Skip to content

Commit b208c53

Browse files
authored
feat: add linkml schemas for data source files (#269, #270) (#283)
1 parent b9e3a38 commit b208c53

25 files changed

+568
-47
lines changed

.github/workflows/run-checks.yml

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,41 @@ name: Run checks
22
on: [pull_request]
33

44
jobs:
5-
build:
5+
run-checks:
66
runs-on: ubuntu-latest
77
steps:
8-
- uses: actions/checkout@v2
9-
- uses: actions/setup-node@v2
8+
- uses: actions/checkout@v4
9+
- uses: actions/setup-node@v4
1010
with:
1111
node-version: "20.10.0"
12-
- run: |
13-
npm ci
14-
npm run check-format
15-
npm run lint
16-
npx tsc --noEmit
12+
- name: Cache npm cache
13+
uses: actions/cache@v4
14+
with:
15+
path: ~/.npm
16+
key: ${{ runner.os }}-node-${{ hashFiles('package-lock.json') }}
17+
- name: Install dependencies
18+
run: npm ci
19+
- name: Run Prettier
20+
run: npm run check-format
21+
- name: Run Linter (ESLint)
22+
run: npm run lint
23+
- name: Type Check
24+
run: npx tsc --noEmit
25+
26+
- name: Set up Python
27+
uses: actions/setup-python@v5
28+
with:
29+
python-version: "3.12.4"
30+
cache: "pip"
31+
cache-dependency-path: "./catalog/build/py/requirements.txt"
32+
- name: Install Python dependencies
33+
run: pip install -r ./catalog/build/py/requirements.txt
34+
- name: Run linkml-lint
35+
# Run linting on the LinkML schemas, to enforce conventions such as in naming, and to catch simple errors.
36+
run: linkml-lint ./catalog/schema --validate --verbose
37+
- name: Test LinkML Python generation
38+
# Generate Python code from the main LinkML schemas, discarding the output; this will catch more subtle errors such as references to nonexistent elements.
39+
run: npm run test-gen-python
40+
- name: Validate catalog files
41+
# Validate the catalog source files against their corresponding LinkML schemas.
42+
run: npm run validate-catalog

.linkmllint.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
extends: recommended
2+
rules:
3+
standard_naming:
4+
permissible_values_upper_case: true

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ Run `npm run build:local` to build. The built app can be run using `npm start`,
1616

1717
## Building the data source files
1818

19+
Using Python version 3.12.4 is recommended.
20+
1921
Create a Python virtual environment and install requirements:
2022

2123
```shell
@@ -55,3 +57,11 @@ the `# Anopheles gambiae` comment:
5557
```yaml
5658
- accession: XXX_000000000.0
5759
```
60+
61+
## Overview of automated checks on catalog content
62+
63+
The `run-checks` GitHub workflow performs checks to ensure that the catalog data and schemas are well-formed; this is done by:
64+
65+
- Linting the schemas via `linkml-lint`.
66+
- Converting the schemas to Python, to catch any errors that occur.
67+
- Validating the catalog source files against their corresponding schemas.

app/apis/catalog/brc-analytics-catalog/common/entities.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ export interface EntitiesResponsePagination {
4444
}
4545

4646
export interface WorkflowCategory {
47+
category: string;
4748
description: string;
4849
name: string;
49-
type: string;
5050
workflows: Workflow[];
5151
}
5252

@@ -58,6 +58,8 @@ export interface Workflow {
5858
}
5959

6060
export enum WORKFLOW_PLOIDY {
61-
ANY = "any",
62-
HAPLOID = "haploid",
61+
ANY = "ANY",
62+
DIPLOID = "DIPLOID",
63+
HAPLOID = "HAPLOID",
64+
POLYPLOID = "POLYPLOID",
6365
}

app/components/Entity/components/AnalysisMethodsCatalog/analysisMethodsCatalog.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ export const AnalysisMethodsCatalog = ({
2121
);
2222
return (
2323
<AnalysisMethod
24-
key={workflowCategory.type}
24+
key={workflowCategory.category}
2525
geneModelUrl={geneModelUrl}
2626
genomeVersionAssemblyId={genomeVersionAssemblyId}
2727
content={

catalog/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ This directory provides the catalog data (information on genome assemblies, orga
77
- `py` - Python scripts.
88
- `ts` - Typescript scripts.
99
- `output` - JSON files output by the catalog build process, to be consumed by the app.
10+
- `schema` - LinkML schemas for source files.
1011
- `source` - YAML files providing data used as input for building the catalog.

catalog/build/py/requirements.txt

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,77 @@
1+
annotated-types==0.7.0
2+
antlr4-python3-runtime==4.9.3
3+
arrow==1.3.0
4+
attrs==25.1.0
15
certifi==2024.8.30
6+
CFGraph==0.2.1
7+
chardet==5.2.0
28
charset-normalizer==3.3.2
9+
click==8.1.8
10+
curies==0.10.4
11+
Deprecated==1.2.18
12+
et_xmlfile==2.0.0
13+
fqdn==1.5.1
14+
graphviz==0.20.3
15+
hbreader==0.9.1
316
idna==3.8
17+
iniconfig==2.0.0
18+
isodate==0.7.2
19+
isoduration==20.11.0
20+
Jinja2==3.1.5
21+
json-flattener==0.1.9
22+
jsonasobj==1.3.1
23+
jsonasobj2==1.0.4
24+
jsonpatch==1.33
25+
jsonpath-ng==1.7.0
26+
jsonpointer==3.0.0
27+
jsonschema==4.23.0
28+
jsonschema-specifications==2024.10.1
29+
linkml==1.8.6
30+
linkml-dataops==0.1.0
31+
linkml-runtime==1.8.3
32+
MarkupSafe==3.0.2
433
numpy==2.1.0
34+
openpyxl==3.1.5
35+
packaging==24.2
536
pandas==2.2.2
37+
parse==1.20.2
38+
pluggy==1.5.0
39+
ply==3.11
40+
prefixcommons==0.1.12
41+
prefixmaps==0.2.6
42+
pydantic==2.10.6
43+
pydantic_core==2.27.2
44+
PyJSG==0.11.10
45+
pyparsing==3.2.1
46+
PyShEx==0.8.1
47+
PyShExC==0.9.1
48+
pytest==8.3.4
49+
pytest-logging==2015.11.4
650
python-dateutil==2.9.0.post0
51+
PyTrie==0.4.0
752
pytz==2024.1
853
PyYAML==6.0.2
54+
rdflib==7.1.3
55+
rdflib-jsonld==0.6.1
56+
rdflib-shim==1.0.3
57+
referencing==0.36.2
958
requests==2.32.3
59+
rfc3339-validator==0.1.4
60+
rfc3987==1.3.8
61+
rpds-py==0.22.3
62+
ruamel.yaml==0.18.10
63+
ruamel.yaml.clib==0.2.12
64+
ShExJSG==0.8.2
1065
six==1.16.0
66+
sortedcontainers==2.4.0
67+
sparqlslurper==0.5.1
68+
SPARQLWrapper==2.0.0
69+
SQLAlchemy==2.0.38
70+
types-python-dateutil==2.9.0.20241206
71+
typing_extensions==4.12.2
1172
tzdata==2024.1
73+
uri-template==1.3.0
1274
urllib3==2.2.2
75+
watchdog==6.0.0
76+
webcolors==24.11.1
77+
wrapt==1.17.2

catalog/build/ts/build-catalog.ts

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,10 @@ async function buildWorkflows(): Promise<WorkflowCategory[]> {
105105

106106
const workflowCategories: WorkflowCategory[] =
107107
sourceWorkflowCategories.workflow_categories.map(
108-
({ description, name, type }) => ({
108+
({ category, description, name }) => ({
109+
category,
109110
description,
110111
name,
111-
type,
112112
workflows: [],
113113
})
114114
);
@@ -123,21 +123,23 @@ async function buildWorkflows(): Promise<WorkflowCategory[]> {
123123
function buildWorkflow(
124124
workflowCategories: WorkflowCategory[],
125125
{
126+
categories,
126127
ploidy,
127128
trs_id: trsId,
128-
type,
129129
workflow_description: workflowDescription,
130130
workflow_name: workflowName,
131131
}: SourceWorkflow
132132
): void {
133-
const category = workflowCategories.find((c) => c.type === type);
134-
if (!category) throw new Error(`Unknown workflow category: ${type}`);
135-
category.workflows.push({
136-
ploidy,
137-
trsId,
138-
workflowDescription,
139-
workflowName,
140-
});
133+
for (const category of categories) {
134+
const workflowCategory = workflowCategories.find((c) => c.category === category);
135+
if (!workflowCategory) throw new Error(`Unknown workflow category: ${category}`);
136+
workflowCategory.workflows.push({
137+
ploidy,
138+
trsId,
139+
workflowDescription,
140+
workflowName,
141+
});
142+
}
141143
}
142144

143145
async function readValuesFile<T>(

catalog/build/ts/entities.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ export interface SourceGenome {
2323

2424
export interface SourceWorkflowCategories {
2525
workflow_categories: {
26+
category: string;
2627
description: string;
2728
name: string;
28-
type: string;
2929
}[];
3030
}
3131

@@ -34,9 +34,9 @@ export interface SourceWorkflows {
3434
}
3535

3636
export interface SourceWorkflow {
37+
categories: string[];
3738
ploidy: WORKFLOW_PLOIDY;
3839
trs_id: string;
39-
type: string;
4040
workflow_description: string;
4141
workflow_name: string;
4242
}

catalog/output/workflows.json

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,59 @@
11
[
22
{
3+
"category": "VARIANT_CALLING",
34
"description": "Identify nucleotide polymorphisms and short indels from Illumina and Element data.",
45
"name": "Variant calling",
5-
"type": "VARIANT_CALLING",
66
"workflows": [
77
{
8-
"ploidy": "haploid",
8+
"ploidy": "HAPLOID",
99
"trsId": "https://dockstore.org/api/ga4gh/trs/v2/tools/#workflow/github.com/iwc-workflows/haploid-variant-calling-wgs-pe/main/versions/v0.1",
1010
"workflowDescription": "Workflow for variant analysis against a reference genome in GenBank format",
1111
"workflowName": "Paired end variant calling in haploid system"
1212
}
1313
]
1414
},
1515
{
16+
"category": "TRANSCRIPTOMICS",
1617
"description": "Analyze bulk or single-cell RNA seq data using a variety of approaches.",
1718
"name": "Transcriptomics",
18-
"type": "TRANSCRIPTOMICS",
1919
"workflows": [
2020
{
21-
"ploidy": "any",
21+
"ploidy": "ANY",
2222
"trsId": "https://dockstore.org/api/ga4gh/trs/v2/tools/#workflow/github.com/iwc-workflows/rnaseq-pe/main/versions/v0.9",
2323
"workflowDescription": "This workflow takes as input a list of paired-end fastqs. Adapters and bad quality bases are removed with cutadapt. Reads are mapped with STAR with ENCODE parameters and genes are counted simultaneously as well as normalized coverage (per million mapped reads) on uniquely mapped reads. The counts are reprocessed to be similar to HTSeq-count output. FPKM are computed with cufflinks and/or with StringTie. The unstranded normalized coverage is computed with bedtools.",
2424
"workflowName": "RNAseq_PE"
2525
}
2626
]
2727
},
2828
{
29+
"category": "REGULATION",
2930
"description": "Workflows for the analysis of ChIP-seq, ATAC-Seq, and beyond.",
3031
"name": "Regulation",
31-
"type": "REGULATION",
3232
"workflows": [
3333
{
34-
"ploidy": "any",
34+
"ploidy": "ANY",
3535
"trsId": "https://dockstore.org/api/ga4gh/trs/v2/tools/#workflow/github.com/iwc-workflows/chipseq-pe/main/versions/v0.12",
3636
"workflowDescription": "This workflow takes as input a collection of paired fastqs. Remove adapters with cutadapt, map pairs with bowtie2. Keep MAPQ30 and concordant pairs. MACS2 for paired bam.",
3737
"workflowName": "ChIPseq_PE"
3838
}
3939
]
4040
},
4141
{
42+
"category": "ASSEMBLY",
4243
"description": "Assemble prokaryotic and eukaryotic genomes sequenced with a variety of technologies.",
4344
"name": "Assembly",
44-
"type": "ASSEMBLY",
4545
"workflows": []
4646
},
4747
{
48+
"category": "GENOME_COMPARISONS",
4849
"description": "Workflows for creation of pairwise and multiple genome alignments.",
4950
"name": "Genome comparisons",
50-
"type": "GENOME_COMPARISONS",
5151
"workflows": []
5252
},
5353
{
54+
"category": "PROTEIN_FOLDING",
5455
"description": "Analysis of protein folding using the ColabFold framework.",
5556
"name": "Protein folding",
56-
"type": "PROTEIN_FOLDING",
5757
"workflows": []
5858
}
5959
]

0 commit comments

Comments
 (0)