Skip to content

Commit 06b0851

Browse files
authored
feat: add ga2 catalog data and scripts (#752) (#772)
1 parent 7fac580 commit 06b0851

28 files changed

+400547
-60
lines changed

.github/workflows/run-checks.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ jobs:
3737
- name: Test LinkML Python generation
3838
# Generate Python code from the main LinkML schemas, discarding the output; this will catch more subtle errors such as references to nonexistent elements.
3939
run: npm run test-gen-python
40-
- name: Validate catalog files
40+
- name: Validate BRC catalog files
4141
# Validate the catalog source files against their corresponding LinkML schemas.
42-
run: npm run validate-catalog
42+
run: npm run validate-brc-catalog
43+
- name: Validate GA2 catalog files
44+
# Validate the GA2 catalog source files.
45+
run: npm run validate-ga2-catalog

.github/workflows/update-catalog.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ jobs:
2727
- name: Install npm dependencies
2828
run: npm ci
2929
- name: Run catalog script
30-
run: npm run build-files-from-ncbi
30+
run: npm run build-brc-from-ncbi
3131
- name: Get current date
3232
id: date
3333
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT

.prettierignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,5 @@ venv
1919

2020
#catalog
2121
/catalog/output
22+
/catalog/ga2/output
2223
/catalog/schema/generated

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ pip install -r ./catalog/build/py/requirements.txt
2929
Then run the script:
3030

3131
```shell
32-
npm run build-files-from-ncbi
32+
npm run build-brc-from-ncbi
3333
```
3434

3535
The environment can be deactivated by running `deactivate`, and re-activated by running `source ./venv/bin/activate`

app/apis/catalog/common/utils.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
/**
2+
* Sanitize an entity ID by replacing all periods with underscores.
3+
* @param entityId - Entity ID.
4+
* @returns sanitized entity ID.
5+
*/
6+
export function sanitizeEntityId(entityId?: string): string {
7+
if (!entityId) return "";
8+
return entityId.replace(/\./g, "_");
9+
}

app/apis/catalog/ga2/entities.ts

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import { ORGANISM_PLOIDY } from "../brc-analytics-catalog/common/schema-entities";
2+
3+
export type GA2Catalog = GA2AssemblyEntity | GA2OrganismEntity;
4+
5+
export interface GA2AssemblyEntity {
6+
accession: string;
7+
annotationStatus: string | null;
8+
chromosomes: number | null;
9+
coverage: string | null;
10+
gcPercent: number | null;
11+
geneModelUrl: string | null;
12+
isRef: "No" | "Yes";
13+
length: number;
14+
level: string;
15+
lineageTaxonomyIds: string[];
16+
ncbiTaxonomyId: string;
17+
ploidy: ORGANISM_PLOIDY[];
18+
scaffoldCount: number | null;
19+
scaffoldL50: number | null;
20+
scaffoldN50: number | null;
21+
speciesTaxonomyId: string;
22+
sra_data: SRAData[];
23+
strainName: string | null;
24+
taxonomicGroup: string[];
25+
taxonomicLevelSpecies: string;
26+
taxonomicLevelStrain: string;
27+
tolId: string;
28+
ucscBrowserUrl: string | null;
29+
}
30+
31+
export interface GA2OrganismEntity {
32+
assemblyCount: number;
33+
assemblyTaxonomyIds: string[];
34+
genomes: GA2AssemblyEntity[];
35+
maxScaffoldN50: number | null;
36+
ncbiTaxonomyId: string;
37+
taxonomicGroup: string[];
38+
taxonomicLevelSpecies: string;
39+
tolId: string;
40+
}
41+
42+
export interface SRAData {
43+
accession: string;
44+
biosample: string;
45+
instrument: string;
46+
library_layout: string;
47+
library_source: string;
48+
library_strategy: string;
49+
platform: string;
50+
run_total_bases: number | null;
51+
sra_run_acc: string;
52+
sra_sample_acc: string;
53+
sra_study_acc: string;
54+
total_bases: number | null;
55+
}

app/apis/catalog/ga2/utils.ts

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import { sanitizeEntityId } from "../common/utils";
2+
import { GA2AssemblyEntity, GA2OrganismEntity } from "./entities";
3+
4+
/**
5+
* Get the ID of the given assembly entity.
6+
* @param entity - Entity.
7+
* @returns entity ID.
8+
*/
9+
export function getAssemblyId(entity?: GA2AssemblyEntity): string {
10+
return sanitizeEntityId(entity?.accession);
11+
}
12+
13+
/**
14+
* Get the title of the given assembly entity.
15+
* @param entity - Entity.
16+
* @returns entity title.
17+
*/
18+
export function getAssemblyTitle(entity?: GA2AssemblyEntity): string {
19+
return entity?.taxonomicLevelSpecies || "";
20+
}
21+
22+
/**
23+
* Get the ID of the given organism entity.
24+
* @param entity - Entity.
25+
* @returns entity ID.
26+
*/
27+
export function getOrganismId(entity?: GA2OrganismEntity): string {
28+
return entity?.ncbiTaxonomyId || "";
29+
}

catalog/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ This directory provides the catalog data (information on genome assemblies, orga
66
- `intermediate` - Intermediate data files.
77
- `py` - Python scripts.
88
- `ts` - Typescript scripts.
9+
- `ga2` - Catalog directory for Genome Ark 2, with contents laid out in an analogous manner to this one.
910
- `output` - JSON files output by the catalog build process, to be consumed by the app.
1011
- `py_package` - Python package used to share catalog features, such as the schemas and build process, with other projects.
1112
- `schema` - Schema-related scripts and derived models.

catalog/build/ts/build-assemblies.ts

Lines changed: 20 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,21 @@ import {
33
Outbreak,
44
} from "../../../app/apis/catalog/brc-analytics-catalog/common/entities";
55
import { getGenomeId } from "../../../app/apis/catalog/brc-analytics-catalog/common/utils";
6-
import { Organisms as SourceOrganisms } from "../../schema/generated/schema";
76
import { SOURCE_GENOME_KEYS } from "./constants";
87
import { SourceGenome } from "./entities";
98
import {
109
defaultStringToNone,
10+
getOutbreakMatchingLineage,
11+
getPloidyForAssembly,
12+
getSourceOrganismsByTaxonomyId,
13+
getSpeciesStrainName,
1114
parseBoolean,
1215
parseList,
16+
parseListOrNull,
1317
parseNumber,
1418
parseNumberOrNull,
1519
parseStringOrNull,
1620
readValuesFile,
17-
readYamlFile,
1821
verifyUniqueIds,
1922
} from "./utils";
2023

@@ -29,31 +32,18 @@ export async function buildAssemblies(
2932
undefined,
3033
SOURCE_GENOME_KEYS
3134
);
32-
const sourceOrganisms = await readYamlFile<SourceOrganisms>(
35+
const sourceOrganismsByTaxonomyId = await getSourceOrganismsByTaxonomyId(
3336
SOURCE_PATH_ORGANISMS
3437
);
35-
const sourceOrganismsByTaxonomyId = new Map(
36-
sourceOrganisms.organisms.map((sourceOrganism) => [
37-
String(sourceOrganism.taxonomy_id),
38-
sourceOrganism,
39-
])
40-
);
4138
const mappedRows: BRCDataCatalogGenome[] = [];
4239
for (const row of sourceRows) {
43-
const ploidy = sourceOrganismsByTaxonomyId.get(
44-
row.speciesTaxonomyId
45-
)?.ploidy;
46-
if (ploidy === undefined) {
47-
console.log(
48-
`Skipping assembly ${row.accession} [tax_id: ${row.speciesTaxonomyId}] - ploidy not found`
49-
);
50-
continue;
51-
}
52-
const taxonomicLevelStrain =
53-
row.taxonomicLevelStrain ||
54-
(row.strain
55-
? `${row.taxonomicLevelSpecies} strain ${row.strain}`
56-
: "None");
40+
const ploidy = getPloidyForAssembly(
41+
sourceOrganismsByTaxonomyId,
42+
row.speciesTaxonomyId,
43+
true,
44+
row.accession
45+
);
46+
if (ploidy === null) continue;
5747
const lineageTaxonomyIds = parseList(row.lineageTaxonomyIds);
5848
const outbreak = getOutbreakMatchingLineage(
5949
outbreaksByTaxonomyId,
@@ -72,7 +62,7 @@ export async function buildAssemblies(
7262
level: row.level,
7363
lineageTaxonomyIds,
7464
ncbiTaxonomyId: row.taxonomyId,
75-
otherTaxa: row.otherTaxa ? row.otherTaxa.split(",") : null,
65+
otherTaxa: parseListOrNull(row.otherTaxa),
7666
ploidy,
7767
priority: outbreak?.priority ?? null,
7868
priorityPathogenName: outbreak?.name ?? null,
@@ -81,7 +71,7 @@ export async function buildAssemblies(
8171
scaffoldN50: parseNumberOrNull(row.scaffoldN50),
8272
speciesTaxonomyId: row.speciesTaxonomyId,
8373
strainName: parseStringOrNull(row.strain),
84-
taxonomicGroup: row.taxonomicGroup ? row.taxonomicGroup.split(",") : [],
74+
taxonomicGroup: parseList(row.taxonomicGroup),
8575
taxonomicLevelClass: defaultStringToNone(row.taxonomicLevelClass),
8676
taxonomicLevelDomain: defaultStringToNone(row.taxonomicLevelDomain),
8777
taxonomicLevelFamily: defaultStringToNone(row.taxonomicLevelFamily),
@@ -93,7 +83,11 @@ export async function buildAssemblies(
9383
taxonomicLevelRealm: defaultStringToNone(row.taxonomicLevelRealm),
9484
taxonomicLevelSerotype: defaultStringToNone(row.taxonomicLevelSerotype),
9585
taxonomicLevelSpecies: defaultStringToNone(row.taxonomicLevelSpecies),
96-
taxonomicLevelStrain,
86+
taxonomicLevelStrain: getSpeciesStrainName(
87+
row.taxonomicLevelSpecies,
88+
row.taxonomicLevelStrain,
89+
row.strain
90+
),
9791
ucscBrowserUrl: parseStringOrNull(row.ucscBrowser),
9892
});
9993
}
@@ -103,20 +97,3 @@ export async function buildAssemblies(
10397
verifyUniqueIds("assembly", sortedRows, getGenomeId);
10498
return sortedRows;
10599
}
106-
107-
/**
108-
* Get the outbreak associated with the first of the given lineage taxa that has an assocated outbreak, or null if none is found.
109-
* @param outbreaksByTaxonomyId - Map from taxonomy ID (number) to outbreak.
110-
* @param lineageTaxonomyIds - Taxonomic lineage (array of taxonomy ID strings).
111-
* @returns matching outbreak, or null.
112-
*/
113-
function getOutbreakMatchingLineage(
114-
outbreaksByTaxonomyId: Map<number, Outbreak>,
115-
lineageTaxonomyIds: string[]
116-
): Outbreak | null {
117-
for (const stringId of lineageTaxonomyIds) {
118-
const outbreak = outbreaksByTaxonomyId.get(Number(stringId));
119-
if (outbreak !== undefined) return outbreak;
120-
}
121-
return null;
122-
}

catalog/build/ts/build-organisms.ts

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,12 @@ import {
33
BRCDataCatalogOrganism,
44
} from "../../../app/apis/catalog/brc-analytics-catalog/common/entities";
55
import { getOrganismId } from "../../../app/apis/catalog/brc-analytics-catalog/common/utils";
6-
import { accumulateArrayValue, verifyUniqueIds } from "./utils";
6+
import {
7+
accumulateArrayOrNullValues,
8+
accumulateArrayValue,
9+
incrementValue,
10+
verifyUniqueIds,
11+
} from "./utils";
712

813
export function buildOrganisms(
914
genomes: BRCDataCatalogGenome[]
@@ -27,17 +32,18 @@ function buildOrganism(
2732
genome: BRCDataCatalogGenome
2833
): BRCDataCatalogOrganism {
2934
return {
30-
assemblyCount: (organism?.assemblyCount ?? 0) + 1,
35+
assemblyCount: incrementValue(organism?.assemblyCount),
3136
assemblyTaxonomyIds: accumulateArrayValue(
3237
organism?.assemblyTaxonomyIds,
3338
genome.ncbiTaxonomyId
3439
),
3540
commonName: genome.commonName,
3641
genomes: accumulateArrayValue(organism?.genomes, genome),
3742
ncbiTaxonomyId: genome.speciesTaxonomyId,
38-
otherTaxa: genome.otherTaxa
39-
? accumulateArrayValue(organism?.otherTaxa || [], ...genome.otherTaxa)
40-
: organism?.otherTaxa || null,
43+
otherTaxa: accumulateArrayOrNullValues(
44+
organism?.otherTaxa,
45+
genome.otherTaxa
46+
),
4147
priority: organism?.priority ?? genome.priority,
4248
priorityPathogenName:
4349
organism?.priorityPathogenName ?? genome.priorityPathogenName,

0 commit comments

Comments
 (0)