Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .github/workflows/run-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ jobs:
- name: Test LinkML Python generation
# Generate Python code from the main LinkML schemas, discarding the output; this will catch more subtle errors such as references to nonexistent elements.
run: npm run test-gen-python
- name: Validate catalog files
- name: Validate BRC catalog files
# Validate the catalog source files against their corresponding LinkML schemas.
run: npm run validate-catalog
run: npm run validate-brc-catalog
- name: Validate GA2 catalog files
# Validate the GA2 catalog source files.
run: npm run validate-ga2-catalog
2 changes: 1 addition & 1 deletion .github/workflows/update-catalog.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
- name: Install npm dependencies
run: npm ci
- name: Run catalog script
run: npm run build-files-from-ncbi
run: npm run build-brc-from-ncbi
- name: Get current date
id: date
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
Expand Down
1 change: 1 addition & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ venv

#catalog
/catalog/output
/catalog/ga2/output
/catalog/schema/generated
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pip install -r ./catalog/build/py/requirements.txt
Then run the script:

```shell
npm run build-files-from-ncbi
npm run build-brc-from-ncbi
```

The environment can be deactivated by running `deactivate`, and re-activated by running `source ./venv/bin/activate`
Expand Down
9 changes: 9 additions & 0 deletions app/apis/catalog/common/utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
/**
* Sanitize an entity ID by replacing all periods with underscores.
* @param entityId - Entity ID.
* @returns sanitized entity ID.
*/
export function sanitizeEntityId(entityId?: string): string {
if (!entityId) return "";
return entityId.replace(/\./g, "_");
}
55 changes: 55 additions & 0 deletions app/apis/catalog/ga2/entities.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import { ORGANISM_PLOIDY } from "../brc-analytics-catalog/common/schema-entities";

export type GA2Catalog = GA2AssemblyEntity | GA2OrganismEntity;

export interface GA2AssemblyEntity {
accession: string;
annotationStatus: string | null;
chromosomes: number | null;
coverage: string | null;
gcPercent: number | null;
geneModelUrl: string | null;
isRef: "No" | "Yes";
length: number;
level: string;
lineageTaxonomyIds: string[];
ncbiTaxonomyId: string;
ploidy: ORGANISM_PLOIDY[];
scaffoldCount: number | null;
scaffoldL50: number | null;
scaffoldN50: number | null;
speciesTaxonomyId: string;
sra_data: SRAData[];
strainName: string | null;
taxonomicGroup: string[];
taxonomicLevelSpecies: string;
taxonomicLevelStrain: string;
tolId: string;
ucscBrowserUrl: string | null;
}

export interface GA2OrganismEntity {
assemblyCount: number;
assemblyTaxonomyIds: string[];
genomes: GA2AssemblyEntity[];
maxScaffoldN50: number | null;
ncbiTaxonomyId: string;
taxonomicGroup: string[];
taxonomicLevelSpecies: string;
tolId: string;
}

export interface SRAData {
accession: string;
biosample: string;
instrument: string;
library_layout: string;
library_source: string;
library_strategy: string;
platform: string;
run_total_bases: number | null;
sra_run_acc: string;
sra_sample_acc: string;
sra_study_acc: string;
total_bases: number | null;
}
29 changes: 29 additions & 0 deletions app/apis/catalog/ga2/utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import { sanitizeEntityId } from "../common/utils";
import { GA2AssemblyEntity, GA2OrganismEntity } from "./entities";

/**
* Get the ID of the given assembly entity.
* @param entity - Entity.
* @returns entity ID.
*/
export function getAssemblyId(entity?: GA2AssemblyEntity): string {
return sanitizeEntityId(entity?.accession);
}

/**
* Get the title of the given assembly entity.
* @param entity - Entity.
* @returns entity title.
*/
export function getAssemblyTitle(entity?: GA2AssemblyEntity): string {
return entity?.taxonomicLevelSpecies || "";
}

/**
* Get the ID of the given organism entity.
* @param entity - Entity.
* @returns entity ID.
*/
export function getOrganismId(entity?: GA2OrganismEntity): string {
return entity?.ncbiTaxonomyId || "";
}
1 change: 1 addition & 0 deletions catalog/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ This directory provides the catalog data (information on genome assemblies, orga
- `intermediate` - Intermediate data files.
- `py` - Python scripts.
- `ts` - Typescript scripts.
- `ga2` - Catalog directory for Genome Ark 2, with contents laid out in an analogous manner to this one.
- `output` - JSON files output by the catalog build process, to be consumed by the app.
- `py_package` - Python package used to share catalog features, such as the schemas and build process, with other projects.
- `schema` - Schema-related scripts and derived models.
Expand Down
63 changes: 20 additions & 43 deletions catalog/build/ts/build-assemblies.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,21 @@ import {
Outbreak,
} from "../../../app/apis/catalog/brc-analytics-catalog/common/entities";
import { getGenomeId } from "../../../app/apis/catalog/brc-analytics-catalog/common/utils";
import { Organisms as SourceOrganisms } from "../../schema/generated/schema";
import { SOURCE_GENOME_KEYS } from "./constants";
import { SourceGenome } from "./entities";
import {
defaultStringToNone,
getOutbreakMatchingLineage,
getPloidyForAssembly,
getSourceOrganismsByTaxonomyId,
getSpeciesStrainName,
parseBoolean,
parseList,
parseListOrNull,
parseNumber,
parseNumberOrNull,
parseStringOrNull,
readValuesFile,
readYamlFile,
verifyUniqueIds,
} from "./utils";

Expand All @@ -29,31 +32,18 @@ export async function buildAssemblies(
undefined,
SOURCE_GENOME_KEYS
);
const sourceOrganisms = await readYamlFile<SourceOrganisms>(
const sourceOrganismsByTaxonomyId = await getSourceOrganismsByTaxonomyId(
SOURCE_PATH_ORGANISMS
);
const sourceOrganismsByTaxonomyId = new Map(
sourceOrganisms.organisms.map((sourceOrganism) => [
String(sourceOrganism.taxonomy_id),
sourceOrganism,
])
);
const mappedRows: BRCDataCatalogGenome[] = [];
for (const row of sourceRows) {
const ploidy = sourceOrganismsByTaxonomyId.get(
row.speciesTaxonomyId
)?.ploidy;
if (ploidy === undefined) {
console.log(
`Skipping assembly ${row.accession} [tax_id: ${row.speciesTaxonomyId}] - ploidy not found`
);
continue;
}
const taxonomicLevelStrain =
row.taxonomicLevelStrain ||
(row.strain
? `${row.taxonomicLevelSpecies} strain ${row.strain}`
: "None");
const ploidy = getPloidyForAssembly(
sourceOrganismsByTaxonomyId,
row.speciesTaxonomyId,
true,
row.accession
);
if (ploidy === null) continue;
const lineageTaxonomyIds = parseList(row.lineageTaxonomyIds);
const outbreak = getOutbreakMatchingLineage(
outbreaksByTaxonomyId,
Expand All @@ -72,7 +62,7 @@ export async function buildAssemblies(
level: row.level,
lineageTaxonomyIds,
ncbiTaxonomyId: row.taxonomyId,
otherTaxa: row.otherTaxa ? row.otherTaxa.split(",") : null,
otherTaxa: parseListOrNull(row.otherTaxa),
ploidy,
priority: outbreak?.priority ?? null,
priorityPathogenName: outbreak?.name ?? null,
Expand All @@ -81,7 +71,7 @@ export async function buildAssemblies(
scaffoldN50: parseNumberOrNull(row.scaffoldN50),
speciesTaxonomyId: row.speciesTaxonomyId,
strainName: parseStringOrNull(row.strain),
taxonomicGroup: row.taxonomicGroup ? row.taxonomicGroup.split(",") : [],
taxonomicGroup: parseList(row.taxonomicGroup),
taxonomicLevelClass: defaultStringToNone(row.taxonomicLevelClass),
taxonomicLevelDomain: defaultStringToNone(row.taxonomicLevelDomain),
taxonomicLevelFamily: defaultStringToNone(row.taxonomicLevelFamily),
Expand All @@ -93,7 +83,11 @@ export async function buildAssemblies(
taxonomicLevelRealm: defaultStringToNone(row.taxonomicLevelRealm),
taxonomicLevelSerotype: defaultStringToNone(row.taxonomicLevelSerotype),
taxonomicLevelSpecies: defaultStringToNone(row.taxonomicLevelSpecies),
taxonomicLevelStrain,
taxonomicLevelStrain: getSpeciesStrainName(
row.taxonomicLevelSpecies,
row.taxonomicLevelStrain,
row.strain
),
ucscBrowserUrl: parseStringOrNull(row.ucscBrowser),
});
}
Expand All @@ -103,20 +97,3 @@ export async function buildAssemblies(
verifyUniqueIds("assembly", sortedRows, getGenomeId);
return sortedRows;
}

/**
* Get the outbreak associated with the first of the given lineage taxa that has an assocated outbreak, or null if none is found.
* @param outbreaksByTaxonomyId - Map from taxonomy ID (number) to outbreak.
* @param lineageTaxonomyIds - Taxonomic lineage (array of taxonomy ID strings).
* @returns matching outbreak, or null.
*/
function getOutbreakMatchingLineage(
outbreaksByTaxonomyId: Map<number, Outbreak>,
lineageTaxonomyIds: string[]
): Outbreak | null {
for (const stringId of lineageTaxonomyIds) {
const outbreak = outbreaksByTaxonomyId.get(Number(stringId));
if (outbreak !== undefined) return outbreak;
}
return null;
}
16 changes: 11 additions & 5 deletions catalog/build/ts/build-organisms.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@ import {
BRCDataCatalogOrganism,
} from "../../../app/apis/catalog/brc-analytics-catalog/common/entities";
import { getOrganismId } from "../../../app/apis/catalog/brc-analytics-catalog/common/utils";
import { accumulateArrayValue, verifyUniqueIds } from "./utils";
import {
accumulateArrayOrNullValues,
accumulateArrayValue,
incrementValue,
verifyUniqueIds,
} from "./utils";

export function buildOrganisms(
genomes: BRCDataCatalogGenome[]
Expand All @@ -27,17 +32,18 @@ function buildOrganism(
genome: BRCDataCatalogGenome
): BRCDataCatalogOrganism {
return {
assemblyCount: (organism?.assemblyCount ?? 0) + 1,
assemblyCount: incrementValue(organism?.assemblyCount),
assemblyTaxonomyIds: accumulateArrayValue(
organism?.assemblyTaxonomyIds,
genome.ncbiTaxonomyId
),
commonName: genome.commonName,
genomes: accumulateArrayValue(organism?.genomes, genome),
ncbiTaxonomyId: genome.speciesTaxonomyId,
otherTaxa: genome.otherTaxa
? accumulateArrayValue(organism?.otherTaxa || [], ...genome.otherTaxa)
: organism?.otherTaxa || null,
otherTaxa: accumulateArrayOrNullValues(
organism?.otherTaxa,
genome.otherTaxa
),
priority: organism?.priority ?? genome.priority,
priorityPathogenName:
organism?.priorityPathogenName ?? genome.priorityPathogenName,
Expand Down
17 changes: 12 additions & 5 deletions catalog/build/ts/constants.ts
Original file line number Diff line number Diff line change
@@ -1,21 +1,31 @@
export const SOURCE_GENOME_KEYS = [
/**
* Names of columns that are expected to be in the TSV output by the initial build script, regardless of configuration.
*/
export const CORE_SOURCE_GENOME_KEYS = [
"accession",
"annotationStatus",
"chromosomeCount",
"commonName",
"coverage",
"gcPercent",
"geneModelUrl",
"isRef",
"length",
"level",
"lineageTaxonomyIds",
"otherTaxa",
"scaffoldCount",
"scaffoldL50",
"scaffoldN50",
"species",
"speciesTaxonomyId",
"strain",
"taxonomyId",
"ucscBrowser",
] as const;

export const SOURCE_GENOME_KEYS = [
...CORE_SOURCE_GENOME_KEYS,
"otherTaxa",
"taxonomicGroup",
"taxonomicLevelClass",
"taxonomicLevelFamily",
Expand All @@ -29,7 +39,4 @@ export const SOURCE_GENOME_KEYS = [
"taxonomicLevelSerotype",
"taxonomicLevelIsolate",
"taxonomicLevelDomain",
"commonName",
"taxonomyId",
"ucscBrowser",
] as const;
Loading
Loading