Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
e3972bd
feat: adding flu (#379)
d-callan Mar 12, 2025
2e86135
feat: adding the niaid biodefense viruses (#379)
d-callan Mar 12, 2025
82347fe
Merge branch 'main' into even-more-organisms
d-callan Mar 13, 2025
b38db38
Merge branch 'main' into even-more-organisms
d-callan Mar 24, 2025
c312a10
Merge branch 'main' into even-more-organisms
d-callan Apr 8, 2025
df7cff2
fix: some accessions were being duplicated in the intermediate tsv
d-callan Apr 8, 2025
da43936
Merge branch 'main' into even-more-organisms
d-callan Apr 15, 2025
d0442d2
feat: get gene model urls more efficiently
d-callan Apr 15, 2025
c4904fb
feat: all requested organisms now in ucsc
d-callan Apr 17, 2025
72483a3
fix: move imports in data catalog build.py
d-callan Apr 17, 2025
63d1c2c
fix: revert accidentally added npm package
d-callan Apr 19, 2025
ee27f7d
Merge branch 'main' into even-more-organisms
d-callan Apr 28, 2025
08a096f
feat: first pass adding niaid biodefense bacteria
d-callan Apr 28, 2025
2f529a6
Merge branch 'main' into even-more-organisms
d-callan Apr 28, 2025
c78181a
fix: fetching parent taxa occassionally fails
d-callan Apr 29, 2025
b02e366
fix: update to produced json
d-callan Apr 29, 2025
8a66e7c
Merge branch 'main' into even-more-organisms
d-callan Apr 29, 2025
8a1812c
fix: case where a genbank id was provided where it should have been t…
d-callan Apr 30, 2025
25492ff
Merge branch 'even-more-organisms' of github.com:galaxyproject/brc-an…
d-callan May 6, 2025
117f036
Merge branch 'main' into even-more-organisms
d-callan May 6, 2025
4dbe33a
Merge branch 'even-more-organisms' of github.com:galaxyproject/brc-an…
d-callan May 6, 2025
606b382
Merge branch 'main' into even-more-organisms
d-callan May 12, 2025
d1be910
Merge branch 'main' into even-more-organisms
d-callan May 16, 2025
230f98b
feat: update data qc report and ucsc urls for bacteria assemblies
d-callan May 16, 2025
12c881e
feat: all seqs in ucsc brc hub
d-callan May 19, 2025
06b25a2
Merge branch 'main' into even-more-organisms
d-callan May 21, 2025
3007ae4
Merge branch 'main' into even-more-organisms
d-callan May 23, 2025
cba3819
feat: all bacteria have gene model urls
d-callan May 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 203 additions & 6 deletions catalog/build/intermediate/genomes-from-ncbi.tsv

Large diffs are not rendered by default.

34 changes: 17 additions & 17 deletions catalog/build/intermediate/outbreak-taxonomy-mapping.tsv
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
taxonomy_id name rank
5763 Naegleria fowleri SPECIES
5207 Cryptococcus neoformans SPECIES
11050 Flaviviridae FAMILY
498019 Candidozyma auris SPECIES
1980415 Nairoviridae FAMILY
11018 Togaviridae FAMILY
1980418 Phenuiviridae FAMILY
11320 Influenza A virus
4827 Mucorales ORDER
5833 Plasmodium falciparum SPECIES
10244 Monkeypox virus SPECIES
5037 Histoplasma capsulatum SPECIES
3418604 Betacoronavirus pandemicum SPECIES
5052 Aspergillus GENUS
199306 Coccidioides posadasii SPECIES
11617 Arenaviridae FAMILY
1980416 Peribunyaviridae FAMILY
12058 Picornaviridae FAMILY
1980413 Hantaviridae FAMILY
5807 Cryptosporidium parvum SPECIES
5037 Histoplasma capsulatum SPECIES
1980415 Nairoviridae FAMILY
3418604 Betacoronavirus pandemicum SPECIES
1773 Mycobacterium tuberculosis SPECIES
5052 Aspergillus GENUS
11320 Influenza A virus
5207 Cryptococcus neoformans SPECIES
11158 Paramyxoviridae FAMILY
5807 Cryptosporidium parvum SPECIES
1980418 Phenuiviridae FAMILY
1980413 Hantaviridae FAMILY
11266 Filoviridae FAMILY
11018 Togaviridae FAMILY
10244 Monkeypox virus SPECIES
5763 Naegleria fowleri SPECIES
1980416 Peribunyaviridae FAMILY
5833 Plasmodium falciparum SPECIES
498019 Candidozyma auris SPECIES
11158 Paramyxoviridae FAMILY
38574 Leishmania donovani species complex SPECIES_GROUP
11617 Arenaviridae FAMILY
46 changes: 30 additions & 16 deletions catalog/build/py/package/catalog_build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,9 @@ def get_species_tree(taxonomy_ids, taxonomic_levels, species_info=None):
child for edge in edges.values() for child in edge.get("visible_children", [])
}
all_children = [str(num) for num in all_children]
root_ids = [node_id for node_id in edges if node_id not in all_children]
root_ids = [str(num) for num in root_ids]
# Determine root IDs and sort them for consistent ordering
roots = [node_id for node_id in edges if node_id not in all_children]
root_ids = sorted([str(num) for num in roots], key=lambda x: int(x))

if not root_ids:
return {}
Expand Down Expand Up @@ -372,25 +373,37 @@ def fetch_taxa_info(tax_ids, taxon_name_map, taxon_rank_map, description="taxa")
if not missing_tax_ids:
return

print(f"Fetching information for {len(missing_tax_ids)} {description}")

# Fetch in batches to avoid overwhelming the NCBI server
url = "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/dataset_report"
reports = post_ncbi_request(url, {"taxons": missing_tax_ids})

for report in reports:
tax_id = str(report["taxonomy"]["tax_id"])
taxon_name_map[tax_id] = report["taxonomy"]["current_scientific_name"]["name"]
if "rank" in report["taxonomy"]:
taxon_rank_map[tax_id] = report["taxonomy"]["rank"]
else:
print(f"rank not found for tax_id: {tax_id}")
batch_size = 2500
total = len(missing_tax_ids)
total_batches = (total + batch_size - 1) // batch_size
print(f"Fetching information for {total} {description} in {total_batches} batches")
for batch_index in range(total_batches):
batch = missing_tax_ids[
batch_index * batch_size : (batch_index + 1) * batch_size
]
print(
f" Batch {batch_index + 1}/{total_batches}: fetching {len(batch)} {description}"
)
reports = post_ncbi_request(url, {"taxons": batch})
for report in reports:
tax_id = str(report["taxonomy"]["tax_id"])
taxon_name_map[tax_id] = report["taxonomy"]["current_scientific_name"][
"name"
]
if "rank" in report["taxonomy"]:
taxon_rank_map[tax_id] = report["taxonomy"]["rank"]
else:
print(f"rank not found for tax_id: {tax_id}")


def ncbi_tree_to_nested_tree(node_id, edges, taxonomy_ids):
children = edges.get(str(node_id), {}).get("visible_children", [])
children = [str(num) for num in children]
# ncbi results odd again, dup children
children = set(children)
# Deduplicate and sort children by taxonomy ID for consistent ordering
children = sorted(set(children), key=lambda x: int(x))
if len(children) > 0 or int(node_id) in taxonomy_ids:
child_trees = [
ncbi_tree_to_nested_tree(child, edges, taxonomy_ids) for child in children
Expand Down Expand Up @@ -1213,7 +1226,7 @@ def get_other_taxa(lineage_ids):
"accessions",
"matched with gene model URLs",
genomes_df["accession"],
genomes_df["geneModelUrl"].astype(bool),
genomes_df["geneModelUrl"].notna(),
)
else:
genomes_df["geneModelUrl"] = ""
Expand All @@ -1239,7 +1252,8 @@ def get_other_taxa(lineage_ids):
list(genomes_df["taxonomyId"]), taxonomic_levels_for_tree, species_info
)
with open(tree_output_path, "w") as outfile:
json.dump(species_tree, outfile, indent=4)
# Dump with sorted keys and consistent indentation
json.dump(species_tree, outfile, indent=4, sort_keys=True)
print(f"Wrote to {tree_output_path}")

if organisms_path is not None:
Expand Down
Loading
Loading