Skip to content

Commit a3a70ca

Browse files
authored
feat: build catalog and add qc report (#344) (#364)
1 parent bebcfb2 commit a3a70ca

File tree

7 files changed

+11402
-521
lines changed

7 files changed

+11402
-521
lines changed

catalog/build/intermediate/genomes-from-ncbi.tsv

Lines changed: 363 additions & 202 deletions
Large diffs are not rendered by default.

catalog/build/py/build-files-from-ncbi.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
GENOMES_OUTPUT_PATH = "catalog/build/intermediate/genomes-from-ncbi.tsv"
88

9+
QC_REPORT_PATH = "catalog/output/qc-report.md"
10+
911
TAXONOMIC_GROUPS_BY_TAXONOMY_ID = {
1012
2: "Bacteria",
1113
10239: "Viruses",
@@ -41,5 +43,6 @@
4143
GENOMES_OUTPUT_PATH,
4244
UCSC_ASSEMBLIES_URL,
4345
TAXANOMIC_LEVELS_FOR_TREE,
44-
{"taxonomicGroup": TAXONOMIC_GROUPS_BY_TAXONOMY_ID}
46+
{"taxonomicGroup": TAXONOMIC_GROUPS_BY_TAXONOMY_ID},
47+
qc_report_path=QC_REPORT_PATH
4548
)

catalog/build/py/package/catalog_build/build.py

Lines changed: 78 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from bs4 import BeautifulSoup
88
import logging
99

10+
MAX_NCBI_URL_LENGTH = 2000 # The actual limit seems to be a bit over 4000
11+
1012
log = logging.getLogger(__name__)
1113

1214

@@ -31,6 +33,37 @@ def get_paginated_ncbi_results(base_url, query_description):
3133
return results
3234

3335

36+
def get_next_ncbi_url_batch(get_url, items, start_index):
37+
# Do a binary search to find the longest possible URL
38+
max_valid_end_index = start_index
39+
min_invalid_end_index = len(items)
40+
end_index = len(items)
41+
while end_index != max_valid_end_index:
42+
test_url = get_url(items[start_index:end_index])
43+
if len(test_url) > MAX_NCBI_URL_LENGTH:
44+
min_invalid_end_index = end_index
45+
else:
46+
max_valid_end_index = end_index
47+
end_index = int((max_valid_end_index + min_invalid_end_index)/2)
48+
return get_url(items[start_index:end_index]), end_index
49+
50+
51+
def get_batched_ncbi_urls(get_url, items):
52+
urls = []
53+
index = 0
54+
while index < len(items):
55+
url, index = get_next_ncbi_url_batch(get_url, items, index)
56+
urls.append(url)
57+
return urls
58+
59+
60+
def get_batched_ncbi_results(get_base_url, items, query_description):
61+
results = []
62+
for batch_index, batch_url in enumerate(get_batched_ncbi_urls(get_base_url, items)):
63+
results.extend(get_paginated_ncbi_results(batch_url, f"{query_description} batch {batch_index + 1}"))
64+
return results
65+
66+
3467
def match_taxonomic_group(tax_id, lineage, taxonomic_groups):
3568
if tax_id not in taxonomic_groups:
3669
return None
@@ -74,7 +107,11 @@ def get_species_row(taxon_info, taxonomic_group_sets, taxonomic_levels):
74107

75108

76109
def get_species_df(taxonomy_ids, taxonomic_group_sets, taxonomic_levels):
77-
species_info = get_paginated_ncbi_results(f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{",".join([str(id) for id in taxonomy_ids])}/dataset_report", "taxa")
110+
species_info = get_batched_ncbi_results(
111+
lambda ids: f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{",".join(ids)}/dataset_report",
112+
[str(id) for id in taxonomy_ids],
113+
"taxa"
114+
)
78115
return pd.DataFrame([get_species_row(info, taxonomic_group_sets, taxonomic_levels) for info in species_info])
79116

80117

@@ -107,20 +144,15 @@ def get_biosample_data(genome_info):
107144

108145

109146
def get_genomes_and_primarydata_df(accessions):
110-
# Send accessions in batches of 100
111-
# ncbi api produces a 414 error if there are too many accessions
112-
batch_size = 100
113-
genomes_info = []
114-
biosamples_info = []
115-
for i in range(0, len(accessions), batch_size):
116-
batch = accessions[i:i+batch_size]
117-
batch_genomes_info = get_paginated_ncbi_results(f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{','.join(batch)}/dataset_report", "genomes")
118-
genomes_info.extend([get_genome_row(info) for info in batch_genomes_info])
119-
biosamples_info.extend([get_biosample_data(info) for info in batch_genomes_info if 'biosample' in info['assembly_info']])
147+
genomes_info = get_batched_ncbi_results(
148+
lambda a: f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{",".join(a)}/dataset_report",
149+
accessions,
150+
"genomes"
151+
)
120152

121153
return (
122-
pd.DataFrame(data=genomes_info),
123-
pd.DataFrame(data=biosamples_info))
154+
pd.DataFrame(data=[get_genome_row(info) for info in genomes_info]),
155+
pd.DataFrame(data=[get_biosample_data(info) for info in genomes_info if 'biosample' in info['assembly_info']]))
124156

125157

126158
def _id_to_gene_model_url(asm_id):
@@ -161,7 +193,7 @@ def report_missing_values_from(values_name, message_predicate, all_values_series
161193
present_values_mask[:] = False
162194
for series in partial_values_series:
163195
present_values_mask |= all_values_series.isin(series)
164-
report_missing_values(values_name, message_predicate, all_values_series, present_values_mask)
196+
return report_missing_values(values_name, message_predicate, all_values_series, present_values_mask)
165197

166198

167199
def report_missing_values(values_name, message_predicate, values_series, present_values_mask):
@@ -172,6 +204,15 @@ def report_missing_values(values_name, message_predicate, values_series, present
172204
print(f"Only {len(present_values)} of {len(values_series)} {values_name} {message_predicate}: {", ".join(present_values)}")
173205
else:
174206
print(f"{len(missing_values)} {values_name} not {message_predicate}: {", ".join(missing_values)}")
207+
return missing_values
208+
209+
210+
def report_inconsistent_taxonomy_ids(df):
211+
inconsistent_ids_series = df.groupby(["species", "strain"]).filter(lambda g: g["taxonomyId"].nunique() > 1).groupby(["species", "strain"])["taxonomyId"].apply(set)
212+
inconsistent_ids_strings = [(f"{species} strain {strain}" if strain else species, ", ".join([str(id) for id in ids])) for (species, strain), ids in inconsistent_ids_series.items()]
213+
if len(inconsistent_ids_strings) > 0:
214+
print(f"Taxa with inconsistent taxonomy IDs: {", ".join([f"{taxon} ({ids})" for taxon, ids in inconsistent_ids_strings])}")
215+
return inconsistent_ids_strings
175216

176217

177218
def fetch_sra_metadata(srs_ids, batch_size=20):
@@ -344,6 +385,14 @@ def fetch_url_data(url, counter=0, counter_limit=2, wait_time=2, num_retry=3):
344385
return data
345386

346387

388+
def make_qc_report(missing_ncbi_assemblies, inconsistent_taxonomy_ids, missing_ucsc_assemblies, missing_gene_model_urls=None):
389+
ncbi_assemblies_text = "None" if len(missing_ncbi_assemblies) == 0 else "\n".join([f"- {accession}" for accession in missing_ncbi_assemblies])
390+
ucsc_assemblies_text = "None" if len(missing_ucsc_assemblies) == 0 else "\n".join([f"- {accession}" for accession in missing_ucsc_assemblies])
391+
gene_model_urls_text = "N/A" if missing_gene_model_urls is None else "None" if len(missing_gene_model_urls) == 0 else "\n".join([f"- {accession}" for accession in missing_gene_model_urls])
392+
taxonomy_ids_text = "None" if len(inconsistent_taxonomy_ids) == 0 else "\n".join([f"- {taxon}: {ids}" for taxon, ids in inconsistent_taxonomy_ids])
393+
return f"# Catalog QC report\n\n## Assemblies not found on NCBI\n\n{ncbi_assemblies_text}\n\n## Assemblies not found in UCSC list\n\n{ucsc_assemblies_text}\n\n## Assemblies with gene model URLs not found\n\n{gene_model_urls_text}\n\n## Species and strain combinations with multiple taxonomy IDs\n\n{taxonomy_ids_text}\n"
394+
395+
347396
def build_files(
348397
assemblies_path,
349398
genomes_output_path,
@@ -352,10 +401,13 @@ def build_files(
352401
taxonomic_group_sets={},
353402
do_gene_model_urls=True,
354403
extract_primary_data=False,
355-
primary_output_path=None
404+
primary_output_path=None,
405+
qc_report_path=None
356406
):
357407
print("Building files")
358408

409+
qc_report_params = {}
410+
359411
source_list_df = read_assemblies(assemblies_path)
360412

361413
base_genomes_df, primarydata_df = get_genomes_and_primarydata_df(source_list_df["accession"])
@@ -369,26 +421,29 @@ def build_files(
369421
sra_metadata = fetch_sra_metadata(sra_ids_list)
370422
sra_metadata_df = pd.DataFrame([sra_metadata[sra][srr] for sra in sra_metadata for srr in sra_metadata[sra]])
371423
primarydata_df = primarydata_df.merge(sra_metadata_df, how="left", left_on="sra_sample_acc", right_on="sra_sample_acc")
372-
report_missing_values_from("accessions", "found on NCBI", source_list_df["accession"], base_genomes_df["accession"])
424+
425+
qc_report_params["missing_ncbi_assemblies"] = report_missing_values_from("accessions", "found on NCBI", source_list_df["accession"], base_genomes_df["accession"])
373426

374427
species_df = get_species_df(base_genomes_df["taxonomyId"], taxonomic_group_sets, taxonomic_levels_for_tree)
375428

376429
report_missing_values_from("species", "found on NCBI", base_genomes_df["taxonomyId"], species_df["taxonomyId"])
377430

378431
genomes_with_species_df = base_genomes_df.merge(species_df, how="left", on="taxonomyId")
379432

433+
qc_report_params["inconsistent_taxonomy_ids"] = report_inconsistent_taxonomy_ids(genomes_with_species_df)
434+
380435
assemblies_df = pd.DataFrame(requests.get(ucsc_assemblies_url).json()["data"])[["ucscBrowser", "genBank", "refSeq"]]
381436

382437
gen_bank_merge_df = genomes_with_species_df.merge(assemblies_df, how="left", left_on="accession", right_on="genBank")
383438
ref_seq_merge_df = genomes_with_species_df.merge(assemblies_df, how="left", left_on="accession", right_on="refSeq")
384439

385-
report_missing_values_from("accessions", "matched in assembly list", genomes_with_species_df["accession"], assemblies_df["genBank"], assemblies_df["refSeq"])
440+
qc_report_params["missing_ucsc_assemblies"] = report_missing_values_from("accessions", "matched in assembly list", genomes_with_species_df["accession"], assemblies_df["genBank"], assemblies_df["refSeq"])
386441

387442
genomes_df = gen_bank_merge_df.combine_first(ref_seq_merge_df)
388443

389444
if do_gene_model_urls:
390445
genomes_df = add_gene_model_url(genomes_df)
391-
report_missing_values("accessions", "matched with gene model URLs", genomes_df["accession"], genomes_df["geneModelUrl"].astype(bool))
446+
qc_report_params["missing_gene_model_urls"] = report_missing_values("accessions", "matched with gene model URLs", genomes_df["accession"], genomes_df["geneModelUrl"].astype(bool))
392447
else:
393448
genomes_df["geneModelUrl"] = ""
394449

@@ -400,3 +455,8 @@ def build_files(
400455
primarydata_df.to_csv(primary_output_path, index=False, sep="\t")
401456

402457
print(f"Wrote to {primary_output_path}")
458+
459+
if qc_report_path is not None:
460+
qc_report_text = make_qc_report(**qc_report_params)
461+
with open(qc_report_path, "w") as file:
462+
file.write(qc_report_text)

catalog/build/py/package/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setup(
44
name="catalog_build",
5-
version="1.3.1",
5+
version="1.4.0",
66
packages=["catalog_build"],
77
install_requires=["pandas", "requests", "PyYAML", "BeautifulSoup4", "lxml"],
88
)

0 commit comments

Comments
 (0)