7
7
from bs4 import BeautifulSoup
8
8
import logging
9
9
10
+ MAX_NCBI_URL_LENGTH = 2000 # The actual limit seems to be a bit over 4000
11
+
10
12
log = logging .getLogger (__name__ )
11
13
12
14
@@ -31,6 +33,37 @@ def get_paginated_ncbi_results(base_url, query_description):
31
33
return results
32
34
33
35
36
+ def get_next_ncbi_url_batch (get_url , items , start_index ):
37
+ # Do a binary search to find the longest possible URL
38
+ max_valid_end_index = start_index
39
+ min_invalid_end_index = len (items )
40
+ end_index = len (items )
41
+ while end_index != max_valid_end_index :
42
+ test_url = get_url (items [start_index :end_index ])
43
+ if len (test_url ) > MAX_NCBI_URL_LENGTH :
44
+ min_invalid_end_index = end_index
45
+ else :
46
+ max_valid_end_index = end_index
47
+ end_index = int ((max_valid_end_index + min_invalid_end_index )/ 2 )
48
+ return get_url (items [start_index :end_index ]), end_index
49
+
50
+
51
+ def get_batched_ncbi_urls (get_url , items ):
52
+ urls = []
53
+ index = 0
54
+ while index < len (items ):
55
+ url , index = get_next_ncbi_url_batch (get_url , items , index )
56
+ urls .append (url )
57
+ return urls
58
+
59
+
60
+ def get_batched_ncbi_results (get_base_url , items , query_description ):
61
+ results = []
62
+ for batch_index , batch_url in enumerate (get_batched_ncbi_urls (get_base_url , items )):
63
+ results .extend (get_paginated_ncbi_results (batch_url , f"{ query_description } batch { batch_index + 1 } " ))
64
+ return results
65
+
66
+
34
67
def match_taxonomic_group (tax_id , lineage , taxonomic_groups ):
35
68
if tax_id not in taxonomic_groups :
36
69
return None
@@ -74,7 +107,11 @@ def get_species_row(taxon_info, taxonomic_group_sets, taxonomic_levels):
74
107
75
108
76
109
def get_species_df (taxonomy_ids , taxonomic_group_sets , taxonomic_levels ):
77
- species_info = get_paginated_ncbi_results (f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{ "," .join ([str (id ) for id in taxonomy_ids ])} /dataset_report" , "taxa" )
110
+ species_info = get_batched_ncbi_results (
111
+ lambda ids : f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{ "," .join (ids )} /dataset_report" ,
112
+ [str (id ) for id in taxonomy_ids ],
113
+ "taxa"
114
+ )
78
115
return pd .DataFrame ([get_species_row (info , taxonomic_group_sets , taxonomic_levels ) for info in species_info ])
79
116
80
117
@@ -107,20 +144,15 @@ def get_biosample_data(genome_info):
107
144
108
145
109
146
def get_genomes_and_primarydata_df (accessions ):
110
- # Send accessions in batches of 100
111
- # ncbi api produces a 414 error if there are too many accessions
112
- batch_size = 100
113
- genomes_info = []
114
- biosamples_info = []
115
- for i in range (0 , len (accessions ), batch_size ):
116
- batch = accessions [i :i + batch_size ]
117
- batch_genomes_info = get_paginated_ncbi_results (f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{ ',' .join (batch )} /dataset_report" , "genomes" )
118
- genomes_info .extend ([get_genome_row (info ) for info in batch_genomes_info ])
119
- biosamples_info .extend ([get_biosample_data (info ) for info in batch_genomes_info if 'biosample' in info ['assembly_info' ]])
147
+ genomes_info = get_batched_ncbi_results (
148
+ lambda a : f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{ "," .join (a )} /dataset_report" ,
149
+ accessions ,
150
+ "genomes"
151
+ )
120
152
121
153
return (
122
- pd .DataFrame (data = genomes_info ),
123
- pd .DataFrame (data = biosamples_info ))
154
+ pd .DataFrame (data = [ get_genome_row ( info ) for info in genomes_info ] ),
155
+ pd .DataFrame (data = [ get_biosample_data ( info ) for info in genomes_info if 'biosample' in info [ 'assembly_info' ]] ))
124
156
125
157
126
158
def _id_to_gene_model_url (asm_id ):
@@ -161,7 +193,7 @@ def report_missing_values_from(values_name, message_predicate, all_values_series
161
193
present_values_mask [:] = False
162
194
for series in partial_values_series :
163
195
present_values_mask |= all_values_series .isin (series )
164
- report_missing_values (values_name , message_predicate , all_values_series , present_values_mask )
196
+ return report_missing_values (values_name , message_predicate , all_values_series , present_values_mask )
165
197
166
198
167
199
def report_missing_values (values_name , message_predicate , values_series , present_values_mask ):
@@ -172,6 +204,15 @@ def report_missing_values(values_name, message_predicate, values_series, present
172
204
print (f"Only { len (present_values )} of { len (values_series )} { values_name } { message_predicate } : { ", " .join (present_values )} " )
173
205
else :
174
206
print (f"{ len (missing_values )} { values_name } not { message_predicate } : { ", " .join (missing_values )} " )
207
+ return missing_values
208
+
209
+
210
+ def report_inconsistent_taxonomy_ids (df ):
211
+ inconsistent_ids_series = df .groupby (["species" , "strain" ]).filter (lambda g : g ["taxonomyId" ].nunique () > 1 ).groupby (["species" , "strain" ])["taxonomyId" ].apply (set )
212
+ inconsistent_ids_strings = [(f"{ species } strain { strain } " if strain else species , ", " .join ([str (id ) for id in ids ])) for (species , strain ), ids in inconsistent_ids_series .items ()]
213
+ if len (inconsistent_ids_strings ) > 0 :
214
+ print (f"Taxa with inconsistent taxonomy IDs: { ", " .join ([f"{ taxon } ({ ids } )" for taxon , ids in inconsistent_ids_strings ])} " )
215
+ return inconsistent_ids_strings
175
216
176
217
177
218
def fetch_sra_metadata (srs_ids , batch_size = 20 ):
@@ -344,6 +385,14 @@ def fetch_url_data(url, counter=0, counter_limit=2, wait_time=2, num_retry=3):
344
385
return data
345
386
346
387
388
+ def make_qc_report (missing_ncbi_assemblies , inconsistent_taxonomy_ids , missing_ucsc_assemblies , missing_gene_model_urls = None ):
389
+ ncbi_assemblies_text = "None" if len (missing_ncbi_assemblies ) == 0 else "\n " .join ([f"- { accession } " for accession in missing_ncbi_assemblies ])
390
+ ucsc_assemblies_text = "None" if len (missing_ucsc_assemblies ) == 0 else "\n " .join ([f"- { accession } " for accession in missing_ucsc_assemblies ])
391
+ gene_model_urls_text = "N/A" if missing_gene_model_urls is None else "None" if len (missing_gene_model_urls ) == 0 else "\n " .join ([f"- { accession } " for accession in missing_gene_model_urls ])
392
+ taxonomy_ids_text = "None" if len (inconsistent_taxonomy_ids ) == 0 else "\n " .join ([f"- { taxon } : { ids } " for taxon , ids in inconsistent_taxonomy_ids ])
393
+ return f"# Catalog QC report\n \n ## Assemblies not found on NCBI\n \n { ncbi_assemblies_text } \n \n ## Assemblies not found in UCSC list\n \n { ucsc_assemblies_text } \n \n ## Assemblies with gene model URLs not found\n \n { gene_model_urls_text } \n \n ## Species and strain combinations with multiple taxonomy IDs\n \n { taxonomy_ids_text } \n "
394
+
395
+
347
396
def build_files (
348
397
assemblies_path ,
349
398
genomes_output_path ,
@@ -352,10 +401,13 @@ def build_files(
352
401
taxonomic_group_sets = {},
353
402
do_gene_model_urls = True ,
354
403
extract_primary_data = False ,
355
- primary_output_path = None
404
+ primary_output_path = None ,
405
+ qc_report_path = None
356
406
):
357
407
print ("Building files" )
358
408
409
+ qc_report_params = {}
410
+
359
411
source_list_df = read_assemblies (assemblies_path )
360
412
361
413
base_genomes_df , primarydata_df = get_genomes_and_primarydata_df (source_list_df ["accession" ])
@@ -369,26 +421,29 @@ def build_files(
369
421
sra_metadata = fetch_sra_metadata (sra_ids_list )
370
422
sra_metadata_df = pd .DataFrame ([sra_metadata [sra ][srr ] for sra in sra_metadata for srr in sra_metadata [sra ]])
371
423
primarydata_df = primarydata_df .merge (sra_metadata_df , how = "left" , left_on = "sra_sample_acc" , right_on = "sra_sample_acc" )
372
- report_missing_values_from ("accessions" , "found on NCBI" , source_list_df ["accession" ], base_genomes_df ["accession" ])
424
+
425
+ qc_report_params ["missing_ncbi_assemblies" ] = report_missing_values_from ("accessions" , "found on NCBI" , source_list_df ["accession" ], base_genomes_df ["accession" ])
373
426
374
427
species_df = get_species_df (base_genomes_df ["taxonomyId" ], taxonomic_group_sets , taxonomic_levels_for_tree )
375
428
376
429
report_missing_values_from ("species" , "found on NCBI" , base_genomes_df ["taxonomyId" ], species_df ["taxonomyId" ])
377
430
378
431
genomes_with_species_df = base_genomes_df .merge (species_df , how = "left" , on = "taxonomyId" )
379
432
433
+ qc_report_params ["inconsistent_taxonomy_ids" ] = report_inconsistent_taxonomy_ids (genomes_with_species_df )
434
+
380
435
assemblies_df = pd .DataFrame (requests .get (ucsc_assemblies_url ).json ()["data" ])[["ucscBrowser" , "genBank" , "refSeq" ]]
381
436
382
437
gen_bank_merge_df = genomes_with_species_df .merge (assemblies_df , how = "left" , left_on = "accession" , right_on = "genBank" )
383
438
ref_seq_merge_df = genomes_with_species_df .merge (assemblies_df , how = "left" , left_on = "accession" , right_on = "refSeq" )
384
439
385
- report_missing_values_from ("accessions" , "matched in assembly list" , genomes_with_species_df ["accession" ], assemblies_df ["genBank" ], assemblies_df ["refSeq" ])
440
+ qc_report_params [ "missing_ucsc_assemblies" ] = report_missing_values_from ("accessions" , "matched in assembly list" , genomes_with_species_df ["accession" ], assemblies_df ["genBank" ], assemblies_df ["refSeq" ])
386
441
387
442
genomes_df = gen_bank_merge_df .combine_first (ref_seq_merge_df )
388
443
389
444
if do_gene_model_urls :
390
445
genomes_df = add_gene_model_url (genomes_df )
391
- report_missing_values ("accessions" , "matched with gene model URLs" , genomes_df ["accession" ], genomes_df ["geneModelUrl" ].astype (bool ))
446
+ qc_report_params [ "missing_gene_model_urls" ] = report_missing_values ("accessions" , "matched with gene model URLs" , genomes_df ["accession" ], genomes_df ["geneModelUrl" ].astype (bool ))
392
447
else :
393
448
genomes_df ["geneModelUrl" ] = ""
394
449
@@ -400,3 +455,8 @@ def build_files(
400
455
primarydata_df .to_csv (primary_output_path , index = False , sep = "\t " )
401
456
402
457
print (f"Wrote to { primary_output_path } " )
458
+
459
+ if qc_report_path is not None :
460
+ qc_report_text = make_qc_report (** qc_report_params )
461
+ with open (qc_report_path , "w" ) as file :
462
+ file .write (qc_report_text )
0 commit comments