3
3
import requests
4
4
import urllib
5
5
import re
6
+ import time
7
+ from bs4 import BeautifulSoup
8
+ import logging
9
+
10
+ log = logging .getLogger (__name__ )
11
+
6
12
7
13
def read_assemblies (assemblies_path ):
8
14
with open (assemblies_path ) as stream :
9
15
return pd .DataFrame (yaml .safe_load (stream )["assemblies" ])
10
16
17
+
11
18
def get_paginated_ncbi_results (base_url , query_description ):
12
19
page = 1
13
20
next_page_token = None
@@ -23,8 +30,9 @@ def get_paginated_ncbi_results(base_url, query_description):
23
30
page += 1
24
31
return results
25
32
33
+
26
34
def match_taxonomic_group (tax_id , lineage , taxonomic_groups ):
27
- if not tax_id in taxonomic_groups :
35
+ if tax_id not in taxonomic_groups :
28
36
return None
29
37
taxon_info = taxonomic_groups [tax_id ]
30
38
name , exclude = (taxon_info ["value" ], taxon_info .get ("exclude" )) if isinstance (taxon_info , dict ) else (taxon_info , None )
@@ -36,12 +44,15 @@ def match_taxonomic_group(tax_id, lineage, taxonomic_groups):
36
44
return name
37
45
return None
38
46
47
+
39
48
def get_taxonomic_groups (lineage , taxonomic_groups ):
40
49
return [group for group in (match_taxonomic_group (tax_id , lineage , taxonomic_groups ) for tax_id in lineage ) if group is not None ]
41
50
51
+
42
52
def get_taxonomic_group_sets (lineage , taxonomic_group_sets ):
43
53
return {field : "," .join (get_taxonomic_groups (lineage , taxonomic_groups )) for field , taxonomic_groups in taxonomic_group_sets .items ()}
44
54
55
+
45
56
def get_species_row (taxon_info , taxonomic_group_sets ):
46
57
species_info = taxon_info ["taxonomy" ]["classification" ]["species" ]
47
58
return {
@@ -51,10 +62,12 @@ def get_species_row(taxon_info, taxonomic_group_sets):
51
62
** get_taxonomic_group_sets (taxon_info ["taxonomy" ]["parents" ], taxonomic_group_sets )
52
63
}
53
64
65
+
54
66
def get_species_df (taxonomy_ids , taxonomic_group_sets ):
55
67
species_info = get_paginated_ncbi_results (f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{ "," .join ([str (id ) for id in taxonomy_ids ])} /dataset_report" , "taxa" )
56
68
return pd .DataFrame ([get_species_row (info , taxonomic_group_sets ) for info in species_info ])
57
69
70
+
58
71
def get_genome_row (genome_info ):
59
72
refseq_category = genome_info ["assembly_info" ].get ("refseq_category" )
60
73
return {
@@ -74,9 +87,22 @@ def get_genome_row(genome_info):
74
87
"pairedAccession" : genome_info .get ("paired_accession" ),
75
88
}
76
89
77
- def get_genomes_df (accessions ):
90
+
91
+ def get_biosample_data (genome_info ):
92
+ return {
93
+ "accession" : genome_info ["accession" ],
94
+ "biosample" : genome_info ["assembly_info" ]["biosample" ]["accession" ],
95
+ 'sample_ids' : "," .join ([f"{ sample ['db' ]} :{ sample ['value' ]} " for sample in genome_info ["assembly_info" ]["biosample" ]['sample_ids' ] if 'db' in sample ]),
96
+ }
97
+
98
+
99
+ def get_genomes_and_primarydata_df (accessions ):
78
100
genomes_info = get_paginated_ncbi_results (f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{ "," .join (accessions )} /dataset_report" , "genomes" )
79
- return pd .DataFrame (data = [get_genome_row (info ) for info in genomes_info ])
101
+
102
+ return (
103
+ pd .DataFrame (data = [get_genome_row (info ) for info in genomes_info ]),
104
+ pd .DataFrame (data = [get_biosample_data (info ) for info in genomes_info if 'biosample' in info ['assembly_info' ]]))
105
+
80
106
81
107
def _id_to_gene_model_url (asm_id ):
82
108
hubs_url = "https://hgdownload.soe.ucsc.edu/hubs/"
@@ -106,16 +132,19 @@ def _id_to_gene_model_url(asm_id):
106
132
# No match, I guess that's OK ?
107
133
return None
108
134
135
+
109
136
def add_gene_model_url (genomes_df : pd .DataFrame ):
110
137
return pd .concat ([genomes_df , genomes_df ["accession" ].apply (_id_to_gene_model_url ).rename ("geneModelUrl" )], axis = "columns" )
111
138
139
+
112
140
def report_missing_values_from (values_name , message_predicate , all_values_series , * partial_values_series ):
113
141
present_values_mask = all_values_series .astype (bool )
114
142
present_values_mask [:] = False
115
143
for series in partial_values_series :
116
144
present_values_mask |= all_values_series .isin (series )
117
145
report_missing_values (values_name , message_predicate , all_values_series , present_values_mask )
118
146
147
+
119
148
def report_missing_values (values_name , message_predicate , values_series , present_values_mask ):
120
149
missing_values = values_series [~ present_values_mask ]
121
150
if len (missing_values ) > 0 :
@@ -125,13 +154,193 @@ def report_missing_values(values_name, message_predicate, values_series, present
125
154
else :
126
155
print (f"{ len (missing_values )} { values_name } not { message_predicate } : { ", " .join (missing_values )} " )
127
156
128
- def build_files (assemblies_path , genomes_output_path , ucsc_assemblies_url , taxonomic_group_sets = {}, do_gene_model_urls = True ):
157
+
158
+ def fetch_sra_metadata (srs_ids , batch_size = 20 ):
159
+ """
160
+ Fetches metadata for a list of SRS IDs from the SRA database.
161
+
162
+ This function retrieves metadata for a given list of SRS (SRA Sample) IDs by querying the NCBI and EBI databases.
163
+ It fetches the metadata in batches and handles retries and waiting mechanisms for failed requests. The metadata includes
164
+ information about the experiment, platform, instrument, library, and associated files.
165
+
166
+ Args:
167
+ srs_ids (list): A list of SRS IDs to fetch metadata for.
168
+ batch_size (int, optional): The number of SRS IDs to process in each batch. Defaults to 20.
169
+
170
+ Returns:
171
+ dict: A dictionary containing the fetched metadata, organized by sample accession and run accession.
172
+
173
+ Raises:
174
+ Exception: If the data could not be fetched after the specified number of retries or if duplicate entries are found.
175
+ """
176
+ def fetch_url_data (url , counter = 0 , counter_limit = 2 , wait_time = 2 , num_retry = 3 ):
177
+ """
178
+ Fetches data from a given URL with retry and wait mechanisms.
179
+ Args:
180
+ url (str): The URL to fetch data from.
181
+ counter (int, optional): The current retry counter. Defaults to 0.
182
+ counter_limit (int, optional): The maximum number of retries before waiting. Defaults to 3.
183
+ wait_time (int, optional): The time to wait before retrying in seconds. Defaults to 5.
184
+ num_retry (int, optional): The number of retry attempts. Defaults to 3.
185
+ Returns:
186
+ tuple: A tuple containing the JSON response and the updated counter.
187
+ Raises:
188
+ Exception: If the data could not be fetched after the specified number of retries.
189
+ """
190
+ if counter > counter_limit :
191
+ time .sleep (wait_time )
192
+ counter = 0
193
+
194
+ response = requests .get (url )
195
+ while num_retry > 0 and response .status_code != 200 :
196
+ time .sleep (wait_time )
197
+ log .debug (f"Failed to fetch, status: { response .status_code } , url: { url } . Retrying..." )
198
+ response = requests .get (url )
199
+ num_retry -= 1
200
+
201
+ if num_retry <= 0 :
202
+ raise Exception (f"Failed to fetch, status: { response .status_code } , url: { url } " )
203
+ log .debug (f"Fetching data from { url } " )
204
+ return response , counter + 1
205
+
206
+ if srs_ids is None :
207
+ return None
208
+
209
+ data = {}
210
+ counter = 0
211
+ samples_processed = 0
212
+ for i in range (0 , len (srs_ids ), batch_size ):
213
+ print (f"Processing metadata for samples: { samples_processed } of { len (srs_ids )} " , end = '\r ' )
214
+ batch_srs_id = srs_ids [i :i + batch_size ]
215
+ samples_processed += len (batch_srs_id )
216
+ search_data , counter = fetch_url_data (f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term={ "+OR+" .join (batch_srs_id )} &retmode=json&retmax=1000" , counter )
217
+ search_data = search_data .json ()
218
+
219
+ if int (search_data .get ("esearchresult" , {}).get ("count" , 0 )) == 0 :
220
+ log .debug (f"No SRR IDs found for SRS { batch_srs_id } " )
221
+ return None
222
+
223
+ # Extract SRR IDs
224
+ srr_ids = search_data .get ("esearchresult" , {}).get ("idlist" , [])
225
+ if srr_ids :
226
+ for i in range (0 , len (srr_ids ), batch_size ):
227
+ batch_srr_id = srr_ids [i :i + batch_size ]
228
+ summary_data , counter = fetch_url_data (f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=sra&id={ ',' .join (batch_srr_id )} &retmode=json&retmax=1000" , counter )
229
+ summary_data = summary_data .json ()
230
+ if 'result' in summary_data :
231
+ for result in summary_data ['result' ]['uids' ]:
232
+ exp_soup = BeautifulSoup (f"<Root>{ summary_data ['result' ][result ]['expxml' ]} </Root>" , 'xml' )
233
+ run_soup = BeautifulSoup (f"<Root>{ summary_data ['result' ][result ]['runs' ]} </Root>" , 'xml' )
234
+
235
+ library_layout = exp_soup .find ("LIBRARY_LAYOUT" ).find ().name
236
+ title = exp_soup .find ("Title" ).text
237
+ platform = exp_soup .find ("Platform" ).text
238
+ instrument = exp_soup .find ("Platform" )["instrument_model" ]
239
+ organism_name = exp_soup .find ("Organism" ).get ("ScientificName" , "" )
240
+ total_spots = exp_soup .find ("Statistics" )["total_spots" ]
241
+ total_bases = exp_soup .find ("Statistics" )["total_bases" ]
242
+
243
+ sra_experiment_acc = exp_soup .find ("Experiment" )["acc" ]
244
+ sra_sample_acc = exp_soup .find ("Sample" )["acc" ]
245
+ sra_study_acc = exp_soup .find ("Study" )["acc" ]
246
+ sra_submitter_acc = exp_soup .find ("Submitter" )["acc" ]
247
+
248
+ library_name = exp_soup .find ("LIBRARY_NAME" ).text if exp_soup .find ("LIBRARY_NAME" ) else ""
249
+ library_strategy = exp_soup .find ("LIBRARY_STRATEGY" ).text if exp_soup .find ("LIBRARY_STRATEGY" ) else ""
250
+ library_source = exp_soup .find ("LIBRARY_SOURCE" ).text if exp_soup .find ("LIBRARY_SOURCE" ) else ""
251
+ library_selection = exp_soup .find ("LIBRARY_SELECTION" ).text if exp_soup .find ("LIBRARY_SELECTION" ) else ""
252
+ bioproject_elem = exp_soup .find ("Bioproject" )
253
+ bioproject = bioproject_elem .text if bioproject_elem else ""
254
+
255
+ for run in run_soup .find_all ("Run" ):
256
+ sra_run_acc = run ["acc" ]
257
+ run_total_bases = run ["total_bases" ]
258
+ run_total_spots = run ["total_spots" ]
259
+
260
+ d = {
261
+ "title" : title ,
262
+ "platform" : platform ,
263
+ "instrument" : instrument ,
264
+ "total_spots" : total_spots ,
265
+ "total_bases" : total_bases ,
266
+ "bioproject" : bioproject ,
267
+ "organism_name" : organism_name ,
268
+ "library_name" : library_name ,
269
+ 'library_layout' : library_layout ,
270
+ "library_strategy" : library_strategy ,
271
+ "library_source" : library_source ,
272
+ "library_selection" : library_selection ,
273
+ "sra_experiment_acc" : sra_experiment_acc ,
274
+ "sra_run_acc" : sra_run_acc ,
275
+ 'sra_sample_acc' : sra_sample_acc ,
276
+ "sra_study_acc" : sra_study_acc ,
277
+ "sra_submitter_acc" : sra_submitter_acc ,
278
+ "run_total_bases" : run_total_bases ,
279
+ "run_total_spots" : run_total_spots ,
280
+ }
281
+
282
+ if sra_sample_acc in data :
283
+ if sra_run_acc in data [sra_sample_acc ]:
284
+ raise Exception (f"Duplicate biosample run_acc { sra_run_acc } found { sra_sample_acc } " )
285
+ else :
286
+ data [sra_sample_acc ][sra_run_acc ] = d
287
+ else :
288
+ data [sra_sample_acc ] = {sra_run_acc : d }
289
+ print (f"Processing metadata for samples: { samples_processed } of { len (srs_ids )} " , end = '\n ' )
290
+ samples_processed = 0
291
+ for sample_acc in data :
292
+ print (f"Adding file urls to : { samples_processed } of { len (data )} " , end = '\r ' )
293
+ samples_processed += 1
294
+ # Fetch url, file size and md5 for raw/primary data files
295
+ file_list_data , counter = fetch_url_data (f"https://www.ebi.ac.uk/ena/portal/api/filereport?accession={ sample_acc } &result=read_run&format=json&retmax=1000" , counter )
296
+ file_list_data = file_list_data .json ()
297
+ for result in file_list_data :
298
+ if result ['run_accession' ] not in data [sample_acc ]:
299
+ raise Exception (f"Not metadata found for { result ['run_accession' ]} { sample_acc } " )
300
+ if 'fastq_ftp' in data [sample_acc ][result ['run_accession' ]]:
301
+ raise Exception (f"Duplicate file list entry for { result ['run_accession' ]} { sample_acc } " )
302
+
303
+ data [sample_acc ][result ['run_accession' ]]['file_urls' ] = result ['fastq_ftp' ]
304
+ data [sample_acc ][result ['run_accession' ]]['file_size' ] = result ['fastq_bytes' ]
305
+ data [sample_acc ][result ['run_accession' ]]['file_md5' ] = result ['fastq_md5' ]
306
+
307
+ data [sample_acc ][result ['run_accession' ]]['file_urls' ] = result ['fastq_ftp' ]
308
+ data [sample_acc ][result ['run_accession' ]]['file_size' ] = result ['fastq_bytes' ]
309
+ data [sample_acc ][result ['run_accession' ]]['file_md5' ] = result ['fastq_md5' ]
310
+
311
+ if not len (data [sample_acc ][result ['run_accession' ]]['file_urls' ]):
312
+ # Some raw or primary data has been uploaded but not properly processed by SRA.
313
+ # These files will lack https/ftp URLs and statistics, looks like these are
314
+ # BAM files that are labeled as FASTQ.
315
+ # For these, we can retrieve S3 links instead.
316
+ file_list_data , counter = fetch_url_data (f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id=SRR25741043&retmode=xml" , counter )
317
+ srafile_soup = BeautifulSoup (file_list_data .text , 'xml' )
318
+ for file in srafile_soup .findAll ("SRAFile" ):
319
+ if file ['supertype' ] == "Original" :
320
+ alternatives = file .find ('Alternatives' )
321
+ data [sample_acc ][result ['run_accession' ]]['file_urls' ] = alternatives ['url' ]
322
+ data [sample_acc ][result ['run_accession' ]]['file_size' ] = file ['size' ]
323
+ data [sample_acc ][result ['run_accession' ]]['file_md5' ] = file ['md5' ]
324
+ print (f"Adding file urls to : { samples_processed } of { len (data )} " , end = '\n ' )
325
+ return data
326
+
327
+
328
+ def build_files (assemblies_path , genomes_output_path , ucsc_assemblies_url , taxonomic_group_sets = {}, do_gene_model_urls = True , extract_primary_data = False , primary_output_path = None ):
129
329
print ("Building files" )
130
330
131
331
source_list_df = read_assemblies (assemblies_path )
132
332
133
- base_genomes_df = get_genomes_df (source_list_df ["accession" ])
333
+ base_genomes_df , primarydata_df = get_genomes_and_primarydata_df (source_list_df ["accession" ])
134
334
335
+ primarydata_df ['sra_sample_acc' ] = primarydata_df ["sample_ids" ].str .split ("," )
336
+ primarydata_df = primarydata_df .explode ("sra_sample_acc" )
337
+ primarydata_df = primarydata_df [~ primarydata_df ["sra_sample_acc" ].isnull () & primarydata_df ["sra_sample_acc" ].str .startswith ('SRA' )]
338
+ primarydata_df ["sra_sample_acc" ] = primarydata_df ["sra_sample_acc" ].str .replace ("SRA:" , "" )
339
+ if extract_primary_data :
340
+ sra_ids_list = primarydata_df ["sra_sample_acc" ].dropna ().unique ().tolist ()
341
+ sra_metadata = fetch_sra_metadata (sra_ids_list )
342
+ sra_metadata_df = pd .DataFrame ([sra_metadata [sra ][srr ] for sra in sra_metadata for srr in sra_metadata [sra ]])
343
+ primarydata_df = primarydata_df .merge (sra_metadata_df , how = "left" , left_on = "sra_sample_acc" , right_on = "sra_sample_acc" )
135
344
report_missing_values_from ("accessions" , "found on NCBI" , source_list_df ["accession" ], base_genomes_df ["accession" ])
136
345
137
346
species_df = get_species_df (base_genomes_df ["taxonomyId" ], taxonomic_group_sets )
@@ -146,7 +355,7 @@ def build_files(assemblies_path, genomes_output_path, ucsc_assemblies_url, taxon
146
355
ref_seq_merge_df = genomes_with_species_df .merge (assemblies_df , how = "left" , left_on = "accession" , right_on = "refSeq" )
147
356
148
357
report_missing_values_from ("accessions" , "matched in assembly list" , genomes_with_species_df ["accession" ], assemblies_df ["genBank" ], assemblies_df ["refSeq" ])
149
-
358
+
150
359
genomes_df = gen_bank_merge_df .combine_first (ref_seq_merge_df )
151
360
152
361
if do_gene_model_urls :
@@ -158,3 +367,8 @@ def build_files(assemblies_path, genomes_output_path, ucsc_assemblies_url, taxon
158
367
genomes_df .to_csv (genomes_output_path , index = False , sep = "\t " )
159
368
160
369
print (f"Wrote to { genomes_output_path } " )
370
+
371
+ if extract_primary_data :
372
+ primarydata_df .to_csv (primary_output_path , index = False , sep = "\t " )
373
+
374
+ print (f"Wrote to { primary_output_path } " )
0 commit comments