Skip to content

Commit 5ded790

Browse files
authored
feat: fetch raw/primary data information from sra (#295)
* feat: fetch raw/primary data information from sra * style: minor changes to make pycodestyle happy * update according to reviewer comments.
1 parent 57d55db commit 5ded790

File tree

4 files changed

+229
-11
lines changed

4 files changed

+229
-11
lines changed

catalog/build/py/build-files-from-ncbi.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
GENOMES_OUTPUT_PATH = "catalog/build/intermediate/genomes-from-ncbi.tsv"
88

9+
PRIMARYDATA_OUTPUT_PATH = "catalog/build/intermediate/primary-data-ncbi.tsv"
10+
911
TAXONOMIC_GROUPS_BY_TAXONOMY_ID = {
1012
2: "Bacteria",
1113
10239: "Viruses",
@@ -16,4 +18,4 @@
1618
}
1719

1820
if __name__ == "__main__":
19-
build_files(ASSEMBLIES_PATH, GENOMES_OUTPUT_PATH, UCSC_ASSEMBLIES_URL, {"taxonomicGroup": TAXONOMIC_GROUPS_BY_TAXONOMY_ID})
21+
build_files(ASSEMBLIES_PATH, GENOMES_OUTPUT_PATH, UCSC_ASSEMBLIES_URL, {"taxonomicGroup": TAXONOMIC_GROUPS_BY_TAXONOMY_ID}, extract_primary_data=True, primary_output_path=PRIMARYDATA_OUTPUT_PATH)

catalog/build/py/package/catalog_build/build.py

Lines changed: 220 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,18 @@
33
import requests
44
import urllib
55
import re
6+
import time
7+
from bs4 import BeautifulSoup
8+
import logging
9+
10+
log = logging.getLogger(__name__)
11+
612

713
def read_assemblies(assemblies_path):
814
with open(assemblies_path) as stream:
915
return pd.DataFrame(yaml.safe_load(stream)["assemblies"])
1016

17+
1118
def get_paginated_ncbi_results(base_url, query_description):
1219
page = 1
1320
next_page_token = None
@@ -23,8 +30,9 @@ def get_paginated_ncbi_results(base_url, query_description):
2330
page += 1
2431
return results
2532

33+
2634
def match_taxonomic_group(tax_id, lineage, taxonomic_groups):
27-
if not tax_id in taxonomic_groups:
35+
if tax_id not in taxonomic_groups:
2836
return None
2937
taxon_info = taxonomic_groups[tax_id]
3038
name, exclude = (taxon_info["value"], taxon_info.get("exclude")) if isinstance(taxon_info, dict) else (taxon_info, None)
@@ -36,12 +44,15 @@ def match_taxonomic_group(tax_id, lineage, taxonomic_groups):
3644
return name
3745
return None
3846

47+
3948
def get_taxonomic_groups(lineage, taxonomic_groups):
4049
return [group for group in (match_taxonomic_group(tax_id, lineage, taxonomic_groups) for tax_id in lineage) if group is not None]
4150

51+
4252
def get_taxonomic_group_sets(lineage, taxonomic_group_sets):
4353
return {field: ",".join(get_taxonomic_groups(lineage, taxonomic_groups)) for field, taxonomic_groups in taxonomic_group_sets.items()}
4454

55+
4556
def get_species_row(taxon_info, taxonomic_group_sets):
4657
species_info = taxon_info["taxonomy"]["classification"]["species"]
4758
return {
@@ -51,10 +62,12 @@ def get_species_row(taxon_info, taxonomic_group_sets):
5162
**get_taxonomic_group_sets(taxon_info["taxonomy"]["parents"], taxonomic_group_sets)
5263
}
5364

65+
5466
def get_species_df(taxonomy_ids, taxonomic_group_sets):
5567
species_info = get_paginated_ncbi_results(f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{",".join([str(id) for id in taxonomy_ids])}/dataset_report", "taxa")
5668
return pd.DataFrame([get_species_row(info, taxonomic_group_sets) for info in species_info])
5769

70+
5871
def get_genome_row(genome_info):
5972
refseq_category = genome_info["assembly_info"].get("refseq_category")
6073
return {
@@ -74,9 +87,22 @@ def get_genome_row(genome_info):
7487
"pairedAccession": genome_info.get("paired_accession"),
7588
}
7689

77-
def get_genomes_df(accessions):
90+
91+
def get_biosample_data(genome_info):
92+
return {
93+
"accession": genome_info["accession"],
94+
"biosample": genome_info["assembly_info"]["biosample"]["accession"],
95+
'sample_ids': ",".join([f"{sample['db']}:{sample['value']}" for sample in genome_info["assembly_info"]["biosample"]['sample_ids'] if 'db' in sample]),
96+
}
97+
98+
99+
def get_genomes_and_primarydata_df(accessions):
78100
genomes_info = get_paginated_ncbi_results(f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{",".join(accessions)}/dataset_report", "genomes")
79-
return pd.DataFrame(data=[get_genome_row(info) for info in genomes_info])
101+
102+
return (
103+
pd.DataFrame(data=[get_genome_row(info) for info in genomes_info]),
104+
pd.DataFrame(data=[get_biosample_data(info) for info in genomes_info if 'biosample' in info['assembly_info']]))
105+
80106

81107
def _id_to_gene_model_url(asm_id):
82108
hubs_url = "https://hgdownload.soe.ucsc.edu/hubs/"
@@ -106,16 +132,19 @@ def _id_to_gene_model_url(asm_id):
106132
# No match, I guess that's OK ?
107133
return None
108134

135+
109136
def add_gene_model_url(genomes_df: pd.DataFrame):
110137
return pd.concat([genomes_df, genomes_df["accession"].apply(_id_to_gene_model_url).rename("geneModelUrl")], axis="columns")
111138

139+
112140
def report_missing_values_from(values_name, message_predicate, all_values_series, *partial_values_series):
113141
present_values_mask = all_values_series.astype(bool)
114142
present_values_mask[:] = False
115143
for series in partial_values_series:
116144
present_values_mask |= all_values_series.isin(series)
117145
report_missing_values(values_name, message_predicate, all_values_series, present_values_mask)
118146

147+
119148
def report_missing_values(values_name, message_predicate, values_series, present_values_mask):
120149
missing_values = values_series[~present_values_mask]
121150
if len(missing_values) > 0:
@@ -125,13 +154,193 @@ def report_missing_values(values_name, message_predicate, values_series, present
125154
else:
126155
print(f"{len(missing_values)} {values_name} not {message_predicate}: {", ".join(missing_values)}")
127156

128-
def build_files(assemblies_path, genomes_output_path, ucsc_assemblies_url, taxonomic_group_sets={}, do_gene_model_urls=True):
157+
158+
def fetch_sra_metadata(srs_ids, batch_size=20):
159+
"""
160+
Fetches metadata for a list of SRS IDs from the SRA database.
161+
162+
This function retrieves metadata for a given list of SRS (SRA Sample) IDs by querying the NCBI and EBI databases.
163+
It fetches the metadata in batches and handles retries and waiting mechanisms for failed requests. The metadata includes
164+
information about the experiment, platform, instrument, library, and associated files.
165+
166+
Args:
167+
srs_ids (list): A list of SRS IDs to fetch metadata for.
168+
batch_size (int, optional): The number of SRS IDs to process in each batch. Defaults to 20.
169+
170+
Returns:
171+
dict: A dictionary containing the fetched metadata, organized by sample accession and run accession.
172+
173+
Raises:
174+
Exception: If the data could not be fetched after the specified number of retries or if duplicate entries are found.
175+
"""
176+
def fetch_url_data(url, counter=0, counter_limit=2, wait_time=2, num_retry=3):
177+
"""
178+
Fetches data from a given URL with retry and wait mechanisms.
179+
Args:
180+
url (str): The URL to fetch data from.
181+
counter (int, optional): The current retry counter. Defaults to 0.
182+
counter_limit (int, optional): The maximum number of retries before waiting. Defaults to 3.
183+
wait_time (int, optional): The time to wait before retrying in seconds. Defaults to 5.
184+
num_retry (int, optional): The number of retry attempts. Defaults to 3.
185+
Returns:
186+
tuple: A tuple containing the JSON response and the updated counter.
187+
Raises:
188+
Exception: If the data could not be fetched after the specified number of retries.
189+
"""
190+
if counter > counter_limit:
191+
time.sleep(wait_time)
192+
counter = 0
193+
194+
response = requests.get(url)
195+
while num_retry > 0 and response.status_code != 200:
196+
time.sleep(wait_time)
197+
log.debug(f"Failed to fetch, status: {response.status_code}, url: {url}. Retrying...")
198+
response = requests.get(url)
199+
num_retry -= 1
200+
201+
if num_retry <= 0:
202+
raise Exception(f"Failed to fetch, status: {response.status_code}, url: {url} ")
203+
log.debug(f"Fetching data from {url}")
204+
return response, counter + 1
205+
206+
if srs_ids is None:
207+
return None
208+
209+
data = {}
210+
counter = 0
211+
samples_processed = 0
212+
for i in range(0, len(srs_ids), batch_size):
213+
print(f"Processing metadata for samples: {samples_processed} of {len(srs_ids)}", end='\r')
214+
batch_srs_id = srs_ids[i:i + batch_size]
215+
samples_processed += len(batch_srs_id)
216+
search_data, counter = fetch_url_data(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term={"+OR+".join(batch_srs_id)}&retmode=json&retmax=1000", counter)
217+
search_data = search_data.json()
218+
219+
if int(search_data.get("esearchresult", {}).get("count", 0)) == 0:
220+
log.debug(f"No SRR IDs found for SRS {batch_srs_id}")
221+
return None
222+
223+
# Extract SRR IDs
224+
srr_ids = search_data.get("esearchresult", {}).get("idlist", [])
225+
if srr_ids:
226+
for i in range(0, len(srr_ids), batch_size):
227+
batch_srr_id = srr_ids[i:i + batch_size]
228+
summary_data, counter = fetch_url_data(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=sra&id={','.join(batch_srr_id)}&retmode=json&retmax=1000", counter)
229+
summary_data = summary_data.json()
230+
if 'result' in summary_data:
231+
for result in summary_data['result']['uids']:
232+
exp_soup = BeautifulSoup(f"<Root>{summary_data['result'][result]['expxml']}</Root>", 'xml')
233+
run_soup = BeautifulSoup(f"<Root>{summary_data['result'][result]['runs']}</Root>", 'xml')
234+
235+
library_layout = exp_soup.find("LIBRARY_LAYOUT").find().name
236+
title = exp_soup.find("Title").text
237+
platform = exp_soup.find("Platform").text
238+
instrument = exp_soup.find("Platform")["instrument_model"]
239+
organism_name = exp_soup.find("Organism").get("ScientificName", "")
240+
total_spots = exp_soup.find("Statistics")["total_spots"]
241+
total_bases = exp_soup.find("Statistics")["total_bases"]
242+
243+
sra_experiment_acc = exp_soup.find("Experiment")["acc"]
244+
sra_sample_acc = exp_soup.find("Sample")["acc"]
245+
sra_study_acc = exp_soup.find("Study")["acc"]
246+
sra_submitter_acc = exp_soup.find("Submitter")["acc"]
247+
248+
library_name = exp_soup.find("LIBRARY_NAME").text if exp_soup.find("LIBRARY_NAME") else ""
249+
library_strategy = exp_soup.find("LIBRARY_STRATEGY").text if exp_soup.find("LIBRARY_STRATEGY") else ""
250+
library_source = exp_soup.find("LIBRARY_SOURCE").text if exp_soup.find("LIBRARY_SOURCE") else ""
251+
library_selection = exp_soup.find("LIBRARY_SELECTION").text if exp_soup.find("LIBRARY_SELECTION") else ""
252+
bioproject_elem = exp_soup.find("Bioproject")
253+
bioproject = bioproject_elem.text if bioproject_elem else ""
254+
255+
for run in run_soup.find_all("Run"):
256+
sra_run_acc = run["acc"]
257+
run_total_bases = run["total_bases"]
258+
run_total_spots = run["total_spots"]
259+
260+
d = {
261+
"title": title,
262+
"platform": platform,
263+
"instrument": instrument,
264+
"total_spots": total_spots,
265+
"total_bases": total_bases,
266+
"bioproject": bioproject,
267+
"organism_name": organism_name,
268+
"library_name": library_name,
269+
'library_layout': library_layout,
270+
"library_strategy": library_strategy,
271+
"library_source": library_source,
272+
"library_selection": library_selection,
273+
"sra_experiment_acc": sra_experiment_acc,
274+
"sra_run_acc": sra_run_acc,
275+
'sra_sample_acc': sra_sample_acc,
276+
"sra_study_acc": sra_study_acc,
277+
"sra_submitter_acc": sra_submitter_acc,
278+
"run_total_bases": run_total_bases,
279+
"run_total_spots": run_total_spots,
280+
}
281+
282+
if sra_sample_acc in data:
283+
if sra_run_acc in data[sra_sample_acc]:
284+
raise Exception(f"Duplicate biosample run_acc {sra_run_acc} found {sra_sample_acc}")
285+
else:
286+
data[sra_sample_acc][sra_run_acc] = d
287+
else:
288+
data[sra_sample_acc] = {sra_run_acc: d}
289+
print(f"Processing metadata for samples: {samples_processed} of {len(srs_ids)}", end='\n')
290+
samples_processed = 0
291+
for sample_acc in data:
292+
print(f"Adding file urls to : {samples_processed} of {len(data)}", end='\r')
293+
samples_processed += 1
294+
# Fetch url, file size and md5 for raw/primary data files
295+
file_list_data, counter = fetch_url_data(f"https://www.ebi.ac.uk/ena/portal/api/filereport?accession={sample_acc}&result=read_run&format=json&retmax=1000", counter)
296+
file_list_data = file_list_data.json()
297+
for result in file_list_data:
298+
if result['run_accession'] not in data[sample_acc]:
299+
raise Exception(f"Not metadata found for {result['run_accession']} {sample_acc}")
300+
if 'fastq_ftp' in data[sample_acc][result['run_accession']]:
301+
raise Exception(f"Duplicate file list entry for {result['run_accession']} {sample_acc}")
302+
303+
data[sample_acc][result['run_accession']]['file_urls'] = result['fastq_ftp']
304+
data[sample_acc][result['run_accession']]['file_size'] = result['fastq_bytes']
305+
data[sample_acc][result['run_accession']]['file_md5'] = result['fastq_md5']
306+
307+
data[sample_acc][result['run_accession']]['file_urls'] = result['fastq_ftp']
308+
data[sample_acc][result['run_accession']]['file_size'] = result['fastq_bytes']
309+
data[sample_acc][result['run_accession']]['file_md5'] = result['fastq_md5']
310+
311+
if not len(data[sample_acc][result['run_accession']]['file_urls']):
312+
# Some raw or primary data has been uploaded but not properly processed by SRA.
313+
# These files will lack https/ftp URLs and statistics, looks like these are
314+
# BAM files that are labeled as FASTQ.
315+
# For these, we can retrieve S3 links instead.
316+
file_list_data, counter = fetch_url_data(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id=SRR25741043&retmode=xml", counter)
317+
srafile_soup = BeautifulSoup(file_list_data.text, 'xml')
318+
for file in srafile_soup.findAll("SRAFile"):
319+
if file['supertype'] == "Original":
320+
alternatives = file.find('Alternatives')
321+
data[sample_acc][result['run_accession']]['file_urls'] = alternatives['url']
322+
data[sample_acc][result['run_accession']]['file_size'] = file['size']
323+
data[sample_acc][result['run_accession']]['file_md5'] = file['md5']
324+
print(f"Adding file urls to : {samples_processed} of {len(data)}", end='\n')
325+
return data
326+
327+
328+
def build_files(assemblies_path, genomes_output_path, ucsc_assemblies_url, taxonomic_group_sets={}, do_gene_model_urls=True, extract_primary_data=False, primary_output_path=None ):
129329
print("Building files")
130330

131331
source_list_df = read_assemblies(assemblies_path)
132332

133-
base_genomes_df = get_genomes_df(source_list_df["accession"])
333+
base_genomes_df, primarydata_df = get_genomes_and_primarydata_df(source_list_df["accession"])
134334

335+
primarydata_df['sra_sample_acc'] = primarydata_df["sample_ids"].str.split(",")
336+
primarydata_df = primarydata_df.explode("sra_sample_acc")
337+
primarydata_df = primarydata_df[~primarydata_df["sra_sample_acc"].isnull() & primarydata_df["sra_sample_acc"].str.startswith('SRA')]
338+
primarydata_df["sra_sample_acc"] = primarydata_df["sra_sample_acc"].str.replace("SRA:", "")
339+
if extract_primary_data:
340+
sra_ids_list = primarydata_df["sra_sample_acc"].dropna().unique().tolist()
341+
sra_metadata = fetch_sra_metadata(sra_ids_list)
342+
sra_metadata_df = pd.DataFrame([sra_metadata[sra][srr] for sra in sra_metadata for srr in sra_metadata[sra]])
343+
primarydata_df = primarydata_df.merge(sra_metadata_df, how="left", left_on="sra_sample_acc", right_on="sra_sample_acc")
135344
report_missing_values_from("accessions", "found on NCBI", source_list_df["accession"], base_genomes_df["accession"])
136345

137346
species_df = get_species_df(base_genomes_df["taxonomyId"], taxonomic_group_sets)
@@ -146,7 +355,7 @@ def build_files(assemblies_path, genomes_output_path, ucsc_assemblies_url, taxon
146355
ref_seq_merge_df = genomes_with_species_df.merge(assemblies_df, how="left", left_on="accession", right_on="refSeq")
147356

148357
report_missing_values_from("accessions", "matched in assembly list", genomes_with_species_df["accession"], assemblies_df["genBank"], assemblies_df["refSeq"])
149-
358+
150359
genomes_df = gen_bank_merge_df.combine_first(ref_seq_merge_df)
151360

152361
if do_gene_model_urls:
@@ -158,3 +367,8 @@ def build_files(assemblies_path, genomes_output_path, ucsc_assemblies_url, taxon
158367
genomes_df.to_csv(genomes_output_path, index=False, sep="\t")
159368

160369
print(f"Wrote to {genomes_output_path}")
370+
371+
if extract_primary_data:
372+
primarydata_df.to_csv(primary_output_path, index=False, sep="\t")
373+
374+
print(f"Wrote to {primary_output_path}")

catalog/build/py/package/setup.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from setuptools import setup
22

33
setup(
4-
name="catalog_build",
5-
version="1.2.1",
6-
packages=["catalog_build"],
7-
install_requires=["pandas", "requests", "PyYAML"],
4+
name="catalog_build",
5+
version="1.2.1",
6+
packages=["catalog_build"],
7+
install_requires=["pandas", "requests", "PyYAML", "BeautifulSoup4", "lxml"],
88
)

catalog/build/py/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ annotated-types==0.7.0
22
antlr4-python3-runtime==4.9.3
33
arrow==1.3.0
44
attrs==25.1.0
5+
beautifulsoup4==4.13.3
56
certifi==2024.8.30
67
CFGraph==0.2.1
78
chardet==5.2.0
@@ -30,6 +31,7 @@ linkml==1.8.6
3031
linkml-dataops==0.1.0
3132
linkml-runtime==1.8.3
3233
MarkupSafe==3.0.2
34+
lxml==5.3.1
3335
numpy==2.1.0
3436
openpyxl==3.1.5
3537
packaging==24.2

0 commit comments

Comments
 (0)