Skip to content

Commit

Permalink
release 1.6 with fastq download
Browse files Browse the repository at this point in the history
  • Loading branch information
ialbert committed Oct 20, 2023
1 parent 2248566 commit 116a49b
Show file tree
Hide file tree
Showing 14 changed files with 289 additions and 263 deletions.
1 change: 1 addition & 0 deletions biorun/api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Contains the backend classes for the biorun package
153 changes: 153 additions & 0 deletions biorun/api/ena_fastq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""
Connects to the ENA (European Nucleotide Archive) to download FASTQ files.
"""

import json, sys, gzip, os
from biorun import utils
from biorun.libs import placlib as plac

# Use the package logger
logger = utils.logger

# ENA API points
ENA_API = "https://www.ebi.ac.uk/ena/portal/api"
ENA_FIELDS = f"{ENA_API}/returnFields"
ENA_REPORT = f"{ENA_API}/filereport"

# The download command with aria2c
CMD = 'aria2c -x 5 -c --summary-interval 10'

# Fetch metadata
def get_metadata(srr):

# The SRR number
# logger.info(f"Searching Ensembl for {srr}")

# The URL to fetch
url = ENA_REPORT

# The fields to fetch
fields = [ "run_accession", 'fastq_ftp', 'fastq_md5', 'fastq_bytes', 'read_count' ]

# Form the fields string
fields = ",".join(fields)

# The parameters for the query
params = dict(
accession=srr, fields=fields,
format='json', result='read_run',
)

# Download the metadata
resp = utils.get_url(url, params=params)

# Decode the JSON
try:
resp = json.loads(resp)
except Exception as exc:
utils.error(f"JSON decoding error: {exc}")

return resp


def test_metadata(srr='ERR12058121'):
"""
Testing the metadata fetch.
"""
meta = get_metadata(srr=srr)

print(json.dumps(meta, indent=4))


@plac.pos("srr", help="the srr numbers", )
@plac.opt("out", help="optional output to the file", abbrev='o')
@plac.opt("limit", help="how many reads to download", abbrev='l', type=int)
def run(srr='ERR12058121', limit=10, out=''):

# Sets the logger lever.
logger.setLevel("INFO")

# Set the prefix
out = out or srr

# The name of the directory
dname = os.path.dirname(out)

# Make directory dname
if dname and not os.path.isdir(dname):
logger.info(f"Creating directory: {dname}")
os.makedirs(dname, exist_ok=True)

# Is the limit set
all = not limit

# Obtain the metadata.
meta = get_metadata(srr=srr)

# Bad metadata may make this fail
try:
if all:
n = int(meta[0].get('read_count', 0))
b = meta[0].get('fastq_bytes', "0;0").split(";")
b = map(float, b)
b = map(lambda x: float(x)/(1024**3), b)
b = ", ".join(map(lambda x: f"{x:.1f} GB", b))
logger.info(f"Downloading {b} GB with {n:,} reads for {srr}")
else:
logger.info(f"Downloading {limit:,} reads for {srr}")
except Exception as exc:
logger.warning("metadata parsing problem, might still work ...")

# Create URLs from metadata.
urls = meta[0]['fastq_ftp'].split(';')
urls = map(lambda x: f"https://{x}", urls)
urls = list(urls)

# Iterate over urls and downlad each file.
for idx, url in enumerate(urls):

fpath = f"{out}_{idx + 1}.fastq.gz"

if all:
# Download all reads
pname = CMD.split()[0]
exit_code = os.system(f"command -v {pname} > /dev/null 2>&1")
if exit_code != 0:
utils.error(f"Unable to run: {pname}", stop=False)
utils.error(f"Installation: micromamba install aria2c")

# Form the download command.
cmd = f"{CMD} -o {fpath} {url}"
logger.info(f"Running: {cmd}")
#sys.stderr.flush()

#continue

# Run the download command.
exit_code = os.system(cmd)
if exit_code != 0:
utils.error(f"Error when running: {cmd}")

else:
# Stream to a file


# logger.info(f"Downloading {url}")

# Open stream to remote gzipped files.
stream = utils.get_gz_lines(url, limit=limit*4)

# Open local gzip file.
fp = gzip.open(fpath, mode='wb')

logger.info(f"Saving to {fpath}")

for line in stream:
line = line.encode("utf-8")
fp.write(line)


if __name__ == '__main__':
plac.call(run)


4 changes: 4 additions & 0 deletions biorun/api/ncbi_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""
A more helpful way interface with NCBI datasets.
"""

3 changes: 2 additions & 1 deletion biorun/data/fetch_prot.fa
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ SOURCE Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)
ORGANISM Severe acute respiratory syndrome coronavirus 2
Viruses; Riboviria; Orthornavirae; Pisuviricota; Pisoniviricetes;
Nidovirales; Cornidovirineae; Coronaviridae; Orthocoronavirinae;
Betacoronavirus; Sarbecovirus.
Betacoronavirus; Sarbecovirus; Severe acute respiratory
syndrome-related coronavirus.
REFERENCE 1 (residues 1 to 1273)
AUTHORS Wu,F., Zhao,S., Yu,B., Chen,Y.M., Wang,W., Song,Z.G., Hu,Y.,
Tao,Z.W., Tian,J.H., Pei,Y.Y., Yuan,M.L., Zhang,Y.L., Dai,F.H.,
Expand Down
6 changes: 4 additions & 2 deletions biorun/data/genomes.gb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ SOURCE Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)
ORGANISM Severe acute respiratory syndrome coronavirus 2
Viruses; Riboviria; Orthornavirae; Pisuviricota; Pisoniviricetes;
Nidovirales; Cornidovirineae; Coronaviridae; Orthocoronavirinae;
Betacoronavirus; Sarbecovirus.
Betacoronavirus; Sarbecovirus; Severe acute respiratory
syndrome-related coronavirus.
REFERENCE 1 (bases 1 to 29903)
AUTHORS Wu,F., Zhao,S., Yu,B., Chen,Y.M., Wang,W., Song,Z.G., Hu,Y.,
Tao,Z.W., Tian,J.H., Pei,Y.Y., Yuan,M.L., Zhang,Y.L., Dai,F.H.,
Expand Down Expand Up @@ -1199,7 +1200,8 @@ SOURCE Bat coronavirus RaTG13
ORGANISM Bat coronavirus RaTG13
Viruses; Riboviria; Orthornavirae; Pisuviricota; Pisoniviricetes;
Nidovirales; Cornidovirineae; Coronaviridae; Orthocoronavirinae;
Betacoronavirus; Sarbecovirus.
Betacoronavirus; Sarbecovirus; Severe acute respiratory
syndrome-related coronavirus.
REFERENCE 1 (bases 1 to 29855)
AUTHORS Zhou,P., Yang,X.-L., Wang,X.-G., Hu,B., Zhang,L., Zhang,W.,
Si,H.-R., Zhu,Y., Li,B., Huang,C.-L., Chen,H.-D., Chen,J., Luo,Y.,
Expand Down
6 changes: 4 additions & 2 deletions biorun/data/genomes.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
"Coronaviridae",
"Orthocoronavirinae",
"Betacoronavirus",
"Sarbecovirus"
"Sarbecovirus",
"Severe acute respiratory syndrome-related coronavirus"
],
"references": [
{
Expand Down Expand Up @@ -1889,7 +1890,8 @@
"Coronaviridae",
"Orthocoronavirinae",
"Betacoronavirus",
"Sarbecovirus"
"Sarbecovirus",
"Severe acute respiratory syndrome-related coronavirus"
],
"references": [
{
Expand Down
Loading

0 comments on commit 116a49b

Please sign in to comment.