From 495fcf5c7ff5ffdb5c581af79baebd5b36b2f351 Mon Sep 17 00:00:00 2001 From: matt-sd-watson Date: Mon, 25 Oct 2021 07:35:29 -0400 Subject: [PATCH 1/6] Try stdin compabitility --- pangolin/command.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index b3fa803..8ad3a27 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -10,6 +10,7 @@ import gzip import joblib from pangolin.utils.log_colours import green,cyan,red +import select try: import pangoLEARN @@ -185,8 +186,6 @@ def main(sysargs = sys.argv[1:]): sys.exit(0) - - dependency_checks.check_dependencies(args.usher) # to enable not having to pass a query if running update @@ -198,13 +197,16 @@ def main(sysargs = sys.argv[1:]): else: # find the query fasta if not args.decompress: - query = os.path.join(cwd, args.query[0]) - if not os.path.exists(query): - sys.stderr.write(cyan(f'Error: cannot find query (input) fasta file at:') + f'{query}\n' + + if not os.path.exists(os.path.join(cwd, args.query[0])): + if select.select([sys.stdin,],[],[],0.0)[0]: + query = sys.stdin + else: + sys.stderr.write(cyan(f'Error: cannot find query (input) fasta file at:') + f'{query}\n' + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + ' for detailed instructions.\n') - sys.exit(-1) + sys.exit(-1) else: + query = os.path.join(cwd, args.query[0]) print(green(f"The query file is:") + f"{query}") # default output dir @@ -261,11 +263,12 @@ def main(sysargs = sys.argv[1:]): print("{:<30}\t{:>25}\t{:<10}\n".format("Sequence name","Reason","Value")) - file_ending = query.split(".")[-1] - if file_ending in ["gz","gzip","tgz"]: - query = gzip.open(query, 'rt') - elif file_ending in ["xz","lzma"]: - query = lzma.open(query, 'rt') + if not select.select([sys.stdin,],[],[],0.0)[0]: + file_ending = query.split(".")[-1] + if file_ending in ["gz","gzip","tgz"]: + query = gzip.open(query, 'rt') + elif file_ending in ["xz","lzma"]: + query = lzma.open(query, 'rt') for record in SeqIO.parse(query, "fasta"): total_input +=1 From 5a718946ed9704216ff4b0d1490086a70407f380 Mon Sep 17 00:00:00 2001 From: matt-sd-watson Date: Mon, 25 Oct 2021 10:38:17 -0400 Subject: [PATCH 2/6] Fix logic for stdin vs query fasta --- pangolin/command.py | 62 +++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index 8ad3a27..c867fd2 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -200,10 +200,17 @@ def main(sysargs = sys.argv[1:]): if not os.path.exists(os.path.join(cwd, args.query[0])): if select.select([sys.stdin,],[],[],0.0)[0]: query = sys.stdin - else: - sys.stderr.write(cyan(f'Error: cannot find query (input) fasta file at:') + f'{query}\n' + - 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + - ' for detailed instructions.\n') + elif not select.select([sys.stdin,],[],[],0.0)[0]: + tried_path = os.path.join(cwd, args.query[0]) + if tried_path.endswith("-"): + sys.stderr.write(cyan( + f'Error: cannot find query (input) fasta file using stdin.\n' + + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + + ' for detailed instructions.\n')) + else: + sys.stderr.write(cyan(f'Error: cannot find query (input) fasta file at:') + f'{tried_path}\n' + + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + + ' for detailed instructions.\n') sys.exit(-1) else: query = os.path.join(cwd, args.query[0]) @@ -270,29 +277,34 @@ def main(sysargs = sys.argv[1:]): elif file_ending in ["xz","lzma"]: query = lzma.open(query, 'rt') - for record in SeqIO.parse(query, "fasta"): - total_input +=1 - # replace spaces in sequence headers with underscores - record.description = record.description.replace(' ', '_').replace(",","_") - record.id = record.description - if "," in record.id: - record.id=record.id.replace(",","_") - - if len(record) args.maxambig: - record.description = record.description + f" fail=N_content:{prop_N}" + try: + for record in SeqIO.parse(query, "fasta"): + total_input +=1 + # replace spaces in sequence headers with underscores + record.description = record.description.replace(' ', '_').replace(",","_") + record.id = record.description + if "," in record.id: + record.id=record.id.replace(",","_") + + if len(record) args.maxambig: + record.description = record.description + f" fail=N_content:{prop_N}" + do_not_run.append(record) + print(fmt.format(record.id, "N content too high", prop_N)) + # print("{record.id} | has an N content of {prop_N}") + else: + run.append(record) + except UnicodeDecodeError: + sys.stderr.write(cyan( + f'Error: error when reading query fasta.\n' + + 'It is possible that compressed stdin was passed.')) print(green("\nNumber of sequences detected: ") + f"{total_input}") print(green("Total passing QC: ") + f"{len(run)}") From 12da6a0f44b8e7d01add5a35eadd1bb8a00e4207 Mon Sep 17 00:00:00 2001 From: matt-sd-watson Date: Mon, 25 Oct 2021 11:07:39 -0400 Subject: [PATCH 3/6] handling if no filepath or stdin --- pangolin/command.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index c867fd2..931badb 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -197,24 +197,32 @@ def main(sysargs = sys.argv[1:]): else: # find the query fasta if not args.decompress: - if not os.path.exists(os.path.join(cwd, args.query[0])): - if select.select([sys.stdin,],[],[],0.0)[0]: - query = sys.stdin - elif not select.select([sys.stdin,],[],[],0.0)[0]: - tried_path = os.path.join(cwd, args.query[0]) - if tried_path.endswith("-"): - sys.stderr.write(cyan( - f'Error: cannot find query (input) fasta file using stdin.\n' + + try: + if not os.path.exists(os.path.join(cwd, args.query[0])): + if select.select([sys.stdin,],[],[],0.0)[0]: + query = sys.stdin + elif not select.select([sys.stdin,],[],[],0.0)[0]: + tried_path = os.path.join(cwd, args.query[0]) + if tried_path.endswith("-"): + sys.stderr.write(cyan( + f'Error: cannot find query (input) fasta file using stdin.\n' + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + ' for detailed instructions.\n')) - else: - sys.stderr.write(cyan(f'Error: cannot find query (input) fasta file at:') + f'{tried_path}\n' + + sys.exit(-1) + else: + sys.stderr.write(cyan(f'Error: cannot find query (input) fasta file at:') + f'{tried_path}\n' + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + ' for detailed instructions.\n') - sys.exit(-1) - else: - query = os.path.join(cwd, args.query[0]) - print(green(f"The query file is:") + f"{query}") + sys.exit(-1) + else: + query = os.path.join(cwd, args.query[0]) + print(green(f"The query file is:") + f"{query}") + except IndexError: + sys.stderr.write(cyan( + f'Error: input query fasta could not be detected from a filepath or through stdin.\n' + + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + + ' for detailed instructions.\n')) + sys.exit(-1) # default output dir From b1241a4f6ca47335307b0755be9e35cf7de847f0 Mon Sep 17 00:00:00 2001 From: matt-sd-watson Date: Mon, 25 Oct 2021 17:15:30 -0400 Subject: [PATCH 4/6] Fix syntax errors from master merge --- pangolin/command.py | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index 905fab1..e5b3afc 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -285,33 +285,39 @@ def main(sysargs = sys.argv[1:]): post_qc_query = os.path.join(tempdir, 'query.post_qc.fasta') fw_pass = open(post_qc_query,"w") qc_fail = os.path.join(tempdir,'query.failed_qc.fasta') - fw_fail = open(qc_fail,"w" + fw_fail = open(qc_fail,"w") total_input = 0 total_pass = 0 try: for record in SeqIO.parse(query, "fasta"): - total_input +=1 - record.description = record.description.replace(' ', '_').replace(",","_") - record.id = record.description - if "," in record.id: - record.id=record.id.replace(",","_") - - if len(record) {record.description}\n{record.seq}\n") - else: - num_N = str(record.seq).upper().count("N") - prop_N = round((num_N)/len(record.seq), 2) - if prop_N > args.maxambig: - record.description = record.description + f" fail=N_content:{prop_N}" + total_input +=1 + record.description = record.description.replace(' ', '_').replace(",","_") + record.id = record.description + if "," in record.id: + record.id=record.id.replace(",","_") + + if len(record) {record.description}\n{record.seq}\n") else: - total_pass +=1 - seq = str(record.seq).replace("-","") - fw_pass.write(f">{record.description}\n{seq}\n") - + num_N = str(record.seq).upper().count("N") + prop_N = round((num_N)/len(record.seq), 2) + if prop_N > args.maxambig: + record.description = record.description + f" fail=N_content:{prop_N}" + fw_fail.write(f">{record.description}\n{record.seq}\n") + else: + total_pass +=1 + seq = str(record.seq).replace("-","") + fw_pass.write(f">{record.description}\n{seq}\n") + except UnicodeDecodeError: + sys.stderr.write(cyan( + f'Error: input query fasta could not be detected from a filepath or through stdin.\n' + + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + + ' for detailed instructions.\n')) + sys.exit(-1) + print(green("Number of sequences detected: ") + f"{total_input}") print(green("Total passing QC: ") + f"{total_pass}") fw_fail.close() From f6552bc887e110e456b9fb5715e4801cc07ba2dd Mon Sep 17 00:00:00 2001 From: matt-sd-watson Date: Tue, 26 Oct 2021 12:42:59 -0400 Subject: [PATCH 5/6] Change logic statement to assess file opening --- pangolin/command.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pangolin/command.py b/pangolin/command.py index e5b3afc..3960145 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -11,6 +11,7 @@ import joblib from pangolin.utils.log_colours import green,cyan,red import select +import lzma try: import pangoLEARN @@ -275,7 +276,7 @@ def main(sysargs = sys.argv[1:]): print(green("** Running sequence QC **")) - if not select.select([sys.stdin,],[],[],0.0)[0]: + if os.path.exists(os.path.join(cwd, args.query[0])): file_ending = query.split(".")[-1] if file_ending in ["gz","gzip","tgz"]: query = gzip.open(query, 'rt') From 5dd90ff02159f095dadc6e9d408b92b32b9d41dd Mon Sep 17 00:00:00 2001 From: matt-sd-watson Date: Tue, 26 Oct 2021 13:23:50 -0400 Subject: [PATCH 6/6] Fix error message for bad fasta parsing --- pangolin/command.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index 3960145..02d9311 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -271,8 +271,8 @@ def main(sysargs = sys.argv[1:]): 3) write a file that contains just the seqs to run """ if not args.decompress: - do_not_run = [] - run = [] + # do_not_run = [] + # run = [] print(green("** Running sequence QC **")) @@ -314,7 +314,8 @@ def main(sysargs = sys.argv[1:]): fw_pass.write(f">{record.description}\n{seq}\n") except UnicodeDecodeError: sys.stderr.write(cyan( - f'Error: input query fasta could not be detected from a filepath or through stdin.\n' + + f'Error: the input query fasta could not be parsed.\n' + + 'Double check your query fasta and that compressed stdin was not passed.\n' + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + ' for detailed instructions.\n')) sys.exit(-1)