From f55cfc89a937c0d286132faa20c3ca14067e86f4 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Wed, 5 Jul 2023 15:08:34 +0100 Subject: [PATCH] Handle cortex not making VCF; also empty/nonexistent input files --- python/clockwork/gvcf.py | 3 +- python/clockwork/tests/utils_test.py | 22 +++++ python/clockwork/utils.py | 30 +++++-- .../clockwork/var_call_one_sample_pipeline.py | 87 ++++++++++++++----- 4 files changed, 114 insertions(+), 28 deletions(-) diff --git a/python/clockwork/gvcf.py b/python/clockwork/gvcf.py index e89358e..b99e692 100644 --- a/python/clockwork/gvcf.py +++ b/python/clockwork/gvcf.py @@ -203,7 +203,8 @@ def gvcf_from_minos_vcf_and_samtools_gvcf(ref_fasta, minos_vcf, samtools_vcf, ou print(samtools_record, file=f_out) ref_pos = samtools_record.POS + 1 - _finish_contig(ref_pos, ref_seq, minos_record, minos_iter, f_out) + if ref_seq is not None: # happens if the samtools VCF was empty + _finish_contig(ref_pos, ref_seq, minos_record, minos_iter, f_out) _print_non_samtools_seqs(ref_seqs, used_ref_seqs, minos_records, f_out) diff --git a/python/clockwork/tests/utils_test.py b/python/clockwork/tests/utils_test.py index 2c35ee1..989cfcc 100644 --- a/python/clockwork/tests/utils_test.py +++ b/python/clockwork/tests/utils_test.py @@ -106,3 +106,25 @@ def test_sam_record_count(self): self.assertEqual(42, utils.sam_record_count(tmpfile)) os.unlink(tmpfile) + + def test_vcf_has_records(self): + """test vcf_has_records""" + tmpfile = "tmp.vcf_has_records.vcf" + with open(tmpfile, "w") as f: + print("#foo", file=f) + self.assertFalse(utils.vcf_has_records(tmpfile)) + with open(tmpfile, "a") as f: + print("ref\t42\t.\tA\tG\t.\tPASS\t.", file=f) + self.assertTrue(utils.vcf_has_records(tmpfile)) + os.unlink(tmpfile) + + def test_file_has_at_least_one_line(self): + """test file_has_at_least_one_line""" + tmpfile = "tmp.file_has_at_least_one_line" + with open(tmpfile, "w") as f: + pass + self.assertFalse(utils.file_has_at_least_one_line(tmpfile)) + with open(tmpfile, "w") as f: + print("hello there", file=f) + self.assertTrue(utils.file_has_at_least_one_line(tmpfile)) + os.unlink(tmpfile) diff --git a/python/clockwork/utils.py b/python/clockwork/utils.py index f82c7c1..1fee48b 100644 --- a/python/clockwork/utils.py +++ b/python/clockwork/utils.py @@ -5,6 +5,8 @@ import subprocess import sys +import pyfastaq + def decode(x): try: @@ -121,10 +123,26 @@ def make_empty_file(filename): def sam_record_count(filename): """Returns number of sam records in file""" - completed_process = syscall(r"""grep -c -v '^@' """ + filename) - try: - count = int(completed_process.stdout.rstrip()) - except: - raise Exception("Error counting sam records in file " + filename) - + count = 0 + with open(filename) as f: + for line in f: + if not line.startswith("@"): + count += 1 return count + + +def vcf_has_records(filename): + """Returns true if there is at least 1 record in VCF file""" + with open(filename) as f: + for line in f: + if not line.startswith("#"): + return True + return False + + +def file_has_at_least_one_line(filename): + """Returns true if there is at least 1 line in the file. Can be gzipped""" + f = pyfastaq.utils.open_file_read(filename) + has_line = any(f) + f.close() + return has_line diff --git a/python/clockwork/var_call_one_sample_pipeline.py b/python/clockwork/var_call_one_sample_pipeline.py index e35c511..34c4fcb 100644 --- a/python/clockwork/var_call_one_sample_pipeline.py +++ b/python/clockwork/var_call_one_sample_pipeline.py @@ -4,6 +4,20 @@ from clockwork import cortex, gvcf, read_map, read_trim, reference_dir, utils +def check_reads_files(read_files): + ok = True + for filename in read_files: + if os.path.exists(filename): + if not utils.file_has_at_least_one_line(filename): + logging.error(f"NO READS IN FILE: {filename}") + ok = False + else: + logging.error(f"READS FILE NOT FOUND: {filename}") + ok = False + + return ok + + def run( reads1_list, reads2_list, @@ -19,6 +33,10 @@ def run( raise Exception( "Must give same number of forward and reverse reads files. Got:\nForward:{reads1_list}\nReverse:{reads2_list}" ) + if not check_reads_files(reads1_list + reads2_list): + raise Exception("Reads file(s) not found or are empty. See previous error messages. Cannot continue") + if not os.path.exists(ref_dir): + raise FileNotFoundError(f"Reference directory not found: {ref_dir}. Cannot continue") os.mkdir(outdir) @@ -57,6 +75,13 @@ def run( samtools_vcf = os.path.join(outdir, "samtools.vcf") cmd = f"bcftools mpileup --output-type u -f {refdir.ref_fasta} {rmdup_bam} | bcftools call -vm -O v -o {samtools_vcf}" utils.syscall(cmd) + samtools_vcf_has_vars = utils.vcf_has_records(samtools_vcf) + if not samtools_vcf_has_vars: + logging.warn("SAMTOOLS VCF FILE HAS ZERO VARIANTS. PLEASE CHECK THE QUALITY OF INPUT DATA") + + samtools_gvcf = os.path.join(outdir, "samtools.gvcf") + cmd = f"bcftools mpileup -I --output-type u -f {refdir.ref_fasta} {rmdup_bam} | bcftools call -c -O v -o {samtools_gvcf}" + utils.syscall(cmd) cortex_dir = os.path.join(outdir, "cortex") ctx = cortex.CortexRunCalls( @@ -66,29 +91,50 @@ def run( sample_name, mem_height=cortex_mem_height, ) - ctx.run(run_mccortex_view_kmers=False) + try: + ctx.run(run_mccortex_view_kmers=False) + except: + logging.error("ERROR RUNNING CORTEX. WILL TRY TO CONTINUE USING SAMTOOLS VCF ONLY (IF SAMTOOLS FOUND VARIANTS), BUT PLEASE CHECK CORTEX LOGS") + ctx_vcf_dir = os.path.join(cortex_dir, "cortex.out", "vcfs") - cortex_vcfs = [ - os.path.join(ctx_vcf_dir, x) - for x in os.listdir(ctx_vcf_dir) - if x.endswith("raw.vcf") - ] + try: + cortex_vcfs = [ + os.path.join(ctx_vcf_dir, x) + for x in os.listdir(ctx_vcf_dir) + if x.endswith("raw.vcf") + ] + except: + cortex_vcfs = [] + + cortex_vcf_has_vars = False if len(cortex_vcfs) != 1: - raise Exception("Error running cortex. Could not find output VCF file") - cortex_vcf = os.path.join(outdir, "cortex.vcf") - os.rename(cortex_vcfs[0], cortex_vcf) - if not debug: - utils.syscall(f"rm -rf {cortex_dir}") + logging.error("NO VCF FILE MADE BY CORTEX. WILL TRY TO CONTINUE USING SAMTOOLS VCF ONLY (IF SAMTOOLS FOUND VARIANTS), BUT PLEASE CHECK THE QUALITY OF INPUT DATA, AND CORTEX LOGS") + cortex_vcf = "" + else: + cortex_vcf = os.path.join(outdir, "cortex.vcf") + os.rename(cortex_vcfs[0], cortex_vcf) + cortex_vcf_has_vars = utils.vcf_has_records(cortex_vcf) + if not cortex_vcf_has_vars: + logging.warn("CORTEX VCF FILE HAS ZERO VARIANTS. PLEASE CHECK THE QUALITY OF INPUT DATA") + if not debug: + utils.syscall(f"rm -rf {cortex_dir}") - minos_dir = os.path.join(outdir, "minos") - cmd = f"minos adjudicate --reads {rmdup_bam} {minos_dir} {refdir.ref_fasta} {samtools_vcf} {cortex_vcf}" - utils.syscall(cmd) final_vcf = os.path.join(outdir, "final.vcf") - os.rename(os.path.join(minos_dir, "final.vcf"), final_vcf) - samtools_gvcf = os.path.join(outdir, "samtools.gvcf") - cmd = f"bcftools mpileup -I --output-type u -f {refdir.ref_fasta} {rmdup_bam} | bcftools call -c -O v -o {samtools_gvcf}" - utils.syscall(cmd) + if not (samtools_vcf_has_vars and cortex_vcf_has_vars): + logging.error("NO VARIANTS FOUND BY CORTEX OR SAMTOOLS. WRITING HEADER-ONLY FINAL VCF FILE INSTEAD OF RUNNING MINOS") + with open(final_vcf, "w") as f: + print("##fileformat=VCFv4.2", file=f) + print("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "sample", sep="\t", file=f) + minos_vcf_has_vars = False + else: + minos_dir = os.path.join(outdir, "minos") + cmd = f"minos adjudicate --reads {rmdup_bam} {minos_dir} {refdir.ref_fasta} {samtools_vcf} {cortex_vcf}" + utils.syscall(cmd) + os.rename(os.path.join(minos_dir, "final.vcf"), final_vcf) + if not debug: + utils.syscall(f"rm -rf {minos_dir}") + final_gvcf = os.path.join(outdir, "final.gvcf") gvcf.gvcf_from_minos_vcf_and_samtools_gvcf( refdir.ref_fasta, final_vcf, samtools_gvcf, final_gvcf, @@ -97,9 +143,6 @@ def run( os.unlink(samtools_gvcf) gvcf.gvcf_to_fasta(final_gvcf, f"{final_gvcf}.fasta") - if not debug: - utils.syscall(f"rm -rf {minos_dir}") - if not (keep_bam or debug): os.unlink(rmdup_bam) os.unlink(rmdup_bam + ".bai") @@ -109,3 +152,5 @@ def run( raise Exception(f"Error. Final VCF file not found: {final_vcf}") logging.info(f"Finished variant calling. Final VCF file: {final_vcf}") + if not utils.vcf_has_records(final_vcf): + logging.error("NO VARIANTS FOUND! PLEASE CHECK QUALITY OF INPUT DATA")