Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle cortex fails in variant call script #115

Merged
merged 1 commit into from
Jul 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion python/clockwork/gvcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,8 @@ def gvcf_from_minos_vcf_and_samtools_gvcf(ref_fasta, minos_vcf, samtools_vcf, ou
print(samtools_record, file=f_out)
ref_pos = samtools_record.POS + 1

_finish_contig(ref_pos, ref_seq, minos_record, minos_iter, f_out)
if ref_seq is not None: # happens if the samtools VCF was empty
_finish_contig(ref_pos, ref_seq, minos_record, minos_iter, f_out)
_print_non_samtools_seqs(ref_seqs, used_ref_seqs, minos_records, f_out)


Expand Down
22 changes: 22 additions & 0 deletions python/clockwork/tests/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,25 @@ def test_sam_record_count(self):

self.assertEqual(42, utils.sam_record_count(tmpfile))
os.unlink(tmpfile)

def test_vcf_has_records(self):
"""test vcf_has_records"""
tmpfile = "tmp.vcf_has_records.vcf"
with open(tmpfile, "w") as f:
print("#foo", file=f)
self.assertFalse(utils.vcf_has_records(tmpfile))
with open(tmpfile, "a") as f:
print("ref\t42\t.\tA\tG\t.\tPASS\t.", file=f)
self.assertTrue(utils.vcf_has_records(tmpfile))
os.unlink(tmpfile)

def test_file_has_at_least_one_line(self):
"""test file_has_at_least_one_line"""
tmpfile = "tmp.file_has_at_least_one_line"
with open(tmpfile, "w") as f:
pass
self.assertFalse(utils.file_has_at_least_one_line(tmpfile))
with open(tmpfile, "w") as f:
print("hello there", file=f)
self.assertTrue(utils.file_has_at_least_one_line(tmpfile))
os.unlink(tmpfile)
30 changes: 24 additions & 6 deletions python/clockwork/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import subprocess
import sys

import pyfastaq


def decode(x):
try:
Expand Down Expand Up @@ -121,10 +123,26 @@ def make_empty_file(filename):

def sam_record_count(filename):
"""Returns number of sam records in file"""
completed_process = syscall(r"""grep -c -v '^@' """ + filename)
try:
count = int(completed_process.stdout.rstrip())
except:
raise Exception("Error counting sam records in file " + filename)

count = 0
with open(filename) as f:
for line in f:
if not line.startswith("@"):
count += 1
return count


def vcf_has_records(filename):
"""Returns true if there is at least 1 record in VCF file"""
with open(filename) as f:
for line in f:
if not line.startswith("#"):
return True
return False


def file_has_at_least_one_line(filename):
"""Returns true if there is at least 1 line in the file. Can be gzipped"""
f = pyfastaq.utils.open_file_read(filename)
has_line = any(f)
f.close()
return has_line
87 changes: 66 additions & 21 deletions python/clockwork/var_call_one_sample_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,20 @@
from clockwork import cortex, gvcf, read_map, read_trim, reference_dir, utils


def check_reads_files(read_files):
ok = True
for filename in read_files:
if os.path.exists(filename):
if not utils.file_has_at_least_one_line(filename):
logging.error(f"NO READS IN FILE: {filename}")
ok = False
else:
logging.error(f"READS FILE NOT FOUND: {filename}")
ok = False

return ok


def run(
reads1_list,
reads2_list,
Expand All @@ -19,6 +33,10 @@ def run(
raise Exception(
"Must give same number of forward and reverse reads files. Got:\nForward:{reads1_list}\nReverse:{reads2_list}"
)
if not check_reads_files(reads1_list + reads2_list):
raise Exception("Reads file(s) not found or are empty. See previous error messages. Cannot continue")
if not os.path.exists(ref_dir):
raise FileNotFoundError(f"Reference directory not found: {ref_dir}. Cannot continue")

os.mkdir(outdir)

Expand Down Expand Up @@ -57,6 +75,13 @@ def run(
samtools_vcf = os.path.join(outdir, "samtools.vcf")
cmd = f"bcftools mpileup --output-type u -f {refdir.ref_fasta} {rmdup_bam} | bcftools call -vm -O v -o {samtools_vcf}"
utils.syscall(cmd)
samtools_vcf_has_vars = utils.vcf_has_records(samtools_vcf)
if not samtools_vcf_has_vars:
logging.warn("SAMTOOLS VCF FILE HAS ZERO VARIANTS. PLEASE CHECK THE QUALITY OF INPUT DATA")

samtools_gvcf = os.path.join(outdir, "samtools.gvcf")
cmd = f"bcftools mpileup -I --output-type u -f {refdir.ref_fasta} {rmdup_bam} | bcftools call -c -O v -o {samtools_gvcf}"
utils.syscall(cmd)

cortex_dir = os.path.join(outdir, "cortex")
ctx = cortex.CortexRunCalls(
Expand All @@ -66,29 +91,50 @@ def run(
sample_name,
mem_height=cortex_mem_height,
)
ctx.run(run_mccortex_view_kmers=False)
try:
ctx.run(run_mccortex_view_kmers=False)
except:
logging.error("ERROR RUNNING CORTEX. WILL TRY TO CONTINUE USING SAMTOOLS VCF ONLY (IF SAMTOOLS FOUND VARIANTS), BUT PLEASE CHECK CORTEX LOGS")

ctx_vcf_dir = os.path.join(cortex_dir, "cortex.out", "vcfs")
cortex_vcfs = [
os.path.join(ctx_vcf_dir, x)
for x in os.listdir(ctx_vcf_dir)
if x.endswith("raw.vcf")
]
try:
cortex_vcfs = [
os.path.join(ctx_vcf_dir, x)
for x in os.listdir(ctx_vcf_dir)
if x.endswith("raw.vcf")
]
except:
cortex_vcfs = []

cortex_vcf_has_vars = False
if len(cortex_vcfs) != 1:
raise Exception("Error running cortex. Could not find output VCF file")
cortex_vcf = os.path.join(outdir, "cortex.vcf")
os.rename(cortex_vcfs[0], cortex_vcf)
if not debug:
utils.syscall(f"rm -rf {cortex_dir}")
logging.error("NO VCF FILE MADE BY CORTEX. WILL TRY TO CONTINUE USING SAMTOOLS VCF ONLY (IF SAMTOOLS FOUND VARIANTS), BUT PLEASE CHECK THE QUALITY OF INPUT DATA, AND CORTEX LOGS")
cortex_vcf = ""
else:
cortex_vcf = os.path.join(outdir, "cortex.vcf")
os.rename(cortex_vcfs[0], cortex_vcf)
cortex_vcf_has_vars = utils.vcf_has_records(cortex_vcf)
if not cortex_vcf_has_vars:
logging.warn("CORTEX VCF FILE HAS ZERO VARIANTS. PLEASE CHECK THE QUALITY OF INPUT DATA")
if not debug:
utils.syscall(f"rm -rf {cortex_dir}")

minos_dir = os.path.join(outdir, "minos")
cmd = f"minos adjudicate --reads {rmdup_bam} {minos_dir} {refdir.ref_fasta} {samtools_vcf} {cortex_vcf}"
utils.syscall(cmd)
final_vcf = os.path.join(outdir, "final.vcf")
os.rename(os.path.join(minos_dir, "final.vcf"), final_vcf)

samtools_gvcf = os.path.join(outdir, "samtools.gvcf")
cmd = f"bcftools mpileup -I --output-type u -f {refdir.ref_fasta} {rmdup_bam} | bcftools call -c -O v -o {samtools_gvcf}"
utils.syscall(cmd)
if not (samtools_vcf_has_vars and cortex_vcf_has_vars):
logging.error("NO VARIANTS FOUND BY CORTEX OR SAMTOOLS. WRITING HEADER-ONLY FINAL VCF FILE INSTEAD OF RUNNING MINOS")
with open(final_vcf, "w") as f:
print("##fileformat=VCFv4.2", file=f)
print("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "sample", sep="\t", file=f)
minos_vcf_has_vars = False
else:
minos_dir = os.path.join(outdir, "minos")
cmd = f"minos adjudicate --reads {rmdup_bam} {minos_dir} {refdir.ref_fasta} {samtools_vcf} {cortex_vcf}"
utils.syscall(cmd)
os.rename(os.path.join(minos_dir, "final.vcf"), final_vcf)
if not debug:
utils.syscall(f"rm -rf {minos_dir}")

final_gvcf = os.path.join(outdir, "final.gvcf")
gvcf.gvcf_from_minos_vcf_and_samtools_gvcf(
refdir.ref_fasta, final_vcf, samtools_gvcf, final_gvcf,
Expand All @@ -97,9 +143,6 @@ def run(
os.unlink(samtools_gvcf)
gvcf.gvcf_to_fasta(final_gvcf, f"{final_gvcf}.fasta")

if not debug:
utils.syscall(f"rm -rf {minos_dir}")

if not (keep_bam or debug):
os.unlink(rmdup_bam)
os.unlink(rmdup_bam + ".bai")
Expand All @@ -109,3 +152,5 @@ def run(
raise Exception(f"Error. Final VCF file not found: {final_vcf}")

logging.info(f"Finished variant calling. Final VCF file: {final_vcf}")
if not utils.vcf_has_records(final_vcf):
logging.error("NO VARIANTS FOUND! PLEASE CHECK QUALITY OF INPUT DATA")