Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisamiller committed Feb 13, 2019
0 parents commit f18d76a
Show file tree
Hide file tree
Showing 2 changed files with 340 additions and 0 deletions.
8 changes: 8 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM continuumio/miniconda3
# RUN apt-get update && apt-get install -y \
# vim \
# git
# WORKDIR /
# RUN pip install --extra-index-url https://testpypi.python.org/pypi vcf-annotation-tools
RUN pip install vcfpy pysam
COPY somatic_llr_filter.py /usr/bin/somatic_llr_filter.py
332 changes: 332 additions & 0 deletions somatic_llr_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,332 @@
import argparse
import sys
import os
import re
import vcfpy
import tempfile
import csv
import math
from collections import OrderedDict

def define_parser():
parser = argparse.ArgumentParser('somatic-llr-filter')
parser.add_argument(
"input_vcf",
help="A VCF file with at least two samples (tumor and normal) and readcount information"
)
parser.add_argument(
"output_vcf",
help="Path to write the output VCF file"
)
parser.add_argument(
"--sequence-error-rate",
help="expected sequencing error rate (range 0 to 1) - default 0.005",
type=float,
default=0.005
)
parser.add_argument(
"--tumor-purity",
help="tumor purity (range 0 to 1) - default 1",
type=float,
default=1
)
parser.add_argument(
"--normal-contamination-rate",
help="normal contamination rate (range 0 to 1) - default 0",
type=float,
default=0
)
parser.add_argument(
"--allele-depth-field",
help="field corresponding to allele depth - default AD",
default="AD"
)
parser.add_argument(
"--site-depth-field",
help="field corresponding to site depth - default DP",
default="DP"
)
parser.add_argument(
"--tumor-sample-name",
help="name of the vcf sample corresponding to the tumor - default TUMOR",
default="TUMOR"
)
parser.add_argument(
"--normal-sample-name",
help="name of the vcf sample corresponding to the normal - default NORMAL",
default="NORMAL"
)
parser.add_argument(
"--llr-field",
help="name of the vcf info field in which to store the log-likelihood ratio value",
default="LLR"
)
parser.add_argument(
"--somatic-field",
help="name of the vcf info field in which to store the classification call from this script (somatic = 0 or 1)",
default="SOMATIC"
)
parser.add_argument(
"--llr-threshold",
type=float,
help="if set, variants that are not somatic or have an LLR value below this threshold will have their FILTER field set appropriately"
)
parser.add_argument(
"--filter-field",
help="if --llr-threshold is given, then failing variants will have this string added to their FILTER field",
default="SOMATIC_LLR"
)

parser.add_argument('-w', "--overwrite", action='store_true',
help="by default, this tool will raise an exception if the LLR or SOMATIC fields already exist in the VCF. This flag allows existing fields to be overwritten."
)
return parser


def create_vcf_reader(args):
vcf_reader = vcfpy.Reader.from_path(args.input_vcf)
#do some sanity checking
sample_names = vcf_reader.header.samples.names
is_multi_sample = len(sample_names) > 1
if(not is_multi_sample):
vcf_reader.close()
raise Exception("A multisample VCF with both tumor and normal data is required")

if not args.tumor_sample_name in sample_names:
raise Exception("Could not find tumor sample name {} in sample names".format(args.tumor_sample_name))
if not args.normal_sample_name in sample_names:
raise Exception("Could not find normal sample name {} in sample names".format(args.normal_sample_name))

#check for needed format fields
if not args.allele_depth_field in vcf_reader.header.format_ids():
vcf_reader.close()
raise Exception("No " + args.allele_depth_field + " format field found. Annotate your VCF with readcounts first")
if not args.site_depth_field in vcf_reader.header.format_ids():
vcf_reader.close()
raise Exception("No {} format field found. Annotate your VCF with readcounts first".format(args.site_depth_field))

return vcf_reader


def create_vcf_writer(args, vcf_reader):
output_file = args.output_vcf

new_header = vcf_reader.header.copy()

#check/add llr field in header
if args.llr_field in vcf_reader.header.info_ids():
if args.overwrite: #verify compatibility
if not vcf_reader.header.get_info_field_info(args.llr_field).type == "Float":
vcf_reader.close()
raise Exception("{} field to be overwritten must be of type 'Float'. Either modify this, or choose a new {} field".format(args.llr_field,args.llr_field))
if not vcf_reader.header.get_info_field_info(args.llr_field).number == 1:
vcf_reader.close()
raise Exception("{} field to be overwritten must have Number '1'. Either modify this, or choose a new {} field".format(args.llr_field,args.llr_field))
else:
vcf_reader.close()
raise Exception("INFO already contains a {} field. Choose a different label, or use the --overwrite flag to retain this field description and overwrite values".format(args.llr_field))

else:
od = OrderedDict([('ID', args.llr_field), ('Number', '1'), ('Type', 'Float'), ('Description', 'log-likelihood ratio for the binomial filter call')])
new_header.add_info_line(od)

#check/add somatic field in header
if args.somatic_field in vcf_reader.header.info_ids():
if args.overwrite:
if not vcf_reader.header.get_info_field_info(args.somatic_field).type == "Flag":
vcf_reader.close()
raise Exception("{} field to be overwritten must be of type 'Flag'. Either modify this, or choose a new {} field".format(args.somatic_field,args.somatic_field))
if not vcf_reader.header.get_info_field_info(args.somatic_field).number == 0:
vcf_reader.close()
raise Exception("{} field to be overwritten must have Number '0'. Either modify this, or choose a new {} field".format(args.somatic_field,args.somatic_field))
else:
vcf_reader.close()
raise Exception("INFO already contains a {} field. Choose a different label, or use the --overwrite flag to retain this field description and overwrite values".format(args.somatic_field))
else:
od = OrderedDict([('ID', args.somatic_field), ('Number', '0'), ('Type', 'Flag'), ('Description', 'Is a somatic mutation')])
new_header.add_info_line(od)

#check/add FILTER field in header
if args.llr_threshold is not None: #filtering may not even be specified
if args.filter_field in vcf_reader.header.filter_ids():
if not args.overwrite:
vcf_reader.close()
raise Exception("FILTER {} already exists. Choose a different --filter-field, or use the --overwrite flag to retain this filter description and overwrite values".format(args.filter_field))
else:
od = OrderedDict([('ID', args.filter_field), ('Description', 'Is a somatic mutation with LLR greater than {}'.format(args.llr_threshold))])
new_header.add_filter_line(od)


return vcfpy.Writer.from_path(output_file, new_header)


#for each call possibility (ref, germ het, germ hom, etc)
#calculate the llr here
def calc_llr(normal_expect, tumor_expect, normal_ref, normal_var, tumor_ref, tumor_var):
return(normal_ref * math.log(1-normal_expect) +
normal_var * math.log(normal_expect) +
tumor_ref * math.log(1-tumor_expect) +
tumor_var * math.log(tumor_expect))

def get_llr(call, normal_ref, normal_var, tumor_ref, tumor_var, error_rate, heterozygous_expect,
homozygous_expect, tumor_freq, tumor_purity, normal_contamination_rate, error_expect):
if call == "Germline_het":
return(calc_llr(heterozygous_expect, heterozygous_expect,
normal_ref, normal_var, tumor_ref, tumor_var))
elif call == "Germline_hom":
return(calc_llr(homozygous_expect, homozygous_expect,
normal_ref, normal_var, tumor_ref, tumor_var))
elif call == "LOH_ref":
return(calc_llr(heterozygous_expect, error_rate,
normal_ref, normal_var, tumor_ref, tumor_var))
elif call == "LOH_variant":
return(calc_llr(heterozygous_expect, homozygous_expect,
normal_ref, normal_var, tumor_ref, tumor_var))
elif call == "NotSomatic":
return(calc_llr(error_expect, error_expect,
normal_ref, normal_var, tumor_ref, tumor_var))
elif call == "Reference":
return(calc_llr(error_rate, error_rate,
normal_ref, normal_var, tumor_ref, tumor_var))
elif call == "Somatic":
return(calc_llr(error_rate + (tumor_freq / tumor_purity * normal_contamination_rate), tumor_freq,
normal_ref, normal_var, tumor_ref, tumor_var))
raise Exception('Call "' + call + '" is not a recognized type')


def make_call(normal_ref, normal_var, tumor_ref, tumor_var, error_rate, heterozygous_expect,
homozygous_expect, tumor_freq, tumor_purity, normal_contamination_rate, error_expect):
#Want to test several models and pick the most likely one
#Reference: normal expectation is 0.001 and tumor expectation is 0.01
#Germline Het: normal expectation is 0.5 and tumor expectation is 0.5
#Somatic Het: normal expectation is 0.001 and tumor expectation in 0.5
#Germline Homozygote: normal expectation is 0.999 and tumor expectation is 0.999
#LOH (variant): germline is 0.5 and tumor is 0.999
#LOH (ref): germline is 0.5 and tumor is 0.001

marginal_probability = None
max_llr = None
max2_llr = None
max_call = None
max2_call = None
llr = 0

if(tumor_var == 0): #special case that chokes some of the log code. LLR is always zero if we have no supp reads
return([0,"Reference"])

call_types = ["Germline_het", "Germline_hom", "LOH_ref", "LOH_variant", "NotSomatic", "Reference", "Somatic"];
for call in call_types:
llr = get_llr(call, normal_ref, normal_var, tumor_ref, tumor_var, error_rate, heterozygous_expect,
homozygous_expect, tumor_freq, tumor_purity, normal_contamination_rate, error_expect)
if marginal_probability is None:
marginal_probability=llr
elif llr > marginal_probability:
marginal_probability = llr + math.log(1 + math.exp(marginal_probability-llr))
else:
marginal_probability = marginal_probability + math.log(1 + math.exp(llr-marginal_probability))

max_llr_def = (max_llr is not None)
max2_llr_def = (max2_llr is not None)

if not (max_llr_def and (llr < max_llr)):
#save the second-place, store the new best
max2_call = max_call
max2_llr = max_llr
max_llr = llr
max_call = call
elif not (max2_llr_def and (llr < max2_llr)):
max2_call = call
max2_llr = llr
#end for

#get the final llr
if max_llr is not None:
llr = max_llr-max2_llr
else:
llr = 0

#get the final call
if max_call is None:
max_call = "-"

return (llr,max_call)



def main(args_input = sys.argv[1:]):
parser = define_parser()
args = parser.parse_args(args_input)

#these aren't going to change per site
homozygous_expect = 1 - args.sequence_error_rate;
heterozygous_expect = 0.5;

vcf_reader = create_vcf_reader(args)
vcf_writer = create_vcf_writer(args, vcf_reader)

for entry in vcf_reader:
#collect the needed info
ref = entry.REF
alts = entry.ALT

#this code will mostly handle multiple alleles, as written, but the issue is that there is
#no way to set the INFO (SOMATIC) field appropriately when there are multiple alleles per line,
# so we're going to restrict it to decomposed VCFs - one variant per line
if len(alts) > 1:
raise Exception("site with multiple alleles detected. This tool requires a decomposed vcf (one allele per line) so that INFO fields can be set appropriately")

def getFormatField(sample_name, field_name):
if(sample_name in entry.call_for_sample and field_name in entry.call_for_sample[sample_name].data):
return entry.call_for_sample[sample_name].data[field_name]
raise Exception("Field {} missing in an entry for Sample {}".format(field_name, sample_name))


ad_nrm = getFormatField(args.normal_sample_name,args.allele_depth_field)
ad_tum = getFormatField(args.tumor_sample_name,args.allele_depth_field)
normal_depth = getFormatField(args.tumor_sample_name,args.site_depth_field)
tumor_depth = getFormatField(args.tumor_sample_name,args.site_depth_field)
normal_ref = ad_nrm[0]
tumor_ref = ad_tum[0]

#TODO parse out per alt, retrieve calls
call = []
for i in range(1,(len(alts)+1)):
normal_var = ad_nrm[i]
tumor_var = ad_tum[i]

#weighted average of the frequencies
error_expect = (tumor_var + normal_var)/(tumor_depth + normal_depth)
#dave had this line in there, but I don't know why...
#error_expect ||= error_rate
if error_expect == 1:
error_expect = 1 - args.sequence_error_rate

tumor_freq = tumor_var/tumor_depth
if tumor_freq == 0:
tumor_freq = args.sequence_error_rate
elif tumor_freq == 1:
tumor_freq = tumor_freq - args.sequence_error_rate

(llr,call) = make_call(normal_ref, normal_var, tumor_ref, tumor_var, args.sequence_error_rate, heterozygous_expect,
homozygous_expect, tumor_freq, args.tumor_purity, args.normal_contamination_rate, error_expect)

#Store it back in the entry before writing out.
if call == "Somatic":
entry.INFO[args.somatic_field] = 1
else:
entry.INFO.pop('SOMATIC',None)

entry.INFO[args.llr_field] = llr

#do filtering if specified
if args.llr_threshold is not None:
if (not call == "Somatic") or (llr < args.llr_threshold):
entry.FILTER.append(args.filter_field);

vcf_writer.write_record(entry)

vcf_reader.close()
vcf_writer.close()

if __name__ == '__main__':
main()

0 comments on commit f18d76a

Please sign in to comment.