diff --git a/preprocessor/preprocessor/utilities.py b/preprocessor/preprocessor/utilities.py index affda6a8..901a7da5 100644 --- a/preprocessor/preprocessor/utilities.py +++ b/preprocessor/preprocessor/utilities.py @@ -988,7 +988,7 @@ def extract_pgx_variants(pharmcat_positions: Path, reference_fasta: Path, vcf_fi # update info ref_info = list(set([x[7] for x in ref_pos_static[input_chr_pos].values()])) - if fields[7] != '.': + if fields[7] != '.' and fields[7] not in ref_info: ref_info.append(fields[7]) updated_info = ';'.join(ref_info) diff --git a/preprocessor/tests/helpers.py b/preprocessor/tests/helpers.py index 8a7355f0..7dcf6464 100644 --- a/preprocessor/tests/helpers.py +++ b/preprocessor/tests/helpers.py @@ -99,6 +99,29 @@ def compare_vcf_files(expected: Path, tmp_dir: Path, basename: str, sample: str assert orig_actual_vcf in results, ('%s not in results (%s)' % (actual, results)) if copy_to_test_dir: shutil.copyfile(actual, actual.parent / actual.name) - print(read_vcf(actual)) - assert read_vcf(expected) == read_vcf(actual), '%s mismatch' % key + # compare vcfs line by line + expected_lines = read_vcf(expected).split('\n') + actual_lines = read_vcf(actual).split('\n') + + if len(expected_lines) != len(actual_lines): + assert False, f'mismatching number of lines between {expected} and {actual}' + + for expected_line, actual_line in zip(expected_lines, actual_lines): + columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'Samples'] + expected_fields = expected_line.split('\t') + actual_fields = actual_line.split('\t') + + if len(expected_fields) != len(actual_fields): + assert False, f'mismatching number of samples between {expected} and {actual}' + + for i, col in enumerate(columns): + # compare the ALT alleles + if i == 4 and set(actual_fields[3]) != set(expected_fields[3]): + assert False, f'mismatching {col}:\nexpected: {expected_line}\nactual: {actual_line}' + # compare genotypes + if i == 9 and actual_fields[9:] != expected_fields[9:]: + assert False, f'mismatching {col}:\nexpected: {expected_line}\nactual: {actual_line}' + # compare the rest + if actual_line[i] != expected_line[i]: + assert False, f'mismatching {col}:\nexpected: {expected_line}\nactual: {actual_line}' diff --git a/preprocessor/tests/test_utilities.py b/preprocessor/tests/test_utilities.py index b3cd31be..3749e75e 100644 --- a/preprocessor/tests/test_utilities.py +++ b/preprocessor/tests/test_utilities.py @@ -1,6 +1,7 @@ import os import shutil import tempfile +import random from pathlib import Path from timeit import default_timer as timer from typing import List @@ -546,24 +547,52 @@ def test_export_single_sample_concurrent(): def test_absent_and_unspecified_to_ref(): reference_fasta: Path = helpers.get_reference_fasta(helpers.pharmcat_positions_file) - vcf_file = helpers.test_dir / 'raw_unspecified_genotypes.vcf.bgz' - expected_file = helpers.test_dir / 'test.unspecified_to_ref.absent_to_ref.vcf' + expected_file = helpers.pharmcat_positions_file with tempfile.TemporaryDirectory() as td: - # copy the input test file to the temporary folder + # copy the expected file to the temporary folder tmp_dir = Path(td) - tmp_vcf = tmp_dir / vcf_file.name - shutil.copyfile(vcf_file, tmp_vcf) - # copy the expected output files to the temporary folder - expected_vcf = tmp_dir / expected_file.name - shutil.copyfile(expected_file, expected_vcf) + expected_bgz = tmp_dir / 'pharmcat_positions.vcf.gz' + shutil.copyfile(expected_file, expected_bgz) + raw_lines = helpers.read_vcf(expected_bgz, bgzipped=True, skip_comments=False).split('\n') + + # convert "0/0" to "0|0" to avoid irrelevant mismatching error + expected_vcf = tmp_dir / 'random_pharmcat_positions.vcf' + expected_lines = [] + with open(expected_vcf, 'w') as f: + for line in raw_lines: + if not line.startswith('#'): + fields = line.split('\t') + fields[9] = '0|0' + line = '\t'.join(fields) + f.write(line + '\n') + expected_lines.append(line) + + # create the test file by randomly knocking down positions or genotypes + test_file = tmp_dir / 'random_absent_and_unspecified_genotypes.vcf' + random.seed(37) + with open(test_file, 'w') as f: + for line in expected_lines: + if line.startswith('#'): # metadata and the header line + f.write(line + '\n') + else: + fields = line.split('\t') + rnd_int = random.randrange(10) + if rnd_int == 1: # make 10% of the positions absent + continue + if rnd_int == 2: # convert 10% of the positions to unspecified genotypes + fields[9] = './.' + line = '\t'.join(fields) + f.write(line + '\n') + test_vcf = utils.bgzip_file(test_file, True) + # copy the necessary helper files to the temporary folder shutil.copyfile(preprocessor.CHR_RENAME_FILE, tmp_dir / preprocessor.CHR_RENAME_MAP_FILENAME) # run the utility function to convert absent positions to reference - basename = 'test1' - samples = ['Sample_1', 'Sample_2'] - input_basename = tmp_vcf.name - preprocessor.preprocess(helpers.pharmcat_positions_file, reference_fasta, [tmp_vcf], samples, + basename = 'test_absent_and_unspecified_to_ref' + samples = ['PharmCAT'] + input_basename = test_vcf.name + preprocessor.preprocess(helpers.pharmcat_positions_file, reference_fasta, [test_vcf], samples, input_basename, tmp_dir, basename, absent_to_ref=True, unspecified_to_ref=True) helpers.compare_vcf_files(expected_vcf, tmp_dir, basename)