Clinical-Genomics · hassanfa · Nov 4, 2020 · Nov 3, 2020 · Nov 3, 2020 · Nov 3, 2020
@@ -3,6 +3,7 @@
 
 from BALSAMIC.utils.rule import get_conda_env
 from BALSAMIC.utils.rule import get_threads
+from BALSAMIC.utils.workflowscripts import get_densityplot
 
 ## UmiAwareMarkDuplicatesWithMateCigar - umimetrics
 rule picard_umiaware:
@@ -134,7 +135,7 @@ rule bcftools_query_calculatenoiseAF_umi:
 source activate {params.conda};
 
 bcftools query \
--f \"%CHROM:%POS:REF->%ALT\\t[%AF]\\n\" \
+-f \"%CHROM:%POS:%REF->%ALT\\t[%AF]\\n\" \
 {input.umiextract} > \
 {output.umiextractAF};
 
@@ -153,8 +154,7 @@ END {{print(\"NoiseAF: \"(sum1/f1_nr)/(sum2/(NR-f1_nr)))}}' \
 {output.noiseAF}
         """
 
-
-# Plot the TNscope calculated AFs befor and after consensuscall
+# Plot the TNscope calculated AFs before and after consensuscall
 rule seaborn_densityplot_umi:
     input:
         table1 = qc_dir + "{sample}.TNscope.umialign.AF.txt",
@@ -170,17 +170,4 @@ rule seaborn_densityplot_umi:
     message:
         "Density AF plots for sample {params.sample_id}"
     run:
-        import matplotlib.pyplot as plt
-        import pandas as pd
-        import seaborn as sns
-        umi= pd.read_csv(input.table1, sep='\t', header=None)
-        umi.columns = ['id', 'AF']
-        umi['method'] = 'umiextract'
-        con = pd.read_csv(input.table2, sep='\t', header=None)
-        con.columns = ['id', 'AF']
-        con['method'] = 'consensuscall'
-        sns.kdeplot(umi['AF'], color='r', shade=True, Label=set(umi['method']))
-        sns.kdeplot(con['AF'], color='b', shade=True, Label=set(con['method']))
-        plt.xlabel('Allelic Frequency (AF)')
-        plt.ylabel('Probability Density')
-        return (plt.savefig(output.AF_plot))
+        get_densityplot(input.table1, input.table2, "umiextracted", "consensuscalled", output.AF_plot)
@@ -0,0 +1,44 @@
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+def get_file_contents(input_file, prefix_name):
+    """ Reads the 2-column tsv file and returns file contents with header names.
+
+    Arguments:
+    input_file: Path to the TSV file 
+    """
+
+    input_df = pd.read_csv(input_file, sep='\t', header=None)
+    input_df.columns = ['id', 'AF']
+    input_df['method'] = prefix_name
+    return input_df
+
+
+def get_densityplot(input_file1, input_file2, prefix_name1, prefix_name2,
+                    output_file):
+    """ Reads two input dataframes and outputs a densityplot to the pdf file.
+
+    Arguments:
+    input_file1: Path to the TSV file1 (e.g: AFs calculated without consensuscall step) 
+    input_file2: Path to the TSV file2 (e.g: AFs calculated after consensuscall step)
+    prefix_name1: Label legend w.r.t file1 (e.g: "umiextract") 
+    prefix_name2: Label legend w.r.t file2 (e.g: "consensuscall")
+    output_file: Path to output filename with '.pdf' extn (e.g: "outplot.pdf")
+    """
+
+    dataframe1 = get_file_contents(input_file1, prefix_name1)
+    dataframe2 = get_file_contents(input_file2, prefix_name2)
+    sns.kdeplot(dataframe1['AF'],
+                color='r',
+                shade=True,
+                label=set(dataframe1['method']))
+    sns.kdeplot(dataframe2['AF'],
+                color='b',
+                shade=True,
+                label=set(dataframe2['method']))
+    plt.xlabel('Allelic Frequency (AF)')
+    plt.ylabel('Probability Density')
+    plt.savefig(output_file)
+    return output_file
@@ -13,7 +13,7 @@ psutil>=5.7.0
 pydantic>=1.5.1
 pygments>=2.6.1
 pyyaml>=5.3.1
-seaborn>=0.11.0
+seaborn==0.10.1
 six>=1.12.0
 snakemake==5.13.0
 yapf>=0.30.0
@@ -0,0 +1,10 @@
+1:124845:C->CA	0.667
+1:458764:C->G	0.6
+1:458793:G->A	0.6
+1:569492:T->C	1
+1:753405:C->A	1
+1:753425:T->C	1
+1:753474:C->G	1
+1:757896:C->A	1
+1:757936:C->A	1
+1:808223:G->C	1
@@ -0,0 +1,10 @@
+1:21502724:TGCC->T	0.75
+1:23530237:T->C	0.455
+1:27023140:TGGC->T	0.008
+1:27023450:AGCG->A	0.006
+1:27023503:C->A	0.526
+1:27058056:G->A	0.002
+1:27089670:C->T	0.003
+1:27098991:C->T	0.003
+1:27100181:CGCA->C	0.051
+1:27101370:G->A	0.003
@@ -0,0 +1,10 @@
+1:124845:C->CA
+1:458764:C->G
+1:458793:G->A
+1:569492:T->C
+1:753405:C->A
+1:753425:T->C
+1:753474:C->G
+1:757896:C->A
+1:757936:C->A
+1:808223:G->C
@@ -27,6 +27,7 @@
     get_variant_callers, get_script_path, get_result_dir, get_threads,
     get_delivery_id, get_reference_output_files)
 
+from BALSAMIC.utils.workflowscripts import get_file_contents, get_densityplot
 
 def test_get_variant_callers_wrong_analysis_type(tumor_normal_config):
     # GIVEN a wrong analysis_type
@@ -721,3 +722,43 @@ def test_get_fastq_bind_path(tmpdir_factory):
     create_fastq_symlink(casefiles=casefiles, symlink_dir=symlink_to_path)
     #THEN function returns list containing the original parent path!
     assert get_fastq_bind_path(symlink_to_path) == [symlink_from_path]
+
+
+def test_get_file_contents():
+    #GIVEN a test input file
+    test_file = 'tests/test_data/densityplots/dummy_file1.txt'
+
+    # WHEN invoking function
+    test_file_built = get_file_contents(test_file,'umi')
+    column_names = ['id', 'AF', 'method']
+
+    # THEN check column names and no. of column matches
+    assert all(test_file_built.columns == column_names)
+    assert len(test_file_built.columns) == 3
+
+def test_get_wrongfile_contents():
+    #GIVEN a test input file
+    test_wrongfile = 'tests/test_data/densityplots/dummy_wrongfile.txt'
+
+    # WHEN invoking function
+    with pytest.raises(ValueError):
+        test_wrongfile_built = get_file_contents(test_wrongfile,'umi')
+        assert len(test_wrongfile_built.columns)!=3
+
+def test_get_densityplot():
+    #GIVEN prefix names, input and files
+    test_file1= 'tests/test_data/densityplots/dummy_file1.txt'
+    test_file2= 'tests/test_data/densityplots/dummy_file2.txt'
+    name1 = 'testnam1'
+    name2 = 'testnam2'
+    out_file='tests/test_data/densityplots/dummy_plot.pdf'
+
+    # WHEN invoking function out_file is created
+    test_result = get_densityplot(test_file1, test_file2, name1, name2, out_file)
+    test_result_name = Path(test_result).name
+
+    # THEN check for filepaths
+    assert Path(test_file1).exists()
+    assert Path(test_file2).exists()
+    assert Path(out_file).exists()
+    assert test_result_name == "dummy_plot.pdf"