From a4b6f334294797ca87db4b70af2cd7f406ff92f3 Mon Sep 17 00:00:00 2001 From: George Tollefson <gatollefson@gmail.com> Date: Thu, 1 Nov 2018 16:12:35 -0400 Subject: [PATCH 1/6] finished most tests. Fixed siglist filter. --- src/ViVa.jl | 2 +- src/vcf_utils_complete.jl | 13 +- test/.DS_Store | Bin 8196 -> 8196 bytes test/new_vcf_utils.jl | 244 ++++++++---------- ...olumn_list.txt => select_samples_list.txt} | 0 viva_cli.jl | 13 +- 6 files changed, 124 insertions(+), 148 deletions(-) rename test/test_files/{select_column_list.txt => select_samples_list.txt} (100%) diff --git a/src/ViVa.jl b/src/ViVa.jl index 67780f7..be303b2 100644 --- a/src/ViVa.jl +++ b/src/ViVa.jl @@ -4,7 +4,7 @@ using DataFrames #use CSV.jl ? depwarnings using PlotlyJS using Rsvg using Blink -using CSV +#using CSV using GeneticVariation using ArgParse using VCFTools diff --git a/src/vcf_utils_complete.jl b/src/vcf_utils_complete.jl index 68b749b..fb8964e 100644 --- a/src/vcf_utils_complete.jl +++ b/src/vcf_utils_complete.jl @@ -51,7 +51,6 @@ function sort_genotype_array(genotype_array) data=genotype_array[:,3:size(genotype_array,2)] chrom_positions = [parse(Int, i) for i in genotype_array[:,1:2]] genotype_array = hcat(chrom_positions,data) - genotype_array = sortrows(genotype_array, by=x->(x[1],x[2])) return genotype_array @@ -122,16 +121,18 @@ function io_sig_list_vcf_filter(sig_list,vcf_filename) pos=(sig_list[row,2]) reader = VCF.Reader(open(vcf_filename, "r")) - + tic() for record in reader if typeof(VCF.chrom(record)) == String chr = string(chr) - if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) push!(vcf_subarray,record) end + + end#remove this if need code below +#= else if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) @@ -139,7 +140,9 @@ function io_sig_list_vcf_filter(sig_list,vcf_filename) end end + =# end + toc() end return vcf_subarray @@ -367,9 +370,7 @@ convert sub from variant filters to gt_num_array and gt_chromosome_labels for pl """ function combined_all_genotype_array_functions(sub) genotype_array = generate_genotype_array(sub,"GT") - map!(s->replace(s, "chr", ""), genotype_array, genotype_array) - clean_column1!(genotype_array) genotype_array=ViVa.sort_genotype_array(genotype_array) geno_dict = define_geno_dict() @@ -508,7 +509,7 @@ function get_sample_names(reader) end """ - find_group_label_indices(pheno) + find_group_label_indices(pheno,trait_to_group_by,row_to_sort_by) find indices and determines names for group 1 and group 2 labels on plots. finds index of center of each sample group to place tick mark and label. """ function find_group_label_indices(pheno,trait_to_group_by,row_to_sort_by) diff --git a/test/.DS_Store b/test/.DS_Store index e9bcd34daba8bc5a670b0d044f29f56de51d96ab..c02d3ce4ddfb142ddb2f95a94384b282c69c667d 100644 GIT binary patch delta 16 XcmZp1XmQxES8(!r0m;qR1oij;I<5vt delta 18 ZcmZp1XmQxESCEl$@&*Bk&DR9=_y9oD21)<` diff --git a/test/new_vcf_utils.jl b/test/new_vcf_utils.jl index a0ee61b..a0825e9 100644 --- a/test/new_vcf_utils.jl +++ b/test/new_vcf_utils.jl @@ -26,18 +26,21 @@ sample_names = get_sample_names(reader) @test df[3,1] == 2 end +#functions for variant filters + @testset "io_chromosome_range_vcf_filter" begin sub = io_chromosome_range_vcf_filter("chr4:0-400000000",reader) println(sub[1:2]) println(size(sub,2)) end -@testset "io_sig_list_vcf_filter" begin +#= +@testset "filters_with_siglist" begin @testset "load_siglist" begin - sig_list=load_siglist("test_files/significantList_for_proteinstructures.csv") - println(sig_list[2:1]) - println(size(sig_list,1)) + sig_list=load_siglist("test_files/significantList_for_proteinstructures.csv") + println(sig_list[2:1]) + println(size(sig_list,1)) @testset "clean_column1_siglist!" begin clean_column1_siglist!(sig_list) @@ -45,170 +48,145 @@ end println(size(sig_list,1)) end - sub=io_sig_list_vcf_filter(sig_list,vcf_filename) - println(sub[1,5]) - - @testset "pass_chrrange_siglist_filter" begin - sub = pass_chrrange_siglist_filter(vcf_filename,sig_list,"chr4:0-400000000") - println(sub[1,5]) + @testset "io_sig_list_vcf_filter" begin + sub=io_sig_list_vcf_filter(sig_list,vcf_filename) + @test (typeof(sub[1])) == GeneticVariation.VCF.Record + @test (length(sub)) == 13 + end - @testset pass_siglist_filter begin - sub = pass_siglist_filter(vcf_filename,sig_list) - end + @testset "pass_chrrange_siglist_filter" begin + sub = pass_chrrange_siglist_filter(vcf_filename,sig_list,"chr4:0-400000000") + @test (typeof(sub[1])) == GeneticVariation.VCF.Record + @test (length(sub)) == 12 + end - @testset "chrrange_siglist_filter" begin - sub = chrrange_siglist_filter(vcf_filename,sig_list,"chr4:0-400000000") - end + @testset "pass_siglist_filter" begin + sub = pass_siglist_filter(vcf_filename, sig_list) + @test (typeof(sub[1])) == GeneticVariation.VCF.Record + @test (length(sub)) == 12 + end + @testset "chrrange_siglist_filter" begin + sub = chrrange_siglist_filter(vcf_filename,sig_list,"chr4:0-400000000") + @test (typeof(sub[1])) == GeneticVariation.VCF.Record + @test (length(sub)) == 13 + end - end + end end -end +=# @testset "io_pass_filter" begin + reader = VCF.Reader(open(vcf_filename, "r")) sub = io_pass_filter(reader) - println(sub[2,1]) + @test (typeof(sub[1])) == GeneticVariation.VCF.Record + @test (length(sub)) == 1164 end @testset "pass_chrrange_filter" begin + reader = VCF.Reader(open(vcf_filename, "r")) sub = pass_chrrange_filter(reader,"chr4:0-400000000") + @test (typeof(sub[1])) == GeneticVariation.VCF.Record + @test (length(sub)) == 856 end - - - - - -#= -#functions for variant filters - - #functions for converting vcf record array to numerical array +@testset "combined_all_genotype_array_functions" begin -""" - create_chr_dict() -creates dict for use in combined_all_genotype_array_functions() for removing 'chr' from chromosome labels to allow sorting variant records by chromosome position. -""" -@testset create_chr_dict() begin - -end - -""" - combined_all_genotype_array_functions(sub) -convert sub from variant filters to gt_num_array and gt_chromosome_labels for plot functions. -""" -@testset combined_all_genotype_array_functions(sub) begin - -end - -""" - combined_all_read_depth_array_functions(sub) -convert sub from variant filters to dp_num_array and dp_chromosome_labels for plot functions. -""" -@testset combined_all_read_depth_array_functions(sub) begin - -end - -""" - generate_genotype_array(record_sub::Array{Any,1},genotype_field::String) -Returns numerical array of genotype values (either genotype or read_depth values) which are translated by another function into num_array -Where genotype_field is either GT or DP to visualize genotype or read_depth -""" -@testset generate_genotype_array(record_sub::Array{Any,1},y) begin - -end - -""" - define_geno_dict() -returns dictionary of values for use in replace_genotype_with_vals() -""" -@testset define_geno_dict() begin +reader = VCF.Reader(open(vcf_filename, "r")) +sub = io_pass_filter(reader) -end +gt_num_array,gt_chromosome_labels=combined_all_genotype_array_functions(sub) +println(typeof(gt_num_array)) +println(length(gt_num_array)) +println(typeof(gt_chromosome_labels)) +println(length(gt_chromosome_labels)) -""" - translate_genotype_to_num_array(genotype_array,geno_dict) -returns a tuple of num_array for plotting, and chromosome labels for plotting as label bar. -Translates array of genotype values to numerical array of categorical values. -Genotype values are converted to categorical values. No_call=0, 0/0=400, heterozygous_variant=600, homozygous_variant=800 -""" -@testset translate_genotype_to_num_array(genotype_array,geno_dict) begin + @testset "generate_genotype_array" begin + reader = VCF.Reader(open(vcf_filename, "r")) + sub = io_pass_filter(reader) + genotype_array=generate_genotype_array(sub,"GT") + println(typeof(genotype_array)) + println(length(genotype_array)) + println(genotype_array[3:5]) + + @testset "define_geno_dict" begin + geno_dict = define_geno_dict() + println(typeof(geno_dict)) + println(length(geno_dict)) + + @testset "translate_genotype_to_num_array" begin + gt_num_array,gt_chromosome_labels=translate_genotype_to_num_array(genotype_array,geno_dict) + println(typeof(gt_num_array)) + println(length(gt_num_array)) + println(typeof(gt_chromosome_labels)) + println(length(gt_chromosome_labels)) + end + end + end end -""" - translate_readdepth_strings_to_num_array(read_depth_array::Array{Any,2}) -Returns array of read_depth as int for plotting and average calculation. -By default, read depth values over 100 are replaced with 100 to improve heatmap visualization (see read_depth_threshhold() ). -Where read_depth_array is output of generate_genotype_array() for DP option -returns a tuple of num_array type Int for average calculation and plotting, and chromosome labels for plotting as label bar -""" -@testset translate_readdepth_strings_to_num_array(read_depth_array::Array{Any,2}) begin +@testset "combined_all_read_depth_array_functions" begin #inside functions same used in combined_all_genotype_array_functions +reader = VCF.Reader(open(vcf_filename, "r")) +sub = io_pass_filter(reader) +dp_num_array,dp_chromosome_labels=combined_all_read_depth_array_functions(sub) +println(typeof(dp_num_array)) +println(length(dp_num_array)) +println(typeof(dp_chromosome_labels)) +println(length(dp_chromosome_labels)) + +@testset "get_sample_names" begin +reader = VCF.Reader(open(vcf_filename, "r")) +sample_names=get_sample_names(reader) +println("get_sample_names") +println(typeof(sample_names)) +println(length(sample_names)) + +@testset "avg_dp_samples" begin +avg_sample_list=avg_dp_samples(dp_num_array) +println("avg_sample_list is $avg_sample_list") + +@testset "list_sample_names_low_dp" begin +list=list_sample_names_low_dp(avg_sample_list,sample_names) +println(list) end - -#functions for sample filters - -""" - get_sample_names(reader) -returns sample ids of vcf file as a vector of symbols for naming columns of num_array dataframe object for column filter functions -""" -@testset get_sample_names(reader) begin - end -""" - find_group_label_indices(pheno) -find indices and determines names for group 1 and group 2 labels on plots. finds index of center of each sample group to place tick mark and label. -""" -@testset find_group_label_indices(pheno,trait_to_group_by,row_to_sort_by) begin -end - -""" - sortcols_by_phenotype_matrix(pheno_matrix_filename::String,trait_to_group_by::String,num_array::Array{Int64,2}, sample_names::Array{Symbol,2}) -group samples by a common trait using a user generated key matrix ("phenotype matrix") -""" -@testset sortcols_by_phenotype_matrix(pheno_matrix_filename::String,trait_to_group_by::String, num_array::Array{Int64,2}, sample_names::Array{Symbol,2}) begin - -end - -""" - select_columns(filename_sample_list::AbstractString, num_array::Array{Int64,2}, sample_names::Array{Symbol,2}) -returns num_array with columns matching user generated list of sample ids to select for analysis. num_array now has sample ids in first row. -""" -@testset select_columns(filename_sample_list::AbstractString, num_array::Array{Int64,2}, sample_names::Array{Symbol,2}) begin +@testset "avg_dp_variant" begin +avg_variant_list=avg_dp_variant(dp_num_array) +println("avg_dp_variant is $avg_variant_list") end +@testset "sortcols_by_phenotype_matrix" begin +vcf,group_label_pack=sortcols_by_phenotype_matrix("test_files/sample_phenotype_matrix.csv","control,case", dp_num_array, sample_names) +println(typeof(vcf)) +println(size(vcf,1)) +println(typeof(group_label_pack)) +println(length(group_label_pack)) + + @testset "find_group_label_indices" begin + pheno = readdlm("test_files/sample_phenotype_matrix.csv", ',') + row_to_sort_by = find(x -> x == "control,case", pheno) + row_to_sort_by = row_to_sort_by[1] + group_label_pack=find_group_label_indices(pheno,"control,case",row_to_sort_by) + println(typeof(group_label_pack)) + println(length(group_label_pack)) + end -#functions for mathematic analysis -""" - avg_dp_samples(dp_num_array::Array{Int64,2}) -create sample_avg_list vector that lists averages of read depth for each sample for input into avg_sample_dp_line_chart(sample_avg_list) -dp_num_array must contain dp values as Int64 and be without chromosome position columns -""" -@testset avg_dp_samples(dp_num_array::Array{Int64,2}) begin + @testset "select_columns" begin + dp_num_array=select_columns("test_files/select_samples_list.txt", dp_num_array, sample_names) + println(typeof(dp_num_array)) + println(length(dp_num_array)) + end end - - -""" - avg_dp_variant(dp_num_array::Array{Int64,2}) -create variant_avg_list vector that lists averages of read depth for each variant for input into avg_variant_dp_line_chart(variant_avg_list) -""" -@testset avg_dp_variant(dp_num_array::Array{Int64,2}) begin - end - -""" - list_sample_names_low_dp(sample_avg_list::Array{Float64,2},sample_names) -returns list of sample ids that have an average read depth of under 15 across all variant positions -""" -@testset list_sample_names_low_dp(sample_avg_list::Array{Float64,1},sample_names) begin - end """ diff --git a/test/test_files/select_column_list.txt b/test/test_files/select_samples_list.txt similarity index 100% rename from test/test_files/select_column_list.txt rename to test/test_files/select_samples_list.txt diff --git a/viva_cli.jl b/viva_cli.jl index 8209662..6171d20 100644 --- a/viva_cli.jl +++ b/viva_cli.jl @@ -268,6 +268,8 @@ if parsed_args["heatmap"] == "genotype" graphic = ViVa.genotype_heatmap2(gt_num_array,title,chrom_label_info,sample_names) end + println("saving now") + PlotlyJS.savefig(graphic, joinpath("$(parsed_args["output_directory"])" ,"$title.$(parsed_args["save_format"])"), js=:remote) end @@ -327,10 +329,7 @@ println("_______________________________________________") if parsed_args["avg_dp"] == "sample" if isdefined(:dp_num_array) == false - read_depth_array = ViVa.generate_genotype_array(sub,"DP") - clean_column1!(read_depth_array) - read_depth_array=ViVa.sort_genotype_array(read_depth_array) - dp_num_array,dp_chromosome_labels = translate_readdepth_strings_to_num_array(read_depth_array) + dp_num_array,dp_chromosome_labels=combined_all_read_depth_array_functions(sub) end chr_pos_tuple_list = generate_chromosome_positions_for_hover_labels(dp_chromosome_labels) @@ -343,11 +342,9 @@ if parsed_args["avg_dp"] == "sample" PlotlyJS.savefig(graphic, joinpath("$(parsed_args["output_directory"])" ,"Average Sample Read Depth.$(parsed_args["save_format"])"), js=:remote) elseif parsed_args["avg_dp"] == "variant" + if isdefined(:dp_num_array) == false - read_depth_array = ViVa.generate_genotype_array(sub,"DP") - clean_column1!(read_depth_array) - read_depth_array=ViVa.sort_genotype_array(read_depth_array) - dp_num_array,dp_chromosome_labels = translate_readdepth_strings_to_num_array(read_depth_array) + dp_num_array,dp_chromosome_labels=combined_all_read_depth_array_functions(sub) end chr_pos_tuple_list = generate_chromosome_positions_for_hover_labels(dp_chromosome_labels) From 22f2a8411c7fc84b982ddebe0860dca4549123a2 Mon Sep 17 00:00:00 2001 From: George Tollefson <gatollefson@gmail.com> Date: Mon, 5 Nov 2018 14:20:45 -0500 Subject: [PATCH 2/6] Wrote all tests for test/new_vcf_utils. All are passing. Fixed 4 sig list filter functions to iterate over VCF.Reader object first then siglist to make faster. Allowed inputing vcf filename as path to filename. --- .gitignore | 1 + .travis.yml | 35 ++-- ViVa_v2.jl | 15 -- open_jupyter_notebook.jl | 2 - src/ViVa.jl | 8 +- src/plot_utils.jl | 8 +- src/vcf_utils_complete.jl | 142 ++++++++-------- test/.DS_Store | Bin 8196 -> 8196 bytes test/new_vcf_utils.jl | 329 +++++++++++++++----------------------- viva_cli.jl | 53 +++--- 10 files changed, 255 insertions(+), 338 deletions(-) delete mode 100644 ViVa_v2.jl delete mode 100644 open_jupyter_notebook.jl diff --git a/.gitignore b/.gitignore index ea423ef..5f5d647 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ *.html .DS_Store .ipynb_checkpoints/ +output diff --git a/.travis.yml b/.travis.yml index df211b0..ddcbd6d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,24 +1,23 @@ -## Documentation: http://docs.travis-ci.com/user/languages/julia/ language: julia -# os: -# - linux -# - osx -# -# julia: -# - 0.6 -# -# notifications: -# email: false -# -# git: -# depth: 99999999 -# -# before_script: -# - export PATH=$HOME/.local/bin:$PATH +os: + - linux + - osx -# after_success: -# - julia -e 'import Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())' +julia: + - 0.6 + +notifications: + email: false + +git: + depth: 99999999 + +before_script: + - export PATH=$HOME/.local/bin:$PATH + +after_success: + - julia -e 'Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())' jobs: include: diff --git a/ViVa_v2.jl b/ViVa_v2.jl deleted file mode 100644 index 9822dfa..0000000 --- a/ViVa_v2.jl +++ /dev/null @@ -1,15 +0,0 @@ - -vcf_filename = "test_4X_191.vcf" -field_to_visualize = "read_depth" # field(s) = "read_depth", *etc. -variant_filter = ["range","chr4:0-4000000000"] #"range","chr4:0-4000000000" -sample_filter = ["reorder_columns", "sample_phenotype_matrix.csv", "case_control_status"] # "select_columns", "select_column_list.txt", -plot_types = "sample_line_chart" -save_format = "html" -plot_title = "Read_depth_test" - - -using ViVa -using GeneticVariation -using VCFTools -plot=ViVa.jupyter_main_new(vcf_filename,field_to_visualize,variant_filter,sample_filter,plot_types,save_format,plot_title) - diff --git a/open_jupyter_notebook.jl b/open_jupyter_notebook.jl deleted file mode 100644 index bcac03b..0000000 --- a/open_jupyter_notebook.jl +++ /dev/null @@ -1,2 +0,0 @@ -using IJulia -notebook() diff --git a/src/ViVa.jl b/src/ViVa.jl index be303b2..3e97735 100644 --- a/src/ViVa.jl +++ b/src/ViVa.jl @@ -1,13 +1,13 @@ module ViVa -using DataFrames #use CSV.jl ? depwarnings +using DataFrames using PlotlyJS using Rsvg using Blink -#using CSV using GeneticVariation using ArgParse using VCFTools +#using Base.Filesystem @@ -42,9 +42,9 @@ export avg_variant_dp_line_chart, read_depth_threshhold, list_variant_positions_low_dp, - list_sample_positions_low_dp, + list_sample_names_low_dp, avg_dp_variant, - avg_dp_patients, + avg_dp_samples, jupyter_main, save_numerical_array, io_pass_filter, diff --git a/src/plot_utils.jl b/src/plot_utils.jl index f86e74f..cdc7941 100644 --- a/src/plot_utils.jl +++ b/src/plot_utils.jl @@ -42,8 +42,8 @@ function genotype_heatmap2(input,title,chrom_label_info,sample_names) end """ - genotype_heatmap_with_groups(input::Array{Int64,2},title::String,chrom_label_info::Tuple{Array{String,1},Array{Int64,1},String},group_label_pack::Array{Any,1},sample_names) - generate heatmap of genotype data. + genotype_heatmap_with_groups(input::Array{Int64,2},title::String,chrom_label_info::Tuple{Array{String,1},Array{Int64,1},String},group_label_pack::Array{Any,1},sample_names) +generate heatmap of genotype data. """ function genotype_heatmap_with_groups(input::Array{Int64,2},title::String,chrom_label_info::Tuple{Array{String,1},Array{Int64,1},String},group_label_pack::Array{Any,1},sample_names) @@ -127,7 +127,7 @@ function dp_heatmap2(input, title, chrom_label_info, sample_names) end """ - dp_heatmap2_with_groups(input::Array{Int64,2},title::String,chrom_label_info::Tuple{Array{String,1},Array{Int64,1},String},group_label_pack::Array{Any,1}) + dp_heatmap2_with_groups(input::Array{Int64,2},title::String,chrom_label_info::Tuple{Array{String,1},Array{Int64,1},String},group_label_pack::Array{Any,1}) generate heatmap of read depth data with grouped samples. """ function dp_heatmap2_with_groups(input::Array{Int64,2},title::String,chrom_label_info::Tuple{Array{String,1},Array{Int64,1},String},group_label_pack::Array{Any,1}) @@ -174,7 +174,7 @@ function dp_heatmap2_with_groups(input::Array{Int64,2},title::String,chrom_label end """ - avg_sample_dp_scatter(sample_avg_list::Array{Float64,1},sample_names) + avg_sample_dp_scatter(sample_avg_list::Array{Float64,1},sample_names) generate line chart of average read depths of each sample. """ function avg_sample_dp_scatter(sample_avg_list::Array{Float64,1},sample_names) diff --git a/src/vcf_utils_complete.jl b/src/vcf_utils_complete.jl index fb8964e..de3b55a 100644 --- a/src/vcf_utils_complete.jl +++ b/src/vcf_utils_complete.jl @@ -110,42 +110,37 @@ end io_sig_list_vcf_filter(sig_list,vcf_filename) returns subarray of variant records matching a list of variant positions returned from load_siglist() """ -function io_sig_list_vcf_filter(sig_list,vcf_filename) +function io_sig_list_vcf_filter(sig_list,vcf_filename) - vcf_subarray = Array{Any}(0) + reader = VCF.Reader(open(vcf_filename, "r")) - for row= 1:size(sig_list,1) - dimension = size(sig_list,1) + vcf_subarray = Array{Any}(0) - chr=(sig_list[row,1]) - pos=(sig_list[row,2]) + for record in reader - reader = VCF.Reader(open(vcf_filename, "r")) - tic() - for record in reader + for row= 1:size(sig_list,1) + dimension = size(sig_list,1) - if typeof(VCF.chrom(record)) == String - chr = string(chr) - if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) - push!(vcf_subarray,record) - end + chr=(sig_list[row,1]) + pos=(sig_list[row,2]) + if typeof(VCF.chrom(record)) == String + chr = string(chr) + if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) + push!(vcf_subarray,record) + end - end#remove this if need code below -#= - else + else - if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) - push!(vcf_subarray,record) + if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) + push!(vcf_subarray,record) - end - end - =# + end end - toc() - end + end + end - return vcf_subarray + return vcf_subarray end """ @@ -186,16 +181,16 @@ function pass_chrrange_siglist_filter(vcf_filename,sig_list,chr_range::AbstractS vcf_subarray = Array{Any}(0) - for row= 1:size(sig_list,1) + reader = VCF.Reader(open(vcf_filename, "r")) + + for record in reader + + for row = 1:size(sig_list,1) dimension = size(sig_list,1) chr=(sig_list[row,1]) pos=(sig_list[row,2]) - reader = VCF.Reader(open(vcf_filename, "r")) - - for record in reader - if typeof(VCF.chrom(record)) == String chr = string(chr) @@ -258,12 +253,16 @@ returns subarray of vcf records with io_pass_filter and io_chromosome_range_vcf_ end """ - pass_siglist_filter(vcf_filename,sig_list,chr_range::AbstractString) - returns subarray of vcf records with io_pass_filter, io_sig_list_vcf_filter, and io_chromosome_range_vcf_filter applied. + pass_siglist_filter(vcf_filename,sig_list,chr_range::AbstractString) +returns subarray of vcf records with io_pass_filter, io_sig_list_vcf_filter, and io_chromosome_range_vcf_filter applied. """ - function pass_siglist_filter(vcf_filename,sig_list) +function pass_siglist_filter(vcf_filename,sig_list) - vcf_subarray = Array{Any}(0) + vcf_subarray = Array{Any}(0) + + reader = VCF.Reader(open(vcf_filename, "r")) + + for record in reader for row= 1:size(sig_list,1) dimension = size(sig_list,1) @@ -271,10 +270,6 @@ returns subarray of vcf records with io_pass_filter and io_chromosome_range_vcf_ chr=(sig_list[row,1]) pos=(sig_list[row,2]) - reader = VCF.Reader(open(vcf_filename, "r")) - - for record in reader - if typeof(VCF.chrom(record)) == String chr = string(chr) @@ -286,64 +281,63 @@ returns subarray of vcf records with io_pass_filter and io_chromosome_range_vcf_ if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) && (VCF.hasfilter(record)) && (VCF.filter(record) == String["PASS"]) push!(vcf_subarray,record) - end end - end end + end - return vcf_subarray - end + return vcf_subarray +end """ chrrange_siglist_filter(vcf_filename,sig_list,chr_range::AbstractString) returns subarray of vcf records with io_pass_filter, io_sig_list_vcf_filter, and io_chromosome_range_vcf_filter applied. """ - function chrrange_siglist_filter(vcf_filename,sig_list,chr_range::AbstractString) +function chrrange_siglist_filter(vcf_filename,sig_list,chr_range::AbstractString) - a=split(chr_range,":") - chrwhole=a[1] - chrnumber=split(chrwhole,"r") - string_chr=chrnumber[2] - chr=String(string_chr) - range=a[2] - splitrange=split(range, "-") - lower_limit=splitrange[1] - chr_range_low=parse(lower_limit) - upper_limit=splitrange[2] - chr_range_high=parse(upper_limit) + a=split(chr_range,":") + chrwhole=a[1] + chrnumber=split(chrwhole,"r") + string_chr=chrnumber[2] + chr=String(string_chr) + range=a[2] + splitrange=split(range, "-") + lower_limit=splitrange[1] + chr_range_low=parse(lower_limit) + upper_limit=splitrange[2] + chr_range_high=parse(upper_limit) - vcf_subarray = Array{Any}(0) + vcf_subarray = Array{Any}(0) - for row= 1:size(sig_list,1) - dimension = size(sig_list,1) + reader = VCF.Reader(open(vcf_filename, "r")) - chr=(sig_list[row,1]) - pos=(sig_list[row,2]) + for record in reader - reader = VCF.Reader(open(vcf_filename, "r")) + for row= 1:size(sig_list,1) + dimension = size(sig_list,1) - for record in reader + chr=(sig_list[row,1]) + pos=(sig_list[row,2]) - if typeof(VCF.chrom(record)) == String - chr = string(chr) + if typeof(VCF.chrom(record)) == String + chr = string(chr) - if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) && ((VCF.chrom(record) == chr)) && ((chr_range_high > VCF.pos(record) > chr_range_low)) - push!(vcf_subarray,record) - end + if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) && ((VCF.chrom(record) == chr)) && ((chr_range_high > VCF.pos(record) > chr_range_low)) + push!(vcf_subarray,record) + end - else + else - if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) && ((VCF.chrom(record) == chr)) && ((chr_range_high > VCF.pos(record) > chr_range_low)) - push!(vcf_subarray,record) + if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) && ((VCF.chrom(record) == chr)) && ((chr_range_high > VCF.pos(record) > chr_range_low)) + push!(vcf_subarray,record) - end - end - end + end + end end + end - return vcf_subarray - end + return vcf_subarray +end #functions for converting vcf record array to numerical array diff --git a/test/.DS_Store b/test/.DS_Store index c02d3ce4ddfb142ddb2f95a94384b282c69c667d..058d747c0a0ffc8e96385d11f1e9bc0b65dcdf0d 100644 GIT binary patch delta 1325 zcmeIxO>7Kd7zgm@@AI9Bu6Nsdce|u*y4|FtR<~c;Qnnu&QnaDfmP+k*2BX_uI@=Fj zt0h8E93~Y+99%?PXmLW~B2F&d_`3KA7Y7HI3K#G0Cc?qN$;BLI=J`MK@yz?nobaFU z-=5)kY9;u1ks}r>t*$E0uc(R_=f|pjmGZP=bv&*xhsRgDX;1&*(b$pr(Mwj`o`=iU zZSGVwvo{osL_$bb)^qztn`pAD8#|5ONO-$3s$0*p>wP_(51RTmJrsx<LwaK%s+&DL zbI8z#TMkBzNZ8z=518`k_Lop<vo2*ej&a-K4B-$tjJsLqV9+=;XhaVulBv2(<SP6g z>(=C>wnQ{zuc2$Hj#wrN7~7T18wPZ_*Af}F#k|lX)HYL&!vjo7CCgW+YFnQX*0`yp z6YnZjw1)ygJ;XJIB{E9oeP!jU=om^5$D5O>46CYD(b*g6@7Kdo(~e>pr$${TtaDjQ z$AsugzP?V=bd53Q_7|nUN#P(yIaEO9w3b?_i!>Uh37Vvfbe(R|eR@Do=o!t>b9zB9 z=^eeN5A=~f(=2_ZZ}c4uc4Q(8c__k4tU?KVsKI*Fp&kv`jBRK`yB9mqgMBb?0DXue zj&U5rah$~j&f@~E;Rfzt3e&iYhj?UlXLs6MP1$PUZ)6+(noN1MD7iTH%&hDsx$dH3 zU*#IVRllTTtYgS9jbKP`A2zh;{;q)8tA{lsysup@zCOZ|3&<BHavV;Pm+x|M#-B2l zDhoX>l|wQssdBkTcIU{Ra5!^Rud76nn0#)kbd{+UOtSI$>28%;qexgjFCoU{)vj7~ zJ(GTXLAqV9HY#?!RXF{^^GBSUG)0dk%xiihIX=-BngvM=8=T0M7zJ>{1Fz&LMH$Lb z`wPidRJ5QKUD%0k?3OV5B}^Ye2xAx{7)4A1ox~}e#u*7TiHo?5E4Vs6Bg!$oz&Sgw Uxgio7><|C<`1!N^Bac%20J(EL&Hw-a delta 1241 zcmeH_OGs2v7{|YVX`Or1JTpG#3^``0SUQ%YHa2B7X_`@BnAVh%jm}*R%|pFcNvCOg z7`5oh4WfdGXw`#+s6`uTRYB0ppjt!_)uK&N1g-3hH(ExUcAdq6^Zm~A_djF)G5`2; zj)&$X-@jNS61CMncX5r+n{*c^eDxl8ab2x9nN*mg%(H64uIR{UVsCQaIdiXVKF*ut zT(FSd7wL`1Bj!!+G+)?SGewJSy=v~a=bQKP8jAwqzIbd)cvS1*d`Q>YwMfti4{Pg! zhNg3v7wzj@*>iJvfkTMEP;Yo(C~S=I62>O^+?;H+P?%G8M@kfmg^YED<f5iQP4*!R zmx*$P2U&+M_qQq)s$1c@(y==nv*w7Us)zBA(isW%YLV$lqE7WPzERmWtPSenc+4vL z)RoKzw-V@!N26NI(5>=n;b-&s1GY1L_@<z4wDoH-d4BrRjfFy(<GBT?5`Q&KH-#8E zX$kqMmAXlzej1@;bb?OO1-e2v={8N!eR@QX=?Oih*Yu9w(+B!UpXm#IrEdUK%t00e z7GNPtQHBcCq7GivqXErmK`S<(4FPON7rGHb6fw94Fo*=ka2Q8$6lZY>S8)y3aRYZS zVRq#OGF{DiEx$vl`Ew}BtE}|vZ1&u|{GyWb%Iak+)|jpN+h%o2soEZM!Y)#LcG?x2 zl({+Q7rR`XNk`UDDRrs*HdChAqEg9b{37luNoLBxEas_ET|84}W3BVnt18cs;aPNL zgQT-$Ol;EW5C!U5wMntzt-^_w_*084-+v+gGF_v)Qu=dxNpGa^Pg3>|nnEUW;D8f_ zaG?n0Sd2=zQ6*I`hYu^zG_89*HlrOK=)_j^Ac#JxTtln~{ZhFO14*QC00(gh$8a2{ qa2h7g+<I0}hg-A7_l#yk+he4^{|-GqXf(wmL($lO`~Td&2R{J~eIF?R diff --git a/test/new_vcf_utils.jl b/test/new_vcf_utils.jl index a0825e9..c1ee9fd 100644 --- a/test/new_vcf_utils.jl +++ b/test/new_vcf_utils.jl @@ -30,22 +30,24 @@ end @testset "io_chromosome_range_vcf_filter" begin sub = io_chromosome_range_vcf_filter("chr4:0-400000000",reader) -println(sub[1:2]) -println(size(sub,2)) +@test typeof(sub) == Array{Any,1} +@test size(sub,1) == 1012 +#println("io_chromosome_range_vcf_filter type is $(typeof(sub))") +#println("io_chromosome_range_vcf_filter size is $(size(sub,1))") end -#= + @testset "filters_with_siglist" begin @testset "load_siglist" begin sig_list=load_siglist("test_files/significantList_for_proteinstructures.csv") - println(sig_list[2:1]) - println(size(sig_list,1)) + #println(sig_list[2:1]) + #println(size(sig_list,1)) @testset "clean_column1_siglist!" begin clean_column1_siglist!(sig_list) - println(sig_list[1,2]) - println(size(sig_list,1)) + #println(sig_list[1,2]) + #println(size(sig_list,1)) end @testset "io_sig_list_vcf_filter" begin @@ -75,7 +77,7 @@ end end end -=# + @testset "io_pass_filter" begin reader = VCF.Reader(open(vcf_filename, "r")) @@ -98,237 +100,164 @@ reader = VCF.Reader(open(vcf_filename, "r")) sub = io_pass_filter(reader) gt_num_array,gt_chromosome_labels=combined_all_genotype_array_functions(sub) -println(typeof(gt_num_array)) -println(length(gt_num_array)) -println(typeof(gt_chromosome_labels)) -println(length(gt_chromosome_labels)) +#println("combined_all_genotype_array_functions gt array is type: $(typeof(gt_num_array))") +#println("combined_all_genotype_array_functions gt array is length: $(length(gt_num_array))") +#println("combined_all_genotype_array_functions gt_chromosome_labels is typeof: $(typeof(gt_chromosome_labels))") +#println("combined_all_genotype_array_functions gt_chromosome_labels is length: $(length(gt_chromosome_labels))") +@test typeof(gt_num_array) == Array{Int64,2} +@test length(gt_num_array) == 222324 +@test typeof(gt_chromosome_labels) == Array{Any,2} +@test length(gt_chromosome_labels) == 2328 + + @testset "chromosome_label_generator" begin + chrom_label_info = ViVa.chromosome_label_generator(gt_chromosome_labels[:,1]) + #println("chromosome_label_generator chrom_label_info is type $(typeof(chrom_label_info))") + #println("chromosome_label_generator chrom_label_info is length $(length(chrom_label_info))") + @test typeof(chrom_label_info) == Tuple{Array{String,1},Array{Int64,1},String} + @test size(chrom_label_info,1) == 3 + end + + @testset "generate_chromosome_positions_for_hover_labels" begin + chr_pos_tuple_list = generate_chromosome_positions_for_hover_labels(gt_chromosome_labels) + #println("generate_chromosome_positions_for_hover_labels chr_pos_tuple_list is type $(typeof(chr_pos_tuple_list))") + #println("generate_chromosome_positions_for_hover_labels chr_pos_tuple_list is length $(length(chr_pos_tuple_list))") + @test typeof(chr_pos_tuple_list) == Array{Tuple,1} + @test size(chr_pos_tuple_list,1) == 1164 + end @testset "generate_genotype_array" begin reader = VCF.Reader(open(vcf_filename, "r")) sub = io_pass_filter(reader) genotype_array=generate_genotype_array(sub,"GT") - println(typeof(genotype_array)) - println(length(genotype_array)) - println(genotype_array[3:5]) + + #println("generate_genotype_array is $(typeof(genotype_array))") + #println("generate_genotype_array is $(size(genotype_array,1))") + @test typeof(genotype_array) == Array{String,2} + @test size(genotype_array,1) == 1164 @testset "define_geno_dict" begin geno_dict = define_geno_dict() - println(typeof(geno_dict)) - println(length(geno_dict)) + #println("define_geno_dict is type is $(typeof(geno_dict))") + #println("define_geno_dict is length is $(length(geno_dict))") + @test typeof(geno_dict) == Dict{Any,Any} + @test length(geno_dict) == 92 @testset "translate_genotype_to_num_array" begin gt_num_array,gt_chromosome_labels=translate_genotype_to_num_array(genotype_array,geno_dict) - println(typeof(gt_num_array)) - println(length(gt_num_array)) - println(typeof(gt_chromosome_labels)) - println(length(gt_chromosome_labels)) + #println("translate_genotype_to_num_array gt_num_array type is $(typeof(gt_num_array))") + #println("translate_genotype_to_num_array gt_num_array length is $(length(gt_num_array))") + #println("translate_genotype_to_num_array gt_chromosome_labels typeof is $(typeof(gt_chromosome_labels))") + #println("translate_genotype_to_num_array gt_chromosome_labels length is $(length(gt_chromosome_labels))") + @test typeof(gt_num_array) == Array{Int64,2} + @test length(gt_num_array) == 222324 + @test typeof(gt_chromosome_labels) == Array{String,2} + @test length(gt_chromosome_labels) == 2328 end end end end -@testset "combined_all_read_depth_array_functions" begin #inside functions same used in combined_all_genotype_array_functions +@testset "combined_all_read_depth_array_functions" begin reader = VCF.Reader(open(vcf_filename, "r")) sub = io_pass_filter(reader) dp_num_array,dp_chromosome_labels=combined_all_read_depth_array_functions(sub) -println(typeof(dp_num_array)) -println(length(dp_num_array)) -println(typeof(dp_chromosome_labels)) -println(length(dp_chromosome_labels)) - -@testset "get_sample_names" begin -reader = VCF.Reader(open(vcf_filename, "r")) -sample_names=get_sample_names(reader) -println("get_sample_names") -println(typeof(sample_names)) -println(length(sample_names)) - -@testset "avg_dp_samples" begin -avg_sample_list=avg_dp_samples(dp_num_array) -println("avg_sample_list is $avg_sample_list") - -@testset "list_sample_names_low_dp" begin -list=list_sample_names_low_dp(avg_sample_list,sample_names) -println(list) -end - -end - +#println("combined_all_read_depth_array_functions dp_num_array type is $(typeof(dp_num_array))") +#println("combined_all_read_depth_array_functions dp_num_array length is $(length(dp_num_array))") +#println("combined_all_read_depth_array_functions gt_chromosome_labels typeof is $(typeof(dp_chromosome_labels))") +#println("combined_all_read_depth_array_functions gt_chromosome_labels length is $(length(dp_chromosome_labels))") +@test typeof(dp_num_array) == Array{Int64,2} +@test length(dp_num_array) == 222324 +@test typeof(dp_chromosome_labels) == Array{Any,2} +@test length(dp_chromosome_labels) == 2328 -@testset "avg_dp_variant" begin -avg_variant_list=avg_dp_variant(dp_num_array) -println("avg_dp_variant is $avg_variant_list") -end + @testset "get_sample_names" begin + reader = VCF.Reader(open(vcf_filename, "r")) + sample_names=get_sample_names(reader) + #println("get_sample_names") + #println("get_sample_names sample_names type is $(typeof(sample_names))") + #println("get_sample_names sample_names length is $(length(sample_names))") + @test typeof(sample_names) == Array{Symbol,2} + @test length(sample_names) == 191 + + @testset "read_depth_threshhold" begin + dp_num_array=read_depth_threshhold(dp_num_array) + #println("read_depth_threshhold dp_num_array is type $(typeof(dp_num_array))") + #println("read_depth_threshhold dp_num_array is size $(size(dp_num_array,1))") + @test typeof(dp_num_array) == Array{Int64,2} + @test size(dp_num_array,1) == 1164 + end -@testset "sortcols_by_phenotype_matrix" begin -vcf,group_label_pack=sortcols_by_phenotype_matrix("test_files/sample_phenotype_matrix.csv","control,case", dp_num_array, sample_names) -println(typeof(vcf)) -println(size(vcf,1)) -println(typeof(group_label_pack)) -println(length(group_label_pack)) - - @testset "find_group_label_indices" begin - pheno = readdlm("test_files/sample_phenotype_matrix.csv", ',') - row_to_sort_by = find(x -> x == "control,case", pheno) - row_to_sort_by = row_to_sort_by[1] - group_label_pack=find_group_label_indices(pheno,"control,case",row_to_sort_by) - println(typeof(group_label_pack)) - println(length(group_label_pack)) - end + @testset "avg_dp_samples" begin + avg_sample_list=avg_dp_samples(dp_num_array) + #println("avg_dp_samples avg_sample_list type is $(typeof(avg_sample_list))") + #println("avg_dp_samples avg_sample_list length is $(length(avg_sample_list))") + @test typeof(avg_sample_list) == Array{Float64,1} + @test size(avg_sample_list,1) == 191 + + @testset "list_sample_names_low_dp" begin + list=list_sample_names_low_dp(avg_sample_list,sample_names) + #println("list_sample_names_low_dp list type is $(typeof(list))") + #println("list_sample_names_low_dp list length is $(length(list))") + @test typeof(list) == Array{String,1} + @test size(list,1) == 6 + end - @testset "select_columns" begin - dp_num_array=select_columns("test_files/select_samples_list.txt", dp_num_array, sample_names) - println(typeof(dp_num_array)) - println(length(dp_num_array)) end -end -end -end - -""" - list_variant_positions_low_dp(variant_avg_list::Array{Float64,2},chrom_labels) -finds variant positions that have an average read depth of under 15 across all patients -""" -function list_variant_positions_low_dp(variant_avg_list::Array{Float64,1},chrom_labels) + @testset "avg_dp_variant" begin + avg_variant_list=avg_dp_variant(dp_num_array) + #println("avg_dp_variant avg_variant_list type is $(typeof(avg_variant_list))") + #println("avg_dp_variant avg_variant_list length is $(length(avg_variant_list))") + @test typeof(avg_variant_list) == Array{Float64,1} + @test size(avg_variant_list,1) == 1164 - low_dp_index_list = Array{Int64}(0) + @testset "list_variant_positions_low_dp" begin + list=list_variant_positions_low_dp(avg_variant_list,dp_chromosome_labels) + #println("list_variant_positions_low_dp list type is $(typeof(list))") + #println("list_variant_positions_low_dp list length is $(length(list))") + @test typeof(list) == Array{Tuple{Int64,Int64},1} + @test size(list,1) == 33 - for item = 1:length(variant_avg_list) - if variant_avg_list[item] < 15 - push!(low_dp_index_list,item) - end end - low_dp_positions = Array{Tuple{Int64,Int64}}(0) + end - for i in low_dp_index_list - chrom_position = chrom_labels[i,1],chrom_labels[i,2] - push!(low_dp_positions,chrom_position) + @testset "sortcols_by_phenotype_matrix" begin + vcf,group_label_pack=sortcols_by_phenotype_matrix("test_files/sample_phenotype_matrix.csv","control,case", dp_num_array, sample_names) + #println("sortcols_by_phenotype_matrix vcf type is $(typeof(vcf))") + #println("sortcols_by_phenotype_matrix vcf size is $(size(vcf,1))") + #println("sortcols_by_phenotype_matrix group_label_pack type is $(typeof(group_label_pack))") + #println("sortcols_by_phenotype_matrix group_label_pack size is $(size(group_label_pack,1))") + @test typeof(vcf) == Array{Int64,2} + @test size(vcf,1) == 1164 + @test typeof(group_label_pack) == Array{Any,1} + @test size(group_label_pack,1) == 5 + + @testset "find_group_label_indices" begin + pheno = readdlm("test_files/sample_phenotype_matrix.csv", ',') + row_to_sort_by = find(x -> x == "control,case", pheno) + row_to_sort_by = row_to_sort_by[1] + group_label_pack=find_group_label_indices(pheno,"control,case",row_to_sort_by) + #println("find_group_label_indices group_label_pack type is $(typeof(group_label_pack))") + #println("find_group_label_indices group_label_pack length is $(length(group_label_pack))") + @test typeof(group_label_pack) == Array{Any,1} + @test size(group_label_pack,1) == 5 end - return low_dp_positions -end - -#functions for producing objects for plot functions - -""" - read_depth_threshhold(dp_array::Array{Int64,2}) -sets ceiling for read depth values at dp = 100. All dp over 100 are set to 100 to visualize read depth values between 0 < dp > 100 in better definition -""" -function read_depth_threshhold(dp_array::Array{Int64,2}) - - dp_array[dp_array[:,:].>100].=100 - - return dp_array -end - -""" - save_numerical_array(num_array::Matrix{Any},sample_names,chr_labels) -save numerical array with chr labels and sample ids to working directory -""" -function save_numerical_array(num_array,sample_names,chr_labels) - - #samplenames=sample_names - #samplenames=Matrix(samplenames) - - headings = hcat("chr","position") - sample_names = hcat(headings,sample_names) - - chr_labeled_array_for_plotly=hcat(chr_labels, num_array) - labeled_value_matrix_withsamplenames= vcat(sample_names,chr_labeled_array_for_plotly) - - writedlm("AC_gatk406_eh_PASS_withheader_value_matrix_.txt", labeled_value_matrix_withsamplenames, "\t") -end - -""" - chromosome_label_generator(chromosome_labels::Array{String,2}) -Returns vector of chr labels and indices to mark chromosomes in plotly heatmap -Specifically, saves indexes and chrom labels in vectors to pass into heatmap function to ticvals and tictext respectively -Input is either gt_chromosome_labels or dp_chromosome_labels from translate_gt/dp_to_num_array() -""" - -function chromosome_label_generator(chromosome_labels::Array{Any,1}) - chrom_label_indices = findfirst.(map(a -> (y -> isequal(a, y)), unique(chromosome_labels)), [chromosome_labels]) - chrom_labels = unique(chromosome_labels) - chrom_labels = [string(i) for i in chrom_labels] - - if length(chrom_labels) > 1 - for item=2:(length(chrom_labels)) - - ratio=((chrom_label_indices[item])-(chrom_label_indices[item-1]))/(length(chromosome_labels)) - println(ratio) - - if ratio < 0.2 - font_size = "8" - println("font size is $font_size") - return chrom_labels,chrom_label_indices,font_size - else - font_size = "10" - println("font size is $font_size") - - return chrom_labels,chrom_label_indices,font_size - end + @testset "select_columns" begin + dp_num_array=select_columns("test_files/select_samples_list.txt", dp_num_array, sample_names) + #println("select_columns dp_num_array type is $(typeof(dp_num_array))") + #println("select_columns dp_num_array size is $(size(dp_num_array,1))") + @test typeof(dp_num_array) == Array{Int64,2} + @test size(dp_num_array,1) == 1164 end - else - font_size = "10" - return chrom_labels,chrom_label_indices,font_size end -end -""" - checkfor_outputdirectory(path::String) -Checks to see if output directory exists already. If it doesn't, it creates the new directory to write output files to. -""" -function checkfor_outputdirectory(path::String) - if isdir(path) == true - else - mkdir(path) end -end - -""" - generate_chromosome_positions_for_hover_labels(chr_labels::Array{Any,2}) -creates tuple of genomic locations to set as tick labels. This is automatically store chromosome positions in hover labels. However tick labels are set to hidden with showticklabels=false so they will not crowd the y axis. -""" -function generate_chromosome_positions_for_hover_labels(chr_labels::Array{Any,2}) - -returnXY_column1!(chr_labels) #not working yet -#println(chr_labels) - -chr_pos_tuple_list=Array{Tuple}(0) - - for row = 1:size(chr_labels,1) - - chr=chr_labels[row,1] - pos=chr_labels[row,2] - chr_pos_tuple=chr,pos - push!(chr_pos_tuple_list,chr_pos_tuple) end - return chr_pos_tuple_list -end - -""" - returnXY_column1!(chr_label_vector) -Replace String "23","24","25" with "X","Y","M" in chromosome label vector used for plot labels -""" -@testset "returnXY_column1!" begin - -end - -""" - sort_genotype_array(genotype_array) -sorts genotype array for GT or DP by chromosomal location -""" -@testset sort_genotype_array(genotype_array) begin - -end - - -=# end diff --git a/viva_cli.jl b/viva_cli.jl index 6171d20..6cfdd7a 100644 --- a/viva_cli.jl +++ b/viva_cli.jl @@ -1,8 +1,3 @@ -#= -read_depth heatmap up to 15 white, up to 30 light blue, -don't output files in same folder -=# - println("Welcome to ViVa.") println() println("Loading packages:") @@ -59,7 +54,7 @@ function test_parse_main(ARGS::Vector{String}) ) @add_arg_table s begin - "--vcf_file", "-f" # vcf filename + "--vcf_file", "-f" help = "vcf filename in format: file.vcf" arg_type = String required = true @@ -73,7 +68,7 @@ function test_parse_main(ARGS::Vector{String}) arg_type = String default = "output" - "--save_format", "-s" # format to save graphics in + "--save_format", "-s" help = "file format you wish to save graphics as (eg. pdf, html, png). Defaults to html" arg_type = String default = "html" @@ -129,9 +124,7 @@ end parsed_args = test_parse_main(ARGS) -#begin main program - -#filter vcf and load matrix +#filter vcf and load matrix of filtered vcf records vcf_filename = (parsed_args["vcf_file"]) println("Reading $vcf_filename ...") @@ -175,7 +168,7 @@ if parsed_args["show_stats"] == true end tic() - +#pass_filter if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] == nothing && parsed_args["positions_list"] == nothing sub = ViVa.io_pass_filter(reader) number_rows = size(sub,1) @@ -183,6 +176,7 @@ if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] == noth heatmap_input = "pass_filtered" end +#chr_range if parsed_args["chromosome_range"] != nothing && parsed_args["pass_filter"] == false && parsed_args["positions_list"] == nothing sub = ViVa.io_chromosome_range_vcf_filter(parsed_args["chromosome_range"],reader) number_rows = size(sub,1) @@ -190,6 +184,7 @@ if parsed_args["chromosome_range"] != nothing && parsed_args["pass_filter"] == f heatmap_input = "range_filtered" end +#list if parsed_args["positions_list"] != nothing && parsed_args["chromosome_range"] == nothing && parsed_args["pass_filter"] == false sig_list = load_siglist(parsed_args["positions_list"]) sub = ViVa.io_sig_list_vcf_filter(sig_list,vcf_filename) @@ -198,6 +193,7 @@ if parsed_args["positions_list"] != nothing && parsed_args["chromosome_range"] = heatmap_input = "positions_filtered" end +#pass_filter and chr_range and list if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] != nothing && parsed_args["positions_list"] != nothing sig_list = load_siglist(parsed_args["positions_list"]) sub = ViVa.pass_chrrange_siglist_filter(vcf_filename, sig_list, parsed_args["chromosome_range"]) @@ -205,12 +201,14 @@ if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] != noth println("selected $number_rows variants with Filter status: PASS, that match list of chromosome positions of interest, and are within chromosome range: $(parsed_args["chromosome_range"])") end +#pass_filter and chr_range if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] != nothing && parsed_args["positions_list"] == nothing sub = ViVa.pass_chrrange_filter(reader, parsed_args["chromosome_range"]) number_rows = size(sub,1) println("selected $number_rows variants with Filter status: PASS and are within chromosome range: $(parsed_args["chromosome_range"])") end +#pass_filter and list if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] == nothing && parsed_args["positions_list"] != nothing sig_list = load_siglist(parsed_args["positions_list"]) sub = ViVa.pass_siglist_filter(vcf_filename, sig_list) @@ -218,6 +216,15 @@ if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] == noth println("selected $number_rows variants with Filter status: PASS and that match list of chromosome positions of interest") end +#chr_range and list +if parsed_args["pass_filter"] == false && parsed_args["chromosome_range"] != nothing && parsed_args["positions_list"] != nothing + sig_list = load_siglist(parsed_args["positions_list"]) + sub = ViVa.chrrange_siglist_filter(vcf_filename, sig_list, parsed_args["chromosome_range"]) + number_rows = size(sub,1) + println("selected $number_rows variants that are within chromosome range: $(parsed_args["chromosome_range"]) and that match list of chromosome positions of interest") +end + +#no filters if parsed_args["pass_filter"] == false && parsed_args["chromosome_range"] == nothing && parsed_args["positions_list"] == nothing println("no filters applied. Large vcf files will take a long time to process and heatmap visualizations will lose resolution at this scale unless viewed in interactive html for zooming.") @@ -232,13 +239,22 @@ if parsed_args["pass_filter"] == false && parsed_args["chromosome_range"] == not end +println() +println("Finished Filtering. Total time to filter:") +toc() +println("_______________________________________________") + +#convert to numeric array for plotting and generate/save heatmaps and scatter plots + if parsed_args["heatmap"] == "genotype" gt_num_array,gt_chromosome_labels = combined_all_genotype_array_functions(sub) if parsed_args["heatmap_title"] != nothing title = parsed_args["heatmap_title"] else - title = "Genotype_$(parsed_args["vcf_file"])" + bn = Base.Filesystem.basename(parsed_args["vcf_file"]) + title = "Genotype_$bn" + println(title) end chr_pos_tuple_list = generate_chromosome_positions_for_hover_labels(gt_chromosome_labels) @@ -249,7 +265,9 @@ if parsed_args["heatmap"] == "genotype" if parsed_args["select_samples"] != nothing println("selecting samples listed in $(parsed_args["select_samples"])") - gt_num_array = select_columns(parsed_args["select_samples"], gt_num_array, sample_names) + gt_num_array = select_columns(parsed_args["select_samples"], + gt_num_array, + sample_names) end group_trait_matrix_filename=((parsed_args["group_samples"])[1]) @@ -319,13 +337,6 @@ if parsed_args["heatmap"] == "read_depth" end -#println("_______________________________________________") -println() -println("Finished Filtering. Total time to filter:") -toc() -println("_______________________________________________") -#println() - if parsed_args["avg_dp"] == "sample" if isdefined(:dp_num_array) == false @@ -342,7 +353,7 @@ if parsed_args["avg_dp"] == "sample" PlotlyJS.savefig(graphic, joinpath("$(parsed_args["output_directory"])" ,"Average Sample Read Depth.$(parsed_args["save_format"])"), js=:remote) elseif parsed_args["avg_dp"] == "variant" - + if isdefined(:dp_num_array) == false dp_num_array,dp_chromosome_labels=combined_all_read_depth_array_functions(sub) end From 8c27f4ab3399b9e4d8c8a4ee3f3befeec7223f03 Mon Sep 17 00:00:00 2001 From: Fernando Gelin <fernando_gelin@brown.edu> Date: Mon, 5 Nov 2018 14:33:19 -0500 Subject: [PATCH 3/6] Update .travis.yml --- .travis.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ddcbd6d..fe06d90 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,7 +15,11 @@ git: before_script: - export PATH=$HOME/.local/bin:$PATH - + +script: + - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi + - julia --check-bounds=yes -e 'Pkg.clone(pwd()); Pkg.build("ViVa"); Pkg.test("ViVa"; coverage=true)' + after_success: - julia -e 'Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())' From 986b475e4daab2a8cdc13db36aba836e3f85d60f Mon Sep 17 00:00:00 2001 From: Fernando Gelin <fernando_gelin@brown.edu> Date: Mon, 5 Nov 2018 14:46:26 -0500 Subject: [PATCH 4/6] Update .travis.yml --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index fe06d90..9ca2b70 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,7 @@ git: before_script: - export PATH=$HOME/.local/bin:$PATH + - julia -e 'Pkg.pin("Blink", v"0.6.2"); Pkg.pin("PlotlyJS", v"0.10.2"); Pkg.build("Blink")' script: - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi From 77f4817fb0501401df8b4775b2939317b675bdc2 Mon Sep 17 00:00:00 2001 From: Fernando Gelin <fernando_gelin@brown.edu> Date: Mon, 5 Nov 2018 14:52:27 -0500 Subject: [PATCH 5/6] Update .travis.yml --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9ca2b70..0992315 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,11 +15,10 @@ git: before_script: - export PATH=$HOME/.local/bin:$PATH - - julia -e 'Pkg.pin("Blink", v"0.6.2"); Pkg.pin("PlotlyJS", v"0.10.2"); Pkg.build("Blink")' script: - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi - - julia --check-bounds=yes -e 'Pkg.clone(pwd()); Pkg.build("ViVa"); Pkg.test("ViVa"; coverage=true)' + - julia --check-bounds=yes -e 'Pkg.clone(pwd()); Pkg.pin("Blink", v"0.6.2"); Pkg.pin("PlotlyJS", v"0.10.2"); Pkg.build("Blink"); Pkg.build("ViVa"); Pkg.test("ViVa"; coverage=true)' after_success: - julia -e 'Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())' From ff801a22b1ea62afb0e4de6c6c6826a6d1634ddc Mon Sep 17 00:00:00 2001 From: Fernando Gelin <fernando_gelin@brown.edu> Date: Mon, 5 Nov 2018 15:15:41 -0500 Subject: [PATCH 6/6] Update ViVa.jl --- src/ViVa.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ViVa.jl b/src/ViVa.jl index 3e97735..8db9b81 100644 --- a/src/ViVa.jl +++ b/src/ViVa.jl @@ -6,7 +6,7 @@ using Rsvg using Blink using GeneticVariation using ArgParse -using VCFTools +#using VCFTools #using Base.Filesystem