diff --git a/.gitignore b/.gitignore index ea423ef..5f5d647 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ *.html .DS_Store .ipynb_checkpoints/ +output diff --git a/.travis.yml b/.travis.yml index df211b0..0992315 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,24 +1,27 @@ -## Documentation: http://docs.travis-ci.com/user/languages/julia/ language: julia -# os: -# - linux -# - osx -# -# julia: -# - 0.6 -# -# notifications: -# email: false -# -# git: -# depth: 99999999 -# -# before_script: -# - export PATH=$HOME/.local/bin:$PATH +os: + - linux + - osx -# after_success: -# - julia -e 'import Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())' +julia: + - 0.6 + +notifications: + email: false + +git: + depth: 99999999 + +before_script: + - export PATH=$HOME/.local/bin:$PATH + +script: + - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi + - julia --check-bounds=yes -e 'Pkg.clone(pwd()); Pkg.pin("Blink", v"0.6.2"); Pkg.pin("PlotlyJS", v"0.10.2"); Pkg.build("Blink"); Pkg.build("ViVa"); Pkg.test("ViVa"; coverage=true)' + +after_success: + - julia -e 'Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())' jobs: include: diff --git a/ViVa_v2.jl b/ViVa_v2.jl deleted file mode 100644 index 9822dfa..0000000 --- a/ViVa_v2.jl +++ /dev/null @@ -1,15 +0,0 @@ - -vcf_filename = "test_4X_191.vcf" -field_to_visualize = "read_depth" # field(s) = "read_depth", *etc. -variant_filter = ["range","chr4:0-4000000000"] #"range","chr4:0-4000000000" -sample_filter = ["reorder_columns", "sample_phenotype_matrix.csv", "case_control_status"] # "select_columns", "select_column_list.txt", -plot_types = "sample_line_chart" -save_format = "html" -plot_title = "Read_depth_test" - - -using ViVa -using GeneticVariation -using VCFTools -plot=ViVa.jupyter_main_new(vcf_filename,field_to_visualize,variant_filter,sample_filter,plot_types,save_format,plot_title) - diff --git a/open_jupyter_notebook.jl b/open_jupyter_notebook.jl deleted file mode 100644 index bcac03b..0000000 --- a/open_jupyter_notebook.jl +++ /dev/null @@ -1,2 +0,0 @@ -using IJulia -notebook() diff --git a/src/ViVa.jl b/src/ViVa.jl index 67780f7..8db9b81 100644 --- a/src/ViVa.jl +++ b/src/ViVa.jl @@ -1,13 +1,13 @@ module ViVa -using DataFrames #use CSV.jl ? depwarnings +using DataFrames using PlotlyJS using Rsvg using Blink -using CSV using GeneticVariation using ArgParse -using VCFTools +#using VCFTools +#using Base.Filesystem @@ -42,9 +42,9 @@ export avg_variant_dp_line_chart, read_depth_threshhold, list_variant_positions_low_dp, - list_sample_positions_low_dp, + list_sample_names_low_dp, avg_dp_variant, - avg_dp_patients, + avg_dp_samples, jupyter_main, save_numerical_array, io_pass_filter, diff --git a/src/plot_utils.jl b/src/plot_utils.jl index f86e74f..cdc7941 100644 --- a/src/plot_utils.jl +++ b/src/plot_utils.jl @@ -42,8 +42,8 @@ function genotype_heatmap2(input,title,chrom_label_info,sample_names) end """ - genotype_heatmap_with_groups(input::Array{Int64,2},title::String,chrom_label_info::Tuple{Array{String,1},Array{Int64,1},String},group_label_pack::Array{Any,1},sample_names) - generate heatmap of genotype data. + genotype_heatmap_with_groups(input::Array{Int64,2},title::String,chrom_label_info::Tuple{Array{String,1},Array{Int64,1},String},group_label_pack::Array{Any,1},sample_names) +generate heatmap of genotype data. """ function genotype_heatmap_with_groups(input::Array{Int64,2},title::String,chrom_label_info::Tuple{Array{String,1},Array{Int64,1},String},group_label_pack::Array{Any,1},sample_names) @@ -127,7 +127,7 @@ function dp_heatmap2(input, title, chrom_label_info, sample_names) end """ - dp_heatmap2_with_groups(input::Array{Int64,2},title::String,chrom_label_info::Tuple{Array{String,1},Array{Int64,1},String},group_label_pack::Array{Any,1}) + dp_heatmap2_with_groups(input::Array{Int64,2},title::String,chrom_label_info::Tuple{Array{String,1},Array{Int64,1},String},group_label_pack::Array{Any,1}) generate heatmap of read depth data with grouped samples. """ function dp_heatmap2_with_groups(input::Array{Int64,2},title::String,chrom_label_info::Tuple{Array{String,1},Array{Int64,1},String},group_label_pack::Array{Any,1}) @@ -174,7 +174,7 @@ function dp_heatmap2_with_groups(input::Array{Int64,2},title::String,chrom_label end """ - avg_sample_dp_scatter(sample_avg_list::Array{Float64,1},sample_names) + avg_sample_dp_scatter(sample_avg_list::Array{Float64,1},sample_names) generate line chart of average read depths of each sample. """ function avg_sample_dp_scatter(sample_avg_list::Array{Float64,1},sample_names) diff --git a/src/vcf_utils_complete.jl b/src/vcf_utils_complete.jl index 68b749b..de3b55a 100644 --- a/src/vcf_utils_complete.jl +++ b/src/vcf_utils_complete.jl @@ -51,7 +51,6 @@ function sort_genotype_array(genotype_array) data=genotype_array[:,3:size(genotype_array,2)] chrom_positions = [parse(Int, i) for i in genotype_array[:,1:2]] genotype_array = hcat(chrom_positions,data) - genotype_array = sortrows(genotype_array, by=x->(x[1],x[2])) return genotype_array @@ -111,38 +110,37 @@ end io_sig_list_vcf_filter(sig_list,vcf_filename) returns subarray of variant records matching a list of variant positions returned from load_siglist() """ -function io_sig_list_vcf_filter(sig_list,vcf_filename) - - vcf_subarray = Array{Any}(0) +function io_sig_list_vcf_filter(sig_list,vcf_filename) - for row= 1:size(sig_list,1) - dimension = size(sig_list,1) + reader = VCF.Reader(open(vcf_filename, "r")) - chr=(sig_list[row,1]) - pos=(sig_list[row,2]) + vcf_subarray = Array{Any}(0) - reader = VCF.Reader(open(vcf_filename, "r")) + for record in reader - for record in reader + for row= 1:size(sig_list,1) + dimension = size(sig_list,1) - if typeof(VCF.chrom(record)) == String - chr = string(chr) + chr=(sig_list[row,1]) + pos=(sig_list[row,2]) - if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) - push!(vcf_subarray,record) - end + if typeof(VCF.chrom(record)) == String + chr = string(chr) + if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) + push!(vcf_subarray,record) + end - else + else - if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) - push!(vcf_subarray,record) + if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) + push!(vcf_subarray,record) - end - end + end end - end + end + end - return vcf_subarray + return vcf_subarray end """ @@ -183,16 +181,16 @@ function pass_chrrange_siglist_filter(vcf_filename,sig_list,chr_range::AbstractS vcf_subarray = Array{Any}(0) - for row= 1:size(sig_list,1) + reader = VCF.Reader(open(vcf_filename, "r")) + + for record in reader + + for row = 1:size(sig_list,1) dimension = size(sig_list,1) chr=(sig_list[row,1]) pos=(sig_list[row,2]) - reader = VCF.Reader(open(vcf_filename, "r")) - - for record in reader - if typeof(VCF.chrom(record)) == String chr = string(chr) @@ -255,12 +253,16 @@ returns subarray of vcf records with io_pass_filter and io_chromosome_range_vcf_ end """ - pass_siglist_filter(vcf_filename,sig_list,chr_range::AbstractString) - returns subarray of vcf records with io_pass_filter, io_sig_list_vcf_filter, and io_chromosome_range_vcf_filter applied. + pass_siglist_filter(vcf_filename,sig_list,chr_range::AbstractString) +returns subarray of vcf records with io_pass_filter, io_sig_list_vcf_filter, and io_chromosome_range_vcf_filter applied. """ - function pass_siglist_filter(vcf_filename,sig_list) +function pass_siglist_filter(vcf_filename,sig_list) - vcf_subarray = Array{Any}(0) + vcf_subarray = Array{Any}(0) + + reader = VCF.Reader(open(vcf_filename, "r")) + + for record in reader for row= 1:size(sig_list,1) dimension = size(sig_list,1) @@ -268,10 +270,6 @@ returns subarray of vcf records with io_pass_filter and io_chromosome_range_vcf_ chr=(sig_list[row,1]) pos=(sig_list[row,2]) - reader = VCF.Reader(open(vcf_filename, "r")) - - for record in reader - if typeof(VCF.chrom(record)) == String chr = string(chr) @@ -283,64 +281,63 @@ returns subarray of vcf records with io_pass_filter and io_chromosome_range_vcf_ if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) && (VCF.hasfilter(record)) && (VCF.filter(record) == String["PASS"]) push!(vcf_subarray,record) - end end - end end + end - return vcf_subarray - end + return vcf_subarray +end """ chrrange_siglist_filter(vcf_filename,sig_list,chr_range::AbstractString) returns subarray of vcf records with io_pass_filter, io_sig_list_vcf_filter, and io_chromosome_range_vcf_filter applied. """ - function chrrange_siglist_filter(vcf_filename,sig_list,chr_range::AbstractString) +function chrrange_siglist_filter(vcf_filename,sig_list,chr_range::AbstractString) - a=split(chr_range,":") - chrwhole=a[1] - chrnumber=split(chrwhole,"r") - string_chr=chrnumber[2] - chr=String(string_chr) - range=a[2] - splitrange=split(range, "-") - lower_limit=splitrange[1] - chr_range_low=parse(lower_limit) - upper_limit=splitrange[2] - chr_range_high=parse(upper_limit) + a=split(chr_range,":") + chrwhole=a[1] + chrnumber=split(chrwhole,"r") + string_chr=chrnumber[2] + chr=String(string_chr) + range=a[2] + splitrange=split(range, "-") + lower_limit=splitrange[1] + chr_range_low=parse(lower_limit) + upper_limit=splitrange[2] + chr_range_high=parse(upper_limit) - vcf_subarray = Array{Any}(0) + vcf_subarray = Array{Any}(0) - for row= 1:size(sig_list,1) - dimension = size(sig_list,1) + reader = VCF.Reader(open(vcf_filename, "r")) - chr=(sig_list[row,1]) - pos=(sig_list[row,2]) + for record in reader - reader = VCF.Reader(open(vcf_filename, "r")) + for row= 1:size(sig_list,1) + dimension = size(sig_list,1) - for record in reader + chr=(sig_list[row,1]) + pos=(sig_list[row,2]) - if typeof(VCF.chrom(record)) == String - chr = string(chr) + if typeof(VCF.chrom(record)) == String + chr = string(chr) - if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) && ((VCF.chrom(record) == chr)) && ((chr_range_high > VCF.pos(record) > chr_range_low)) - push!(vcf_subarray,record) - end + if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) && ((VCF.chrom(record) == chr)) && ((chr_range_high > VCF.pos(record) > chr_range_low)) + push!(vcf_subarray,record) + end - else + else - if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) && ((VCF.chrom(record) == chr)) && ((chr_range_high > VCF.pos(record) > chr_range_low)) - push!(vcf_subarray,record) + if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos) && ((VCF.chrom(record) == chr)) && ((chr_range_high > VCF.pos(record) > chr_range_low)) + push!(vcf_subarray,record) - end - end - end + end + end end + end - return vcf_subarray - end + return vcf_subarray +end #functions for converting vcf record array to numerical array @@ -367,9 +364,7 @@ convert sub from variant filters to gt_num_array and gt_chromosome_labels for pl """ function combined_all_genotype_array_functions(sub) genotype_array = generate_genotype_array(sub,"GT") - map!(s->replace(s, "chr", ""), genotype_array, genotype_array) - clean_column1!(genotype_array) genotype_array=ViVa.sort_genotype_array(genotype_array) geno_dict = define_geno_dict() @@ -508,7 +503,7 @@ function get_sample_names(reader) end """ - find_group_label_indices(pheno) + find_group_label_indices(pheno,trait_to_group_by,row_to_sort_by) find indices and determines names for group 1 and group 2 labels on plots. finds index of center of each sample group to place tick mark and label. """ function find_group_label_indices(pheno,trait_to_group_by,row_to_sort_by) diff --git a/test/.DS_Store b/test/.DS_Store index e9bcd34..058d747 100644 Binary files a/test/.DS_Store and b/test/.DS_Store differ diff --git a/test/new_vcf_utils.jl b/test/new_vcf_utils.jl index a0ee61b..c1ee9fd 100644 --- a/test/new_vcf_utils.jl +++ b/test/new_vcf_utils.jl @@ -26,331 +26,238 @@ sample_names = get_sample_names(reader) @test df[3,1] == 2 end +#functions for variant filters + @testset "io_chromosome_range_vcf_filter" begin sub = io_chromosome_range_vcf_filter("chr4:0-400000000",reader) -println(sub[1:2]) -println(size(sub,2)) +@test typeof(sub) == Array{Any,1} +@test size(sub,1) == 1012 +#println("io_chromosome_range_vcf_filter type is $(typeof(sub))") +#println("io_chromosome_range_vcf_filter size is $(size(sub,1))") end -@testset "io_sig_list_vcf_filter" begin + +@testset "filters_with_siglist" begin @testset "load_siglist" begin - sig_list=load_siglist("test_files/significantList_for_proteinstructures.csv") - println(sig_list[2:1]) - println(size(sig_list,1)) + sig_list=load_siglist("test_files/significantList_for_proteinstructures.csv") + #println(sig_list[2:1]) + #println(size(sig_list,1)) @testset "clean_column1_siglist!" begin clean_column1_siglist!(sig_list) - println(sig_list[1,2]) - println(size(sig_list,1)) + #println(sig_list[1,2]) + #println(size(sig_list,1)) end - sub=io_sig_list_vcf_filter(sig_list,vcf_filename) - println(sub[1,5]) - - @testset "pass_chrrange_siglist_filter" begin - sub = pass_chrrange_siglist_filter(vcf_filename,sig_list,"chr4:0-400000000") - println(sub[1,5]) + @testset "io_sig_list_vcf_filter" begin + sub=io_sig_list_vcf_filter(sig_list,vcf_filename) + @test (typeof(sub[1])) == GeneticVariation.VCF.Record + @test (length(sub)) == 13 + end - @testset pass_siglist_filter begin - sub = pass_siglist_filter(vcf_filename,sig_list) - end + @testset "pass_chrrange_siglist_filter" begin + sub = pass_chrrange_siglist_filter(vcf_filename,sig_list,"chr4:0-400000000") + @test (typeof(sub[1])) == GeneticVariation.VCF.Record + @test (length(sub)) == 12 + end - @testset "chrrange_siglist_filter" begin - sub = chrrange_siglist_filter(vcf_filename,sig_list,"chr4:0-400000000") - end + @testset "pass_siglist_filter" begin + sub = pass_siglist_filter(vcf_filename, sig_list) + @test (typeof(sub[1])) == GeneticVariation.VCF.Record + @test (length(sub)) == 12 + end + @testset "chrrange_siglist_filter" begin + sub = chrrange_siglist_filter(vcf_filename,sig_list,"chr4:0-400000000") + @test (typeof(sub[1])) == GeneticVariation.VCF.Record + @test (length(sub)) == 13 + end - end + end end -end + @testset "io_pass_filter" begin + reader = VCF.Reader(open(vcf_filename, "r")) sub = io_pass_filter(reader) - println(sub[2,1]) + @test (typeof(sub[1])) == GeneticVariation.VCF.Record + @test (length(sub)) == 1164 end @testset "pass_chrrange_filter" begin + reader = VCF.Reader(open(vcf_filename, "r")) sub = pass_chrrange_filter(reader,"chr4:0-400000000") + @test (typeof(sub[1])) == GeneticVariation.VCF.Record + @test (length(sub)) == 856 end - - - - - -#= -#functions for variant filters - - #functions for converting vcf record array to numerical array +@testset "combined_all_genotype_array_functions" begin -""" - create_chr_dict() -creates dict for use in combined_all_genotype_array_functions() for removing 'chr' from chromosome labels to allow sorting variant records by chromosome position. -""" -@testset create_chr_dict() begin - -end - -""" - combined_all_genotype_array_functions(sub) -convert sub from variant filters to gt_num_array and gt_chromosome_labels for plot functions. -""" -@testset combined_all_genotype_array_functions(sub) begin - -end - -""" - combined_all_read_depth_array_functions(sub) -convert sub from variant filters to dp_num_array and dp_chromosome_labels for plot functions. -""" -@testset combined_all_read_depth_array_functions(sub) begin - -end - -""" - generate_genotype_array(record_sub::Array{Any,1},genotype_field::String) -Returns numerical array of genotype values (either genotype or read_depth values) which are translated by another function into num_array -Where genotype_field is either GT or DP to visualize genotype or read_depth -""" -@testset generate_genotype_array(record_sub::Array{Any,1},y) begin - -end - -""" - define_geno_dict() -returns dictionary of values for use in replace_genotype_with_vals() -""" -@testset define_geno_dict() begin - -end - -""" - translate_genotype_to_num_array(genotype_array,geno_dict) -returns a tuple of num_array for plotting, and chromosome labels for plotting as label bar. -Translates array of genotype values to numerical array of categorical values. -Genotype values are converted to categorical values. No_call=0, 0/0=400, heterozygous_variant=600, homozygous_variant=800 -""" -@testset translate_genotype_to_num_array(genotype_array,geno_dict) begin - -end - -""" - translate_readdepth_strings_to_num_array(read_depth_array::Array{Any,2}) -Returns array of read_depth as int for plotting and average calculation. -By default, read depth values over 100 are replaced with 100 to improve heatmap visualization (see read_depth_threshhold() ). -Where read_depth_array is output of generate_genotype_array() for DP option -returns a tuple of num_array type Int for average calculation and plotting, and chromosome labels for plotting as label bar -""" -@testset translate_readdepth_strings_to_num_array(read_depth_array::Array{Any,2}) begin - -end - - -#functions for sample filters - -""" - get_sample_names(reader) -returns sample ids of vcf file as a vector of symbols for naming columns of num_array dataframe object for column filter functions -""" -@testset get_sample_names(reader) begin - -end - -""" - find_group_label_indices(pheno) -find indices and determines names for group 1 and group 2 labels on plots. finds index of center of each sample group to place tick mark and label. -""" -@testset find_group_label_indices(pheno,trait_to_group_by,row_to_sort_by) begin - -end - -""" - sortcols_by_phenotype_matrix(pheno_matrix_filename::String,trait_to_group_by::String,num_array::Array{Int64,2}, sample_names::Array{Symbol,2}) -group samples by a common trait using a user generated key matrix ("phenotype matrix") -""" -@testset sortcols_by_phenotype_matrix(pheno_matrix_filename::String,trait_to_group_by::String, num_array::Array{Int64,2}, sample_names::Array{Symbol,2}) begin - -end - -""" - select_columns(filename_sample_list::AbstractString, num_array::Array{Int64,2}, sample_names::Array{Symbol,2}) -returns num_array with columns matching user generated list of sample ids to select for analysis. num_array now has sample ids in first row. -""" -@testset select_columns(filename_sample_list::AbstractString, num_array::Array{Int64,2}, sample_names::Array{Symbol,2}) begin - -end - - -#functions for mathematic analysis -""" - avg_dp_samples(dp_num_array::Array{Int64,2}) -create sample_avg_list vector that lists averages of read depth for each sample for input into avg_sample_dp_line_chart(sample_avg_list) -dp_num_array must contain dp values as Int64 and be without chromosome position columns -""" -@testset avg_dp_samples(dp_num_array::Array{Int64,2}) begin - -end - - -""" - avg_dp_variant(dp_num_array::Array{Int64,2}) -create variant_avg_list vector that lists averages of read depth for each variant for input into avg_variant_dp_line_chart(variant_avg_list) -""" -@testset avg_dp_variant(dp_num_array::Array{Int64,2}) begin +reader = VCF.Reader(open(vcf_filename, "r")) +sub = io_pass_filter(reader) + +gt_num_array,gt_chromosome_labels=combined_all_genotype_array_functions(sub) +#println("combined_all_genotype_array_functions gt array is type: $(typeof(gt_num_array))") +#println("combined_all_genotype_array_functions gt array is length: $(length(gt_num_array))") +#println("combined_all_genotype_array_functions gt_chromosome_labels is typeof: $(typeof(gt_chromosome_labels))") +#println("combined_all_genotype_array_functions gt_chromosome_labels is length: $(length(gt_chromosome_labels))") +@test typeof(gt_num_array) == Array{Int64,2} +@test length(gt_num_array) == 222324 +@test typeof(gt_chromosome_labels) == Array{Any,2} +@test length(gt_chromosome_labels) == 2328 + + @testset "chromosome_label_generator" begin + chrom_label_info = ViVa.chromosome_label_generator(gt_chromosome_labels[:,1]) + #println("chromosome_label_generator chrom_label_info is type $(typeof(chrom_label_info))") + #println("chromosome_label_generator chrom_label_info is length $(length(chrom_label_info))") + @test typeof(chrom_label_info) == Tuple{Array{String,1},Array{Int64,1},String} + @test size(chrom_label_info,1) == 3 + end -end + @testset "generate_chromosome_positions_for_hover_labels" begin + chr_pos_tuple_list = generate_chromosome_positions_for_hover_labels(gt_chromosome_labels) + #println("generate_chromosome_positions_for_hover_labels chr_pos_tuple_list is type $(typeof(chr_pos_tuple_list))") + #println("generate_chromosome_positions_for_hover_labels chr_pos_tuple_list is length $(length(chr_pos_tuple_list))") + @test typeof(chr_pos_tuple_list) == Array{Tuple,1} + @test size(chr_pos_tuple_list,1) == 1164 + end -""" - list_sample_names_low_dp(sample_avg_list::Array{Float64,2},sample_names) -returns list of sample ids that have an average read depth of under 15 across all variant positions -""" -@testset list_sample_names_low_dp(sample_avg_list::Array{Float64,1},sample_names) begin + @testset "generate_genotype_array" begin + reader = VCF.Reader(open(vcf_filename, "r")) + sub = io_pass_filter(reader) + genotype_array=generate_genotype_array(sub,"GT") + + #println("generate_genotype_array is $(typeof(genotype_array))") + #println("generate_genotype_array is $(size(genotype_array,1))") + @test typeof(genotype_array) == Array{String,2} + @test size(genotype_array,1) == 1164 + + @testset "define_geno_dict" begin + geno_dict = define_geno_dict() + #println("define_geno_dict is type is $(typeof(geno_dict))") + #println("define_geno_dict is length is $(length(geno_dict))") + @test typeof(geno_dict) == Dict{Any,Any} + @test length(geno_dict) == 92 + + @testset "translate_genotype_to_num_array" begin + gt_num_array,gt_chromosome_labels=translate_genotype_to_num_array(genotype_array,geno_dict) + #println("translate_genotype_to_num_array gt_num_array type is $(typeof(gt_num_array))") + #println("translate_genotype_to_num_array gt_num_array length is $(length(gt_num_array))") + #println("translate_genotype_to_num_array gt_chromosome_labels typeof is $(typeof(gt_chromosome_labels))") + #println("translate_genotype_to_num_array gt_chromosome_labels length is $(length(gt_chromosome_labels))") + @test typeof(gt_num_array) == Array{Int64,2} + @test length(gt_num_array) == 222324 + @test typeof(gt_chromosome_labels) == Array{String,2} + @test length(gt_chromosome_labels) == 2328 + end + end + end end -""" - list_variant_positions_low_dp(variant_avg_list::Array{Float64,2},chrom_labels) -finds variant positions that have an average read depth of under 15 across all patients -""" -function list_variant_positions_low_dp(variant_avg_list::Array{Float64,1},chrom_labels) - - low_dp_index_list = Array{Int64}(0) +@testset "combined_all_read_depth_array_functions" begin - for item = 1:length(variant_avg_list) - if variant_avg_list[item] < 15 - push!(low_dp_index_list,item) +reader = VCF.Reader(open(vcf_filename, "r")) +sub = io_pass_filter(reader) +dp_num_array,dp_chromosome_labels=combined_all_read_depth_array_functions(sub) + +#println("combined_all_read_depth_array_functions dp_num_array type is $(typeof(dp_num_array))") +#println("combined_all_read_depth_array_functions dp_num_array length is $(length(dp_num_array))") +#println("combined_all_read_depth_array_functions gt_chromosome_labels typeof is $(typeof(dp_chromosome_labels))") +#println("combined_all_read_depth_array_functions gt_chromosome_labels length is $(length(dp_chromosome_labels))") +@test typeof(dp_num_array) == Array{Int64,2} +@test length(dp_num_array) == 222324 +@test typeof(dp_chromosome_labels) == Array{Any,2} +@test length(dp_chromosome_labels) == 2328 + + @testset "get_sample_names" begin + reader = VCF.Reader(open(vcf_filename, "r")) + sample_names=get_sample_names(reader) + #println("get_sample_names") + #println("get_sample_names sample_names type is $(typeof(sample_names))") + #println("get_sample_names sample_names length is $(length(sample_names))") + @test typeof(sample_names) == Array{Symbol,2} + @test length(sample_names) == 191 + + @testset "read_depth_threshhold" begin + dp_num_array=read_depth_threshhold(dp_num_array) + #println("read_depth_threshhold dp_num_array is type $(typeof(dp_num_array))") + #println("read_depth_threshhold dp_num_array is size $(size(dp_num_array,1))") + @test typeof(dp_num_array) == Array{Int64,2} + @test size(dp_num_array,1) == 1164 end - end - - low_dp_positions = Array{Tuple{Int64,Int64}}(0) - for i in low_dp_index_list - chrom_position = chrom_labels[i,1],chrom_labels[i,2] - push!(low_dp_positions,chrom_position) + @testset "avg_dp_samples" begin + avg_sample_list=avg_dp_samples(dp_num_array) + #println("avg_dp_samples avg_sample_list type is $(typeof(avg_sample_list))") + #println("avg_dp_samples avg_sample_list length is $(length(avg_sample_list))") + @test typeof(avg_sample_list) == Array{Float64,1} + @test size(avg_sample_list,1) == 191 + + @testset "list_sample_names_low_dp" begin + list=list_sample_names_low_dp(avg_sample_list,sample_names) + #println("list_sample_names_low_dp list type is $(typeof(list))") + #println("list_sample_names_low_dp list length is $(length(list))") + @test typeof(list) == Array{String,1} + @test size(list,1) == 6 end - return low_dp_positions -end - -#functions for producing objects for plot functions - -""" - read_depth_threshhold(dp_array::Array{Int64,2}) -sets ceiling for read depth values at dp = 100. All dp over 100 are set to 100 to visualize read depth values between 0 < dp > 100 in better definition -""" -function read_depth_threshhold(dp_array::Array{Int64,2}) - - dp_array[dp_array[:,:].>100].=100 - - return dp_array -end + end -""" - save_numerical_array(num_array::Matrix{Any},sample_names,chr_labels) -save numerical array with chr labels and sample ids to working directory -""" -function save_numerical_array(num_array,sample_names,chr_labels) + @testset "avg_dp_variant" begin + avg_variant_list=avg_dp_variant(dp_num_array) + #println("avg_dp_variant avg_variant_list type is $(typeof(avg_variant_list))") + #println("avg_dp_variant avg_variant_list length is $(length(avg_variant_list))") + @test typeof(avg_variant_list) == Array{Float64,1} + @test size(avg_variant_list,1) == 1164 - #samplenames=sample_names - #samplenames=Matrix(samplenames) + @testset "list_variant_positions_low_dp" begin + list=list_variant_positions_low_dp(avg_variant_list,dp_chromosome_labels) + #println("list_variant_positions_low_dp list type is $(typeof(list))") + #println("list_variant_positions_low_dp list length is $(length(list))") + @test typeof(list) == Array{Tuple{Int64,Int64},1} + @test size(list,1) == 33 - headings = hcat("chr","position") - sample_names = hcat(headings,sample_names) + end - chr_labeled_array_for_plotly=hcat(chr_labels, num_array) - labeled_value_matrix_withsamplenames= vcat(sample_names,chr_labeled_array_for_plotly) + end - writedlm("AC_gatk406_eh_PASS_withheader_value_matrix_.txt", labeled_value_matrix_withsamplenames, "\t") -end + @testset "sortcols_by_phenotype_matrix" begin + vcf,group_label_pack=sortcols_by_phenotype_matrix("test_files/sample_phenotype_matrix.csv","control,case", dp_num_array, sample_names) + #println("sortcols_by_phenotype_matrix vcf type is $(typeof(vcf))") + #println("sortcols_by_phenotype_matrix vcf size is $(size(vcf,1))") + #println("sortcols_by_phenotype_matrix group_label_pack type is $(typeof(group_label_pack))") + #println("sortcols_by_phenotype_matrix group_label_pack size is $(size(group_label_pack,1))") + @test typeof(vcf) == Array{Int64,2} + @test size(vcf,1) == 1164 + @test typeof(group_label_pack) == Array{Any,1} + @test size(group_label_pack,1) == 5 + + @testset "find_group_label_indices" begin + pheno = readdlm("test_files/sample_phenotype_matrix.csv", ',') + row_to_sort_by = find(x -> x == "control,case", pheno) + row_to_sort_by = row_to_sort_by[1] + group_label_pack=find_group_label_indices(pheno,"control,case",row_to_sort_by) + #println("find_group_label_indices group_label_pack type is $(typeof(group_label_pack))") + #println("find_group_label_indices group_label_pack length is $(length(group_label_pack))") + @test typeof(group_label_pack) == Array{Any,1} + @test size(group_label_pack,1) == 5 + end -""" - chromosome_label_generator(chromosome_labels::Array{String,2}) -Returns vector of chr labels and indices to mark chromosomes in plotly heatmap -Specifically, saves indexes and chrom labels in vectors to pass into heatmap function to ticvals and tictext respectively -Input is either gt_chromosome_labels or dp_chromosome_labels from translate_gt/dp_to_num_array() -""" - -function chromosome_label_generator(chromosome_labels::Array{Any,1}) - chrom_label_indices = findfirst.(map(a -> (y -> isequal(a, y)), unique(chromosome_labels)), [chromosome_labels]) - chrom_labels = unique(chromosome_labels) - chrom_labels = [string(i) for i in chrom_labels] - - if length(chrom_labels) > 1 - for item=2:(length(chrom_labels)) - - ratio=((chrom_label_indices[item])-(chrom_label_indices[item-1]))/(length(chromosome_labels)) - println(ratio) - - if ratio < 0.2 - font_size = "8" - println("font size is $font_size") - return chrom_labels,chrom_label_indices,font_size - else - font_size = "10" - println("font size is $font_size") - - return chrom_labels,chrom_label_indices,font_size - end + @testset "select_columns" begin + dp_num_array=select_columns("test_files/select_samples_list.txt", dp_num_array, sample_names) + #println("select_columns dp_num_array type is $(typeof(dp_num_array))") + #println("select_columns dp_num_array size is $(size(dp_num_array,1))") + @test typeof(dp_num_array) == Array{Int64,2} + @test size(dp_num_array,1) == 1164 end - else - font_size = "10" - return chrom_labels,chrom_label_indices,font_size end -end -""" - checkfor_outputdirectory(path::String) -Checks to see if output directory exists already. If it doesn't, it creates the new directory to write output files to. -""" -function checkfor_outputdirectory(path::String) - if isdir(path) == true - else - mkdir(path) end -end - -""" - generate_chromosome_positions_for_hover_labels(chr_labels::Array{Any,2}) -creates tuple of genomic locations to set as tick labels. This is automatically store chromosome positions in hover labels. However tick labels are set to hidden with showticklabels=false so they will not crowd the y axis. -""" -function generate_chromosome_positions_for_hover_labels(chr_labels::Array{Any,2}) - -returnXY_column1!(chr_labels) #not working yet -#println(chr_labels) - -chr_pos_tuple_list=Array{Tuple}(0) - - for row = 1:size(chr_labels,1) - - chr=chr_labels[row,1] - pos=chr_labels[row,2] - chr_pos_tuple=chr,pos - push!(chr_pos_tuple_list,chr_pos_tuple) end - return chr_pos_tuple_list -end - -""" - returnXY_column1!(chr_label_vector) -Replace String "23","24","25" with "X","Y","M" in chromosome label vector used for plot labels -""" -@testset "returnXY_column1!" begin - -end - -""" - sort_genotype_array(genotype_array) -sorts genotype array for GT or DP by chromosomal location -""" -@testset sort_genotype_array(genotype_array) begin - -end - - -=# end diff --git a/test/test_files/select_column_list.txt b/test/test_files/select_samples_list.txt similarity index 100% rename from test/test_files/select_column_list.txt rename to test/test_files/select_samples_list.txt diff --git a/viva_cli.jl b/viva_cli.jl index 8209662..6cfdd7a 100644 --- a/viva_cli.jl +++ b/viva_cli.jl @@ -1,8 +1,3 @@ -#= -read_depth heatmap up to 15 white, up to 30 light blue, -don't output files in same folder -=# - println("Welcome to ViVa.") println() println("Loading packages:") @@ -59,7 +54,7 @@ function test_parse_main(ARGS::Vector{String}) ) @add_arg_table s begin - "--vcf_file", "-f" # vcf filename + "--vcf_file", "-f" help = "vcf filename in format: file.vcf" arg_type = String required = true @@ -73,7 +68,7 @@ function test_parse_main(ARGS::Vector{String}) arg_type = String default = "output" - "--save_format", "-s" # format to save graphics in + "--save_format", "-s" help = "file format you wish to save graphics as (eg. pdf, html, png). Defaults to html" arg_type = String default = "html" @@ -129,9 +124,7 @@ end parsed_args = test_parse_main(ARGS) -#begin main program - -#filter vcf and load matrix +#filter vcf and load matrix of filtered vcf records vcf_filename = (parsed_args["vcf_file"]) println("Reading $vcf_filename ...") @@ -175,7 +168,7 @@ if parsed_args["show_stats"] == true end tic() - +#pass_filter if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] == nothing && parsed_args["positions_list"] == nothing sub = ViVa.io_pass_filter(reader) number_rows = size(sub,1) @@ -183,6 +176,7 @@ if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] == noth heatmap_input = "pass_filtered" end +#chr_range if parsed_args["chromosome_range"] != nothing && parsed_args["pass_filter"] == false && parsed_args["positions_list"] == nothing sub = ViVa.io_chromosome_range_vcf_filter(parsed_args["chromosome_range"],reader) number_rows = size(sub,1) @@ -190,6 +184,7 @@ if parsed_args["chromosome_range"] != nothing && parsed_args["pass_filter"] == f heatmap_input = "range_filtered" end +#list if parsed_args["positions_list"] != nothing && parsed_args["chromosome_range"] == nothing && parsed_args["pass_filter"] == false sig_list = load_siglist(parsed_args["positions_list"]) sub = ViVa.io_sig_list_vcf_filter(sig_list,vcf_filename) @@ -198,6 +193,7 @@ if parsed_args["positions_list"] != nothing && parsed_args["chromosome_range"] = heatmap_input = "positions_filtered" end +#pass_filter and chr_range and list if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] != nothing && parsed_args["positions_list"] != nothing sig_list = load_siglist(parsed_args["positions_list"]) sub = ViVa.pass_chrrange_siglist_filter(vcf_filename, sig_list, parsed_args["chromosome_range"]) @@ -205,12 +201,14 @@ if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] != noth println("selected $number_rows variants with Filter status: PASS, that match list of chromosome positions of interest, and are within chromosome range: $(parsed_args["chromosome_range"])") end +#pass_filter and chr_range if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] != nothing && parsed_args["positions_list"] == nothing sub = ViVa.pass_chrrange_filter(reader, parsed_args["chromosome_range"]) number_rows = size(sub,1) println("selected $number_rows variants with Filter status: PASS and are within chromosome range: $(parsed_args["chromosome_range"])") end +#pass_filter and list if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] == nothing && parsed_args["positions_list"] != nothing sig_list = load_siglist(parsed_args["positions_list"]) sub = ViVa.pass_siglist_filter(vcf_filename, sig_list) @@ -218,6 +216,15 @@ if parsed_args["pass_filter"] == true && parsed_args["chromosome_range"] == noth println("selected $number_rows variants with Filter status: PASS and that match list of chromosome positions of interest") end +#chr_range and list +if parsed_args["pass_filter"] == false && parsed_args["chromosome_range"] != nothing && parsed_args["positions_list"] != nothing + sig_list = load_siglist(parsed_args["positions_list"]) + sub = ViVa.chrrange_siglist_filter(vcf_filename, sig_list, parsed_args["chromosome_range"]) + number_rows = size(sub,1) + println("selected $number_rows variants that are within chromosome range: $(parsed_args["chromosome_range"]) and that match list of chromosome positions of interest") +end + +#no filters if parsed_args["pass_filter"] == false && parsed_args["chromosome_range"] == nothing && parsed_args["positions_list"] == nothing println("no filters applied. Large vcf files will take a long time to process and heatmap visualizations will lose resolution at this scale unless viewed in interactive html for zooming.") @@ -232,13 +239,22 @@ if parsed_args["pass_filter"] == false && parsed_args["chromosome_range"] == not end +println() +println("Finished Filtering. Total time to filter:") +toc() +println("_______________________________________________") + +#convert to numeric array for plotting and generate/save heatmaps and scatter plots + if parsed_args["heatmap"] == "genotype" gt_num_array,gt_chromosome_labels = combined_all_genotype_array_functions(sub) if parsed_args["heatmap_title"] != nothing title = parsed_args["heatmap_title"] else - title = "Genotype_$(parsed_args["vcf_file"])" + bn = Base.Filesystem.basename(parsed_args["vcf_file"]) + title = "Genotype_$bn" + println(title) end chr_pos_tuple_list = generate_chromosome_positions_for_hover_labels(gt_chromosome_labels) @@ -249,7 +265,9 @@ if parsed_args["heatmap"] == "genotype" if parsed_args["select_samples"] != nothing println("selecting samples listed in $(parsed_args["select_samples"])") - gt_num_array = select_columns(parsed_args["select_samples"], gt_num_array, sample_names) + gt_num_array = select_columns(parsed_args["select_samples"], + gt_num_array, + sample_names) end group_trait_matrix_filename=((parsed_args["group_samples"])[1]) @@ -268,6 +286,8 @@ if parsed_args["heatmap"] == "genotype" graphic = ViVa.genotype_heatmap2(gt_num_array,title,chrom_label_info,sample_names) end + println("saving now") + PlotlyJS.savefig(graphic, joinpath("$(parsed_args["output_directory"])" ,"$title.$(parsed_args["save_format"])"), js=:remote) end @@ -317,20 +337,10 @@ if parsed_args["heatmap"] == "read_depth" end -#println("_______________________________________________") -println() -println("Finished Filtering. Total time to filter:") -toc() -println("_______________________________________________") -#println() - if parsed_args["avg_dp"] == "sample" if isdefined(:dp_num_array) == false - read_depth_array = ViVa.generate_genotype_array(sub,"DP") - clean_column1!(read_depth_array) - read_depth_array=ViVa.sort_genotype_array(read_depth_array) - dp_num_array,dp_chromosome_labels = translate_readdepth_strings_to_num_array(read_depth_array) + dp_num_array,dp_chromosome_labels=combined_all_read_depth_array_functions(sub) end chr_pos_tuple_list = generate_chromosome_positions_for_hover_labels(dp_chromosome_labels) @@ -343,11 +353,9 @@ if parsed_args["avg_dp"] == "sample" PlotlyJS.savefig(graphic, joinpath("$(parsed_args["output_directory"])" ,"Average Sample Read Depth.$(parsed_args["save_format"])"), js=:remote) elseif parsed_args["avg_dp"] == "variant" + if isdefined(:dp_num_array) == false - read_depth_array = ViVa.generate_genotype_array(sub,"DP") - clean_column1!(read_depth_array) - read_depth_array=ViVa.sort_genotype_array(read_depth_array) - dp_num_array,dp_chromosome_labels = translate_readdepth_strings_to_num_array(read_depth_array) + dp_num_array,dp_chromosome_labels=combined_all_read_depth_array_functions(sub) end chr_pos_tuple_list = generate_chromosome_positions_for_hover_labels(dp_chromosome_labels)