Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Finished most tests. Fixed siglist filter functions. #8

Merged
merged 6 commits into from
Nov 5, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/ViVa.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ using DataFrames #use CSV.jl ? depwarnings
using PlotlyJS
using Rsvg
using Blink
using CSV
#using CSV
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#using CSV

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you're not using CSV, just remove that line

using GeneticVariation
using ArgParse
using VCFTools
Expand Down
13 changes: 7 additions & 6 deletions src/vcf_utils_complete.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ function sort_genotype_array(genotype_array)
data=genotype_array[:,3:size(genotype_array,2)]
chrom_positions = [parse(Int, i) for i in genotype_array[:,1:2]]
genotype_array = hcat(chrom_positions,data)

genotype_array = sortrows(genotype_array, by=x->(x[1],x[2]))

return genotype_array
Expand Down Expand Up @@ -122,24 +121,28 @@ function io_sig_list_vcf_filter(sig_list,vcf_filename)
pos=(sig_list[row,2])

reader = VCF.Reader(open(vcf_filename, "r"))

tic()
for record in reader

if typeof(VCF.chrom(record)) == String
chr = string(chr)

if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos)
push!(vcf_subarray,record)
end


end#remove this if need code below
#=
else

if (VCF.chrom(record) == chr) && (VCF.pos(record) == pos)
push!(vcf_subarray,record)

end
end
=#
end
toc()
end

return vcf_subarray
Expand Down Expand Up @@ -367,9 +370,7 @@ convert sub from variant filters to gt_num_array and gt_chromosome_labels for pl
"""
function combined_all_genotype_array_functions(sub)
genotype_array = generate_genotype_array(sub,"GT")

map!(s->replace(s, "chr", ""), genotype_array, genotype_array)

clean_column1!(genotype_array)
genotype_array=ViVa.sort_genotype_array(genotype_array)
geno_dict = define_geno_dict()
Expand Down Expand Up @@ -508,7 +509,7 @@ function get_sample_names(reader)
end

"""
find_group_label_indices(pheno)
find_group_label_indices(pheno,trait_to_group_by,row_to_sort_by)
find indices and determines names for group 1 and group 2 labels on plots. finds index of center of each sample group to place tick mark and label.
"""
function find_group_label_indices(pheno,trait_to_group_by,row_to_sort_by)
Expand Down
Binary file modified test/.DS_Store
Binary file not shown.
244 changes: 111 additions & 133 deletions test/new_vcf_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,189 +26,167 @@ sample_names = get_sample_names(reader)
@test df[3,1] == 2
end

#functions for variant filters

@testset "io_chromosome_range_vcf_filter" begin
sub = io_chromosome_range_vcf_filter("chr4:0-400000000",reader)
println(sub[1:2])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove print statements from tests and add @test macros.

println(size(sub,2))
end

@testset "io_sig_list_vcf_filter" begin
#=
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this commented out?

@testset "filters_with_siglist" begin

@testset "load_siglist" begin
sig_list=load_siglist("test_files/significantList_for_proteinstructures.csv")
println(sig_list[2:1])
println(size(sig_list,1))
sig_list=load_siglist("test_files/significantList_for_proteinstructures.csv")
println(sig_list[2:1])
println(size(sig_list,1))

@testset "clean_column1_siglist!" begin
clean_column1_siglist!(sig_list)
println(sig_list[1,2])
println(size(sig_list,1))
end

sub=io_sig_list_vcf_filter(sig_list,vcf_filename)
println(sub[1,5])

@testset "pass_chrrange_siglist_filter" begin
sub = pass_chrrange_siglist_filter(vcf_filename,sig_list,"chr4:0-400000000")
println(sub[1,5])
@testset "io_sig_list_vcf_filter" begin
sub=io_sig_list_vcf_filter(sig_list,vcf_filename)
@test (typeof(sub[1])) == GeneticVariation.VCF.Record
@test (length(sub)) == 13
end

@testset pass_siglist_filter begin
sub = pass_siglist_filter(vcf_filename,sig_list)
end
@testset "pass_chrrange_siglist_filter" begin
sub = pass_chrrange_siglist_filter(vcf_filename,sig_list,"chr4:0-400000000")
@test (typeof(sub[1])) == GeneticVariation.VCF.Record
@test (length(sub)) == 12
end

@testset "chrrange_siglist_filter" begin
sub = chrrange_siglist_filter(vcf_filename,sig_list,"chr4:0-400000000")
end
@testset "pass_siglist_filter" begin
sub = pass_siglist_filter(vcf_filename, sig_list)
@test (typeof(sub[1])) == GeneticVariation.VCF.Record
@test (length(sub)) == 12
end

@testset "chrrange_siglist_filter" begin
sub = chrrange_siglist_filter(vcf_filename,sig_list,"chr4:0-400000000")
@test (typeof(sub[1])) == GeneticVariation.VCF.Record
@test (length(sub)) == 13
end

end
end

end
end
=#

@testset "io_pass_filter" begin
reader = VCF.Reader(open(vcf_filename, "r"))
sub = io_pass_filter(reader)
println(sub[2,1])
@test (typeof(sub[1])) == GeneticVariation.VCF.Record
@test (length(sub)) == 1164
end

@testset "pass_chrrange_filter" begin
reader = VCF.Reader(open(vcf_filename, "r"))
sub = pass_chrrange_filter(reader,"chr4:0-400000000")
@test (typeof(sub[1])) == GeneticVariation.VCF.Record
@test (length(sub)) == 856
end






#=
#functions for variant filters


#functions for converting vcf record array to numerical array
@testset "combined_all_genotype_array_functions" begin
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the next few test sets have no test, only print statements. Make sure all test sets are actually testing something. Also, it looks like there are nested test sets here, is this necessary. Try to use indentation to match begin and end statements.


"""
create_chr_dict()
creates dict for use in combined_all_genotype_array_functions() for removing 'chr' from chromosome labels to allow sorting variant records by chromosome position.
"""
@testset create_chr_dict() begin

end

"""
combined_all_genotype_array_functions(sub)
convert sub from variant filters to gt_num_array and gt_chromosome_labels for plot functions.
"""
@testset combined_all_genotype_array_functions(sub) begin

end

"""
combined_all_read_depth_array_functions(sub)
convert sub from variant filters to dp_num_array and dp_chromosome_labels for plot functions.
"""
@testset combined_all_read_depth_array_functions(sub) begin

end

"""
generate_genotype_array(record_sub::Array{Any,1},genotype_field::String)
Returns numerical array of genotype values (either genotype or read_depth values) which are translated by another function into num_array
Where genotype_field is either GT or DP to visualize genotype or read_depth
"""
@testset generate_genotype_array(record_sub::Array{Any,1},y) begin

end

"""
define_geno_dict()
returns dictionary of values for use in replace_genotype_with_vals()
"""
@testset define_geno_dict() begin
reader = VCF.Reader(open(vcf_filename, "r"))
sub = io_pass_filter(reader)

end
gt_num_array,gt_chromosome_labels=combined_all_genotype_array_functions(sub)
println(typeof(gt_num_array))
println(length(gt_num_array))
println(typeof(gt_chromosome_labels))
println(length(gt_chromosome_labels))

"""
translate_genotype_to_num_array(genotype_array,geno_dict)
returns a tuple of num_array for plotting, and chromosome labels for plotting as label bar.
Translates array of genotype values to numerical array of categorical values.
Genotype values are converted to categorical values. No_call=0, 0/0=400, heterozygous_variant=600, homozygous_variant=800
"""
@testset translate_genotype_to_num_array(genotype_array,geno_dict) begin
@testset "generate_genotype_array" begin
reader = VCF.Reader(open(vcf_filename, "r"))
sub = io_pass_filter(reader)
genotype_array=generate_genotype_array(sub,"GT")
println(typeof(genotype_array))
println(length(genotype_array))
println(genotype_array[3:5])

@testset "define_geno_dict" begin
geno_dict = define_geno_dict()
println(typeof(geno_dict))
println(length(geno_dict))

@testset "translate_genotype_to_num_array" begin
gt_num_array,gt_chromosome_labels=translate_genotype_to_num_array(genotype_array,geno_dict)
println(typeof(gt_num_array))
println(length(gt_num_array))
println(typeof(gt_chromosome_labels))
println(length(gt_chromosome_labels))
end
end
end

end

"""
translate_readdepth_strings_to_num_array(read_depth_array::Array{Any,2})
Returns array of read_depth as int for plotting and average calculation.
By default, read depth values over 100 are replaced with 100 to improve heatmap visualization (see read_depth_threshhold() ).
Where read_depth_array is output of generate_genotype_array() for DP option
returns a tuple of num_array type Int for average calculation and plotting, and chromosome labels for plotting as label bar
"""
@testset translate_readdepth_strings_to_num_array(read_depth_array::Array{Any,2}) begin
@testset "combined_all_read_depth_array_functions" begin #inside functions same used in combined_all_genotype_array_functions

reader = VCF.Reader(open(vcf_filename, "r"))
sub = io_pass_filter(reader)
dp_num_array,dp_chromosome_labels=combined_all_read_depth_array_functions(sub)
println(typeof(dp_num_array))
println(length(dp_num_array))
println(typeof(dp_chromosome_labels))
println(length(dp_chromosome_labels))

@testset "get_sample_names" begin
reader = VCF.Reader(open(vcf_filename, "r"))
sample_names=get_sample_names(reader)
println("get_sample_names")
println(typeof(sample_names))
println(length(sample_names))

@testset "avg_dp_samples" begin
avg_sample_list=avg_dp_samples(dp_num_array)
println("avg_sample_list is $avg_sample_list")

@testset "list_sample_names_low_dp" begin
list=list_sample_names_low_dp(avg_sample_list,sample_names)
println(list)
end


#functions for sample filters

"""
get_sample_names(reader)
returns sample ids of vcf file as a vector of symbols for naming columns of num_array dataframe object for column filter functions
"""
@testset get_sample_names(reader) begin

end

"""
find_group_label_indices(pheno)
find indices and determines names for group 1 and group 2 labels on plots. finds index of center of each sample group to place tick mark and label.
"""
@testset find_group_label_indices(pheno,trait_to_group_by,row_to_sort_by) begin

end

"""
sortcols_by_phenotype_matrix(pheno_matrix_filename::String,trait_to_group_by::String,num_array::Array{Int64,2}, sample_names::Array{Symbol,2})
group samples by a common trait using a user generated key matrix ("phenotype matrix")
"""
@testset sortcols_by_phenotype_matrix(pheno_matrix_filename::String,trait_to_group_by::String, num_array::Array{Int64,2}, sample_names::Array{Symbol,2}) begin

end

"""
select_columns(filename_sample_list::AbstractString, num_array::Array{Int64,2}, sample_names::Array{Symbol,2})
returns num_array with columns matching user generated list of sample ids to select for analysis. num_array now has sample ids in first row.
"""
@testset select_columns(filename_sample_list::AbstractString, num_array::Array{Int64,2}, sample_names::Array{Symbol,2}) begin

@testset "avg_dp_variant" begin
avg_variant_list=avg_dp_variant(dp_num_array)
println("avg_dp_variant is $avg_variant_list")
end

@testset "sortcols_by_phenotype_matrix" begin
vcf,group_label_pack=sortcols_by_phenotype_matrix("test_files/sample_phenotype_matrix.csv","control,case", dp_num_array, sample_names)
println(typeof(vcf))
println(size(vcf,1))
println(typeof(group_label_pack))
println(length(group_label_pack))

@testset "find_group_label_indices" begin
pheno = readdlm("test_files/sample_phenotype_matrix.csv", ',')
row_to_sort_by = find(x -> x == "control,case", pheno)
row_to_sort_by = row_to_sort_by[1]
group_label_pack=find_group_label_indices(pheno,"control,case",row_to_sort_by)
println(typeof(group_label_pack))
println(length(group_label_pack))
end

#functions for mathematic analysis
"""
avg_dp_samples(dp_num_array::Array{Int64,2})
create sample_avg_list vector that lists averages of read depth for each sample for input into avg_sample_dp_line_chart(sample_avg_list)
dp_num_array must contain dp values as Int64 and be without chromosome position columns
"""
@testset avg_dp_samples(dp_num_array::Array{Int64,2}) begin
@testset "select_columns" begin
dp_num_array=select_columns("test_files/select_samples_list.txt", dp_num_array, sample_names)
println(typeof(dp_num_array))
println(length(dp_num_array))
end

end


"""
avg_dp_variant(dp_num_array::Array{Int64,2})
create variant_avg_list vector that lists averages of read depth for each variant for input into avg_variant_dp_line_chart(variant_avg_list)
"""
@testset avg_dp_variant(dp_num_array::Array{Int64,2}) begin

end

"""
list_sample_names_low_dp(sample_avg_list::Array{Float64,2},sample_names)
returns list of sample ids that have an average read depth of under 15 across all variant positions
"""
@testset list_sample_names_low_dp(sample_avg_list::Array{Float64,1},sample_names) begin

end

"""
Expand Down
Loading