-
Notifications
You must be signed in to change notification settings - Fork 596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GenomicsDB upgrade fixes #7257
GenomicsDB upgrade fixes #7257
Changes from all commits
d66f3d4
6380374
c4bb467
824e3ca
75be1de
bcd7f26
f1a013e
34b8fed
05a0171
b8b1c11
559955c
fb09132
93a8c03
81a37f1
2be62f5
6550f6f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -371,6 +371,11 @@ protected List<SimpleInterval> transformTraversalIntervals(final List<SimpleInte | |
} | ||
} | ||
|
||
@Override | ||
public boolean disableProgressMeter() { | ||
return true; | ||
} | ||
|
||
@Override | ||
public int getDefaultCloudPrefetchBufferSize() { | ||
// Empirical testing has shown that this tool performs best at scale with cloud buffering | ||
|
@@ -717,8 +722,20 @@ private Map<String, FeatureReader<VariantContext>> createSampleToReaderMap( | |
} | ||
|
||
private Void logMessageOnBatchCompletion(final BatchCompletionCallbackFunctionArgument arg) { | ||
progressMeter.update(null); | ||
logger.info("Done importing batch " + arg.batchCount + "/" + arg.totalBatchCount); | ||
logger.debug("List of samples imported in batch " + arg.batchCount + ":"); | ||
int index = 0; | ||
final int sampleCount = sampleNameToVcfPath.size(); | ||
final int updatedBatchSize = (batchSize == DEFAULT_ZERO_BATCH_SIZE) ? sampleCount : batchSize; | ||
final int startBatch = (arg.batchCount - 1) * updatedBatchSize; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see we decrement startBatch here but then we increment for the new batch later. Just checking to make sure we're not off by 1 in one direction. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think there is an off by 1 issue here. Minus 1 here just to get the upper and lower bounds |
||
final int stopBatch = arg.batchCount * updatedBatchSize; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like these calculations could be moved into the batch argument class itself but maybe that's not worth it since they're only used here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Left these since they're only used here... |
||
for(String key : sampleNameToVcfPath.keySet()) { | ||
index++; | ||
if (index <= startBatch || index > stopBatch) { | ||
continue; | ||
} | ||
logger.debug("\t"+key); | ||
} | ||
this.batchCount = arg.batchCount + 1; | ||
return null; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
##fileformat=VCFv4.1 | ||
##FILTER=<ID=PASS,Description="All filters passed"> | ||
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location"> | ||
##FILTER=<ID=LowQual,Description="Low quality"> | ||
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another"> | ||
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group"> | ||
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> | ||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"> | ||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | ||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block"> | ||
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> | ||
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."> | ||
##GATKCommandLine=<ID=HaplotypeCaller,Version=3.1-1-g07a4bf8,Date="Fri Apr 04 07:31:03 EDT 2014",Epoch=1396611063032,CommandLineOptions="analysis_type=HaplotypeCaller input_file=[/seq/external-data/1kg/ACB/exome/HG01958/HG01958.bam] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[/seq/picardtemp3/seq/sample_vcf/1kg_ACB/Exome/Homo_sapiens_assembly19/89742746-5001-317d-8b64-97264efbcfd9/scattered/temp_0001_of_10/scattered.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=true num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false variant_index_type=LINEAR variant_index_parameter=128000 logging_level=INFO log_to_file=null help=false version=false likelihoodCalculationEngine=PairHMM heterogeneousKmerSizeResolution=COMBO_MIN graphOutput=null bamOutput=null bam_compression=null disable_bam_indexing=null generate_md5=null simplifyBAM=null bamWriterType=CALLED_HAPLOTYPES dbsnp=(RodBinding name= source=UNBOUND) dontTrimActiveRegions=false maxDiscARExtension=25 maxGGAARExtension=300 paddingAroundIndels=150 paddingAroundSNPs=20 comp=[] annotation=[ClippingRankSumTest, DepthPerSampleHC, StrandBiasBySample] excludeAnnotation=[SpanningDeletions, TandemRepeatAnnotator, ChromosomeCounts, FisherStrand, QualByDepth] heterozygosity=0.001 indel_heterozygosity=1.25E-4 genotyping_mode=DISCOVERY standard_min_confidence_threshold_for_calling=-0.0 standard_min_confidence_threshold_for_emitting=-0.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=3 input_prior=[] contamination_fraction_to_filter=0.002 contamination_fraction_per_sample_file=null p_nonref_model=EXACT_INDEPENDENT exactcallslog=null kmerSize=[10, 25] dontIncreaseKmerSizesForCycles=false numPruningSamples=1 recoverDanglingHeads=false dontRecoverDanglingTails=false consensus=false emitRefConfidence=GVCF GVCFGQBands=[5, 20, 60] indelSizeToEliminateInRefModel=10 min_base_quality_score=10 minPruning=3 gcpHMM=10 includeUmappedReads=false useAllelesTrigger=false useFilteredReadsForAnnotations=false phredScaledGlobalReadMismappingRate=45 maxNumHaplotypesInPopulation=200 mergeVariantsViaLD=false pair_hmm_implementation=VECTOR_LOGLESS_CACHING keepRG=null justDetermineActiveRegions=false dontGenotype=false errorCorrectKmers=false debug=false debugGraphTransformations=false dontUseSoftClippedBases=false captureAssemblyFailureBAM=false allowCyclesInKmerGraphToGeneratePaths=false noFpga=false errorCorrectReads=false kmerLengthForReadErrorCorrection=25 minObservationsForKmerToBeSolid=20 pcr_indel_model=CONSERVATIVE activityProfileOut=null activeRegionOut=null activeRegionIn=null activeRegionExtension=null forceActive=false activeRegionMaxSize=null bandPassSigma=null min_mapping_quality_score=20 filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false"> | ||
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive) | ||
##GVCFBlock=minGQ=20(inclusive),maxGQ=60(exclusive) | ||
##GVCFBlock=minGQ=5(inclusive),maxGQ=20(exclusive) | ||
##GVCFBlock=minGQ=60(inclusive),maxGQ=2147483647(exclusive) | ||
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"> | ||
##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases"> | ||
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"> | ||
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?"> | ||
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval"> | ||
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes"> | ||
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"> | ||
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality"> | ||
##INFO=<ID=RAW_MQ,Number=1,Type=Float,Description="Raw data for RMS Mapping Quality"> | ||
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads"> | ||
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"> | ||
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"> | ||
##contig=<ID=chr1,length=100000> | ||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT IUPAC1 | ||
chr1 12000 . CGG C,<NON_REF> . . END=12277;DS GT:DP:GQ:MIN_DP:PL 0/0:3:0:0:0,0,0 | ||
chr1 17385 db567;rs890 G T,<NON_REF> 3302.77 PASS BaseQRankSum=-2.074;ClippingRankSum=0.555;DP=120;MLEAC=2,7;MLEAF=1,0.7;MQ=29.82;RAW_MQ=2.5;MQ0=3;MQRankSum=-1.369;ReadPosRankSum=-0.101 GT:AD:DP:GQ:PL:SB:PGT:PID 1/1:0,120,37:120:99:3336,358,0,4536,958,7349:0,0,0,0:0|1:17385_G_T |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are some issues with integrating this new method into the various traversals, but I will address those myself in a separate PR after this is merged.