Skip to content

Commit

Permalink
GenomicsDB upgrade fixes (#7257)
Browse files Browse the repository at this point in the history
* support to not remap missing to nonref for combine operations
* test for soft masked regions showing up as N allele
* remove genomicsdbimport progress meter, print samples in batch

Co-authored-by: nalinigans <[email protected]>
  • Loading branch information
mlathara and nalinigans authored Jul 16, 2021
1 parent 688515a commit 934a39d
Show file tree
Hide file tree
Showing 10 changed files with 357 additions and 3 deletions.
14 changes: 12 additions & 2 deletions src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,14 @@ public boolean requiresIntervals() {
return false;
}

/**
* Does this tool want to disable the progress meter? If so, override here to return true
*
* @return true if this tools wants to disable progress meter output, otherwise false
*/
public boolean disableProgressMeter() {
return false;
}

/**
* Get the {@link SequenceDictionaryValidationArgumentCollection} for the tool.
Expand Down Expand Up @@ -1073,9 +1081,11 @@ public void onTraversalStart() {}
protected final Object doWork() {
try {
onTraversalStart();
progressMeter.start();
if (!disableProgressMeter()) {
progressMeter.start();
}
traverse();
if (!progressMeter.stopped()) {
if (!disableProgressMeter() && !progressMeter.stopped()) {
progressMeter.stop();
}
return onTraversalSuccess();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,11 @@ protected List<SimpleInterval> transformTraversalIntervals(final List<SimpleInte
}
}

@Override
public boolean disableProgressMeter() {
return true;
}

@Override
public int getDefaultCloudPrefetchBufferSize() {
// Empirical testing has shown that this tool performs best at scale with cloud buffering
Expand Down Expand Up @@ -717,8 +722,20 @@ private Map<String, FeatureReader<VariantContext>> createSampleToReaderMap(
}

private Void logMessageOnBatchCompletion(final BatchCompletionCallbackFunctionArgument arg) {
progressMeter.update(null);
logger.info("Done importing batch " + arg.batchCount + "/" + arg.totalBatchCount);
logger.debug("List of samples imported in batch " + arg.batchCount + ":");
int index = 0;
final int sampleCount = sampleNameToVcfPath.size();
final int updatedBatchSize = (batchSize == DEFAULT_ZERO_BATCH_SIZE) ? sampleCount : batchSize;
final int startBatch = (arg.batchCount - 1) * updatedBatchSize;
final int stopBatch = arg.batchCount * updatedBatchSize;
for(String key : sampleNameToVcfPath.keySet()) {
index++;
if (index <= startBatch || index > stopBatch) {
continue;
}
logger.debug("\t"+key);
}
this.batchCount = arg.batchCount + 1;
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ public static void updateImportProtobufVidMapping(GenomicsDBImporter importer) {
GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY, ELEMENT_WISE_SUM);
vidMapPB = updateAlleleSpecificINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.AS_RAW_QUAL_APPROX_KEY, ELEMENT_WISE_INT_SUM);
vidMapPB = updateFieldSetDisableRemapMissingAlleleToNonRef(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.AS_RAW_RMS_MAPPING_QUALITY_KEY, true);
vidMapPB = updateFieldSetDisableRemapMissingAlleleToNonRef(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.AS_SB_TABLE_KEY, true);

importer.updateProtobufVidMapping(vidMapPB);
}
Expand Down Expand Up @@ -261,6 +265,36 @@ public static GenomicsDBVidMapProto.VidMappingPB updateAlleleSpecificINFOFieldCo
}

/**
* Update vid Protobuf field to set whether missing allele should be remapped with value from NON_REF
* @param vidMapPB input vid object
* @param fieldNameToIndexInVidFieldsList name to index in list
* @param fieldName INFO field name
* @param value boolean value - true to disable remapping missing value with value from NON_REF
* @return updated vid Protobuf object if field exists, else return original protobuf object
*/
public static GenomicsDBVidMapProto.VidMappingPB updateFieldSetDisableRemapMissingAlleleToNonRef(
final GenomicsDBVidMapProto.VidMappingPB vidMapPB,
final Map<String, Integer> fieldNameToIndexInVidFieldsList,
final String fieldName,
final boolean value)
{
int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName)
? fieldNameToIndexInVidFieldsList.get(fieldName) : -1;
if(fieldIdx >= 0) {
//Would need to rebuild vidMapPB - so get top level builder first
GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder();
//To update the list element corresponding to fieldName, we get the builder for that specific list element
GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder infoBuilder =
updatedVidMapBuilder.getFieldsBuilder(fieldIdx);

infoBuilder.setDisableRemapMissingWithNonRef(value);
//Rebuild full vidMap
return updatedVidMapBuilder.build();
}
return vidMapPB;
}

/*
* Changes relative local file paths to be absolute file paths. Cloud Datastore paths are left unchanged
* @param path to the resource
* @return an absolute file path if the original path was a relative file path, otherwise the original path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTes
private static final int SEVERAL_CONTIGS = 7;
private static final String MANY_CONTIGS_INTERVAL_PICARD_STYLE_EXPECTED =
toolsTestDir + "GenomicsDBImport/Ptrichocarpa.v3.expected.interval_list";
private static String IUPAC_REF = publicTestDir + "/iupacFASTA.fasta";

@Override
public String getTestedClassName() {
Expand Down Expand Up @@ -447,6 +448,37 @@ public void testGenomicsDBAlleleSpecificAnnotationsInTheMiddleOfSpanningDeletion
new String[]{"-G", "StandardAnnotation", "-G", "AS_StandardAnnotation"});
}

@Test
public void testGenomicsDBNoRemapMissingToNonRef() throws IOException {
testGenomicsDBAgainstCombineGVCFs(Arrays.asList(COMBINEGVCFS_TEST_DIR+"NA12878.AS.NON_REF_remap_check.chr20snippet.g.vcf",
COMBINEGVCFS_TEST_DIR+"NA12892.AS.chr20snippet.g.vcf"),
new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("20", 10433313, 10700000))),
b37_reference_20_21,
new String[]{"-G", "StandardAnnotation", "-G", "AS_StandardAnnotation"});
}

@Test
public void testGenomicsDBSoftMaskedRegion() throws IOException {
final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";
final List<String> vcfInputs = Arrays.asList(GENOMICSDB_TEST_DIR+"iupacTestSoftMasked.1.vcf",
GENOMICSDB_TEST_DIR+"iupacTestSoftMasked.2.vcf");
final List<SimpleInterval> intervals = Arrays.asList(new SimpleInterval("chr1", 1, 18000));

writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1);
checkNoNAlleleInRef(workspace, IUPAC_REF);
}

private void checkNoNAlleleInRef(final String workspace, final String referenceFile) throws IOException {
try(final FeatureReader<VariantContext> reader = getGenomicsDBFeatureReader(workspace, referenceFile)) {
final CloseableTribbleIterator<VariantContext> iterator = reader.iterator();
Assert.assertTrue(iterator.hasNext(), "expected to see a variant");
iterator.forEachRemaining(vc -> {
Allele refAllele = vc.getReference();
Assert.assertFalse(refAllele.basesMatch("N"), vc.getContig()+":"+Integer.toString(vc.getStart()));
});
}
}

/**
* Converts a list of large file paths into equivalent cloud paths
* This must be done non-statically because any failure during static initialization results in hard to understand
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=HaplotypeCaller,Version=3.1-1-g07a4bf8,Date="Fri Apr 04 07:31:03 EDT 2014",Epoch=1396611063032,CommandLineOptions="analysis_type=HaplotypeCaller input_file=[/seq/external-data/1kg/ACB/exome/HG01958/HG01958.bam] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[/seq/picardtemp3/seq/sample_vcf/1kg_ACB/Exome/Homo_sapiens_assembly19/89742746-5001-317d-8b64-97264efbcfd9/scattered/temp_0001_of_10/scattered.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=true num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false variant_index_type=LINEAR variant_index_parameter=128000 logging_level=INFO log_to_file=null help=false version=false likelihoodCalculationEngine=PairHMM heterogeneousKmerSizeResolution=COMBO_MIN graphOutput=null bamOutput=null bam_compression=null disable_bam_indexing=null generate_md5=null simplifyBAM=null bamWriterType=CALLED_HAPLOTYPES dbsnp=(RodBinding name= source=UNBOUND) dontTrimActiveRegions=false maxDiscARExtension=25 maxGGAARExtension=300 paddingAroundIndels=150 paddingAroundSNPs=20 comp=[] annotation=[ClippingRankSumTest, DepthPerSampleHC, StrandBiasBySample] excludeAnnotation=[SpanningDeletions, TandemRepeatAnnotator, ChromosomeCounts, FisherStrand, QualByDepth] heterozygosity=0.001 indel_heterozygosity=1.25E-4 genotyping_mode=DISCOVERY standard_min_confidence_threshold_for_calling=-0.0 standard_min_confidence_threshold_for_emitting=-0.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=3 input_prior=[] contamination_fraction_to_filter=0.002 contamination_fraction_per_sample_file=null p_nonref_model=EXACT_INDEPENDENT exactcallslog=null kmerSize=[10, 25] dontIncreaseKmerSizesForCycles=false numPruningSamples=1 recoverDanglingHeads=false dontRecoverDanglingTails=false consensus=false emitRefConfidence=GVCF GVCFGQBands=[5, 20, 60] indelSizeToEliminateInRefModel=10 min_base_quality_score=10 minPruning=3 gcpHMM=10 includeUmappedReads=false useAllelesTrigger=false useFilteredReadsForAnnotations=false phredScaledGlobalReadMismappingRate=45 maxNumHaplotypesInPopulation=200 mergeVariantsViaLD=false pair_hmm_implementation=VECTOR_LOGLESS_CACHING keepRG=null justDetermineActiveRegions=false dontGenotype=false errorCorrectKmers=false debug=false debugGraphTransformations=false dontUseSoftClippedBases=false captureAssemblyFailureBAM=false allowCyclesInKmerGraphToGeneratePaths=false noFpga=false errorCorrectReads=false kmerLengthForReadErrorCorrection=25 minObservationsForKmerToBeSolid=20 pcr_indel_model=CONSERVATIVE activityProfileOut=null activeRegionOut=null activeRegionIn=null activeRegionExtension=null forceActive=false activeRegionMaxSize=null bandPassSigma=null min_mapping_quality_score=20 filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
##GVCFBlock=minGQ=20(inclusive),maxGQ=60(exclusive)
##GVCFBlock=minGQ=5(inclusive),maxGQ=20(exclusive)
##GVCFBlock=minGQ=60(inclusive),maxGQ=2147483647(exclusive)
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=RAW_MQ,Number=1,Type=Float,Description="Raw data for RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##contig=<ID=chr1,length=100000>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT IUPAC1
chr1 12000 . CGG C,<NON_REF> . . END=12277;DS GT:DP:GQ:MIN_DP:PL 0/0:3:0:0:0,0,0
chr1 17385 db567;rs890 G T,<NON_REF> 3302.77 PASS BaseQRankSum=-2.074;ClippingRankSum=0.555;DP=120;MLEAC=2,7;MLEAF=1,0.7;MQ=29.82;RAW_MQ=2.5;MQ0=3;MQRankSum=-1.369;ReadPosRankSum=-0.101 GT:AD:DP:GQ:PL:SB:PGT:PID 1/1:0,120,37:120:99:3336,358,0,4536,958,7349:0,0,0,0:0|1:17385_G_T
Binary file not shown.
Loading

0 comments on commit 934a39d

Please sign in to comment.