GenomicsDB upgrade fixes (#7257)

* support to not remap missing to nonref for combine operations * test for soft masked regions showing up as N allele * remove genomicsdbimport progress meter, print samples in batch Co-authored-by: nalinigans <[email protected]>
broadinstitute · Jul 16, 2021 · 934a39d · 934a39d
1 parent 688515a
commit 934a39d
Show file tree

Hide file tree

Showing 10 changed files with 357 additions and 3 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
@@ -598,6 +598,14 @@ public boolean requiresIntervals() {
         return false;
     }
 
+    /**
+     * Does this tool want to disable the progress meter? If so, override here to return true
+     * 
+     * @return true if this tools wants to disable progress meter output, otherwise false
+     */
+    public boolean disableProgressMeter() {
+        return false;
+    }
 
     /**
      * Get the {@link SequenceDictionaryValidationArgumentCollection} for the tool.
@@ -1073,9 +1081,11 @@ public void onTraversalStart() {}
     protected final Object doWork() {
         try {
             onTraversalStart();
-            progressMeter.start();
+            if (!disableProgressMeter()) {
+                progressMeter.start();
+            }
             traverse();
-            if (!progressMeter.stopped()) {
+            if (!disableProgressMeter() && !progressMeter.stopped()) {
                 progressMeter.stop();
             }
             return onTraversalSuccess();

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java
@@ -371,6 +371,11 @@ protected List<SimpleInterval> transformTraversalIntervals(final List<SimpleInte
         }
     }
 
+    @Override
+    public boolean disableProgressMeter() {
+        return true;
+    }
+
     @Override
     public int getDefaultCloudPrefetchBufferSize() {
         // Empirical testing has shown that this tool performs best at scale with cloud buffering
@@ -717,8 +722,20 @@ private Map<String, FeatureReader<VariantContext>> createSampleToReaderMap(
     }
 
     private Void logMessageOnBatchCompletion(final BatchCompletionCallbackFunctionArgument arg) {
-        progressMeter.update(null);
         logger.info("Done importing batch " + arg.batchCount + "/" + arg.totalBatchCount);
+        logger.debug("List of samples imported in batch " + arg.batchCount + ":");
+        int index = 0;
+        final int sampleCount = sampleNameToVcfPath.size();
+        final int updatedBatchSize = (batchSize == DEFAULT_ZERO_BATCH_SIZE) ? sampleCount : batchSize;
+        final int startBatch = (arg.batchCount - 1) * updatedBatchSize;
+        final int stopBatch = arg.batchCount * updatedBatchSize;
+        for(String key : sampleNameToVcfPath.keySet()) {
+            index++;
+            if (index <= startBatch || index > stopBatch) {
+                continue;
+            }
+            logger.debug("\t"+key);
+        }
         this.batchCount = arg.batchCount + 1;
         return null;
     }

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBUtils.java
@@ -80,6 +80,10 @@ public static void updateImportProtobufVidMapping(GenomicsDBImporter importer) {
                 GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY, ELEMENT_WISE_SUM);
         vidMapPB = updateAlleleSpecificINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
                 GATKVCFConstants.AS_RAW_QUAL_APPROX_KEY, ELEMENT_WISE_INT_SUM);
+        vidMapPB = updateFieldSetDisableRemapMissingAlleleToNonRef(vidMapPB, fieldNameToIndexInVidFieldsList,
+                GATKVCFConstants.AS_RAW_RMS_MAPPING_QUALITY_KEY, true);
+        vidMapPB = updateFieldSetDisableRemapMissingAlleleToNonRef(vidMapPB, fieldNameToIndexInVidFieldsList,
+                GATKVCFConstants.AS_SB_TABLE_KEY, true);
 
         importer.updateProtobufVidMapping(vidMapPB);
     }
@@ -261,6 +265,36 @@ public static GenomicsDBVidMapProto.VidMappingPB updateAlleleSpecificINFOFieldCo
     }
 
     /**
+     * Update vid Protobuf field to set whether missing allele should be remapped with value from NON_REF
+     * @param vidMapPB input vid object
+     * @param fieldNameToIndexInVidFieldsList name to index in list
+     * @param fieldName INFO field name
+     * @param value boolean value - true to disable remapping missing value with value from NON_REF
+     * @return updated vid Protobuf object if field exists, else return original protobuf object
+     */
+    public static GenomicsDBVidMapProto.VidMappingPB updateFieldSetDisableRemapMissingAlleleToNonRef(
+            final GenomicsDBVidMapProto.VidMappingPB vidMapPB,
+            final Map<String, Integer> fieldNameToIndexInVidFieldsList,
+            final String fieldName,
+            final boolean value)
+    {
+        int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName)
+                ? fieldNameToIndexInVidFieldsList.get(fieldName) : -1;
+        if(fieldIdx >= 0) {
+            //Would need to rebuild vidMapPB - so get top level builder first
+            GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder();
+            //To update the list element corresponding to fieldName, we get the builder for that specific list element
+            GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder infoBuilder =
+                    updatedVidMapBuilder.getFieldsBuilder(fieldIdx);
+
+            infoBuilder.setDisableRemapMissingWithNonRef(value);
+            //Rebuild full vidMap
+            return updatedVidMapBuilder.build();
+        }
+        return vidMapPB;
+    }
+
+    /*
      * Changes relative local file paths to be absolute file paths. Cloud Datastore paths are left unchanged
      * @param path to the resource
      * @return an absolute file path if the original path was a relative file path, otherwise the original path

diff --git a/.../java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java b/.../java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImportIntegrationTest.java
@@ -180,6 +180,7 @@ public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTes
     private static final int SEVERAL_CONTIGS = 7;
     private static final String MANY_CONTIGS_INTERVAL_PICARD_STYLE_EXPECTED =
             toolsTestDir + "GenomicsDBImport/Ptrichocarpa.v3.expected.interval_list";
+    private static String IUPAC_REF = publicTestDir + "/iupacFASTA.fasta";
 
     @Override
     public String getTestedClassName() {
@@ -447,6 +448,37 @@ public void testGenomicsDBAlleleSpecificAnnotationsInTheMiddleOfSpanningDeletion
                 new String[]{"-G", "StandardAnnotation", "-G", "AS_StandardAnnotation"});
     }
 
+    @Test
+    public void testGenomicsDBNoRemapMissingToNonRef() throws IOException {
+        testGenomicsDBAgainstCombineGVCFs(Arrays.asList(COMBINEGVCFS_TEST_DIR+"NA12878.AS.NON_REF_remap_check.chr20snippet.g.vcf", 
+                COMBINEGVCFS_TEST_DIR+"NA12892.AS.chr20snippet.g.vcf"),
+                new ArrayList<SimpleInterval>(Arrays.asList(new SimpleInterval("20", 10433313, 10700000))),
+                b37_reference_20_21,
+                new String[]{"-G", "StandardAnnotation", "-G", "AS_StandardAnnotation"});
+    }
+
+    @Test
+    public void testGenomicsDBSoftMaskedRegion() throws IOException {
+        final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";
+        final List<String> vcfInputs = Arrays.asList(GENOMICSDB_TEST_DIR+"iupacTestSoftMasked.1.vcf",
+                GENOMICSDB_TEST_DIR+"iupacTestSoftMasked.2.vcf");
+        final List<SimpleInterval> intervals = Arrays.asList(new SimpleInterval("chr1", 1, 18000));
+
+        writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1);
+        checkNoNAlleleInRef(workspace, IUPAC_REF);
+    }
+
+    private void checkNoNAlleleInRef(final String workspace, final String referenceFile) throws IOException {
+        try(final FeatureReader<VariantContext> reader = getGenomicsDBFeatureReader(workspace, referenceFile)) {
+            final CloseableTribbleIterator<VariantContext> iterator = reader.iterator();
+            Assert.assertTrue(iterator.hasNext(), "expected to see a variant");
+            iterator.forEachRemaining(vc -> {
+                Allele refAllele = vc.getReference();
+                Assert.assertFalse(refAllele.basesMatch("N"), vc.getContig()+":"+Integer.toString(vc.getStart()));
+            });
+        }
+    }
+
     /**
      * Converts a list of large file paths into equivalent cloud paths
      * This must be done non-statically because any failure during static initialization results in hard to understand

diff --git a/.../resources/org/broadinstitute/hellbender/tools/GenomicsDBImport/iupacTestSoftMasked.1.vcf b/.../resources/org/broadinstitute/hellbender/tools/GenomicsDBImport/iupacTestSoftMasked.1.vcf
@@ -0,0 +1,36 @@
+##fileformat=VCFv4.1
+##FILTER=<ID=PASS,Description="All filters passed">
+##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
+##FILTER=<ID=LowQual,Description="Low quality">
+##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another">
+##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
+##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
+##GATKCommandLine=<ID=HaplotypeCaller,Version=3.1-1-g07a4bf8,Date="Fri Apr 04 07:31:03 EDT 2014",Epoch=1396611063032,CommandLineOptions="analysis_type=HaplotypeCaller input_file=[/seq/external-data/1kg/ACB/exome/HG01958/HG01958.bam] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[/seq/picardtemp3/seq/sample_vcf/1kg_ACB/Exome/Homo_sapiens_assembly19/89742746-5001-317d-8b64-97264efbcfd9/scattered/temp_0001_of_10/scattered.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=true num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false variant_index_type=LINEAR variant_index_parameter=128000 logging_level=INFO log_to_file=null help=false version=false likelihoodCalculationEngine=PairHMM heterogeneousKmerSizeResolution=COMBO_MIN graphOutput=null bamOutput=null bam_compression=null disable_bam_indexing=null generate_md5=null simplifyBAM=null bamWriterType=CALLED_HAPLOTYPES dbsnp=(RodBinding name= source=UNBOUND) dontTrimActiveRegions=false maxDiscARExtension=25 maxGGAARExtension=300 paddingAroundIndels=150 paddingAroundSNPs=20 comp=[] annotation=[ClippingRankSumTest, DepthPerSampleHC, StrandBiasBySample] excludeAnnotation=[SpanningDeletions, TandemRepeatAnnotator, ChromosomeCounts, FisherStrand, QualByDepth] heterozygosity=0.001 indel_heterozygosity=1.25E-4 genotyping_mode=DISCOVERY standard_min_confidence_threshold_for_calling=-0.0 standard_min_confidence_threshold_for_emitting=-0.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=3 input_prior=[] contamination_fraction_to_filter=0.002 contamination_fraction_per_sample_file=null p_nonref_model=EXACT_INDEPENDENT exactcallslog=null kmerSize=[10, 25] dontIncreaseKmerSizesForCycles=false numPruningSamples=1 recoverDanglingHeads=false dontRecoverDanglingTails=false consensus=false emitRefConfidence=GVCF GVCFGQBands=[5, 20, 60] indelSizeToEliminateInRefModel=10 min_base_quality_score=10 minPruning=3 gcpHMM=10 includeUmappedReads=false useAllelesTrigger=false useFilteredReadsForAnnotations=false phredScaledGlobalReadMismappingRate=45 maxNumHaplotypesInPopulation=200 mergeVariantsViaLD=false pair_hmm_implementation=VECTOR_LOGLESS_CACHING keepRG=null justDetermineActiveRegions=false dontGenotype=false errorCorrectKmers=false debug=false debugGraphTransformations=false dontUseSoftClippedBases=false captureAssemblyFailureBAM=false allowCyclesInKmerGraphToGeneratePaths=false noFpga=false errorCorrectReads=false kmerLengthForReadErrorCorrection=25 minObservationsForKmerToBeSolid=20 pcr_indel_model=CONSERVATIVE activityProfileOut=null activeRegionOut=null activeRegionIn=null activeRegionExtension=null forceActive=false activeRegionMaxSize=null bandPassSigma=null min_mapping_quality_score=20 filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
+##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
+##GVCFBlock=minGQ=20(inclusive),maxGQ=60(exclusive)
+##GVCFBlock=minGQ=5(inclusive),maxGQ=20(exclusive)
+##GVCFBlock=minGQ=60(inclusive),maxGQ=2147483647(exclusive)
+##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
+##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
+##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
+##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
+##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
+##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
+##INFO=<ID=RAW_MQ,Number=1,Type=Float,Description="Raw data for RMS Mapping Quality">
+##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
+##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
+##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
+##contig=<ID=chr1,length=100000>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	IUPAC1
+chr1	12000	.	CGG	C,<NON_REF>	.	.	END=12277;DS	GT:DP:GQ:MIN_DP:PL	0/0:3:0:0:0,0,0
+chr1	17385	db567;rs890	G	T,<NON_REF>	3302.77	PASS	BaseQRankSum=-2.074;ClippingRankSum=0.555;DP=120;MLEAC=2,7;MLEAF=1,0.7;MQ=29.82;RAW_MQ=2.5;MQ0=3;MQRankSum=-1.369;ReadPosRankSum=-0.101	GT:AD:DP:GQ:PL:SB:PGT:PID	1/1:0,120,37:120:99:3336,358,0,4536,958,7349:0,0,0,0:0|1:17385_G_T
diff --git a/...ources/org/broadinstitute/hellbender/tools/GenomicsDBImport/iupacTestSoftMasked.1.vcf.idx b/...ources/org/broadinstitute/hellbender/tools/GenomicsDBImport/iupacTestSoftMasked.1.vcf.idx