Skip to content

Commit

Permalink
Disentangle non-GVS code from GVS code [VS-834] (#8229)
Browse files Browse the repository at this point in the history
  • Loading branch information
mcovarr authored Mar 3, 2023
1 parent 6d41adf commit 3a7f6e2
Show file tree
Hide file tree
Showing 30 changed files with 138 additions and 400 deletions.
3 changes: 1 addition & 2 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -216,15 +216,14 @@ workflows:
branches:
- master
- ah_var_store
- vs_639_hail_testing_spike
- name: GvsQuickstartIntegration
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsQuickstartIntegration.wdl
filters:
branches:
- master
- ah_var_store
- vs_639_hail_testing_spike
- vs_834_disentangle
- name: GvsIngestTieout
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl
Expand Down
96 changes: 96 additions & 0 deletions scripts/variantstore/repo/generate_git_filter_repo_command.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env bash
#
# VS-834 Generate the `git filter-repo` command for moving files and history from the ah_var_store branch of `gatk`
# to the `variantstore` repo. Find files that were/are:
#
# * Added on ah_var_store after it branched from master.
# * Not on master (exclude files cherry picked from master).
# * Were never deleted from master (exclude files cherry picked from master that were subsequently deleted).

# Debug only
# PS4='\D{+%F %T} \w $ '
# set -o errexit -o nounset -o pipefail -o xtrace

# Make the variants branch a variable to be able to experiment with the effect of moving files around on a branch
# of ah_var_store.
# variants_branch=ah_var_store
variants_branch=vs_834_disentangle

files_on_master() {
git ls-tree -r --name-only master
}


ah_var_store_branch_point() {
# Finds the commit at which ah_var_store branched off master.
# https://stackoverflow.com/a/4991675/21269164
# The more obvious `git merge-base --fork-point master ah_var_store` does not work, exits 1 with no output.
diff -u <(git rev-list --first-parent $variants_branch) <(git rev-list --first-parent master) |
sed -ne 's/^ //p' | head -1
}


files_added_on_ah_var_store() {
# Look for files added to ah_var_store since the branch point. Note that these files were not necessarily *uniquely*
# added to ah_var_store and might represent cherry picks from master (e.g. scripts and build files for the migration
# from Travis to GitHub Actions, VQSR Lite work, etc.)
git diff "$(ah_var_store_branch_point)" $variants_branch --name-status | grep -E '^A' | cut -f 2-
}


files_added_on_ah_var_store_not_on_master() {
# `comm` excluding files only on master and files on master that were also added on ah_var_store
comm -1 -3 <(files_on_master) <(files_added_on_ah_var_store)
}


files_deleted_from_master() {
# This intentionally does not use `git diff` as is used in `files_added_on_ah_var_store` since that would only show
# files deleted from the branch point to the head of master. There are numerous files here (mostly related to VQSR
# Lite) where files added to master after the branch point were cherry picked onto ah_var_store and subsequently
# deleted from master. This `git log` finds these while the `git diff` does not.
#
# https://waylonwalker.com/git-find-deleted-files/#git-log-diff-filter
# That `sort` at the end is critical to avoid confusing `comm`
git log master --diff-filter D --pretty="format:" --name-only | sed '/^$/d' | sort
}


files_to_move() {
# `comm` excluding files deleted from master from our previous working set of files.
comm -1 -3 <(files_deleted_from_master) <(files_added_on_ah_var_store_not_on_master)
}

paths_to_move() {
# Call out the paths that we want to move from gatk to variantstore.
# Partition into paths that are unique to Variants (containing '/gvs/' or '/variantstore/') and those that are not.
# Use perl for the non-greedy `*?` quantifier since we're trying to grab as short of a 'variantstore' or 'gvs' path
# prefix as possible.
files_to_move | perl -ne 'if (/(.*?(\/gvs\/|\/variantstore\/)).*/) { print "$1\n" } else { print "$_\n" }' |
sed '/^$/d' | sort -u
}

# Files that were legitimately created on ah_var_store but currently need to stay in master barring some significant
# refactoring:
KEEP_ON_MASTER=(
# Added on ah_var_store but referenced by BigQueryUtils which was created on master before the branch point.
src/main/java/org/broadinstitute/hellbender/utils/bigquery/BigQueryResultAndStatistics.java
# Same
src/main/java/org/broadinstitute/hellbender/utils/bigquery/StorageAPIAvroReaderAndBigQueryStatistics.java
)

# https://stackoverflow.com/a/17841619/21269164
function join_by { local IFS="$1"; shift; echo "$*"; }

KEEP_ON_MASTER_FILTER=$(join_by '|' "${KEEP_ON_MASTER[@]}")
paths_to_move | grep -v -E "$KEEP_ON_MASTER_FILTER" > /tmp/gvs_paths.txt

# Build the command to be executed in *a clone of* the `gatk` repo prior to pushing to `variantstore`.

# Disable the ShellCheck warning about word splitting since that's a feature here. File names are individually quoted
# within the expression.
# shellcheck disable=SC2046

# The 'Useless cat' makes this more readable left to right.
# shellcheck disable=SC2002
echo git filter-repo $(cat /tmp/gvs_paths.txt | sed 's/^/'\''/' | sed 's/$/'\''/' | sed -E 's/^/ --path /' | tr -d '\n')
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.bigquery.*;
import org.broadinstitute.hellbender.utils.localsort.AvroSortingCollectionCodec;
import org.broadinstitute.hellbender.utils.localsort.SortingCollection;
import org.broadinstitute.hellbender.utils.gvs.bigquery.AvroFileReader;
import org.broadinstitute.hellbender.utils.gvs.localsort.AvroSortingCollectionCodec;
import org.broadinstitute.hellbender.utils.gvs.localsort.SortingCollection;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;

Expand Down Expand Up @@ -1150,7 +1151,7 @@ void createVariantsFromSortedRanges(final SortedSet<Long> sampleIdsToExtract,
// TODO: use the genotypes of this specific sample (e.g. 0/1 vs 1/2) to decide how big the spanning deletion is. The current logic matches what we do on ingest though
// TODO: really, really, really think this through!!!
handlePotentialSpanningDeletion(vetRow, referenceCache);

lastPosition = variantLocation;
lastSample = variantSample;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.bigquery.AvroFileReader;
import org.broadinstitute.hellbender.utils.gvs.bigquery.AvroFileReader;
import org.broadinstitute.hellbender.utils.bigquery.StorageAPIAvroReader;
import org.broadinstitute.hellbender.utils.bigquery.TableReference;
import org.broadinstitute.hellbender.utils.localsort.SortingCollection;
import org.broadinstitute.hellbender.utils.gvs.localsort.SortingCollection;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;

Expand Down Expand Up @@ -1116,7 +1116,7 @@ void createVariantsFromSortedRanges(final SortedSet<Long> sampleIdsToExtract,
// TODO: use the genotypes of this specific sample (e.g. 0/1 vs 1/2) to decide how big the spanning deletion is. The current logic matches what we do on ingest though
// TODO: really, really, really think this through!!!
handlePotentialSpanningDeletion(vetRow, referenceCache);

lastPosition = variantLocation;
lastSample = variantSample;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.bigquery.*;
import org.broadinstitute.hellbender.utils.localsort.AvroSortingCollection;
import org.broadinstitute.hellbender.utils.localsort.SortingCollection;
import org.broadinstitute.hellbender.utils.gvs.localsort.AvroSortingCollection;
import org.broadinstitute.hellbender.utils.gvs.localsort.SortingCollection;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.HomoSapiensConstants;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public final class RefCreator {

public static boolean doRowsExistFor(CommonCode.OutputType outputType, String projectId, String datasetName, String tableNumber, String sampleId) {
if (outputType != CommonCode.OutputType.BQ) return false;
return BigQueryUtils.doRowsExistFor(projectId, datasetName, REF_RANGES_FILETYPE_PREFIX + tableNumber, sampleId);
return BigQueryUtils.doRowsExistFor(projectId, datasetName, REF_RANGES_FILETYPE_PREFIX + tableNumber, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId);
}

public RefCreator(String sampleIdentifierForOutputFileName, String sampleId, String tableNumber, SAMSequenceDictionary seqDictionary, GQStateEnum gqStateToIgnore, final boolean dropAboveGqThreshold, final File outputDirectory, final CommonCode.OutputType outputType, final boolean writeReferenceRanges, final String projectId, final String datasetName) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,9 @@
package org.broadinstitute.hellbender.tools.gvs.ingest;

import com.google.protobuf.Descriptors;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.file.CodecFactory;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.io.DatumWriter;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.bigquery.PendingBQWriter;
import org.broadinstitute.hellbender.utils.gvs.bigquery.PendingBQWriter;
import org.json.JSONObject;

import java.io.File;
import java.io.IOException;
import java.util.concurrent.ExecutionException;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import org.broadinstitute.hellbender.tools.gvs.common.IngestConstants;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
import org.broadinstitute.hellbender.utils.bigquery.BigQueryUtils;
import org.broadinstitute.hellbender.utils.bigquery.PendingBQWriter;
import org.broadinstitute.hellbender.utils.gvs.bigquery.PendingBQWriter;
import org.broadinstitute.hellbender.utils.tsv.SimpleXSVWriter;
import org.json.JSONObject;

Expand All @@ -33,7 +33,7 @@ public class VetCreator {

public static boolean doRowsExistFor(CommonCode.OutputType outputType, String projectId, String datasetName, String tableNumber, String sampleId) {
if (outputType != CommonCode.OutputType.BQ) return false;
return BigQueryUtils.doRowsExistFor(projectId, datasetName, VET_FILETYPE_PREFIX + tableNumber, sampleId);
return BigQueryUtils.doRowsExistFor(projectId, datasetName, VET_FILETYPE_PREFIX + tableNumber, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId);
}

public VetCreator(String sampleIdentifierForOutputFileName, String sampleId, String tableNumber, final File outputDirectory, final CommonCode.OutputType outputType, final String projectId, final String datasetName, final boolean forceLoadingFromNonAlleleSpecific, final boolean skipLoadingVqsrFields) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;

import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -451,17 +450,17 @@ public static StorageAPIAvroReaderAndBigQueryStatistics executeQueryWithStorageA
return new StorageAPIAvroReaderAndBigQueryStatistics(new StorageAPIAvroReader(tr), bigQueryResultAndStatistics.queryStatistics);
}

public static boolean doRowsExistFor(String projectID, String datasetName, String tableName, String sampleId) {
public static boolean doRowsExistFor(String projectID, String datasetName, String tableName, String columnName, String value) {
String template = "SELECT COUNT(*) FROM `%s.%s.%s` WHERE %s = %s";
String query = String.format(template, projectID, datasetName, tableName, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId);
String query = String.format(template, projectID, datasetName, tableName, columnName, value);

BigQueryResultAndStatistics resultAndStatistics = BigQueryUtils.executeQuery(projectID, query, true, null);
for (final FieldValueList row : resultAndStatistics.result.iterateAll()) {
final long count = row.get(0).getLongValue();
return count != 0;
}
throw new GATKException(String.format("No rows returned from count of `%s.%s.%s` for sample id %s",
projectID, datasetName, tableName, sampleId));
throw new GATKException(String.format("No rows returned from count of `%s.%s.%s` for %s = %s",
projectID, datasetName, tableName, columnName, value));
}

private static StatusRuntimeException extractCausalStatusRuntimeExceptionOrThrow(Throwable original, Throwable current) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.api.client.util.ExponentialBackOff;
import com.google.api.core.ApiFuture;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;

import com.google.cloud.bigquery.storage.v1beta2.*;
import org.apache.logging.log4j.LogManager;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.localsort;
package org.broadinstitute.hellbender.utils.gvs.localsort;

import org.apache.avro.generic.GenericRecord;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.localsort;
package org.broadinstitute.hellbender.utils.gvs.localsort;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.localsort;
package org.broadinstitute.hellbender.utils.gvs.localsort;

import htsjdk.samtools.Defaults;
import htsjdk.samtools.util.*;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import htsjdk.variant.vcf.*;

import org.broadinstitute.hellbender.tools.gvs.common.CommonCode;
import org.broadinstitute.hellbender.tools.walkers.annotator.*;
import org.broadinstitute.hellbender.utils.Utils;

Expand Down Expand Up @@ -77,7 +76,7 @@ public static VCFFormatHeaderLine getEquivalentFormatHeaderLine(final String inf
addFilterLine(new VCFFilterHeaderLine(VCFConstants.PASSES_FILTERS_v4, "Site contains at least one allele that passes filters"));

addFilterLine(new VCFFilterHeaderLine(NAY_FROM_YNG, "Considered a NAY in the Yay, Nay, Grey table"));
addFilterLine(new VCFFilterHeaderLine(EXCESS_ALLELES,"Site has an excess of alternate alleles based on the input threshold (default is " + CommonCode.EXCESS_ALLELES_THRESHOLD + ")"));
addFilterLine(new VCFFilterHeaderLine(EXCESS_ALLELES,"Site has an excess of alternate alleles based on the input threshold"));
addFilterLine(new VCFFilterHeaderLine(NO_HQ_GENOTYPES, "Site has no high quality variant genotypes"));
addFilterLine(new VCFFilterHeaderLine(EXCESS_HET_KEY, "Site has excess het value larger than the threshold"));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@
@SuppressWarnings("ConstantConditions")
public class WeightedSplitIntervalsIntegrationTest extends CommandLineProgramTest {

private final String testDir = getToolTestDataDir();

@Test
public void testNoLossSimple() {
final File weights = new File(publicTestDir + "example_weights_chr20_chr21.bed.gz");
final File weights = new File(testDir + "example_weights_chr20_chr21.bed.gz");
final int scatterCount = 100;
final File outputDir = createTempDir("output");
final Interval interval = new Interval("chr20", 1000000, 2000000);
Expand Down Expand Up @@ -49,8 +51,8 @@ public void testNoLossSimple() {

@Test
public void testNoLossRealisticWgs() {
final File weights = new File(publicTestDir + "example_weights_chr20_chr21.bed.gz");
final File wgsIntervalList = new File(publicTestDir + "hg38.handcurated.noCentromeres.noTelomeres.chr20_chr21.interval_list");
final File weights = new File(testDir + "example_weights_chr20_chr21.bed.gz");
final File wgsIntervalList = new File(testDir + "hg38.handcurated.noCentromeres.noTelomeres.chr20_chr21.interval_list");

final int scatterCount = 100;
final File outputDir = createTempDir("output");
Expand Down Expand Up @@ -89,7 +91,7 @@ public void testHandleNotOneMoreBase() {
// target: sum 10200 / 10 -> 1020 per shard
// chr20 60000 60001 . 200
// chr20 60001 60011 . 10000
final File weights = new File(publicTestDir + "example_weights_chr20_test_zero.bed");
final File weights = new File(testDir + "example_weights_chr20_test_zero.bed");
final int scatterCount = 10;
final File outputDir = createTempDir("output");
final Interval inputInterval = new Interval("chr20",60000,60109);
Expand Down Expand Up @@ -119,7 +121,7 @@ public void testHandleNotOneMoreBase() {

@Test
public void testDontMixContigs() {
final File weights = new File(publicTestDir + "example_weights_chr20_chr21.bed.gz");
final File weights = new File(testDir + "example_weights_chr20_chr21.bed.gz");
final int scatterCount = 1;
final File outputDir = createTempDir("output");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import org.broadinstitute.hellbender.GATKBaseTest;

import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.utils.bigquery.AvroFileReader;
import org.broadinstitute.hellbender.utils.gvs.bigquery.AvroFileReader;
import org.testng.Assert;
import org.testng.annotations.Test;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
package org.broadinstitute.hellbender.tools.gvs.extract;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.broadinstitute.hellbender.GATKBaseTest;

import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.utils.bigquery.AvroFileReader;
import org.broadinstitute.hellbender.utils.gvs.bigquery.AvroFileReader;
import org.testng.Assert;
import org.testng.annotations.Test;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import org.broadinstitute.hellbender.GATKBaseTest;

import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.utils.bigquery.AvroFileReader;
import org.broadinstitute.hellbender.utils.gvs.bigquery.AvroFileReader;
import org.testng.Assert;
import org.testng.annotations.Test;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.hellbender.utils.bigquery;
package org.broadinstitute.hellbender.utils.gvs.bigquery;


import org.apache.avro.generic.GenericRecord;
Expand All @@ -8,14 +8,12 @@
import org.testng.annotations.Test;
import org.apache.avro.Schema;

import java.util.*;

/**
* A class to test the functionality of {@link AvroFileReader}.
*/
public class AvroFileReaderUnitTest extends GATKBaseTest {

private static final String avroFileName = "src/test/java/org/broadinstitute/hellbender/utils/bigquery/avro_test_avro_test_file.avro";
private static final String avroFileName = "src/test/resources/org/broadinstitute/hellbender/utils/gvs/bigquery/avro_test_avro_test_file.avro";
private static final AvroFileReader avroFile = new AvroFileReader(new GATKPath(avroFileName));

@Test()
Expand Down
Loading

0 comments on commit 3a7f6e2

Please sign in to comment.