-
Notifications
You must be signed in to change notification settings - Fork 598
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Vs 822 Add documentation for the work that we did on the latest itera…
…tion of Delta (#8205) * Lees name * add vds validation script written by Tim * fix rd tim typo * make sure temp dir is set and not default for validate() * swap to consistent kebab case Co-authored-by: Miguel Covarrubias <[email protected]> * clean up validation * put init in the right place * add proper example to notes * update code formatting --------- Co-authored-by: Miguel Covarrubias <[email protected]>
- Loading branch information
1 parent
b09d909
commit 3e5a756
Showing
2 changed files
with
117 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# Creating a VDS for AoU | ||
|
||
**NOTE:** This doc is very much a work in process and is serving as a way to keep track of Hail commands that were used for Delta after the initial VDS delivery. | ||
This example walks through the procedure we followed for creating a VDS. | ||
|
||
## Configuring the Terra notebook cluster | ||
|
||
Please follow the instructions | ||
in [AoU Delta VDS Cluster Configuration](cluster/AoU%20Delta%20VDS%20Cluster%20Configuration.md) to set up a Delta-scale | ||
cluster. | ||
|
||
|
||
## Now we made a VDS for Delta, and then needed to run the following to correct the GQ0s and then re-deliver it | ||
|
||
``` | ||
pip install --force-reinstall hail==0.2.109 | ||
python3 | ||
>>> import hail as hl | ||
>>> vds = hl.vds.read_vds('gs://fc-secure-fb908548-fe3c-41d6-adaf-7ac20d541375/vds/01-04-2023-correct-GT/aou_srwgs_short_variants_v7_without_ext_aian_prod.vds') | ||
``` | ||
|
||
## Turn the GQ0s into no calls | ||
|
||
```python | ||
>>> rd = vds.reference_data | ||
>>> rd = rd.filter_entries(rd.GQ > 0) | ||
>>> rd = rd.filter_rows(hl.agg.count() > 0) # remove sites with no block starts | ||
>>> rd = rd.annotate_globals(ref_block_max_length=1000) # set the max length | ||
>>> vds2 = hl.vds.VariantDataset(reference_data=rd, variant_data=vds.variant_data) | ||
``` | ||
|
||
## Put it back in the cloud | ||
|
||
```python | ||
>>> final_path = 'gs://prod-drc-broad/aou-wgs-delta/vds/2023-02-13-GQ0/aou_srwgs_short_variants_v7_without_ext_aian_prod_gq0.vds' | ||
>>> vds2.write(final_path, overwrite=True) | ||
``` | ||
|
||
## Validate it to ensure that it is ready to be shared | ||
|
||
Copy the [VDS Validation python script](vds_validation.py) to the notebook environment | ||
Run it with the following arguments: | ||
|
||
`--vds-path`: the GCS path to the newly-created VDS | ||
|
||
`--temp-path`: a GCS path for temp files to live | ||
|
||
|
||
An example run (on the Anvil 3k) is illustrated below | ||
|
||
``` | ||
python3 validation_script.py --vds-path 'gs://fc-3ecd704c-4d2c-4360-bae1-093e214abce2/vds/vds_GQ0_max_ref_1k' --temp-path 'gs://fc-62891290-7fa3-434d-a39e-c64eeee4db8d/temp' | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import argparse | ||
import hail as hl | ||
|
||
### | ||
# VDS validation: | ||
# check that the reference and variant matrix tables contain the same samples | ||
# check that the reference blocks: | ||
# * do not have any GQ0s (they should be no calls instead) | ||
# * are valid in that they have an END after START | ||
# * are not longer than 1000 bases (this should be true for the ref tables in BQ) | ||
# spot check a small subset of the VDS ## TODO it might be helpful to print rows from this subset | ||
# run the classic Hail method vds.validate() | ||
### | ||
|
||
|
||
|
||
|
||
def check_samples_match(vds): | ||
print('checking sample equivalence between reference and variant MTs') | ||
assert vds.reference_data.cols().select().collect() == vds.variant_data.cols().select().collect() | ||
|
||
def check_ref_blocks(vds): | ||
print('checking that:\n * no reference blocks have GQ=0\n * all ref blocks have END after start\n * all ref blocks are max 1000 bases long') | ||
rd = vds.reference_data | ||
rd = rd.annotate_rows(locus_start = rd.locus.position) | ||
|
||
LEN = rd.END - rd.locus_start + 1 | ||
|
||
assert rd.aggregate_entries(hl.agg.all(hl.all(rd.GQ > 0, LEN >= 0, LEN <= rd.ref_block_max_length))) | ||
|
||
def check_densify_small_region(vds): | ||
print('running densify on 200kb region') | ||
from time import time | ||
t1 = time() | ||
|
||
filt = hl.vds.filter_intervals(vds, [hl.parse_locus_interval('chr16:29.5M-29.7M', reference_genome='GRCh38')]) | ||
n=hl.vds.to_dense_mt(filt).select_entries('LGT')._force_count_rows() | ||
|
||
print(f'took {time() - t1:.1f}s to densify {n} rows after interval query') | ||
|
||
|
||
|
||
def main(vds): | ||
check_samples_match(vds) | ||
check_ref_blocks(vds) | ||
check_densify_small_region(vds) | ||
vds.validate(); print('Hail VDS validation successful') | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(allow_abbrev=False, description='Create VAT inputs TSV') | ||
parser.add_argument('--vds-path', type=str, help='Input VDS Path', default="@VDS_INPUT_PATH@") | ||
parser.add_argument('--temp-path', type=str, help='Path to temporary directory', default="@TEMP_DIR@", | ||
required=True) | ||
|
||
args = parser.parse_args() | ||
|
||
hl.init(tmp_dir=f'{args.temp_path}/hail_tmp_general') | ||
|
||
vds = hl.vds.read_vds(args.vds_path) | ||
|
||
main(vds) |