From a89690e6cc00ca775e28de4327807f09a155943d Mon Sep 17 00:00:00 2001
From: vruano
- * This command takes as input the reference (possibly traversal intervals) and a {@link STRDecimationTable decimation table} herein
- * referred as DT.
+ * This command takes as input the reference (possibly a subset of intervals) and an optional {@link STRDecimationTable decimation table} (herein referred as DT).
*
* The DT modulates how often we sample a site for each possible period and repeat length. Since there is far more
@@ -33,32 +33,53 @@
* For further details about the format of this table and interpretation of its values please check the documentation
* in class {@link STRDecimationTable}.
*
- * The output of this command is a zip file that contain the collection of sampled sites in binary form (all.bin),
- * and index for that file for quick access by location interval (all.idx), a copy of the reference sequence dictionary
- * (reference.dict), a copy of the DT (decimation.txt) and additional information and stats (e.g. summary.txt)
- *
- * The reference dictionary file may be used by commands downstream that need to verify that
- * the reference that wa use to generate the sample sites matches the one that is provided by the user to that command.
- * Inputs
* Output
- *
- * The DT also provide downstream commands with the information as to how the resulting collection of sites was downSampled, - * in case further down-sampling is necessary. + * The output of this command is a zip file that contains the collection of sampled sites in + * binary form ({@code all.bin}), and index for that file for quick access by location interval + * ({@code all.idx}). Other files in the zip provide some summary and tracking information, for example + * the reference sequence dictionary ({@code reference.dict}), a copy of the DT ({@code decimation.txt}) + * and summarized stats ({@code summary.txt}). *
*- *
+ + * # Human? just use the default. + * gatk ComposeSTRTableFile -R hg19.fasta -O hg19.str.zip + * # or ... + * gatk ComposeSTRTableFile -R hg19.fasta --decimation DEFAULT -O hg19.str.zip + * + *+ *
+ * + * # yeast genome is roughly ~ 12Mbp long. + * gatk ComposeSTRTableFile -R yeast.fasta --decimation custom-yeast.dt -O yeast.str.zip + * + **
- * gatk ComposeSTRTableFile -R ref.fasta -O ref.str + * + * # Carsonella ruddii just about 160Kbps, prorably we don't want to decimate at all: + * gatk ComposeSTRTableFile -R Cruddii.fasta --decimation NONE -O yeast.str.zip + * ** */ @CommandLineProgramProperties( programGroup = ReferenceProgramGroup.class, - summary = "Determine the presence of STR in a reference sequence", - oneLineSummary = "Determines the presence of STR in a reference sequence" + oneLineSummary = "Composes a genome-wide STR location table used for DragSTR model auto-calibration", + summary = "Composes a genome-wide STR location table used for DragSTR model auto-calibration" ) +@DocumentedFeature public final class ComposeSTRTableFile extends GATKTool { public static final String REFERENCE_SEQUENCE_BUFFER_SIZE_FULL_NAME = "reference-sequence-buffer-size"; @@ -67,9 +88,11 @@ public final class ComposeSTRTableFile extends GATKTool { public static final int MAXIMUM_REFERENCE_SEQUENCE_BUFFER_SIZE = 100_000_000; public static final int DEFAULT_REFERENCE_SEQUENCE_BUFFER_SIZE = 100_000; - @Argument(fullName="decimation", doc="decimation per period and repeat. It can be \"DEFAULT\" to use the default values (default), " + - " \"NONE\" to deactivate decimation (potentially resulting in a very large output file) or indicate the path to a file" + - " that contains the decimation matrix.", optional = true) + @Argument(fullName = "decimation", + doc = "decimation per period and repeat. It can be \"DEFAULT\" to use the default values (DEFAULT), " + + " \"NONE\" to deactivate decimation (potentially resulting in a very large output file) " + + "or indicate the path to a file that contains the decimation matrix.", + optional = true) private STRDecimationTable decimationTable = STRDecimationTable.DEFAULT; @Argument(doc = "name of the zip file where the sites sampled will be stored", diff --git a/src/main/java/org/broadinstitute/hellbender/tools/dragstr/STRDecimationTable.java b/src/main/java/org/broadinstitute/hellbender/tools/dragstr/STRDecimationTable.java index f9efe0b83d3..ec68533bb97 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/dragstr/STRDecimationTable.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/dragstr/STRDecimationTable.java @@ -27,6 +27,39 @@ * Thus if the DT entry for a period and repeat-length combination is 0, all sites are sampled, if it is 1 * every second site is discarded, if it 10, one every 1024 sites is sampled and the rest discarded. * + *
+ *
+ * ### + * # Human b38 decimation table (3Gbp) + * # + * 0 + * 0 10 10 9 8 7 5 3 1 0 + * 0 0 9 6 3 0 + * 0 0 8 4 1 0 + * 0 0 6 0 + * 0 0 5 0 + * 0 0 4 0 + * 0 0 1 0 + * 0 + *+ * + *
+ * For historical reasons the first row and column represent the absurd + * 0-period length and 0-repeat length STR. We recommend to set them to zero although the values + * in these should have no effect. + *
+ *+ * That is also true for values for the 1-repeat and n-period, for n > 1, since in practice these are + * interpreted as 1-period n-repeat STR. We recommend to keep them as 0. + *
+ *+ * Missing entries (left-bottom portion of the matrix) are assumed to be 0. + * Therefore the 0 entries at the end of each row include the lonely zero + * in the last one could be omitted. However adding that extra 0 give some reasurance that the row is not truncated + * by mistake. Since decimation is unecessary (even contra-productive) for longer period and larger + * repeat count the resulting matrix is a upper-left triangular one. + *
*/ public class STRDecimationTable {