broadinstitute · droazen · Aug 17, 2021 · Aug 12, 2021
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/dragstr/ComposeSTRTableFile.java b/src/main/java/org/broadinstitute/hellbender/tools/dragstr/ComposeSTRTableFile.java
@@ -6,6 +6,7 @@
 import org.broadinstitute.barclay.argparser.Argument;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
 import org.broadinstitute.barclay.argparser.Hidden;
+import org.broadinstitute.barclay.help.DocumentedFeature;
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
 import org.broadinstitute.hellbender.engine.GATKPath;
 import org.broadinstitute.hellbender.engine.GATKTool;
@@ -20,45 +21,65 @@
 import java.util.stream.Collectors;
 
 /**
- * This tools look for STR sequences in the reference that will used later to estimate the Dragstr parameters values
- * using {@link CalibrateDragstrModel}.
+ * This tool looks for low-complexity STR sequences along the reference that are later used to estimate the Dragstr model 
+ * during single sample auto calibration {@link CalibrateDragstrModel}.
  * <h3>Inputs</h3>
  * <p>
- *     This command takes as input the reference (possibly traversal intervals) and a {@link STRDecimationTable decimation table} herein
- *     referred as DT.
+ *     This command takes as input the reference (possibly a subset of intervals) and an optional {@link STRDecimationTable decimation table} (herein referred as DT). 
  * </p>
  * <p>
  *     The DT modulates how often we sample a site for each possible period and repeat length. Since there is far more
  *     positions with short period and short repeat length sampling for those combinations should be less frequent.
  *     For further details about the format of this table and interpretation of its values please check the documentation
  *     in class {@link STRDecimationTable}.
  * </p>
- * <h3>Output</h3>
- * <p>
- *     The output of this command is a zip file that contain the collection of sampled sites in binary form (all.bin),
- *     and index for that file for quick access by location interval (all.idx), a copy of the reference sequence dictionary
- *     (reference.dict), a copy of the DT (decimation.txt) and additional information and stats (e.g. summary.txt)
- * </p>
  * <p>
- *     The reference dictionary file may be used by commands downstream that need to verify that
- *     the reference that wa use to generate the sample sites matches the one that is provided by the user to that command.
- * </p>
+ *    If no DT is provided, the tool uses a default one that has been tailored to work fine when run over 
+ *    the entire Human genome and it should be alright with other genomes of comparable size (i.e. 1 to 10Gbps).
+ *    With larger genomes, that default DT will likely result in an unecessarely large number of sampled sites 
+ *    that it turn may increase the run time of tools that depend on the output. In contrast, 
+ *    with smaller genomes or subsets (using targeted intervals) it might result in a number of sampled sites 
+ *    too small to build accurate Dragstr model. In this case you really need to compose and provide
+ *    your own DT or perhaps try out not to decimate at all ({@code --decimation NONE}). 
+ * </p> 
+ * <h3>Output</h3>
  * <p>
- *     The DT also provide downstream commands with the information as to how the resulting collection of sites was downSampled,
- *     in case further down-sampling is necessary.
+ *    The output of this command is a zip file that contains the collection of sampled sites in 
+ *    binary form ({@code all.bin}), and index for that file for quick access by location interval 
+ *    ({@code all.idx}). Other files in the zip provide some summary and tracking information, for example
+ *    the reference sequence dictionary ({@code reference.dict}), a copy of the DT ({@code decimation.txt}) 
+ *    and summarized stats ({@code summary.txt}).
  * </p>
  * <p>
- * <h3>Example</h3>
+ * <h3>Examples</h3>
+ * <pre>
+
+ *     # Human? just use the default.
+ *     gatk ComposeSTRTableFile -R hg19.fasta -O hg19.str.zip
+ *     # or ...
+ *     gatk ComposeSTRTableFile -R hg19.fasta --decimation DEFAULT -O hg19.str.zip
+ *
+ * </pre>
+ * <pre>
+ *
+ *     # yeast genome is roughly ~ 12Mbp long.
+ *     gatk ComposeSTRTableFile -R yeast.fasta --decimation custom-yeast.dt -O yeast.str.zip
+ *
+ * </pre>
  * <pre>
- *     gatk ComposeSTRTableFile -R ref.fasta -O ref.str
+ *
+ *     #  Carsonella ruddii just about 160Kbps, prorably we don't want to decimate at all:
+ *     gatk ComposeSTRTableFile -R Cruddii.fasta --decimation NONE -O yeast.str.zip
+ *
  * </pre>
  * </p>
  */
 @CommandLineProgramProperties(
         programGroup = ReferenceProgramGroup.class,
-        summary = "Determine the presence of STR in a reference sequence",
-        oneLineSummary = "Determines the presence of STR in a reference sequence"
+        oneLineSummary = "Composes a genome-wide STR location table used for DragSTR model auto-calibration",
+        summary = "Composes a genome-wide STR location table used for DragSTR model auto-calibration"
 )
+@DocumentedFeature
 public final class ComposeSTRTableFile extends GATKTool {
 
     public static final String REFERENCE_SEQUENCE_BUFFER_SIZE_FULL_NAME = "reference-sequence-buffer-size";
@@ -67,9 +88,11 @@ public final class ComposeSTRTableFile extends GATKTool {
     public static final int MAXIMUM_REFERENCE_SEQUENCE_BUFFER_SIZE = 100_000_000;
     public static final int DEFAULT_REFERENCE_SEQUENCE_BUFFER_SIZE = 100_000;
 
-    @Argument(fullName="decimation", doc="decimation per period and repeat. It can be \"DEFAULT\" to use the default values (default), " +
-            " \"NONE\" to deactivate decimation (potentially resulting in a very large output file) or indicate the path to a file" +
-            " that contains the decimation matrix.", optional = true)
+    @Argument(fullName = "decimation", 
+              doc = "decimation per period and repeat. It can be \"DEFAULT\" to use the default values (DEFAULT), " +
+                    " \"NONE\" to deactivate decimation (potentially resulting in a very large output file) " + 
+                    "or indicate the path to a file that contains the decimation matrix.", 
+              optional = true)
     private STRDecimationTable decimationTable = STRDecimationTable.DEFAULT;
 
     @Argument(doc = "name of the zip file where the sites sampled will be stored",

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/dragstr/STRDecimationTable.java b/src/main/java/org/broadinstitute/hellbender/tools/dragstr/STRDecimationTable.java
@@ -27,6 +27,39 @@
  *     Thus if the DT entry for a period and repeat-length combination is 0, all sites are sampled, if it is 1
  *     every second site is discarded, if it 10, one every 1024 sites is sampled and the rest discarded.
  * </p>
+ * <p>
+ * <h3>Example (the default) table in a text file:</h3>
+ * <pre>
+ * ###
+ * # Human b38 decimation table (3Gbp)
+ * #
+ * 0
+ * 0 10 10  9  8  7  5  3  1  0
+ * 0  0  9  6  3  0
+ * 0  0  8  4  1  0
+ * 0  0  6  0
+ * 0  0  5  0
+ * 0  0  4  0
+ * 0  0  1  0
+ * 0
+ * </pre>
+ * </p>
+ * <p>
+ * For historical reasons the first row and column represent the absurd
+ * 0-period length and 0-repeat length STR. We recommend to set them to zero although the values
+ * in these should have no effect.
+ * </p>
+ * <p>
+ * That is also true for values for the 1-repeat and n-period, for n > 1, since in practice these are
+ * interpreted as 1-period n-repeat STR. We recommend to keep them as 0.
+ * </p>
+ * <p>
+ * Missing entries (left-bottom portion of the matrix) are assumed to be 0.
+ * Therefore the 0 entries at the end of each row include the lonely zero
+ * in the last one could be omitted. However adding that extra 0 give some reasurance that the row is not truncated
+ * by mistake. Since decimation is unecessary (even contra-productive) for longer period and larger
+ * repeat count the resulting matrix is a upper-left triangular one.
+ * </p>
  */
 public class STRDecimationTable {