From a89690e6cc00ca775e28de4327807f09a155943d Mon Sep 17 00:00:00 2001 From: vruano Date: Thu, 12 Aug 2021 17:54:30 -0400 Subject: [PATCH] Fix issues with missing documentation for ComposeSTRTableFile. Improved the documentation text for ComposeSTRTableFile and related STRDecimationTable. --- .../tools/dragstr/ComposeSTRTableFile.java | 67 +++++++++++++------ .../tools/dragstr/STRDecimationTable.java | 33 +++++++++ 2 files changed, 78 insertions(+), 22 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/dragstr/ComposeSTRTableFile.java b/src/main/java/org/broadinstitute/hellbender/tools/dragstr/ComposeSTRTableFile.java index 55f85863fdd..410d7f76901 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/dragstr/ComposeSTRTableFile.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/dragstr/ComposeSTRTableFile.java @@ -6,6 +6,7 @@ import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.argparser.Hidden; +import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.GATKPath; import org.broadinstitute.hellbender.engine.GATKTool; @@ -20,12 +21,11 @@ import java.util.stream.Collectors; /** - * This tools look for STR sequences in the reference that will used later to estimate the Dragstr parameters values - * using {@link CalibrateDragstrModel}. + * This tool looks for low-complexity STR sequences along the reference that are later used to estimate the Dragstr model + * during single sample auto calibration {@link CalibrateDragstrModel}. *

Inputs

*

- * This command takes as input the reference (possibly traversal intervals) and a {@link STRDecimationTable decimation table} herein - * referred as DT. + * This command takes as input the reference (possibly a subset of intervals) and an optional {@link STRDecimationTable decimation table} (herein referred as DT). *

*

* The DT modulates how often we sample a site for each possible period and repeat length. Since there is far more @@ -33,32 +33,53 @@ * For further details about the format of this table and interpretation of its values please check the documentation * in class {@link STRDecimationTable}. *

- *

Output

- *

- * The output of this command is a zip file that contain the collection of sampled sites in binary form (all.bin), - * and index for that file for quick access by location interval (all.idx), a copy of the reference sequence dictionary - * (reference.dict), a copy of the DT (decimation.txt) and additional information and stats (e.g. summary.txt) - *

*

- * The reference dictionary file may be used by commands downstream that need to verify that - * the reference that wa use to generate the sample sites matches the one that is provided by the user to that command. - *

+ * If no DT is provided, the tool uses a default one that has been tailored to work fine when run over + * the entire Human genome and it should be alright with other genomes of comparable size (i.e. 1 to 10Gbps). + * With larger genomes, that default DT will likely result in an unecessarely large number of sampled sites + * that it turn may increase the run time of tools that depend on the output. In contrast, + * with smaller genomes or subsets (using targeted intervals) it might result in a number of sampled sites + * too small to build accurate Dragstr model. In this case you really need to compose and provide + * your own DT or perhaps try out not to decimate at all ({@code --decimation NONE}). + *

+ *

Output

*

- * The DT also provide downstream commands with the information as to how the resulting collection of sites was downSampled, - * in case further down-sampling is necessary. + * The output of this command is a zip file that contains the collection of sampled sites in + * binary form ({@code all.bin}), and index for that file for quick access by location interval + * ({@code all.idx}). Other files in the zip provide some summary and tracking information, for example + * the reference sequence dictionary ({@code reference.dict}), a copy of the DT ({@code decimation.txt}) + * and summarized stats ({@code summary.txt}). *

*

- *

Example

+ *

Examples

+ *
+
+ *     # Human? just use the default.
+ *     gatk ComposeSTRTableFile -R hg19.fasta -O hg19.str.zip
+ *     # or ...
+ *     gatk ComposeSTRTableFile -R hg19.fasta --decimation DEFAULT -O hg19.str.zip
+ *
+ * 
+ *
+ *
+ *     # yeast genome is roughly ~ 12Mbp long.
+ *     gatk ComposeSTRTableFile -R yeast.fasta --decimation custom-yeast.dt -O yeast.str.zip
+ *
+ * 
*
- *     gatk ComposeSTRTableFile -R ref.fasta -O ref.str
+ *
+ *     #  Carsonella ruddii just about 160Kbps, prorably we don't want to decimate at all:
+ *     gatk ComposeSTRTableFile -R Cruddii.fasta --decimation NONE -O yeast.str.zip
+ *
  * 
*

*/ @CommandLineProgramProperties( programGroup = ReferenceProgramGroup.class, - summary = "Determine the presence of STR in a reference sequence", - oneLineSummary = "Determines the presence of STR in a reference sequence" + oneLineSummary = "Composes a genome-wide STR location table used for DragSTR model auto-calibration", + summary = "Composes a genome-wide STR location table used for DragSTR model auto-calibration" ) +@DocumentedFeature public final class ComposeSTRTableFile extends GATKTool { public static final String REFERENCE_SEQUENCE_BUFFER_SIZE_FULL_NAME = "reference-sequence-buffer-size"; @@ -67,9 +88,11 @@ public final class ComposeSTRTableFile extends GATKTool { public static final int MAXIMUM_REFERENCE_SEQUENCE_BUFFER_SIZE = 100_000_000; public static final int DEFAULT_REFERENCE_SEQUENCE_BUFFER_SIZE = 100_000; - @Argument(fullName="decimation", doc="decimation per period and repeat. It can be \"DEFAULT\" to use the default values (default), " + - " \"NONE\" to deactivate decimation (potentially resulting in a very large output file) or indicate the path to a file" + - " that contains the decimation matrix.", optional = true) + @Argument(fullName = "decimation", + doc = "decimation per period and repeat. It can be \"DEFAULT\" to use the default values (DEFAULT), " + + " \"NONE\" to deactivate decimation (potentially resulting in a very large output file) " + + "or indicate the path to a file that contains the decimation matrix.", + optional = true) private STRDecimationTable decimationTable = STRDecimationTable.DEFAULT; @Argument(doc = "name of the zip file where the sites sampled will be stored", diff --git a/src/main/java/org/broadinstitute/hellbender/tools/dragstr/STRDecimationTable.java b/src/main/java/org/broadinstitute/hellbender/tools/dragstr/STRDecimationTable.java index f9efe0b83d3..ec68533bb97 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/dragstr/STRDecimationTable.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/dragstr/STRDecimationTable.java @@ -27,6 +27,39 @@ * Thus if the DT entry for a period and repeat-length combination is 0, all sites are sampled, if it is 1 * every second site is discarded, if it 10, one every 1024 sites is sampled and the rest discarded. *

+ *

+ *

Example (the default) table in a text file:

+ *
+ * ###
+ * # Human b38 decimation table (3Gbp)
+ * #
+ * 0
+ * 0 10 10  9  8  7  5  3  1  0
+ * 0  0  9  6  3  0
+ * 0  0  8  4  1  0
+ * 0  0  6  0
+ * 0  0  5  0
+ * 0  0  4  0
+ * 0  0  1  0
+ * 0
+ * 
+ *

+ *

+ * For historical reasons the first row and column represent the absurd + * 0-period length and 0-repeat length STR. We recommend to set them to zero although the values + * in these should have no effect. + *

+ *

+ * That is also true for values for the 1-repeat and n-period, for n > 1, since in practice these are + * interpreted as 1-period n-repeat STR. We recommend to keep them as 0. + *

+ *

+ * Missing entries (left-bottom portion of the matrix) are assumed to be 0. + * Therefore the 0 entries at the end of each row include the lonely zero + * in the last one could be omitted. However adding that extra 0 give some reasurance that the row is not truncated + * by mistake. Since decimation is unecessary (even contra-productive) for longer period and larger + * repeat count the resulting matrix is a upper-left triangular one. + *

*/ public class STRDecimationTable {