bigdatagenomics · heuermh · Aug 26, 2019 · Aug 26, 2019
diff --git a/cli/src/main/scala/org/bdgenomics/cannoli/cli/BcftoolsCall.scala b/cli/src/main/scala/org/bdgenomics/cannoli/cli/BcftoolsCall.scala
@@ -0,0 +1,91 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.cannoli.cli
+
+import grizzled.slf4j.Logging
+import htsjdk.samtools.ValidationStringency
+import org.apache.spark.SparkContext
+import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.adam.rdd.ADAMSaveAnyArgs
+import org.bdgenomics.adam.util.FileExtensions._
+import org.bdgenomics.cannoli.{
+  BcftoolsCall => BcftoolsCallFn,
+  BcftoolsCallArgs => BcftoolsCallFnArgs
+}
+import org.bdgenomics.utils.cli._
+import org.kohsuke.args4j.{ Argument, Option => Args4jOption }
+
+object BcftoolsCall extends BDGCommandCompanion {
+  val commandName = "bcftoolsCall"
+  val commandDescription = "ADAM Pipe API wrapper for bcftools call."
+
+  def apply(cmdLine: Array[String]) = {
+    new BcftoolsCall(Args4j[BcftoolsCallArgs](cmdLine))
+  }
+}
+
+/**
+ * Bcftools call command line arguments.
+ */
+class BcftoolsCallArgs extends BcftoolsCallFnArgs with ADAMSaveAnyArgs with ParquetArgs {
+  @Argument(required = true, metaVar = "INPUT", usage = "Location to pipe variant contexts from (e.g. .vcf, .vcf.gz, .vcf.bgz). If extension is not detected, Parquet is assumed.", index = 0)
+  var inputPath: String = null
+
+  @Argument(required = true, metaVar = "OUTPUT", usage = "Location to pipe variant contexts to (e.g. .vcf, .vcf.gz, .vcf.bgz). If extension is not detected, Parquet is assumed.", index = 1)
+  var outputPath: String = null
+
+  @Args4jOption(required = false, name = "-single", usage = "Saves OUTPUT as single file.")
+  var asSingleFile: Boolean = false
+
+  @Args4jOption(required = false, name = "-defer_merging", usage = "Defers merging single file output.")
+  var deferMerging: Boolean = false
+
+  @Args4jOption(required = false, name = "-disable_fast_concat", usage = "Disables the parallel file concatenation engine.")
+  var disableFastConcat: Boolean = false
+
+  @Args4jOption(required = false, name = "-stringency", usage = "Stringency level for various checks; can be SILENT, LENIENT, or STRICT. Defaults to STRICT.")
+  var stringency: String = "STRICT"
+
+  // must be defined due to ADAMSaveAnyArgs, but unused here
+  var sortFastqOutput: Boolean = false
+}
+
+/**
+ * Bcftools call command line wrapper.
+ */
+class BcftoolsCall(protected val args: BcftoolsCallArgs) extends BDGSparkCommand[BcftoolsCallArgs] with Logging {
+  val companion = BcftoolsCall
+  val stringency: ValidationStringency = ValidationStringency.valueOf(args.stringency)
+
+  def run(sc: SparkContext) {
+    val variantContexts = sc.loadVariantContexts(args.inputPath)
+    val pipedVariantContexts = new BcftoolsCallFn(args, stringency, sc).apply(variantContexts)
+
+    if (isVcfExt(args.outputPath)) {
+      pipedVariantContexts.saveAsVcf(
+        args.outputPath,
+        asSingleFile = args.asSingleFile,
+        deferMerging = args.deferMerging,
+        disableFastConcat = args.disableFastConcat,
+        stringency
+      )
+    } else {
+      pipedVariantContexts.saveAsParquet(args)
+    }
+  }
+}
diff --git a/cli/src/main/scala/org/bdgenomics/cannoli/cli/Cannoli.scala b/cli/src/main/scala/org/bdgenomics/cannoli/cli/Cannoli.scala
@@ -28,7 +28,8 @@ import org.bdgenomics.utils.cli._
 
 object Cannoli {
 
-  val defaultCommandGroups = List(CommandGroup("CANNOLI", List(BcftoolsMpileup,
+  val defaultCommandGroups = List(CommandGroup("CANNOLI", List(BcftoolsCall,
+    BcftoolsMpileup,
     BcftoolsNorm,
     BedtoolsIntersect,
     Bowtie,

diff --git a/core/src/main/scala/org/bdgenomics/cannoli/BcftoolsCall.scala b/core/src/main/scala/org/bdgenomics/cannoli/BcftoolsCall.scala
@@ -0,0 +1,97 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.cannoli
+
+import htsjdk.samtools.ValidationStringency
+import org.apache.spark.SparkContext
+import org.bdgenomics.adam.models.VariantContext
+import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.adam.rdd.variant.{ VariantContextDataset, VCFInFormatter, VCFOutFormatter }
+import org.bdgenomics.adam.sql.{ VariantContext => VariantContextProduct }
+import org.bdgenomics.cannoli.builder.CommandBuilders
+import org.bdgenomics.utils.cli._
+import org.kohsuke.args4j.{ Option => Args4jOption }
+import scala.collection.JavaConversions._
+
+/**
+ * Bcftools call function arguments.
+ */
+class BcftoolsCallArgs extends Args4jBase {
+  @Args4jOption(required = false, name = "-executable", usage = "Path to the bcftools executable. Defaults to bcftools.")
+  var executable: String = "bcftools"
+
+  @Args4jOption(required = false, name = "-docker_image", usage = "Container image to use. Defaults to quay.io/biocontainers/bcftools:1.9--ha228f0b_3.")
+  var image: String = "quay.io/biocontainers/bcftools:1.9--ha228f0b_3"
+
+  @Args4jOption(required = false, name = "-sudo", usage = "Run via sudo.")
+  var sudo: Boolean = false
+
+  @Args4jOption(required = false, name = "-add_files", usage = "If true, use the SparkFiles mechanism to distribute files to executors.")
+  var addFiles: Boolean = false
+
+  @Args4jOption(required = false, name = "-use_docker", usage = "If true, uses Docker to launch bcftools.")
+  var useDocker: Boolean = false
+
+  @Args4jOption(required = false, name = "-use_singularity", usage = "If true, uses Singularity to launch bcftools.")
+  var useSingularity: Boolean = false
+
+  @Args4jOption(required = false, name = "-bcftools_args", usage = "Additional arguments for Bcftools, must be double-quoted, e.g. -bcftools_args \"--gcvf 5,15\"")
+  var bcftoolsArgs: String = null
+}
+
+/**
+ * Bcftools call wrapper as a function VariantContextDataset &rarr; VariantContextDataset,
+ * for use in cannoli-shell or notebooks.
+ *
+ * @param args Bcftools call function arguments.
+ * @param stringency Validation stringency. Defaults to ValidationStringency.LENIENT.
+ * @param sc Spark context.
+ */
+class BcftoolsCall(
+    val args: BcftoolsCallArgs,
+    val stringency: ValidationStringency = ValidationStringency.LENIENT,
+    sc: SparkContext) extends CannoliFn[VariantContextDataset, VariantContextDataset](sc) {
+
+  override def apply(variants: VariantContextDataset): VariantContextDataset = {
+
+    val builder = CommandBuilders.create(args.useDocker, args.useSingularity)
+      .setExecutable(args.executable)
+      .add("call")
+      .add("--output-type")
+      .add("v")
+
+    Option(args.bcftoolsArgs).foreach(builder.add(_))
+
+    if (args.useDocker || args.useSingularity) {
+      builder
+        .setImage(args.image)
+        .setSudo(args.sudo)
+    }
+
+    info("Piping %s to bcftools with command: %s files: %s".format(
+      variants, builder.build(), builder.getFiles()))
+
+    implicit val tFormatter = VCFInFormatter
+    implicit val uFormatter = new VCFOutFormatter(sc.hadoopConfiguration, stringency)
+
+    variants.pipe[VariantContext, VariantContextProduct, VariantContextDataset, VCFInFormatter](
+      cmd = builder.build(),
+      files = builder.getFiles()
+    )
+  }
+}
diff --git a/core/src/main/scala/org/bdgenomics/cannoli/Cannoli.scala b/core/src/main/scala/org/bdgenomics/cannoli/Cannoli.scala
@@ -173,6 +173,19 @@ object Cannoli {
       new Vep(args, stringency, vcs.rdd.context).apply(vcs)
     }
 
+    /**
+     * Call variant contexts in this VariantContextDataset with bcftools call via Cannoli.
+     *
+     * @param args Bcftools call function arguments.
+     * @param stringency Validation stringency. Defaults to ValidationStringency.LENIENT.
+     * @return VariantContextDataset.
+     */
+    def callWithBcftools(
+      args: BcftoolsCallArgs,
+      stringency: ValidationStringency = ValidationStringency.LENIENT): VariantContextDataset = {
+      new BcftoolsCall(args, stringency, vcs.rdd.context).apply(vcs)
+    }
+
     /**
      * Normalize the variant contexts in this VariantContextDataset with bcftools norm via Cannoli.
      *