diff --git a/cli/src/main/scala/org/bdgenomics/cannoli/cli/BcftoolsCall.scala b/cli/src/main/scala/org/bdgenomics/cannoli/cli/BcftoolsCall.scala new file mode 100644 index 0000000..06f01ee --- /dev/null +++ b/cli/src/main/scala/org/bdgenomics/cannoli/cli/BcftoolsCall.scala @@ -0,0 +1,91 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.cannoli.cli + +import grizzled.slf4j.Logging +import htsjdk.samtools.ValidationStringency +import org.apache.spark.SparkContext +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.rdd.ADAMSaveAnyArgs +import org.bdgenomics.adam.util.FileExtensions._ +import org.bdgenomics.cannoli.{ + BcftoolsCall => BcftoolsCallFn, + BcftoolsCallArgs => BcftoolsCallFnArgs +} +import org.bdgenomics.utils.cli._ +import org.kohsuke.args4j.{ Argument, Option => Args4jOption } + +object BcftoolsCall extends BDGCommandCompanion { + val commandName = "bcftoolsCall" + val commandDescription = "ADAM Pipe API wrapper for bcftools call." + + def apply(cmdLine: Array[String]) = { + new BcftoolsCall(Args4j[BcftoolsCallArgs](cmdLine)) + } +} + +/** + * Bcftools call command line arguments. + */ +class BcftoolsCallArgs extends BcftoolsCallFnArgs with ADAMSaveAnyArgs with ParquetArgs { + @Argument(required = true, metaVar = "INPUT", usage = "Location to pipe variant contexts from (e.g. .vcf, .vcf.gz, .vcf.bgz). If extension is not detected, Parquet is assumed.", index = 0) + var inputPath: String = null + + @Argument(required = true, metaVar = "OUTPUT", usage = "Location to pipe variant contexts to (e.g. .vcf, .vcf.gz, .vcf.bgz). If extension is not detected, Parquet is assumed.", index = 1) + var outputPath: String = null + + @Args4jOption(required = false, name = "-single", usage = "Saves OUTPUT as single file.") + var asSingleFile: Boolean = false + + @Args4jOption(required = false, name = "-defer_merging", usage = "Defers merging single file output.") + var deferMerging: Boolean = false + + @Args4jOption(required = false, name = "-disable_fast_concat", usage = "Disables the parallel file concatenation engine.") + var disableFastConcat: Boolean = false + + @Args4jOption(required = false, name = "-stringency", usage = "Stringency level for various checks; can be SILENT, LENIENT, or STRICT. Defaults to STRICT.") + var stringency: String = "STRICT" + + // must be defined due to ADAMSaveAnyArgs, but unused here + var sortFastqOutput: Boolean = false +} + +/** + * Bcftools call command line wrapper. + */ +class BcftoolsCall(protected val args: BcftoolsCallArgs) extends BDGSparkCommand[BcftoolsCallArgs] with Logging { + val companion = BcftoolsCall + val stringency: ValidationStringency = ValidationStringency.valueOf(args.stringency) + + def run(sc: SparkContext) { + val variantContexts = sc.loadVariantContexts(args.inputPath) + val pipedVariantContexts = new BcftoolsCallFn(args, stringency, sc).apply(variantContexts) + + if (isVcfExt(args.outputPath)) { + pipedVariantContexts.saveAsVcf( + args.outputPath, + asSingleFile = args.asSingleFile, + deferMerging = args.deferMerging, + disableFastConcat = args.disableFastConcat, + stringency + ) + } else { + pipedVariantContexts.saveAsParquet(args) + } + } +} diff --git a/cli/src/main/scala/org/bdgenomics/cannoli/cli/Cannoli.scala b/cli/src/main/scala/org/bdgenomics/cannoli/cli/Cannoli.scala index 23e53e4..bd3f992 100644 --- a/cli/src/main/scala/org/bdgenomics/cannoli/cli/Cannoli.scala +++ b/cli/src/main/scala/org/bdgenomics/cannoli/cli/Cannoli.scala @@ -28,7 +28,8 @@ import org.bdgenomics.utils.cli._ object Cannoli { - val defaultCommandGroups = List(CommandGroup("CANNOLI", List(BcftoolsMpileup, + val defaultCommandGroups = List(CommandGroup("CANNOLI", List(BcftoolsCall, + BcftoolsMpileup, BcftoolsNorm, BedtoolsIntersect, Bowtie, diff --git a/core/src/main/scala/org/bdgenomics/cannoli/BcftoolsCall.scala b/core/src/main/scala/org/bdgenomics/cannoli/BcftoolsCall.scala new file mode 100644 index 0000000..b367982 --- /dev/null +++ b/core/src/main/scala/org/bdgenomics/cannoli/BcftoolsCall.scala @@ -0,0 +1,97 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.cannoli + +import htsjdk.samtools.ValidationStringency +import org.apache.spark.SparkContext +import org.bdgenomics.adam.models.VariantContext +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.rdd.variant.{ VariantContextDataset, VCFInFormatter, VCFOutFormatter } +import org.bdgenomics.adam.sql.{ VariantContext => VariantContextProduct } +import org.bdgenomics.cannoli.builder.CommandBuilders +import org.bdgenomics.utils.cli._ +import org.kohsuke.args4j.{ Option => Args4jOption } +import scala.collection.JavaConversions._ + +/** + * Bcftools call function arguments. + */ +class BcftoolsCallArgs extends Args4jBase { + @Args4jOption(required = false, name = "-executable", usage = "Path to the bcftools executable. Defaults to bcftools.") + var executable: String = "bcftools" + + @Args4jOption(required = false, name = "-docker_image", usage = "Container image to use. Defaults to quay.io/biocontainers/bcftools:1.9--ha228f0b_3.") + var image: String = "quay.io/biocontainers/bcftools:1.9--ha228f0b_3" + + @Args4jOption(required = false, name = "-sudo", usage = "Run via sudo.") + var sudo: Boolean = false + + @Args4jOption(required = false, name = "-add_files", usage = "If true, use the SparkFiles mechanism to distribute files to executors.") + var addFiles: Boolean = false + + @Args4jOption(required = false, name = "-use_docker", usage = "If true, uses Docker to launch bcftools.") + var useDocker: Boolean = false + + @Args4jOption(required = false, name = "-use_singularity", usage = "If true, uses Singularity to launch bcftools.") + var useSingularity: Boolean = false + + @Args4jOption(required = false, name = "-bcftools_args", usage = "Additional arguments for Bcftools, must be double-quoted, e.g. -bcftools_args \"--gcvf 5,15\"") + var bcftoolsArgs: String = null +} + +/** + * Bcftools call wrapper as a function VariantContextDataset → VariantContextDataset, + * for use in cannoli-shell or notebooks. + * + * @param args Bcftools call function arguments. + * @param stringency Validation stringency. Defaults to ValidationStringency.LENIENT. + * @param sc Spark context. + */ +class BcftoolsCall( + val args: BcftoolsCallArgs, + val stringency: ValidationStringency = ValidationStringency.LENIENT, + sc: SparkContext) extends CannoliFn[VariantContextDataset, VariantContextDataset](sc) { + + override def apply(variants: VariantContextDataset): VariantContextDataset = { + + val builder = CommandBuilders.create(args.useDocker, args.useSingularity) + .setExecutable(args.executable) + .add("call") + .add("--output-type") + .add("v") + + Option(args.bcftoolsArgs).foreach(builder.add(_)) + + if (args.useDocker || args.useSingularity) { + builder + .setImage(args.image) + .setSudo(args.sudo) + } + + info("Piping %s to bcftools with command: %s files: %s".format( + variants, builder.build(), builder.getFiles())) + + implicit val tFormatter = VCFInFormatter + implicit val uFormatter = new VCFOutFormatter(sc.hadoopConfiguration, stringency) + + variants.pipe[VariantContext, VariantContextProduct, VariantContextDataset, VCFInFormatter]( + cmd = builder.build(), + files = builder.getFiles() + ) + } +} diff --git a/core/src/main/scala/org/bdgenomics/cannoli/Cannoli.scala b/core/src/main/scala/org/bdgenomics/cannoli/Cannoli.scala index 4354907..8caede8 100644 --- a/core/src/main/scala/org/bdgenomics/cannoli/Cannoli.scala +++ b/core/src/main/scala/org/bdgenomics/cannoli/Cannoli.scala @@ -173,6 +173,19 @@ object Cannoli { new Vep(args, stringency, vcs.rdd.context).apply(vcs) } + /** + * Call variant contexts in this VariantContextDataset with bcftools call via Cannoli. + * + * @param args Bcftools call function arguments. + * @param stringency Validation stringency. Defaults to ValidationStringency.LENIENT. + * @return VariantContextDataset. + */ + def callWithBcftools( + args: BcftoolsCallArgs, + stringency: ValidationStringency = ValidationStringency.LENIENT): VariantContextDataset = { + new BcftoolsCall(args, stringency, vcs.rdd.context).apply(vcs) + } + /** * Normalize the variant contexts in this VariantContextDataset with bcftools norm via Cannoli. *