From 13c70cbc775d0e52acd60ddd9f711dc8f46a3446 Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Wed, 13 Sep 2017 08:53:41 -0500 Subject: [PATCH] Adding wrapper for samtools mpileup. --- .../org/bdgenomics/cannoli/cli/Cannoli.scala | 1 + .../org/bdgenomics/cannoli/cli/Samtools.scala | 116 ++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 cli/src/main/scala/org/bdgenomics/cannoli/cli/Samtools.scala diff --git a/cli/src/main/scala/org/bdgenomics/cannoli/cli/Cannoli.scala b/cli/src/main/scala/org/bdgenomics/cannoli/cli/Cannoli.scala index deaa506..de09f30 100644 --- a/cli/src/main/scala/org/bdgenomics/cannoli/cli/Cannoli.scala +++ b/cli/src/main/scala/org/bdgenomics/cannoli/cli/Cannoli.scala @@ -34,6 +34,7 @@ object Cannoli { Bowtie2, Bwa, Freebayes, + Samtools, SnpEff)), CommandGroup("CANNOLI TOOLS", List(InterleaveFastq, SampleReads))) diff --git a/cli/src/main/scala/org/bdgenomics/cannoli/cli/Samtools.scala b/cli/src/main/scala/org/bdgenomics/cannoli/cli/Samtools.scala new file mode 100644 index 0000000..b97a4ba --- /dev/null +++ b/cli/src/main/scala/org/bdgenomics/cannoli/cli/Samtools.scala @@ -0,0 +1,116 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.cannoli.cli + +import htsjdk.samtools.ValidationStringency +import org.apache.spark.SparkContext +import org.bdgenomics.adam.models.VariantContext +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.rdd.ADAMSaveAnyArgs +import org.bdgenomics.adam.rdd.read.{ AlignmentRecordRDD, BAMInFormatter } +import org.bdgenomics.adam.rdd.variant.{ VariantContextRDD, VCFOutFormatter } +import org.bdgenomics.utils.cli._ +import org.bdgenomics.utils.misc.Logging +import org.kohsuke.args4j.{ Argument, Option => Args4jOption } + +object Samtools extends BDGCommandCompanion { + val commandName = "samtools" + val commandDescription = "ADAM Pipe API wrapper for samtools mpileup." + + def apply(cmdLine: Array[String]) = { + new Samtools(Args4j[SamtoolsArgs](cmdLine)) + } +} + +class SamtoolsArgs extends Args4jBase with ADAMSaveAnyArgs with ParquetArgs { + @Argument(required = true, metaVar = "INPUT", usage = "Location to pipe from.", index = 0) + var inputPath: String = null + + @Argument(required = true, metaVar = "OUTPUT", usage = "Location to pipe to, in VCF format.", index = 1) + var outputPath: String = null + + @Args4jOption(required = false, name = "-samtools_path", usage = "Path to the samtools executable. Defaults to samtools.") + var samtoolsPath: String = "samtools" + + @Args4jOption(required = true, name = "-samtools_reference", usage = "Reference sequence for analysis. An index file (.fai) will be created if none exists.") + var referencePath: String = null + + @Args4jOption(required = false, name = "-docker_image", usage = "Docker image to use. Defaults to heuermh/samtools.") + var dockerImage: String = "heuermh/samtools" + + @Args4jOption(required = false, name = "-use_docker", usage = "If true, uses Docker to launch samtools. If false, uses the samtools executable path.") + var useDocker: Boolean = false + + @Args4jOption(required = false, name = "-single", usage = "Saves OUTPUT as single file.") + var asSingleFile: Boolean = false + + @Args4jOption(required = false, name = "-defer_merging", usage = "Defers merging single file output.") + var deferMerging: Boolean = false + + @Args4jOption(required = false, name = "-disable_fast_concat", usage = "Disables the parallel file concatenation engine.") + var disableFastConcat: Boolean = false + + @Args4jOption(required = false, name = "-stringency", usage = "Stringency level for various checks; can be SILENT, LENIENT, or STRICT. Defaults to STRICT.") + var stringency: String = "STRICT" + + // must be defined due to ADAMSaveAnyArgs, but unused here + var sortFastqOutput: Boolean = false +} + +/** + * Samtools. + */ +class Samtools(protected val args: SamtoolsArgs) extends BDGSparkCommand[SamtoolsArgs] with Logging { + val companion = Samtools + val stringency: ValidationStringency = ValidationStringency.valueOf(args.stringency) + + def run(sc: SparkContext) { + val input: AlignmentRecordRDD = sc.loadAlignments(args.inputPath, stringency = stringency) + + implicit val tFormatter = BAMInFormatter + implicit val uFormatter = new VCFOutFormatter + + val samtoolsCommand = if (args.useDocker) { + Seq("docker", + "run", + "--interactive", + "--rm", + args.dockerImage, + "samtools", + "mpileup", + "-", + "--reference", + args.referencePath, + "-v", + "-u").mkString(" ") + } else { + Seq(args.samtoolsPath, + "mpileup", + "-", + "--reference", + args.referencePath, + "-v", + "-u").mkString(" ") + } + + val output: VariantContextRDD = input.pipe[VariantContext, VariantContextRDD, BAMInFormatter](samtoolsCommand) + .transform(_.cache()) + + output.saveAsVcf(args, stringency) + } +}