diff --git a/README.md b/README.md index f8bfb4ee..8205fc6a 100644 --- a/README.md +++ b/README.md @@ -5,11 +5,11 @@ RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads. It is based on the novel idea of _pseudoalignment_ for rapidly determining the compatibility of reads with targets, without the need for alignment. On benchmarks with standard RNA-Seq data, __kallisto__ can -quantify 30 million human reads in less than 3 minutes on a Mac desktop +quantify 30 million human bulk RNA-seq reads in less than 3 minutes on a Mac desktop computer using only the read sequences and a transcriptome index that -itself takes less than 10 minutes to build. Pseudoalignment of reads +itself takes than 10 minutes to build. Pseudoalignment of reads preserves the key information needed for quantification, and __kallisto__ -is therefore not only fast, but also as accurate than existing +is therefore not only fast, but also comparably accurate to other existing quantification tools. In fact, because the pseudoalignment procedure is robust to errors in the reads, in many benchmarks __kallisto__ significantly outperforms existing tools. The __kallisto__ algorithms are described in more detail in: @@ -18,7 +18,9 @@ NL Bray, H Pimentel, P Melsted and L Pachter, [Near optimal probabilistic RNA-se Scripts reproducing all the results of the paper are available [here](https://github.com/pachterlab/kallisto_paper_analysis). -__kallisto__ quantified RNA-Seq can be analyzed with [__sleuth__](https://github.com/pachterlab/sleuth/). +__kallisto__ quantified bulk RNA-Seq can be analyzed with [__sleuth__](https://github.com/pachterlab/sleuth/). + +__kallisto__ can be used together with [__bustools__](https://bustools.github.io/) to pre-process single-cell RNA-seq data. See the [kallistobus.tools](https://www.kallistobus.tools/) website for instructions. ## Manual @@ -26,24 +28,15 @@ Please visit http://pachterlab.github.io/kallisto/manual.html for the manual. ## License -Please read the license before using kallisto. The license is distributed with __kallisto__ in the file `license.txt` also viewable [here](https://pachterlab.github.io/kallisto/download). - -## Announcements - -There is a low traffic Google Group, -[kallisto-sleuth-announcements](https://groups.google.com/d/forum/kallisto-sleuth-announcements) -where we make announcements about new releases. This is a read-only mailing -list. +__kallisto__ is distributed under the BSD-2 license. The license is distributed with __kallisto__ in the file `license.txt`, which is also viewable [here](https://pachterlab.github.io/kallisto/download). Please read the license before using __kallisto__. ## Getting help -For help running __kallisto__, please post to the [kallisto-sleuth-users -Google Group](https://groups.google.com/d/forum/kallisto-sleuth-users). +For help running __kallisto__, please post to the [kallisto-and-applications Google Group](https://groups.google.com/forum/#!forum/kallisto-and-applications). ## Reporting bugs -Please report bugs to the [Github issues -page](https://github.com/pachterlab/kallisto/issues) +Please report bugs to the [Github issues page](https://github.com/pachterlab/kallisto/issues) ## Development and pull requests diff --git a/src/ProcessReads.cpp b/src/ProcessReads.cpp index b6702d25..d8f151bd 100644 --- a/src/ProcessReads.cpp +++ b/src/ProcessReads.cpp @@ -256,7 +256,7 @@ int64_t ProcessBUSReads(MasterProcessor& MP, const ProgramOptions& opt) { if (nummapped == 0) { std::cerr << "[~warn] no reads pseudoaligned." << std::endl; } - + return numreads; } @@ -565,7 +565,7 @@ void MasterProcessor::processAln(const EMAlgorithm& em, bool useEM = true) { assert(opt.pseudobam); pseudobatchf_in.open(opt.output + "/pseudoaln.bin", std::ios::in | std::ios::binary); - SR.reset(); + SR->reset(); std::vector workers; for (int i = 0; i < opt.threads; i++) { @@ -893,6 +893,7 @@ ReadProcessor::ReadProcessor(ReadProcessor && o) : seqs(std::move(o.seqs)), names(std::move(o.names)), quals(std::move(o.quals)), + flags(std::move(o.flags)), umis(std::move(o.umis)), newEcs(std::move(o.newEcs)), flens(std::move(o.flens)), @@ -919,16 +920,16 @@ void ReadProcessor::operator()() { if (batchSR.empty()) { return; } else { - batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, mp.opt.pseudobam ); + batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam ); } } else { std::lock_guard lock(mp.reader_lock); - if (mp.SR.empty()) { + if (mp.SR->empty()) { // nothing to do return; } else { // get new sequences - mp.SR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, mp.opt.pseudobam || mp.opt.fusion); + mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam || mp.opt.fusion); } // release the reader lock } @@ -1222,7 +1223,7 @@ void ReadProcessor::clear() { BUSProcessor::BUSProcessor(const KmerIndex& index, const ProgramOptions& opt, const MinCollector& tc, MasterProcessor& mp, int _id, int _local_id) : - paired(!opt.single_end), tc(tc), index(index), mp(mp), id(_id), local_id(_local_id) { + paired(!opt.single_end), bam(opt.bam), num(opt.num), tc(tc), index(index), mp(mp), id(_id), local_id(_local_id) { // initialize buffer bufsize = mp.bufsize; buffer = new char[bufsize]; @@ -1238,6 +1239,8 @@ BUSProcessor::BUSProcessor(const KmerIndex& index, const ProgramOptions& opt, co BUSProcessor::BUSProcessor(BUSProcessor && o) : paired(o.paired), + bam(o.bam), + num(o.num), tc(o.tc), index(o.index), mp(o.mp), @@ -1248,6 +1251,7 @@ BUSProcessor::BUSProcessor(BUSProcessor && o) : seqs(std::move(o.seqs)), names(std::move(o.names)), quals(std::move(o.quals)), + flags(std::move(o.flags)), newEcs(std::move(o.newEcs)), flens(std::move(o.flens)), bias5(std::move(o.bias5)), @@ -1273,13 +1277,13 @@ void BUSProcessor::operator()() { // grab the reader lock { std::lock_guard lock(mp.reader_lock); - if (mp.SR.empty()) { + if (mp.SR->empty()) { // nothing to do return; } else { // get new sequences std::vector umis; - mp.SR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, false); + mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, false); } // release the reader lock } @@ -1304,7 +1308,7 @@ void BUSProcessor::processBuffer() { std::vector> v,v2; std::vector vtmp; std::vector u; - + u.reserve(1000); v.reserve(1000); vtmp.reserve(1000); @@ -1326,9 +1330,17 @@ void BUSProcessor::processBuffer() { // actually process the sequences const BUSOptions& busopt = mp.opt.busOptions; - int incf = busopt.nfiles-1; + int incf, jmax; + if (bam) { + incf = 1; + jmax = 2; + } else { + incf = busopt.nfiles-1; + jmax = busopt.nfiles; + } + //int incf = (bam) ? 1 : busopt.nfiles-1; for (int i = 0; i + incf < seqs.size(); i++) { - for (int j = 0; j < busopt.nfiles; j++) { + for (int j = 0; j < jmax /*(bam) ? 2 : busopt.nfiles*/; j++) { s[j] = seqs[i+j].first; l[j] = seqs[i+j].second; } @@ -1350,7 +1362,7 @@ void BUSProcessor::processBuffer() { } - auto &bcc = busopt.bc[0]; +// auto &bcc = busopt.bc[0]; int blen = 0; bool bad_bc = false; for (auto &bcc : busopt.bc) { @@ -1401,6 +1413,9 @@ void BUSProcessor::processBuffer() { b.flags |= (f) << 8; b.count = 1; //std::cout << std::string(s1,10) << "\t" << b.barcode << "\t" << std::string(s1+10,16) << "\t" << b.UMI << "\n"; + if (num) { + b.flags = (uint32_t) flags[i / jmax]; + } //ec = tc.findEC(u); @@ -1438,8 +1453,7 @@ AlnProcessor::AlnProcessor(const KmerIndex& index, const ProgramOptions& opt, Ma buffer = new char[bufsize]; bambufsize = 1<<20; bambuffer = new char[bambufsize]; // refactor this? - - + if (opt.batch_mode) { /* need to check this later */ assert(id != -1); @@ -1475,6 +1489,7 @@ AlnProcessor::AlnProcessor(AlnProcessor && o) : seqs(std::move(o.seqs)), names(std::move(o.names)), quals(std::move(o.quals)), + flags(std::move(o.flags)), umis(std::move(o.umis)), batchSR(std::move(o.batchSR)) { buffer = o.buffer; @@ -1515,19 +1530,19 @@ void AlnProcessor::operator()() { if (batchSR.empty()) { return; } else { - batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, true ); + batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, true ); readPseudoAlignmentBatch(mp.pseudobatchf_in, pseudobatch); assert(pseudobatch.batch_id == readbatch_id); assert(pseudobatch.aln.size() == ((paired) ? seqs.size()/2 : seqs.size())); // sanity checks } } else { std::lock_guard lock(mp.reader_lock); - if (mp.SR.empty()) { + if (mp.SR->empty()) { // nothing to do return; } else { // get new sequences - mp.SR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, true); + mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, true); readPseudoAlignmentBatch(mp.pseudobatchf_in, pseudobatch); assert(pseudobatch.batch_id == readbatch_id); assert(pseudobatch.aln.size() == ((paired) ? seqs.size()/2 : seqs.size())); // sanity checks @@ -2557,9 +2572,14 @@ int fillBamRecord(bam1_t &b, uint8_t* buf, const char *seq, const char *name, co } -/** -- sequence reader -- **/ +/** -- sequence readers -- **/ + +void SequenceReader::reset() { + state = false; + readbatch_id = -1; +} -SequenceReader::~SequenceReader() { +FastqSequenceReader::~FastqSequenceReader() { for (auto &f : fp) { if (f) { gzclose(f); @@ -2578,26 +2598,20 @@ SequenceReader::~SequenceReader() { } -void SequenceReader::reserveNfiles(int n) { - fp.resize(nfiles); - seq.resize(nfiles, nullptr); - l.resize(nfiles, 0); - nl.resize(nfiles, 0); +bool FastqSequenceReader::empty() { + return (!state && current_file >= files.size()); } -void SequenceReader::reset() { - for (auto &f : fp) { +void FastqSequenceReader::reset() { + SequenceReader::reset(); + + for (auto &f : fp) { if (f) { gzclose(f); } f = nullptr; } - for (auto &s : seq) { - kseq_destroy(s); - s = nullptr; - } - if (f_umi && f_umi->is_open()) { f_umi->close(); } @@ -2612,15 +2626,24 @@ void SequenceReader::reset() { } current_file = 0; - state = false; - readbatch_id = -1; + for (auto &s : seq) { + kseq_destroy(s); + s = nullptr; + } } +void FastqSequenceReader::reserveNfiles(int n) { + fp.resize(nfiles); + l.resize(nfiles, 0); + nl.resize(nfiles, 0); + seq.resize(nfiles, nullptr); +} // returns true if there is more left to read from the files -bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector > &seqs, +bool FastqSequenceReader::fetchSequences(char *buf, const int limit, std::vector > &seqs, std::vector > &names, std::vector > &quals, + std::vector& flags, std::vector &umis, int& read_id, bool full) { @@ -2634,6 +2657,7 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector> umi; umis.emplace_back(std::move(umi)); } + + flags.push_back(numreads++); } else { return true; // read it next time } @@ -2732,27 +2758,146 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector= files.size()); -} - // move constructor -SequenceReader::SequenceReader(SequenceReader&& o) : +#if 1 +FastqSequenceReader::FastqSequenceReader(FastqSequenceReader&& o) : + nfiles(o.nfiles), + numreads(o.numreads), fp(std::move(o.fp)), - seq(std::move(o.seq)), l(std::move(o.l)), nl(std::move(o.nl)), paired(o.paired), - nfiles(o.nfiles), files(std::move(o.files)), umi_files(std::move(o.umi_files)), f_umi(std::move(o.f_umi)), - current_file(o.current_file), - state(o.state) { + current_file(o.current_file) { + + o.fp.resize(nfiles); + o.l.resize(nfiles, 0); + o.nl.resize(nfiles, 0); + o.state = false; +} +#endif +#if 0 +FastqSequenceReader::FastqSequenceReader(FastqSequenceReader&& o) { + fp = (std::move(o.fp)); + l = (std::move(o.l)); + nl = (std::move(o.nl)); + paired = (o.paired); + nfiles = (o.nfiles); + files = (std::move(o.files)), + umi_files = (std::move(o.umi_files)); + f_umi = (std::move(o.f_umi)); + current_file= (o.current_file); + seq = (std::move(o.seq)); o.fp.resize(nfiles); - o.seq.resize(nfiles, nullptr); o.l.resize(nfiles, 0); o.nl.resize(nfiles, 0); o.state = false; + o.seq.resize(nfiles, nullptr); +} +#endif +const std::string BamSequenceReader::seq_enc = "=ACMGRSVTWYHKDBN"; + +BamSequenceReader::~BamSequenceReader() { + if (fp) { + bgzf_close(fp); + } + if (head) { + bam_hdr_destroy(head); + } + if (rec) { + bam_destroy1(rec); + } +} + +bool BamSequenceReader::empty() { + return !state; +} + +void BamSequenceReader::reset() { + SequenceReader::reset(); +} + +void BamSequenceReader::reserveNfiles(int n) { +} + +// returns true if there is more left to read from the files +bool BamSequenceReader::fetchSequences(char *buf, const int limit, std::vector > &seqs, + std::vector > &names, + std::vector > &quals, + std::vector& flags, + std::vector &umis, int& read_id, + bool full) { + + readbatch_id += 1; // increase the batch id + read_id = readbatch_id; // copy now because we are inside a lock + seqs.clear(); + flags.clear(); + + int bufpos = 0; + while (true) { + if (!state) { // should we open a file + return false; + } + + // the file is open and we have read into seq1 and seq2 + if (err >= 0) { + if (!(rec->core.flag & BAM_FSECONDARY)) { // record is primary alignment + int bufadd = l_seq + l_bc + l_umi + 2; + // fits into the buffer + if (bufpos+bufadd < limit) { + char *pi; + + pi = buf + bufpos; + memcpy(pi, bc, l_bc); + memcpy(pi + l_bc, umi, l_umi + 1); + seqs.emplace_back(pi, l_bc + l_umi); + bufpos += l_bc + l_umi + 1; + + pi = buf + bufpos; + int len = (l_seq + 1) / 2; + if (l_seq % 2) --len; + int j = 0; + for (int i = 0; i < len; ++i, ++eseq) { + buf[bufpos++] = seq_enc[*eseq >> 4]; + buf[bufpos++] = seq_enc[*eseq & 0x0F]; + } + if (l_seq % 2) { + buf[bufpos++] = seq_enc[*eseq >> 4]; + } + buf[bufpos++] = '\0'; + seqs.emplace_back(pi, l_seq); + + } else { + return true; // read it next time + } + } + + // read for the next one + err = bam_read1(fp, rec); + if (err < 0) { + return true; + } + eseq = bam_get_seq(rec); + l_seq = rec->core.l_qseq; + + bc = bam_aux2Z(bam_aux_get(rec, "CR")); + l_bc = 0; + for (char *pbc = bc; *pbc != '\0'; ++pbc) { + ++l_bc; + } + + umi = bam_aux2Z(bam_aux_get(rec, "UR")); + l_umi = 0; + for (char *pumi = umi; *pumi != '\0'; ++pumi) { + ++l_umi; + } + + } else { + state = false; // haven't opened file yet + } + } } + diff --git a/src/ProcessReads.h b/src/ProcessReads.h index 011d5394..bb475a04 100644 --- a/src/ProcessReads.h +++ b/src/ProcessReads.h @@ -23,6 +23,7 @@ #include "GeneModel.h" #include "BUSData.h" #include "BUSTools.h" +#include #ifndef KSEQ_INIT_READY @@ -41,9 +42,35 @@ class SequenceReader { public: SequenceReader(const ProgramOptions& opt) : - paired(!opt.single_end), files(opt.files), - f_umi(new std::ifstream{}), - current_file(0), state(false), readbatch_id(-1) { + readbatch_id(-1) {}; + SequenceReader() : state(false), readbatch_id(-1) {}; + virtual ~SequenceReader() {} +// SequenceReader(SequenceReader&& o); + + virtual bool empty() = 0; + virtual void reset(); + virtual void reserveNfiles(int n) = 0; + virtual bool fetchSequences(char *buf, const int limit, std::vector>& seqs, + std::vector>& names, + std::vector>& quals, + std::vector& flags, + std::vector& umis, int &readbatch_id, + bool full=false) = 0; + + +public: + bool state; // is the file open + int readbatch_id = -1; +}; + +class FastqSequenceReader : public SequenceReader { +public: + + FastqSequenceReader(const ProgramOptions& opt) : SequenceReader(opt), + current_file(0), paired(!opt.single_end), files(opt.files), + f_umi(new std::ifstream{}) { + SequenceReader::state = false; + if (opt.bus_mode) { nfiles = opt.busOptions.nfiles; } else { @@ -51,46 +78,106 @@ class SequenceReader { } reserveNfiles(nfiles); } - SequenceReader() : + FastqSequenceReader() : SequenceReader(), paired(false), f_umi(new std::ifstream{}), - current_file(0), state(false), readbatch_id(-1) {} - SequenceReader(SequenceReader&& o); - - bool empty(); + current_file(0) {}; + FastqSequenceReader(FastqSequenceReader &&o); + ~FastqSequenceReader(); + + bool empty(); void reset(); void reserveNfiles(int n); - ~SequenceReader(); - bool fetchSequences(char *buf, const int limit, std::vector>& seqs, std::vector>& names, std::vector>& quals, + std::vector& flags, std::vector& umis, int &readbatch_id, bool full=false); public: int nfiles = 1; + uint32_t numreads = 0; std::vector fp; - //gzFile fp1 = 0, fp2 = 0; - std::vector seq; std::vector l; std::vector nl; - //kseq_t *seq1 = 0, *seq2 = 0; - //int l1,l2,nl1,nl2; bool paired; std::vector files; std::vector umi_files; std::unique_ptr f_umi; int current_file; - bool state; // is the file open - int readbatch_id = -1; + std::vector seq; +}; + +class BamSequenceReader : public SequenceReader { +public: + + BamSequenceReader(const ProgramOptions& opt) : + SequenceReader(opt) { + SequenceReader::state = true; + + fp = bgzf_open(opt.files[0].c_str(), "rb"); + head = bam_hdr_read(fp); + rec = bam_init1(); + + err = bam_read1(fp, rec); + eseq = bam_get_seq(rec); + l_seq = rec->core.l_qseq; + + bc = bam_aux2Z(bam_aux_get(rec, "CR")); + l_bc = 0; + for (char *pbc = bc; *pbc != '\0'; ++pbc) { + ++l_bc; + } + + umi = bam_aux2Z(bam_aux_get(rec, "UR")); + l_umi = 0; + for (char *pumi = umi; *pumi != '\0'; ++pumi) { + ++l_umi; + } + } + BamSequenceReader() : SequenceReader() {}; + BamSequenceReader(BamSequenceReader &&o); + ~BamSequenceReader(); + + bool empty(); + void reset(); + void reserveNfiles(int n); + bool fetchSequences(char *buf, const int limit, std::vector>& seqs, + std::vector>& names, + std::vector>& quals, + std::vector& flags, + std::vector& umis, int &readbatch_id, + bool full=false); + +public: + BGZF *fp; + bam_hdr_t *head; + bam1_t *rec; + uint8_t *eseq; + int32_t l_seq; + char *bc; + int l_bc; + char *umi; + int l_umi; + int err; + char seq[256]; // Is there a better limit? + +private: + static const std::string seq_enc; }; class MasterProcessor { public: MasterProcessor (KmerIndex &index, const ProgramOptions& opt, MinCollector &tc, const Transcriptome& model) - : tc(tc), index(index), model(model), bamfp(nullptr), bamfps(nullptr), bamh(nullptr), opt(opt), SR(opt), numreads(0) + : tc(tc), index(index), model(model), bamfp(nullptr), bamfps(nullptr), bamh(nullptr), opt(opt), numreads(0) ,nummapped(0), num_umi(0), bufsize(1ULL<<23), tlencount(0), biasCount(0), maxBiasCount((opt.bias) ? 1000000 : 0), last_pseudobatch_id (-1) { + if (opt.bam) { + SR = new BamSequenceReader(opt); + } else { + SR = new FastqSequenceReader(opt); + } + if (opt.batch_mode) { memset(&bus_bc_len[0],0,33); memset(&bus_umi_len[0],0,33); @@ -136,13 +223,14 @@ class MasterProcessor { bam_hdr_destroy(bamh); bamh = nullptr; } + delete SR; } std::mutex reader_lock; std::mutex writer_lock; - SequenceReader SR; + SequenceReader *SR; MinCollector& tc; KmerIndex& index; const Transcriptome& model; @@ -205,7 +293,7 @@ class ReadProcessor { std::vector, std::string>> new_ec_umi; const KmerIndex& index; MasterProcessor& mp; - SequenceReader batchSR; + FastqSequenceReader batchSR; int64_t numreads; int id; int local_id; @@ -215,6 +303,7 @@ class ReadProcessor { std::vector> seqs; std::vector> names; std::vector> quals; + std::vector flags; std::vector umis; std::vector> newEcs; std::vector flens; @@ -237,6 +326,8 @@ class BUSProcessor { size_t bufsize; bool paired; + bool bam; + bool num; const MinCollector& tc; const KmerIndex& index; MasterProcessor& mp; @@ -251,6 +342,7 @@ class BUSProcessor { std::vector> seqs; std::vector> names; std::vector> quals; + std::vector flags; std::vector> newEcs; std::vector flens; @@ -279,7 +371,7 @@ class AlnProcessor { const KmerIndex& index; const EMAlgorithm& em; MasterProcessor& mp; - SequenceReader batchSR; + FastqSequenceReader batchSR; int64_t numreads; int id; PseudoAlignmentBatch pseudobatch; @@ -291,6 +383,7 @@ class AlnProcessor { std::vector> seqs; std::vector> names; std::vector> quals; + std::vector flags; std::vector umis; void operator()(); diff --git a/src/common.h b/src/common.h index 85f9e280..950c5cd5 100644 --- a/src/common.h +++ b/src/common.h @@ -5,7 +5,7 @@ #include #include - +#include #ifdef _WIN64 typedef unsigned int uint; @@ -67,6 +67,8 @@ struct ProgramOptions { bool bus_mode; BUSOptions busOptions; bool pseudo_quant; + bool bam; + bool num; std::string batch_file_name; std::vector> batch_files; std::vector batch_ids; @@ -107,6 +109,8 @@ ProgramOptions() : batch_mode(false), bus_mode(false), pseudo_quant(false), + bam(false), + num(false), plaintext(false), write_index(false), single_end(false), diff --git a/src/main.cpp b/src/main.cpp index e0fccfd1..c061c658 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -542,13 +542,15 @@ void ListSingleCellTechnologies() { } void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) { - const char *opt_string = "i:o:x:t:l"; + const char *opt_string = "i:o:x:t:lbn"; static struct option long_options[] = { {"index", required_argument, 0, 'i'}, {"output-dir", required_argument, 0, 'o'}, {"technology", required_argument, 0, 'x'}, {"list", no_argument, 0, 'l'}, {"threads", required_argument, 0, 't'}, + {"bam", no_argument, 0, 'b'}, + {"num", no_argument, 0, 'n'}, {0,0,0,0} }; @@ -586,6 +588,14 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) { stringstream(optarg) >> opt.threads; break; } + case 'b': { + opt.bam = true; + break; + } + case 'n': { + opt.num = true; + break; + } default: break; } } @@ -814,96 +824,183 @@ bool CheckOptionsBus(ProgramOptions& opt) { ret = false; } else { auto& busopt = opt.busOptions; - - if (opt.technology == "10XV2") { - busopt.nfiles = 2; - busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string - busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26] - busopt.bc.push_back(BUSOptionSubstr(0,0,16)); - } else if (opt.technology == "10XV3") { - busopt.nfiles = 2; - busopt.seq = BUSOptionSubstr(1,0,0); - busopt.umi = BUSOptionSubstr(0,16,28); - busopt.bc.push_back(BUSOptionSubstr(0,0,16)); - } else if (opt.technology == "10XV1") { - busopt.nfiles = 3; - busopt.seq = BUSOptionSubstr(0,0,0); - busopt.umi = BUSOptionSubstr(1,0,0); - busopt.bc.push_back(BUSOptionSubstr(2,0,0)); - } else if (opt.technology == "SURECELL") { - busopt.nfiles = 2; - busopt.seq = BUSOptionSubstr(1,0,0); - busopt.umi = BUSOptionSubstr(0,51,59); - busopt.bc.push_back(BUSOptionSubstr(0,0,6)); - busopt.bc.push_back(BUSOptionSubstr(0,21,27)); - busopt.bc.push_back(BUSOptionSubstr(0,42,48)); - } else if (opt.technology == "DROPSEQ") { - busopt.nfiles = 2; - busopt.seq = BUSOptionSubstr(1,0,0); - busopt.umi = BUSOptionSubstr(0,12,20); - busopt.bc.push_back(BUSOptionSubstr(0,0,12)); - } else if (opt.technology == "INDROPSV1") { - busopt.nfiles = 2; - busopt.seq = BUSOptionSubstr(1,0,0); - busopt.umi = BUSOptionSubstr(0,42,48); - busopt.bc.push_back(BUSOptionSubstr(0,0,11)); - busopt.bc.push_back(BUSOptionSubstr(0,30,38)); - } else if (opt.technology == "INDROPSV2") { - busopt.nfiles = 2; - busopt.seq = BUSOptionSubstr(0,0,0); - busopt.umi = BUSOptionSubstr(1,42,48); - busopt.bc.push_back(BUSOptionSubstr(1,0,11)); - busopt.bc.push_back(BUSOptionSubstr(1,30,38)); - } else if (opt.technology == "INDROPSV3") { - busopt.nfiles = 3; - busopt.seq = BUSOptionSubstr(2,0,0); - busopt.umi = BUSOptionSubstr(1,8,14); - busopt.bc.push_back(BUSOptionSubstr(0,0,8)); - busopt.bc.push_back(BUSOptionSubstr(1,0,8)); - } else if (opt.technology == "CELSEQ") { - busopt.nfiles = 2; - busopt.seq = BUSOptionSubstr(1,0,0); - busopt.umi = BUSOptionSubstr(0,8,12); - busopt.bc.push_back(BUSOptionSubstr(0,0,8)); - } else if (opt.technology == "CELSEQ2") { - busopt.nfiles = 2; - busopt.seq = BUSOptionSubstr(1,0,0); - busopt.umi = BUSOptionSubstr(0,0,6); - busopt.bc.push_back(BUSOptionSubstr(0,6,12)); - } else if (opt.technology == "SCRBSEQ") { - busopt.nfiles = 2; - busopt.seq = BUSOptionSubstr(1,0,0); - busopt.umi = BUSOptionSubstr(0,6,16); - busopt.bc.push_back(BUSOptionSubstr(0,0,6)); - } else { - vector files; - vector values; - vector bcValues; - vector errorList; - bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues); - if(!invalid) { - busopt.nfiles = files.size(); - for(int i = 0; i < bcValues.size(); i++) { - busopt.bc.push_back(bcValues[i]); + + if (opt.bam) { // Note: only 10xV2 has been tested + busopt.nfiles = 1; + if (opt.technology == "10XV2") { + busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string + busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26] + busopt.bc.push_back(BUSOptionSubstr(0,0,16)); + } else if (opt.technology == "10XV3") { + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,16,28); + busopt.bc.push_back(BUSOptionSubstr(0,0,16)); +// } else if (opt.technology == "10XV1") { + + } else if (opt.technology == "SURECELL") { + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,18,26); + busopt.bc.push_back(BUSOptionSubstr(0,0,18)); + } else if (opt.technology == "DROPSEQ") { + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,12,20); + busopt.bc.push_back(BUSOptionSubstr(0,0,12)); + } else if (opt.technology == "INDROPSV1") { + busopt.nfiles = 2; + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,42,48); + busopt.bc.push_back(BUSOptionSubstr(0,0,11)); + busopt.bc.push_back(BUSOptionSubstr(0,30,38)); + } else if (opt.technology == "INDROPSV2") { + busopt.nfiles = 2; + busopt.seq = BUSOptionSubstr(0,0,0); + busopt.umi = BUSOptionSubstr(1,42,48); + busopt.bc.push_back(BUSOptionSubstr(1,0,11)); + busopt.bc.push_back(BUSOptionSubstr(1,30,38)); + } else if (opt.technology == "INDROPSV3") { + busopt.nfiles = 3; + busopt.seq = BUSOptionSubstr(2,0,0); + busopt.umi = BUSOptionSubstr(1,8,14); + busopt.bc.push_back(BUSOptionSubstr(0,0,8)); + busopt.bc.push_back(BUSOptionSubstr(1,0,8)); + } else if (opt.technology == "CELSEQ") { + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,8,12); + busopt.bc.push_back(BUSOptionSubstr(0,0,8)); + } else if (opt.technology == "CELSEQ2") { + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,0,6); + busopt.bc.push_back(BUSOptionSubstr(0,6,12)); + } else if (opt.technology == "SCRBSEQ") { + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,6,16); + busopt.bc.push_back(BUSOptionSubstr(0,0,6)); + } else if (opt.technology == "INDROPSV3") { + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,0,6); + busopt.bc.push_back(BUSOptionSubstr(0,6,16)); + } else { + vector files; + vector values; + vector bcValues; + vector errorList; + bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues); + if(!invalid) { + busopt.nfiles = files.size(); + for(int i = 0; i < bcValues.size(); i++) { + busopt.bc.push_back(bcValues[i]); + } + busopt.umi = values[0]; + busopt.seq = values[1]; + } else { + for(int j = 0; j < errorList.size(); j++) { + cerr << errorList[j] << endl; + } + cerr << "Unable to create technology: " << opt.technology << endl; + ret = false; } - busopt.umi = values[0]; - busopt.seq = values[1]; + } + } else { + if (opt.technology == "10XV2") { + busopt.nfiles = 2; + busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string + busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26] + busopt.bc.push_back(BUSOptionSubstr(0,0,16)); + } else if (opt.technology == "10XV3") { + busopt.nfiles = 2; + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,16,28); + busopt.bc.push_back(BUSOptionSubstr(0,0,16)); + } else if (opt.technology == "10XV1") { + busopt.nfiles = 3; + busopt.seq = BUSOptionSubstr(0,0,0); + busopt.umi = BUSOptionSubstr(1,0,0); + busopt.bc.push_back(BUSOptionSubstr(2,0,0)); + } else if (opt.technology == "SURECELL") { + busopt.nfiles = 2; + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,51,59); + busopt.bc.push_back(BUSOptionSubstr(0,0,6)); + busopt.bc.push_back(BUSOptionSubstr(0,21,27)); + busopt.bc.push_back(BUSOptionSubstr(0,42,48)); + } else if (opt.technology == "DROPSEQ") { + busopt.nfiles = 2; + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,12,20); + busopt.bc.push_back(BUSOptionSubstr(0,0,12)); + } else if (opt.technology == "INDROPSV1") { + busopt.nfiles = 2; + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,42,48); + busopt.bc.push_back(BUSOptionSubstr(0,0,11)); + busopt.bc.push_back(BUSOptionSubstr(0,30,38)); + } else if (opt.technology == "INDROPSV2") { + busopt.nfiles = 2; + busopt.seq = BUSOptionSubstr(0,0,0); + busopt.umi = BUSOptionSubstr(1,42,48); + busopt.bc.push_back(BUSOptionSubstr(1,0,11)); + busopt.bc.push_back(BUSOptionSubstr(1,30,38)); + } else if (opt.technology == "INDROPSV3") { + busopt.nfiles = 3; + busopt.seq = BUSOptionSubstr(2,0,0); + busopt.umi = BUSOptionSubstr(1,8,14); + busopt.bc.push_back(BUSOptionSubstr(0,0,8)); + busopt.bc.push_back(BUSOptionSubstr(1,0,8)); + } else if (opt.technology == "CELSEQ") { + busopt.nfiles = 2; + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,8,12); + busopt.bc.push_back(BUSOptionSubstr(0,0,8)); + } else if (opt.technology == "CELSEQ2") { + busopt.nfiles = 2; + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,0,6); + busopt.bc.push_back(BUSOptionSubstr(0,6,12)); + } else if (opt.technology == "SCRBSEQ") { + busopt.nfiles = 2; + busopt.seq = BUSOptionSubstr(1,0,0); + busopt.umi = BUSOptionSubstr(0,6,16); + busopt.bc.push_back(BUSOptionSubstr(0,0,6)); + } else if (opt.technology == "INDROPSV3") { + busopt.nfiles = 3; + busopt.seq = BUSOptionSubstr(2,0,0); + busopt.umi = BUSOptionSubstr(1,8,14); + busopt.bc.push_back(BUSOptionSubstr(0,0,8)); + busopt.bc.push_back(BUSOptionSubstr(1,0,8)); } else { - for(int j = 0; j < errorList.size(); j++) { - cerr << errorList[j] << endl; + vector files; + vector values; + vector bcValues; + vector errorList; + bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues); + if(!invalid) { + busopt.nfiles = files.size(); + for(int i = 0; i < bcValues.size(); i++) { + busopt.bc.push_back(bcValues[i]); + } + busopt.umi = values[0]; + busopt.seq = values[1]; + } else { + for(int j = 0; j < errorList.size(); j++) { + cerr << errorList[j] << endl; + } + cerr << "Unable to create technology: " << opt.technology << endl; + ret = false; } - cerr << "Unable to create technology: " << opt.technology << endl; - ret = false; } } } - if (ret && opt.files.size() % opt.busOptions.nfiles != 0) { + if (ret && !opt.bam && opt.files.size() % opt.busOptions.nfiles != 0) { cerr << "Error: Number of files (" << opt.files.size() << ") does not match number of input files required by " << "technology " << opt.technology << " (" << opt.busOptions.nfiles << ")" << endl; ret = false; } + if (opt.bam && opt.num) { + cerr << "Warning: --bam option was used, so --num option will be ignored" << endl; + } + return ret; } @@ -1549,7 +1646,9 @@ void usageBus() { << "-x, --technology=STRING Single-cell technology used " << endl << endl << "Optional arguments:" << endl << "-l, --list List all single-cell technologies supported" << endl - << "-t, --threads=INT Number of threads to use (default: 1)" << endl; + << "-t, --threads=INT Number of threads to use (default: 1)" << endl + << "-b, --bam Input file is a BAM file" << endl + << "-n, --num Output number of read in flag column (incompatible with --bam)" << endl; } void usageIndex() {