diff --git a/README.md b/README.md index 4f7153a..3fbb3b1 100644 --- a/README.md +++ b/README.md @@ -15,29 +15,36 @@ other tools such as [Sailfish](https://github.com/kingsfordgroup/sailfish) and QAPA consists of both Python (2.7+ or 3.5+) and R scripts. -1. Install [R](https://www.r-project.org/) and the R packages optparse, dplyr, - data.table, and stringr. In the R console, the packages can be installed in - one line: - - install.packages(c("optparse", "dplyr", "data.table", "stringr")) - -2. Install the latest development version of +1. Install the latest development version of [bedtools](https://github.com/arq5x/bedtools2). *Note: do not use 2.26.0 stable release as there is a [bug](https://github.com/arq5x/bedtools2/issues/435) with the groupBy tool*. -3. Install the latest QAPA Python/R source code from GitHub. QAPA requires the - Python packages pandas, numpy, pybedtools, and biopython. These will be - automatically installed, if necessary. +2. Install the latest development version (or the latest + [release](https://github.com/morrislab/qapa/releases/latest)) of QAPA from + GitHub . QAPA requires the Python packages pandas, numpy, pybedtools, and + biopython. These will be automatically installed, if necessary. git clone https://github.com/morrislab/qapa.git cd qapa python setup.py install - # To test if installation is working: - cd - which qapa - qapa -h +3. Install [R](https://www.r-project.org/) and the R packages optparse, dplyr, + data.table, and stringr. In the R console, the packages can be installed in + one line: + + install.packages(c("optparse", "dplyr", "data.table", "stringr")) + + Alternatively, run the provided `install_packages.R` helper script from + command line: + + Rscript scripts/install_packages.R + +4. To test if installation is working: + + cd # change to root directory + which qapa # should return path of qapa executable + qapa -h # should display help message # Usage @@ -86,11 +93,15 @@ The following data sources are required: mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e "select * from wgEncodeGencodeBasicVM9" mm10 > gencode.basic.txt - Note that the `-N` option (suppress column headings) is not used here. + Alternatively, if you are starting from a GTF/GFF file, you can convert + it to genePred format using the UCSC tool + [`gtfToGenePred`](http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/gtfToGenePred): + + gtfToGenePred -genePredExt custom_genes.gtf custom_genes.genedPred **B. Poly(A) site annotation** -Two options are available. +As of v1.2.0, this step is optional. Otherwise, two options are available: Option 1: standard approach (as described in the [paper](#citation)) @@ -109,11 +120,11 @@ Option 1: standard approach (as described in the [paper](#citation)) txEnd, name2, score, strand from wgEncodeGencodePolyaVM9 where name2 = 'polyA_site'" -N mm10 > gencode.polyA_sites.bed -Option 2: use custom BED track (*new in v1.1.0*) +Option 2: use custom BED track 1. Custom BED track of poly(A) sites - Alternatively a custom BED file of poly(A) can be used to annotate 3' UTRs. + A custom BED file of poly(A) can be used to annotate 3' UTRs. Each entry must contain the start (0-based) and end coordinate of a poly(A) site. @@ -127,14 +138,24 @@ A reference genome in FASTA format is required for extracting sequences from To extract 3' UTRs from annotation, run: - qapa build --db ensembl_identifiers.txt -g gencode.polyA_sites.bed - -p clusters.mm10.bed gencode.basic.txt > output_utrs.bed + qapa build --db ensembl_identifiers.txt -g gencode.polyA_sites.bed -p clusters.mm10.bed + gencode.basic.txt > output_utrs.bed -Or when using a custom BED file: +If using a custom BED file, replace the `-g` and `-p` options with `-o`: qapa build --db ensembl_identifiers.txt -o custom_sites.bed gencode.basic.txt > output_utrs.bed +If using a custom genePred file converted from GTF, include the `-H` +option: + + qapa build -H --db ensembl_identifiers.txt -o custom_sites.bed + custom_genes.genePred > output_utrs.bed + +If bypassing the poly(A) annotation step, include the `-N` option: + + qapa build -N --db ensembl_identifiers.txt gencode.basic.txt > output.utrs.bed + Results will be saved in the file `output_utrs.bed` (default is STDOUT). To extract sequences from the resulting BED file, use the `fasta` sub-command diff --git a/qapa/annotate.py b/qapa/annotate.py index 1b54af8..dcd2d74 100644 --- a/qapa/annotate.py +++ b/qapa/annotate.py @@ -9,7 +9,6 @@ from pybedtools import featurefuncs import re - def extend_feature(feature, length=24): """Extend the 3' end by length """ @@ -117,7 +116,7 @@ def main(args, input_filename, fout=sys.stdout): custom_mode = True custom = pybedtools.BedTool(args.other) sites = sort_bed(custom) - else: + elif not args.no_annotation: pas_filter = re.compile("(DS|TE)$") gencode = pybedtools.BedTool(args.gencode_polya)\ .filter(lambda x: x.name == 'polyA_site')\ @@ -147,12 +146,16 @@ def main(args, input_filename, fout=sys.stdout): # - Restore BED format with cut() # - Use custom function to resolve overlapping features from groupby - overlap_utrs = utrs.intersect(sites, s=True, wa=True, wb=True)\ - .each(restore_feature)\ - .each(update_3prime, - min_intermediate_pas=args.intermediate_polyasite, - custom=custom_mode)\ - .saveas() + if args.no_annotation: + overlap_utrs = utrs.each(restore_feature)\ + .saveas() + else: + overlap_utrs = utrs.intersect(sites, s=True, wa=True, wb=True)\ + .each(restore_feature)\ + .each(update_3prime, + min_intermediate_pas=args.intermediate_polyasite, + custom=custom_mode)\ + .saveas() overlap_utrs = sort_bed(overlap_utrs)\ .groupby(g=[1, 2, 3, 6, 7, 8, 9], c=[4, 5, 10, 11], o=['collapse'] * 4, delim="|")\ diff --git a/qapa/collapse.py b/qapa/collapse.py index a93f461..d693a74 100644 --- a/qapa/collapse.py +++ b/qapa/collapse.py @@ -11,7 +11,7 @@ class Interval: - def __init__(self, l): + def __init__(self, l, sp=None): #l = line.rstrip().split("\t") self.chrom = l[0] self.start = int(l[1]) @@ -24,6 +24,7 @@ def __init__(self, l): self.end2 = int(l[7]) self.gene_id = l[8] #self.utr_id = l[9] + self.species = self._guess_species(sp) def is_forward(self): return self.strand == "+" @@ -45,24 +46,36 @@ def set_score(self): else: self.score = self.start2 - self.start - def finalize(self): + def finalize(self, species=None): '''Print the interval using the highest peak''' if self.is_forward(): utr_co = [self.end2, self.end] else: utr_co = [self.start, self.start2] - new_name = [self.name, self._guess_species(), self.chrom, self.start, - self.end, self.strand, 'utr'] + utr_co + new_name = [self.name, self.species, self.chrom, + self.start, self.end, self.strand, 'utr'] + utr_co new_name = "_".join([str(x) for x in new_name]) self.set_score() return [self.chrom, self.start, self.end, new_name, self.score, self.strand, self.gene_id] - def _guess_species(self): + def set_gene(self, conn): + '''Query for Ensembl Gene ID''' + query = 'select gene_id from ensembl_id where transcript_id = ?' + cur = conn.execute(query, (self.name,)) + r = cur.fetchone() + if r: + self.gene_id = r[0] + else: + self.gene_id = None + + def _guess_species(self, species=None): if re.match(r'ENST\d+', self.name): return 'hg19' elif re.match(r'ENSMUST\d+', self.name): return 'mm10' + elif species is not None: + return species warnings.warn('Could not guess species from gene name!', Warning) return 'unk' @@ -121,12 +134,8 @@ def merge_bed(args, inputfile): overlap_diff_genes = set() print("Iterating and merging intervals by 3' end", file=sys.stderr) - # for line in input: for index, line in df.iterrows(): - if re.match(r'^(?!chr)', line[0]): - continue - - my_interval = Interval(line) + my_interval = Interval(line, args.species) if prev_interval is None: prev_interval = my_interval diff --git a/qapa/extract.py b/qapa/extract.py index dad6665..a9c01d1 100644 --- a/qapa/extract.py +++ b/qapa/extract.py @@ -8,8 +8,13 @@ class Row: - def __init__(self, row): + def __init__(self, row, no_header=False): l = row.rstrip().split("\t") + if no_header: + # add a dummy column in front of list to represent bin column in + # UCSC genePred tables + l.insert(0, "dummy") + self.name = l[1] self.chrom = l[2] self.strand = l[3] @@ -75,7 +80,7 @@ def get_3utr_length(self): return self.utr3[1] - self.utr3[0] def is_on_random_chromosome(self): - return not re.match(r'chr[0-9XY]+$', self.chrom) + return not re.match(r'^(chr)*[0-9XY]+$', self.chrom) def get_block_sizes(self, n): sizes = [0] * n @@ -120,18 +125,19 @@ def main(args, fout=sys.stdout): n = 0 for row in fileinput.input(args.annotation_file[0], openhook=fileinput.hook_compressed): - n = n + 1 - - if fileinput.isfirstline(): + + if fileinput.isfirstline() and not args.no_header: continue - + n = n + 1 + if re.match(r"^#", row): c = c + 1 continue - rowobj = Row(row) + rowobj = Row(row, args.no_header) - if rowobj.is_on_random_chromosome(): + if not args.no_skip_random_chromosomes and \ + rowobj.is_on_random_chromosome(): c = c + 1 continue @@ -167,8 +173,8 @@ def main(args, fout=sys.stdout): fileinput.close() # conn.close() if float(c) / float(n) > 0.75: - print("Warning: More than 75% of entries skipped. Are you using the " - "correct database?", file=sys.stderr) + print("Warning: %d/%d (%0.2f%%) were skipped. Are you using the " + "correct database?" % (c, n, float(c)/float(n)), file=sys.stderr) if __name__ == '__main__': diff --git a/qapa/fasta.py b/qapa/fasta.py index 615ac2a..d900a64 100644 --- a/qapa/fasta.py +++ b/qapa/fasta.py @@ -7,7 +7,8 @@ def get_sequences(bed_file, genome): bed = pybedtools.BedTool(bed_file) - return bed.sequence(genome, s=True, name=True, fullHeader=True, split=True) + return bed.sequence(genome, s=True, name=True, fullHeader=False, + split=True) def filter_sequences(fasta_file, min_length=100, fout=sys.stdout): diff --git a/qapa/qapa.py b/qapa/qapa.py index 2416578..22779d6 100644 --- a/qapa/qapa.py +++ b/qapa/qapa.py @@ -2,6 +2,7 @@ import sys import os import os.path +import re import argparse import tempfile @@ -69,7 +70,7 @@ def getoptions(args=None): help="Ensembl gene identifier table") required.add_argument('-g', '--gencode_polya', dest="gencode_polya", help="GENCODE poly(A) site track") - required.add_argument('-p', '--polyasite', dest="polyasite", + required.add_argument('-p', '--polyasite', dest="polyasite", help="PolyAsite database") optional.add_argument('-m', '--min_polyasite', dest="min_polyasite", type=int, default=3, @@ -94,8 +95,24 @@ def getoptions(args=None): help="Use this option to annotate 3' UTRs with a " "custom BED file of poly(A) sites. This option " "cannot be used in conjunction with -g and -p.") + optional.add_argument("-C", "--no_skip_random_chromosomes", default=True, + action="store_true", + help="Disable skiping of chromosomes that don't " + "match the regular expression pattern " + " '^(chr)*[0-9XY]+$'") optional.add_argument("-s", "--save", action='store_true', - help="don't automatically delete intermediate files") + help="Don't automatically delete intermediate files") + optional.add_argument("-H", "--no_header", action='store_true', + help="Annotation table (genePred) has no header. " + "Use this option if your input table was created " + "using gtfToGenePred -genePredExt.") + optional.add_argument("-N", "--no_annotation", action='store_true', + help="Skip annotation step. Use this option if you " + "only have a gene model annotation file to build " + "a 3' UTR library.") + optional.add_argument("--species", type=str, + help="Set species. Useful if not using 'hg19' or " + "'mm10', otherwise 'unk' is used.") build_parser.set_defaults(func=build) build_parser._action_groups.append(optional) @@ -163,12 +180,21 @@ def getoptions(args=None): args.other, args.annotation_file[0]], build_parser) if args.other is None and \ - (args.gencode_polya is None or args.polyasite is None): + (args.gencode_polya is None or args.polyasite is None) and \ + not args.no_annotation: parser.error("Missing arguments: -g and/or -p") if args.other and (args.gencode_polya or args.polyasite): eprint("Option -o (custom BED) will be used for build phase and " "-g and -p will be ignored") + + if args.no_annotation: + eprint("Annotation step will be skipped") + + if not (args.species is None or \ + re.match(r'^[a-zA-Z0-9]+$', args.species)): + parser.error("Species must be alphanumeric.") + elif args.subcommand == 'fasta': _check_input_files([args.bed_file[0], args.genome], fasta_parser) elif args.subcommand == 'quant': @@ -238,7 +264,7 @@ def quant(args): intermediate_name = args.save try: - cmd = "create_merged_data.R --ensembl {} -f {} -F {} {} > {}".format( + cmd = "create_merged_data.R --db {} -f {} -F {} {} > {}".format( args.db, args.field, args.format, " ".join(args.quant_files), intermediate_name) # eprint(cmd) diff --git a/qapa/version.py b/qapa/version.py index b3ddbc4..58d478a 100644 --- a/qapa/version.py +++ b/qapa/version.py @@ -1 +1 @@ -__version__ = '1.1.1' +__version__ = '1.2.0' diff --git a/scripts/compute_pau.R b/scripts/compute_pau.R index b8846c1..59e28b3 100755 --- a/scripts/compute_pau.R +++ b/scripts/compute_pau.R @@ -31,10 +31,14 @@ suppressPackageStartupMessages(library(data.table)) calculate <- function(dt, qscore) { # Calculate UTR Index given a data.table # - # Return as data frame + # Return as data table or NULL if dt is empty write("Calculating Poly(A) Usage", stderr()) write(paste("\t", nrow(dt), "rows,", length(unique(dt$Gene)), "genes"), stderr()) + if (nrow(dt) == 0) { + return(NULL) + } + dt[, c("APA_ID", "pau") := list(update_apa_id(APA_ID, UTR3.Start, UTR3.End), calc_pau(value) @@ -102,8 +106,6 @@ create_apa_id <- function(d) { update_apa_id <- function(x, start, end) { # Update APA_ID with proximal, distal or single status - added as another suffix - # x <- apa_id(rep(x, length(start))) - if (length(x) == 1) { x <- paste(x, "S", sep = "_") } else { @@ -136,7 +138,6 @@ if (file == "-") { m <- data.table(read.csv(file, sep="\t", header=T, check.names=FALSE, stringsAsFactors=FALSE)) setkey(m, Gene) m[, APA_ID := apa_id(Gene)] -# m <- as.data.frame(m) # Calculate PAU values #### write("Melting data frame", stderr()) @@ -174,6 +175,12 @@ pau <- n_sites[pau] if (opt$options$expr) { # Append a suffix (e.g. .TPM) to the end to distinguish from the pau columns write("\nAdding input expression values", stderr()) + if (nrow(dt.plus) == 0) { + dt.plus <- NULL + } + if (nrow(dt.minus) == 0) { + dt.minus <- NULL + } expr <- rbind(dt.plus, dt.minus) expr[,sample := paste0(sample, ".TPM")] expr <- dcast(expr, Transcript + Gene + Gene_Name + Chr + LastExon.Start + diff --git a/scripts/create_merged_data.R b/scripts/create_merged_data.R index 61167e4..7fc1cba 100755 --- a/scripts/create_merged_data.R +++ b/scripts/create_merged_data.R @@ -5,7 +5,7 @@ suppressPackageStartupMessages(library(optparse)) args <- commandArgs(TRUE) option.list <- list( - make_option(c("-e", "--ensembl"), type="character", default=NULL, + make_option(c("--db"), type="character", default=NULL, help="Ensembl identifiers database file. Required unless -m is specified. [%default]"), make_option(c("-f", "--field"), type="character", @@ -39,7 +39,7 @@ if (length(opt$args) < 1) { if (opt$options$merge_only) { write("Merge-only mode enabled.", stderr()) -} else if (is.null(opt$options$ensembl)) { +} else if (is.null(opt$options$db)) { stop("Ensembl identifiers database is required.") } @@ -119,7 +119,13 @@ format_multi_ensembl_ids <- function(ids) { # ENSMUST00000111043_ENSMUSG00000048482,ENSMUST00000111044_ENSMUSG00000048482_mm9_chr1 # becomes # ENSMUST00000111043,ENSMUST00000111044_ENSMUSG00000048482_mm9_chr1 - split_ids <- str_match(ids, "^(ENS.+)_((hg19|mm10).+)") + # Test regex: https://regex101.com/r/zuDsy1/1 + split_ids <- str_match(ids, "^(([^_]+_[^_,]+)(,[^_]+_[^_,]+)*)_(([^_]+).+)") + + if (is.na(split_ids[1])) { + stop("Unable to format Ensembl ID by regex") + } + # Separate multiple Transcript_Gene name ens <- strsplit(split_ids[,2], ",") # Split transcript and gene names, then re-arrange to combine transcripts and genes @@ -129,24 +135,24 @@ format_multi_ensembl_ids <- function(ids) { paste(., collapse="_") }) stopifnot(length(ens) == length(ids)) - apply(cbind(ens, split_ids[,3]), 1, paste, collapse="_") + apply(cbind(ens, split_ids[,5]), 1, paste, collapse="_") } separate_ensembl_field <- function(df) { - tx_pattern <- "^ENS(MUS)*T.*_(hg\\d+|mm\\d+)_chr[0-9XY]+_\\d+_\\d+_[-+]_utr_\\d+_\\d+" + tx_pattern <- "^([^_]+_[^_,]+)(,[^_]+_[^_,]+)*_[^_]+_(chr)*\\w+_\\d+_\\d+_[-+]_utr_\\d+_\\d+" if (grepl(tx_pattern, df$Transcript[1], perl = TRUE)) { # Format Ensembl Transcript and Ensembl Gene IDs if there are multiple - # Remove hg19 info # Remove utr tag df[, Transcript := str_extract(Transcript, tx_pattern) %>% format_multi_ensembl_ids() %>% - str_replace("_(hg\\d+|mm\\d+)", "") %>% + #str_replace("_(hg\\d+|mm\\d+|unk)", "") %>% str_replace("_utr", "")] # Split by underscore - df[, c("Transcript", "Gene", "Chr", "LastExon.Start", "LastExon.End", - "Strand", "UTR3.Start", "UTR3.End") := + df[, c("Transcript", "Gene", "Species", "Chr", "LastExon.Start", + "LastExon.End", "Strand", "UTR3.Start", "UTR3.End") := tstrsplit(Transcript, "_", fixed=TRUE)] + df[, Species := NULL] df[, ':=' ( LastExon.Start = as.numeric(as.character(LastExon.Start)), @@ -155,16 +161,15 @@ separate_ensembl_field <- function(df) { UTR3.End = as.numeric(as.character(UTR3.End)) )] df[, Length := abs(UTR3.End - UTR3.Start)] - } else if (grepl("ENS(MUS)*T\\d+", df$Transcript[1])) { - df } else { - stop("Unable to find Ensembl IDs by regex") + warning("Unable to find Ensembl IDs by regex") + df } } #### Add Ensembl Gene ID column #### extract_one_transcript <- function(ids) { - str_extract(ids, "ENS(MUS)*T\\d+") + str_extract(ids, "^[^,]+") } add_ensembl_metadata <- function(df, dbfile, all_genes = FALSE, @@ -176,27 +181,22 @@ add_ensembl_metadata <- function(df, dbfile, all_genes = FALSE, db <- read.table(dbfile, header = TRUE, sep = "\t", stringsAsFactors=FALSE) %>% data.table() - if (all(grepl("ENS(MUS)*T\\d+.*", df$Transcript[1:min(nrow(df), 1000)], perl=TRUE))) { - df[, tid := extract_one_transcript(Transcript)] - - if (!all_genes) { + df[, tid := extract_one_transcript(Transcript)] + + if (!all_genes) { db <- db[Gene.type == "protein_coding"] - } - gid <- db[, .(tid=Transcript.stable.ID, - Gene=Gene.stable.ID, - Gene_Name=Gene.name)] %>% + } + gid <- db[, .(tid=Transcript.stable.ID, + Gene=Gene.stable.ID, + Gene_Name=Gene.name)] %>% unique() - if ("Gene" %in% colnames(df)) { + if ("Gene" %in% colnames(df)) { df[, Gene := NULL] # Remove Gene column so it doesn't conflict with gid results - } - setkey(df, tid) - setkey(gid, tid) - df <- gid[df] - df[, tid := NULL] - } else { - # should never enter this condition if input is properly prepared - stop("Can't identify gene ID column type") } + setkey(df, tid) + setkey(gid, tid) + df <- gid[df] + df[, tid := NULL] c <- length(which(!is.na(df$Gene_Name))) pc <- paste0("(", round(c / nrow(df)*100, digits=7), "%)") @@ -222,10 +222,10 @@ merged_data <- join_iterative(opt$args, by = c("Transcript", "Length"), format = opt$options$format) # Remove random chromosomes -if (!opt$options$merge_only && all(grepl("chr[0-9XY]+", merged_data$Transcript))) { - write("Removing random chromosomes", stderr()) - merged_data <- merged_data[grep("chr[0-9XY]+_\\d+_\\d+", Transcript),] -} +# if (!opt$options$merge_only && all(grepl("chr[0-9XY]+", merged_data$Transcript))) { +# write("Removing random chromosomes", stderr()) +# merged_data <- merged_data[grep("chr[0-9XY]+_\\d+_\\d+", Transcript),] +# } if (nrow(merged_data) < 100) { warning("Less than 100 rows in final table.") @@ -235,7 +235,7 @@ if (!opt$options$merge_only) { write("Separating Ensembl IDs", stderr()) separate_ensembl_field(merged_data) write("Adding Ensembl metadata", stderr()) - merged_data <- add_ensembl_metadata(merged_data, opt$options$ensembl, + merged_data <- add_ensembl_metadata(merged_data, opt$options$db, opt$options$all_genes, opt$options$non_standard) } diff --git a/scripts/install_packages.R b/scripts/install_packages.R new file mode 100755 index 0000000..92fa8c9 --- /dev/null +++ b/scripts/install_packages.R @@ -0,0 +1,25 @@ +#!/usr/bin/env Rscript + +writeLines(paste("Using", R.Version()$version.string)) + +local.lib <- .libPaths()[1] +pkgs <- c("stringr", "dplyr", "data.table", "optparse") + +for (p in pkgs) { + if (!p %in% installed.packages(lib.loc = local.lib)) { + install.packages(p, dependencies=TRUE, + repos='http://cran.us.r-project.org', lib=local.lib) + } +} + +writeLines("\n-------------------------------------") +writeLines("INSTALL SUMMARY") +writeLines("-------------------------------------") +check <- vector(length = length(pkgs)) +for (p in pkgs) { + check[p] <- p %in% installed.packages(lib.loc = local.lib) + writeLines(sprintf("%s: %s", p, ifelse(check[p] == TRUE, "OK", "FAIL"))) +} +writeLines("-------------------------------------\n") +status <- !(all(check)) +q(status=status) diff --git a/setup.py b/setup.py index 3e8d6e1..21e1342 100644 --- a/setup.py +++ b/setup.py @@ -4,8 +4,7 @@ from distutils.core import setup if sys.version_info[:2] < (2, 7): - sys.stderr.write("At least Python 2.7, or Python 3.5 or later " - "is required\n") + sys.stderr.write("At least Python 2.7 or later is required\n") sys.exit(1) elif sys.version_info[0] == 3 and sys.version_info[:2] < (3, 5): sys.stderr.write("At least Python 3.5 or later is required.\n") @@ -44,7 +43,6 @@ 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.5', - 'Topic :: Scientific/Engineering :: Bio-Informatics', ], zip_safe=False )