morrislab · kcha · Jun 18, 2018 · May 18, 2018 · Jun 14, 2018 · Jun 14, 2018
diff --git a/README.md b/README.md
@@ -15,29 +15,36 @@ other tools such as [Sailfish](https://github.com/kingsfordgroup/sailfish) and
 
 QAPA consists of both Python (2.7+ or 3.5+) and R scripts.
 
-1. Install [R](https://www.r-project.org/) and the R packages optparse, dplyr,
-   data.table, and stringr. In the R console, the packages can be installed in
-   one line:
-
-        install.packages(c("optparse", "dplyr", "data.table", "stringr"))
-
-2. Install the latest development version of
+1. Install the latest development version of
    [bedtools](https://github.com/arq5x/bedtools2). *Note: do not use 2.26.0
    stable release as there is a
    [bug](https://github.com/arq5x/bedtools2/issues/435) with the groupBy tool*.
 
-3. Install the latest QAPA Python/R source code from GitHub. QAPA requires the
-   Python packages pandas, numpy, pybedtools, and biopython. These will be
-   automatically installed, if necessary.
+2. Install the latest development version (or the latest
+   [release](https://github.com/morrislab/qapa/releases/latest)) of QAPA from
+   GitHub . QAPA requires the Python packages pandas, numpy, pybedtools, and
+   biopython. These will be automatically installed, if necessary.
 
         git clone https://github.com/morrislab/qapa.git
         cd qapa
         python setup.py install
 
-        # To test if installation is working:
-        cd
-        which qapa
-        qapa -h
+3. Install [R](https://www.r-project.org/) and the R packages optparse, dplyr,
+   data.table, and stringr. In the R console, the packages can be installed in
+   one line:
+
+        install.packages(c("optparse", "dplyr", "data.table", "stringr"))
+
+   Alternatively, run the provided `install_packages.R` helper script from
+   command line:
+
+        Rscript scripts/install_packages.R
+
+4. To test if installation is working:
+
+        cd          # change to root directory
+        which qapa  # should return path of qapa executable
+        qapa -h     # should display help message
 
 # Usage
 
@@ -86,11 +93,15 @@ The following data sources are required:
         mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e "select * from
             wgEncodeGencodeBasicVM9" mm10 > gencode.basic.txt
 
-   Note that the `-N` option (suppress column headings) is not used here.
+   Alternatively, if you are starting from a GTF/GFF file, you can convert
+   it to genePred format using the UCSC tool
+   [`gtfToGenePred`](http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/gtfToGenePred):
+
+        gtfToGenePred -genePredExt custom_genes.gtf custom_genes.genedPred
 
 **B. Poly(A) site annotation**
 
-Two options are available.
+As of v1.2.0, this step is optional. Otherwise, two options are available:
 
 Option 1: standard approach (as described in the [paper](#citation))
 
@@ -109,11 +120,11 @@ Option 1: standard approach (as described in the [paper](#citation))
         txEnd, name2, score, strand from wgEncodeGencodePolyaVM9 where name2 =
         'polyA_site'" -N mm10 > gencode.polyA_sites.bed
 
-Option 2: use custom BED track (*new in v1.1.0*)
+Option 2: use custom BED track
 
 1. Custom BED track of poly(A) sites
 
-    Alternatively a custom BED file of poly(A) can be used to annotate 3' UTRs.
+    A custom BED file of poly(A) can be used to annotate 3' UTRs.
     Each entry must contain the start (0-based) and end coordinate of a poly(A)
     site.
 
@@ -127,14 +138,24 @@ A reference genome in FASTA format is required for extracting sequences from
 
 To extract 3' UTRs from annotation, run:
 
-    qapa build --db ensembl_identifiers.txt -g gencode.polyA_sites.bed
-        -p clusters.mm10.bed gencode.basic.txt > output_utrs.bed
+    qapa build --db ensembl_identifiers.txt -g gencode.polyA_sites.bed -p clusters.mm10.bed 
+        gencode.basic.txt > output_utrs.bed
 
-Or when using a custom BED file:
+If using a custom BED file, replace the `-g` and `-p` options with `-o`:
 
     qapa build --db ensembl_identifiers.txt -o custom_sites.bed
         gencode.basic.txt > output_utrs.bed
 
+If using a custom genePred file converted from GTF, include the `-H`
+option:
+
+    qapa build -H --db ensembl_identifiers.txt -o custom_sites.bed
+        custom_genes.genePred > output_utrs.bed
+
+If bypassing the poly(A) annotation step, include the `-N` option:
+
+    qapa build -N --db ensembl_identifiers.txt gencode.basic.txt > output.utrs.bed
+
 Results will be saved in the file `output_utrs.bed` (default is STDOUT).
 
 To extract sequences from the resulting BED file, use the `fasta` sub-command

diff --git a/qapa/annotate.py b/qapa/annotate.py
@@ -9,7 +9,6 @@
 from pybedtools import featurefuncs
 import re
 
-
 def extend_feature(feature, length=24):
     """Extend the 3' end by length
     """
@@ -117,7 +116,7 @@ def main(args, input_filename, fout=sys.stdout):
         custom_mode = True
         custom = pybedtools.BedTool(args.other)
         sites = sort_bed(custom)
-    else:
+    elif not args.no_annotation:
         pas_filter = re.compile("(DS|TE)$")
         gencode = pybedtools.BedTool(args.gencode_polya)\
             .filter(lambda x: x.name == 'polyA_site')\
@@ -147,12 +146,16 @@ def main(args, input_filename, fout=sys.stdout):
     #   - Restore BED format with cut()
     #   - Use custom function to resolve overlapping features from groupby
 
-    overlap_utrs = utrs.intersect(sites, s=True, wa=True, wb=True)\
-                       .each(restore_feature)\
-                       .each(update_3prime,
-                             min_intermediate_pas=args.intermediate_polyasite,
-                             custom=custom_mode)\
-                       .saveas()
+    if args.no_annotation:
+        overlap_utrs = utrs.each(restore_feature)\
+                           .saveas()
+    else: 
+        overlap_utrs = utrs.intersect(sites, s=True, wa=True, wb=True)\
+                           .each(restore_feature)\
+                           .each(update_3prime,
+                                 min_intermediate_pas=args.intermediate_polyasite,
+                                 custom=custom_mode)\
+                           .saveas()
     overlap_utrs = sort_bed(overlap_utrs)\
         .groupby(g=[1, 2, 3, 6, 7, 8, 9], c=[4, 5, 10, 11],
                  o=['collapse'] * 4, delim="|")\

diff --git a/qapa/collapse.py b/qapa/collapse.py
@@ -11,7 +11,7 @@
 
 
 class Interval:
-    def __init__(self, l):
+    def __init__(self, l, sp=None):
         #l = line.rstrip().split("\t")
         self.chrom = l[0]
         self.start = int(l[1])
@@ -24,6 +24,7 @@ def __init__(self, l):
         self.end2 = int(l[7])
         self.gene_id = l[8]
         #self.utr_id = l[9]
+        self.species = self._guess_species(sp)
 
     def is_forward(self):
         return self.strand == "+"
@@ -45,24 +46,36 @@ def set_score(self):
         else:
             self.score = self.start2 - self.start
 
-    def finalize(self):
+    def finalize(self, species=None):
         '''Print the interval using the highest peak'''
         if self.is_forward():
             utr_co = [self.end2, self.end]
         else:
             utr_co = [self.start, self.start2]
-        new_name = [self.name, self._guess_species(), self.chrom, self.start,
-                    self.end, self.strand, 'utr'] + utr_co
+        new_name = [self.name, self.species, self.chrom, 
+                    self.start, self.end, self.strand, 'utr'] + utr_co
         new_name = "_".join([str(x) for x in new_name])
         self.set_score()
         return [self.chrom, self.start, self.end, new_name, self.score,
                 self.strand, self.gene_id]
 
-    def _guess_species(self):
+    def set_gene(self, conn):
+        '''Query for Ensembl Gene ID'''
+        query = 'select gene_id from ensembl_id where transcript_id = ?'
+        cur = conn.execute(query, (self.name,))
+        r = cur.fetchone()
+        if r:
+            self.gene_id = r[0]
+        else:
+            self.gene_id = None
+
+    def _guess_species(self, species=None):
         if re.match(r'ENST\d+', self.name):
             return 'hg19'
         elif re.match(r'ENSMUST\d+', self.name):
             return 'mm10'
+        elif species is not None:
+            return species
         warnings.warn('Could not guess species from gene name!', Warning)
         return 'unk'
 
@@ -121,12 +134,8 @@ def merge_bed(args, inputfile):
     overlap_diff_genes = set()
 
     print("Iterating and merging intervals by 3' end", file=sys.stderr)
-    # for line in input:
     for index, line in df.iterrows():
-        if re.match(r'^(?!chr)', line[0]):
-            continue
-
-        my_interval = Interval(line)
+        my_interval = Interval(line, args.species)
 
         if prev_interval is None:
             prev_interval = my_interval

diff --git a/qapa/extract.py b/qapa/extract.py
@@ -8,8 +8,13 @@
 
 
 class Row:
-    def __init__(self, row):
+    def __init__(self, row, no_header=False):
         l = row.rstrip().split("\t")
+        if no_header:
+            # add a dummy column in front of list to represent bin column in
+            # UCSC genePred tables
+            l.insert(0, "dummy")
+
         self.name = l[1]
         self.chrom = l[2]
         self.strand = l[3]
@@ -75,7 +80,7 @@ def get_3utr_length(self):
         return self.utr3[1] - self.utr3[0]
 
     def is_on_random_chromosome(self):
-        return not re.match(r'chr[0-9XY]+$', self.chrom)
+        return not re.match(r'^(chr)*[0-9XY]+$', self.chrom)
 
     def get_block_sizes(self, n):
         sizes = [0] * n
@@ -120,18 +125,19 @@ def main(args, fout=sys.stdout):
     n = 0
     for row in fileinput.input(args.annotation_file[0],
                                openhook=fileinput.hook_compressed):
-        n = n + 1
-
-        if fileinput.isfirstline():
+
+        if fileinput.isfirstline() and not args.no_header:
             continue
-
+        n = n + 1
+
         if re.match(r"^#", row):
             c = c + 1
             continue
 
-        rowobj = Row(row)
+        rowobj = Row(row, args.no_header)
 
-        if rowobj.is_on_random_chromosome():
+        if not args.no_skip_random_chromosomes and \
+            rowobj.is_on_random_chromosome():
             c = c + 1
             continue
 
@@ -167,8 +173,8 @@ def main(args, fout=sys.stdout):
     fileinput.close()
     # conn.close()
     if float(c) / float(n) > 0.75:
-        print("Warning: More than 75% of entries skipped. Are you using the "
-              "correct database?", file=sys.stderr)
+        print("Warning: %d/%d (%0.2f%%) were skipped. Are you using the "
+              "correct database?" % (c, n, float(c)/float(n)), file=sys.stderr)
 
 
 if __name__ == '__main__':

diff --git a/qapa/fasta.py b/qapa/fasta.py
@@ -7,7 +7,8 @@
 
 def get_sequences(bed_file, genome):
     bed = pybedtools.BedTool(bed_file)
-    return bed.sequence(genome, s=True, name=True, fullHeader=True, split=True)
+    return bed.sequence(genome, s=True, name=True, fullHeader=False,
+                        split=True)
 
 
 def filter_sequences(fasta_file, min_length=100, fout=sys.stdout):

diff --git a/qapa/qapa.py b/qapa/qapa.py
@@ -2,6 +2,7 @@
 import sys
 import os
 import os.path
+import re
 import argparse
 import tempfile
 
@@ -69,7 +70,7 @@ def getoptions(args=None):
                           help="Ensembl gene identifier table")
     required.add_argument('-g', '--gencode_polya', dest="gencode_polya",
                           help="GENCODE poly(A) site track")
-    required.add_argument('-p', '--polyasite', dest="polyasite", 
+    required.add_argument('-p', '--polyasite', dest="polyasite",
                           help="PolyAsite database")
     optional.add_argument('-m', '--min_polyasite', dest="min_polyasite",
                           type=int, default=3,
@@ -94,8 +95,24 @@ def getoptions(args=None):
                           help="Use this option to annotate 3' UTRs with a "
                           "custom BED file of poly(A) sites. This option "
                           "cannot be used in conjunction with -g and -p.")
+    optional.add_argument("-C", "--no_skip_random_chromosomes", default=True,
+                          action="store_true",
+                          help="Disable skiping of chromosomes that don't "
+                          "match the regular expression pattern "
+                          " '^(chr)*[0-9XY]+$'")
     optional.add_argument("-s", "--save", action='store_true',
-                          help="don't automatically delete intermediate files")
+                          help="Don't automatically delete intermediate files")
+    optional.add_argument("-H", "--no_header", action='store_true',
+                          help="Annotation table (genePred) has no header. "
+                          "Use this option if your input table was created "
+                          "using gtfToGenePred -genePredExt.")
+    optional.add_argument("-N", "--no_annotation", action='store_true',
+                          help="Skip annotation step. Use this option if you "
+                          "only have a gene model annotation file to build "
+                          "a 3' UTR library.")
+    optional.add_argument("--species", type=str,
+                          help="Set species. Useful if not using 'hg19' or "
+                          "'mm10', otherwise 'unk' is used.")
     build_parser.set_defaults(func=build)
     build_parser._action_groups.append(optional)
 
@@ -163,12 +180,21 @@ def getoptions(args=None):
                             args.other, args.annotation_file[0]], build_parser)
 
         if args.other is None and \
-            (args.gencode_polya is None or args.polyasite is None):
+            (args.gencode_polya is None or args.polyasite is None) and \
+            not args.no_annotation:
                 parser.error("Missing arguments: -g and/or -p")
 
         if args.other and (args.gencode_polya or args.polyasite):
             eprint("Option -o (custom BED) will be used for build phase and "
                    "-g and -p will be ignored")
+
+        if args.no_annotation:
+            eprint("Annotation step will be skipped")
+
+        if not (args.species is None or \
+                re.match(r'^[a-zA-Z0-9]+$', args.species)):
+            parser.error("Species must be alphanumeric.")
+
     elif args.subcommand == 'fasta':
         _check_input_files([args.bed_file[0], args.genome], fasta_parser)
     elif args.subcommand == 'quant':
@@ -238,7 +264,7 @@ def quant(args):
         intermediate_name = args.save
 
     try:
-        cmd = "create_merged_data.R --ensembl {} -f {} -F {} {} > {}".format(
+        cmd = "create_merged_data.R --db {} -f {} -F {} {} > {}".format(
             args.db, args.field, args.format, " ".join(args.quant_files),
             intermediate_name)
         # eprint(cmd)

diff --git a/qapa/version.py b/qapa/version.py
@@ -1 +1 @@
-__version__ = '1.1.1'
+__version__ = '1.2.0'