diff --git a/src/scripts/borzoi_test_apa_folds_polaydb.py b/src/scripts/borzoi_test_apa_folds_polaydb.py index 0ff4c15..423bb41 100755 --- a/src/scripts/borzoi_test_apa_folds_polaydb.py +++ b/src/scripts/borzoi_test_apa_folds_polaydb.py @@ -21,6 +21,8 @@ """ borzoi_test_apa_folds_polaydb.py + +Measure accuracy at polyadenylation-level for multiple model replicates. """ ################################################################################ @@ -29,13 +31,6 @@ def main(): usage = "usage: %prog [options] ..." parser = OptionParser(usage) - parser.add_option( - "-a", - "--alt", - dest="alternative", - default="two-sided", - help="Statistical test alternative [Default: %default]", - ) parser.add_option( "-c", dest="crosses", @@ -50,13 +45,6 @@ def main(): type="int", help="Dataset index [Default:%default]", ) - parser.add_option( - "--d_ref", - dest="dataset_ref_i", - default=None, - type="int", - help="Reference Dataset index [Default:%default]", - ) parser.add_option( "-e", dest="conda_env", @@ -67,26 +55,19 @@ def main(): "-f", dest="fold_subset", default=None, - help="Run a subset of folds (encoded as comma-separated string) [Default:%default]", - ) - parser.add_option("-g", dest="apa_file", default="polyadb_human_v3.csv.gz") - parser.add_option( - "--label_exp", - dest="label_exp", - default="Experiment", - help="Experiment label [Default: %default]", + type="int", + help="Run a subset of folds [Default:%default]", ) parser.add_option( - "--label_ref", - dest="label_ref", - default="Reference", - help="Reference label [Default: %default]", + "--f_list", + dest="fold_subset_list", + default=None, + help="Run a subset of folds (encoded as comma-separated string) [Default:%default]", ) parser.add_option( - "-m", - dest="metric", - default="pearsonr", - help="Train/test metric [Default: Pearsonr or AUPRC]", + "-g", + dest="apa_file", + default="polyadb_human_v3.csv.gz" ) parser.add_option( "--name", @@ -101,14 +82,9 @@ def main(): help="Output experiment directory [Default: %default]", ) parser.add_option( - "-p", dest="out_stem", default=None, help="Output plot stem [Default: %default]" - ) - parser.add_option("-q", dest="queue", default="geforce") - parser.add_option( - "-r", - dest="ref_dir", - default=None, - help="Reference directory for statistical tests", + "-q", + dest="queue", + default="geforce" ) parser.add_option( "--rc", @@ -124,13 +100,6 @@ def main(): type="str", help="Ensemble prediction shifts [Default: %default]", ) - parser.add_option( - "--status", - dest="status", - default=False, - action="store_true", - help="Update metric status; do not run jobs [Default: %default]", - ) parser.add_option( "-t", dest="targets_file", @@ -138,6 +107,13 @@ def main(): type="str", help="File specifying target indexes and labels in table format", ) + parser.add_option( + "-u", + dest="untransform_old", + default=False, + action="store_true", + help="Untransform old models [Default: %default]", + ) (options, args) = parser.parse_args() if len(args) < 2: @@ -161,12 +137,16 @@ def main(): # count folds num_folds = len([dkey for dkey in data_stats if dkey.startswith("fold")]) - - fold_index = [fold_i for fold_i in range(num_folds)] # subset folds if options.fold_subset is not None: - fold_index = [int(fold_str) for fold_str in options.fold_subset.split(",")] + num_folds = min(options.fold_subset, num_folds) + + fold_index = [fold_i for fold_i in range(num_folds)] + + # subset folds (list) + if options.fold_subset_list is not None: + fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")] if options.queue == "standard": num_cpu = 4 @@ -192,7 +172,7 @@ def main(): model_file = "%s/train/model%d_best.h5" % (it_dir, options.dataset_i) # check if done - acc_file = "%s/acc.txt" % out_dir + acc_file = "%s/apa_preds_polyadb.tsv.gz" % out_dir if os.path.isfile(acc_file): # print('%s already generated.' % acc_file) pass @@ -209,6 +189,8 @@ def main(): cmd += " --shifts %s" % options.shifts if options.targets_file is not None: cmd += " -t %s" % options.targets_file + if options.untransform_old: + cmd += " -u" cmd += " %s" % params_file cmd += " %s" % model_file cmd += " %s/data%d" % (it_dir, head_i) diff --git a/src/scripts/borzoi_test_apa_polaydb.py b/src/scripts/borzoi_test_apa_polaydb.py index 2f26efa..eecf01c 100755 --- a/src/scripts/borzoi_test_apa_polaydb.py +++ b/src/scripts/borzoi_test_apa_polaydb.py @@ -33,11 +33,6 @@ Measure accuracy at polyadenylation-level. """ - -def eprint(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - - ################################################################################ # main ################################################################################ @@ -89,6 +84,13 @@ def main(): default=None, help="TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]", ) + parser.add_option( + "-u", + dest="untransform_old", + default=False, + action="store_true", + help="Untransform old models [Default: %default]", + ) (options, args) = parser.parse_args() if len(args) != 4: @@ -132,9 +134,6 @@ def main(): num_targets = targets_df.shape[0] num_targets_strand = targets_strand_df.shape[0] - # save sqrt'd tracks - sqrt_mask = np.array([ss.find("sqrt") != -1 for ss in targets_strand_df.sum_stat]) - # read model parameters with open(params_file) as params_open: params = json.load(params_open) @@ -188,9 +187,6 @@ def main(): # filter for 3' UTR polyA sites only apa_df = apa_df.query("site_type == '3\\' most exon'").copy().reset_index(drop=True) - eprint("len(apa_df) = " + str(len(apa_df))) - print("len(apa_df) = " + str(len(apa_df))) - apa_df["start_hg38"] = apa_df["position_hg38"] apa_df["end_hg38"] = apa_df["position_hg38"] + 1 @@ -207,6 +203,11 @@ def main(): apa_pr = pr.PyRanges( apa_df[["Chromosome", "Start", "End", "pas_id", "cut_mode", "pas_strand"]] ) + + # get strands + pas_strand_dict = {} + for _, row in apa_df.iterrows() : + pas_strand_dict[row['pas_id']] = row['pas_strand'] ####################################################### # intersect APA sites w/ preds, targets @@ -214,9 +215,6 @@ def main(): # intersect seqs, APA sites seqs_apa_pr = seqs_pr.join(apa_pr) - eprint("len(seqs_apa_pr.df) = " + str(len(seqs_apa_pr.df))) - print("len(seqs_apa_pr.df) = " + str(len(seqs_apa_pr.df))) - # hash preds/targets by pas_id apa_preds_dict = {} apa_targets_dict = {} @@ -228,8 +226,7 @@ def main(): y = y.numpy()[..., targets_df.index] t0 = time.time() - eprint("Sequence %d..." % si) - print("Sequence %d..." % si, end="") + print("Sequence %d..." % si, end="", flush=True) for bsi in range(x.shape[0]): seq = seqs_df.iloc[si + bsi] @@ -276,14 +273,11 @@ def main(): apa_preds_dict.setdefault(pas_id, []).append(yhb) apa_targets_dict.setdefault(pas_id, []).append(yb) else: - eprint("(Warning: len(yb) <= 0)") + print("(Warning: len(yb) <= 0)", flush=True) # advance sequence table index si += x.shape[0] - eprint("DONE in %ds." % (time.time() - t0)) - print("DONE in %ds." % (time.time() - t0)) - - eprint("len(apa_preds_dict) = " + str(len(apa_preds_dict))) + print("DONE in %ds." % (time.time() - t0), flush=True) if si % 128 == 0: gc.collect() @@ -300,14 +294,22 @@ def main(): apa_targets_gi = np.concatenate(apa_targets_dict[pas_id], axis=0).astype( "float32" ) - - # undo scale - apa_preds_gi /= np.expand_dims(targets_strand_df.scale, axis=0) - apa_targets_gi /= np.expand_dims(targets_strand_df.scale, axis=0) - - # undo sqrt - apa_preds_gi[:, sqrt_mask] = apa_preds_gi[:, sqrt_mask] ** (4 / 3) - apa_targets_gi[:, sqrt_mask] = apa_targets_gi[:, sqrt_mask] ** (4 / 3) + + # slice strand + if pas_strand_dict[pas_id] == "+": + pas_strand_mask = (targets_df.strand != "-").to_numpy() + else: + pas_strand_mask = (targets_df.strand != "+").to_numpy() + apa_preds_gi = apa_preds_gi[:, pas_strand_mask] + apa_targets_gi = apa_targets_gi[:, pas_strand_mask] + + # untransform + if options.untransform_old: + apa_preds_gi = dataset.untransform_preds1(apa_preds_gi, targets_strand_df, unscale=True, unclip=False) + apa_targets_gi = dataset.untransform_preds1(apa_targets_gi, targets_strand_df, unscale=True, unclip=False) + else: + apa_preds_gi = dataset.untransform_preds(apa_preds_gi, targets_strand_df, unscale=True, unclip=False) + apa_targets_gi = dataset.untransform_preds(apa_targets_gi, targets_strand_df, unscale=True, unclip=False) # mean coverage apa_preds_gi = apa_preds_gi.mean(axis=0) diff --git a/src/scripts/borzoi_test_exons_folds.py b/src/scripts/borzoi_test_exons_folds.py index 02c70d3..7091465 100755 --- a/src/scripts/borzoi_test_exons_folds.py +++ b/src/scripts/borzoi_test_exons_folds.py @@ -94,6 +94,12 @@ def main(): type="int", help="Run a subset of folds [Default:%default]", ) + parser.add_option( + "--f_list", + dest="fold_subset_list", + default=None, + help="Run a subset of folds (encoded as comma-separated string) [Default:%default]", + ) parser.add_option( "-g", dest="exons_gff", @@ -195,6 +201,12 @@ def main(): # subset folds if options.fold_subset is not None: num_folds = min(options.fold_subset, num_folds) + + fold_index = [fold_i for fold_i in range(num_folds)] + + # subset folds (list) + if options.fold_subset_list is not None: + fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")] if options.queue == "standard": num_cpu = 4 @@ -209,7 +221,7 @@ def main(): jobs = [] for ci in range(options.crosses): - for fi in range(num_folds): + for fi in fold_index: it_dir = "%s/f%dc%d" % (options.exp_dir, fi, ci) if options.dataset_i is None: diff --git a/src/scripts/borzoi_test_genes.py b/src/scripts/borzoi_test_genes.py index e36fa54..bfdb2aa 100755 --- a/src/scripts/borzoi_test_genes.py +++ b/src/scripts/borzoi_test_genes.py @@ -117,6 +117,13 @@ def main(): action="store_true", help="Untransform old models [Default: %default]", ) + parser.add_option( + "--store_span", + dest="store_span", + default=False, + action="store_true", + help="Store predicted/measured gene span coverage profiles [Default: %default]", + ) (options, args) = parser.parse_args() if len(args) != 4: @@ -323,16 +330,21 @@ def main(): preds_log = np.log2(gene_preds_gi[:, ti] + 1) targets_log = np.log2(gene_targets_gi[:, ti] + 1) gene_corr_gi[ti] = pearsonr(preds_log, targets_log)[0] - # gene_corr_gi[ti] = pearsonr(gene_preds_gi[:,ti], gene_targets_gi[:,ti])[0] else: gene_corr_gi[ti] = np.nan gene_within.append(gene_corr_gi) gene_wvar.append(gene_targets_gi.var(axis=0)) - # TEMP: save gene preds/targets - # os.makedirs('%s/gene_within' % options.out_dir, exist_ok=True) - # np.save('%s/gene_within/%s_preds.npy' % (options.out_dir, gene_id), gene_preds_gi.astype('float16')) - # np.save('%s/gene_within/%s_targets.npy' % (options.out_dir, gene_id), gene_targets_gi.astype('float16')) + # optionally store raw coverage profiles for gene span + if options.store_span: + hash_code = str(gene_id.split(".")[0][-1]) # last digit of gene id + + os.makedirs('%s/gene_within' % options.out_dir, exist_ok=True) + os.makedirs('%s/gene_within/%s' % (options.out_dir, hash_code), exist_ok=True) + os.makedirs('%s/gene_within/%s/preds' % (options.out_dir, hash_code), exist_ok=True) + os.makedirs('%s/gene_within/%s/targets' % (options.out_dir, hash_code), exist_ok=True) + np.save('%s/gene_within/%s/preds/%s_preds.npy' % (options.out_dir, hash_code, gene_id), gene_preds_gi.astype('float16')) + np.save('%s/gene_within/%s/targets/%s_targets.npy' % (options.out_dir, hash_code, gene_id), gene_targets_gi.astype('float16')) # mean coverage gene_preds_gi = gene_preds_gi.mean(axis=0) / float(pool_width) diff --git a/src/scripts/borzoi_test_genes_folds.py b/src/scripts/borzoi_test_genes_folds.py index 13196ba..b558205 100755 --- a/src/scripts/borzoi_test_genes_folds.py +++ b/src/scripts/borzoi_test_genes_folds.py @@ -28,9 +28,9 @@ import slurm """ -borzoi_test_folds.py +borzoi_test_genes_folds.py -Train Borzoi model replicates using given parameters and data. +Measure accuracy at gene-level for multiple model replicates. """ ################################################################################ @@ -89,6 +89,13 @@ def main(): action="store_true", help="Untransform old models [Default: %default]", ) + parser.add_option( + "--store_span", + dest="store_span", + default=False, + action="store_true", + help="Store predicted/measured gene span coverage profiles [Default: %default]", + ) # folds parser.add_option( @@ -129,6 +136,13 @@ def main(): "-f", dest="fold_subset", default=None, + type="int", + help="Run a subset of folds [Default:%default]", + ) + parser.add_option( + "--f_list", + dest="fold_subset_list", + default=None, help="Run a subset of folds (encoded as comma-separated string) [Default:%default]", ) parser.add_option( @@ -167,9 +181,16 @@ def main(): help="Output experiment directory [Default: %default]", ) parser.add_option( - "-p", dest="out_stem", default=None, help="Output plot stem [Default: %default]" + "-p", + dest="out_stem", + default=None, + help="Output plot stem [Default: %default]" + ) + parser.add_option( + "-q", + dest="queue", + default="geforce" ) - parser.add_option("-q", dest="queue", default="geforce") parser.add_option( "-s", dest="sub_dir", @@ -182,13 +203,6 @@ def main(): default=None, help="Reference directory for statistical tests", ) - parser.add_option( - "--status", - dest="status", - default=False, - action="store_true", - help="Update metric status; do not run jobs [Default: %default]", - ) (options, args) = parser.parse_args() @@ -213,12 +227,16 @@ def main(): # count folds num_folds = len([dkey for dkey in data_stats if dkey.startswith("fold")]) - - fold_index = [fold_i for fold_i in range(num_folds)] # subset folds if options.fold_subset is not None: - fold_index = [int(fold_str) for fold_str in options.fold_subset.split(",")] + num_folds = min(options.fold_subset, num_folds) + + fold_index = [fold_i for fold_i in range(num_folds)] + + # subset folds (list) + if options.fold_subset_list is not None: + fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")] if options.queue == "standard": num_cpu = 8 @@ -253,7 +271,7 @@ def main(): cmd = ". /home/drk/anaconda3/etc/profile.d/conda.sh;" cmd += " conda activate %s;" % options.conda_env cmd += " time borzoi_test_genes.py" - # cmd += ' --head %d' % head_i + cmd += ' --head %d' % head_i cmd += " -o %s" % out_dir if options.rc: cmd += " --rc" @@ -265,6 +283,8 @@ def main(): cmd += ' --pseudo_qtl %.2f' % options.pseudo_qtl if options.untransform_old: cmd += ' -u' + if options.store_span: + cmd += ' --store_span' if options.span: cmd += " --span" job_mem = 240000 diff --git a/src/scripts/borzoi_test_tss_folds_gencode.py b/src/scripts/borzoi_test_tss_folds_gencode.py index 0cdc500..1b65130 100644 --- a/src/scripts/borzoi_test_tss_folds_gencode.py +++ b/src/scripts/borzoi_test_tss_folds_gencode.py @@ -13,25 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # ========================================================================= -from optparse import OptionParser, OptionGroup -import glob +from optparse import OptionParser import json import os -import pdb -import sys - -from natsort import natsorted -import numpy as np -import pandas as pd -from scipy.stats import wilcoxon, ttest_rel -import matplotlib.pyplot as plt -import seaborn as sns import slurm """ borzoi_test_tss_folds_gencode.py +Measure accuracy at TSS-level for multiple model replicates. """ ################################################################################ @@ -40,13 +31,6 @@ def main(): usage = 'usage: %prog [options] ...' parser = OptionParser(usage) - parser.add_option( - '-a', - '--alt', - dest='alternative', - default='two-sided', - help='Statistical test alternative [Default: %default]', - ) parser.add_option( '-c', dest='crosses', @@ -61,13 +45,6 @@ def main(): type='int', help='Dataset index [Default:%default]', ) - parser.add_option( - '--d_ref', - dest='dataset_ref_i', - default=None, - type='int', - help='Reference Dataset index [Default:%default]', - ) parser.add_option( '-e', dest='conda_env', @@ -78,6 +55,13 @@ def main(): '-f', dest='fold_subset', default=None, + type='int', + help='Run a subset of folds [Default:%default]', + ) + parser.add_option( + '--f_list', + dest='fold_subset_list', + default=None, help='Run a subset of folds (encoded as comma-separated string) [Default:%default]', ) parser.add_option( @@ -85,24 +69,6 @@ def main(): dest='tss_file', default='/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_tss2.bed', ) - parser.add_option( - '--label_exp', - dest='label_exp', - default='Experiment', - help='Experiment label [Default: %default]', - ) - parser.add_option( - '--label_ref', - dest='label_ref', - default='Reference', - help='Reference label [Default: %default]', - ) - parser.add_option( - '-m', - dest='metric', - default='pearsonr', - help='Train/test metric [Default: Pearsonr or AUPRC]', - ) parser.add_option( '--name', dest='name', @@ -115,23 +81,11 @@ def main(): default=None, help='Output experiment directory [Default: %default]', ) - parser.add_option( - '-p', - dest='out_stem', - default=None, - help='Output plot stem [Default: %default]', - ) parser.add_option( '-q', dest='queue', default='geforce', ) - parser.add_option( - '-r', - dest='ref_dir', - default=None, - help='Reference directory for statistical tests', - ) parser.add_option( '--rc', dest='rc', @@ -160,13 +114,6 @@ def main(): action='store_true', help='Store max instead of avg bin value in local window [Default: %default]', ) - parser.add_option( - '--status', - dest='status', - default=False, - action='store_true', - help='Update metric status; do not run jobs [Default: %default]', - ) parser.add_option( '-t', dest='targets_file', @@ -174,6 +121,13 @@ def main(): type='str', help='File specifying target indexes and labels in table format', ) + parser.add_option( + '-u', + dest='untransform_old', + default=False, + action='store_true', + help='Untransform old models [Default: %default]', + ) (options, args) = parser.parse_args() if len(args) < 2: @@ -197,12 +151,16 @@ def main(): # count folds num_folds = len([dkey for dkey in data_stats if dkey.startswith("fold")]) - - fold_index = [fold_i for fold_i in range(num_folds)] # subset folds if options.fold_subset is not None: - fold_index = [int(fold_str) for fold_str in options.fold_subset.split(",")] + num_folds = min(options.fold_subset, num_folds) + + fold_index = [fold_i for fold_i in range(num_folds)] + + # subset folds (list) + if options.fold_subset_list is not None: + fold_index = [int(fold_str) for fold_str in options.fold_subset_list.split(",")] if options.queue == 'standard': num_cpu = 4 @@ -236,12 +194,12 @@ def main(): model_file = '%s/train/model%d_best.h5' % (it_dir, options.dataset_i) # check if done - acc_file = '%s/acc.txt' % out_dir + acc_file = '%s/tss_preds_gencode.tsv.gz' % out_dir if os.path.isfile(acc_file): # print('%s already generated.' % acc_file) pass else: - # basenji test + # evaluate cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;' cmd += ' conda activate %s;' % options.conda_env cmd += ' time borzoi_test_tss_gencode.py' @@ -257,6 +215,8 @@ def main(): cmd += ' --maxcov' if options.targets_file is not None: cmd += ' -t %s' % options.targets_file + if options.untransform_old: + cmd += ' -u' cmd += ' %s' % params_file cmd += ' %s' % model_file cmd += ' %s/data%d' % (it_dir, head_i) diff --git a/src/scripts/borzoi_test_tss_gencode.py b/src/scripts/borzoi_test_tss_gencode.py index c5cad57..3e88a02 100644 --- a/src/scripts/borzoi_test_tss_gencode.py +++ b/src/scripts/borzoi_test_tss_gencode.py @@ -13,31 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. # ========================================================================= - from optparse import OptionParser import gc import json -import pdb import os import time import sys -import h5py -#from intervaltree import IntervalTree import numpy as np import pandas as pd import pyranges as pr -from scipy.stats import pearsonr -from sklearn.metrics import explained_variance_score -import tensorflow as tf -#from tqdm import tqdm - -from basenji import bed -from basenji import dataset -from basenji import seqnn -from basenji import trainer -#import pygene -#from qnorm import quantile_normalize + +from baskerville import dataset +from baskerville import seqnn ''' borzoi_test_tss_gencode.py @@ -45,9 +33,6 @@ Measure accuracy at TSS-level. ''' -def eprint(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - ################################################################################ # main ################################################################################ @@ -113,6 +98,13 @@ def main(): default=None, help='TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]', ) + parser.add_option( + "-u", + dest="untransform_old", + default=False, + action="store_true", + help="Untransform old models [Default: %default]", + ) (options, args) = parser.parse_args() if len(args) != 4: @@ -154,9 +146,6 @@ def main(): num_targets = targets_df.shape[0] num_targets_strand = targets_strand_df.shape[0] - # save sqrt'd tracks - sqrt_mask = np.array([ss.find('sqrt') != -1 for ss in targets_strand_df.sum_stat]) - # read model parameters with open(params_file) as params_open: params = json.load(params_open) @@ -203,6 +192,11 @@ def main(): tss_df = pd.read_csv(tss_file, sep='\t', names=['Chromosome', 'Start', 'End', 'tss_id', 'feat1', 'tss_strand']) tss_pr = pr.PyRanges(tss_df) + + # get strands + tss_strand_dict = {} + for _, row in tss_df.iterrows() : + tss_strand_dict[row['tss_id']] = row['tss_strand'] ####################################################### # intersect TSS sites w/ preds, targets @@ -210,9 +204,6 @@ def main(): # intersect seqs, TSS sites seqs_tss_pr = seqs_pr.join(tss_pr) - eprint("len(seqs_tss_pr.df) = " + str(len(seqs_tss_pr.df))) - print("len(seqs_tss_pr.df) = " + str(len(seqs_tss_pr.df))) - # hash preds/targets by tss_id tss_preds_dict = {} tss_targets_dict = {} @@ -221,11 +212,10 @@ def main(): for x, y in eval_data.dataset: # predict only if gene overlaps yh = None - y = y.numpy()[...,targets_df.index] + y = y.numpy()[..., targets_df.index] t0 = time.time() - eprint('Sequence %d...' % si) - print('Sequence %d...' % si, end='') + print('Sequence %d...' % si, end='', flush=True) for bsi in range(x.shape[0]): seq = seqs_df.iloc[si+bsi] @@ -263,26 +253,22 @@ def main(): yh = seqnn_model(x) # slice gene region - yhb = yh[bsi,bin_start:bin_end].astype('float16') - yb = y[bsi,bin_start:bin_end].astype('float16') + yhb = yh[bsi, bin_start:bin_end].astype('float16') + yb = y[bsi, bin_start:bin_end].astype('float16') if len(yb) > 0: - tss_preds_dict.setdefault(tss_id,[]).append(yhb) - tss_targets_dict.setdefault(tss_id,[]).append(yb) + tss_preds_dict.setdefault(tss_id, []).append(yhb) + tss_targets_dict.setdefault(tss_id, []).append(yb) else: - eprint("(Warning: len(yb) <= 0)") + print("(Warning: len(yb) <= 0)", flush=True) # advance sequence table index si += x.shape[0] - eprint('DONE in %ds.' % (time.time()-t0)) - print('DONE in %ds.' % (time.time()-t0)) - - eprint("len(tss_preds_dict) = " + str(len(tss_preds_dict))) + print('DONE in %ds.' % (time.time() - t0), flush=True) if si % 128 == 0: gc.collect() - ####################################################### # aggregate TSS bin values into arrays @@ -292,15 +278,25 @@ def main(): for tss_id in tss_ids: tss_preds_gi = np.concatenate(tss_preds_dict[tss_id], axis=0).astype('float32') - tss_targets_gi = np.concatenate(tss_targets_dict[tss_id], axis=0).astype('float32') - - # undo scale - tss_preds_gi /= np.expand_dims(targets_strand_df.scale, axis=0) - tss_targets_gi /= np.expand_dims(targets_strand_df.scale, axis=0) - - # undo sqrt - tss_preds_gi[:,sqrt_mask] = tss_preds_gi[:,sqrt_mask]**(4/3) - tss_targets_gi[:,sqrt_mask] = tss_targets_gi[:,sqrt_mask]**(4/3) + tss_targets_gi = np.concatenate(tss_targets_dict[tss_id], axis=0).astype( + 'float32' + ) + + # slice strand + if tss_strand_dict[tss_id] == "+": + tss_strand_mask = (targets_df.strand != "-").to_numpy() + else: + tss_strand_mask = (targets_df.strand != "+").to_numpy() + tss_preds_gi = tss_preds_gi[:, tss_strand_mask] + tss_targets_gi = tss_targets_gi[:, tss_strand_mask] + + # untransform + if options.untransform_old: + tss_preds_gi = dataset.untransform_preds1(tss_preds_gi, targets_strand_df, unscale=True, unclip=False) + tss_targets_gi = dataset.untransform_preds1(tss_targets_gi, targets_strand_df, unscale=True, unclip=False) + else: + tss_preds_gi = dataset.untransform_preds(tss_preds_gi, targets_strand_df, unscale=True, unclip=False) + tss_targets_gi = dataset.untransform_preds(tss_targets_gi, targets_strand_df, unscale=True, unclip=False) # mean (or max) coverage tss_preds_gi = tss_preds_gi.max(axis=0) if options.maxcov else tss_preds_gi.mean(axis=0)