-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathFDR_all_genes.py
73 lines (59 loc) · 2.06 KB
/
FDR_all_genes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import argparse
import os
import gzip as gz
from statsmodels.stats.multitest import multipletests
def chunks(seq, num):
avg = len(seq) / float(num)
out = []
last = 0.0
while last < len(seq):
out.append(seq[int(last):int(last + avg)])
last += avg
return out
def choose_blocks_by_fdr_bh(pvals, labels, alpha=0.05):
rejected_list, corrected_p_vals, _, _ = multipletests(pvals, alpha=alpha, method='fdr_bh')
index = 0
for rejected in rejected_list:
if not rejected:
break
index += 1
if index > 0:
return pvals[0:index], labels[0:index]
else:
return [], []
def process_res_file(in_file):
gene_p_val_list = []
seen_genes = set()
with gz.open(in_file, 'r') as f:
for line in f:
line = line.decode().strip()
tokens = line.split("\t")
gene_name = tokens[0]
p_val = tokens[3]
if gene_name not in seen_genes:
gene_p_val_list.append((gene_name, float(p_val), float(tokens[2])))
seen_genes.add(gene_name)
gene_p_val_list.sort(key=lambda x: x[1])
gene_names = [a for a, b, _ in gene_p_val_list]
p_vals = [b for a, b, _ in gene_p_val_list]
correlations = [c for a,b,c in gene_p_val_list]
res_p_vals, res_gene_names = choose_blocks_by_fdr_bh(p_vals, gene_names)
print(res_gene_names)
print(len(res_gene_names))
out_dir = "/cs/cbio/jon/projects/PyCharmProjects/AchillesPrediction/gene_files_significant"
gene_chunks = chunks(res_gene_names, 17)
file_idx = 0
for chunk in gene_chunks:
out_file = os.path.join(out_dir, f"gene_file_{file_idx}.txt")
with open(out_file, 'w') as f_out:
for g in chunk:
f_out.write(f"{g}\n")
file_idx += 1
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--res_file',
default='res/all_genes.txt.gz')
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
process_res_file(args.res_file)