forked from thiesgehrmann/proteny
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
296 lines (208 loc) · 11.4 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
dd = '/home/nfs/thiesgehrmann/groups/w/phd/data';
from ibidas import *
###############################################################################
# Each organism requires two files:
# Genes file:
# |<chrid>\t<start>\t<end>\t<strand>\t<geneid>\t<transcriptid>\t<exonnumber>
# Sequence file:
# |><chrid1>
# |<sequence>
# |><chrid2>
# |<sequence>
###############################################################################
__gene_slice_names__ = ( 'chrid', 'start', 'end', 'strand', 'geneid', 'transcriptid', 'exonid' );
__genome_slice_names__ = ( 'chrid', 'sequence');
def example(org, chr, odir=None):
(name, genes, genome) = org(odir = odir);
genes = genes[_.f0 == chr];
genome = genome[_.f0 == chr];
return (name, genes.Copy(), genome.Copy());
#edef
###############################################################################
def read_prepared(name, genes_file, genome_file):
genes = Read(genes_file) / __gene_slice_names__;
genes = genes.Cast(bytes, int, int, bytes, bytes, bytes, int);
genome = Read(genome_file) / __genome_slice_names__;
genome = genome.Cast(bytes, bytes);
return (name, genes.Copy(), genome.Copy());
#edef
###############################################################################
def agabi2(odir = None):
genome_a = Read(Fetch('http://genome.jgi-psf.org/Agabi_varbisH97_2/download/Abisporus_varbisporusH97.v2.maskedAssembly.gz'), format='fasta');
genome_b = Read(Fetch('http://genome.jgi-psf.org/Agabi_varbisH97_2/download/Abisporus_var_bisporus.mitochondrion.scaffolds.fasta.gz'));
genome = genome_a | Stack | genome_b;
genes = Read(Fetch('http://genome.jgi-psf.org/Agabi_varbisH97_2/download/Abisporus_varbisporusH97.v2.FilteredModels3.gff.gz'), format='tsv');
genes = genes[_.f2 == 'CDS'];
genes = genes.To(_.f8, Do=_.Each(lambda x:[ y.strip().split(' ')[1] for y in x.split(';')[1:]]));
genes = genes.Get(_.f0, _.f3, _.f4, _.f6, _.f8.Each(lambda x: x[0]), _.f8.Each(lambda x: x[0]), _.f8.Each(lambda x: x[1])).Detect();
genes = genes.To(_.f0, Do=_.Each(lambda x: x.split('_')[1]).Cast(str));
genome = genome.To(_.f0, Do=_.Each(lambda x: x.split('_')[1]).Cast(str));
genes = genes / __gene_slice_names__;
genome = genome / __genome_slice_names__;
genes = genes.Copy();
genome = genome.Copy();
if not(odir == None):
Export(genes, '%s/proteny_genes.tsv' % (odir));
Export(genome, '%s/proteny_sequences.fasta' % (odir));
#fi
return ("agabi2", genes, genome);
#edef
###############################################################################
def agabi3(odir = None):
#genome = Read(Fetch('http://genome.jgi-psf.org/Agabi_varbisH97_3/download/Agabi_varbisH97_3_AssemblyScaffolds.fasta.gz'), format='fasta');
#genes = Read(Fetch('http://genome.jgi-psf.org/Agabi_varbisH97_3/download/Agabi_varbisH97_3_GeneCatalog_genes_20120316.gff.gz'));
genome = Read("%s/2b-agabi3/Agabi_varbisH97_3_AssemblyScaffolds.fasta.gz" % dd);
genes = Read("%s/2b-agabi3/Agabi_varbisH97_3_GeneCatalog_genes_20120316.gff.gz" % dd, format='tsv');
genes = genes[_.f2 == 'CDS'];
genes = genes.To(_.f8, Do=_.Each(lambda x:[ y.strip().split(' ')[1] for y in x.split(';')[1:]]));
genes = genes.Get(_.f0, _.f3, _.f4, _.f6, _.f8.Each(lambda x: x[0]), _.f8.Each(lambda x: x[0]), _.f8.Each(lambda x: x[1])).Detect();
genes = genes.To(_.f0, Do=_.Each(lambda x: x.split('_')[1]).Cast(str));
genome = genome.To(_.f0, Do=_.Each(lambda x: x.split('_')[1]).Cast(str));
genes = genes / __gene_slice_names__;
genome = genome / __genome_slice_names__;
genes = genes.Copy();
genome = genome.Copy();
if not(odir == None):
Export(genes, '%s/proteny_genes.tsv' % (odir));
Export(genome, '%s/proteny_sequences.fasta' % (odir));
#fi
return ("agabi3", genes, genome);
#edef
###############################################################################
def schco2(odir = None):
genome = Read(Fetch('http://genome.jgi-psf.org/Schco2/download/Schco2_AssemblyScaffolds.fasta.gz'));
genes = Read(Fetch('http://genome.jgi-psf.org/Schco2/download/Schco2_GeneCatalog_genes_20110923.gff.gz'), format='tsv');
genes = genes[_.f2 == 'CDS'];
genes = genes.To(_.f8, Do=_.Each(lambda x:[ y.strip().split(' ')[1] for y in x.split(';')[1:]]));
genes = genes.Get(_.f0, _.f3, _.f4, _.f6, _.f8.Each(lambda x: x[0]), _.f8.Each(lambda x: x[0]), _.f8.Each(lambda x: x[1])).Detect();
genes = genes.To(_.f0, Do=_.Each(lambda x: x.split('_')[1]).Cast(str));
genome = genome.To(_.f0, Do=_.Each(lambda x: x.split('_')[1]).Cast(str));
genes = genes / __gene_slice_names__;
genome = genome / __genome_slice_names__;
genes = genes.Copy();
genome = genome.Copy();
if not(odir == None):
Export(genes, '%s/proteny_genes.tsv' % (odir));
Export(genome, '%s/proteny_sequences.fasta' % (odir));
#fi
return ("schco2", genes, genome);
#edef
###############################################################################
def schco3(odir = None):
genome = Read(Fetch('http://genome.jgi-psf.org/Schco3/download/Schco3_AssemblyScaffolds.fasta.gz'));
genes = Read(Fetch('http://genome.jgi-psf.org/Schco3/download/Schco3_GeneCatalog_genes_20130812.gff.gz'), format='tsv');
genes = genes[_.f2 == 'CDS'];
genes = genes.To(_.f8, Do=_.Each(lambda x:[ y.strip().split(' ')[1] for y in x.split(';')[1:]]));
genes = genes.Get(_.f0, _.f3, _.f4, _.f6, _.f8.Each(lambda x: x[0]), _.f8.Each(lambda x: x[0]), _.f8.Each(lambda x: x[1])).Detect();
genes = genes.To(_.f0, Do=_.Each(lambda x: x.split('_')[1]).Cast(str));
genome = genome.To(_.f0, Do=_.Each(lambda x: x.split('_')[1]).Cast(str));
genes = genes / __gene_slice_names__;
genome = genome / __genome_slice_names__;
genes = genes.Copy();
genome = genome.Copy();
if not(odir == None):
Export(genes, '%s/proteny_genes.tsv' % (odir));
Export(genome, '%s/proteny_sequences.fasta' % (odir));
#fi
return ("schco3", genes, genome);
#edef
###############################################################################
def human(odir = "%s/human" % dd):
genome = Read('%s/human/Homo_sapiens.GRCh37.73.dna.fa' % (dd), sep=[' ']);
genes = Read('%s/human/Homo_sapiens.GRCh37.73.gtf' % (dd));
genes = genes.To(_.f8, Do=_.Each(lambda x:[ y.strip().replace('"', '').split(' ')[1] for y in x.split(';')[:-1]]));
genes = genes.Get(_.f0, _.f3, _.f4, _.f6, _.f8.Each(lambda x: x[0]), _.f8.Each(lambda x: x[1]),_.f8.Each(lambda x: x[2]));
genome = genome.Get(_.f0, _.seq);
genes = genes / __gene_slice_names__;
genome = genome / __genome_slice_names__;
genes = genes.Copy();
genome = genome.Copy();
if not(odir == None):
Export(genes, '%s/proteny_genes.tsv' % (odir));
Export(genome, '%s/proteny_sequences.tsv' % (odir));
#fi
return ("human", genes, genome);
#edef
###############################################################################
def mouse(odir = "%s/mus_musculus" % dd):
genome = Read('%s/mus_musculus/Mus_musculus.GRCm38.73.dna.fa' % (dd), sep=[' ']);
genes = Read('%s/mus_musculus/Mus_musculus.GRCm38.73.gtf' % (dd));
genes = genes.To(_.f8, Do=_.Each(lambda x:[ y.strip().replace('"', '').split(' ')[1] for y in x.split(';')[:-1]]));
genes = genes.Get(_.f0, _.f3, _.f4, _.f6, _.f8.Each(lambda x: x[0]), _.f8.Each(lambda x: x[1]),_.f8.Each(lambda x: x[2]));
genome = genome.Get(_.f0, _.seq);
genes = genes / __gene_slice_names__;
genome = genome / __genome_slice_names__;
genes = genes.Copy();
genome = genome.Copy();
if not(odir == None):
Export(genes, '%s/proteny_genes.tsv' % (odir));
Export(genome, '%s/proteny_sequences.tsv' % (odir));
#fi
return ("mouse", genes, genome);
#edef
###############################################################################
# Not available publicly yet...
def aniger_n402(odir = None):
genome = Read('/tudelft.net/staff-groups/ewi/insy/DBL/marchulsman/projects/n402_sequence/assembly/n402_atcc.unpadded.fasta', sep=[]);
genes = Read('/tudelft.net/staff-groups/ewi/insy/DBL/marchulsman/projects/n402_sequence/annotations/results/n402_annotations.gff', format='tsv');
#<chrid>\t<start>\t<end>\t<strand>\t<geneid>\t<transcriptid>\t<exonnumber>
genes = genes[_.f2 == 'exon'];
genes = genes.To(_.f8, Do=_.Each(lambda x: x.split(';')));
genes = genes.To(_.f0, Do=_.Each(lambda x: x.split('_')[0]));
genes = genes.Get(_.f0, _.f3, _.f4, _.f6, _.f8.Each(lambda x: x[1].split('=')[1].split('_')[0]), _.f8.Each(lambda x: int(x[0].split('=')[1].split('_')[1][4:])+1 ));
genes = genes / ("f0", "f1", "f2", "f3", "f4", "f5");
genes = genes.Get(_.f0, _.f1, _.f2, _.f3, _.f4, _.f4, _.f5);
genome = genome.To(_.f0, Do=_.Each(lambda x: x.split('_')[0]));
genes = genes.To(_.f0, Do=_.Each(lambda x: x[-6:]).Cast(int).Cast(str));
genome = genome.To(_.f0, Do=_.Each(lambda x: x[-6:]).Cast(int).Cast(str));
genes = genes / __gene_slice_names__;
genome = genome / __genome_slice_names__;
genes = genes.Detect().Copy();
genome = genome.Detect().Copy();
if not(odir == None):
Export(genes, '%s/proteny_genes.tsv' % (odir));
Export(genome, '%s/proteny_sequences.tsv' % (odir));
#fi
return ("n402", genes, genome);
#edef
###############################################################################
def aniger_513_88(odir = None):
genome = Read(Fetch('http://www.aspergillusgenome.org/download/sequence/A_niger_CBS_513_88/current/A_niger_CBS_513_88_current_chromosomes.fasta.gz'));
genes = Read(Fetch('http://www.aspergillusgenome.org/download/gff/A_niger_CBS_513_88/A_niger_CBS_513_88_current_features.gff'), format='tsv');
genes = genes[_.f2 == 'exon'];
genes = genes.To(_.f8, Do=_.Each(lambda x: x.split(';')));
genes = genes.To(_.f0, Do=_.Each(lambda x: x.split('_')[0]));
genes = genes.Get(_.f0, _.f3, _.f4, _.f6, _.f8.Each(lambda x: x[1].split('=')[1].split('-')[0]), _.f8.Each(lambda x: x[0].split('-')[2][1:]))
genes = genes / ("f0", "f1", "f2", "f3", "f4", "f5");
genes = genes.Get(_.f0, _.f1, _.f2, _.f3, _.f4, _.f4, _.f5);
genome = genome.To(_.f0, Do=_.Each(lambda x: x.split('_')[0]));
genes = genes.To(_.f0, Do=_.Each(lambda x: x[-2:]).Cast(int).Cast(str));
genome = genome.To(_.f0, Do=_.Each(lambda x: x[-2:]).Cast(int).Cast(str));
genes = genes / __gene_slice_names__;
genome = genome / __genome_slice_names__;
genes = genes.Detect().Copy();
genome = genome.Detect().Copy();
if not(odir == None):
Export(genes, '%s/proteny_genes.tsv' % (odir));
Export(genome, '%s/proteny_sequences.tsv' % (odir));
#fi
return ("513.88", genes, genome);
#edef
###############################################################################
def pleos2(odir = None):
genome = Read(Fetch("http://genome.jgi.doe.gov/PleosPC15_2/download/PleosPC15_2_Assembly_scaffolds.fasta.gz"));
genes = Read(Fetch("http://genome.jgi.doe.gov/PleosPC15_2/download/PleosPC15_2_GeneModels_FilteredModels1.gff.gz"), format='tsv');
genes = genes[_.f2 == 'CDS'];
genes = genes.To(_.f8, Do=_.Each(lambda x:[ y.strip().split(' ')[1] for y in x.split(';')[1:]]));
genes = genes.Get(_.f0, _.f3, _.f4, _.f6, _.f8.Each(lambda x: x[0]), _.f8.Each(lambda x: x[0]), _.f8.Each(lambda x: x[1])).Detect();
genes = genes / __gene_slice_names__;
genome = genome / __genome_slice_names__;
genes = genes.Copy();
genome = genome.Copy();
if not(odir == None):
Export(genes, '%s/proteny_genes.tsv' % (odir));
Export(genome, '%s/proteny_sequences.fasta' % (odir));
#fi
return ("pleos2", genes, genome);
#edef
###############################################################################