-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgeosoft_extractor.py
executable file
·458 lines (400 loc) · 22.8 KB
/
geosoft_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
#! /bin/python
# Script Name: geosoft_extractor.py
# Author: Festus Nyasimi
# Date: 17 Jan 2020
"""
The Script extracts metadata file a GEO soft file and creates a json file.
Written in Python 3.7
Usage: python geosoft_extractor.py <geosoft file>
"""
# import required modules
import shutil
import difflib
import subprocess
import os
import sys
import wrangle_functions
# Check of all arguments have been provided
if len(sys.argv) == 2:
# Input file
infile = os.getcwd() + "/" + sys.argv[1]
# Check if the paths exist
if os.path.isfile(infile):
pass
else:
print("The file does not exist!!!")
print(__doc__)
exit()
else:
print(__doc__)
exit()
# Make directories
if os.path.exists("Jsonfiles"):
pass
else:
os.makedirs("Processed_Data/Undownloadable_data")
os.makedirs("Processed_Data/Undetermined_targets")
os.makedirs("Jsonfiles")
os.makedirs("links_tmp")
# Metadata header
if os.path.isfile("metadata.tsv"):
pass
else:
metafile = open("metadata.tsv", "w")
metafile.write("Expt Accession\tTarget\tTissue\tCell line\tCell type\tGeneric peak name\n")
metafile.close()
# Terms which define control samples
control_terms = ["none", "input", "igg", "genomic", "genomic DNA", "n/a"]
# Open the file for data extraction
with open(infile, "r") as myfile:
sample_name = ""
sample_title = ""
sample_target = ""
targets = []
controls = []
All_samples = {} # A dict containing each sample GSM and title
Control_samples = {} # A dict containing each control sample GSM and title
for line in myfile:
# Get samples
if line.startswith("^SAMPLE"):
sample_name = line[10:].rstrip()
sample_target = ""
if line.startswith("!Sample_title"):
sample_title = line[16:].rstrip()
# Search for transcription factor target antibodies
if sample_target == "":
if line.startswith("!Sample_characteristics_ch1 = chip antibody:") or line.startswith("!Sample_characteristics_ch1 = chip-antibody:") or line.startswith("!Sample_characteristics_ch1 = chip_antibody:"):
sample_target = line[45:].rstrip()
if line.startswith("!Sample_characteristics_ch1 = tchip antibody:"):
sample_target = line[46:].rstrip()
if line.startswith("!Sample_characteristics_ch1 = purification target:"):
sample_target = line[51:].rstrip()
if line.startswith("!Sample_characteristics_ch1 = antibody:"):
sample_target = line[40:].rstrip()
if line.startswith("!Sample_characteristics_ch1 = factor:"):
sample_target = line[38:].rstrip()
if line.startswith("!Sample_characteristics_ch1 = ip"):
sample_target = line[34:].split(":")[-1].strip()
if line.startswith("!Sample_characteristics_ch1 = chip epitope:") or line.startswith("!Sample_characteristics_ch1 = factor chip:") or line.startswith("!Sample_characteristics_ch1 = ip antibody:") or line.startswith("!Sample_characteristics_ch1 = chip_epitope:") or line.startswith("!Sample_characteristics_ch1 = chip target:") or line.startswith("!Sample_characteristics_ch1 = chip antigen:"):
sample_target = line[43:].strip()
if line.startswith("!Sample_characteristics_ch1 = chip antibodies:") or line.startswith("!Sample_characteristics_ch1 = antibody target:") or line.startswith("!Sample_characteristics_ch1 = chip (antibody):"):
sample_target = line[47:].strip()
if line.startswith("!Sample_characteristics_ch1 = antibodies:"):
sample_target = line[42:].strip()
if line.startswith("!Sample_characteristics_ch1 = chip ab:") or line.startswith("!Sample_characteristics_ch1 = epitope:") or line.startswith("!Sample_characteristics_ch1 = target:") or line.startswith("!Sample_characteristics_ch1 = chip-ab:"):
sample_target = line[38:].strip()
if line.startswith("!Sample_characteristics_ch1 = chip antibody catalog") or line.startswith("!Sample_characteristics_ch1 = chip_antibody_catalog"):
sample_target = line[53:].strip()
if line.startswith("!Sample_characteristics_ch1 = antibody (catalog)"):
sample_target = line[50:].strip()
if line.startswith("!Sample_characteristics_ch1 = chip antibody:") or line.startswith("!Sample_characteristics_ch1 = chip antibody:") or line.startswith("!Sample_characteristics_ch1 = chip anitbody:"):
sample_target = line[45:].strip()
if line.startswith("!Sample_characteristics_ch1 = chip-seq antibody:") or line.startswith("!Sample_characteristics_ch1 = chip seq antibody:"):
sample_target = line[49:].strip()
if line.startswith("!Sample_characteristics_ch1 = ChIP:"):
sample_target = line[35:].strip()
if line.startswith("!Sample_characteristics_ch1 = antibody used:") or line.startswith("!Sample_characteristics_ch1 = ChIP Antibody:") or line.startswith("!Sample_characteristics_ch1 = beads/antibody:"):
sample_target = line[45:].strip()
if line.startswith("!Sample_characteristics_ch1 = antibody/details:") or line.startswith("!Sample_characteristics_ch1 = antibody source") or line.startswith("!Sample_characteristics_ch1 = antibody/enzyme:") or line.startswith("!Sample_characteristics_ch1 = antibody/capture:"):
sample_target = line[47:].strip()
if line.startswith("!Sample_data_row_count"):
# Append all samples and their titles
All_samples[sample_name] = sample_title
# Shorten the target name
sample_target = sample_target.split("(")[0].strip()
# Search for controls through chip target and title
if sample_target.casefold() in control_terms:
controls.append(sample_name)
Control_samples[sample_name] = sample_title
else:
tag = []
for word in control_terms:
# check if control terms exist in the target
if word in sample_target.casefold():
controls.append(sample_name)
Control_samples[sample_name] = sample_title
# check if control terms exist in the title
elif word in sample_title.casefold():
controls.append(sample_name)
Control_samples[sample_name] = sample_title
# if they dont exist in both then append a false
else:
tag.append(False)
if len(tag) == 6:
if "control" in sample_title.casefold() or "control".casefold() == sample_target or "control" == sample_target.casefold():
controls.append(sample_name)
Control_samples[sample_name] = sample_title
else:
pass
# Finding all targets in the experiment
if sample_target.casefold() not in control_terms:
if sample_target.casefold() != "." and sample_target.casefold() != "" and sample_target.casefold() != "no" and sample_target.casefold() != "na" and sample_target.casefold() != "-" and "n/a" not in sample_target.casefold() and "none" not in sample_target.casefold() and "igg" not in sample_target.casefold():
targets.append(sample_target)
# Get unique targets only
targets = list(set(targets))
remove = [] #Remove more non targets
for term in ["input","control", 'no antibody', 'genomic', 'immunoglobulin', 'igg']:
for target in targets:
if term.casefold() in target or term == target.casefold() or term in target.casefold():
remove.append(target)
remove = list(set(remove))
for item in remove:
targets.remove(item)
#print(targets)
# Remove controls from All samples
for item in Control_samples:
del All_samples[item]
#print(All_samples)
#print(Control_samples)
## MATCH SAMPLES AND CONTROLS
matches = {}
if len(Control_samples)== 0:
pass
elif len(controls) == 1:
for sgsm,stitle in All_samples.items():
#print(stitle + " : " + Control_samples.get(controls[0]))
matches[sgsm]=controls[0]
else:
for sgsm,stitle in All_samples.items():
n = difflib.SequenceMatcher(None, stitle, Control_samples.get(controls[0])).ratio()
best = controls[0]
for ctl in controls[1:]:
# Use Sequence matcher to match header of sample and control
test = difflib.SequenceMatcher(None, stitle, Control_samples.get(ctl)).ratio()
if float(test) > float(n):
n = test
best = ctl
#print(stitle + " : " + Control_samples.get(best))
matches[sgsm]=best
#print(matches)
## Loop through targets to create json files
if len(targets) == 0:
# Move file to another folder targets_undetermined
myfile.close()
shutil.move(infile,"Processed_Data/Undetermined_targets/"+infile.split('/')[-1])
print("No target found")
exit()
else:
stopper = False
for target in targets:
#print(infile.split('/')[-1].split('_')[0] + "_" +target.replace(" ","_").replace(",","") + ".json")
outname = infile.split('/')[-1].split('_')[0] + "_" +target.replace(" ","_").replace(",","").replace("/","").replace("'", "").replace('"','').replace("&","").replace(".","_").replace("[","").replace("]","").replace(";","").replace(":","").replace("__","_")
# Ensure the links file is empty
links = open("links_tmp/" + outname + ".txt", "w")
links.close()
outfile = open("Jsonfiles/" + outname + ".json", "w")
outfile.write("{\n")
expt_samples = []
cell_line = "-"
tissue = "-"
cell_type = "-"
# Extract GSM samples for that specific target
myfile.seek(0)
for line in myfile:
# Get title and description
# Get the title of the experiment
if line.startswith("!Series_title"):
title = line[16:].rstrip()
description = ""
# Get summary which will be used as description
if line.startswith("!Series_summary"):
if description == "":
description = line[18:].rstrip()
if line.startswith("^SAMPLE"):
sample_name = line[10:].rstrip()
sample_target = ""
if line.startswith("!Sample_characteristics_ch1 = cell type:"):
cell_type = line[41:].rstrip()
if line.startswith("!Sample_characteristics_ch1 = cell line:"):
cell_line = line[41:].rstrip()
if line.startswith("!Sample_characteristics_ch1 = tissue:"):
tissue = line[38:].rstrip()
# Search for transcription factor target antibodies
if sample_target == "":
if line.startswith("!Sample_characteristics_ch1 = chip antibody:") or line.startswith("!Sample_characteristics_ch1 = chip-antibody:") or line.startswith("!Sample_characteristics_ch1 = chip_antibody:"):
sample_target = line[45:].rstrip()
if line.startswith("!Sample_characteristics_ch1 = tchip antibody:"):
sample_target = line[46:].rstrip()
if line.startswith("!Sample_characteristics_ch1 = purification target:"):
sample_target = line[51:].rstrip()
if line.startswith("!Sample_characteristics_ch1 = antibody:"):
sample_target = line[40:].rstrip()
if line.startswith("!Sample_characteristics_ch1 = factor:"):
sample_target = line[38:].rstrip()
if line.startswith("!Sample_characteristics_ch1 = ip"):
sample_target = line[34:].split(":")[-1].strip()
if line.startswith("!Sample_characteristics_ch1 = chip epitope:") or line.startswith("!Sample_characteristics_ch1 = factor chip:") or line.startswith("!Sample_characteristics_ch1 = ip antibody:") or line.startswith("!Sample_characteristics_ch1 = chip_epitope:") or line.startswith("!Sample_characteristics_ch1 = chip target:") or line.startswith("!Sample_characteristics_ch1 = chip antigen:"):
sample_target = line[43:].strip()
if line.startswith("!Sample_characteristics_ch1 = chip antibodies:") or line.startswith("!Sample_characteristics_ch1 = antibody target:") or line.startswith("!Sample_characteristics_ch1 = chip (antibody):"):
sample_target = line[47:].strip()
if line.startswith("!Sample_characteristics_ch1 = antibodies:"):
sample_target = line[42:].strip()
if line.startswith("!Sample_characteristics_ch1 = chip ab:") or line.startswith("!Sample_characteristics_ch1 = epitope:") or line.startswith("!Sample_characteristics_ch1 = target:") or line.startswith("!Sample_characteristics_ch1 = chip-ab:"):
sample_target = line[38:].strip()
if line.startswith("!Sample_characteristics_ch1 = chip antibody catalog") or line.startswith("!Sample_characteristics_ch1 = chip_antibody_catalog"):
sample_target = line[53:].strip()
if line.startswith("!Sample_characteristics_ch1 = antibody (catalog)"):
sample_target = line[50:].strip()
if line.startswith("!Sample_characteristics_ch1 = chip antibody:") or line.startswith("!Sample_characteristics_ch1 = chip antibody:") or line.startswith("!Sample_characteristics_ch1 = chip anitbody:"):
sample_target = line[45:].strip()
if line.startswith("!Sample_characteristics_ch1 = chip-seq antibody:") or line.startswith("!Sample_characteristics_ch1 = chip seq antibody:"):
sample_target = line[49:].strip()
if line.startswith("!Sample_characteristics_ch1 = ChIP:"):
sample_target = line[35:].strip()
if line.startswith("!Sample_characteristics_ch1 = antibody used:") or line.startswith("!Sample_characteristics_ch1 = ChIP Antibody:") or line.startswith("!Sample_characteristics_ch1 = beads/antibody:"):
sample_target = line[45:].strip()
if line.startswith("!Sample_characteristics_ch1 = antibody/details:") or line.startswith("!Sample_characteristics_ch1 = antibody source") or line.startswith("!Sample_characteristics_ch1 = antibody/enzyme:") or line.startswith("!Sample_characteristics_ch1 = antibody/capture:"):
sample_target = line[47:].strip()
if line.startswith("!Sample_data_row_count"):
sample_target = sample_target.split("(")[0].strip()
if target == sample_target:
expt_samples.append(sample_name)
#print(sample_name)
genome_path = os.path.abspath(os.getcwd()) + "/Data/genome"
# Print constant variables
outfile.write(' "chip.title" : "' + title + '",\n\n')
outfile.write(' "chip.description" : "' + description + '",\n\n')
outfile.write(' "chip.pipeline_type" : "tf",\n')
outfile.write(' "chip.aligner" : "bowtie2",\n')
outfile.write(' "chip.align_only" : false,' + '\n\n')
outfile.write(' "chip.genome_tsv" : "%s/hg38/hg38.tsv",'%(genome_path) + '\n\n')
# Check the experiment samples are in controls and remove them
for control in controls:
if control in expt_samples:
expt_samples = list(filter(lambda x: x != control, expt_samples))
# SAMPLES
# Use only 10 samples due to memory constrains in the pipeline
scounter = 0
matched_controls = []
samp_ends =[]
for sample in expt_samples[:10]:
if len(matches) != 0:
ctl = matches.get(sample)
matched_controls.append(ctl)
scounter += 1
#print(sample + "::::"+ ctl)
# Get the endedness of the sample
# Retry downloading if it fails on first instance
ender = ""
c = 0
while ender == "" and c < 3:
ender = os.popen('check_endedness.sh ' + sample + " " + outname).read()
c += 1
# Confirm data is available
if ender == "":
stopper = True
else:
pass
#Break out of the loop if data is unavailable
if stopper:
break
# Use the output to create the fastqs arrays in json for replicates
chip_type = "fastqs"
wrangle_functions.array_creator(chip_type, scounter, ender, outfile)
# Get the endness array
wrangle_functions.ends_array(samp_ends, ender)
# Download the data files
outfile.write('\n')
### If Data is unavailable stop further execution
if stopper:
#Remove the file
# Move the soft file to undownloaded_data
outfile.close()
os.remove("Jsonfiles/"+ outname + ".json")
shutil.move(infile,"Processed_Data/Undownloadable_data/"+infile.split('/')[-1])
print("Unable to download file data")
break
# print the endedness
s_type = "paired"
wrangle_functions.ends_writer(samp_ends, s_type, outfile)
# CONTROLS
cont_ends =[]
# Check if all experiments have different controls or same controls
if len(expt_samples[:10]) == len(list(set(matched_controls))):
outfile.write(' "chip.always_use_pooled_ctl" : false,' + '\n\n')
ccounter = 0
for contrl in matched_controls:
ccounter += 1
# Retry downloading if it fails on first instance
ender = ""
c = 0
while ender == "" and c < 3:
ender = os.popen('check_endedness.sh ' + contrl + " " + outname).read()
c += 1
chip_type = "ctl_fastqs"
if ender == "":
stopper = True
else:
pass
#Break out of the loop if data is unavailable
if stopper:
break
# Use the Array creator function
wrangle_functions.array_creator(chip_type, ccounter, ender, outfile)
# Get the endness array
wrangle_functions.ends_array(cont_ends, ender)
# Download data
elif len(expt_samples[:10]) > len(list(set(matched_controls))) and len(list(set(matched_controls))) != 0:
if len(list(set(matched_controls))) == 1:
outfile.write(' "chip.always_use_pooled_ctl" : false,' + '\n\n')
else:
outfile.write(' "chip.always_use_pooled_ctl" : true,' + '\n\n')
ccounter = 0
for contrl in list(set(matched_controls)):
ccounter += 1
# Retry downloading if it fails on first instance
ender = ""
c = 0
while ender == "" and c < 3:
ender = os.popen('check_endedness.sh ' + contrl + " " + outname).read()
c += 1
chip_type = "ctl_fastqs"
if ender == "":
stopper = True
else:
pass
#Break out of the loop if data is unavailable
if stopper:
break
# Use the Array creator function
wrangle_functions.array_creator(chip_type, ccounter, ender, outfile)
# Get the endness array
wrangle_functions.ends_array(cont_ends, ender)
# Download the data
else:
pass
# if Data for controls unavailable
if stopper:
#Remove the file
# Move the soft file to undownloaded_data
outfile.close()
os.remove("Jsonfiles/"+ outname + ".json")
shutil.move(infile,"Processed_Data/Undownloadable_data/"+infile.split('/')[-1])
print("Unable to download controls file data")
break
outfile.write('\n')
# print the endedness
c_type = "ctl_paired"
wrangle_functions.ends_writer(cont_ends, c_type, outfile)
outfile.write('\n')
# Allocate more memory
if len(expt_samples[:10]) <= 4:
outfile.write(' "chip.align_mem_mb" : 40000,\n')
outfile.write(' "chip.call_peak_mem_mb" : 32000\n')
else:
outfile.write(' "chip.align_mem_mb" : 80000,\n')
outfile.write(' "chip.call_peak_mem_mb" : 64000\n')
outfile.write('\n')
outfile.write('}\n')
outfile.close()
# Confirm if metadata for expt exists and write out the metadata
append_expt = True
metafile = open("metadata.tsv", "a+")
metafile.seek(0)
for line in metafile:
if line.split("\t")[5].strip() == outname:
append_expt = False
if append_expt:
metafile.write(infile.split('/')[-1].split('_')[0] + "\t" + target + "\t" + tissue + "\t" + cell_line + "\t" + cell_type + "\t" + outname + "\n")
metafile.close()
myfile.close()