-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmmseqs_hmmtop.py
417 lines (317 loc) · 15.1 KB
/
mmseqs_hmmtop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
import subprocess
import os
import copy
from subprocess import call
#Perform mmseqs_createdb (convert Fasta files into mmseqs's dbtype/database structure)
def runCreateDB(fastaFileName):
createDB_cmd = ['mmseqs', 'createdb', fastaFileName, fastaFileName + 'DB']
if os.path.isfile(fastaFileName + 'DB') == False:
subprocess.run(createDB_cmd, check=True)
#Perform mmseqs_search (calculate alignment from query and target databases)
def runSearch(query, target, result):
search_cmd = ['mmseqs', 'search', query, target, result, 'tmp', '-a', '--gap-open', 'aa:9,nucl:5', '--gap-extend', 'aa:1,nucl:2']
if os.path.isfile(result + ".dbtype") == False:
subprocess.run(search_cmd, check = True)
#Perform mmseqs_convertalis (convert alignment result from 'Search' module to BLAST tab)
def runConvertalis(query, target, result):
column_labels = 'query,target,alnlen,evalue,bits,pident,qstart,qend,qlen,qcov,tstart,tend,tlen,tcov,qaln,taln,qseq,tseq'
convertalis_cmd = ['mmseqs', 'convertalis', query, target, result, result + ".tab", "--format-mode", "4","--format-output", column_labels]
if os.path.isfile(result + ".tab") == False:
subprocess.run(convertalis_cmd, check = True)
#Convert BLAST tab file generated by mmseqs Convertalis into python dictionary object
def blastDict(result):
#Convert BLAST tab file into dictionary
#Open the tab file for reading
with open(result + ".tab", 'r') as f:
# Read the header line and split it by tabs
header = f.readline().strip().split('\t')
# Initialize an empty dictionary
mmseqs_dict = {}
# Read through each line of the file
for line in f:
# Split the line by tabs
fields = line.strip().split('\t')
fields[2] = int(fields[2]) #alnlen
fields[3] = float(fields[3]) #evalue
fields[4] = int(fields[4]) #bits
fields[5] = float(fields[5]) #pident
fields[6] = int(fields[6]) #qstart
fields[7] = int(fields[7]) #qend
fields[8] = int(fields[8]) #qlen
fields[9] = float(fields[9]) #qcov
fields[10] = int(fields[10]) #tstart
fields[11] = int(fields[11]) #tend
fields[12] = int(fields[12]) #tlen
fields[13] = float(fields[13]) #tcov
# Create a dictionary using the header as the keys and the fields as the values
blast_result = {header[i]: fields[i] for i in range(len(header))}
# Use the first field as the key to the nested dictionary
key = blast_result.pop(header[0])
# Add the blast result to the nested dictionary
mmseqs_dict[key] = blast_result
mmseqs_dict = dict(sorted(mmseqs_dict.items()))
return mmseqs_dict
#Create dictionary of just targets to remove duplicate target accessions
def target_OnlyDict(mmseqsDict):
target_only = {}
for key in mmseqsDict:
target_only[mmseqsDict[key]['target']] = mmseqsDict[key]['tseq']
target_only = dict(sorted(target_only.items()))
return target_only
#Create fasta file used for Hmmtop's TMS calculation from python dictionaries (query and target's accessions and aligned sequences)
def fastaConversion(result, mmseqsDict, targetDict):
#Convert Result From mmseqs alignment to fasta format
if os.path.isfile(result + '.faa') == False: #check if file already exists
call('touch ' + result+ '.faa', shell = True)
ofile = open(result + '.faa', "w") #open file for writing
faaLnCount = 0
for key in mmseqsDict: #write in query accessions and sequences
faaLnCount += 1 #keep track of number of iterations in mmseqs_dict in creation of the .faa file
ofile.write(">" + key + "\n" + mmseqsDict[key]['qseq'] + "\n")
tfaaLnCount = 0
for key in targetDict: #write in target accesssions and sequences
tfaaLnCount += 1
ofile.write(">" + key + "\n" + targetDict[key] + "\n")
ofile.close()
print("number of sequences contained the .faa file is:" + str(faaLnCount + tfaaLnCount)) #2020 -> 1010 sequences
print("Converted result file from mmseqs search to FASTA format as: " + result + ".faa" + " for use in hmmtop")
faaSizeCmd = ['wc', '-l', result + '.faa']
subprocess.run(faaSizeCmd, check = True) #2006 -> 1003 sequences
#Run Hmmtop on a fasta file (storing accessions and aligned sequences)
def runHmmtop(fastaFile, hmmResult):
#Perform hmmtop
hmmtop_cmd = ['hmmtop', '-if=' + fastaFile, '-of=' + hmmResult +'.hmmtop', '-sf=FAS', '-pi=spred', '-is=pseudo']
if os.path.isfile(hmmResult + '.hmmtop') == False:
subprocess.run(hmmtop_cmd)
#Command that pulll substrate data from tcdb website
def getSubstrateData(fileName):
substrate_cmd = ['wget', '-O', fileName + '.tsv', 'https://tcdb.org/cgi-bin/substrates/getSubstrates.py']
if os.path.isfile(fileName + '.tsv') == False:
subprocess.run(substrate_cmd, check = True)
#Record results of Hmmtop in python dictionary (accession as key; protein length, number of TMS, and list of TMS as values)
def hmmtopDict(hmmResult):
# Open the .hmmtop file for reading
with open( hmmResult + '.hmmtop', 'r') as f:
# Initialize an empty dictionary
hmmtop_dict = {}
# Manually create names for fields
fieldNames = ["protlen", "ntms","tms"]
# Read through each line of the file
for l in f:
# Split the line by spaces
fields = l.strip().split()
# empty list for individual line
hmmtop_result = {}
# create list to store pairs of tms
tms_list = []
tempTMSPos = 0
for i in range (len(fields)):
if fields[i] == 'IN' or fields[i] == 'OUT':
tempTMSPos = i
for i in range(tempTMSPos + 2, len(fields), 2):
tms_pairs = [int(fields[i]), int(fields[i+1])]
tms_list.append(tms_pairs)
hmmtop_result = {fieldNames[0]:int(fields[1]), fieldNames[1]:int(fields[4]), fieldNames[2]:tms_list}
hmmtop_dict[fields[2]] = hmmtop_result
return hmmtop_dict
#Record substrates data in tsv file in python dictionary
def substrateDict(fileName):
with open(fileName + '.tsv', 'r') as f:
# Initialize an empty dictionary
substrate_dict = {}
# Read through each line of the file
for line in f:
# Split the line by tabs
fields = line.strip().split('\t')
chebi = fields[1].split('|')
element = []
for i in chebi:
rawPair = i.split(';')
pair = [int(rawPair[0].split(":")[1]), rawPair[1]]
element.append(pair)
substrate_dict[fields[0]] = element
return substrate_dict
#Calculate number of TMS overlaps in each pair of aligned sequences
def overlapDict(mmseqsDict, hmmtopDict, minRes):
overlap_dict = {}
def oneToZeroIndexed(coords):
newcoords = copy.deepcopy(coords)
for i in newcoords:
i[0] = i[0] - 1
i[1] = i[1] - 1
return newcoords
#Assume coords have been already corrected to 0-based indices
def realRMScoords (coords, alnStart, alnEnd):
newcoords = copy.deepcopy(coords)
#Remove TMSs that are not in the alignment region
#using the equation:
# relTMCScoord = hmmtopCoord - alnStart
returnTMS = []
for i in newcoords:
i[0] = i[0] - alnStart
i[1] = i[1] - alnStart
if i [1] <= 0:
continue
elif i[0] >= alnEnd - alnStart:
continue
else:
returnTMS.append(i)
return returnTMS
#Helper function that returns true if given starting index is within the range of a TMS, otherwise returns false
def findTMS(givenIndex, TMSList):
for i in TMSList:
if i[0] <= givenIndex and i[1] >= givenIndex:
return True
return False
#Helper function that remove a TMS from the list if given Index is within the range of a TMS
def removeTMS(givenIndex, TMSList):
for i in TMSList:
if i[0] <= givenIndex and i[1] >= givenIndex:
TMSList.remove(i) #CHECK SYNTAX
return
return
def overlapScore (qTMSWithin, tTMSWithin, qaln, taln):
overlaps = 0
qTracking = 0
tTracking = 0
#copy two lists of TMSWithin, SYNTAX MIGHT NOT BE CORRECT!!!!
tempQWithin = qTMSWithin
tempTWithin = tTMSWithin
residueCount = 0
for i in range (0, len(qaln)): #aligned region has the same length for query and target (counting the gaps)
#if encounter gaps, actual indices ("Tracking") don't increment, and no need to compare TMS
if qaln[i] == "-":
qTracking = qTracking - 1
if taln[i] == "-":
tTracking = tTracking - 1
#identical Residue Position Found
if findTMS(qTracking, tempQWithin) == True and qaln[i] != "-" and taln[i] != "-":
if findTMS(tTracking, tempTWithin) == True :
residueCount = residueCount + 1
else:
residueCount = 0 #making sure next residue count doesn't inherit previous count
elif findTMS(qTracking, tempQWithin) == False :
residueCount = 0 #making sure next residue count doesn't inherit previous count
if residueCount == minRes:
overlaps = overlaps + 1
residueCount = 0 #making sure next residue count doesn't inherit previous count
#If one pair TMS is already counted as overlapped, they can't be used for future comparison
removeTMS(qTracking, tempQWithin)
removeTMS(tTracking, tempTWithin)
#tracking actual indices
qTracking = qTracking + 1
tTracking = tTracking + 1
#print(tempQWithin)
#print(tempTWithin)
return overlaps
for key in mmseqsDict:
qaln = mmseqsDict[key]['qaln']
taln = mmseqsDict[key]['taln']
tAccession = mmseqsDict[key]['target']
# - 1 to convert from 1-indexing to 0-indexing
qalnStartPos = mmseqsDict[key]['qstart'] - 1
qalnEndPos = mmseqsDict[key]['qend'] - 1
talnStartPos = mmseqsDict[key]['tstart'] - 1
talnEndPos = mmseqsDict[key]['tend'] - 1
rawqTMS = hmmtopDict[key]['tms']
rawtTMS = hmmtopDict[tAccession]['tms']
qTMS = oneToZeroIndexed(rawqTMS)
tTMS = oneToZeroIndexed(rawtTMS)
qTMSWithin = realRMScoords(qTMS, qalnStartPos, qalnEndPos)
tTMSWithin = realRMScoords(tTMS, talnStartPos, talnEndPos)
overlaps = overlapScore(qTMSWithin, tTMSWithin, qaln, taln)
targetAndOverlap = {}
targetAndOverlap['target'] = tAccession
targetAndOverlap['alignedTMS'] = overlaps
overlap_dict[key] = targetAndOverlap
return overlap_dict
'''
#For Testing
tempMmseqs = {}
tempData = {}
tempData['qaln'] = 'MGFDIGGDIGKPLKDAFDKFGADIKMTFLTVLNWMK--WISIG------ILIVISVI-------LICKIIKVLFQCGKCLLSCFGFCKK'
tempData['taln'] = 'MGFSINFD---PIINKFREFQTNINHNINEQLDKLKMVWINLGSHIKYWFIIIISILTILFILFLLIKITKLILNCKKIFSCCCNVCCK'
tempData['target'] = '1.A.100.1.1-B2X7D9'
tempData['qstart'] = 1
tempData['qend'] = 74
tempData['tstart'] = 22
tempData['tend'] = 107
tempMmseqs['1.A.95.2.5-YP_009361958'] = tempData
tempHmmtop = {}
tempQhmmData = {}
tempQhmmData['tms'] = [[26, 49], [54,72]]
tempThmmData = {}
tempThmmData['tms'] = [[68, 92]]
tempHmmtop['1.A.95.2.5-YP_009361958'] = tempQhmmData
tempHmmtop['1.A.100.1.1-B2X7D9'] = tempThmmData
overlap_dict = overlapDict(tempMmseqs, tempHmmtop, minRes)
'''
def main():
qFasta_str = input("Enter query fasta file you wish to convert to mmseqs-database type:")
tFasta_str = input("Enter target fasta file you wish to convert to mmseqs-database type:")
runCreateDB(qFasta_str)
runCreateDB(tFasta_str)
#Ask for names of query and target databases, and Ask to name the result database
query_str = input("Enter query database: ")
target_str = input("Enter target database: ")
result_str = input("Name the result database from mmseqs search: ")
runSearch(query_str, target_str, result_str)
runConvertalis(query_str, target_str, result_str)
mmseqs_dict = blastDict(result_str)
# Print the resulting dictionary for checking
################################
'''
eValueRangeDict = {}
for key in mmseqs_dict:
#if mmseqs_dict[key]['evalue'] >= 1e-15 and mmseqs_dict[key]['evalue'] <= 1e-10:
#if mmseqs_dict[key]['evalue'] > 1e-10 and mmseqs_dict[key]['evalue'] <= 1e-7:
if mmseqs_dict[key]['evalue'] > 1e-7 and mmseqs_dict[key]['evalue'] <= 1e-4:
eValueRangeDict[key] = mmseqs_dict[key]
eValueRangeDict = dict(sorted(eValueRangeDict.items()))
print(len(eValueRangeDict))
'''
################################
#print(mmseqs_dict)
print("length of mmseqs_dict is:" + str(len(mmseqs_dict)))
target_only_dict = target_OnlyDict(mmseqs_dict)
print("length of target dictionary is:" + str(len(target_only_dict)))
fastaConversion(result_str, mmseqs_dict, target_only_dict)
faa_str = input('Enter file name to run hmmtop on: ')
hmmResult_str = input('Name the result file from hmmtop (name only, without .hmmtop): ')
runHmmtop(faa_str, hmmResult_str)
#Download Substrate Data
substrate_str = input('Name the file to store substrate data (without .tsv): ')
getSubstrateData(substrate_str)
hmmtop_dict = hmmtopDict(hmmResult_str)
# Print the resulting dictionary
print(hmmtop_dict)
##########################################
#for key in eValueRangeDict:
#print(hmmtop_dict[key])
##########################################
print("length of the hmmtop_dict is:" + str(len(hmmtop_dict))) #1002
substrate_dict = substrateDict(substrate_str)
#print substrate dictionary
#print(substrate_dict)
#print(len(substrate_dict))
print ("-- Calculating TMS overlap based on dictionaries generated by previously-ran mmseqs search and hmmtop --")
minRes = 8 #minimum number of identical residues for a pair of TMS to be considered overlapping
yn = input('Do you wish to change the minimum number of identical residues for TMS to be considered overlapping? -- default: 8 ; type y or n: ')
if (yn == 'y'):
minRes = input('Enter the desired minimum residues number:')
overlap_dict = overlapDict(mmseqs_dict, hmmtop_dict, minRes)
#print(overlap_dict)
#print(len(overlap_dict))
#######################################
'''
Muelsyse = 0
for key in eValueRangeDict:
#print(key + ' : ' + str(overlap_dict[key]))
Muelsyse = Muelsyse + 1
#swTest_cmd = ['bash', 'swTest.ssh', key, eValueRangeDict[key]['target']]
print(str(Muelsyse) + '. ' + key + ' : ' + str(overlap_dict[key]))
#subprocess.run(swTest_cmd, check = True)
print(Muelsyse)
'''
#######################################
# main()