-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimuAnalysis2.py
236 lines (210 loc) · 11.6 KB
/
simuAnalysis2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import os
import re
import sys
import MDAnalysis as MDA
import MDAnalysis.analysis.psa
import numpy as np
import matplotlib.pyplot as plt
import math
import glob
import pandas as pd
import subprocess
set_output_directory = "/home_e/dor/PycharmProjects/pythonProject/graphs/"
def distance_2dots (point1,point2):
"""
Calculates the distance between two points in 3D space.
:param point1: X,Y,Z coordinates for the first point (Numpy array)
:param point2: X,Y,Z coordinates for the second point (Numpy array)
:return: Distance between two points (Float).
"""
if point1.ndim == 2:
dis = math.sqrt((point1[0,0]-point2[0,0])**2+(point1[0,1]-point2[:,1])**2+(point1[0,2]-point2[0,2])**2)
return dis
if point1.ndim == 1:
dis = math.sqrt((point1[0]-point2[0])**2+(point1[1]-point2[1])**2+(point1[2]-point2[2])**2)
return dis
def Rg_per_frame (universe):
"""
Calculates the Rg of the protein for the exact time frame. The function includes only C alpha atoms,
for improved calculation efficiency.
:param universe: Universe class, generated by MDAnalysis
:return: Rg from the center of mass.
"""
center = universe.select_atoms("protein").center_of_mass()
all_atom = universe.select_atoms("name CA")
radi_sum = 0
for i in all_atom.positions:
radi_sum+=distance_2dots(center,i)
return radi_sum/len(all_atom)
def Rg_t_lapse (universe,start_step=None,end_step=None):
"""
Calculates Rg for a simulation time period.The function includes only C alpha atoms,
for improved calculation efficiency. If time period id not specified, than the entire simulation will be analyzed.
:param universe: Universe class, generated by MDAnalysis
:param start_step: Frame number to start analysing. Do not receive the time in the simulation. (integer)
:param end_step: Frame number to end analysing. Do not receive the time in the simulation. (integer)
:return: numpy.array with time steps of the simulation, and Rg for each step(simulation length,2).
"""
simulation_trajectory = universe.trajectory[start_step:end_step]
data = np.zeros((len(simulation_trajectory),2))
for steps,time in zip(simulation_trajectory,range(len(simulation_trajectory))):
data[time,0] = universe.trajectory.time
data[time,1] = Rg_per_frame(universe)
return data
def resi_distance (resi1, resi2,universe=None,start_step=None,end_step=None):
"""
Calculates the distance between two residues for a simulation time period.The function includes only C alpha atoms,
for improved calculation efficiency. If time period id not specified, than the entire simulation will be analyzed.
:param resi1: Location of the first residue on the chain (integer)
:param resi2: Location of the second residue on the chain (integer)
:param universe: Universe class, generated by MDAnalysis.
:param start_step: Frame number to start analysing. Do not receive the time in the simulation. (integer)
:param end_step: Frame number to end analysing. Do not receive the time in the simulation. (integer)
:return: numpy.array with time steps of the simulation, and the distance between two of interest for each step(simulation length,2).
"""
residue_1 = universe.select_atoms("resid "+str(resi1)+"-"+str(resi1) +" and name CA")
residue_2 = universe.select_atoms("resid "+str(resi2)+"-"+str(resi2) +" and name CA")
simulation_trajectory = universe.trajectory[start_step:end_step]
data = np.zeros((len(simulation_trajectory),2))
for steps,time in zip(simulation_trajectory,range(len(simulation_trajectory))):
data[time,0] = universe.trajectory.time
data[time,1] = distance_2dots(residue_1.positions,residue_2.positions)
return data
def pdbAnalysis (input_directory,resi1,resi2,start_step=None,end_step=None):
"""
Analyzes the RG and residues distances for all the .pbd files (from coarse grain simulations) in the given directory.
:param input_directory:
:param resi1: Residue index on the peptide, for calculating distance with resi2. (index start from 1 not 0) (integer)
:param resi2:
:param start_step: Frame number to start analysing. Do not receive the time in the simulation. (integer)
:param end_step: Frame number to end analysing. Do not receive the time in the simulation. (integer)
:return: Data Frame with each calculation (Rg and residue distance) for each repeat as a column.
"""
all_pdbs_files = glob.glob(input_directory+"/*.pdb")
all_data = pd.DataFrame()
first = True
for i in range(len(all_pdbs_files)):
universe = MDA.Universe(all_pdbs_files[i])
if first:
all_data = pd.DataFrame(Rg_t_lapse(universe, start_step, end_step),columns=['time',"Rg_repeat_" + str(i+1)])
all_data["resi_distance_repeat_1"] = resi_distance(resi1, resi2, universe, start_step, end_step)[:,1]
first = False
else:
all_data["resi_distance_repeat_" + str(i+1)] = resi_distance(resi1, resi2, universe, start_step, end_step)[:,1]
all_data["Rg_repeat_" + str(i+1)] = Rg_t_lapse(universe, start_step, end_step)[:,1]
return all_data
def xtcAnalysis(directory, resi1, resi2, start_step=None, end_step=None):
"""
Analyzes the RG and residues distances for all the .xtc files (from atomistic simulations) in the given directory.
Directory must include a topology file (either .gro or .pdb)
:param directory:
:param resi1: Residue index on the peptide, for calculating distance with resi2. (index start from 1 not 0) (integer)
:param resi2:
:param start_step: Frame number to start analysing. Do not receive the time in the simulation. (integer)
:param end_step: Frame number to end analysing. Do not receive the time in the simulation. (integer)
:return: Data Frame with each calculation (Rg and residue distance) for each repeat as a column.
"""
all_xtc_files = glob.glob(directory + "/*.xtc")
all_topo_files = glob.glob(directory + "/*.gro")
if len(all_topo_files) == 0:
all_topo_files = glob.glob(directory + "/*.pdb")
resi1, resi2 = (resi1+1, resi2+1)
all_data = pd.DataFrame()
first = True
for i in range(len(all_xtc_files)):
universe = MDA.Universe(all_topo_files[0],all_xtc_files[i])
if first:
all_data = pd.DataFrame(Rg_t_lapse(universe, start_step, end_step),
columns=['time', "Rg_repeat_" + str(i + 1)])
all_data["resi_distance_repeat_1"] = resi_distance(resi1, resi2, universe, start_step, end_step)[:, 1]
first = False
else:
all_data["resi_distance_repeat_" + str(i + 1)] = resi_distance(resi1, resi2, universe, start_step,
end_step)[:, 1]
all_data["Rg_repeat_" + str(i + 1)] = Rg_t_lapse(universe, start_step, end_step)[:, 1]
return all_data
def merge_continue_and_start(directory_start,directory_continue,output_directory,resi1, resi2,end_time, start_step=None, end_step=None):
segment = re.search(r".*(seg[ABC]).*", directory_start).group(1)
start_data = xtcAnalysis(directory_start,resi1, resi2, start_step)
continue_data = xtcAnalysis(directory_continue,resi1, resi2, start_step=0,end_step=end_step)
continue_data["time"]= np.array(continue_data["time"])+end_time
complete_data = pd.concat([start_data,continue_data],ignore_index=True)
pd.DataFrame.to_csv(complete_data, output_directory+"/"+segment + ".csv")
def save_xtcAnalysis(input_directory,segment,outputdir=None,start_step=1000,end_step=None):
segment,input_directory = (str(segment),str(input_directory))
segments = {"segA":(1,151),"segB":(1,159),"segC":(1,128)}
resi1, resi2 = segments[segment]
if outputdir != None:
output_directory = outputdir+"/"
else: output_directory = set_output_directory
os.makedirs(output_directory, exist_ok=True)
data = xtcAnalysis(input_directory,resi1,resi2,start_step,end_step)
pd.DataFrame.to_csv(data, output_directory +segment+ ".csv")
FRET_measures(output_directory +segment+ ".csv")
def iterPDBAnalysis(directory,start_step=None,end_step=None):
"""
Iterates over a tree directory, no matter its depth size, and applies PDBAnalysis on sub-directories with
.pdb files in it.
:param directory: Any tree directory with coarse-grain output as leaf directories. Must not include dirs without
coarse-grain output at its end.
:param start_step: Frame number to start analysing. Do not receive the time in the simulation. (integer)
:param end_step: Frame number to end analysing. Do not receive the time in the simulation. (integer)
:return: A new exact tree directory, with the PDBAnalysis output as .csv files, in the leaf directories.
The directory is saved in a default dir ('set_output_directory')
"""
segs = {"segA": (1, 151), "segB": (1, 159), "segC": (1, 128)}
output_dir = set_output_directory+"coarse/"
directory_without_main = re.search("(.*/).*/",directory).group(1)
directories_toiter = [directory]
while directories_toiter:
dir = directories_toiter.pop()
if "output" in [x for x in os.walk(dir)][0][1]:
print("PDBAnalysis on:",dir+"output/traj/")
segment = re.search(".*(seg[ABC]).*", dir).group(1)
down_tree = re.search(directory_without_main+"(.*)",dir).group(1)
data = pdbAnalysis(dir + "output/Traj/", segs[segment][0], segs[segment][1], start_step, end_step)
os.makedirs(output_dir+down_tree,exist_ok=True)
pd.DataFrame.to_csv(data,output_dir+down_tree+"file.csv")
else:
for subdir in [x for x in os.walk(dir)][0][1]:
directories_toiter.append(dir+subdir+"/")
def zip2pdb(directory):
"""
Iterates over a tree directory, no matter its depth size, unzips every output zip file, and converts .dat file
.pbd file. Converting .dat to .pdb every X frames, specified by 'skip_frame'.
:param directory: Any tree directory with coarse-grain output as leaf directories. Must not include dirs without
coarse-grain output at its end.
:return:
"""
unzip_command = "bunzip2 *"
directories_toiter = [directory]
skip_frame = "2" # number, which specifies how many frames to skip when converting to pdb
while directories_toiter:
dir = directories_toiter.pop()
if "output" in [x for x in os.walk(dir)][0][1]:
wd = dir+"output/Traj/"
print("zip2pdb on:", wd)
subprocess.Popen(unzip_command, cwd=wd, shell=True).wait()
for file in glob.glob(dir+"output/Traj/*.dat"):
subprocess.Popen(["/home_e/dor/scripts/TrajToPDB/TrajToPDB.pl",file,skip_frame]).wait()
else:
for subdir in [x for x in os.walk(dir)][0][1]:
directories_toiter.append(dir+subdir+"/")
def calc_FRET(data,segment):
N = {"segA":151,"segB":159,"segC":128}
exponent = {"segA":0.48,"segB":0.43,"segC":0.44}
marker_length = 9
scaler = (N[segment]/(N[segment]+marker_length))**exponent[segment]
scaled_toinclude_markers = data*scaler
R_zero = 54
FRET_measures= (R_zero**6)/(scaled_toinclude_markers**6+R_zero**6)
return FRET_measures
def FRET_measures(csv_file):
segment = re.search(".*(seg[ABC]).*",csv_file).group(1)
df = pd.read_csv(csv_file)
for col in df.columns:
if "resi_distance" in col:
reapeat_num = re.search(".*(\d).*",col).group(1)
print(col,reapeat_num)
df["smFRET_"+str(reapeat_num)] = calc_FRET(df[col],segment)
pd.DataFrame.to_csv(df, csv_file)