simuAnalysis2.py

import os
import re
import sys
import MDAnalysis as MDA
import MDAnalysis.analysis.psa
import numpy as np
import matplotlib.pyplot as plt
import math
import glob
import pandas as pd
import subprocess


set_output_directory = "/home_e/dor/PycharmProjects/pythonProject/graphs/"

def distance_2dots (point1,point2):
    """
    Calculates the distance between two points in 3D space.
    :param point1: X,Y,Z coordinates for the first point (Numpy array)
    :param point2: X,Y,Z coordinates for the second point (Numpy array)
    :return: Distance between two points (Float).
    """
    if point1.ndim == 2:
        dis = math.sqrt((point1[0,0]-point2[0,0])**2+(point1[0,1]-point2[:,1])**2+(point1[0,2]-point2[0,2])**2)
        return dis
    if point1.ndim == 1:
        dis = math.sqrt((point1[0]-point2[0])**2+(point1[1]-point2[1])**2+(point1[2]-point2[2])**2)
        return dis


def Rg_per_frame (universe):
    """
    Calculates the Rg of the protein for the exact time frame. The function includes only C alpha atoms,
    for improved calculation efficiency.
    :param universe: Universe class, generated by MDAnalysis
    :return: Rg from the center of mass.
    """
    center = universe.select_atoms("protein").center_of_mass()
    all_atom = universe.select_atoms("name CA")
    radi_sum = 0
    for i in all_atom.positions:
        radi_sum+=distance_2dots(center,i)
    return radi_sum/len(all_atom)


def Rg_t_lapse (universe,start_step=None,end_step=None):
    """
    Calculates Rg for a simulation time period.The function includes only C alpha atoms,
    for improved calculation efficiency. If time period id not specified, than the entire simulation will be analyzed.
    :param universe: Universe class, generated by MDAnalysis
    :param start_step: Frame number to start analysing. Do not receive the time in the simulation. (integer)
    :param end_step: Frame number to end analysing. Do not receive the time in the simulation. (integer)
    :return: numpy.array with time steps of the simulation, and Rg for each step(simulation length,2).
    """
    simulation_trajectory = universe.trajectory[start_step:end_step]
    data = np.zeros((len(simulation_trajectory),2))
    for steps,time in zip(simulation_trajectory,range(len(simulation_trajectory))):
        data[time,0] = universe.trajectory.time
        data[time,1] = Rg_per_frame(universe)
    return data


def resi_distance (resi1, resi2,universe=None,start_step=None,end_step=None):
    """
    Calculates the distance between two residues for a simulation time period.The function includes only C alpha atoms,
    for improved calculation efficiency. If time period id not specified, than the entire simulation will be analyzed.
    :param resi1: Location of the first residue on the chain (integer)
    :param resi2: Location of the second residue on the chain (integer)
    :param universe: Universe class, generated by MDAnalysis.
    :param start_step: Frame number to start analysing. Do not receive the time in the simulation. (integer)
    :param end_step: Frame number to end analysing. Do not receive the time in the simulation. (integer)
    :return: numpy.array with time steps of the simulation, and the distance between two of interest for each step(simulation length,2).
    """
    residue_1 = universe.select_atoms("resid "+str(resi1)+"-"+str(resi1) +" and name CA")
    residue_2 = universe.select_atoms("resid "+str(resi2)+"-"+str(resi2) +" and name CA")
    simulation_trajectory = universe.trajectory[start_step:end_step]
    data = np.zeros((len(simulation_trajectory),2))
    for steps,time in zip(simulation_trajectory,range(len(simulation_trajectory))):
        data[time,0] = universe.trajectory.time
        data[time,1] = distance_2dots(residue_1.positions,residue_2.positions)
    return data


def pdbAnalysis (input_directory,resi1,resi2,start_step=None,end_step=None):
    """
    Analyzes the RG and residues distances for all the .pbd files (from coarse grain simulations) in the given directory.
    :param input_directory:
    :param resi1: Residue index on the peptide, for calculating distance with resi2. (index start from 1 not 0) (integer)
    :param resi2:
    :param start_step: Frame number to start analysing. Do not receive the time in the simulation. (integer)
    :param end_step: Frame number to end analysing. Do not receive the time in the simulation. (integer)
    :return: Data Frame with each calculation (Rg and residue distance) for each repeat as a column.
    """
    all_pdbs_files = glob.glob(input_directory+"/*.pdb")
    all_data = pd.DataFrame()
    first = True
    for i in range(len(all_pdbs_files)):
        universe = MDA.Universe(all_pdbs_files[i])
        if first:
            all_data = pd.DataFrame(Rg_t_lapse(universe, start_step, end_step),columns=['time',"Rg_repeat_" + str(i+1)])
            all_data["resi_distance_repeat_1"] = resi_distance(resi1, resi2, universe, start_step, end_step)[:,1]
            first = False
        else:
            all_data["resi_distance_repeat_" + str(i+1)] = resi_distance(resi1, resi2, universe, start_step, end_step)[:,1]
            all_data["Rg_repeat_" + str(i+1)] = Rg_t_lapse(universe, start_step, end_step)[:,1]
    return all_data


def xtcAnalysis(directory, resi1, resi2, start_step=None, end_step=None):
    """
    Analyzes the RG and residues distances for all the .xtc files (from atomistic simulations) in the given directory.
    Directory must include a topology file (either .gro or .pdb)
    :param directory:
    :param resi1: Residue index on the peptide, for calculating distance with resi2. (index start from 1 not 0) (integer)
    :param resi2:
    :param start_step: Frame number to start analysing. Do not receive the time in the simulation. (integer)
    :param end_step: Frame number to end analysing. Do not receive the time in the simulation. (integer)
    :return: Data Frame with each calculation (Rg and residue distance) for each repeat as a column.
    """
    all_xtc_files = glob.glob(directory + "/*.xtc")
    all_topo_files = glob.glob(directory + "/*.gro")
    if len(all_topo_files) == 0:
        all_topo_files = glob.glob(directory + "/*.pdb")
    resi1, resi2 = (resi1+1, resi2+1)
    all_data = pd.DataFrame()
    first = True
    for i in range(len(all_xtc_files)):
        universe = MDA.Universe(all_topo_files[0],all_xtc_files[i])
        if first:
            all_data = pd.DataFrame(Rg_t_lapse(universe, start_step, end_step),
                                    columns=['time', "Rg_repeat_" + str(i + 1)])
            all_data["resi_distance_repeat_1"] = resi_distance(resi1, resi2, universe, start_step, end_step)[:, 1]
            first = False
        else:
            all_data["resi_distance_repeat_" + str(i + 1)] = resi_distance(resi1, resi2, universe, start_step,
                                                                           end_step)[:, 1]
            all_data["Rg_repeat_" + str(i + 1)] = Rg_t_lapse(universe, start_step, end_step)[:, 1]
    return all_data


def merge_continue_and_start(directory_start,directory_continue,output_directory,resi1, resi2,end_time, start_step=None, end_step=None):
    segment = re.search(r".*(seg[ABC]).*", directory_start).group(1)
    start_data = xtcAnalysis(directory_start,resi1, resi2, start_step)
    continue_data = xtcAnalysis(directory_continue,resi1, resi2, start_step=0,end_step=end_step)
    continue_data["time"]= np.array(continue_data["time"])+end_time
    complete_data = pd.concat([start_data,continue_data],ignore_index=True)
    pd.DataFrame.to_csv(complete_data, output_directory+"/"+segment + ".csv")


def save_xtcAnalysis(input_directory,segment,outputdir=None,start_step=1000,end_step=None):
    segment,input_directory = (str(segment),str(input_directory))
    segments = {"segA":(1,151),"segB":(1,159),"segC":(1,128)}
    resi1, resi2 = segments[segment]
    if outputdir != None:
        output_directory = outputdir+"/"
    else: output_directory = set_output_directory
    os.makedirs(output_directory, exist_ok=True)
    data = xtcAnalysis(input_directory,resi1,resi2,start_step,end_step)
    pd.DataFrame.to_csv(data, output_directory +segment+ ".csv")
    FRET_measures(output_directory +segment+ ".csv")


def iterPDBAnalysis(directory,start_step=None,end_step=None):
    """
    Iterates over a tree directory, no matter its depth size, and applies PDBAnalysis on sub-directories with
    .pdb files in it.
    :param directory: Any tree directory with coarse-grain output as leaf directories. Must not include dirs without
    coarse-grain output at its end.
    :param start_step: Frame number to start analysing. Do not receive the time in the simulation. (integer)
    :param end_step: Frame number to end analysing. Do not receive the time in the simulation. (integer)
    :return: A new exact tree directory, with the PDBAnalysis output as .csv files, in the leaf directories.
    The directory is saved in a default dir ('set_output_directory')
    """
    segs = {"segA": (1, 151), "segB": (1, 159), "segC": (1, 128)}
    output_dir = set_output_directory+"coarse/"
    directory_without_main = re.search("(.*/).*/",directory).group(1)
    directories_toiter = [directory]
    while directories_toiter:
        dir = directories_toiter.pop()
        if "output" in [x for x in os.walk(dir)][0][1]:
            print("PDBAnalysis on:",dir+"output/traj/")
            segment = re.search(".*(seg[ABC]).*", dir).group(1)
            down_tree = re.search(directory_without_main+"(.*)",dir).group(1)
            data = pdbAnalysis(dir + "output/Traj/", segs[segment][0], segs[segment][1], start_step, end_step)
            os.makedirs(output_dir+down_tree,exist_ok=True)
            pd.DataFrame.to_csv(data,output_dir+down_tree+"file.csv")
        else:
            for subdir in [x for x in os.walk(dir)][0][1]:
                directories_toiter.append(dir+subdir+"/")


def zip2pdb(directory):
    """
    Iterates over a tree directory, no matter its depth size, unzips every output zip file, and converts .dat file
    .pbd file. Converting .dat to .pdb every X frames, specified by 'skip_frame'.
    :param directory: Any tree directory with coarse-grain output as leaf directories. Must not include dirs without
    coarse-grain output at its end.
    :return:
    """
    unzip_command = "bunzip2 *"
    directories_toiter = [directory]
    skip_frame = "2"        # number, which specifies how many frames to skip when converting to pdb
    while directories_toiter:
        dir = directories_toiter.pop()
        if "output" in [x for x in os.walk(dir)][0][1]:
            wd = dir+"output/Traj/"
            print("zip2pdb on:", wd)
            subprocess.Popen(unzip_command, cwd=wd, shell=True).wait()
            for file in glob.glob(dir+"output/Traj/*.dat"):
                subprocess.Popen(["/home_e/dor/scripts/TrajToPDB/TrajToPDB.pl",file,skip_frame]).wait()
        else:
            for subdir in [x for x in os.walk(dir)][0][1]:
                directories_toiter.append(dir+subdir+"/")


def calc_FRET(data,segment):
    N = {"segA":151,"segB":159,"segC":128}
    exponent = {"segA":0.48,"segB":0.43,"segC":0.44}
    marker_length = 9
    scaler = (N[segment]/(N[segment]+marker_length))**exponent[segment]
    scaled_toinclude_markers = data*scaler
    R_zero = 54
    FRET_measures= (R_zero**6)/(scaled_toinclude_markers**6+R_zero**6)
    return FRET_measures


def FRET_measures(csv_file):
    segment = re.search(".*(seg[ABC]).*",csv_file).group(1)
    df = pd.read_csv(csv_file)
    for col in df.columns:
        if "resi_distance" in col:
            reapeat_num = re.search(".*(\d).*",col).group(1)
            print(col,reapeat_num)
            df["smFRET_"+str(reapeat_num)] = calc_FRET(df[col],segment)
    pd.DataFrame.to_csv(df, csv_file)