-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataprocessing.py
79 lines (61 loc) · 2.17 KB
/
dataprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import json
from pandas.io.json import json_normalize
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import random
# %%
###########################
#Load data
###########################
def load_data(path):
a = pd.read_json(path, lines=True,typ='frame') #load data fromn notepad to dataframe
return a
# %%
###########################
# Retrun unique DNAs
###########################
def uniSnpDnaDf(a):
uArr = pd.unique(a[['snpMother', 'snpSelf', 'snpFather']].values.ravel('K')) ### (i) make single dim array in row major form and (ii) keep only unique values
uDF = pd.DataFrame(uArr, columns=['snpDNA'])
uDF.dropna(axis = 0, inplace=True) #drop None values
return uDF
# %%
def main_f(path):
# path = '../../data/DeepIntegrate_Data_HOH_2019.txt'
a = load_data(path)
print("Columns of Dataframe - ", a.columns)
print("Number of records initially - ", len(a))
uDF = uniSnpDnaDf(a)
return uDF
###########################
# Retrun the one-hot-encoder
###########################
def noiseIntro(l, noisePercentage):
rn = random.sample(range(0, 13714), noisePercentage)
for i in np.arange(noisePercentage):
l[rn[i]] = '-'
return l
def ohe_enc(udf, ls, flag, noisePercentage):
ohe = OneHotEncoder(drop='first')
for i in np.arange(len(udf)):
one_recd = list(udf['snpDNA'][0])
if(flag==1):
noiseIntro(one_recd, noisePercentage)
a = np.array(one_recd).reshape(-1, 1) ### list() will make the list from string, reshape to n*1 array,
eohe = ohe.fit_transform(a).todense() ## convert to ohe in matrix format
ls[i] = eohe.T
return ls
# %%
def UniqueNoisyDNASequences(path):
uDF = main_f(path)
totalDNASequence = len(uDF['snpDNA']) #1314
DNALength = len(uDF['snpDNA'][0]) #13714
encodingLen = 8
noisePercentage = len(uDF['snpDNA'][0])*.2 #2742
ls = np.ones((totalDNASequence, encodingLen, DNALength))
noisyX = ohe_enc(uDF, ls, 1, noisePercentage)
orgY = ohe_enc(uDF, ls, 0, noisePercentage)
DNAListX = np.array(noisyX)
DNAListY = np.array(orgY)
return DNAListX, DNAListY