-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
40 lines (29 loc) · 1.2 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from Bio import SeqIO
import pandas as pd
def make_dataframe(n_samples):
n_samples = int(n_samples/2)
sequence_target = []
for record in SeqIO.parse("dataset/ncRNA_datasets/hsa_ncRNA.fa", "fasta"):
sequence_target.append(str(record.seq))
data_target = {'Sequence' : sequence_target, 'target' : 1}
df_target = pd.DataFrame(data=data_target)
if len(df_target) >= n_samples:
df_target = df_target.sample(n=n_samples)
else:
print('Numero de samples maior do que oque há no banco de dados')
return 0
sequence = []
for record in SeqIO.parse("dataset/VERT_test_dataset/hsa_real.fa", "fasta"):
sequence.append(str(record.seq))
data = {'Sequence' : sequence, 'target' : 0}
df = pd.DataFrame(data=data)
if len(df) >= n_samples:
df = df.sample(n=n_samples)
else:
print('Numero de samples maior do que oque há no banco de dados')
return 0
dataframe = pd.concat([df_target, df], ignore_index=True, sort=False)
dataframe = dataframe.sample(frac=1).reset_index(drop=True)
return dataframe
def make_csv(file_name, samples):
make_dataframe(n_samples=samples).to_csv(file_name+'.csv', index=False)