forked from hoondy/esprnn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreproc_fa2npy.py
78 lines (55 loc) · 1.89 KB
/
preproc_fa2npy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/python
__author__ = "Donghoon Lee"
__copyright__ = "Copyright 2019"
__credits__ = ["Donghoon Lee"]
__license__ = "GPL"
__version__ = "1.0.1"
__maintainer__ = "Donghoon Lee"
__email__ = "[email protected]"
import argparse
from Bio import SeqIO
from Bio.Alphabet import IUPAC
import numpy as np
parser = argparse.ArgumentParser(description='FA to NPY')
parser.add_argument('-i','--input', help='input file',required=True)
parser.add_argument('-p','--prefix',help='output prefix', required=True)
args = parser.parse_args()
def readFasta(fastaFile):
list_id=[]
list_seq = []
for fa in SeqIO.parse(fastaFile,"fasta",alphabet=IUPAC.unambiguous_dna): # reads one fasta record at a time
list_id.append(fa.id) # id
list_seq.append(str(fa.seq.upper())) # sequence
with open(args.prefix+'.txt','w') as o:
for x in list_id:
o.write(x+'\n')
return list_seq
def dna_onehot_encoding(list_seq):
# Input:
# list_seq: list of sequences
### initialize
# sequence to long string
# str_seq = ""
# for s in list_seq:
# str_seq=str_seq+s
# nucleotide mapping
# nt = set(str_seq)
# nt_idx = {c: i for i, c in enumerate(nt)}
nt_idx = {'A': 0, 'C': 1, 'T': 2, 'G': 3}
print("NT Mapping:",nt_idx)
maxlen = max(map(len, list_seq)) # apply len to all seq, find max length
X = np.zeros((len(list_seq), maxlen, len(nt_idx)), dtype=np.uint8) # set array of zeros with dim: n_sample, seq_length, 4 NT
### one-hot encoding
for i, seq in enumerate(list_seq):
for j, nuc in enumerate(seq):
if nuc in nt_idx.keys():
X[i,j,nt_idx[nuc]] = 1
return X
def fa2npy(fa_file, npy_file):
X=dna_onehot_encoding(readFasta(fa_file))
print(X.shape)
print(X)
np.save(npy_file,X)
print("File",npy_file,"Saved")
print("Done")
fa2npy(args.input, args.prefix+'.npy')