-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathencode_data.py
65 lines (58 loc) · 1.97 KB
/
encode_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import jellyfish
import argparse
import os
from shutil import copyfile as cp
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('--source', '-s', type=str, help='Source language')
parser.add_argument('--target', '-t', type=str, help='Target language')
parser.add_argument('--groupingfile', '-m', type=str, help='Mapping file')
parser.add_argument('--inpath', '-i', type=int, help='Input path')
parser.add_argument('--filenames', '-f', type=str, default='train,valid,test', help='Input filenames comma separated')
parser.add_argument('--outpath', '-o', type=str, help='Output path')
parser.add_argument('--algo', '-a', type=str, default='algo1', help='Algorithm format for file extension')
args = parser.parse_args()
src = args.source
tgt = args.target
inpath = args.infile
filenames = args.filenames.split(',')
groupingfile = args.groupingfile
inpath = args.inpath
outpath = args.outpath
algo = args.algo
groups = dict()
with codecs.open(groupingfile,'r',encoding='utf-8') as f:
for line in f:
parts = line.strip().split(' : ')
word = parts[0]
groupid = parts[1]
groups[word] = groupid
for filename in filenames:
print(src+" : "+format+" : "+filename)
grouping_file = open(outpath+'/'+filename+'.'+algo,'w',encoding='utf-8')
concat_file = open(outpath+'/'+filename+'.'+src+algo,'w',encoding='utf-8')
with open(inpath+'/'+filename+'.'+src,'r') as input:
for i,line in enumerate(input):
line = line.strip()
words = line.split()
str = ""
for word in words:
groupid = ""
try:
groupid = groups[word]
except:
print("Group not found for "+word+".")
exit()
str += groupid + " "
str = str[:-1]
try:
grouping_file.write(str+'\n')
concat_file.write(line+' '+str+'\n')
except:
print("Line ",i)
exit()
grouping_file.close()
concat_file.close()
cp(inpath+filename+'.'+src,outpath+filename+'.'+src)
if filename!='test':
cp(inpath+filename+'.'+tgt,outpath+filename+'.'+tgt)
print("DONE!")