-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnormwix.py
executable file
·90 lines (77 loc) · 3.09 KB
/
normwix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python3
# Copyright (C) 2016.
# Author: Jesús Manuel Mager Hois
# e-mail: <[email protected]>
# Project website: http://turing.iimas.unam.mx/wix/
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import sys
import re
def normwix(text):
text = text.lower()
text = re.sub(r"´", "'", text, flags=re.IGNORECASE)
#text = re.sub(r"'", "", text, flags=re.IGNORECASE)
text = re.sub(r"v", "w", text, flags=re.IGNORECASE)
text = re.sub(r"(c|qu)", "k", text, flags=re.IGNORECASE)
text = re.sub(r"[0-9]+", "", text, flags=re.IGNORECASE)
text = re.sub(r"ch", "ts", text, flags=re.IGNORECASE)
text = re.sub(r"rr", "x", text, flags=re.IGNORECASE)
text = re.sub(r" +", " ", text, flags=re.IGNORECASE)
text = re.sub(r"[üï]", "+", text, flags=re.IGNORECASE)
text = re.sub(r"^ ", "", text, flags=re.IGNORECASE)
text = re.sub(r"(?<!t|\[)s", "ts", text, flags=re.IGNORECASE)
text = re.sub(r"[áàä]", "a", text, flags=re.IGNORECASE)
text = re.sub(r"[éèë]", "e", text, flags=re.IGNORECASE)
text = re.sub(r"[íì]", "i", text, flags=re.IGNORECASE)
text = re.sub(r"[óòö]", "o", text, flags=re.IGNORECASE)
text = re.sub(r"[úù]", "u", text, flags=re.IGNORECASE)
text = re.sub(r"([a-z+])\1+", r"\1", text, flags=re.IGNORECASE)
return text
def tokenizewix(text):
text = re.sub(r"(?<![\s])([.|,|,\-,\"|:|;|¿|?|¡|!])", r" \1", text)
text = re.sub(r"([.|,|,\-,\"|:|;|¿|?|¡|!])(?<![\s])", r"\1 ", text)
return text
if __name__ == "__main__":
l = 4
if len(sys.argv) < 2:
print("normwix.py normalize and tokenize text in wixárika (huichol) ")
print("language. It has GPL licence, so feel free to share it.")
print(" normwix.py [-a|-n|-t|-p|-h] inputfile [outputfile]")
print(" -a all: normalize and tokenize")
print(" -n normalize")
print(" -t tokenize")
print(" -p print output")
print(" -h this help")
sys.exit()
op = sys.argv[1]
if not "-" in op:
l = 3
op = "a"
infile = sys.argv[1]
else:
infile = sys.argv[2]
if "p" in op:
l = 2
else:
outfile = sys.argv[3]
Fo = open(outfile, "w")
Fi = open(infile, "r")
text = Fi.read()
Fi.close()
if ("n" in op) or ("a" in op):
text = normwix(text)
if ("t" in op) or ("a" in op):
text = tokenizewix(text)
try:
Fo.write(text)
Fo.close()
except:
print(text)