-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathremove.py
37 lines (35 loc) · 1.09 KB
/
remove.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import logging
import os.path
import six
import sys
import re
import codecs
#reload(sys)
#sys.setdefaultencoding('utf8')
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
if len(sys.argv) != 3:
print("Using: python remove.py xxx.txt xxxx.txt")
sys.exit(1)
inp, outp = sys.argv[1:3]
output = codecs.open(outp, 'w', encoding='utf-8')
inp = codecs.open(inp, 'r', encoding='utf-8')
i = 0
for line in inp.readlines():
ss = re.sub("[\s+\.\!\/_,;-><¿#&«-»=|1234567890¡?():$%^*(+\"\']+|[+——!,。?、【】《》“”‘’~@#¥%……&*()''""]+".decode("utf-8"), " ".decode("utf-8"),line)
ss+="\n"
output.write("".join(ss.lower()))
i=i+1
if (i % 10000 == 0):
logger.info("Saved " + str(i) + " articles")
#break
output.close()
inp.close()
logger.info("Finished removed words!")