-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcheck_years.py
58 lines (44 loc) · 1.51 KB
/
check_years.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import re
from bz2 import BZ2File as bzopen
from collections import defaultdict
# Findet den Ausdruck den man haben will
def finde_ausdruck(ausdruck, text):
match = re.search(ausdruck, text)
if not match:
return None, None
begin = match.start() ## füll mich aus
end = match.end()
return begin, end
#filename_mask = "IT_Ethik_deep_learning_text_normalized"
filename_mask = "norm/%.2d.txt"
filename_mask = "output/AA/wiki_%.2d.bz2"
num_range = 44
regex = "([^ ]+ 1[0-9]{3} [^ ]+)"
pres = defaultdict(int)
posts = defaultdict(int)
for x in range(num_range):
if "%" in filename_mask:
filename = filename_mask % x
else:
filename = filename_mask
print('Filename:', filename)
with bzopen(filename) as bzin:
for line in bzin:
try:
line = line.decode('utf-8')
if line[-1] == '/n':
line = line[:-1]
begin, end = finde_ausdruck(regex, line)
if end:
exp = line[begin:end]
split = exp.split()
pres[split[0]] += 1
posts[split[2]] += 1
except:
print("Error in line:", line)
pres_sorted = sorted(pres.items(), key= lambda k: k[1], reverse=True)
posts_sorted = sorted(posts.items(), key= lambda k: k[1], reverse=True)
print('pres:')
print('"),("'.join([elem[0] for elem in pres_sorted[:200]]))
print('posts:')
print('"),("'.join([elem[0] for elem in posts_sorted[:200]]))