-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathahocorasick_example.py
48 lines (43 loc) · 1.26 KB
/
ahocorasick_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import math
import dataset
import ahocorasick
# ahocorasicks使用例子
words, _ = dataset.load_freq_words(proba=True)
logtotal = math.log(sum(words.values()))
am = ahocorasick.Automaton()
for word, proba in words.items():
logproba = (math.log(proba) if proba > 0 else 0) - logtotal
am.add_word(word, (word, proba))
am.make_automaton()
text = "黑天鹅和灰犀牛是两个突发性事件"
for end_idx, (word, proba) in am.iter(text):
print(end_idx, word, proba)
"""
0 黑 0.0001879472409333384
1 黑天 7.820042779627218e-07
1 天 0.0005986325939749099
2 黑天鹅 4.9915166678471605e-08
2 天鹅 9.184390668838775e-06
2 鹅 1.1680149002762355e-05
3 和 0.009247866122464898
4 灰 4.780209128908297e-05
5 灰犀 0.0
5 犀 2.2129057227455743e-06
6 灰犀牛 4.9915166678471605e-08
6 犀牛 2.645503833958995e-06
6 牛 0.0001054707471916105
7 是 0.013260646202080588
8 两 0.0007156337446692474
9 两个 0.0004816314432805725
9 个 0.0020887500648273227
10 突 3.261124222993478e-05
11 突发 1.484144289239889e-05
11 发 0.00028019046895515394
12 突发性 1.2146023891761423e-06
12 发性 1.1646872224976707e-07
12 性 0.0002303917709989321
13 性事 2.49575833392358e-07
13 事 0.0007448840323428318
14 事件 0.00012608571102981926
14 件 0.00010785003680328432
"""