-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhierarchical_clustering.py
88 lines (70 loc) · 2.95 KB
/
hierarchical_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from utils import *
from rhyme_metric import *
class HierarchicalClustering:
def __init__(self, dataset_name) -> None:
self.dataset_name = dataset_name
self.input_path = 'input/' + dataset_name
self.output_path = 'output/' + dataset_name
self.reports_path = self.output_path + '/reports'
def cluster_level_nodes():
content = read_file(self.input_path + '/vocab.txt')
lines = content.split("\n")
del content
words = [l.split(' ')[0] for l in lines]
del lines
clusters = merge_nearest_words(words)
json_write(clusters, 'hierarchical_clusters0.json')
def hierarchical_level(i):
clusters_simple = json_read('hierarchical_clusters' + str(i) + '.json')
clusters = []
for cluster_words in clusters_simple:
clusters.append({
'words': cluster_words,
'pattern': ''
})
pattern_set = {}
for cluster in clusters:
pattern = cluster['words'][0]
for word in cluster['words'][1:]:
pattern = rhyme_pattern(pattern, word)
cluster['pattern'] = pattern
if pattern in pattern_set:
pattern_set[pattern].append(cluster['words'])
else:
pattern_set[pattern] = [cluster['words']]
rep_patterns = [(p, pattern_set[p]) for p in pattern_set if len(pattern_set[p]) > 1]
upper_level_clusters = merge_nearest_words([c['pattern'] for c in clusters])
json_write(upper_level_clusters, 'hierarchical_clusters' + str(i + 1) + '.json')
def merge_nearest_words(words):
# clusters_file = open('hierarchical_clusters0.txt', 'a+')
clusters = []
remained_words = words.copy()
clusters_num = 0
for word1 in words:
if not (word1 in remained_words):
continue
remained_words.remove(word1)
cluster_words = []
max_similarity = 0
for word2 in remained_words:
similarity = rhyme_similarity(word1, word2)
if similarity > max_similarity:
max_similarity = similarity
cluster_words = [word2]
elif (similarity == max_similarity) and (similarity > 0):
cluster_words.append(word2)
for w in cluster_words:
remained_words.remove(w)
cluster_words.append(word1)
clusters.append(cluster_words)
# clusters_file.write(', '.join(cluster_words) + "\n")
clusters_num += 1
if (clusters_num % 10) == 0:
print("Cluster" + str(clusters_num) + ' Remained words:' + str(len(remained_words)))
# clusters_file.flush()
# print(cluster)
# clusters_file.close()
# json_write(remained_words, 'no_similars.json')
return clusters
# cluster_level_nodes()
hierarchical_level(0)