forked from tedhchen/twitter_network_polarization_tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwords_related_to_topic.py
158 lines (106 loc) · 4.31 KB
/
words_related_to_topic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import json
import re
from uralicNLP import uralicApi
import itertools
from collections import Counter
import os
def load_tweet_file(path):
with open(path, "r") as read_file:
tweets = json.load(read_file)
return tweets
def load_stopwords():
with open("stopwords.txt", "r") as read_file:
list_of_words = read_file.readlines()
stopwords = [w.strip("\n") for w in list_of_words]
return stopwords
def remove_url(txt):
text = re.sub(r"http\S+", "", txt)
return text
def process_text(tweets, sw, LEM):
processed_tweets = []
for i in range(len(tweets)):
if "retweeted_status" in tweets[i]:
# Remove URL
tweet_1 = remove_url(tweets[i]["retweeted_status"]["text"])
# Lower case and split
tweet_2 = tweet_1.lower().split()
# Remove stopwords
tweet_3 = [w for w in tweet_2 if w not in sw]
# Lemmatize
if LEM:
tweet_3 = [uralicApi.lemmatize(w, "fin") for w in tweet_3]
processed_tweets.append(tweet_3)
else:
# Remove URL
tweet_1 = remove_url(tweets[i]["text"])
# Lower case and split
tweet_2 = tweet_1.lower().split()
# Remove stopwords
tweet_3 = [w for w in tweet_2 if w not in sw]
# Lemmatize
if LEM:
tweet_3 = [uralicApi.lemmatize(w, "fin") for w in tweet_3]
processed_tweets.append(tweet_3)
return processed_tweets
def word_frequency(all_tweets):
flat_list = [item for sublist in all_tweets for item in sublist]
word_counts = Counter(flat_list)
return dict(word_counts)
def flatten(l):
flattened = []
for i in range(len(l)):
flat_list = [item for sublist in l[i] for item in sublist]
flattened.append(flat_list)
return flattened
def main():
# Write the data path
data_dir = "/run/user/1282311/gvfs/smb-share:server=data.triton.aalto.fi,share=scratch/cs/networks/ecanet/elections/raw_tweets/all_data"
# Select the word
seed_word = "yle"
# Lemmatize
LEM = False
##########################################################################################################
stopwords_fin = load_stopwords()
files = [f for f in os.listdir(data_dir) if f.endswith('.json')]
files = files[0:4]
f_count = len(files)
freq_dict_all = dict()
selected_structs = []
for f in files:
with open(os.path.join(data_dir, f), "r") as read_file:
data = json.load(read_file)
all_tweets_processed = process_text(data, stopwords_fin, LEM)
#print(all_tweets_processed)
if LEM:
all_tweets_processed = flatten(all_tweets_processed)
freq_dict_batch = word_frequency(all_tweets_processed)
freq_dict_all = Counter(freq_dict_all) + Counter(freq_dict_batch)
for tweet in all_tweets_processed:
if (seed_word in tweet) or (("#" + seed_word) in tweet):
selected_structs.append(tweet)
f_count -= 1
print("A file completed. This many more: ", f_count)
#all_tweets = load_tweet_file(data_path)
# Flatten the sublists
#all_tweets_flatten = flatten(all_tweets_processed)
#all_tweets_flatten = all_tweets_processed
# Keep the structs with seed word
# Flatten the struct list
flattened_structs = [item for sublist in selected_structs for item in sublist]
#print(flattened_structs)
struct_counts = list(Counter(flattened_structs).items())
#print(struct_counts)
# Normalized occurences
n_occ = dict()
for s in struct_counts:
#print(s)
if freq_dict_all[s[0]] > 50: #Select threshold here
n_occ[s[0]] = (s[1]/freq_dict_all[s[0]], freq_dict_all[s[0]])
#print(n_occ)
# This prints/saves the normalized co-occurence score for words that appears the most with selected seed word.
sorted_dict = sorted(n_occ.items(), key=lambda kv: kv[1])
#print(sorted_dict)
with open('cooccurence_result_try.json', 'w', encoding='utf8') as fp:
json.dump(sorted_dict, fp, ensure_ascii=False)
if __name__ == '__main__':
main()