-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrecommender.py~
234 lines (216 loc) · 6.82 KB
/
recommender.py~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# -*- coding: utf-8 -*-
from __future__ import print_function
from bs4 import BeautifulSoup
import urllib3
import locale
import sys
sys.path.append("src/")
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
from datetime import datetime
import re
import csv
#my functions
from network import get_title
from draw import jaccard_index
def extract_recommender_info(article):
rowData = article.find(class_="post_title")['href']
matched = re.search(r"\d\d\d\d+", rowData)
if not matched:
return None
post_id = matched.group(0)
return int(post_id)
def get_fav_by_username(username):
http = urllib3.PoolManager()
url = "http://habrahabr.ru/users/"+username+"/favorites/page"
MAX_PAGES = 250
divclass = "post"
i = 1
post_ids = []
while i <= MAX_PAGES:
try:
page_url = url+str(i)
response = http.request('GET', page_url, headers={"cache-control": "no-cache"})
except urllib3.exceptions.HTTPError as err:
print("ERROR: "+page_url)
return None
html = response.data.decode("utf-8")
soup = BeautifulSoup(html)
articles = soup.find_all(class_=divclass)
if articles:
for article in articles:
post_id = extract_recommender_info(article)
if post_id:
post_ids.append(post_id)
i += 1
else:
break
return post_ids
def extract_author_commenters(post_id):
usernames = set()
http = urllib3.PoolManager()
url = "http://habrahabr.ru/post/" + str(post_id)
try:
response = http.request('GET', url, headers={"cache-control": "no-cache"})
html = response.data.decode("utf-8")
soup = BeautifulSoup(html)
author = str(soup.find(class_= "author").a.text)
usernames.add(author)
commentators = soup.find_all(class_= "username")
for user in commentators:
usernames.add(str(user.text))
return usernames
except:
return set()
def get_all_user_names():
# first posts from 2013
log = open("username_log.txt", "w")
first_post_id = 165001
last_post_id = 223000
users = set()
for post_id in xrange(first_post_id, last_post_id):
print(post_id, end="", file=log)
newUsers = extract_author_commenters(post_id)
if newUsers:
print(" OK", file=log)
users_file = open("users_file.txt","w")
users |= newUsers
users_file.write(str(users))
users_file.close()
else:
print(" Empty", file=log)
log.flush()
log.close()
def get_all_users_fav():
fav_filename = "data/recommender/user_fav.csv"
empty_users_file = open("empty_users.txt", "a+")
empty_users = empty_users_file.read().splitlines()
users = eval(open("data/recommender/user_list.txt").read())
user_fav_csv = open(fav_filename,"a+")
downloaded_users = read_all_usernames(fav_filename)
for user in users:
if user not in downloaded_users and user not in empty_users:
print("Processing: " +user)
favs = get_fav_by_username(user)
if favs:
print(user + " OK")
print(user + ',' + ' '.join([str(e) for e in favs]),file=user_fav_csv)
user_fav_csv.flush()
else:
print(user+" EMPTY")
print(user, file=empty_users_file)
empty_users_file.flush()
user_fav_csv.close()
empty_users_file.close()
def read_all_usernames(filename):
usernames = []
data = open(filename).readlines()
for line in data:
username, post_ids = line.split(',')
usernames.append(username)
return usernames
def read_preferences():
filename = "./data/recommender/user_favorites.csv"
data = open(filename).read().splitlines()
header = True
preferences = {}
for line in data:
if header: #ignore header of the columns
header = False
continue
username, str_post_ids = line.split(',')
post_ids = [int(e) for e in str_post_ids.split(' ')]
preferences[username] = set(post_ids)
return preferences
def read_weigths():
count_file = open("./data/recommender/post_counts.csv", "r")
count_reader = csv.reader(count_file)
counts = {}
i = 0
for row in count_reader:
counts[int(row[0])] = int(row[1])
return counts
def get_all_posts(preferences):
all_posts = set()
counts = []
count_file = open("./data/recommender/post_counts.csv", "w")
for user, pref in preferences.iteritems():
all_posts |= pref
return all_posts
def compute_weigths():
preferences = read_preferences()
all_posts = get_all_posts(preferences)
count_file = open("./data/recommender/post_counts.csv", "w")
count_writer = csv.writer(count_file)
for post in all_posts:
print(post)
count = 0
for user, pref in preferences.iteritems():
if post in pref:
count += 1
count_writer.writerow([post, count])
count_file.flush()
count_file.close()
def transform_all_csv():
file_in = open("data/recommender/all.csv","r")
csv_reader = csv.reader(file_in)
file_out= open("data/recommender/titles.csv","w")
csv_writer = csv.writer(file_out)
for row in csv_reader:
csv_writer.writerow([row[0], row[1]])
file_in.close()
file_out.close()
def read_titles():
titles = {}
title_file = open("data/recommender/titles.csv", "r")
title_reader = csv.reader(title_file)
for row in title_reader:
titles[int(row[0])] = row[1]
return titles
def download_new_titles():
preferences = read_preferences()
all_posts = get_all_posts(preferences)
file_out= open("data/recommender/titles.csv","a")
csv_writer = csv.writer(file_out)
for post in all_posts:
if post > 218345:
title = get_title(post).encode("utf-8")
csv_writer.writerow([post,title])
file_out.flush()
file_out.close()
def get_titles(ids):
answer = []
for post_id in ids:
if post_id in titles:
answer.append((post_id, titles[post_id]))
else:
answer.append((post_id, get_title(post_id)))
return answer
#uncomment to make give_recommendations work!
print("reading preferences")
preferences = read_preferences()
print("reading weights")
weights = read_weigths()
print("reading titles")
titles = read_titles()
def give_recommendations(username):
preference = preferences[username]
rank = {}
for user_other, preference_other in preferences.iteritems():
if username != user_other:
similarity = jaccard_index(preference, preference_other)
if not similarity:
continue
for post in preference_other:
if post not in preference:
rank.setdefault(post, 0)
rank[post] += similarity
#normalize and convert to
post_list = [(similarity/weights[post], post) for post, similarity in rank.items()]
post_list.sort(reverse=True)
print(post_list[1:10])
recommendation_ids = [post_id for similarity, post_id in post_list]
first_answer = get_titles(recommendation_ids[:3])
other_answer = recommendation_ids[4:21]
return (first_answer,other_answer)
print("giving recommendation")
first, other = give_recommendations("Zelenyikot")