-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgithub_following_analysis.py
113 lines (104 loc) · 3.74 KB
/
github_following_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from itertools import islice
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from wordcloud import WordCloud
from github import Github
from pypinyin import lazy_pinyin
import matplotlib.pyplot as plt
import csv
import re
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
def preprocessing(string, lowercase=True):
'''
adopted from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
'''
# preserve English, digital, some punctuations and whitespace characters
string = re.sub('[^A-Za-z0-9(),.:;!?\-\'&]', ' ', string)
string = re.sub('\'s', ' \'s', string)
string = re.sub('\'m', ' \'m', string)
string = re.sub('\'ve', ' \'ve', string)
string = re.sub('n\'t', ' n\'t', string)
string = re.sub('\'re', ' \'re', string)
string = re.sub('\'d', ' \'d', string)
string = re.sub('\'ll', ' \'ll', string)
string = re.sub('\(', ' ( ', string)
string = re.sub('\)', ' ) ', string)
string = re.sub(',+', ' , ', string)
string = re.sub(':+', ' , ', string)
string = re.sub('\.+', ' . ', string)
string = re.sub(';+', ' . ', string)
string = re.sub('!+', ' ! ', string)
string = re.sub('\?+', ' ? ', string)
# replace consecutive spaces by one space and remove whitespaces in the front and the end of the text
string = re.sub(' +', ' ', string).strip()
if lowercase:
# convert the text into lower case
return string.lower()
return string
def getBioData():
csvFile = csv.reader(open("following_pagerank.csv",encoding='utf-8',errors='ignore'))
user_list = [row[1] for row in islice(csvFile,1,None)]
bio_list = []
GIT = Github("81bb10423c837b5a6d517b25117a7a95a7ec444a")
#Group 1
for userid in user_list[0:199]:
try:
print(userid)
user = GIT.get_user(userid)
if(user.bio!=None):
bio = preprocessing(user.bio)
bio_list.append(bio)
except:
print("error")
#Group 2
for userid in user_list[200:399]:
try:
print(userid)
user = GIT.get_user(userid)
if(user.bio!=None):
bio = preprocessing(user.bio)
bio_list.append(bio)
except:
print("error")
print("bio list: ")
print(bio_list)
return bio_list
def calTfIdf():
bio_list = getBioData()
X = vectorizer.fit_transform(bio_list)
print('tf-idf:')
print(X)
return X
def kmeansClustering():
X = calTfIdf()
silhouette_limit = -1
for k in range(20,31):
km = KMeans(n_clusters=k, max_iter=100)
kmc = km.fit_predict(X)
silhouette_coefficient = silhouette_score(X, kmc)
print('number: %d, silhouette coefficient: %.9f' %(k,silhouette_coefficient))
if(silhouette_coefficient > silhouette_limit):
best_k = k
silhouette_limit = silhouette_coefficient
best_km = km
print('The best clustering result is %d' % best_k)
return best_km,best_k
def wordCloud():
best_km,best_k = kmeansClustering()
order_centroids = best_km.cluster_centers_.argsort()[:, ::-1]
terms = lazy_pinyin(vectorizer.get_feature_names())
cluster_item=''
for i in range(best_k):
print('Cluster %d:' % i, end='')
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind], end='')
cluster_item=cluster_item+' '+terms[ind]
print()
wordcloud = WordCloud().generate(cluster_item)
wordcloud.to_file('wordcloud%d'%i+'.png')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
cluster_item=''
wordCloud()