-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathapi_harvest.py
153 lines (115 loc) · 3.21 KB
/
api_harvest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import urllib
import urllib2
import socket
import base64
import time
import subprocess
import threading
import Queue
import shutil
import git
import gc
halt = False
try:
import argparse
except:
print 'Missing needed module: easy_install argparse'
halt = True
try:
import psutil
except:
print 'Missing needed module: easy_install psutil'
halt = True
try:
import simplejson as json
except:
print 'Missing needed module: easy_install simplejson'
halt = True
try:
from git import *
except:
print 'Missing needed module: easy_install gitpython'
halt = True
if halt == True:
sys.exit()
socket.setdefaulttimeout(120)
rate_limit_left = 5000
def get_repos(last_seen): #, user, u_pass):
global rate_limit_left
url = 'https://api.github.com/repositories?since=%s' % (last_seen)
req = urllib2.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1')
#b64s = base64.encodestring('%s:%s' % (user, u_pass))
#req.add_header('Authorization', 'Basic %s' % b64s)
page = urllib2.urlopen(req)
page_content = page.read()
page.close()
rate_limit_left = page.info()['X-RateLimit-Remaining']
return page_content
def setup():
parser = argparse.ArgumentParser()
#parser.add_argument('-u', '--username', action='store', dest='username', required=True, help='github username')
#parser.add_argument('-p', '--password', action='store', dest='password', required=True, help='github password')
parser.add_argument('-o', '--output', action='store', dest='output', required=True, help='base path for output')
global args
args = parser.parse_args()
def worker(itm):
#p = subprocess.Popen(['git', 'clone', itm['clone_url'], itm['path']])
#p.wait
try:
o_path = '%s//%s' % (args.output, itm['path'])
res = git.Git().clone(itm['clone_url'], o_path)
print '%s\n' % res
except:
pass
def main():
setup()
last_seen = 441803
while rate_limit_left > 5:
repos = get_repos(last_seen)#, args.username,args.password)
jsonrepos = json.loads(repos)
q = Queue.Queue()
threads = []
for repo in jsonrepos:
#print repo['full_name']
#print repo['owner']['login']
clone_url = 'git://github.com/%s.git' % repo['full_name']
#subprocess.call(['git', 'clone', clone_url, repo['full_name'] ])
#***** worker(clone_url, repo['full_name'])
tmp_item = {}
tmp_item['clone_url'] = clone_url
tmp_item['path'] = repo['full_name']
q.put(tmp_item)
del tmp_item
while not q.empty():
if 10 >= threading.activeCount():
q_itm = q.get()
try:
t = threading.Thread(target=worker,args=(q_itm,))
t.daemon = True
threads.append(t)
t.start()
finally:
q.task_done()
while threading.activeCount() > 1:
time.sleep(0.1)
for thread in threads:
thread.join()
q.join()
last_seen = jsonrepos[-1]['id']
f=open('logfile.txt','w')
f.write('Current rate_limit_left: %s Current last_seen: %s...' % (rate_limit_left, last_seen))
f.close()
del jsonrepos
del q
del threads
gc.collect()
#time.sleep(700)
print 'I hit my rate limit last_seen: %s' % last_seen
sys.exit()
if __name__ == "__main__":
main()