-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcfscrapper.py
121 lines (98 loc) · 2.8 KB
/
cfscrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import requests
from bs4 import BeautifulSoup
import sys
import os
import pyprind
import pandas as pd
import glob
from os.path import basename
# now you can call it directly with basename
path = os.getcwd()
#user="Balance_Breaker"
x='true';
user=sys.argv[1]
os.chdir(path+"/CodeForces/csvdatabase")
for file in glob.glob("*.csv"):
if(basename(file)==user+'.csv'):
print('already created csv file')
x='false';
os.chdir(path)
if(x=='true'):
ur = "http://codeforces.com/submissions/"+user+"/page/1"
source_code = requests.get(ur)
plain_text = source_code.text
soup =BeautifulSoup(plain_text,"lxml")
page=soup.find_all('div',{'class':'pagination'})
page=page[1]
page=page.find_all('span')
pages=page[len(page)-1]
pages=pages.findAll('a')
pagenos=pages[0].text
#section=soup.findAll('table',{'class':'status-frame-datatable'})
#links = section[0].find_all('tr')
num=int(pagenos)
url='http://codeforces.com/submissions/'+user+'/page/%d' %num
code=requests.get(url)
plain_text=code.text
soup=BeautifulSoup(plain_text,"lxml")
section = soup.findAll('table',{'class':"status-frame-datatable"})
links = section[0].find_all('tr')
lastrownos=len(links)
problist = []
cnt=(num-1)*50 + lastrownos-1
pbar=pyprind.ProgBar(cnt)
for j in range(1,num+1):
url='http://codeforces.com/submissions/'+user+'/page/%d' %j
code=requests.get(url)
plain_text=code.text
soup=BeautifulSoup(plain_text,"lxml")
section = soup.findAll('table',{'class':"status-frame-datatable"})
links = section[0].find_all('tr')
for i in range(1,len(links)):
pbar.update()
row = links[i]
id = row['data-submission-id']
data = row.find_all('td')
verdict=data[5]
span=verdict.findAll('span',{'class':'verdict-accepted'})
if(len(span)>0):
val= data[3].find_all('a')
lang = data[4]
lang=lang.text
lan='cpp'
if(lang.find('C++')>=0):
lan='cpp'
elif(lang.find('Java')>=0):
lan='java'
elif(lang.find('C')>=0):
lan='c'
else:
lan='py'
val = val[0]
str = val['href']
str=str.split('/')
num = str[3]
prt=str[4]
url = 'http://codeforces.com/contest/'+num+'/submission/'+id
code = requests.get(url)
code=code.text
soup1=BeautifulSoup(code,"lxml")
source = soup1.findAll('pre',{'class':'program-source'})
try:
source=source[0].text
#soup2 = BeautifulSoup(source,"lxml")
#problem_code = soup2.get_text()
except:
continue
problem_code=source
name = num+prt+"." + lan
problist.append(num+prt)
newpath = './CodeForces/Code/'+user#username
if not os.path.exists(newpath):
os.makedirs(newpath)
name=newpath+'/'+name
f = open(name,'w')
f.write(problem_code)
f.close()
df = pd.DataFrame(problist)
df.to_csv('./CodeForces/csvdatabase/'+user+'.csv',index=False)