-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgscholar.py
executable file
·221 lines (193 loc) · 7.28 KB
/
gscholar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python
# gscholar - Get bibtex entries from Goolge Scholar
# Copyright (C) 2011 Bastian Venthur <venthur at debian org>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""Library to query Google Scholar.
Call the method query with a string which contains the full search string.
Query will return a list of bibtex items.
"""
import urllib2
import re
import hashlib
import random
import sys
import os
import subprocess
import optparse
import logging
from htmlentitydefs import name2codepoint
# fake google id (looks like it is a 16 elements hex)
google_id = hashlib.md5(str(random.random())).hexdigest()[:16]
GOOGLE_SCHOLAR_URL = "http://scholar.google.com"
# the cookie looks normally like:
# 'Cookie' : 'GSP=ID=%s:CF=4' % google_id }
# where CF is the format (e.g. bibtex). since we don't know the format yet, we
# have to append it later
HEADERS = {'User-Agent' : 'Mozilla/5.0',
'Cookie' : 'GSP=ID=%s' % google_id }
FORMAT_BIBTEX = 4
FORMAT_ENDNOTE = 3
FORMAT_REFMAN = 2
FORMAT_WENXIANWANG = 5
def query(searchstr, outformat, allresults=False):
"""Return a list of bibtex items."""
logging.debug("Query: %s" % searchstr)
searchstr = '/scholar?q='+urllib2.quote(searchstr)
url = GOOGLE_SCHOLAR_URL + searchstr
header = HEADERS
header['Cookie'] = header['Cookie'] + ":CF=%d" % outformat
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request)
html = response.read()
html.decode('ascii', 'ignore')
# grab the links
tmp = get_links(html, outformat)
# follow the bibtex links to get the bibtex entries
result = list()
if allresults == False and len(tmp) != 0:
tmp = [tmp[0]]
for link in tmp:
url = GOOGLE_SCHOLAR_URL+link
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request)
bib = response.read()
print
print
print bib
result.append(bib)
return result
def get_links(html, outformat):
"""Return a list of reference links from the html."""
if outformat == FORMAT_BIBTEX:
refre = re.compile(r'<a href="(/scholar\.bib\?[^>]*)">')
elif outformat == FORMAT_ENDNOTE:
refre = re.compile(r'<a href="(/scholar\.enw\?[^>]*)">')
elif outformat == FORMAT_REFMAN:
refre = re.compile(r'<a href="(/scholar\.ris\?[^>]*)">')
elif outformat == FORMAT_WENXIANWANG:
refre = re.compile(r'<a href="(/scholar\.ral\?[^>]*)">')
reflist = refre.findall(html)
# escape html enteties
reflist = [re.sub('&(%s);' % '|'.join(name2codepoint), lambda m:
unichr(name2codepoint[m.group(1)]), s) for s in reflist]
return reflist
def convert_pdf_to_txt(pdf):
"""Convert a pdf file to txet and return the text.
This method requires pdftotext to be installed.
"""
stdout = subprocess.Popen(["pdftotext", "-q", pdf, "-"], stdout=subprocess.PIPE).communicate()[0]
return stdout
def pdflookup(pdf, allresults, outformat):
"""Look a pdf up on google scholar and return bibtex items."""
txt = convert_pdf_to_txt(pdf)
# remove all non alphanumeric characters
txt = re.sub("\W", " ", txt)
words = txt.strip().split()[:20]
gsquery = " ".join(words)
bibtexlist = query(gsquery, outformat, allresults)
return bibtexlist
def _get_bib_element(bibitem, element):
"""Return element from bibitem or None."""
lst = [i.strip() for i in bibitem.split("\n")]
for i in lst:
if i.startswith(element):
value = i.split("=", 1)[-1]
value = value.strip()
while value.endswith(','):
value = value[:-1]
while value.startswith('{') or value.startswith('"'):
value = value[1:-1]
return value
return None
def rename_file(pdf, bibitem):
"""Attempt to rename pdf according to bibitem."""
year = _get_bib_element(bibitem, "year")
author = _get_bib_element(bibitem, "author")
if author:
author = author.split(",")[0]
title = _get_bib_element(bibitem, "title")
l = []
for i in year, author, title:
if i:
l.append(i)
filename = " - ".join(l) + ".pdf"
newfile = pdf.replace(os.path.basename(pdf), filename)
print
print "Will rename:"
print
print " %s" % pdf
print
print "to"
print
print " %s" % newfile
print
print "Proceed? [y/N]"
answer = raw_input()
if answer == 'y':
print "Renaming %s to %s" % (pdf, newfile)
os.rename(pdf, newfile)
else:
print "Aborting."
if __name__ == "__main__":
usage = 'Usage: %prog [options] {pdf | "search terms"}'
parser = optparse.OptionParser(usage)
parser.add_option("-a", "--all", action="store_true", dest="all",
default="False", help="show all bibtex results")
parser.add_option("-d", "--debug", action="store_true", dest="debug",
default="False", help="show debugging output")
parser.add_option("-r", "--rename", action="store_true", dest="rename",
default="False", help="rename file (asks before doing it)")
parser.add_option("-f", "--outputformat", dest='output',
default="bibtex", help="Output format. Available formats are: bibtex, endnote, refman, wenxianwang [default: %default]")
(options, args) = parser.parse_args()
if options.debug == True:
logging.basicConfig(level=logging.DEBUG)
if options.output == 'bibtex':
outformat = FORMAT_BIBTEX
elif options.output == 'endnote':
outformat = FORMAT_ENDNOTE
elif options.output == 'refman':
outformat = FORMAT_REFMAN
elif options.output == 'wenxianwang':
outformat = FORMAT_WENXIANWANG
if len(args) != 1:
parser.error("No argument given, nothing to do.")
sys.exit(1)
args = args[0]
pdfmode = False
if os.path.exists(args):
logging.debug("File exist, assuming you want me to lookup the pdf: %s." % args)
pdfmode = True
biblist = pdflookup(args, all, outformat)
else:
logging.debug("Assuming you want me to lookup the query: %s." % args)
biblist = query(args, outformat, options.all)
if len(biblist) < 1:
print "No results found, try again with a different query!"
sys.exit(1)
if options.all == True:
logging.debug("All results:")
for i in biblist:
print i
else:
logging.debug("First result:")
print biblist[0]
if options.rename == True:
if not pdfmode:
print "You asked me to rename the pdf but didn't tell me which file to rename, aborting."
sys.exit(1)
else:
rename_file(args, biblist[0])