Skip to content
This repository has been archived by the owner on May 30, 2021. It is now read-only.

use Cookie for effective bibtex extraction #87 #106

Merged
merged 3 commits into from
Jun 4, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@ before_install:
- if ! go get code.google.com/p/go.tools/cmd/cover; then go get golang.org/x/tools/cmd/cover; fi

install:
- go get github.com/docopt/docopt-go
- go get github.com/PuerkitoBio/goquery
- go get github.com/Sirupsen/logrus
- ./build

script:
- $HOME/gopath/bin/goveralls -repotoken $COVERALLS_TOKEN
Expand Down
18 changes: 18 additions & 0 deletions Godeps/Godeps.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 17 additions & 22 deletions article.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,29 @@ package goscholar

import (
"encoding/json"
"fmt"
log "github.com/Sirupsen/logrus"
"github.com/k0kubun/pp"
"strconv"
"strings"
)

// Article stores the parsed results from Google Scholar.
type Article struct {
Title *Title `json:"title"`
Year string `json:"year"`
ClusterId string `json:"cluster_id"`
NumCite string `json:"num_cite"`
NumVer string `json:"num_ver"`
InfoId string `json:"info_id"`
Link *Link `json:"link"`
Title *Title `json:"title"`
Year string `json:"year"`
ClusterId string `json:"cluster_id"`
NumCite string `json:"num_cite"`
NumVer string `json:"num_ver"`
InfoId string `json:"info_id"`
Link *Link `json:"link"`
BibTeX string `json:"bibtex"`
Author []string `json:"author"`
Journal string `json:"journal"`
Booktitle string `json:"booktitle"`
Volume string `json:"volume"`
Number string `json:"number"`
Pages string `json:"pages"`
Publisher string `json:"publisher"`
}

// Title is an attribute of Article.
Expand Down Expand Up @@ -46,20 +54,7 @@ func newArticle() *Article {

// String provides a pretty print.
func (a *Article) String() string {
ret := "[Title]\n"
ret += fmt.Sprintf(" Name: %v\n", a.Title.Name)
ret += fmt.Sprintf(" Url: %v\n", a.Title.Url)
ret += fmt.Sprintf("[Year]\n %v\n", a.Year)
ret += fmt.Sprintf("[ClusterId]\n %v\n", a.ClusterId)
ret += fmt.Sprintf("[NumCite]\n %v\n", a.NumCite)
ret += fmt.Sprintf("[NumVer]\n %v\n", a.NumVer)
ret += fmt.Sprintf("[InfoId]\n %v\n", a.InfoId)
ret += "[Link]\n"
ret += fmt.Sprintf(" Name: %v\n", a.Link.Name)
ret += fmt.Sprintf(" Url: %v\n", a.Link.Url)
ret += fmt.Sprintf(" Format: %v", a.Link.Format)

return ret
return pp.Sprint(a)
}

// Json provides JSON formatted Article.
Expand Down
28 changes: 9 additions & 19 deletions article_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ func init() {
Url: "http://machinelearning.wustl.edu/mlpapers/paper_files/icml2010_Martens10.pdf",
Format: "PDF",
},
BibTeX: "@inproceedings{martens2010deep, title={Deep learning via Hessian-free optimization}, author={Martens, James}, booktitle={Proceedings of the 27th International Conference on Machine Learning (ICML-10)}, pages={735--742}, year={2010}}",
Author: []string{"Martens, James"},
Journal: "",
Booktitle: "Proceedings of the 27th International Conference on Machine Learning (ICML-10)",
Volume: "",
Number: "",
Pages: "735--742",
Publisher: "",
}
}

Expand Down Expand Up @@ -68,30 +76,12 @@ func TestNewArticle(t *testing.T) {

func ExampleString() {
fmt.Println(article)
// Output:
// [Title]
// Name: Deep learning via Hessian-free optimization
// Url: http://machinelearning.wustl.edu/mlpapers/paper_files/icml2010_Martens10.pdf
// [Year]
// 2010
// [ClusterId]
// 15502119379559163003
// [NumCite]
// 260
// [NumVer]
// 9
// [InfoId]
// e6RSJHGXItcJ
// [Link]
// Name: wustl.edu
// Url: http://machinelearning.wustl.edu/mlpapers/paper_files/icml2010_Martens10.pdf
// Format: PDF
}

func ExampleJson() {
fmt.Println(article.Json())
// Output:
// {"title":{"name":"Deep learning via Hessian-free optimization","url":"http://machinelearning.wustl.edu/mlpapers/paper_files/icml2010_Martens10.pdf"},"year":"2010","cluster_id":"15502119379559163003","num_cite":"260","num_ver":"9","info_id":"e6RSJHGXItcJ","link":{"name":"wustl.edu","url":"http://machinelearning.wustl.edu/mlpapers/paper_files/icml2010_Martens10.pdf","format":"PDF"}}
// {"title":{"name":"Deep learning via Hessian-free optimization","url":"http://machinelearning.wustl.edu/mlpapers/paper_files/icml2010_Martens10.pdf"},"year":"2010","cluster_id":"15502119379559163003","num_cite":"260","num_ver":"9","info_id":"e6RSJHGXItcJ","link":{"name":"wustl.edu","url":"http://machinelearning.wustl.edu/mlpapers/paper_files/icml2010_Martens10.pdf","format":"PDF"},"bibtex":"@inproceedings{martens2010deep, title={Deep learning via Hessian-free optimization}, author={Martens, James}, booktitle={Proceedings of the 27th International Conference on Machine Learning (ICML-10)}, pages={735--742}, year={2010}}","author":["Martens, James"],"journal":"","booktitle":"Proceedings of the 27th International Conference on Machine Learning (ICML-10)","volume":"","number":"","pages":"735--742","publisher":""}
}

func TestIsValid(t *testing.T) {
Expand Down
40 changes: 40 additions & 0 deletions bibtex.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package goscholar

import (
"bytes"
"errors"
"fmt"
"net/http"
"strings"
)

func getBibTeX(url string) (bibtex string, err error) {
// TODO: add logging
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return "", err
}

req.AddCookie(&http.Cookie{Name: "GSP", Value: "CF=4"})

res, err := http.DefaultClient.Do(req)
if err != nil {
return "", err
}

// e.g., 403 Forbidden occurs when we remove "GSP=CF=4"
if res.StatusCode != 200 {
return "", errors.New(fmt.Sprintf("Bad status code: %v", res.StatusCode))
}

// convert to string
bufbody := new(bytes.Buffer)
bufbody.ReadFrom(res.Body)
body := bufbody.String()

// trim \n and abundant spaces
body = strings.Replace(body, "\n", "", -1)
body = strings.Replace(body, " ", " ", -1)

return body, nil
}
13 changes: 13 additions & 0 deletions bibtex_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package goscholar

import (
"fmt"
)

func ExampleGetBibTeX() {
url := "https://scholar.google.co.jp/scholar.bib?q=info:e6RSJHGXItcJ:scholar.google.com/&output=citation&hl=en&ct=citation"
bibtex, _ := getBibTeX(url)
fmt.Println(bibtex)
// Output:
// @inproceedings{martens2010deep, title={Deep learning via Hessian-free optimization}, author={Martens, James}, booktitle={Proceedings of the 27th International Conference on Machine Learning (ICML-10)}, pages={735--742}, year={2010}}
}
10 changes: 6 additions & 4 deletions build
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@ export GOFMTPATH="./ cmd/goscholar"

eval $(go env)

if [ "--dev" = "$1" ]; then
gofmt -s -w -l $GOFMTPATH
$GOPATH/bin/godep save
fi

if ! [ -e $GOPATH/bin/godep ]; then
go get github.com/tools/godep
fi

$GOPATH/bin/godep restore

if [ "--fmt" = "$1" ]; then
gofmt -s -w -l $GOFMTPATH
fi
go get github.com/docopt/docopt-go # docopt-go is only used in CLI

go build
go build -o goscholar cmd/goscholar/main.go
39 changes: 39 additions & 0 deletions parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ import (
"fmt"
"github.com/PuerkitoBio/goquery"
log "github.com/Sirupsen/logrus"
"github.com/sotetsuk/gobibtex"
"regexp"
"strings"
"time"
)

// ParseDocument sends the pointers of parsed Articles to the given channel.
Expand Down Expand Up @@ -35,6 +37,43 @@ func ParseSelection(s *goquery.Selection) (a *Article, err error) {
a.ClusterId, a.NumCite, a.NumVer, a.InfoId = parseBottom(s)
a.Link = parseSideBar(s)

if a.InfoId != "" {
time.Sleep(1.0 * time.Second) // TODO: make 1.0 parameter
a.BibTeX, err = getBibTeX(generateBibTeXLink(a.InfoId))
if err != nil {
return nil, err
}
}

if a.BibTeX != "" {
bibmap, err := gobibtex.Decode(a.BibTeX)
if err != nil {
return nil, err
}

if author, ok := bibmap["author"]; ok {
a.Author = author.([]string)
}
if journal, ok := bibmap["journal"]; ok {
a.Journal = journal.(string)
}
if booktitle, ok := bibmap["booktitle"]; ok {
a.Booktitle = booktitle.(string)
}
if volume, ok := bibmap["volume"]; ok {
a.Volume = volume.(string)
}
if number, ok := bibmap["number"]; ok {
a.Number = number.(string)
}
if pages, ok := bibmap["pages"]; ok {
a.Pages = pages.(string)
}
if publisher, ok := bibmap["publisher"]; ok {
a.Publisher = publisher.(string)
}
}

if !a.isValid() {
return nil, errors.New(fmt.Sprintf("\"%v\" is not a valid article", a.Title.Name))
}
Expand Down
1 change: 1 addition & 0 deletions parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ func init() {
test_case2 := "./testdata/parse_test_case2.html"
doc2, err2 = loadDummyHtml(test_case2) // In actual case, use Fetch(url)

// TODO: Update test for bibtex information
// set a1Expected
a1Expected = &Article{
Title: &Title{
Expand Down
4 changes: 4 additions & 0 deletions text.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,7 @@ func enclosedInDoubleQuotation(s string) bool {
return false
}
}

func generateBibTeXLink(infoId string) (bibtexLink string) {
return scholar_url + "scholar.bib?q=info:" + infoId + ":scholar.google.com/&output=citation&hl=en&ct=citation"
}
9 changes: 9 additions & 0 deletions text_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,12 @@ func TestEnclosedInDoubleQuotation(t *testing.T) {
t.Error("%v is enclosed in double quotation")
}
}

func TestGenerateBibTeXLink(t *testing.T) {
infoId := "b2pGeL14LLMJ"
expected := "https://scholar.google.co.jp/scholar.bib?q=info:b2pGeL14LLMJ:scholar.google.com/&output=citation&hl=en&ct=citation"

if bibtexLink := generateBibTeXLink(infoId); expected != bibtexLink {
t.Error(testErr{expected: expected, actual: bibtexLink})
}
}