Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

/contests/[contestID] のクローラーのテストを追加 #84

Merged
merged 10 commits into from
Jul 26, 2024
28 changes: 20 additions & 8 deletions crawler/common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ import (
type htmlMap map[string]string

func (m htmlMap) Get(key string) string {
if key == "no-html" {
return "no html"
if key == "not-a-html" {
return "no a html"
}
return m[key]
}
Expand Down Expand Up @@ -48,6 +48,12 @@ func testHTMLMap(t *testing.T, target string) htmlMap {
return m
}

type requestWant struct {
path string
query url.Values
body url.Values
}

type mockRequestRoundTripper struct {
request *http.Request
}
Expand All @@ -60,20 +66,20 @@ func (m *mockRequestRoundTripper) RoundTrip(req *http.Request) (*http.Response,
}, nil
}

type captureFunc func() (method string, query, body *url.Values)
type captureFunc func() (method, path string, query, body url.Values)

func (m *mockRequestRoundTripper) lastCaputure() (string, *url.Values, *url.Values) {
q := m.request.URL.Query()
var body *url.Values
func (m *mockRequestRoundTripper) lastCaputure() (string, string, url.Values, url.Values) {
query := m.request.URL.Query()
body := url.Values{}
if m.request.Body != nil {
b, _ := io.ReadAll(m.request.Body)
if bt, err := url.ParseQuery(string(b)); err == nil {
body = &bt
body = bt
} else {
panic(errors.Wrapf(err, "cannot parse request body: %s", string(b)))
}
}
return m.request.Method, &q, body
return m.request.Method, m.request.URL.Path, query, body
}

func mockRequestClient() (*http.Client, captureFunc) {
Expand All @@ -84,6 +90,12 @@ func mockRequestClient() (*http.Client, captureFunc) {
return c, m.lastCaputure
}

type mockHTTPResponse struct {
status int
bodyFile string
timeout bool
}

type mockResponseRoundTripper struct {
status int
body string
Expand Down
5 changes: 3 additions & 2 deletions crawler/contest.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/meian/atgo/crawler/requests"
"github.com/meian/atgo/crawler/responses"
"github.com/meian/atgo/logs"
"github.com/meian/atgo/timezone"
"github.com/meian/atgo/url"
"github.com/pkg/errors"
)
Expand Down Expand Up @@ -86,11 +87,11 @@ func (c *Contest) parseTimes(ctx context.Context, doc *goquery.Document) (time.T
With("startAt", tt.Eq(0).Text()).
With("endAt", tt.Eq(1).Text()).
Debug("find result")
startAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tt.Eq(0).Text(), time.Local)
startAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tt.Eq(0).Text(), timezone.Tokyo)
if err != nil {
return time.Time{}, 0, errors.New("failed to parse start time")
}
endAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tt.Eq(1).Text(), time.Local)
endAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tt.Eq(1).Text(), timezone.Tokyo)
if err != nil {
return time.Time{}, 0, errors.New("failed to parse end time")
}
Expand Down
3 changes: 2 additions & 1 deletion crawler/contest_archive.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/meian/atgo/crawler/requests"
"github.com/meian/atgo/crawler/responses"
"github.com/meian/atgo/logs"
"github.com/meian/atgo/timezone"
"github.com/meian/atgo/url"
"github.com/meian/atgo/util"
"github.com/pkg/errors"
Expand Down Expand Up @@ -130,7 +131,7 @@ func (c *ContestArchive) parseContest(ctx context.Context, tr *goquery.Selection
if tdTime.Length() == 0 {
return responses.ContestArchive_Contest{}, errors.New("no time is found")
}
startAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tdTime.Text(), time.Local)
startAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tdTime.Text(), timezone.Tokyo)
if err != nil {
logger.Error(err.Error())
return responses.ContestArchive_Contest{}, errors.New("failed to parse start time")
Expand Down
100 changes: 100 additions & 0 deletions crawler/contest_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package crawler_test

import (
"context"
"net/http"
"net/url"
"testing"
"time"

"github.com/meian/atgo/crawler"
"github.com/meian/atgo/crawler/requests"
"github.com/meian/atgo/crawler/responses"
"github.com/meian/atgo/timezone"
"github.com/stretchr/testify/assert"
)

func TestContest_Do_Request(t *testing.T) {
req := &requests.Contest{
ContestID: "abc123",
}
want := requestWant{
path: "/contests/abc123",
query: url.Values{},
body: url.Values{},
}

assert := assert.New(t)
client, cFunc := mockRequestClient()
_, _ = crawler.NewContest(client).Do(context.Background(), req)
method, path, query, body := cFunc()
assert.Equal(http.MethodGet, method)
assert.Equal(want.path, path)
assert.Equal(want.query, query)
assert.Equal(want.body, body)
}

func TestContest_Do_Response(t *testing.T) {
m := testHTMLMap(t, "contest")

type want struct {
err bool
res *responses.Contest
}
tests := []struct {
name string
httpRes mockHTTPResponse
want want
}{
{
name: "success",
httpRes: mockHTTPResponse{status: http.StatusOK, bodyFile: "success.html"},
want: want{
res: &responses.Contest{
ID: "abc234",
Title: "AtCoder Beginner Contest 234",
StartAt: time.Date(2022, 1, 8, 21, 0, 0, 0, timezone.Tokyo),
Duration: 1*time.Hour + 40*time.Minute,
TargetRate: " - 1999",
},
},
},
{
name: "not found",
httpRes: mockHTTPResponse{status: http.StatusNotFound},
want: want{err: true},
},
{
name: "not a html response",
httpRes: mockHTTPResponse{status: http.StatusOK, bodyFile: "not-a-html"},
want: want{err: true},
},
{
name: "timeout",
httpRes: mockHTTPResponse{timeout: true},
want: want{err: true},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert := assert.New(t)
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
client := mockResponseClient(tt.httpRes.status, m.Get(tt.httpRes.bodyFile), tt.httpRes.timeout)
req := &requests.Contest{ContestID: "abc234"}
res, err := crawler.NewContest(client).Do(ctx, req)
if tt.want.err {
if assert.Error(err) {
t.Logf("error: %v", err)
}
return
}
assert.NoError(err)
if !assert.NotNil(res) {
return
}
assert.Equal(tt.want.res, res)
})
}
}
44 changes: 43 additions & 1 deletion crawler/crawler.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
package crawler

import (
"bytes"
"context"
"fmt"
"io"
"log/slog"
"net/http"
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/meian/atgo/io"
"github.com/meian/atgo/logs"
"github.com/meian/atgo/url"
"github.com/pkg/errors"
Expand Down Expand Up @@ -141,6 +142,15 @@ func (c Crawler) doHTTPRequest(ctx context.Context, method, contentType, url str
func (c Crawler) documentFromReader(ctx context.Context, respBody io.Reader) (*goquery.Document, error) {
logger := logs.FromContext(ctx)
logger.Debug("parsing document from response")

respBody, err := io.WithReadAction(respBody, func(r io.Reader) error {
return c.validHTML(ctx, r)
})
if err != nil {
logger.Error(err.Error())
return nil, errors.New("response is not a valid HTML")
}

doc, err := goquery.NewDocumentFromReader(respBody)
if err != nil {
logger.Error(err.Error())
Expand All @@ -151,6 +161,38 @@ func (c Crawler) documentFromReader(ctx context.Context, respBody io.Reader) (*g
return doc, nil
}

func (Crawler) validHTML(ctx context.Context, reader io.Reader) error {
logger := logs.FromContext(ctx)

tags := [][]byte{[]byte("<html"), []byte("<head"), []byte("<body")}
buf := make([]byte, 4096)
var content []byte

for {
n, err := reader.Read(buf)
if err != nil {
if err == io.EOF {
return errors.New("failed to find HTML tags")
}
logger.Error(err.Error())
return errors.New("failed to read response body")
}
content = append(content, buf[:n]...)

for {
pos := bytes.Index(content, tags[0])
if pos < 0 {
break
}
content = content[pos:]
tags = tags[1:]
if len(tags) == 0 {
return nil
}
}
}
}

func (c Crawler) LoggedIn(ctx context.Context, doc *goquery.Document) bool {
selector := fmt.Sprintf("a[href='%s']", url.SettingsPath)
return doc.Find(selector).Length() > 0
Expand Down
61 changes: 29 additions & 32 deletions crawler/login_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/meian/atgo/crawler"
"github.com/meian/atgo/crawler/requests"
"github.com/meian/atgo/crawler/responses"
"github.com/stretchr/testify/assert"
)

Expand All @@ -19,66 +20,62 @@ func TestLogin_Do_Request(t *testing.T) {
CSRFToken: "token",
Continue: "ctn",
}
want := struct {
query *url.Values
body *url.Values
}{
query: &url.Values{"continue": {"ctn"}},
body: &url.Values{"username": {"user"}, "password": {"pass"}, "csrf_token": {"token"}},
want := requestWant{
path: "/login",
query: url.Values{"continue": {"ctn"}},
body: url.Values{"username": {"user"}, "password": {"pass"}, "csrf_token": {"token"}},
}

assert := assert.New(t)
client, cFunc := mockRequestClient()
_, _ = crawler.NewLogin(client).Do(context.Background(), req)
method, query, body := cFunc()
method, path, query, body := cFunc()
assert.Equal(http.MethodPost, method)
assert.Equal(want.path, path)
assert.Equal(want.query, query)
assert.Equal(want.body, body)
}

func TestLogin_Do_Response(t *testing.T) {
type res struct {
status int
bodyFile string
timeout bool
}
m := testHTMLMap(t, "login")

type want struct {
err bool
loggedIn bool
err bool
res *responses.Login
}
tests := []struct {
name string
res res
want want
name string
httpRes mockHTTPResponse
want want
}{
{
name: "success",
res: res{status: http.StatusOK, bodyFile: "logged-in.html"},
want: want{loggedIn: true},
name: "success",
httpRes: mockHTTPResponse{status: http.StatusOK, bodyFile: "success.html"},
want: want{res: &responses.Login{LoggedIn: true}},
},
{
name: "forbidden",
res: res{status: http.StatusForbidden},
want: want{err: true},
name: "forbidden",
httpRes: mockHTTPResponse{status: http.StatusForbidden},
want: want{err: true},
},
{
name: "no html",
res: res{status: http.StatusOK, bodyFile: "no-html"},
want: want{err: false, loggedIn: false},
name: "not a html response",
httpRes: mockHTTPResponse{status: http.StatusOK, bodyFile: "not-a-html"},
want: want{err: true},
},
{
name: "timeout",
res: res{timeout: true},
want: want{err: true},
name: "timeout",
httpRes: mockHTTPResponse{timeout: true},
want: want{err: true},
},
}
m := testHTMLMap(t, "login")

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert := assert.New(t)
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
client := mockResponseClient(tt.res.status, m.Get(tt.res.bodyFile), tt.res.timeout)
client := mockResponseClient(tt.httpRes.status, m.Get(tt.httpRes.bodyFile), tt.httpRes.timeout)
req := &requests.Login{Username: "user", Password: "pass"}
res, err := crawler.NewLogin(client).Do(ctx, req)
if tt.want.err {
Expand All @@ -91,7 +88,7 @@ func TestLogin_Do_Response(t *testing.T) {
if !assert.NotNil(res) {
return
}
assert.Equal(tt.want.loggedIn, res.LoggedIn)
assert.Equal(tt.want.res, res)
})
}
}
Loading