diff --git a/crawler/common_test.go b/crawler/common_test.go
index f6a0925..c5cb347 100644
--- a/crawler/common_test.go
+++ b/crawler/common_test.go
@@ -17,8 +17,8 @@ import (
type htmlMap map[string]string
func (m htmlMap) Get(key string) string {
- if key == "no-html" {
- return "no html"
+ if key == "not-a-html" {
+ return "no a html"
}
return m[key]
}
@@ -48,6 +48,12 @@ func testHTMLMap(t *testing.T, target string) htmlMap {
return m
}
+type requestWant struct {
+ path string
+ query url.Values
+ body url.Values
+}
+
type mockRequestRoundTripper struct {
request *http.Request
}
@@ -60,20 +66,20 @@ func (m *mockRequestRoundTripper) RoundTrip(req *http.Request) (*http.Response,
}, nil
}
-type captureFunc func() (method string, query, body *url.Values)
+type captureFunc func() (method, path string, query, body url.Values)
-func (m *mockRequestRoundTripper) lastCaputure() (string, *url.Values, *url.Values) {
- q := m.request.URL.Query()
- var body *url.Values
+func (m *mockRequestRoundTripper) lastCaputure() (string, string, url.Values, url.Values) {
+ query := m.request.URL.Query()
+ body := url.Values{}
if m.request.Body != nil {
b, _ := io.ReadAll(m.request.Body)
if bt, err := url.ParseQuery(string(b)); err == nil {
- body = &bt
+ body = bt
} else {
panic(errors.Wrapf(err, "cannot parse request body: %s", string(b)))
}
}
- return m.request.Method, &q, body
+ return m.request.Method, m.request.URL.Path, query, body
}
func mockRequestClient() (*http.Client, captureFunc) {
@@ -84,6 +90,12 @@ func mockRequestClient() (*http.Client, captureFunc) {
return c, m.lastCaputure
}
+type mockHTTPResponse struct {
+ status int
+ bodyFile string
+ timeout bool
+}
+
type mockResponseRoundTripper struct {
status int
body string
diff --git a/crawler/contest.go b/crawler/contest.go
index 9ca925f..0ff17b4 100644
--- a/crawler/contest.go
+++ b/crawler/contest.go
@@ -11,6 +11,7 @@ import (
"github.com/meian/atgo/crawler/requests"
"github.com/meian/atgo/crawler/responses"
"github.com/meian/atgo/logs"
+ "github.com/meian/atgo/timezone"
"github.com/meian/atgo/url"
"github.com/pkg/errors"
)
@@ -86,11 +87,11 @@ func (c *Contest) parseTimes(ctx context.Context, doc *goquery.Document) (time.T
With("startAt", tt.Eq(0).Text()).
With("endAt", tt.Eq(1).Text()).
Debug("find result")
- startAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tt.Eq(0).Text(), time.Local)
+ startAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tt.Eq(0).Text(), timezone.Tokyo)
if err != nil {
return time.Time{}, 0, errors.New("failed to parse start time")
}
- endAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tt.Eq(1).Text(), time.Local)
+ endAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tt.Eq(1).Text(), timezone.Tokyo)
if err != nil {
return time.Time{}, 0, errors.New("failed to parse end time")
}
diff --git a/crawler/contest_archive.go b/crawler/contest_archive.go
index 2087de0..005733c 100644
--- a/crawler/contest_archive.go
+++ b/crawler/contest_archive.go
@@ -12,6 +12,7 @@ import (
"github.com/meian/atgo/crawler/requests"
"github.com/meian/atgo/crawler/responses"
"github.com/meian/atgo/logs"
+ "github.com/meian/atgo/timezone"
"github.com/meian/atgo/url"
"github.com/meian/atgo/util"
"github.com/pkg/errors"
@@ -130,7 +131,7 @@ func (c *ContestArchive) parseContest(ctx context.Context, tr *goquery.Selection
if tdTime.Length() == 0 {
return responses.ContestArchive_Contest{}, errors.New("no time is found")
}
- startAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tdTime.Text(), time.Local)
+ startAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tdTime.Text(), timezone.Tokyo)
if err != nil {
logger.Error(err.Error())
return responses.ContestArchive_Contest{}, errors.New("failed to parse start time")
diff --git a/crawler/contest_test.go b/crawler/contest_test.go
new file mode 100644
index 0000000..aa0e5c0
--- /dev/null
+++ b/crawler/contest_test.go
@@ -0,0 +1,100 @@
+package crawler_test
+
+import (
+ "context"
+ "net/http"
+ "net/url"
+ "testing"
+ "time"
+
+ "github.com/meian/atgo/crawler"
+ "github.com/meian/atgo/crawler/requests"
+ "github.com/meian/atgo/crawler/responses"
+ "github.com/meian/atgo/timezone"
+ "github.com/stretchr/testify/assert"
+)
+
+func TestContest_Do_Request(t *testing.T) {
+ req := &requests.Contest{
+ ContestID: "abc123",
+ }
+ want := requestWant{
+ path: "/contests/abc123",
+ query: url.Values{},
+ body: url.Values{},
+ }
+
+ assert := assert.New(t)
+ client, cFunc := mockRequestClient()
+ _, _ = crawler.NewContest(client).Do(context.Background(), req)
+ method, path, query, body := cFunc()
+ assert.Equal(http.MethodGet, method)
+ assert.Equal(want.path, path)
+ assert.Equal(want.query, query)
+ assert.Equal(want.body, body)
+}
+
+func TestContest_Do_Response(t *testing.T) {
+ m := testHTMLMap(t, "contest")
+
+ type want struct {
+ err bool
+ res *responses.Contest
+ }
+ tests := []struct {
+ name string
+ httpRes mockHTTPResponse
+ want want
+ }{
+ {
+ name: "success",
+ httpRes: mockHTTPResponse{status: http.StatusOK, bodyFile: "success.html"},
+ want: want{
+ res: &responses.Contest{
+ ID: "abc234",
+ Title: "AtCoder Beginner Contest 234",
+ StartAt: time.Date(2022, 1, 8, 21, 0, 0, 0, timezone.Tokyo),
+ Duration: 1*time.Hour + 40*time.Minute,
+ TargetRate: " - 1999",
+ },
+ },
+ },
+ {
+ name: "not found",
+ httpRes: mockHTTPResponse{status: http.StatusNotFound},
+ want: want{err: true},
+ },
+ {
+ name: "not a html response",
+ httpRes: mockHTTPResponse{status: http.StatusOK, bodyFile: "not-a-html"},
+ want: want{err: true},
+ },
+ {
+ name: "timeout",
+ httpRes: mockHTTPResponse{timeout: true},
+ want: want{err: true},
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ assert := assert.New(t)
+ ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+ defer cancel()
+ client := mockResponseClient(tt.httpRes.status, m.Get(tt.httpRes.bodyFile), tt.httpRes.timeout)
+ req := &requests.Contest{ContestID: "abc234"}
+ res, err := crawler.NewContest(client).Do(ctx, req)
+ if tt.want.err {
+ if assert.Error(err) {
+ t.Logf("error: %v", err)
+ }
+ return
+ }
+ assert.NoError(err)
+ if !assert.NotNil(res) {
+ return
+ }
+ assert.Equal(tt.want.res, res)
+ })
+ }
+}
diff --git a/crawler/crawler.go b/crawler/crawler.go
index a4689c5..8758e9e 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -1,14 +1,15 @@
package crawler
import (
+ "bytes"
"context"
"fmt"
- "io"
"log/slog"
"net/http"
"strings"
"github.com/PuerkitoBio/goquery"
+ "github.com/meian/atgo/io"
"github.com/meian/atgo/logs"
"github.com/meian/atgo/url"
"github.com/pkg/errors"
@@ -141,6 +142,15 @@ func (c Crawler) doHTTPRequest(ctx context.Context, method, contentType, url str
func (c Crawler) documentFromReader(ctx context.Context, respBody io.Reader) (*goquery.Document, error) {
logger := logs.FromContext(ctx)
logger.Debug("parsing document from response")
+
+ respBody, err := io.WithReadAction(respBody, func(r io.Reader) error {
+ return c.validHTML(ctx, r)
+ })
+ if err != nil {
+ logger.Error(err.Error())
+ return nil, errors.New("response is not a valid HTML")
+ }
+
doc, err := goquery.NewDocumentFromReader(respBody)
if err != nil {
logger.Error(err.Error())
@@ -151,6 +161,38 @@ func (c Crawler) documentFromReader(ctx context.Context, respBody io.Reader) (*g
return doc, nil
}
+func (Crawler) validHTML(ctx context.Context, reader io.Reader) error {
+ logger := logs.FromContext(ctx)
+
+ tags := [][]byte{[]byte(" 0
diff --git a/crawler/login_test.go b/crawler/login_test.go
index 82ef4c3..15d1196 100644
--- a/crawler/login_test.go
+++ b/crawler/login_test.go
@@ -9,6 +9,7 @@ import (
"github.com/meian/atgo/crawler"
"github.com/meian/atgo/crawler/requests"
+ "github.com/meian/atgo/crawler/responses"
"github.com/stretchr/testify/assert"
)
@@ -19,66 +20,62 @@ func TestLogin_Do_Request(t *testing.T) {
CSRFToken: "token",
Continue: "ctn",
}
- want := struct {
- query *url.Values
- body *url.Values
- }{
- query: &url.Values{"continue": {"ctn"}},
- body: &url.Values{"username": {"user"}, "password": {"pass"}, "csrf_token": {"token"}},
+ want := requestWant{
+ path: "/login",
+ query: url.Values{"continue": {"ctn"}},
+ body: url.Values{"username": {"user"}, "password": {"pass"}, "csrf_token": {"token"}},
}
assert := assert.New(t)
client, cFunc := mockRequestClient()
_, _ = crawler.NewLogin(client).Do(context.Background(), req)
- method, query, body := cFunc()
+ method, path, query, body := cFunc()
assert.Equal(http.MethodPost, method)
+ assert.Equal(want.path, path)
assert.Equal(want.query, query)
assert.Equal(want.body, body)
}
func TestLogin_Do_Response(t *testing.T) {
- type res struct {
- status int
- bodyFile string
- timeout bool
- }
+ m := testHTMLMap(t, "login")
+
type want struct {
- err bool
- loggedIn bool
+ err bool
+ res *responses.Login
}
tests := []struct {
- name string
- res res
- want want
+ name string
+ httpRes mockHTTPResponse
+ want want
}{
{
- name: "success",
- res: res{status: http.StatusOK, bodyFile: "logged-in.html"},
- want: want{loggedIn: true},
+ name: "success",
+ httpRes: mockHTTPResponse{status: http.StatusOK, bodyFile: "success.html"},
+ want: want{res: &responses.Login{LoggedIn: true}},
},
{
- name: "forbidden",
- res: res{status: http.StatusForbidden},
- want: want{err: true},
+ name: "forbidden",
+ httpRes: mockHTTPResponse{status: http.StatusForbidden},
+ want: want{err: true},
},
{
- name: "no html",
- res: res{status: http.StatusOK, bodyFile: "no-html"},
- want: want{err: false, loggedIn: false},
+ name: "not a html response",
+ httpRes: mockHTTPResponse{status: http.StatusOK, bodyFile: "not-a-html"},
+ want: want{err: true},
},
{
- name: "timeout",
- res: res{timeout: true},
- want: want{err: true},
+ name: "timeout",
+ httpRes: mockHTTPResponse{timeout: true},
+ want: want{err: true},
},
}
- m := testHTMLMap(t, "login")
+
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert := assert.New(t)
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
- client := mockResponseClient(tt.res.status, m.Get(tt.res.bodyFile), tt.res.timeout)
+ client := mockResponseClient(tt.httpRes.status, m.Get(tt.httpRes.bodyFile), tt.httpRes.timeout)
req := &requests.Login{Username: "user", Password: "pass"}
res, err := crawler.NewLogin(client).Do(ctx, req)
if tt.want.err {
@@ -91,7 +88,7 @@ func TestLogin_Do_Response(t *testing.T) {
if !assert.NotNil(res) {
return
}
- assert.Equal(tt.want.loggedIn, res.LoggedIn)
+ assert.Equal(tt.want.res, res)
})
}
}
diff --git a/crawler/testdata/contest/success.html b/crawler/testdata/contest/success.html
new file mode 100644
index 0000000..c4a3f51
--- /dev/null
+++ b/crawler/testdata/contest/success.html
@@ -0,0 +1,505 @@
+
+
+
+
+
+
+
+ AtCoder Beginner Contest 234 - AtCoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
AtCoder Beginner Contest 234 has begun.
+
+
+
+
+
+
+
+
+
+
+
AtCoder Beginner Contest 234 has ended.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Can Participate: All
+ Rated Range: - 1999
+ Penalty: 5 minutes
+
+
+
+
+
+ コンテスト情報
+
+
+ コンテスト時間: 100 分
+ レーティング更新対象: 0 - 1999
+
+
+
+ 配点
+
+
+
+
+
+
+ 問題
+ 点数
+
+
+
+
+ A
+ 100
+
+
+ B
+ 200
+
+
+ C
+ 300
+
+
+ D
+ 400
+
+
+ E
+ 500
+
+
+ F
+ 500
+
+
+ G
+ 600
+
+
+ Ex
+ 600
+
+
+
+
+
+
+
+ ルール
+
+
+ コンテスト中に問題に正解すると点数を獲得できます。
+ 順位は総合得点で決定します。
+ 同点の場合は提出時間の早い人が上の順位になります。
+ 誤答を提出するたびにペナルティが加算されます。このコンテストのペナルティは5分です。詳細は画面下部の「ルール」をご覧ください。
+
+
+ このコンテストは full-feedback 形式のコンテストです。コンテスト中に提出された結果だけで順位が決定します。
+
+
+
+ 便利情報
+
+
+
+
+ Contest Information
+
+
+ Duration: 100 minutes
+ Rated Range: 0 - 1999
+
+
+ Point Values
+
+
+
+
+
+
+ Task
+ Score
+
+
+
+
+ A
+ 100
+
+
+ B
+ 200
+
+
+ C
+ 300
+
+
+ D
+ 400
+
+
+ E
+ 500
+
+
+ F
+ 500
+
+
+ G
+ 600
+
+
+ Ex
+ 600
+
+
+
+
+
+
+ Contest Rules
+ This contest is full-feedback (solutions are judged during the contest).
+
+ When you solve a problem, you get a score assigned to it.
+ Competitors are ranked first by total scores, then by penalties.
+ The penalties are computed as (the time you spend to get your current score) + (5 minutes) * (the number of incorrect attempts).
+
+
+ Useful Links
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Page Top
+
+
+
+
+
diff --git a/crawler/testdata/login/logged-in.html b/crawler/testdata/login/logged-in.html
deleted file mode 100644
index a9122e8..0000000
--- a/crawler/testdata/login/logged-in.html
+++ /dev/null
@@ -1,647 +0,0 @@
-
-
-
-
-
-
-
-
-
- AtCoder:競技プログラミングコンテストを開催する国内最大のサイト
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- ×
- ようこそ、kitamin さん。
-
-
-
-
-
-
-
-
-
-
-
解けた!を 世界に届けたい。
-
AtCoderは、世界最高峰の競技プログラミングサイトです。 リアルタイムのオンラインコンテストで競い合うことや、 5,000以上の過去問にいつでもチャレンジすることができます。
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
お知らせ
-
-
-
-
-
-
- 投稿日時:
-
-
-
-
-
-
-
<a href="https://atcoder.jp/contests/ahc035">ALGO ARTIS プログラミングコンテスト2024 夏(AtCoder Heuristic Contest 035)</a> が開催されます。
-
-- コンテストページ: https://atcoder.jp/contests/ahc035
-- 開始時刻: <a href='http://www.timeanddate.com/worldclock/fixedtime.html?iso=20240721T1500&p1=248' target='blank'><time class='fixtime fixtime-full'>2024-07-21 15:00:00+0900</time></a>
-- コンテスト時間: 4 時間
-- Writer:<img src="https://img.atcoder.jp/assets/icon/crown_gold.png"><a href="https://atcoder.jp/users/terry_u16?contestType=heuristic" class="username"><span class="user-red">terry_u16</span></a>
-- レーティング変化:All(ヒューリスティックレーティング)
-
-このコンテストは <a href="https://atcoder.jp/posts/1163">AWTF 2025</a> の選考対象です。
-
-皆様、是非ご参加ください!
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
AtCoderJobs
-
AtCoderのレーティングを用いた人材採用サービスです。
-
-
-
-
-
-
-
-
-
-
-
検定
-
-
-
-
-
-
-
-
-
-
- アルゴリズム実技検定についてはこちらから!
-
- 過去問公開中
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/crawler/testdata/login/success.html b/crawler/testdata/login/success.html
new file mode 100644
index 0000000..fa67f95
--- /dev/null
+++ b/crawler/testdata/login/success.html
@@ -0,0 +1,383 @@
+
+
+
+
+
+
+
+
+ AtCoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Contest
+
+
+
+
+
+
+
+
+
Permanent Contests
+
+
+
+
+
+
+
+ Contest Name
+
+
+
+
+
+
+
+
+
+
+
+
+
Upcoming Contests
+
+
+
+
+
+
+ Start Time
+ Contest Name
+
+
+
+
+
+
+
+
+
+
+
+
+
Recent Contests
+
+
+
+
+
+
+ Start Time
+ Contest Name
+
+
+
+
+
+
+
+
+
+
+
+ Detail
+
+
+
+
+
+
+
+
+
+ Ranking: Heuristic
+
+
+
+
+
+
+ View all
+
+
+ GP30
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Page Top
+
+
+
+
+
+
diff --git a/io/read_action.go b/io/read_action.go
new file mode 100644
index 0000000..bf377de
--- /dev/null
+++ b/io/read_action.go
@@ -0,0 +1,18 @@
+package io
+
+import (
+ "bytes"
+ "io"
+)
+
+func WithReadAction(r io.Reader, f func(io.Reader) error) (io.Reader, error) {
+ var buf bytes.Buffer
+ tee := io.TeeReader(r, &buf)
+ if err := f(tee); err != nil {
+ return nil, err
+ }
+ if _, err := io.Copy(io.Discard, tee); err != nil {
+ return nil, err
+ }
+ return &buf, nil
+}
diff --git a/io/types.go b/io/types.go
index 747420a..69dccb1 100644
--- a/io/types.go
+++ b/io/types.go
@@ -6,6 +6,7 @@ type Writer = io.Writer
type Reader = io.Reader
var Discard = io.Discard
+var EOF = io.EOF
func Copy(dst Writer, src Reader) (written int64, err error) {
return io.Copy(dst, src)
diff --git a/timezone/timezone.go b/timezone/timezone.go
new file mode 100644
index 0000000..f1f532b
--- /dev/null
+++ b/timezone/timezone.go
@@ -0,0 +1,18 @@
+package timezone
+
+import (
+ "log/slog"
+ "time"
+)
+
+var Tokyo *time.Location
+
+func init() {
+ var err error
+ Tokyo, err = time.LoadLocation("Asia/Tokyo")
+ if err != nil {
+ slog.With("err", err).Error("Failed to load Asia/Tokyo timezone")
+ Tokyo = time.FixedZone("Asia/Tokyo", 9*60*60)
+ slog.Info("Using fixed timezone for Asia/Tokyo")
+ }
+}