diff --git a/crawler/common_test.go b/crawler/common_test.go index f6a0925..c5cb347 100644 --- a/crawler/common_test.go +++ b/crawler/common_test.go @@ -17,8 +17,8 @@ import ( type htmlMap map[string]string func (m htmlMap) Get(key string) string { - if key == "no-html" { - return "no html" + if key == "not-a-html" { + return "no a html" } return m[key] } @@ -48,6 +48,12 @@ func testHTMLMap(t *testing.T, target string) htmlMap { return m } +type requestWant struct { + path string + query url.Values + body url.Values +} + type mockRequestRoundTripper struct { request *http.Request } @@ -60,20 +66,20 @@ func (m *mockRequestRoundTripper) RoundTrip(req *http.Request) (*http.Response, }, nil } -type captureFunc func() (method string, query, body *url.Values) +type captureFunc func() (method, path string, query, body url.Values) -func (m *mockRequestRoundTripper) lastCaputure() (string, *url.Values, *url.Values) { - q := m.request.URL.Query() - var body *url.Values +func (m *mockRequestRoundTripper) lastCaputure() (string, string, url.Values, url.Values) { + query := m.request.URL.Query() + body := url.Values{} if m.request.Body != nil { b, _ := io.ReadAll(m.request.Body) if bt, err := url.ParseQuery(string(b)); err == nil { - body = &bt + body = bt } else { panic(errors.Wrapf(err, "cannot parse request body: %s", string(b))) } } - return m.request.Method, &q, body + return m.request.Method, m.request.URL.Path, query, body } func mockRequestClient() (*http.Client, captureFunc) { @@ -84,6 +90,12 @@ func mockRequestClient() (*http.Client, captureFunc) { return c, m.lastCaputure } +type mockHTTPResponse struct { + status int + bodyFile string + timeout bool +} + type mockResponseRoundTripper struct { status int body string diff --git a/crawler/contest.go b/crawler/contest.go index 9ca925f..0ff17b4 100644 --- a/crawler/contest.go +++ b/crawler/contest.go @@ -11,6 +11,7 @@ import ( "github.com/meian/atgo/crawler/requests" "github.com/meian/atgo/crawler/responses" "github.com/meian/atgo/logs" + "github.com/meian/atgo/timezone" "github.com/meian/atgo/url" "github.com/pkg/errors" ) @@ -86,11 +87,11 @@ func (c *Contest) parseTimes(ctx context.Context, doc *goquery.Document) (time.T With("startAt", tt.Eq(0).Text()). With("endAt", tt.Eq(1).Text()). Debug("find result") - startAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tt.Eq(0).Text(), time.Local) + startAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tt.Eq(0).Text(), timezone.Tokyo) if err != nil { return time.Time{}, 0, errors.New("failed to parse start time") } - endAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tt.Eq(1).Text(), time.Local) + endAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tt.Eq(1).Text(), timezone.Tokyo) if err != nil { return time.Time{}, 0, errors.New("failed to parse end time") } diff --git a/crawler/contest_archive.go b/crawler/contest_archive.go index 2087de0..005733c 100644 --- a/crawler/contest_archive.go +++ b/crawler/contest_archive.go @@ -12,6 +12,7 @@ import ( "github.com/meian/atgo/crawler/requests" "github.com/meian/atgo/crawler/responses" "github.com/meian/atgo/logs" + "github.com/meian/atgo/timezone" "github.com/meian/atgo/url" "github.com/meian/atgo/util" "github.com/pkg/errors" @@ -130,7 +131,7 @@ func (c *ContestArchive) parseContest(ctx context.Context, tr *goquery.Selection if tdTime.Length() == 0 { return responses.ContestArchive_Contest{}, errors.New("no time is found") } - startAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tdTime.Text(), time.Local) + startAt, err := time.ParseInLocation("2006-01-02 15:04:05-0700", tdTime.Text(), timezone.Tokyo) if err != nil { logger.Error(err.Error()) return responses.ContestArchive_Contest{}, errors.New("failed to parse start time") diff --git a/crawler/contest_test.go b/crawler/contest_test.go new file mode 100644 index 0000000..aa0e5c0 --- /dev/null +++ b/crawler/contest_test.go @@ -0,0 +1,100 @@ +package crawler_test + +import ( + "context" + "net/http" + "net/url" + "testing" + "time" + + "github.com/meian/atgo/crawler" + "github.com/meian/atgo/crawler/requests" + "github.com/meian/atgo/crawler/responses" + "github.com/meian/atgo/timezone" + "github.com/stretchr/testify/assert" +) + +func TestContest_Do_Request(t *testing.T) { + req := &requests.Contest{ + ContestID: "abc123", + } + want := requestWant{ + path: "/contests/abc123", + query: url.Values{}, + body: url.Values{}, + } + + assert := assert.New(t) + client, cFunc := mockRequestClient() + _, _ = crawler.NewContest(client).Do(context.Background(), req) + method, path, query, body := cFunc() + assert.Equal(http.MethodGet, method) + assert.Equal(want.path, path) + assert.Equal(want.query, query) + assert.Equal(want.body, body) +} + +func TestContest_Do_Response(t *testing.T) { + m := testHTMLMap(t, "contest") + + type want struct { + err bool + res *responses.Contest + } + tests := []struct { + name string + httpRes mockHTTPResponse + want want + }{ + { + name: "success", + httpRes: mockHTTPResponse{status: http.StatusOK, bodyFile: "success.html"}, + want: want{ + res: &responses.Contest{ + ID: "abc234", + Title: "AtCoder Beginner Contest 234", + StartAt: time.Date(2022, 1, 8, 21, 0, 0, 0, timezone.Tokyo), + Duration: 1*time.Hour + 40*time.Minute, + TargetRate: " - 1999", + }, + }, + }, + { + name: "not found", + httpRes: mockHTTPResponse{status: http.StatusNotFound}, + want: want{err: true}, + }, + { + name: "not a html response", + httpRes: mockHTTPResponse{status: http.StatusOK, bodyFile: "not-a-html"}, + want: want{err: true}, + }, + { + name: "timeout", + httpRes: mockHTTPResponse{timeout: true}, + want: want{err: true}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert := assert.New(t) + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer cancel() + client := mockResponseClient(tt.httpRes.status, m.Get(tt.httpRes.bodyFile), tt.httpRes.timeout) + req := &requests.Contest{ContestID: "abc234"} + res, err := crawler.NewContest(client).Do(ctx, req) + if tt.want.err { + if assert.Error(err) { + t.Logf("error: %v", err) + } + return + } + assert.NoError(err) + if !assert.NotNil(res) { + return + } + assert.Equal(tt.want.res, res) + }) + } +} diff --git a/crawler/crawler.go b/crawler/crawler.go index a4689c5..8758e9e 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -1,14 +1,15 @@ package crawler import ( + "bytes" "context" "fmt" - "io" "log/slog" "net/http" "strings" "github.com/PuerkitoBio/goquery" + "github.com/meian/atgo/io" "github.com/meian/atgo/logs" "github.com/meian/atgo/url" "github.com/pkg/errors" @@ -141,6 +142,15 @@ func (c Crawler) doHTTPRequest(ctx context.Context, method, contentType, url str func (c Crawler) documentFromReader(ctx context.Context, respBody io.Reader) (*goquery.Document, error) { logger := logs.FromContext(ctx) logger.Debug("parsing document from response") + + respBody, err := io.WithReadAction(respBody, func(r io.Reader) error { + return c.validHTML(ctx, r) + }) + if err != nil { + logger.Error(err.Error()) + return nil, errors.New("response is not a valid HTML") + } + doc, err := goquery.NewDocumentFromReader(respBody) if err != nil { logger.Error(err.Error()) @@ -151,6 +161,38 @@ func (c Crawler) documentFromReader(ctx context.Context, respBody io.Reader) (*g return doc, nil } +func (Crawler) validHTML(ctx context.Context, reader io.Reader) error { + logger := logs.FromContext(ctx) + + tags := [][]byte{[]byte(" 0 diff --git a/crawler/login_test.go b/crawler/login_test.go index 82ef4c3..15d1196 100644 --- a/crawler/login_test.go +++ b/crawler/login_test.go @@ -9,6 +9,7 @@ import ( "github.com/meian/atgo/crawler" "github.com/meian/atgo/crawler/requests" + "github.com/meian/atgo/crawler/responses" "github.com/stretchr/testify/assert" ) @@ -19,66 +20,62 @@ func TestLogin_Do_Request(t *testing.T) { CSRFToken: "token", Continue: "ctn", } - want := struct { - query *url.Values - body *url.Values - }{ - query: &url.Values{"continue": {"ctn"}}, - body: &url.Values{"username": {"user"}, "password": {"pass"}, "csrf_token": {"token"}}, + want := requestWant{ + path: "/login", + query: url.Values{"continue": {"ctn"}}, + body: url.Values{"username": {"user"}, "password": {"pass"}, "csrf_token": {"token"}}, } assert := assert.New(t) client, cFunc := mockRequestClient() _, _ = crawler.NewLogin(client).Do(context.Background(), req) - method, query, body := cFunc() + method, path, query, body := cFunc() assert.Equal(http.MethodPost, method) + assert.Equal(want.path, path) assert.Equal(want.query, query) assert.Equal(want.body, body) } func TestLogin_Do_Response(t *testing.T) { - type res struct { - status int - bodyFile string - timeout bool - } + m := testHTMLMap(t, "login") + type want struct { - err bool - loggedIn bool + err bool + res *responses.Login } tests := []struct { - name string - res res - want want + name string + httpRes mockHTTPResponse + want want }{ { - name: "success", - res: res{status: http.StatusOK, bodyFile: "logged-in.html"}, - want: want{loggedIn: true}, + name: "success", + httpRes: mockHTTPResponse{status: http.StatusOK, bodyFile: "success.html"}, + want: want{res: &responses.Login{LoggedIn: true}}, }, { - name: "forbidden", - res: res{status: http.StatusForbidden}, - want: want{err: true}, + name: "forbidden", + httpRes: mockHTTPResponse{status: http.StatusForbidden}, + want: want{err: true}, }, { - name: "no html", - res: res{status: http.StatusOK, bodyFile: "no-html"}, - want: want{err: false, loggedIn: false}, + name: "not a html response", + httpRes: mockHTTPResponse{status: http.StatusOK, bodyFile: "not-a-html"}, + want: want{err: true}, }, { - name: "timeout", - res: res{timeout: true}, - want: want{err: true}, + name: "timeout", + httpRes: mockHTTPResponse{timeout: true}, + want: want{err: true}, }, } - m := testHTMLMap(t, "login") + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { assert := assert.New(t) ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) defer cancel() - client := mockResponseClient(tt.res.status, m.Get(tt.res.bodyFile), tt.res.timeout) + client := mockResponseClient(tt.httpRes.status, m.Get(tt.httpRes.bodyFile), tt.httpRes.timeout) req := &requests.Login{Username: "user", Password: "pass"} res, err := crawler.NewLogin(client).Do(ctx, req) if tt.want.err { @@ -91,7 +88,7 @@ func TestLogin_Do_Response(t *testing.T) { if !assert.NotNil(res) { return } - assert.Equal(tt.want.loggedIn, res.LoggedIn) + assert.Equal(tt.want.res, res) }) } } diff --git a/crawler/testdata/contest/success.html b/crawler/testdata/contest/success.html new file mode 100644 index 0000000..c4a3f51 --- /dev/null +++ b/crawler/testdata/contest/success.html @@ -0,0 +1,505 @@ + + + + + + + + AtCoder Beginner Contest 234 - AtCoder + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ +
+
+ + + +
+
+
+ + + Contest Duration: + - (local time) + (100 minutes) + + + Back to Home +
+ +
+
+ +
+

AtCoder Beginner Contest 234

+ +

+ Virtual Participation +

+ + + + + + + + + + + +
+

+ Can Participate: All + Rated Range: - 1999 + Penalty: 5 minutes + +

+ +
+ +

コンテスト情報

+
+
    +
  • コンテスト時間: 100 分
  • +
  • レーティング更新対象: 0 - 1999
  • +
+
+ +

配点

+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
問題点数
A100
B200
C300
D400
E500
F500
G600
Ex600
+
+
+
+ +

ルール

+
+
    +
  1. コンテスト中に問題に正解すると点数を獲得できます。
  2. +
  3. 順位は総合得点で決定します。
  4. +
  5. 同点の場合は提出時間の早い人が上の順位になります。
  6. +
  7. 誤答を提出するたびにペナルティが加算されます。このコンテストのペナルティは5分です。詳細は画面下部の「ルール」をご覧ください。
  8. +
+

+ このコンテストは full-feedback 形式のコンテストです。コンテスト中に提出された結果だけで順位が決定します。 +

+
+ +

便利情報

+ +
+ + +

Contest Information

+ +
    +
  • Duration: 100 minutes
  • +
  • Rated Range: 0 - 1999
  • +
+ +

Point Values

+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TaskScore
A100
B200
C300
D400
E500
F500
G600
Ex600
+
+
+ +

Contest Rules

+ This contest is full-feedback (solutions are judged during the contest). +
+ When you solve a problem, you get a score assigned to it. + Competitors are ranked first by total scores, then by penalties. + The penalties are computed as (the time you spend to get your current score) + (5 minutes) * (the number of incorrect attempts). +
+ +

Useful Links

+ + +
+
+ + + + +
+
+ + + + + +
+ + + +
+ + + + + + +
+ + + + +
+
+
+ +
+ +
+

+ + + + + + diff --git a/crawler/testdata/login/logged-in.html b/crawler/testdata/login/logged-in.html deleted file mode 100644 index a9122e8..0000000 --- a/crawler/testdata/login/logged-in.html +++ /dev/null @@ -1,647 +0,0 @@ - - - - - - - - - - AtCoder:競技プログラミングコンテストを開催する国内最大のサイト - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - -
- -
-
- -
- - - - -
- - - -
-
-
- -

解けた!を
世界に届けたい。

-
AtCoderは、世界最高峰の競技プログラミングサイトです。
リアルタイムのオンラインコンテストで競い合うことや、
5,000以上の過去問にいつでもチャレンジすることができます。
-
-
- -
- -
-
-
-

コンテスト

-
-
-
-
-
-
-

最新コンテスト

-
- -
    - -
- - -
-
- - -
-
-
-

ランキング

-
-
- -
-
- 最終更新日時: - - - - -
- -
-
- 最終更新日時: - - - - -
- -
-
-
-
-
- -
-
-
-

お知らせ

-
-
- - - -
- 投稿日時: - - -
-
-
<a href="https://atcoder.jp/contests/ahc035">ALGO ARTIS プログラミングコンテスト2024 夏(AtCoder Heuristic Contest 035)</a> が開催されます。 - -- コンテストページ: https://atcoder.jp/contests/ahc035 -- 開始時刻: <a href='http://www.timeanddate.com/worldclock/fixedtime.html?iso=20240721T1500&p1=248' target='blank'><time class='fixtime fixtime-full'>2024-07-21 15:00:00+0900</time></a> -- コンテスト時間: 4 時間 -- Writer:<img src="https://img.atcoder.jp/assets/icon/crown_gold.png"><a href="https://atcoder.jp/users/terry_u16?contestType=heuristic" class="username"><span class="user-red">terry_u16</span></a> -- レーティング変化:All(ヒューリスティックレーティング) - -このコンテストは <a href="https://atcoder.jp/posts/1163">AWTF 2025</a> の選考対象です。 - -皆様、是非ご参加ください! -
- -
- - - -
-
-
-
-
- - -
-
-
-

AtCoderJobs

-

AtCoderのレーティングを用いた人材採用サービスです。

-
-
-
- - - - - - - - - -
-
- -
-
- - -
-
-
-

検定

-
- -
-
-
-
- -
-
-

- アルゴリズム実技検定についてはこちらから! -
- 過去問公開中 -

-
- -
-
-
-
- -
- - - - - - - - -
- - - -
- - - - - - -
- - - - -
-
-
- - - - - - - - - - - diff --git a/crawler/testdata/login/success.html b/crawler/testdata/login/success.html new file mode 100644 index 0000000..fa67f95 --- /dev/null +++ b/crawler/testdata/login/success.html @@ -0,0 +1,383 @@ + + + + + + + + + AtCoder + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ +
+
+ + + +
+
+
+
+
+

+ Contest + +

+
+
+ + + +
+

Permanent Contests

+ + +
+ + + + + + + + + + +
Contest Name
+
+
+ + +
+
+

Upcoming Contests

+ + +
+ + + + + + + + + + +
Start TimeContest Name
+
+
+ + +
+
+

Recent Contests

+ + +
+ + + + + + + + + + +
Start TimeContest Name
+
+
+ +
+

+ Detail + + +

+
+
+
+
+

+ Ranking: Algorithm + +

+
+
+
+

Last Updated:

+ + + + + + + + + + + +
UserRating
+
+

+ View all + + + Hall of Fame + + + GP30 +

+
+
+
+
+

+ Ranking: Heuristic + +

+
+
+
+

Last Updated:

+ + + + + + + + + + + +
UserRating
+
+

+ View all + + + GP30 +

+
+
+
+
+ +

Post Archive

+
+
+ + + + +
+ +
+ + + + + + +
+ + + + +
+
+
+ +
+ +
+

+ + + + + + + diff --git a/io/read_action.go b/io/read_action.go new file mode 100644 index 0000000..bf377de --- /dev/null +++ b/io/read_action.go @@ -0,0 +1,18 @@ +package io + +import ( + "bytes" + "io" +) + +func WithReadAction(r io.Reader, f func(io.Reader) error) (io.Reader, error) { + var buf bytes.Buffer + tee := io.TeeReader(r, &buf) + if err := f(tee); err != nil { + return nil, err + } + if _, err := io.Copy(io.Discard, tee); err != nil { + return nil, err + } + return &buf, nil +} diff --git a/io/types.go b/io/types.go index 747420a..69dccb1 100644 --- a/io/types.go +++ b/io/types.go @@ -6,6 +6,7 @@ type Writer = io.Writer type Reader = io.Reader var Discard = io.Discard +var EOF = io.EOF func Copy(dst Writer, src Reader) (written int64, err error) { return io.Copy(dst, src) diff --git a/timezone/timezone.go b/timezone/timezone.go new file mode 100644 index 0000000..f1f532b --- /dev/null +++ b/timezone/timezone.go @@ -0,0 +1,18 @@ +package timezone + +import ( + "log/slog" + "time" +) + +var Tokyo *time.Location + +func init() { + var err error + Tokyo, err = time.LoadLocation("Asia/Tokyo") + if err != nil { + slog.With("err", err).Error("Failed to load Asia/Tokyo timezone") + Tokyo = time.FixedZone("Asia/Tokyo", 9*60*60) + slog.Info("Using fixed timezone for Asia/Tokyo") + } +}