Skip to content

Commit

Permalink
fixup! #78 Fix raise error for not a html
Browse files Browse the repository at this point in the history
  • Loading branch information
meian committed Jul 26, 2024
1 parent a8c5100 commit 8d69aa9
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 18 deletions.
53 changes: 35 additions & 18 deletions crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ import (
"bytes"
"context"
"fmt"
"io"
"log/slog"
"net/http"
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/meian/atgo/io"
"github.com/meian/atgo/logs"
"github.com/meian/atgo/url"
"github.com/pkg/errors"
Expand Down Expand Up @@ -142,15 +142,15 @@ func (c Crawler) doHTTPRequest(ctx context.Context, method, contentType, url str
func (c Crawler) documentFromReader(ctx context.Context, respBody io.Reader) (*goquery.Document, error) {
logger := logs.FromContext(ctx)
logger.Debug("parsing document from response")
body, err := io.ReadAll(respBody)

respBody, err := io.WithReadAction(respBody, func(r io.Reader) error {
return c.validHTML(ctx, r)
})
if err != nil {
logger.Error(err.Error())
return nil, errors.New("failed to read response body")
}
if !c.validHTML(string(body)) {
return nil, errors.New("response is not a valid HTML")
}
respBody = bytes.NewReader(body)

doc, err := goquery.NewDocumentFromReader(respBody)
if err != nil {
logger.Error(err.Error())
Expand All @@ -161,19 +161,36 @@ func (c Crawler) documentFromReader(ctx context.Context, respBody io.Reader) (*g
return doc, nil
}

func (Crawler) validHTML(body string) bool {
htmlPos := strings.Index(body, "<html")
if htmlPos < 0 {
return false
}
body = body[htmlPos:]
headPos := strings.Index(body, "<head")
if headPos < 0 {
return false
func (Crawler) validHTML(ctx context.Context, reader io.Reader) error {
logger := logs.FromContext(ctx)

tags := [][]byte{[]byte("<html"), []byte("<head"), []byte("<body")}
buf := make([]byte, 4096)
var content []byte

for {
n, err := reader.Read(buf)
if err != nil {
if err == io.EOF {
return errors.New("failed to find HTML tags")
}
logger.Error(err.Error())
return errors.New("failed to read response body")
}
content = append(content, buf[:n]...)

for {
pos := bytes.Index(content, tags[0])
if pos < 0 {
break
}
content = content[pos:]
tags = tags[1:]
if len(tags) == 0 {
return nil
}
}
}
body = body[headPos:]
bodyPos := strings.Index(body, "<body")
return bodyPos >= 0
}

func (c Crawler) LoggedIn(ctx context.Context, doc *goquery.Document) bool {
Expand Down
16 changes: 16 additions & 0 deletions io/read_action.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package io

import (
"bytes"
"io"
)

func WithReadAction(r io.Reader, f func(io.Reader) error) (io.Reader, error) {
var buf bytes.Buffer
tee := io.TeeReader(r, &buf)
if err := f(tee); err != nil {
return nil, err
}
io.Copy(io.Discard, tee)
return &buf, nil
}
1 change: 1 addition & 0 deletions io/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ type Writer = io.Writer
type Reader = io.Reader

var Discard = io.Discard
var EOF = io.EOF

func Copy(dst Writer, src Reader) (written int64, err error) {
return io.Copy(dst, src)
Expand Down

0 comments on commit 8d69aa9

Please sign in to comment.