Skip to content

Commit

Permalink
add context support to LoadURL* func
Browse files Browse the repository at this point in the history
  • Loading branch information
sndnvaps committed Dec 2, 2021
1 parent 0a60b4a commit f99bb02
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 16 deletions.
31 changes: 25 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,39 @@ htmlquery is an XPath query package for HTML, lets you extract data or evaluate
Installation
====

> $ go get github.com/antchfx/htmlquery
> $ go get github.com/Aiicy/htmlquery
Getting Started
====

#### Load HTML document from URL.

```go
doc, err := htmlquery.LoadURL("http://example.com/")
ctx := context.Background()
ctx, cancel := context.WithTimeout(ctx, time.Second)
defer cancel()
doc, err := htmlquery.LoadURL(ctx,"http://example.com/")
```

### Load HTML document from URL with Header set

```go
ctx := context.Background()
ctx, cancel := context.WithTimeout(ctx, time.Second)
defer cancel()
header := map[string]string {
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
}
doc,err := htmlquery.LoadURLWithHeader("http://example.com/",header)
doc,err := htmlquery.LoadURLWithHeader(ctx,"http://example.com/",header)
```

### Load HTML document from URL with Proxy

```go

doc,err := htmlquery.LoadURLWithProxy("http://example.com/","http://proxyip:proxyport")
ctx := context.Background()
ctx, cancel := context.WithTimeout(ctx, time.Second)
defer cancel()
doc,err := htmlquery.LoadURLWithProxy(ctx,"http://example.com/","http://proxyip:proxyport")
```

#### Load HTML document from string.
Expand Down Expand Up @@ -77,8 +85,19 @@ Quick Tutorial
===

```go
package main
import (
"fmt"
"context"

"github.com/Aiicy/htmlquery"
)

func main() {
doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
ctx := context.Background()
ctx, cancel := context.WithTimeout(ctx, time.Second)
defer cancel()
doc, err := htmlquery.LoadURL(ctx,"https://www.bing.com/search?q=golang")
if err != nil {
panic(err)
}
Expand Down
39 changes: 30 additions & 9 deletions query.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package htmlquery

import (
"bytes"
"context"
"fmt"
"io"
"net/http"
Expand Down Expand Up @@ -67,22 +68,32 @@ func FindEach(top *html.Node, expr string, cb func(int, *html.Node)) error {
}

// LoadURL loads the HTML document from the specified URL.
func LoadURL(url string) (*html.Node, error) {
resp, err := http.Get(url)
func LoadURL(ctx context.Context, url string) (*html.Node, error) {
Ctx, cancel := context.WithCancel(ctx)
defer cancel()
req, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
return nil, err
}
defer resp.Body.Close()

r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
req = req.WithContext(Ctx)
res, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
defer res.Body.Close()

r, err := charset.NewReader(res.Body, res.Header.Get("Content-Type"))
if err != nil {
return nil, err
}
return html.Parse(r)
}

//LoadURLWithHeader loads the HTML document from the specified URL with http header
func LoadURLWithHeader(link string, headers map[string]string) (*html.Node, error) {
func LoadURLWithHeader(ctx context.Context, link string, headers map[string]string) (*html.Node, error) {
Ctx, cancel := context.WithCancel(ctx)
defer cancel()
client := &http.Client{}
request, err := http.NewRequest("GET", link, nil)
for k, v := range headers {
Expand All @@ -92,6 +103,7 @@ func LoadURLWithHeader(link string, headers map[string]string) (*html.Node, erro
return nil, err
}

request = request.WithContext(Ctx)
resp, err := client.Do(request)
if err != nil {
return nil, err
Expand All @@ -106,21 +118,30 @@ func LoadURLWithHeader(link string, headers map[string]string) (*html.Node, erro
}

// LoadURLWithProxy loads the HTML document from the specified URL with Proxy.
func LoadURLWithProxy(link string, proxy string) (*html.Node, error) {
func LoadURLWithProxy(ctx context.Context, link string, proxy string) (*html.Node, error) {
Ctx, cancel := context.WithCancel(ctx)
defer cancel()

proxyUrl, err := url.Parse(proxy) //proxy = http://proxyIp:proxyPort
Client := &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(proxyUrl),
},
}

resp, err := Client.Get(link)
req, err := http.NewRequest(http.MethodGet, link, nil)
if err != nil {
return nil, err
}
defer resp.Body.Close()

r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
req = req.WithContext(Ctx)
res, err := Client.Do(req)
if err != nil {
return nil, err
}
defer res.Body.Close()

r, err := charset.NewReader(res.Body, res.Header.Get("Content-Type"))
if err != nil {
return nil, err
}
Expand Down
8 changes: 7 additions & 1 deletion query_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package htmlquery

import (
"context"
"strings"
"testing"
"time"

"github.com/antchfx/xpath"
"golang.org/x/net/html"
Expand Down Expand Up @@ -40,7 +42,11 @@ const htmlSample = `<!DOCTYPE html><html lang="en-US">
var testDoc, _ = loadHTML(htmlSample)

func TestHttpLoad(t *testing.T) {
doc, err := LoadURL("http://www.bing.com")
ctx := context.Background()
ctx, cancel := context.WithTimeout(ctx, time.Second)
defer cancel()

doc, err := LoadURL(ctx, "http://www.bing.com")
if err != nil {
t.Fatal(err)
}
Expand Down

0 comments on commit f99bb02

Please sign in to comment.