Skip to content

Commit

Permalink
feat: Improves proxy pool and adds configs to re-use browser page
Browse files Browse the repository at this point in the history
  • Loading branch information
gosom committed Jan 19, 2025
1 parent 0313772 commit b2bc749
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 27 deletions.
81 changes: 55 additions & 26 deletions adapters/fetchers/jshttp/jshttp.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (

var _ scrapemate.HTTPFetcher = (*jsFetch)(nil)

func New(headless, disableImages bool, rotator scrapemate.ProxyRotator) (scrapemate.HTTPFetcher, error) {
func New(headless, disableImages bool, rotator scrapemate.ProxyRotator, poolSize, pageReuseLimit, browserReuseLimit int) (scrapemate.HTTPFetcher, error) {
opts := []*playwright.RunOptions{
{
Browsers: []string{"chromium"},
Expand All @@ -20,46 +20,58 @@ func New(headless, disableImages bool, rotator scrapemate.ProxyRotator) (scrapem
return nil, err
}

const poolSize = 10

pw, err := playwright.Run()
if err != nil {
return nil, err
}

ans := jsFetch{
pw: pw,
headless: headless,
disableImages: disableImages,
pool: make(chan *browser, poolSize),
rotator: rotator,
pw: pw,
headless: headless,
disableImages: disableImages,
pool: make(chan *browser, poolSize),
rotator: rotator,
pageReuseLimit: pageReuseLimit,
browserReuseLimit: browserReuseLimit,
}

for i := 0; i < poolSize; i++ {
b, err := newBrowser(pw, headless, disableImages, rotator)
if err != nil {
_ = ans.Close()
return nil, err
}

ans.pool <- b
}

return &ans, nil
}

type jsFetch struct {
pw *playwright.Playwright
headless bool
disableImages bool
pool chan *browser
rotator scrapemate.ProxyRotator
pw *playwright.Playwright
headless bool
disableImages bool
pool chan *browser
rotator scrapemate.ProxyRotator
pageReuseLimit int
browserReuseLimit int
}

func (o *jsFetch) GetBrowser(ctx context.Context) (*browser, error) {
select {
case <-ctx.Done():
return nil, ctx.Err()
case ans := <-o.pool:
return ans, nil
default:
ans, err := newBrowser(o.pw, o.headless, o.disableImages, o.rotator)
if err != nil {
return nil, err
if ans.browser.IsConnected() && (o.browserReuseLimit <= 0 || ans.browserUsage < o.browserReuseLimit) {
return ans, nil
}

return ans, nil
ans.browser.Close()
default:
}

return newBrowser(o.pw, o.headless, o.disableImages, o.rotator)
}

func (o *jsFetch) Close() error {
Expand All @@ -75,6 +87,12 @@ func (o *jsFetch) Close() error {
}

func (o *jsFetch) PutBrowser(ctx context.Context, b *browser) {
if !b.browser.IsConnected() {
b.Close()

return
}

select {
case <-ctx.Done():
b.Close()
Expand Down Expand Up @@ -117,21 +135,32 @@ func (o *jsFetch) Fetch(ctx context.Context, job scrapemate.IJob) scrapemate.Res
Error: err,
}
}
}

// match the browser default timeout to the job timeout
if job.GetTimeout() > 0 {
page.SetDefaultTimeout(float64(job.GetTimeout().Milliseconds()))
}
// match the browser default timeout to the job timeout
if job.GetTimeout() > 0 {
page.SetDefaultTimeout(float64(job.GetTimeout().Milliseconds()))
}

defer page.Close()
browser.page0Usage++
browser.browserUsage++

defer func() {
if o.pageReuseLimit == 0 || browser.page0Usage >= o.pageReuseLimit {
_ = page.Close()

browser.page0Usage = 0
}
}()

return job.BrowserActions(ctx, page)
}

type browser struct {
browser playwright.Browser
ctx playwright.BrowserContext
browser playwright.Browser
ctx playwright.BrowserContext
page0Usage int
browserUsage int
}

func (o *browser) Close() {
Expand Down
26 changes: 26 additions & 0 deletions scrapemateapp/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,24 @@ func NewConfig(writers []scrapemate.ResultWriter, options ...func(*Config) error
return &cfg, nil
}

// WithBrowserReuseLimit sets the browser reuse limit of the app.
func WithBrowserReuseLimit(limit int) func(*Config) error {
return func(o *Config) error {
o.BrowserReuseLimit = limit

return nil
}
}

// WithPageReuseLimit sets the page reuse limit of the app.
func WithPageReuseLimit(limit int) func(*Config) error {
return func(o *Config) error {
o.PageReuseLimit = limit

return nil
}
}

// WithConcurrency sets the concurrency of the app.
func WithConcurrency(concurrency int) func(*Config) error {
return func(o *Config) error {
Expand Down Expand Up @@ -167,6 +185,14 @@ type Config struct {
ExitOnInactivityDuration time.Duration
// Proxies are the proxies to use for the app.
Proxies []string
// BrowserReuseLimit is the limit of browser reuse.
// Only applicable when using JavaScript renderer.
// By default it is 0, which means the browser will be reused indefinitely.
BrowserReuseLimit int
// PageReuseLimit is the limit of page reuse.
// Only applicable when using JavaScript renderer.
// By default it is 0, which means the page will not be reused.
PageReuseLimit int
}

func (o *Config) validate() error {
Expand Down
9 changes: 8 additions & 1 deletion scrapemateapp/scrapemateapp.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,14 @@ func (app *ScrapemateApp) getFetcher() (scrapemate.HTTPFetcher, error) {

switch app.cfg.UseJS {
case true:
httpFetcher, err = jsfetcher.New(!app.cfg.JSOpts.Headfull, app.cfg.JSOpts.DisableImages, rotator)
httpFetcher, err = jsfetcher.New(
!app.cfg.JSOpts.Headfull,
app.cfg.JSOpts.DisableImages,
rotator,
app.cfg.Concurrency,
app.cfg.PageReuseLimit,
app.cfg.BrowserReuseLimit,
)
if err != nil {
return nil, err
}
Expand Down

0 comments on commit b2bc749

Please sign in to comment.