Skip to content

Commit

Permalink
feat: Adds support to customize playwright's user agent
Browse files Browse the repository at this point in the history
  • Loading branch information
gosom committed Jan 25, 2025
1 parent b2bc749 commit 7ca931c
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 19 deletions.
44 changes: 33 additions & 11 deletions adapters/fetchers/jshttp/jshttp.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,17 @@ import (

var _ scrapemate.HTTPFetcher = (*jsFetch)(nil)

func New(headless, disableImages bool, rotator scrapemate.ProxyRotator, poolSize, pageReuseLimit, browserReuseLimit int) (scrapemate.HTTPFetcher, error) {
type JSFetcherOptions struct {
Headless bool
DisableImages bool
Rotator scrapemate.ProxyRotator
PoolSize int
PageReuseLimit int
BrowserReuseLimit int
UserAgent string
}

func New(params JSFetcherOptions) (scrapemate.HTTPFetcher, error) {
opts := []*playwright.RunOptions{
{
Browsers: []string{"chromium"},
Expand All @@ -27,16 +37,17 @@ func New(headless, disableImages bool, rotator scrapemate.ProxyRotator, poolSize

ans := jsFetch{
pw: pw,
headless: headless,
disableImages: disableImages,
pool: make(chan *browser, poolSize),
rotator: rotator,
pageReuseLimit: pageReuseLimit,
browserReuseLimit: browserReuseLimit,
headless: params.Headless,
disableImages: params.DisableImages,
pool: make(chan *browser, params.PoolSize),
rotator: params.Rotator,
pageReuseLimit: params.PageReuseLimit,
browserReuseLimit: params.BrowserReuseLimit,
ua: params.UserAgent,
}

for i := 0; i < poolSize; i++ {
b, err := newBrowser(pw, headless, disableImages, rotator)
for i := 0; i < params.PoolSize; i++ {
b, err := newBrowser(pw, params.Headless, params.DisableImages, params.Rotator, params.UserAgent)
if err != nil {
_ = ans.Close()
return nil, err
Expand All @@ -56,6 +67,7 @@ type jsFetch struct {
rotator scrapemate.ProxyRotator
pageReuseLimit int
browserReuseLimit int
ua string
}

func (o *jsFetch) GetBrowser(ctx context.Context) (*browser, error) {
Expand All @@ -71,7 +83,7 @@ func (o *jsFetch) GetBrowser(ctx context.Context) (*browser, error) {
default:
}

return newBrowser(o.pw, o.headless, o.disableImages, o.rotator)
return newBrowser(o.pw, o.headless, o.disableImages, o.rotator, o.ua)
}

func (o *jsFetch) Close() error {
Expand Down Expand Up @@ -168,7 +180,7 @@ func (o *browser) Close() {
_ = o.browser.Close()
}

func newBrowser(pw *playwright.Playwright, headless, disableImages bool, rotator scrapemate.ProxyRotator) (*browser, error) {
func newBrowser(pw *playwright.Playwright, headless, disableImages bool, rotator scrapemate.ProxyRotator, ua string) (*browser, error) {
opts := playwright.BrowserTypeLaunchOptions{
Headless: playwright.Bool(headless),
Args: []string{
Expand All @@ -189,6 +201,7 @@ func newBrowser(pw *playwright.Playwright, headless, disableImages bool, rotator
`--disable-default-apps`,
`--disable-notifications`,
`--disable-webgl`,
`--disable-blink-features=AutomationControlled`,
},
}
if disableImages {
Expand All @@ -204,6 +217,15 @@ func newBrowser(pw *playwright.Playwright, headless, disableImages bool, rotator
const defaultWidth, defaultHeight = 1920, 1080

bctx, err := br.NewContext(playwright.BrowserNewContextOptions{
UserAgent: func() *string {
if ua == "" {
defaultUA := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"

return &defaultUA
}

return &ua
}(),
Viewport: &playwright.Size{
Width: defaultWidth,
Height: defaultHeight,
Expand Down
9 changes: 9 additions & 0 deletions scrapemateapp/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,20 @@ func Headfull() func(*jsOptions) {
}
}

// DisableImages is a helper function to disable images in the browser.
func DisableImages() func(*jsOptions) {
return func(o *jsOptions) {
o.DisableImages = true
}
}

// WithUA sets the user agent of the browser.
func WithUA(ua string) func(*jsOptions) {
return func(o *jsOptions) {
o.UA = ua
}
}

// WithExitOnInactivity sets the duration after which the app will exit if there are no more jobs to run.
func WithExitOnInactivity(duration time.Duration) func(*Config) error {
return func(o *Config) error {
Expand All @@ -146,6 +154,7 @@ type jsOptions struct {
// By default, the browser is run in headless mode.
Headfull bool
DisableImages bool
UA string
}

type Config struct {
Expand Down
19 changes: 11 additions & 8 deletions scrapemateapp/scrapemateapp.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,17 @@ func (app *ScrapemateApp) getFetcher() (scrapemate.HTTPFetcher, error) {

switch app.cfg.UseJS {
case true:
httpFetcher, err = jsfetcher.New(
!app.cfg.JSOpts.Headfull,
app.cfg.JSOpts.DisableImages,
rotator,
app.cfg.Concurrency,
app.cfg.PageReuseLimit,
app.cfg.BrowserReuseLimit,
)
jsParams := jsfetcher.JSFetcherOptions{
Headless: !app.cfg.JSOpts.Headfull,
DisableImages: app.cfg.JSOpts.DisableImages,
Rotator: rotator,
PoolSize: app.cfg.Concurrency,
PageReuseLimit: app.cfg.PageReuseLimit,
BrowserReuseLimit: app.cfg.BrowserReuseLimit,
UserAgent: app.cfg.JSOpts.UA,
}

httpFetcher, err = jsfetcher.New(jsParams)
if err != nil {
return nil, err
}
Expand Down

0 comments on commit 7ca931c

Please sign in to comment.