Skip to content

Commit

Permalink
Added support of ignoring page resources (MontFerret#592)
Browse files Browse the repository at this point in the history
* Added support of ignoring page resources

* Updatd pipeline scripts

* Updated comments
  • Loading branch information
ziflex authored and bundleman committed Apr 5, 2021
1 parent 1fedc3e commit cc5f10c
Show file tree
Hide file tree
Showing 9 changed files with 176 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
sudo curl -o /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar
export CLASSPATH=".:/usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar:$CLASSPATH"
mkdir $HOME/antlr-bin
echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar "$@"' > $HOME/antlr-bin/antlr
echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-4.9-complete.jar "$@"' > $HOME/antlr-bin/antlr
echo -e '#!/bin/bash\njava org.antlr.v4.gui.TestRig "$@"' > $HOME/antlr-bin/grun
chmod +x $HOME/antlr-bin/*
export PATH=$PATH:$HOME/antlr-bin
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
sudo curl -o /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar
export CLASSPATH=".:/usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar:$CLASSPATH"
mkdir $HOME/antlr-bin
echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar "$@"' > $HOME/antlr-bin/antlr
echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-4.9-complete.jar "$@"' > $HOME/antlr-bin/antlr
echo -e '#!/bin/bash\njava org.antlr.v4.gui.TestRig "$@"' > $HOME/antlr-bin/grun
chmod +x $HOME/antlr-bin/*
sudo ln -s $HOME/antlr-bin/antlr /usr/local/bin/antlr
Expand Down
13 changes: 13 additions & 0 deletions examples/disable-images.fql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
LET p = DOCUMENT("https://www.gettyimages.com/", {
driver: "cdp",
disable: {
resources: [
{
url: "*",
type: "image"
}
]
}
})

RETURN NONE
1 change: 0 additions & 1 deletion pkg/drivers/cdp/events/loop.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ func (loop *Loop) RemoveListener(eventID ID, listenerID ListenerID) {

// run starts running an event loop.
// It constantly iterates over each event source.
// Additionally to that, on each iteration it checks the command channel in order to perform add/remove listener/source operations.
func (loop *Loop) run(ctx context.Context) {
sources := loop.sources
size := sources.Size()
Expand Down
1 change: 1 addition & 0 deletions pkg/drivers/cdp/network/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ import "github.com/MontFerret/ferret/pkg/drivers/cdp/events"
var (
eventFrameLoad = events.New("frame_load")
responseReceived = events.New("response_received")
requestPaused = events.New("request_paused")
)
75 changes: 75 additions & 0 deletions pkg/drivers/cdp/network/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"sync"

"github.com/mafredri/cdp"
"github.com/mafredri/cdp/protocol/fetch"
"github.com/mafredri/cdp/protocol/network"
"github.com/mafredri/cdp/protocol/page"
"github.com/mafredri/cdp/rpcc"
Expand Down Expand Up @@ -37,13 +38,15 @@ type (
eventLoop *events.Loop
cancel context.CancelFunc
responseListenerID events.ListenerID
filterListenerID events.ListenerID
response *sync.Map
}
)

func New(
logger *zerolog.Logger,
client *cdp.Client,
options Options,
) (*Manager, error) {
ctx, cancel := context.WithCancel(context.Background())

Expand All @@ -55,6 +58,20 @@ func New(
m.cancel = cancel
m.response = new(sync.Map)

if len(options.Cookies) > 0 {
for url, cookies := range options.Cookies {
if err := m.setCookiesInternal(ctx, url, cookies); err != nil {
return nil, err
}
}
}

if len(options.Headers) > 0 {
if err := m.setHeadersInternal(ctx, options.Headers); err != nil {
return nil, err
}
}

var err error

closers := make([]io.Closer, 0, 10)
Expand Down Expand Up @@ -87,6 +104,32 @@ func New(

m.responseListenerID = m.eventLoop.AddListener(responseReceived, m.onResponse)

if len(options.Filter.Patterns) > 0 {
el2 := events.NewLoop()

err = m.client.Fetch.Enable(ctx, toFetchArgs(options.Filter.Patterns))

if err != nil {
return nil, err
}

requestPausedStream, err := m.client.Fetch.RequestPaused(ctx)

if err != nil {
return nil, err
}

el2.AddSource(events.NewSource(requestPaused, requestPausedStream, func(stream rpcc.Stream) (interface{}, error) {
return stream.(fetch.RequestPausedClient).Recv()
}))

m.filterListenerID = el2.AddListener(requestPaused, m.onRequestPaused)

// run in a separate loop in order to get higher priority
// TODO: Consider adding support of event priorities to EventLoop
el2.Run(ctx)
}

m.eventLoop.Run(ctx)

return m, nil
Expand Down Expand Up @@ -128,6 +171,10 @@ func (m *Manager) SetCookies(ctx context.Context, url string, cookies drivers.HT
m.mu.Lock()
defer m.mu.Unlock()

return m.setCookiesInternal(ctx, url, cookies)
}

func (m *Manager) setCookiesInternal(ctx context.Context, url string, cookies drivers.HTTPCookies) error {
if len(cookies) == 0 {
return nil
}
Expand Down Expand Up @@ -176,6 +223,10 @@ func (m *Manager) SetHeaders(ctx context.Context, headers drivers.HTTPHeaders) e
m.mu.Lock()
defer m.mu.Unlock()

return m.setHeadersInternal(ctx, headers)
}

func (m *Manager) setHeadersInternal(ctx context.Context, headers drivers.HTTPHeaders) error {
if len(headers) == 0 {
return nil
}
Expand Down Expand Up @@ -431,3 +482,27 @@ func (m *Manager) onResponse(_ context.Context, message interface{}) (out bool)

return
}

func (m *Manager) onRequestPaused(ctx context.Context, message interface{}) (out bool) {
out = true
msg, ok := message.(*fetch.RequestPausedReply)

if !ok {
return
}

err := m.client.Fetch.FailRequest(ctx, &fetch.FailRequestArgs{
RequestID: msg.RequestID,
ErrorReason: network.ErrorReasonBlockedByClient,
})

if err != nil {
m.logger.
Err(err).
Str("resourceType", msg.ResourceType.String()).
Str("url", msg.Request.URL).
Msg("failed to terminate a request")
}

return
}
37 changes: 37 additions & 0 deletions pkg/drivers/cdp/network/options.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package network

import (
"github.com/MontFerret/ferret/pkg/drivers"
"github.com/mafredri/cdp/protocol/fetch"
)

type (
Cookies map[string]drivers.HTTPCookies

Filter struct {
Patterns []drivers.ResourceFilter
}

Options struct {
Cookies Cookies
Headers drivers.HTTPHeaders
Filter Filter
}
)

func toFetchArgs(filterPatterns []drivers.ResourceFilter) *fetch.EnableArgs {
patterns := make([]fetch.RequestPattern, 0, len(filterPatterns))

for _, pattern := range filterPatterns {
rt := toResourceType(pattern.Type)

patterns = append(patterns, fetch.RequestPattern{
URLPattern: &pattern.URL,
ResourceType: &rt,
})
}

return &fetch.EnableArgs{
Patterns: patterns,
}
}
37 changes: 37 additions & 0 deletions pkg/drivers/cdp/network/resources.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package network

import "github.com/mafredri/cdp/protocol/network"

var (
resourceTypeMapping = map[string]network.ResourceType{
"document": network.ResourceTypeDocument,
"stylesheet": network.ResourceTypeStylesheet,
"css": network.ResourceTypeStylesheet,
"image": network.ResourceTypeImage,
"media": network.ResourceTypeMedia,
"font": network.ResourceTypeFont,
"script": network.ResourceTypeScript,
"js": network.ResourceTypeScript,
"texttrack": network.ResourceTypeTextTrack,
"xhr": network.ResourceTypeXHR,
"ajax": network.ResourceTypeXHR,
"fetch": network.ResourceTypeFetch,
"eventsource": network.ResourceTypeEventSource,
"websocket": network.ResourceTypeWebSocket,
"manifest": network.ResourceTypeManifest,
"sxg": network.ResourceTypeSignedExchange,
"ping": network.ResourceTypePing,
"cspViolationReport": network.ResourceTypeCSPViolationReport,
"other": network.ResourceTypeOther,
}
)

func toResourceType(alias string) network.ResourceType {
rt, found := resourceTypeMapping[alias]

if found {
return rt
}

return network.ResourceTypeNotSet
}
19 changes: 11 additions & 8 deletions pkg/drivers/cdp/page.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,19 +69,22 @@ func LoadHTMLPage(
}
}()

netManager, err := net.New(logger, client)

if err != nil {
return nil, err
netOpts := net.Options{
Headers: params.Headers,
}

err = netManager.SetCookies(ctx, params.URL, params.Cookies)
if len(params.Cookies) > 0 {
netOpts.Cookies = make(map[string]drivers.HTTPCookies)
netOpts.Cookies[params.URL] = params.Cookies
}

if err != nil {
return nil, err
if params.Disable != nil {
if len(params.Disable.Resources) > 0 {
netOpts.Filter.Patterns = params.Disable.Resources
}
}

err = netManager.SetHeaders(ctx, params.Headers)
netManager, err := net.New(logger, client, netOpts)

if err != nil {
return nil, err
Expand Down

0 comments on commit cc5f10c

Please sign in to comment.