Skip to content

Commit

Permalink
Configurable scraper user agent string (#409)
Browse files Browse the repository at this point in the history
* Add debug scrape option.

Co-authored-by: HiddenPants255 <>
  • Loading branch information
WithoutPants authored Mar 20, 2020
1 parent ff49536 commit abf2b49
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 11 deletions.
1 change: 1 addition & 0 deletions graphql/documents/data/config.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ fragment ConfigGeneralData on ConfigGeneralResult {
logLevel
logAccess
excludes
scraperUserAgent
}

fragment ConfigInterfaceData on ConfigInterfaceResult {
Expand Down
4 changes: 4 additions & 0 deletions graphql/schema/types/config.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ input ConfigGeneralInput {
logAccess: Boolean!
"""Array of file regexp to exclude from Scan"""
excludes: [String!]
"""Scraper user agent string"""
scraperUserAgent: String
}

type ConfigGeneralResult {
Expand Down Expand Up @@ -59,6 +61,8 @@ type ConfigGeneralResult {
logAccess: Boolean!
"""Array of file regexp to exclude from Scan"""
excludes: [String!]!
"""Scraper user agent string"""
scraperUserAgent: String
}

input ConfigInterfaceInput {
Expand Down
4 changes: 4 additions & 0 deletions pkg/api/resolver_mutation_configure.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ func (r *mutationResolver) ConfigureGeneral(ctx context.Context, input models.Co
config.Set(config.Exclude, input.Excludes)
}

if input.ScraperUserAgent != nil {
config.Set(config.ScraperUserAgent, input.ScraperUserAgent)
}

if err := config.Write(); err != nil {
return makeConfigGeneralResult(), err
}
Expand Down
4 changes: 3 additions & 1 deletion pkg/api/resolver_query_configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ func makeConfigGeneralResult() *models.ConfigGeneralResult {
maxTranscodeSize := config.GetMaxTranscodeSize()
maxStreamingTranscodeSize := config.GetMaxStreamingTranscodeSize()

scraperUserAgent := config.GetScraperUserAgent()

return &models.ConfigGeneralResult{
Stashes: config.GetStashPaths(),
DatabasePath: config.GetDatabasePath(),
Expand All @@ -46,6 +48,7 @@ func makeConfigGeneralResult() *models.ConfigGeneralResult {
LogLevel: config.GetLogLevel(),
LogAccess: config.GetLogAccess(),
Excludes: config.GetExcludes(),
ScraperUserAgent: &scraperUserAgent,
}
}

Expand All @@ -59,7 +62,6 @@ func makeConfigInterfaceResult() *models.ConfigInterfaceResult {
cssEnabled := config.GetCSSEnabled()
language := config.GetLanguage()


return &models.ConfigInterfaceResult{
SoundOnPreview: &soundOnPreview,
WallShowTitle: &wallShowTitle,
Expand Down
9 changes: 8 additions & 1 deletion pkg/manager/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ const Password = "password"

const Database = "database"

const ScrapersPath = "scrapers_path"
const Exclude = "exclude"

const MaxTranscodeSize = "max_transcode_size"
Expand All @@ -32,6 +31,10 @@ const Host = "host"
const Port = "port"
const ExternalHost = "external_host"

// scraping options
const ScrapersPath = "scrapers_path"
const ScraperUserAgent = "scraper_user_agent"

// i18n
const Language = "language"

Expand Down Expand Up @@ -115,6 +118,10 @@ func GetScrapersPath() string {
return viper.GetString(ScrapersPath)
}

func GetScraperUserAgent() string {
return viper.GetString(ScraperUserAgent)
}

func GetHost() string {
return viper.GetString(Host)
}
Expand Down
9 changes: 7 additions & 2 deletions pkg/scraper/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,10 @@ func (c *scrapeSceneByURLConfig) resolveFn() {
}
}

type scraperDebugOptions struct {
PrintHTML bool `yaml:"printHTML"`
}

type scraperConfig struct {
ID string
Name string `yaml:"name"`
Expand All @@ -148,8 +152,9 @@ type scraperConfig struct {
SceneByFragment *sceneByFragmentConfig `yaml:"sceneByFragment"`
SceneByURL []*scrapeSceneByURLConfig `yaml:"sceneByURL"`

StashServer *stashServer `yaml:"stashServer"`
XPathScrapers xpathScrapers `yaml:"xPathScrapers"`
DebugOptions *scraperDebugOptions `yaml:"debug"`
StashServer *stashServer `yaml:"stashServer"`
XPathScrapers xpathScrapers `yaml:"xPathScrapers"`
}

func loadScraperFromYAML(path string) (*scraperConfig, error) {
Expand Down
13 changes: 12 additions & 1 deletion pkg/scraper/image.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strings"
"time"

"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/utils"
)
Expand Down Expand Up @@ -52,8 +53,18 @@ func getImage(url string) (*string, error) {
Timeout: imageGetTimeout,
}

req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}

userAgent := config.GetScraperUserAgent()
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}

// assume is a URL for now
resp, err := client.Get(url)
resp, err := client.Do(req)
if err != nil {
return nil, err
}
Expand Down
52 changes: 48 additions & 4 deletions pkg/scraper/xpath.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package scraper

import (
"bytes"
"errors"
"net/http"
"net/url"
"reflect"
"regexp"
Expand All @@ -10,11 +12,17 @@ import (

"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"

"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
)

// Timeout for the scrape http request. Includes transfer time. May want to make this
// configurable at some point.
const scrapeGetTimeout = time.Second * 30

type commonXPathConfig map[string]string

func (c commonXPathConfig) applyCommon(src string) string {
Expand Down Expand Up @@ -197,7 +205,7 @@ func (c xpathScraperAttrConfig) applySubScraper(value string) string {
return value
}

doc, err := htmlquery.LoadURL(value)
doc, err := loadURL(value, nil)

if err != nil {
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
Expand Down Expand Up @@ -504,14 +512,50 @@ func (r xPathResults) setKey(index int, key string, value string) xPathResults {
return r
}

func loadURL(url string, c *scraperConfig) (*html.Node, error) {
client := &http.Client{
Timeout: scrapeGetTimeout,
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}

userAgent := config.GetScraperUserAgent()
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}

resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()

r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
if err != nil {
return nil, err
}

ret, err := html.Parse(r)

if err == nil && c != nil && c.DebugOptions != nil && c.DebugOptions.PrintHTML {
var b bytes.Buffer
html.Render(&b, ret)
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
}

return ret, err
}

func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]

if scraper == nil {
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}

doc, err := htmlquery.LoadURL(url)
doc, err := loadURL(url, c.scraperConfig)

if err != nil {
return nil, err
Expand All @@ -527,7 +571,7 @@ func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene,
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}

doc, err := htmlquery.LoadURL(url)
doc, err := loadURL(url, c.scraperConfig)

if err != nil {
return nil, err
Expand All @@ -551,7 +595,7 @@ func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.Scra
u := c.QueryURL
u = strings.Replace(u, placeholder, escapedName, -1)

doc, err := htmlquery.LoadURL(u)
doc, err := loadURL(u, c.scraperConfig)

if err != nil {
return nil, err
Expand Down
21 changes: 20 additions & 1 deletion ui/v2.5/src/components/Settings/SettingsConfigurationPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ export const SettingsConfigurationPanel: React.FC = () => {
const [logLevel, setLogLevel] = useState<string>("Info");
const [logAccess, setLogAccess] = useState<boolean>(true);
const [excludes, setExcludes] = useState<string[]>([]);
const [scraperUserAgent, setScraperUserAgent] = useState<string | undefined>(undefined);

const { data, error, loading } = StashService.useConfiguration();

Expand All @@ -44,7 +45,8 @@ export const SettingsConfigurationPanel: React.FC = () => {
logOut,
logLevel,
logAccess,
excludes
excludes,
scraperUserAgent
});

useEffect(() => {
Expand All @@ -66,6 +68,7 @@ export const SettingsConfigurationPanel: React.FC = () => {
setLogLevel(conf.general.logLevel);
setLogAccess(conf.general.logAccess);
setExcludes(conf.general.excludes);
setScraperUserAgent(conf.general.scraperUserAgent ?? undefined);
}
}, [data, error]);

Expand Down Expand Up @@ -289,6 +292,22 @@ export const SettingsConfigurationPanel: React.FC = () => {

<hr />

<Form.Group id="generated-path">
<h6>Scraping</h6>
<Form.Control
className="col col-sm-6 text-input"
defaultValue={scraperUserAgent}
onChange={(e: React.FormEvent<HTMLInputElement>) =>
setScraperUserAgent(e.currentTarget.value)
}
/>
<Form.Text className="text-muted">
User-Agent string used during scrape http requests
</Form.Text>
</Form.Group>

<hr />

<Form.Group>
<h4>Authentication</h4>
<Form.Group id="username">
Expand Down
16 changes: 15 additions & 1 deletion ui/v2/src/components/Settings/SettingsConfigurationPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
const [logLevel, setLogLevel] = useState<string>("Info");
const [logAccess, setLogAccess] = useState<boolean>(true);
const [excludes, setExcludes] = useState<(string)[]>([]);
const [scraperUserAgent, setScraperUserAgent] = useState<string | undefined>(undefined);

const { data, error, loading } = StashService.useConfiguration();

Expand All @@ -48,7 +49,7 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
logLevel,
logAccess,
excludes,

scraperUserAgent,
});

useEffect(() => {
Expand All @@ -67,6 +68,7 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
setLogLevel(conf.general.logLevel);
setLogAccess(conf.general.logAccess);
setExcludes(conf.general.excludes);
setScraperUserAgent(conf.general.scraperUserAgent);
}
}, [data, error]);

Expand Down Expand Up @@ -229,6 +231,18 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
</FormGroup>
<Divider />

<FormGroup>
<H4>Scraping</H4>
<FormGroup
label="Scraper User-Agent string"
helperText="User-Agent string used during scrape http requests"
>
<InputGroup value={scraperUserAgent} onChange={(e: any) => setScraperUserAgent(e.target.value)} />
</FormGroup>
</FormGroup>

<Divider />

<FormGroup>
<H4>Authentication</H4>
<FormGroup
Expand Down

0 comments on commit abf2b49

Please sign in to comment.