Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configurable scraper user agent string #409

Merged
merged 6 commits into from
Mar 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions graphql/documents/data/config.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ fragment ConfigGeneralData on ConfigGeneralResult {
logLevel
logAccess
excludes
scraperUserAgent
}

fragment ConfigInterfaceData on ConfigInterfaceResult {
Expand Down
4 changes: 4 additions & 0 deletions graphql/schema/types/config.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ input ConfigGeneralInput {
logAccess: Boolean!
"""Array of file regexp to exclude from Scan"""
excludes: [String!]
"""Scraper user agent string"""
scraperUserAgent: String
}

type ConfigGeneralResult {
Expand Down Expand Up @@ -59,6 +61,8 @@ type ConfigGeneralResult {
logAccess: Boolean!
"""Array of file regexp to exclude from Scan"""
excludes: [String!]!
"""Scraper user agent string"""
scraperUserAgent: String
}

input ConfigInterfaceInput {
Expand Down
4 changes: 4 additions & 0 deletions pkg/api/resolver_mutation_configure.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ func (r *mutationResolver) ConfigureGeneral(ctx context.Context, input models.Co
config.Set(config.Exclude, input.Excludes)
}

if input.ScraperUserAgent != nil {
config.Set(config.ScraperUserAgent, input.ScraperUserAgent)
}

if err := config.Write(); err != nil {
return makeConfigGeneralResult(), err
}
Expand Down
4 changes: 3 additions & 1 deletion pkg/api/resolver_query_configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ func makeConfigGeneralResult() *models.ConfigGeneralResult {
maxTranscodeSize := config.GetMaxTranscodeSize()
maxStreamingTranscodeSize := config.GetMaxStreamingTranscodeSize()

scraperUserAgent := config.GetScraperUserAgent()

return &models.ConfigGeneralResult{
Stashes: config.GetStashPaths(),
DatabasePath: config.GetDatabasePath(),
Expand All @@ -46,6 +48,7 @@ func makeConfigGeneralResult() *models.ConfigGeneralResult {
LogLevel: config.GetLogLevel(),
LogAccess: config.GetLogAccess(),
Excludes: config.GetExcludes(),
ScraperUserAgent: &scraperUserAgent,
}
}

Expand All @@ -59,7 +62,6 @@ func makeConfigInterfaceResult() *models.ConfigInterfaceResult {
cssEnabled := config.GetCSSEnabled()
language := config.GetLanguage()


return &models.ConfigInterfaceResult{
SoundOnPreview: &soundOnPreview,
WallShowTitle: &wallShowTitle,
Expand Down
9 changes: 8 additions & 1 deletion pkg/manager/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ const Password = "password"

const Database = "database"

const ScrapersPath = "scrapers_path"
const Exclude = "exclude"

const MaxTranscodeSize = "max_transcode_size"
Expand All @@ -32,6 +31,10 @@ const Host = "host"
const Port = "port"
const ExternalHost = "external_host"

// scraping options
const ScrapersPath = "scrapers_path"
const ScraperUserAgent = "scraper_user_agent"

// i18n
const Language = "language"

Expand Down Expand Up @@ -115,6 +118,10 @@ func GetScrapersPath() string {
return viper.GetString(ScrapersPath)
}

func GetScraperUserAgent() string {
return viper.GetString(ScraperUserAgent)
}

func GetHost() string {
return viper.GetString(Host)
}
Expand Down
9 changes: 7 additions & 2 deletions pkg/scraper/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,10 @@ func (c *scrapeSceneByURLConfig) resolveFn() {
}
}

type scraperDebugOptions struct {
PrintHTML bool `yaml:"printHTML"`
}

type scraperConfig struct {
ID string
Name string `yaml:"name"`
Expand All @@ -148,8 +152,9 @@ type scraperConfig struct {
SceneByFragment *sceneByFragmentConfig `yaml:"sceneByFragment"`
SceneByURL []*scrapeSceneByURLConfig `yaml:"sceneByURL"`

StashServer *stashServer `yaml:"stashServer"`
XPathScrapers xpathScrapers `yaml:"xPathScrapers"`
DebugOptions *scraperDebugOptions `yaml:"debug"`
StashServer *stashServer `yaml:"stashServer"`
XPathScrapers xpathScrapers `yaml:"xPathScrapers"`
}

func loadScraperFromYAML(path string) (*scraperConfig, error) {
Expand Down
13 changes: 12 additions & 1 deletion pkg/scraper/image.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strings"
"time"

"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/utils"
)
Expand Down Expand Up @@ -52,8 +53,18 @@ func getImage(url string) (*string, error) {
Timeout: imageGetTimeout,
}

req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}

userAgent := config.GetScraperUserAgent()
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}

// assume is a URL for now
resp, err := client.Get(url)
resp, err := client.Do(req)
if err != nil {
return nil, err
}
Expand Down
52 changes: 48 additions & 4 deletions pkg/scraper/xpath.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package scraper

import (
"bytes"
"errors"
"net/http"
"net/url"
"reflect"
"regexp"
Expand All @@ -10,11 +12,17 @@ import (

"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"

"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/manager/config"
"github.com/stashapp/stash/pkg/models"
)

// Timeout for the scrape http request. Includes transfer time. May want to make this
// configurable at some point.
const scrapeGetTimeout = time.Second * 30

type commonXPathConfig map[string]string

func (c commonXPathConfig) applyCommon(src string) string {
Expand Down Expand Up @@ -197,7 +205,7 @@ func (c xpathScraperAttrConfig) applySubScraper(value string) string {
return value
}

doc, err := htmlquery.LoadURL(value)
doc, err := loadURL(value, nil)

if err != nil {
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
Expand Down Expand Up @@ -504,14 +512,50 @@ func (r xPathResults) setKey(index int, key string, value string) xPathResults {
return r
}

func loadURL(url string, c *scraperConfig) (*html.Node, error) {
client := &http.Client{
Timeout: scrapeGetTimeout,
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}

userAgent := config.GetScraperUserAgent()
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}

resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()

r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
if err != nil {
return nil, err
}

ret, err := html.Parse(r)

if err == nil && c != nil && c.DebugOptions != nil && c.DebugOptions.PrintHTML {
var b bytes.Buffer
html.Render(&b, ret)
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
}

return ret, err
}

func scrapePerformerURLXpath(c scraperTypeConfig, url string) (*models.ScrapedPerformer, error) {
scraper := c.scraperConfig.XPathScrapers[c.Scraper]

if scraper == nil {
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}

doc, err := htmlquery.LoadURL(url)
doc, err := loadURL(url, c.scraperConfig)

if err != nil {
return nil, err
Expand All @@ -527,7 +571,7 @@ func scrapeSceneURLXPath(c scraperTypeConfig, url string) (*models.ScrapedScene,
return nil, errors.New("xpath scraper with name " + c.Scraper + " not found in config")
}

doc, err := htmlquery.LoadURL(url)
doc, err := loadURL(url, c.scraperConfig)

if err != nil {
return nil, err
Expand All @@ -551,7 +595,7 @@ func scrapePerformerNamesXPath(c scraperTypeConfig, name string) ([]*models.Scra
u := c.QueryURL
u = strings.Replace(u, placeholder, escapedName, -1)

doc, err := htmlquery.LoadURL(u)
doc, err := loadURL(u, c.scraperConfig)

if err != nil {
return nil, err
Expand Down
21 changes: 20 additions & 1 deletion ui/v2.5/src/components/Settings/SettingsConfigurationPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ export const SettingsConfigurationPanel: React.FC = () => {
const [logLevel, setLogLevel] = useState<string>("Info");
const [logAccess, setLogAccess] = useState<boolean>(true);
const [excludes, setExcludes] = useState<string[]>([]);
const [scraperUserAgent, setScraperUserAgent] = useState<string | undefined>(undefined);

const { data, error, loading } = StashService.useConfiguration();

Expand All @@ -44,7 +45,8 @@ export const SettingsConfigurationPanel: React.FC = () => {
logOut,
logLevel,
logAccess,
excludes
excludes,
scraperUserAgent
});

useEffect(() => {
Expand All @@ -66,6 +68,7 @@ export const SettingsConfigurationPanel: React.FC = () => {
setLogLevel(conf.general.logLevel);
setLogAccess(conf.general.logAccess);
setExcludes(conf.general.excludes);
setScraperUserAgent(conf.general.scraperUserAgent ?? undefined);
}
}, [data, error]);

Expand Down Expand Up @@ -289,6 +292,22 @@ export const SettingsConfigurationPanel: React.FC = () => {

<hr />

<Form.Group id="generated-path">
<h6>Scraping</h6>
<Form.Control
className="col col-sm-6 text-input"
defaultValue={scraperUserAgent}
onChange={(e: React.FormEvent<HTMLInputElement>) =>
setScraperUserAgent(e.currentTarget.value)
}
/>
<Form.Text className="text-muted">
User-Agent string used during scrape http requests
</Form.Text>
</Form.Group>

<hr />

<Form.Group>
<h4>Authentication</h4>
<Form.Group id="username">
Expand Down
16 changes: 15 additions & 1 deletion ui/v2/src/components/Settings/SettingsConfigurationPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
const [logLevel, setLogLevel] = useState<string>("Info");
const [logAccess, setLogAccess] = useState<boolean>(true);
const [excludes, setExcludes] = useState<(string)[]>([]);
const [scraperUserAgent, setScraperUserAgent] = useState<string | undefined>(undefined);

const { data, error, loading } = StashService.useConfiguration();

Expand All @@ -48,7 +49,7 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
logLevel,
logAccess,
excludes,

scraperUserAgent,
});

useEffect(() => {
Expand All @@ -67,6 +68,7 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
setLogLevel(conf.general.logLevel);
setLogAccess(conf.general.logAccess);
setExcludes(conf.general.excludes);
setScraperUserAgent(conf.general.scraperUserAgent);
}
}, [data, error]);

Expand Down Expand Up @@ -229,6 +231,18 @@ export const SettingsConfigurationPanel: FunctionComponent<IProps> = (props: IPr
</FormGroup>
<Divider />

<FormGroup>
<H4>Scraping</H4>
<FormGroup
label="Scraper User-Agent string"
helperText="User-Agent string used during scrape http requests"
>
<InputGroup value={scraperUserAgent} onChange={(e: any) => setScraperUserAgent(e.target.value)} />
</FormGroup>
</FormGroup>

<Divider />

<FormGroup>
<H4>Authentication</H4>
<FormGroup
Expand Down