Skip to content

Commit

Permalink
Fix scraping multiple URLs (#5677)
Browse files Browse the repository at this point in the history
* Hack fix for scraping URLs field
* Rewrite apply function using known value types
  • Loading branch information
WithoutPants authored Feb 25, 2025
1 parent 587fd9e commit 1e05766
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 46 deletions.
145 changes: 103 additions & 42 deletions pkg/scraper/mapped.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,17 @@ func (s mappedConfig) applyCommon(c commonMappedConfig, src string) string {
return ret
}

func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonMappedConfig) mappedResults {
type isMultiFunc func(key string) bool

func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonMappedConfig, isMulti isMultiFunc) mappedResults {
var ret mappedResults

for k, attrConfig := range s {

if attrConfig.Fixed != "" {
// TODO - not sure if this needs to set _all_ indexes for the key
const i = 0
ret = ret.setKey(i, k, attrConfig.Fixed)
ret = ret.setSingleValue(i, k, attrConfig.Fixed)
} else {
selector := attrConfig.Selector
selector = s.applyCommon(common, selector)
Expand All @@ -63,8 +65,15 @@ func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonM

if len(found) > 0 {
result := s.postProcess(ctx, q, attrConfig, found)
for i, text := range result {
ret = ret.setKey(i, k, text)

// HACK - if the key is URLs, then we need to set the value as a multi-value
isMulti := isMulti != nil && isMulti(k)
if isMulti {
ret = ret.setMultiValue(0, k, result)
} else {
for i, text := range result {
ret = ret.setSingleValue(i, k, text)
}
}
}
}
Expand Down Expand Up @@ -845,37 +854,82 @@ type mappedScraper struct {
Movie *mappedMovieScraperConfig `yaml:"movie"`
}

type mappedResult map[string]string
type mappedResult map[string]interface{}
type mappedResults []mappedResult

func (r mappedResult) apply(dest interface{}) {
destVal := reflect.ValueOf(dest)

// dest should be a pointer
destVal = destVal.Elem()
destVal := reflect.ValueOf(dest).Elem()

// all fields are either string pointers or string slices
for key, value := range r {
field := destVal.FieldByName(key)

if field.IsValid() {
var reflectValue reflect.Value
if field.Kind() == reflect.Ptr {
// need to copy the value, otherwise everything is set to the
// same pointer
localValue := value
reflectValue = reflect.ValueOf(&localValue)
if err := mapFieldValue(destVal, key, value); err != nil {
logger.Errorf("Error mapping field %s in %T: %v", key, dest, err)
}
}
}

func mapFieldValue(destVal reflect.Value, key string, value interface{}) error {
field := destVal.FieldByName(key)
fieldType := field.Type()

if field.IsValid() && field.CanSet() {
switch v := value.(type) {
case string:
// if the field is a pointer to a string, then we need to convert the string to a pointer
// if the field is a string slice, then we need to convert the string to a slice
switch {
case fieldType.Kind() == reflect.String:
field.SetString(v)
case fieldType.Kind() == reflect.Ptr && fieldType.Elem().Kind() == reflect.String:
ptr := reflect.New(fieldType.Elem())
ptr.Elem().SetString(v)
field.Set(ptr)
case fieldType.Kind() == reflect.Slice && fieldType.Elem().Kind() == reflect.String:
field.Set(reflect.ValueOf([]string{v}))
default:
return fmt.Errorf("cannot convert %T to %s", value, fieldType)
}
case []string:
// expect the field to be a string slice
if fieldType.Kind() == reflect.Slice && fieldType.Elem().Kind() == reflect.String {
field.Set(reflect.ValueOf(v))
} else {
reflectValue = reflect.ValueOf(value)
return fmt.Errorf("cannot convert %T to %s", value, fieldType)
}
default:
// fallback to reflection
reflectValue := reflect.ValueOf(value)
reflectValueType := reflectValue.Type()

switch {
case reflectValueType.ConvertibleTo(fieldType):
field.Set(reflectValue.Convert(fieldType))
case fieldType.Kind() == reflect.Pointer && reflectValueType.ConvertibleTo(fieldType.Elem()):
ptr := reflect.New(fieldType.Elem())
ptr.Elem().Set(reflectValue.Convert(fieldType.Elem()))
field.Set(ptr)
default:
return fmt.Errorf("cannot convert %T to %s", value, fieldType)
}

field.Set(reflectValue)
} else {
logger.Errorf("Field %s does not exist in %T", key, dest)
}
} else {
return fmt.Errorf("field does not exist or cannot be set")
}

return nil
}

func (r mappedResults) setSingleValue(index int, key string, value string) mappedResults {
if index >= len(r) {
r = append(r, make(mappedResult))
}

logger.Debugf(`[%d][%s] = %s`, index, key, value)
r[index][key] = value
return r
}

func (r mappedResults) setKey(index int, key string, value string) mappedResults {
func (r mappedResults) setMultiValue(index int, key string, value []string) mappedResults {
if index >= len(r) {
r = append(r, make(mappedResult))
}
Expand All @@ -885,6 +939,10 @@ func (r mappedResults) setKey(index int, key string, value string) mappedResults
return r
}

func urlsIsMulti(key string) bool {
return key == "URLs"
}

func (s mappedScraper) scrapePerformer(ctx context.Context, q mappedQuery) (*models.ScrapedPerformer, error) {
var ret models.ScrapedPerformer

Expand All @@ -895,12 +953,12 @@ func (s mappedScraper) scrapePerformer(ctx context.Context, q mappedQuery) (*mod

performerTagsMap := performerMap.Tags

results := performerMap.process(ctx, q, s.Common)
results := performerMap.process(ctx, q, s.Common, urlsIsMulti)

// now apply the tags
if performerTagsMap != nil {
logger.Debug(`Processing performer tags:`)
tagResults := performerTagsMap.process(ctx, q, s.Common)
tagResults := performerTagsMap.process(ctx, q, s.Common, nil)

for _, p := range tagResults {
tag := &models.ScrapedTag{}
Expand Down Expand Up @@ -928,7 +986,8 @@ func (s mappedScraper) scrapePerformers(ctx context.Context, q mappedQuery) ([]*
return nil, nil
}

results := performerMap.process(ctx, q, s.Common)
// isMulti is nil because it will behave incorrect when scraping multiple performers
results := performerMap.process(ctx, q, s.Common, nil)
for _, r := range results {
var p models.ScrapedPerformer
r.apply(&p)
Expand Down Expand Up @@ -957,7 +1016,7 @@ func (s mappedScraper) processSceneRelationships(ctx context.Context, q mappedQu

if sceneStudioMap != nil {
logger.Debug(`Processing scene studio:`)
studioResults := sceneStudioMap.process(ctx, q, s.Common)
studioResults := sceneStudioMap.process(ctx, q, s.Common, nil)

if len(studioResults) > 0 && resultIndex < len(studioResults) {
studio := &models.ScrapedStudio{}
Expand All @@ -981,14 +1040,15 @@ func (s mappedScraper) processPerformers(ctx context.Context, performersMap mapp
// now apply the performers and tags
if performersMap.mappedConfig != nil {
logger.Debug(`Processing performers:`)
performerResults := performersMap.process(ctx, q, s.Common)
// isMulti is nil because it will behave incorrect when scraping multiple performers
performerResults := performersMap.process(ctx, q, s.Common, nil)

scenePerformerTagsMap := performersMap.Tags

// process performer tags once
var performerTagResults mappedResults
if scenePerformerTagsMap != nil {
performerTagResults = scenePerformerTagsMap.process(ctx, q, s.Common)
performerTagResults = scenePerformerTagsMap.process(ctx, q, s.Common, nil)
}

for _, p := range performerResults {
Expand All @@ -1011,7 +1071,7 @@ func (s mappedScraper) processPerformers(ctx context.Context, performersMap mapp
func processRelationships[T any](ctx context.Context, s mappedScraper, relationshipMap mappedConfig, q mappedQuery) []*T {
var ret []*T

results := relationshipMap.process(ctx, q, s.Common)
results := relationshipMap.process(ctx, q, s.Common, nil)

for _, p := range results {
var value T
Expand All @@ -1032,7 +1092,8 @@ func (s mappedScraper) scrapeScenes(ctx context.Context, q mappedQuery) ([]*Scra
}

logger.Debug(`Processing scenes:`)
results := sceneMap.process(ctx, q, s.Common)
// urlsIsMulti is nil because it will behave incorrect when scraping multiple scenes
results := sceneMap.process(ctx, q, s.Common, nil)
for i, r := range results {
logger.Debug(`Processing scene:`)

Expand All @@ -1054,7 +1115,7 @@ func (s mappedScraper) scrapeScene(ctx context.Context, q mappedQuery) (*Scraped
sceneMap := sceneScraperConfig.mappedConfig

logger.Debug(`Processing scene:`)
results := sceneMap.process(ctx, q, s.Common)
results := sceneMap.process(ctx, q, s.Common, urlsIsMulti)

var ret ScrapedScene
if len(results) > 0 {
Expand Down Expand Up @@ -1087,7 +1148,7 @@ func (s mappedScraper) scrapeImage(ctx context.Context, q mappedQuery) (*Scraped
imageStudioMap := imageScraperConfig.Studio

logger.Debug(`Processing image:`)
results := imageMap.process(ctx, q, s.Common)
results := imageMap.process(ctx, q, s.Common, urlsIsMulti)

// now apply the performers and tags
if imagePerformersMap != nil {
Expand All @@ -1102,7 +1163,7 @@ func (s mappedScraper) scrapeImage(ctx context.Context, q mappedQuery) (*Scraped

if imageStudioMap != nil {
logger.Debug(`Processing image studio:`)
studioResults := imageStudioMap.process(ctx, q, s.Common)
studioResults := imageStudioMap.process(ctx, q, s.Common, nil)

if len(studioResults) > 0 {
studio := &models.ScrapedStudio{}
Expand Down Expand Up @@ -1138,12 +1199,12 @@ func (s mappedScraper) scrapeGallery(ctx context.Context, q mappedQuery) (*Scrap
galleryStudioMap := galleryScraperConfig.Studio

logger.Debug(`Processing gallery:`)
results := galleryMap.process(ctx, q, s.Common)
results := galleryMap.process(ctx, q, s.Common, urlsIsMulti)

// now apply the performers and tags
if galleryPerformersMap != nil {
logger.Debug(`Processing gallery performers:`)
performerResults := galleryPerformersMap.process(ctx, q, s.Common)
performerResults := galleryPerformersMap.process(ctx, q, s.Common, urlsIsMulti)

for _, p := range performerResults {
performer := &models.ScrapedPerformer{}
Expand All @@ -1154,7 +1215,7 @@ func (s mappedScraper) scrapeGallery(ctx context.Context, q mappedQuery) (*Scrap

if galleryTagsMap != nil {
logger.Debug(`Processing gallery tags:`)
tagResults := galleryTagsMap.process(ctx, q, s.Common)
tagResults := galleryTagsMap.process(ctx, q, s.Common, nil)

for _, p := range tagResults {
tag := &models.ScrapedTag{}
Expand All @@ -1165,7 +1226,7 @@ func (s mappedScraper) scrapeGallery(ctx context.Context, q mappedQuery) (*Scrap

if galleryStudioMap != nil {
logger.Debug(`Processing gallery studio:`)
studioResults := galleryStudioMap.process(ctx, q, s.Common)
studioResults := galleryStudioMap.process(ctx, q, s.Common, nil)

if len(studioResults) > 0 {
studio := &models.ScrapedStudio{}
Expand Down Expand Up @@ -1199,11 +1260,11 @@ func (s mappedScraper) scrapeGroup(ctx context.Context, q mappedQuery) (*models.
movieStudioMap := movieScraperConfig.Studio
movieTagsMap := movieScraperConfig.Tags

results := movieMap.process(ctx, q, s.Common)
results := movieMap.process(ctx, q, s.Common, urlsIsMulti)

if movieStudioMap != nil {
logger.Debug(`Processing movie studio:`)
studioResults := movieStudioMap.process(ctx, q, s.Common)
studioResults := movieStudioMap.process(ctx, q, s.Common, nil)

if len(studioResults) > 0 {
studio := &models.ScrapedStudio{}
Expand All @@ -1215,7 +1276,7 @@ func (s mappedScraper) scrapeGroup(ctx context.Context, q mappedQuery) (*models.
// now apply the tags
if movieTagsMap != nil {
logger.Debug(`Processing movie tags:`)
tagResults := movieTagsMap.process(ctx, q, s.Common)
tagResults := movieTagsMap.process(ctx, q, s.Common, nil)

for _, p := range tagResults {
tag := &models.ScrapedTag{}
Expand Down
23 changes: 19 additions & 4 deletions pkg/scraper/xpath_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ const htmlDoc1 = `
</td>
<td class="paramvalue">
<a href="/html/m_links/Mia_Malkova/">Mia Malkova</a>&nbsp;
<a href="/html/m_links/Mia_Malkova/second_url">Mia Malkova</a>&nbsp;
</td>
</tr>
<tr>
Expand Down Expand Up @@ -206,6 +207,8 @@ func makeXPathConfig() mappedPerformerScraperConfig {
}

config.mappedConfig["Name"] = makeSimpleAttrConfig(makeCommonXPath("Babe Name:") + `/a`)
config.mappedConfig["URL"] = makeSimpleAttrConfig(makeCommonXPath("Babe Name:") + `/a/@href`)
config.mappedConfig["URLs"] = makeSimpleAttrConfig(makeCommonXPath("Babe Name:") + `/a/@href`)
config.mappedConfig["Ethnicity"] = makeSimpleAttrConfig(makeCommonXPath("Ethnicity:"))
config.mappedConfig["Aliases"] = makeSimpleAttrConfig(makeCommonXPath("Aliases:"))
config.mappedConfig["EyeColor"] = makeSimpleAttrConfig(makeCommonXPath("Eye Color:"))
Expand Down Expand Up @@ -321,6 +324,8 @@ func TestScrapePerformerXPath(t *testing.T) {
}

const performerName = "Mia Malkova"
const url = "/html/m_links/Mia_Malkova/"
const secondURL = "/html/m_links/Mia_Malkova/second_url"
const ethnicity = "Caucasian"
const country = "United States"
const birthdate = "1992-07-01"
Expand All @@ -338,6 +343,16 @@ func TestScrapePerformerXPath(t *testing.T) {
const weight = "57" // 126 lb

verifyField(t, performerName, performer.Name, "Name")
verifyField(t, url, performer.URL, "URL")

// #5294 - test multiple URLs
if len(performer.URLs) != 2 {
t.Errorf("Expected 2 URLs, got %d", len(performer.URLs))
} else {
verifyField(t, url, &performer.URLs[0], "URLs[0]")
verifyField(t, secondURL, &performer.URLs[1], "URLs[1]")
}

verifyField(t, gender, performer.Gender, "Gender")
verifyField(t, ethnicity, performer.Ethnicity, "Ethnicity")
verifyField(t, country, performer.Country, "Country")
Expand Down Expand Up @@ -569,7 +584,7 @@ func makeSceneXPathConfig() mappedScraper {

performerConfig := make(mappedConfig)
performerConfig["Name"] = makeSimpleAttrConfig(`$performerElem/@data-mxptext`)
performerConfig["URL"] = makeSimpleAttrConfig(`$performerElem/@href`)
performerConfig["URLs"] = makeSimpleAttrConfig(`$performerElem/@href`)
config.Performers.mappedConfig = performerConfig

studioConfig := make(mappedConfig)
Expand Down Expand Up @@ -653,8 +668,8 @@ func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []strin
}
if i < len(actualPerformers) {
actualName = *actualPerformers[i].Name
if actualPerformers[i].URL != nil {
actualURL = *actualPerformers[i].URL
if len(actualPerformers[i].URLs) == 1 {
actualURL = actualPerformers[i].URLs[0]
}
}

Expand Down Expand Up @@ -805,7 +820,7 @@ func TestLoadInvalidXPath(t *testing.T) {
doc: doc,
}

config.process(context.Background(), q, nil)
config.process(context.Background(), q, nil, nil)
}

type mockGlobalConfig struct{}
Expand Down

0 comments on commit 1e05766

Please sign in to comment.