Skip to content

Commit

Permalink
scraper: Rebuild of Reality Lovers (#1864)
Browse files Browse the repository at this point in the history
API appears to have been abandoned and now returns 403 errors. This new scraper now relies on HTML for its data. All works. Title and Cover are both grabbed from index page. These seem to be the most accurate and useful data source. On single scene scrape we fall back to using the gallery and the URL for these sources
  • Loading branch information
pops64 authored Jan 17, 2025
1 parent 4fe225b commit cf51ef2
Showing 1 changed file with 104 additions and 102 deletions.
206 changes: 104 additions & 102 deletions pkg/scrape/realitylovers.go
Original file line number Diff line number Diff line change
@@ -1,146 +1,148 @@
package scrape

import (
"fmt"
"regexp"
"strings"
"time"

"github.com/go-resty/resty/v2"
"github.com/gocolly/colly/v2"
"github.com/mozillazg/go-slugify"
"github.com/nleeper/goment"
"github.com/thoas/go-funk"
"github.com/tidwall/gjson"
"github.com/xbapps/xbvr/pkg/models"
)

func RealityLoversSite(wg *models.ScrapeWG, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene, singleSceneURL string, scraperID string, siteID string, domain string, singeScrapeAdditionalInfo string, limitScraping bool) error {
defer wg.Done()
logScrapeStart(scraperID, siteID)

sceneCollector := createCollector("realitylovers.com", "engine.realitylovers.com", "tsvirtuallovers.com", "engine.tsvirtuallovers.com")
sceneCollector := createCollector(domain)
siteCollector := createCollector(domain)

sceneCollector.OnResponse(func(r *colly.Response) {
if r.StatusCode != 200 {
return
}
json := gjson.ParseBytes(r.Body)
// These cookies are needed for age verification.
siteCollector.OnRequest(func(r *colly.Request) {
r.Headers.Set("Cookie", "agreedToDisclaimer=true")
})

sceneCollector.OnRequest(func(r *colly.Request) {
r.Headers.Set("Cookie", "agreedToDisclaimer=true")
})

sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
sc := models.ScrapedScene{}
sc.ScraperID = scraperID
sc.SceneType = "VR"
sc.Studio = "RealityLovers"
sc.Site = siteID
sc.HomepageURL = r.Request.Ctx.Get("sceneURL")

// Scene ID
sc.SiteID = json.Get("contentId").String()
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID
sc.SiteID = ""
sc.HomepageURL, _ = strings.CutSuffix(e.Request.URL.String(), "/")

sc.Title = json.Get("title").String()
sc.Synopsis = json.Get("description").String()
// Cover Url
coverURL := e.Request.Ctx.GetAny("coverURL").(string)
sc.Covers = append(sc.Covers, coverURL)

covers := json.Get("mainImages.0.imgSrcSet").String()
sc.Covers = append(sc.Covers, strings.Fields(covers)[0])
// Gallery
e.ForEach(`div.owl-carousel div.item`, func(id int, e *colly.HTMLElement) {
sc.Gallery = append(sc.Gallery, e.ChildAttr("img", "src"))
})

sc.Released = json.Get("releaseDate").String()
// Incase we scrape a single scene use one of the gallery images for the cover
if singleSceneURL != "" {
sc.Covers = append(sc.Covers, sc.Gallery[0])
}

// Cast
sc.ActorDetails = make(map[string]models.ActorDetails)
json.Get("starring").ForEach(func(_, star gjson.Result) bool {
name := star.Get("name").String()
sc.Cast = append(sc.Cast, name)
sc.ActorDetails[name] = models.ActorDetails{Source: sc.ScraperID + " scrape", ProfileUrl: "https://" + domain + "/" + star.Get("uri").String()}
return true
})

// Gallery
json.Get("screenshots").ForEach(func(_, screenshot gjson.Result) bool {
imgset := screenshot.Get("galleryImgSrcSet").String()
images := strings.Split(imgset, ",")
selectedImage := ""
for _, image := range images {
parts := strings.Fields(image)
if selectedImage == "" {
selectedImage = parts[0]
}
if parts[1] == "1920w" {
selectedImage = parts[0]
e.ForEach(`table.video-description-list tbody`, func(id int, e *colly.HTMLElement) {
// Cast
e.ForEach(`tr:nth-child(1) a`, func(id int, e *colly.HTMLElement) {
if strings.TrimSpace(e.Text) != "" {
sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text))
sc.ActorDetails[strings.TrimSpace(e.Text)] = models.ActorDetails{Source: sc.ScraperID + " scrape", ProfileUrl: e.Request.AbsoluteURL(e.Attr("href"))}
}
}
sc.Gallery = append(sc.Gallery, selectedImage)
return true
})
})

// Tags
e.ForEach(`tr:nth-child(2) a`, func(id int, e *colly.HTMLElement) {
tag := strings.TrimSpace(e.Text)

// Tags
json.Get("categories").ForEach(func(_, category gjson.Result) bool {
sc.Tags = append(sc.Tags, category.Get("name").String())
return true
// Standardize the resolution tags
tag, _ = strings.CutSuffix(strings.ToLower(tag), " vr porn")
tag, _ = strings.CutSuffix(tag, " ts")
sc.Tags = append(sc.Tags, tag)
})

// Date
tmpDate, _ := goment.New(strings.TrimSpace(e.ChildText(`tr:nth-child(3) td:last-child`)), "MMMM DD, YYYY")
sc.Released = tmpDate.Format("YYYY-MM-DD")
})

sc.TrailerType = "url"
sc.TrailerSrc = json.Get("trailerUrl").String()
// Synposis
sc.Synopsis = strings.TrimSpace(e.ChildText("div.accordion-body"))

tmp := strings.Split(sc.HomepageURL, "/")

// Title
sc.Title = e.Request.Ctx.GetAny("title").(string)

//Fall back incase single scene scraping
if sc.Title == "" {
sc.Title = strings.ReplaceAll(tmp[len(tmp)-1], "-", " ")
}

// Scene ID
sc.SiteID = tmp[len(tmp)-2]

if sc.SiteID != "" {
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID

out <- sc
// save only if we got a SceneID
out <- sc
}
})

// Request scenes via REST API
if singleSceneURL == "" {
page := 0
for {
url := fmt.Sprintf("https://engine.%s/content/videos?max=12&page=%v&pornstar=&category=&perspective=&sort=NEWEST", domain, page)
log.Infoln("visiting", url)
r, err := resty.New().R().
SetHeader("User-Agent", UserAgent).
Get(url)

if err != nil {
log.Errorf("Error fetching BaberoticaVR feed: %s", err)
logScrapeFinished(scraperID, siteID)
return nil
}
siteCollector.OnHTML(`a.page-link[aria-label="Next"]:not(.disabled)`, func(e *colly.HTMLElement) {
if !limitScraping {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
}
})

scenecnt := 0
if err == nil || r.StatusCode() == 200 {
result := gjson.Get(r.String(), "contents")
result.ForEach(func(key, value gjson.Result) bool {
scenecnt++
sceneURL := "https://" + domain + "/" + value.Get("videoUri").String()
sceneID := value.Get("id").String()
if !funk.ContainsString(knownScenes, sceneURL) {
ctx := colly.NewContext()
ctx.Put("sceneURL", sceneURL)
sceneCollector.Request("GET", "https://engine."+domain+"/content/videoDetail?contentId="+sceneID, nil, ctx, nil)
}
return true
})
}
if err != nil {
log.Errorf("Error visiting %s %s", url, err)
}
if r.StatusCode() != 200 {
log.Errorf("Return code visiting %s %v", url, r.StatusCode())
}
siteCollector.OnHTML(`div#gridView`, func(e *colly.HTMLElement) {

e.ForEach("div.video-grid-view", func(id int, e *colly.HTMLElement) {

if scenecnt < 12 {
break
re := regexp.MustCompile(`.+[jJ][pP][gG]`)
tmp := strings.Split(e.ChildAttr("img", "srcset"), ",")
r := re.FindStringSubmatch(tmp[len(tmp)-1])
coverURL := ""

if len(r) > 0 {
coverURL = strings.TrimSpace(r[0])
} else {
log.Warnln("Couldn't Find Cover Img in srcset:", tmp)
}
page++
if limitScraping {
break

title := e.ChildText("p.card-title")

sceneURL := e.Request.AbsoluteURL(e.ChildAttr("a", "href"))

// If scene exist in database, there's no need to scrape
if !funk.ContainsString(knownScenes, sceneURL) {
ctx := colly.NewContext()
ctx.Put("coverURL", coverURL)
ctx.Put("title", title)
sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
}
// have seen instances of status 404, so make sure we don't span will calls
time.Sleep(time.Second)
}
} else {
re := regexp.MustCompile(`.com\/vd\/(\d+)\/`)
match := re.FindStringSubmatch(singleSceneURL)
if len(match) >= 2 {
ctx := colly.NewContext()
ctx.Put("sceneURL", singleSceneURL)
sceneCollector.Request("GET", "https://engine."+domain+"/content/videoDetail?contentId="+match[1], nil, ctx, nil)
}
})
})

if singleSceneURL != "" {
ctx := colly.NewContext()
ctx.Put("coverURL", "")
ctx.Put("title", "")
sceneCollector.Request("GET", singleSceneURL, nil, ctx, nil)
} else {
siteCollector.Visit("https://" + domain + "/videos/page1")
}

if updateSite {
Expand Down

0 comments on commit cf51ef2

Please sign in to comment.