scraper: Rebuild of Reality Lovers (#1864)

API appears to have been abandoned and now returns 403 errors. This new scraper now relies on HTML for its data. All works. Title and Cover are both grabbed from index page. These seem to be the most accurate and useful data source. On single scene scrape we fall back to using the gallery and the URL for these sources
xbapps · Jan 17, 2025 · cf51ef2 · cf51ef2
1 parent 4fe225b
commit cf51ef2
Showing 1 changed file with 104 additions and 102 deletions.
diff --git a/pkg/scrape/realitylovers.go b/pkg/scrape/realitylovers.go
@@ -1,146 +1,148 @@
 package scrape
 
 import (
-	"fmt"
 	"regexp"
 	"strings"
-	"time"
 
-	"github.com/go-resty/resty/v2"
 	"github.com/gocolly/colly/v2"
 	"github.com/mozillazg/go-slugify"
+	"github.com/nleeper/goment"
 	"github.com/thoas/go-funk"
-	"github.com/tidwall/gjson"
 	"github.com/xbapps/xbvr/pkg/models"
 )
 
 func RealityLoversSite(wg *models.ScrapeWG, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene, singleSceneURL string, scraperID string, siteID string, domain string, singeScrapeAdditionalInfo string, limitScraping bool) error {
 	defer wg.Done()
 	logScrapeStart(scraperID, siteID)
 
-	sceneCollector := createCollector("realitylovers.com", "engine.realitylovers.com", "tsvirtuallovers.com", "engine.tsvirtuallovers.com")
+	sceneCollector := createCollector(domain)
+	siteCollector := createCollector(domain)
 
-	sceneCollector.OnResponse(func(r *colly.Response) {
-		if r.StatusCode != 200 {
-			return
-		}
-		json := gjson.ParseBytes(r.Body)
+	// These cookies are needed for age verification.
+	siteCollector.OnRequest(func(r *colly.Request) {
+		r.Headers.Set("Cookie", "agreedToDisclaimer=true")
+	})
+
+	sceneCollector.OnRequest(func(r *colly.Request) {
+		r.Headers.Set("Cookie", "agreedToDisclaimer=true")
+	})
 
+	sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
 		sc := models.ScrapedScene{}
 		sc.ScraperID = scraperID
 		sc.SceneType = "VR"
 		sc.Studio = "RealityLovers"
 		sc.Site = siteID
-		sc.HomepageURL = r.Request.Ctx.Get("sceneURL")
-
-		// Scene ID
-		sc.SiteID = json.Get("contentId").String()
-		sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID
+		sc.SiteID = ""
+		sc.HomepageURL, _ = strings.CutSuffix(e.Request.URL.String(), "/")
 
-		sc.Title = json.Get("title").String()
-		sc.Synopsis = json.Get("description").String()
+		// Cover Url
+		coverURL := e.Request.Ctx.GetAny("coverURL").(string)
+		sc.Covers = append(sc.Covers, coverURL)
 
-		covers := json.Get("mainImages.0.imgSrcSet").String()
-		sc.Covers = append(sc.Covers, strings.Fields(covers)[0])
+		// Gallery
+		e.ForEach(`div.owl-carousel div.item`, func(id int, e *colly.HTMLElement) {
+			sc.Gallery = append(sc.Gallery, e.ChildAttr("img", "src"))
+		})
 
-		sc.Released = json.Get("releaseDate").String()
+		// Incase we scrape a single scene use one of the gallery images for the cover
+		if singleSceneURL != "" {
+			sc.Covers = append(sc.Covers, sc.Gallery[0])
+		}
 
 		// Cast
 		sc.ActorDetails = make(map[string]models.ActorDetails)
-		json.Get("starring").ForEach(func(_, star gjson.Result) bool {
-			name := star.Get("name").String()
-			sc.Cast = append(sc.Cast, name)
-			sc.ActorDetails[name] = models.ActorDetails{Source: sc.ScraperID + " scrape", ProfileUrl: "https://" + domain + "/" + star.Get("uri").String()}
-			return true
-		})
-
-		// Gallery
-		json.Get("screenshots").ForEach(func(_, screenshot gjson.Result) bool {
-			imgset := screenshot.Get("galleryImgSrcSet").String()
-			images := strings.Split(imgset, ",")
-			selectedImage := ""
-			for _, image := range images {
-				parts := strings.Fields(image)
-				if selectedImage == "" {
-					selectedImage = parts[0]
-				}
-				if parts[1] == "1920w" {
-					selectedImage = parts[0]
+		e.ForEach(`table.video-description-list tbody`, func(id int, e *colly.HTMLElement) {
+			// Cast
+			e.ForEach(`tr:nth-child(1) a`, func(id int, e *colly.HTMLElement) {
+				if strings.TrimSpace(e.Text) != "" {
+					sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text))
+					sc.ActorDetails[strings.TrimSpace(e.Text)] = models.ActorDetails{Source: sc.ScraperID + " scrape", ProfileUrl: e.Request.AbsoluteURL(e.Attr("href"))}
 				}
-			}
-			sc.Gallery = append(sc.Gallery, selectedImage)
-			return true
-		})
+			})
+
+			// Tags
+			e.ForEach(`tr:nth-child(2) a`, func(id int, e *colly.HTMLElement) {
+				tag := strings.TrimSpace(e.Text)
 
-		// Tags
-		json.Get("categories").ForEach(func(_, category gjson.Result) bool {
-			sc.Tags = append(sc.Tags, category.Get("name").String())
-			return true
+				// Standardize the resolution tags
+				tag, _ = strings.CutSuffix(strings.ToLower(tag), " vr porn")
+				tag, _ = strings.CutSuffix(tag, " ts")
+				sc.Tags = append(sc.Tags, tag)
+			})
+
+			// Date
+			tmpDate, _ := goment.New(strings.TrimSpace(e.ChildText(`tr:nth-child(3) td:last-child`)), "MMMM DD, YYYY")
+			sc.Released = tmpDate.Format("YYYY-MM-DD")
 		})
 
-		sc.TrailerType = "url"
-		sc.TrailerSrc = json.Get("trailerUrl").String()
+		// Synposis
+		sc.Synopsis = strings.TrimSpace(e.ChildText("div.accordion-body"))
+
+		tmp := strings.Split(sc.HomepageURL, "/")
+
+		// Title
+		sc.Title = e.Request.Ctx.GetAny("title").(string)
+
+		//Fall back incase single scene scraping
+		if sc.Title == "" {
+			sc.Title = strings.ReplaceAll(tmp[len(tmp)-1], "-", " ")
+		}
+
+		// Scene ID
+		sc.SiteID = tmp[len(tmp)-2]
+
+		if sc.SiteID != "" {
+			sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID
 
-		out <- sc
+			// save only if we got a SceneID
+			out <- sc
+		}
 	})
 
-	// Request scenes via REST API
-	if singleSceneURL == "" {
-		page := 0
-		for {
-			url := fmt.Sprintf("https://engine.%s/content/videos?max=12&page=%v&pornstar=&category=&perspective=&sort=NEWEST", domain, page)
-			log.Infoln("visiting", url)
-			r, err := resty.New().R().
-				SetHeader("User-Agent", UserAgent).
-				Get(url)
-
-			if err != nil {
-				log.Errorf("Error fetching BaberoticaVR feed: %s", err)
-				logScrapeFinished(scraperID, siteID)
-				return nil
-			}
+	siteCollector.OnHTML(`a.page-link[aria-label="Next"]:not(.disabled)`, func(e *colly.HTMLElement) {
+		if !limitScraping {
+			pageURL := e.Request.AbsoluteURL(e.Attr("href"))
+			siteCollector.Visit(pageURL)
+		}
+	})
 
-			scenecnt := 0
-			if err == nil || r.StatusCode() == 200 {
-				result := gjson.Get(r.String(), "contents")
-				result.ForEach(func(key, value gjson.Result) bool {
-					scenecnt++
-					sceneURL := "https://" + domain + "/" + value.Get("videoUri").String()
-					sceneID := value.Get("id").String()
-					if !funk.ContainsString(knownScenes, sceneURL) {
-						ctx := colly.NewContext()
-						ctx.Put("sceneURL", sceneURL)
-						sceneCollector.Request("GET", "https://engine."+domain+"/content/videoDetail?contentId="+sceneID, nil, ctx, nil)
-					}
-					return true
-				})
-			}
-			if err != nil {
-				log.Errorf("Error visiting %s %s", url, err)
-			}
-			if r.StatusCode() != 200 {
-				log.Errorf("Return code visiting %s %v", url, r.StatusCode())
-			}
+	siteCollector.OnHTML(`div#gridView`, func(e *colly.HTMLElement) {
+
+		e.ForEach("div.video-grid-view", func(id int, e *colly.HTMLElement) {
 
-			if scenecnt < 12 {
-				break
+			re := regexp.MustCompile(`.+[jJ][pP][gG]`)
+			tmp := strings.Split(e.ChildAttr("img", "srcset"), ",")
+			r := re.FindStringSubmatch(tmp[len(tmp)-1])
+			coverURL := ""
+
+			if len(r) > 0 {
+				coverURL = strings.TrimSpace(r[0])
+			} else {
+				log.Warnln("Couldn't Find Cover Img in srcset:", tmp)
 			}
-			page++
-			if limitScraping {
-				break
+
+			title := e.ChildText("p.card-title")
+
+			sceneURL := e.Request.AbsoluteURL(e.ChildAttr("a", "href"))
+
+			// If scene exist in database, there's no need to scrape
+			if !funk.ContainsString(knownScenes, sceneURL) {
+				ctx := colly.NewContext()
+				ctx.Put("coverURL", coverURL)
+				ctx.Put("title", title)
+				sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
 			}
-			// have seen instances of status 404, so make sure we don't span will calls
-			time.Sleep(time.Second)
-		}
-	} else {
-		re := regexp.MustCompile(`.com\/vd\/(\d+)\/`)
-		match := re.FindStringSubmatch(singleSceneURL)
-		if len(match) >= 2 {
-			ctx := colly.NewContext()
-			ctx.Put("sceneURL", singleSceneURL)
-			sceneCollector.Request("GET", "https://engine."+domain+"/content/videoDetail?contentId="+match[1], nil, ctx, nil)
-		}
+		})
+	})
 
+	if singleSceneURL != "" {
+		ctx := colly.NewContext()
+		ctx.Put("coverURL", "")
+		ctx.Put("title", "")
+		sceneCollector.Request("GET", singleSceneURL, nil, ctx, nil)
+	} else {
+		siteCollector.Visit("https://" + domain + "/videos/page1")
 	}
 
 	if updateSite {