scrapers: Added more JAV scrapers (javland, javlibrary, javbus) (xbap…

…ps#1100) * javr: enable multiple scraper options, and add javlibrary and javbus * javr: add jav.land as well * Update javdatabase.go Add `javdatabase` as a tag * Update javbus.go Always add `javbus` as a tag * Update javland.go Always add 'jav.land' as a tag * Update javlibrary.go Always add javlibrary as a tag * Update javtags.go Changed both of the skip/re-map lists to tab separation. Moved "solo/solo work/solowork" to the "drop" list, I forgot what the R18 tag was, but unless I am mistaken, it was a tag they (and FANZA by extension) automatically add(ed) to titles that aren't part of an overarching "series". It's meaningless to us since we don't scrape R18/DMM's "series" listings, nor would we have a way to filter for them in XBVR. Changed "kiss kiss" to the tag to be retained, as that was the tag R18 used, and most users would already have plenty of in their databases. Probably best to maintain continuity with the old R18 tags whenever possible if this is to be done. * Update javtags.go Similarly, `suntan` was the tag R18 used, I have 12 entries in my library that pre-date the manifests I started writing myself. * javr: more error-correcting code, and less code-reuse between scrapers Co-authored-by: vt-idiot <[email protected]>
balckpixie · Jan 17, 2023 · 2db2ecf · 2db2ecf
1 parent dacefb9
commit 2db2ecf
Show file tree

Hide file tree

Showing 8 changed files with 581 additions and 49 deletions.
diff --git a/pkg/api/tasks.go b/pkg/api/tasks.go
@@ -11,7 +11,8 @@ import (
 )
 
 type RequestScrapeJAVR struct {
-	Query string `json:"q"`
+	Scraper string `json:"s"`
+	Query   string `json:"q"`
 }
 
 type RequestScrapeTPDB struct {
@@ -165,7 +166,7 @@ func (i TaskResource) scrapeJAVR(req *restful.Request, resp *restful.Response) {
 	}
 
 	if r.Query != "" {
-		go tasks.ScrapeJAVR(r.Query)
+		go tasks.ScrapeJAVR(r.Query, r.Scraper)
 	}
 }
 

diff --git a/pkg/scrape/javbus.go b/pkg/scrape/javbus.go
@@ -0,0 +1,100 @@
+package scrape
+
+import (
+	"regexp"
+	"strings"
+
+	"github.com/gocolly/colly"
+	"github.com/xbapps/xbvr/pkg/models"
+)
+
+func ScrapeJavBus(out *[]models.ScrapedScene, queryString string) {
+	sceneCollector := createCollector("www.javbus.com")
+
+	sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) {
+		sc := models.ScrapedScene{}
+		sc.SceneType = "VR"
+
+		// Always add 'javr' as a tag
+		sc.Tags = append(sc.Tags, `javr`)
+
+		// Always add 'javbus' as a tag
+		sc.Tags = append(sc.Tags, `javbus`)
+
+		html.ForEach(`div.row.movie div.info > p`, func(id int, p *colly.HTMLElement) {
+			label := p.ChildText(`span.header`)
+
+			if label == `Studio:` {
+				// Studio
+				sc.Studio = p.ChildText(`a`)
+
+			} else if label == `ID:` {
+				// Title, SceneID and SiteID all like 'VRKM-821' format
+				idRegex := regexp.MustCompile("^([A-Za-z0-9]+)-([0-9]+)$")
+				p.ForEach("span", func(_ int, span *colly.HTMLElement) {
+					match := idRegex.FindStringSubmatch(span.Text)
+					if match != nil && len(match) > 2 {
+						dvdId := match[1] + "-" + match[2]
+						sc.Title = dvdId
+						sc.SceneID = dvdId
+						sc.SiteID = dvdId
+						sc.Site = match[1]
+					}
+				})
+
+			} else if label == `Release Date:` {
+				// Release date
+				dateStr := p.Text
+				dateRegex := regexp.MustCompile("(\\d\\d\\d\\d-\\d\\d-\\d\\d)")
+				match := dateRegex.FindStringSubmatch(dateStr)
+				if match != nil && len(match) > 1 {
+					sc.Released = match[1]
+				}
+			}
+		})
+
+		// Tags
+		html.ForEach("div.row.movie span.genre > label > a", func(id int, anchor *colly.HTMLElement) {
+			href := anchor.Attr("href")
+			if strings.Contains(href, "javbus.com/en/genre/") {
+				// Tags
+				tag := ProcessJavrTag(anchor.Text)
+
+				if tag != "" {
+					sc.Tags = append(sc.Tags, tag)
+				}
+			}
+		})
+
+		// Cast
+		html.ForEach("div.row.movie div.star-name > a", func(id int, anchor *colly.HTMLElement) {
+			href := anchor.Attr("href")
+			if strings.Contains(href, "javbus.com/en/star/") {
+				sc.Cast = append(sc.Cast, anchor.Text)
+			}
+		})
+
+		// Screenshots
+		html.ForEach("a[href]", func(_ int, anchor *colly.HTMLElement) {
+			linkHref := anchor.Attr(`href`)
+			if strings.HasPrefix(linkHref, "https://pics.dmm.co.jp/digital/video/") && strings.HasSuffix(linkHref, `.jpg`) {
+				sc.Gallery = append(sc.Gallery, linkHref)
+			}
+		})
+
+		// Apply post-processing for error-correcting code
+		PostProcessJavScene(&sc, "")
+
+		if sc.SceneID != "" {
+			*out = append(*out, sc)
+		}
+	})
+
+	// Allow comma-separated scene id's
+	scenes := strings.Split(queryString, ",")
+	for _, v := range scenes {
+		sceneCollector.Visit("https://www.javbus.com/en/" + strings.ToUpper(v) + "/")
+	}
+
+	sceneCollector.Wait()
+}
diff --git a/pkg/scrape/javdatabase.go b/pkg/scrape/javdatabase.go
@@ -1,7 +1,6 @@
 package scrape
 
 import (
-	"regexp"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
@@ -10,7 +9,7 @@ import (
 	"github.com/xbapps/xbvr/pkg/models"
 )
 
-func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString string) {
+func ScrapeJavDB(out *[]models.ScrapedScene, queryString string) {
 	sceneCollector := createCollector("www.javdatabase.com")
 
 	sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) {
@@ -21,14 +20,8 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s
 		// Always add 'javr' as a tag
 		sc.Tags = append(sc.Tags, `javr`)
 
-		// Skipping some very generic and useless tags
-		skiptags := map[string]bool{
-			"featured actress":       true,
-			"vr exclusive":           true,
-			"high-quality vr":        true,
-			"hi-def":                 true,
-			"exclusive distribution": true,
-		}
+		// Always add 'javdatabase' as a tag
+		sc.Tags = append(sc.Tags, `javdatabase`)
 
 		// Cast
 		html.ForEach("h2.subhead", func(id int, h2 *colly.HTMLElement) {
@@ -37,8 +30,9 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s
 				parent := dom.Parent()
 				if parent != nil {
 					parent.Find("a").Each(func(i int, anchor *goquery.Selection) {
-						if anchor.Text() != "" {
-							sc.Cast = append(sc.Cast, anchor.Text())
+						href, exists := anchor.Attr("href")
+						if exists && strings.Contains(href, "javdatabase.com/idols/") && anchor.Text() != "" {
+							sc.Cast = append(sc.Cast, strings.TrimSpace(anchor.Text()))
 						}
 					})
 				}
@@ -82,9 +76,9 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s
 					href := anchor.Attr("href")
 					if strings.Contains(href, "javdatabase.com/genres/") {
 						// Tags
-						tag := strings.ToLower(anchor.Text)
+						tag := ProcessJavrTag(anchor.Text)
 
-						if !skiptags[tag] {
+						if tag != "" {
 							sc.Tags = append(sc.Tags, tag)
 						}
 					}
@@ -96,8 +90,6 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s
 
 			} else if label == `Content ID:` {
 				contentId = tr.ChildText(`td.tablevalue`)
-				sc.HomepageURL = `https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=` + contentId + `/`
-				sc.Covers = append(sc.Covers, `https://pics.dmm.co.jp/digital/video/`+contentId+`/`+contentId+`pl.jpg`)
 			}
 		})
 
@@ -113,22 +105,12 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s
 			}
 		})
 
-		// Some specific postprocessing for error-correcting 3DSVR scenes
-		if len(contentId) > 0 && sc.Site == "DSVR" {
-			r := regexp.MustCompile("13dsvr0(\\d{4})")
-			match := r.FindStringSubmatch(contentId)
-			if match != nil && len(match) > 1 {
-				// Found a 3DSVR scene that is being wrongly categorized as DSVR
-				log.Println("Applying DSVR->3DSVR workaround")
-				sid := match[1]
-				sc.Site = "3DSVR"
-				sc.SceneID = "3DSVR-" + sid
-				sc.Title = sc.SceneID
-				sc.SiteID = sc.SceneID
-			}
-		}
+		// Apply post-processing for error-correcting code
+		PostProcessJavScene(&sc, contentId)
 
-		*out = append(*out, sc)
+		if sc.SceneID != "" {
+			*out = append(*out, sc)
+		}
 	})
 
 	// Allow comma-separated scene id's

diff --git a/pkg/scrape/javland.go b/pkg/scrape/javland.go
@@ -0,0 +1,117 @@
+package scrape
+
+import (
+	"strings"
+
+	"github.com/gocolly/colly"
+	"github.com/nleeper/goment"
+	"github.com/xbapps/xbvr/pkg/models"
+)
+
+func ScrapeJavLand(out *[]models.ScrapedScene, queryString string) {
+	sceneCollector := createCollector("jav.land")
+
+	sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) {
+		sc := models.ScrapedScene{}
+		sc.SceneType = "VR"
+		contentId := ""
+
+		// Always add 'javr' as a tag
+		sc.Tags = append(sc.Tags, `javr`)
+
+		// Always add 'jav.land' as a tag
+		sc.Tags = append(sc.Tags, `jav.land`)
+
+		html.ForEach(`table.videotextlist tr`, func(id int, tr *colly.HTMLElement) {
+			tds := tr.DOM.Children()
+			if tds.Length() != 2 {
+				return
+			}
+			label := tds.First().Text()
+			value := tds.Last().Text()
+
+			if label == `Maker:` {
+				// Studio
+				sc.Studio = value
+
+			} else if label == `DVD ID:` {
+				// Title, SceneID and SiteID all like 'VRKM-821' format
+				dvdId := strings.ToUpper(value)
+				sc.Title = dvdId
+				sc.SceneID = dvdId
+				sc.SiteID = dvdId
+
+				// Set 'Site' to first part of the ID (e.g. `VRKM for `vrkm-821`)
+				siteParts := strings.Split(dvdId, `-`)
+				if len(siteParts) > 0 {
+					sc.Site = siteParts[0]
+				}
+
+			} else if label == `Release Date:` {
+				// Release date
+				tmpDate, _ := goment.New(strings.TrimSpace(value), "YYYY-MM-DD")
+				sc.Released = tmpDate.Format("YYYY-MM-DD")
+
+			} else if label == `Genre(s):` {
+				// Tags
+				tr.ForEach("span.genre > a", func(id int, anchor *colly.HTMLElement) {
+					href := anchor.Attr("href")
+					if strings.Contains(href, "/genre/") {
+						// Tags
+						tag := ProcessJavrTag(anchor.Text)
+
+						if tag != "" {
+							sc.Tags = append(sc.Tags, tag)
+						}
+					}
+				})
+
+			} else if label == `Cast:` {
+				// Tags
+				tr.ForEach("span.star > a", func(id int, anchor *colly.HTMLElement) {
+					href := anchor.Attr("href")
+					if strings.Contains(href, "/star/") {
+						sc.Cast = append(sc.Cast, anchor.Text)
+					}
+				})
+
+			} else if label == `Content ID:` {
+				contentId = value
+			}
+		})
+
+		// Screenshots
+		html.ForEach("a[href]", func(_ int, anchor *colly.HTMLElement) {
+			linkHref := anchor.Attr(`href`)
+			if strings.HasPrefix(linkHref, "https://pics.vpdmm.cc/") && strings.HasSuffix(linkHref, `.jpg`) {
+				linkHref = strings.Replace(linkHref, "https://pics.vpdmm.cc/", "https://pics.dmm.co.jp/", 1)
+			}
+			if strings.HasPrefix(linkHref, "https://pics.dmm.co.jp/digital/video/") && strings.HasSuffix(linkHref, `.jpg`) {
+				sc.Gallery = append(sc.Gallery, linkHref)
+			}
+		})
+
+		// Synopsis
+		title := html.DOM.Find("title")
+		if title != nil && title.Length() == 1 {
+			descr := title.Text()
+			descr = strings.ReplaceAll(descr, "- JAV.Land", "")
+			sc.Synopsis = descr
+		}
+
+		// Apply post-processing for error-correcting code
+		PostProcessJavScene(&sc, contentId)
+
+		if sc.SceneID != "" {
+			*out = append(*out, sc)
+		}
+	})
+
+	// Allow comma-separated scene id's
+	scenes := strings.Split(queryString, ",")
+	for _, v := range scenes {
+		sceneCollector.Visit("https://jav.land/en/id_search.php?keys=" + strings.ToLower(v))
+	}
+
+	sceneCollector.Wait()
+}