Skip to content

Commit

Permalink
scrapers: Added more JAV scrapers (javland, javlibrary, javbus) (xbap…
Browse files Browse the repository at this point in the history
…ps#1100)

* javr: enable multiple scraper options, and add javlibrary and javbus

* javr: add jav.land as well

* Update javdatabase.go

Add `javdatabase` as a tag

* Update javbus.go

Always add `javbus` as a tag

* Update javland.go

Always add 'jav.land' as a tag

* Update javlibrary.go

Always add javlibrary as a tag

* Update javtags.go

Changed both of the skip/re-map lists to tab separation.

Moved "solo/solo work/solowork" to the "drop" list, I forgot what the R18 tag was, but unless I am mistaken, it was a tag they (and FANZA by extension) automatically add(ed) to titles that aren't part of an overarching "series". It's meaningless to us since we don't scrape R18/DMM's "series" listings, nor would we have a way to filter for them in XBVR.

Changed "kiss kiss" to the tag to be retained, as that was the tag R18 used, and most users would already have plenty of in their databases. Probably best to maintain continuity with the old R18 tags whenever possible if this is to be done.

* Update javtags.go

Similarly, `suntan` was the tag R18 used, I have 12 entries in my library that pre-date the manifests I started writing myself.

* javr: more error-correcting code, and less code-reuse between scrapers

Co-authored-by: vt-idiot <[email protected]>
  • Loading branch information
thebrnd and vt-idiot authored Jan 17, 2023
1 parent dacefb9 commit 2db2ecf
Show file tree
Hide file tree
Showing 8 changed files with 581 additions and 49 deletions.
5 changes: 3 additions & 2 deletions pkg/api/tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ import (
)

type RequestScrapeJAVR struct {
Query string `json:"q"`
Scraper string `json:"s"`
Query string `json:"q"`
}

type RequestScrapeTPDB struct {
Expand Down Expand Up @@ -165,7 +166,7 @@ func (i TaskResource) scrapeJAVR(req *restful.Request, resp *restful.Response) {
}

if r.Query != "" {
go tasks.ScrapeJAVR(r.Query)
go tasks.ScrapeJAVR(r.Query, r.Scraper)
}
}

Expand Down
100 changes: 100 additions & 0 deletions pkg/scrape/javbus.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package scrape

import (
"regexp"
"strings"

"github.com/gocolly/colly"
"github.com/xbapps/xbvr/pkg/models"
)

func ScrapeJavBus(out *[]models.ScrapedScene, queryString string) {
sceneCollector := createCollector("www.javbus.com")

sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) {
sc := models.ScrapedScene{}
sc.SceneType = "VR"

// Always add 'javr' as a tag
sc.Tags = append(sc.Tags, `javr`)

// Always add 'javbus' as a tag
sc.Tags = append(sc.Tags, `javbus`)

html.ForEach(`div.row.movie div.info > p`, func(id int, p *colly.HTMLElement) {
label := p.ChildText(`span.header`)

if label == `Studio:` {
// Studio
sc.Studio = p.ChildText(`a`)

} else if label == `ID:` {
// Title, SceneID and SiteID all like 'VRKM-821' format
idRegex := regexp.MustCompile("^([A-Za-z0-9]+)-([0-9]+)$")
p.ForEach("span", func(_ int, span *colly.HTMLElement) {
match := idRegex.FindStringSubmatch(span.Text)
if match != nil && len(match) > 2 {
dvdId := match[1] + "-" + match[2]
sc.Title = dvdId
sc.SceneID = dvdId
sc.SiteID = dvdId
sc.Site = match[1]
}
})

} else if label == `Release Date:` {
// Release date
dateStr := p.Text
dateRegex := regexp.MustCompile("(\\d\\d\\d\\d-\\d\\d-\\d\\d)")
match := dateRegex.FindStringSubmatch(dateStr)
if match != nil && len(match) > 1 {
sc.Released = match[1]
}
}
})

// Tags
html.ForEach("div.row.movie span.genre > label > a", func(id int, anchor *colly.HTMLElement) {
href := anchor.Attr("href")
if strings.Contains(href, "javbus.com/en/genre/") {
// Tags
tag := ProcessJavrTag(anchor.Text)

if tag != "" {
sc.Tags = append(sc.Tags, tag)
}
}
})

// Cast
html.ForEach("div.row.movie div.star-name > a", func(id int, anchor *colly.HTMLElement) {
href := anchor.Attr("href")
if strings.Contains(href, "javbus.com/en/star/") {
sc.Cast = append(sc.Cast, anchor.Text)
}
})

// Screenshots
html.ForEach("a[href]", func(_ int, anchor *colly.HTMLElement) {
linkHref := anchor.Attr(`href`)
if strings.HasPrefix(linkHref, "https://pics.dmm.co.jp/digital/video/") && strings.HasSuffix(linkHref, `.jpg`) {
sc.Gallery = append(sc.Gallery, linkHref)
}
})

// Apply post-processing for error-correcting code
PostProcessJavScene(&sc, "")

if sc.SceneID != "" {
*out = append(*out, sc)
}
})

// Allow comma-separated scene id's
scenes := strings.Split(queryString, ",")
for _, v := range scenes {
sceneCollector.Visit("https://www.javbus.com/en/" + strings.ToUpper(v) + "/")
}

sceneCollector.Wait()
}
44 changes: 13 additions & 31 deletions pkg/scrape/javdatabase.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package scrape

import (
"regexp"
"strings"

"github.com/PuerkitoBio/goquery"
Expand All @@ -10,7 +9,7 @@ import (
"github.com/xbapps/xbvr/pkg/models"
)

func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString string) {
func ScrapeJavDB(out *[]models.ScrapedScene, queryString string) {
sceneCollector := createCollector("www.javdatabase.com")

sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) {
Expand All @@ -21,14 +20,8 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s
// Always add 'javr' as a tag
sc.Tags = append(sc.Tags, `javr`)

// Skipping some very generic and useless tags
skiptags := map[string]bool{
"featured actress": true,
"vr exclusive": true,
"high-quality vr": true,
"hi-def": true,
"exclusive distribution": true,
}
// Always add 'javdatabase' as a tag
sc.Tags = append(sc.Tags, `javdatabase`)

// Cast
html.ForEach("h2.subhead", func(id int, h2 *colly.HTMLElement) {
Expand All @@ -37,8 +30,9 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s
parent := dom.Parent()
if parent != nil {
parent.Find("a").Each(func(i int, anchor *goquery.Selection) {
if anchor.Text() != "" {
sc.Cast = append(sc.Cast, anchor.Text())
href, exists := anchor.Attr("href")
if exists && strings.Contains(href, "javdatabase.com/idols/") && anchor.Text() != "" {
sc.Cast = append(sc.Cast, strings.TrimSpace(anchor.Text()))
}
})
}
Expand Down Expand Up @@ -82,9 +76,9 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s
href := anchor.Attr("href")
if strings.Contains(href, "javdatabase.com/genres/") {
// Tags
tag := strings.ToLower(anchor.Text)
tag := ProcessJavrTag(anchor.Text)

if !skiptags[tag] {
if tag != "" {
sc.Tags = append(sc.Tags, tag)
}
}
Expand All @@ -96,8 +90,6 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s

} else if label == `Content ID:` {
contentId = tr.ChildText(`td.tablevalue`)
sc.HomepageURL = `https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=` + contentId + `/`
sc.Covers = append(sc.Covers, `https://pics.dmm.co.jp/digital/video/`+contentId+`/`+contentId+`pl.jpg`)
}
})

Expand All @@ -113,22 +105,12 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s
}
})

// Some specific postprocessing for error-correcting 3DSVR scenes
if len(contentId) > 0 && sc.Site == "DSVR" {
r := regexp.MustCompile("13dsvr0(\\d{4})")
match := r.FindStringSubmatch(contentId)
if match != nil && len(match) > 1 {
// Found a 3DSVR scene that is being wrongly categorized as DSVR
log.Println("Applying DSVR->3DSVR workaround")
sid := match[1]
sc.Site = "3DSVR"
sc.SceneID = "3DSVR-" + sid
sc.Title = sc.SceneID
sc.SiteID = sc.SceneID
}
}
// Apply post-processing for error-correcting code
PostProcessJavScene(&sc, contentId)

*out = append(*out, sc)
if sc.SceneID != "" {
*out = append(*out, sc)
}
})

// Allow comma-separated scene id's
Expand Down
117 changes: 117 additions & 0 deletions pkg/scrape/javland.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
package scrape

import (
"strings"

"github.com/gocolly/colly"
"github.com/nleeper/goment"
"github.com/xbapps/xbvr/pkg/models"
)

func ScrapeJavLand(out *[]models.ScrapedScene, queryString string) {
sceneCollector := createCollector("jav.land")

sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) {
sc := models.ScrapedScene{}
sc.SceneType = "VR"
contentId := ""

// Always add 'javr' as a tag
sc.Tags = append(sc.Tags, `javr`)

// Always add 'jav.land' as a tag
sc.Tags = append(sc.Tags, `jav.land`)

html.ForEach(`table.videotextlist tr`, func(id int, tr *colly.HTMLElement) {
tds := tr.DOM.Children()
if tds.Length() != 2 {
return
}
label := tds.First().Text()
value := tds.Last().Text()

if label == `Maker:` {
// Studio
sc.Studio = value

} else if label == `DVD ID:` {
// Title, SceneID and SiteID all like 'VRKM-821' format
dvdId := strings.ToUpper(value)
sc.Title = dvdId
sc.SceneID = dvdId
sc.SiteID = dvdId

// Set 'Site' to first part of the ID (e.g. `VRKM for `vrkm-821`)
siteParts := strings.Split(dvdId, `-`)
if len(siteParts) > 0 {
sc.Site = siteParts[0]
}

} else if label == `Release Date:` {
// Release date
tmpDate, _ := goment.New(strings.TrimSpace(value), "YYYY-MM-DD")
sc.Released = tmpDate.Format("YYYY-MM-DD")

} else if label == `Genre(s):` {
// Tags
tr.ForEach("span.genre > a", func(id int, anchor *colly.HTMLElement) {
href := anchor.Attr("href")
if strings.Contains(href, "/genre/") {
// Tags
tag := ProcessJavrTag(anchor.Text)

if tag != "" {
sc.Tags = append(sc.Tags, tag)
}
}
})

} else if label == `Cast:` {
// Tags
tr.ForEach("span.star > a", func(id int, anchor *colly.HTMLElement) {
href := anchor.Attr("href")
if strings.Contains(href, "/star/") {
sc.Cast = append(sc.Cast, anchor.Text)
}
})

} else if label == `Content ID:` {
contentId = value
}
})

// Screenshots
html.ForEach("a[href]", func(_ int, anchor *colly.HTMLElement) {
linkHref := anchor.Attr(`href`)
if strings.HasPrefix(linkHref, "https://pics.vpdmm.cc/") && strings.HasSuffix(linkHref, `.jpg`) {
linkHref = strings.Replace(linkHref, "https://pics.vpdmm.cc/", "https://pics.dmm.co.jp/", 1)
}
if strings.HasPrefix(linkHref, "https://pics.dmm.co.jp/digital/video/") && strings.HasSuffix(linkHref, `.jpg`) {
sc.Gallery = append(sc.Gallery, linkHref)
}
})

// Synopsis
title := html.DOM.Find("title")
if title != nil && title.Length() == 1 {
descr := title.Text()
descr = strings.ReplaceAll(descr, "- JAV.Land", "")
sc.Synopsis = descr
}

// Apply post-processing for error-correcting code
PostProcessJavScene(&sc, contentId)

if sc.SceneID != "" {
*out = append(*out, sc)
}
})

// Allow comma-separated scene id's
scenes := strings.Split(queryString, ",")
for _, v := range scenes {
sceneCollector.Visit("https://jav.land/en/id_search.php?keys=" + strings.ToLower(v))
}

sceneCollector.Wait()
}
Loading

0 comments on commit 2db2ecf

Please sign in to comment.