forked from xbapps/xbvr
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
scrapers: Added more JAV scrapers (javland, javlibrary, javbus) (xbap…
…ps#1100) * javr: enable multiple scraper options, and add javlibrary and javbus * javr: add jav.land as well * Update javdatabase.go Add `javdatabase` as a tag * Update javbus.go Always add `javbus` as a tag * Update javland.go Always add 'jav.land' as a tag * Update javlibrary.go Always add javlibrary as a tag * Update javtags.go Changed both of the skip/re-map lists to tab separation. Moved "solo/solo work/solowork" to the "drop" list, I forgot what the R18 tag was, but unless I am mistaken, it was a tag they (and FANZA by extension) automatically add(ed) to titles that aren't part of an overarching "series". It's meaningless to us since we don't scrape R18/DMM's "series" listings, nor would we have a way to filter for them in XBVR. Changed "kiss kiss" to the tag to be retained, as that was the tag R18 used, and most users would already have plenty of in their databases. Probably best to maintain continuity with the old R18 tags whenever possible if this is to be done. * Update javtags.go Similarly, `suntan` was the tag R18 used, I have 12 entries in my library that pre-date the manifests I started writing myself. * javr: more error-correcting code, and less code-reuse between scrapers Co-authored-by: vt-idiot <[email protected]>
- Loading branch information
Showing
8 changed files
with
581 additions
and
49 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
package scrape | ||
|
||
import ( | ||
"regexp" | ||
"strings" | ||
|
||
"github.com/gocolly/colly" | ||
"github.com/xbapps/xbvr/pkg/models" | ||
) | ||
|
||
func ScrapeJavBus(out *[]models.ScrapedScene, queryString string) { | ||
sceneCollector := createCollector("www.javbus.com") | ||
|
||
sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) { | ||
sc := models.ScrapedScene{} | ||
sc.SceneType = "VR" | ||
|
||
// Always add 'javr' as a tag | ||
sc.Tags = append(sc.Tags, `javr`) | ||
|
||
// Always add 'javbus' as a tag | ||
sc.Tags = append(sc.Tags, `javbus`) | ||
|
||
html.ForEach(`div.row.movie div.info > p`, func(id int, p *colly.HTMLElement) { | ||
label := p.ChildText(`span.header`) | ||
|
||
if label == `Studio:` { | ||
// Studio | ||
sc.Studio = p.ChildText(`a`) | ||
|
||
} else if label == `ID:` { | ||
// Title, SceneID and SiteID all like 'VRKM-821' format | ||
idRegex := regexp.MustCompile("^([A-Za-z0-9]+)-([0-9]+)$") | ||
p.ForEach("span", func(_ int, span *colly.HTMLElement) { | ||
match := idRegex.FindStringSubmatch(span.Text) | ||
if match != nil && len(match) > 2 { | ||
dvdId := match[1] + "-" + match[2] | ||
sc.Title = dvdId | ||
sc.SceneID = dvdId | ||
sc.SiteID = dvdId | ||
sc.Site = match[1] | ||
} | ||
}) | ||
|
||
} else if label == `Release Date:` { | ||
// Release date | ||
dateStr := p.Text | ||
dateRegex := regexp.MustCompile("(\\d\\d\\d\\d-\\d\\d-\\d\\d)") | ||
match := dateRegex.FindStringSubmatch(dateStr) | ||
if match != nil && len(match) > 1 { | ||
sc.Released = match[1] | ||
} | ||
} | ||
}) | ||
|
||
// Tags | ||
html.ForEach("div.row.movie span.genre > label > a", func(id int, anchor *colly.HTMLElement) { | ||
href := anchor.Attr("href") | ||
if strings.Contains(href, "javbus.com/en/genre/") { | ||
// Tags | ||
tag := ProcessJavrTag(anchor.Text) | ||
|
||
if tag != "" { | ||
sc.Tags = append(sc.Tags, tag) | ||
} | ||
} | ||
}) | ||
|
||
// Cast | ||
html.ForEach("div.row.movie div.star-name > a", func(id int, anchor *colly.HTMLElement) { | ||
href := anchor.Attr("href") | ||
if strings.Contains(href, "javbus.com/en/star/") { | ||
sc.Cast = append(sc.Cast, anchor.Text) | ||
} | ||
}) | ||
|
||
// Screenshots | ||
html.ForEach("a[href]", func(_ int, anchor *colly.HTMLElement) { | ||
linkHref := anchor.Attr(`href`) | ||
if strings.HasPrefix(linkHref, "https://pics.dmm.co.jp/digital/video/") && strings.HasSuffix(linkHref, `.jpg`) { | ||
sc.Gallery = append(sc.Gallery, linkHref) | ||
} | ||
}) | ||
|
||
// Apply post-processing for error-correcting code | ||
PostProcessJavScene(&sc, "") | ||
|
||
if sc.SceneID != "" { | ||
*out = append(*out, sc) | ||
} | ||
}) | ||
|
||
// Allow comma-separated scene id's | ||
scenes := strings.Split(queryString, ",") | ||
for _, v := range scenes { | ||
sceneCollector.Visit("https://www.javbus.com/en/" + strings.ToUpper(v) + "/") | ||
} | ||
|
||
sceneCollector.Wait() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
package scrape | ||
|
||
import ( | ||
"strings" | ||
|
||
"github.com/gocolly/colly" | ||
"github.com/nleeper/goment" | ||
"github.com/xbapps/xbvr/pkg/models" | ||
) | ||
|
||
func ScrapeJavLand(out *[]models.ScrapedScene, queryString string) { | ||
sceneCollector := createCollector("jav.land") | ||
|
||
sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) { | ||
sc := models.ScrapedScene{} | ||
sc.SceneType = "VR" | ||
contentId := "" | ||
|
||
// Always add 'javr' as a tag | ||
sc.Tags = append(sc.Tags, `javr`) | ||
|
||
// Always add 'jav.land' as a tag | ||
sc.Tags = append(sc.Tags, `jav.land`) | ||
|
||
html.ForEach(`table.videotextlist tr`, func(id int, tr *colly.HTMLElement) { | ||
tds := tr.DOM.Children() | ||
if tds.Length() != 2 { | ||
return | ||
} | ||
label := tds.First().Text() | ||
value := tds.Last().Text() | ||
|
||
if label == `Maker:` { | ||
// Studio | ||
sc.Studio = value | ||
|
||
} else if label == `DVD ID:` { | ||
// Title, SceneID and SiteID all like 'VRKM-821' format | ||
dvdId := strings.ToUpper(value) | ||
sc.Title = dvdId | ||
sc.SceneID = dvdId | ||
sc.SiteID = dvdId | ||
|
||
// Set 'Site' to first part of the ID (e.g. `VRKM for `vrkm-821`) | ||
siteParts := strings.Split(dvdId, `-`) | ||
if len(siteParts) > 0 { | ||
sc.Site = siteParts[0] | ||
} | ||
|
||
} else if label == `Release Date:` { | ||
// Release date | ||
tmpDate, _ := goment.New(strings.TrimSpace(value), "YYYY-MM-DD") | ||
sc.Released = tmpDate.Format("YYYY-MM-DD") | ||
|
||
} else if label == `Genre(s):` { | ||
// Tags | ||
tr.ForEach("span.genre > a", func(id int, anchor *colly.HTMLElement) { | ||
href := anchor.Attr("href") | ||
if strings.Contains(href, "/genre/") { | ||
// Tags | ||
tag := ProcessJavrTag(anchor.Text) | ||
|
||
if tag != "" { | ||
sc.Tags = append(sc.Tags, tag) | ||
} | ||
} | ||
}) | ||
|
||
} else if label == `Cast:` { | ||
// Tags | ||
tr.ForEach("span.star > a", func(id int, anchor *colly.HTMLElement) { | ||
href := anchor.Attr("href") | ||
if strings.Contains(href, "/star/") { | ||
sc.Cast = append(sc.Cast, anchor.Text) | ||
} | ||
}) | ||
|
||
} else if label == `Content ID:` { | ||
contentId = value | ||
} | ||
}) | ||
|
||
// Screenshots | ||
html.ForEach("a[href]", func(_ int, anchor *colly.HTMLElement) { | ||
linkHref := anchor.Attr(`href`) | ||
if strings.HasPrefix(linkHref, "https://pics.vpdmm.cc/") && strings.HasSuffix(linkHref, `.jpg`) { | ||
linkHref = strings.Replace(linkHref, "https://pics.vpdmm.cc/", "https://pics.dmm.co.jp/", 1) | ||
} | ||
if strings.HasPrefix(linkHref, "https://pics.dmm.co.jp/digital/video/") && strings.HasSuffix(linkHref, `.jpg`) { | ||
sc.Gallery = append(sc.Gallery, linkHref) | ||
} | ||
}) | ||
|
||
// Synopsis | ||
title := html.DOM.Find("title") | ||
if title != nil && title.Length() == 1 { | ||
descr := title.Text() | ||
descr = strings.ReplaceAll(descr, "- JAV.Land", "") | ||
sc.Synopsis = descr | ||
} | ||
|
||
// Apply post-processing for error-correcting code | ||
PostProcessJavScene(&sc, contentId) | ||
|
||
if sc.SceneID != "" { | ||
*out = append(*out, sc) | ||
} | ||
}) | ||
|
||
// Allow comma-separated scene id's | ||
scenes := strings.Split(queryString, ",") | ||
for _, v := range scenes { | ||
sceneCollector.Visit("https://jav.land/en/id_search.php?keys=" + strings.ToLower(v)) | ||
} | ||
|
||
sceneCollector.Wait() | ||
} |
Oops, something went wrong.