diff --git a/pkg/api/tasks.go b/pkg/api/tasks.go index cdda36f6e..b4de85007 100644 --- a/pkg/api/tasks.go +++ b/pkg/api/tasks.go @@ -11,7 +11,8 @@ import ( ) type RequestScrapeJAVR struct { - Query string `json:"q"` + Scraper string `json:"s"` + Query string `json:"q"` } type RequestScrapeTPDB struct { @@ -165,7 +166,7 @@ func (i TaskResource) scrapeJAVR(req *restful.Request, resp *restful.Response) { } if r.Query != "" { - go tasks.ScrapeJAVR(r.Query) + go tasks.ScrapeJAVR(r.Query, r.Scraper) } } diff --git a/pkg/scrape/javbus.go b/pkg/scrape/javbus.go new file mode 100644 index 000000000..b4666271a --- /dev/null +++ b/pkg/scrape/javbus.go @@ -0,0 +1,100 @@ +package scrape + +import ( + "regexp" + "strings" + + "github.com/gocolly/colly" + "github.com/xbapps/xbvr/pkg/models" +) + +func ScrapeJavBus(out *[]models.ScrapedScene, queryString string) { + sceneCollector := createCollector("www.javbus.com") + + sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) { + sc := models.ScrapedScene{} + sc.SceneType = "VR" + + // Always add 'javr' as a tag + sc.Tags = append(sc.Tags, `javr`) + + // Always add 'javbus' as a tag + sc.Tags = append(sc.Tags, `javbus`) + + html.ForEach(`div.row.movie div.info > p`, func(id int, p *colly.HTMLElement) { + label := p.ChildText(`span.header`) + + if label == `Studio:` { + // Studio + sc.Studio = p.ChildText(`a`) + + } else if label == `ID:` { + // Title, SceneID and SiteID all like 'VRKM-821' format + idRegex := regexp.MustCompile("^([A-Za-z0-9]+)-([0-9]+)$") + p.ForEach("span", func(_ int, span *colly.HTMLElement) { + match := idRegex.FindStringSubmatch(span.Text) + if match != nil && len(match) > 2 { + dvdId := match[1] + "-" + match[2] + sc.Title = dvdId + sc.SceneID = dvdId + sc.SiteID = dvdId + sc.Site = match[1] + } + }) + + } else if label == `Release Date:` { + // Release date + dateStr := p.Text + dateRegex := regexp.MustCompile("(\\d\\d\\d\\d-\\d\\d-\\d\\d)") + match := dateRegex.FindStringSubmatch(dateStr) + if match != nil && len(match) > 1 { + sc.Released = match[1] + } + } + }) + + // Tags + html.ForEach("div.row.movie span.genre > label > a", func(id int, anchor *colly.HTMLElement) { + href := anchor.Attr("href") + if strings.Contains(href, "javbus.com/en/genre/") { + // Tags + tag := ProcessJavrTag(anchor.Text) + + if tag != "" { + sc.Tags = append(sc.Tags, tag) + } + } + }) + + // Cast + html.ForEach("div.row.movie div.star-name > a", func(id int, anchor *colly.HTMLElement) { + href := anchor.Attr("href") + if strings.Contains(href, "javbus.com/en/star/") { + sc.Cast = append(sc.Cast, anchor.Text) + } + }) + + // Screenshots + html.ForEach("a[href]", func(_ int, anchor *colly.HTMLElement) { + linkHref := anchor.Attr(`href`) + if strings.HasPrefix(linkHref, "https://pics.dmm.co.jp/digital/video/") && strings.HasSuffix(linkHref, `.jpg`) { + sc.Gallery = append(sc.Gallery, linkHref) + } + }) + + // Apply post-processing for error-correcting code + PostProcessJavScene(&sc, "") + + if sc.SceneID != "" { + *out = append(*out, sc) + } + }) + + // Allow comma-separated scene id's + scenes := strings.Split(queryString, ",") + for _, v := range scenes { + sceneCollector.Visit("https://www.javbus.com/en/" + strings.ToUpper(v) + "/") + } + + sceneCollector.Wait() +} diff --git a/pkg/scrape/javdatabase.go b/pkg/scrape/javdatabase.go index ceeb4d97e..d1e349b9c 100644 --- a/pkg/scrape/javdatabase.go +++ b/pkg/scrape/javdatabase.go @@ -1,7 +1,6 @@ package scrape import ( - "regexp" "strings" "github.com/PuerkitoBio/goquery" @@ -10,7 +9,7 @@ import ( "github.com/xbapps/xbvr/pkg/models" ) -func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString string) { +func ScrapeJavDB(out *[]models.ScrapedScene, queryString string) { sceneCollector := createCollector("www.javdatabase.com") sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) { @@ -21,14 +20,8 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s // Always add 'javr' as a tag sc.Tags = append(sc.Tags, `javr`) - // Skipping some very generic and useless tags - skiptags := map[string]bool{ - "featured actress": true, - "vr exclusive": true, - "high-quality vr": true, - "hi-def": true, - "exclusive distribution": true, - } + // Always add 'javdatabase' as a tag + sc.Tags = append(sc.Tags, `javdatabase`) // Cast html.ForEach("h2.subhead", func(id int, h2 *colly.HTMLElement) { @@ -37,8 +30,9 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s parent := dom.Parent() if parent != nil { parent.Find("a").Each(func(i int, anchor *goquery.Selection) { - if anchor.Text() != "" { - sc.Cast = append(sc.Cast, anchor.Text()) + href, exists := anchor.Attr("href") + if exists && strings.Contains(href, "javdatabase.com/idols/") && anchor.Text() != "" { + sc.Cast = append(sc.Cast, strings.TrimSpace(anchor.Text())) } }) } @@ -82,9 +76,9 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s href := anchor.Attr("href") if strings.Contains(href, "javdatabase.com/genres/") { // Tags - tag := strings.ToLower(anchor.Text) + tag := ProcessJavrTag(anchor.Text) - if !skiptags[tag] { + if tag != "" { sc.Tags = append(sc.Tags, tag) } } @@ -96,8 +90,6 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s } else if label == `Content ID:` { contentId = tr.ChildText(`td.tablevalue`) - sc.HomepageURL = `https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=` + contentId + `/` - sc.Covers = append(sc.Covers, `https://pics.dmm.co.jp/digital/video/`+contentId+`/`+contentId+`pl.jpg`) } }) @@ -113,22 +105,12 @@ func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString s } }) - // Some specific postprocessing for error-correcting 3DSVR scenes - if len(contentId) > 0 && sc.Site == "DSVR" { - r := regexp.MustCompile("13dsvr0(\\d{4})") - match := r.FindStringSubmatch(contentId) - if match != nil && len(match) > 1 { - // Found a 3DSVR scene that is being wrongly categorized as DSVR - log.Println("Applying DSVR->3DSVR workaround") - sid := match[1] - sc.Site = "3DSVR" - sc.SceneID = "3DSVR-" + sid - sc.Title = sc.SceneID - sc.SiteID = sc.SceneID - } - } + // Apply post-processing for error-correcting code + PostProcessJavScene(&sc, contentId) - *out = append(*out, sc) + if sc.SceneID != "" { + *out = append(*out, sc) + } }) // Allow comma-separated scene id's diff --git a/pkg/scrape/javland.go b/pkg/scrape/javland.go new file mode 100644 index 000000000..8c0b472b2 --- /dev/null +++ b/pkg/scrape/javland.go @@ -0,0 +1,117 @@ +package scrape + +import ( + "strings" + + "github.com/gocolly/colly" + "github.com/nleeper/goment" + "github.com/xbapps/xbvr/pkg/models" +) + +func ScrapeJavLand(out *[]models.ScrapedScene, queryString string) { + sceneCollector := createCollector("jav.land") + + sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) { + sc := models.ScrapedScene{} + sc.SceneType = "VR" + contentId := "" + + // Always add 'javr' as a tag + sc.Tags = append(sc.Tags, `javr`) + + // Always add 'jav.land' as a tag + sc.Tags = append(sc.Tags, `jav.land`) + + html.ForEach(`table.videotextlist tr`, func(id int, tr *colly.HTMLElement) { + tds := tr.DOM.Children() + if tds.Length() != 2 { + return + } + label := tds.First().Text() + value := tds.Last().Text() + + if label == `Maker:` { + // Studio + sc.Studio = value + + } else if label == `DVD ID:` { + // Title, SceneID and SiteID all like 'VRKM-821' format + dvdId := strings.ToUpper(value) + sc.Title = dvdId + sc.SceneID = dvdId + sc.SiteID = dvdId + + // Set 'Site' to first part of the ID (e.g. `VRKM for `vrkm-821`) + siteParts := strings.Split(dvdId, `-`) + if len(siteParts) > 0 { + sc.Site = siteParts[0] + } + + } else if label == `Release Date:` { + // Release date + tmpDate, _ := goment.New(strings.TrimSpace(value), "YYYY-MM-DD") + sc.Released = tmpDate.Format("YYYY-MM-DD") + + } else if label == `Genre(s):` { + // Tags + tr.ForEach("span.genre > a", func(id int, anchor *colly.HTMLElement) { + href := anchor.Attr("href") + if strings.Contains(href, "/genre/") { + // Tags + tag := ProcessJavrTag(anchor.Text) + + if tag != "" { + sc.Tags = append(sc.Tags, tag) + } + } + }) + + } else if label == `Cast:` { + // Tags + tr.ForEach("span.star > a", func(id int, anchor *colly.HTMLElement) { + href := anchor.Attr("href") + if strings.Contains(href, "/star/") { + sc.Cast = append(sc.Cast, anchor.Text) + } + }) + + } else if label == `Content ID:` { + contentId = value + } + }) + + // Screenshots + html.ForEach("a[href]", func(_ int, anchor *colly.HTMLElement) { + linkHref := anchor.Attr(`href`) + if strings.HasPrefix(linkHref, "https://pics.vpdmm.cc/") && strings.HasSuffix(linkHref, `.jpg`) { + linkHref = strings.Replace(linkHref, "https://pics.vpdmm.cc/", "https://pics.dmm.co.jp/", 1) + } + if strings.HasPrefix(linkHref, "https://pics.dmm.co.jp/digital/video/") && strings.HasSuffix(linkHref, `.jpg`) { + sc.Gallery = append(sc.Gallery, linkHref) + } + }) + + // Synopsis + title := html.DOM.Find("title") + if title != nil && title.Length() == 1 { + descr := title.Text() + descr = strings.ReplaceAll(descr, "- JAV.Land", "") + sc.Synopsis = descr + } + + // Apply post-processing for error-correcting code + PostProcessJavScene(&sc, contentId) + + if sc.SceneID != "" { + *out = append(*out, sc) + } + }) + + // Allow comma-separated scene id's + scenes := strings.Split(queryString, ",") + for _, v := range scenes { + sceneCollector.Visit("https://jav.land/en/id_search.php?keys=" + strings.ToLower(v)) + } + + sceneCollector.Wait() +} diff --git a/pkg/scrape/javlibrary.go b/pkg/scrape/javlibrary.go new file mode 100644 index 000000000..5a6d00bac --- /dev/null +++ b/pkg/scrape/javlibrary.go @@ -0,0 +1,162 @@ +package scrape + +import ( + "net/url" + "regexp" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/gocolly/colly" + "github.com/nleeper/goment" + "github.com/xbapps/xbvr/pkg/models" +) + +func ScrapeJavLibrary(out *[]models.ScrapedScene, queryString string) { + sceneCollector := createCollector("www.javlibrary.com") + + sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) { + // This html page might be the redirected video details page, or the search results, + // find out which by looking inside the DOM + boxTitle := e.DOM.Find("div.boxtitle") + if boxTitle != nil { + r := regexp.MustCompile("\"([^\"]+)\" ID Search Result") + match := r.FindStringSubmatch(boxTitle.Text()) + if match != nil && len(match) > 1 { + // Found a search results page + searchQuery := strings.ToLower(match[1]) + log.Printf("Search results page found for " + searchQuery) + + // Try to find exact match in the results + videos := e.DOM.Find("div.videos div.video a") + videos.Each(func(_ int, el *goquery.Selection) { + sel := el.Find("div.id") + if sel != nil { + if strings.ToLower(sel.Text()) == searchQuery { + href, exists := el.Attr("href") + if exists { + // Found matching search result, visit it + baseURL := e.Request.URL + hrefURL, err := url.Parse(href) + if err == nil { + linkURL := baseURL.ResolveReference(hrefURL) + sceneCollector.Visit(linkURL.String()) + } + } + } + } + }) + + return // end parsing html search results page + } + } + + // Begin parsing scene details + sc := models.ScrapedScene{} + sc.SceneType = "VR" + + // Tags + // Always add 'javr' as a tag + sc.Tags = append(sc.Tags, `javr`) + + // Always add 'javlibrary' as a tag + sc.Tags = append(sc.Tags, `javlibrary`) + + // ID + videoIdSel := e.DOM.Find("div#video_id td.text") + if videoIdSel != nil { + dvdId := strings.ToUpper(videoIdSel.Text()) + sc.Title = dvdId + sc.SiteID = dvdId + sc.SceneID = dvdId + + // Set 'Site' to first part of the ID (e.g. `VRKM for `vrkm-821`) + siteParts := strings.Split(dvdId, `-`) + if len(siteParts) > 0 { + sc.Site = siteParts[0] + } + } + + // Cover image + coverImg := e.DOM.Find("img#video_jacket_img") + if coverImg != nil { + src, exists := coverImg.Attr("src") + if exists { + if strings.HasPrefix(src, "//") { + // include protocol in image urls + src = "https:" + src + } + sc.Covers = append(sc.Covers, src) + } + } + + // Gallery + previewDiv := e.DOM.Find("div.previewthumbs") + if previewDiv != nil { + imgEls := previewDiv.Find("img") + imgEls.Each(func(_ int, s *goquery.Selection) { + src, exists := s.Attr("src") + if exists { + if strings.HasPrefix(src, "//") { + // include protocol in image urls + src = "https:" + src + } + + // Replace low-res version with higher-res version for specific pics.dmm.co.jp images + m := regexp.MustCompile("//pics.dmm.co.jp/digital/video/([^/]+)/(.+[0-9])-([0-9]+).jpg") + res := m.ReplaceAllString(src, "//pics.dmm.co.jp/digital/video/${1}/${2}jp-${3}.jpg") + sc.Gallery = append(sc.Gallery, res) + } + }) + } + + // Release date + videoDateTd := e.DOM.Find("div#video_date td.text") + if videoDateTd != nil { + dateStr := videoDateTd.Text() + tmpDate, _ := goment.New(strings.TrimSpace(dateStr), "YYYY-MM-DD") + sc.Released = tmpDate.Format("YYYY-MM-DD") + } + + // Cast + videoCastSel := e.DOM.Find("span.star") + videoCastSel.Each(func(_ int, s *goquery.Selection) { + sc.Cast = append(sc.Cast, strings.TrimSpace(s.Text())) + }) + + // Genre + videoGenreSel := e.DOM.Find("span.genre") + videoGenreSel.Each(func(_ int, s *goquery.Selection) { + tag := ProcessJavrTag(s.Text()) + if tag != "" { + sc.Tags = append(sc.Tags, tag) + } + }) + + // Description + videoTitleSel := e.DOM.Find("div#video_title h3") + if videoTitleSel != nil { + sc.Synopsis = videoTitleSel.Text() + } + + // Studio + videoStudioSel := e.DOM.Find("span.maker") + if videoStudioSel != nil { + sc.Studio = videoStudioSel.Text() + } + + // Apply post-processing for error-correcting code + PostProcessJavScene(&sc, "") + + if sc.SceneID != "" { + *out = append(*out, sc) + } + }) + + // Allow comma-separated scene id's + scenes := strings.Split(queryString, ",") + for _, v := range scenes { + sceneCollector.Visit("https://www.javlibrary.com/en/vl_searchbyid.php?keyword=" + strings.ToLower(v)) + } + + sceneCollector.Wait() +} diff --git a/pkg/scrape/javutil.go b/pkg/scrape/javutil.go new file mode 100644 index 000000000..8ed9da2d5 --- /dev/null +++ b/pkg/scrape/javutil.go @@ -0,0 +1,163 @@ +package scrape + +import ( + "fmt" + "regexp" + "strconv" + "strings" + + "github.com/xbapps/xbvr/pkg/models" +) + +/* Returns en empty string if this tag is to be skipped, or the + * mapped tag if it should be included. + */ +func ProcessJavrTag(tag string) string { + taglower := strings.TrimSpace(strings.ToLower(tag)) + + // Skipping some very generic and useless tags + skiptags := map[string]bool{ + "featured actress": true, + "vr exclusive": true, + "high quality vr": true, + "high-quality vr": true, + "vr": true, + "vr only": true, + "hi-def": true, + "exclusive distribution": true, + "single work": true, + "solo work": true, + "solowork": true, + } + if skiptags[taglower] { + return "" + } + + // Map some tags to normalize so different sources match + // TODO: this mapping is totally incomplete and needs help from community to fill + maptags := map[string]string{ + "blow": "blowjob", + "blow job": "blowjob", + "kiss": "kiss kiss", + "kiss / kiss": "kiss kiss", + "prostitute": "club hostess & sex worker", + "prostitutes": "club hostess & sex worker", + "sun tan": "suntan", + } + if maptags[taglower] != "" { + return maptags[taglower] + } + + // Leave out some japanese text tags + matched, err := regexp.Match("[^a-z0-9_\\- /&()\\+]", []byte(taglower)) + if matched == true || err != nil { + return "" + } + + // keep tag as-is (but lowercase) + return taglower +} + +func determineContentId(sc *models.ScrapedScene) string { + contentId := "" + contentIdRegex := regexp.MustCompile("//pics.dmm.co.jp/digital/video/([^/]+)/") + + // obtain from cover + for i := range sc.Covers { + href := sc.Covers[i] + match := contentIdRegex.FindStringSubmatch(href) + if match != nil && len(match) > 1 { + contentId = match[1] + log.Println("Found content ID from cover image: " + contentId) + break + } + } + + // obtain from gallery + if len(contentId) == 0 { + for i := range sc.Gallery { + href := sc.Gallery[i] + match := contentIdRegex.FindStringSubmatch(href) + if match != nil && len(match) > 1 { + contentId = match[1] + log.Println("Found content ID from gallery image: " + contentId) + break + } + } + } + + // last resort: build from dvd id + if len(contentId) == 0 { + // Guess contentId based on dvdId, as javbus simply doesn't have it otherwise. + // 3DSVR-0878 and FSDSS-335 are examples of scenes that really has no contentId there + parts := strings.Split(sc.SceneID, `-`) + if len(parts) == 2 { + site := strings.ToLower(parts[0]) + numstr := parts[1] + i, _ := strconv.ParseInt(numstr, 10, 32) + nameMap := map[string]bool{ + "3dsvr": true, + "fsdss": true, + } + if nameMap[site] == true { + site = "1" + site + } + contentId = fmt.Sprintf("%s%05d", site, i) + log.Println("Fallback content ID from dvd ID: " + contentId) + } + } + + return contentId +} + +func PostProcessJavScene(sc *models.ScrapedScene, contentId string) { + if sc.SceneID == "" { + log.Println("Scene not found.") + return + } + + if len(contentId) == 0 { + contentId = determineContentId(sc) + } + + // Set Homepage URL + if sc.HomepageURL == "" { + sc.HomepageURL = `https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=` + contentId + `/` + } + + // Set Cover URL + if len(sc.Covers) == 0 { + sc.Covers = append(sc.Covers, `https://pics.dmm.co.jp/digital/video/`+contentId+`/`+contentId+`pl.jpg`) + } + + // Fallback gallery images if needed + if len(sc.Gallery) == 0 { + for i := 1; i < 7; i++ { + url := fmt.Sprintf("https:/pics.dmm.co.jp/digital/video/%s/%sjp-%d.jpg", contentId, contentId, i) + sc.Covers = append(sc.Covers, url) + } + } + + // Trim excess whitespace + if sc.Studio != "" { + sc.Studio = strings.TrimSpace(sc.Studio) + } + if sc.Synopsis != "" { + sc.Synopsis = strings.TrimSpace(sc.Synopsis) + } + + // Some specific postprocessing for error-correcting 3DSVR scenes + if len(contentId) > 0 && sc.Site == "DSVR" { + r := regexp.MustCompile("13dsvr0(\\d{4})") + match := r.FindStringSubmatch(contentId) + if match != nil && len(match) > 1 { + // Found a 3DSVR scene that is being wrongly categorized as DSVR + log.Println("Applying DSVR->3DSVR workaround") + sid := match[1] + sc.Site = "3DSVR" + sc.SceneID = "3DSVR-" + sid + sc.Title = sc.SceneID + sc.SiteID = sc.SceneID + } + } +} diff --git a/pkg/tasks/content.go b/pkg/tasks/content.go index c4cf1765e..188db6b5c 100644 --- a/pkg/tasks/content.go +++ b/pkg/tasks/content.go @@ -268,7 +268,7 @@ func Scrape(toScrape string) { } } -func ScrapeJAVR(queryString string) { +func ScrapeJAVR(queryString string, scraper string) { if !models.CheckLock("scrape") { models.CreateLock("scrape") defer models.RemoveLock("scrape") @@ -276,22 +276,22 @@ func ScrapeJAVR(queryString string) { tlog := log.WithField("task", "scrape") tlog.Infof("Scraping started at %s", t0.Format("Mon Jan _2 15:04:05 2006")) - // Get all known scenes - var scenes []models.Scene - db, _ := models.GetDB() - db.Find(&scenes) - db.Close() - - var knownScenes []string - for i := range scenes { - knownScenes = append(knownScenes, scenes[i].SceneURL) - } - // Start scraping var collectedScenes []models.ScrapedScene - tlog.Infof("Scraping JavDB") - scrape.ScrapeJavDB(knownScenes, &collectedScenes, queryString) + if scraper == "javlibrary" { + tlog.Infof("Scraping JavLibrary") + scrape.ScrapeJavLibrary(&collectedScenes, queryString) + } else if scraper == "javbus" { + tlog.Infof("Scraping JavBus") + scrape.ScrapeJavBus(&collectedScenes, queryString) + } else if scraper == "javland" { + tlog.Infof("Scraping JavLand") + scrape.ScrapeJavLand(&collectedScenes, queryString) + } else { + tlog.Infof("Scraping JavDB") + scrape.ScrapeJavDB(&collectedScenes, queryString) + } if len(collectedScenes) > 0 { db, _ := models.GetDB() diff --git a/ui/src/views/options/sections/OptionsSceneCreate.vue b/ui/src/views/options/sections/OptionsSceneCreate.vue index 155db8d1b..92a565ec7 100644 --- a/ui/src/views/options/sections/OptionsSceneCreate.vue +++ b/ui/src/views/options/sections/OptionsSceneCreate.vue @@ -1,9 +1,15 @@