diff --git a/go.mod b/go.mod index 88c83f744..81c2442b6 100644 --- a/go.mod +++ b/go.mod @@ -15,6 +15,7 @@ require ( github.com/araddon/dateparse v0.0.0-20200409225146-d820a6159ab1 github.com/avast/retry-go v3.0.0+incompatible github.com/blevesearch/bleve v1.0.14 + github.com/bregydoc/gtranslate v0.0.0-20200913051839-1bd07f6c1fc5 github.com/creasty/defaults v1.5.1 github.com/darwayne/go-timecode v1.1.0 github.com/djherbis/times v1.2.0 @@ -77,6 +78,7 @@ require ( golang.org/x/net v0.0.0-20210226172049-e18ecbb05110 golang.org/x/oauth2 v0.0.0-20210220000619-9bb904979d93 golang.org/x/sys v0.0.0-20210228012217-479acdf4ea46 + golang.org/x/text v0.3.4 gopkg.in/cheggaaa/pb.v1 v1.0.28 gopkg.in/gormigrate.v1 v1.6.0 gopkg.in/resty.v1 v1.12.0 diff --git a/pkg/models/model_tag.go b/pkg/models/model_tag.go index dfa73c82c..1f20adcfa 100644 --- a/pkg/models/model_tag.go +++ b/pkg/models/model_tag.go @@ -85,7 +85,7 @@ func ConvertTag(t string) string { return "blowjob" } - if funk.Contains([]string{"boobs job", "titty fucking", "titjob"}, t) { + if funk.Contains([]string{"boobs job", "titty fucking", "tittyfuck", "titjob"}, t) { return "titty fuck" } @@ -297,7 +297,7 @@ func ConvertTag(t string) string { return "parody" } - if funk.Contains([]string{"fingering"}, t) { + if funk.Contains([]string{"fingering", "masterbation"}, t) { return "masturbation" } diff --git a/pkg/scrape/caribbeancom.go b/pkg/scrape/caribbeancom.go new file mode 100644 index 000000000..ffbcd4dfb --- /dev/null +++ b/pkg/scrape/caribbeancom.go @@ -0,0 +1,130 @@ +package scrape + +import ( + "strconv" + "strings" + "sync" + + "github.com/bregydoc/gtranslate" + "github.com/gocolly/colly" + "github.com/mozillazg/go-slugify" + "github.com/thoas/go-funk" + "github.com/tidwall/gjson" + "github.com/xbapps/xbvr/pkg/models" + "golang.org/x/text/language" +) + +func CariVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene) error { + defer wg.Done() + scraperID := "caribbeancomvr" + siteID := "CaribbeanCom VR" + logScrapeStart(scraperID, siteID) + + sceneCollector := createCollector("en.caribbeancom.com", "www.caribbeancom.com") + siteCollector := createCollector("en.caribbeancom.com", "www.caribbeancom.com") + sceneCollectorJap := cloneCollector(sceneCollector) + + sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) { + + sc := models.ScrapedScene{} + sc.SceneType = "VR" + sc.Studio = "Caribbeancom" + sc.Site = siteID + sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0] + + // Scene ID - get from JavaScript + e.ForEach(`script`, func(id int, e *colly.HTMLElement) { + if !strings.Contains(e.Text, "movie_seq") { + return + } + jsonData := e.Text[strings.Index(e.Text, "{") : len(e.Text)-3] + movSeq := gjson.Get(jsonData, "movie_seq").String() + if movSeq == "" { + return + } + sc.SiteID = movSeq + sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID + }) + + // Title + e.ForEach(`h1[itemprop=name]`, func(id int, e *colly.HTMLElement) { + sc.Title = strings.TrimSpace(strings.Replace(e.Text, "[VR] ", "", 1)) + }) + + // Cover + coverURL := strings.Replace(strings.Replace(sc.HomepageURL, "eng/", "", 1), "index.html", "images/poster_en.jpg", 1) + if len(coverURL) > 0 { + sc.Covers = append(sc.Covers, coverURL) + } + + // Filename 011421-001-carib-2160p.mp4 + sc.Filenames = append(sc.Filenames, strings.Split(coverURL, "/")[4]+"-carib-2160p.mp4") + + // Gallery + e.ForEach(`div.movie-gallery a.fancy-gallery`, func(id int, e *colly.HTMLElement) { + if strings.Compare(e.Attr(`data-is_sample`), "0") == 0 { + return + } + sc.Gallery = append(sc.Gallery, e.Request.AbsoluteURL(e.Attr("href"))) + }) + + // Cast & Tags + e.ForEach(`div.movie-info a.spec__tag`, func(id int, e *colly.HTMLElement) { + if strings.Compare(e.Attr(`itemprop`), "actor") == 0 { + sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text)) + } else { + if (strings.Compare(e.Attr(`itemprop`), "genre") == 0) || (strings.Compare(e.Attr(`itemprop`), "url") == 0) { + sc.Tags = append(sc.Tags, strings.TrimSpace(e.Text)) + } + } + }) + + // Release Date + e.ForEach(`div.movie-info span`, func(id int, e *colly.HTMLElement) { + if e.Attr(`itemprop`) == "uploadDate" { + sc.Released = strings.TrimSpace(strings.Replace(e.Text, "/", "-", -1)) + } + // Duration + if e.Attr(`itemprop`) == "duration" { + tmpDuration := strings.Split(strings.Trim(e.Attr(`content`), "TS"), "M")[0] + sc.Duration, _ = strconv.Atoi(strings.Split(tmpDuration, "H")[1]) + } + }) + + sceneURLJap := strings.Replace(strings.Replace(sc.HomepageURL, "eng/", "", 1), "en.", "www.", 1) + ctx := colly.NewContext() + ctx.Put("scene", sc) + + sceneCollectorJap.Request("GET", sceneURLJap, nil, ctx, nil) + }) + + // Synopsis - Pull from Japanese site & translate + sceneCollectorJap.OnHTML(`html`, func(e *colly.HTMLElement) { + sc := e.Request.Ctx.GetAny("scene").(models.ScrapedScene) + e.ForEach(`p[itemprop=description]`, func(id int, e *colly.HTMLElement) { + sc.Synopsis, _ = gtranslate.Translate(strings.TrimSpace(e.Text), language.Japanese, language.English) + }) + + out <- sc + }) + + siteCollector.OnHTML(`div.media-thum a`, func(e *colly.HTMLElement) { + sceneURL := e.Request.AbsoluteURL(e.Attr("href")) + // If scene exists in database, there's no need to scrape + if !funk.ContainsString(knownScenes, sceneURL) { + sceneCollector.Visit(sceneURL) + } + }) + + siteCollector.Visit("https://en.caribbeancom.com/eng/listpages/vr1.htm") + + if updateSite { + updateSiteLastUpdate(scraperID) + } + logScrapeFinished(scraperID, siteID) + return nil +} + +func init() { + registerScraper("caribbeancomvr", "CaribbeanCom VR", "https://mcdn.vrporn.com/files/20191217194900/baimudan-vr-porn-studio-logo-vrporn.com-virtual-reality-porn.jpg", CariVR) +}