From e7185d1b33e8a41b9175119d464f46cffde4f3fd Mon Sep 17 00:00:00 2001 From: jrebey <55519905+jrebey@users.noreply.github.com> Date: Sat, 9 Nov 2019 13:15:09 -0500 Subject: [PATCH] Add VRP Films scraper (#190) --- pkg/models/model_tag.go | 31 +++++--- pkg/scrape/vrpfilms.go | 154 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+), 11 deletions(-) create mode 100644 pkg/scrape/vrpfilms.go diff --git a/pkg/models/model_tag.go b/pkg/models/model_tag.go index fce4701b3..421ba6bb6 100644 --- a/pkg/models/model_tag.go +++ b/pkg/models/model_tag.go @@ -22,13 +22,14 @@ func (t *Tag) Save() error { } func ConvertTag(t string) string { - t = strings.ToLower(t) + t = strings.TrimSpace(strings.ToLower(t)) if funk.Contains([]string{"180", "60fps", "60 fps", "5k", "5k+", "big dick", "big cocks", - "axaxqxrrysrwqua", "girl-boy", "virtual reality", - "virtual reality porn", "vr porn", "180 vr porn", "xxxsex vr", - "xxx vr porn", "VRconk", "sex onbed", - },t) { + "axaxqxrrysrwqua", "girl-boy", "virtual reality", "sex", "new", + "virtual reality porn", "vr porn", "180 vr porn", "xxxsex vr", + "xxx vr porn", "VRconk", "sex onbed", "pornstars", "vr", "vrp", + "bg", "coming soon", "vr 1080p porn", + }, t) { return "" } @@ -60,7 +61,7 @@ func ConvertTag(t string) string { return "threesome fmm" } - if funk.Contains([]string{"big boobs"}, t) { + if funk.Contains([]string{"big boobs", "big tits porn"}, t) { return "big tits" } @@ -124,7 +125,7 @@ func ConvertTag(t string) string { return "latina" } - if funk.Contains([]string{"lesbian love", "lesbians"}, t) { + if funk.Contains([]string{"lesbian love", "lesbians", "girlgirl", "girl-on-girl"}, t) { return "lesbian" } @@ -152,7 +153,7 @@ func ConvertTag(t string) string { return "squirting" } - if funk.Contains([]string{"teens"}, t) { + if funk.Contains([]string{"teens", "18"}, t) { return "teen" } @@ -200,11 +201,11 @@ func ConvertTag(t string) string { return "no tattoos" } - if funk.Contains([]string{"tattoo", "tatoos"}, t) { + if funk.Contains([]string{"tattoo", "tatoos", "tattoo(s)"}, t) { return "tattoos" } - if funk.Contains([]string{"piercing", "pirced pussy"}, t) { + if funk.Contains([]string{"piercing", "pirced pussy", "pierced navel"}, t) { return "piercings" } @@ -272,7 +273,7 @@ func ConvertTag(t string) string { return "dp" } - if funk.Contains([]string{"pov fucking"}, t) { + if funk.Contains([]string{"pov fucking", "pov vr"}, t) { return "pov" } @@ -280,5 +281,13 @@ func ConvertTag(t string) string { return "parody" } + if funk.Contains([]string{"fingering"}, t) { + return "masturbation" + } + + if funk.Contains([]string{"solo models"}, t) { + return "solo" + } + return t } diff --git a/pkg/scrape/vrpfilms.go b/pkg/scrape/vrpfilms.go new file mode 100644 index 000000000..48aace93b --- /dev/null +++ b/pkg/scrape/vrpfilms.go @@ -0,0 +1,154 @@ +package scrape + +import ( + "fmt" + "strconv" + "strings" + "sync" + + "github.com/gocolly/colly" + "github.com/mozillazg/go-slugify" + "github.com/nleeper/goment" + "github.com/thoas/go-funk" + "github.com/xbapps/xbvr/pkg/models" +) + +func VRPFilms(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene) error { + defer wg.Done() + logScrapeStart("vrpfilms", "VRP Films") + + siteCollector := colly.NewCollector( + colly.AllowedDomains("vrpfilms.com", "www.vrpfilms.com"), + colly.CacheDir(siteCacheDir), + colly.UserAgent(userAgent), + ) + + sceneCollector := colly.NewCollector( + colly.AllowedDomains("vrpfilms.com", "www.vrpfilms.com"), + colly.CacheDir(sceneCacheDir), + colly.UserAgent(userAgent), + ) + + siteCollector.OnRequest(func(r *colly.Request) { + log.Println("visiting", r.URL.String()) + }) + + sceneCollector.OnRequest(func(r *colly.Request) { + log.Println("visiting", r.URL.String()) + }) + + sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) { + sc := models.ScrapedScene{} + sc.SceneType = "VR" + sc.Studio = "VRP Films" + sc.Site = "VRP Films" + sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0] + + // Scene ID - get from download link. It's the closest thing they have to a scene id + sc.SiteID = e.ChildAttr(`a.member-download`, "data-main-product-id") + sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID + + sc.Title = strings.TrimSpace(e.ChildText(`span.breadcrumb_last`)) + coverURL := e.ChildAttr(`meta[property="og:image"]`, "content") + sc.Covers = append(sc.Covers, coverURL) + + // No release date anywhere, but we can approximate based on the wordpress date of the + // cover image. It's at least better than nothing. + // + // https://vrpfilms.com/wp-content/uploads/2019/10/No-Boys-Just-Toys-Banner-1600x800.jpg + t := strings.Split(coverURL, "/") + tmpDate := fmt.Sprintf("%s-%s-01", t[5], t[6]) + date, _ := goment.New(tmpDate, "YYYY-MM-DD") + sc.Released = date.Format("YYYY-MM-DD") + + sc.Gallery = e.ChildAttrs(`.movies-gallery a`, "href") + + unfilteredTags := []string{} + e.ForEach(`.detail p`, func(id int, e *colly.HTMLElement) { + if strings.Contains(e.Text, "Featuring:") { + // Featuring: Amber Jayne, Selvaggia + tmpCast := strings.Split(e.Text, ":")[1] + cast := strings.Split(strings.TrimSpace(tmpCast), ",") + funk.ForEach(cast, func(c string) { + sc.Cast = append(sc.Cast, strings.TrimSpace(c)) + }) + + } + + if strings.Contains(e.Text, "Length:") { + // Length: 35 Minutes + tmpDuration := strings.TrimSpace(strings.Split(e.Text, ":")[1]) + duration, err := strconv.Atoi(strings.Split(tmpDuration, " ")[0]) + if err == nil { + sc.Duration = duration + } + } + + if strings.Contains(e.Text, "Tags:") { + tmpTags := strings.Split(e.Text, ":")[1] + tags := strings.Split(strings.TrimSpace(tmpTags), ",") + funk.ForEach(tags, func(t string) { + unfilteredTags = append(unfilteredTags, strings.TrimSpace(t)) + }) + } + }) + + // It pains me to have to do this + garbageTags := []string{"pussy", "polly pons", "little cindy", + "bass ass handy women", "hot", + "estate agent sex pov", "real estate sex vr", + "sandy's superstar escorts", "wet and wild", + } + sc.Tags = funk.FilterString(unfilteredTags, func(t string) bool { + lt := strings.ToLower(t) + if funk.ContainsString(garbageTags, lt) { + return false + } + + var badTag bool + funk.ForEach(sc.Cast, func(c string) { + if strings.ToLower(c) == lt { + badTag = true + } + }) + + if badTag { + return false + } + + if strings.ToLower(sc.Title) == lt { + return false + } + return true + }) + + out <- sc + }) + + siteCollector.OnHTML(`article a`, func(e *colly.HTMLElement) { + sceneURL := e.Request.AbsoluteURL(e.Attr("href")) + + if !funk.ContainsString(knownScenes, sceneURL) && !strings.Contains(sceneURL, "/join") { + sceneCollector.Visit(sceneURL) + } + }) + + siteCollector.OnHTML(`a.page-numbers`, func(e *colly.HTMLElement) { + pageURL := e.Request.AbsoluteURL(e.Attr("href")) + if !strings.Contains(pageURL, "/join") { + siteCollector.Visit(pageURL) + } + }) + + siteCollector.Visit("https://vrpfilms.com/vrp-movies") + + if updateSite { + updateSiteLastUpdate("vrpfilms") + } + logScrapeFinished("vrpfilms", "VRP Films") + return nil +} + +func init() { + registerScraper("vrpfilms", "VRP Films", VRPFilms) +}