From 3088cb69beaf61876d4966226bbbbaa417d10060 Mon Sep 17 00:00:00 2001 From: pops64 Date: Sat, 5 Oct 2024 15:32:13 -0400 Subject: [PATCH 1/5] WIP Full Site Scrape Only Currently scrapes the whole site. Needs logic to prevent rescrapes of scenes already processed. Needs logic to handle single scene scrapes. This is a completely different body request then whole site. The JSON is in the same format for single scenes so whole site logic can be reused for extracting data. Tags need filtering as original series and adult time original are redundant tags. Code needs to be cleaned and formatted --- pkg/scrape/upclosevr.go | 277 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 pkg/scrape/upclosevr.go diff --git a/pkg/scrape/upclosevr.go b/pkg/scrape/upclosevr.go new file mode 100644 index 000000000..288ca81c1 --- /dev/null +++ b/pkg/scrape/upclosevr.go @@ -0,0 +1,277 @@ +package scrape + +import ( + // "encoding/json" + "regexp" + "strconv" + "strings" + "sync" + // "net/http" + // "io" + // "fmt" + + "github.com/gocolly/colly/v2" + "github.com/mozillazg/go-slugify" + // "github.com/thoas/go-funk" + "github.com/tidwall/gjson" + "github.com/xbapps/xbvr/pkg/models" + "github.com/go-resty/resty/v2" + "github.com/nleeper/goment" + +) + +func UpCloseVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene, singleSceneURL string, singeScrapeAdditionalInfo string, limitScraping bool) error { + // this scraper is non-standard in that it gathers info via an api rather than scraping html pages + defer wg.Done() + scraperID := "upclosevr" + siteID := "UpCloseVR" + logScrapeStart(scraperID, siteID) + // nextApiUrl := "" + + siteCollector := createCollector("www.upclosevr.com") + // apiCollector := createCollector("site-api.project1service.com") + // offset := 0 + + // apiCollector.OnResponse(func(r *colly.Response) { + // sceneListJson := gjson.ParseBytes(r.Body) + + // processScene := func(scene gjson.Result) { + // sc := models.ScrapedScene{} + // sc.ScraperID = scraperID + // sc.SceneType = "VR" + // sc.Studio = "BangBros" + // sc.Site = siteID + // id := strconv.Itoa(int(scene.Get("id").Int())) + // sc.SceneID = "bvr-" + id + + // sc.Title = scene.Get("title").String() + // sc.HomepageURL = "https://virtualporn.com/video/" + id + "/" + slugify.Slugify(strings.ReplaceAll(sc.Title, "'", "")) + // sc.MembersUrl = "https://site-ma.virtualporn.com/scene/" + id + "/" + slugify.Slugify(strings.ReplaceAll(sc.Title, "'", "")) + // sc.Synopsis = scene.Get("description").String() + // dateParts := strings.Split(scene.Get("dateReleased").String(), "T") + // sc.Released = dateParts[0] + + // scene.Get("images.poster").ForEach(func(key, imgGroup gjson.Result) bool { + // if key.String() == "0" { + // imgurl := imgGroup.Get("xl.urls.webp").String() + // if imgurl != "" { + // sc.Covers = append(sc.Covers, imgurl) + // } + + // } else { + // imgurl := imgGroup.Get("xl.urls.webp").String() + // if imgurl != "" { + // if len(sc.Covers) == 0 { + // sc.Covers = append(sc.Covers, imgurl) + // } else { + // sc.Gallery = append(sc.Gallery, imgurl) + // } + // } + // } + // return true + // }) + + // // Cast + // sc.ActorDetails = make(map[string]models.ActorDetails) + // scene.Get("actors").ForEach(func(key, actor gjson.Result) bool { + // name := actor.Get("name").String() + // if actor.Get("gender").String() == "female" { + // sc.Cast = append(sc.Cast, name) + // } + // sc.ActorDetails[actor.Get("name").String()] = models.ActorDetails{Source: scraperID + " scrape", ProfileUrl: "https://virtualporn.com/model/" + strconv.Itoa(int(actor.Get("id").Int())) + "/" + slugify.Slugify(name)} + // return true + // }) + + // // Tags + // scene.Get("tags").ForEach(func(key, tag gjson.Result) bool { + // if tag.Get("isVisible").Bool() { + // sc.Tags = append(sc.Tags, tag.Get("name").String()) + // } + // return true + // }) + + // // trailer & filename details + // sc.TrailerType = "urls" + // var trailers []models.VideoSource + // scene.Get("children").ForEach(func(key, child gjson.Result) bool { + // child.Get("videos.full.files").ForEach(func(key, file gjson.Result) bool { + // quality := file.Get("format").String() + // url := file.Get("urls.view").String() + // filename := file.Get("urls.download").String() + // if url != "" { + // trailers = append(trailers, models.VideoSource{URL: url, Quality: quality}) + // } + // pos := strings.Index(filename, "?filename=") + // if pos != -1 { + // sc.Filenames = append(sc.Filenames, filename[pos+10:]) + // } + // return true + // }) + // return true + // }) + // trailerJson, _ := json.Marshal(models.VideoSourceResponse{VideoSources: trailers}) + // sc.TrailerSrc = string(trailerJson) + + // out <- sc + + // } + // total := int(sceneListJson.Get("meta.total").Int()) + // scenes := sceneListJson.Get("result") + // if strings.Contains(r.Request.URL.RawQuery, "offset=") { + // scenes.ForEach(func(key, scene gjson.Result) bool { + // // check if we have the scene already + // matches := funk.Filter(knownScenes, func(s string) bool { + // return strings.Contains(s, scene.Get("id").String()) + // }) + // if funk.IsEmpty(matches) { + // processScene(scene) + // } + // return true + // }) + // } else { + // processScene(scenes) + // } + + // offset += 24 + // if offset < total { + // if !limitScraping { + // apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset=" + strconv.Itoa(offset)) + // } + // } + // }) + + siteCollector.OnHTML(`script`, func(e *colly.HTMLElement) { + re := regexp.MustCompile(`"apiKey":"(.+)"}},"site`) + apiKey := re.FindStringSubmatch(e.Text) + re = regexp.MustCompile(`"applicationID":"(.+)","apiKey`) + applicationID := re.FindStringSubmatch(e.Text) + + if len(apiKey) > 0 && len(applicationID) > 0{ + var data = strings.NewReader(`{"requests":[{"indexName":"all_scenes_latest_desc","params":"analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Aupclosevr%22%2C%22context%3Avideos%22%2C%22device%3Adesktop%22%5D&clickAnalytics=true&facetingAfterDistinct=true&facets=%5B%22categories.name%22%5D&filters=(upcoming%3A'0')%20AND%20availableOnSite%3Aupclosevr&highlightPostTag=__%2Fais-highlight__&highlightPreTag=__ais-highlight__&hitsPerPage=60&maxValuesPerFacet=1000&page=0&query=&tagFilters="}]}`) + resp, _ := resty.New().R(). + SetHeader("Origin", "https://www.upclosevr.com"). + SetHeader("Referer", "https://www.upclosevr.com/"). + SetHeader("User-Agent", UserAgent). + SetHeader("x-algolia-api-key", apiKey[1]). + SetHeader("x-algolia-application-id", applicationID[1]). + SetBody(data). + Post("https://tsmkfa364q-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(4.22.1)%3B%20Browser%3B%20instantsearch.js%20(4.64.3)%3B%20react%20(18.2.0)%3B%20react-instantsearch%20(7.5.5)%3B%20react-instantsearch-core%20(7.5.5)%3B%20JS%20Helper%20(3.16.2)") + + // Convert the resp into a json string for gjson usability + jsonString := resp.String() + + // Determine the amount of Hits in the response to know array length. Index result of results.0.Hits is unreliable + nbScenes := int(gjson.Get(jsonString, "results.0.nbHits").Int()) + + for i:=0; i
", " ", -1)) + + // Title + sc.Title = strings.TrimSpace(gjson.Get(jsonString, queryStr + `.title`).String()) + log.Infoln(`Scraping ` + sc.Title) + + sc.ActorDetails = make(map[string]models.ActorDetails) + for i, name := range gjson.Get(jsonString, queryStr + `.female_actors.#.name`).Array(){ + sc.Cast = append(sc.Cast, name.String()) + + actorQuery := queryStr + `.female_actors.` + strconv.Itoa(i) + + sc.ActorDetails[name.String()] = models.ActorDetails{ + Source: scraperID + " scrape", + ProfileUrl: `https://www.upclosevr.com/en/pornstar/view/` + gjson.Get(jsonString, actorQuery + `.url_name`).String() + `/` + gjson.Get(jsonString, actorQuery + `.actor_id`).String(), + } + } + + for _, name := range gjson.Get(jsonString, queryStr + `.categories.#.name`).Array(){ + sc.Tags = append(sc.Tags, name.String()) + } + + //Duration + sc.Duration = int(gjson.Get(jsonString, queryStr + `.length`).Int()) / 60 + + out <- sc + } + + // client := &http.Client{} + // var data = strings.NewReader(`{"requests":[{"indexName":"all_scenes_latest_desc","params":"analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Aupclosevr%22%2C%22context%3Avideos%22%2C%22device%3Adesktop%22%5D&clickAnalytics=true&facetingAfterDistinct=true&facets=%5B%22categories.name%22%5D&filters=(upcoming%3A'0')%20AND%20availableOnSite%3Aupclosevr&highlightPostTag=__%2Fais-highlight__&highlightPreTag=__ais-highlight__&hitsPerPage=60&maxValuesPerFacet=1000&page=0&query=&tagFilters="}]}`) + // req, err := http.NewRequest("POST", "https://tsmkfa364q-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(4.22.1)%3B%20Browser%3B%20instantsearch.js%20(4.64.3)%3B%20react%20(18.2.0)%3B%20react-instantsearch%20(7.5.5)%3B%20react-instantsearch-core%20(7.5.5)%3B%20JS%20Helper%20(3.16.2)", data) + // if err != nil { + // log.Fatal(err) + // } + // req.Header.Set("Origin", "https://www.upclosevr.com") + // req.Header.Set("Referer", "https://www.upclosevr.com/") + // req.Header.Set("User-Agent", UserAgent) + // req.Header.Set("x-algolia-api-key", apiKey[1]) + // req.Header.Set("x-algolia-application-id", applicationID[1]) + // resp, err := client.Do(req) + // if err != nil { + // log.Fatal(err) + // } + // defer resp.Body.Close() + // var j interface{} + // err = json.NewDecoder(resp.Body).Decode(&j) + // if err != nil { + // log.Fatal(err) + // } + // log.Infoln(j.results.hits[0]) + } + + // if len(matches) > 1 { + // instanceJson := gjson.ParseBytes([]byte(matches[1])) + // token := instanceJson.Get("jwt").String() + // // set up api requests to use the token in the Instance Header + // apiCollector.OnRequest(func(r *colly.Request) { + // r.Headers.Set("Instance", token) + // }) + // apiCollector.Visit(nextApiUrl) + // } + }) + if singleSceneURL != "" { + // ctx := colly.NewContext() + // ctx.Put("dur", "") + // ctx.Put("date", "") + // urlParts := strings.Split(singleSceneURL, "/") + // id := urlParts[len(urlParts)-2] + // offset = 9999 // do read more pages, we only need 1 + // nextApiUrl = "https://site-api.project1service.com/v2/releases/" + id + // siteCollector.Visit("https://virtualporn.com/videos") + + } else { + // call virtualporn.com, this is just to get the instance token to use the api for this session + // nextApiUrl = "https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset=" + strconv.Itoa(offset) + siteCollector.Visit("https://www.upclosevr.com/en/videos") + } + + if updateSite { + updateSiteLastUpdate(scraperID) + } + logScrapeFinished(scraperID, siteID) + return nil +} + +func init() { + registerScraper("upclosevr", "Up Close VR", "https://static01-cms-fame.gammacdn.com/upclosevr/m/3ixx4xg65im880g8/UpClose-VR_Favicon_114x114.png", "upclosevr.com", UpCloseVR) +} \ No newline at end of file From a08eda3b4f7bdb4dca5f007636323c3c6d5b6df3 Mon Sep 17 00:00:00 2001 From: pops64 Date: Sat, 5 Oct 2024 18:58:31 -0400 Subject: [PATCH 2/5] Final Works. Tested both single scene and full site. All data available is retrieved. There is a bug when scraping single scene sites that the pop up doesn't show to save it. Unsure if it is bug in my XBVR or something in my code. --- pkg/scrape/upclosevr.go | 308 +++++++++++++--------------------------- 1 file changed, 95 insertions(+), 213 deletions(-) diff --git a/pkg/scrape/upclosevr.go b/pkg/scrape/upclosevr.go index 288ca81c1..a800370c1 100644 --- a/pkg/scrape/upclosevr.go +++ b/pkg/scrape/upclosevr.go @@ -10,14 +10,13 @@ import ( // "io" // "fmt" + "github.com/go-resty/resty/v2" "github.com/gocolly/colly/v2" "github.com/mozillazg/go-slugify" - // "github.com/thoas/go-funk" + "github.com/nleeper/goment" + "github.com/thoas/go-funk" "github.com/tidwall/gjson" "github.com/xbapps/xbvr/pkg/models" - "github.com/go-resty/resty/v2" - "github.com/nleeper/goment" - ) func UpCloseVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene, singleSceneURL string, singeScrapeAdditionalInfo string, limitScraping bool) error { @@ -26,244 +25,127 @@ func UpCloseVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out ch scraperID := "upclosevr" siteID := "UpCloseVR" logScrapeStart(scraperID, siteID) - // nextApiUrl := "" siteCollector := createCollector("www.upclosevr.com") - // apiCollector := createCollector("site-api.project1service.com") - // offset := 0 - // apiCollector.OnResponse(func(r *colly.Response) { - // sceneListJson := gjson.ParseBytes(r.Body) - - // processScene := func(scene gjson.Result) { - // sc := models.ScrapedScene{} - // sc.ScraperID = scraperID - // sc.SceneType = "VR" - // sc.Studio = "BangBros" - // sc.Site = siteID - // id := strconv.Itoa(int(scene.Get("id").Int())) - // sc.SceneID = "bvr-" + id + siteCollector.OnHTML(`script`, func(e *colly.HTMLElement) { + re := regexp.MustCompile(`"apiKey":"(.+)"}},"site`) + apiKey := re.FindStringSubmatch(e.Text) + re = regexp.MustCompile(`"applicationID":"(.+)","apiKey`) + applicationID := re.FindStringSubmatch(e.Text) - // sc.Title = scene.Get("title").String() - // sc.HomepageURL = "https://virtualporn.com/video/" + id + "/" + slugify.Slugify(strings.ReplaceAll(sc.Title, "'", "")) - // sc.MembersUrl = "https://site-ma.virtualporn.com/scene/" + id + "/" + slugify.Slugify(strings.ReplaceAll(sc.Title, "'", "")) - // sc.Synopsis = scene.Get("description").String() - // dateParts := strings.Split(scene.Get("dateReleased").String(), "T") - // sc.Released = dateParts[0] + if len(apiKey) > 0 && len(applicationID) > 0 { + pageTotal := 1 + client := resty.New() - // scene.Get("images.poster").ForEach(func(key, imgGroup gjson.Result) bool { - // if key.String() == "0" { - // imgurl := imgGroup.Get("xl.urls.webp").String() - // if imgurl != "" { - // sc.Covers = append(sc.Covers, imgurl) - // } + for page := 0; page < pageTotal; page++ { - // } else { - // imgurl := imgGroup.Get("xl.urls.webp").String() - // if imgurl != "" { - // if len(sc.Covers) == 0 { - // sc.Covers = append(sc.Covers, imgurl) - // } else { - // sc.Gallery = append(sc.Gallery, imgurl) - // } - // } - // } - // return true - // }) + var payload = strings.NewReader("") + if singleSceneURL != "" { + tmp := strings.Split(singleSceneURL, "/") + sceneID := tmp[len(tmp)-1] + payload.Reset(`{"requests":[{"indexName":"all_scenes","params":"clickAnalytics=true&facetFilters=%5B%5B%22availableOnSite%3Aupclosevr%22%5D%2C%5B%22clip_id%3A` + sceneID + `%22%5D%5D&facets=%5B%5D&hitsPerPage=1&tagFilters="},{"indexName":"all_scenes","params":"analytics=false&clickAnalytics=false&facetFilters=%5B%5B%22clip_id%3A251717%22%5D%5D&facets=availableOnSite&hitsPerPage=0&page=0"},{"indexName":"all_scenes","params":"analytics=false&clickAnalytics=false&facetFilters=%5B%5B%22availableOnSite%3Aupclosevr%22%5D%5D&facets=clip_id&hitsPerPage=0&page=0"}]}`) + } else { + payload.Reset(`{"requests":[{"indexName":"all_scenes_latest_desc","params":"analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Aupclosevr%22%2C%22context%3Avideos%22%2C%22device%3Adesktop%22%5D&clickAnalytics=true&facetingAfterDistinct=true&facets=%5B%22categories.name%22%5D&filters=(upcoming%3A'0')%20AND%20availableOnSite%3Aupclosevr&highlightPostTag=__%2Fais-highlight__&highlightPreTag=__ais-highlight__&hitsPerPage=60&maxValuesPerFacet=1000&page=` + strconv.Itoa(page) + `&query=&tagFilters="}]}`) + } + resp, err := client.R(). + SetHeader("Origin", "https://www.upclosevr.com"). + SetHeader("Referer", "https://www.upclosevr.com/"). + SetHeader("User-Agent", UserAgent). + SetHeader("x-algolia-api-key", apiKey[1]). + SetHeader("x-algolia-application-id", applicationID[1]). + SetBody(payload). + Post("https://tsmkfa364q-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(4.22.1)%3B%20Browser%3B%20instantsearch.js%20(4.64.3)%3B%20react%20(18.2.0)%3B%20react-instantsearch%20(7.5.5)%3B%20react-instantsearch-core%20(7.5.5)%3B%20JS%20Helper%20(3.16.2)") + + if err != nil { + log.Errorln("UpCloseVR encourtned an error on the API Call", err) + return + } - // // Cast - // sc.ActorDetails = make(map[string]models.ActorDetails) - // scene.Get("actors").ForEach(func(key, actor gjson.Result) bool { - // name := actor.Get("name").String() - // if actor.Get("gender").String() == "female" { - // sc.Cast = append(sc.Cast, name) - // } - // sc.ActorDetails[actor.Get("name").String()] = models.ActorDetails{Source: scraperID + " scrape", ProfileUrl: "https://virtualporn.com/model/" + strconv.Itoa(int(actor.Get("id").Int())) + "/" + slugify.Slugify(name)} - // return true - // }) + // Convert the resp into a json string for gjson usability + jsonString := resp.String() - // // Tags - // scene.Get("tags").ForEach(func(key, tag gjson.Result) bool { - // if tag.Get("isVisible").Bool() { - // sc.Tags = append(sc.Tags, tag.Get("name").String()) - // } - // return true - // }) + // Check to see if there are multiple pages of results + if pageTotal == 1 && singleSceneURL == "" && !limitScraping { + pageTotal = int(gjson.Get(jsonString, "results.0.nbPages").Int()) + } - // // trailer & filename details - // sc.TrailerType = "urls" - // var trailers []models.VideoSource - // scene.Get("children").ForEach(func(key, child gjson.Result) bool { - // child.Get("videos.full.files").ForEach(func(key, file gjson.Result) bool { - // quality := file.Get("format").String() - // url := file.Get("urls.view").String() - // filename := file.Get("urls.download").String() - // if url != "" { - // trailers = append(trailers, models.VideoSource{URL: url, Quality: quality}) - // } - // pos := strings.Index(filename, "?filename=") - // if pos != -1 { - // sc.Filenames = append(sc.Filenames, filename[pos+10:]) - // } - // return true - // }) - // return true - // }) - // trailerJson, _ := json.Marshal(models.VideoSourceResponse{VideoSources: trailers}) - // sc.TrailerSrc = string(trailerJson) + // Make sure we are getting valid response. If the hits array is zero something went wrong + if len(gjson.Get(jsonString, "results.0.hits").Array()) == 0 { + log.Errorln("No Results found for UpCloseVR message:", gjson.Get(jsonString, "message").String(), "response code:", gjson.Get(jsonString, "status").String()) + } - // out <- sc + // iterate over each hit result + for i, _ := range gjson.Get(jsonString, "results.0.hits").Array() { + queryStr := `results.0.hits.` + strconv.Itoa(i) - // } - // total := int(sceneListJson.Get("meta.total").Int()) - // scenes := sceneListJson.Get("result") - // if strings.Contains(r.Request.URL.RawQuery, "offset=") { - // scenes.ForEach(func(key, scene gjson.Result) bool { - // // check if we have the scene already - // matches := funk.Filter(knownScenes, func(s string) bool { - // return strings.Contains(s, scene.Get("id").String()) - // }) - // if funk.IsEmpty(matches) { - // processScene(scene) - // } - // return true - // }) - // } else { - // processScene(scenes) - // } + // Check to make sure we don't update scenes we have already collectoed + sceneID := gjson.Get(jsonString, queryStr+`.clip_id`).String() + sceneURL := `https://www.upclosevr.com/en/video/upclosevr/` + gjson.Get(jsonString, queryStr+`.url_title`).String() + `/` + sceneID + if !funk.ContainsString(knownScenes, sceneURL) || singleSceneURL != "" { - // offset += 24 - // if offset < total { - // if !limitScraping { - // apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset=" + strconv.Itoa(offset)) - // } - // } - // }) + sc := models.ScrapedScene{} - siteCollector.OnHTML(`script`, func(e *colly.HTMLElement) { - re := regexp.MustCompile(`"apiKey":"(.+)"}},"site`) - apiKey := re.FindStringSubmatch(e.Text) - re = regexp.MustCompile(`"applicationID":"(.+)","apiKey`) - applicationID := re.FindStringSubmatch(e.Text) + sc.ScraperID = scraperID + sc.SceneType = "VR" + sc.Studio = siteID + sc.Site = siteID + sc.SiteID = sceneID + sc.HomepageURL = sceneURL - if len(apiKey) > 0 && len(applicationID) > 0{ - var data = strings.NewReader(`{"requests":[{"indexName":"all_scenes_latest_desc","params":"analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Aupclosevr%22%2C%22context%3Avideos%22%2C%22device%3Adesktop%22%5D&clickAnalytics=true&facetingAfterDistinct=true&facets=%5B%22categories.name%22%5D&filters=(upcoming%3A'0')%20AND%20availableOnSite%3Aupclosevr&highlightPostTag=__%2Fais-highlight__&highlightPreTag=__ais-highlight__&hitsPerPage=60&maxValuesPerFacet=1000&page=0&query=&tagFilters="}]}`) - resp, _ := resty.New().R(). - SetHeader("Origin", "https://www.upclosevr.com"). - SetHeader("Referer", "https://www.upclosevr.com/"). - SetHeader("User-Agent", UserAgent). - SetHeader("x-algolia-api-key", apiKey[1]). - SetHeader("x-algolia-application-id", applicationID[1]). - SetBody(data). - Post("https://tsmkfa364q-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(4.22.1)%3B%20Browser%3B%20instantsearch.js%20(4.64.3)%3B%20react%20(18.2.0)%3B%20react-instantsearch%20(7.5.5)%3B%20react-instantsearch-core%20(7.5.5)%3B%20JS%20Helper%20(3.16.2)") - - // Convert the resp into a json string for gjson usability - jsonString := resp.String() + // Scene ID + sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID - // Determine the amount of Hits in the response to know array length. Index result of results.0.Hits is unreliable - nbScenes := int(gjson.Get(jsonString, "results.0.nbHits").Int()) - - for i:=0; i
", " ", -1)) - // Scene ID - sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID + // Title + sc.Title = strings.TrimSpace(gjson.Get(jsonString, queryStr+`.title`).String()) + log.Infoln(`Scraping ` + sc.Title) - // Date - tmpDate, _ := goment.New(gjson.Get(jsonString, queryStr + `.release_date`).String(), "YYYY-MM-DD") - sc.Released = tmpDate.Format("YYYY-MM-DD") + // Cast - Females Only can be update to include males if wanted + sc.ActorDetails = make(map[string]models.ActorDetails) + for i, name := range gjson.Get(jsonString, queryStr+`.female_actors.#.name`).Array() { + sc.Cast = append(sc.Cast, name.String()) - // Cover - sc.Covers = append(sc.Covers, `https://transform.gammacdn.com/movies/` + gjson.Get(jsonString, queryStr + `.pictures.1920x1080`).String()) + actorQuery := queryStr + `.female_actors.` + strconv.Itoa(i) - // Synopsis - sc.Synopsis = strings.TrimSpace(strings.Replace(gjson.Get(jsonString, queryStr + `.description`).String(), "

", " ", -1)) + sc.ActorDetails[name.String()] = models.ActorDetails{ + Source: scraperID + " scrape", + ProfileUrl: `https://www.upclosevr.com/en/pornstar/view/` + gjson.Get(jsonString, actorQuery+`.url_name`).String() + `/` + gjson.Get(jsonString, actorQuery+`.actor_id`).String(), + } + } - // Title - sc.Title = strings.TrimSpace(gjson.Get(jsonString, queryStr + `.title`).String()) - log.Infoln(`Scraping ` + sc.Title) + // Junk Tags we don't want to add to scene data + skiptags := map[string]bool{ + "Original Series": true, + "Adult Time Original": true, // Everything gets tagged 3D on SLR, even mono 360 + } - sc.ActorDetails = make(map[string]models.ActorDetails) - for i, name := range gjson.Get(jsonString, queryStr + `.female_actors.#.name`).Array(){ - sc.Cast = append(sc.Cast, name.String()) + for _, name := range gjson.Get(jsonString, queryStr+`.categories.#.name`).Array() { + if !skiptags[name.String()] { + sc.Tags = append(sc.Tags, name.String()) + } + } - actorQuery := queryStr + `.female_actors.` + strconv.Itoa(i) - - sc.ActorDetails[name.String()] = models.ActorDetails{ - Source: scraperID + " scrape", - ProfileUrl: `https://www.upclosevr.com/en/pornstar/view/` + gjson.Get(jsonString, actorQuery + `.url_name`).String() + `/` + gjson.Get(jsonString, actorQuery + `.actor_id`).String(), - } - } + // Duration is in total seconds + sc.Duration = int(gjson.Get(jsonString, queryStr+`.length`).Int()) / 60 - for _, name := range gjson.Get(jsonString, queryStr + `.categories.#.name`).Array(){ - sc.Tags = append(sc.Tags, name.String()) + out <- sc + } } - - //Duration - sc.Duration = int(gjson.Get(jsonString, queryStr + `.length`).Int()) / 60 - - out <- sc } - - // client := &http.Client{} - // var data = strings.NewReader(`{"requests":[{"indexName":"all_scenes_latest_desc","params":"analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Aupclosevr%22%2C%22context%3Avideos%22%2C%22device%3Adesktop%22%5D&clickAnalytics=true&facetingAfterDistinct=true&facets=%5B%22categories.name%22%5D&filters=(upcoming%3A'0')%20AND%20availableOnSite%3Aupclosevr&highlightPostTag=__%2Fais-highlight__&highlightPreTag=__ais-highlight__&hitsPerPage=60&maxValuesPerFacet=1000&page=0&query=&tagFilters="}]}`) - // req, err := http.NewRequest("POST", "https://tsmkfa364q-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(4.22.1)%3B%20Browser%3B%20instantsearch.js%20(4.64.3)%3B%20react%20(18.2.0)%3B%20react-instantsearch%20(7.5.5)%3B%20react-instantsearch-core%20(7.5.5)%3B%20JS%20Helper%20(3.16.2)", data) - // if err != nil { - // log.Fatal(err) - // } - // req.Header.Set("Origin", "https://www.upclosevr.com") - // req.Header.Set("Referer", "https://www.upclosevr.com/") - // req.Header.Set("User-Agent", UserAgent) - // req.Header.Set("x-algolia-api-key", apiKey[1]) - // req.Header.Set("x-algolia-application-id", applicationID[1]) - // resp, err := client.Do(req) - // if err != nil { - // log.Fatal(err) - // } - // defer resp.Body.Close() - // var j interface{} - // err = json.NewDecoder(resp.Body).Decode(&j) - // if err != nil { - // log.Fatal(err) - // } - // log.Infoln(j.results.hits[0]) } - - // if len(matches) > 1 { - // instanceJson := gjson.ParseBytes([]byte(matches[1])) - // token := instanceJson.Get("jwt").String() - // // set up api requests to use the token in the Instance Header - // apiCollector.OnRequest(func(r *colly.Request) { - // r.Headers.Set("Instance", token) - // }) - // apiCollector.Visit(nextApiUrl) - // } }) - if singleSceneURL != "" { - // ctx := colly.NewContext() - // ctx.Put("dur", "") - // ctx.Put("date", "") - // urlParts := strings.Split(singleSceneURL, "/") - // id := urlParts[len(urlParts)-2] - // offset = 9999 // do read more pages, we only need 1 - // nextApiUrl = "https://site-api.project1service.com/v2/releases/" + id - // siteCollector.Visit("https://virtualporn.com/videos") - } else { - // call virtualporn.com, this is just to get the instance token to use the api for this session - // nextApiUrl = "https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset=" + strconv.Itoa(offset) - siteCollector.Visit("https://www.upclosevr.com/en/videos") - } + siteCollector.Visit("https://www.upclosevr.com/en/videos") if updateSite { updateSiteLastUpdate(scraperID) @@ -274,4 +156,4 @@ func UpCloseVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out ch func init() { registerScraper("upclosevr", "Up Close VR", "https://static01-cms-fame.gammacdn.com/upclosevr/m/3ixx4xg65im880g8/UpClose-VR_Favicon_114x114.png", "upclosevr.com", UpCloseVR) -} \ No newline at end of file +} From fe16c528f60a98a538279cab59634cfb39e68d28 Mon Sep 17 00:00:00 2001 From: pops64 Date: Sat, 5 Oct 2024 18:59:48 -0400 Subject: [PATCH 3/5] Remove Junk lines --- pkg/scrape/upclosevr.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pkg/scrape/upclosevr.go b/pkg/scrape/upclosevr.go index a800370c1..74aa69b91 100644 --- a/pkg/scrape/upclosevr.go +++ b/pkg/scrape/upclosevr.go @@ -1,14 +1,10 @@ package scrape import ( - // "encoding/json" "regexp" "strconv" "strings" "sync" - // "net/http" - // "io" - // "fmt" "github.com/go-resty/resty/v2" "github.com/gocolly/colly/v2" From e729fdd33f7a39b711beb41b234e0bde2bab140a Mon Sep 17 00:00:00 2001 From: pops64 Date: Sun, 6 Oct 2024 07:21:01 -0400 Subject: [PATCH 4/5] More clean up --- pkg/scrape/upclosevr.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/scrape/upclosevr.go b/pkg/scrape/upclosevr.go index 74aa69b91..312ee9934 100644 --- a/pkg/scrape/upclosevr.go +++ b/pkg/scrape/upclosevr.go @@ -75,7 +75,7 @@ func UpCloseVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out ch for i, _ := range gjson.Get(jsonString, "results.0.hits").Array() { queryStr := `results.0.hits.` + strconv.Itoa(i) - // Check to make sure we don't update scenes we have already collectoed + // Check to make sure we don't update scenes we have already collected sceneID := gjson.Get(jsonString, queryStr+`.clip_id`).String() sceneURL := `https://www.upclosevr.com/en/video/upclosevr/` + gjson.Get(jsonString, queryStr+`.url_title`).String() + `/` + sceneID if !funk.ContainsString(knownScenes, sceneURL) || singleSceneURL != "" { @@ -122,9 +122,10 @@ func UpCloseVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out ch // Junk Tags we don't want to add to scene data skiptags := map[string]bool{ "Original Series": true, - "Adult Time Original": true, // Everything gets tagged 3D on SLR, even mono 360 + "Adult Time Original": true, } + // Tags for _, name := range gjson.Get(jsonString, queryStr+`.categories.#.name`).Array() { if !skiptags[name.String()] { sc.Tags = append(sc.Tags, name.String()) From 79f68ed9889a0dae4aafe5fff38bb5fd99711719 Mon Sep 17 00:00:00 2001 From: pops64 Date: Sun, 6 Oct 2024 12:11:44 -0400 Subject: [PATCH 5/5] Code Clean Up --- pkg/scrape/upclosevr.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pkg/scrape/upclosevr.go b/pkg/scrape/upclosevr.go index 312ee9934..4f0e73804 100644 --- a/pkg/scrape/upclosevr.go +++ b/pkg/scrape/upclosevr.go @@ -25,10 +25,10 @@ func UpCloseVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out ch siteCollector := createCollector("www.upclosevr.com") siteCollector.OnHTML(`script`, func(e *colly.HTMLElement) { - re := regexp.MustCompile(`"apiKey":"(.+)"}},"site`) - apiKey := re.FindStringSubmatch(e.Text) - re = regexp.MustCompile(`"applicationID":"(.+)","apiKey`) - applicationID := re.FindStringSubmatch(e.Text) + apiKeyRegex := regexp.MustCompile(`"apiKey":"(.+)"}},"site`) + applicationIDRegex := regexp.MustCompile(`"applicationID":"(.+)","apiKey`) + apiKey := apiKeyRegex.FindStringSubmatch(e.Text) + applicationID := applicationIDRegex.FindStringSubmatch(e.Text) if len(apiKey) > 0 && len(applicationID) > 0 { pageTotal := 1 @@ -36,14 +36,16 @@ func UpCloseVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out ch for page := 0; page < pageTotal; page++ { - var payload = strings.NewReader("") + var payloadStr string if singleSceneURL != "" { tmp := strings.Split(singleSceneURL, "/") sceneID := tmp[len(tmp)-1] - payload.Reset(`{"requests":[{"indexName":"all_scenes","params":"clickAnalytics=true&facetFilters=%5B%5B%22availableOnSite%3Aupclosevr%22%5D%2C%5B%22clip_id%3A` + sceneID + `%22%5D%5D&facets=%5B%5D&hitsPerPage=1&tagFilters="},{"indexName":"all_scenes","params":"analytics=false&clickAnalytics=false&facetFilters=%5B%5B%22clip_id%3A251717%22%5D%5D&facets=availableOnSite&hitsPerPage=0&page=0"},{"indexName":"all_scenes","params":"analytics=false&clickAnalytics=false&facetFilters=%5B%5B%22availableOnSite%3Aupclosevr%22%5D%5D&facets=clip_id&hitsPerPage=0&page=0"}]}`) + payloadStr = `{"requests":[{"indexName":"all_scenes","params":"clickAnalytics=true&facetFilters=%5B%5B%22availableOnSite%3Aupclosevr%22%5D%2C%5B%22clip_id%3A` + sceneID + `%22%5D%5D&facets=%5B%5D&hitsPerPage=1&tagFilters="}]}` } else { - payload.Reset(`{"requests":[{"indexName":"all_scenes_latest_desc","params":"analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Aupclosevr%22%2C%22context%3Avideos%22%2C%22device%3Adesktop%22%5D&clickAnalytics=true&facetingAfterDistinct=true&facets=%5B%22categories.name%22%5D&filters=(upcoming%3A'0')%20AND%20availableOnSite%3Aupclosevr&highlightPostTag=__%2Fais-highlight__&highlightPreTag=__ais-highlight__&hitsPerPage=60&maxValuesPerFacet=1000&page=` + strconv.Itoa(page) + `&query=&tagFilters="}]}`) + payloadStr = `{"requests":[{"indexName":"all_scenes_latest_desc","params":"analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Aupclosevr%22%2C%22context%3Avideos%22%2C%22device%3Adesktop%22%5D&clickAnalytics=true&facetingAfterDistinct=true&facets=%5B%22categories.name%22%5D&filters=(upcoming%3A'0')%20AND%20availableOnSite%3Aupclosevr&highlightPostTag=__%2Fais-highlight__&highlightPreTag=__ais-highlight__&hitsPerPage=60&maxValuesPerFacet=1000&page=` + strconv.Itoa(page) + `&query=&tagFilters="}]}` } + + var payload = strings.NewReader(payloadStr) resp, err := client.R(). SetHeader("Origin", "https://www.upclosevr.com"). SetHeader("Referer", "https://www.upclosevr.com/").