Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

proxyscrape return none #34

Open
VReunov opened this issue Aug 28, 2021 · 8 comments
Open

proxyscrape return none #34

VReunov opened this issue Aug 28, 2021 · 8 comments

Comments

@VReunov
Copy link

VReunov commented Aug 28, 2021

>>> import proxyscrape
>>> collector = proxyscrape.create_collector('default', 'http')
>>> proxy = collector.get_proxy({'country': 'united states'})
>>> print(proxy)
None

Python 3.9.6

@akguthal
Copy link

akguthal commented Sep 1, 2021

I'm also having this issue

@Mark7888
Copy link

Mark7888 commented Sep 1, 2021

same here

@chikko80
Copy link

Do you guys find a solutions?

@yoarch
Copy link

yoarch commented Nov 10, 2022

Still same error.

@8xu
Copy link

8xu commented Jul 7, 2023

Having the same issue. I think this project is not maintained anymore..

@ydeng11
Copy link

ydeng11 commented Mar 7, 2024

Same issue. I think it is dead.
I fork it and it is working now proxyscrape

@zero-stroke
Copy link

zero-stroke commented Jun 23, 2024

Same issue. I think it is dead. I fork it and it is working now proxyscrape

Your version works, but I dont know why or if I'm missing something but almost no free proxies work :( and I even made a multithreaded script to mass verify them

"""
python version >= 3.11
"""
import concurrent
import json
import os
import subprocess
import time
from concurrent.futures import ThreadPoolExecutor
from enum import auto, StrEnum
from threading import Lock

import proxyscrape
import requests
import datetime
from proxyscrape import Proxy
from requests.exceptions import ProxyError, ReadTimeout, SSLError, ConnectionError as ConnectionError_

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0'
}


sleeptime = 1
ip_location_url = "https://www.iplocation.net/"
ipify_url = "https://api.ipify.org/?format=json"
timeout = 15

try:
    base_ip_address: str = requests.get(url=ipify_url, headers=headers).text
except SSLError as e:
    print("Error getting ip")


class ProxyType(StrEnum):
    http = auto()
    https = auto()
    socks4 = auto()
    socks5 = auto()


def get_api_proxies(proxy_type) -> list[str]:
    proxy_response = requests.get(
        f"https://api.proxyscrape.com/?request=displayproxies"
        f"&proxytype={proxy_type}"
        "&timeout=10000"
        "&country=all"
        "&ssl=all"
        "&anonymity=all")

    all_proxies: list[str] = proxy_response.text.split("\r\n")[:-1]

    for proxy in all_proxies:
        print(proxy)
    return all_proxies


def check_proxy(raw_proxy, proxy_type, selenium_check):
    print(f"\nProxy: {raw_proxy}")
    host, port = raw_proxy.split(":")  # 1.94.31.35:8888
    proxy = Proxy(host=host, port=port, code='us', country='', anonymous='T', type=proxy_type, source='')
    if validate_proxies(proxy, selenium_check):
        return proxy
    return None


def validate_proxies_multithreaded(all_proxies, proxy_type, selenium_check) -> list:
    good_proxies = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for proxy in executor.map(check_proxy, all_proxies, [proxy_type] * len(all_proxies),
                                  [selenium_check] * len(all_proxies)):
            if proxy is not None:
                good_proxies.append(proxy)
    return good_proxies


def test_proxies(selenium_check: bool, use_api: bool) -> list[Proxy]:
    """
        Quickly filter through list of proxies and find only the ones that have valid 200 response codes.
    Then verify that the ip address does indeed change.
    Then try with selenium.

    Example Proxy from proxyscrape collector:
    Proxy(host='1.2.7.9',port='32',code='us', country='iran', anonymous=T, type='https', source='sslproxies')

    Parameters
    ----------
    selenium_check : bool
        To check using selenium or not.
    use_api : bool


    Returns
    -------

    """
    good_proxies: list[Proxy] = []

    if use_api:
        proxy_type = ProxyType.socks4  # socks5 will only yield a handful
        all_proxies: list[str] = get_api_proxies(proxy_type)
        print(f"Looping through {len(all_proxies)} proxies")

        good_proxies = validate_proxies_multithreaded(all_proxies, proxy_type, selenium_check)

        # for raw_proxy in all_proxies:
        #     print(f"\nProxy: {raw_proxy}")
        #     host, port = raw_proxy.split(":")  # 1.94.31.35:8888
        #     proxy: Proxy = Proxy(host=host, port=port, code='us', country='', anonymous='T', type=proxy_type, source='')
        #     if validate_proxies(proxy, selenium_check):
        #         good_proxies.append(proxy)

        print(f"Percentage of good proxies: {len(good_proxies) / len(all_proxies)}")
    else:
        quality_proxy_types = (ProxyType.https, ProxyType.socks4, ProxyType.socks5)
        collector = proxyscrape.create_collector('default', quality_proxy_types)
        num_valid_proxies = 5
        max_workers = 25

        good_proxies_lock = Lock()

        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = set()

            while len(good_proxies) < num_valid_proxies:
                while len(futures) < max_workers:
                    proxy = collector.get_proxy()
                    print(f"\nProxy: {proxy}")
                    futures.add(executor.submit(validate_proxies, proxy, selenium_check))

                done, futures = concurrent.futures.wait(
                    futures,
                    return_when=concurrent.futures.FIRST_COMPLETED
                )

                for future in done:
                    result = future.result()
                    if result:
                        with good_proxies_lock:
                            good_proxies.append(result)

    date_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")  # '2024-06-22 17:34'
    filename = f"good_proxies{date_time}.json"
    with open(filename, 'w') as f:
        json.dump([proxy._asdict() for proxy in good_proxies], f, indent=4)

    print(f"Number of good proxies: {len(good_proxies)}")
    return good_proxies


def validate_proxies(proxy: Proxy, selenium_check: bool) -> Proxy | None:
    proxies = {
        "http": f"{proxy.type}://{proxy.host}:{proxy.port}",
        "https": f"{proxy.type}://{proxy.host}:{proxy.port}"
    }

    # ----- Check with requests -----
    try:
        response: requests.Response = requests.get(ip_location_url, headers=headers, proxies=proxies, timeout=7)
    except ProxyError:
        print("Proxy error")
        return None
    except SSLError:
        print("SLLError")
        return None
    except ConnectionError_:
        print("ConnectionError")
        return None
    except ReadTimeout:
        print("ReadTimeout error")
        return None
    if response.status_code != 200:
        print("Site rejected proxy")
        return None
    else:
        print("PASSED ip location check")

    response: requests.Response = requests.get(ipify_url, headers=headers, proxies=proxies)
    if response.text == base_ip_address:
        print("Didn't change ip")
        return None
    else:
        print("PASSED ip change check")

    # ----- Check with selenium -----
    if selenium_check:  # Likely not necessary, extra check if you want it and have the setup. 
        from data_agg_constants import adblock_path
        from selenium import webdriver
        from selenium.common import TimeoutException
        from selenium.webdriver.chrome.service import Service
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support import expected_conditions as EC
        from selenium.webdriver.support.wait import WebDriverWait

        import undetected_chromedriver as uc

        chrome_126_path = f"C:/Users/{os.getlogin()}/Downloads/chrome-win64 (2)/chrome-win64/chrome.exe"
        options = webdriver.ChromeOptions()
        options.binary_location = chrome_126_path
        options.add_extension(adblock_path)

        options.add_argument(f'--proxy-server=http://{proxy.host}:{proxy.port}')

        driver = uc.Chrome(service=Service(), options=options)  # noqa
        driver.set_page_load_timeout(timeout)

        try:
            driver.get(ip_location_url)
        except (TimeoutException, Exception):
            print(f"Loading failed with proxy {proxy.host}:{proxy.port}, trying a new proxy...")
            try:
                subprocess.run(["taskkill", "/F", "/IM", "chrome.exe"], check=True)
            except subprocess.CalledProcessError as err:
                if err.returncode == 128:  # ERROR_INVALID_HANDLE (0x80000003L)
                    print("Chrome.exe is not running.")
                else:
                    print(f"Error occurred: {err}")
                return None
            time.sleep(sleeptime)

        WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        print("PASSED: Page loaded successfully")
        time.sleep(40)
        return proxy


def main():
    good_proxies = test_proxies(selenium_check=False, use_api=True)
    print(f"Good proxies: {good_proxies}")


if __name__ == '__main__':
    main()

oh and I couldnt open an issue on it for some reason so I posted here lol

@ydeng11
Copy link

ydeng11 commented Jun 26, 2024

I think these sources are not working any more. These free proxies are barely click bait.
These websites have the same structure and it leads me to think they are from the same source/owner.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

8 participants