From 217013181da06abe8d71d9db70e809ea4ebd8236 Mon Sep 17 00:00:00 2001 From: Federico Minutoli Date: Fri, 10 May 2024 21:09:48 +0200 Subject: [PATCH] feat(proxy-rotation): add parse (IP address) or search (from broker) functionality for proxy rotation the broker has been made fully configurable for anonymity level, admissible locations, scheme and max shape not to waste resources, unlike the original `free-proxy` package. other options have been explored (e.g., `proxybroker`, `proxybroker2`) due to their built-in proxy server and rotation capabilities, but the former is no longer maintained, and the latter has issue with any python version outside of python 3.9 --- scrapegraphai/utils/__init__.py | 6 +- scrapegraphai/utils/proxy_rotation.py | 234 ++++++++++++++++++++++++-- tests/utils/test_proxy_rotation.py | 121 +++++++++++++ 3 files changed, 342 insertions(+), 19 deletions(-) create mode 100644 tests/utils/test_proxy_rotation.py diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 0aee7839..74c70f84 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -1,8 +1,10 @@ """ __init__.py file for utils folder """ -from .save_audio_from_bytes import save_audio_from_bytes + from .convert_to_csv import convert_to_csv from .convert_to_json import convert_to_json from .prettify_exec_info import prettify_exec_info -from .proxy_rotation import proxy_generator +from .proxy_rotation import Proxy, parse_or_search_proxy, search_proxy_servers +from .save_audio_from_bytes import save_audio_from_bytes +from .sys_dynamic_import import dynamic_import, srcfile_import diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py index 576a91e4..0ca204e0 100644 --- a/scrapegraphai/utils/proxy_rotation.py +++ b/scrapegraphai/utils/proxy_rotation.py @@ -1,34 +1,234 @@ """ Module for rotating proxies """ + +import ipaddress +import random +from typing import List, Optional, Set, TypedDict + +import requests +from fp.errors import FreeProxyException from fp.fp import FreeProxy -def proxy_generator(num_ips: int) -> list: - """ - Generates a specified number of proxy IP addresses using the FreeProxy library. +class ProxyBrokerCriteria(TypedDict, total=False): + """proxy broker criteria""" + + anonymous: bool + countryset: Set[str] + secure: bool + timeout: float + search_outside_if_empty: bool + + +class ProxySettings(TypedDict, total=False): + """proxy settings""" + + server: str + bypass: str + username: str + password: str + + +class Proxy(ProxySettings): + """proxy server information""" + + criteria: ProxyBrokerCriteria + + +def search_proxy_servers( + anonymous: bool = True, + countryset: Optional[Set[str]] = None, + secure: bool = False, + timeout: float = 5.0, + max_shape: int = 5, + search_outside_if_empty: bool = True, +) -> List[str]: + """search for proxy servers that match the specified broker criteria Args: - num_ips (int): The number of proxy IPs to generate and rotate through. + anonymous: whether proxy servers should have minimum level-1 anonymity. + countryset: admissible proxy servers locations. + secure: whether proxy servers should support HTTP or HTTPS; defaults to HTTP; + timeout: The maximum timeout for proxy responses; defaults to 5.0 seconds. + max_shape: The maximum number of proxy servers to return; defaults to 5. + search_outside_if_empty: whether countryset should be extended if empty. Returns: - list: A list of proxy IP addresses. + A list of proxy server URLs matching the criteria. Example: - >>> proxy_generator(5) + >>> search_proxy_servers( + ... anonymous=True, + ... countryset={"GB", "US"}, + ... secure=True, + ... timeout=1.0 + ... max_shape=2 + ... ) [ - '192.168.1.1:8080', - '103.10.63.135:8080', - '176.9.75.42:8080', - '37.57.216.2:8080', - '113.20.31.250:8080' + "http://103.10.63.135:8080", + "http://113.20.31.250:8080", ] - - This function fetches fresh proxies and indexes them, making it easy to manage multiple proxy configurations. """ + proxybroker = FreeProxy( + anonym=anonymous, + country_id=countryset, + elite=True, + https=secure, + timeout=timeout, + ) - res = [] + def search_all(proxybroker: FreeProxy, k: int, search_outside: bool) -> List[str]: + candidateset = proxybroker.get_proxy_list(search_outside) + random.shuffle(candidateset) - for i in range(0, num_ips): - res.append(FreeProxy().get()) - return res + positive = set() + + for address in candidateset: + setting = {proxybroker.schema: f"http://{address}"} + + try: + server = proxybroker._FreeProxy__check_if_proxy_is_working(setting) + + if not server: + continue + + positive.add(server) + + if len(positive) < k: + continue + + return list(positive) + + except requests.exceptions.RequestException: + continue + + n = len(positive) + + if n < k and search_outside: + proxybroker.country_id = None + + try: + negative = set(search_all(proxybroker, k - n, False)) + except FreeProxyException: + negative = set() + + positive = positive | negative + + if not positive: + raise FreeProxyException("missing proxy servers for criteria") + + return list(positive) + + return search_all(proxybroker, max_shape, search_outside_if_empty) + + +def _parse_proxy(proxy: ProxySettings) -> ProxySettings: + """parses a proxy configuration with known server + + Args: + proxy: The proxy configuration to parse. + + Returns: + A 'playwright' compliant proxy configuration. + """ + assert "server" in proxy, "missing server in the proxy configuration" + + auhtorization = [x in proxy for x in ("username", "password")] + + message = "username and password must be provided in pairs or not at all" + + assert all(auhtorization) or not any(auhtorization), message + + parsed = {"server": proxy["server"]} + + if proxy.get("bypass"): + parsed["bypass"] = proxy["bypass"] + + if all(auhtorization): + parsed["username"] = proxy["username"] + parsed["password"] = proxy["password"] + + return parsed + + +def _search_proxy(proxy: Proxy) -> ProxySettings: + """searches for a proxy server matching the specified broker criteria + + Args: + proxy: The proxy configuration to search for. + + Returns: + A 'playwright' compliant proxy configuration. + """ + server = search_proxy_servers(max_shape=1, **proxy.get("criteria", {}))[0] + + return {"server": server} + + +def is_ipv4_address(address: str) -> bool: + """If a proxy address conforms to a IPv4 address""" + try: + ipaddress.IPv4Address(address) + return True + except ipaddress.AddressValueError: + return False + + +def parse_or_search_proxy(proxy: Proxy) -> ProxySettings: + """parses a proxy configuration or searches for a new one matching + the specified broker criteria + + Args: + proxy: The proxy configuration to parse or search for. + + Returns: + A 'playwright' compliant proxy configuration. + + Notes: + - If the proxy server is a IP address, it is assumed to be + a proxy server address. + - If the proxy server is 'broker', a proxy server is searched for + based on the provided broker criteria. + + Example: + >>> proxy = { + ... "server": "broker", + ... "criteria": { + ... "anonymous": True, + ... "countryset": {"GB", "US"}, + ... "secure": True, + ... "timeout": 5.0 + ... "search_outside_if_empty": False + ... } + ... } + + >>> parse_or_search_proxy(proxy) + { + "server": "", + } + + Example: + >>> proxy = { + ... "server": "192.168.1.1:8080", + ... "username": "", + ... "password": "" + ... } + + >>> parse_or_search_proxy(proxy) + { + "server": "192.168.1.1:8080", + "username": "", + "password": "" + } + """ + assert "server" in proxy, "missing server in the proxy configuration" + + server_address = proxy["server"].split(":", maxsplit=1)[0] + + if is_ipv4_address(server_address): + return _parse_proxy(proxy) + + assert proxy["server"] == "broker", "unknown proxy server" + + return _search_proxy(proxy) diff --git a/tests/utils/test_proxy_rotation.py b/tests/utils/test_proxy_rotation.py new file mode 100644 index 00000000..8acbdb30 --- /dev/null +++ b/tests/utils/test_proxy_rotation.py @@ -0,0 +1,121 @@ +import pytest +from fp.errors import FreeProxyException + +from scrapegraphai.utils.proxy_rotation import ( + Proxy, + _parse_proxy, + _search_proxy, + is_ipv4_address, + parse_or_search_proxy, + search_proxy_servers, +) + + +def test_search_proxy_servers_success(): + servers = search_proxy_servers( + anonymous=True, + countryset={"US"}, + secure=False, + timeout=10.0, + max_shape=2, + search_outside_if_empty=True, + ) + + assert isinstance(servers, list) + assert all(isinstance(server, str) for server in servers) + + +def test_search_proxy_servers_exception(): + with pytest.raises(FreeProxyException): + search_proxy_servers( + anonymous=True, + countryset={"XX"}, + secure=True, + timeout=1.0, + max_shape=2, + search_outside_if_empty=False, + ) + + +def test_parse_proxy_success(): + proxy = { + "server": "192.168.1.1:8080", + "username": "user", + "password": "pass", + "bypass": "*.local", + } + + parsed_proxy = _parse_proxy(proxy) + assert parsed_proxy == proxy + + +def test_parse_proxy_exception(): + invalid_proxy = {"server": "192.168.1.1:8080", "username": "user"} + + with pytest.raises(AssertionError) as error_info: + _parse_proxy(invalid_proxy) + + assert "username and password must be provided in pairs" in str(error_info.value) + + +def test_search_proxy_success(): + proxy = Proxy(criteria={"anonymous": True, "countryset": {"US"}}) + found_proxy = _search_proxy(proxy) + + assert isinstance(found_proxy, dict) + assert "server" in found_proxy + + +def test_is_ipv4_address(): + assert is_ipv4_address("192.168.1.1") is True + assert is_ipv4_address("999.999.999.999") is False + assert is_ipv4_address("no-address") is False + + +def test_parse_or_search_proxy_success(): + proxy = { + "server": "192.168.1.1:8080", + "username": "username", + "password": "password", + } + + parsed_proxy = parse_or_search_proxy(proxy) + assert parsed_proxy == proxy + + proxy_broker = { + "server": "broker", + "criteria": { + "anonymous": True, + "countryset": {"US"}, + "secure": True, + "timeout": 10.0, + }, + } + + found_proxy = parse_or_search_proxy(proxy_broker) + + assert isinstance(found_proxy, dict) + assert "server" in found_proxy + + +def test_parse_or_search_proxy_exception(): + proxy = { + "username": "username", + "password": "password", + } + + with pytest.raises(AssertionError) as error_info: + parse_or_search_proxy(proxy) + + assert "missing server in the proxy configuration" in str(error_info.value) + + +def test_parse_or_search_proxy_unknown_server(): + proxy = { + "server": "unknown", + } + + with pytest.raises(AssertionError) as error_info: + parse_or_search_proxy(proxy) + + assert "unknown proxy server" in str(error_info.value)