diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 0aee7839..74c70f84 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -1,8 +1,10 @@ """ __init__.py file for utils folder """ -from .save_audio_from_bytes import save_audio_from_bytes + from .convert_to_csv import convert_to_csv from .convert_to_json import convert_to_json from .prettify_exec_info import prettify_exec_info -from .proxy_rotation import proxy_generator +from .proxy_rotation import Proxy, parse_or_search_proxy, search_proxy_servers +from .save_audio_from_bytes import save_audio_from_bytes +from .sys_dynamic_import import dynamic_import, srcfile_import diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py index 576a91e4..0ca204e0 100644 --- a/scrapegraphai/utils/proxy_rotation.py +++ b/scrapegraphai/utils/proxy_rotation.py @@ -1,34 +1,234 @@ """ Module for rotating proxies """ + +import ipaddress +import random +from typing import List, Optional, Set, TypedDict + +import requests +from fp.errors import FreeProxyException from fp.fp import FreeProxy -def proxy_generator(num_ips: int) -> list: - """ - Generates a specified number of proxy IP addresses using the FreeProxy library. +class ProxyBrokerCriteria(TypedDict, total=False): + """proxy broker criteria""" + + anonymous: bool + countryset: Set[str] + secure: bool + timeout: float + search_outside_if_empty: bool + + +class ProxySettings(TypedDict, total=False): + """proxy settings""" + + server: str + bypass: str + username: str + password: str + + +class Proxy(ProxySettings): + """proxy server information""" + + criteria: ProxyBrokerCriteria + + +def search_proxy_servers( + anonymous: bool = True, + countryset: Optional[Set[str]] = None, + secure: bool = False, + timeout: float = 5.0, + max_shape: int = 5, + search_outside_if_empty: bool = True, +) -> List[str]: + """search for proxy servers that match the specified broker criteria Args: - num_ips (int): The number of proxy IPs to generate and rotate through. + anonymous: whether proxy servers should have minimum level-1 anonymity. + countryset: admissible proxy servers locations. + secure: whether proxy servers should support HTTP or HTTPS; defaults to HTTP; + timeout: The maximum timeout for proxy responses; defaults to 5.0 seconds. + max_shape: The maximum number of proxy servers to return; defaults to 5. + search_outside_if_empty: whether countryset should be extended if empty. Returns: - list: A list of proxy IP addresses. + A list of proxy server URLs matching the criteria. Example: - >>> proxy_generator(5) + >>> search_proxy_servers( + ... anonymous=True, + ... countryset={"GB", "US"}, + ... secure=True, + ... timeout=1.0 + ... max_shape=2 + ... ) [ - '192.168.1.1:8080', - '103.10.63.135:8080', - '176.9.75.42:8080', - '37.57.216.2:8080', - '113.20.31.250:8080' + "http://103.10.63.135:8080", + "http://113.20.31.250:8080", ] - - This function fetches fresh proxies and indexes them, making it easy to manage multiple proxy configurations. """ + proxybroker = FreeProxy( + anonym=anonymous, + country_id=countryset, + elite=True, + https=secure, + timeout=timeout, + ) - res = [] + def search_all(proxybroker: FreeProxy, k: int, search_outside: bool) -> List[str]: + candidateset = proxybroker.get_proxy_list(search_outside) + random.shuffle(candidateset) - for i in range(0, num_ips): - res.append(FreeProxy().get()) - return res + positive = set() + + for address in candidateset: + setting = {proxybroker.schema: f"http://{address}"} + + try: + server = proxybroker._FreeProxy__check_if_proxy_is_working(setting) + + if not server: + continue + + positive.add(server) + + if len(positive) < k: + continue + + return list(positive) + + except requests.exceptions.RequestException: + continue + + n = len(positive) + + if n < k and search_outside: + proxybroker.country_id = None + + try: + negative = set(search_all(proxybroker, k - n, False)) + except FreeProxyException: + negative = set() + + positive = positive | negative + + if not positive: + raise FreeProxyException("missing proxy servers for criteria") + + return list(positive) + + return search_all(proxybroker, max_shape, search_outside_if_empty) + + +def _parse_proxy(proxy: ProxySettings) -> ProxySettings: + """parses a proxy configuration with known server + + Args: + proxy: The proxy configuration to parse. + + Returns: + A 'playwright' compliant proxy configuration. + """ + assert "server" in proxy, "missing server in the proxy configuration" + + auhtorization = [x in proxy for x in ("username", "password")] + + message = "username and password must be provided in pairs or not at all" + + assert all(auhtorization) or not any(auhtorization), message + + parsed = {"server": proxy["server"]} + + if proxy.get("bypass"): + parsed["bypass"] = proxy["bypass"] + + if all(auhtorization): + parsed["username"] = proxy["username"] + parsed["password"] = proxy["password"] + + return parsed + + +def _search_proxy(proxy: Proxy) -> ProxySettings: + """searches for a proxy server matching the specified broker criteria + + Args: + proxy: The proxy configuration to search for. + + Returns: + A 'playwright' compliant proxy configuration. + """ + server = search_proxy_servers(max_shape=1, **proxy.get("criteria", {}))[0] + + return {"server": server} + + +def is_ipv4_address(address: str) -> bool: + """If a proxy address conforms to a IPv4 address""" + try: + ipaddress.IPv4Address(address) + return True + except ipaddress.AddressValueError: + return False + + +def parse_or_search_proxy(proxy: Proxy) -> ProxySettings: + """parses a proxy configuration or searches for a new one matching + the specified broker criteria + + Args: + proxy: The proxy configuration to parse or search for. + + Returns: + A 'playwright' compliant proxy configuration. + + Notes: + - If the proxy server is a IP address, it is assumed to be + a proxy server address. + - If the proxy server is 'broker', a proxy server is searched for + based on the provided broker criteria. + + Example: + >>> proxy = { + ... "server": "broker", + ... "criteria": { + ... "anonymous": True, + ... "countryset": {"GB", "US"}, + ... "secure": True, + ... "timeout": 5.0 + ... "search_outside_if_empty": False + ... } + ... } + + >>> parse_or_search_proxy(proxy) + { + "server": "", + } + + Example: + >>> proxy = { + ... "server": "192.168.1.1:8080", + ... "username": "", + ... "password": "" + ... } + + >>> parse_or_search_proxy(proxy) + { + "server": "192.168.1.1:8080", + "username": "", + "password": "" + } + """ + assert "server" in proxy, "missing server in the proxy configuration" + + server_address = proxy["server"].split(":", maxsplit=1)[0] + + if is_ipv4_address(server_address): + return _parse_proxy(proxy) + + assert proxy["server"] == "broker", "unknown proxy server" + + return _search_proxy(proxy) diff --git a/tests/utils/test_proxy_rotation.py b/tests/utils/test_proxy_rotation.py new file mode 100644 index 00000000..8acbdb30 --- /dev/null +++ b/tests/utils/test_proxy_rotation.py @@ -0,0 +1,121 @@ +import pytest +from fp.errors import FreeProxyException + +from scrapegraphai.utils.proxy_rotation import ( + Proxy, + _parse_proxy, + _search_proxy, + is_ipv4_address, + parse_or_search_proxy, + search_proxy_servers, +) + + +def test_search_proxy_servers_success(): + servers = search_proxy_servers( + anonymous=True, + countryset={"US"}, + secure=False, + timeout=10.0, + max_shape=2, + search_outside_if_empty=True, + ) + + assert isinstance(servers, list) + assert all(isinstance(server, str) for server in servers) + + +def test_search_proxy_servers_exception(): + with pytest.raises(FreeProxyException): + search_proxy_servers( + anonymous=True, + countryset={"XX"}, + secure=True, + timeout=1.0, + max_shape=2, + search_outside_if_empty=False, + ) + + +def test_parse_proxy_success(): + proxy = { + "server": "192.168.1.1:8080", + "username": "user", + "password": "pass", + "bypass": "*.local", + } + + parsed_proxy = _parse_proxy(proxy) + assert parsed_proxy == proxy + + +def test_parse_proxy_exception(): + invalid_proxy = {"server": "192.168.1.1:8080", "username": "user"} + + with pytest.raises(AssertionError) as error_info: + _parse_proxy(invalid_proxy) + + assert "username and password must be provided in pairs" in str(error_info.value) + + +def test_search_proxy_success(): + proxy = Proxy(criteria={"anonymous": True, "countryset": {"US"}}) + found_proxy = _search_proxy(proxy) + + assert isinstance(found_proxy, dict) + assert "server" in found_proxy + + +def test_is_ipv4_address(): + assert is_ipv4_address("192.168.1.1") is True + assert is_ipv4_address("999.999.999.999") is False + assert is_ipv4_address("no-address") is False + + +def test_parse_or_search_proxy_success(): + proxy = { + "server": "192.168.1.1:8080", + "username": "username", + "password": "password", + } + + parsed_proxy = parse_or_search_proxy(proxy) + assert parsed_proxy == proxy + + proxy_broker = { + "server": "broker", + "criteria": { + "anonymous": True, + "countryset": {"US"}, + "secure": True, + "timeout": 10.0, + }, + } + + found_proxy = parse_or_search_proxy(proxy_broker) + + assert isinstance(found_proxy, dict) + assert "server" in found_proxy + + +def test_parse_or_search_proxy_exception(): + proxy = { + "username": "username", + "password": "password", + } + + with pytest.raises(AssertionError) as error_info: + parse_or_search_proxy(proxy) + + assert "missing server in the proxy configuration" in str(error_info.value) + + +def test_parse_or_search_proxy_unknown_server(): + proxy = { + "server": "unknown", + } + + with pytest.raises(AssertionError) as error_info: + parse_or_search_proxy(proxy) + + assert "unknown proxy server" in str(error_info.value)