feat(proxy-rotation): add parse (IP address) or search (from broker) functionality for proxy rotation

the broker has been made fully configurable for anonymity level, admissible locations, scheme and max shape not to waste resources, unlike the original `free-proxy` package.

other options have been explored (e.g., `proxybroker`, `proxybroker2`) due to their built-in proxy server and rotation capabilities, but the former is no longer maintained, and the latter has issue with any python version outside of python 3.9
This commit is contained in:
Federico Minutoli 2024-05-10 21:09:48 +02:00
parent db2234bf5d
commit 217013181d
3 changed files with 342 additions and 19 deletions

View File

@ -1,8 +1,10 @@
"""
__init__.py file for utils folder
"""
from .save_audio_from_bytes import save_audio_from_bytes
from .convert_to_csv import convert_to_csv
from .convert_to_json import convert_to_json
from .prettify_exec_info import prettify_exec_info
from .proxy_rotation import proxy_generator
from .proxy_rotation import Proxy, parse_or_search_proxy, search_proxy_servers
from .save_audio_from_bytes import save_audio_from_bytes
from .sys_dynamic_import import dynamic_import, srcfile_import

View File

@ -1,34 +1,234 @@
"""
Module for rotating proxies
"""
import ipaddress
import random
from typing import List, Optional, Set, TypedDict
import requests
from fp.errors import FreeProxyException
from fp.fp import FreeProxy
def proxy_generator(num_ips: int) -> list:
"""
Generates a specified number of proxy IP addresses using the FreeProxy library.
class ProxyBrokerCriteria(TypedDict, total=False):
"""proxy broker criteria"""
anonymous: bool
countryset: Set[str]
secure: bool
timeout: float
search_outside_if_empty: bool
class ProxySettings(TypedDict, total=False):
"""proxy settings"""
server: str
bypass: str
username: str
password: str
class Proxy(ProxySettings):
"""proxy server information"""
criteria: ProxyBrokerCriteria
def search_proxy_servers(
anonymous: bool = True,
countryset: Optional[Set[str]] = None,
secure: bool = False,
timeout: float = 5.0,
max_shape: int = 5,
search_outside_if_empty: bool = True,
) -> List[str]:
"""search for proxy servers that match the specified broker criteria
Args:
num_ips (int): The number of proxy IPs to generate and rotate through.
anonymous: whether proxy servers should have minimum level-1 anonymity.
countryset: admissible proxy servers locations.
secure: whether proxy servers should support HTTP or HTTPS; defaults to HTTP;
timeout: The maximum timeout for proxy responses; defaults to 5.0 seconds.
max_shape: The maximum number of proxy servers to return; defaults to 5.
search_outside_if_empty: whether countryset should be extended if empty.
Returns:
list: A list of proxy IP addresses.
A list of proxy server URLs matching the criteria.
Example:
>>> proxy_generator(5)
>>> search_proxy_servers(
... anonymous=True,
... countryset={"GB", "US"},
... secure=True,
... timeout=1.0
... max_shape=2
... )
[
'192.168.1.1:8080',
'103.10.63.135:8080',
'176.9.75.42:8080',
'37.57.216.2:8080',
'113.20.31.250:8080'
"http://103.10.63.135:8080",
"http://113.20.31.250:8080",
]
This function fetches fresh proxies and indexes them, making it easy to manage multiple proxy configurations.
"""
proxybroker = FreeProxy(
anonym=anonymous,
country_id=countryset,
elite=True,
https=secure,
timeout=timeout,
)
res = []
def search_all(proxybroker: FreeProxy, k: int, search_outside: bool) -> List[str]:
candidateset = proxybroker.get_proxy_list(search_outside)
random.shuffle(candidateset)
for i in range(0, num_ips):
res.append(FreeProxy().get())
return res
positive = set()
for address in candidateset:
setting = {proxybroker.schema: f"http://{address}"}
try:
server = proxybroker._FreeProxy__check_if_proxy_is_working(setting)
if not server:
continue
positive.add(server)
if len(positive) < k:
continue
return list(positive)
except requests.exceptions.RequestException:
continue
n = len(positive)
if n < k and search_outside:
proxybroker.country_id = None
try:
negative = set(search_all(proxybroker, k - n, False))
except FreeProxyException:
negative = set()
positive = positive | negative
if not positive:
raise FreeProxyException("missing proxy servers for criteria")
return list(positive)
return search_all(proxybroker, max_shape, search_outside_if_empty)
def _parse_proxy(proxy: ProxySettings) -> ProxySettings:
"""parses a proxy configuration with known server
Args:
proxy: The proxy configuration to parse.
Returns:
A 'playwright' compliant proxy configuration.
"""
assert "server" in proxy, "missing server in the proxy configuration"
auhtorization = [x in proxy for x in ("username", "password")]
message = "username and password must be provided in pairs or not at all"
assert all(auhtorization) or not any(auhtorization), message
parsed = {"server": proxy["server"]}
if proxy.get("bypass"):
parsed["bypass"] = proxy["bypass"]
if all(auhtorization):
parsed["username"] = proxy["username"]
parsed["password"] = proxy["password"]
return parsed
def _search_proxy(proxy: Proxy) -> ProxySettings:
"""searches for a proxy server matching the specified broker criteria
Args:
proxy: The proxy configuration to search for.
Returns:
A 'playwright' compliant proxy configuration.
"""
server = search_proxy_servers(max_shape=1, **proxy.get("criteria", {}))[0]
return {"server": server}
def is_ipv4_address(address: str) -> bool:
"""If a proxy address conforms to a IPv4 address"""
try:
ipaddress.IPv4Address(address)
return True
except ipaddress.AddressValueError:
return False
def parse_or_search_proxy(proxy: Proxy) -> ProxySettings:
"""parses a proxy configuration or searches for a new one matching
the specified broker criteria
Args:
proxy: The proxy configuration to parse or search for.
Returns:
A 'playwright' compliant proxy configuration.
Notes:
- If the proxy server is a IP address, it is assumed to be
a proxy server address.
- If the proxy server is 'broker', a proxy server is searched for
based on the provided broker criteria.
Example:
>>> proxy = {
... "server": "broker",
... "criteria": {
... "anonymous": True,
... "countryset": {"GB", "US"},
... "secure": True,
... "timeout": 5.0
... "search_outside_if_empty": False
... }
... }
>>> parse_or_search_proxy(proxy)
{
"server": "<proxy-server-matching-criteria>",
}
Example:
>>> proxy = {
... "server": "192.168.1.1:8080",
... "username": "<username>",
... "password": "<password>"
... }
>>> parse_or_search_proxy(proxy)
{
"server": "192.168.1.1:8080",
"username": "<username>",
"password": "<password>"
}
"""
assert "server" in proxy, "missing server in the proxy configuration"
server_address = proxy["server"].split(":", maxsplit=1)[0]
if is_ipv4_address(server_address):
return _parse_proxy(proxy)
assert proxy["server"] == "broker", "unknown proxy server"
return _search_proxy(proxy)

View File

@ -0,0 +1,121 @@
import pytest
from fp.errors import FreeProxyException
from scrapegraphai.utils.proxy_rotation import (
Proxy,
_parse_proxy,
_search_proxy,
is_ipv4_address,
parse_or_search_proxy,
search_proxy_servers,
)
def test_search_proxy_servers_success():
servers = search_proxy_servers(
anonymous=True,
countryset={"US"},
secure=False,
timeout=10.0,
max_shape=2,
search_outside_if_empty=True,
)
assert isinstance(servers, list)
assert all(isinstance(server, str) for server in servers)
def test_search_proxy_servers_exception():
with pytest.raises(FreeProxyException):
search_proxy_servers(
anonymous=True,
countryset={"XX"},
secure=True,
timeout=1.0,
max_shape=2,
search_outside_if_empty=False,
)
def test_parse_proxy_success():
proxy = {
"server": "192.168.1.1:8080",
"username": "user",
"password": "pass",
"bypass": "*.local",
}
parsed_proxy = _parse_proxy(proxy)
assert parsed_proxy == proxy
def test_parse_proxy_exception():
invalid_proxy = {"server": "192.168.1.1:8080", "username": "user"}
with pytest.raises(AssertionError) as error_info:
_parse_proxy(invalid_proxy)
assert "username and password must be provided in pairs" in str(error_info.value)
def test_search_proxy_success():
proxy = Proxy(criteria={"anonymous": True, "countryset": {"US"}})
found_proxy = _search_proxy(proxy)
assert isinstance(found_proxy, dict)
assert "server" in found_proxy
def test_is_ipv4_address():
assert is_ipv4_address("192.168.1.1") is True
assert is_ipv4_address("999.999.999.999") is False
assert is_ipv4_address("no-address") is False
def test_parse_or_search_proxy_success():
proxy = {
"server": "192.168.1.1:8080",
"username": "username",
"password": "password",
}
parsed_proxy = parse_or_search_proxy(proxy)
assert parsed_proxy == proxy
proxy_broker = {
"server": "broker",
"criteria": {
"anonymous": True,
"countryset": {"US"},
"secure": True,
"timeout": 10.0,
},
}
found_proxy = parse_or_search_proxy(proxy_broker)
assert isinstance(found_proxy, dict)
assert "server" in found_proxy
def test_parse_or_search_proxy_exception():
proxy = {
"username": "username",
"password": "password",
}
with pytest.raises(AssertionError) as error_info:
parse_or_search_proxy(proxy)
assert "missing server in the proxy configuration" in str(error_info.value)
def test_parse_or_search_proxy_unknown_server():
proxy = {
"server": "unknown",
}
with pytest.raises(AssertionError) as error_info:
parse_or_search_proxy(proxy)
assert "unknown proxy server" in str(error_info.value)