mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-17 21:08:25 +08:00
feat(proxy-rotation): add parse (IP address) or search (from broker) functionality for proxy rotation
the broker has been made fully configurable for anonymity level, admissible locations, scheme and max shape not to waste resources, unlike the original `free-proxy` package. other options have been explored (e.g., `proxybroker`, `proxybroker2`) due to their built-in proxy server and rotation capabilities, but the former is no longer maintained, and the latter has issue with any python version outside of python 3.9
This commit is contained in:
parent
db2234bf5d
commit
217013181d
@ -1,8 +1,10 @@
|
||||
"""
|
||||
__init__.py file for utils folder
|
||||
"""
|
||||
from .save_audio_from_bytes import save_audio_from_bytes
|
||||
|
||||
from .convert_to_csv import convert_to_csv
|
||||
from .convert_to_json import convert_to_json
|
||||
from .prettify_exec_info import prettify_exec_info
|
||||
from .proxy_rotation import proxy_generator
|
||||
from .proxy_rotation import Proxy, parse_or_search_proxy, search_proxy_servers
|
||||
from .save_audio_from_bytes import save_audio_from_bytes
|
||||
from .sys_dynamic_import import dynamic_import, srcfile_import
|
||||
|
||||
@ -1,34 +1,234 @@
|
||||
"""
|
||||
Module for rotating proxies
|
||||
"""
|
||||
|
||||
import ipaddress
|
||||
import random
|
||||
from typing import List, Optional, Set, TypedDict
|
||||
|
||||
import requests
|
||||
from fp.errors import FreeProxyException
|
||||
from fp.fp import FreeProxy
|
||||
|
||||
|
||||
def proxy_generator(num_ips: int) -> list:
|
||||
"""
|
||||
Generates a specified number of proxy IP addresses using the FreeProxy library.
|
||||
class ProxyBrokerCriteria(TypedDict, total=False):
|
||||
"""proxy broker criteria"""
|
||||
|
||||
anonymous: bool
|
||||
countryset: Set[str]
|
||||
secure: bool
|
||||
timeout: float
|
||||
search_outside_if_empty: bool
|
||||
|
||||
|
||||
class ProxySettings(TypedDict, total=False):
|
||||
"""proxy settings"""
|
||||
|
||||
server: str
|
||||
bypass: str
|
||||
username: str
|
||||
password: str
|
||||
|
||||
|
||||
class Proxy(ProxySettings):
|
||||
"""proxy server information"""
|
||||
|
||||
criteria: ProxyBrokerCriteria
|
||||
|
||||
|
||||
def search_proxy_servers(
|
||||
anonymous: bool = True,
|
||||
countryset: Optional[Set[str]] = None,
|
||||
secure: bool = False,
|
||||
timeout: float = 5.0,
|
||||
max_shape: int = 5,
|
||||
search_outside_if_empty: bool = True,
|
||||
) -> List[str]:
|
||||
"""search for proxy servers that match the specified broker criteria
|
||||
|
||||
Args:
|
||||
num_ips (int): The number of proxy IPs to generate and rotate through.
|
||||
anonymous: whether proxy servers should have minimum level-1 anonymity.
|
||||
countryset: admissible proxy servers locations.
|
||||
secure: whether proxy servers should support HTTP or HTTPS; defaults to HTTP;
|
||||
timeout: The maximum timeout for proxy responses; defaults to 5.0 seconds.
|
||||
max_shape: The maximum number of proxy servers to return; defaults to 5.
|
||||
search_outside_if_empty: whether countryset should be extended if empty.
|
||||
|
||||
Returns:
|
||||
list: A list of proxy IP addresses.
|
||||
A list of proxy server URLs matching the criteria.
|
||||
|
||||
Example:
|
||||
>>> proxy_generator(5)
|
||||
>>> search_proxy_servers(
|
||||
... anonymous=True,
|
||||
... countryset={"GB", "US"},
|
||||
... secure=True,
|
||||
... timeout=1.0
|
||||
... max_shape=2
|
||||
... )
|
||||
[
|
||||
'192.168.1.1:8080',
|
||||
'103.10.63.135:8080',
|
||||
'176.9.75.42:8080',
|
||||
'37.57.216.2:8080',
|
||||
'113.20.31.250:8080'
|
||||
"http://103.10.63.135:8080",
|
||||
"http://113.20.31.250:8080",
|
||||
]
|
||||
|
||||
This function fetches fresh proxies and indexes them, making it easy to manage multiple proxy configurations.
|
||||
"""
|
||||
proxybroker = FreeProxy(
|
||||
anonym=anonymous,
|
||||
country_id=countryset,
|
||||
elite=True,
|
||||
https=secure,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
res = []
|
||||
def search_all(proxybroker: FreeProxy, k: int, search_outside: bool) -> List[str]:
|
||||
candidateset = proxybroker.get_proxy_list(search_outside)
|
||||
random.shuffle(candidateset)
|
||||
|
||||
for i in range(0, num_ips):
|
||||
res.append(FreeProxy().get())
|
||||
return res
|
||||
positive = set()
|
||||
|
||||
for address in candidateset:
|
||||
setting = {proxybroker.schema: f"http://{address}"}
|
||||
|
||||
try:
|
||||
server = proxybroker._FreeProxy__check_if_proxy_is_working(setting)
|
||||
|
||||
if not server:
|
||||
continue
|
||||
|
||||
positive.add(server)
|
||||
|
||||
if len(positive) < k:
|
||||
continue
|
||||
|
||||
return list(positive)
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
continue
|
||||
|
||||
n = len(positive)
|
||||
|
||||
if n < k and search_outside:
|
||||
proxybroker.country_id = None
|
||||
|
||||
try:
|
||||
negative = set(search_all(proxybroker, k - n, False))
|
||||
except FreeProxyException:
|
||||
negative = set()
|
||||
|
||||
positive = positive | negative
|
||||
|
||||
if not positive:
|
||||
raise FreeProxyException("missing proxy servers for criteria")
|
||||
|
||||
return list(positive)
|
||||
|
||||
return search_all(proxybroker, max_shape, search_outside_if_empty)
|
||||
|
||||
|
||||
def _parse_proxy(proxy: ProxySettings) -> ProxySettings:
|
||||
"""parses a proxy configuration with known server
|
||||
|
||||
Args:
|
||||
proxy: The proxy configuration to parse.
|
||||
|
||||
Returns:
|
||||
A 'playwright' compliant proxy configuration.
|
||||
"""
|
||||
assert "server" in proxy, "missing server in the proxy configuration"
|
||||
|
||||
auhtorization = [x in proxy for x in ("username", "password")]
|
||||
|
||||
message = "username and password must be provided in pairs or not at all"
|
||||
|
||||
assert all(auhtorization) or not any(auhtorization), message
|
||||
|
||||
parsed = {"server": proxy["server"]}
|
||||
|
||||
if proxy.get("bypass"):
|
||||
parsed["bypass"] = proxy["bypass"]
|
||||
|
||||
if all(auhtorization):
|
||||
parsed["username"] = proxy["username"]
|
||||
parsed["password"] = proxy["password"]
|
||||
|
||||
return parsed
|
||||
|
||||
|
||||
def _search_proxy(proxy: Proxy) -> ProxySettings:
|
||||
"""searches for a proxy server matching the specified broker criteria
|
||||
|
||||
Args:
|
||||
proxy: The proxy configuration to search for.
|
||||
|
||||
Returns:
|
||||
A 'playwright' compliant proxy configuration.
|
||||
"""
|
||||
server = search_proxy_servers(max_shape=1, **proxy.get("criteria", {}))[0]
|
||||
|
||||
return {"server": server}
|
||||
|
||||
|
||||
def is_ipv4_address(address: str) -> bool:
|
||||
"""If a proxy address conforms to a IPv4 address"""
|
||||
try:
|
||||
ipaddress.IPv4Address(address)
|
||||
return True
|
||||
except ipaddress.AddressValueError:
|
||||
return False
|
||||
|
||||
|
||||
def parse_or_search_proxy(proxy: Proxy) -> ProxySettings:
|
||||
"""parses a proxy configuration or searches for a new one matching
|
||||
the specified broker criteria
|
||||
|
||||
Args:
|
||||
proxy: The proxy configuration to parse or search for.
|
||||
|
||||
Returns:
|
||||
A 'playwright' compliant proxy configuration.
|
||||
|
||||
Notes:
|
||||
- If the proxy server is a IP address, it is assumed to be
|
||||
a proxy server address.
|
||||
- If the proxy server is 'broker', a proxy server is searched for
|
||||
based on the provided broker criteria.
|
||||
|
||||
Example:
|
||||
>>> proxy = {
|
||||
... "server": "broker",
|
||||
... "criteria": {
|
||||
... "anonymous": True,
|
||||
... "countryset": {"GB", "US"},
|
||||
... "secure": True,
|
||||
... "timeout": 5.0
|
||||
... "search_outside_if_empty": False
|
||||
... }
|
||||
... }
|
||||
|
||||
>>> parse_or_search_proxy(proxy)
|
||||
{
|
||||
"server": "<proxy-server-matching-criteria>",
|
||||
}
|
||||
|
||||
Example:
|
||||
>>> proxy = {
|
||||
... "server": "192.168.1.1:8080",
|
||||
... "username": "<username>",
|
||||
... "password": "<password>"
|
||||
... }
|
||||
|
||||
>>> parse_or_search_proxy(proxy)
|
||||
{
|
||||
"server": "192.168.1.1:8080",
|
||||
"username": "<username>",
|
||||
"password": "<password>"
|
||||
}
|
||||
"""
|
||||
assert "server" in proxy, "missing server in the proxy configuration"
|
||||
|
||||
server_address = proxy["server"].split(":", maxsplit=1)[0]
|
||||
|
||||
if is_ipv4_address(server_address):
|
||||
return _parse_proxy(proxy)
|
||||
|
||||
assert proxy["server"] == "broker", "unknown proxy server"
|
||||
|
||||
return _search_proxy(proxy)
|
||||
|
||||
121
tests/utils/test_proxy_rotation.py
Normal file
121
tests/utils/test_proxy_rotation.py
Normal file
@ -0,0 +1,121 @@
|
||||
import pytest
|
||||
from fp.errors import FreeProxyException
|
||||
|
||||
from scrapegraphai.utils.proxy_rotation import (
|
||||
Proxy,
|
||||
_parse_proxy,
|
||||
_search_proxy,
|
||||
is_ipv4_address,
|
||||
parse_or_search_proxy,
|
||||
search_proxy_servers,
|
||||
)
|
||||
|
||||
|
||||
def test_search_proxy_servers_success():
|
||||
servers = search_proxy_servers(
|
||||
anonymous=True,
|
||||
countryset={"US"},
|
||||
secure=False,
|
||||
timeout=10.0,
|
||||
max_shape=2,
|
||||
search_outside_if_empty=True,
|
||||
)
|
||||
|
||||
assert isinstance(servers, list)
|
||||
assert all(isinstance(server, str) for server in servers)
|
||||
|
||||
|
||||
def test_search_proxy_servers_exception():
|
||||
with pytest.raises(FreeProxyException):
|
||||
search_proxy_servers(
|
||||
anonymous=True,
|
||||
countryset={"XX"},
|
||||
secure=True,
|
||||
timeout=1.0,
|
||||
max_shape=2,
|
||||
search_outside_if_empty=False,
|
||||
)
|
||||
|
||||
|
||||
def test_parse_proxy_success():
|
||||
proxy = {
|
||||
"server": "192.168.1.1:8080",
|
||||
"username": "user",
|
||||
"password": "pass",
|
||||
"bypass": "*.local",
|
||||
}
|
||||
|
||||
parsed_proxy = _parse_proxy(proxy)
|
||||
assert parsed_proxy == proxy
|
||||
|
||||
|
||||
def test_parse_proxy_exception():
|
||||
invalid_proxy = {"server": "192.168.1.1:8080", "username": "user"}
|
||||
|
||||
with pytest.raises(AssertionError) as error_info:
|
||||
_parse_proxy(invalid_proxy)
|
||||
|
||||
assert "username and password must be provided in pairs" in str(error_info.value)
|
||||
|
||||
|
||||
def test_search_proxy_success():
|
||||
proxy = Proxy(criteria={"anonymous": True, "countryset": {"US"}})
|
||||
found_proxy = _search_proxy(proxy)
|
||||
|
||||
assert isinstance(found_proxy, dict)
|
||||
assert "server" in found_proxy
|
||||
|
||||
|
||||
def test_is_ipv4_address():
|
||||
assert is_ipv4_address("192.168.1.1") is True
|
||||
assert is_ipv4_address("999.999.999.999") is False
|
||||
assert is_ipv4_address("no-address") is False
|
||||
|
||||
|
||||
def test_parse_or_search_proxy_success():
|
||||
proxy = {
|
||||
"server": "192.168.1.1:8080",
|
||||
"username": "username",
|
||||
"password": "password",
|
||||
}
|
||||
|
||||
parsed_proxy = parse_or_search_proxy(proxy)
|
||||
assert parsed_proxy == proxy
|
||||
|
||||
proxy_broker = {
|
||||
"server": "broker",
|
||||
"criteria": {
|
||||
"anonymous": True,
|
||||
"countryset": {"US"},
|
||||
"secure": True,
|
||||
"timeout": 10.0,
|
||||
},
|
||||
}
|
||||
|
||||
found_proxy = parse_or_search_proxy(proxy_broker)
|
||||
|
||||
assert isinstance(found_proxy, dict)
|
||||
assert "server" in found_proxy
|
||||
|
||||
|
||||
def test_parse_or_search_proxy_exception():
|
||||
proxy = {
|
||||
"username": "username",
|
||||
"password": "password",
|
||||
}
|
||||
|
||||
with pytest.raises(AssertionError) as error_info:
|
||||
parse_or_search_proxy(proxy)
|
||||
|
||||
assert "missing server in the proxy configuration" in str(error_info.value)
|
||||
|
||||
|
||||
def test_parse_or_search_proxy_unknown_server():
|
||||
proxy = {
|
||||
"server": "unknown",
|
||||
}
|
||||
|
||||
with pytest.raises(AssertionError) as error_info:
|
||||
parse_or_search_proxy(proxy)
|
||||
|
||||
assert "unknown proxy server" in str(error_info.value)
|
||||
Loading…
Reference in New Issue
Block a user