mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
feat: add new proxy rotation function
This commit is contained in:
parent
44bc9196ef
commit
f6077d1f98
@ -7,6 +7,7 @@ from langchain_community.document_loaders import AsyncHtmlLoader
|
||||
from langchain_core.documents import Document
|
||||
from .base_node import BaseNode
|
||||
from ..utils.remover import remover
|
||||
from ..utils.proxy_rotation import proxy_rotation
|
||||
|
||||
|
||||
class FetchNode(BaseNode):
|
||||
@ -37,13 +38,16 @@ class FetchNode(BaseNode):
|
||||
to succeed.
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
|
||||
def __init__(self, input: str, output: List[str], num_prox: int = True,
|
||||
node_name: str = "Fetch"):
|
||||
"""
|
||||
Initializes the FetchHTMLNode with a node name and node type.
|
||||
Arguments:
|
||||
node_name (str): name of the node
|
||||
prox_rotation (bool): if you wamt to rotate proxies
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 1)
|
||||
self.num_prox = num_prox
|
||||
|
||||
def execute(self, state):
|
||||
"""
|
||||
@ -78,7 +82,11 @@ class FetchNode(BaseNode):
|
||||
|
||||
# if it is a URL
|
||||
else:
|
||||
loader = AsyncHtmlLoader(source)
|
||||
if self.num_prox > 1:
|
||||
loader = AsyncHtmlLoader(
|
||||
source, proxies=proxy_rotation(self.num_prox))
|
||||
else:
|
||||
loader = AsyncHtmlLoader(source)
|
||||
document = loader.load()
|
||||
compressed_document = [
|
||||
Document(page_content=remover(str(document)))]
|
||||
|
||||
@ -5,3 +5,4 @@ from .save_audio_from_bytes import save_audio_from_bytes
|
||||
from .convert_to_csv import convert_to_csv
|
||||
from .convert_to_json import convert_to_json
|
||||
from .prettify_exec_info import prettify_exec_info
|
||||
from .proxy_rotation import proxy_rotation
|
||||
|
||||
32
scrapegraphai/utils/proxy_rotation.py
Normal file
32
scrapegraphai/utils/proxy_rotation.py
Normal file
@ -0,0 +1,32 @@
|
||||
"""
|
||||
Module for rotating proxies
|
||||
"""
|
||||
from fp.fp import FreeProxy
|
||||
|
||||
|
||||
def proxy_rotation(num_ips: int):
|
||||
"""
|
||||
Rotates through a specified number of proxy IPs using the FreeProxy library.
|
||||
|
||||
Args:
|
||||
num_ips (int): The number of proxy IPs to rotate through.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation.
|
||||
|
||||
Example:
|
||||
>>> proxy_rotation(5)
|
||||
{
|
||||
0: '192.168.1.1:8080',
|
||||
1: '103.10.63.135:8080',
|
||||
2: '176.9.75.42:8080',
|
||||
3: '37.57.216.2:8080',
|
||||
4: '113.20.31.250:8080'
|
||||
}
|
||||
"""
|
||||
res = {}
|
||||
|
||||
for i in range(0, num_ips):
|
||||
res[i] = FreeProxy().get()
|
||||
|
||||
return res
|
||||
Loading…
Reference in New Issue
Block a user