mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-25 21:11:11 +08:00
fix: robot node and proxyes
This commit is contained in:
parent
4bc727412f
commit
adbc08f27b
@ -10,7 +10,7 @@ from scrapegraphai.utils import prettify_exec_info
|
|||||||
graph_config = {
|
graph_config = {
|
||||||
"llm": {
|
"llm": {
|
||||||
"model": "ollama/mistral",
|
"model": "ollama/mistral",
|
||||||
"temperature": 1,
|
"temperature": 0,
|
||||||
"format": "json", # Ollama needs the format to be specified explicitly
|
"format": "json", # Ollama needs the format to be specified explicitly
|
||||||
# "model_tokens": 2000, # set context length arbitrarily,
|
# "model_tokens": 2000, # set context length arbitrarily,
|
||||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||||
|
|||||||
0
examples/single_node/fetch_node.py
Normal file
0
examples/single_node/fetch_node.py
Normal file
@ -2,22 +2,16 @@
|
|||||||
Example of custom graph using existing nodes
|
Example of custom graph using existing nodes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
from scrapegraphai.models import Ollama
|
||||||
from dotenv import load_dotenv
|
|
||||||
from scrapegraphai.models import OpenAI
|
|
||||||
from scrapegraphai.nodes import RobotsNode
|
from scrapegraphai.nodes import RobotsNode
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
# ************************************************
|
# ************************************************
|
||||||
# Define the configuration for the graph
|
# Define the configuration for the graph
|
||||||
# ************************************************
|
# ************************************************
|
||||||
|
|
||||||
openai_key = os.getenv("OPENAI_APIKEY")
|
|
||||||
|
|
||||||
graph_config = {
|
graph_config = {
|
||||||
"llm": {
|
"llm": {
|
||||||
"api_key": openai_key,
|
"model": "ollama/llama3",
|
||||||
"model": "gpt-3.5-turbo",
|
|
||||||
"temperature": 0,
|
"temperature": 0,
|
||||||
"streaming": True
|
"streaming": True
|
||||||
},
|
},
|
||||||
@ -27,7 +21,7 @@ graph_config = {
|
|||||||
# Define the node
|
# Define the node
|
||||||
# ************************************************
|
# ************************************************
|
||||||
|
|
||||||
llm_model = OpenAI(graph_config["llm"])
|
llm_model = Ollama(graph_config["llm"])
|
||||||
|
|
||||||
robots_node = RobotsNode(
|
robots_node = RobotsNode(
|
||||||
input="url",
|
input="url",
|
||||||
|
|||||||
@ -7,7 +7,6 @@ from langchain_community.document_loaders import AsyncHtmlLoader
|
|||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from .base_node import BaseNode
|
from .base_node import BaseNode
|
||||||
from ..utils.remover import remover
|
from ..utils.remover import remover
|
||||||
from ..utils.proxy_generator import proxy_generator
|
|
||||||
|
|
||||||
|
|
||||||
class FetchNode(BaseNode):
|
class FetchNode(BaseNode):
|
||||||
@ -38,8 +37,7 @@ class FetchNode(BaseNode):
|
|||||||
to succeed.
|
to succeed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, input: str, output: List[str], num_prox: int = True,
|
def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
|
||||||
node_name: str = "Fetch"):
|
|
||||||
"""
|
"""
|
||||||
Initializes the FetchHTMLNode with a node name and node type.
|
Initializes the FetchHTMLNode with a node name and node type.
|
||||||
Arguments:
|
Arguments:
|
||||||
@ -47,7 +45,6 @@ class FetchNode(BaseNode):
|
|||||||
prox_rotation (bool): if you wamt to rotate proxies
|
prox_rotation (bool): if you wamt to rotate proxies
|
||||||
"""
|
"""
|
||||||
super().__init__(node_name, "node", input, output, 1)
|
super().__init__(node_name, "node", input, output, 1)
|
||||||
self.num_prox = num_prox
|
|
||||||
|
|
||||||
def execute(self, state):
|
def execute(self, state):
|
||||||
"""
|
"""
|
||||||
@ -80,13 +77,13 @@ class FetchNode(BaseNode):
|
|||||||
"source": "local_dir"
|
"source": "local_dir"
|
||||||
})]
|
})]
|
||||||
|
|
||||||
# if it is a URL
|
|
||||||
else:
|
else:
|
||||||
if self.num_prox > 1:
|
if self.node_config.get("endpoint") is not None:
|
||||||
loader = AsyncHtmlLoader(
|
loader = AsyncHtmlLoader(
|
||||||
source, proxies=proxy_generator(self.num_prox))
|
source, proxies={"http": self.node_config["endpoint"]})
|
||||||
else:
|
else:
|
||||||
loader = AsyncHtmlLoader(source)
|
loader = AsyncHtmlLoader(source)
|
||||||
|
|
||||||
document = loader.load()
|
document = loader.load()
|
||||||
compressed_document = [
|
compressed_document = [
|
||||||
Document(page_content=remover(str(document)))]
|
Document(page_content=remover(str(document)))]
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
Module for checking if a website is scrapepable or not
|
Module for checking if a website is scrapepable or not
|
||||||
"""
|
"""
|
||||||
from typing import List
|
from typing import List
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
@ -12,7 +12,7 @@ from ..helpers import robots_dictionary
|
|||||||
|
|
||||||
class RobotsNode(BaseNode):
|
class RobotsNode(BaseNode):
|
||||||
"""
|
"""
|
||||||
A node responsible for checking if a website is scrapepable or not.
|
A node responsible for checking if a website is scrapepable or not.
|
||||||
It uses the AsyncHtmlLoader for asynchronous
|
It uses the AsyncHtmlLoader for asynchronous
|
||||||
document loading.
|
document loading.
|
||||||
|
|
||||||
@ -59,7 +59,7 @@ class RobotsNode(BaseNode):
|
|||||||
node_config (dict): Configuration parameters for the node.
|
node_config (dict): Configuration parameters for the node.
|
||||||
force_scraping (bool): A flag indicating whether scraping should be enforced even
|
force_scraping (bool): A flag indicating whether scraping should be enforced even
|
||||||
if disallowed by robots.txt. Defaults to True.
|
if disallowed by robots.txt. Defaults to True.
|
||||||
node_name (str, optional): The unique identifier name for the node.
|
node_name (str, optional): The unique identifier name for the node.
|
||||||
Defaults to "Robots".
|
Defaults to "Robots".
|
||||||
"""
|
"""
|
||||||
super().__init__(node_name, "node", input, output, 1)
|
super().__init__(node_name, "node", input, output, 1)
|
||||||
@ -112,11 +112,12 @@ class RobotsNode(BaseNode):
|
|||||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||||
loader = AsyncHtmlLoader(f"{base_url}/robots.txt")
|
loader = AsyncHtmlLoader(f"{base_url}/robots.txt")
|
||||||
document = loader.load()
|
document = loader.load()
|
||||||
model = self.llm_model.model_name
|
if "ollama" in self.llm_model.model:
|
||||||
|
self.llm_model.model = self.llm_model.model.split("/")[-1]
|
||||||
if "ollama" in model:
|
model = self.llm_model.model.split("/")[-1]
|
||||||
model = model.split("/", maxsplit=1)[-1]
|
|
||||||
|
|
||||||
|
else:
|
||||||
|
model = self.llm_model.model_name
|
||||||
try:
|
try:
|
||||||
agent = robots_dictionary[model]
|
agent = robots_dictionary[model]
|
||||||
|
|
||||||
|
|||||||
@ -5,4 +5,4 @@ from .save_audio_from_bytes import save_audio_from_bytes
|
|||||||
from .convert_to_csv import convert_to_csv
|
from .convert_to_csv import convert_to_csv
|
||||||
from .convert_to_json import convert_to_json
|
from .convert_to_json import convert_to_json
|
||||||
from .prettify_exec_info import prettify_exec_info
|
from .prettify_exec_info import prettify_exec_info
|
||||||
from .proxy_generator import proxy_generator
|
from .proxy_rotation import proxy_generator
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user