From adbc08f27bc0966822f054f3af0e1f94fc0b87f5 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Sat, 27 Apr 2024 19:07:37 +0200 Subject: [PATCH] fix: robot node and proxyes --- .../local_models/Ollama/smart_scraper_ollama.py | 2 +- examples/single_node/fetch_node.py | 0 examples/single_node/robot_node.py | 12 +++--------- scrapegraphai/nodes/fetch_node.py | 11 ++++------- scrapegraphai/nodes/robots_node.py | 15 ++++++++------- scrapegraphai/utils/__init__.py | 2 +- 6 files changed, 17 insertions(+), 25 deletions(-) create mode 100644 examples/single_node/fetch_node.py diff --git a/examples/local_models/Ollama/smart_scraper_ollama.py b/examples/local_models/Ollama/smart_scraper_ollama.py index 77879227..d710b986 100644 --- a/examples/local_models/Ollama/smart_scraper_ollama.py +++ b/examples/local_models/Ollama/smart_scraper_ollama.py @@ -10,7 +10,7 @@ from scrapegraphai.utils import prettify_exec_info graph_config = { "llm": { "model": "ollama/mistral", - "temperature": 1, + "temperature": 0, "format": "json", # Ollama needs the format to be specified explicitly # "model_tokens": 2000, # set context length arbitrarily, "base_url": "http://localhost:11434", # set ollama URL arbitrarily diff --git a/examples/single_node/fetch_node.py b/examples/single_node/fetch_node.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/single_node/robot_node.py b/examples/single_node/robot_node.py index 55795f87..8aa26446 100644 --- a/examples/single_node/robot_node.py +++ b/examples/single_node/robot_node.py @@ -2,22 +2,16 @@ Example of custom graph using existing nodes """ -import os -from dotenv import load_dotenv -from scrapegraphai.models import OpenAI +from scrapegraphai.models import Ollama from scrapegraphai.nodes import RobotsNode -load_dotenv() # ************************************************ # Define the configuration for the graph # ************************************************ -openai_key = os.getenv("OPENAI_APIKEY") - graph_config = { "llm": { - "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "ollama/llama3", "temperature": 0, "streaming": True }, @@ -27,7 +21,7 @@ graph_config = { # Define the node # ************************************************ -llm_model = OpenAI(graph_config["llm"]) +llm_model = Ollama(graph_config["llm"]) robots_node = RobotsNode( input="url", diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 2564d44d..51fc7c30 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -7,7 +7,6 @@ from langchain_community.document_loaders import AsyncHtmlLoader from langchain_core.documents import Document from .base_node import BaseNode from ..utils.remover import remover -from ..utils.proxy_generator import proxy_generator class FetchNode(BaseNode): @@ -38,8 +37,7 @@ class FetchNode(BaseNode): to succeed. """ - def __init__(self, input: str, output: List[str], num_prox: int = True, - node_name: str = "Fetch"): + def __init__(self, input: str, output: List[str], node_name: str = "Fetch"): """ Initializes the FetchHTMLNode with a node name and node type. Arguments: @@ -47,7 +45,6 @@ class FetchNode(BaseNode): prox_rotation (bool): if you wamt to rotate proxies """ super().__init__(node_name, "node", input, output, 1) - self.num_prox = num_prox def execute(self, state): """ @@ -80,13 +77,13 @@ class FetchNode(BaseNode): "source": "local_dir" })] - # if it is a URL else: - if self.num_prox > 1: + if self.node_config.get("endpoint") is not None: loader = AsyncHtmlLoader( - source, proxies=proxy_generator(self.num_prox)) + source, proxies={"http": self.node_config["endpoint"]}) else: loader = AsyncHtmlLoader(source) + document = loader.load() compressed_document = [ Document(page_content=remover(str(document)))] diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index c9235067..a492421d 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -1,5 +1,5 @@ """ -Module for checking if a website is scrapepable or not +Module for checking if a website is scrapepable or not """ from typing import List from urllib.parse import urlparse @@ -12,7 +12,7 @@ from ..helpers import robots_dictionary class RobotsNode(BaseNode): """ - A node responsible for checking if a website is scrapepable or not. + A node responsible for checking if a website is scrapepable or not. It uses the AsyncHtmlLoader for asynchronous document loading. @@ -59,7 +59,7 @@ class RobotsNode(BaseNode): node_config (dict): Configuration parameters for the node. force_scraping (bool): A flag indicating whether scraping should be enforced even if disallowed by robots.txt. Defaults to True. - node_name (str, optional): The unique identifier name for the node. + node_name (str, optional): The unique identifier name for the node. Defaults to "Robots". """ super().__init__(node_name, "node", input, output, 1) @@ -112,11 +112,12 @@ class RobotsNode(BaseNode): base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" loader = AsyncHtmlLoader(f"{base_url}/robots.txt") document = loader.load() - model = self.llm_model.model_name - - if "ollama" in model: - model = model.split("/", maxsplit=1)[-1] + if "ollama" in self.llm_model.model: + self.llm_model.model = self.llm_model.model.split("/")[-1] + model = self.llm_model.model.split("/")[-1] + else: + model = self.llm_model.model_name try: agent = robots_dictionary[model] diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 3fd1d884..0aee7839 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -5,4 +5,4 @@ from .save_audio_from_bytes import save_audio_from_bytes from .convert_to_csv import convert_to_csv from .convert_to_json import convert_to_json from .prettify_exec_info import prettify_exec_info -from .proxy_generator import proxy_generator +from .proxy_rotation import proxy_generator