From adbc08f27bc0966822f054f3af0e1f94fc0b87f5 Mon Sep 17 00:00:00 2001
From: VinciGit00 <mvincig11@gmail.com>
Date: Sat, 27 Apr 2024 19:07:37 +0200
Subject: [PATCH] fix: robot node and proxyes

---
 .../local_models/Ollama/smart_scraper_ollama.py   |  2 +-
 examples/single_node/fetch_node.py                |  0
 examples/single_node/robot_node.py                | 12 +++---------
 scrapegraphai/nodes/fetch_node.py                 | 11 ++++-------
 scrapegraphai/nodes/robots_node.py                | 15 ++++++++-------
 scrapegraphai/utils/__init__.py                   |  2 +-
 6 files changed, 17 insertions(+), 25 deletions(-)
 create mode 100644 examples/single_node/fetch_node.py

diff --git a/examples/local_models/Ollama/smart_scraper_ollama.py b/examples/local_models/Ollama/smart_scraper_ollama.py
index 77879227..d710b986 100644
--- a/examples/local_models/Ollama/smart_scraper_ollama.py
+++ b/examples/local_models/Ollama/smart_scraper_ollama.py
@@ -10,7 +10,7 @@ from scrapegraphai.utils import prettify_exec_info
 graph_config = {
     "llm": {
         "model": "ollama/mistral",
-        "temperature": 1,
+        "temperature": 0,
         "format": "json",  # Ollama needs the format to be specified explicitly
         # "model_tokens": 2000, # set context length arbitrarily,
         "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
diff --git a/examples/single_node/fetch_node.py b/examples/single_node/fetch_node.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/single_node/robot_node.py b/examples/single_node/robot_node.py
index 55795f87..8aa26446 100644
--- a/examples/single_node/robot_node.py
+++ b/examples/single_node/robot_node.py
@@ -2,22 +2,16 @@
 Example of custom graph using existing nodes
 """
 
-import os
-from dotenv import load_dotenv
-from scrapegraphai.models import OpenAI
+from scrapegraphai.models import Ollama
 from scrapegraphai.nodes import RobotsNode
-load_dotenv()
 
 # ************************************************
 # Define the configuration for the graph
 # ************************************************
 
-openai_key = os.getenv("OPENAI_APIKEY")
-
 graph_config = {
     "llm": {
-        "api_key": openai_key,
-        "model": "gpt-3.5-turbo",
+        "model": "ollama/llama3",
         "temperature": 0,
         "streaming": True
     },
@@ -27,7 +21,7 @@ graph_config = {
 # Define the node
 # ************************************************
 
-llm_model = OpenAI(graph_config["llm"])
+llm_model = Ollama(graph_config["llm"])
 
 robots_node = RobotsNode(
     input="url",
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
index 2564d44d..51fc7c30 100644
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@@ -7,7 +7,6 @@ from langchain_community.document_loaders import AsyncHtmlLoader
 from langchain_core.documents import Document
 from .base_node import BaseNode
 from ..utils.remover import remover
-from ..utils.proxy_generator import proxy_generator
 
 
 class FetchNode(BaseNode):
@@ -38,8 +37,7 @@ class FetchNode(BaseNode):
                         to succeed.
     """
 
-    def __init__(self, input: str, output: List[str], num_prox: int = True,
-                 node_name: str = "Fetch"):
+    def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
         """
         Initializes the FetchHTMLNode with a node name and node type.
         Arguments:
@@ -47,7 +45,6 @@ class FetchNode(BaseNode):
             prox_rotation (bool): if you wamt to rotate proxies
         """
         super().__init__(node_name, "node", input, output, 1)
-        self.num_prox = num_prox
 
     def execute(self, state):
         """
@@ -80,13 +77,13 @@ class FetchNode(BaseNode):
                 "source": "local_dir"
             })]
 
-        # if it is a URL
         else:
-            if self.num_prox > 1:
+            if self.node_config.get("endpoint") is not None:
                 loader = AsyncHtmlLoader(
-                    source, proxies=proxy_generator(self.num_prox))
+                    source, proxies={"http": self.node_config["endpoint"]})
             else:
                 loader = AsyncHtmlLoader(source)
+
             document = loader.load()
             compressed_document = [
                 Document(page_content=remover(str(document)))]
diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py
index c9235067..a492421d 100644
--- a/scrapegraphai/nodes/robots_node.py
+++ b/scrapegraphai/nodes/robots_node.py
@@ -1,5 +1,5 @@
 """
-Module for checking if a website is scrapepable or not 
+Module for checking if a website is scrapepable or not
 """
 from typing import List
 from urllib.parse import urlparse
@@ -12,7 +12,7 @@ from ..helpers import robots_dictionary
 
 class RobotsNode(BaseNode):
     """
-    A node responsible for checking if a website is scrapepable or not. 
+    A node responsible for checking if a website is scrapepable or not.
     It uses the AsyncHtmlLoader for asynchronous
     document loading.
 
@@ -59,7 +59,7 @@ class RobotsNode(BaseNode):
             node_config (dict): Configuration parameters for the node.
             force_scraping (bool): A flag indicating whether scraping should be enforced even
                                    if disallowed by robots.txt. Defaults to True.
-            node_name (str, optional): The unique identifier name for the node. 
+            node_name (str, optional): The unique identifier name for the node.
                                        Defaults to "Robots".
         """
         super().__init__(node_name, "node", input, output, 1)
@@ -112,11 +112,12 @@ class RobotsNode(BaseNode):
             base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
             loader = AsyncHtmlLoader(f"{base_url}/robots.txt")
             document = loader.load()
-            model = self.llm_model.model_name
-
-            if "ollama" in model:
-                model = model.split("/", maxsplit=1)[-1]
+            if "ollama" in self.llm_model.model:
+                self.llm_model.model = self.llm_model.model.split("/")[-1]
+                model = self.llm_model.model.split("/")[-1]
 
+            else:
+                model = self.llm_model.model_name
             try:
                 agent = robots_dictionary[model]
 
diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py
index 3fd1d884..0aee7839 100644
--- a/scrapegraphai/utils/__init__.py
+++ b/scrapegraphai/utils/__init__.py
@@ -5,4 +5,4 @@ from .save_audio_from_bytes import save_audio_from_bytes
 from .convert_to_csv import convert_to_csv
 from .convert_to_json import convert_to_json
 from .prettify_exec_info import prettify_exec_info
-from .proxy_generator import proxy_generator
+from .proxy_rotation import proxy_generator