From 3585cd81e5577fa22ba4374d680b8673e58bb993 Mon Sep 17 00:00:00 2001
From: Perinim <perinim.98@gmail.com>
Date: Sun, 17 Mar 2024 12:02:35 +0100
Subject: [PATCH] refactor fetch node

---
 scrapegraphai/nodes/__init__.py   |  1 +
 scrapegraphai/nodes/base_node.py  |  2 +-
 scrapegraphai/nodes/fetch_node.py | 78 +++++++++++++++++++++++++++++++
 scrapegraphai/nodes/parse_node.py |  2 +-
 scrapegraphai/utils/test_node.py  | 18 ++++---
 5 files changed, 93 insertions(+), 8 deletions(-)
 create mode 100644 scrapegraphai/nodes/fetch_node.py

diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index d4af797c..24817b94 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -2,6 +2,7 @@
 __init__.py file for node folder 
 """
 from .fetch_html_node import FetchHTMLNode
+from .fetch_node import FetchNode
 from .conditional_node import ConditionalNode
 from .get_probable_tags_node import GetProbableTagsNode
 from .generate_answer_node import GenerateAnswerNode
diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py
index daddb394..43e43fa4 100644
--- a/scrapegraphai/nodes/base_node.py
+++ b/scrapegraphai/nodes/base_node.py
@@ -62,7 +62,7 @@ class BaseNode(ABC):
         self.node_type = node_type
 
     @abstractmethod
-    def execute(self, state: dict):
+    def execute(self, state: dict) -> dict:
         """
         Execute the node's logic and return the updated state.
         Args:
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
new file mode 100644
index 00000000..e28e239d
--- /dev/null
+++ b/scrapegraphai/nodes/fetch_node.py
@@ -0,0 +1,78 @@
+""" 
+Module for fetching the HTML node
+"""
+
+from langchain_community.document_loaders import AsyncHtmlLoader
+from langchain_core.documents import Document
+from .base_node import BaseNode
+from typing import List
+
+class FetchNode(BaseNode):
+    """
+    A node responsible for fetching the HTML content of a specified URL and updating
+    the graph's state with this content. It uses the AsyncHtmlLoader for asynchronous
+    document loading.
+
+    This node acts as a starting point in many scraping workflows, preparing the state
+    with the necessary HTML content for further processing by subsequent nodes in the graph.
+
+    Attributes:
+        node_name (str): The unique identifier name for the node.
+        node_type (str): The type of the node, defaulting to "node". This categorization
+                         helps in determining the node's role and behavior within the graph.
+                         The "node" type is used for standard operational nodes.
+
+    Args:
+        node_name (str): The unique identifier name for the node. This name is used to
+                         reference the node within the graph.
+        node_type (str, optional): The type of the node, limited to "node" or
+                                   "conditional_node". Defaults to "node".
+
+    Methods:
+        execute(state): Fetches the HTML content for the URL specified in the state and
+                        updates the state with this content under the 'document' key.
+                        The 'url' key must be present in the state for the operation
+                        to succeed.
+    """
+
+    def __init__(self, input: str, output: List[str], node_name: str = "FetchNode"):
+        """
+        Initializes the FetchHTMLNode with a node name and node type.
+        Arguments:
+            node_name (str): name of the node
+        """
+        super().__init__(node_name, "node", input, output, 1)
+
+    def execute(self, state):
+        """
+        Executes the node's logic to fetch HTML content from a specified URL and
+        update the state with this content.
+
+        Args:
+            state (dict): The current state of the graph, expected to contain a 'url' key.
+
+        Returns:
+            dict: The updated state with a new 'document' key containing the fetched HTML content.
+
+        Raises:
+            KeyError: If the 'url' key is not found in the state, indicating that the
+                      necessary information to perform the operation is missing.
+        """
+        print(f"--- Executing {self.node_name} Node ---")
+    
+        # Interpret input keys based on the provided input expression
+        input_keys = self.get_input_keys(state)
+        
+        # Fetching data from the state based on the input keys
+        input_data = [state[key] for key in input_keys]
+
+        loader = AsyncHtmlLoader(input_data[0])
+        document = loader.load()
+        # metadata = document[0].metadata
+        # document = remover(str(document[0]))
+
+        # state["document"] = [
+        #     Document(page_content=document, metadata=metadata)]
+        state.update({self.output[0]: document})
+
+        return state
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
index 62082bdc..c8ac24b4 100644
--- a/scrapegraphai/nodes/parse_node.py
+++ b/scrapegraphai/nodes/parse_node.py
@@ -38,7 +38,7 @@ class ParseNode(BaseNode):
             node_name (str): name of the node
             node_type (str, optional): type of the node
         """
-        super().__init__(node_name, "node", input, output, 2)
+        super().__init__(node_name, "node", input, output, 1)
 
     def execute(self,  state):
         """
diff --git a/scrapegraphai/utils/test_node.py b/scrapegraphai/utils/test_node.py
index 97ab43e1..a0f20763 100644
--- a/scrapegraphai/utils/test_node.py
+++ b/scrapegraphai/utils/test_node.py
@@ -1,15 +1,21 @@
-from scrapegraphai.nodes import FetchHTMLNode, ParseNode, RAGNode, GenerateAnswerNode
+from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode
 
 state = {
-    "user_prompt": None,
-    "url": None,
-    "doc": None,
+    "user_prompt": "List me all the projects",
+    "url": "https://perinim.github.io/projects/",
 }
 
+fetch_node = FetchNode(
+    input="url | local_dir",
+    output=["doc"],
+    node_name="fetch_html"
+    )
+
+updated_state = fetch_node.execute(state)
 parse_node = ParseNode(
-    input="doc & url",
+    input="doc",
     output=["parsed_doc"],
     node_name="parse_document"
     )
 
-parse_node.execute(state)
\ No newline at end of file
+parse_node.execute(updated_state)
\ No newline at end of file