refactor fetch node

2026-06-23 21:00:30 +08:00 · 2024-03-17 12:02:35 +01:00 · 2024-03-17 12:02:35 +01:00 · 3585cd81e5
commit 3585cd81e5
parent 20d7b69008
5 changed files with 93 additions and 8 deletions
--- a/scrapegraphai/nodes/init.py
+++ b/scrapegraphai/nodes/init.py
@ -2,6 +2,7 @@
 __init__.py file for node folder 
 """
 from .fetch_html_node import FetchHTMLNode
+from .fetch_node import FetchNode
 from .conditional_node import ConditionalNode
 from .get_probable_tags_node import GetProbableTagsNode
 from .generate_answer_node import GenerateAnswerNode
--- a/scrapegraphai/nodes/base_node.py
+++ b/scrapegraphai/nodes/base_node.py
@ -62,7 +62,7 @@ class BaseNode(ABC):
        self.node_type = node_type

    @abstractmethod
-    def execute(self, state: dict):
+    def execute(self, state: dict) -> dict:
        """
        Execute the node's logic and return the updated state.
        Args:
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@ -0,0 +1,78 @@
+""" 
+Module for fetching the HTML node
+"""
+
+from langchain_community.document_loaders import AsyncHtmlLoader
+from langchain_core.documents import Document
+from .base_node import BaseNode
+from typing import List
+
+class FetchNode(BaseNode):
+    """
+    A node responsible for fetching the HTML content of a specified URL and updating
+    the graph's state with this content. It uses the AsyncHtmlLoader for asynchronous
+    document loading.
+
+    This node acts as a starting point in many scraping workflows, preparing the state
+    with the necessary HTML content for further processing by subsequent nodes in the graph.
+
+    Attributes:
+        node_name (str): The unique identifier name for the node.
+        node_type (str): The type of the node, defaulting to "node". This categorization
+                         helps in determining the node's role and behavior within the graph.
+                         The "node" type is used for standard operational nodes.
+
+    Args:
+        node_name (str): The unique identifier name for the node. This name is used to
+                         reference the node within the graph.
+        node_type (str, optional): The type of the node, limited to "node" or
+                                   "conditional_node". Defaults to "node".
+
+    Methods:
+        execute(state): Fetches the HTML content for the URL specified in the state and
+                        updates the state with this content under the 'document' key.
+                        The 'url' key must be present in the state for the operation
+                        to succeed.
+    """
+
+    def __init__(self, input: str, output: List[str], node_name: str = "FetchNode"):
+        """
+        Initializes the FetchHTMLNode with a node name and node type.
+        Arguments:
+            node_name (str): name of the node
+        """
+        super().__init__(node_name, "node", input, output, 1)
+
+    def execute(self, state):
+        """
+        Executes the node's logic to fetch HTML content from a specified URL and
+        update the state with this content.
+
+        Args:
+            state (dict): The current state of the graph, expected to contain a 'url' key.
+
+        Returns:
+            dict: The updated state with a new 'document' key containing the fetched HTML content.
+
+        Raises:
+            KeyError: If the 'url' key is not found in the state, indicating that the
+                      necessary information to perform the operation is missing.
+        """
+        print(f"--- Executing {self.node_name} Node ---")
+    
+        # Interpret input keys based on the provided input expression
+        input_keys = self.get_input_keys(state)
+        
+        # Fetching data from the state based on the input keys
+        input_data = [state[key] for key in input_keys]
+
+        loader = AsyncHtmlLoader(input_data[0])
+        document = loader.load()
+        # metadata = document[0].metadata
+        # document = remover(str(document[0]))
+
+        # state["document"] = [
+        #     Document(page_content=document, metadata=metadata)]
+        state.update({self.output[0]: document})
+
+        return state
--- a/scrapegraphai/nodes/parse_node.py
+++ b/scrapegraphai/nodes/parse_node.py
@ -38,7 +38,7 @@ class ParseNode(BaseNode):
            node_name (str): name of the node
            node_type (str, optional): type of the node
        """
-        super().__init__(node_name, "node", input, output, 2)
+        super().__init__(node_name, "node", input, output, 1)

    def execute(self,  state):
        """
--- a/scrapegraphai/utils/test_node.py
+++ b/scrapegraphai/utils/test_node.py
@ -1,15 +1,21 @@
-from scrapegraphai.nodes import FetchHTMLNode, ParseNode, RAGNode, GenerateAnswerNode
+from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode

 state = {
-    "user_prompt": None,
-    "url": None,
-    "doc": None,
+    "user_prompt": "List me all the projects",
+    "url": "https://perinim.github.io/projects/",
 }

+fetch_node = FetchNode(
+    input="url | local_dir",
+    output=["doc"],
+    node_name="fetch_html"
+    )
+
+updated_state = fetch_node.execute(state)
 parse_node = ParseNode(
-    input="doc & url",
+    input="doc",
    output=["parsed_doc"],
    node_name="parse_document"
    )

-parse_node.execute(state)
+parse_node.execute(updated_state)