From 3585cd81e5577fa22ba4374d680b8673e58bb993 Mon Sep 17 00:00:00 2001 From: Perinim Date: Sun, 17 Mar 2024 12:02:35 +0100 Subject: [PATCH] refactor fetch node --- scrapegraphai/nodes/__init__.py | 1 + scrapegraphai/nodes/base_node.py | 2 +- scrapegraphai/nodes/fetch_node.py | 78 +++++++++++++++++++++++++++++++ scrapegraphai/nodes/parse_node.py | 2 +- scrapegraphai/utils/test_node.py | 18 ++++--- 5 files changed, 93 insertions(+), 8 deletions(-) create mode 100644 scrapegraphai/nodes/fetch_node.py diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index d4af797c..24817b94 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -2,6 +2,7 @@ __init__.py file for node folder """ from .fetch_html_node import FetchHTMLNode +from .fetch_node import FetchNode from .conditional_node import ConditionalNode from .get_probable_tags_node import GetProbableTagsNode from .generate_answer_node import GenerateAnswerNode diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index daddb394..43e43fa4 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -62,7 +62,7 @@ class BaseNode(ABC): self.node_type = node_type @abstractmethod - def execute(self, state: dict): + def execute(self, state: dict) -> dict: """ Execute the node's logic and return the updated state. Args: diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py new file mode 100644 index 00000000..e28e239d --- /dev/null +++ b/scrapegraphai/nodes/fetch_node.py @@ -0,0 +1,78 @@ +""" +Module for fetching the HTML node +""" + +from langchain_community.document_loaders import AsyncHtmlLoader +from langchain_core.documents import Document +from .base_node import BaseNode +from typing import List + +class FetchNode(BaseNode): + """ + A node responsible for fetching the HTML content of a specified URL and updating + the graph's state with this content. It uses the AsyncHtmlLoader for asynchronous + document loading. + + This node acts as a starting point in many scraping workflows, preparing the state + with the necessary HTML content for further processing by subsequent nodes in the graph. + + Attributes: + node_name (str): The unique identifier name for the node. + node_type (str): The type of the node, defaulting to "node". This categorization + helps in determining the node's role and behavior within the graph. + The "node" type is used for standard operational nodes. + + Args: + node_name (str): The unique identifier name for the node. This name is used to + reference the node within the graph. + node_type (str, optional): The type of the node, limited to "node" or + "conditional_node". Defaults to "node". + + Methods: + execute(state): Fetches the HTML content for the URL specified in the state and + updates the state with this content under the 'document' key. + The 'url' key must be present in the state for the operation + to succeed. + """ + + def __init__(self, input: str, output: List[str], node_name: str = "FetchNode"): + """ + Initializes the FetchHTMLNode with a node name and node type. + Arguments: + node_name (str): name of the node + """ + super().__init__(node_name, "node", input, output, 1) + + def execute(self, state): + """ + Executes the node's logic to fetch HTML content from a specified URL and + update the state with this content. + + Args: + state (dict): The current state of the graph, expected to contain a 'url' key. + + Returns: + dict: The updated state with a new 'document' key containing the fetched HTML content. + + Raises: + KeyError: If the 'url' key is not found in the state, indicating that the + necessary information to perform the operation is missing. + """ + print(f"--- Executing {self.node_name} Node ---") + + # Interpret input keys based on the provided input expression + input_keys = self.get_input_keys(state) + + # Fetching data from the state based on the input keys + input_data = [state[key] for key in input_keys] + + loader = AsyncHtmlLoader(input_data[0]) + document = loader.load() + # metadata = document[0].metadata + # document = remover(str(document[0])) + + # state["document"] = [ + # Document(page_content=document, metadata=metadata)] + state.update({self.output[0]: document}) + + return state diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 62082bdc..c8ac24b4 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -38,7 +38,7 @@ class ParseNode(BaseNode): node_name (str): name of the node node_type (str, optional): type of the node """ - super().__init__(node_name, "node", input, output, 2) + super().__init__(node_name, "node", input, output, 1) def execute(self, state): """ diff --git a/scrapegraphai/utils/test_node.py b/scrapegraphai/utils/test_node.py index 97ab43e1..a0f20763 100644 --- a/scrapegraphai/utils/test_node.py +++ b/scrapegraphai/utils/test_node.py @@ -1,15 +1,21 @@ -from scrapegraphai.nodes import FetchHTMLNode, ParseNode, RAGNode, GenerateAnswerNode +from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode state = { - "user_prompt": None, - "url": None, - "doc": None, + "user_prompt": "List me all the projects", + "url": "https://perinim.github.io/projects/", } +fetch_node = FetchNode( + input="url | local_dir", + output=["doc"], + node_name="fetch_html" + ) + +updated_state = fetch_node.execute(state) parse_node = ParseNode( - input="doc & url", + input="doc", output=["parsed_doc"], node_name="parse_document" ) -parse_node.execute(state) \ No newline at end of file +parse_node.execute(updated_state) \ No newline at end of file