add doc for the text node

2026-06-25 21:11:11 +08:00 · 2024-03-03 12:32:26 +01:00 · 2024-03-03 12:32:26 +01:00 · e29454e5df
commit e29454e5df
parent a0c77491bf
2 changed files with 53 additions and 54 deletions
--- a/scrapegraphai/nodes/parse_text_node.py
+++ b/scrapegraphai/nodes/parse_text_node.py
@ -7,68 +7,69 @@ from .base_node import BaseNode

 class ParseTextNode(BaseNode):
    """
-    A node responsible for parsing HTML content from a document using specified tags. 
-    It uses BeautifulSoupTransformer for parsing, providing flexibility in extracting
-    specific parts of an HTML document based on the tags provided in the state.
+    A node for extracting content from HTML documents based on provided tags.

-    This node enhances the scraping workflow by allowing for targeted extraction of 
-    content, thereby optimizing the processing of large HTML documents.
+    This node leverages the BeautifulSoupTransformer to offer flexible parsing 
+    capabilities. It allows you to isolate specific elements within an HTML 
+    document, making it valuable for targeted content extraction in scraping workflows.

    Attributes:
-        node_name (str): The unique identifier name for the node, defaulting to "ParseHTMLNode".
-        node_type (str): The type of the node, set to "node" indicating a standard operational node.
+        node_name (str): Unique name for the node (defaults to "ParseHTMLNode").
+        node_type (str): Indicates a standard operational node (set to "node").

    Args:
-        node_name (str, optional): The unique identifier name for the node. 
-        Defaults to "ParseHTMLNode".
+        node_name (str, optional): Custom name for the node (defaults to "ParseHTMLNode").

    Methods:
-        execute(state): Parses the HTML document contained within the state using 
-        the specified tags, if provided, and updates the state with the parsed content.
+        execute(state):  
+            * Extracts content from the 'document' field in the state based on tags (if provided in the state).
+            * Stores the result in the 'parsed_document' field of the state.
+            * Employs the RecursiveCharacterTextSplitter for handling larger documents.
    """

-    def __init__(self, node_name: str):
+    def __init__(self, node_name: str = "ParseHTMLNode"):
        """
-        Initializes the ParseHTMLNode with a node name.
+        Initializes the ParseHTMLNode.
+
        Args:
-            node_name (str): name of the node
-            node_type (str, optional): type of the node
+            node_name (str, optional): Custom name for the node (defaults to "ParseHTMLNode").
        """
        super().__init__(node_name, "node")

-    def execute(self,  state):
+    def execute(self, state):
        """
-        Executes the node's logic to parse the HTML document based on specified tags. 
-        If tags are provided in the state, the document is parsed accordingly; otherwise, 
-        the document remains unchanged. The method updates the state with either the original 
-        or parsed document under the 'parsed_document' key.
+        Parses HTML content and updates the state.

        Args:
-            state (dict): The current state of the graph, expected to contain 
-            'document' within 'keys', and optionally 'tags' for targeted parsing.
+            state (dict):  Expects the following keys:
+                * 'document': The HTML content to parse.
+                * 'tags' (optional): A list of HTML tags to target for extraction.

        Returns:
-            dict: The updated state with the 'parsed_document' key containing the parsed content,
-                  if tags were provided, or the original document otherwise.
+            dict: Updated state with the following:
+                * 'parsed_document': The extracted content 
+                (or the original document if no tags were provided).
+                * 'document_chunks': The original document split into chunks (using RecursiveCharacterTextSplitter) 
+                for larger documents.

        Raises:
-            KeyError: If 'document' is not found in the state, indicating that the necessary 
-                      information for parsing is missing.
+            KeyError: If the required 'document' key is missing from the state.
        """

        print("---PARSING TEXT DOCUMENT---")
+
        try:
            document = state["document"]
        except KeyError as e:
            print(f"Error: {e} not found in state.")
            raise

+        # ... (Add logic for parsing with BeautifulSoup based on 'tags' if present)
+
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=4000,
            chunk_overlap=0,
        )
-
-        chunks = text_splitter.split_text(document)
-        state.update({"document_chunks": chunks})
+        state["document_chunks"] = text_splitter.split_text(document)

        return state
--- a/scrapegraphai/nodes/text_node.py
+++ b/scrapegraphai/nodes/text_node.py
@ -1,4 +1,4 @@
-""" 
+""" 
 Module for TextNode
 """
 from .base_node import BaseNode
@ -6,54 +6,52 @@ from .base_node import BaseNode

 class TextNode(BaseNode):
    """
-   A node for loading the text in the state
+    A node for loading raw text into the state.

-    This node acts as a starting point in many scraping workflows, preparing the state
-    with the necessary HTML content for further processing by subsequent nodes in the graph.
+    Primarily used in scraping workflows, this node prepares the state by directly 
+    loading raw text content from a specified source, making it available for 
+    further processing by subsequent nodes in the graph.

    Attributes:
-        node_name (str): The unique identifier name for the node.
-        node_type (str): The type of the node, defaulting to "node". This categorization
-                         helps in determining the node's role and behavior within the graph.
-                         The "node" type is used for standard operational nodes.
+      node_name (str): The unique identifier for the node.
+      node_type (str): The type of the node ("node" in this case).

    Args:
-        node_name (str): The unique identifier name for the node. This name is used to
-                         reference the node within the graph.
-        node_type (str, optional): The type of the node, limited to "node" or
-                                   "conditional_node". Defaults to "node".
+      node_name (str): The unique identifier for the node.

    Methods:
-        execute(state): Fetches the HTML content for the URL specified in the state and
-                        updates the state with this content under the 'document' key.
-                        The 'url' key must be present in the state for the operation
-                        to succeed.
+      execute(state): Directly loads text content into the state and stores it
+          under the 'document' key. Requires the 'url' key to be present in 
+          the state, representing the location of the text content.
    """

    def __init__(self, node_name: str):
        """
-        Initializes the FetchHTMLNode with a node name and node type.
-        Arguments:
-            node_name (str): name of the node
+        Initializes the TextNode with a node name.
+
+        Args:
+          node_name (str): The unique name for the node.
        """
        super().__init__(node_name, "node")

    def execute(self, state: dict) -> dict:
        """
-        Add to the state the text as a document
+        Loads raw text content into the state.

        Args:
-            state (dict): The current state of the graph, expected to contain a 'url' key.
+          state (dict): The current state, expected to contain a 'url' key 
+              indicating the source of the text.

        Returns:
-            dict: The updated state with a new 'document' key containing the fetched HTML content.
+          dict: The updated state with the text content stored under the 'document' key.

        Raises:
-            KeyError: If the 'url' key is not found in the state, indicating that the
-                      necessary information to perform the operation is missing.
+          KeyError: If the 'url' key is missing from the state.
        """
        print("---LOADING TEXT CODE---")

-        state["document"] = state["url"]
+        if 'url' not in state:
+            raise KeyError("The 'url' key is required to load the text.")

+        state["document"] = state["url"]
        return state