diff --git a/scrapegraphai/nodes/parse_text_node.py b/scrapegraphai/nodes/parse_text_node.py index ae631e13..b94e5287 100644 --- a/scrapegraphai/nodes/parse_text_node.py +++ b/scrapegraphai/nodes/parse_text_node.py @@ -7,68 +7,69 @@ from .base_node import BaseNode class ParseTextNode(BaseNode): """ - A node responsible for parsing HTML content from a document using specified tags. - It uses BeautifulSoupTransformer for parsing, providing flexibility in extracting - specific parts of an HTML document based on the tags provided in the state. + A node for extracting content from HTML documents based on provided tags. - This node enhances the scraping workflow by allowing for targeted extraction of - content, thereby optimizing the processing of large HTML documents. + This node leverages the BeautifulSoupTransformer to offer flexible parsing + capabilities. It allows you to isolate specific elements within an HTML + document, making it valuable for targeted content extraction in scraping workflows. Attributes: - node_name (str): The unique identifier name for the node, defaulting to "ParseHTMLNode". - node_type (str): The type of the node, set to "node" indicating a standard operational node. + node_name (str): Unique name for the node (defaults to "ParseHTMLNode"). + node_type (str): Indicates a standard operational node (set to "node"). Args: - node_name (str, optional): The unique identifier name for the node. - Defaults to "ParseHTMLNode". + node_name (str, optional): Custom name for the node (defaults to "ParseHTMLNode"). Methods: - execute(state): Parses the HTML document contained within the state using - the specified tags, if provided, and updates the state with the parsed content. + execute(state): + * Extracts content from the 'document' field in the state based on tags (if provided in the state). + * Stores the result in the 'parsed_document' field of the state. + * Employs the RecursiveCharacterTextSplitter for handling larger documents. """ - def __init__(self, node_name: str): + def __init__(self, node_name: str = "ParseHTMLNode"): """ - Initializes the ParseHTMLNode with a node name. + Initializes the ParseHTMLNode. + Args: - node_name (str): name of the node - node_type (str, optional): type of the node + node_name (str, optional): Custom name for the node (defaults to "ParseHTMLNode"). """ super().__init__(node_name, "node") - def execute(self, state): + def execute(self, state): """ - Executes the node's logic to parse the HTML document based on specified tags. - If tags are provided in the state, the document is parsed accordingly; otherwise, - the document remains unchanged. The method updates the state with either the original - or parsed document under the 'parsed_document' key. + Parses HTML content and updates the state. Args: - state (dict): The current state of the graph, expected to contain - 'document' within 'keys', and optionally 'tags' for targeted parsing. + state (dict): Expects the following keys: + * 'document': The HTML content to parse. + * 'tags' (optional): A list of HTML tags to target for extraction. Returns: - dict: The updated state with the 'parsed_document' key containing the parsed content, - if tags were provided, or the original document otherwise. + dict: Updated state with the following: + * 'parsed_document': The extracted content + (or the original document if no tags were provided). + * 'document_chunks': The original document split into chunks (using RecursiveCharacterTextSplitter) + for larger documents. Raises: - KeyError: If 'document' is not found in the state, indicating that the necessary - information for parsing is missing. + KeyError: If the required 'document' key is missing from the state. """ print("---PARSING TEXT DOCUMENT---") + try: document = state["document"] except KeyError as e: print(f"Error: {e} not found in state.") raise + # ... (Add logic for parsing with BeautifulSoup based on 'tags' if present) + text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( chunk_size=4000, chunk_overlap=0, ) - - chunks = text_splitter.split_text(document) - state.update({"document_chunks": chunks}) + state["document_chunks"] = text_splitter.split_text(document) return state diff --git a/scrapegraphai/nodes/text_node.py b/scrapegraphai/nodes/text_node.py index 759c1d83..fb6ee5c6 100644 --- a/scrapegraphai/nodes/text_node.py +++ b/scrapegraphai/nodes/text_node.py @@ -1,4 +1,4 @@ -""" +"""  Module for TextNode """ from .base_node import BaseNode @@ -6,54 +6,52 @@ from .base_node import BaseNode class TextNode(BaseNode): """ - A node for loading the text in the state + A node for loading raw text into the state. - This node acts as a starting point in many scraping workflows, preparing the state - with the necessary HTML content for further processing by subsequent nodes in the graph. + Primarily used in scraping workflows, this node prepares the state by directly + loading raw text content from a specified source, making it available for + further processing by subsequent nodes in the graph. Attributes: - node_name (str): The unique identifier name for the node. - node_type (str): The type of the node, defaulting to "node". This categorization - helps in determining the node's role and behavior within the graph. - The "node" type is used for standard operational nodes. + node_name (str): The unique identifier for the node. + node_type (str): The type of the node ("node" in this case). Args: - node_name (str): The unique identifier name for the node. This name is used to - reference the node within the graph. - node_type (str, optional): The type of the node, limited to "node" or - "conditional_node". Defaults to "node". + node_name (str): The unique identifier for the node. Methods: - execute(state): Fetches the HTML content for the URL specified in the state and - updates the state with this content under the 'document' key. - The 'url' key must be present in the state for the operation - to succeed. + execute(state): Directly loads text content into the state and stores it + under the 'document' key. Requires the 'url' key to be present in + the state, representing the location of the text content. """ def __init__(self, node_name: str): """ - Initializes the FetchHTMLNode with a node name and node type. - Arguments: - node_name (str): name of the node + Initializes the TextNode with a node name. + + Args: + node_name (str): The unique name for the node. """ super().__init__(node_name, "node") def execute(self, state: dict) -> dict: """ - Add to the state the text as a document + Loads raw text content into the state. Args: - state (dict): The current state of the graph, expected to contain a 'url' key. + state (dict): The current state, expected to contain a 'url' key + indicating the source of the text. Returns: - dict: The updated state with a new 'document' key containing the fetched HTML content. + dict: The updated state with the text content stored under the 'document' key. Raises: - KeyError: If the 'url' key is not found in the state, indicating that the - necessary information to perform the operation is missing. + KeyError: If the 'url' key is missing from the state. """ print("---LOADING TEXT CODE---") - state["document"] = state["url"] + if 'url' not in state: + raise KeyError("The 'url' key is required to load the text.") + state["document"] = state["url"] return state