diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index f00fd869..38a5f3ea 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -1,6 +1,7 @@ """ -Module for creating the basic node +Module for defining BaseNode, an abstract base class for nodes in a graph-based workflow. """ + from abc import ABC, abstractmethod from typing import Optional, List import re @@ -8,50 +9,43 @@ import re class BaseNode(ABC): """ - An abstract base class for nodes in a graph-based workflow. Each node is - intended to perform a specific action when executed as part of the graph's - processing flow. + An abstract base class for nodes in a graph-based workflow, designed to perform specific actions when executed. Attributes: - node_name (str): A unique identifier for the node. - node_type (str): Specifies the node's type, which influences how the - node interacts within the graph. Valid values are - "node" for standard nodes and "conditional_node" for - nodes that determine the flow based on conditions. + node_name (str): The unique identifier name for the node. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of + min_input_len (int): Minimum required number of input keys. + node_config (Optional[dict]): Additional configuration for the node. - Methods: - execute(state): An abstract method that subclasses must implement. This - method should contain the logic that the node executes - when it is reached in the graph's flow. It takes the - graph's current state as input and returns the updated - state after execution. - - Args: - node_name (str): The unique identifier name for the node. This name is - used to reference the node within the graph. - node_type (str): The type of the node, limited to "node" or - "conditional_node". This categorization helps in - determining the node's role and behavior within the - graph. - - Raises: - ValueError: If the provided `node_type` is not one of the allowed - values ("node" or "conditional_node"), a ValueError is - raised to indicate the incorrect usage. + Example: + >>> class MyNode(BaseNode): + ... def execute(self, state): + ... # Implementation of node logic here + ... return state + ... + >>> my_node = MyNode("ExampleNode", "node", "input_spec", ["output_spec"]) + >>> updated_state = my_node.execute({'key': 'value'}) + {'key': 'value'} """ def __init__(self, node_name: str, node_type: str, input: str, output: List[str], min_input_len: int = 1, node_config: Optional[dict] = None): """ - Initialize the node with a unique identifier and a specified node type. + Initialize the instance with the node's name, type, input/output specifications, and configuration details. Args: - node_name (str): The unique identifier name for the node. - node_type (str): The type of the node, limited to "node" or "conditional_node". + node_name (str): Name for identifying the node. + node_type (str): Type of the node; must be 'node' or 'conditional_node'. + input (str): Expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + min_input_len (int, optional): Minimum required number of input keys; defaults to 1. + node_config (Optional[dict], optional): Additional configuration for the node; defaults to None. Raises: - ValueError: If node_type is not "node" or "conditional_node". + ValueError: If `node_type` is not one of the allowed types. """ + self.node_name = node_name self.input = input self.output = output @@ -66,17 +60,31 @@ class BaseNode(ABC): @abstractmethod def execute(self, state: dict) -> dict: """ - Execute the node's logic and return the updated state. + Execute the node's logic based on the current state and update it accordingly. + Args: state (dict): The current state of the graph. - :return: The updated state after executing this node. + + Returns: + dict: The updated state after executing the node's logic. """ + pass def get_input_keys(self, state: dict) -> List[str]: - """Use the _parse_input_keys method to identify which state keys are - needed based on the input attribute """ + Determines the necessary state keys based on the input specification. + + Args: + state (dict): The current state of the graph used to parse input keys. + + Returns: + List[str]: A list of input keys required for node operation. + + Raises: + ValueError: If error occurs in parsing input keys. + """ + try: input_keys = self._parse_input_keys(state, self.input) self._validate_input_keys(input_keys) @@ -86,6 +94,16 @@ class BaseNode(ABC): f"Error parsing input keys for {self.node_name}: {str(e)}") def _validate_input_keys(self, input_keys): + """ + Validates if the provided input keys meet the minimum length requirement. + + Args: + input_keys (List[str]): The list of input keys to validate. + + Raises: + ValueError: If the number of input keys is less than the minimum required. + """ + if len(input_keys) < self.min_input_len: raise ValueError( f"""{self.node_name} requires at least {self.min_input_len} input keys, @@ -93,8 +111,8 @@ class BaseNode(ABC): def _parse_input_keys(self, state: dict, expression: str) -> List[str]: """ - Parses the input keys expression and identifies the corresponding keys - from the state that match the expression logic. + Parses the input keys expression to extract relevant keys from the state based on logical conditions. + The expression can contain AND (&), OR (|), and parentheses to group conditions. Args: state (dict): The current state of the graph. @@ -102,7 +120,11 @@ class BaseNode(ABC): Returns: List[str]: A list of key names that match the input keys expression logic. + + Raises: + ValueError: If the expression is invalid or if no state keys match the expression. """ + # Check for empty expression if not expression: raise ValueError("Empty expression.") @@ -142,9 +164,12 @@ class BaseNode(ABC): "Missing or unbalanced parentheses in expression.") # Helper function to evaluate an expression without parentheses - def evaluate_simple_expression(exp): + def evaluate_simple_expression(exp: str) -> List[str]: + """Evaluate an expression without parentheses.""" + # Split the expression by the OR operator and process each segment for or_segment in exp.split('|'): + # Check if all elements in an AND segment are in state and_segment = or_segment.split('&') if all(elem.strip() in state for elem in and_segment): @@ -152,13 +177,17 @@ class BaseNode(ABC): return [] # Helper function to evaluate expressions with parentheses - def evaluate_expression(expression): + def evaluate_expression(expression: str) -> List[str]: + """Evaluate an expression with parentheses.""" + while '(' in expression: start = expression.rfind('(') end = expression.find(')', start) sub_exp = expression[start + 1:end] + # Replace the evaluated part with a placeholder and then evaluate it sub_result = evaluate_simple_expression(sub_exp) + # For simplicity in handling, join sub-results with OR to reprocess them later expression = expression[:start] + \ '|'.join(sub_result) + expression[end+1:] diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 7a34536e..0b58141b 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -12,38 +12,28 @@ from ..utils.remover import remover class FetchNode(BaseNode): """ A node responsible for fetching the HTML content of a specified URL and updating - the graph's state with this content. It uses the AsyncHtmlLoader for asynchronous - document loading. + the graph's state with this content. It uses the AsyncChromiumLoader to fetch the + content asynchronously. This node acts as a starting point in many scraping workflows, preparing the state with the necessary HTML content for further processing by subsequent nodes in the graph. Attributes: - node_name (str): The unique identifier name for the node. - node_type (str): The type of the node, defaulting to "node". This categorization - helps in determining the node's role and behavior within the graph. - The "node" type is used for standard operational nodes. - + headless (bool): A flag indicating whether the browser should run in headless mode. + verbose (bool): A flag indicating whether to print verbose output during execution. + Args: - node_name (str): The unique identifier name for the node. This name is used to - reference the node within the graph. - node_type (str, optional): The type of the node, limited to "node" or - "conditional_node". Defaults to "node". + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (Optional[dict]): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Fetch". Methods: - execute(state): Fetches the HTML content for the URL specified in the state and - updates the state with this content under the 'document' key. - The 'url' key must be present in the state for the operation - to succeed. + execute(state): Fetches the HTML content for the URL specified in the state + and updates the state with the fetched content under the specified output key. """ def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"): - """ - Initializes the FetchHTMLNode with a node name and node type. - Arguments: - node_name (str): name of the node - prox_rotation (bool): if you wamt to rotate proxies - """ super().__init__(node_name, "node", input, output, 1) self.headless = True if node_config is None else node_config.get("headless", True)