mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
docs: base and fetch node
This commit is contained in:
parent
5ae67f5d3b
commit
e9817963c8
@ -1,6 +1,7 @@
|
||||
"""
|
||||
Module for creating the basic node
|
||||
Module for defining BaseNode, an abstract base class for nodes in a graph-based workflow.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional, List
|
||||
import re
|
||||
@ -8,50 +9,43 @@ import re
|
||||
|
||||
class BaseNode(ABC):
|
||||
"""
|
||||
An abstract base class for nodes in a graph-based workflow. Each node is
|
||||
intended to perform a specific action when executed as part of the graph's
|
||||
processing flow.
|
||||
An abstract base class for nodes in a graph-based workflow, designed to perform specific actions when executed.
|
||||
|
||||
Attributes:
|
||||
node_name (str): A unique identifier for the node.
|
||||
node_type (str): Specifies the node's type, which influences how the
|
||||
node interacts within the graph. Valid values are
|
||||
"node" for standard nodes and "conditional_node" for
|
||||
nodes that determine the flow based on conditions.
|
||||
node_name (str): The unique identifier name for the node.
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of
|
||||
min_input_len (int): Minimum required number of input keys.
|
||||
node_config (Optional[dict]): Additional configuration for the node.
|
||||
|
||||
Methods:
|
||||
execute(state): An abstract method that subclasses must implement. This
|
||||
method should contain the logic that the node executes
|
||||
when it is reached in the graph's flow. It takes the
|
||||
graph's current state as input and returns the updated
|
||||
state after execution.
|
||||
|
||||
Args:
|
||||
node_name (str): The unique identifier name for the node. This name is
|
||||
used to reference the node within the graph.
|
||||
node_type (str): The type of the node, limited to "node" or
|
||||
"conditional_node". This categorization helps in
|
||||
determining the node's role and behavior within the
|
||||
graph.
|
||||
|
||||
Raises:
|
||||
ValueError: If the provided `node_type` is not one of the allowed
|
||||
values ("node" or "conditional_node"), a ValueError is
|
||||
raised to indicate the incorrect usage.
|
||||
Example:
|
||||
>>> class MyNode(BaseNode):
|
||||
... def execute(self, state):
|
||||
... # Implementation of node logic here
|
||||
... return state
|
||||
...
|
||||
>>> my_node = MyNode("ExampleNode", "node", "input_spec", ["output_spec"])
|
||||
>>> updated_state = my_node.execute({'key': 'value'})
|
||||
{'key': 'value'}
|
||||
"""
|
||||
|
||||
def __init__(self, node_name: str, node_type: str, input: str, output: List[str],
|
||||
min_input_len: int = 1, node_config: Optional[dict] = None):
|
||||
"""
|
||||
Initialize the node with a unique identifier and a specified node type.
|
||||
Initialize the instance with the node's name, type, input/output specifications, and configuration details.
|
||||
|
||||
Args:
|
||||
node_name (str): The unique identifier name for the node.
|
||||
node_type (str): The type of the node, limited to "node" or "conditional_node".
|
||||
node_name (str): Name for identifying the node.
|
||||
node_type (str): Type of the node; must be 'node' or 'conditional_node'.
|
||||
input (str): Expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
min_input_len (int, optional): Minimum required number of input keys; defaults to 1.
|
||||
node_config (Optional[dict], optional): Additional configuration for the node; defaults to None.
|
||||
|
||||
Raises:
|
||||
ValueError: If node_type is not "node" or "conditional_node".
|
||||
ValueError: If `node_type` is not one of the allowed types.
|
||||
"""
|
||||
|
||||
self.node_name = node_name
|
||||
self.input = input
|
||||
self.output = output
|
||||
@ -66,17 +60,31 @@ class BaseNode(ABC):
|
||||
@abstractmethod
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Execute the node's logic and return the updated state.
|
||||
Execute the node's logic based on the current state and update it accordingly.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph.
|
||||
:return: The updated state after executing this node.
|
||||
|
||||
Returns:
|
||||
dict: The updated state after executing the node's logic.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def get_input_keys(self, state: dict) -> List[str]:
|
||||
"""Use the _parse_input_keys method to identify which state keys are
|
||||
needed based on the input attribute
|
||||
"""
|
||||
Determines the necessary state keys based on the input specification.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph used to parse input keys.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of input keys required for node operation.
|
||||
|
||||
Raises:
|
||||
ValueError: If error occurs in parsing input keys.
|
||||
"""
|
||||
|
||||
try:
|
||||
input_keys = self._parse_input_keys(state, self.input)
|
||||
self._validate_input_keys(input_keys)
|
||||
@ -86,6 +94,16 @@ class BaseNode(ABC):
|
||||
f"Error parsing input keys for {self.node_name}: {str(e)}")
|
||||
|
||||
def _validate_input_keys(self, input_keys):
|
||||
"""
|
||||
Validates if the provided input keys meet the minimum length requirement.
|
||||
|
||||
Args:
|
||||
input_keys (List[str]): The list of input keys to validate.
|
||||
|
||||
Raises:
|
||||
ValueError: If the number of input keys is less than the minimum required.
|
||||
"""
|
||||
|
||||
if len(input_keys) < self.min_input_len:
|
||||
raise ValueError(
|
||||
f"""{self.node_name} requires at least {self.min_input_len} input keys,
|
||||
@ -93,8 +111,8 @@ class BaseNode(ABC):
|
||||
|
||||
def _parse_input_keys(self, state: dict, expression: str) -> List[str]:
|
||||
"""
|
||||
Parses the input keys expression and identifies the corresponding keys
|
||||
from the state that match the expression logic.
|
||||
Parses the input keys expression to extract relevant keys from the state based on logical conditions.
|
||||
The expression can contain AND (&), OR (|), and parentheses to group conditions.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph.
|
||||
@ -102,7 +120,11 @@ class BaseNode(ABC):
|
||||
|
||||
Returns:
|
||||
List[str]: A list of key names that match the input keys expression logic.
|
||||
|
||||
Raises:
|
||||
ValueError: If the expression is invalid or if no state keys match the expression.
|
||||
"""
|
||||
|
||||
# Check for empty expression
|
||||
if not expression:
|
||||
raise ValueError("Empty expression.")
|
||||
@ -142,9 +164,12 @@ class BaseNode(ABC):
|
||||
"Missing or unbalanced parentheses in expression.")
|
||||
|
||||
# Helper function to evaluate an expression without parentheses
|
||||
def evaluate_simple_expression(exp):
|
||||
def evaluate_simple_expression(exp: str) -> List[str]:
|
||||
"""Evaluate an expression without parentheses."""
|
||||
|
||||
# Split the expression by the OR operator and process each segment
|
||||
for or_segment in exp.split('|'):
|
||||
|
||||
# Check if all elements in an AND segment are in state
|
||||
and_segment = or_segment.split('&')
|
||||
if all(elem.strip() in state for elem in and_segment):
|
||||
@ -152,13 +177,17 @@ class BaseNode(ABC):
|
||||
return []
|
||||
|
||||
# Helper function to evaluate expressions with parentheses
|
||||
def evaluate_expression(expression):
|
||||
def evaluate_expression(expression: str) -> List[str]:
|
||||
"""Evaluate an expression with parentheses."""
|
||||
|
||||
while '(' in expression:
|
||||
start = expression.rfind('(')
|
||||
end = expression.find(')', start)
|
||||
sub_exp = expression[start + 1:end]
|
||||
|
||||
# Replace the evaluated part with a placeholder and then evaluate it
|
||||
sub_result = evaluate_simple_expression(sub_exp)
|
||||
|
||||
# For simplicity in handling, join sub-results with OR to reprocess them later
|
||||
expression = expression[:start] + \
|
||||
'|'.join(sub_result) + expression[end+1:]
|
||||
|
||||
@ -12,38 +12,28 @@ from ..utils.remover import remover
|
||||
class FetchNode(BaseNode):
|
||||
"""
|
||||
A node responsible for fetching the HTML content of a specified URL and updating
|
||||
the graph's state with this content. It uses the AsyncHtmlLoader for asynchronous
|
||||
document loading.
|
||||
the graph's state with this content. It uses the AsyncChromiumLoader to fetch the
|
||||
content asynchronously.
|
||||
|
||||
This node acts as a starting point in many scraping workflows, preparing the state
|
||||
with the necessary HTML content for further processing by subsequent nodes in the graph.
|
||||
|
||||
Attributes:
|
||||
node_name (str): The unique identifier name for the node.
|
||||
node_type (str): The type of the node, defaulting to "node". This categorization
|
||||
helps in determining the node's role and behavior within the graph.
|
||||
The "node" type is used for standard operational nodes.
|
||||
|
||||
headless (bool): A flag indicating whether the browser should run in headless mode.
|
||||
verbose (bool): A flag indicating whether to print verbose output during execution.
|
||||
|
||||
Args:
|
||||
node_name (str): The unique identifier name for the node. This name is used to
|
||||
reference the node within the graph.
|
||||
node_type (str, optional): The type of the node, limited to "node" or
|
||||
"conditional_node". Defaults to "node".
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (Optional[dict]): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "Fetch".
|
||||
|
||||
Methods:
|
||||
execute(state): Fetches the HTML content for the URL specified in the state and
|
||||
updates the state with this content under the 'document' key.
|
||||
The 'url' key must be present in the state for the operation
|
||||
to succeed.
|
||||
execute(state): Fetches the HTML content for the URL specified in the state
|
||||
and updates the state with the fetched content under the specified output key.
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"):
|
||||
"""
|
||||
Initializes the FetchHTMLNode with a node name and node type.
|
||||
Arguments:
|
||||
node_name (str): name of the node
|
||||
prox_rotation (bool): if you wamt to rotate proxies
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 1)
|
||||
|
||||
self.headless = True if node_config is None else node_config.get("headless", True)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user