mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-25 21:11:11 +08:00
add doc for the text node
This commit is contained in:
parent
a0c77491bf
commit
e29454e5df
@ -7,68 +7,69 @@ from .base_node import BaseNode
|
||||
|
||||
class ParseTextNode(BaseNode):
|
||||
"""
|
||||
A node responsible for parsing HTML content from a document using specified tags.
|
||||
It uses BeautifulSoupTransformer for parsing, providing flexibility in extracting
|
||||
specific parts of an HTML document based on the tags provided in the state.
|
||||
A node for extracting content from HTML documents based on provided tags.
|
||||
|
||||
This node enhances the scraping workflow by allowing for targeted extraction of
|
||||
content, thereby optimizing the processing of large HTML documents.
|
||||
This node leverages the BeautifulSoupTransformer to offer flexible parsing
|
||||
capabilities. It allows you to isolate specific elements within an HTML
|
||||
document, making it valuable for targeted content extraction in scraping workflows.
|
||||
|
||||
Attributes:
|
||||
node_name (str): The unique identifier name for the node, defaulting to "ParseHTMLNode".
|
||||
node_type (str): The type of the node, set to "node" indicating a standard operational node.
|
||||
node_name (str): Unique name for the node (defaults to "ParseHTMLNode").
|
||||
node_type (str): Indicates a standard operational node (set to "node").
|
||||
|
||||
Args:
|
||||
node_name (str, optional): The unique identifier name for the node.
|
||||
Defaults to "ParseHTMLNode".
|
||||
node_name (str, optional): Custom name for the node (defaults to "ParseHTMLNode").
|
||||
|
||||
Methods:
|
||||
execute(state): Parses the HTML document contained within the state using
|
||||
the specified tags, if provided, and updates the state with the parsed content.
|
||||
execute(state):
|
||||
* Extracts content from the 'document' field in the state based on tags (if provided in the state).
|
||||
* Stores the result in the 'parsed_document' field of the state.
|
||||
* Employs the RecursiveCharacterTextSplitter for handling larger documents.
|
||||
"""
|
||||
|
||||
def __init__(self, node_name: str):
|
||||
def __init__(self, node_name: str = "ParseHTMLNode"):
|
||||
"""
|
||||
Initializes the ParseHTMLNode with a node name.
|
||||
Initializes the ParseHTMLNode.
|
||||
|
||||
Args:
|
||||
node_name (str): name of the node
|
||||
node_type (str, optional): type of the node
|
||||
node_name (str, optional): Custom name for the node (defaults to "ParseHTMLNode").
|
||||
"""
|
||||
super().__init__(node_name, "node")
|
||||
|
||||
def execute(self, state):
|
||||
def execute(self, state):
|
||||
"""
|
||||
Executes the node's logic to parse the HTML document based on specified tags.
|
||||
If tags are provided in the state, the document is parsed accordingly; otherwise,
|
||||
the document remains unchanged. The method updates the state with either the original
|
||||
or parsed document under the 'parsed_document' key.
|
||||
Parses HTML content and updates the state.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph, expected to contain
|
||||
'document' within 'keys', and optionally 'tags' for targeted parsing.
|
||||
state (dict): Expects the following keys:
|
||||
* 'document': The HTML content to parse.
|
||||
* 'tags' (optional): A list of HTML tags to target for extraction.
|
||||
|
||||
Returns:
|
||||
dict: The updated state with the 'parsed_document' key containing the parsed content,
|
||||
if tags were provided, or the original document otherwise.
|
||||
dict: Updated state with the following:
|
||||
* 'parsed_document': The extracted content
|
||||
(or the original document if no tags were provided).
|
||||
* 'document_chunks': The original document split into chunks (using RecursiveCharacterTextSplitter)
|
||||
for larger documents.
|
||||
|
||||
Raises:
|
||||
KeyError: If 'document' is not found in the state, indicating that the necessary
|
||||
information for parsing is missing.
|
||||
KeyError: If the required 'document' key is missing from the state.
|
||||
"""
|
||||
|
||||
print("---PARSING TEXT DOCUMENT---")
|
||||
|
||||
try:
|
||||
document = state["document"]
|
||||
except KeyError as e:
|
||||
print(f"Error: {e} not found in state.")
|
||||
raise
|
||||
|
||||
# ... (Add logic for parsing with BeautifulSoup based on 'tags' if present)
|
||||
|
||||
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
||||
chunk_size=4000,
|
||||
chunk_overlap=0,
|
||||
)
|
||||
|
||||
chunks = text_splitter.split_text(document)
|
||||
state.update({"document_chunks": chunks})
|
||||
state["document_chunks"] = text_splitter.split_text(document)
|
||||
|
||||
return state
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
"""
|
||||
"""
|
||||
Module for TextNode
|
||||
"""
|
||||
from .base_node import BaseNode
|
||||
@ -6,54 +6,52 @@ from .base_node import BaseNode
|
||||
|
||||
class TextNode(BaseNode):
|
||||
"""
|
||||
A node for loading the text in the state
|
||||
A node for loading raw text into the state.
|
||||
|
||||
This node acts as a starting point in many scraping workflows, preparing the state
|
||||
with the necessary HTML content for further processing by subsequent nodes in the graph.
|
||||
Primarily used in scraping workflows, this node prepares the state by directly
|
||||
loading raw text content from a specified source, making it available for
|
||||
further processing by subsequent nodes in the graph.
|
||||
|
||||
Attributes:
|
||||
node_name (str): The unique identifier name for the node.
|
||||
node_type (str): The type of the node, defaulting to "node". This categorization
|
||||
helps in determining the node's role and behavior within the graph.
|
||||
The "node" type is used for standard operational nodes.
|
||||
node_name (str): The unique identifier for the node.
|
||||
node_type (str): The type of the node ("node" in this case).
|
||||
|
||||
Args:
|
||||
node_name (str): The unique identifier name for the node. This name is used to
|
||||
reference the node within the graph.
|
||||
node_type (str, optional): The type of the node, limited to "node" or
|
||||
"conditional_node". Defaults to "node".
|
||||
node_name (str): The unique identifier for the node.
|
||||
|
||||
Methods:
|
||||
execute(state): Fetches the HTML content for the URL specified in the state and
|
||||
updates the state with this content under the 'document' key.
|
||||
The 'url' key must be present in the state for the operation
|
||||
to succeed.
|
||||
execute(state): Directly loads text content into the state and stores it
|
||||
under the 'document' key. Requires the 'url' key to be present in
|
||||
the state, representing the location of the text content.
|
||||
"""
|
||||
|
||||
def __init__(self, node_name: str):
|
||||
"""
|
||||
Initializes the FetchHTMLNode with a node name and node type.
|
||||
Arguments:
|
||||
node_name (str): name of the node
|
||||
Initializes the TextNode with a node name.
|
||||
|
||||
Args:
|
||||
node_name (str): The unique name for the node.
|
||||
"""
|
||||
super().__init__(node_name, "node")
|
||||
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Add to the state the text as a document
|
||||
Loads raw text content into the state.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph, expected to contain a 'url' key.
|
||||
state (dict): The current state, expected to contain a 'url' key
|
||||
indicating the source of the text.
|
||||
|
||||
Returns:
|
||||
dict: The updated state with a new 'document' key containing the fetched HTML content.
|
||||
dict: The updated state with the text content stored under the 'document' key.
|
||||
|
||||
Raises:
|
||||
KeyError: If the 'url' key is not found in the state, indicating that the
|
||||
necessary information to perform the operation is missing.
|
||||
KeyError: If the 'url' key is missing from the state.
|
||||
"""
|
||||
print("---LOADING TEXT CODE---")
|
||||
|
||||
state["document"] = state["url"]
|
||||
if 'url' not in state:
|
||||
raise KeyError("The 'url' key is required to load the text.")
|
||||
|
||||
state["document"] = state["url"]
|
||||
return state
|
||||
|
||||
Loading…
Reference in New Issue
Block a user