mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
refactor fetch node
This commit is contained in:
parent
20d7b69008
commit
3585cd81e5
@ -2,6 +2,7 @@
|
||||
__init__.py file for node folder
|
||||
"""
|
||||
from .fetch_html_node import FetchHTMLNode
|
||||
from .fetch_node import FetchNode
|
||||
from .conditional_node import ConditionalNode
|
||||
from .get_probable_tags_node import GetProbableTagsNode
|
||||
from .generate_answer_node import GenerateAnswerNode
|
||||
|
||||
@ -62,7 +62,7 @@ class BaseNode(ABC):
|
||||
self.node_type = node_type
|
||||
|
||||
@abstractmethod
|
||||
def execute(self, state: dict):
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Execute the node's logic and return the updated state.
|
||||
Args:
|
||||
|
||||
78
scrapegraphai/nodes/fetch_node.py
Normal file
78
scrapegraphai/nodes/fetch_node.py
Normal file
@ -0,0 +1,78 @@
|
||||
"""
|
||||
Module for fetching the HTML node
|
||||
"""
|
||||
|
||||
from langchain_community.document_loaders import AsyncHtmlLoader
|
||||
from langchain_core.documents import Document
|
||||
from .base_node import BaseNode
|
||||
from typing import List
|
||||
|
||||
class FetchNode(BaseNode):
|
||||
"""
|
||||
A node responsible for fetching the HTML content of a specified URL and updating
|
||||
the graph's state with this content. It uses the AsyncHtmlLoader for asynchronous
|
||||
document loading.
|
||||
|
||||
This node acts as a starting point in many scraping workflows, preparing the state
|
||||
with the necessary HTML content for further processing by subsequent nodes in the graph.
|
||||
|
||||
Attributes:
|
||||
node_name (str): The unique identifier name for the node.
|
||||
node_type (str): The type of the node, defaulting to "node". This categorization
|
||||
helps in determining the node's role and behavior within the graph.
|
||||
The "node" type is used for standard operational nodes.
|
||||
|
||||
Args:
|
||||
node_name (str): The unique identifier name for the node. This name is used to
|
||||
reference the node within the graph.
|
||||
node_type (str, optional): The type of the node, limited to "node" or
|
||||
"conditional_node". Defaults to "node".
|
||||
|
||||
Methods:
|
||||
execute(state): Fetches the HTML content for the URL specified in the state and
|
||||
updates the state with this content under the 'document' key.
|
||||
The 'url' key must be present in the state for the operation
|
||||
to succeed.
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], node_name: str = "FetchNode"):
|
||||
"""
|
||||
Initializes the FetchHTMLNode with a node name and node type.
|
||||
Arguments:
|
||||
node_name (str): name of the node
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 1)
|
||||
|
||||
def execute(self, state):
|
||||
"""
|
||||
Executes the node's logic to fetch HTML content from a specified URL and
|
||||
update the state with this content.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph, expected to contain a 'url' key.
|
||||
|
||||
Returns:
|
||||
dict: The updated state with a new 'document' key containing the fetched HTML content.
|
||||
|
||||
Raises:
|
||||
KeyError: If the 'url' key is not found in the state, indicating that the
|
||||
necessary information to perform the operation is missing.
|
||||
"""
|
||||
print(f"--- Executing {self.node_name} Node ---")
|
||||
|
||||
# Interpret input keys based on the provided input expression
|
||||
input_keys = self.get_input_keys(state)
|
||||
|
||||
# Fetching data from the state based on the input keys
|
||||
input_data = [state[key] for key in input_keys]
|
||||
|
||||
loader = AsyncHtmlLoader(input_data[0])
|
||||
document = loader.load()
|
||||
# metadata = document[0].metadata
|
||||
# document = remover(str(document[0]))
|
||||
|
||||
# state["document"] = [
|
||||
# Document(page_content=document, metadata=metadata)]
|
||||
state.update({self.output[0]: document})
|
||||
|
||||
return state
|
||||
@ -38,7 +38,7 @@ class ParseNode(BaseNode):
|
||||
node_name (str): name of the node
|
||||
node_type (str, optional): type of the node
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 2)
|
||||
super().__init__(node_name, "node", input, output, 1)
|
||||
|
||||
def execute(self, state):
|
||||
"""
|
||||
|
||||
@ -1,15 +1,21 @@
|
||||
from scrapegraphai.nodes import FetchHTMLNode, ParseNode, RAGNode, GenerateAnswerNode
|
||||
from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode
|
||||
|
||||
state = {
|
||||
"user_prompt": None,
|
||||
"url": None,
|
||||
"doc": None,
|
||||
"user_prompt": "List me all the projects",
|
||||
"url": "https://perinim.github.io/projects/",
|
||||
}
|
||||
|
||||
fetch_node = FetchNode(
|
||||
input="url | local_dir",
|
||||
output=["doc"],
|
||||
node_name="fetch_html"
|
||||
)
|
||||
|
||||
updated_state = fetch_node.execute(state)
|
||||
parse_node = ParseNode(
|
||||
input="doc & url",
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
node_name="parse_document"
|
||||
)
|
||||
|
||||
parse_node.execute(state)
|
||||
parse_node.execute(updated_state)
|
||||
Loading…
Reference in New Issue
Block a user