mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
docs: refactor nodes docstrings
This commit is contained in:
parent
e9817963c8
commit
1409797475
@ -1,5 +1,5 @@
|
||||
"""
|
||||
Module for defining BaseNode, an abstract base class for nodes in a graph-based workflow.
|
||||
BaseNode Module
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
"""
|
||||
Module for fetching the HTML node
|
||||
FetchNode Module
|
||||
"""
|
||||
|
||||
from typing import List, Optional
|
||||
@ -27,10 +27,6 @@ class FetchNode(BaseNode):
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (Optional[dict]): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "Fetch".
|
||||
|
||||
Methods:
|
||||
execute(state): Fetches the HTML content for the URL specified in the state
|
||||
and updates the state with the fetched content under the specified output key.
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"):
|
||||
@ -45,13 +41,14 @@ class FetchNode(BaseNode):
|
||||
update the state with this content.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph, expected to contain a 'url' key.
|
||||
state (dict): The current state of the graph. The input keys will be used
|
||||
to fetch the correct data types from the state.
|
||||
|
||||
Returns:
|
||||
dict: The updated state with a new 'document' key containing the fetched HTML content.
|
||||
dict: The updated state with a new output key containing the fetched HTML content.
|
||||
|
||||
Raises:
|
||||
KeyError: If the 'url' key is not found in the state, indicating that the
|
||||
KeyError: If the input key is not found in the state, indicating that the
|
||||
necessary information to perform the operation is missing.
|
||||
"""
|
||||
if self.verbose:
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
"""
|
||||
Module for generating the answer node
|
||||
GenerateAnswerNode Module
|
||||
"""
|
||||
|
||||
# Imports from standard library
|
||||
from typing import List
|
||||
from tqdm import tqdm
|
||||
@ -16,57 +17,43 @@ from .base_node import BaseNode
|
||||
|
||||
class GenerateAnswerNode(BaseNode):
|
||||
"""
|
||||
A node that generates an answer using a language model (LLM) based on the user's input
|
||||
A node that generates an answer using a large language model (LLM) based on the user's input
|
||||
and the content extracted from a webpage. It constructs a prompt from the user's input
|
||||
and the scraped content, feeds it to the LLM, and parses the LLM's response to produce
|
||||
an answer.
|
||||
|
||||
Attributes:
|
||||
llm: An instance of a language model client, configured for generating answers.
|
||||
node_name (str): The unique identifier name for the node, defaulting
|
||||
to "GenerateAnswerNode".
|
||||
node_type (str): The type of the node, set to "node" indicating a
|
||||
standard operational node.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
|
||||
Args:
|
||||
llm: An instance of the language model client (e.g., ChatOpenAI) used
|
||||
for generating answers.
|
||||
node_name (str, optional): The unique identifier name for the node.
|
||||
Defaults to "GenerateAnswerNode".
|
||||
|
||||
Methods:
|
||||
execute(state): Processes the input and document from the state to generate an answer,
|
||||
updating the state with the generated answer under the 'answer' key.
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], node_config: dict,
|
||||
node_name: str = "GenerateAnswer"):
|
||||
"""
|
||||
Initializes the GenerateAnswerNode with a language model client and a node name.
|
||||
Args:
|
||||
llm: An instance of the OpenAIImageToText class.
|
||||
node_name (str): name of the node
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 2, node_config)
|
||||
|
||||
self.llm_model = node_config["llm"]
|
||||
self.verbose = True if node_config is None else node_config.get("verbose", False)
|
||||
|
||||
def execute(self, state):
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Generates an answer by constructing a prompt from the user's input and the scraped
|
||||
content, querying the language model, and parsing its response.
|
||||
|
||||
The method updates the state with the generated answer under the 'answer' key.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph, expected to contain 'user_input',
|
||||
and optionally 'parsed_document' or 'relevant_chunks' within 'keys'.
|
||||
state (dict): The current state of the graph. The input keys will be used
|
||||
to fetch the correct data from the state.
|
||||
|
||||
Returns:
|
||||
dict: The updated state with the 'answer' key containing the generated answer.
|
||||
dict: The updated state with the output key containing the generated answer.
|
||||
|
||||
Raises:
|
||||
KeyError: If 'user_input' or 'document' is not found in the state, indicating
|
||||
KeyError: If the input keys are not found in the state, indicating
|
||||
that the necessary information for generating an answer is missing.
|
||||
"""
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
"""
|
||||
Module for generating the answer node
|
||||
GenerateScraperNode Module
|
||||
"""
|
||||
|
||||
# Imports from standard library
|
||||
from typing import List
|
||||
from tqdm import tqdm
|
||||
@ -16,58 +17,46 @@ from .base_node import BaseNode
|
||||
|
||||
class GenerateScraperNode(BaseNode):
|
||||
"""
|
||||
A node that generates an answer using a language model (LLM) based on the user's input
|
||||
and the content extracted from a webpage. It constructs a prompt from the user's input
|
||||
and the scraped content, feeds it to the LLM, and parses the LLM's response to produce
|
||||
an answer.
|
||||
Generates a python script for scraping a website using the specified library.
|
||||
It takes the user's prompt and the scraped content as input and generates a python script
|
||||
that extracts the information requested by the user.
|
||||
|
||||
Attributes:
|
||||
llm: An instance of a language model client, configured for generating answers.
|
||||
node_name (str): The unique identifier name for the node, defaulting
|
||||
to "GenerateScraperNode".
|
||||
node_type (str): The type of the node, set to "node" indicating a
|
||||
standard operational node.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
library (str): The python library to use for scraping the website.
|
||||
source (str): The website to scrape.
|
||||
|
||||
Args:
|
||||
llm: An instance of the language model client (e.g., ChatOpenAI) used
|
||||
for generating answers.
|
||||
node_name (str, optional): The unique identifier name for the node.
|
||||
Defaults to "GenerateScraperNode".
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
library (str): The python library to use for scraping the website.
|
||||
website (str): The website to scrape.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
|
||||
|
||||
Methods:
|
||||
execute(state): Processes the input and document from the state to generate an answer,
|
||||
updating the state with the generated answer under the 'answer' key.
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], node_config: dict,
|
||||
library: str, website: str, node_name: str = "GenerateAnswer"):
|
||||
"""
|
||||
Initializes the GenerateScraperNode with a language model client and a node name.
|
||||
Args:
|
||||
llm (OpenAIImageToText): An instance of the OpenAIImageToText class.
|
||||
node_name (str): name of the node
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 2, node_config)
|
||||
|
||||
self.llm_model = node_config["llm"]
|
||||
self.library = library
|
||||
self.source = website
|
||||
|
||||
def execute(self, state):
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Generates an answer by constructing a prompt from the user's input and the scraped
|
||||
content, querying the language model, and parsing its response.
|
||||
|
||||
The method updates the state with the generated answer under the 'answer' key.
|
||||
Generates a python script for scraping a website using the specified library.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph, expected to contain 'user_input',
|
||||
and optionally 'parsed_document' or 'relevant_chunks' within 'keys'.
|
||||
state (dict): The current state of the graph. The input keys will be used
|
||||
to fetch the correct data from the state.
|
||||
|
||||
Returns:
|
||||
dict: The updated state with the 'answer' key containing the generated answer.
|
||||
dict: The updated state with the output key containing the generated answer.
|
||||
|
||||
Raises:
|
||||
KeyError: If 'user_input' or 'document' is not found in the state, indicating
|
||||
KeyError: If input keys are not found in the state, indicating
|
||||
that the necessary information for generating an answer is missing.
|
||||
"""
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
"""
|
||||
Module for proobable tags
|
||||
GetProbableTagsNode Module
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from langchain.output_parsers import CommaSeparatedListOutputParser
|
||||
from langchain.prompts import PromptTemplate
|
||||
@ -15,47 +16,36 @@ class GetProbableTagsNode(BaseNode):
|
||||
list of probable tags.
|
||||
|
||||
Attributes:
|
||||
llm: An instance of a language model client, configured for generating tag predictions.
|
||||
node_name (str): The unique identifier name for the node,
|
||||
defaulting to "GetProbableTagsNode".
|
||||
node_type (str): The type of the node, set to "node" indicating a standard operational node.
|
||||
llm_model: An instance of the language model client used for tag predictions.
|
||||
|
||||
Args:
|
||||
llm: An instance of the language model client (e.g., ChatOpenAI) used for tag predictions.
|
||||
node_name (str, optional): The unique identifier name for the node.
|
||||
Defaults to "GetProbableTagsNode".
|
||||
|
||||
Methods:
|
||||
execute(state): Processes the user's input and the URL from the state to generate a list of
|
||||
probable HTML tags, updating the state with these tags under the 'tags' key.
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
model_config (dict): Additional configuration for the language model.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "GetProbableTags".
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], model_config: dict,
|
||||
node_name: str = "GetProbableTags"):
|
||||
"""
|
||||
Initializes the GetProbableTagsNode with a language model client and a node name.
|
||||
Args:
|
||||
llm (OpenAIImageToText): An instance of the OpenAIImageToText class.
|
||||
node_name (str): name of the node
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 2, model_config)
|
||||
|
||||
self.llm_model = model_config["llm_model"]
|
||||
|
||||
def execute(self, state):
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Generates a list of probable HTML tags based on the user's input and updates the state
|
||||
with this list. The method constructs a prompt for the language model, submits it, and
|
||||
parses the output to identify probable tags.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph, expected to contain 'user_input', 'url',
|
||||
and optionally 'document' within 'keys'.
|
||||
state (dict): The current state of the graph. The input keys will be used to fetch the
|
||||
correct data types from the state.
|
||||
|
||||
Returns:
|
||||
dict: The updated state with the 'tags' key containing a list of probable HTML tags.
|
||||
dict: The updated state with the input key containing a list of probable HTML tags.
|
||||
|
||||
Raises:
|
||||
KeyError: If 'user_input' or 'url' is not found in the state, indicating that the
|
||||
KeyError: If input keys are not found in the state, indicating that the
|
||||
necessary information for generating tag predictions is missing.
|
||||
"""
|
||||
|
||||
|
||||
@ -1,45 +1,44 @@
|
||||
"""
|
||||
Module for the ImageToTextNode class.
|
||||
ImageToTextNode Module
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from .base_node import BaseNode
|
||||
|
||||
|
||||
class ImageToTextNode(BaseNode):
|
||||
"""
|
||||
A class representing a node that processes an image and returns the text description.
|
||||
Retrieve an image from an URL and convert it to text using an ImageToText model.
|
||||
|
||||
Attributes:
|
||||
llm_model (OpenAIImageToText): An instance of the OpenAIImageToText class.
|
||||
llm_model: An instance of the language model client used for image-to-text conversion.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
|
||||
Methods:
|
||||
execute(state, url): Execute the node's logic and return the updated state.
|
||||
Args:
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "ImageToText".
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], node_config: dict,
|
||||
node_name: str = "ImageToText"):
|
||||
"""
|
||||
Initializes an instance of the ImageToTextNode class.
|
||||
|
||||
Args:
|
||||
input (str): The input for the node.
|
||||
output (List[str]): The output of the node.
|
||||
node_config (dict): Configuration for the model.
|
||||
node_name (str): Name of the node.
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 1, node_config)
|
||||
|
||||
self.llm_model = node_config["llm_model"]
|
||||
self.verbose = True if node_config is None else node_config.get("verbose", False)
|
||||
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Execute the node's logic and return the updated state.
|
||||
Generate text from an image using an image-to-text model. The method retrieves the image
|
||||
from the URL provided in the state.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph.
|
||||
state (dict): The current state of the graph. The input keys will be used to fetch the
|
||||
correct data types from the state.
|
||||
|
||||
Returns:
|
||||
dict: The updated state after executing this node.
|
||||
dict: The updated state with the input key containing the text extracted from the image.
|
||||
"""
|
||||
|
||||
if self.verbose:
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
"""
|
||||
Module for parsing the HTML node
|
||||
ParseNode Module
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_community.document_transformers import Html2TextTransformer
|
||||
@ -10,56 +11,40 @@ from .base_node import BaseNode
|
||||
class ParseNode(BaseNode):
|
||||
"""
|
||||
A node responsible for parsing HTML content from a document.
|
||||
It uses BeautifulSoupTransformer for parsing, providing flexibility in extracting
|
||||
specific parts of an HTML document.
|
||||
The parsed content is split into chunks for further processing.
|
||||
|
||||
This node enhances the scraping workflow by allowing for targeted extraction of
|
||||
content, thereby optimizing the processing of large HTML documents.
|
||||
|
||||
Attributes:
|
||||
node_name (str): The unique identifier name for the node, defaulting to "ParseHTMLNode".
|
||||
node_type (str): The type of the node, set to "node" indicating a standard operational node.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
|
||||
Args:
|
||||
node_name (str, optional): The unique identifier name for the node.
|
||||
Defaults to "ParseHTMLNode".
|
||||
|
||||
Methods:
|
||||
execute(state): Parses the HTML document contained within the state using
|
||||
the specified tags, if provided, and updates the state with the parsed content.
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "Parse".
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "Parse"):
|
||||
"""
|
||||
Initializes the ParseHTMLNode with a node name.
|
||||
Args:
|
||||
doc_type (str): type of the input document
|
||||
chunks_size (int): size of the chunks to split the document
|
||||
node_name (str): name of the node
|
||||
node_type (str, optional): type of the node
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 1, node_config)
|
||||
|
||||
self.verbose = True if node_config is None else node_config.get("verbose", False)
|
||||
|
||||
def execute(self, state):
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Executes the node's logic to parse the HTML document based on specified tags.
|
||||
If tags are provided in the state, the document is parsed accordingly; otherwise,
|
||||
the document remains unchanged. The method updates the state with either the original
|
||||
or parsed document under the 'parsed_document' key.
|
||||
Executes the node's logic to parse the HTML document content and split it into chunks.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph, expected to contain
|
||||
'document' within 'keys', and optionally 'tags' for targeted parsing.
|
||||
state (dict): The current state of the graph. The input keys will be used to fetch the
|
||||
correct data from the state.
|
||||
|
||||
Returns:
|
||||
dict: The updated state with the 'parsed_document' key containing the parsed content,
|
||||
if tags were provided, or the original document otherwise.
|
||||
dict: The updated state with the output key containing the parsed content chunks.
|
||||
|
||||
Raises:
|
||||
KeyError: If 'document' is not found in the state, indicating that the necessary
|
||||
information for parsing is missing.
|
||||
KeyError: If the input keys are not found in the state, indicating that the
|
||||
necessary information for parsing the content is missing.
|
||||
"""
|
||||
|
||||
if self.verbose:
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
"""
|
||||
Module for parsing the HTML node
|
||||
RAGNode Module
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
@ -18,46 +18,44 @@ from .base_node import BaseNode
|
||||
class RAGNode(BaseNode):
|
||||
"""
|
||||
A node responsible for compressing the input tokens and storing the document
|
||||
in a vector database for retrieval.
|
||||
in a vector database for retrieval. Relevant chunks are stored in the state.
|
||||
|
||||
It allows scraping of big documents without exceeding the token limit of the language model.
|
||||
|
||||
Attributes:
|
||||
node_name (str): The unique identifier name for the node, defaulting to "ParseHTMLNode".
|
||||
node_type (str): The type of the node, set to "node" indicating a standard operational node.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client, configured for generating embeddings.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
|
||||
Args:
|
||||
node_name (str, optional): The unique identifier name for the node.
|
||||
Defaults to "ParseHTMLNode".
|
||||
|
||||
Methods:
|
||||
execute(state): Parses the HTML document contained within the state using
|
||||
the specified tags, if provided, and updates the state with the parsed content.
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "Parse".
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "RAG"):
|
||||
"""
|
||||
Initializes the ParseHTMLNode with a node name.
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 2, node_config)
|
||||
|
||||
self.llm_model = node_config["llm"]
|
||||
self.embedder_model = node_config.get("embedder_model", None)
|
||||
self.verbose = True if node_config is None else node_config.get("verbose", False)
|
||||
|
||||
def execute(self, state):
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Executes the node's logic to implement RAG (Retrieval-Augmented Generation)
|
||||
Executes the node's logic to implement RAG (Retrieval-Augmented Generation).
|
||||
The method updates the state with relevant chunks of the document.
|
||||
|
||||
Args:
|
||||
state (dict): The state containing the 'document' key with the HTML content
|
||||
state (dict): The current state of the graph. The input keys will be used to fetch the
|
||||
correct data from the state.
|
||||
|
||||
Returns:
|
||||
dict: The updated state containing the 'relevant_chunks' key with the relevant chunks.
|
||||
dict: The updated state with the output key containing the relevant chunks of the document.
|
||||
|
||||
Raises:
|
||||
KeyError: If 'document' is not found in the state, indicating that the necessary
|
||||
information for parsing is missing.
|
||||
KeyError: If the input keys are not found in the state, indicating that the
|
||||
necessary information for compressing the content is missing.
|
||||
"""
|
||||
|
||||
if self.verbose:
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
"""
|
||||
Module for checking if a website is scrapepable or not
|
||||
RobotsNode Module
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from urllib.parse import urlparse
|
||||
from langchain_community.document_loaders import AsyncHtmlLoader
|
||||
@ -12,75 +13,53 @@ from ..helpers import robots_dictionary
|
||||
|
||||
class RobotsNode(BaseNode):
|
||||
"""
|
||||
A node responsible for checking if a website is scrapepable or not.
|
||||
It uses the AsyncHtmlLoader for asynchronous
|
||||
document loading.
|
||||
A node responsible for checking if a website is scrapeable or not based on the robots.txt file.
|
||||
It uses a language model to determine if the website allows scraping of the provided path.
|
||||
|
||||
This node acts as a starting point in many scraping workflows, preparing the state
|
||||
with the necessary HTML content for further processing by subsequent nodes in the graph.
|
||||
|
||||
Attributes:
|
||||
This node acts as a starting point in many scraping workflows, preparing the state
|
||||
with the necessary HTML content for further processing by subsequent nodes in the graph.
|
||||
|
||||
Attributes:
|
||||
node_name (str): The unique identifier name for the node.
|
||||
node_type (str): The type of the node, defaulting to "node". This categorization
|
||||
helps in determining the node's role and behavior within the graph.
|
||||
The "node" type is used for standard operational nodes.
|
||||
llm_model: An instance of the language model client used for checking scrapeability.
|
||||
force_scraping (bool): A flag indicating whether scraping should be enforced even
|
||||
if disallowed by robots.txt.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
|
||||
Args:
|
||||
node_name (str): The unique identifier name for the node. This name is used to
|
||||
reference the node within the graph.
|
||||
node_type (str, optional): The type of the node, limited to "node" or
|
||||
"conditional_node". Defaults to "node".
|
||||
node_config (dict): Configuration parameters for the node.
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
force_scraping (bool): A flag indicating whether scraping should be enforced even
|
||||
if disallowed by robots.txt. Defaults to True.
|
||||
input (str): Input expression defining how to interpret the incoming data.
|
||||
output (List[str]): List of output keys where the results will be stored.
|
||||
|
||||
Methods:
|
||||
execute(state): Fetches the HTML content for the URL specified in the state and
|
||||
updates the state with this content under the 'document' key.
|
||||
The 'url' key must be present in the state for the operation
|
||||
to succeed.
|
||||
if disallowed by robots.txt. Defaults to True.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "Robots".
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], node_config: dict, force_scraping=True,
|
||||
node_name: str = "Robots"):
|
||||
"""
|
||||
Initializes the RobotsNode with a node name, input/output expressions
|
||||
and node configuration.
|
||||
|
||||
Arguments:
|
||||
input (str): Input expression defining how to interpret the incoming data.
|
||||
output (List[str]): List of output keys where the results will be stored.
|
||||
node_config (dict): Configuration parameters for the node.
|
||||
force_scraping (bool): A flag indicating whether scraping should be enforced even
|
||||
if disallowed by robots.txt. Defaults to True.
|
||||
node_name (str, optional): The unique identifier name for the node.
|
||||
Defaults to "Robots".
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 1)
|
||||
|
||||
self.llm_model = node_config["llm"]
|
||||
self.force_scraping = force_scraping
|
||||
self.verbose = True if node_config is None else node_config.get("verbose", False)
|
||||
|
||||
def execute(self, state):
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Executes the node's logic to fetch HTML content from a specified URL and
|
||||
update the state with this content.
|
||||
Checks if a website is scrapeable based on the robots.txt file and updates the state
|
||||
with the scrapeability status. The method constructs a prompt for the language model,
|
||||
submits it, and parses the output to determine if scraping is allowed.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph, expected to contain a 'url' key.
|
||||
state (dict): The current state of the graph. The input keys will be used to fetch the
|
||||
|
||||
Returns:
|
||||
dict: The updated state with a new 'document' key containing the fetched HTML content.
|
||||
dict: The updated state with the output key containing the scrapeability status.
|
||||
|
||||
Raises:
|
||||
KeyError: If the 'url' key is not found in the state, indicating that the
|
||||
necessary information to perform the operation is missing.
|
||||
KeyError: If the input keys are not found in the state, indicating that the
|
||||
necessary information for checking scrapeability is missing.
|
||||
KeyError: If the large language model is not found in the robots_dictionary.
|
||||
ValueError: If the website is not scrapeable based on the robots.txt file and
|
||||
scraping is not enforced.
|
||||
"""
|
||||
|
||||
if self.verbose:
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
"""
|
||||
Module for generating the answer node
|
||||
SearchInternetNode Module
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from langchain.output_parsers import CommaSeparatedListOutputParser
|
||||
from langchain.prompts import PromptTemplate
|
||||
@ -10,63 +11,46 @@ from .base_node import BaseNode
|
||||
|
||||
class SearchInternetNode(BaseNode):
|
||||
"""
|
||||
A node that generates an answer by querying a language model (LLM) based on the user's input
|
||||
and the content extracted from a webpage. It constructs a prompt from the user's input
|
||||
and the scraped content, feeds it to the LLM, and parses the LLM's response to produce
|
||||
an answer.
|
||||
A node that generates a search query based on the user's input and searches the internet
|
||||
for relevant information. The node constructs a prompt for the language model, submits it,
|
||||
and processes the output to generate a search query. It then uses the search query to find
|
||||
relevant information on the internet and updates the state with the generated answer.
|
||||
|
||||
Attributes:
|
||||
node_name (str): The unique identifier name for the node.
|
||||
node_type (str): The type of the node, set to "node" indicating a standard operational node.
|
||||
input (str): The user input used to construct the prompt.
|
||||
output (List[str]): The keys in the state dictionary
|
||||
where the generated answer will be stored.
|
||||
model_config (dict): Configuration parameters for the language model client.
|
||||
llm_model: An instance of the language model client used for generating search queries.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
|
||||
Args:
|
||||
input (str): The user input used to construct the prompt.
|
||||
output (List[str]): The keys in the state dictionary where the
|
||||
generated answer will be stored.
|
||||
model_config (dict): Configuration parameters for the language model client.
|
||||
node_name (str, optional): The unique identifier name for the node.
|
||||
|
||||
Methods:
|
||||
execute(state): Processes the input and document from the state to generate an answer,
|
||||
updating the state with the generated answer under the 'answer' key.
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "SearchInternet".
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], node_config: dict,
|
||||
node_name: str = "SearchInternet"):
|
||||
"""
|
||||
Initializes the SearchInternetNode with input, output, model configuration, and a node name.
|
||||
Args:
|
||||
input (str): The user input used to construct the prompt.
|
||||
output (List[str]): The keys in the state dictionary where the
|
||||
generated answer will be stored.
|
||||
model_config (dict): Configuration parameters for the language model client.
|
||||
node_name (str): The unique identifier name for the node.
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 1, node_config)
|
||||
|
||||
self.llm_model = node_config["llm"]
|
||||
self.verbose = True if node_config is None else node_config.get("verbose", False)
|
||||
|
||||
def execute(self, state):
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Generates an answer by constructing a prompt from the user's input and the scraped
|
||||
content, querying the language model, and parsing its response.
|
||||
|
||||
The method updates the state with the generated answer under the 'answer' key.
|
||||
The method updates the state with the generated answer.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph, expected to contain 'user_input',
|
||||
and optionally 'parsed_document' or 'relevant_chunks' within 'keys'.
|
||||
state (dict): The current state of the graph. The input keys will be used to fetch the
|
||||
correct data types from the state.
|
||||
|
||||
Returns:
|
||||
dict: The updated state with the 'answer' key containing the generated answer.
|
||||
dict: The updated state with the output key containing the generated answer.
|
||||
|
||||
Raises:
|
||||
KeyError: If 'user_input' or 'document' is not found in the state, indicating
|
||||
that the necessary information for generating an answer is missing.
|
||||
KeyError: If the input keys are not found in the state, indicating that the
|
||||
necessary information for generating the answer is missing.
|
||||
"""
|
||||
|
||||
if self.verbose:
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
"""
|
||||
Module for generating the answer node
|
||||
SearchLinkNode Module
|
||||
"""
|
||||
|
||||
# Imports from standard library
|
||||
from typing import List
|
||||
from tqdm import tqdm
|
||||
@ -18,58 +19,42 @@ from .base_node import BaseNode
|
||||
|
||||
class SearchLinkNode(BaseNode):
|
||||
"""
|
||||
A node that generates an answer using a language model (LLM) based on the user's input
|
||||
and the content extracted from a webpage. It constructs a prompt from the user's input
|
||||
and the scraped content, feeds it to the LLM, and parses the LLM's response to produce
|
||||
an answer.
|
||||
A node that look for all the links in a web page and returns them.
|
||||
It initially tries to extract the links using classical methods, if it fails it uses the LLM to extract the links.
|
||||
|
||||
Attributes:
|
||||
llm: An instance of a language model client, configured for generating answers.
|
||||
node_name (str): The unique identifier name for the node, defaulting
|
||||
to "GenerateAnswerNode".
|
||||
node_type (str): The type of the node, set to "node" indicating a
|
||||
standard operational node.
|
||||
llm_model: An instance of the language model client used for generating answers.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
|
||||
Args:
|
||||
llm: An instance of the language model client (e.g., ChatOpenAI) used
|
||||
for generating answers.
|
||||
node_name (str, optional): The unique identifier name for the node.
|
||||
Defaults to "GenerateAnswerNode".
|
||||
|
||||
Methods:
|
||||
execute(state): Processes the input and document from the state to generate an answer,
|
||||
updating the state with the generated answer under the 'answer' key.
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str], node_config: dict,
|
||||
node_name: str = "GenerateLinks"):
|
||||
"""
|
||||
Initializes the GenerateAnswerNode with a language model client and a node name.
|
||||
Args:
|
||||
llm: An instance of the OpenAIImageToText class.
|
||||
node_name (str): name of the node
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 1, node_config)
|
||||
|
||||
self.llm_model = node_config["llm"]
|
||||
self.verbose = True if node_config is None else node_config.get("verbose", False)
|
||||
|
||||
def execute(self, state):
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Generates an answer by constructing a prompt from the user's input and the scraped
|
||||
content, querying the language model, and parsing its response.
|
||||
|
||||
The method updates the state with the generated answer under the 'answer' key.
|
||||
Generates a list of links by extracting them from the provided HTML content.
|
||||
First, it tries to extract the links using classical methods, if it fails it uses the LLM to extract the links.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph, expected to contain 'user_input',
|
||||
and optionally 'parsed_document' or 'relevant_chunks' within 'keys'.
|
||||
state (dict): The current state of the graph. The input keys will be used to fetch the
|
||||
correct data types from the state.
|
||||
|
||||
Returns:
|
||||
dict: The updated state with the 'answer' key containing the generated answer.
|
||||
dict: The updated state with the output key containing the list of links.
|
||||
|
||||
Raises:
|
||||
KeyError: If 'user_input' or 'document' is not found in the state, indicating
|
||||
that the necessary information for generating an answer is missing.
|
||||
KeyError: If the input keys are not found in the state, indicating that the
|
||||
necessary information for generating the answer is missing.
|
||||
"""
|
||||
|
||||
if self.verbose:
|
||||
@ -90,7 +75,7 @@ class SearchLinkNode(BaseNode):
|
||||
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print("error on using classical methods. Using LLM for getting the links")
|
||||
print("Error extracting links using classical methods. Using LLM to extract links.")
|
||||
|
||||
output_parser = JsonOutputParser()
|
||||
|
||||
|
||||
@ -1,39 +1,47 @@
|
||||
"""
|
||||
TextToSpeechNode Module
|
||||
"""
|
||||
|
||||
"""
|
||||
Module for parsing the text to voice
|
||||
"""
|
||||
from typing import List
|
||||
from .base_node import BaseNode
|
||||
|
||||
|
||||
class TextToSpeechNode(BaseNode):
|
||||
"""
|
||||
A class representing a node that processes text and returns the voice.
|
||||
Converts text to speech using the specified text-to-speech model.
|
||||
|
||||
Attributes:
|
||||
llm (OpenAITextToSpeech): An instance of the OpenAITextToSpeech class.
|
||||
tts_model: An instance of the text-to-speech model client.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
|
||||
Methods:
|
||||
execute(state, text): Execute the node's logic and return the updated state.
|
||||
Args:
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "TextToSpeech".
|
||||
"""
|
||||
|
||||
def __init__(self, input: str, output: List[str],
|
||||
node_config: dict, node_name: str = "TextToSpeech"):
|
||||
"""
|
||||
Initializes an instance of the TextToSpeechNode class.
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 1, node_config)
|
||||
|
||||
self.tts_model = node_config["tts_model"]
|
||||
self.verbose = True if node_config is None else node_config.get("verbose", False)
|
||||
|
||||
def execute(self, state):
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Execute the node's logic and return the updated state.
|
||||
Args:
|
||||
state (dict): The current state of the graph.
|
||||
text (str): The text to convert to speech.
|
||||
Converts text to speech using the specified text-to-speech model.
|
||||
|
||||
:return: The updated state after executing this node.
|
||||
Args:
|
||||
state (dict): The current state of the graph. The input keys will be used to fetch the
|
||||
correct data types from the state.
|
||||
|
||||
Returns:
|
||||
dict: The updated state with the output key containing the audio generated from the text.
|
||||
|
||||
Raises:
|
||||
KeyError: If the input keys are not found in the state, indicating that the
|
||||
necessary information for generating the audio is missing.
|
||||
"""
|
||||
|
||||
if self.verbose:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user