diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index 38a5f3ea..79bc305c 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -1,5 +1,5 @@ """ -Module for defining BaseNode, an abstract base class for nodes in a graph-based workflow. +BaseNode Module """ from abc import ABC, abstractmethod diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 0b58141b..f873654d 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -1,5 +1,5 @@ """ -Module for fetching the HTML node +FetchNode Module """ from typing import List, Optional @@ -27,10 +27,6 @@ class FetchNode(BaseNode): output (List[str]): List of output keys to be updated in the state. node_config (Optional[dict]): Additional configuration for the node. node_name (str): The unique identifier name for the node, defaulting to "Fetch". - - Methods: - execute(state): Fetches the HTML content for the URL specified in the state - and updates the state with the fetched content under the specified output key. """ def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"): @@ -45,13 +41,14 @@ class FetchNode(BaseNode): update the state with this content. Args: - state (dict): The current state of the graph, expected to contain a 'url' key. + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data types from the state. Returns: - dict: The updated state with a new 'document' key containing the fetched HTML content. + dict: The updated state with a new output key containing the fetched HTML content. Raises: - KeyError: If the 'url' key is not found in the state, indicating that the + KeyError: If the input key is not found in the state, indicating that the necessary information to perform the operation is missing. """ if self.verbose: diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index e4047356..e9b4dd40 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,6 +1,7 @@ """ -Module for generating the answer node +GenerateAnswerNode Module """ + # Imports from standard library from typing import List from tqdm import tqdm @@ -16,57 +17,43 @@ from .base_node import BaseNode class GenerateAnswerNode(BaseNode): """ - A node that generates an answer using a language model (LLM) based on the user's input + A node that generates an answer using a large language model (LLM) based on the user's input and the content extracted from a webpage. It constructs a prompt from the user's input and the scraped content, feeds it to the LLM, and parses the LLM's response to produce an answer. Attributes: - llm: An instance of a language model client, configured for generating answers. - node_name (str): The unique identifier name for the node, defaulting - to "GenerateAnswerNode". - node_type (str): The type of the node, set to "node" indicating a - standard operational node. + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. Args: - llm: An instance of the language model client (e.g., ChatOpenAI) used - for generating answers. - node_name (str, optional): The unique identifier name for the node. - Defaults to "GenerateAnswerNode". - - Methods: - execute(state): Processes the input and document from the state to generate an answer, - updating the state with the generated answer under the 'answer' key. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "GenerateAnswer"): - """ - Initializes the GenerateAnswerNode with a language model client and a node name. - Args: - llm: An instance of the OpenAIImageToText class. - node_name (str): name of the node - """ super().__init__(node_name, "node", input, output, 2, node_config) + self.llm_model = node_config["llm"] self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ Generates an answer by constructing a prompt from the user's input and the scraped content, querying the language model, and parsing its response. - The method updates the state with the generated answer under the 'answer' key. - Args: - state (dict): The current state of the graph, expected to contain 'user_input', - and optionally 'parsed_document' or 'relevant_chunks' within 'keys'. + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data from the state. Returns: - dict: The updated state with the 'answer' key containing the generated answer. + dict: The updated state with the output key containing the generated answer. Raises: - KeyError: If 'user_input' or 'document' is not found in the state, indicating + KeyError: If the input keys are not found in the state, indicating that the necessary information for generating an answer is missing. """ diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index d60ff6db..9c80fc19 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -1,6 +1,7 @@ """ -Module for generating the answer node +GenerateScraperNode Module """ + # Imports from standard library from typing import List from tqdm import tqdm @@ -16,58 +17,46 @@ from .base_node import BaseNode class GenerateScraperNode(BaseNode): """ - A node that generates an answer using a language model (LLM) based on the user's input - and the content extracted from a webpage. It constructs a prompt from the user's input - and the scraped content, feeds it to the LLM, and parses the LLM's response to produce - an answer. + Generates a python script for scraping a website using the specified library. + It takes the user's prompt and the scraped content as input and generates a python script + that extracts the information requested by the user. Attributes: - llm: An instance of a language model client, configured for generating answers. - node_name (str): The unique identifier name for the node, defaulting - to "GenerateScraperNode". - node_type (str): The type of the node, set to "node" indicating a - standard operational node. + llm_model: An instance of a language model client, configured for generating answers. + library (str): The python library to use for scraping the website. + source (str): The website to scrape. Args: - llm: An instance of the language model client (e.g., ChatOpenAI) used - for generating answers. - node_name (str, optional): The unique identifier name for the node. - Defaults to "GenerateScraperNode". + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + library (str): The python library to use for scraping the website. + website (str): The website to scrape. + node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer". - Methods: - execute(state): Processes the input and document from the state to generate an answer, - updating the state with the generated answer under the 'answer' key. """ def __init__(self, input: str, output: List[str], node_config: dict, library: str, website: str, node_name: str = "GenerateAnswer"): - """ - Initializes the GenerateScraperNode with a language model client and a node name. - Args: - llm (OpenAIImageToText): An instance of the OpenAIImageToText class. - node_name (str): name of the node - """ super().__init__(node_name, "node", input, output, 2, node_config) + self.llm_model = node_config["llm"] self.library = library self.source = website - def execute(self, state): + def execute(self, state: dict) -> dict: """ - Generates an answer by constructing a prompt from the user's input and the scraped - content, querying the language model, and parsing its response. - - The method updates the state with the generated answer under the 'answer' key. + Generates a python script for scraping a website using the specified library. Args: - state (dict): The current state of the graph, expected to contain 'user_input', - and optionally 'parsed_document' or 'relevant_chunks' within 'keys'. + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data from the state. Returns: - dict: The updated state with the 'answer' key containing the generated answer. + dict: The updated state with the output key containing the generated answer. Raises: - KeyError: If 'user_input' or 'document' is not found in the state, indicating + KeyError: If input keys are not found in the state, indicating that the necessary information for generating an answer is missing. """ diff --git a/scrapegraphai/nodes/get_probable_tags_node.py b/scrapegraphai/nodes/get_probable_tags_node.py index 20688143..11977c62 100644 --- a/scrapegraphai/nodes/get_probable_tags_node.py +++ b/scrapegraphai/nodes/get_probable_tags_node.py @@ -1,6 +1,7 @@ """ -Module for proobable tags +GetProbableTagsNode Module """ + from typing import List from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate @@ -15,47 +16,36 @@ class GetProbableTagsNode(BaseNode): list of probable tags. Attributes: - llm: An instance of a language model client, configured for generating tag predictions. - node_name (str): The unique identifier name for the node, - defaulting to "GetProbableTagsNode". - node_type (str): The type of the node, set to "node" indicating a standard operational node. + llm_model: An instance of the language model client used for tag predictions. Args: - llm: An instance of the language model client (e.g., ChatOpenAI) used for tag predictions. - node_name (str, optional): The unique identifier name for the node. - Defaults to "GetProbableTagsNode". - - Methods: - execute(state): Processes the user's input and the URL from the state to generate a list of - probable HTML tags, updating the state with these tags under the 'tags' key. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + model_config (dict): Additional configuration for the language model. + node_name (str): The unique identifier name for the node, defaulting to "GetProbableTags". """ def __init__(self, input: str, output: List[str], model_config: dict, node_name: str = "GetProbableTags"): - """ - Initializes the GetProbableTagsNode with a language model client and a node name. - Args: - llm (OpenAIImageToText): An instance of the OpenAIImageToText class. - node_name (str): name of the node - """ super().__init__(node_name, "node", input, output, 2, model_config) + self.llm_model = model_config["llm_model"] - def execute(self, state): + def execute(self, state: dict) -> dict: """ Generates a list of probable HTML tags based on the user's input and updates the state with this list. The method constructs a prompt for the language model, submits it, and parses the output to identify probable tags. Args: - state (dict): The current state of the graph, expected to contain 'user_input', 'url', - and optionally 'document' within 'keys'. + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data types from the state. Returns: - dict: The updated state with the 'tags' key containing a list of probable HTML tags. + dict: The updated state with the input key containing a list of probable HTML tags. Raises: - KeyError: If 'user_input' or 'url' is not found in the state, indicating that the + KeyError: If input keys are not found in the state, indicating that the necessary information for generating tag predictions is missing. """ diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py index fff877df..d9d4f1cc 100644 --- a/scrapegraphai/nodes/image_to_text_node.py +++ b/scrapegraphai/nodes/image_to_text_node.py @@ -1,45 +1,44 @@ """ -Module for the ImageToTextNode class. +ImageToTextNode Module """ + from typing import List from .base_node import BaseNode class ImageToTextNode(BaseNode): """ - A class representing a node that processes an image and returns the text description. + Retrieve an image from an URL and convert it to text using an ImageToText model. Attributes: - llm_model (OpenAIImageToText): An instance of the OpenAIImageToText class. + llm_model: An instance of the language model client used for image-to-text conversion. + verbose (bool): A flag indicating whether to show print statements during execution. - Methods: - execute(state, url): Execute the node's logic and return the updated state. + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "ImageToText". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "ImageToText"): - """ - Initializes an instance of the ImageToTextNode class. - - Args: - input (str): The input for the node. - output (List[str]): The output of the node. - node_config (dict): Configuration for the model. - node_name (str): Name of the node. - """ super().__init__(node_name, "node", input, output, 1, node_config) + self.llm_model = node_config["llm_model"] self.verbose = True if node_config is None else node_config.get("verbose", False) def execute(self, state: dict) -> dict: """ - Execute the node's logic and return the updated state. + Generate text from an image using an image-to-text model. The method retrieves the image + from the URL provided in the state. Args: - state (dict): The current state of the graph. + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data types from the state. Returns: - dict: The updated state after executing this node. + dict: The updated state with the input key containing the text extracted from the image. """ if self.verbose: diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 789ce057..b552ece4 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -1,6 +1,7 @@ """ -Module for parsing the HTML node +ParseNode Module """ + from typing import List from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_transformers import Html2TextTransformer @@ -10,56 +11,40 @@ from .base_node import BaseNode class ParseNode(BaseNode): """ A node responsible for parsing HTML content from a document. - It uses BeautifulSoupTransformer for parsing, providing flexibility in extracting - specific parts of an HTML document. + The parsed content is split into chunks for further processing. This node enhances the scraping workflow by allowing for targeted extraction of content, thereby optimizing the processing of large HTML documents. Attributes: - node_name (str): The unique identifier name for the node, defaulting to "ParseHTMLNode". - node_type (str): The type of the node, set to "node" indicating a standard operational node. + verbose (bool): A flag indicating whether to show print statements during execution. Args: - node_name (str, optional): The unique identifier name for the node. - Defaults to "ParseHTMLNode". - - Methods: - execute(state): Parses the HTML document contained within the state using - the specified tags, if provided, and updates the state with the parsed content. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "Parse"): - """ - Initializes the ParseHTMLNode with a node name. - Args: - doc_type (str): type of the input document - chunks_size (int): size of the chunks to split the document - node_name (str): name of the node - node_type (str, optional): type of the node - """ super().__init__(node_name, "node", input, output, 1, node_config) self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ - Executes the node's logic to parse the HTML document based on specified tags. - If tags are provided in the state, the document is parsed accordingly; otherwise, - the document remains unchanged. The method updates the state with either the original - or parsed document under the 'parsed_document' key. + Executes the node's logic to parse the HTML document content and split it into chunks. Args: - state (dict): The current state of the graph, expected to contain - 'document' within 'keys', and optionally 'tags' for targeted parsing. + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data from the state. Returns: - dict: The updated state with the 'parsed_document' key containing the parsed content, - if tags were provided, or the original document otherwise. + dict: The updated state with the output key containing the parsed content chunks. Raises: - KeyError: If 'document' is not found in the state, indicating that the necessary - information for parsing is missing. + KeyError: If the input keys are not found in the state, indicating that the + necessary information for parsing the content is missing. """ if self.verbose: diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index d10f50c6..e7dc233c 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -1,5 +1,5 @@ """ -Module for parsing the HTML node +RAGNode Module """ from typing import List @@ -18,46 +18,44 @@ from .base_node import BaseNode class RAGNode(BaseNode): """ A node responsible for compressing the input tokens and storing the document - in a vector database for retrieval. + in a vector database for retrieval. Relevant chunks are stored in the state. It allows scraping of big documents without exceeding the token limit of the language model. Attributes: - node_name (str): The unique identifier name for the node, defaulting to "ParseHTMLNode". - node_type (str): The type of the node, set to "node" indicating a standard operational node. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. Args: - node_name (str, optional): The unique identifier name for the node. - Defaults to "ParseHTMLNode". - - Methods: - execute(state): Parses the HTML document contained within the state using - the specified tags, if provided, and updates the state with the parsed content. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "RAG"): - """ - Initializes the ParseHTMLNode with a node name. - """ super().__init__(node_name, "node", input, output, 2, node_config) + self.llm_model = node_config["llm"] self.embedder_model = node_config.get("embedder_model", None) self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ - Executes the node's logic to implement RAG (Retrieval-Augmented Generation) + Executes the node's logic to implement RAG (Retrieval-Augmented Generation). The method updates the state with relevant chunks of the document. Args: - state (dict): The state containing the 'document' key with the HTML content + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data from the state. Returns: - dict: The updated state containing the 'relevant_chunks' key with the relevant chunks. + dict: The updated state with the output key containing the relevant chunks of the document. Raises: - KeyError: If 'document' is not found in the state, indicating that the necessary - information for parsing is missing. + KeyError: If the input keys are not found in the state, indicating that the + necessary information for compressing the content is missing. """ if self.verbose: diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index 3df9603d..001de62d 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -1,6 +1,7 @@ """ -Module for checking if a website is scrapepable or not +RobotsNode Module """ + from typing import List from urllib.parse import urlparse from langchain_community.document_loaders import AsyncHtmlLoader @@ -12,75 +13,53 @@ from ..helpers import robots_dictionary class RobotsNode(BaseNode): """ - A node responsible for checking if a website is scrapepable or not. - It uses the AsyncHtmlLoader for asynchronous - document loading. + A node responsible for checking if a website is scrapeable or not based on the robots.txt file. + It uses a language model to determine if the website allows scraping of the provided path. This node acts as a starting point in many scraping workflows, preparing the state with the necessary HTML content for further processing by subsequent nodes in the graph. Attributes: - This node acts as a starting point in many scraping workflows, preparing the state - with the necessary HTML content for further processing by subsequent nodes in the graph. - - Attributes: - node_name (str): The unique identifier name for the node. - node_type (str): The type of the node, defaulting to "node". This categorization - helps in determining the node's role and behavior within the graph. - The "node" type is used for standard operational nodes. + llm_model: An instance of the language model client used for checking scrapeability. + force_scraping (bool): A flag indicating whether scraping should be enforced even + if disallowed by robots.txt. + verbose (bool): A flag indicating whether to show print statements during execution. Args: - node_name (str): The unique identifier name for the node. This name is used to - reference the node within the graph. - node_type (str, optional): The type of the node, limited to "node" or - "conditional_node". Defaults to "node". - node_config (dict): Configuration parameters for the node. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. force_scraping (bool): A flag indicating whether scraping should be enforced even - if disallowed by robots.txt. Defaults to True. - input (str): Input expression defining how to interpret the incoming data. - output (List[str]): List of output keys where the results will be stored. - - Methods: - execute(state): Fetches the HTML content for the URL specified in the state and - updates the state with this content under the 'document' key. - The 'url' key must be present in the state for the operation - to succeed. + if disallowed by robots.txt. Defaults to True. + node_name (str): The unique identifier name for the node, defaulting to "Robots". """ def __init__(self, input: str, output: List[str], node_config: dict, force_scraping=True, node_name: str = "Robots"): - """ - Initializes the RobotsNode with a node name, input/output expressions - and node configuration. - - Arguments: - input (str): Input expression defining how to interpret the incoming data. - output (List[str]): List of output keys where the results will be stored. - node_config (dict): Configuration parameters for the node. - force_scraping (bool): A flag indicating whether scraping should be enforced even - if disallowed by robots.txt. Defaults to True. - node_name (str, optional): The unique identifier name for the node. - Defaults to "Robots". - """ super().__init__(node_name, "node", input, output, 1) + self.llm_model = node_config["llm"] self.force_scraping = force_scraping self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ - Executes the node's logic to fetch HTML content from a specified URL and - update the state with this content. + Checks if a website is scrapeable based on the robots.txt file and updates the state + with the scrapeability status. The method constructs a prompt for the language model, + submits it, and parses the output to determine if scraping is allowed. Args: - state (dict): The current state of the graph, expected to contain a 'url' key. + state (dict): The current state of the graph. The input keys will be used to fetch the Returns: - dict: The updated state with a new 'document' key containing the fetched HTML content. + dict: The updated state with the output key containing the scrapeability status. Raises: - KeyError: If the 'url' key is not found in the state, indicating that the - necessary information to perform the operation is missing. + KeyError: If the input keys are not found in the state, indicating that the + necessary information for checking scrapeability is missing. + KeyError: If the large language model is not found in the robots_dictionary. + ValueError: If the website is not scrapeable based on the robots.txt file and + scraping is not enforced. """ if self.verbose: diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 91dfa427..00cf9211 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -1,6 +1,7 @@ """ -Module for generating the answer node +SearchInternetNode Module """ + from typing import List from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate @@ -10,63 +11,46 @@ from .base_node import BaseNode class SearchInternetNode(BaseNode): """ - A node that generates an answer by querying a language model (LLM) based on the user's input - and the content extracted from a webpage. It constructs a prompt from the user's input - and the scraped content, feeds it to the LLM, and parses the LLM's response to produce - an answer. + A node that generates a search query based on the user's input and searches the internet + for relevant information. The node constructs a prompt for the language model, submits it, + and processes the output to generate a search query. It then uses the search query to find + relevant information on the internet and updates the state with the generated answer. Attributes: - node_name (str): The unique identifier name for the node. - node_type (str): The type of the node, set to "node" indicating a standard operational node. - input (str): The user input used to construct the prompt. - output (List[str]): The keys in the state dictionary - where the generated answer will be stored. - model_config (dict): Configuration parameters for the language model client. + llm_model: An instance of the language model client used for generating search queries. + verbose (bool): A flag indicating whether to show print statements during execution. Args: - input (str): The user input used to construct the prompt. - output (List[str]): The keys in the state dictionary where the - generated answer will be stored. - model_config (dict): Configuration parameters for the language model client. - node_name (str, optional): The unique identifier name for the node. - - Methods: - execute(state): Processes the input and document from the state to generate an answer, - updating the state with the generated answer under the 'answer' key. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "SearchInternet". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "SearchInternet"): - """ - Initializes the SearchInternetNode with input, output, model configuration, and a node name. - Args: - input (str): The user input used to construct the prompt. - output (List[str]): The keys in the state dictionary where the - generated answer will be stored. - model_config (dict): Configuration parameters for the language model client. - node_name (str): The unique identifier name for the node. - """ super().__init__(node_name, "node", input, output, 1, node_config) + self.llm_model = node_config["llm"] self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ Generates an answer by constructing a prompt from the user's input and the scraped content, querying the language model, and parsing its response. - The method updates the state with the generated answer under the 'answer' key. + The method updates the state with the generated answer. Args: - state (dict): The current state of the graph, expected to contain 'user_input', - and optionally 'parsed_document' or 'relevant_chunks' within 'keys'. + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data types from the state. Returns: - dict: The updated state with the 'answer' key containing the generated answer. + dict: The updated state with the output key containing the generated answer. Raises: - KeyError: If 'user_input' or 'document' is not found in the state, indicating - that the necessary information for generating an answer is missing. + KeyError: If the input keys are not found in the state, indicating that the + necessary information for generating the answer is missing. """ if self.verbose: diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 5d7cfca9..7f766b5b 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -1,6 +1,7 @@ """ -Module for generating the answer node +SearchLinkNode Module """ + # Imports from standard library from typing import List from tqdm import tqdm @@ -18,58 +19,42 @@ from .base_node import BaseNode class SearchLinkNode(BaseNode): """ - A node that generates an answer using a language model (LLM) based on the user's input - and the content extracted from a webpage. It constructs a prompt from the user's input - and the scraped content, feeds it to the LLM, and parses the LLM's response to produce - an answer. + A node that look for all the links in a web page and returns them. + It initially tries to extract the links using classical methods, if it fails it uses the LLM to extract the links. Attributes: - llm: An instance of a language model client, configured for generating answers. - node_name (str): The unique identifier name for the node, defaulting - to "GenerateAnswerNode". - node_type (str): The type of the node, set to "node" indicating a - standard operational node. + llm_model: An instance of the language model client used for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. Args: - llm: An instance of the language model client (e.g., ChatOpenAI) used - for generating answers. - node_name (str, optional): The unique identifier name for the node. - Defaults to "GenerateAnswerNode". - - Methods: - execute(state): Processes the input and document from the state to generate an answer, - updating the state with the generated answer under the 'answer' key. + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "GenerateLinks"): - """ - Initializes the GenerateAnswerNode with a language model client and a node name. - Args: - llm: An instance of the OpenAIImageToText class. - node_name (str): name of the node - """ super().__init__(node_name, "node", input, output, 1, node_config) + self.llm_model = node_config["llm"] self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ - Generates an answer by constructing a prompt from the user's input and the scraped - content, querying the language model, and parsing its response. - - The method updates the state with the generated answer under the 'answer' key. + Generates a list of links by extracting them from the provided HTML content. + First, it tries to extract the links using classical methods, if it fails it uses the LLM to extract the links. Args: - state (dict): The current state of the graph, expected to contain 'user_input', - and optionally 'parsed_document' or 'relevant_chunks' within 'keys'. + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data types from the state. Returns: - dict: The updated state with the 'answer' key containing the generated answer. + dict: The updated state with the output key containing the list of links. Raises: - KeyError: If 'user_input' or 'document' is not found in the state, indicating - that the necessary information for generating an answer is missing. + KeyError: If the input keys are not found in the state, indicating that the + necessary information for generating the answer is missing. """ if self.verbose: @@ -90,7 +75,7 @@ class SearchLinkNode(BaseNode): except Exception as e: if self.verbose: - print("error on using classical methods. Using LLM for getting the links") + print("Error extracting links using classical methods. Using LLM to extract links.") output_parser = JsonOutputParser() diff --git a/scrapegraphai/nodes/text_to_speech_node.py b/scrapegraphai/nodes/text_to_speech_node.py index 5a5c0b48..53da713a 100644 --- a/scrapegraphai/nodes/text_to_speech_node.py +++ b/scrapegraphai/nodes/text_to_speech_node.py @@ -1,39 +1,47 @@ +""" +TextToSpeechNode Module +""" -""" -Module for parsing the text to voice -""" from typing import List from .base_node import BaseNode class TextToSpeechNode(BaseNode): """ - A class representing a node that processes text and returns the voice. + Converts text to speech using the specified text-to-speech model. Attributes: - llm (OpenAITextToSpeech): An instance of the OpenAITextToSpeech class. + tts_model: An instance of the text-to-speech model client. + verbose (bool): A flag indicating whether to show print statements during execution. - Methods: - execute(state, text): Execute the node's logic and return the updated state. + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "TextToSpeech". """ def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "TextToSpeech"): - """ - Initializes an instance of the TextToSpeechNode class. - """ super().__init__(node_name, "node", input, output, 1, node_config) + self.tts_model = node_config["tts_model"] self.verbose = True if node_config is None else node_config.get("verbose", False) - def execute(self, state): + def execute(self, state: dict) -> dict: """ - Execute the node's logic and return the updated state. - Args: - state (dict): The current state of the graph. - text (str): The text to convert to speech. + Converts text to speech using the specified text-to-speech model. - :return: The updated state after executing this node. + Args: + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data types from the state. + + Returns: + dict: The updated state with the output key containing the audio generated from the text. + + Raises: + KeyError: If the input keys are not found in the state, indicating that the + necessary information for generating the audio is missing. """ if self.verbose: