diff --git a/scrapegraphai/builders/graph_builder.py b/scrapegraphai/builders/graph_builder.py index 8001bc3b..7280c50b 100644 --- a/scrapegraphai/builders/graph_builder.py +++ b/scrapegraphai/builders/graph_builder.py @@ -1,5 +1,5 @@ """ -Module for making the graph building +GraphBuilder Module """ from langchain_core.prompts import ChatPromptTemplate diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index d943a4dc..32762c0a 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -1,6 +1,7 @@ """ __init__.py file for graphs folder """ + from .base_graph import BaseGraph from .smart_scraper_graph import SmartScraperGraph from .speech_graph import SpeechGraph diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 5adf8ba6..0632742f 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -1,6 +1,7 @@ """ -Module having abstract class for creating all the graphs +AbstractGraph Module """ + from abc import ABC, abstractmethod from typing import Optional from ..models import OpenAI, Gemini, Ollama, AzureOpenAI, HuggingFace, Groq @@ -9,13 +10,34 @@ from ..helpers import models_tokens class AbstractGraph(ABC): """ - Abstract class representing a generic graph-based tool. + Scaffolding class for creating a graph representation and executing it. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + prompt (str): The prompt for the graph. + config (dict): Configuration parameters for the graph. + source (str, optional): The source of the graph. + + Example: + >>> class MyGraph(AbstractGraph): + ... def _create_graph(self): + ... # Implementation of graph creation here + ... return graph + ... + >>> my_graph = MyGraph("Example Graph", {"llm": {"model": "gpt-3.5-turbo"}}, "example_source") + >>> result = my_graph.run() """ def __init__(self, prompt: str, config: dict, source: Optional[str] = None): - """ - Initializes the AbstractGraph with a prompt, file source, and configuration. - """ + self.prompt = prompt self.source = source self.config = config @@ -32,10 +54,20 @@ class AbstractGraph(ABC): self.final_state = None self.execution_info = None - def _create_llm(self, llm_config: dict): + def _create_llm(self, llm_config: dict) -> object: """ - Creates an instance of the language model (OpenAI or Gemini) based on configuration. + Create a large language model instance based on the configuration provided. + + Args: + llm_config (dict): Configuration parameters for the language model. + + Returns: + object: An instance of the language model client. + + Raises: + KeyError: If the model is not supported. """ + llm_defaults = { "temperature": 0, "streaming": False @@ -104,8 +136,15 @@ class AbstractGraph(ABC): def get_state(self, key=None) -> dict: """"" - Obtain the current state + Get the final state of the graph. + + Args: + key (str, optional): The key of the final state to retrieve. + + Returns: + dict: The final state of the graph. """ + if key is not None: return self.final_state[key] return self.final_state @@ -113,7 +152,11 @@ class AbstractGraph(ABC): def get_execution_info(self): """ Returns the execution information of the graph. + + Returns: + dict: The execution information of the graph. """ + return self.execution_info @abstractmethod diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index 855085ca..5d6c55a1 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -1,6 +1,7 @@ """ -Module for creating the base graphs - """ +BaseGraph Module +""" + import time import warnings from langchain_community.callbacks import get_openai_callback @@ -16,21 +17,33 @@ class BaseGraph: key-value pair corresponds to the from-node and to-node relationship. entry_point (str): The name of the entry point node from which the graph execution begins. - Methods: - execute(initial_state): Executes the graph's nodes starting from the entry point and - traverses the graph based on the provided initial state. - Args: nodes (iterable): An iterable of node instances that will be part of the graph. edges (iterable): An iterable of tuples where each tuple represents a directed edge in the graph, defined by a pair of nodes (from_node, to_node). entry_point (BaseNode): The node instance that represents the entry point of the graph. + + Raises: + Warning: If the entry point node is not the first node in the list. + + Example: + >>> BaseGraph( + ... nodes=[ + ... fetch_node, + ... parse_node, + ... rag_node, + ... generate_answer_node, + ... ], + ... edges=[ + ... (fetch_node, parse_node), + ... (parse_node, rag_node), + ... (rag_node, generate_answer_node) + ... ], + ... entry_point=fetch_node + ... ) """ def __init__(self, nodes: list, edges: list, entry_point: str): - """ - Initializes the graph with nodes, edges, and the entry point. - """ self.nodes = nodes self.edges = self._create_edges({e for e in edges}) @@ -51,6 +64,7 @@ class BaseGraph: Returns: dict: A dictionary of edges with the from-node as keys and to-node as values. """ + edge_dict = {} for from_node, to_node in edges: edge_dict[from_node.node_name] = to_node.node_name @@ -66,8 +80,10 @@ class BaseGraph: initial_state (dict): The initial state to pass to the entry point node. Returns: - dict: The state after execution has completed, which may have been altered by the nodes. + Tuple[dict, list]: A tuple containing the final state of the execution and a list + of execution information for each node. """ + current_node_name = self.nodes[0] state = initial_state diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index 851ba8de..0e231a5c 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -1,6 +1,7 @@ """ -Module for creating the smart scraper +JSONScraperGraph Module """ + from .base_graph import BaseGraph from ..nodes import ( FetchNode, @@ -13,22 +14,44 @@ from .abstract_graph import AbstractGraph class JSONScraperGraph(AbstractGraph): """ - SmartScraper is a comprehensive web scraping tool that automates the process of extracting - information from web pages using a natural language model to interpret and answer prompts. + JSONScraperGraph defines a scraping pipeline for JSON files. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + + Example: + >>> json_scraper = JSONScraperGraph( + ... "List me all the attractions in Chioggia.", + ... "data/chioggia.json", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = json_scraper.run() """ def __init__(self, prompt: str, source: str, config: dict): - """ - Initializes the JsonScraperGraph with a prompt, source, and configuration. - """ super().__init__(prompt, config, source) self.input_key = "json" if source.endswith("json") else "json_dir" - def _create_graph(self): + def _create_graph(self) -> BaseGraph: """ Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. """ + fetch_node = FetchNode( input="json_dir", output=["doc"], @@ -81,7 +104,11 @@ class JSONScraperGraph(AbstractGraph): def run(self) -> str: """ Executes the web scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index 1a64512e..418ec0e6 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -1,6 +1,7 @@ """ -Module for creating the smart scraper +ScriptCreatorGraph Module """ + from .base_graph import BaseGraph from ..nodes import ( FetchNode, @@ -13,24 +14,47 @@ from .abstract_graph import AbstractGraph class ScriptCreatorGraph(AbstractGraph): """ - SmartScraper is a comprehensive web scraping tool that automates the process of extracting - information from web pages using a natural language model to interpret and answer prompts. + ScriptCreatorGraph defines a scraping pipeline for generating web scraping scripts. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + model_token (int): The token limit for the language model. + library (str): The library used for web scraping. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + + Example: + >>> script_creator = ScriptCreatorGraph( + ... "List me all the attractions in Chioggia.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = script_creator.run() """ def __init__(self, prompt: str, source: str, config: dict): - """ - Initializes the ScriptCreatorGraph with a prompt, source, and configuration. - """ - self.library = config['library'] - super().__init__(prompt, config, source) self.input_key = "url" if source.startswith("http") else "local_dir" + self.library = config['library'] - def _create_graph(self): + def _create_graph(self) -> BaseGraph: """ Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. """ + fetch_node = FetchNode( input="url | local_dir", output=["doc"], @@ -76,7 +100,11 @@ class ScriptCreatorGraph(AbstractGraph): def run(self) -> str: """ Executes the web scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 4cc179bb..41548a77 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -1,6 +1,7 @@ """ -Module for making the search on the intenet +SearchGraph Module """ + from .base_graph import BaseGraph from ..nodes import ( SearchInternetNode, @@ -14,13 +15,37 @@ from .abstract_graph import AbstractGraph class SearchGraph(AbstractGraph): """ - Module for searching info on the internet + SearchGraph is a scraping pipeline that searches the internet for answers to a given prompt. + It only requires a user prompt to search the internet and generate an answer. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + config (dict): Configuration parameters for the graph. + + Example: + >>> search_graph = SearchGraph( + ... "What is Chioggia famous for?", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = search_graph.run() """ - def _create_graph(self): + def _create_graph(self) -> BaseGraph: """ Creates the graph of nodes representing the workflow for web scraping and searching. + + Returns: + BaseGraph: A graph instance representing the web scraping and searching workflow. """ + search_internet_node = SearchInternetNode( input="user_prompt", output=["url"], @@ -83,7 +108,11 @@ class SearchGraph(AbstractGraph): def run(self) -> str: """ Executes the web scraping and searching process. + + Returns: + str: The answer to the prompt. """ + inputs = {"user_prompt": self.prompt} self.final_state, self.execution_info = self.graph.execute(inputs) diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 77fd09ee..ad984c61 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -1,6 +1,7 @@ """ -Module for creating the smart scraper +SmartScraperGraph Module """ + from .base_graph import BaseGraph from ..nodes import ( FetchNode, @@ -13,22 +14,44 @@ from .abstract_graph import AbstractGraph class SmartScraperGraph(AbstractGraph): """ - SmartScraper is a comprehensive web scraping tool that automates the process of extracting - information from web pages using a natural language model to interpret and answer prompts. + SmartScraper is a scraping pipeline that automates the process of extracting information from web pages + using a natural language model to interpret and answer prompts. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + + Example: + >>> smart_scraper = SmartScraperGraph( + ... "List me all the attractions in Chioggia.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper.run() + ) """ def __init__(self, prompt: str, source: str, config: dict): - """ - Initializes the SmartScraperGraph with a prompt, source, and configuration. - """ super().__init__(prompt, config, source) self.input_key = "url" if source.startswith("http") else "local_dir" - - def _create_graph(self): + def _create_graph(self) -> BaseGraph: """ Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. """ fetch_node = FetchNode( input="url | local_dir", @@ -81,8 +104,12 @@ class SmartScraperGraph(AbstractGraph): def run(self) -> str: """ - Executes the web scraping process and returns the answer to the prompt. + Executes the scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 7a2524e9..3edadfd0 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -1,6 +1,7 @@ """ -Module for converting text to speach +SpeechGraph Module """ + from scrapegraphai.utils.save_audio_from_bytes import save_audio_from_bytes from ..models import OpenAITextToSpeech from .base_graph import BaseGraph @@ -16,22 +17,43 @@ from .abstract_graph import AbstractGraph class SpeechGraph(AbstractGraph): """ - SpeechSummaryGraph is a tool that automates the process of extracting and summarizing - information from web pages, then converting that summary into spoken word via an MP3 file. + SpeechyGraph is a scraping pipeline that scrapes the web, provide an answer to a given prompt, and generate an audio file. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + + Example: + >>> speech_graph = SpeechGraph( + ... "List me all the attractions in Chioggia and generate an audio summary.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "gpt-3.5-turbo"}} """ def __init__(self, prompt: str, source: str, config: dict): - """ - Initializes the SmartScraperGraph with a prompt, source, and configuration. - """ super().__init__(prompt, config, source) self.input_key = "url" if source.startswith("http") else "local_dir" - def _create_graph(self): + def _create_graph(self) -> BaseGraph: """ - Creates the graph of nodes representing the workflow for web scraping and summarization. + Creates the graph of nodes representing the workflow for web scraping and audio generation. + + Returns: + BaseGraph: A graph instance representing the web scraping and audio generation workflow. """ + fetch_node = FetchNode( input="url | local_dir", output=["doc"], @@ -93,8 +115,12 @@ class SpeechGraph(AbstractGraph): def run(self) -> str: """ - Executes the web scraping, summarization, and text-to-speech process. + Executes the scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) @@ -105,4 +131,4 @@ class SpeechGraph(AbstractGraph): "output_path", "output.mp3")) print(f"Audio saved to {self.config.get('output_path', 'output.mp3')}") - return self.final_state + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index 659de51c..83aba049 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -1,6 +1,7 @@ """ -Module for creating the smart scraper +XMLScraperGraph Module """ + from .base_graph import BaseGraph from ..nodes import ( FetchNode, @@ -13,22 +14,46 @@ from .abstract_graph import AbstractGraph class XMLScraperGraph(AbstractGraph): """ - SmartScraper is a comprehensive web scraping tool that automates the process of extracting - information from web pages using a natural language model to interpret and answer prompts. + XMLScraperGraph is a scraping pipeline that extracts information from XML files using a natural + language model to interpret and answer prompts. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + + Example: + >>> xml_scraper = XMLScraperGraph( + ... "List me all the attractions in Chioggia.", + ... "data/chioggia.xml", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = xml_scraper.run() """ def __init__(self, prompt: str, source: str, config: dict): - """ - Initializes the XmlScraperGraph with a prompt, source, and configuration. - """ super().__init__(prompt, config, source) self.input_key = "xml" if source.endswith("xml") else "xml_dir" - def _create_graph(self): + def _create_graph(self) -> BaseGraph: """ Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. """ + fetch_node = FetchNode( input="xml_dir", output=["doc"], @@ -81,7 +106,11 @@ class XMLScraperGraph(AbstractGraph): def run(self) -> str: """ Executes the web scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) diff --git a/scrapegraphai/helpers/__init__.py b/scrapegraphai/helpers/__init__.py index 4565e2d9..23bc0154 100644 --- a/scrapegraphai/helpers/__init__.py +++ b/scrapegraphai/helpers/__init__.py @@ -1,7 +1,7 @@ """ __init__.py for th e helpers folder - """ + from .nodes_metadata import nodes_metadata from .schemas import graph_schema from .models_tokens import models_tokens diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 6b9ed637..dc756e95 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -1,6 +1,7 @@ """ Models token """ + models_tokens = { "openai": { "gpt-3.5-turbo-0125": 16385, diff --git a/scrapegraphai/helpers/robots.py b/scrapegraphai/helpers/robots.py index e89d203d..de49a98c 100644 --- a/scrapegraphai/helpers/robots.py +++ b/scrapegraphai/helpers/robots.py @@ -1,7 +1,7 @@ - """ Module for mapping the models in ai agents """ + robots_dictionary = { "gpt-3.5-turbo": ["GPTBot", "ChatGPT-user"], "gpt-4-turbo": ["GPTBot", "ChatGPT-user"], diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index 79bc305c..f3329320 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -17,7 +17,18 @@ class BaseNode(ABC): output (List[str]): List of min_input_len (int): Minimum required number of input keys. node_config (Optional[dict]): Additional configuration for the node. + + Args: + node_name (str): Name for identifying the node. + node_type (str): Type of the node; must be 'node' or 'conditional_node'. + input (str): Expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + min_input_len (int, optional): Minimum required number of input keys; defaults to 1. + node_config (Optional[dict], optional): Additional configuration for the node; defaults to None. + Raises: + ValueError: If `node_type` is not one of the allowed types. + Example: >>> class MyNode(BaseNode): ... def execute(self, state): @@ -31,20 +42,6 @@ class BaseNode(ABC): def __init__(self, node_name: str, node_type: str, input: str, output: List[str], min_input_len: int = 1, node_config: Optional[dict] = None): - """ - Initialize the instance with the node's name, type, input/output specifications, and configuration details. - - Args: - node_name (str): Name for identifying the node. - node_type (str): Type of the node; must be 'node' or 'conditional_node'. - input (str): Expression defining the input keys needed from the state. - output (List[str]): List of output keys to be updated in the state. - min_input_len (int, optional): Minimum required number of input keys; defaults to 1. - node_config (Optional[dict], optional): Additional configuration for the node; defaults to None. - - Raises: - ValueError: If `node_type` is not one of the allowed types. - """ self.node_name = node_name self.input = input