docs: graphs and helpers docstrings

2026-06-23 21:00:30 +08:00 · 2024-05-02 00:23:38 +02:00 · 2024-05-02 00:23:38 +02:00 · 0631985e61
commit 0631985e61
parent 18c20eb03d
14 changed files with 304 additions and 80 deletions
--- a/scrapegraphai/builders/graph_builder.py
+++ b/scrapegraphai/builders/graph_builder.py
@ -1,5 +1,5 @@
 """ 
-Module for making the graph building
+GraphBuilder Module
 """
 from langchain_core.prompts import ChatPromptTemplate
--- a/scrapegraphai/graphs/init.py
+++ b/scrapegraphai/graphs/init.py
@ -1,6 +1,7 @@
 """ 
 __init__.py file for graphs folder
 """
 from .base_graph import BaseGraph
 from .smart_scraper_graph import SmartScraperGraph
 from .speech_graph import SpeechGraph
--- a/scrapegraphai/graphs/abstract_graph.py
+++ b/scrapegraphai/graphs/abstract_graph.py
@ -1,6 +1,7 @@
 """
-Module having abstract class for creating all the graphs
+AbstractGraph Module
 """
 from abc import ABC, abstractmethod
 from typing import Optional
 from ..models import OpenAI, Gemini, Ollama, AzureOpenAI, HuggingFace, Groq
@ -9,13 +10,34 @@ from ..helpers import models_tokens
 class AbstractGraph(ABC):
    """
-    Abstract class representing a generic graph-based tool.
+    Scaffolding class for creating a graph representation and executing it.
    Attributes:
        prompt (str): The prompt for the graph.
        source (str): The source of the graph.
        config (dict): Configuration parameters for the graph.
        llm_model: An instance of a language model client, configured for generating answers.
        embedder_model: An instance of an embedding model client, configured for generating embeddings.
        verbose (bool): A flag indicating whether to show print statements during execution.
        headless (bool): A flag indicating whether to run the graph in headless mode.
    Args:
        prompt (str): The prompt for the graph.
        config (dict): Configuration parameters for the graph.
        source (str, optional): The source of the graph.
    Example:
        >>> class MyGraph(AbstractGraph):
        ...     def _create_graph(self):
        ...         # Implementation of graph creation here
        ...         return graph
        ...
        >>> my_graph = MyGraph("Example Graph", {"llm": {"model": "gpt-3.5-turbo"}}, "example_source")
        >>> result = my_graph.run()
    """
    def __init__(self, prompt: str, config: dict, source: Optional[str] = None):
-        """
+
        Initializes the AbstractGraph with a prompt, file source, and configuration.
        """
        self.prompt = prompt
        self.source = source
        self.config = config
@ -32,10 +54,20 @@ class AbstractGraph(ABC):
        self.final_state = None
        self.execution_info = None
-    def _create_llm(self, llm_config: dict):
+    def _create_llm(self, llm_config: dict) -> object:
        """
-        Creates an instance of the language model (OpenAI or Gemini) based on configuration.
+        Create a large language model instance based on the configuration provided.
        Args:
            llm_config (dict): Configuration parameters for the language model.
        Returns:
            object: An instance of the language model client.
        Raises:
            KeyError: If the model is not supported.
        """
        llm_defaults = {
            "temperature": 0,
            "streaming": False
@ -104,8 +136,15 @@ class AbstractGraph(ABC):
    def get_state(self, key=None) -> dict:
        """""
-        Obtain the current state
+        Get the final state of the graph.
        Args:
            key (str, optional): The key of the final state to retrieve.
        Returns:
            dict: The final state of the graph.
        """
        if key is not None:
            return self.final_state[key]
        return self.final_state
@ -113,7 +152,11 @@ class AbstractGraph(ABC):
    def get_execution_info(self):
        """
        Returns the execution information of the graph.
        Returns:
            dict: The execution information of the graph.
        """
        return self.execution_info
    @abstractmethod
--- a/scrapegraphai/graphs/base_graph.py
+++ b/scrapegraphai/graphs/base_graph.py
@ -1,6 +1,7 @@
 """
-Module for creating the base graphs
+BaseGraph Module
- """
+"""
 import time
 import warnings
 from langchain_community.callbacks import get_openai_callback
@ -16,21 +17,33 @@ class BaseGraph:
                      key-value pair corresponds to the from-node and to-node relationship.
        entry_point (str): The name of the entry point node from which the graph execution begins.
    Methods:
        execute(initial_state): Executes the graph's nodes starting from the entry point and
                                traverses the graph based on the provided initial state.
    Args:
        nodes (iterable): An iterable of node instances that will be part of the graph.
        edges (iterable): An iterable of tuples where each tuple represents a directed edge
                          in the graph, defined by a pair of nodes (from_node, to_node).
        entry_point (BaseNode): The node instance that represents the entry point of the graph.
    Raises:
        Warning: If the entry point node is not the first node in the list.
    Example:
        >>> BaseGraph(
        ...    nodes=[
        ...        fetch_node,
        ...        parse_node,
        ...        rag_node,
        ...        generate_answer_node,
        ...    ],
        ...    edges=[
        ...        (fetch_node, parse_node),
        ...        (parse_node, rag_node),
        ...        (rag_node, generate_answer_node)
        ...    ],
        ...    entry_point=fetch_node
        ... )
    """
    def __init__(self, nodes: list, edges: list, entry_point: str):
        """
        Initializes the graph with nodes, edges, and the entry point.
        """
        self.nodes = nodes
        self.edges = self._create_edges({e for e in edges})
@ -51,6 +64,7 @@ class BaseGraph:
        Returns:
            dict: A dictionary of edges with the from-node as keys and to-node as values.
        """
        edge_dict = {}
        for from_node, to_node in edges:
            edge_dict[from_node.node_name] = to_node.node_name
@ -66,8 +80,10 @@ class BaseGraph:
            initial_state (dict): The initial state to pass to the entry point node.
        Returns:
-            dict: The state after execution has completed, which may have been altered by the nodes.
+            Tuple[dict, list]: A tuple containing the final state of the execution and a list
                               of execution information for each node.
        """
        current_node_name = self.nodes[0]
        state = initial_state
--- a/scrapegraphai/graphs/json_scraper_graph.py
+++ b/scrapegraphai/graphs/json_scraper_graph.py
@ -1,6 +1,7 @@
 """
-Module for creating the smart scraper
+JSONScraperGraph Module
 """
 from .base_graph import BaseGraph
 from ..nodes import (
    FetchNode,
@ -13,22 +14,44 @@ from .abstract_graph import AbstractGraph
 class JSONScraperGraph(AbstractGraph):
    """
-    SmartScraper is a comprehensive web scraping tool that automates the process of extracting
+    JSONScraperGraph defines a scraping pipeline for JSON files.
-    information from web pages using a natural language model to interpret and answer prompts.
+
    Attributes:
        prompt (str): The prompt for the graph.
        source (str): The source of the graph.
        config (dict): Configuration parameters for the graph.
        llm_model: An instance of a language model client, configured for generating answers.
        embedder_model: An instance of an embedding model client, configured for generating embeddings.
        verbose (bool): A flag indicating whether to show print statements during execution.
        headless (bool): A flag indicating whether to run the graph in headless mode.
    Args:
        prompt (str): The prompt for the graph.
        source (str): The source of the graph.
        config (dict): Configuration parameters for the graph.
    Example:
        >>> json_scraper = JSONScraperGraph(
        ...     "List me all the attractions in Chioggia.",
        ...     "data/chioggia.json",
        ...     {"llm": {"model": "gpt-3.5-turbo"}}
        ... )
        >>> result = json_scraper.run()
    """
    def __init__(self, prompt: str, source: str, config: dict):
        """
        Initializes the JsonScraperGraph with a prompt, source, and configuration.
        """
        super().__init__(prompt, config, source)
        self.input_key = "json" if source.endswith("json") else "json_dir"
-    def _create_graph(self):
+    def _create_graph(self) -> BaseGraph:
        """
        Creates the graph of nodes representing the workflow for web scraping.
        Returns:
            BaseGraph: A graph instance representing the web scraping workflow.
        """
        fetch_node = FetchNode(
            input="json_dir",
            output=["doc"],
@ -81,7 +104,11 @@ class JSONScraperGraph(AbstractGraph):
    def run(self) -> str:
        """
        Executes the web scraping process and returns the answer to the prompt.
        Returns:
            str: The answer to the prompt.
        """
        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
        self.final_state, self.execution_info = self.graph.execute(inputs)
--- a/scrapegraphai/graphs/script_creator_graph.py
+++ b/scrapegraphai/graphs/script_creator_graph.py
@ -1,6 +1,7 @@
 """
-Module for creating the smart scraper
+ScriptCreatorGraph Module
 """
 from .base_graph import BaseGraph
 from ..nodes import (
    FetchNode,
@ -13,24 +14,47 @@ from .abstract_graph import AbstractGraph
 class ScriptCreatorGraph(AbstractGraph):
    """
-    SmartScraper is a comprehensive web scraping tool that automates the process of extracting
+    ScriptCreatorGraph defines a scraping pipeline for generating web scraping scripts.
-    information from web pages using a natural language model to interpret and answer prompts.
+
    Attributes:
        prompt (str): The prompt for the graph.
        source (str): The source of the graph.
        config (dict): Configuration parameters for the graph.
        llm_model: An instance of a language model client, configured for generating answers.
        embedder_model: An instance of an embedding model client, configured for generating embeddings.
        verbose (bool): A flag indicating whether to show print statements during execution.
        headless (bool): A flag indicating whether to run the graph in headless mode.
        model_token (int): The token limit for the language model.
        library (str): The library used for web scraping.
    Args:
        prompt (str): The prompt for the graph.
        source (str): The source of the graph.
        config (dict): Configuration parameters for the graph.
    Example:
        >>> script_creator = ScriptCreatorGraph(
        ...     "List me all the attractions in Chioggia.",
        ...     "https://en.wikipedia.org/wiki/Chioggia",
        ...     {"llm": {"model": "gpt-3.5-turbo"}}
        ... )
        >>> result = script_creator.run()
    """
    def __init__(self, prompt: str, source: str, config: dict):
        """
        Initializes the ScriptCreatorGraph with a prompt, source, and configuration.
        """
        self.library = config['library']
        super().__init__(prompt, config, source)
        self.input_key = "url" if source.startswith("http") else "local_dir"
        self.library = config['library']
-    def _create_graph(self):
+    def _create_graph(self) -> BaseGraph:
        """
        Creates the graph of nodes representing the workflow for web scraping.
        Returns:
            BaseGraph: A graph instance representing the web scraping workflow.
        """
        fetch_node = FetchNode(
            input="url | local_dir",
            output=["doc"],
@ -76,7 +100,11 @@ class ScriptCreatorGraph(AbstractGraph):
    def run(self) -> str:
        """
        Executes the web scraping process and returns the answer to the prompt.
        Returns:
            str: The answer to the prompt.
        """
        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
        self.final_state, self.execution_info = self.graph.execute(inputs)
--- a/scrapegraphai/graphs/search_graph.py
+++ b/scrapegraphai/graphs/search_graph.py
@ -1,6 +1,7 @@
 """ 
-Module for making the search on the intenet
+SearchGraph Module
 """
 from .base_graph import BaseGraph
 from ..nodes import (
    SearchInternetNode,
@ -14,13 +15,37 @@ from .abstract_graph import AbstractGraph
 class SearchGraph(AbstractGraph):
    """ 
-    Module for searching info on the internet
+    SearchGraph is a scraping pipeline that searches the internet for answers to a given prompt.
    It only requires a user prompt to search the internet and generate an answer.
    Attributes:
        prompt (str): The user prompt to search the internet.
        llm_model (dict): The configuration for the language model.
        embedder_model (dict): The configuration for the embedder model.
        headless (bool): A flag to run the browser in headless mode.
        verbose (bool): A flag to display the execution information.
        model_token (int): The token limit for the language model.
    Args:
        prompt (str): The user prompt to search the internet.
        config (dict): Configuration parameters for the graph.
    Example:
        >>> search_graph = SearchGraph(
        ...     "What is Chioggia famous for?",
        ...     {"llm": {"model": "gpt-3.5-turbo"}}
        ... )
        >>> result = search_graph.run()
    """
-    def _create_graph(self):
+    def _create_graph(self) -> BaseGraph:
        """
        Creates the graph of nodes representing the workflow for web scraping and searching.
        Returns:
            BaseGraph: A graph instance representing the web scraping and searching workflow.
        """
        search_internet_node = SearchInternetNode(
            input="user_prompt",
            output=["url"],
@ -83,7 +108,11 @@ class SearchGraph(AbstractGraph):
    def run(self) -> str:
        """
        Executes the web scraping and searching process.
        Returns:
            str: The answer to the prompt.
        """
        inputs = {"user_prompt": self.prompt}
        self.final_state, self.execution_info = self.graph.execute(inputs)
--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@ -1,6 +1,7 @@
 """
-Module for creating the smart scraper
+SmartScraperGraph Module
 """
 from .base_graph import BaseGraph
 from ..nodes import (
    FetchNode,
@ -13,22 +14,44 @@ from .abstract_graph import AbstractGraph
 class SmartScraperGraph(AbstractGraph):
    """
-    SmartScraper is a comprehensive web scraping tool that automates the process of extracting
+    SmartScraper is a scraping pipeline that automates the process of extracting information from web pages
-    information from web pages using a natural language model to interpret and answer prompts.
+    using a natural language model to interpret and answer prompts.
    Attributes:
        prompt (str): The prompt for the graph.
        source (str): The source of the graph.
        config (dict): Configuration parameters for the graph.
        llm_model: An instance of a language model client, configured for generating answers.
        embedder_model: An instance of an embedding model client, configured for generating embeddings.
        verbose (bool): A flag indicating whether to show print statements during execution.
        headless (bool): A flag indicating whether to run the graph in headless mode.
    Args:
        prompt (str): The prompt for the graph.
        source (str): The source of the graph.
        config (dict): Configuration parameters for the graph.
    Example:
        >>> smart_scraper = SmartScraperGraph(
        ...     "List me all the attractions in Chioggia.",
        ...     "https://en.wikipedia.org/wiki/Chioggia",
        ...     {"llm": {"model": "gpt-3.5-turbo"}}
        ... )
        >>> result = smart_scraper.run()
        )
    """
    def __init__(self, prompt: str, source: str, config: dict):
        """
        Initializes the SmartScraperGraph with a prompt, source, and configuration.
        """
        super().__init__(prompt, config, source)
        self.input_key = "url" if source.startswith("http") else "local_dir"
-
+    def _create_graph(self) -> BaseGraph:
    def _create_graph(self):
        """
        Creates the graph of nodes representing the workflow for web scraping.
        Returns:
            BaseGraph: A graph instance representing the web scraping workflow.
        """
        fetch_node = FetchNode(
            input="url | local_dir",
@ -81,8 +104,12 @@ class SmartScraperGraph(AbstractGraph):
    def run(self) -> str:
        """
-        Executes the web scraping process and returns the answer to the prompt.
+        Executes the scraping process and returns the answer to the prompt.
        Returns:
            str: The answer to the prompt.
        """
        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
        self.final_state, self.execution_info = self.graph.execute(inputs)
--- a/scrapegraphai/graphs/speech_graph.py
+++ b/scrapegraphai/graphs/speech_graph.py
@ -1,6 +1,7 @@
 """ 
-Module for converting text to speach
+SpeechGraph Module
 """
 from scrapegraphai.utils.save_audio_from_bytes import save_audio_from_bytes
 from ..models import OpenAITextToSpeech
 from .base_graph import BaseGraph
@ -16,22 +17,43 @@ from .abstract_graph import AbstractGraph
 class SpeechGraph(AbstractGraph):
    """
-    SpeechSummaryGraph is a tool that automates the process of extracting and summarizing
+    SpeechyGraph is a scraping pipeline that scrapes the web, provide an answer to a given prompt, and generate an audio file.
-    information from web pages, then converting that summary into spoken word via an MP3 file.
+
    Attributes:
        prompt (str): The prompt for the graph.
        source (str): The source of the graph.
        config (dict): Configuration parameters for the graph.
        llm_model: An instance of a language model client, configured for generating answers.
        embedder_model: An instance of an embedding model client, configured for generating embeddings.
        verbose (bool): A flag indicating whether to show print statements during execution.
        headless (bool): A flag indicating whether to run the graph in headless mode.
        model_token (int): The token limit for the language model.
    Args:
        prompt (str): The prompt for the graph.
        source (str): The source of the graph.
        config (dict): Configuration parameters for the graph.
    Example:
        >>> speech_graph = SpeechGraph(
        ...     "List me all the attractions in Chioggia and generate an audio summary.",
        ...     "https://en.wikipedia.org/wiki/Chioggia",
        ...     {"llm": {"model": "gpt-3.5-turbo"}}
    """
    def __init__(self, prompt: str, source: str, config: dict):
        """
        Initializes the SmartScraperGraph with a prompt, source, and configuration.
        """
        super().__init__(prompt, config, source)
        self.input_key = "url" if source.startswith("http") else "local_dir"
-    def _create_graph(self):
+    def _create_graph(self) -> BaseGraph:
        """
-        Creates the graph of nodes representing the workflow for web scraping and summarization.
+        Creates the graph of nodes representing the workflow for web scraping and audio generation.
        Returns:
            BaseGraph: A graph instance representing the web scraping and audio generation workflow.
        """
        fetch_node = FetchNode(
            input="url | local_dir",
            output=["doc"],
@ -93,8 +115,12 @@ class SpeechGraph(AbstractGraph):
    def run(self) -> str:
        """
-        Executes the web scraping, summarization, and text-to-speech process.
+        Executes the scraping process and returns the answer to the prompt.
        Returns:
            str: The answer to the prompt.
        """
        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
        self.final_state, self.execution_info = self.graph.execute(inputs)
@ -105,4 +131,4 @@ class SpeechGraph(AbstractGraph):
            "output_path", "output.mp3"))
        print(f"Audio saved to {self.config.get('output_path', 'output.mp3')}")
-        return self.final_state
+        return self.final_state.get("answer", "No answer found.")
--- a/scrapegraphai/graphs/xml_scraper_graph.py
+++ b/scrapegraphai/graphs/xml_scraper_graph.py
@ -1,6 +1,7 @@
 """
-Module for creating the smart scraper
+XMLScraperGraph Module
 """
 from .base_graph import BaseGraph
 from ..nodes import (
    FetchNode,
@ -13,22 +14,46 @@ from .abstract_graph import AbstractGraph
 class XMLScraperGraph(AbstractGraph):
    """
-    SmartScraper is a comprehensive web scraping tool that automates the process of extracting
+    XMLScraperGraph is a scraping pipeline that extracts information from XML files using a natural
-    information from web pages using a natural language model to interpret and answer prompts.
+    language model to interpret and answer prompts.
    Attributes:
        prompt (str): The prompt for the graph.
        source (str): The source of the graph.
        config (dict): Configuration parameters for the graph.
        llm_model: An instance of a language model client, configured for generating answers.
        embedder_model: An instance of an embedding model client, configured for generating embeddings.
        verbose (bool): A flag indicating whether to show print statements during execution.
        headless (bool): A flag indicating whether to run the graph in headless mode.
        model_token (int): The token limit for the language model.
    Args:
        prompt (str): The prompt for the graph.
        source (str): The source of the graph.
        config (dict): Configuration parameters for the graph.
    Example:
        >>> xml_scraper = XMLScraperGraph(
        ...     "List me all the attractions in Chioggia.",
        ...     "data/chioggia.xml",
        ...     {"llm": {"model": "gpt-3.5-turbo"}}
        ... )
        >>> result = xml_scraper.run()
    """
    def __init__(self, prompt: str, source: str, config: dict):
        """
        Initializes the XmlScraperGraph with a prompt, source, and configuration.
        """
        super().__init__(prompt, config, source)
        self.input_key = "xml" if source.endswith("xml") else "xml_dir"
-    def _create_graph(self):
+    def _create_graph(self) -> BaseGraph:
        """
        Creates the graph of nodes representing the workflow for web scraping.
        Returns:
            BaseGraph: A graph instance representing the web scraping workflow.
        """
        fetch_node = FetchNode(
            input="xml_dir",
            output=["doc"],
@ -81,7 +106,11 @@ class XMLScraperGraph(AbstractGraph):
    def run(self) -> str:
        """
        Executes the web scraping process and returns the answer to the prompt.
        Returns:
            str: The answer to the prompt.
        """
        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
        self.final_state, self.execution_info = self.graph.execute(inputs)
--- a/scrapegraphai/helpers/init.py
+++ b/scrapegraphai/helpers/init.py
@ -1,7 +1,7 @@
 """ 
 __init__.py for th e helpers folder
 """
 from .nodes_metadata import nodes_metadata
 from .schemas import graph_schema
 from .models_tokens import models_tokens
--- a/scrapegraphai/helpers/models_tokens.py
+++ b/scrapegraphai/helpers/models_tokens.py
@ -1,6 +1,7 @@
 """
 Models token 
 """
 models_tokens = {
    "openai": {
        "gpt-3.5-turbo-0125": 16385,
--- a/scrapegraphai/helpers/robots.py
+++ b/scrapegraphai/helpers/robots.py
@ -1,7 +1,7 @@
 """ 
 Module for mapping the models in ai agents
 """
 robots_dictionary = {
    "gpt-3.5-turbo": ["GPTBot", "ChatGPT-user"],
    "gpt-4-turbo": ["GPTBot", "ChatGPT-user"],
--- a/scrapegraphai/nodes/base_node.py
+++ b/scrapegraphai/nodes/base_node.py
@ -17,7 +17,18 @@ class BaseNode(ABC):
        output (List[str]): List of 
        min_input_len (int): Minimum required number of input keys.
        node_config (Optional[dict]): Additional configuration for the node.
    Args:
        node_name (str): Name for identifying the node.
        node_type (str): Type of the node; must be 'node' or 'conditional_node'.
        input (str): Expression defining the input keys needed from the state.
        output (List[str]): List of output keys to be updated in the state.
        min_input_len (int, optional): Minimum required number of input keys; defaults to 1.
        node_config (Optional[dict], optional): Additional configuration for the node; defaults to None.
    Raises:
        ValueError: If `node_type` is not one of the allowed types.
    Example:
        >>> class MyNode(BaseNode):
        ...     def execute(self, state):
@ -31,20 +42,6 @@ class BaseNode(ABC):
    def __init__(self, node_name: str, node_type: str, input: str, output: List[str],
                 min_input_len: int = 1, node_config: Optional[dict] = None):
        """
        Initialize the instance with the node's name, type, input/output specifications, and configuration details.
        Args:
            node_name (str): Name for identifying the node.
            node_type (str): Type of the node; must be 'node' or 'conditional_node'.
            input (str): Expression defining the input keys needed from the state.
            output (List[str]): List of output keys to be updated in the state.
            min_input_len (int, optional): Minimum required number of input keys; defaults to 1.
            node_config (Optional[dict], optional): Additional configuration for the node; defaults to None.
        Raises:
            ValueError: If `node_type` is not one of the allowed types.
        """
        self.node_name = node_name
        self.input = input