docs: graphs and helpers docstrings

2026-06-23 21:00:30 +08:00 · 2024-05-02 00:23:38 +02:00 · 2024-05-02 00:23:38 +02:00 · 0631985e61
commit 0631985e61
parent 18c20eb03d
14 changed files with 304 additions and 80 deletions
--- a/scrapegraphai/builders/graph_builder.py
+++ b/scrapegraphai/builders/graph_builder.py
@ -1,5 +1,5 @@
 """ 
-Module for making the graph building
+GraphBuilder Module
 """

 from langchain_core.prompts import ChatPromptTemplate
--- a/scrapegraphai/graphs/init.py
+++ b/scrapegraphai/graphs/init.py
@ -1,6 +1,7 @@
 """ 
 __init__.py file for graphs folder
 """
+
 from .base_graph import BaseGraph
 from .smart_scraper_graph import SmartScraperGraph
 from .speech_graph import SpeechGraph
--- a/scrapegraphai/graphs/abstract_graph.py
+++ b/scrapegraphai/graphs/abstract_graph.py
@ -1,6 +1,7 @@
 """
-Module having abstract class for creating all the graphs
+AbstractGraph Module
 """
+
 from abc import ABC, abstractmethod
 from typing import Optional
 from ..models import OpenAI, Gemini, Ollama, AzureOpenAI, HuggingFace, Groq
@ -9,13 +10,34 @@ from ..helpers import models_tokens

 class AbstractGraph(ABC):
    """
-    Abstract class representing a generic graph-based tool.
+    Scaffolding class for creating a graph representation and executing it.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+
+    Args:
+        prompt (str): The prompt for the graph.
+        config (dict): Configuration parameters for the graph.
+        source (str, optional): The source of the graph.
+
+    Example:
+        >>> class MyGraph(AbstractGraph):
+        ...     def _create_graph(self):
+        ...         # Implementation of graph creation here
+        ...         return graph
+        ...
+        >>> my_graph = MyGraph("Example Graph", {"llm": {"model": "gpt-3.5-turbo"}}, "example_source")
+        >>> result = my_graph.run()
    """

    def __init__(self, prompt: str, config: dict, source: Optional[str] = None):
-        """
-        Initializes the AbstractGraph with a prompt, file source, and configuration.
-        """
+
        self.prompt = prompt
        self.source = source
        self.config = config
@ -32,10 +54,20 @@ class AbstractGraph(ABC):
        self.final_state = None
        self.execution_info = None

-    def _create_llm(self, llm_config: dict):
+    def _create_llm(self, llm_config: dict) -> object:
        """
-        Creates an instance of the language model (OpenAI or Gemini) based on configuration.
+        Create a large language model instance based on the configuration provided.
+
+        Args:
+            llm_config (dict): Configuration parameters for the language model.
+
+        Returns:
+            object: An instance of the language model client.
+
+        Raises:
+            KeyError: If the model is not supported.
        """
+
        llm_defaults = {
            "temperature": 0,
            "streaming": False
@ -104,8 +136,15 @@ class AbstractGraph(ABC):

    def get_state(self, key=None) -> dict:
        """""
-        Obtain the current state
+        Get the final state of the graph.
+
+        Args:
+            key (str, optional): The key of the final state to retrieve.
+
+        Returns:
+            dict: The final state of the graph.
        """
+
        if key is not None:
            return self.final_state[key]
        return self.final_state
@ -113,7 +152,11 @@ class AbstractGraph(ABC):
    def get_execution_info(self):
        """
        Returns the execution information of the graph.
+
+        Returns:
+            dict: The execution information of the graph.
        """
+        
        return self.execution_info

    @abstractmethod
--- a/scrapegraphai/graphs/base_graph.py
+++ b/scrapegraphai/graphs/base_graph.py
@ -1,6 +1,7 @@
 """
-Module for creating the base graphs
- """
+BaseGraph Module
+"""
+
 import time
 import warnings
 from langchain_community.callbacks import get_openai_callback
@ -16,21 +17,33 @@ class BaseGraph:
                      key-value pair corresponds to the from-node and to-node relationship.
        entry_point (str): The name of the entry point node from which the graph execution begins.

-    Methods:
-        execute(initial_state): Executes the graph's nodes starting from the entry point and
-                                traverses the graph based on the provided initial state.
-
    Args:
        nodes (iterable): An iterable of node instances that will be part of the graph.
        edges (iterable): An iterable of tuples where each tuple represents a directed edge
                          in the graph, defined by a pair of nodes (from_node, to_node).
        entry_point (BaseNode): The node instance that represents the entry point of the graph.
+
+    Raises:
+        Warning: If the entry point node is not the first node in the list.
+
+    Example:
+        >>> BaseGraph(
+        ...    nodes=[
+        ...        fetch_node,
+        ...        parse_node,
+        ...        rag_node,
+        ...        generate_answer_node,
+        ...    ],
+        ...    edges=[
+        ...        (fetch_node, parse_node),
+        ...        (parse_node, rag_node),
+        ...        (rag_node, generate_answer_node)
+        ...    ],
+        ...    entry_point=fetch_node
+        ... )
    """

    def __init__(self, nodes: list, edges: list, entry_point: str):
-        """
-        Initializes the graph with nodes, edges, and the entry point.
-        """

        self.nodes = nodes
        self.edges = self._create_edges({e for e in edges})
@ -51,6 +64,7 @@ class BaseGraph:
        Returns:
            dict: A dictionary of edges with the from-node as keys and to-node as values.
        """
+
        edge_dict = {}
        for from_node, to_node in edges:
            edge_dict[from_node.node_name] = to_node.node_name
@ -66,8 +80,10 @@ class BaseGraph:
            initial_state (dict): The initial state to pass to the entry point node.

        Returns:
-            dict: The state after execution has completed, which may have been altered by the nodes.
+            Tuple[dict, list]: A tuple containing the final state of the execution and a list
+                               of execution information for each node.
        """
+        
        current_node_name = self.nodes[0]
        state = initial_state

--- a/scrapegraphai/graphs/json_scraper_graph.py
+++ b/scrapegraphai/graphs/json_scraper_graph.py
@ -1,6 +1,7 @@
 """
-Module for creating the smart scraper
+JSONScraperGraph Module
 """
+
 from .base_graph import BaseGraph
 from ..nodes import (
    FetchNode,
@ -13,22 +14,44 @@ from .abstract_graph import AbstractGraph

 class JSONScraperGraph(AbstractGraph):
    """
-    SmartScraper is a comprehensive web scraping tool that automates the process of extracting
-    information from web pages using a natural language model to interpret and answer prompts.
+    JSONScraperGraph defines a scraping pipeline for JSON files.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+
+    Args:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+
+    Example:
+        >>> json_scraper = JSONScraperGraph(
+        ...     "List me all the attractions in Chioggia.",
+        ...     "data/chioggia.json",
+        ...     {"llm": {"model": "gpt-3.5-turbo"}}
+        ... )
+        >>> result = json_scraper.run()
    """

    def __init__(self, prompt: str, source: str, config: dict):
-        """
-        Initializes the JsonScraperGraph with a prompt, source, and configuration.
-        """
        super().__init__(prompt, config, source)

        self.input_key = "json" if source.endswith("json") else "json_dir"

-    def _create_graph(self):
+    def _create_graph(self) -> BaseGraph:
        """
        Creates the graph of nodes representing the workflow for web scraping.
+        
+        Returns:
+            BaseGraph: A graph instance representing the web scraping workflow.
        """
+
        fetch_node = FetchNode(
            input="json_dir",
            output=["doc"],
@ -81,7 +104,11 @@ class JSONScraperGraph(AbstractGraph):
    def run(self) -> str:
        """
        Executes the web scraping process and returns the answer to the prompt.
+
+        Returns:
+            str: The answer to the prompt.
        """
+
        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
        self.final_state, self.execution_info = self.graph.execute(inputs)

--- a/scrapegraphai/graphs/script_creator_graph.py
+++ b/scrapegraphai/graphs/script_creator_graph.py
@ -1,6 +1,7 @@
 """
-Module for creating the smart scraper
+ScriptCreatorGraph Module
 """
+
 from .base_graph import BaseGraph
 from ..nodes import (
    FetchNode,
@ -13,24 +14,47 @@ from .abstract_graph import AbstractGraph

 class ScriptCreatorGraph(AbstractGraph):
    """
-    SmartScraper is a comprehensive web scraping tool that automates the process of extracting
-    information from web pages using a natural language model to interpret and answer prompts.
+    ScriptCreatorGraph defines a scraping pipeline for generating web scraping scripts.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+        model_token (int): The token limit for the language model.
+        library (str): The library used for web scraping.
+
+    Args:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+
+    Example:
+        >>> script_creator = ScriptCreatorGraph(
+        ...     "List me all the attractions in Chioggia.",
+        ...     "https://en.wikipedia.org/wiki/Chioggia",
+        ...     {"llm": {"model": "gpt-3.5-turbo"}}
+        ... )
+        >>> result = script_creator.run()
    """

    def __init__(self, prompt: str, source: str, config: dict):
-        """
-        Initializes the ScriptCreatorGraph with a prompt, source, and configuration.
-        """
-        self.library = config['library']
-
        super().__init__(prompt, config, source)

        self.input_key = "url" if source.startswith("http") else "local_dir"
+        self.library = config['library']

-    def _create_graph(self):
+    def _create_graph(self) -> BaseGraph:
        """
        Creates the graph of nodes representing the workflow for web scraping.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping workflow.
        """
+
        fetch_node = FetchNode(
            input="url | local_dir",
            output=["doc"],
@ -76,7 +100,11 @@ class ScriptCreatorGraph(AbstractGraph):
    def run(self) -> str:
        """
        Executes the web scraping process and returns the answer to the prompt.
+
+        Returns:
+            str: The answer to the prompt.
        """
+        
        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
        self.final_state, self.execution_info = self.graph.execute(inputs)

--- a/scrapegraphai/graphs/search_graph.py
+++ b/scrapegraphai/graphs/search_graph.py
@ -1,6 +1,7 @@
 """ 
-Module for making the search on the intenet
+SearchGraph Module
 """
+
 from .base_graph import BaseGraph
 from ..nodes import (
    SearchInternetNode,
@ -14,13 +15,37 @@ from .abstract_graph import AbstractGraph

 class SearchGraph(AbstractGraph):
    """ 
-    Module for searching info on the internet
+    SearchGraph is a scraping pipeline that searches the internet for answers to a given prompt.
+    It only requires a user prompt to search the internet and generate an answer.
+
+    Attributes:
+        prompt (str): The user prompt to search the internet.
+        llm_model (dict): The configuration for the language model.
+        embedder_model (dict): The configuration for the embedder model.
+        headless (bool): A flag to run the browser in headless mode.
+        verbose (bool): A flag to display the execution information.
+        model_token (int): The token limit for the language model.
+
+    Args:
+        prompt (str): The user prompt to search the internet.
+        config (dict): Configuration parameters for the graph.
+
+    Example:
+        >>> search_graph = SearchGraph(
+        ...     "What is Chioggia famous for?",
+        ...     {"llm": {"model": "gpt-3.5-turbo"}}
+        ... )
+        >>> result = search_graph.run()
    """

-    def _create_graph(self):
+    def _create_graph(self) -> BaseGraph:
        """
        Creates the graph of nodes representing the workflow for web scraping and searching.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping and searching workflow.
        """
+
        search_internet_node = SearchInternetNode(
            input="user_prompt",
            output=["url"],
@ -83,7 +108,11 @@ class SearchGraph(AbstractGraph):
    def run(self) -> str:
        """
        Executes the web scraping and searching process.
+        
+        Returns:
+            str: The answer to the prompt.
        """
+        
        inputs = {"user_prompt": self.prompt}
        self.final_state, self.execution_info = self.graph.execute(inputs)

--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@ -1,6 +1,7 @@
 """
-Module for creating the smart scraper
+SmartScraperGraph Module
 """
+
 from .base_graph import BaseGraph
 from ..nodes import (
    FetchNode,
@ -13,22 +14,44 @@ from .abstract_graph import AbstractGraph

 class SmartScraperGraph(AbstractGraph):
    """
-    SmartScraper is a comprehensive web scraping tool that automates the process of extracting
-    information from web pages using a natural language model to interpret and answer prompts.
+    SmartScraper is a scraping pipeline that automates the process of extracting information from web pages
+    using a natural language model to interpret and answer prompts.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+
+    Args:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+
+    Example:
+        >>> smart_scraper = SmartScraperGraph(
+        ...     "List me all the attractions in Chioggia.",
+        ...     "https://en.wikipedia.org/wiki/Chioggia",
+        ...     {"llm": {"model": "gpt-3.5-turbo"}}
+        ... )
+        >>> result = smart_scraper.run()
+        )
    """

    def __init__(self, prompt: str, source: str, config: dict):
-        """
-        Initializes the SmartScraperGraph with a prompt, source, and configuration.
-        """
        super().__init__(prompt, config, source)

        self.input_key = "url" if source.startswith("http") else "local_dir"
        
-
-    def _create_graph(self):
+    def _create_graph(self) -> BaseGraph:
        """
        Creates the graph of nodes representing the workflow for web scraping.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping workflow.
        """
        fetch_node = FetchNode(
            input="url | local_dir",
@ -81,8 +104,12 @@ class SmartScraperGraph(AbstractGraph):

    def run(self) -> str:
        """
-        Executes the web scraping process and returns the answer to the prompt.
+        Executes the scraping process and returns the answer to the prompt.
+
+        Returns:
+            str: The answer to the prompt.
        """
+
        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
        self.final_state, self.execution_info = self.graph.execute(inputs)

--- a/scrapegraphai/graphs/speech_graph.py
+++ b/scrapegraphai/graphs/speech_graph.py
@ -1,6 +1,7 @@
 """ 
-Module for converting text to speach
+SpeechGraph Module
 """
+
 from scrapegraphai.utils.save_audio_from_bytes import save_audio_from_bytes
 from ..models import OpenAITextToSpeech
 from .base_graph import BaseGraph
@ -16,22 +17,43 @@ from .abstract_graph import AbstractGraph

 class SpeechGraph(AbstractGraph):
    """
-    SpeechSummaryGraph is a tool that automates the process of extracting and summarizing
-    information from web pages, then converting that summary into spoken word via an MP3 file.
+    SpeechyGraph is a scraping pipeline that scrapes the web, provide an answer to a given prompt, and generate an audio file.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+        model_token (int): The token limit for the language model.
+
+    Args:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+
+    Example:
+        >>> speech_graph = SpeechGraph(
+        ...     "List me all the attractions in Chioggia and generate an audio summary.",
+        ...     "https://en.wikipedia.org/wiki/Chioggia",
+        ...     {"llm": {"model": "gpt-3.5-turbo"}}
    """

    def __init__(self, prompt: str, source: str, config: dict):
-        """
-        Initializes the SmartScraperGraph with a prompt, source, and configuration.
-        """
        super().__init__(prompt, config, source)

        self.input_key = "url" if source.startswith("http") else "local_dir"

-    def _create_graph(self):
+    def _create_graph(self) -> BaseGraph:
        """
-        Creates the graph of nodes representing the workflow for web scraping and summarization.
+        Creates the graph of nodes representing the workflow for web scraping and audio generation.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping and audio generation workflow.
        """
+
        fetch_node = FetchNode(
            input="url | local_dir",
            output=["doc"],
@ -93,8 +115,12 @@ class SpeechGraph(AbstractGraph):

    def run(self) -> str:
        """
-        Executes the web scraping, summarization, and text-to-speech process.
+        Executes the scraping process and returns the answer to the prompt.
+
+        Returns:
+            str: The answer to the prompt.
        """
+        
        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
        self.final_state, self.execution_info = self.graph.execute(inputs)

@ -105,4 +131,4 @@ class SpeechGraph(AbstractGraph):
            "output_path", "output.mp3"))
        print(f"Audio saved to {self.config.get('output_path', 'output.mp3')}")

-        return self.final_state
+        return self.final_state.get("answer", "No answer found.")
--- a/scrapegraphai/graphs/xml_scraper_graph.py
+++ b/scrapegraphai/graphs/xml_scraper_graph.py
@ -1,6 +1,7 @@
 """
-Module for creating the smart scraper
+XMLScraperGraph Module
 """
+
 from .base_graph import BaseGraph
 from ..nodes import (
    FetchNode,
@ -13,22 +14,46 @@ from .abstract_graph import AbstractGraph

 class XMLScraperGraph(AbstractGraph):
    """
-    SmartScraper is a comprehensive web scraping tool that automates the process of extracting
-    information from web pages using a natural language model to interpret and answer prompts.
+    XMLScraperGraph is a scraping pipeline that extracts information from XML files using a natural
+    language model to interpret and answer prompts.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+        model_token (int): The token limit for the language model.
+
+    Args:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+
+    Example:
+        >>> xml_scraper = XMLScraperGraph(
+        ...     "List me all the attractions in Chioggia.",
+        ...     "data/chioggia.xml",
+        ...     {"llm": {"model": "gpt-3.5-turbo"}}
+        ... )
+        >>> result = xml_scraper.run()
    """

    def __init__(self, prompt: str, source: str, config: dict):
-        """
-        Initializes the XmlScraperGraph with a prompt, source, and configuration.
-        """
        super().__init__(prompt, config, source)

        self.input_key = "xml" if source.endswith("xml") else "xml_dir"

-    def _create_graph(self):
+    def _create_graph(self) -> BaseGraph:
        """
        Creates the graph of nodes representing the workflow for web scraping.
+        
+        Returns:
+            BaseGraph: A graph instance representing the web scraping workflow.
        """
+
        fetch_node = FetchNode(
            input="xml_dir",
            output=["doc"],
@ -81,7 +106,11 @@ class XMLScraperGraph(AbstractGraph):
    def run(self) -> str:
        """
        Executes the web scraping process and returns the answer to the prompt.
+
+        Returns:
+            str: The answer to the prompt.
        """
+        
        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
        self.final_state, self.execution_info = self.graph.execute(inputs)

--- a/scrapegraphai/helpers/init.py
+++ b/scrapegraphai/helpers/init.py
@ -1,7 +1,7 @@
 """ 
 __init__.py for th e helpers folder
-
 """
+
 from .nodes_metadata import nodes_metadata
 from .schemas import graph_schema
 from .models_tokens import models_tokens
--- a/scrapegraphai/helpers/models_tokens.py
+++ b/scrapegraphai/helpers/models_tokens.py
@ -1,6 +1,7 @@
 """
 Models token 
 """
+
 models_tokens = {
    "openai": {
        "gpt-3.5-turbo-0125": 16385,
--- a/scrapegraphai/helpers/robots.py
+++ b/scrapegraphai/helpers/robots.py
@ -1,7 +1,7 @@
-
 """ 
 Module for mapping the models in ai agents
 """
+
 robots_dictionary = {
    "gpt-3.5-turbo": ["GPTBot", "ChatGPT-user"],
    "gpt-4-turbo": ["GPTBot", "ChatGPT-user"],
--- a/scrapegraphai/nodes/base_node.py
+++ b/scrapegraphai/nodes/base_node.py
@ -17,7 +17,18 @@ class BaseNode(ABC):
        output (List[str]): List of 
        min_input_len (int): Minimum required number of input keys.
        node_config (Optional[dict]): Additional configuration for the node.
+    
+    Args:
+        node_name (str): Name for identifying the node.
+        node_type (str): Type of the node; must be 'node' or 'conditional_node'.
+        input (str): Expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        min_input_len (int, optional): Minimum required number of input keys; defaults to 1.
+        node_config (Optional[dict], optional): Additional configuration for the node; defaults to None.

+    Raises:
+        ValueError: If `node_type` is not one of the allowed types.
+    
    Example:
        >>> class MyNode(BaseNode):
        ...     def execute(self, state):
@ -31,20 +42,6 @@ class BaseNode(ABC):

    def __init__(self, node_name: str, node_type: str, input: str, output: List[str],
                 min_input_len: int = 1, node_config: Optional[dict] = None):
-        """
-        Initialize the instance with the node's name, type, input/output specifications, and configuration details.
-
-        Args:
-            node_name (str): Name for identifying the node.
-            node_type (str): Type of the node; must be 'node' or 'conditional_node'.
-            input (str): Expression defining the input keys needed from the state.
-            output (List[str]): List of output keys to be updated in the state.
-            min_input_len (int, optional): Minimum required number of input keys; defaults to 1.
-            node_config (Optional[dict], optional): Additional configuration for the node; defaults to None.
-
-        Raises:
-            ValueError: If `node_type` is not one of the allowed types.
-        """

        self.node_name = node_name
        self.input = input