docs: update utils docstrings

2026-06-23 21:00:30 +08:00 · 2024-05-01 12:35:12 +02:00 · 2024-05-01 12:35:12 +02:00 · cf038b33ea
commit cf038b33ea
parent 96975b2e36
9 changed files with 127 additions and 59 deletions
--- a/scrapegraphai/utils/convert_to_csv.py
+++ b/scrapegraphai/utils/convert_to_csv.py
@ -6,20 +6,27 @@ import sys
 import pandas as pd


-def convert_to_csv(data: dict, filename: str, position: str = None):
+def convert_to_csv(data: dict, filename: str, position: str = None) -> None:
    """
-    Converts a dictionary to a CSV file and saves it.
+    Converts a dictionary to a CSV file and saves it at a specified location.

    Args:
-    data (dict): Data to be converted to CSV.
-    position (str): Optional path where the file should be saved. If not provided,
-    the directory of the caller script will be used.
+        data (dict): The data to be converted into CSV format.
+        filename (str): The name of the output CSV file, without the '.csv' extension.
+        position (str, optional): The file path where the CSV should be saved. Defaults to the directory of the caller script if not provided.

+    Returns:
+        None: The function does not return anything.
+        
    Raises:
-    FileNotFoundError: If the specified directory does not exist.
-    PermissionError: If the program lacks write permission for the directory.
-    TypeError: If the input data is not a dictionary.
-    Exception: For other potential errors during DataFrame creation or CSV saving.
+        FileNotFoundError: If the specified directory does not exist.
+        PermissionError: If write permissions are lacking for the directory.
+        TypeError: If `data` is not a dictionary.
+        Exception: For other issues that may arise during the creation or saving of the CSV file.
+
+    Example:
+        >>> convert_to_csv({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save')
+        Saves a CSV file named 'output.csv' at '/path/to/save'.
    """

    if ".csv" in filename:
--- a/scrapegraphai/utils/convert_to_json.py
+++ b/scrapegraphai/utils/convert_to_json.py
@ -6,23 +6,33 @@ import os
 import sys


-def convert_to_json(data: dict, filename: str, position: str = None):
+def convert_to_json(data: dict, filename: str, position: str = None) -> None:
    """
-    Convert data to JSON format and save it to a file.
+    Converts a dictionary to a JSON file and saves it at a specified location.

    Args:
-    data (dict): Data to save.
-    filename (str): Name of the file to save without .json extension.
-    position (str): Directory where the file should be saved. If None, 
-    the directory of the caller script will be used.
+        data (dict): The data to be converted into JSON format.
+        filename (str): The name of the output JSON file, without the '.json' extension.
+        position (str, optional): The file path where the JSON file should be saved. Defaults to the directory of the caller script if not provided.

+    Returns:
+        None: The function does not return anything.
+        
    Raises:
-    ValueError: If filename contains '.json'.
-    FileNotFoundError: If the specified directory does not exist.
-    PermissionError: If the program does not have permission to write to the directory.
+        ValueError: If 'filename' contains '.json'.
+        FileNotFoundError: If the specified directory does not exist.
+        PermissionError: If write permissions are lacking for the directory.
+
+    Example:
+        >>> convert_to_json({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save')
+        Saves a JSON file named 'output.json' at '/path/to/save'.
+
+    Notes:
+        This function automatically ensures the directory exists before attempting to write the file. If the directory does not exist, it will attempt to create it.
    """
+
    if ".json" in filename:
-        filename = filename.replace(".json", "")  # Remove .csv extension
+        filename = filename.replace(".json", "")  # Remove .json extension

  # Get the directory of the caller script
    if position is None:
--- a/scrapegraphai/utils/parse_state_keys.py
+++ b/scrapegraphai/utils/parse_state_keys.py
@ -4,12 +4,30 @@ Parse_state_key module
 import re


-def parse_expression(expression, state: dict):
-    """ 
-    Function for parsing the expressions
-    Args:
-        state (dict): state to elaborate
+def parse_expression(expression, state: dict) -> list:
    """
+    Parses a complex boolean expression involving state keys.
+
+    Args:
+        expression (str): The boolean expression to parse.
+        state (dict): Dictionary of state keys used to evaluate the expression.
+
+    Raises:
+        ValueError: If the expression is empty, has adjacent state keys without operators, invalid operator usage,
+                    unbalanced parentheses, or if no state keys match the expression.
+
+    Returns:
+        list: A list of state keys that match the boolean expression, ensuring each key appears only once.
+
+    Example:
+        >>> parse_expression("user_input & (relevant_chunks | parsed_document | document)", 
+                            {"user_input": None, "document": None, "parsed_document": None, "relevant_chunks": None})
+        ['user_input', 'relevant_chunks', 'parsed_document', 'document']
+
+    This function evaluates the expression to determine the logical inclusion of state keys based on provided boolean logic.
+    It checks for syntax errors such as unbalanced parentheses, incorrect adjacency of operators, and empty expressions.
+    """
+
    # Check for empty expression
    if not expression:
        raise ValueError("Empty expression.")
--- a/scrapegraphai/utils/prettify_exec_info.py
+++ b/scrapegraphai/utils/prettify_exec_info.py
@ -7,13 +7,17 @@ import pandas as pd

 def prettify_exec_info(complete_result: list[dict]) -> pd.DataFrame:
    """
-    Transform the execution information of the graph into a DataFrame for better visualization.
+    Transforms the execution information of a graph into a DataFrame for enhanced visualization.

    Args:
-    - complete_result (list[dict]): The complete execution information of the graph.
+        complete_result (list[dict]): The complete execution information of the graph.

    Returns:
-    - pd.DataFrame: The execution information of the graph in a DataFrame.
+        pd.DataFrame: A DataFrame that organizes the execution information for better readability and analysis.
+
+    Example:
+        >>> prettify_exec_info([{'node': 'A', 'status': 'success'}, {'node': 'B', 'status': 'failure'}])
+        DataFrame with columns 'node' and 'status' showing execution results for each node.
    """

    df_nodes = pd.DataFrame(complete_result)
--- a/scrapegraphai/utils/proxy_rotation.py
+++ b/scrapegraphai/utils/proxy_rotation.py
@ -4,26 +4,29 @@ Module for rotating proxies
 from fp.fp import FreeProxy


-def proxy_generator(num_ips: int):
+def proxy_generator(num_ips: int) -> list:
    """
-    Rotates through a specified number of proxy IPs using the FreeProxy library.
+    Generates a specified number of proxy IP addresses using the FreeProxy library.

    Args:
-        num_ips (int): The number of proxy IPs to rotate through.
+        num_ips (int): The number of proxy IPs to generate and rotate through.

    Returns:
-        dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation.
+        list: A list of proxy IP addresses.

    Example:
        >>> proxy_generator(5)
-        {
-            0: '192.168.1.1:8080',
-            1: '103.10.63.135:8080',
-            2: '176.9.75.42:8080',
-            3: '37.57.216.2:8080',
-            4: '113.20.31.250:8080'
-        }
+        [
+            '192.168.1.1:8080',
+            '103.10.63.135:8080',
+            '176.9.75.42:8080',
+            '37.57.216.2:8080',
+            '113.20.31.250:8080'
+        ]
+
+    This function fetches fresh proxies and indexes them, making it easy to manage multiple proxy configurations.
    """
+
    res = []

    for i in range(0, num_ips):
--- a/scrapegraphai/utils/remover.py
+++ b/scrapegraphai/utils/remover.py
@ -7,15 +7,20 @@ from minify_html import minify

 def remover(html_content: str) -> str:
    """
-    This function processes HTML content, removes unnecessary tags 
-    (including style tags), minifies the HTML, and retrieves the 
-    title and body content.
+    Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.

-    Parameters:
-        html_content (str): The HTML content to parse
+    Args:
+        html_content (str): The HTML content to be processed.

    Returns:
-        str: The parsed title followed by the minified body content
+        str: A string combining the parsed title and the minified body content. If no body content is found, it indicates so.
+
+    Example:
+        >>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
+        >>> remover(html_content)
+        'Title: Example, Body: <body><p>Hello World!</p></body>'
+
+    This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
    """

    soup = BeautifulSoup(html_content, 'html.parser')
--- a/scrapegraphai/utils/research_web.py
+++ b/scrapegraphai/utils/research_web.py
@ -8,16 +8,25 @@ from googlesearch import search


 def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
-    """ 
-    Function that given a query it finds it on the intenet
+    """
+    Searches the web for a given query using specified search engine options.
+
    Args:
-        query (str): query to search on internet
-        search_engine (str, optional): type of browser, it could be DuckDuckGo or Google,
-            default: Google
-        max_results (int, optional): maximum number of results
+        query (str): The search query to find on the internet.
+        search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'.
+        max_results (int, optional): The maximum number of search results to return.

    Returns:
-        List[str]: List of strings of web link
+        List[str]: A list of URLs as strings that are the search results.
+
+    Raises:
+        ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'.
+
+    Example:
+        >>> search_on_web("example query", search_engine="Google", max_results=5)
+        ['http://example.com', 'http://example.org', ...]
+
+    This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs.
    """

    if search_engine == "Google":
--- a/scrapegraphai/utils/save_audio_from_bytes.py
+++ b/scrapegraphai/utils/save_audio_from_bytes.py
@ -7,12 +7,18 @@ from typing import Union

 def save_audio_from_bytes(byte_response: bytes, output_path: Union[str, Path]) -> None:
    """
-    Saves the byte response as an audio file.
+    Saves the byte response as an audio file to the specified path.

    Args:
-        byte_response (bytes): The byte response containing the generated speech.
-        output_path (str or Path): The file path where the generated speech should be saved.
+        byte_response (bytes): The byte array containing audio data.
+        output_path (Union[str, Path]): The destination file path where the audio file will be saved.
+
+    Example:
+        >>> save_audio_from_bytes(b'audio data', 'path/to/audio.mp3')
+
+    This function writes the byte array containing audio data to a file, saving it as an audio file.
    """
+
    if not isinstance(output_path, Path):
        output_path = Path(output_path)

--- a/scrapegraphai/utils/token_calculator.py
+++ b/scrapegraphai/utils/token_calculator.py
@ -8,15 +8,21 @@ from ..helpers.models_tokens import models_tokens

 def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
    """
-    It creates a list of strings to create max dimension tokenizable elements
+    Truncates text into chunks that are small enough to be processed by specified llm models.

    Args:
-        text (str): The input text to be truncated into tokenizable elements.
-        model (str): The name of the language model to be used.
-        encoding_name (str): The name of the encoding to be used (default: EMBEDDING_ENCODING).
+        text (str): The input text to be truncated.
+        model (str): The name of the llm model to determine the maximum token limit.
+        encoding_name (str): The encoding strategy used to encode the text before truncation.

    Returns:
-        List[str]: A list of tokenizable elements created from the input text.
+        List[str]: A list of text chunks, each within the token limit of the specified model.
+
+    Example:
+        >>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING")
+        ["This is a sample text", "for truncation."]
+
+    This function ensures that each chunk of text can be tokenized by the specified model without exceeding the model's token limit.
    """

    encoding = tiktoken.get_encoding(encoding_name)