docs: update utils docstrings

2026-06-23 21:00:30 +08:00 · 2024-05-01 12:35:12 +02:00 · 2024-05-01 12:35:12 +02:00 · cf038b33ea
commit cf038b33ea
parent 96975b2e36
9 changed files with 127 additions and 59 deletions
--- a/scrapegraphai/utils/convert_to_csv.py
+++ b/scrapegraphai/utils/convert_to_csv.py
@ -6,20 +6,27 @@ import sys
 import pandas as pd
-def convert_to_csv(data: dict, filename: str, position: str = None):
+def convert_to_csv(data: dict, filename: str, position: str = None) -> None:
    """
-    Converts a dictionary to a CSV file and saves it.
+    Converts a dictionary to a CSV file and saves it at a specified location.
    Args:
-    data (dict): Data to be converted to CSV.
+        data (dict): The data to be converted into CSV format.
-    position (str): Optional path where the file should be saved. If not provided,
+        filename (str): The name of the output CSV file, without the '.csv' extension.
-    the directory of the caller script will be used.
+        position (str, optional): The file path where the CSV should be saved. Defaults to the directory of the caller script if not provided.
    Returns:
        None: The function does not return anything.
    Raises:
-    FileNotFoundError: If the specified directory does not exist.
+        FileNotFoundError: If the specified directory does not exist.
-    PermissionError: If the program lacks write permission for the directory.
+        PermissionError: If write permissions are lacking for the directory.
-    TypeError: If the input data is not a dictionary.
+        TypeError: If `data` is not a dictionary.
-    Exception: For other potential errors during DataFrame creation or CSV saving.
+        Exception: For other issues that may arise during the creation or saving of the CSV file.
    Example:
        >>> convert_to_csv({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save')
        Saves a CSV file named 'output.csv' at '/path/to/save'.
    """
    if ".csv" in filename:
--- a/scrapegraphai/utils/convert_to_json.py
+++ b/scrapegraphai/utils/convert_to_json.py
@ -6,23 +6,33 @@ import os
 import sys
-def convert_to_json(data: dict, filename: str, position: str = None):
+def convert_to_json(data: dict, filename: str, position: str = None) -> None:
    """
-    Convert data to JSON format and save it to a file.
+    Converts a dictionary to a JSON file and saves it at a specified location.
    Args:
-    data (dict): Data to save.
+        data (dict): The data to be converted into JSON format.
-    filename (str): Name of the file to save without .json extension.
+        filename (str): The name of the output JSON file, without the '.json' extension.
-    position (str): Directory where the file should be saved. If None, 
+        position (str, optional): The file path where the JSON file should be saved. Defaults to the directory of the caller script if not provided.
    the directory of the caller script will be used.
    Returns:
        None: The function does not return anything.
    Raises:
-    ValueError: If filename contains '.json'.
+        ValueError: If 'filename' contains '.json'.
-    FileNotFoundError: If the specified directory does not exist.
+        FileNotFoundError: If the specified directory does not exist.
-    PermissionError: If the program does not have permission to write to the directory.
+        PermissionError: If write permissions are lacking for the directory.
    Example:
        >>> convert_to_json({'id': [1, 2], 'value': [10, 20]}, 'output', '/path/to/save')
        Saves a JSON file named 'output.json' at '/path/to/save'.
    Notes:
        This function automatically ensures the directory exists before attempting to write the file. If the directory does not exist, it will attempt to create it.
    """
    if ".json" in filename:
-        filename = filename.replace(".json", "")  # Remove .csv extension
+        filename = filename.replace(".json", "")  # Remove .json extension
  # Get the directory of the caller script
    if position is None:
--- a/scrapegraphai/utils/parse_state_keys.py
+++ b/scrapegraphai/utils/parse_state_keys.py
@ -4,12 +4,30 @@ Parse_state_key module
 import re
-def parse_expression(expression, state: dict):
+def parse_expression(expression, state: dict) -> list:
    """ 
    Function for parsing the expressions
    Args:
        state (dict): state to elaborate
    """
    Parses a complex boolean expression involving state keys.
    Args:
        expression (str): The boolean expression to parse.
        state (dict): Dictionary of state keys used to evaluate the expression.
    Raises:
        ValueError: If the expression is empty, has adjacent state keys without operators, invalid operator usage,
                    unbalanced parentheses, or if no state keys match the expression.
    Returns:
        list: A list of state keys that match the boolean expression, ensuring each key appears only once.
    Example:
        >>> parse_expression("user_input & (relevant_chunks | parsed_document | document)", 
                            {"user_input": None, "document": None, "parsed_document": None, "relevant_chunks": None})
        ['user_input', 'relevant_chunks', 'parsed_document', 'document']
    This function evaluates the expression to determine the logical inclusion of state keys based on provided boolean logic.
    It checks for syntax errors such as unbalanced parentheses, incorrect adjacency of operators, and empty expressions.
    """
    # Check for empty expression
    if not expression:
        raise ValueError("Empty expression.")
--- a/scrapegraphai/utils/prettify_exec_info.py
+++ b/scrapegraphai/utils/prettify_exec_info.py
@ -7,13 +7,17 @@ import pandas as pd
 def prettify_exec_info(complete_result: list[dict]) -> pd.DataFrame:
    """
-    Transform the execution information of the graph into a DataFrame for better visualization.
+    Transforms the execution information of a graph into a DataFrame for enhanced visualization.
    Args:
-    - complete_result (list[dict]): The complete execution information of the graph.
+        complete_result (list[dict]): The complete execution information of the graph.
    Returns:
-    - pd.DataFrame: The execution information of the graph in a DataFrame.
+        pd.DataFrame: A DataFrame that organizes the execution information for better readability and analysis.
    Example:
        >>> prettify_exec_info([{'node': 'A', 'status': 'success'}, {'node': 'B', 'status': 'failure'}])
        DataFrame with columns 'node' and 'status' showing execution results for each node.
    """
    df_nodes = pd.DataFrame(complete_result)
--- a/scrapegraphai/utils/proxy_rotation.py
+++ b/scrapegraphai/utils/proxy_rotation.py
@ -4,26 +4,29 @@ Module for rotating proxies
 from fp.fp import FreeProxy
-def proxy_generator(num_ips: int):
+def proxy_generator(num_ips: int) -> list:
    """
-    Rotates through a specified number of proxy IPs using the FreeProxy library.
+    Generates a specified number of proxy IP addresses using the FreeProxy library.
    Args:
-        num_ips (int): The number of proxy IPs to rotate through.
+        num_ips (int): The number of proxy IPs to generate and rotate through.
    Returns:
-        dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation.
+        list: A list of proxy IP addresses.
    Example:
        >>> proxy_generator(5)
-        {
+        [
-            0: '192.168.1.1:8080',
+            '192.168.1.1:8080',
-            1: '103.10.63.135:8080',
+            '103.10.63.135:8080',
-            2: '176.9.75.42:8080',
+            '176.9.75.42:8080',
-            3: '37.57.216.2:8080',
+            '37.57.216.2:8080',
-            4: '113.20.31.250:8080'
+            '113.20.31.250:8080'
-        }
+        ]
    This function fetches fresh proxies and indexes them, making it easy to manage multiple proxy configurations.
    """
    res = []
    for i in range(0, num_ips):
--- a/scrapegraphai/utils/remover.py
+++ b/scrapegraphai/utils/remover.py
@ -7,15 +7,20 @@ from minify_html import minify
 def remover(html_content: str) -> str:
    """
-    This function processes HTML content, removes unnecessary tags 
+    Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
    (including style tags), minifies the HTML, and retrieves the 
    title and body content.
-    Parameters:
+    Args:
-        html_content (str): The HTML content to parse
+        html_content (str): The HTML content to be processed.
    Returns:
-        str: The parsed title followed by the minified body content
+        str: A string combining the parsed title and the minified body content. If no body content is found, it indicates so.
    Example:
        >>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
        >>> remover(html_content)
        'Title: Example, Body: <body><p>Hello World!</p></body>'
    This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
--- a/scrapegraphai/utils/research_web.py
+++ b/scrapegraphai/utils/research_web.py
@ -8,16 +8,25 @@ from googlesearch import search
 def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
-    """ 
+    """
-    Function that given a query it finds it on the intenet
+    Searches the web for a given query using specified search engine options.
    Args:
-        query (str): query to search on internet
+        query (str): The search query to find on the internet.
-        search_engine (str, optional): type of browser, it could be DuckDuckGo or Google,
+        search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'.
-            default: Google
+        max_results (int, optional): The maximum number of search results to return.
        max_results (int, optional): maximum number of results
    Returns:
-        List[str]: List of strings of web link
+        List[str]: A list of URLs as strings that are the search results.
    Raises:
        ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'.
    Example:
        >>> search_on_web("example query", search_engine="Google", max_results=5)
        ['http://example.com', 'http://example.org', ...]
    This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs.
    """
    if search_engine == "Google":
--- a/scrapegraphai/utils/save_audio_from_bytes.py
+++ b/scrapegraphai/utils/save_audio_from_bytes.py
@ -7,12 +7,18 @@ from typing import Union
 def save_audio_from_bytes(byte_response: bytes, output_path: Union[str, Path]) -> None:
    """
-    Saves the byte response as an audio file.
+    Saves the byte response as an audio file to the specified path.
    Args:
-        byte_response (bytes): The byte response containing the generated speech.
+        byte_response (bytes): The byte array containing audio data.
-        output_path (str or Path): The file path where the generated speech should be saved.
+        output_path (Union[str, Path]): The destination file path where the audio file will be saved.
    Example:
        >>> save_audio_from_bytes(b'audio data', 'path/to/audio.mp3')
    This function writes the byte array containing audio data to a file, saving it as an audio file.
    """
    if not isinstance(output_path, Path):
        output_path = Path(output_path)
--- a/scrapegraphai/utils/token_calculator.py
+++ b/scrapegraphai/utils/token_calculator.py
@ -8,15 +8,21 @@ from ..helpers.models_tokens import models_tokens
 def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
    """
-    It creates a list of strings to create max dimension tokenizable elements
+    Truncates text into chunks that are small enough to be processed by specified llm models.
    Args:
-        text (str): The input text to be truncated into tokenizable elements.
+        text (str): The input text to be truncated.
-        model (str): The name of the language model to be used.
+        model (str): The name of the llm model to determine the maximum token limit.
-        encoding_name (str): The name of the encoding to be used (default: EMBEDDING_ENCODING).
+        encoding_name (str): The encoding strategy used to encode the text before truncation.
    Returns:
-        List[str]: A list of tokenizable elements created from the input text.
+        List[str]: A list of text chunks, each within the token limit of the specified model.
    Example:
        >>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING")
        ["This is a sample text", "for truncation."]
    This function ensures that each chunk of text can be tokenized by the specified model without exceeding the model's token limit.
    """
    encoding = tiktoken.get_encoding(encoding_name)