diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py index 01448a5b..4f0952ae 100644 --- a/examples/openai/smart_scraper_openai.py +++ b/examples/openai/smart_scraper_openai.py @@ -19,7 +19,7 @@ openai_key = os.getenv("OPENAI_APIKEY") graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "verbose": True, "headless": False, @@ -30,7 +30,7 @@ graph_config = { # ************************************************ smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", + prompt="List me all the projects with their description", # also accepts a string with the already downloaded HTML code source="https://perinim.github.io/projects/", config=graph_config diff --git a/examples/single_node/image2text_node.py b/examples/single_node/image2text_node.py new file mode 100644 index 00000000..8fc20991 --- /dev/null +++ b/examples/single_node/image2text_node.py @@ -0,0 +1,51 @@ +""" +Example of ImageToTextNode +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.nodes import ImageToTextNode +from scrapegraphai.models import OpenAIImageToText + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-4o", + "temperature": 0, + }, +} + +# ************************************************ +# Define the node +# ************************************************ + +llm_model = OpenAIImageToText(graph_config["llm"]) + +image_to_text_node = ImageToTextNode( + input="img_url", + output=["img_desc"], + node_config={ + "llm_model": llm_model, + "headless": False + } +) + +# ************************************************ +# Test the node +# ************************************************ + +state = { + "img_url": "https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/assets/scrapegraphai_logo.png?raw=true" +} + +result = image_to_text_node.execute(state) + +print(result) diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index ec83e1fb..f8881d75 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -18,6 +18,7 @@ models_tokens = { "gpt-4-0613": 8192, "gpt-4-32k": 32768, "gpt-4-32k-0613": 32768, + "gpt-4o": 128000, }, "azure": { "gpt-3.5-turbo": 4096, diff --git a/scrapegraphai/nodes/image_descriptor_node.py b/scrapegraphai/nodes/image_descriptor_node.py new file mode 100644 index 00000000..5149b795 --- /dev/null +++ b/scrapegraphai/nodes/image_descriptor_node.py @@ -0,0 +1,68 @@ +""" +ImageDescriptorNode Module +""" + +from typing import List, Optional +from .base_node import BaseNode + + +class ImageDescriptorNode(BaseNode): + """ + Retrieve images from a list of URLs and return a description of the images using an image-to-text model. + + Attributes: + llm_model: An instance of the language model client used for image-to-text conversion. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "ImageDescriptor". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict]=None, + node_name: str = "ImageDescriptor", + ): + super().__init__(node_name, "node", input, output, 1, node_config) + + self.llm_model = node_config["llm_model"] + self.verbose = False if node_config is None else node_config.get("verbose", False) + self.max_images = 5 if node_config is None else node_config.get("max_images", 5) + + def execute(self, state: dict) -> dict: + """ + Generate text from an image using an image-to-text model. The method retrieves the image + from the list of URLs provided in the state and returns the extracted text. + + Args: + state (dict): The current state of the graph. The input keys will be used to fetch the + correct data types from the state. + + Returns: + dict: The updated state with the input key containing the text extracted from the image. + """ + + if self.verbose: + print(f"--- Executing {self.node_name} Node ---") + + input_keys = self.get_input_keys(state) + input_data = [state[key] for key in input_keys] + urls = input_data[0] + + if len(urls) == 1 and not isinstance(urls, list): + urls = [urls] + elif len(urls) == 0: + return state + + img_desc = [] + for url in urls[:self.max_images]: + text_answer = self.llm_model.run(url) + img_desc.append(text_answer) + + state.update({self.output[0]: img_desc}) + return state