From 99dc8497d85289759286a973e4aecc3f924d3ada Mon Sep 17 00:00:00 2001
From: Marco Perini <perinim.98@gmail.com>
Date: Sat, 20 Jul 2024 20:02:26 +0200
Subject: [PATCH] docs(gpt-4o-mini): added new gpt, fixed chromium lazy
 loading, add documentation and metrics

---
 docs/source/scrapers/graph_config.rst      |  2 +-
 docs/source/scrapers/telemetry.rst         |  5 ++++
 scrapegraphai/docloaders/chromium.py       |  2 +-
 scrapegraphai/graphs/base_graph.py         | 35 +++++++++++++++++++++-
 scrapegraphai/helpers/models_tokens.py     |  4 ++-
 scrapegraphai/nodes/graph_iterator_node.py |  3 +-
 scrapegraphai/telemetry/telemetry.py       |  7 ++++-
 7 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/docs/source/scrapers/graph_config.rst b/docs/source/scrapers/graph_config.rst
index 9e1d49e0..e16ccae7 100644
--- a/docs/source/scrapers/graph_config.rst
+++ b/docs/source/scrapers/graph_config.rst
@@ -14,7 +14,7 @@ Some interesting ones are:
 - `burr_kwargs`: A dictionary with additional parameters to enable `Burr` graphical user interface.
 - `max_images`: The maximum number of images to be analyzed. Useful in `OmniScraperGraph` and `OmniSearchGraph`.
 - `cache_path`: The path where the cache files will be saved. If already exists, the cache will be loaded from this path.
-
+- `additional_info`: Add additional text to default prompts defined in the graphs.
 .. _Burr:
 
 Burr Integration
diff --git a/docs/source/scrapers/telemetry.rst b/docs/source/scrapers/telemetry.rst
index f5e9f27c..a80eb3b6 100644
--- a/docs/source/scrapers/telemetry.rst
+++ b/docs/source/scrapers/telemetry.rst
@@ -27,8 +27,13 @@ Additionally, the following properties are collected:
        "llm_model": llm_model_name,
        "embedder_model": embedder_model_name,
        "source_type": source_type,
+       "source": source,
        "execution_time": execution_time,
+       "prompt": prompt,
+       "schema": schema,
        "error_node": error_node_name,
+       "exception": exception,
+       "response": response,
        "total_tokens": total_tokens,
    }
 
diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
index 579933e6..474c22de 100644
--- a/scrapegraphai/docloaders/chromium.py
+++ b/scrapegraphai/docloaders/chromium.py
@@ -82,7 +82,7 @@ class ChromiumLoader(BaseLoader):
                 context = await browser.new_context()
                 await Malenia.apply_stealth(context)
                 page = await context.new_page()
-                await page.goto(url)
+                await page.goto(url, wait_until="domcontentloaded")
                 await page.wait_for_load_state(self.load_state)
                 results = await page.content()  # Simply get the HTML content
                 logger.info("Content scraped")
diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py
index df2845fa..e6df2dec 100644
--- a/scrapegraphai/graphs/base_graph.py
+++ b/scrapegraphai/graphs/base_graph.py
@@ -106,18 +106,32 @@ class BaseGraph:
         source_type = None
         llm_model = None
         embedder_model = None
+        source = []
+        prompt = None
+        schema = None
 
         while current_node_name:
             curr_time = time.time()
             current_node = next(node for node in self.nodes if node.node_name == current_node_name)
 
+            
             # check if there is a "source" key in the node config
             if current_node.__class__.__name__ == "FetchNode":
                 # get the second key name of the state dictionary
                 source_type = list(state.keys())[1]
+                if state.get("user_prompt", None):
+                    prompt = state["user_prompt"] if type(state["user_prompt"]) == str else None
                 # quick fix for local_dir source type
                 if source_type == "local_dir":
                     source_type = "html_dir"
+                elif source_type == "url":
+                    if type(state[source_type]) == list:
+                        # iterate through the list of urls and see if they are strings
+                        for url in state[source_type]:
+                            if type(url) == str:
+                                source.append(url)
+                    elif type(state[source_type]) == str:
+                        source.append(state[source_type])
 
             # check if there is an "llm_model" variable in the class
             if hasattr(current_node, "llm_model") and llm_model is None:
@@ -135,6 +149,16 @@ class BaseGraph:
                 elif hasattr(embedder_model, "model"):
                     embedder_model = embedder_model.model
 
+            if hasattr(current_node, "node_config"):
+                if type(current_node.node_config) is dict:
+                    if current_node.node_config.get("schema", None) and schema is None:
+                        if type(current_node.node_config["schema"]) is not dict:
+                            # convert to dict
+                            try:
+                                schema = current_node.node_config["schema"].schema()
+                            except Exception as e:
+                                schema = None
+
             with get_openai_callback() as cb:
                 try:
                     result = current_node.execute(state)
@@ -144,11 +168,15 @@ class BaseGraph:
                     graph_execution_time = time.time() - start_time
                     log_graph_execution(
                         graph_name=self.graph_name,
+                        source=source,
+                        prompt=prompt,
+                        schema=schema,
                         llm_model=llm_model,
                         embedder_model=embedder_model,
                         source_type=source_type,
                         execution_time=graph_execution_time,
-                        error_node=error_node
+                        error_node=error_node,
+                        exception=str(e)
                     )
                     raise e
                 node_exec_time = time.time() - curr_time
@@ -191,11 +219,16 @@ class BaseGraph:
 
         # Log the graph execution telemetry
         graph_execution_time = time.time() - start_time
+        response = state.get("answer", None) if source_type == "url" else None
         log_graph_execution(
             graph_name=self.graph_name,
+            source=source,
+            prompt=prompt,
+            schema=schema,
             llm_model=llm_model,
             embedder_model=embedder_model,
             source_type=source_type,
+            response=response,
             execution_time=graph_execution_time,
             total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None,
         )
diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
index 7becbf8b..ab96c46a 100644
--- a/scrapegraphai/helpers/models_tokens.py
+++ b/scrapegraphai/helpers/models_tokens.py
@@ -16,9 +16,10 @@ models_tokens = {
         "gpt-4-32k": 32768,
         "gpt-4-32k-0613": 32768,
         "gpt-4o": 128000,
+        "gpt-4o-mini":128000,
     },
     "azure": {
-          "gpt-3.5-turbo-0125": 16385,
+        "gpt-3.5-turbo-0125": 16385,
         "gpt-3.5": 4096,
         "gpt-3.5-turbo": 16385,
         "gpt-3.5-turbo-1106": 16385,
@@ -34,6 +35,7 @@ models_tokens = {
         "gpt-4-32k": 32768,
         "gpt-4-32k-0613": 32768,
         "gpt-4o": 128000,
+        "gpt-4o-mini":128000,
     },
     "gemini": {
         "gemini-pro": 128000,
diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py
index 7e0872e3..061be77a 100644
--- a/scrapegraphai/nodes/graph_iterator_node.py
+++ b/scrapegraphai/nodes/graph_iterator_node.py
@@ -126,7 +126,8 @@ class GraphIteratorNode(BaseNode):
         for url in urls:
             instance = copy.copy(graph_instance)
             instance.source = url
-
+            if url.startswith("http"):
+                instance.input_key = "url"
             participants.append(instance)
 
         futures = [_async_run(graph) for graph in participants]
diff --git a/scrapegraphai/telemetry/telemetry.py b/scrapegraphai/telemetry/telemetry.py
index 20d38186..d1c8a367 100644
--- a/scrapegraphai/telemetry/telemetry.py
+++ b/scrapegraphai/telemetry/telemetry.py
@@ -156,14 +156,19 @@ def log_event(event: str, properties: Dict[str, any]):
         send_event_json(event_json)
 
 
-def log_graph_execution(graph_name: str, llm_model: str, embedder_model: str, source_type: str, execution_time: float, error_node: str = None, total_tokens: int = None):
+def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
     properties = {
         "graph_name": graph_name,
+        "source": source,
+        "prompt": prompt,
+        "schema": schema,
         "llm_model": llm_model,
         "embedder_model": embedder_model,
         "source_type": source_type,
+        "response": response,
         "execution_time": execution_time,
         "error_node": error_node,
+        "exception": exception,
         "total_tokens": total_tokens,
     }
     log_event("graph_execution", properties)