fix: script generator and add new benchmarks

2026-06-23 21:00:30 +08:00 · 2024-04-30 11:51:04 +02:00 · 2024-04-30 11:51:04 +02:00 · e3d0194dc9
commit e3d0194dc9
parent 7e81f7c03f
7 changed files with 149 additions and 33 deletions
--- a/examples/benchmarks/GenerateScraper/Readme.md
+++ b/examples/benchmarks/GenerateScraper/Readme.md
@ -1,4 +1,5 @@
 # Local models
+# Local models
 The two websites benchmark are:
 - Example 1:  https://perinim.github.io/projects
 - Example 2: https://www.wired.com (at 17/4/2024)
@ -9,14 +10,12 @@ The time is measured in seconds

 The model runned for this benchmark is Mistral on Ollama with nomic-embed-text

-In particular, is tested with ScriptCreatorGraph
-
 | Hardware               | Model                                   | Example 1 | Example 2 |
 | ---------------------- | --------------------------------------- | --------- | --------- |
 | Macbook 14' m1 pro     | Mistral on Ollama with nomic-embed-text | 30.54s    | 35.76s    |
-| Macbook m2 max         | Mistral on Ollama with nomic-embed-text | 18,46s    | 19.59     |
-| Macbook 14' m1 pro<br> | Llama3 on Ollama with nomic-embed-text  | 27.82s    | 29.98s    |
-| Macbook m2 max<br>     | Llama3 on Ollama with nomic-embed-text  | 20.83s    | 12.29s    |
+| Macbook m2 max         | Mistral on Ollama with nomic-embed-text |           |           |
+| Macbook 14' m1 pro<br> | Llama3 on Ollama with nomic-embed-text  | 27.82s    | 29.986s   |
+| Macbook m2 max<br>     | Llama3 on Ollama with nomic-embed-text  |           |           |


 **Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). 
@ -25,17 +24,20 @@ In particular, is tested with ScriptCreatorGraph
 **URL**: https://perinim.github.io/projects
 **Task**: List me all the projects with their description.

-| Name                | Execution time | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
-| ------------------- | ---------------| ------------ | ------------- | ----------------- | ------------------- | -------------- |
-| gpt-3.5-turbo       | 4.50s          | 1897         | 1802          | 95                | 1                   | 0.002893       |
-| gpt-4-turbo         | 7.88s          | 1920         | 1802          | 118               | 1                   | 0.02156        |
+| Name                        | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
+| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
+| gpt-3.5-turbo               | 24.21                    | 1892         | 1802          | 90                | 1                   | 0.002883       |
+| gpt-4-turbo-preview         | 6.614                    | 1936         | 1802          | 134               | 1                   | 0.02204        |
+| Grooq with nomic-embed-text | 6.71                     | 2201         | 2024          | 177               | 1                   | 0              |

 ### Example 2: Wired
 **URL**: https://www.wired.com
 **Task**: List me all the articles with their description.

-| Name                | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
-| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
-| gpt-3.5-turbo       |   Error (text too long)  |      -       |      -        |         -         |           -         |        -       |
-| gpt-4-turbo         |   Error (TPM limit reach)|      -       |      -        |         -         |           -         |        -       |
+| Name                        | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
+| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
+| gpt-3.5-turbo               |                          |              |               |                   |                     |                |
+| gpt-4-turbo-preview         |                          |              |               |                   |                     |                |
+| Grooq with nomic-embed-text |                          |              |               |                   |                     |                |
+

--- a/examples/benchmarks/GenerateScraper/benchmark_groq.py
+++ b/examples/benchmarks/GenerateScraper/benchmark_groq.py
@ -0,0 +1,61 @@
+""" 
+Basic example of scraping pipeline using SmartScraper from text
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import ScriptCreatorGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Read the text file
+# ************************************************
+files = ["inputs/example_1.txt", "inputs/example_2.txt"]
+tasks = ["List me all the projects with their description.",
+         "List me all the articles with their description."]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+groq_key = os.getenv("GROQ_APIKEY")
+
+graph_config = {
+    "llm": {
+        "model": "groq/gemma-7b-it",
+        "api_key": groq_key,
+        "temperature": 0
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "headless": False,
+    "library": "beautifoulsoup"
+}
+
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+for i in range(0, 2):
+    with open(files[i], 'r', encoding="utf-8") as file:
+        text = file.read()
+
+    smart_scraper_graph = ScriptCreatorGraph(
+        prompt=tasks[i],
+        source=text,
+        config=graph_config
+    )
+
+    result = smart_scraper_graph.run()
+    print(result)
+    # ************************************************
+    # Get graph execution info
+    # ************************************************
+
+    graph_exec_info = smart_scraper_graph.get_execution_info()
+    print(prettify_exec_info(graph_exec_info))
--- a/examples/benchmarks/GenerateScraper/benchmark_llama3.py
+++ b/examples/benchmarks/GenerateScraper/benchmark_llama3.py
@ -2,11 +2,8 @@
 Basic example of scraping pipeline using SmartScraper from text
 """

-import os
-from dotenv import load_dotenv
 from scrapegraphai.graphs import ScriptCreatorGraph
 from scrapegraphai.utils import prettify_exec_info
-load_dotenv()

 # ************************************************
 # Read the text file
@ -19,8 +16,6 @@ tasks = ["List me all the projects with their description.",
 # Define the configuration for the graph
 # ************************************************

-openai_key = os.getenv("GPT4_KEY")
-

 graph_config = {
    "llm": {
--- a/examples/benchmarks/SmartScraper/Readme.md
+++ b/examples/benchmarks/SmartScraper/Readme.md
@ -5,37 +5,37 @@ The two websites benchmark are:

 Both are strored locally as txt file in .txt format  because in this way we do not have to think about the internet connection

-In particular, is tested with SmartScraper
-
-| Hardware           | Moodel                                  | Example 1 | Example 2 |
+| Hardware           | Model                                   | Example 1 | Example 2 |
 | ------------------ | --------------------------------------- | --------- | --------- |
 | Macbook 14' m1 pro | Mistral on Ollama with nomic-embed-text | 11.60s    | 26.61s    |
 | Macbook m2 max     | Mistral on Ollama with nomic-embed-text | 8.05s     | 12.17s    |
-| Macbook 14' m1 pro | Llama3 on Ollama with nomic-embed-text  | 29.871s   | 35.32s    |
+| Macbook 14' m1 pro | Llama3 on Ollama with nomic-embed-text  | 29.87s    | 35.32s    |
 | Macbook m2 max     | Llama3 on Ollama with nomic-embed-text  | 18.36s    | 78.32s    |

-
 **Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). Indeed the results are the following:

 | Hardware           | Example 1 | Example 2 |
 | ------------------ | --------- | --------- |
-| Macbook 14' m1 pro | 139.89s   | Too long  |
+| Macbook 14' m1 pro | 139.89    | Too long  |
 # Performance on APIs services
 ### Example 1: personal portfolio 
 **URL**: https://perinim.github.io/projects
 **Task**: List me all the projects with their description.

-| Name                | Execution time | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
-| ------------------- | ---------------| ------------ | ------------- | ----------------- | ------------------- | -------------- |
-| gpt-3.5-turbo       | 5.58s          | 445          | 272           | 173               | 1                   | 0.000754       |
-| gpt-4-turbo         | 9.76s          | 445          | 272           | 173               | 1                   | 0.00791        |
+| Name                        | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
+| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
+| gpt-3.5-turbo               | 25.22                    | 445          | 272           | 173               | 1                   | 0.000754       |
+| gpt-4-turbo-preview         | 9.53                     | 449          | 272           | 177               | 1                   | 0.00803        |
+| Grooq with nomic-embed-text | 1.99                     | 474          | 284           | 190               | 1                   | 0              |

 ### Example 2: Wired
 **URL**: https://www.wired.com
 **Task**: List me all the articles with their description.

-| Name                | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
-| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
-| gpt-3.5-turbo       | 6.50                     | 2442         | 2199          | 243               | 1                   | 0.003784       |
-| gpt-4-turbo         | 76.07                    | 3521         | 2199          | 1322              | 1                   | 0.06165        |
+| Name                        | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
+| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
+| gpt-3.5-turbo               | 25.89                    | 445          | 272           | 173               | 1                   | 0.000754       |
+| gpt-4-turbo-preview         | 64.70                    | 3573         | 2199          | 1374              | 1                   | 0.06321        |
+| Grooq with nomic-embed-text | 3.82                     | 2459         | 2192          | 267               | 1                   | 0              |
+

--- a/examples/benchmarks/SmartScraper/benchmark_groq.py
+++ b/examples/benchmarks/SmartScraper/benchmark_groq.py
@ -0,0 +1,57 @@
+""" 
+Basic example of scraping pipeline using SmartScraper from text
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+files = ["inputs/example_1.txt", "inputs/example_2.txt"]
+tasks = ["List me all the projects with their description.",
+         "List me all the articles with their description."]
+
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+groq_key = os.getenv("GROQ_APIKEY")
+
+graph_config = {
+    "llm": {
+        "model": "groq/gemma-7b-it",
+        "api_key": groq_key,
+        "temperature": 0
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "headless": False
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+for i in range(0, 2):
+    with open(files[i], 'r', encoding="utf-8") as file:
+        text = file.read()
+
+    smart_scraper_graph = SmartScraperGraph(
+        prompt=tasks[i],
+        source=text,
+        config=graph_config
+    )
+
+    result = smart_scraper_graph.run()
+    print(result)
+    # ************************************************
+    # Get graph execution info
+    # ************************************************
+
+    graph_exec_info = smart_scraper_graph.get_execution_info()
+    print(prettify_exec_info(graph_exec_info))
--- a/examples/benchmarks/SmartScraper/benchmark_llama3.py
+++ b/examples/benchmarks/SmartScraper/benchmark_llama3.py
@ -2,7 +2,6 @@
 Basic example of scraping pipeline using SmartScraper from text
 """

-import os
 from scrapegraphai.graphs import SmartScraperGraph
 from scrapegraphai.utils import prettify_exec_info

--- a/scrapegraphai/graphs/script_creator_graph.py
+++ b/scrapegraphai/graphs/script_creator_graph.py
@ -34,6 +34,8 @@ class ScriptCreatorGraph(AbstractGraph):
        fetch_node = FetchNode(
            input="url | local_dir",
            output=["doc"],
+            node_config={
+                "headless": True if self.config is None else self.config.get("headless", True)}
        )
        parse_node = ParseNode(
            input="doc",