From e3d0194dc93b20dc254fc48bba11559bf8a3a185 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Tue, 30 Apr 2024 11:51:04 +0200 Subject: [PATCH] fix: script generator and add new benchmarks --- examples/benchmarks/GenerateScraper/Readme.md | 28 +++++---- .../GenerateScraper/benchmark_groq.py | 61 +++++++++++++++++++ .../GenerateScraper/benchmark_llama3.py | 5 -- examples/benchmarks/SmartScraper/Readme.md | 28 ++++----- .../benchmarks/SmartScraper/benchmark_groq.py | 57 +++++++++++++++++ .../SmartScraper/benchmark_llama3.py | 1 - scrapegraphai/graphs/script_creator_graph.py | 2 + 7 files changed, 149 insertions(+), 33 deletions(-) create mode 100644 examples/benchmarks/GenerateScraper/benchmark_groq.py create mode 100644 examples/benchmarks/SmartScraper/benchmark_groq.py diff --git a/examples/benchmarks/GenerateScraper/Readme.md b/examples/benchmarks/GenerateScraper/Readme.md index 7da6245c..79201d22 100644 --- a/examples/benchmarks/GenerateScraper/Readme.md +++ b/examples/benchmarks/GenerateScraper/Readme.md @@ -1,4 +1,5 @@ # Local models +# Local models The two websites benchmark are: - Example 1: https://perinim.github.io/projects - Example 2: https://www.wired.com (at 17/4/2024) @@ -9,14 +10,12 @@ The time is measured in seconds The model runned for this benchmark is Mistral on Ollama with nomic-embed-text -In particular, is tested with ScriptCreatorGraph - | Hardware | Model | Example 1 | Example 2 | | ---------------------- | --------------------------------------- | --------- | --------- | | Macbook 14' m1 pro | Mistral on Ollama with nomic-embed-text | 30.54s | 35.76s | -| Macbook m2 max | Mistral on Ollama with nomic-embed-text | 18,46s | 19.59 | -| Macbook 14' m1 pro
| Llama3 on Ollama with nomic-embed-text | 27.82s | 29.98s | -| Macbook m2 max
| Llama3 on Ollama with nomic-embed-text | 20.83s | 12.29s | +| Macbook m2 max | Mistral on Ollama with nomic-embed-text | | | +| Macbook 14' m1 pro
| Llama3 on Ollama with nomic-embed-text | 27.82s | 29.986s | +| Macbook m2 max
| Llama3 on Ollama with nomic-embed-text | | | **Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). @@ -25,17 +24,20 @@ In particular, is tested with ScriptCreatorGraph **URL**: https://perinim.github.io/projects **Task**: List me all the projects with their description. -| Name | Execution time | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | -| ------------------- | ---------------| ------------ | ------------- | ----------------- | ------------------- | -------------- | -| gpt-3.5-turbo | 4.50s | 1897 | 1802 | 95 | 1 | 0.002893 | -| gpt-4-turbo | 7.88s | 1920 | 1802 | 118 | 1 | 0.02156 | +| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | +| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | +| gpt-3.5-turbo | 24.21 | 1892 | 1802 | 90 | 1 | 0.002883 | +| gpt-4-turbo-preview | 6.614 | 1936 | 1802 | 134 | 1 | 0.02204 | +| Grooq with nomic-embed-text | 6.71 | 2201 | 2024 | 177 | 1 | 0 | ### Example 2: Wired **URL**: https://www.wired.com **Task**: List me all the articles with their description. -| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | -| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | -| gpt-3.5-turbo | Error (text too long) | - | - | - | - | - | -| gpt-4-turbo | Error (TPM limit reach)| - | - | - | - | - | +| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | +| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | +| gpt-3.5-turbo | | | | | | | +| gpt-4-turbo-preview | | | | | | | +| Grooq with nomic-embed-text | | | | | | | + diff --git a/examples/benchmarks/GenerateScraper/benchmark_groq.py b/examples/benchmarks/GenerateScraper/benchmark_groq.py new file mode 100644 index 00000000..bef4e8b6 --- /dev/null +++ b/examples/benchmarks/GenerateScraper/benchmark_groq.py @@ -0,0 +1,61 @@ +""" +Basic example of scraping pipeline using SmartScraper from text +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import ScriptCreatorGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Read the text file +# ************************************************ +files = ["inputs/example_1.txt", "inputs/example_2.txt"] +tasks = ["List me all the projects with their description.", + "List me all the articles with their description."] + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "headless": False, + "library": "beautifoulsoup" +} + + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +for i in range(0, 2): + with open(files[i], 'r', encoding="utf-8") as file: + text = file.read() + + smart_scraper_graph = ScriptCreatorGraph( + prompt=tasks[i], + source=text, + config=graph_config + ) + + result = smart_scraper_graph.run() + print(result) + # ************************************************ + # Get graph execution info + # ************************************************ + + graph_exec_info = smart_scraper_graph.get_execution_info() + print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/GenerateScraper/benchmark_llama3.py b/examples/benchmarks/GenerateScraper/benchmark_llama3.py index 46717acc..a80b2e71 100644 --- a/examples/benchmarks/GenerateScraper/benchmark_llama3.py +++ b/examples/benchmarks/GenerateScraper/benchmark_llama3.py @@ -2,11 +2,8 @@ Basic example of scraping pipeline using SmartScraper from text """ -import os -from dotenv import load_dotenv from scrapegraphai.graphs import ScriptCreatorGraph from scrapegraphai.utils import prettify_exec_info -load_dotenv() # ************************************************ # Read the text file @@ -19,8 +16,6 @@ tasks = ["List me all the projects with their description.", # Define the configuration for the graph # ************************************************ -openai_key = os.getenv("GPT4_KEY") - graph_config = { "llm": { diff --git a/examples/benchmarks/SmartScraper/Readme.md b/examples/benchmarks/SmartScraper/Readme.md index 833ac680..9166dfec 100644 --- a/examples/benchmarks/SmartScraper/Readme.md +++ b/examples/benchmarks/SmartScraper/Readme.md @@ -5,37 +5,37 @@ The two websites benchmark are: Both are strored locally as txt file in .txt format because in this way we do not have to think about the internet connection -In particular, is tested with SmartScraper - -| Hardware | Moodel | Example 1 | Example 2 | +| Hardware | Model | Example 1 | Example 2 | | ------------------ | --------------------------------------- | --------- | --------- | | Macbook 14' m1 pro | Mistral on Ollama with nomic-embed-text | 11.60s | 26.61s | | Macbook m2 max | Mistral on Ollama with nomic-embed-text | 8.05s | 12.17s | -| Macbook 14' m1 pro | Llama3 on Ollama with nomic-embed-text | 29.871s | 35.32s | +| Macbook 14' m1 pro | Llama3 on Ollama with nomic-embed-text | 29.87s | 35.32s | | Macbook m2 max | Llama3 on Ollama with nomic-embed-text | 18.36s | 78.32s | - **Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). Indeed the results are the following: | Hardware | Example 1 | Example 2 | | ------------------ | --------- | --------- | -| Macbook 14' m1 pro | 139.89s | Too long | +| Macbook 14' m1 pro | 139.89 | Too long | # Performance on APIs services ### Example 1: personal portfolio **URL**: https://perinim.github.io/projects **Task**: List me all the projects with their description. -| Name | Execution time | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | -| ------------------- | ---------------| ------------ | ------------- | ----------------- | ------------------- | -------------- | -| gpt-3.5-turbo | 5.58s | 445 | 272 | 173 | 1 | 0.000754 | -| gpt-4-turbo | 9.76s | 445 | 272 | 173 | 1 | 0.00791 | +| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | +| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | +| gpt-3.5-turbo | 25.22 | 445 | 272 | 173 | 1 | 0.000754 | +| gpt-4-turbo-preview | 9.53 | 449 | 272 | 177 | 1 | 0.00803 | +| Grooq with nomic-embed-text | 1.99 | 474 | 284 | 190 | 1 | 0 | ### Example 2: Wired **URL**: https://www.wired.com **Task**: List me all the articles with their description. -| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | -| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | -| gpt-3.5-turbo | 6.50 | 2442 | 2199 | 243 | 1 | 0.003784 | -| gpt-4-turbo | 76.07 | 3521 | 2199 | 1322 | 1 | 0.06165 | +| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | +| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | +| gpt-3.5-turbo | 25.89 | 445 | 272 | 173 | 1 | 0.000754 | +| gpt-4-turbo-preview | 64.70 | 3573 | 2199 | 1374 | 1 | 0.06321 | +| Grooq with nomic-embed-text | 3.82 | 2459 | 2192 | 267 | 1 | 0 | + diff --git a/examples/benchmarks/SmartScraper/benchmark_groq.py b/examples/benchmarks/SmartScraper/benchmark_groq.py new file mode 100644 index 00000000..e769ee52 --- /dev/null +++ b/examples/benchmarks/SmartScraper/benchmark_groq.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using SmartScraper from text +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +files = ["inputs/example_1.txt", "inputs/example_2.txt"] +tasks = ["List me all the projects with their description.", + "List me all the articles with their description."] + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "headless": False +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +for i in range(0, 2): + with open(files[i], 'r', encoding="utf-8") as file: + text = file.read() + + smart_scraper_graph = SmartScraperGraph( + prompt=tasks[i], + source=text, + config=graph_config + ) + + result = smart_scraper_graph.run() + print(result) + # ************************************************ + # Get graph execution info + # ************************************************ + + graph_exec_info = smart_scraper_graph.get_execution_info() + print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/SmartScraper/benchmark_llama3.py b/examples/benchmarks/SmartScraper/benchmark_llama3.py index 45e898dc..2b182f20 100644 --- a/examples/benchmarks/SmartScraper/benchmark_llama3.py +++ b/examples/benchmarks/SmartScraper/benchmark_llama3.py @@ -2,7 +2,6 @@ Basic example of scraping pipeline using SmartScraper from text """ -import os from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index fa86eeb4..1a64512e 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -34,6 +34,8 @@ class ScriptCreatorGraph(AbstractGraph): fetch_node = FetchNode( input="url | local_dir", output=["doc"], + node_config={ + "headless": True if self.config is None else self.config.get("headless", True)} ) parse_node = ParseNode( input="doc",