From d3e2eb6ea5729477651a700bded20297ceb07857 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Sat, 20 Apr 2024 22:49:25 +0200 Subject: [PATCH] add new benchmarks --- .../{ => GenerateScraper}/.env.example | 0 examples/benchmarks/GenerateScraper/Readme.md | 41 ++ .../GenerateScraper/benchmark_docker.py | 0 .../GenerateScraper/benchmark_ollama.py | 62 +++ .../GenerateScraper/benchmark_openai_gpt35.py | 53 +++ .../GenerateScraper/benchmark_openai_gpt4.py | 53 +++ .../inputs/example_1.txt | 0 .../inputs/example_2.txt | 0 examples/benchmarks/SmartScraper/.env.example | 1 + examples/benchmarks/SmartScraper/Readme.md | 2 + .../{ => SmartScraper}/benchmark_docker.py | 0 .../{ => SmartScraper}/benchmark_ollama.py | 0 .../benchmark_openai_gpt35.py | 0 .../benchmark_openai_gpt4.py | 0 .../SmartScraper/inputs/example_1.txt | 105 +++++ .../SmartScraper/inputs/example_2.txt | 400 ++++++++++++++++++ examples/benchmarks/readme.md | 6 +- 17 files changed, 721 insertions(+), 2 deletions(-) rename examples/benchmarks/{ => GenerateScraper}/.env.example (100%) create mode 100644 examples/benchmarks/GenerateScraper/Readme.md create mode 100644 examples/benchmarks/GenerateScraper/benchmark_docker.py create mode 100644 examples/benchmarks/GenerateScraper/benchmark_ollama.py create mode 100644 examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py create mode 100644 examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py rename examples/benchmarks/{ => GenerateScraper}/inputs/example_1.txt (100%) rename examples/benchmarks/{ => GenerateScraper}/inputs/example_2.txt (100%) create mode 100644 examples/benchmarks/SmartScraper/.env.example create mode 100644 examples/benchmarks/SmartScraper/Readme.md rename examples/benchmarks/{ => SmartScraper}/benchmark_docker.py (100%) rename examples/benchmarks/{ => SmartScraper}/benchmark_ollama.py (100%) rename examples/benchmarks/{ => SmartScraper}/benchmark_openai_gpt35.py (100%) rename examples/benchmarks/{ => SmartScraper}/benchmark_openai_gpt4.py (100%) create mode 100644 examples/benchmarks/SmartScraper/inputs/example_1.txt create mode 100644 examples/benchmarks/SmartScraper/inputs/example_2.txt diff --git a/examples/benchmarks/.env.example b/examples/benchmarks/GenerateScraper/.env.example similarity index 100% rename from examples/benchmarks/.env.example rename to examples/benchmarks/GenerateScraper/.env.example diff --git a/examples/benchmarks/GenerateScraper/Readme.md b/examples/benchmarks/GenerateScraper/Readme.md new file mode 100644 index 00000000..ec897dc9 --- /dev/null +++ b/examples/benchmarks/GenerateScraper/Readme.md @@ -0,0 +1,41 @@ +# Local models +The two websites benchmark are: +- Example 1: https://perinim.github.io/projects +- Example 2: https://www.wired.com (at 17/4/2024) + +Both are strored locally as txt file in .txt format because in this way we do not have to think about the internet connection + +The time is measured in seconds + +The model runned for this benchmark is Mistral on Ollama with nomic-embed-text + +| Hardware | Example 1 | Example 2 | +| ------------------ | --------- | --------- | +| Macbook 14' m1 pro | 30.54 | 35.76 | +| Macbook m2 max | | | + + +**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). Indeed the results are the following: + +| Hardware | Example 1 | Example 2 | +| ------------------ | --------- | --------- | +| Macbook 14' m1 pro | | | +# Performance on APIs services +### Example 1: personal portfolio +**URL**: https://perinim.github.io/projects +**Task**: List me all the projects with their description. + +| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | +| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | +| gpt-3.5-turbo | 24.215268 | 1892 | 1802 | 90 | 1 | 0.002883 | +| gpt-4-turbo-preview | 6.614 | 1936 | 1802 | 134 | 1 | 0.02204 | + +### Example 2: Wired +**URL**: https://www.wired.com +**Task**: List me all the articles with their description. + +| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD | +| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- | +| gpt-3.5-turbo | | | | | | | +| gpt-4-turbo-preview | | | | | | | + diff --git a/examples/benchmarks/GenerateScraper/benchmark_docker.py b/examples/benchmarks/GenerateScraper/benchmark_docker.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/benchmarks/GenerateScraper/benchmark_ollama.py b/examples/benchmarks/GenerateScraper/benchmark_ollama.py new file mode 100644 index 00000000..87219eb4 --- /dev/null +++ b/examples/benchmarks/GenerateScraper/benchmark_ollama.py @@ -0,0 +1,62 @@ +""" +Basic example of scraping pipeline using SmartScraper from text +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import ScriptCreatorGraph +from scrapegraphai.utils import prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the text file +# ************************************************ +files = ["inputs/example_1.txt", "inputs/example_2.txt"] +tasks = ["List me all the projects with their description.", + "List me all the articles with their description."] + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("GPT4_KEY") + + +graph_config = { + "llm": { + "model": "ollama/mistral", + "temperature": 0, + # "model_tokens": 2000, # set context length arbitrarily, + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "library": "beautifoulsoup" +} + + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +for i in range(0, 2): + with open(files[i], 'r', encoding="utf-8") as file: + text = file.read() + + smart_scraper_graph = ScriptCreatorGraph( + prompt=tasks[i], + source=text, + config=graph_config + ) + + result = smart_scraper_graph.run() + print(result) + # ************************************************ + # Get graph execution info + # ************************************************ + + graph_exec_info = smart_scraper_graph.get_execution_info() + print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py b/examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py new file mode 100644 index 00000000..a395d2c5 --- /dev/null +++ b/examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py @@ -0,0 +1,53 @@ +""" +Basic example of scraping pipeline using SmartScraper from text +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import ScriptCreatorGraph +from scrapegraphai.utils import prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the text file +# ************************************************ +files = ["inputs/example_1.txt", "inputs/example_2.txt"] +tasks = ["List me all the projects with their description.", + "List me all the articles with their description."] + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("GPT35_KEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-3.5-turbo", + }, + "library": "beautifoulsoup" +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +for i in range(0, 2): + with open(files[i], 'r', encoding="utf-8") as file: + text = file.read() + + smart_scraper_graph = ScriptCreatorGraph( + prompt=tasks[i], + source=text, + config=graph_config + ) + + result = smart_scraper_graph.run() + print(result) + # ************************************************ + # Get graph execution info + # ************************************************ + + graph_exec_info = smart_scraper_graph.get_execution_info() + print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py b/examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py new file mode 100644 index 00000000..998bd809 --- /dev/null +++ b/examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py @@ -0,0 +1,53 @@ +""" +Basic example of scraping pipeline using SmartScraper from text +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import ScriptCreatorGraph +from scrapegraphai.utils import prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the text file +# ************************************************ +files = ["inputs/example_1.txt", "inputs/example_2.txt"] +tasks = ["List me all the projects with their description.", + "List me all the articles with their description."] + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("GPT4_KEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-4-turbo-preview", + }, + "library": "beautifoulsoup" +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +for i in range(0, 2): + with open(files[i], 'r', encoding="utf-8") as file: + text = file.read() + + smart_scraper_graph = ScriptCreatorGraph( + prompt=tasks[i], + source=text, + config=graph_config + ) + + result = smart_scraper_graph.run() + print(result) + # ************************************************ + # Get graph execution info + # ************************************************ + + graph_exec_info = smart_scraper_graph.get_execution_info() + print(prettify_exec_info(graph_exec_info)) diff --git a/examples/benchmarks/inputs/example_1.txt b/examples/benchmarks/GenerateScraper/inputs/example_1.txt similarity index 100% rename from examples/benchmarks/inputs/example_1.txt rename to examples/benchmarks/GenerateScraper/inputs/example_1.txt diff --git a/examples/benchmarks/inputs/example_2.txt b/examples/benchmarks/GenerateScraper/inputs/example_2.txt similarity index 100% rename from examples/benchmarks/inputs/example_2.txt rename to examples/benchmarks/GenerateScraper/inputs/example_2.txt diff --git a/examples/benchmarks/SmartScraper/.env.example b/examples/benchmarks/SmartScraper/.env.example new file mode 100644 index 00000000..12c1491c --- /dev/null +++ b/examples/benchmarks/SmartScraper/.env.example @@ -0,0 +1 @@ +OPENAI_APIKEY="your openai api key" \ No newline at end of file diff --git a/examples/benchmarks/SmartScraper/Readme.md b/examples/benchmarks/SmartScraper/Readme.md new file mode 100644 index 00000000..9d2ae5d0 --- /dev/null +++ b/examples/benchmarks/SmartScraper/Readme.md @@ -0,0 +1,2 @@ +This folder contains all the scripts used for benchmarks +Remember if you use openai to set the keys or if you use Ollama/Docker to set the setup \ No newline at end of file diff --git a/examples/benchmarks/benchmark_docker.py b/examples/benchmarks/SmartScraper/benchmark_docker.py similarity index 100% rename from examples/benchmarks/benchmark_docker.py rename to examples/benchmarks/SmartScraper/benchmark_docker.py diff --git a/examples/benchmarks/benchmark_ollama.py b/examples/benchmarks/SmartScraper/benchmark_ollama.py similarity index 100% rename from examples/benchmarks/benchmark_ollama.py rename to examples/benchmarks/SmartScraper/benchmark_ollama.py diff --git a/examples/benchmarks/benchmark_openai_gpt35.py b/examples/benchmarks/SmartScraper/benchmark_openai_gpt35.py similarity index 100% rename from examples/benchmarks/benchmark_openai_gpt35.py rename to examples/benchmarks/SmartScraper/benchmark_openai_gpt35.py diff --git a/examples/benchmarks/benchmark_openai_gpt4.py b/examples/benchmarks/SmartScraper/benchmark_openai_gpt4.py similarity index 100% rename from examples/benchmarks/benchmark_openai_gpt4.py rename to examples/benchmarks/SmartScraper/benchmark_openai_gpt4.py diff --git a/examples/benchmarks/SmartScraper/inputs/example_1.txt b/examples/benchmarks/SmartScraper/inputs/example_1.txt new file mode 100644 index 00000000..78f814ae --- /dev/null +++ b/examples/benchmarks/SmartScraper/inputs/example_1.txt @@ -0,0 +1,105 @@ + +
+ + +
+
+
+
+
+
+

Projects

+

+
+
+ +
+
+
+ +
+ \ No newline at end of file diff --git a/examples/benchmarks/SmartScraper/inputs/example_2.txt b/examples/benchmarks/SmartScraper/inputs/example_2.txt new file mode 100644 index 00000000..b7810eed --- /dev/null +++ b/examples/benchmarks/SmartScraper/inputs/example_2.txt @@ -0,0 +1,400 @@ +WIRED - The Latest in Technology, Science, Culture and Business | WIRED
Skip to main content

WIRED

Book Excerpt

They Experimented on Themselves in Secret. What They Discovered Helped Win a War

The untold, top-secret story of the British researchers who found the key to keeping humans alive underwater—and helped make D-Day a success.
Business

Microsoft in the age of Satya Nadella

Originally published February 2015: More than five years before Microsoft invested its first $1 billion in OpenAI, its engineers were hard at work on something that they believed would transform consumer computing, and it wasn’t artificial intelligence.
+ WIRED - The Latest in Technology, Science, Culture and Business | WIRED +
Skip to main content

WIRED

Book Excerpt

They Experimented on Themselves in Secret. What They Discovered Helped Win a War

The untold, top-secret story of the British researchers who found the key to keeping humans alive underwater—and helped make D-Day a success.
Business

Microsoft in the age of Satya Nadella

Originally published February 2015: More than five years before Microsoft invested its first $1 billion in OpenAI, its engineers were hard at work on something that they believed would transform consumer computing, and it wasn’t artificial intelligence.
+WIRED - The Latest in Technology, Science, Culture and Business | WIRED
Skip to main content

WIRED

Book Excerpt

They Experimented on Themselves in Secret. What They Discovered Helped Win a War

The untold, top-secret story of the British researchers who found the key to keeping humans alive underwater—and helped make D-Day a success.
Business

Microsoft in the age of Satya Nadella

Originally published February 2015: More than five years before Microsoft invested its first $1 billion in OpenAI, its engineers were hard at work on something that they believed would transform consumer computing, and it wasn’t artificial intelligence.
\ No newline at end of file diff --git a/examples/benchmarks/readme.md b/examples/benchmarks/readme.md index 9d2ae5d0..ca672ad0 100644 --- a/examples/benchmarks/readme.md +++ b/examples/benchmarks/readme.md @@ -1,2 +1,4 @@ -This folder contains all the scripts used for benchmarks -Remember if you use openai to set the keys or if you use Ollama/Docker to set the setup \ No newline at end of file +These 2 subfolders contain all the scripts and performance documents for the 2 graphs used for the scrapers. +In particular: +* __GenerateScraper__: contains the benchmarks for GenerateScraper class +* __SmartScraper__: contains the benchamrks for SmartScraper class \ No newline at end of file