mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
fix: script generator and add new benchmarks
This commit is contained in:
parent
7e81f7c03f
commit
e3d0194dc9
@ -1,4 +1,5 @@
|
||||
# Local models
|
||||
# Local models
|
||||
The two websites benchmark are:
|
||||
- Example 1: https://perinim.github.io/projects
|
||||
- Example 2: https://www.wired.com (at 17/4/2024)
|
||||
@ -9,14 +10,12 @@ The time is measured in seconds
|
||||
|
||||
The model runned for this benchmark is Mistral on Ollama with nomic-embed-text
|
||||
|
||||
In particular, is tested with ScriptCreatorGraph
|
||||
|
||||
| Hardware | Model | Example 1 | Example 2 |
|
||||
| ---------------------- | --------------------------------------- | --------- | --------- |
|
||||
| Macbook 14' m1 pro | Mistral on Ollama with nomic-embed-text | 30.54s | 35.76s |
|
||||
| Macbook m2 max | Mistral on Ollama with nomic-embed-text | 18,46s | 19.59 |
|
||||
| Macbook 14' m1 pro<br> | Llama3 on Ollama with nomic-embed-text | 27.82s | 29.98s |
|
||||
| Macbook m2 max<br> | Llama3 on Ollama with nomic-embed-text | 20.83s | 12.29s |
|
||||
| Macbook m2 max | Mistral on Ollama with nomic-embed-text | | |
|
||||
| Macbook 14' m1 pro<br> | Llama3 on Ollama with nomic-embed-text | 27.82s | 29.986s |
|
||||
| Macbook m2 max<br> | Llama3 on Ollama with nomic-embed-text | | |
|
||||
|
||||
|
||||
**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama).
|
||||
@ -25,17 +24,20 @@ In particular, is tested with ScriptCreatorGraph
|
||||
**URL**: https://perinim.github.io/projects
|
||||
**Task**: List me all the projects with their description.
|
||||
|
||||
| Name | Execution time | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
|
||||
| ------------------- | ---------------| ------------ | ------------- | ----------------- | ------------------- | -------------- |
|
||||
| gpt-3.5-turbo | 4.50s | 1897 | 1802 | 95 | 1 | 0.002893 |
|
||||
| gpt-4-turbo | 7.88s | 1920 | 1802 | 118 | 1 | 0.02156 |
|
||||
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
|
||||
| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
|
||||
| gpt-3.5-turbo | 24.21 | 1892 | 1802 | 90 | 1 | 0.002883 |
|
||||
| gpt-4-turbo-preview | 6.614 | 1936 | 1802 | 134 | 1 | 0.02204 |
|
||||
| Grooq with nomic-embed-text | 6.71 | 2201 | 2024 | 177 | 1 | 0 |
|
||||
|
||||
### Example 2: Wired
|
||||
**URL**: https://www.wired.com
|
||||
**Task**: List me all the articles with their description.
|
||||
|
||||
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
|
||||
| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
|
||||
| gpt-3.5-turbo | Error (text too long) | - | - | - | - | - |
|
||||
| gpt-4-turbo | Error (TPM limit reach)| - | - | - | - | - |
|
||||
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
|
||||
| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
|
||||
| gpt-3.5-turbo | | | | | | |
|
||||
| gpt-4-turbo-preview | | | | | | |
|
||||
| Grooq with nomic-embed-text | | | | | | |
|
||||
|
||||
|
||||
|
||||
61
examples/benchmarks/GenerateScraper/benchmark_groq.py
Normal file
61
examples/benchmarks/GenerateScraper/benchmark_groq.py
Normal file
@ -0,0 +1,61 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper from text
|
||||
"""
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import ScriptCreatorGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Read the text file
|
||||
# ************************************************
|
||||
files = ["inputs/example_1.txt", "inputs/example_2.txt"]
|
||||
tasks = ["List me all the projects with their description.",
|
||||
"List me all the articles with their description."]
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
groq_key = os.getenv("GROQ_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "groq/gemma-7b-it",
|
||||
"api_key": groq_key,
|
||||
"temperature": 0
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"headless": False,
|
||||
"library": "beautifoulsoup"
|
||||
}
|
||||
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
for i in range(0, 2):
|
||||
with open(files[i], 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
smart_scraper_graph = ScriptCreatorGraph(
|
||||
prompt=tasks[i],
|
||||
source=text,
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -2,11 +2,8 @@
|
||||
Basic example of scraping pipeline using SmartScraper from text
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import ScriptCreatorGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Read the text file
|
||||
@ -19,8 +16,6 @@ tasks = ["List me all the projects with their description.",
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
openai_key = os.getenv("GPT4_KEY")
|
||||
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
|
||||
@ -5,37 +5,37 @@ The two websites benchmark are:
|
||||
|
||||
Both are strored locally as txt file in .txt format because in this way we do not have to think about the internet connection
|
||||
|
||||
In particular, is tested with SmartScraper
|
||||
|
||||
| Hardware | Moodel | Example 1 | Example 2 |
|
||||
| Hardware | Model | Example 1 | Example 2 |
|
||||
| ------------------ | --------------------------------------- | --------- | --------- |
|
||||
| Macbook 14' m1 pro | Mistral on Ollama with nomic-embed-text | 11.60s | 26.61s |
|
||||
| Macbook m2 max | Mistral on Ollama with nomic-embed-text | 8.05s | 12.17s |
|
||||
| Macbook 14' m1 pro | Llama3 on Ollama with nomic-embed-text | 29.871s | 35.32s |
|
||||
| Macbook 14' m1 pro | Llama3 on Ollama with nomic-embed-text | 29.87s | 35.32s |
|
||||
| Macbook m2 max | Llama3 on Ollama with nomic-embed-text | 18.36s | 78.32s |
|
||||
|
||||
|
||||
**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). Indeed the results are the following:
|
||||
|
||||
| Hardware | Example 1 | Example 2 |
|
||||
| ------------------ | --------- | --------- |
|
||||
| Macbook 14' m1 pro | 139.89s | Too long |
|
||||
| Macbook 14' m1 pro | 139.89 | Too long |
|
||||
# Performance on APIs services
|
||||
### Example 1: personal portfolio
|
||||
**URL**: https://perinim.github.io/projects
|
||||
**Task**: List me all the projects with their description.
|
||||
|
||||
| Name | Execution time | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
|
||||
| ------------------- | ---------------| ------------ | ------------- | ----------------- | ------------------- | -------------- |
|
||||
| gpt-3.5-turbo | 5.58s | 445 | 272 | 173 | 1 | 0.000754 |
|
||||
| gpt-4-turbo | 9.76s | 445 | 272 | 173 | 1 | 0.00791 |
|
||||
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
|
||||
| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
|
||||
| gpt-3.5-turbo | 25.22 | 445 | 272 | 173 | 1 | 0.000754 |
|
||||
| gpt-4-turbo-preview | 9.53 | 449 | 272 | 177 | 1 | 0.00803 |
|
||||
| Grooq with nomic-embed-text | 1.99 | 474 | 284 | 190 | 1 | 0 |
|
||||
|
||||
### Example 2: Wired
|
||||
**URL**: https://www.wired.com
|
||||
**Task**: List me all the articles with their description.
|
||||
|
||||
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
|
||||
| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
|
||||
| gpt-3.5-turbo | 6.50 | 2442 | 2199 | 243 | 1 | 0.003784 |
|
||||
| gpt-4-turbo | 76.07 | 3521 | 2199 | 1322 | 1 | 0.06165 |
|
||||
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
|
||||
| --------------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
|
||||
| gpt-3.5-turbo | 25.89 | 445 | 272 | 173 | 1 | 0.000754 |
|
||||
| gpt-4-turbo-preview | 64.70 | 3573 | 2199 | 1374 | 1 | 0.06321 |
|
||||
| Grooq with nomic-embed-text | 3.82 | 2459 | 2192 | 267 | 1 | 0 |
|
||||
|
||||
|
||||
|
||||
57
examples/benchmarks/SmartScraper/benchmark_groq.py
Normal file
57
examples/benchmarks/SmartScraper/benchmark_groq.py
Normal file
@ -0,0 +1,57 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper from text
|
||||
"""
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
load_dotenv()
|
||||
|
||||
files = ["inputs/example_1.txt", "inputs/example_2.txt"]
|
||||
tasks = ["List me all the projects with their description.",
|
||||
"List me all the articles with their description."]
|
||||
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
groq_key = os.getenv("GROQ_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "groq/gemma-7b-it",
|
||||
"api_key": groq_key,
|
||||
"temperature": 0
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"headless": False
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
for i in range(0, 2):
|
||||
with open(files[i], 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt=tasks[i],
|
||||
source=text,
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -2,7 +2,6 @@
|
||||
Basic example of scraping pipeline using SmartScraper from text
|
||||
"""
|
||||
|
||||
import os
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
|
||||
@ -34,6 +34,8 @@ class ScriptCreatorGraph(AbstractGraph):
|
||||
fetch_node = FetchNode(
|
||||
input="url | local_dir",
|
||||
output=["doc"],
|
||||
node_config={
|
||||
"headless": True if self.config is None else self.config.get("headless", True)}
|
||||
)
|
||||
parse_node = ParseNode(
|
||||
input="doc",
|
||||
|
||||
Loading…
Reference in New Issue
Block a user