mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
Merge pull request #72 from VinciGit00/refactoring_nodes_openai
add new benchmarks
This commit is contained in:
commit
fe0b6d4566
41
examples/benchmarks/GenerateScraper/Readme.md
Normal file
41
examples/benchmarks/GenerateScraper/Readme.md
Normal file
@ -0,0 +1,41 @@
|
||||
# Local models
|
||||
The two websites benchmark are:
|
||||
- Example 1: https://perinim.github.io/projects
|
||||
- Example 2: https://www.wired.com (at 17/4/2024)
|
||||
|
||||
Both are strored locally as txt file in .txt format because in this way we do not have to think about the internet connection
|
||||
|
||||
The time is measured in seconds
|
||||
|
||||
The model runned for this benchmark is Mistral on Ollama with nomic-embed-text
|
||||
|
||||
| Hardware | Example 1 | Example 2 |
|
||||
| ------------------ | --------- | --------- |
|
||||
| Macbook 14' m1 pro | 30.54 | 35.76 |
|
||||
| Macbook m2 max | | |
|
||||
|
||||
|
||||
**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). Indeed the results are the following:
|
||||
|
||||
| Hardware | Example 1 | Example 2 |
|
||||
| ------------------ | --------- | --------- |
|
||||
| Macbook 14' m1 pro | | |
|
||||
# Performance on APIs services
|
||||
### Example 1: personal portfolio
|
||||
**URL**: https://perinim.github.io/projects
|
||||
**Task**: List me all the projects with their description.
|
||||
|
||||
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
|
||||
| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
|
||||
| gpt-3.5-turbo | 24.215268 | 1892 | 1802 | 90 | 1 | 0.002883 |
|
||||
| gpt-4-turbo-preview | 6.614 | 1936 | 1802 | 134 | 1 | 0.02204 |
|
||||
|
||||
### Example 2: Wired
|
||||
**URL**: https://www.wired.com
|
||||
**Task**: List me all the articles with their description.
|
||||
|
||||
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
|
||||
| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
|
||||
| gpt-3.5-turbo | | | | | | |
|
||||
| gpt-4-turbo-preview | | | | | | |
|
||||
|
||||
62
examples/benchmarks/GenerateScraper/benchmark_ollama.py
Normal file
62
examples/benchmarks/GenerateScraper/benchmark_ollama.py
Normal file
@ -0,0 +1,62 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper from text
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import ScriptCreatorGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Read the text file
|
||||
# ************************************************
|
||||
files = ["inputs/example_1.txt", "inputs/example_2.txt"]
|
||||
tasks = ["List me all the projects with their description.",
|
||||
"List me all the articles with their description."]
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
openai_key = os.getenv("GPT4_KEY")
|
||||
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"temperature": 0,
|
||||
# "model_tokens": 2000, # set context length arbitrarily,
|
||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"library": "beautifoulsoup"
|
||||
}
|
||||
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
for i in range(0, 2):
|
||||
with open(files[i], 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
smart_scraper_graph = ScriptCreatorGraph(
|
||||
prompt=tasks[i],
|
||||
source=text,
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -0,0 +1,53 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper from text
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import ScriptCreatorGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Read the text file
|
||||
# ************************************************
|
||||
files = ["inputs/example_1.txt", "inputs/example_2.txt"]
|
||||
tasks = ["List me all the projects with their description.",
|
||||
"List me all the articles with their description."]
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
openai_key = os.getenv("GPT35_KEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": openai_key,
|
||||
"model": "gpt-3.5-turbo",
|
||||
},
|
||||
"library": "beautifoulsoup"
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
for i in range(0, 2):
|
||||
with open(files[i], 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
smart_scraper_graph = ScriptCreatorGraph(
|
||||
prompt=tasks[i],
|
||||
source=text,
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
53
examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py
Normal file
53
examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py
Normal file
@ -0,0 +1,53 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper from text
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import ScriptCreatorGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Read the text file
|
||||
# ************************************************
|
||||
files = ["inputs/example_1.txt", "inputs/example_2.txt"]
|
||||
tasks = ["List me all the projects with their description.",
|
||||
"List me all the articles with their description."]
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
openai_key = os.getenv("GPT4_KEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": openai_key,
|
||||
"model": "gpt-4-turbo-preview",
|
||||
},
|
||||
"library": "beautifoulsoup"
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
for i in range(0, 2):
|
||||
with open(files[i], 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
smart_scraper_graph = ScriptCreatorGraph(
|
||||
prompt=tasks[i],
|
||||
source=text,
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
1
examples/benchmarks/SmartScraper/.env.example
Normal file
1
examples/benchmarks/SmartScraper/.env.example
Normal file
@ -0,0 +1 @@
|
||||
OPENAI_APIKEY="your openai api key"
|
||||
2
examples/benchmarks/SmartScraper/Readme.md
Normal file
2
examples/benchmarks/SmartScraper/Readme.md
Normal file
@ -0,0 +1,2 @@
|
||||
This folder contains all the scripts used for benchmarks
|
||||
Remember if you use openai to set the keys or if you use Ollama/Docker to set the setup
|
||||
105
examples/benchmarks/SmartScraper/inputs/example_1.txt
Normal file
105
examples/benchmarks/SmartScraper/inputs/example_1.txt
Normal file
@ -0,0 +1,105 @@
|
||||
<body class="fixed-top-nav " style="padding-top: 57px;">
|
||||
<header>
|
||||
<nav id="navbar" class="navbar navbar-light navbar-expand-sm fixed-top">
|
||||
<div class="container">
|
||||
<a class="navbar-brand title font-weight-lighter" href="/"><span class="font-weight-bold">Marco </span>Perini</a> <button class="navbar-toggler collapsed ml-auto" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation"> <span class="sr-only">Toggle navigation</span> <span class="icon-bar top-bar"></span> <span class="icon-bar middle-bar"></span> <span class="icon-bar bottom-bar"></span> </button>
|
||||
<div class="collapse navbar-collapse text-right" id="navbarNav">
|
||||
<ul class="navbar-nav ml-auto flex-nowrap">
|
||||
<li class="nav-item "> <a class="nav-link" href="/">About</a> </li>
|
||||
<li class="nav-item dropdown active">
|
||||
<a class="nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Projects<span class="sr-only">(current)</span></a>
|
||||
<div class="dropdown-menu dropdown-menu-right" aria-labelledby="navbarDropdown">
|
||||
<a class="dropdown-item" href="/projects/">Projects</a>
|
||||
<div class="dropdown-divider"></div>
|
||||
<a class="dropdown-item" href="/competitions/">Competitions</a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="nav-item "> <a class="nav-link" href="/cv/">CV</a> </li>
|
||||
<li class="toggle-container"> <button id="light-toggle" title="Change theme"> <i class="fa-solid fa-moon"></i> <i class="fa-solid fa-sun"></i> </button> </li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</nav>
|
||||
<progress id="progress" value="0" max="284" style="top: 57px;">
|
||||
<div class="progress-container"> <span class="progress-bar"></span> </div>
|
||||
</progress>
|
||||
</header>
|
||||
<div class="container mt-5">
|
||||
<div class="post">
|
||||
<header class="post-header">
|
||||
<h1 class="post-title">Projects</h1>
|
||||
<p class="post-description"></p>
|
||||
</header>
|
||||
<article>
|
||||
<div class="projects">
|
||||
<div class="grid" style="position: relative; height: 861.992px;">
|
||||
<div class="grid-sizer"></div>
|
||||
<div class="grid-item" style="position: absolute; left: 0px; top: 0px;">
|
||||
<a href="/projects/rotary-pendulum-rl/">
|
||||
<div class="card hoverable">
|
||||
<figure>
|
||||
<picture> <img src="/assets/img/rotary_pybullet.jpg" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
|
||||
</figure>
|
||||
<div class="card-body">
|
||||
<h4 class="card-title">Rotary Pendulum RL</h4>
|
||||
<p class="card-text">Open Source project aimed at controlling a real life rotary pendulum using RL algorithms</p>
|
||||
<div class="row ml-1 mr-1 p-0"> </div>
|
||||
</div>
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
<div class="grid-sizer"></div>
|
||||
<div class="grid-item" style="position: absolute; left: 260px; top: 0px;">
|
||||
<a href="https://github.com/PeriniM/DQN-SwingUp" rel="external nofollow noopener" target="_blank">
|
||||
<div class="card hoverable">
|
||||
<figure>
|
||||
<picture> <img src="/assets/img/value-policy-heatmaps.jpg" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
|
||||
</figure>
|
||||
<div class="card-body">
|
||||
<h4 class="card-title">DQN Implementation from scratch</h4>
|
||||
<p class="card-text">Developed a Deep Q-Network algorithm to train a simple and double pendulum</p>
|
||||
<div class="row ml-1 mr-1 p-0"> </div>
|
||||
</div>
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
<div class="grid-sizer"></div>
|
||||
<div class="grid-item" style="position: absolute; left: 0px; top: 447.414px;">
|
||||
<a href="https://github.com/PeriniM/Multi-Agents-HAED" rel="external nofollow noopener" target="_blank">
|
||||
<div class="card hoverable">
|
||||
<figure>
|
||||
<picture> <img src="/assets/img/multi_agents_haed.gif" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
|
||||
</figure>
|
||||
<div class="card-body">
|
||||
<h4 class="card-title">Multi Agents HAED</h4>
|
||||
<p class="card-text">University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.</p>
|
||||
<div class="row ml-1 mr-1 p-0"> </div>
|
||||
</div>
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
<div class="grid-sizer"></div>
|
||||
<div class="grid-item" style="position: absolute; left: 260px; top: 370.172px;">
|
||||
<a href="/projects/wireless-esc-drone/">
|
||||
<div class="card hoverable">
|
||||
<figure>
|
||||
<picture> <img src="/assets/img/wireless_esc.gif" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
|
||||
</figure>
|
||||
<div class="card-body">
|
||||
<h4 class="card-title">Wireless ESC for Modular Drones</h4>
|
||||
<p class="card-text">Modular drone architecture proposal and proof of concept. The project received maximum grade.</p>
|
||||
<div class="row ml-1 mr-1 p-0"> </div>
|
||||
</div>
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
</div>
|
||||
<footer class="fixed-bottom">
|
||||
<div class="container mt-0"> © Copyright 2023 Marco Perini. Powered by <a href="https://jekyllrb.com/" target="_blank" rel="external nofollow noopener">Jekyll</a> with <a href="https://github.com/alshedivat/al-folio" rel="external nofollow noopener" target="_blank">al-folio</a> theme. Hosted by <a href="https://pages.github.com/" target="_blank" rel="external nofollow noopener">GitHub Pages</a>. </div>
|
||||
</footer>
|
||||
<div class="hiddendiv common"></div>
|
||||
</body>
|
||||
400
examples/benchmarks/SmartScraper/inputs/example_2.txt
Normal file
400
examples/benchmarks/SmartScraper/inputs/example_2.txt
Normal file
File diff suppressed because one or more lines are too long
@ -1,2 +1,4 @@
|
||||
This folder contains all the scripts used for benchmarks
|
||||
Remember if you use openai to set the keys or if you use Ollama/Docker to set the setup
|
||||
These 2 subfolders contain all the scripts and performance documents for the 2 graphs used for the scrapers.
|
||||
In particular:
|
||||
* __GenerateScraper__: contains the benchmarks for GenerateScraper class
|
||||
* __SmartScraper__: contains the benchamrks for SmartScraper class
|
||||
Loading…
Reference in New Issue
Block a user