fix(examples): local, mixed models and fixed SearchGraph embeddings problem

This commit is contained in:
Marco Perini 2024-05-08 15:36:26 +02:00
parent 186c0d035d
commit 6b71ec1d2b
26 changed files with 53 additions and 871 deletions

View File

@ -1,54 +0,0 @@
"""
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
"""
import pandas as pd
from scrapegraphai.graphs import CSVScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Read the csv file
# ************************************************
text = pd.read_csv("inputs/username.csv")
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
}
}
# ************************************************
# Create the CSVScraperGraph instance and run it
# ************************************************
csv_scraper_graph = CSVScraperGraph(
prompt="List me all the last names",
source=str(text), # Pass the content of the file, not the file object
config=graph_config
)
result = csv_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = csv_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -1,120 +0,0 @@
<?xml version="1.0"?>
<catalog>
<book id="bk101">
<author>Gambardella, Matthew</author>
<title>XML Developer's Guide</title>
<genre>Computer</genre>
<price>44.95</price>
<publish_date>2000-10-01</publish_date>
<description>An in-depth look at creating applications
with XML.</description>
</book>
<book id="bk102">
<author>Ralls, Kim</author>
<title>Midnight Rain</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2000-12-16</publish_date>
<description>A former architect battles corporate zombies,
an evil sorceress, and her own childhood to become queen
of the world.</description>
</book>
<book id="bk103">
<author>Corets, Eva</author>
<title>Maeve Ascendant</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2000-11-17</publish_date>
<description>After the collapse of a nanotechnology
society in England, the young survivors lay the
foundation for a new society.</description>
</book>
<book id="bk104">
<author>Corets, Eva</author>
<title>Oberon's Legacy</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2001-03-10</publish_date>
<description>In post-apocalypse England, the mysterious
agent known only as Oberon helps to create a new life
for the inhabitants of London. Sequel to Maeve
Ascendant.</description>
</book>
<book id="bk105">
<author>Corets, Eva</author>
<title>The Sundered Grail</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2001-09-10</publish_date>
<description>The two daughters of Maeve, half-sisters,
battle one another for control of England. Sequel to
Oberon's Legacy.</description>
</book>
<book id="bk106">
<author>Randall, Cynthia</author>
<title>Lover Birds</title>
<genre>Romance</genre>
<price>4.95</price>
<publish_date>2000-09-02</publish_date>
<description>When Carla meets Paul at an ornithology
conference, tempers fly as feathers get ruffled.</description>
</book>
<book id="bk107">
<author>Thurman, Paula</author>
<title>Splish Splash</title>
<genre>Romance</genre>
<price>4.95</price>
<publish_date>2000-11-02</publish_date>
<description>A deep sea diver finds true love twenty
thousand leagues beneath the sea.</description>
</book>
<book id="bk108">
<author>Knorr, Stefan</author>
<title>Creepy Crawlies</title>
<genre>Horror</genre>
<price>4.95</price>
<publish_date>2000-12-06</publish_date>
<description>An anthology of horror stories about roaches,
centipedes, scorpions and other insects.</description>
</book>
<book id="bk109">
<author>Kress, Peter</author>
<title>Paradox Lost</title>
<genre>Science Fiction</genre>
<price>6.95</price>
<publish_date>2000-11-02</publish_date>
<description>After an inadvertant trip through a Heisenberg
Uncertainty Device, James Salway discovers the problems
of being quantum.</description>
</book>
<book id="bk110">
<author>O'Brien, Tim</author>
<title>Microsoft .NET: The Programming Bible</title>
<genre>Computer</genre>
<price>36.95</price>
<publish_date>2000-12-09</publish_date>
<description>Microsoft's .NET initiative is explored in
detail in this deep programmer's reference.</description>
</book>
<book id="bk111">
<author>O'Brien, Tim</author>
<title>MSXML3: A Comprehensive Guide</title>
<genre>Computer</genre>
<price>36.95</price>
<publish_date>2000-12-01</publish_date>
<description>The Microsoft MSXML3 parser is covered in
detail, with attention to XML DOM interfaces, XSLT processing,
SAX and more.</description>
</book>
<book id="bk112">
<author>Galos, Mike</author>
<title>Visual Studio 7: A Comprehensive Guide</title>
<genre>Computer</genre>
<price>49.95</price>
<publish_date>2001-04-16</publish_date>
<description>Microsoft Visual Studio 7 is explored in depth,
looking at how Visual Basic, Visual C++, C#, and ASP+ are
integrated into a comprehensive development
environment.</description>
</book>
</catalog>

View File

@ -1,182 +0,0 @@
{
"kind":"youtube#searchListResponse",
"etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg",
"nextPageToken":"CAUQAA",
"regionCode":"NL",
"pageInfo":{
"totalResults":1000000,
"resultsPerPage":5
},
"items":[
{
"kind":"youtube#searchResult",
"etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ",
"id":{
"kind":"youtube#video",
"videoId":"TvWDY4Mm5GM"
},
"snippet":{
"publishedAt":"2023-07-24T14:15:01Z",
"channelId":"UCwozCpFp9g9x0wAzuFh0hwQ",
"title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts",
"description":"",
"thumbnails":{
"default":{
"url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg",
"width":120,
"height":90
},
"medium":{
"url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg",
"width":320,
"height":180
},
"high":{
"url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg",
"width":480,
"height":360
}
},
"channelTitle":"FC Motivate",
"liveBroadcastContent":"none",
"publishTime":"2023-07-24T14:15:01Z"
}
},
{
"kind":"youtube#searchResult",
"etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k",
"id":{
"kind":"youtube#video",
"videoId":"aZM_42CcNZ4"
},
"snippet":{
"publishedAt":"2023-07-24T16:09:27Z",
"channelId":"UCM5gMM_HqfKHYIEJ3lstMUA",
"title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰",
"description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...",
"thumbnails":{
"default":{
"url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg",
"width":120,
"height":90
},
"medium":{
"url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg",
"width":320,
"height":180
},
"high":{
"url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg",
"width":480,
"height":360
}
},
"channelTitle":"John Nellis",
"liveBroadcastContent":"none",
"publishTime":"2023-07-24T16:09:27Z"
}
},
{
"kind":"youtube#searchResult",
"etag":"WbBz4oh9I5VaYj91LjeJvffrBVY",
"id":{
"kind":"youtube#video",
"videoId":"wkP3XS3aNAY"
},
"snippet":{
"publishedAt":"2023-07-24T16:00:50Z",
"channelId":"UC4EP1dxFDPup_aFLt0ElsDw",
"title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL",
"description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...",
"thumbnails":{
"default":{
"url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg",
"width":120,
"height":90
},
"medium":{
"url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg",
"width":320,
"height":180
},
"high":{
"url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg",
"width":480,
"height":360
}
},
"channelTitle":"Shoot for Love",
"liveBroadcastContent":"none",
"publishTime":"2023-07-24T16:00:50Z"
}
},
{
"kind":"youtube#searchResult",
"etag":"juxv_FhT_l4qrR05S1QTrb4CGh8",
"id":{
"kind":"youtube#video",
"videoId":"rJkDZ0WvfT8"
},
"snippet":{
"publishedAt":"2023-07-24T10:00:39Z",
"channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ",
"title":"TOP 10 DEFENDERS 2023",
"description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...",
"thumbnails":{
"default":{
"url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg",
"width":120,
"height":90
},
"medium":{
"url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg",
"width":320,
"height":180
},
"high":{
"url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg",
"width":480,
"height":360
}
},
"channelTitle":"Home of Football",
"liveBroadcastContent":"none",
"publishTime":"2023-07-24T10:00:39Z"
}
},
{
"kind":"youtube#searchResult",
"etag":"wtuknXTmI1txoULeH3aWaOuXOow",
"id":{
"kind":"youtube#video",
"videoId":"XH0rtu4U6SE"
},
"snippet":{
"publishedAt":"2023-07-21T16:30:05Z",
"channelId":"UCwozCpFp9g9x0wAzuFh0hwQ",
"title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts",
"description":"",
"thumbnails":{
"default":{
"url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg",
"width":120,
"height":90
},
"medium":{
"url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg",
"width":320,
"height":180
},
"high":{
"url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg",
"width":480,
"height":360
}
},
"channelTitle":"FC Motivate",
"liveBroadcastContent":"none",
"publishTime":"2023-07-21T16:30:05Z"
}
}
]
}

View File

@ -1,105 +0,0 @@
<body class="fixed-top-nav " style="padding-top: 57px;">
<header>
<nav id="navbar" class="navbar navbar-light navbar-expand-sm fixed-top">
<div class="container">
<a class="navbar-brand title font-weight-lighter" href="/"><span class="font-weight-bold">Marco&nbsp;</span>Perini</a> <button class="navbar-toggler collapsed ml-auto" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation"> <span class="sr-only">Toggle navigation</span> <span class="icon-bar top-bar"></span> <span class="icon-bar middle-bar"></span> <span class="icon-bar bottom-bar"></span> </button>
<div class="collapse navbar-collapse text-right" id="navbarNav">
<ul class="navbar-nav ml-auto flex-nowrap">
<li class="nav-item "> <a class="nav-link" href="/">About</a> </li>
<li class="nav-item dropdown active">
<a class="nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Projects<span class="sr-only">(current)</span></a>
<div class="dropdown-menu dropdown-menu-right" aria-labelledby="navbarDropdown">
<a class="dropdown-item" href="/projects/">Projects</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="/competitions/">Competitions</a>
</div>
</li>
<li class="nav-item "> <a class="nav-link" href="/cv/">CV</a> </li>
<li class="toggle-container"> <button id="light-toggle" title="Change theme"> <i class="fa-solid fa-moon"></i> <i class="fa-solid fa-sun"></i> </button> </li>
</ul>
</div>
</div>
</nav>
<progress id="progress" value="0" max="284" style="top: 57px;">
<div class="progress-container"> <span class="progress-bar"></span> </div>
</progress>
</header>
<div class="container mt-5">
<div class="post">
<header class="post-header">
<h1 class="post-title">Projects</h1>
<p class="post-description"></p>
</header>
<article>
<div class="projects">
<div class="grid" style="position: relative; height: 861.992px;">
<div class="grid-sizer"></div>
<div class="grid-item" style="position: absolute; left: 0px; top: 0px;">
<a href="/projects/rotary-pendulum-rl/">
<div class="card hoverable">
<figure>
<picture> <img src="/assets/img/rotary_pybullet.jpg" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
</figure>
<div class="card-body">
<h4 class="card-title">Rotary Pendulum RL</h4>
<p class="card-text">Open Source project aimed at controlling a real life rotary pendulum using RL algorithms</p>
<div class="row ml-1 mr-1 p-0"> </div>
</div>
</div>
</a>
</div>
<div class="grid-sizer"></div>
<div class="grid-item" style="position: absolute; left: 260px; top: 0px;">
<a href="https://github.com/PeriniM/DQN-SwingUp" rel="external nofollow noopener" target="_blank">
<div class="card hoverable">
<figure>
<picture> <img src="/assets/img/value-policy-heatmaps.jpg" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
</figure>
<div class="card-body">
<h4 class="card-title">DQN Implementation from scratch</h4>
<p class="card-text">Developed a Deep Q-Network algorithm to train a simple and double pendulum</p>
<div class="row ml-1 mr-1 p-0"> </div>
</div>
</div>
</a>
</div>
<div class="grid-sizer"></div>
<div class="grid-item" style="position: absolute; left: 0px; top: 447.414px;">
<a href="https://github.com/PeriniM/Multi-Agents-HAED" rel="external nofollow noopener" target="_blank">
<div class="card hoverable">
<figure>
<picture> <img src="/assets/img/multi_agents_haed.gif" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
</figure>
<div class="card-body">
<h4 class="card-title">Multi Agents HAED</h4>
<p class="card-text">University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.</p>
<div class="row ml-1 mr-1 p-0"> </div>
</div>
</div>
</a>
</div>
<div class="grid-sizer"></div>
<div class="grid-item" style="position: absolute; left: 260px; top: 370.172px;">
<a href="/projects/wireless-esc-drone/">
<div class="card hoverable">
<figure>
<picture> <img src="/assets/img/wireless_esc.gif" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
</figure>
<div class="card-body">
<h4 class="card-title">Wireless ESC for Modular Drones</h4>
<p class="card-text">Modular drone architecture proposal and proof of concept. The project received maximum grade.</p>
<div class="row ml-1 mr-1 p-0"> </div>
</div>
</div>
</a>
</div>
</div>
</div>
</article>
</div>
</div>
<footer class="fixed-bottom">
<div class="container mt-0"> © Copyright 2023 Marco Perini. Powered by <a href="https://jekyllrb.com/" target="_blank" rel="external nofollow noopener">Jekyll</a> with <a href="https://github.com/alshedivat/al-folio" rel="external nofollow noopener" target="_blank">al-folio</a> theme. Hosted by <a href="https://pages.github.com/" target="_blank" rel="external nofollow noopener">GitHub Pages</a>. </div>
</footer>
<div class="hiddendiv common"></div>
</body>

View File

@ -1,7 +0,0 @@
Username; Identifier;First name;Last name
booker12;9012;Rachel;Booker
grey07;2070;Laura;Grey
johnson81;4081;Craig;Johnson
jenkins46;9346;Mary;Jenkins
smith79;5079;Jamie;Smith
1 Username Identifier First name Last name
2 booker12 9012 Rachel Booker
3 grey07 2070 Laura Grey
4 johnson81 4081 Craig Johnson
5 jenkins46 9346 Mary Jenkins
6 smith79 5079 Jamie Smith

View File

@ -1,61 +0,0 @@
"""
Basic example of scraping pipeline using JSONScraperGraph from JSON documents
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import JSONScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
load_dotenv()
# ************************************************
# Read the JSON file
# ************************************************
FILE_NAME = "inputs/example.json"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)
with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
}
}
# ************************************************
# Create the JSONScraperGraph instance and run it
# ************************************************
json_scraper_graph = JSONScraperGraph(
prompt="List me all the authors, title and genres of the books",
source=text, # Pass the content of the file, not the file object
config=graph_config
)
result = json_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = json_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -1,57 +0,0 @@
"""
Basic example of scraping pipeline using SmartScraper from text
"""
import os
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
# ************************************************
# Read the text file
# ************************************************
FILE_NAME = "inputs/plain_html_example.txt"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)
# It could be also a http request using the request model
with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
}
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects with their description.",
source=text,
config=graph_config
)
result = smart_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

View File

@ -1,56 +0,0 @@
"""
Basic example of scraping pipeline using SmartScraper from XML documents
"""
import os
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
# ************************************************
# Read the XML file
# ************************************************
FILE_NAME = "inputs/books.xml"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)
with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
}
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the authors, title and genres of the books",
source=text, # Pass the content of the file, not the file object
config=graph_config
)
result = smart_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

View File

@ -1,44 +0,0 @@
"""
Basic example of scraping pipeline using ScriptCreatorGraph
"""
from scrapegraphai.graphs import ScriptCreatorGraph
from scrapegraphai.utils import prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json",
# "model_tokens": 2000, # set context length arbitrarily,
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
},
"library": "beautifoulsoup"
}
# ************************************************
# Create the ScriptCreatorGraph instance and run it
# ************************************************
smart_scraper_graph = ScriptCreatorGraph(
prompt="List me all the news with their description.",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects",
config=graph_config
)
result = smart_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

View File

@ -1,49 +0,0 @@
"""
Example of Search Graph
"""
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
},
"max_results": 5,
"verbose": True,
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -1,43 +0,0 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ollama/llama3",
"temperature": 0,
"format": "json",
"model_tokens": 2000, # set context length arbitrarily,
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"base_url": "http://localhost:11434",
}
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the news with their description.",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects",
config=graph_config
)
result = smart_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

View File

@ -1,61 +0,0 @@
"""
Basic example of scraping pipeline using XMLScraperGraph from XML documents
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import XMLScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
load_dotenv()
# ************************************************
# Read the XML file
# ************************************************
FILE_NAME = "inputs/books.xml"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)
with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
}
}
# ************************************************
# Create the XMLScraperGraph instance and run it
# ************************************************
xml_scraper_graph = XMLScraperGraph(
prompt="List me all the authors, title and genres of the books",
source=text, # Pass the content of the file, not the file object
config=graph_config
)
result = xml_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = xml_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -2,15 +2,20 @@
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
"""
import os
import pandas as pd
from scrapegraphai.graphs import CSVScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Read the csv file
# Read the CSV file
# ************************************************
text = pd.read_csv("inputs/username.csv")
FILE_NAME = "inputs/username.csv"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)
text = pd.read_csv(file_path)
# ************************************************
# Define the configuration for the graph
@ -18,7 +23,7 @@ text = pd.read_csv("inputs/username.csv")
graph_config = {
"llm": {
"model": "ollama/mistral",
"model": "ollama/llama3",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily
@ -28,7 +33,8 @@ graph_config = {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434",
}
},
"verbose": True,
}
# ************************************************

View File

@ -35,7 +35,8 @@ graph_config = {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434",
}
},
"verbose": True,
}
# ************************************************

View File

@ -34,7 +34,8 @@ graph_config = {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434",
}
},
"verbose": True,
}
# ************************************************
@ -42,7 +43,7 @@ graph_config = {
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the news with their description.",
prompt="List me all the projects",
source=text,
config=graph_config
)

View File

@ -33,7 +33,8 @@ graph_config = {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
}
},
"verbose": True,
}
# ************************************************

View File

@ -19,7 +19,8 @@ graph_config = {
"temperature": 0,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"library": "beautifoulsoup"
"library": "beautifoulsoup",
"verbose": True,
}
# ************************************************

View File

@ -11,16 +11,15 @@ from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_i
graph_config = {
"llm": {
"model": "ollama/mistral",
"model": "ollama/llama3",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
# "format": "json", # Ollama needs the format to be specified explicitly
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"max_results": 5,
"verbose": True,

View File

@ -9,17 +9,17 @@ from scrapegraphai.utils import prettify_exec_info
graph_config = {
"llm": {
"model": "ollama/mistral",
"model": "ollama/llama3",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
}
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"verbose": True,
}
# ************************************************
@ -27,7 +27,7 @@ graph_config = {
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the news with their description.",
prompt="List me all the projects with their description.",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects",
config=graph_config

View File

@ -25,7 +25,7 @@ with open(file_path, 'r', encoding="utf-8") as file:
graph_config = {
"llm": {
"model": "ollama/mistral",
"model": "ollama/llama3",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily
@ -35,7 +35,8 @@ graph_config = {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434",
}
},
"verbose": True,
}
# ************************************************

View File

@ -12,18 +12,21 @@ load_dotenv()
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
groq_key = os.getenv("GROQ_APIKEY")
graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
"model": "groq/gemma-7b-it",
"api_key": groq_key,
"temperature": 0
},
"embeddings": {
"api_key": openai_key,
"model": "gpt-3.5-turbo",
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"max_results": 2,
"verbose": True,
}
# ************************************************

View File

@ -25,7 +25,8 @@ graph_config = {
"temperature": 0,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"headless": False
"headless": False,
"verbose": True,
}
# ************************************************

View File

@ -2,6 +2,8 @@
SearchGraph Module
"""
from copy import deepcopy
from .base_graph import BaseGraph
from ..nodes import (
SearchInternetNode,
@ -40,6 +42,8 @@ class SearchGraph(AbstractGraph):
def __init__(self, prompt: str, config: dict):
self.max_results = config.get("max_results", 3)
self.copy_config = deepcopy(config)
super().__init__(prompt, config)
def _create_graph(self) -> BaseGraph:
@ -57,7 +61,7 @@ class SearchGraph(AbstractGraph):
smart_scraper_instance = SmartScraperGraph(
prompt="",
source="",
config=self.config
config=self.copy_config
)
# ************************************************

View File

@ -69,10 +69,13 @@ class SearchInternetNode(BaseNode):
search_template = """
PROMPT:
Given the following user prompt, return a query that can be
You are a search engine and you need to generate a search query based on the user's prompt. \n
Given the following user prompt, return a query that can be
used to search the internet for relevant information. \n
You should return only the query string without any additional sentences. \n
You are taught to reply directly giving the search query. \n
For example, if the user prompt is "What is the capital of France?",
you should return "capital of France". \n
If yuo return something else, you will get a really bad grade. \n
USER PROMPT: {user_prompt}"""
search_prompt = PromptTemplate(