mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-25 21:11:11 +08:00
fix(examples): local, mixed models and fixed SearchGraph embeddings problem
This commit is contained in:
parent
186c0d035d
commit
6b71ec1d2b
@ -1,54 +0,0 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from scrapegraphai.graphs import CSVScraperGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Read the csv file
|
||||
# ************************************************
|
||||
|
||||
text = pd.read_csv("inputs/username.csv")
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "model_tokens": 2000, # set context length arbitrarily
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
}
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the CSVScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
csv_scraper_graph = CSVScraperGraph(
|
||||
prompt="List me all the last names",
|
||||
source=str(text), # Pass the content of the file, not the file object
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = csv_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = csv_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json or csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
@ -1,120 +0,0 @@
|
||||
<?xml version="1.0"?>
|
||||
<catalog>
|
||||
<book id="bk101">
|
||||
<author>Gambardella, Matthew</author>
|
||||
<title>XML Developer's Guide</title>
|
||||
<genre>Computer</genre>
|
||||
<price>44.95</price>
|
||||
<publish_date>2000-10-01</publish_date>
|
||||
<description>An in-depth look at creating applications
|
||||
with XML.</description>
|
||||
</book>
|
||||
<book id="bk102">
|
||||
<author>Ralls, Kim</author>
|
||||
<title>Midnight Rain</title>
|
||||
<genre>Fantasy</genre>
|
||||
<price>5.95</price>
|
||||
<publish_date>2000-12-16</publish_date>
|
||||
<description>A former architect battles corporate zombies,
|
||||
an evil sorceress, and her own childhood to become queen
|
||||
of the world.</description>
|
||||
</book>
|
||||
<book id="bk103">
|
||||
<author>Corets, Eva</author>
|
||||
<title>Maeve Ascendant</title>
|
||||
<genre>Fantasy</genre>
|
||||
<price>5.95</price>
|
||||
<publish_date>2000-11-17</publish_date>
|
||||
<description>After the collapse of a nanotechnology
|
||||
society in England, the young survivors lay the
|
||||
foundation for a new society.</description>
|
||||
</book>
|
||||
<book id="bk104">
|
||||
<author>Corets, Eva</author>
|
||||
<title>Oberon's Legacy</title>
|
||||
<genre>Fantasy</genre>
|
||||
<price>5.95</price>
|
||||
<publish_date>2001-03-10</publish_date>
|
||||
<description>In post-apocalypse England, the mysterious
|
||||
agent known only as Oberon helps to create a new life
|
||||
for the inhabitants of London. Sequel to Maeve
|
||||
Ascendant.</description>
|
||||
</book>
|
||||
<book id="bk105">
|
||||
<author>Corets, Eva</author>
|
||||
<title>The Sundered Grail</title>
|
||||
<genre>Fantasy</genre>
|
||||
<price>5.95</price>
|
||||
<publish_date>2001-09-10</publish_date>
|
||||
<description>The two daughters of Maeve, half-sisters,
|
||||
battle one another for control of England. Sequel to
|
||||
Oberon's Legacy.</description>
|
||||
</book>
|
||||
<book id="bk106">
|
||||
<author>Randall, Cynthia</author>
|
||||
<title>Lover Birds</title>
|
||||
<genre>Romance</genre>
|
||||
<price>4.95</price>
|
||||
<publish_date>2000-09-02</publish_date>
|
||||
<description>When Carla meets Paul at an ornithology
|
||||
conference, tempers fly as feathers get ruffled.</description>
|
||||
</book>
|
||||
<book id="bk107">
|
||||
<author>Thurman, Paula</author>
|
||||
<title>Splish Splash</title>
|
||||
<genre>Romance</genre>
|
||||
<price>4.95</price>
|
||||
<publish_date>2000-11-02</publish_date>
|
||||
<description>A deep sea diver finds true love twenty
|
||||
thousand leagues beneath the sea.</description>
|
||||
</book>
|
||||
<book id="bk108">
|
||||
<author>Knorr, Stefan</author>
|
||||
<title>Creepy Crawlies</title>
|
||||
<genre>Horror</genre>
|
||||
<price>4.95</price>
|
||||
<publish_date>2000-12-06</publish_date>
|
||||
<description>An anthology of horror stories about roaches,
|
||||
centipedes, scorpions and other insects.</description>
|
||||
</book>
|
||||
<book id="bk109">
|
||||
<author>Kress, Peter</author>
|
||||
<title>Paradox Lost</title>
|
||||
<genre>Science Fiction</genre>
|
||||
<price>6.95</price>
|
||||
<publish_date>2000-11-02</publish_date>
|
||||
<description>After an inadvertant trip through a Heisenberg
|
||||
Uncertainty Device, James Salway discovers the problems
|
||||
of being quantum.</description>
|
||||
</book>
|
||||
<book id="bk110">
|
||||
<author>O'Brien, Tim</author>
|
||||
<title>Microsoft .NET: The Programming Bible</title>
|
||||
<genre>Computer</genre>
|
||||
<price>36.95</price>
|
||||
<publish_date>2000-12-09</publish_date>
|
||||
<description>Microsoft's .NET initiative is explored in
|
||||
detail in this deep programmer's reference.</description>
|
||||
</book>
|
||||
<book id="bk111">
|
||||
<author>O'Brien, Tim</author>
|
||||
<title>MSXML3: A Comprehensive Guide</title>
|
||||
<genre>Computer</genre>
|
||||
<price>36.95</price>
|
||||
<publish_date>2000-12-01</publish_date>
|
||||
<description>The Microsoft MSXML3 parser is covered in
|
||||
detail, with attention to XML DOM interfaces, XSLT processing,
|
||||
SAX and more.</description>
|
||||
</book>
|
||||
<book id="bk112">
|
||||
<author>Galos, Mike</author>
|
||||
<title>Visual Studio 7: A Comprehensive Guide</title>
|
||||
<genre>Computer</genre>
|
||||
<price>49.95</price>
|
||||
<publish_date>2001-04-16</publish_date>
|
||||
<description>Microsoft Visual Studio 7 is explored in depth,
|
||||
looking at how Visual Basic, Visual C++, C#, and ASP+ are
|
||||
integrated into a comprehensive development
|
||||
environment.</description>
|
||||
</book>
|
||||
</catalog>
|
||||
@ -1,182 +0,0 @@
|
||||
{
|
||||
"kind":"youtube#searchListResponse",
|
||||
"etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg",
|
||||
"nextPageToken":"CAUQAA",
|
||||
"regionCode":"NL",
|
||||
"pageInfo":{
|
||||
"totalResults":1000000,
|
||||
"resultsPerPage":5
|
||||
},
|
||||
"items":[
|
||||
{
|
||||
"kind":"youtube#searchResult",
|
||||
"etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ",
|
||||
"id":{
|
||||
"kind":"youtube#video",
|
||||
"videoId":"TvWDY4Mm5GM"
|
||||
},
|
||||
"snippet":{
|
||||
"publishedAt":"2023-07-24T14:15:01Z",
|
||||
"channelId":"UCwozCpFp9g9x0wAzuFh0hwQ",
|
||||
"title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts",
|
||||
"description":"",
|
||||
"thumbnails":{
|
||||
"default":{
|
||||
"url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg",
|
||||
"width":120,
|
||||
"height":90
|
||||
},
|
||||
"medium":{
|
||||
"url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg",
|
||||
"width":320,
|
||||
"height":180
|
||||
},
|
||||
"high":{
|
||||
"url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg",
|
||||
"width":480,
|
||||
"height":360
|
||||
}
|
||||
},
|
||||
"channelTitle":"FC Motivate",
|
||||
"liveBroadcastContent":"none",
|
||||
"publishTime":"2023-07-24T14:15:01Z"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind":"youtube#searchResult",
|
||||
"etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k",
|
||||
"id":{
|
||||
"kind":"youtube#video",
|
||||
"videoId":"aZM_42CcNZ4"
|
||||
},
|
||||
"snippet":{
|
||||
"publishedAt":"2023-07-24T16:09:27Z",
|
||||
"channelId":"UCM5gMM_HqfKHYIEJ3lstMUA",
|
||||
"title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰",
|
||||
"description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...",
|
||||
"thumbnails":{
|
||||
"default":{
|
||||
"url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg",
|
||||
"width":120,
|
||||
"height":90
|
||||
},
|
||||
"medium":{
|
||||
"url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg",
|
||||
"width":320,
|
||||
"height":180
|
||||
},
|
||||
"high":{
|
||||
"url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg",
|
||||
"width":480,
|
||||
"height":360
|
||||
}
|
||||
},
|
||||
"channelTitle":"John Nellis",
|
||||
"liveBroadcastContent":"none",
|
||||
"publishTime":"2023-07-24T16:09:27Z"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind":"youtube#searchResult",
|
||||
"etag":"WbBz4oh9I5VaYj91LjeJvffrBVY",
|
||||
"id":{
|
||||
"kind":"youtube#video",
|
||||
"videoId":"wkP3XS3aNAY"
|
||||
},
|
||||
"snippet":{
|
||||
"publishedAt":"2023-07-24T16:00:50Z",
|
||||
"channelId":"UC4EP1dxFDPup_aFLt0ElsDw",
|
||||
"title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL",
|
||||
"description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...",
|
||||
"thumbnails":{
|
||||
"default":{
|
||||
"url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg",
|
||||
"width":120,
|
||||
"height":90
|
||||
},
|
||||
"medium":{
|
||||
"url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg",
|
||||
"width":320,
|
||||
"height":180
|
||||
},
|
||||
"high":{
|
||||
"url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg",
|
||||
"width":480,
|
||||
"height":360
|
||||
}
|
||||
},
|
||||
"channelTitle":"Shoot for Love",
|
||||
"liveBroadcastContent":"none",
|
||||
"publishTime":"2023-07-24T16:00:50Z"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind":"youtube#searchResult",
|
||||
"etag":"juxv_FhT_l4qrR05S1QTrb4CGh8",
|
||||
"id":{
|
||||
"kind":"youtube#video",
|
||||
"videoId":"rJkDZ0WvfT8"
|
||||
},
|
||||
"snippet":{
|
||||
"publishedAt":"2023-07-24T10:00:39Z",
|
||||
"channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ",
|
||||
"title":"TOP 10 DEFENDERS 2023",
|
||||
"description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...",
|
||||
"thumbnails":{
|
||||
"default":{
|
||||
"url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg",
|
||||
"width":120,
|
||||
"height":90
|
||||
},
|
||||
"medium":{
|
||||
"url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg",
|
||||
"width":320,
|
||||
"height":180
|
||||
},
|
||||
"high":{
|
||||
"url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg",
|
||||
"width":480,
|
||||
"height":360
|
||||
}
|
||||
},
|
||||
"channelTitle":"Home of Football",
|
||||
"liveBroadcastContent":"none",
|
||||
"publishTime":"2023-07-24T10:00:39Z"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind":"youtube#searchResult",
|
||||
"etag":"wtuknXTmI1txoULeH3aWaOuXOow",
|
||||
"id":{
|
||||
"kind":"youtube#video",
|
||||
"videoId":"XH0rtu4U6SE"
|
||||
},
|
||||
"snippet":{
|
||||
"publishedAt":"2023-07-21T16:30:05Z",
|
||||
"channelId":"UCwozCpFp9g9x0wAzuFh0hwQ",
|
||||
"title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts",
|
||||
"description":"",
|
||||
"thumbnails":{
|
||||
"default":{
|
||||
"url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg",
|
||||
"width":120,
|
||||
"height":90
|
||||
},
|
||||
"medium":{
|
||||
"url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg",
|
||||
"width":320,
|
||||
"height":180
|
||||
},
|
||||
"high":{
|
||||
"url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg",
|
||||
"width":480,
|
||||
"height":360
|
||||
}
|
||||
},
|
||||
"channelTitle":"FC Motivate",
|
||||
"liveBroadcastContent":"none",
|
||||
"publishTime":"2023-07-21T16:30:05Z"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -1,105 +0,0 @@
|
||||
<body class="fixed-top-nav " style="padding-top: 57px;">
|
||||
<header>
|
||||
<nav id="navbar" class="navbar navbar-light navbar-expand-sm fixed-top">
|
||||
<div class="container">
|
||||
<a class="navbar-brand title font-weight-lighter" href="/"><span class="font-weight-bold">Marco </span>Perini</a> <button class="navbar-toggler collapsed ml-auto" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation"> <span class="sr-only">Toggle navigation</span> <span class="icon-bar top-bar"></span> <span class="icon-bar middle-bar"></span> <span class="icon-bar bottom-bar"></span> </button>
|
||||
<div class="collapse navbar-collapse text-right" id="navbarNav">
|
||||
<ul class="navbar-nav ml-auto flex-nowrap">
|
||||
<li class="nav-item "> <a class="nav-link" href="/">About</a> </li>
|
||||
<li class="nav-item dropdown active">
|
||||
<a class="nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Projects<span class="sr-only">(current)</span></a>
|
||||
<div class="dropdown-menu dropdown-menu-right" aria-labelledby="navbarDropdown">
|
||||
<a class="dropdown-item" href="/projects/">Projects</a>
|
||||
<div class="dropdown-divider"></div>
|
||||
<a class="dropdown-item" href="/competitions/">Competitions</a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="nav-item "> <a class="nav-link" href="/cv/">CV</a> </li>
|
||||
<li class="toggle-container"> <button id="light-toggle" title="Change theme"> <i class="fa-solid fa-moon"></i> <i class="fa-solid fa-sun"></i> </button> </li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</nav>
|
||||
<progress id="progress" value="0" max="284" style="top: 57px;">
|
||||
<div class="progress-container"> <span class="progress-bar"></span> </div>
|
||||
</progress>
|
||||
</header>
|
||||
<div class="container mt-5">
|
||||
<div class="post">
|
||||
<header class="post-header">
|
||||
<h1 class="post-title">Projects</h1>
|
||||
<p class="post-description"></p>
|
||||
</header>
|
||||
<article>
|
||||
<div class="projects">
|
||||
<div class="grid" style="position: relative; height: 861.992px;">
|
||||
<div class="grid-sizer"></div>
|
||||
<div class="grid-item" style="position: absolute; left: 0px; top: 0px;">
|
||||
<a href="/projects/rotary-pendulum-rl/">
|
||||
<div class="card hoverable">
|
||||
<figure>
|
||||
<picture> <img src="/assets/img/rotary_pybullet.jpg" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
|
||||
</figure>
|
||||
<div class="card-body">
|
||||
<h4 class="card-title">Rotary Pendulum RL</h4>
|
||||
<p class="card-text">Open Source project aimed at controlling a real life rotary pendulum using RL algorithms</p>
|
||||
<div class="row ml-1 mr-1 p-0"> </div>
|
||||
</div>
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
<div class="grid-sizer"></div>
|
||||
<div class="grid-item" style="position: absolute; left: 260px; top: 0px;">
|
||||
<a href="https://github.com/PeriniM/DQN-SwingUp" rel="external nofollow noopener" target="_blank">
|
||||
<div class="card hoverable">
|
||||
<figure>
|
||||
<picture> <img src="/assets/img/value-policy-heatmaps.jpg" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
|
||||
</figure>
|
||||
<div class="card-body">
|
||||
<h4 class="card-title">DQN Implementation from scratch</h4>
|
||||
<p class="card-text">Developed a Deep Q-Network algorithm to train a simple and double pendulum</p>
|
||||
<div class="row ml-1 mr-1 p-0"> </div>
|
||||
</div>
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
<div class="grid-sizer"></div>
|
||||
<div class="grid-item" style="position: absolute; left: 0px; top: 447.414px;">
|
||||
<a href="https://github.com/PeriniM/Multi-Agents-HAED" rel="external nofollow noopener" target="_blank">
|
||||
<div class="card hoverable">
|
||||
<figure>
|
||||
<picture> <img src="/assets/img/multi_agents_haed.gif" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
|
||||
</figure>
|
||||
<div class="card-body">
|
||||
<h4 class="card-title">Multi Agents HAED</h4>
|
||||
<p class="card-text">University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.</p>
|
||||
<div class="row ml-1 mr-1 p-0"> </div>
|
||||
</div>
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
<div class="grid-sizer"></div>
|
||||
<div class="grid-item" style="position: absolute; left: 260px; top: 370.172px;">
|
||||
<a href="/projects/wireless-esc-drone/">
|
||||
<div class="card hoverable">
|
||||
<figure>
|
||||
<picture> <img src="/assets/img/wireless_esc.gif" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
|
||||
</figure>
|
||||
<div class="card-body">
|
||||
<h4 class="card-title">Wireless ESC for Modular Drones</h4>
|
||||
<p class="card-text">Modular drone architecture proposal and proof of concept. The project received maximum grade.</p>
|
||||
<div class="row ml-1 mr-1 p-0"> </div>
|
||||
</div>
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
</div>
|
||||
<footer class="fixed-bottom">
|
||||
<div class="container mt-0"> © Copyright 2023 Marco Perini. Powered by <a href="https://jekyllrb.com/" target="_blank" rel="external nofollow noopener">Jekyll</a> with <a href="https://github.com/alshedivat/al-folio" rel="external nofollow noopener" target="_blank">al-folio</a> theme. Hosted by <a href="https://pages.github.com/" target="_blank" rel="external nofollow noopener">GitHub Pages</a>. </div>
|
||||
</footer>
|
||||
<div class="hiddendiv common"></div>
|
||||
</body>
|
||||
@ -1,7 +0,0 @@
|
||||
Username; Identifier;First name;Last name
|
||||
booker12;9012;Rachel;Booker
|
||||
grey07;2070;Laura;Grey
|
||||
johnson81;4081;Craig;Johnson
|
||||
jenkins46;9346;Mary;Jenkins
|
||||
smith79;5079;Jamie;Smith
|
||||
|
||||
|
@ -1,61 +0,0 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using JSONScraperGraph from JSON documents
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import JSONScraperGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Read the JSON file
|
||||
# ************************************************
|
||||
|
||||
FILE_NAME = "inputs/example.json"
|
||||
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
file_path = os.path.join(curr_dir, FILE_NAME)
|
||||
|
||||
with open(file_path, 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "model_tokens": 2000, # set context length arbitrarily
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
}
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the JSONScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
json_scraper_graph = JSONScraperGraph(
|
||||
prompt="List me all the authors, title and genres of the books",
|
||||
source=text, # Pass the content of the file, not the file object
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = json_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = json_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json or csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
@ -1,57 +0,0 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper from text
|
||||
"""
|
||||
|
||||
import os
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Read the text file
|
||||
# ************************************************
|
||||
|
||||
FILE_NAME = "inputs/plain_html_example.txt"
|
||||
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
file_path = os.path.join(curr_dir, FILE_NAME)
|
||||
|
||||
# It could be also a http request using the request model
|
||||
with open(file_path, 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "model_tokens": 2000, # set context length arbitrarily
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
}
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List me all the projects with their description.",
|
||||
source=text,
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -1,56 +0,0 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper from XML documents
|
||||
"""
|
||||
import os
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Read the XML file
|
||||
# ************************************************
|
||||
|
||||
FILE_NAME = "inputs/books.xml"
|
||||
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
file_path = os.path.join(curr_dir, FILE_NAME)
|
||||
|
||||
with open(file_path, 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "model_tokens": 2000, # set context length arbitrarily
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
}
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List me all the authors, title and genres of the books",
|
||||
source=text, # Pass the content of the file, not the file object
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -1,44 +0,0 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using ScriptCreatorGraph
|
||||
"""
|
||||
from scrapegraphai.graphs import ScriptCreatorGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"temperature": 0,
|
||||
"format": "json",
|
||||
# "model_tokens": 2000, # set context length arbitrarily,
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
},
|
||||
"library": "beautifoulsoup"
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the ScriptCreatorGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_graph = ScriptCreatorGraph(
|
||||
prompt="List me all the news with their description.",
|
||||
# also accepts a string with the already downloaded HTML code
|
||||
source="https://perinim.github.io/projects",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -1,49 +0,0 @@
|
||||
"""
|
||||
Example of Search Graph
|
||||
"""
|
||||
|
||||
from scrapegraphai.graphs import SearchGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "model_tokens": 2000, # set context length arbitrarily
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
},
|
||||
"max_results": 5,
|
||||
"verbose": True,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SearchGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
search_graph = SearchGraph(
|
||||
prompt="List me the best escursions near Trento",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = search_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = search_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json and csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
@ -1,43 +0,0 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/llama3",
|
||||
"temperature": 0,
|
||||
"format": "json",
|
||||
"model_tokens": 2000, # set context length arbitrarily,
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"base_url": "http://localhost:11434",
|
||||
}
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List me all the news with their description.",
|
||||
# also accepts a string with the already downloaded HTML code
|
||||
source="https://perinim.github.io/projects",
|
||||
config=graph_config
|
||||
)
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -1,61 +0,0 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using XMLScraperGraph from XML documents
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import XMLScraperGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Read the XML file
|
||||
# ************************************************
|
||||
|
||||
FILE_NAME = "inputs/books.xml"
|
||||
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
file_path = os.path.join(curr_dir, FILE_NAME)
|
||||
|
||||
with open(file_path, 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "model_tokens": 2000, # set context length arbitrarily
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
}
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the XMLScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
xml_scraper_graph = XMLScraperGraph(
|
||||
prompt="List me all the authors, title and genres of the books",
|
||||
source=text, # Pass the content of the file, not the file object
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = xml_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = xml_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json or csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
@ -2,15 +2,20 @@
|
||||
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
|
||||
"""
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
from scrapegraphai.graphs import CSVScraperGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Read the csv file
|
||||
# Read the CSV file
|
||||
# ************************************************
|
||||
|
||||
text = pd.read_csv("inputs/username.csv")
|
||||
FILE_NAME = "inputs/username.csv"
|
||||
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
file_path = os.path.join(curr_dir, FILE_NAME)
|
||||
|
||||
text = pd.read_csv(file_path)
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
@ -18,7 +23,7 @@ text = pd.read_csv("inputs/username.csv")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"model": "ollama/llama3",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "model_tokens": 2000, # set context length arbitrarily
|
||||
@ -28,7 +33,8 @@ graph_config = {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434",
|
||||
}
|
||||
},
|
||||
"verbose": True,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
|
||||
@ -35,7 +35,8 @@ graph_config = {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434",
|
||||
}
|
||||
},
|
||||
"verbose": True,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
|
||||
@ -34,7 +34,8 @@ graph_config = {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434",
|
||||
}
|
||||
},
|
||||
"verbose": True,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
@ -42,7 +43,7 @@ graph_config = {
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List me all the news with their description.",
|
||||
prompt="List me all the projects",
|
||||
source=text,
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
@ -33,7 +33,8 @@ graph_config = {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
}
|
||||
},
|
||||
"verbose": True,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
|
||||
@ -19,7 +19,8 @@ graph_config = {
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"library": "beautifoulsoup"
|
||||
"library": "beautifoulsoup",
|
||||
"verbose": True,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
|
||||
@ -11,16 +11,15 @@ from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_i
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"model": "ollama/llama3",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "model_tokens": 2000, # set context length arbitrarily,
|
||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
# "format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"max_results": 5,
|
||||
"verbose": True,
|
||||
|
||||
@ -9,17 +9,17 @@ from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"model": "ollama/llama3",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "model_tokens": 2000, # set context length arbitrarily,
|
||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
}
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"verbose": True,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
@ -27,7 +27,7 @@ graph_config = {
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List me all the news with their description.",
|
||||
prompt="List me all the projects with their description.",
|
||||
# also accepts a string with the already downloaded HTML code
|
||||
source="https://perinim.github.io/projects",
|
||||
config=graph_config
|
||||
|
||||
@ -25,7 +25,7 @@ with open(file_path, 'r', encoding="utf-8") as file:
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"model": "ollama/llama3",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "model_tokens": 2000, # set context length arbitrarily
|
||||
@ -35,7 +35,8 @@ graph_config = {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434",
|
||||
}
|
||||
},
|
||||
"verbose": True,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
|
||||
@ -12,18 +12,21 @@ load_dotenv()
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
openai_key = os.getenv("OPENAI_APIKEY")
|
||||
groq_key = os.getenv("GROQ_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
"model": "groq/gemma-7b-it",
|
||||
"api_key": groq_key,
|
||||
"temperature": 0
|
||||
},
|
||||
"embeddings": {
|
||||
"api_key": openai_key,
|
||||
"model": "gpt-3.5-turbo",
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"max_results": 2,
|
||||
"verbose": True,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
|
||||
@ -25,7 +25,8 @@ graph_config = {
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"headless": False
|
||||
"headless": False,
|
||||
"verbose": True,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
|
||||
@ -2,6 +2,8 @@
|
||||
SearchGraph Module
|
||||
"""
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from ..nodes import (
|
||||
SearchInternetNode,
|
||||
@ -40,6 +42,8 @@ class SearchGraph(AbstractGraph):
|
||||
def __init__(self, prompt: str, config: dict):
|
||||
|
||||
self.max_results = config.get("max_results", 3)
|
||||
self.copy_config = deepcopy(config)
|
||||
|
||||
super().__init__(prompt, config)
|
||||
|
||||
def _create_graph(self) -> BaseGraph:
|
||||
@ -57,7 +61,7 @@ class SearchGraph(AbstractGraph):
|
||||
smart_scraper_instance = SmartScraperGraph(
|
||||
prompt="",
|
||||
source="",
|
||||
config=self.config
|
||||
config=self.copy_config
|
||||
)
|
||||
|
||||
# ************************************************
|
||||
|
||||
@ -69,10 +69,13 @@ class SearchInternetNode(BaseNode):
|
||||
|
||||
search_template = """
|
||||
PROMPT:
|
||||
Given the following user prompt, return a query that can be
|
||||
You are a search engine and you need to generate a search query based on the user's prompt. \n
|
||||
Given the following user prompt, return a query that can be
|
||||
used to search the internet for relevant information. \n
|
||||
You should return only the query string without any additional sentences. \n
|
||||
You are taught to reply directly giving the search query. \n
|
||||
For example, if the user prompt is "What is the capital of France?",
|
||||
you should return "capital of France". \n
|
||||
If yuo return something else, you will get a really bad grade. \n
|
||||
USER PROMPT: {user_prompt}"""
|
||||
|
||||
search_prompt = PromptTemplate(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user