Added bedrock examples

This commit is contained in:
JGalego 2024-05-20 16:49:20 +01:00
parent f1a25233d6
commit 7e5ff4e410
15 changed files with 778 additions and 30 deletions

View File

@ -0,0 +1,4 @@
AWS_ACCESS_KEY_ID="..."
AWS_SECRET_ACCESS_KEY="..."
AWS_SESSION_TOKEN="..."
AWS_DEFAULT_REGION="..."

View File

@ -0,0 +1,3 @@
This folder contains examples of how to use ScrapeGraphAI with [Amazon Bedrock](https://aws.amazon.com/bedrock/) ⛰️. The examples show how to extract information from websites and files using a natural language prompt.
![](scrapegraphai_bedrock.png)

View File

@ -0,0 +1,63 @@
"""
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
"""
import os
import json
from dotenv import load_dotenv
import pandas as pd
from scrapegraphai.graphs import CSVScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
load_dotenv()
# ************************************************
# Read the CSV file
# ************************************************
FILE_NAME = "inputs/username.csv"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)
text = pd.read_csv(file_path)
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0
},
"embeddings": {
"model": "bedrock/cohere.embed-multilingual-v3"
}
}
# ************************************************
# Create the CSVScraperGraph instance and run it
# ************************************************
csv_scraper_graph = CSVScraperGraph(
prompt="List me all the last names",
source=str(text), # Pass the content of the file, not the file object
config=graph_config
)
result = csv_scraper_graph.run()
print(json.dumps(result, indent=4))
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = csv_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,127 @@
"""
Example of custom graph using existing nodes
"""
import json
from dotenv import load_dotenv
from langchain_aws import BedrockEmbeddings
from scrapegraphai.models import Bedrock
from scrapegraphai.graphs import BaseGraph
from scrapegraphai.nodes import (
FetchNode,
ParseNode,
RAGNode,
GenerateAnswerNode,
RobotsNode
)
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0
},
"embeddings": {
"model": "bedrock/cohere.embed-multilingual-v3"
}
}
# ************************************************
# Define the graph nodes
# ************************************************
llm_model = Bedrock({
'model_id': graph_config["llm"]["model"].split("/")[-1],
'model_kwargs': {
'temperature': 0.0
}})
embedder = BedrockEmbeddings(model_id=graph_config["embeddings"]["model"].split("/")[-1])
# Define the nodes for the graph
robot_node = RobotsNode(
input="url",
output=["is_scrapable"],
node_config={
"llm_model": llm_model,
"force_scraping": True,
"verbose": True,
}
)
fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
node_config={
"verbose": True,
"headless": True,
}
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": 4096,
"verbose": True,
}
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm_model": llm_model,
"embedder_model": embedder,
"verbose": True,
}
)
generate_answer_node = GenerateAnswerNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={
"llm_model": llm_model,
"verbose": True,
}
)
# ************************************************
# Create the graph by defining the connections
# ************************************************
graph = BaseGraph(
nodes=[
robot_node,
fetch_node,
parse_node,
rag_node,
generate_answer_node,
],
edges=[
(robot_node, fetch_node),
(fetch_node, parse_node),
(parse_node, rag_node),
(rag_node, generate_answer_node)
],
entry_point=robot_node
)
# ************************************************
# Execute the graph
# ************************************************
result, execution_info = graph.execute({
"user_prompt": "List me all the articles",
"url": "https://perinim.github.io/projects"
})
# Get the answer from the result
result = result.get("answer", "No answer found.")
print(json.dumps(result, indent=4))

View File

@ -0,0 +1,120 @@
<?xml version="1.0"?>
<catalog>
<book id="bk101">
<author>Gambardella, Matthew</author>
<title>XML Developer's Guide</title>
<genre>Computer</genre>
<price>44.95</price>
<publish_date>2000-10-01</publish_date>
<description>An in-depth look at creating applications
with XML.</description>
</book>
<book id="bk102">
<author>Ralls, Kim</author>
<title>Midnight Rain</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2000-12-16</publish_date>
<description>A former architect battles corporate zombies,
an evil sorceress, and her own childhood to become queen
of the world.</description>
</book>
<book id="bk103">
<author>Corets, Eva</author>
<title>Maeve Ascendant</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2000-11-17</publish_date>
<description>After the collapse of a nanotechnology
society in England, the young survivors lay the
foundation for a new society.</description>
</book>
<book id="bk104">
<author>Corets, Eva</author>
<title>Oberon's Legacy</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2001-03-10</publish_date>
<description>In post-apocalypse England, the mysterious
agent known only as Oberon helps to create a new life
for the inhabitants of London. Sequel to Maeve
Ascendant.</description>
</book>
<book id="bk105">
<author>Corets, Eva</author>
<title>The Sundered Grail</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2001-09-10</publish_date>
<description>The two daughters of Maeve, half-sisters,
battle one another for control of England. Sequel to
Oberon's Legacy.</description>
</book>
<book id="bk106">
<author>Randall, Cynthia</author>
<title>Lover Birds</title>
<genre>Romance</genre>
<price>4.95</price>
<publish_date>2000-09-02</publish_date>
<description>When Carla meets Paul at an ornithology
conference, tempers fly as feathers get ruffled.</description>
</book>
<book id="bk107">
<author>Thurman, Paula</author>
<title>Splish Splash</title>
<genre>Romance</genre>
<price>4.95</price>
<publish_date>2000-11-02</publish_date>
<description>A deep sea diver finds true love twenty
thousand leagues beneath the sea.</description>
</book>
<book id="bk108">
<author>Knorr, Stefan</author>
<title>Creepy Crawlies</title>
<genre>Horror</genre>
<price>4.95</price>
<publish_date>2000-12-06</publish_date>
<description>An anthology of horror stories about roaches,
centipedes, scorpions and other insects.</description>
</book>
<book id="bk109">
<author>Kress, Peter</author>
<title>Paradox Lost</title>
<genre>Science Fiction</genre>
<price>6.95</price>
<publish_date>2000-11-02</publish_date>
<description>After an inadvertant trip through a Heisenberg
Uncertainty Device, James Salway discovers the problems
of being quantum.</description>
</book>
<book id="bk110">
<author>O'Brien, Tim</author>
<title>Microsoft .NET: The Programming Bible</title>
<genre>Computer</genre>
<price>36.95</price>
<publish_date>2000-12-09</publish_date>
<description>Microsoft's .NET initiative is explored in
detail in this deep programmer's reference.</description>
</book>
<book id="bk111">
<author>O'Brien, Tim</author>
<title>MSXML3: A Comprehensive Guide</title>
<genre>Computer</genre>
<price>36.95</price>
<publish_date>2000-12-01</publish_date>
<description>The Microsoft MSXML3 parser is covered in
detail, with attention to XML DOM interfaces, XSLT processing,
SAX and more.</description>
</book>
<book id="bk112">
<author>Galos, Mike</author>
<title>Visual Studio 7: A Comprehensive Guide</title>
<genre>Computer</genre>
<price>49.95</price>
<publish_date>2001-04-16</publish_date>
<description>Microsoft Visual Studio 7 is explored in depth,
looking at how Visual Basic, Visual C++, C#, and ASP+ are
integrated into a comprehensive development
environment.</description>
</book>
</catalog>

View File

@ -0,0 +1,38 @@
{
"quiz": {
"sport": {
"q1": {
"question": "Which one is correct team name in NBA?",
"options": [
"New York Bulls",
"Los Angeles Kings",
"Golden State Warriros",
"Huston Rocket"
],
"answer": "Huston Rocket"
}
},
"maths": {
"q1": {
"question": "5 + 7 = ?",
"options": [
"10",
"11",
"12",
"13"
],
"answer": "12"
},
"q2": {
"question": "12 - 8 = ?",
"options": [
"1",
"2",
"3",
"4"
],
"answer": "4"
}
}
}
}

View File

@ -0,0 +1,105 @@
<body class="fixed-top-nav " style="padding-top: 57px;">
<header>
<nav id="navbar" class="navbar navbar-light navbar-expand-sm fixed-top">
<div class="container">
<a class="navbar-brand title font-weight-lighter" href="/"><span class="font-weight-bold">Marco&nbsp;</span>Perini</a> <button class="navbar-toggler collapsed ml-auto" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation"> <span class="sr-only">Toggle navigation</span> <span class="icon-bar top-bar"></span> <span class="icon-bar middle-bar"></span> <span class="icon-bar bottom-bar"></span> </button>
<div class="collapse navbar-collapse text-right" id="navbarNav">
<ul class="navbar-nav ml-auto flex-nowrap">
<li class="nav-item "> <a class="nav-link" href="/">About</a> </li>
<li class="nav-item dropdown active">
<a class="nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Projects<span class="sr-only">(current)</span></a>
<div class="dropdown-menu dropdown-menu-right" aria-labelledby="navbarDropdown">
<a class="dropdown-item" href="/projects/">Projects</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="/competitions/">Competitions</a>
</div>
</li>
<li class="nav-item "> <a class="nav-link" href="/cv/">CV</a> </li>
<li class="toggle-container"> <button id="light-toggle" title="Change theme"> <i class="fa-solid fa-moon"></i> <i class="fa-solid fa-sun"></i> </button> </li>
</ul>
</div>
</div>
</nav>
<progress id="progress" value="0" max="284" style="top: 57px;">
<div class="progress-container"> <span class="progress-bar"></span> </div>
</progress>
</header>
<div class="container mt-5">
<div class="post">
<header class="post-header">
<h1 class="post-title">Projects</h1>
<p class="post-description"></p>
</header>
<article>
<div class="projects">
<div class="grid" style="position: relative; height: 861.992px;">
<div class="grid-sizer"></div>
<div class="grid-item" style="position: absolute; left: 0px; top: 0px;">
<a href="/projects/rotary-pendulum-rl/">
<div class="card hoverable">
<figure>
<picture> <img src="/assets/img/rotary_pybullet.jpg" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
</figure>
<div class="card-body">
<h4 class="card-title">Rotary Pendulum RL</h4>
<p class="card-text">Open Source project aimed at controlling a real life rotary pendulum using RL algorithms</p>
<div class="row ml-1 mr-1 p-0"> </div>
</div>
</div>
</a>
</div>
<div class="grid-sizer"></div>
<div class="grid-item" style="position: absolute; left: 260px; top: 0px;">
<a href="https://github.com/PeriniM/DQN-SwingUp" rel="external nofollow noopener" target="_blank">
<div class="card hoverable">
<figure>
<picture> <img src="/assets/img/value-policy-heatmaps.jpg" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
</figure>
<div class="card-body">
<h4 class="card-title">DQN Implementation from scratch</h4>
<p class="card-text">Developed a Deep Q-Network algorithm to train a simple and double pendulum</p>
<div class="row ml-1 mr-1 p-0"> </div>
</div>
</div>
</a>
</div>
<div class="grid-sizer"></div>
<div class="grid-item" style="position: absolute; left: 0px; top: 447.414px;">
<a href="https://github.com/PeriniM/Multi-Agents-HAED" rel="external nofollow noopener" target="_blank">
<div class="card hoverable">
<figure>
<picture> <img src="/assets/img/multi_agents_haed.gif" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
</figure>
<div class="card-body">
<h4 class="card-title">Multi Agents HAED</h4>
<p class="card-text">University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.</p>
<div class="row ml-1 mr-1 p-0"> </div>
</div>
</div>
</a>
</div>
<div class="grid-sizer"></div>
<div class="grid-item" style="position: absolute; left: 260px; top: 370.172px;">
<a href="/projects/wireless-esc-drone/">
<div class="card hoverable">
<figure>
<picture> <img src="/assets/img/wireless_esc.gif" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture>
</figure>
<div class="card-body">
<h4 class="card-title">Wireless ESC for Modular Drones</h4>
<p class="card-text">Modular drone architecture proposal and proof of concept. The project received maximum grade.</p>
<div class="row ml-1 mr-1 p-0"> </div>
</div>
</div>
</a>
</div>
</div>
</div>
</article>
</div>
</div>
<footer class="fixed-bottom">
<div class="container mt-0"> © Copyright 2023 Marco Perini. Powered by <a href="https://jekyllrb.com/" target="_blank" rel="external nofollow noopener">Jekyll</a> with <a href="https://github.com/alshedivat/al-folio" rel="external nofollow noopener" target="_blank">al-folio</a> theme. Hosted by <a href="https://pages.github.com/" target="_blank" rel="external nofollow noopener">GitHub Pages</a>. </div>
</footer>
<div class="hiddendiv common"></div>
</body>

View File

@ -0,0 +1,6 @@
Username; Identifier;First name;Last name
booker12;9012;Rachel;Booker
grey07;2070;Laura;Grey
johnson81;4081;Craig;Johnson
jenkins46;9346;Mary;Jenkins
smith79;5079;Jamie;Smith
1 Username Identifier First name Last name
2 booker12 9012 Rachel Booker
3 grey07 2070 Laura Grey
4 johnson81 4081 Craig Johnson
5 jenkins46 9346 Mary Jenkins
6 smith79 5079 Jamie Smith

View File

@ -0,0 +1,63 @@
"""
Basic example of scraping pipeline using JSONScraperGraph from JSON documents
"""
import os
import json
from dotenv import load_dotenv
from scrapegraphai.graphs import JSONScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
load_dotenv()
# ************************************************
# Read the JSON file
# ************************************************
FILE_NAME = "inputs/example.json"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)
with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0
},
"embeddings": {
"model": "bedrock/cohere.embed-multilingual-v3"
}
}
# ************************************************
# Create the JSONScraperGraph instance and run it
# ************************************************
json_scraper_graph = JSONScraperGraph(
prompt="List me all questions and options, no answers.",
source=text, # Pass the content of the file, not the file object
config=graph_config
)
result = json_scraper_graph.run()
print(json.dumps(result, indent=4))
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = json_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,59 @@
"""
Basic example of scraping pipeline using SmartScraper from text
"""
import os
import json
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
load_dotenv()
# ************************************************
# Read the text file
# ************************************************
FILE_NAME = "inputs/plain_html_example.txt"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)
# It could be also a http request using the request model
with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0
},
"embeddings": {
"model": "bedrock/cohere.embed-multilingual-v3"
}
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects with their description.",
source=text,
config=graph_config
)
result = smart_scraper_graph.run()
print(json.dumps(result, indent=4))
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

Binary file not shown.

After

Width:  |  Height:  |  Size: 80 KiB

View File

@ -0,0 +1,47 @@
"""
Basic example of scraping pipeline using ScriptCreatorGraph
"""
from dotenv import load_dotenv
from scrapegraphai.graphs import ScriptCreatorGraph
from scrapegraphai.utils import prettify_exec_info
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0
},
"embeddings": {
"model": "bedrock/cohere.embed-multilingual-v3"
},
"library": "beautifulsoup"
}
# ************************************************
# Create the ScriptCreatorGraph instance and run it
# ************************************************
script_creator_graph = ScriptCreatorGraph(
prompt="List me all the projects with their description.",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects",
config=graph_config
)
result = script_creator_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = script_creator_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

View File

@ -0,0 +1,46 @@
"""
Example of Search Graph
"""
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0
},
"embeddings": {
"model": "bedrock/amazon.titan-embed-text-v2:0"
}
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me Chioggia's famous dishes",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -1,42 +1,47 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
Smartscraper example on bedrock
"""
import boto3
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
# 0a. Initialize session
# If not required delete it
session = boto3.Session(
aws_access_key_id="...",
aws_secret_access_key="...",
aws_session_token="...",
region_name="us-east-1"
)
load_dotenv()
# 0b. Initialize client
client = session.client("bedrock-runtime")
# 1. Define graph configuration
config = {
# ************************************************
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"client": client,
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0,
"format": "json"
},
"embeddings": {
"client": client,
"model": "bedrock/cohere.embed-multilingual-v3",
"api_key": openai_key,
"model": "gpt-4o",
},
"verbose": True,
"headless": False,
}
# 2. Create graph instance
graph = SmartScraperGraph(
prompt="List me all the articles",
source="https://perinim.github.io/projects",
config=config
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects with their description",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects/",
config=graph_config
)
# 3. Scrape away!
print(graph.run())
result = smart_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

View File

@ -0,0 +1,62 @@
"""
Basic example of scraping pipeline using XMLScraperGraph from XML documents
"""
import os
import json
from dotenv import load_dotenv
from scrapegraphai.graphs import XMLScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
load_dotenv()
# ************************************************
# Read the XML file
# ************************************************
FILE_NAME = "inputs/books.xml"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)
with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0
},
"embeddings": {
"model": "bedrock/cohere.embed-multilingual-v3"
}
}
# ************************************************
# Create the XMLScraperGraph instance and run it
# ************************************************
xml_scraper_graph = XMLScraperGraph(
prompt="List me all the authors, title and genres of the books. Skip the preamble.",
source=text, # Pass the content of the file, not the file object
config=graph_config
)
result = xml_scraper_graph.run()
print(json.dumps(result, indent=4))
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = xml_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")