diff --git a/examples/bedrock/.env.example b/examples/bedrock/.env.example new file mode 100644 index 00000000..cd27769e --- /dev/null +++ b/examples/bedrock/.env.example @@ -0,0 +1,4 @@ +AWS_ACCESS_KEY_ID="..." +AWS_SECRET_ACCESS_KEY="..." +AWS_SESSION_TOKEN="..." +AWS_DEFAULT_REGION="..." \ No newline at end of file diff --git a/examples/bedrock/README.md b/examples/bedrock/README.md new file mode 100644 index 00000000..88edd82c --- /dev/null +++ b/examples/bedrock/README.md @@ -0,0 +1,3 @@ +This folder contains examples of how to use ScrapeGraphAI with [Amazon Bedrock](https://aws.amazon.com/bedrock/) ⛰️. The examples show how to extract information from websites and files using a natural language prompt. + +![](scrapegraphai_bedrock.png) \ No newline at end of file diff --git a/examples/bedrock/csv_scraper_bedrock.py b/examples/bedrock/csv_scraper_bedrock.py new file mode 100644 index 00000000..1fe09d0f --- /dev/null +++ b/examples/bedrock/csv_scraper_bedrock.py @@ -0,0 +1,63 @@ +""" +Basic example of scraping pipeline using CSVScraperGraph from CSV documents +""" + +import os +import json + +from dotenv import load_dotenv + +import pandas as pd + +from scrapegraphai.graphs import CSVScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() + +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the CSVScraperGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperGraph( + prompt="List me all the last names", + source=str(text), # Pass the content of the file, not the file object + config=graph_config +) + +result = csv_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/bedrock/custom_graph_bedrock.py b/examples/bedrock/custom_graph_bedrock.py new file mode 100644 index 00000000..d550b46b --- /dev/null +++ b/examples/bedrock/custom_graph_bedrock.py @@ -0,0 +1,127 @@ +""" +Example of custom graph using existing nodes +""" + +import json + +from dotenv import load_dotenv + +from langchain_aws import BedrockEmbeddings +from scrapegraphai.models import Bedrock +from scrapegraphai.graphs import BaseGraph +from scrapegraphai.nodes import ( + FetchNode, + ParseNode, + RAGNode, + GenerateAnswerNode, + RobotsNode +) + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Define the graph nodes +# ************************************************ + +llm_model = Bedrock({ + 'model_id': graph_config["llm"]["model"].split("/")[-1], + 'model_kwargs': { + 'temperature': 0.0 + }}) +embedder = BedrockEmbeddings(model_id=graph_config["embeddings"]["model"].split("/")[-1]) + +# Define the nodes for the graph +robot_node = RobotsNode( + input="url", + output=["is_scrapable"], + node_config={ + "llm_model": llm_model, + "force_scraping": True, + "verbose": True, + } +) + +fetch_node = FetchNode( + input="url | local_dir", + output=["doc", "link_urls", "img_urls"], + node_config={ + "verbose": True, + "headless": True, + } +) + +parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "chunk_size": 4096, + "verbose": True, + } +) + +rag_node = RAGNode( + input="user_prompt & (parsed_doc | doc)", + output=["relevant_chunks"], + node_config={ + "llm_model": llm_model, + "embedder_model": embedder, + "verbose": True, + } +) + +generate_answer_node = GenerateAnswerNode( + input="user_prompt & (relevant_chunks | parsed_doc | doc)", + output=["answer"], + node_config={ + "llm_model": llm_model, + "verbose": True, + } +) + +# ************************************************ +# Create the graph by defining the connections +# ************************************************ + +graph = BaseGraph( + nodes=[ + robot_node, + fetch_node, + parse_node, + rag_node, + generate_answer_node, + ], + edges=[ + (robot_node, fetch_node), + (fetch_node, parse_node), + (parse_node, rag_node), + (rag_node, generate_answer_node) + ], + entry_point=robot_node +) + +# ************************************************ +# Execute the graph +# ************************************************ + +result, execution_info = graph.execute({ + "user_prompt": "List me all the articles", + "url": "https://perinim.github.io/projects" +}) + +# Get the answer from the result +result = result.get("answer", "No answer found.") +print(json.dumps(result, indent=4)) diff --git a/examples/bedrock/inputs/books.xml b/examples/bedrock/inputs/books.xml new file mode 100644 index 00000000..e3d1fe87 --- /dev/null +++ b/examples/bedrock/inputs/books.xml @@ -0,0 +1,120 @@ + + + + Gambardella, Matthew + XML Developer's Guide + Computer + 44.95 + 2000-10-01 + An in-depth look at creating applications + with XML. + + + Ralls, Kim + Midnight Rain + Fantasy + 5.95 + 2000-12-16 + A former architect battles corporate zombies, + an evil sorceress, and her own childhood to become queen + of the world. + + + Corets, Eva + Maeve Ascendant + Fantasy + 5.95 + 2000-11-17 + After the collapse of a nanotechnology + society in England, the young survivors lay the + foundation for a new society. + + + Corets, Eva + Oberon's Legacy + Fantasy + 5.95 + 2001-03-10 + In post-apocalypse England, the mysterious + agent known only as Oberon helps to create a new life + for the inhabitants of London. Sequel to Maeve + Ascendant. + + + Corets, Eva + The Sundered Grail + Fantasy + 5.95 + 2001-09-10 + The two daughters of Maeve, half-sisters, + battle one another for control of England. Sequel to + Oberon's Legacy. + + + Randall, Cynthia + Lover Birds + Romance + 4.95 + 2000-09-02 + When Carla meets Paul at an ornithology + conference, tempers fly as feathers get ruffled. + + + Thurman, Paula + Splish Splash + Romance + 4.95 + 2000-11-02 + A deep sea diver finds true love twenty + thousand leagues beneath the sea. + + + Knorr, Stefan + Creepy Crawlies + Horror + 4.95 + 2000-12-06 + An anthology of horror stories about roaches, + centipedes, scorpions and other insects. + + + Kress, Peter + Paradox Lost + Science Fiction + 6.95 + 2000-11-02 + After an inadvertant trip through a Heisenberg + Uncertainty Device, James Salway discovers the problems + of being quantum. + + + O'Brien, Tim + Microsoft .NET: The Programming Bible + Computer + 36.95 + 2000-12-09 + Microsoft's .NET initiative is explored in + detail in this deep programmer's reference. + + + O'Brien, Tim + MSXML3: A Comprehensive Guide + Computer + 36.95 + 2000-12-01 + The Microsoft MSXML3 parser is covered in + detail, with attention to XML DOM interfaces, XSLT processing, + SAX and more. + + + Galos, Mike + Visual Studio 7: A Comprehensive Guide + Computer + 49.95 + 2001-04-16 + Microsoft Visual Studio 7 is explored in depth, + looking at how Visual Basic, Visual C++, C#, and ASP+ are + integrated into a comprehensive development + environment. + + \ No newline at end of file diff --git a/examples/bedrock/inputs/example.json b/examples/bedrock/inputs/example.json new file mode 100644 index 00000000..d729b76a --- /dev/null +++ b/examples/bedrock/inputs/example.json @@ -0,0 +1,38 @@ +{ + "quiz": { + "sport": { + "q1": { + "question": "Which one is correct team name in NBA?", + "options": [ + "New York Bulls", + "Los Angeles Kings", + "Golden State Warriros", + "Huston Rocket" + ], + "answer": "Huston Rocket" + } + }, + "maths": { + "q1": { + "question": "5 + 7 = ?", + "options": [ + "10", + "11", + "12", + "13" + ], + "answer": "12" + }, + "q2": { + "question": "12 - 8 = ?", + "options": [ + "1", + "2", + "3", + "4" + ], + "answer": "4" + } + } + } +} \ No newline at end of file diff --git a/examples/bedrock/inputs/plain_html_example.txt b/examples/bedrock/inputs/plain_html_example.txt new file mode 100644 index 00000000..78f814ae --- /dev/null +++ b/examples/bedrock/inputs/plain_html_example.txt @@ -0,0 +1,105 @@ + +
+ + +
+
+
+
+
+
+

Projects

+

+
+
+ +
+
+
+ +
+ \ No newline at end of file diff --git a/examples/bedrock/inputs/username.csv b/examples/bedrock/inputs/username.csv new file mode 100644 index 00000000..8c039d7e --- /dev/null +++ b/examples/bedrock/inputs/username.csv @@ -0,0 +1,6 @@ +Username; Identifier;First name;Last name +booker12;9012;Rachel;Booker +grey07;2070;Laura;Grey +johnson81;4081;Craig;Johnson +jenkins46;9346;Mary;Jenkins +smith79;5079;Jamie;Smith \ No newline at end of file diff --git a/examples/bedrock/json_scraper_bedrock.py b/examples/bedrock/json_scraper_bedrock.py new file mode 100644 index 00000000..1b7dde0b --- /dev/null +++ b/examples/bedrock/json_scraper_bedrock.py @@ -0,0 +1,63 @@ +""" +Basic example of scraping pipeline using JSONScraperGraph from JSON documents +""" + +import os +import json + +from dotenv import load_dotenv + +from scrapegraphai.graphs import JSONScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() + +# ************************************************ +# Read the JSON file +# ************************************************ + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the JSONScraperGraph instance and run it +# ************************************************ + +json_scraper_graph = JSONScraperGraph( + prompt="List me all questions and options, no answers.", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = json_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = json_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") + diff --git a/examples/bedrock/scrape_plain_text_bedrock.py b/examples/bedrock/scrape_plain_text_bedrock.py new file mode 100644 index 00000000..5cc2067c --- /dev/null +++ b/examples/bedrock/scrape_plain_text_bedrock.py @@ -0,0 +1,59 @@ +""" +Basic example of scraping pipeline using SmartScraper from text +""" + +import os +import json + +from dotenv import load_dotenv + +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Read the text file +# ************************************************ + +FILE_NAME = "inputs/plain_html_example.txt" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +# It could be also a http request using the request model +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + source=text, + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/scrapegraphai_bedrock.png b/examples/bedrock/scrapegraphai_bedrock.png new file mode 100644 index 00000000..918cf191 Binary files /dev/null and b/examples/bedrock/scrapegraphai_bedrock.png differ diff --git a/examples/bedrock/script_generator_bedrock.py b/examples/bedrock/script_generator_bedrock.py new file mode 100644 index 00000000..038bfb53 --- /dev/null +++ b/examples/bedrock/script_generator_bedrock.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using ScriptCreatorGraph +""" + +from dotenv import load_dotenv + +from scrapegraphai.graphs import ScriptCreatorGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + }, + "library": "beautifulsoup" +} + +# ************************************************ +# Create the ScriptCreatorGraph instance and run it +# ************************************************ + +script_creator_graph = ScriptCreatorGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects", + config=graph_config +) + +result = script_creator_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = script_creator_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/bedrock/search_graph_bedrock.py b/examples/bedrock/search_graph_bedrock.py new file mode 100644 index 00000000..79e2c803 --- /dev/null +++ b/examples/bedrock/search_graph_bedrock.py @@ -0,0 +1,46 @@ +""" +Example of Search Graph +""" + +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/amazon.titan-embed-text-v2:0" + } +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me Chioggia's famous dishes", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/bedrock/smart_scraper_bedrock.py b/examples/bedrock/smart_scraper_bedrock.py index fff586f2..4f0952ae 100644 --- a/examples/bedrock/smart_scraper_bedrock.py +++ b/examples/bedrock/smart_scraper_bedrock.py @@ -1,42 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper """ -Smartscraper example on bedrock -""" -import boto3 +import os +from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info -# 0a. Initialize session -# If not required delete it -session = boto3.Session( - aws_access_key_id="...", - aws_secret_access_key="...", - aws_session_token="...", - region_name="us-east-1" -) +load_dotenv() -# 0b. Initialize client -client = session.client("bedrock-runtime") -# 1. Define graph configuration -config = { +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { "llm": { - "client": client, - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0, - "format": "json" - }, - "embeddings": { - "client": client, - "model": "bedrock/cohere.embed-multilingual-v3", + "api_key": openai_key, + "model": "gpt-4o", }, + "verbose": True, + "headless": False, } -# 2. Create graph instance -graph = SmartScraperGraph( - prompt="List me all the articles", - source="https://perinim.github.io/projects", - config=config +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config ) -# 3. Scrape away! -print(graph.run()) +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/xml_scraper_bedrock.py b/examples/bedrock/xml_scraper_bedrock.py new file mode 100644 index 00000000..cb4e24bc --- /dev/null +++ b/examples/bedrock/xml_scraper_bedrock.py @@ -0,0 +1,62 @@ +""" +Basic example of scraping pipeline using XMLScraperGraph from XML documents +""" + +import os +import json + +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the XMLScraperGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperGraph( + prompt="List me all the authors, title and genres of the books. Skip the preamble.", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") +