diff --git a/examples/anthropic/smart_scraper_schema_haiku.py b/examples/anthropic/smart_scraper_schema_haiku.py new file mode 100644 index 00000000..e4f7d5e6 --- /dev/null +++ b/examples/anthropic/smart_scraper_schema_haiku.py @@ -0,0 +1,77 @@ +""" +Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +from langchain_community.llms import HuggingFaceEndpoint +from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings + + +# required environment variables in .env +# HUGGINGFACEHUB_API_TOKEN +# ANTHROPIC_API_KEY +load_dotenv() + +HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') +# ************************************************ +# Initialize the model instances +# ************************************************ + + +embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( + api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" +) + +# ************************************************ +# Define the output schema for the graph +# ************************************************ + +schema= """ + { + "Projects": [ + "Project #": + { + "title": "...", + "description": "...", + }, + "Project #": + { + "title": "...", + "description": "...", + } + ] + } +""" + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "claude-3-haiku-20240307", + "max_tokens": 4000}, + "embeddings": {"model_instance": embedder_model_instance} +} + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + # also accepts a string with the already downloaded HTML code + schema=schema, + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/smart_scraper_schema_bedrock.py b/examples/bedrock/smart_scraper_schema_bedrock.py new file mode 100644 index 00000000..3bcb8a31 --- /dev/null +++ b/examples/bedrock/smart_scraper_schema_bedrock.py @@ -0,0 +1,67 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() +# ************************************************ +# Define the output schema for the graph +# ************************************************ + +schema= """ + { + "Projects": [ + "Project #": + { + "title": "...", + "description": "...", + }, + "Project #": + { + "title": "...", + "description": "...", + } + ] + } +""" + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-4o", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + schema=schema, + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/smart_scraper_schema_deepseek.py b/examples/deepseek/smart_scraper_schema_deepseek.py new file mode 100644 index 00000000..c83c6e9d --- /dev/null +++ b/examples/deepseek/smart_scraper_schema_deepseek.py @@ -0,0 +1,68 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the output schema for the graph +# ************************************************ + +schema= """ + { + "Projects": [ + "Project #": + { + "title": "...", + "description": "...", + }, + "Project #": + { + "title": "...", + "description": "...", + } + ] + } +""" + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek-chat", + "openai_api_key": deepseek_key, + "openai_api_base": 'https://api.deepseek.com/v1', + }, + "verbose": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + schema=schema, + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/gemini/smart_scraper_schema_gemini.py b/examples/gemini/smart_scraper_schema_gemini.py new file mode 100644 index 00000000..157d9542 --- /dev/null +++ b/examples/gemini/smart_scraper_schema_gemini.py @@ -0,0 +1,64 @@ +""" +Basic example of scraping pipeline using SmartScraper with schema +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.utils import prettify_exec_info +from scrapegraphai.graphs import SmartScraperGraph +load_dotenv() + +# ************************************************ +# Define the output schema for the graph +# ************************************************ +schema= """ + { + "Projects": [ + "Project #": + { + "title": "...", + "description": "...", + }, + "Project #": + { + "title": "...", + "description": "...", + } + ] + } +""" + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "gemini-pro", + }, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the news with their description.", + # also accepts a string with the already downloaded HTML code + source="https://www.wired.com", + schema=schema, + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/groq/smart_scraper_schema_groq_openai.py b/examples/groq/smart_scraper_schema_groq_openai.py new file mode 100644 index 00000000..321c71b8 --- /dev/null +++ b/examples/groq/smart_scraper_schema_groq_openai.py @@ -0,0 +1,75 @@ +""" +Basic example of scraping pipeline using SmartScraper with schema +""" + +import os, json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the output schema for the graph +# ************************************************ + +schema= """ + { + "Projects": [ + "Project #": + { + "title": "...", + "description": "...", + }, + "Project #": + { + "title": "...", + "description": "...", + } + ] + } +""" + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +groq_key = os.getenv("GROQ_APIKEY") +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "embeddings": { + "api_key": openai_key, + "model": "openai", + }, + "headless": False +} + + + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + schema=schema, + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/local_models/smart_scraper_schema_ollama.py b/examples/local_models/smart_scraper_schema_ollama.py new file mode 100644 index 00000000..255e6e52 --- /dev/null +++ b/examples/local_models/smart_scraper_schema_ollama.py @@ -0,0 +1,55 @@ +""" +Basic example of scraping pipeline using SmartScraper with schema +""" +import json +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +# ************************************************ +# Define the configuration for the graph +# ************************************************ +schema= """ + { + "Projects": [ + "Project #": + { + "title": "...", + "description": "...", + }, + "Project #": + { + "title": "...", + "description": "...", + } + ] + } +""" + +graph_config = { + "llm": { + "model": "ollama/mistral", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io/projects/", + schema=schema, + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/openai/smart_scraper_schema_openai.py b/examples/openai/smart_scraper_schema_openai.py index a4b28fc0..65448821 100644 --- a/examples/openai/smart_scraper_schema_openai.py +++ b/examples/openai/smart_scraper_schema_openai.py @@ -1,5 +1,5 @@ """ -Basic example of scraping pipeline using SmartScraper +Basic example of scraping pipeline using SmartScraper with schema """ import os, json