From 8aa9103f02af92d9e1a780450daa7bb303afc150 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 24 Nov 2024 10:54:40 +0100 Subject: [PATCH] feat: add api integration --- examples/scrapegraph-api/smart_scraper_api.py | 44 +++++++++++++++++++ pyproject.toml | 3 +- requirements-dev.lock | 9 +++- requirements.lock | 9 +++- scrapegraphai/graphs/smart_scraper_graph.py | 6 +++ 5 files changed, 66 insertions(+), 5 deletions(-) create mode 100644 examples/scrapegraph-api/smart_scraper_api.py diff --git a/examples/scrapegraph-api/smart_scraper_api.py b/examples/scrapegraph-api/smart_scraper_api.py new file mode 100644 index 00000000..8a292ee9 --- /dev/null +++ b/examples/scrapegraph-api/smart_scraper_api.py @@ -0,0 +1,44 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "model": "scrapegraphai/smart-scraper", + "api_key": os.getenv("SCRAPEGRAPH_API_KEY") + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="Extract me all the articles", + source="https://www.wired.com", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/pyproject.toml b/pyproject.toml index 86b4be43..f99e484b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,8 @@ dependencies = [ "transformers>=4.44.2", "googlesearch-python>=1.2.5", "simpleeval>=1.0.0", - "async_timeout>=4.0.3" + "async_timeout>=4.0.3", + "scrapegraph-py>=0.0.3" ] license = "MIT" diff --git a/requirements-dev.lock b/requirements-dev.lock index b2d32e41..7407894f 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -353,7 +353,7 @@ pyasn1==0.6.0 # via rsa pyasn1-modules==0.4.0 # via google-auth -pydantic==2.8.2 +pydantic==2.10.1 # via burr # via fastapi # via fastapi-pagination @@ -368,7 +368,8 @@ pydantic==2.8.2 # via openai # via pydantic-settings # via qdrant-client -pydantic-core==2.20.1 + # via scrapegraph-py +pydantic-core==2.27.1 # via pydantic pydantic-settings==2.5.2 # via langchain-community @@ -396,6 +397,7 @@ python-dateutil==2.9.0.post0 # via pandas python-dotenv==1.0.1 # via pydantic-settings + # via scrapegraph-py # via scrapegraphai pytz==2024.1 # via pandas @@ -424,6 +426,7 @@ requests==2.32.3 # via langchain-community # via langsmith # via mistral-common + # via scrapegraph-py # via sphinx # via streamlit # via tiktoken @@ -439,6 +442,8 @@ s3transfer==0.10.2 # via boto3 safetensors==0.4.5 # via transformers +scrapegraph-py==0.0.3 + # via scrapegraphai semchunk==2.2.0 # via scrapegraphai sentencepiece==0.2.0 diff --git a/requirements.lock b/requirements.lock index 38be6e68..fd291ce8 100644 --- a/requirements.lock +++ b/requirements.lock @@ -257,7 +257,7 @@ pyasn1==0.6.0 # via rsa pyasn1-modules==0.4.0 # via google-auth -pydantic==2.8.2 +pydantic==2.10.1 # via google-generativeai # via langchain # via langchain-aws @@ -269,7 +269,8 @@ pydantic==2.8.2 # via openai # via pydantic-settings # via qdrant-client -pydantic-core==2.20.1 + # via scrapegraph-py +pydantic-core==2.27.1 # via pydantic pydantic-settings==2.5.2 # via langchain-community @@ -286,6 +287,7 @@ python-dateutil==2.9.0.post0 # via pandas python-dotenv==1.0.1 # via pydantic-settings + # via scrapegraph-py # via scrapegraphai pytz==2024.1 # via pandas @@ -313,6 +315,7 @@ requests==2.32.3 # via langchain-community # via langsmith # via mistral-common + # via scrapegraph-py # via tiktoken # via transformers rpds-py==0.20.0 @@ -324,6 +327,8 @@ s3transfer==0.10.2 # via boto3 safetensors==0.4.5 # via transformers +scrapegraph-py==0.0.3 + # via scrapegraphai semchunk==2.2.0 # via scrapegraphai sentencepiece==0.2.0 diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 594420f5..f6316ec6 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -13,6 +13,7 @@ from ..nodes import ( ConditionalNode ) from ..prompts import REGEN_ADDITIONAL_INFO +from scrapegraph_py import ScrapeGraphClient, smart_scraper class SmartScraperGraph(AbstractGraph): """ @@ -59,6 +60,11 @@ class SmartScraperGraph(AbstractGraph): Returns: BaseGraph: A graph instance representing the web scraping workflow. """ + if self.llm_model == "scrapegraphai/smart-scraper": + client = ScrapeGraphClient(self.config.get("api_key")) + + result = smart_scraper(client, self.source, self.prompt) + return result fetch_node = FetchNode( input="url| local_dir",