feat: add api integration

This commit is contained in:
Marco Vinciguerra 2024-11-24 10:54:40 +01:00
parent 92bb8bb168
commit 8aa9103f02
5 changed files with 66 additions and 5 deletions

View File

@ -0,0 +1,44 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
import os
import json
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "scrapegraphai/smart-scraper",
"api_key": os.getenv("SCRAPEGRAPH_API_KEY")
},
"verbose": True,
"headless": False,
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="Extract me all the articles",
source="https://www.wired.com",
config=graph_config
)
result = smart_scraper_graph.run()
print(json.dumps(result, indent=4))
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

View File

@ -43,7 +43,8 @@ dependencies = [
"transformers>=4.44.2",
"googlesearch-python>=1.2.5",
"simpleeval>=1.0.0",
"async_timeout>=4.0.3"
"async_timeout>=4.0.3",
"scrapegraph-py>=0.0.3"
]
license = "MIT"

View File

@ -353,7 +353,7 @@ pyasn1==0.6.0
# via rsa
pyasn1-modules==0.4.0
# via google-auth
pydantic==2.8.2
pydantic==2.10.1
# via burr
# via fastapi
# via fastapi-pagination
@ -368,7 +368,8 @@ pydantic==2.8.2
# via openai
# via pydantic-settings
# via qdrant-client
pydantic-core==2.20.1
# via scrapegraph-py
pydantic-core==2.27.1
# via pydantic
pydantic-settings==2.5.2
# via langchain-community
@ -396,6 +397,7 @@ python-dateutil==2.9.0.post0
# via pandas
python-dotenv==1.0.1
# via pydantic-settings
# via scrapegraph-py
# via scrapegraphai
pytz==2024.1
# via pandas
@ -424,6 +426,7 @@ requests==2.32.3
# via langchain-community
# via langsmith
# via mistral-common
# via scrapegraph-py
# via sphinx
# via streamlit
# via tiktoken
@ -439,6 +442,8 @@ s3transfer==0.10.2
# via boto3
safetensors==0.4.5
# via transformers
scrapegraph-py==0.0.3
# via scrapegraphai
semchunk==2.2.0
# via scrapegraphai
sentencepiece==0.2.0

View File

@ -257,7 +257,7 @@ pyasn1==0.6.0
# via rsa
pyasn1-modules==0.4.0
# via google-auth
pydantic==2.8.2
pydantic==2.10.1
# via google-generativeai
# via langchain
# via langchain-aws
@ -269,7 +269,8 @@ pydantic==2.8.2
# via openai
# via pydantic-settings
# via qdrant-client
pydantic-core==2.20.1
# via scrapegraph-py
pydantic-core==2.27.1
# via pydantic
pydantic-settings==2.5.2
# via langchain-community
@ -286,6 +287,7 @@ python-dateutil==2.9.0.post0
# via pandas
python-dotenv==1.0.1
# via pydantic-settings
# via scrapegraph-py
# via scrapegraphai
pytz==2024.1
# via pandas
@ -313,6 +315,7 @@ requests==2.32.3
# via langchain-community
# via langsmith
# via mistral-common
# via scrapegraph-py
# via tiktoken
# via transformers
rpds-py==0.20.0
@ -324,6 +327,8 @@ s3transfer==0.10.2
# via boto3
safetensors==0.4.5
# via transformers
scrapegraph-py==0.0.3
# via scrapegraphai
semchunk==2.2.0
# via scrapegraphai
sentencepiece==0.2.0

View File

@ -13,6 +13,7 @@ from ..nodes import (
ConditionalNode
)
from ..prompts import REGEN_ADDITIONAL_INFO
from scrapegraph_py import ScrapeGraphClient, smart_scraper
class SmartScraperGraph(AbstractGraph):
"""
@ -59,6 +60,11 @@ class SmartScraperGraph(AbstractGraph):
Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""
if self.llm_model == "scrapegraphai/smart-scraper":
client = ScrapeGraphClient(self.config.get("api_key"))
result = smart_scraper(client, self.source, self.prompt)
return result
fetch_node = FetchNode(
input="url| local_dir",