mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-12 21:01:54 +08:00
73 lines
1.8 KiB
Python
73 lines
1.8 KiB
Python
"""
|
|
Basic example of scraping pipeline using SmartScraper with schema
|
|
"""
|
|
|
|
import os, json
|
|
from typing import List
|
|
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
from pydantic import BaseModel, Field
|
|
from scrapegraphai.graphs import SmartScraperGraph
|
|
from scrapegraphai.integrations import IndexifyNode
|
|
|
|
|
|
# ************************************************
|
|
# Define the output schema for the graph
|
|
# ************************************************
|
|
|
|
class Image(BaseModel):
|
|
url: str = Field(description="The url of the image")
|
|
|
|
class Images(BaseModel):
|
|
images: List[Image]
|
|
|
|
# ************************************************
|
|
# Define the configuration for the graph
|
|
# ************************************************
|
|
|
|
openai_key = os.getenv("OPENAI_APIKEY")
|
|
|
|
graph_config = {
|
|
"llm": {
|
|
"api_key":openai_key,
|
|
"model": "openai/gpt-3.5-turbo",
|
|
},
|
|
"verbose": True,
|
|
"headless": False,
|
|
}
|
|
|
|
# ************************************************
|
|
# Define the custom nodes for the graph
|
|
# ************************************************
|
|
|
|
indexify_node = IndexifyNode(
|
|
input="answer & img_urls",
|
|
output=["is_indexed"],
|
|
node_config={
|
|
"verbose": True
|
|
}
|
|
)
|
|
|
|
# ************************************************
|
|
# Create the SmartScraperGraph instance
|
|
# ************************************************
|
|
|
|
smart_scraper_graph = SmartScraperGraph(
|
|
prompt="List me all the images with their url",
|
|
source="https://giphy.com/",
|
|
schema=Images,
|
|
config=graph_config
|
|
)
|
|
|
|
# Add the custom node to the graph
|
|
smart_scraper_graph.append_node(indexify_node)
|
|
|
|
# ************************************************
|
|
# Run the SmartScraperGraph
|
|
# ************************************************
|
|
|
|
result = smart_scraper_graph.run()
|
|
print(json.dumps(result, indent=2))
|