diff --git a/examples/ernie/smart_scraper_schema_ernie.py b/examples/ernie/smart_scraper_schema_ernie.py index 65448821..64a74937 100644 --- a/examples/ernie/smart_scraper_schema_ernie.py +++ b/examples/ernie/smart_scraper_schema_ernie.py @@ -2,32 +2,31 @@ Basic example of scraping pipeline using SmartScraper with schema """ -import os, json +import json +import os +from typing import Dict + from dotenv import load_dotenv +from pydantic import BaseModel + from scrapegraphai.graphs import SmartScraperGraph + load_dotenv() # ************************************************ # Define the output schema for the graph # ************************************************ -schema= """ - { - "Projects": [ - "Project #": - { - "title": "...", - "description": "...", - }, - "Project #": - { - "title": "...", - "description": "...", - } - ] - } -""" + +class Project(BaseModel): + title: str + description: str + + +class Projects(BaseModel): + Projects: Dict[str, Project] + # ************************************************ # Define the configuration for the graph @@ -37,7 +36,7 @@ openai_key = os.getenv("OPENAI_APIKEY") graph_config = { "llm": { - "api_key":openai_key, + "api_key": openai_key, "model": "gpt-3.5-turbo", }, "verbose": True, @@ -51,8 +50,8 @@ graph_config = { smart_scraper_graph = SmartScraperGraph( prompt="List me all the projects with their description", source="https://perinim.github.io/projects/", - schema=schema, - config=graph_config + schema=Projects, + config=graph_config, ) result = smart_scraper_graph.run() diff --git a/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py index 1e0c94d6..784079e4 100644 --- a/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py +++ b/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py @@ -4,6 +4,9 @@ Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key import os from dotenv import load_dotenv +from typing import Dict + +from pydantic import BaseModel from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info from langchain_community.llms import HuggingFaceEndpoint @@ -13,22 +16,12 @@ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings # Define the output schema for the graph # ************************************************ -schema= """ - { - "Projects": [ - "Project #": - { - "title": "...", - "description": "...", - }, - "Project #": - { - "title": "...", - "description": "...", - } - ] - } -""" +class Project(BaseModel): + title: str + description: str + +class Projects(BaseModel): + Projects: Dict[str, Project] ## required environment variable in .env #HUGGINGFACEHUB_API_TOKEN @@ -61,7 +54,7 @@ graph_config = { smart_scraper_graph = SmartScraperGraph( prompt="List me all the projects with their description", source="https://perinim.github.io/projects/", - schema=schema, + schema=Projects, config=graph_config ) result = smart_scraper_graph.run() diff --git a/examples/mixed_models/smart_scraper_schema_groq_openai.py b/examples/mixed_models/smart_scraper_schema_groq_openai.py index 321c71b8..f177cb61 100644 --- a/examples/mixed_models/smart_scraper_schema_groq_openai.py +++ b/examples/mixed_models/smart_scraper_schema_groq_openai.py @@ -2,8 +2,13 @@ Basic example of scraping pipeline using SmartScraper with schema """ -import os, json +import json +import os +from typing import Dict, List + from dotenv import load_dotenv +from pydantic import BaseModel + from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info @@ -13,22 +18,12 @@ load_dotenv() # Define the output schema for the graph # ************************************************ -schema= """ - { - "Projects": [ - "Project #": - { - "title": "...", - "description": "...", - }, - "Project #": - { - "title": "...", - "description": "...", - } - ] - } -""" +class Project(BaseModel): + title: str + description: str + +class Projects(BaseModel): + Projects: Dict[str, Project] # ************************************************ # Define the configuration for the graph @@ -60,7 +55,7 @@ smart_scraper_graph = SmartScraperGraph( prompt="List me all the projects with their description.", # also accepts a string with the already downloaded HTML code source="https://perinim.github.io/projects/", - schema=schema, + schema=Projects, config=graph_config ) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index ed61255c..ef188b27 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -39,7 +39,7 @@ class AbstractGraph(ABC): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. diff --git a/scrapegraphai/graphs/csv_scraper_multi_graph.py b/scrapegraphai/graphs/csv_scraper_multi_graph.py index fd15f49a..716e9aca 100644 --- a/scrapegraphai/graphs/csv_scraper_multi_graph.py +++ b/scrapegraphai/graphs/csv_scraper_multi_graph.py @@ -5,6 +5,8 @@ CSVScraperMultiGraph Module from copy import copy, deepcopy from typing import List, Optional +from pydantic import BaseModel + from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .csv_scraper_graph import CSVScraperGraph @@ -32,7 +34,7 @@ class CSVScraperMultiGraph(AbstractGraph): prompt (str): The user prompt to search the internet. source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> search_graph = MultipleSearchGraph( @@ -42,7 +44,7 @@ class CSVScraperMultiGraph(AbstractGraph): >>> result = search_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None): + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py index e9e41771..df04c9ce 100644 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ b/scrapegraphai/graphs/deep_scraper_graph.py @@ -34,7 +34,7 @@ class DeepScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -45,7 +45,7 @@ class DeepScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> deep_scraper = DeepScraperGraph( diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index 09a5f02e..4165a194 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -23,7 +23,7 @@ class JSONScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -34,7 +34,7 @@ class JSONScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> json_scraper = JSONScraperGraph( diff --git a/scrapegraphai/graphs/json_scraper_multi_graph.py b/scrapegraphai/graphs/json_scraper_multi_graph.py index 2824c416..48fd8217 100644 --- a/scrapegraphai/graphs/json_scraper_multi_graph.py +++ b/scrapegraphai/graphs/json_scraper_multi_graph.py @@ -33,7 +33,7 @@ class JSONScraperMultiGraph(AbstractGraph): prompt (str): The user prompt to search the internet. source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> search_graph = MultipleSearchGraph( diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index a5eefad2..5b1ad30b 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -29,7 +29,7 @@ class OmniScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -41,7 +41,7 @@ class OmniScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> omni_scraper = OmniScraperGraph( diff --git a/scrapegraphai/graphs/omni_search_graph.py b/scrapegraphai/graphs/omni_search_graph.py index df525949..b6f6df59 100644 --- a/scrapegraphai/graphs/omni_search_graph.py +++ b/scrapegraphai/graphs/omni_search_graph.py @@ -34,7 +34,7 @@ class OmniSearchGraph(AbstractGraph): Args: prompt (str): The user prompt to search the internet. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> omni_search_graph = OmniSearchGraph( diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 41099d8b..89d8018c 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -26,7 +26,7 @@ class PDFScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -38,7 +38,7 @@ class PDFScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> pdf_scraper = PDFScraperGraph( diff --git a/scrapegraphai/graphs/pdf_scraper_multi_graph.py b/scrapegraphai/graphs/pdf_scraper_multi_graph.py index e9b5660b..86b2477f 100644 --- a/scrapegraphai/graphs/pdf_scraper_multi_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_multi_graph.py @@ -34,7 +34,7 @@ class PdfScraperMultiGraph(AbstractGraph): prompt (str): The user prompt to search the internet. source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> search_graph = MultipleSearchGraph( diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index ce3fa319..83bef2ab 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -23,7 +23,7 @@ class ScriptCreatorGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -36,7 +36,7 @@ class ScriptCreatorGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> script_creator = ScriptCreatorGraph( diff --git a/scrapegraphai/graphs/script_creator_multi_graph.py b/scrapegraphai/graphs/script_creator_multi_graph.py index 2b36f4ed..a415a82c 100644 --- a/scrapegraphai/graphs/script_creator_multi_graph.py +++ b/scrapegraphai/graphs/script_creator_multi_graph.py @@ -5,6 +5,8 @@ ScriptCreatorMultiGraph Module from copy import copy, deepcopy from typing import List, Optional +from pydantic import BaseModel + from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .script_creator_graph import ScriptCreatorGraph @@ -30,7 +32,7 @@ class ScriptCreatorMultiGraph(AbstractGraph): prompt (str): The user prompt to search the internet. source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> script_graph = ScriptCreatorMultiGraph( ... "What is Chioggia famous for?", @@ -41,7 +43,7 @@ class ScriptCreatorMultiGraph(AbstractGraph): >>> result = script_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None): + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 6bece062..7efcccc2 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -33,7 +33,7 @@ class SearchGraph(AbstractGraph): Args: prompt (str): The user prompt to search the internet. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> search_graph = SearchGraph( diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 9ee0c3cc..cfbfc000 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -26,7 +26,7 @@ class SmartScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -37,7 +37,7 @@ class SmartScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> smart_scraper = SmartScraperGraph( diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py index 996beff1..84e028fc 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -33,7 +33,7 @@ class SmartScraperMultiGraph(AbstractGraph): prompt (str): The user prompt to search the internet. source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> search_graph = MultipleSearchGraph( diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 1058d127..4816a154 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -28,7 +28,7 @@ class SpeechGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. verbose (bool): A flag indicating whether to show print statements during execution. @@ -39,7 +39,7 @@ class SpeechGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> speech_graph = SpeechGraph( diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index dbab0b73..4513422b 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -24,7 +24,7 @@ class XMLScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -36,7 +36,7 @@ class XMLScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. - schema (str): The schema for the graph output. + schema (BaseModel): The schema for the graph output. Example: >>> xml_scraper = XMLScraperGraph( diff --git a/scrapegraphai/graphs/xml_scraper_multi_graph.py b/scrapegraphai/graphs/xml_scraper_multi_graph.py index e1f4423c..da772647 100644 --- a/scrapegraphai/graphs/xml_scraper_multi_graph.py +++ b/scrapegraphai/graphs/xml_scraper_multi_graph.py @@ -34,7 +34,7 @@ class XMLScraperMultiGraph(AbstractGraph): prompt (str): The user prompt to search the internet. source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. - schema (Optional[str]): The schema for the graph output. + schema (Optional[BaseModel]): The schema for the graph output. Example: >>> search_graph = MultipleSearchGraph(