diff --git a/examples/knowledge_graph/kg_custom_graph.py b/examples/knowledge_graph/kg_custom_graph.py new file mode 100644 index 00000000..b235af17 --- /dev/null +++ b/examples/knowledge_graph/kg_custom_graph.py @@ -0,0 +1,134 @@ +""" +Example of custom graph for creating a knowledge graph +""" + +import os, json +from dotenv import load_dotenv + +from langchain_openai import OpenAIEmbeddings +from scrapegraphai.models import OpenAI +from scrapegraphai.graphs import BaseGraph, SmartScraperGraph +from scrapegraphai.nodes import GraphIteratorNode, MergeAnswersNode, KnowledgeGraphNode + +load_dotenv() + +# ************************************************ +# Define the output schema +# ************************************************ + +schema= """{ + "Job Postings": { + "Company x": [ + { + "title": "...", + "description": "...", + "location": "...", + "date_posted": "..", + "requirements": ["...", "...", "..."] + }, + { + "title": "...", + "description": "...", + "location": "...", + "date_posted": "..", + "requirements": ["...", "...", "..."] + } + ], + "Company y": [ + { + "title": "...", + "description": "...", + "location": "...", + "date_posted": "..", + "requirements": ["...", "...", "..."] + } + ] + } +}""" + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-4o", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Define the graph nodes +# ************************************************ + +llm_model = OpenAI(graph_config["llm"]) +embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key) + +smart_scraper_instance = SmartScraperGraph( + prompt="", + source="", + config=graph_config, +) + +# ************************************************ +# Define the graph nodes +# ************************************************ + +graph_iterator_node = GraphIteratorNode( + input="user_prompt & urls", + output=["results"], + node_config={ + "graph_instance": smart_scraper_instance, + } +) + +merge_answers_node = MergeAnswersNode( + input="user_prompt & results", + output=["answer"], + node_config={ + "llm_model": llm_model, + "schema": schema + } +) + +knowledge_graph_node = KnowledgeGraphNode( + input="user_prompt & answer", + output=["kg"], + node_config={ + "llm_model": llm_model, + } +) + +graph = BaseGraph( + nodes=[ + graph_iterator_node, + merge_answers_node, + knowledge_graph_node + ], + edges=[ + (graph_iterator_node, merge_answers_node), + (merge_answers_node, knowledge_graph_node) + ], + entry_point=graph_iterator_node +) + +# ************************************************ +# Execute the graph +# ************************************************ + +result, execution_info = graph.execute({ + "user_prompt": "List me all the Machine Learning Engineer job postings", + "urls": [ + "https://www.linkedin.com/jobs/machine-learning-engineer-offerte-di-lavoro/?currentJobId=3889037104&originalSubdomain=it", + "https://www.glassdoor.com/Job/italy-machine-learning-engineer-jobs-SRCH_IL.0,5_IN120_KO6,31.html", + "https://it.indeed.com/jobs?q=ML+engineer&vjk=3c2e6d27601ffaaa" + ], +}) + +# get the answer from the result +result = result.get("answer", "No answer found.") +print(json.dumps(result, indent=4)) diff --git a/examples/openai/custom_graph_openai.py b/examples/openai/custom_graph_openai.py index 6e92565b..baaeaa3f 100644 --- a/examples/openai/custom_graph_openai.py +++ b/examples/openai/custom_graph_openai.py @@ -46,7 +46,7 @@ robot_node = RobotsNode( fetch_node = FetchNode( input="url | local_dir", - output=["doc"], + output=["doc", "link_urls", "img_urls"], node_config={ "verbose": True, "headless": True, diff --git a/examples/openai/multiple_search_openai.py b/examples/openai/multiple_search_openai.py deleted file mode 100644 index abc70803..00000000 --- a/examples/openai/multiple_search_openai.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import MultipleSearchGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - - -schema= """{ - "Job Postings": { - "Company x": [ - { - "title": "...", - "description": "...", - "location": "...", - "date_posted": "..", - "requirements": ["...", "...", "..."] - }, - { - "title": "...", - "description": "...", - "location": "...", - "date_posted": "..", - "requirements": ["...", "...", "..."] - } - ], - "Company y": [ - { - "title": "...", - "description": "...", - "location": "...", - "date_posted": "..", - "requirements": ["...", "...", "..."] - } - ] - } -}""" - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "gpt-4o", - }, - "verbose": True, - "headless": False, - "schema": schema, -} - - - -multiple_search_graph = MultipleSearchGraph( - prompt="List me all the projects with their description", - source= [ - "https://www.linkedin.com/jobs/machine-learning-engineer-offerte-di-lavoro/?currentJobId=3889037104&originalSubdomain=it", - "https://www.glassdoor.com/Job/italy-machine-learning-engineer-jobs-SRCH_IL.0,5_IN120_KO6,31.html", - "https://it.indeed.com/jobs?q=ML+engineer&vjk=3c2e6d27601ffaaa" - ], - config=graph_config, -) - -result = multiple_search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = multiple_search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/openai/omni_scraper_openai.py b/examples/openai/omni_scraper_openai.py index 8847fbbc..1d1d86ba 100644 --- a/examples/openai/omni_scraper_openai.py +++ b/examples/openai/omni_scraper_openai.py @@ -19,7 +19,7 @@ openai_key = os.getenv("OPENAI_APIKEY") graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-4-turbo", + "model": "gpt-4o", }, "verbose": True, "headless": True, diff --git a/examples/openai/omni_search_graph_openai.py b/examples/openai/omni_search_graph_openai.py index 66a7cfcc..ed0f8f3c 100644 --- a/examples/openai/omni_search_graph_openai.py +++ b/examples/openai/omni_search_graph_openai.py @@ -20,7 +20,7 @@ graph_config = { "model": "gpt-4o", }, "max_results": 2, - "max_images": 5, + "max_images": 1, "verbose": True, } diff --git a/examples/openai/smart_scraper_multi_openai.py b/examples/openai/smart_scraper_multi_openai.py new file mode 100644 index 00000000..ddfc6239 --- /dev/null +++ b/examples/openai/smart_scraper_multi_openai.py @@ -0,0 +1,41 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os, json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-4o", + }, + "verbose": True, + "headless": False, +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/openai/smart_scraper_schema_openai.py b/examples/openai/smart_scraper_schema_openai.py new file mode 100644 index 00000000..a4b28fc0 --- /dev/null +++ b/examples/openai/smart_scraper_schema_openai.py @@ -0,0 +1,59 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os, json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph + +load_dotenv() + +# ************************************************ +# Define the output schema for the graph +# ************************************************ + +schema= """ + { + "Projects": [ + "Project #": + { + "title": "...", + "description": "...", + }, + "Project #": + { + "title": "...", + "description": "...", + } + ] + } +""" + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key":openai_key, + "model": "gpt-3.5-turbo", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io/projects/", + schema=schema, + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) diff --git a/requirements-dev.lock b/requirements-dev.lock index bcfe71ce..84a8a445 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -45,6 +45,10 @@ certifi==2024.2.2 # via requests charset-normalizer==3.3.2 # via requests +colorama==0.4.6 + # via ipython + # via pytest + # via tqdm dataclasses-json==0.6.6 # via langchain # via langchain-community @@ -100,6 +104,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.5.0 # via langchain-groq grpcio==1.63.0 @@ -212,8 +217,6 @@ pandas==2.2.2 # via scrapegraphai parso==0.8.4 # via jedi -pexpect==4.9.0 - # via ipython playwright==1.43.0 # via scrapegraphai pluggy==1.5.0 @@ -230,8 +233,6 @@ protobuf==4.25.3 # via googleapis-common-protos # via grpcio-status # via proto-plus -ptyprocess==0.7.0 - # via pexpect pure-eval==0.2.2 # via stack-data pyasn1==0.6.0 diff --git a/requirements.lock b/requirements.lock index 1176355d..f33598cf 100644 --- a/requirements.lock +++ b/requirements.lock @@ -45,6 +45,9 @@ certifi==2024.2.2 # via requests charset-normalizer==3.3.2 # via requests +colorama==0.4.6 + # via ipython + # via tqdm dataclasses-json==0.6.6 # via langchain # via langchain-community @@ -99,6 +102,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.5.0 # via langchain-groq grpcio==1.63.0 @@ -208,8 +212,6 @@ pandas==2.2.2 # via scrapegraphai parso==0.8.4 # via jedi -pexpect==4.9.0 - # via ipython playwright==1.43.0 # via scrapegraphai prompt-toolkit==3.0.43 @@ -224,8 +226,6 @@ protobuf==4.25.3 # via googleapis-common-protos # via grpcio-status # via proto-plus -ptyprocess==0.7.0 - # via pexpect pure-eval==0.2.2 # via stack-data pyasn1==0.6.0 diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 7d0d2621..994b2e3a 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -15,4 +15,4 @@ from .csv_scraper_graph import CSVScraperGraph from .pdf_scraper_graph import PDFScraperGraph from .omni_scraper_graph import OmniScraperGraph from .omni_search_graph import OmniSearchGraph -from .multiple_search_graph import MultipleSearchGraph +from .smart_scraper_multi_graph import SmartScraperMultiGraph diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index e1cf77f7..1a96aa97 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -7,10 +7,11 @@ from langchain_aws import BedrockEmbeddings from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings from langchain_google_genai import GoogleGenerativeAIEmbeddings -from ..helpers import models_tokens -from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings +from ..helpers import models_tokens +from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek + class AbstractGraph(ABC): """ @@ -19,6 +20,7 @@ class AbstractGraph(ABC): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -29,6 +31,7 @@ class AbstractGraph(ABC): prompt (str): The prompt for the graph. config (dict): Configuration parameters for the graph. source (str, optional): The source of the graph. + schema (str, optional): The schema for the graph output. Example: >>> class MyGraph(AbstractGraph): @@ -40,11 +43,12 @@ class AbstractGraph(ABC): >>> result = my_graph.run() """ - def __init__(self, prompt: str, config: dict, source: Optional[str] = None): + def __init__(self, prompt: str, config: dict, source: Optional[str] = None, schema: Optional[str] = None): self.prompt = prompt self.source = source self.config = config + self.schema = schema self.llm_model = self._create_llm(config["llm"], chat=True) self.embedder_model = self._create_default_embedder(llm_config=config["llm"] ) if "embeddings" not in config else self._create_embedder( @@ -61,14 +65,14 @@ class AbstractGraph(ABC): self.headless = True if config is None else config.get( "headless", True) self.loader_kwargs = config.get("loader_kwargs", {}) - self.schema = config.get("schema", None) - common_params = {"headless": self.headless, - "verbose": self.verbose, - "loader_kwargs": self.loader_kwargs, - "llm_model": self.llm_model, - "embedder_model": self.embedder_model, - "schema": self.schema} + common_params = { + "headless": self.headless, + "verbose": self.verbose, + "loader_kwargs": self.loader_kwargs, + "llm_model": self.llm_model, + "embedder_model": self.embedder_model + } self.set_common_params(common_params, overwrite=False) diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py index 59d74e65..6ae8cbcb 100644 --- a/scrapegraphai/graphs/csv_scraper_graph.py +++ b/scrapegraphai/graphs/csv_scraper_graph.py @@ -1,14 +1,18 @@ """ Module for creating the smart scraper """ + +from typing import Optional + from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + from ..nodes import ( FetchNode, ParseNode, RAGNode, GenerateAnswerCSVNode ) -from .abstract_graph import AbstractGraph class CSVScraperGraph(AbstractGraph): @@ -17,11 +21,11 @@ class CSVScraperGraph(AbstractGraph): information from web pages using a natural language model to interpret and answer prompts. """ - def __init__(self, prompt: str, source: str, config: dict): + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): """ Initializes the CSVScraperGraph with a prompt, source, and configuration. """ - super().__init__(prompt, config, source) + super().__init__(prompt, config, source, schema) self.input_key = "csv" if source.endswith("csv") else "csv_dir" @@ -53,6 +57,7 @@ class CSVScraperGraph(AbstractGraph): output=["answer"], node_config={ "llm_model": self.llm_model, + "schema": self.schema, } ) diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py index 6d93ccca..b7e73d09 100644 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ b/scrapegraphai/graphs/deep_scraper_graph.py @@ -2,7 +2,11 @@ DeepScraperGraph Module """ +from typing import Optional + from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + from ..nodes import ( FetchNode, SearchLinkNode, @@ -12,7 +16,6 @@ from ..nodes import ( GraphIteratorNode, MergeAnswersNode ) -from .abstract_graph import AbstractGraph class DeepScraperGraph(AbstractGraph): @@ -30,15 +33,19 @@ class DeepScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. verbose (bool): A flag indicating whether to show print statements during execution. headless (bool): A flag indicating whether to run the graph in headless mode. + Args: prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. + Example: >>> deep_scraper = DeepScraperGraph( ... "List me all the job titles and detailed job description.", @@ -49,8 +56,10 @@ class DeepScraperGraph(AbstractGraph): ) """ - def __init__(self, prompt: str, source: str, config: dict): - super().__init__(prompt, config, source) + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): + + super().__init__(prompt, config, source, schema) + self.input_key = "url" if source.startswith("http") else "local_dir" def _create_repeated_graph(self) -> BaseGraph: @@ -84,7 +93,8 @@ class DeepScraperGraph(AbstractGraph): input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], node_config={ - "llm_model": self.llm_model + "llm_model": self.llm_model, + "schema": self.schema } ) search_node = SearchLinkNode( @@ -108,6 +118,7 @@ class DeepScraperGraph(AbstractGraph): output=["answer"], node_config={ "llm_model": self.llm_model, + "schema": self.schema } ) diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index 9a272a03..5b263f70 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -2,14 +2,17 @@ JSONScraperGraph Module """ +from typing import Optional + from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + from ..nodes import ( FetchNode, ParseNode, RAGNode, GenerateAnswerNode ) -from .abstract_graph import AbstractGraph class JSONScraperGraph(AbstractGraph): @@ -20,6 +23,7 @@ class JSONScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -30,6 +34,7 @@ class JSONScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. Example: >>> json_scraper = JSONScraperGraph( @@ -40,8 +45,8 @@ class JSONScraperGraph(AbstractGraph): >>> result = json_scraper.run() """ - def __init__(self, prompt: str, source: str, config: dict): - super().__init__(prompt, config, source) + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): + super().__init__(prompt, config, source, schema) self.input_key = "json" if source.endswith("json") else "json_dir" @@ -76,7 +81,8 @@ class JSONScraperGraph(AbstractGraph): input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], node_config={ - "llm_model": self.llm_model + "llm_model": self.llm_model, + "schema": self.schema } ) diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index 92aa6cce..7bc5f761 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -2,7 +2,11 @@ OmniScraperGraph Module """ +from typing import Optional + from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + from ..nodes import ( FetchNode, ParseNode, @@ -10,8 +14,8 @@ from ..nodes import ( RAGNode, GenerateAnswerOmniNode ) -from scrapegraphai.models import OpenAIImageToText -from .abstract_graph import AbstractGraph + +from ..models import OpenAIImageToText class OmniScraperGraph(AbstractGraph): @@ -24,6 +28,7 @@ class OmniScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -35,6 +40,7 @@ class OmniScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. Example: >>> omni_scraper = OmniScraperGraph( @@ -46,11 +52,11 @@ class OmniScraperGraph(AbstractGraph): ) """ - def __init__(self, prompt: str, source: str, config: dict): + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): self.max_images = 5 if config is None else config.get("max_images", 5) - super().__init__(prompt, config, source) + super().__init__(prompt, config, source, schema) self.input_key = "url" if source.startswith("http") else "local_dir" @@ -96,7 +102,8 @@ class OmniScraperGraph(AbstractGraph): input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc", output=["answer"], node_config={ - "llm_model": self.llm_model + "llm_model": self.llm_model, + "schema": self.schema } ) diff --git a/scrapegraphai/graphs/omni_search_graph.py b/scrapegraphai/graphs/omni_search_graph.py index c428fc98..10c3c653 100644 --- a/scrapegraphai/graphs/omni_search_graph.py +++ b/scrapegraphai/graphs/omni_search_graph.py @@ -3,15 +3,17 @@ OmniSearchGraph Module """ from copy import copy, deepcopy +from typing import Optional from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .omni_scraper_graph import OmniScraperGraph + from ..nodes import ( SearchInternetNode, GraphIteratorNode, MergeAnswersNode ) -from .abstract_graph import AbstractGraph -from .omni_scraper_graph import OmniScraperGraph class OmniSearchGraph(AbstractGraph): @@ -31,6 +33,7 @@ class OmniSearchGraph(AbstractGraph): Args: prompt (str): The user prompt to search the internet. config (dict): Configuration parameters for the graph. + schema (Optional[str]): The schema for the graph output. Example: >>> omni_search_graph = OmniSearchGraph( @@ -40,7 +43,7 @@ class OmniSearchGraph(AbstractGraph): >>> result = search_graph.run() """ - def __init__(self, prompt: str, config: dict): + def __init__(self, prompt: str, config: dict, schema: Optional[str] = None): self.max_results = config.get("max_results", 3) @@ -49,7 +52,7 @@ class OmniSearchGraph(AbstractGraph): else: self.copy_config = deepcopy(config) - super().__init__(prompt, config) + super().__init__(prompt, config, schema) def _create_graph(self) -> BaseGraph: """ @@ -94,6 +97,7 @@ class OmniSearchGraph(AbstractGraph): output=["answer"], node_config={ "llm_model": self.llm_model, + "schema": self.schema } ) diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 58a54ab0..af9fe7d4 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -2,14 +2,17 @@ PDFScraperGraph Module """ +from typing import Optional + from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + from ..nodes import ( FetchNode, ParseNode, RAGNode, GenerateAnswerNode ) -from .abstract_graph import AbstractGraph class PDFScraperGraph(AbstractGraph): @@ -21,6 +24,7 @@ class PDFScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -32,6 +36,7 @@ class PDFScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. Example: >>> pdf_scraper = PDFScraperGraph( @@ -42,8 +47,8 @@ class PDFScraperGraph(AbstractGraph): >>> result = pdf_scraper.run() """ - def __init__(self, prompt: str, source: str, config: dict): - super().__init__(prompt, config, source) + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): + super().__init__(prompt, config, source, schema) self.input_key = "pdf" if source.endswith("pdf") else "pdf_dir" @@ -79,6 +84,7 @@ class PDFScraperGraph(AbstractGraph): output=["answer"], node_config={ "llm_model": self.llm_model, + "schema": self.schema, } ) diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index 773ab2b0..476c440e 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -2,13 +2,16 @@ ScriptCreatorGraph Module """ +from typing import Optional + from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + from ..nodes import ( FetchNode, ParseNode, GenerateScraperNode ) -from .abstract_graph import AbstractGraph class ScriptCreatorGraph(AbstractGraph): @@ -19,6 +22,7 @@ class ScriptCreatorGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -31,6 +35,7 @@ class ScriptCreatorGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. Example: >>> script_creator = ScriptCreatorGraph( @@ -41,11 +46,11 @@ class ScriptCreatorGraph(AbstractGraph): >>> result = script_creator.run() """ - def __init__(self, prompt: str, source: str, config: dict): + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): self.library = config['library'] - super().__init__(prompt, config, source) + super().__init__(prompt, config, source, schema) self.input_key = "url" if source.startswith("http") else "local_dir" @@ -65,14 +70,16 @@ class ScriptCreatorGraph(AbstractGraph): input="doc", output=["parsed_doc"], node_config={"chunk_size": self.model_token, - "verbose": self.verbose, "parse_html": False } ) generate_scraper_node = GenerateScraperNode( input="user_prompt & (doc)", output=["answer"], - node_config={"llm_model": self.llm_model}, + node_config={ + "llm_model": self.llm_model, + "schema": self.schema, + }, library=self.library, website=self.source ) diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index a9f2824a..c4564a15 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -3,15 +3,17 @@ SearchGraph Module """ from copy import copy, deepcopy +from typing import Optional from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .smart_scraper_graph import SmartScraperGraph + from ..nodes import ( SearchInternetNode, GraphIteratorNode, MergeAnswersNode ) -from .abstract_graph import AbstractGraph -from .smart_scraper_graph import SmartScraperGraph class SearchGraph(AbstractGraph): @@ -30,6 +32,7 @@ class SearchGraph(AbstractGraph): Args: prompt (str): The user prompt to search the internet. config (dict): Configuration parameters for the graph. + schema (Optional[str]): The schema for the graph output. Example: >>> search_graph = SearchGraph( @@ -39,7 +42,7 @@ class SearchGraph(AbstractGraph): >>> result = search_graph.run() """ - def __init__(self, prompt: str, config: dict): + def __init__(self, prompt: str, config: dict, schema: Optional[str] = None): self.max_results = config.get("max_results", 3) @@ -48,7 +51,7 @@ class SearchGraph(AbstractGraph): else: self.copy_config = deepcopy(config) - super().__init__(prompt, config) + super().__init__(prompt, config, schema) def _create_graph(self) -> BaseGraph: """ @@ -93,6 +96,7 @@ class SearchGraph(AbstractGraph): output=["answer"], node_config={ "llm_model": self.llm_model, + "schema": self.schema } ) diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 8a6d03e2..ee230695 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -2,14 +2,17 @@ SmartScraperGraph Module """ +from typing import Optional + from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + from ..nodes import ( FetchNode, ParseNode, RAGNode, GenerateAnswerNode ) -from .abstract_graph import AbstractGraph class SmartScraperGraph(AbstractGraph): @@ -22,6 +25,7 @@ class SmartScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -32,6 +36,7 @@ class SmartScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. Example: >>> smart_scraper = SmartScraperGraph( @@ -43,8 +48,8 @@ class SmartScraperGraph(AbstractGraph): ) """ - def __init__(self, prompt: str, source: str, config: dict): - super().__init__(prompt, config, source) + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): + super().__init__(prompt, config, source, schema) self.input_key = "url" if source.startswith("http") else "local_dir" @@ -82,7 +87,7 @@ class SmartScraperGraph(AbstractGraph): output=["answer"], node_config={ "llm_model": self.llm_model, - "schema": self.config.get("schema", None), + "schema": self.schema, } ) diff --git a/scrapegraphai/graphs/multiple_search_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py similarity index 82% rename from scrapegraphai/graphs/multiple_search_graph.py rename to scrapegraphai/graphs/smart_scraper_multi_graph.py index 95cc1dda..100957b5 100644 --- a/scrapegraphai/graphs/multiple_search_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -1,25 +1,25 @@ """ -MultipleSearchGraph Module +SmartScraperMultiGraph Module """ from copy import copy, deepcopy +from typing import List, Optional from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .smart_scraper_graph import SmartScraperGraph + from ..nodes import ( GraphIteratorNode, MergeAnswersNode, KnowledgeGraphNode ) -from .abstract_graph import AbstractGraph -from .smart_scraper_graph import SmartScraperGraph - -from typing import List, Optional -class MultipleSearchGraph(AbstractGraph): +class SmartScraperMultiGraph(AbstractGraph): """ - MultipleSearchGraph is a scraping pipeline that searches the internet for answers to a given prompt. - It only requires a user prompt to search the internet and generate an answer. + SmartScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. + It only requires a user prompt and a list of URLs. Attributes: prompt (str): The user prompt to search the internet. @@ -31,7 +31,9 @@ class MultipleSearchGraph(AbstractGraph): Args: prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. config (dict): Configuration parameters for the graph. + schema (Optional[str]): The schema for the graph output. Example: >>> search_graph = MultipleSearchGraph( @@ -41,7 +43,7 @@ class MultipleSearchGraph(AbstractGraph): >>> result = search_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict): + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None): self.max_results = config.get("max_results", 3) @@ -50,7 +52,7 @@ class MultipleSearchGraph(AbstractGraph): else: self.copy_config = deepcopy(config) - super().__init__(prompt, config, source) + super().__init__(prompt, config, source, schema) def _create_graph(self) -> BaseGraph: """ @@ -87,15 +89,7 @@ class MultipleSearchGraph(AbstractGraph): output=["answer"], node_config={ "llm_model": self.llm_model, - "schema": self.config.get("schema", None), - } - ) - - knowledge_graph_node = KnowledgeGraphNode( - input="user_prompt & answer", - output=["kg"], - node_config={ - "llm_model": self.llm_model, + "schema": self.schema } ) @@ -103,11 +97,9 @@ class MultipleSearchGraph(AbstractGraph): nodes=[ graph_iterator_node, merge_answers_node, - knowledge_graph_node ], edges=[ (graph_iterator_node, merge_answers_node), - (merge_answers_node, knowledge_graph_node) ], entry_point=graph_iterator_node ) diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 80c09537..3e1944b5 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -2,9 +2,11 @@ SpeechGraph Module """ -from scrapegraphai.utils.save_audio_from_bytes import save_audio_from_bytes -from ..models import OpenAITextToSpeech +from typing import Optional + from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + from ..nodes import ( FetchNode, ParseNode, @@ -12,7 +14,9 @@ from ..nodes import ( GenerateAnswerNode, TextToSpeechNode, ) -from .abstract_graph import AbstractGraph + +from ..utils.save_audio_from_bytes import save_audio_from_bytes +from ..models import OpenAITextToSpeech class SpeechGraph(AbstractGraph): @@ -23,6 +27,7 @@ class SpeechGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. verbose (bool): A flag indicating whether to show print statements during execution. @@ -33,6 +38,7 @@ class SpeechGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. Example: >>> speech_graph = SpeechGraph( @@ -41,8 +47,8 @@ class SpeechGraph(AbstractGraph): ... {"llm": {"model": "gpt-3.5-turbo"}} """ - def __init__(self, prompt: str, source: str, config: dict): - super().__init__(prompt, config, source) + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): + super().__init__(prompt, config, source, schema) self.input_key = "url" if source.startswith("http") else "local_dir" @@ -76,7 +82,8 @@ class SpeechGraph(AbstractGraph): input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], node_config={ - "llm_model": self.llm_model + "llm_model": self.llm_model, + "schema": self.schema } ) text_to_speech_node = TextToSpeechNode( diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index 90d8dc55..1557ecd4 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -2,14 +2,17 @@ XMLScraperGraph Module """ +from typing import Optional + from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + from ..nodes import ( FetchNode, ParseNode, RAGNode, GenerateAnswerNode ) -from .abstract_graph import AbstractGraph class XMLScraperGraph(AbstractGraph): @@ -21,6 +24,7 @@ class XMLScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. embedder_model: An instance of an embedding model client, configured for generating embeddings. @@ -32,6 +36,7 @@ class XMLScraperGraph(AbstractGraph): prompt (str): The prompt for the graph. source (str): The source of the graph. config (dict): Configuration parameters for the graph. + schema (str): The schema for the graph output. Example: >>> xml_scraper = XMLScraperGraph( @@ -42,8 +47,8 @@ class XMLScraperGraph(AbstractGraph): >>> result = xml_scraper.run() """ - def __init__(self, prompt: str, source: str, config: dict): - super().__init__(prompt, config, source) + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None): + super().__init__(prompt, config, source, schema) self.input_key = "xml" if source.endswith("xml") else "xml_dir" @@ -78,7 +83,8 @@ class XMLScraperGraph(AbstractGraph): input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], node_config={ - "llm_model": self.llm_model + "llm_model": self.llm_model, + "schema": self.schema } ) diff --git a/scrapegraphai/nodes/conditional_node.py b/scrapegraphai/nodes/conditional_node.py index 33731a9d..894a42f3 100644 --- a/scrapegraphai/nodes/conditional_node.py +++ b/scrapegraphai/nodes/conditional_node.py @@ -1,6 +1,7 @@ """ Module for implementing the conditional node """ + from .base_node import BaseNode diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 12de529c..9a7b1d3b 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -10,10 +10,9 @@ from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel -from ..helpers.generate_answer_node_csv_prompts import template_chunks_csv, template_no_chunks_csv, template_merge_csv - # Imports from the library from .base_node import BaseNode +from ..helpers.generate_answer_node_csv_prompts import template_chunks_csv, template_no_chunks_csv, template_merge_csv class GenerateAnswerCSVNode(BaseNode): diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 701e23d4..06687a41 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -15,6 +15,7 @@ from langchain_core.runnables import RunnableParallel from .base_node import BaseNode from ..helpers import template_chunks, template_no_chunks, template_merge, template_chunks_with_schema, template_no_chunks_with_schema + class GenerateAnswerNode(BaseNode): """ A node that generates an answer using a large language model (LLM) based on the user's input diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index 1e1a98b3..15556ff5 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -15,6 +15,7 @@ from langchain_core.runnables import RunnableParallel from .base_node import BaseNode from ..helpers.generate_answer_node_omni_prompts import template_no_chunk_omni, template_chunks_omni, template_merge_omni + class GenerateAnswerOmniNode(BaseNode): """ A node that generates an answer using a large language model (LLM) based on the user's input diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 9bfc546b..fcad5b5a 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -14,6 +14,7 @@ from langchain_core.runnables import RunnableParallel from .base_node import BaseNode from ..helpers.generate_answer_node_pdf_prompts import template_chunks_pdf, template_no_chunks_pdf, template_merge_pdf + class GenerateAnswerPDFNode(BaseNode): """ A node that generates an answer using a language model (LLM) based on the user's input diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py index 0ef53418..a0268f21 100644 --- a/scrapegraphai/nodes/graph_iterator_node.py +++ b/scrapegraphai/nodes/graph_iterator_node.py @@ -10,7 +10,6 @@ from tqdm.asyncio import tqdm from .base_node import BaseNode - _default_batchsize = 16 diff --git a/scrapegraphai/nodes/knowledge_graph_node.py b/scrapegraphai/nodes/knowledge_graph_node.py index 5e2c8920..7c79f025 100644 --- a/scrapegraphai/nodes/knowledge_graph_node.py +++ b/scrapegraphai/nodes/knowledge_graph_node.py @@ -14,6 +14,7 @@ from langchain_core.output_parsers import JsonOutputParser from .base_node import BaseNode from ..utils import create_graph, create_interactive_graph + class KnowledgeGraphNode(BaseNode): """ A node responsible for generating a knowledge graph from a dictionary. diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 39e40a23..fd18915d 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -3,8 +3,10 @@ ParseNode Module """ from typing import List, Optional + from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_transformers import Html2TextTransformer + from .base_node import BaseNode diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 27d97b6e..469fced9 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -3,6 +3,7 @@ RAGNode Module """ from typing import List, Optional + from langchain.docstore.document import Document from langchain.retrievers import ContextualCompressionRetriever from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index 7aea6cae..e6a87936 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -4,9 +4,11 @@ RobotsNode Module from typing import List, Optional from urllib.parse import urlparse + from langchain_community.document_loaders import AsyncChromiumLoader from langchain.prompts import PromptTemplate from langchain.output_parsers import CommaSeparatedListOutputParser + from .base_node import BaseNode from ..helpers import robots_dictionary diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 87f8dcb2..1310186e 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -3,8 +3,10 @@ SearchInternetNode Module """ from typing import List, Optional + from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate + from ..utils.research_web import search_on_web from .base_node import BaseNode diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index b15e8d26..cd6fbf22 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -6,7 +6,6 @@ SearchLinkNode Module from typing import List, Optional from tqdm import tqdm - # Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser