diff --git a/examples/ernie/deep_scraper_ernie.py b/examples/ernie/deep_scraper_ernie.py deleted file mode 100644 index b8c6501a..00000000 --- a/examples/ernie/deep_scraper_ernie.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DeepScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "model": "ernie/ernie-bot-turbo", - "ernie_client_id": "", - "ernie_client_secret": "", - "temperature": 0.1 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434"}, - "verbose": True, - "max_depth": 1 -} - - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -deep_scraper_graph = DeepScraperGraph( - prompt="List me all the job titles and detailed job description.", - # also accepts a string with the already downloaded HTML code - source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - config=graph_config -) - -result = deep_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = deep_scraper_graph.get_execution_info() -print(deep_scraper_graph.get_state("relevant_links")) -print(prettify_exec_info(graph_exec_info)) \ No newline at end of file diff --git a/examples/fireworks/deep_scraper_fireworks.py b/examples/fireworks/deep_scraper_fireworks.py deleted file mode 100644 index 86fb1717..00000000 --- a/examples/fireworks/deep_scraper_fireworks.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DeepScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -fireworks_api_key = os.getenv("FIREWORKS_APIKEY") - -graph_config = { - "llm": { - "api_key": fireworks_api_key, - "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "verbose": True, - "max_depth": 1 -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -deep_scraper_graph = DeepScraperGraph( - prompt="List me all the job titles and detailed job description.", - # also accepts a string with the already downloaded HTML code - source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - config=graph_config -) - -result = deep_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = deep_scraper_graph.get_execution_info() -print(deep_scraper_graph.get_state("relevant_links")) -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/mistral/deep_scraper_mistral.py b/examples/mistral/deep_scraper_mistral.py deleted file mode 100644 index bf0f6ba4..00000000 --- a/examples/mistral/deep_scraper_mistral.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DeepScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -mistral_key = os.getenv("MISTRAL_API_KEY") - -graph_config = { - "llm": { - "api_key": mistral_key, - "model": "mistralai/open-mistral-nemo", - }, - "verbose": True, - "max_depth": 1 -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -deep_scraper_graph = DeepScraperGraph( - prompt="List me all the job titles and detailed job description.", - # also accepts a string with the already downloaded HTML code - source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - config=graph_config -) - -result = deep_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = deep_scraper_graph.get_execution_info() -print(deep_scraper_graph.get_state("relevant_links")) -print(prettify_exec_info(graph_exec_info)) \ No newline at end of file diff --git a/examples/nemotron/deep_scraper_nemotron.py b/examples/nemotron/deep_scraper_nemotron.py deleted file mode 100644 index 35f54b38..00000000 --- a/examples/nemotron/deep_scraper_nemotron.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DeepScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -nemotron_key = os.getenv("NEMOTRON_APIKEY") - -graph_config = { - "llm": { - "api_key": nemotron_key, - "model": "nvidia/meta/llama3-70b-instruct", - }, - "verbose": True, - "max_depth": 1 -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -deep_scraper_graph = DeepScraperGraph( - prompt="List me all the job titles and detailed job description.", - # also accepts a string with the already downloaded HTML code - source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - config=graph_config -) - -result = deep_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = deep_scraper_graph.get_execution_info() -print(deep_scraper_graph.get_state("relevant_links")) -print(prettify_exec_info(graph_exec_info)) \ No newline at end of file diff --git a/examples/openai/deep_scraper_openai.py b/examples/openai/deep_scraper_openai.py deleted file mode 100644 index b20e164d..00000000 --- a/examples/openai/deep_scraper_openai.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import DeepScraperGraph -from scrapegraphai.utils import prettify_exec_info - -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "openai/gpt-4o", - }, - "verbose": True, - "max_depth": 1 -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -deep_scraper_graph = DeepScraperGraph( - prompt="List me all the job titles and detailed job description.", - # also accepts a string with the already downloaded HTML code - source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - config=graph_config -) - -result = deep_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = deep_scraper_graph.get_execution_info() -print(deep_scraper_graph.get_state("relevant_links")) -print(prettify_exec_info(graph_exec_info)) \ No newline at end of file diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 9b3f36c3..efd6bd7e 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -5,7 +5,6 @@ __init__.py file for graphs folder from .abstract_graph import AbstractGraph from .base_graph import BaseGraph from .smart_scraper_graph import SmartScraperGraph -from .deep_scraper_graph import DeepScraperGraph from .speech_graph import SpeechGraph from .search_graph import SearchGraph from .script_creator_graph import ScriptCreatorGraph diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py deleted file mode 100644 index f3f2ec2d..00000000 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ /dev/null @@ -1,162 +0,0 @@ -""" -DeepScraperGraph Module -""" -from typing import Optional -from pydantic import BaseModel -from .base_graph import BaseGraph -from .abstract_graph import AbstractGraph -from ..nodes import ( - FetchNode, - SearchLinkNode, - ParseNode, - GenerateAnswerNode, - GraphIteratorNode, - MergeAnswersNode -) - -class DeepScraperGraph(AbstractGraph): - """ - [WIP] - - DeepScraper is a scraping pipeline that automates the process of - extracting information from web pages using a natural language model - to interpret and answer prompts. - - Unlike SmartScraper, DeepScraper can navigate to the links within, - the input webpage to fuflfil the task within the prompt. - - Attributes: - prompt (str): The prompt for the graph. - source (str): The source of the graph. - config (dict): Configuration parameters for the graph. - schema (BaseModel): The schema for the graph output. - llm_model: An instance of a language model client, configured for generating answers. - embedder_model: An instance of an embedding model client, - configured for generating embeddings. - verbose (bool): A flag indicating whether to show print statements during execution. - headless (bool): A flag indicating whether to run the graph in headless mode. - - Args: - prompt (str): The prompt for the graph. - source (str): The source of the graph. - config (dict): Configuration parameters for the graph. - schema (BaseModel): The schema for the graph output. - - Example: - >>> deep_scraper = DeepScraperGraph( - ... "List me all the job titles and detailed job description.", - ... "https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - ... {"llm": {"model": "openai/gpt-3.5-turbo"}} - ... ) - >>> result = deep_scraper.run() - ) - """ - - def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None): - - super().__init__(prompt, config, source, schema) - - self.input_key = "url" if source.startswith("http") else "local_dir" - - def _create_repeated_graph(self) -> BaseGraph: - """ - Creates the graph that can be repeatedly executed to conduct search on - hyperlinks within the webpage. - - Returns: - BaseGraph: A graph instance representing the web scraping workflow. - """ - fetch_node = FetchNode( - input="url | local_dir", - output=["doc"] - ) - parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": self.model_token, - "llm_model": self.llm_model - } - ) - - generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": self.llm_model, - "additional_info": self.config.get("additional_info"), - "schema": self.schema - } - ) - - search_node = SearchLinkNode( - input="user_prompt & relevant_chunks", - output=["relevant_links"], - node_config={ - "llm_model": self.llm_model, - } - ) - - graph_iterator_node = GraphIteratorNode( - input="user_prompt & relevant_links", - output=["results"], - node_config={ - "graph_instance": None, - "batchsize": 1 - } - ) - - merge_answers_node = MergeAnswersNode( - input="user_prompt & results", - output=["answer"], - node_config={ - "llm_model": self.llm_model, - "schema": self.schema - } - ) - - return BaseGraph( - nodes=[ - fetch_node, - parse_node, - generate_answer_node, - search_node, - graph_iterator_node, - merge_answers_node - ], - edges=[ - (fetch_node, parse_node), - (search_node, graph_iterator_node), - (graph_iterator_node, merge_answers_node) - ], - entry_point=fetch_node, - graph_name=self.__class__.__name__ - ) - - - def _create_graph(self) -> BaseGraph: - """ - Creates the graph of nodes representing the workflow for web scraping - n-levels deep. - - Returns: - BaseGraph: A graph instance representing the web scraping workflow. - """ - - base_graph = self._create_repeated_graph() - graph_iterator_node = list(filter(lambda x: x.node_name == "GraphIterator", - base_graph.nodes))[0] - graph_iterator_node.node_config["graph_instance"] = self - return base_graph - - def run(self) -> str: - """ - Executes the scraping process and returns the answer to the prompt. - Returns: - str: The answer to the prompt. - """ - - inputs = {"user_prompt": self.prompt, self.input_key: self.source} - self.final_state, self.execution_info = self.graph.execute(inputs) - - return self.final_state.get("answer", "No answer found.")