fix: removed deep scraper

2026-06-28 21:01:55 +08:00 · 2024-09-29 15:40:04 +02:00 · 2024-09-29 15:40:04 +02:00 · 9aa8c889fb
commit 9aa8c889fb
parent 27ae896cb7
7 changed files with 0 additions and 406 deletions
--- a/examples/ernie/deep_scraper_ernie.py
+++ b/examples/ernie/deep_scraper_ernie.py
@ -1,55 +0,0 @@
-""" 
-Basic example of scraping pipeline using SmartScraper
-"""
-
-import os
-from dotenv import load_dotenv
-from scrapegraphai.graphs import DeepScraperGraph
-from scrapegraphai.utils import prettify_exec_info
-
-load_dotenv()
-
-
-# ************************************************
-# Define the configuration for the graph
-# ************************************************
-
-openai_key = os.getenv("OPENAI_APIKEY")
-
-graph_config = {
-    "llm": {
-            "model": "ernie/ernie-bot-turbo",
-            "ernie_client_id": "<ernie_client_id>",
-            "ernie_client_secret": "<ernie_client_secret>",
-            "temperature": 0.1
-        },
-        "embeddings": {
-            "model": "ollama/nomic-embed-text",
-            "temperature": 0,
-            "base_url": "http://localhost:11434"},
-    "verbose": True,
-    "max_depth": 1
-}
-  
-
-# ************************************************
-# Create the SmartScraperGraph instance and run it
-# ************************************************
-
-deep_scraper_graph = DeepScraperGraph(
-    prompt="List me all the job titles and detailed job description.",
-    # also accepts a string with the already downloaded HTML code
-    source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
-    config=graph_config
-)
-
-result = deep_scraper_graph.run()
-print(result)
-
-# ************************************************
-# Get graph execution info
-# ************************************************
-
-graph_exec_info = deep_scraper_graph.get_execution_info()
-print(deep_scraper_graph.get_state("relevant_links"))
-print(prettify_exec_info(graph_exec_info))
--- a/examples/fireworks/deep_scraper_fireworks.py
+++ b/examples/fireworks/deep_scraper_fireworks.py
@ -1,47 +0,0 @@
-""" 
-Basic example of scraping pipeline using SmartScraper
-"""
-
-import os
-from dotenv import load_dotenv
-from scrapegraphai.graphs import DeepScraperGraph
-from scrapegraphai.utils import prettify_exec_info
-
-load_dotenv()
-
-# ************************************************
-# Define the configuration for the graph
-# ************************************************
-
-fireworks_api_key = os.getenv("FIREWORKS_APIKEY")
-
-graph_config = {
-    "llm": {
-        "api_key": fireworks_api_key,
-        "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct"
-    },
-    "verbose": True,
-    "max_depth": 1
-}
-
-# ************************************************
-# Create the SmartScraperGraph instance and run it
-# ************************************************
-
-deep_scraper_graph = DeepScraperGraph(
-    prompt="List me all the job titles and detailed job description.",
-    # also accepts a string with the already downloaded HTML code
-    source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
-    config=graph_config
-)
-
-result = deep_scraper_graph.run()
-print(result)
-
-# ************************************************
-# Get graph execution info
-# ************************************************
-
-graph_exec_info = deep_scraper_graph.get_execution_info()
-print(deep_scraper_graph.get_state("relevant_links"))
-print(prettify_exec_info(graph_exec_info))
--- a/examples/mistral/deep_scraper_mistral.py
+++ b/examples/mistral/deep_scraper_mistral.py
@ -1,47 +0,0 @@
-""" 
-Basic example of scraping pipeline using SmartScraper
-"""
-
-import os
-from dotenv import load_dotenv
-from scrapegraphai.graphs import DeepScraperGraph
-from scrapegraphai.utils import prettify_exec_info
-
-load_dotenv()
-
-# ************************************************
-# Define the configuration for the graph
-# ************************************************
-
-mistral_key = os.getenv("MISTRAL_API_KEY")
-
-graph_config = {
-    "llm": {
-        "api_key": mistral_key,
-        "model": "mistralai/open-mistral-nemo",
-    },
-    "verbose": True,
-    "max_depth": 1
-}
-
-# ************************************************
-# Create the SmartScraperGraph instance and run it
-# ************************************************
-
-deep_scraper_graph = DeepScraperGraph(
-    prompt="List me all the job titles and detailed job description.",
-    # also accepts a string with the already downloaded HTML code
-    source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
-    config=graph_config
-)
-
-result = deep_scraper_graph.run()
-print(result)
-
-# ************************************************
-# Get graph execution info
-# ************************************************
-
-graph_exec_info = deep_scraper_graph.get_execution_info()
-print(deep_scraper_graph.get_state("relevant_links"))
-print(prettify_exec_info(graph_exec_info))
--- a/examples/nemotron/deep_scraper_nemotron.py
+++ b/examples/nemotron/deep_scraper_nemotron.py
@ -1,47 +0,0 @@
-""" 
-Basic example of scraping pipeline using SmartScraper
-"""
-
-import os
-from dotenv import load_dotenv
-from scrapegraphai.graphs import DeepScraperGraph
-from scrapegraphai.utils import prettify_exec_info
-
-load_dotenv()
-
-# ************************************************
-# Define the configuration for the graph
-# ************************************************
-
-nemotron_key = os.getenv("NEMOTRON_APIKEY")
-
-graph_config = {
-    "llm": {
-        "api_key": nemotron_key,
-        "model": "nvidia/meta/llama3-70b-instruct",
-    },
-    "verbose": True,
-    "max_depth": 1
-}
-
-# ************************************************
-# Create the SmartScraperGraph instance and run it
-# ************************************************
-
-deep_scraper_graph = DeepScraperGraph(
-    prompt="List me all the job titles and detailed job description.",
-    # also accepts a string with the already downloaded HTML code
-    source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
-    config=graph_config
-)
-
-result = deep_scraper_graph.run()
-print(result)
-
-# ************************************************
-# Get graph execution info
-# ************************************************
-
-graph_exec_info = deep_scraper_graph.get_execution_info()
-print(deep_scraper_graph.get_state("relevant_links"))
-print(prettify_exec_info(graph_exec_info))
--- a/examples/openai/deep_scraper_openai.py
+++ b/examples/openai/deep_scraper_openai.py
@ -1,47 +0,0 @@
-""" 
-Basic example of scraping pipeline using SmartScraper
-"""
-
-import os
-from dotenv import load_dotenv
-from scrapegraphai.graphs import DeepScraperGraph
-from scrapegraphai.utils import prettify_exec_info
-
-load_dotenv()
-
-# ************************************************
-# Define the configuration for the graph
-# ************************************************
-
-openai_key = os.getenv("OPENAI_APIKEY")
-
-graph_config = {
-    "llm": {
-        "api_key": openai_key,
-        "model": "openai/gpt-4o",
-    },
-    "verbose": True,
-    "max_depth": 1
-}
-
-# ************************************************
-# Create the SmartScraperGraph instance and run it
-# ************************************************
-
-deep_scraper_graph = DeepScraperGraph(
-    prompt="List me all the job titles and detailed job description.",
-    # also accepts a string with the already downloaded HTML code
-    source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
-    config=graph_config
-)
-
-result = deep_scraper_graph.run()
-print(result)
-
-# ************************************************
-# Get graph execution info
-# ************************************************
-
-graph_exec_info = deep_scraper_graph.get_execution_info()
-print(deep_scraper_graph.get_state("relevant_links"))
-print(prettify_exec_info(graph_exec_info))
--- a/scrapegraphai/graphs/init.py
+++ b/scrapegraphai/graphs/init.py
@ -5,7 +5,6 @@ __init__.py file for graphs folder
 from .abstract_graph import AbstractGraph
 from .base_graph import BaseGraph
 from .smart_scraper_graph import SmartScraperGraph
-from .deep_scraper_graph import DeepScraperGraph
 from .speech_graph import SpeechGraph
 from .search_graph import SearchGraph
 from .script_creator_graph import ScriptCreatorGraph
--- a/scrapegraphai/graphs/deep_scraper_graph.py
+++ b/scrapegraphai/graphs/deep_scraper_graph.py
@ -1,162 +0,0 @@
-"""
-DeepScraperGraph Module
-"""
-from typing import Optional
-from pydantic import BaseModel
-from .base_graph import BaseGraph
-from .abstract_graph import AbstractGraph
-from ..nodes import (
-    FetchNode,
-    SearchLinkNode,
-    ParseNode,
-    GenerateAnswerNode,
-    GraphIteratorNode,
-    MergeAnswersNode
-)
-
-class DeepScraperGraph(AbstractGraph):
-    """
-    [WIP]
-
-    DeepScraper is a scraping pipeline that automates the process of 
-    extracting information from web pages using a natural language model 
-    to interpret and answer prompts.
-
-    Unlike SmartScraper, DeepScraper can navigate to the links within,
-    the input webpage to fuflfil the task within the prompt.
-    
-    Attributes:
-        prompt (str): The prompt for the graph.
-        source (str): The source of the graph.
-        config (dict): Configuration parameters for the graph.
-        schema (BaseModel): The schema for the graph output.
-        llm_model: An instance of a language model client, configured for generating answers.
-        embedder_model: An instance of an embedding model client, 
-        configured for generating embeddings.
-        verbose (bool): A flag indicating whether to show print statements during execution.
-        headless (bool): A flag indicating whether to run the graph in headless mode.
-        
-    Args:
-        prompt (str): The prompt for the graph.
-        source (str): The source of the graph.
-        config (dict): Configuration parameters for the graph.
-        schema (BaseModel): The schema for the graph output.
-
-    Example:
-        >>> deep_scraper = DeepScraperGraph(
-        ...     "List me all the job titles and detailed job description.",
-        ...     "https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
-        ...     {"llm": {"model": "openai/gpt-3.5-turbo"}}
-        ... )
-        >>> result = deep_scraper.run()
-        )
-    """
-
-    def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
-
-        super().__init__(prompt, config, source, schema)
-
-        self.input_key = "url" if source.startswith("http") else "local_dir"
-
-    def _create_repeated_graph(self) -> BaseGraph:
-        """
-        Creates the graph that can be repeatedly executed to conduct search on
-        hyperlinks within the webpage.
-
-        Returns:
-            BaseGraph: A graph instance representing the web scraping workflow.
-        """
-        fetch_node = FetchNode(
-            input="url | local_dir",
-            output=["doc"]
-        )
-        parse_node = ParseNode(
-            input="doc",
-            output=["parsed_doc"],
-            node_config={
-                "chunk_size": self.model_token,
-                "llm_model": self.llm_model
-            }
-        )
-
-        generate_answer_node = GenerateAnswerNode(
-            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
-            output=["answer"],
-            node_config={
-                "llm_model": self.llm_model,
-                "additional_info": self.config.get("additional_info"),
-                "schema": self.schema
-            }
-        )
-
-        search_node = SearchLinkNode(
-            input="user_prompt & relevant_chunks",
-            output=["relevant_links"],
-            node_config={
-                "llm_model": self.llm_model,
-            }
-        )
-
-        graph_iterator_node = GraphIteratorNode(
-            input="user_prompt & relevant_links",
-            output=["results"],
-            node_config={
-                "graph_instance": None,
-                "batchsize": 1
-            }
-        )
-
-        merge_answers_node = MergeAnswersNode(
-            input="user_prompt & results",
-            output=["answer"],
-            node_config={
-                "llm_model": self.llm_model,
-                "schema": self.schema
-            }
-        )
-
-        return BaseGraph(
-            nodes=[
-                fetch_node,
-                parse_node,
-                generate_answer_node,
-                search_node,
-                graph_iterator_node,
-                merge_answers_node
-            ],
-            edges=[
-                (fetch_node, parse_node),
-                (search_node, graph_iterator_node),
-                (graph_iterator_node, merge_answers_node)
-            ],
-            entry_point=fetch_node,
-            graph_name=self.__class__.__name__
-        )
-
-
-    def _create_graph(self) -> BaseGraph:
-        """
-        Creates the graph of nodes representing the workflow for web scraping
-        n-levels deep.
-
-        Returns:
-            BaseGraph: A graph instance representing the web scraping workflow.
-        """
-
-        base_graph = self._create_repeated_graph()
-        graph_iterator_node = list(filter(lambda x: x.node_name == "GraphIterator", 
-                                          base_graph.nodes))[0]
-        graph_iterator_node.node_config["graph_instance"] = self
-        return base_graph
-
-    def run(self) -> str:
-        """
-        Executes the scraping process and returns the answer to the prompt.
-        Returns:
-            str: The answer to the prompt.
-        """
-
-        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
-        self.final_state, self.execution_info = self.graph.execute(inputs)
-
-        return self.final_state.get("answer", "No answer found.")