From 5587a64d23451a6a216000fe83b2ce1cc8f7141b Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Wed, 15 May 2024 15:51:27 +0200 Subject: [PATCH] fix: removed unused --- scrapegraphai/graphs/__init__.py | 2 - scrapegraphai/graphs/deep_scraper_graph.py | 116 ---------------- scrapegraphai/graphs/turbo_scraper.py | 146 --------------------- 3 files changed, 264 deletions(-) delete mode 100644 scrapegraphai/graphs/deep_scraper_graph.py delete mode 100644 scrapegraphai/graphs/turbo_scraper.py diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 10eb6d8e..fe726128 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -5,7 +5,6 @@ __init__.py file for graphs folder from .abstract_graph import AbstractGraph from .base_graph import BaseGraph from .smart_scraper_graph import SmartScraperGraph -from .deep_scraper_graph import DeepScraperGraph from .speech_graph import SpeechGraph from .search_graph import SearchGraph from .script_creator_graph import ScriptCreatorGraph @@ -15,4 +14,3 @@ from .csv_scraper_graph import CSVScraperGraph from .pdf_scraper_graph import PDFScraperGraph from .omni_scraper_graph import OmniScraperGraph from .omni_search_graph import OmniSearchGraph -from .turbo_scraper import TurboScraperGraph diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py deleted file mode 100644 index 4b4e672b..00000000 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ /dev/null @@ -1,116 +0,0 @@ -""" -DeepScraperGraph Module -""" - -from .base_graph import BaseGraph -from ..nodes import ( - FetchNode, - SearchLinkNode, - ParseNode, - RAGNode, - GenerateAnswerNode -) -from .abstract_graph import AbstractGraph - - -class DeepScraperGraph(AbstractGraph): - """ - [WIP] - - DeepScraper is a scraping pipeline that automates the process of - extracting information from web pages - using a natural language model to interpret and answer prompts. - - Unlike SmartScraper, DeepScraper can navigate to the links within the input webpage, - to fuflfil the task within the prompt. - - - Attributes: - prompt (str): The prompt for the graph. - source (str): The source of the graph. - config (dict): Configuration parameters for the graph. - llm_model: An instance of a language model client, configured for generating answers. - embedder_model: An instance of an embedding model client, - configured for generating embeddings. - verbose (bool): A flag indicating whether to show print statements during execution. - headless (bool): A flag indicating whether to run the graph in headless mode. - Args: - prompt (str): The prompt for the graph. - source (str): The source of the graph. - config (dict): Configuration parameters for the graph. - Example: - >>> deep_scraper = DeepScraperGraph( - ... "List me all the job titles and detailed job description.", - ... "https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", - ... {"llm": {"model": "gpt-3.5-turbo"}} - ... ) - >>> result = deep_scraper.run() - ) - """ - - def __init__(self, prompt: str, source: str, config: dict): - super().__init__(prompt, config, source) - - self.input_key = "url" if source.startswith("http") else "local_dir" - - def _create_graph(self) -> BaseGraph: - """ - Creates the graph of nodes representing the workflow for web scraping. - Returns: - BaseGraph: A graph instance representing the web scraping workflow. - """ - fetch_node = FetchNode( - input="url | local_dir", - output=["doc", "link_urls", "img_urls"] - ) - parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": self.model_token - } - ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) - search_node = SearchLinkNode( - input="user_prompt & relevant_chunks", - output=["relevant_links"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) - - return BaseGraph( - nodes=[ - fetch_node, - parse_node, - rag_node, - search_node - ], - edges=[ - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, search_node) - - ], - entry_point=fetch_node - ) - - def run(self) -> str: - """ - Executes the scraping process and returns the answer to the prompt. - Returns: - str: The answer to the prompt. - """ - - inputs = {"user_prompt": self.prompt, self.input_key: self.source} - self.final_state, self.execution_info = self.graph.execute(inputs) - - return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/turbo_scraper.py b/scrapegraphai/graphs/turbo_scraper.py deleted file mode 100644 index 2881fd76..00000000 --- a/scrapegraphai/graphs/turbo_scraper.py +++ /dev/null @@ -1,146 +0,0 @@ -""" -SmartScraperGraph Module -""" - -from .base_graph import BaseGraph -from ..nodes import ( - FetchNode, - ParseNode, - RAGNode, - SearchLinksWithContext, - GraphIteratorNode, - MergeAnswersNode -) -from .search_graph import SearchGraph -from .abstract_graph import AbstractGraph - - -class SmartScraperGraph(AbstractGraph): - """ - SmartScraper is a scraping pipeline that automates the process of - extracting information from web pages - using a natural language model to interpret and answer prompts. - - Attributes: - prompt (str): The prompt for the graph. - source (str): The source of the graph. - config (dict): Configuration parameters for the graph. - llm_model: An instance of a language model client, configured for generating answers. - embedder_model: An instance of an embedding model client, - configured for generating embeddings. - verbose (bool): A flag indicating whether to show print statements during execution. - headless (bool): A flag indicating whether to run the graph in headless mode. - - Args: - prompt (str): The prompt for the graph. - source (str): The source of the graph. - config (dict): Configuration parameters for the graph. - - Example: - >>> smart_scraper = SmartScraperGraph( - ... "List me all the attractions in Chioggia.", - ... "https://en.wikipedia.org/wiki/Chioggia", - ... {"llm": {"model": "gpt-3.5-turbo"}} - ... ) - >>> result = smart_scraper.run() - ) - """ - - def __init__(self, prompt: str, source: str, config: dict): - super().__init__(prompt, config, source) - - self.input_key = "url" if source.startswith("http") else "local_dir" - - def _create_graph(self) -> BaseGraph: - """ - Creates the graph of nodes representing the workflow for web scraping. - - Returns: - BaseGraph: A graph instance representing the web scraping workflow. - """ - smart_scraper_graph = SmartScraperGraph( - prompt="", - source="", - config=self.llm_model - ) - fetch_node = FetchNode( - input="url | local_dir", - output=["doc"] - ) - - parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": self.model_token - } - ) - - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) - - search_link_with_context_node = SearchLinksWithContext( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={ - "llm_model": self.llm_model - } - ) - - graph_iterator_node = GraphIteratorNode( - input="user_prompt & urls", - output=["results"], - node_config={ - "graph_instance": smart_scraper_graph, - "verbose": True, - } - ) - - merge_answers_node = MergeAnswersNode( - input="user_prompt & results", - output=["answer"], - node_config={ - "llm_model": self.llm_model, - "verbose": True, - } - ) - - return BaseGraph( - nodes=[ - fetch_node, - parse_node, - rag_node, - search_link_with_context_node, - graph_iterator_node, - merge_answers_node - - ], - edges=[ - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, search_link_with_context_node), - (search_link_with_context_node, graph_iterator_node), - (graph_iterator_node, merge_answers_node), - - ], - entry_point=fetch_node - ) - - def run(self) -> str: - """ - Executes the scraping process and returns the answer to the prompt. - - Returns: - str: The answer to the prompt. - """ - - inputs = {"user_prompt": self.prompt, self.input_key: self.source} - self.final_state, self.execution_info = self.graph.execute(inputs) - - return self.final_state.get("answer", "No answer found.")