From 674e64222e41cfcbae01bf54ec82e4d8efdb469f Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Mon, 29 Apr 2024 15:55:21 +0200 Subject: [PATCH] add first new graphs --- scrapegraphai/graphs/__init__.py | 2 + scrapegraphai/graphs/json_scraper_graph.py | 77 ++++++++++++++++++++++ scrapegraphai/graphs/xml_scraper_graph.py | 77 ++++++++++++++++++++++ 3 files changed, 156 insertions(+) create mode 100644 scrapegraphai/graphs/json_scraper_graph.py create mode 100644 scrapegraphai/graphs/xml_scraper_graph.py diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index a8ee6ac5..b7fbcef7 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -6,3 +6,5 @@ from .smart_scraper_graph import SmartScraperGraph from .speech_graph import SpeechGraph from .search_graph import SearchGraph from .script_creator_graph import ScriptCreatorGraph +from .xml_scraper_graph import XmlScraperGraph +from .json_scraper_graph import JsonScraperGraph diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py new file mode 100644 index 00000000..7f24da6d --- /dev/null +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -0,0 +1,77 @@ +""" +Module for creating the smart scraper +""" +from .base_graph import BaseGraph +from ..nodes import ( + FetchNode, + ParseNode, + RAGNode, + GenerateAnswerNode +) +from .abstract_graph import AbstractGraph + + +class JsonScraperGraph(AbstractGraph): + """ + SmartScraper is a comprehensive web scraping tool that automates the process of extracting + information from web pages using a natural language model to interpret and answer prompts. + """ + + def __init__(self, prompt: str, source: str, config: dict): + """ + Initializes the JsonScraperGraph with a prompt, source, and configuration. + """ + super().__init__(prompt, config, source) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self): + """ + Creates the graph of nodes representing the workflow for web scraping. + """ + fetch_node = FetchNode( + input="url | local_dir", + output=["doc"], + ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={"chunk_size": self.model_token} + ) + rag_node = RAGNode( + input="user_prompt & (parsed_doc | doc)", + output=["relevant_chunks"], + node_config={ + "llm": self.llm_model, + "embedder_model": self.embedder_model + } + ) + generate_answer_node = GenerateAnswerNode( + input="user_prompt & (relevant_chunks | parsed_doc | doc)", + output=["answer"], + node_config={"llm": self.llm_model}, + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + rag_node, + generate_answer_node, + ], + edges=[ + (fetch_node, parse_node), + (parse_node, rag_node), + (rag_node, generate_answer_node) + ], + entry_point=fetch_node + ) + + def run(self) -> str: + """ + Executes the web scraping process and returns the answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py new file mode 100644 index 00000000..5b1e3282 --- /dev/null +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -0,0 +1,77 @@ +""" +Module for creating the smart scraper +""" +from .base_graph import BaseGraph +from ..nodes import ( + FetchNode, + ParseNode, + RAGNode, + GenerateAnswerNode +) +from .abstract_graph import AbstractGraph + + +class XmlScraperGraph(AbstractGraph): + """ + SmartScraper is a comprehensive web scraping tool that automates the process of extracting + information from web pages using a natural language model to interpret and answer prompts. + """ + + def __init__(self, prompt: str, source: str, config: dict): + """ + Initializes the XmlScraperGraph with a prompt, source, and configuration. + """ + super().__init__(prompt, config, source) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self): + """ + Creates the graph of nodes representing the workflow for web scraping. + """ + fetch_node = FetchNode( + input="url | local_dir", + output=["doc"], + ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={"chunk_size": self.model_token} + ) + rag_node = RAGNode( + input="user_prompt & (parsed_doc | doc)", + output=["relevant_chunks"], + node_config={ + "llm": self.llm_model, + "embedder_model": self.embedder_model + } + ) + generate_answer_node = GenerateAnswerNode( + input="user_prompt & (relevant_chunks | parsed_doc | doc)", + output=["answer"], + node_config={"llm": self.llm_model}, + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + rag_node, + generate_answer_node, + ], + edges=[ + (fetch_node, parse_node), + (parse_node, rag_node), + (rag_node, generate_answer_node) + ], + entry_point=fetch_node + ) + + def run(self) -> str: + """ + Executes the web scraping process and returns the answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.")