diff --git a/requirements.txt b/requirements.txt index 1e6224b4..00259542 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ langchain-aws==0.1.2 langchain-anthropic==0.1.11 yahoo-search-py==0.3 pypdf==4.2.0 +burr[start] diff --git a/scrapegraphai/graphs/smart_scraper.png b/scrapegraphai/graphs/smart_scraper.png new file mode 100644 index 00000000..7c2822f7 Binary files /dev/null and b/scrapegraphai/graphs/smart_scraper.png differ diff --git a/scrapegraphai/graphs/smart_scraper_graph b/scrapegraphai/graphs/smart_scraper_graph new file mode 100644 index 00000000..99c3658c --- /dev/null +++ b/scrapegraphai/graphs/smart_scraper_graph @@ -0,0 +1,18 @@ +digraph { + graph [compound=false concentrate=false rankdir=TB ranksep=0.4] + fetch_node [label=fetch_node shape=box style=rounded] + parse_node [label=parse_node shape=box style=rounded] + input__chunk_size [label="input: chunk_size" shape=oval style=dashed] + input__chunk_size -> parse_node + rag_node [label=rag_node shape=box style=rounded] + input__llm_model [label="input: llm_model" shape=oval style=dashed] + input__llm_model -> rag_node + input__embedder_model [label="input: embedder_model" shape=oval style=dashed] + input__embedder_model -> rag_node + generate_answer_node [label=generate_answer_node shape=box style=rounded] + input__llm_model [label="input: llm_model" shape=oval style=dashed] + input__llm_model -> generate_answer_node + fetch_node -> parse_node [style=solid] + parse_node -> rag_node [style=solid] + rag_node -> generate_answer_node [style=solid] +} diff --git a/scrapegraphai/graphs/smart_scraper_graph.png b/scrapegraphai/graphs/smart_scraper_graph.png new file mode 100644 index 00000000..ff94d915 Binary files /dev/null and b/scrapegraphai/graphs/smart_scraper_graph.png differ diff --git a/scrapegraphai/graphs/smart_scraper_graph_burr.py b/scrapegraphai/graphs/smart_scraper_graph_burr.py new file mode 100644 index 00000000..f2c26569 --- /dev/null +++ b/scrapegraphai/graphs/smart_scraper_graph_burr.py @@ -0,0 +1,117 @@ +""" +SmartScraperGraph Module Burr Version +""" +from typing import Tuple + +from burr import tracking +from burr.core import Application, ApplicationBuilder, State, default, when +from burr.core.action import action + +from langchain_community.document_loaders import AsyncChromiumLoader +from langchain_core.documents import Document +from ..utils.remover import remover + + +@action(reads=["url", "local_dir"], writes=["doc"]) +def fetch_node(state: State, headless: bool = True, verbose: bool = False) -> tuple[dict, State]: + if verbose: + print(f"--- Executing Fetch Node ---") + + source = state.get("url", state.get("local_dir")) + + if self.input == "json_dir" or self.input == "xml_dir" or self.input == "csv_dir": + compressed_document = [Document(page_content=source, metadata={ + "source": "local_dir" + })] + # if it is a local directory + elif not source.startswith("http"): + compressed_document = [Document(page_content=remover(source), metadata={ + "source": "local_dir" + })] + + else: + if self.node_config is not None and self.node_config.get("endpoint") is not None: + + loader = AsyncChromiumLoader( + [source], + proxies={"http": self.node_config["endpoint"]}, + headless=headless, + ) + else: + loader = AsyncChromiumLoader( + [source], + headless=headless, + ) + + document = loader.load() + compressed_document = [ + Document(page_content=remover(str(document[0].page_content)))] + + return {"doc": compressed_document}, state.update(doc=compressed_document) + +@action(reads=["doc"], writes=["parsed_doc"]) +def parse_node(state: State, chunk_size: int) -> tuple[dict, State]: + return {}, state + +@action(reads=["user_prompt", "parsed_doc", "doc"], + writes=["relevant_chunks"]) +def rag_node(state: State, llm_model: object, embedder_model: object) -> tuple[dict, State]: + return {}, state + +@action(reads=["user_prompt", "relevant_chunks", "parsed_doc", "doc"], + writes=["answer"]) +def generate_answer_node(state: State, llm_model: object) -> tuple[dict, State]: + return {}, state + +def run(prompt: str, input_key: str, source: str, config: dict) -> str: + + llm_model = config["llm_model"] + embedder_model = config["embedder_model"] + chunk_size = config["model_token"] + + initial_state = { + "user_prompt": prompt, + input_key: source + } + app = ( + ApplicationBuilder() + .with_actions( + fetch_node=fetch_node, + parse_node=parse_node, + rag_node=rag_node, + generate_answer_node=generate_answer_node + ) + .with_transitions( + ("fetch_node", "parse_node", default), + ("parse_node", "rag_node", default), + ("rag_node", "generate_answer_node", default) + ) + .with_entrypoint("fetch_node") + .with_state(**initial_state) + .build() + ) + app.visualize( + output_file_path="smart_scraper_graph", + include_conditions=False, view=True, format="png" + ) + # last_action, result, state = app.run( + # halt_after=["generate_answer_node"], + # inputs={ + # "llm_model": llm_model, + # "embedder_model": embedder_model, + # "model_token": chunk_size + # } + # ) + # return result.get("answer", "No answer found.") + +if __name__ == '__main__': + + prompt = "What is the capital of France?" + source = "https://en.wikipedia.org/wiki/Paris" + input_key = "url" + config = { + "llm_model": "rag-token", + "embedder_model": "foo", + "model_token": "bar", + } + run(prompt, input_key, source, config) \ No newline at end of file diff --git a/scrapegraphai/graphs/smart_scraper_graph_hamilton.py b/scrapegraphai/graphs/smart_scraper_graph_hamilton.py new file mode 100644 index 00000000..ee3bdd88 --- /dev/null +++ b/scrapegraphai/graphs/smart_scraper_graph_hamilton.py @@ -0,0 +1,70 @@ +""" +SmartScraperGraph Module Burr Version +""" + +from typing import Tuple + +from burr import tracking +from burr.core import Application, ApplicationBuilder, State, default, when +from burr.core.action import action + +from langchain_community.document_loaders import AsyncChromiumLoader +from langchain_core.documents import Document +if __name__ == '__main__': + from scrapegraphai.utils.remover import remover +else: + from ..utils.remover import remover + + +def fetch_node(source: str, + headless: bool = True + ) -> Document: + if not source.startswith("http"): + return Document(page_content=remover(source), metadata={ + "source": "local_dir" + }) + else: + loader = AsyncChromiumLoader( + [source], + headless=headless, + ) + document = loader.load() + return Document(page_content=remover(str(document[0].page_content))) + +def parse_node(fetch_node: Document, chunk_size: int) -> list[Document]: + + pass + +def rag_node(parse_node: list[Document]) -> list[Document]: + pass + +def generate_answer_node(rag_node: list[Document]) -> str: + pass + + +if __name__ == '__main__': + from hamilton import driver + import __main__ as smart_scraper_graph_hamilton + dr = ( + driver.Builder() + .with_modules(smart_scraper_graph_hamilton) + .with_config({}) + .build() + ) + dr.display_all_functions("smart_scraper.png") + + # config = { + # "llm_model": "rag-token", + # "embedder_model": "foo", + # "model_token": "bar", + # } + # + # result = dr.execute( + # ["generate_answer_node"], + # inputs={ + # "prompt": "What is the capital of France?", + # "source": "https://en.wikipedia.org/wiki/Paris", + # } + # ) + # + # print(result) \ No newline at end of file