This commit is contained in:
Stefan Krawczyk 2024-05-09 23:52:19 -07:00
parent 7ae50c035e
commit be16fecb55
6 changed files with 206 additions and 0 deletions

View File

@ -19,3 +19,4 @@ langchain-aws==0.1.2
langchain-anthropic==0.1.11
yahoo-search-py==0.3
pypdf==4.2.0
burr[start]

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

View File

@ -0,0 +1,18 @@
digraph {
graph [compound=false concentrate=false rankdir=TB ranksep=0.4]
fetch_node [label=fetch_node shape=box style=rounded]
parse_node [label=parse_node shape=box style=rounded]
input__chunk_size [label="input: chunk_size" shape=oval style=dashed]
input__chunk_size -> parse_node
rag_node [label=rag_node shape=box style=rounded]
input__llm_model [label="input: llm_model" shape=oval style=dashed]
input__llm_model -> rag_node
input__embedder_model [label="input: embedder_model" shape=oval style=dashed]
input__embedder_model -> rag_node
generate_answer_node [label=generate_answer_node shape=box style=rounded]
input__llm_model [label="input: llm_model" shape=oval style=dashed]
input__llm_model -> generate_answer_node
fetch_node -> parse_node [style=solid]
parse_node -> rag_node [style=solid]
rag_node -> generate_answer_node [style=solid]
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

View File

@ -0,0 +1,117 @@
"""
SmartScraperGraph Module Burr Version
"""
from typing import Tuple
from burr import tracking
from burr.core import Application, ApplicationBuilder, State, default, when
from burr.core.action import action
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document
from ..utils.remover import remover
@action(reads=["url", "local_dir"], writes=["doc"])
def fetch_node(state: State, headless: bool = True, verbose: bool = False) -> tuple[dict, State]:
if verbose:
print(f"--- Executing Fetch Node ---")
source = state.get("url", state.get("local_dir"))
if self.input == "json_dir" or self.input == "xml_dir" or self.input == "csv_dir":
compressed_document = [Document(page_content=source, metadata={
"source": "local_dir"
})]
# if it is a local directory
elif not source.startswith("http"):
compressed_document = [Document(page_content=remover(source), metadata={
"source": "local_dir"
})]
else:
if self.node_config is not None and self.node_config.get("endpoint") is not None:
loader = AsyncChromiumLoader(
[source],
proxies={"http": self.node_config["endpoint"]},
headless=headless,
)
else:
loader = AsyncChromiumLoader(
[source],
headless=headless,
)
document = loader.load()
compressed_document = [
Document(page_content=remover(str(document[0].page_content)))]
return {"doc": compressed_document}, state.update(doc=compressed_document)
@action(reads=["doc"], writes=["parsed_doc"])
def parse_node(state: State, chunk_size: int) -> tuple[dict, State]:
return {}, state
@action(reads=["user_prompt", "parsed_doc", "doc"],
writes=["relevant_chunks"])
def rag_node(state: State, llm_model: object, embedder_model: object) -> tuple[dict, State]:
return {}, state
@action(reads=["user_prompt", "relevant_chunks", "parsed_doc", "doc"],
writes=["answer"])
def generate_answer_node(state: State, llm_model: object) -> tuple[dict, State]:
return {}, state
def run(prompt: str, input_key: str, source: str, config: dict) -> str:
llm_model = config["llm_model"]
embedder_model = config["embedder_model"]
chunk_size = config["model_token"]
initial_state = {
"user_prompt": prompt,
input_key: source
}
app = (
ApplicationBuilder()
.with_actions(
fetch_node=fetch_node,
parse_node=parse_node,
rag_node=rag_node,
generate_answer_node=generate_answer_node
)
.with_transitions(
("fetch_node", "parse_node", default),
("parse_node", "rag_node", default),
("rag_node", "generate_answer_node", default)
)
.with_entrypoint("fetch_node")
.with_state(**initial_state)
.build()
)
app.visualize(
output_file_path="smart_scraper_graph",
include_conditions=False, view=True, format="png"
)
# last_action, result, state = app.run(
# halt_after=["generate_answer_node"],
# inputs={
# "llm_model": llm_model,
# "embedder_model": embedder_model,
# "model_token": chunk_size
# }
# )
# return result.get("answer", "No answer found.")
if __name__ == '__main__':
prompt = "What is the capital of France?"
source = "https://en.wikipedia.org/wiki/Paris"
input_key = "url"
config = {
"llm_model": "rag-token",
"embedder_model": "foo",
"model_token": "bar",
}
run(prompt, input_key, source, config)

View File

@ -0,0 +1,70 @@
"""
SmartScraperGraph Module Burr Version
"""
from typing import Tuple
from burr import tracking
from burr.core import Application, ApplicationBuilder, State, default, when
from burr.core.action import action
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document
if __name__ == '__main__':
from scrapegraphai.utils.remover import remover
else:
from ..utils.remover import remover
def fetch_node(source: str,
headless: bool = True
) -> Document:
if not source.startswith("http"):
return Document(page_content=remover(source), metadata={
"source": "local_dir"
})
else:
loader = AsyncChromiumLoader(
[source],
headless=headless,
)
document = loader.load()
return Document(page_content=remover(str(document[0].page_content)))
def parse_node(fetch_node: Document, chunk_size: int) -> list[Document]:
pass
def rag_node(parse_node: list[Document]) -> list[Document]:
pass
def generate_answer_node(rag_node: list[Document]) -> str:
pass
if __name__ == '__main__':
from hamilton import driver
import __main__ as smart_scraper_graph_hamilton
dr = (
driver.Builder()
.with_modules(smart_scraper_graph_hamilton)
.with_config({})
.build()
)
dr.display_all_functions("smart_scraper.png")
# config = {
# "llm_model": "rag-token",
# "embedder_model": "foo",
# "model_token": "bar",
# }
#
# result = dr.execute(
# ["generate_answer_node"],
# inputs={
# "prompt": "What is the capital of France?",
# "source": "https://en.wikipedia.org/wiki/Paris",
# }
# )
#
# print(result)