mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
WIP
This commit is contained in:
parent
7ae50c035e
commit
be16fecb55
@ -19,3 +19,4 @@ langchain-aws==0.1.2
|
||||
langchain-anthropic==0.1.11
|
||||
yahoo-search-py==0.3
|
||||
pypdf==4.2.0
|
||||
burr[start]
|
||||
|
||||
BIN
scrapegraphai/graphs/smart_scraper.png
Normal file
BIN
scrapegraphai/graphs/smart_scraper.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 32 KiB |
18
scrapegraphai/graphs/smart_scraper_graph
Normal file
18
scrapegraphai/graphs/smart_scraper_graph
Normal file
@ -0,0 +1,18 @@
|
||||
digraph {
|
||||
graph [compound=false concentrate=false rankdir=TB ranksep=0.4]
|
||||
fetch_node [label=fetch_node shape=box style=rounded]
|
||||
parse_node [label=parse_node shape=box style=rounded]
|
||||
input__chunk_size [label="input: chunk_size" shape=oval style=dashed]
|
||||
input__chunk_size -> parse_node
|
||||
rag_node [label=rag_node shape=box style=rounded]
|
||||
input__llm_model [label="input: llm_model" shape=oval style=dashed]
|
||||
input__llm_model -> rag_node
|
||||
input__embedder_model [label="input: embedder_model" shape=oval style=dashed]
|
||||
input__embedder_model -> rag_node
|
||||
generate_answer_node [label=generate_answer_node shape=box style=rounded]
|
||||
input__llm_model [label="input: llm_model" shape=oval style=dashed]
|
||||
input__llm_model -> generate_answer_node
|
||||
fetch_node -> parse_node [style=solid]
|
||||
parse_node -> rag_node [style=solid]
|
||||
rag_node -> generate_answer_node [style=solid]
|
||||
}
|
||||
BIN
scrapegraphai/graphs/smart_scraper_graph.png
Normal file
BIN
scrapegraphai/graphs/smart_scraper_graph.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 34 KiB |
117
scrapegraphai/graphs/smart_scraper_graph_burr.py
Normal file
117
scrapegraphai/graphs/smart_scraper_graph_burr.py
Normal file
@ -0,0 +1,117 @@
|
||||
"""
|
||||
SmartScraperGraph Module Burr Version
|
||||
"""
|
||||
from typing import Tuple
|
||||
|
||||
from burr import tracking
|
||||
from burr.core import Application, ApplicationBuilder, State, default, when
|
||||
from burr.core.action import action
|
||||
|
||||
from langchain_community.document_loaders import AsyncChromiumLoader
|
||||
from langchain_core.documents import Document
|
||||
from ..utils.remover import remover
|
||||
|
||||
|
||||
@action(reads=["url", "local_dir"], writes=["doc"])
|
||||
def fetch_node(state: State, headless: bool = True, verbose: bool = False) -> tuple[dict, State]:
|
||||
if verbose:
|
||||
print(f"--- Executing Fetch Node ---")
|
||||
|
||||
source = state.get("url", state.get("local_dir"))
|
||||
|
||||
if self.input == "json_dir" or self.input == "xml_dir" or self.input == "csv_dir":
|
||||
compressed_document = [Document(page_content=source, metadata={
|
||||
"source": "local_dir"
|
||||
})]
|
||||
# if it is a local directory
|
||||
elif not source.startswith("http"):
|
||||
compressed_document = [Document(page_content=remover(source), metadata={
|
||||
"source": "local_dir"
|
||||
})]
|
||||
|
||||
else:
|
||||
if self.node_config is not None and self.node_config.get("endpoint") is not None:
|
||||
|
||||
loader = AsyncChromiumLoader(
|
||||
[source],
|
||||
proxies={"http": self.node_config["endpoint"]},
|
||||
headless=headless,
|
||||
)
|
||||
else:
|
||||
loader = AsyncChromiumLoader(
|
||||
[source],
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
document = loader.load()
|
||||
compressed_document = [
|
||||
Document(page_content=remover(str(document[0].page_content)))]
|
||||
|
||||
return {"doc": compressed_document}, state.update(doc=compressed_document)
|
||||
|
||||
@action(reads=["doc"], writes=["parsed_doc"])
|
||||
def parse_node(state: State, chunk_size: int) -> tuple[dict, State]:
|
||||
return {}, state
|
||||
|
||||
@action(reads=["user_prompt", "parsed_doc", "doc"],
|
||||
writes=["relevant_chunks"])
|
||||
def rag_node(state: State, llm_model: object, embedder_model: object) -> tuple[dict, State]:
|
||||
return {}, state
|
||||
|
||||
@action(reads=["user_prompt", "relevant_chunks", "parsed_doc", "doc"],
|
||||
writes=["answer"])
|
||||
def generate_answer_node(state: State, llm_model: object) -> tuple[dict, State]:
|
||||
return {}, state
|
||||
|
||||
def run(prompt: str, input_key: str, source: str, config: dict) -> str:
|
||||
|
||||
llm_model = config["llm_model"]
|
||||
embedder_model = config["embedder_model"]
|
||||
chunk_size = config["model_token"]
|
||||
|
||||
initial_state = {
|
||||
"user_prompt": prompt,
|
||||
input_key: source
|
||||
}
|
||||
app = (
|
||||
ApplicationBuilder()
|
||||
.with_actions(
|
||||
fetch_node=fetch_node,
|
||||
parse_node=parse_node,
|
||||
rag_node=rag_node,
|
||||
generate_answer_node=generate_answer_node
|
||||
)
|
||||
.with_transitions(
|
||||
("fetch_node", "parse_node", default),
|
||||
("parse_node", "rag_node", default),
|
||||
("rag_node", "generate_answer_node", default)
|
||||
)
|
||||
.with_entrypoint("fetch_node")
|
||||
.with_state(**initial_state)
|
||||
.build()
|
||||
)
|
||||
app.visualize(
|
||||
output_file_path="smart_scraper_graph",
|
||||
include_conditions=False, view=True, format="png"
|
||||
)
|
||||
# last_action, result, state = app.run(
|
||||
# halt_after=["generate_answer_node"],
|
||||
# inputs={
|
||||
# "llm_model": llm_model,
|
||||
# "embedder_model": embedder_model,
|
||||
# "model_token": chunk_size
|
||||
# }
|
||||
# )
|
||||
# return result.get("answer", "No answer found.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
prompt = "What is the capital of France?"
|
||||
source = "https://en.wikipedia.org/wiki/Paris"
|
||||
input_key = "url"
|
||||
config = {
|
||||
"llm_model": "rag-token",
|
||||
"embedder_model": "foo",
|
||||
"model_token": "bar",
|
||||
}
|
||||
run(prompt, input_key, source, config)
|
||||
70
scrapegraphai/graphs/smart_scraper_graph_hamilton.py
Normal file
70
scrapegraphai/graphs/smart_scraper_graph_hamilton.py
Normal file
@ -0,0 +1,70 @@
|
||||
"""
|
||||
SmartScraperGraph Module Burr Version
|
||||
"""
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
from burr import tracking
|
||||
from burr.core import Application, ApplicationBuilder, State, default, when
|
||||
from burr.core.action import action
|
||||
|
||||
from langchain_community.document_loaders import AsyncChromiumLoader
|
||||
from langchain_core.documents import Document
|
||||
if __name__ == '__main__':
|
||||
from scrapegraphai.utils.remover import remover
|
||||
else:
|
||||
from ..utils.remover import remover
|
||||
|
||||
|
||||
def fetch_node(source: str,
|
||||
headless: bool = True
|
||||
) -> Document:
|
||||
if not source.startswith("http"):
|
||||
return Document(page_content=remover(source), metadata={
|
||||
"source": "local_dir"
|
||||
})
|
||||
else:
|
||||
loader = AsyncChromiumLoader(
|
||||
[source],
|
||||
headless=headless,
|
||||
)
|
||||
document = loader.load()
|
||||
return Document(page_content=remover(str(document[0].page_content)))
|
||||
|
||||
def parse_node(fetch_node: Document, chunk_size: int) -> list[Document]:
|
||||
|
||||
pass
|
||||
|
||||
def rag_node(parse_node: list[Document]) -> list[Document]:
|
||||
pass
|
||||
|
||||
def generate_answer_node(rag_node: list[Document]) -> str:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from hamilton import driver
|
||||
import __main__ as smart_scraper_graph_hamilton
|
||||
dr = (
|
||||
driver.Builder()
|
||||
.with_modules(smart_scraper_graph_hamilton)
|
||||
.with_config({})
|
||||
.build()
|
||||
)
|
||||
dr.display_all_functions("smart_scraper.png")
|
||||
|
||||
# config = {
|
||||
# "llm_model": "rag-token",
|
||||
# "embedder_model": "foo",
|
||||
# "model_token": "bar",
|
||||
# }
|
||||
#
|
||||
# result = dr.execute(
|
||||
# ["generate_answer_node"],
|
||||
# inputs={
|
||||
# "prompt": "What is the capital of France?",
|
||||
# "source": "https://en.wikipedia.org/wiki/Paris",
|
||||
# }
|
||||
# )
|
||||
#
|
||||
# print(result)
|
||||
Loading…
Reference in New Issue
Block a user