mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
feat: add Parse_Node
This commit is contained in:
parent
79b8326b5b
commit
e6c7940a57
@ -11,6 +11,7 @@ from .abstract_graph import AbstractGraph
|
||||
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
ParseNode,
|
||||
RAGNode,
|
||||
GenerateAnswerPDFNode
|
||||
)
|
||||
@ -66,6 +67,15 @@ class PDFScraperGraph(AbstractGraph):
|
||||
output=["doc"],
|
||||
)
|
||||
|
||||
parse_node = ParseNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
node_config={
|
||||
"parse_html": False,
|
||||
"chunk_size": self.model_token
|
||||
}
|
||||
)
|
||||
|
||||
rag_node = RAGNode(
|
||||
input="user_prompt & (parsed_doc | doc)",
|
||||
output=["relevant_chunks"],
|
||||
@ -86,11 +96,13 @@ class PDFScraperGraph(AbstractGraph):
|
||||
return BaseGraph(
|
||||
nodes=[
|
||||
fetch_node,
|
||||
parse_node,
|
||||
rag_node,
|
||||
generate_answer_node_pdf,
|
||||
],
|
||||
edges=[
|
||||
(fetch_node, rag_node),
|
||||
(fetch_node, parse_node),
|
||||
(parse_node, rag_node),
|
||||
(rag_node, generate_answer_node_pdf)
|
||||
],
|
||||
entry_point=fetch_node
|
||||
|
||||
@ -3,8 +3,8 @@ SmartScraperGraph Module
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
import logging
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
|
||||
@ -70,6 +70,7 @@ class SmartScraperGraph(AbstractGraph):
|
||||
}
|
||||
)
|
||||
logging.info("FetchNode configured with headless: %s", self.config.get("headless", True))
|
||||
|
||||
parse_node = ParseNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
|
||||
@ -70,12 +70,20 @@ class ParseNode(BaseNode):
|
||||
docs_transformed = input_data[0]
|
||||
if self.parse_html:
|
||||
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
|
||||
docs_transformed = docs_transformed[0]
|
||||
docs_transformed = docs_transformed[0]
|
||||
|
||||
chunks = chunk(text=docs_transformed.page_content,
|
||||
chunk_size= self.node_config.get("chunk_size", 4096),
|
||||
token_counter=lambda x: len(x.split()),
|
||||
memoize=False)
|
||||
chunks = chunk(text=docs_transformed.page_content,
|
||||
chunk_size= self.node_config.get("chunk_size", 4096),
|
||||
token_counter=lambda x: len(x.split()),
|
||||
memoize=False)
|
||||
else:
|
||||
docs_transformed = docs_transformed[0]
|
||||
|
||||
chunks = chunk(text=docs_transformed,
|
||||
chunk_size= self.node_config.get("chunk_size", 4096),
|
||||
token_counter=lambda x: len(x.split()),
|
||||
memoize=False)
|
||||
|
||||
state.update({self.output[0]: chunks})
|
||||
|
||||
return state
|
||||
|
||||
Loading…
Reference in New Issue
Block a user