feat: add Parse_Node

This commit is contained in:
Marco Vinciguerra 2024-06-12 12:29:14 +02:00
parent 79b8326b5b
commit e6c7940a57
3 changed files with 28 additions and 7 deletions

View File

@ -11,6 +11,7 @@ from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
RAGNode,
GenerateAnswerPDFNode
)
@ -66,6 +67,15 @@ class PDFScraperGraph(AbstractGraph):
output=["doc"],
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"parse_html": False,
"chunk_size": self.model_token
}
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
@ -86,11 +96,13 @@ class PDFScraperGraph(AbstractGraph):
return BaseGraph(
nodes=[
fetch_node,
parse_node,
rag_node,
generate_answer_node_pdf,
],
edges=[
(fetch_node, rag_node),
(fetch_node, parse_node),
(parse_node, rag_node),
(rag_node, generate_answer_node_pdf)
],
entry_point=fetch_node

View File

@ -3,8 +3,8 @@ SmartScraperGraph Module
"""
from typing import Optional
import logging
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
@ -70,6 +70,7 @@ class SmartScraperGraph(AbstractGraph):
}
)
logging.info("FetchNode configured with headless: %s", self.config.get("headless", True))
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],

View File

@ -70,12 +70,20 @@ class ParseNode(BaseNode):
docs_transformed = input_data[0]
if self.parse_html:
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
docs_transformed = docs_transformed[0]
docs_transformed = docs_transformed[0]
chunks = chunk(text=docs_transformed.page_content,
chunk_size= self.node_config.get("chunk_size", 4096),
token_counter=lambda x: len(x.split()),
memoize=False)
chunks = chunk(text=docs_transformed.page_content,
chunk_size= self.node_config.get("chunk_size", 4096),
token_counter=lambda x: len(x.split()),
memoize=False)
else:
docs_transformed = docs_transformed[0]
chunks = chunk(text=docs_transformed,
chunk_size= self.node_config.get("chunk_size", 4096),
token_counter=lambda x: len(x.split()),
memoize=False)
state.update({self.output[0]: chunks})
return state