fix(pdf): correctly read .pdf files

This commit is contained in:
Marco Perini 2024-06-14 15:20:30 +02:00
parent 91c5b5af43
commit 203de83405
4 changed files with 13 additions and 3 deletions

View File

@ -32,7 +32,7 @@ source = """
pdf_scraper_graph = PDFScraperGraph(
prompt="Summarize the text and find the main topics",
source="a.pdf",
source="Laureaconanniaccademici.pdf",
config=graph_config,
)
result = pdf_scraper_graph.run()

View File

@ -95,8 +95,10 @@ class FetchNode(BaseNode):
state.update({self.output[0]: compressed_document})
return state
# handling for pdf
# handling pdf
elif input_keys[0] == "pdf":
# TODO: fix bytes content issue
loader = PyPDFLoader(source)
compressed_document = loader.load()
state.update({self.output[0]: compressed_document})

View File

@ -5,6 +5,7 @@ ParseNode Module
from typing import List, Optional
from semchunk import chunk
from langchain_community.document_transformers import Html2TextTransformer
from langchain_core.documents import Document
from ..utils.logging import get_logger
from .base_node import BaseNode
@ -79,10 +80,17 @@ class ParseNode(BaseNode):
else:
docs_transformed = docs_transformed[0]
chunks = chunk(text=docs_transformed,
if type(docs_transformed) == Document:
chunks = chunk(text=docs_transformed.page_content,
chunk_size= self.node_config.get("chunk_size", 4096),
token_counter=lambda x: len(x.split()),
memoize=False)
else:
chunks = chunk(text=docs_transformed,
chunk_size= self.node_config.get("chunk_size", 4096),
token_counter=lambda x: len(x.split()),
memoize=False)
state.update({self.output[0]: chunks})