mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
fix(pdf): correctly read .pdf files
This commit is contained in:
parent
91c5b5af43
commit
203de83405
@ -32,7 +32,7 @@ source = """
|
||||
|
||||
pdf_scraper_graph = PDFScraperGraph(
|
||||
prompt="Summarize the text and find the main topics",
|
||||
source="a.pdf",
|
||||
source="Laureaconanniaccademici.pdf",
|
||||
config=graph_config,
|
||||
)
|
||||
result = pdf_scraper_graph.run()
|
||||
@ -95,8 +95,10 @@ class FetchNode(BaseNode):
|
||||
|
||||
state.update({self.output[0]: compressed_document})
|
||||
return state
|
||||
# handling for pdf
|
||||
# handling pdf
|
||||
elif input_keys[0] == "pdf":
|
||||
|
||||
# TODO: fix bytes content issue
|
||||
loader = PyPDFLoader(source)
|
||||
compressed_document = loader.load()
|
||||
state.update({self.output[0]: compressed_document})
|
||||
|
||||
@ -5,6 +5,7 @@ ParseNode Module
|
||||
from typing import List, Optional
|
||||
from semchunk import chunk
|
||||
from langchain_community.document_transformers import Html2TextTransformer
|
||||
from langchain_core.documents import Document
|
||||
from ..utils.logging import get_logger
|
||||
from .base_node import BaseNode
|
||||
|
||||
@ -79,10 +80,17 @@ class ParseNode(BaseNode):
|
||||
else:
|
||||
docs_transformed = docs_transformed[0]
|
||||
|
||||
chunks = chunk(text=docs_transformed,
|
||||
if type(docs_transformed) == Document:
|
||||
chunks = chunk(text=docs_transformed.page_content,
|
||||
chunk_size= self.node_config.get("chunk_size", 4096),
|
||||
token_counter=lambda x: len(x.split()),
|
||||
memoize=False)
|
||||
else:
|
||||
|
||||
chunks = chunk(text=docs_transformed,
|
||||
chunk_size= self.node_config.get("chunk_size", 4096),
|
||||
token_counter=lambda x: len(x.split()),
|
||||
memoize=False)
|
||||
|
||||
state.update({self.output[0]: chunks})
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user