mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
fix(pdf): correctly read .pdf files
This commit is contained in:
parent
91c5b5af43
commit
203de83405
@ -32,7 +32,7 @@ source = """
|
|||||||
|
|
||||||
pdf_scraper_graph = PDFScraperGraph(
|
pdf_scraper_graph = PDFScraperGraph(
|
||||||
prompt="Summarize the text and find the main topics",
|
prompt="Summarize the text and find the main topics",
|
||||||
source="a.pdf",
|
source="Laureaconanniaccademici.pdf",
|
||||||
config=graph_config,
|
config=graph_config,
|
||||||
)
|
)
|
||||||
result = pdf_scraper_graph.run()
|
result = pdf_scraper_graph.run()
|
||||||
@ -95,8 +95,10 @@ class FetchNode(BaseNode):
|
|||||||
|
|
||||||
state.update({self.output[0]: compressed_document})
|
state.update({self.output[0]: compressed_document})
|
||||||
return state
|
return state
|
||||||
# handling for pdf
|
# handling pdf
|
||||||
elif input_keys[0] == "pdf":
|
elif input_keys[0] == "pdf":
|
||||||
|
|
||||||
|
# TODO: fix bytes content issue
|
||||||
loader = PyPDFLoader(source)
|
loader = PyPDFLoader(source)
|
||||||
compressed_document = loader.load()
|
compressed_document = loader.load()
|
||||||
state.update({self.output[0]: compressed_document})
|
state.update({self.output[0]: compressed_document})
|
||||||
|
|||||||
@ -5,6 +5,7 @@ ParseNode Module
|
|||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from semchunk import chunk
|
from semchunk import chunk
|
||||||
from langchain_community.document_transformers import Html2TextTransformer
|
from langchain_community.document_transformers import Html2TextTransformer
|
||||||
|
from langchain_core.documents import Document
|
||||||
from ..utils.logging import get_logger
|
from ..utils.logging import get_logger
|
||||||
from .base_node import BaseNode
|
from .base_node import BaseNode
|
||||||
|
|
||||||
@ -79,10 +80,17 @@ class ParseNode(BaseNode):
|
|||||||
else:
|
else:
|
||||||
docs_transformed = docs_transformed[0]
|
docs_transformed = docs_transformed[0]
|
||||||
|
|
||||||
chunks = chunk(text=docs_transformed,
|
if type(docs_transformed) == Document:
|
||||||
|
chunks = chunk(text=docs_transformed.page_content,
|
||||||
chunk_size= self.node_config.get("chunk_size", 4096),
|
chunk_size= self.node_config.get("chunk_size", 4096),
|
||||||
token_counter=lambda x: len(x.split()),
|
token_counter=lambda x: len(x.split()),
|
||||||
memoize=False)
|
memoize=False)
|
||||||
|
else:
|
||||||
|
|
||||||
|
chunks = chunk(text=docs_transformed,
|
||||||
|
chunk_size= self.node_config.get("chunk_size", 4096),
|
||||||
|
token_counter=lambda x: len(x.split()),
|
||||||
|
memoize=False)
|
||||||
|
|
||||||
state.update({self.output[0]: chunks})
|
state.update({self.output[0]: chunks})
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user