From 203de834051ea1d6443841921f3aa3e6adbd9174 Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Fri, 14 Jun 2024 15:20:30 +0200 Subject: [PATCH] fix(pdf): correctly read .pdf files --- ...ni_search_graph_openai.py => omni_search_openai.py} | 0 ...f_scraper_graph_openai.py => pdf_scraper_openai.py} | 2 +- scrapegraphai/nodes/fetch_node.py | 4 +++- scrapegraphai/nodes/parse_node.py | 10 +++++++++- 4 files changed, 13 insertions(+), 3 deletions(-) rename examples/openai/{omni_search_graph_openai.py => omni_search_openai.py} (100%) rename examples/openai/{pdf_scraper_graph_openai.py => pdf_scraper_openai.py} (97%) diff --git a/examples/openai/omni_search_graph_openai.py b/examples/openai/omni_search_openai.py similarity index 100% rename from examples/openai/omni_search_graph_openai.py rename to examples/openai/omni_search_openai.py diff --git a/examples/openai/pdf_scraper_graph_openai.py b/examples/openai/pdf_scraper_openai.py similarity index 97% rename from examples/openai/pdf_scraper_graph_openai.py rename to examples/openai/pdf_scraper_openai.py index 59f36a9d..6267baea 100644 --- a/examples/openai/pdf_scraper_graph_openai.py +++ b/examples/openai/pdf_scraper_openai.py @@ -32,7 +32,7 @@ source = """ pdf_scraper_graph = PDFScraperGraph( prompt="Summarize the text and find the main topics", - source="a.pdf", + source="Laureaconanniaccademici.pdf", config=graph_config, ) result = pdf_scraper_graph.run() diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index dbdd9925..df12a26f 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -95,8 +95,10 @@ class FetchNode(BaseNode): state.update({self.output[0]: compressed_document}) return state - # handling for pdf + # handling pdf elif input_keys[0] == "pdf": + + # TODO: fix bytes content issue loader = PyPDFLoader(source) compressed_document = loader.load() state.update({self.output[0]: compressed_document}) diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 5585ae80..9c24edb6 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -5,6 +5,7 @@ ParseNode Module from typing import List, Optional from semchunk import chunk from langchain_community.document_transformers import Html2TextTransformer +from langchain_core.documents import Document from ..utils.logging import get_logger from .base_node import BaseNode @@ -79,10 +80,17 @@ class ParseNode(BaseNode): else: docs_transformed = docs_transformed[0] - chunks = chunk(text=docs_transformed, + if type(docs_transformed) == Document: + chunks = chunk(text=docs_transformed.page_content, chunk_size= self.node_config.get("chunk_size", 4096), token_counter=lambda x: len(x.split()), memoize=False) + else: + + chunks = chunk(text=docs_transformed, + chunk_size= self.node_config.get("chunk_size", 4096), + token_counter=lambda x: len(x.split()), + memoize=False) state.update({self.output[0]: chunks})