From f10f3b1438e0c625b7f2fa52faeb5a6c12116113 Mon Sep 17 00:00:00 2001 From: Shubham Kamboj Date: Thu, 9 May 2024 21:55:05 +0530 Subject: [PATCH] feat: Add support for passing pdf path as source --- requirements.txt | 1 + scrapegraphai/graphs/pdf_scraper_graph.py | 13 +++---------- scrapegraphai/nodes/fetch_node.py | 11 ++++++++++- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/requirements.txt b/requirements.txt index ce5754b5..1e6224b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ playwright==1.43.0 langchain-aws==0.1.2 langchain-anthropic==0.1.11 yahoo-search-py==0.3 +pypdf==4.2.0 diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 2f62f509..4eb42b37 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -56,36 +56,29 @@ class PDFScraperGraph(AbstractGraph): """ fetch_node = FetchNode( - input="pdf_dir", + input='pdf', output=["doc"], - node_config={ - "headless": self.headless, - "verbose": self.verbose - } ) parse_node = ParseNode( input="doc", output=["parsed_doc"], node_config={ "chunk_size": self.model_token, - "verbose": self.verbose } ) rag_node = RAGNode( input="user_prompt & (parsed_doc | doc)", output=["relevant_chunks"], node_config={ - "llm": self.llm_model, + "llm_model": self.llm_model, "embedder_model": self.embedder_model, - "verbose": self.verbose } ) generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], node_config={ - "llm": self.llm_model, - "verbose": self.verbose + "llm_model": self.llm_model, } ) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index bcd207f3..c900b0a2 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -5,6 +5,7 @@ FetchNode Module from typing import List, Optional from langchain_community.document_loaders import AsyncChromiumLoader from langchain_core.documents import Document +from langchain_community.document_loaders import PyPDFLoader from .base_node import BaseNode from ..utils.remover import remover @@ -56,7 +57,6 @@ class FetchNode(BaseNode): # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] @@ -66,6 +66,15 @@ class FetchNode(BaseNode): "source": "local_dir" })] # if it is a local directory + + # handling for pdf + elif self.input == "pdf": + loader = PyPDFLoader(source) + compressed_document = loader.load() + + elif self.input == "pdf_dir": + pass + elif not source.startswith("http"): compressed_document = [Document(page_content=remover(source), metadata={ "source": "local_dir"