feat: Add support for passing pdf path as source

This commit is contained in:
Shubham Kamboj 2024-05-09 21:55:05 +05:30
parent 590aab792d
commit f10f3b1438
3 changed files with 14 additions and 11 deletions

View File

@ -18,3 +18,4 @@ playwright==1.43.0
langchain-aws==0.1.2
langchain-anthropic==0.1.11
yahoo-search-py==0.3
pypdf==4.2.0

View File

@ -56,36 +56,29 @@ class PDFScraperGraph(AbstractGraph):
"""
fetch_node = FetchNode(
input="pdf_dir",
input='pdf',
output=["doc"],
node_config={
"headless": self.headless,
"verbose": self.verbose
}
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": self.model_token,
"verbose": self.verbose
}
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm": self.llm_model,
"llm_model": self.llm_model,
"embedder_model": self.embedder_model,
"verbose": self.verbose
}
)
generate_answer_node = GenerateAnswerNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={
"llm": self.llm_model,
"verbose": self.verbose
"llm_model": self.llm_model,
}
)

View File

@ -5,6 +5,7 @@ FetchNode Module
from typing import List, Optional
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
from .base_node import BaseNode
from ..utils.remover import remover
@ -56,7 +57,6 @@ class FetchNode(BaseNode):
# Interpret input keys based on the provided input expression
input_keys = self.get_input_keys(state)
# Fetching data from the state based on the input keys
input_data = [state[key] for key in input_keys]
@ -66,6 +66,15 @@ class FetchNode(BaseNode):
"source": "local_dir"
})]
# if it is a local directory
# handling for pdf
elif self.input == "pdf":
loader = PyPDFLoader(source)
compressed_document = loader.load()
elif self.input == "pdf_dir":
pass
elif not source.startswith("http"):
compressed_document = [Document(page_content=remover(source), metadata={
"source": "local_dir"