mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
feat: Add support for passing pdf path as source
This commit is contained in:
parent
590aab792d
commit
f10f3b1438
@ -18,3 +18,4 @@ playwright==1.43.0
|
||||
langchain-aws==0.1.2
|
||||
langchain-anthropic==0.1.11
|
||||
yahoo-search-py==0.3
|
||||
pypdf==4.2.0
|
||||
|
||||
@ -56,36 +56,29 @@ class PDFScraperGraph(AbstractGraph):
|
||||
"""
|
||||
|
||||
fetch_node = FetchNode(
|
||||
input="pdf_dir",
|
||||
input='pdf',
|
||||
output=["doc"],
|
||||
node_config={
|
||||
"headless": self.headless,
|
||||
"verbose": self.verbose
|
||||
}
|
||||
)
|
||||
parse_node = ParseNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
node_config={
|
||||
"chunk_size": self.model_token,
|
||||
"verbose": self.verbose
|
||||
}
|
||||
)
|
||||
rag_node = RAGNode(
|
||||
input="user_prompt & (parsed_doc | doc)",
|
||||
output=["relevant_chunks"],
|
||||
node_config={
|
||||
"llm": self.llm_model,
|
||||
"llm_model": self.llm_model,
|
||||
"embedder_model": self.embedder_model,
|
||||
"verbose": self.verbose
|
||||
}
|
||||
)
|
||||
generate_answer_node = GenerateAnswerNode(
|
||||
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
|
||||
output=["answer"],
|
||||
node_config={
|
||||
"llm": self.llm_model,
|
||||
"verbose": self.verbose
|
||||
"llm_model": self.llm_model,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ FetchNode Module
|
||||
from typing import List, Optional
|
||||
from langchain_community.document_loaders import AsyncChromiumLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
from .base_node import BaseNode
|
||||
from ..utils.remover import remover
|
||||
|
||||
@ -56,7 +57,6 @@ class FetchNode(BaseNode):
|
||||
|
||||
# Interpret input keys based on the provided input expression
|
||||
input_keys = self.get_input_keys(state)
|
||||
|
||||
# Fetching data from the state based on the input keys
|
||||
input_data = [state[key] for key in input_keys]
|
||||
|
||||
@ -66,6 +66,15 @@ class FetchNode(BaseNode):
|
||||
"source": "local_dir"
|
||||
})]
|
||||
# if it is a local directory
|
||||
|
||||
# handling for pdf
|
||||
elif self.input == "pdf":
|
||||
loader = PyPDFLoader(source)
|
||||
compressed_document = loader.load()
|
||||
|
||||
elif self.input == "pdf_dir":
|
||||
pass
|
||||
|
||||
elif not source.startswith("http"):
|
||||
compressed_document = [Document(page_content=remover(source), metadata={
|
||||
"source": "local_dir"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user