Merge pull request #351 from VinciGit00/faiss_integration

Faiss integration
This commit is contained in:
Marco Perini 2024-06-11 23:15:23 +02:00 committed by GitHub
commit 589da1d695
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 29 additions and 6 deletions

4
.gitignore vendored
View File

@ -23,6 +23,7 @@ docs/source/_static/
venv/
.venv/
.vscode/
.conda/
# exclude pdf, mp3
*.pdf
@ -38,3 +39,6 @@ lib/
*.html
.idea
# extras
cache/
run_smart_scraper.py

View File

@ -13,6 +13,7 @@ Some interesting ones are:
- `loader_kwargs`: A dictionary with additional parameters to be passed to the `Loader` class, such as `proxy`.
- `burr_kwargs`: A dictionary with additional parameters to enable `Burr` graphical user interface.
- `max_images`: The maximum number of images to be analyzed. Useful in `OmniScraperGraph` and `OmniSearchGraph`.
- `cache_path`: The path where the cache files will be saved. If already exists, the cache will be loaded from this path.
.. _Burr:

View File

@ -1,4 +1,4 @@
sphinx==7.1.2
furo==2024.5.6
pytest==8.0.0
burr[start]==0.19.1
burr[start]==0.22.1

View File

@ -78,6 +78,7 @@ class AbstractGraph(ABC):
self.headless = True if config is None else config.get(
"headless", True)
self.loader_kwargs = config.get("loader_kwargs", {})
self.cache_path = config.get("cache_path", False)
# Create the graph
self.graph = self._create_graph()
@ -93,15 +94,13 @@ class AbstractGraph(ABC):
else:
set_verbosity_warning()
self.headless = True if config is None else config.get("headless", True)
self.loader_kwargs = config.get("loader_kwargs", {})
common_params = {
"headless": self.headless,
"verbose": self.verbose,
"loader_kwargs": self.loader_kwargs,
"llm_model": self.llm_model,
"embedder_model": self.embedder_model
"embedder_model": self.embedder_model,
"cache_path": self.cache_path,
}
self.set_common_params(common_params, overwrite=False)

View File

@ -3,6 +3,7 @@ RAGNode Module
"""
from typing import List, Optional
import os
from langchain.docstore.document import Document
from langchain.retrievers import ContextualCompressionRetriever
@ -50,6 +51,7 @@ class RAGNode(BaseNode):
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
self.cache_path = node_config.get("cache_path", False)
def execute(self, state: dict) -> dict:
"""
@ -98,7 +100,24 @@ class RAGNode(BaseNode):
)
embeddings = self.embedder_model
retriever = FAISS.from_documents(chunked_docs, embeddings).as_retriever()
folder_name = self.node_config.get("cache_path", "cache")
if self.node_config.get("cache_path", False) and not os.path.exists(folder_name):
index = FAISS.from_documents(chunked_docs, embeddings)
os.makedirs(folder_name)
index.save_local(folder_name)
self.logger.info("--- (indexes saved to cache) ---")
elif self.node_config.get("cache_path", False) and os.path.exists(folder_name):
index = FAISS.load_local(folder_path=folder_name,
embeddings=embeddings,
allow_dangerous_deserialization=True)
self.logger.info("--- (indexes loaded from cache) ---")
else:
index = FAISS.from_documents(chunked_docs, embeddings)
retriever = index.as_retriever()
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
# similarity_threshold could be set, now k=20