mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
Merge pull request #351 from VinciGit00/faiss_integration
Faiss integration
This commit is contained in:
commit
589da1d695
4
.gitignore
vendored
4
.gitignore
vendored
@ -23,6 +23,7 @@ docs/source/_static/
|
||||
venv/
|
||||
.venv/
|
||||
.vscode/
|
||||
.conda/
|
||||
|
||||
# exclude pdf, mp3
|
||||
*.pdf
|
||||
@ -38,3 +39,6 @@ lib/
|
||||
*.html
|
||||
.idea
|
||||
|
||||
# extras
|
||||
cache/
|
||||
run_smart_scraper.py
|
||||
|
||||
@ -13,6 +13,7 @@ Some interesting ones are:
|
||||
- `loader_kwargs`: A dictionary with additional parameters to be passed to the `Loader` class, such as `proxy`.
|
||||
- `burr_kwargs`: A dictionary with additional parameters to enable `Burr` graphical user interface.
|
||||
- `max_images`: The maximum number of images to be analyzed. Useful in `OmniScraperGraph` and `OmniSearchGraph`.
|
||||
- `cache_path`: The path where the cache files will be saved. If already exists, the cache will be loaded from this path.
|
||||
|
||||
.. _Burr:
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
sphinx==7.1.2
|
||||
furo==2024.5.6
|
||||
pytest==8.0.0
|
||||
burr[start]==0.19.1
|
||||
burr[start]==0.22.1
|
||||
@ -78,6 +78,7 @@ class AbstractGraph(ABC):
|
||||
self.headless = True if config is None else config.get(
|
||||
"headless", True)
|
||||
self.loader_kwargs = config.get("loader_kwargs", {})
|
||||
self.cache_path = config.get("cache_path", False)
|
||||
|
||||
# Create the graph
|
||||
self.graph = self._create_graph()
|
||||
@ -93,15 +94,13 @@ class AbstractGraph(ABC):
|
||||
else:
|
||||
set_verbosity_warning()
|
||||
|
||||
self.headless = True if config is None else config.get("headless", True)
|
||||
self.loader_kwargs = config.get("loader_kwargs", {})
|
||||
|
||||
common_params = {
|
||||
"headless": self.headless,
|
||||
"verbose": self.verbose,
|
||||
"loader_kwargs": self.loader_kwargs,
|
||||
"llm_model": self.llm_model,
|
||||
"embedder_model": self.embedder_model
|
||||
"embedder_model": self.embedder_model,
|
||||
"cache_path": self.cache_path,
|
||||
}
|
||||
|
||||
self.set_common_params(common_params, overwrite=False)
|
||||
|
||||
@ -3,6 +3,7 @@ RAGNode Module
|
||||
"""
|
||||
|
||||
from typing import List, Optional
|
||||
import os
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.retrievers import ContextualCompressionRetriever
|
||||
@ -50,6 +51,7 @@ class RAGNode(BaseNode):
|
||||
self.verbose = (
|
||||
False if node_config is None else node_config.get("verbose", False)
|
||||
)
|
||||
self.cache_path = node_config.get("cache_path", False)
|
||||
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
@ -98,7 +100,24 @@ class RAGNode(BaseNode):
|
||||
)
|
||||
embeddings = self.embedder_model
|
||||
|
||||
retriever = FAISS.from_documents(chunked_docs, embeddings).as_retriever()
|
||||
folder_name = self.node_config.get("cache_path", "cache")
|
||||
|
||||
if self.node_config.get("cache_path", False) and not os.path.exists(folder_name):
|
||||
index = FAISS.from_documents(chunked_docs, embeddings)
|
||||
os.makedirs(folder_name)
|
||||
index.save_local(folder_name)
|
||||
self.logger.info("--- (indexes saved to cache) ---")
|
||||
|
||||
elif self.node_config.get("cache_path", False) and os.path.exists(folder_name):
|
||||
index = FAISS.load_local(folder_path=folder_name,
|
||||
embeddings=embeddings,
|
||||
allow_dangerous_deserialization=True)
|
||||
self.logger.info("--- (indexes loaded from cache) ---")
|
||||
|
||||
else:
|
||||
index = FAISS.from_documents(chunked_docs, embeddings)
|
||||
|
||||
retriever = index.as_retriever()
|
||||
|
||||
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
|
||||
# similarity_threshold could be set, now k=20
|
||||
|
||||
Loading…
Reference in New Issue
Block a user