diff --git a/README.md b/README.md
index cf437203..51bc3fa9 100644
--- a/README.md
+++ b/README.md
@@ -38,10 +38,9 @@ Additional dependecies can be added while installing the library:
- More Language Models: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.
-
-This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
-```bash
-pip install scrapegraphai[other-language-models]
+ This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
+ ```bash
+ pip install scrapegraphai[other-language-models]
```
- Semantic Options: this group includes tools for advanced semantic processing, such as Graphviz.
@@ -55,23 +54,15 @@ pip install scrapegraphai[other-language-models]
pip install scrapegraphai[more-browser-options]
```
-- faiss Options: this group includes faiss integration
+- qdrants Options: this group includes qdrant integration for RAGnode and DeepScraperGraph.
```bash
- pip install scrapegraphai[faiss-cpu]
+ pip install scrapegraphai[qdrant]
```
-
-### Installing "More Browser Options"
-
-This group includes an ocr scraper for websites
-```bash
-pip install scrapegraphai[screenshot_scraper]
-```
-
## 💻 Usage
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
diff --git a/pyproject.toml b/pyproject.toml
index 26b1fdb7..dde97395 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -100,8 +100,9 @@ screenshot_scraper = [
]
# Group 5: Faiss CPU
-faiss-cpu = [
- "faiss-cpu>=1.8.0",
+qdrant = [
+ "qdrant-client>=1.11.3",
+ "fastembed>=0.3.6"
]
[build-system]
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index ec16c48e..e5fafb87 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -28,3 +28,6 @@ from .html_analyzer_node import HtmlAnalyzerNode
from .generate_code_node import GenerateCodeNode
from .search_node_with_context import SearchLinksWithContext
from .reasoning_node import ReasoningNode
+from .fetch_node_level_k import FetchNodelevelK
+from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
+from .description_node import DescriptionNode
diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
new file mode 100644
index 00000000..49ab941f
--- /dev/null
+++ b/scrapegraphai/nodes/description_node.py
@@ -0,0 +1,42 @@
+"""
+DescriptionNode Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class DescriptionNode(BaseNode):
+ """
+ A node responsible for compressing the input tokens and storing the document
+ in a vector database for retrieval. Relevant chunks are stored in the state.
+
+ It allows scraping of big documents without exceeding the token limit of the language model.
+
+ Attributes:
+ llm_model: An instance of a language model client, configured for generating answers.
+ verbose (bool): A flag indicating whether to show print statements during execution.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (dict): Additional configuration for the node.
+ node_name (str): The unique identifier name for the node, defaulting to "Parse".
+ """
+
+ def __init__(
+ self,
+ input: str,
+ output: List[str],
+ node_config: Optional[dict] = None,
+ node_name: str = "RAG",
+ ):
+ super().__init__(node_name, "node", input, output, 2, node_config)
+
+ self.llm_model = node_config["llm_model"]
+ self.embedder_model = node_config.get("embedder_model", None)
+ self.verbose = (
+ False if node_config is None else node_config.get("verbose", False)
+ )
+ self.cache_path = node_config.get("cache_path", False)
+
+ def execute(self, state: dict) -> dict:
+ pass
diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
new file mode 100644
index 00000000..18a0d435
--- /dev/null
+++ b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -0,0 +1,42 @@
+"""
+FetchNodelevelK Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class FetchNodelevelK(BaseNode):
+ """
+ A node responsible for compressing the input tokens and storing the document
+ in a vector database for retrieval. Relevant chunks are stored in the state.
+
+ It allows scraping of big documents without exceeding the token limit of the language model.
+
+ Attributes:
+ llm_model: An instance of a language model client, configured for generating answers.
+ verbose (bool): A flag indicating whether to show print statements during execution.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (dict): Additional configuration for the node.
+ node_name (str): The unique identifier name for the node, defaulting to "Parse".
+ """
+
+ def __init__(
+ self,
+ input: str,
+ output: List[str],
+ node_config: Optional[dict] = None,
+ node_name: str = "RAG",
+ ):
+ super().__init__(node_name, "node", input, output, 2, node_config)
+
+ self.llm_model = node_config["llm_model"]
+ self.embedder_model = node_config.get("embedder_model", None)
+ self.verbose = (
+ False if node_config is None else node_config.get("verbose", False)
+ )
+ self.cache_path = node_config.get("cache_path", False)
+
+ def execute(self, state: dict) -> dict:
+ pass
diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py
new file mode 100644
index 00000000..1d4cdb4d
--- /dev/null
+++ b/scrapegraphai/nodes/generate_answer_node_k_level.py
@@ -0,0 +1,50 @@
+"""
+GenerateAnswerNodeKLevel Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class GenerateAnswerNodeKLevel(BaseNode):
+ """
+ A node responsible for compressing the input tokens and storing the document
+ in a vector database for retrieval. Relevant chunks are stored in the state.
+
+ It allows scraping of big documents without exceeding the token limit of the language model.
+
+ Attributes:
+ llm_model: An instance of a language model client, configured for generating answers.
+ verbose (bool): A flag indicating whether to show print statements during execution.
+
+ Args:
+ input (str): Boolean expression defining the input keys needed from the state.
+ output (List[str]): List of output keys to be updated in the state.
+ node_config (dict): Additional configuration for the node.
+ node_name (str): The unique identifier name for the node, defaulting to "Parse".
+ """
+
+ def __init__(
+ self,
+ input: str,
+ output: List[str],
+ node_config: Optional[dict] = None,
+ node_name: str = "GANLK",
+ ):
+ super().__init__(node_name, "node", input, output, 2, node_config)
+
+ self.llm_model = node_config["llm_model"]
+ self.embedder_model = node_config.get("embedder_model", None)
+ self.verbose = (
+ False if node_config is None else node_config.get("verbose", False)
+ )
+
+ def execute(self, state: dict) -> dict:
+ client = state["vectorial_db"]
+
+ answer = client.query(
+ collection_name="demo_collection",
+ query_text="This is a query document"
+ )
+
+ state["answer"] = answer
+
+ return state
diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index cc72aaf4..746b10a5 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -26,7 +26,6 @@ from ..utils import (transform_schema,
from .base_node import BaseNode
from jsonschema import validate, ValidationError
-
class GenerateCodeNode(BaseNode):
"""
A node that generates Python code for a function that extracts data
@@ -96,7 +95,7 @@ class GenerateCodeNode(BaseNode):
Raises:
KeyError: If the input keys are not found in the state, indicating
that the necessary information for generating an answer is missing.
- RuntimeError: If the maximum number of iterations is
+ RuntimeError: If the maximum number of iterations is
reached without obtaining the desired code.
"""
@@ -170,7 +169,7 @@ class GenerateCodeNode(BaseNode):
self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---")
state = self.semantic_comparison_loop(state)
if state["errors"]["semantic"]:
- continue
+ continue
break
if state["iteration"] == self.max_iterations["overall"] and \
@@ -195,9 +194,9 @@ class GenerateCodeNode(BaseNode):
state["errors"]["syntax"] = [syntax_message]
self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---")
analysis = syntax_focused_analysis(state, self.llm_model)
- self.logger.info(f"""--- (Regenerating Code
+ self.logger.info(f"""--- (Regenerating Code
to fix the Error) ---""")
- state["generated_code"] = syntax_focused_code_generation(state,
+ state["generated_code"] = syntax_focused_code_generation(state,
analysis, self.llm_model)
state["generated_code"] = extract_code(state["generated_code"])
return state
@@ -217,14 +216,14 @@ class GenerateCodeNode(BaseNode):
self.logger.info(f"--- (Code Execution Error: {execution_result}) ---")
analysis = execution_focused_analysis(state, self.llm_model)
self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
- state["generated_code"] = execution_focused_code_generation(state,
+ state["generated_code"] = execution_focused_code_generation(state,
analysis, self.llm_model)
state["generated_code"] = extract_code(state["generated_code"])
return state
def validation_reasoning_loop(self, state: dict) -> dict:
for _ in range(self.max_iterations["validation"]):
- validation, errors = self.validate_dict(state["execution_result"],
+ validation, errors = self.validate_dict(state["execution_result"],
self.output_schema.schema())
if validation:
state["errors"]["validation"] = []
@@ -240,7 +239,7 @@ class GenerateCodeNode(BaseNode):
def semantic_comparison_loop(self, state: dict) -> dict:
for _ in range(self.max_iterations["semantic"]):
- comparison_result = self.semantic_comparison(state["execution_result"],
+ comparison_result = self.semantic_comparison(state["execution_result"],
state["reference_answer"])
if comparison_result["are_semantically_equivalent"]:
state["errors"]["semantic"] = []
@@ -342,7 +341,7 @@ class GenerateCodeNode(BaseNode):
if not extract_data:
raise NameError("Function 'extract_data' not found in the generated code.")
- result = extract_data(self.raw_html)
+ result = extract_data(self.raw_html)
return True, result
except Exception as e:
return False, f"Error during execution: {str(e)}"
@@ -357,5 +356,5 @@ class GenerateCodeNode(BaseNode):
validate(instance=data, schema=schema)
return True, None
except ValidationError as e:
- errors = e.errors()
+ errors = [e.message]
return False, errors
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
index 1174beee..c92e40f0 100644
--- a/scrapegraphai/nodes/rag_node.py
+++ b/scrapegraphai/nodes/rag_node.py
@@ -1,29 +1,9 @@
"""
RAGNode Module
"""
-import os
-import sys
from typing import List, Optional
-from langchain.docstore.document import Document
-from langchain.retrievers import ContextualCompressionRetriever
-from langchain.retrievers.document_compressors import (
- DocumentCompressorPipeline,
- EmbeddingsFilter,
-)
-from langchain_community.document_transformers import EmbeddingsRedundantFilter
-from langchain_community.vectorstores import FAISS
-from langchain_community.chat_models import ChatOllama
-from langchain_community.embeddings import OllamaEmbeddings
-from langchain_aws import BedrockEmbeddings, ChatBedrock
-from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
-from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI
-from ..utils.logging import get_logger
from .base_node import BaseNode
-from ..helpers import models_tokens
-from ..models import DeepSeek
-
-optional_modules = {"langchain_anthropic", "langchain_fireworks",
- "langchain_groq", "langchain_google_vertexai"}
+from qdrant_client import QdrantClient
class RAGNode(BaseNode):
"""
@@ -34,7 +14,6 @@ class RAGNode(BaseNode):
Attributes:
llm_model: An instance of a language model client, configured for generating answers.
- embedder_model: An instance of an embedding model client, configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
Args:
@@ -58,125 +37,31 @@ class RAGNode(BaseNode):
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
- self.cache_path = node_config.get("cache_path", False)
def execute(self, state: dict) -> dict:
- # Execution logic
- pass
- def _create_default_embedder(self, llm_config=None) -> object:
- """
- Create an embedding model instance based on the chosen llm model.
-
- Returns:
- object: An instance of the embedding model client.
-
- Raises:
- ValueError: If the model is not supported.
- """
-
- if isinstance(self.llm_model, ChatGoogleGenerativeAI):
- return GoogleGenerativeAIEmbeddings(
- google_api_key=llm_config["api_key"], model="models/embedding-001"
- )
- if isinstance(self.llm_model, ChatOpenAI):
- return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key,
- base_url=self.llm_model.openai_api_base)
- elif isinstance(self.llm_model, DeepSeek):
- return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
- elif isinstance(self.llm_model, AzureOpenAIEmbeddings):
- return self.llm_model
- elif isinstance(self.llm_model, AzureChatOpenAI):
- return AzureOpenAIEmbeddings()
- elif isinstance(self.llm_model, ChatOllama):
- params = self.llm_model._lc_kwargs
- params.pop("streaming", None)
- params.pop("temperature", None)
- return OllamaEmbeddings(**params)
- elif isinstance(self.llm_model, ChatBedrock):
- return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id)
- elif all(key in sys.modules for key in optional_modules):
- if isinstance(self.llm_model, ChatFireworks):
- from langchain_fireworks import FireworksEmbeddings
- return FireworksEmbeddings(model=self.llm_model.model_name)
- if isinstance(self.llm_model, ChatNVIDIA):
- from langchain_nvidia import NVIDIAEmbeddings
- return NVIDIAEmbeddings(model=self.llm_model.model_name)
- if isinstance(self.llm_model, ChatHuggingFace):
- from langchain_huggingface import HuggingFaceEmbeddings
- return HuggingFaceEmbeddings(model=self.llm_model.model)
- if isinstance(self.llm_model, ChatVertexAI):
- from langchain_vertexai import VertexAIEmbeddings
- return VertexAIEmbeddings()
+ if self.node_config.get("client_type") == "memory":
+ client = QdrantClient(":memory:")
+ elif self.node_config.get("client_type") == "local_db":
+ client = QdrantClient(path="path/to/db")
+ elif self.node_config.get("client_type") == "image":
+ client = QdrantClient(url="http://localhost:6333")
else:
- raise ValueError("Embedding Model missing or not supported")
+ raise ValueError("client_type provided not correct")
- def _create_embedder(self, embedder_config: dict) -> object:
- """
- Create an embedding model instance based on the configuration provided.
+ docs = ["Qdrant has Langchain integrations", "Qdrant also has Llama Index integrations"]
+ metadata = [
+ {"source": "Langchain-docs"},
+ {"source": "Linkedin-docs"},
+ ]
+ ids = [42, 2]
- Args:
- embedder_config (dict): Configuration parameters for the embedding model.
+ client.add(
+ collection_name="demo_collection",
+ documents=docs,
+ metadata=metadata,
+ ids=ids
+ )
- Returns:
- object: An instance of the embedding model client.
-
- Raises:
- KeyError: If the model is not supported.
- """
- embedder_params = {**embedder_config}
- if "model_instance" in embedder_config:
- return embedder_params["model_instance"]
- if "openai" in embedder_params["model"]:
- return OpenAIEmbeddings(api_key=embedder_params["api_key"])
- if "azure" in embedder_params["model"]:
- return AzureOpenAIEmbeddings()
- if "ollama" in embedder_params["model"]:
- embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
- try:
- models_tokens["ollama"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return OllamaEmbeddings(**embedder_params)
- if "gemini" in embedder_params["model"]:
- try:
- models_tokens["gemini"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return GoogleGenerativeAIEmbeddings(model=embedder_params["model"])
- if "bedrock" in embedder_params["model"]:
- embedder_params["model"] = embedder_params["model"].split("/")[-1]
- client = embedder_params.get("client", None)
- try:
- models_tokens["bedrock"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return BedrockEmbeddings(client=client, model_id=embedder_params["model"])
- if all(key in sys.modules for key in optional_modules):
- if "hugging_face" in embedder_params["model"]:
- from langchain_huggingface import HuggingFaceEmbeddings
- embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
- try:
- models_tokens["hugging_face"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return HuggingFaceEmbeddings(model=embedder_params["model"])
- elif "fireworks" in embedder_params["model"]:
- from langchain_fireworks import FireworksEmbeddings
- embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
- try:
- models_tokens["fireworks"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return FireworksEmbeddings(model=embedder_params["model"])
- elif "nvidia" in embedder_params["model"]:
- from langchain_nvidia import NVIDIAEmbeddings
- embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
- try:
- models_tokens["nvidia"][embedder_params["model"]]
- except KeyError as exc:
- raise KeyError("Model not supported") from exc
- return NVIDIAEmbeddings(model=embedder_params["model"],
- nvidia_api_key=embedder_params["api_key"])
-
- raise ValueError("Model provided by the configuration not supported")
+ state["vectorial_db"] = client
+ return state