add empyt nodes

2026-06-28 21:01:55 +08:00 · 2024-09-30 11:52:14 +02:00 · 2024-09-30 11:52:14 +02:00 · ea27b2499e
commit ea27b2499e
parent d14fb54548
8 changed files with 176 additions and 163 deletions
--- a/README.md
+++ b/README.md
@ -38,10 +38,9 @@ Additional dependecies can be added while installing the library:

 - <b>More Language Models</b>: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.

-
-This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
-```bash
-pip install scrapegraphai[other-language-models]
+  This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
+  ```bash
+  pip install scrapegraphai[other-language-models]
  ```
 - <b>Semantic Options</b>: this group includes tools for advanced semantic processing, such as Graphviz.

@ -55,23 +54,15 @@ pip install scrapegraphai[other-language-models]
  pip install scrapegraphai[more-browser-options]
  ```

- <b>faiss Options</b>: this group includes faiss integration
+- <b>qdrants Options</b>: this group includes qdrant integration for RAGnode and DeepScraperGraph.

  ```bash
-  pip install scrapegraphai[faiss-cpu]
+  pip install scrapegraphai[qdrant]
  ```

 </details>


-
-### Installing "More Browser Options"
-
-This group includes an ocr scraper for websites
-```bash
-pip install scrapegraphai[screenshot_scraper]
-```
-
 ## 💻 Usage
 There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).

--- a/pyproject.toml
+++ b/pyproject.toml
@ -100,8 +100,9 @@ screenshot_scraper = [
 ]

 # Group 5: Faiss CPU
-faiss-cpu = [
-    "faiss-cpu>=1.8.0",
+qdrant = [
+    "qdrant-client>=1.11.3",
+    "fastembed>=0.3.6"
 ]

 [build-system]
--- a/scrapegraphai/nodes/init.py
+++ b/scrapegraphai/nodes/init.py
@ -28,3 +28,6 @@ from .html_analyzer_node import HtmlAnalyzerNode
 from .generate_code_node import GenerateCodeNode
 from .search_node_with_context import SearchLinksWithContext
 from .reasoning_node import ReasoningNode
+from .fetch_node_level_k import FetchNodelevelK
+from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
+from .description_node import DescriptionNode
--- a/scrapegraphai/nodes/description_node.py
+++ b/scrapegraphai/nodes/description_node.py
@ -0,0 +1,42 @@
+"""
+DescriptionNode Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class DescriptionNode(BaseNode):
+    """
+    A node responsible for compressing the input tokens and storing the document
+    in a vector database for retrieval. Relevant chunks are stored in the state.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "RAG",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+        self.cache_path = node_config.get("cache_path", False)
+
+    def execute(self, state: dict) -> dict:
+        pass
--- a/scrapegraphai/nodes/fetch_node_level_k.py
+++ b/scrapegraphai/nodes/fetch_node_level_k.py
@ -0,0 +1,42 @@
+"""
+FetchNodelevelK Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class FetchNodelevelK(BaseNode):
+    """
+    A node responsible for compressing the input tokens and storing the document
+    in a vector database for retrieval. Relevant chunks are stored in the state.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "RAG",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+        self.cache_path = node_config.get("cache_path", False)
+
+    def execute(self, state: dict) -> dict:
+        pass
--- a/scrapegraphai/nodes/generate_answer_node_k_level.py
+++ b/scrapegraphai/nodes/generate_answer_node_k_level.py
@ -0,0 +1,50 @@
+"""
+GenerateAnswerNodeKLevel Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class GenerateAnswerNodeKLevel(BaseNode):
+    """
+    A node responsible for compressing the input tokens and storing the document
+    in a vector database for retrieval. Relevant chunks are stored in the state.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "GANLK",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+
+    def execute(self, state: dict) -> dict:
+        client = state["vectorial_db"]
+
+        answer = client.query(
+            collection_name="demo_collection",
+            query_text="This is a query document"
+        )
+
+        state["answer"] = answer
+
+        return state
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@ -26,7 +26,6 @@ from ..utils import (transform_schema,
 from .base_node import BaseNode
 from jsonschema import validate, ValidationError

-
 class GenerateCodeNode(BaseNode):
    """
    A node that generates Python code for a function that extracts data
@ -96,7 +95,7 @@ class GenerateCodeNode(BaseNode):
        Raises:
            KeyError: If the input keys are not found in the state, indicating
                      that the necessary information for generating an answer is missing.
-            RuntimeError: If the maximum number of iterations is 
+            RuntimeError: If the maximum number of iterations is
            reached without obtaining the desired code.
        """

@ -170,7 +169,7 @@ class GenerateCodeNode(BaseNode):
            self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---")
            state = self.semantic_comparison_loop(state)
            if state["errors"]["semantic"]:
-                continue      
+                continue
            break

        if state["iteration"] == self.max_iterations["overall"] and \
@ -195,9 +194,9 @@ class GenerateCodeNode(BaseNode):
            state["errors"]["syntax"] = [syntax_message]
            self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---")
            analysis = syntax_focused_analysis(state, self.llm_model)
-            self.logger.info(f"""--- (Regenerating Code 
+            self.logger.info(f"""--- (Regenerating Code
                             to fix the Error) ---""")
-            state["generated_code"] = syntax_focused_code_generation(state, 
+            state["generated_code"] = syntax_focused_code_generation(state,
                                                                     analysis, self.llm_model)
            state["generated_code"] = extract_code(state["generated_code"])
        return state
@ -217,14 +216,14 @@ class GenerateCodeNode(BaseNode):
            self.logger.info(f"--- (Code Execution Error: {execution_result}) ---")
            analysis = execution_focused_analysis(state, self.llm_model)
            self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
-            state["generated_code"] = execution_focused_code_generation(state, 
+            state["generated_code"] = execution_focused_code_generation(state,
                                                                        analysis, self.llm_model)
            state["generated_code"] = extract_code(state["generated_code"])
        return state

    def validation_reasoning_loop(self, state: dict) -> dict:
        for _ in range(self.max_iterations["validation"]):
-            validation, errors = self.validate_dict(state["execution_result"], 
+            validation, errors = self.validate_dict(state["execution_result"],
                                                    self.output_schema.schema())
            if validation:
                state["errors"]["validation"] = []
@ -240,7 +239,7 @@ class GenerateCodeNode(BaseNode):

    def semantic_comparison_loop(self, state: dict) -> dict:
        for _ in range(self.max_iterations["semantic"]):
-            comparison_result = self.semantic_comparison(state["execution_result"], 
+            comparison_result = self.semantic_comparison(state["execution_result"],
                                                         state["reference_answer"])
            if comparison_result["are_semantically_equivalent"]:
                state["errors"]["semantic"] = []
@ -342,7 +341,7 @@ class GenerateCodeNode(BaseNode):
            if not extract_data:
                raise NameError("Function 'extract_data' not found in the generated code.")

-            result = extract_data(self.raw_html)            
+            result = extract_data(self.raw_html)
            return True, result
        except Exception as e:
            return False, f"Error during execution: {str(e)}"
@ -357,5 +356,5 @@ class GenerateCodeNode(BaseNode):
            validate(instance=data, schema=schema)
            return True, None
        except ValidationError as e:
-            errors = e.errors()
+            errors = [e.message]
            return False, errors
--- a/scrapegraphai/nodes/rag_node.py
+++ b/scrapegraphai/nodes/rag_node.py
@ -1,29 +1,9 @@
 """
 RAGNode Module
 """
-import os
-import sys
 from typing import List, Optional
-from langchain.docstore.document import Document
-from langchain.retrievers import ContextualCompressionRetriever
-from langchain.retrievers.document_compressors import (
-    DocumentCompressorPipeline,
-    EmbeddingsFilter,
-)
-from langchain_community.document_transformers import EmbeddingsRedundantFilter
-from langchain_community.vectorstores import FAISS
-from langchain_community.chat_models import ChatOllama
-from langchain_community.embeddings import OllamaEmbeddings
-from langchain_aws import BedrockEmbeddings, ChatBedrock
-from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
-from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI
-from ..utils.logging import get_logger
 from .base_node import BaseNode
-from ..helpers import models_tokens
-from ..models import DeepSeek
-
-optional_modules = {"langchain_anthropic", "langchain_fireworks",
-                    "langchain_groq", "langchain_google_vertexai"}
+from qdrant_client import QdrantClient

 class RAGNode(BaseNode):
    """
@ -34,7 +14,6 @@ class RAGNode(BaseNode):

    Attributes:
        llm_model: An instance of a language model client, configured for generating answers.
-        embedder_model: An instance of an embedding model client, configured for generating embeddings.
        verbose (bool): A flag indicating whether to show print statements during execution.

    Args:
@ -58,125 +37,31 @@ class RAGNode(BaseNode):
        self.verbose = (
            False if node_config is None else node_config.get("verbose", False)
        )
-        self.cache_path = node_config.get("cache_path", False)

    def execute(self, state: dict) -> dict:
-        # Execution logic
-        pass

-    def _create_default_embedder(self, llm_config=None) -> object:
-        """
-        Create an embedding model instance based on the chosen llm model.
-
-        Returns:
-            object: An instance of the embedding model client.
-
-        Raises:
-            ValueError: If the model is not supported.
-        """
-
-        if isinstance(self.llm_model, ChatGoogleGenerativeAI):
-            return GoogleGenerativeAIEmbeddings(
-                google_api_key=llm_config["api_key"], model="models/embedding-001"
-            )
-        if isinstance(self.llm_model, ChatOpenAI):
-            return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key,
-                                    base_url=self.llm_model.openai_api_base)
-        elif isinstance(self.llm_model, DeepSeek):
-            return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
-        elif isinstance(self.llm_model, AzureOpenAIEmbeddings):
-            return self.llm_model
-        elif isinstance(self.llm_model, AzureChatOpenAI):
-            return AzureOpenAIEmbeddings()
-        elif isinstance(self.llm_model, ChatOllama):
-            params = self.llm_model._lc_kwargs
-            params.pop("streaming", None)
-            params.pop("temperature", None)
-            return OllamaEmbeddings(**params)
-        elif isinstance(self.llm_model, ChatBedrock):
-            return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id)
-        elif all(key in sys.modules for key in optional_modules):
-            if isinstance(self.llm_model, ChatFireworks):
-                from langchain_fireworks import FireworksEmbeddings
-                return FireworksEmbeddings(model=self.llm_model.model_name)
-            if isinstance(self.llm_model, ChatNVIDIA):
-                from langchain_nvidia import NVIDIAEmbeddings
-                return NVIDIAEmbeddings(model=self.llm_model.model_name)
-            if isinstance(self.llm_model, ChatHuggingFace):
-                from langchain_huggingface import HuggingFaceEmbeddings
-                return HuggingFaceEmbeddings(model=self.llm_model.model)
-            if isinstance(self.llm_model, ChatVertexAI):
-                from langchain_vertexai import VertexAIEmbeddings
-                return VertexAIEmbeddings()
+        if self.node_config.get("client_type") == "memory":
+            client = QdrantClient(":memory:")
+        elif self.node_config.get("client_type") == "local_db":
+            client = QdrantClient(path="path/to/db")
+        elif self.node_config.get("client_type") == "image":
+            client = QdrantClient(url="http://localhost:6333")
        else:
-            raise ValueError("Embedding Model missing or not supported")
+            raise ValueError("client_type provided not correct")

-    def _create_embedder(self, embedder_config: dict) -> object:
-        """
-        Create an embedding model instance based on the configuration provided.
+        docs = ["Qdrant has Langchain integrations", "Qdrant also has Llama Index integrations"]
+        metadata = [
+            {"source": "Langchain-docs"},
+            {"source": "Linkedin-docs"},
+        ]
+        ids = [42, 2]

-        Args:
-            embedder_config (dict): Configuration parameters for the embedding model.
+        client.add(
+            collection_name="demo_collection",
+            documents=docs,
+            metadata=metadata,
+            ids=ids
+        )

-        Returns:
-            object: An instance of the embedding model client.
-
-        Raises:
-            KeyError: If the model is not supported.
-        """
-        embedder_params = {**embedder_config}
-        if "model_instance" in embedder_config:
-            return embedder_params["model_instance"]
-        if "openai" in embedder_params["model"]:
-            return OpenAIEmbeddings(api_key=embedder_params["api_key"])
-        if "azure" in embedder_params["model"]:
-            return AzureOpenAIEmbeddings()
-        if "ollama" in embedder_params["model"]:
-            embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
-            try:
-                models_tokens["ollama"][embedder_params["model"]]
-            except KeyError as exc:
-                raise KeyError("Model not supported") from exc
-            return OllamaEmbeddings(**embedder_params)
-        if "gemini" in embedder_params["model"]:
-            try:
-                models_tokens["gemini"][embedder_params["model"]]
-            except KeyError as exc:
-                raise KeyError("Model not supported") from exc
-            return GoogleGenerativeAIEmbeddings(model=embedder_params["model"])
-        if "bedrock" in embedder_params["model"]:
-            embedder_params["model"] = embedder_params["model"].split("/")[-1]
-            client = embedder_params.get("client", None)
-            try:
-                models_tokens["bedrock"][embedder_params["model"]]
-            except KeyError as exc:
-                raise KeyError("Model not supported") from exc
-            return BedrockEmbeddings(client=client, model_id=embedder_params["model"])
-        if all(key in sys.modules for key in optional_modules):
-            if "hugging_face" in embedder_params["model"]:
-                from langchain_huggingface import HuggingFaceEmbeddings
-                embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
-                try:
-                    models_tokens["hugging_face"][embedder_params["model"]]
-                except KeyError as exc:
-                    raise KeyError("Model not supported") from exc
-                return HuggingFaceEmbeddings(model=embedder_params["model"])
-            elif "fireworks" in embedder_params["model"]:
-                from langchain_fireworks import FireworksEmbeddings
-                embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
-                try:
-                    models_tokens["fireworks"][embedder_params["model"]]
-                except KeyError as exc:
-                    raise KeyError("Model not supported") from exc
-                return FireworksEmbeddings(model=embedder_params["model"])
-            elif "nvidia" in embedder_params["model"]:
-                from langchain_nvidia import NVIDIAEmbeddings
-                embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
-                try:
-                    models_tokens["nvidia"][embedder_params["model"]]
-                except KeyError as exc:
-                    raise KeyError("Model not supported") from exc
-                return NVIDIAEmbeddings(model=embedder_params["model"],
-                                        nvidia_api_key=embedder_params["api_key"])
-
-        raise ValueError("Model provided by the configuration not supported")
+        state["vectorial_db"] = client
+        return state