From ea27b2499ef5dccc46aab8bc7cdc987cfc6e6c20 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 30 Sep 2024 11:52:14 +0200 Subject: [PATCH] add empyt nodes --- README.md | 19 +-- pyproject.toml | 5 +- scrapegraphai/nodes/__init__.py | 3 + scrapegraphai/nodes/description_node.py | 42 +++++ scrapegraphai/nodes/fetch_node_level_k.py | 42 +++++ .../nodes/generate_answer_node_k_level.py | 50 ++++++ scrapegraphai/nodes/generate_code_node.py | 19 +-- scrapegraphai/nodes/rag_node.py | 159 +++--------------- 8 files changed, 176 insertions(+), 163 deletions(-) create mode 100644 scrapegraphai/nodes/description_node.py create mode 100644 scrapegraphai/nodes/fetch_node_level_k.py create mode 100644 scrapegraphai/nodes/generate_answer_node_k_level.py diff --git a/README.md b/README.md index cf437203..51bc3fa9 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,9 @@ Additional dependecies can be added while installing the library: - More Language Models: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints. - -This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints. -```bash -pip install scrapegraphai[other-language-models] + This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints. + ```bash + pip install scrapegraphai[other-language-models] ``` - Semantic Options: this group includes tools for advanced semantic processing, such as Graphviz. @@ -55,23 +54,15 @@ pip install scrapegraphai[other-language-models] pip install scrapegraphai[more-browser-options] ``` -- faiss Options: this group includes faiss integration +- qdrants Options: this group includes qdrant integration for RAGnode and DeepScraperGraph. ```bash - pip install scrapegraphai[faiss-cpu] + pip install scrapegraphai[qdrant] ``` - -### Installing "More Browser Options" - -This group includes an ocr scraper for websites -```bash -pip install scrapegraphai[screenshot_scraper] -``` - ## 💻 Usage There are multiple standard scraping pipelines that can be used to extract information from a website (or local file). diff --git a/pyproject.toml b/pyproject.toml index 26b1fdb7..dde97395 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,8 +100,9 @@ screenshot_scraper = [ ] # Group 5: Faiss CPU -faiss-cpu = [ - "faiss-cpu>=1.8.0", +qdrant = [ + "qdrant-client>=1.11.3", + "fastembed>=0.3.6" ] [build-system] diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index ec16c48e..e5fafb87 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -28,3 +28,6 @@ from .html_analyzer_node import HtmlAnalyzerNode from .generate_code_node import GenerateCodeNode from .search_node_with_context import SearchLinksWithContext from .reasoning_node import ReasoningNode +from .fetch_node_level_k import FetchNodelevelK +from .generate_answer_node_k_level import GenerateAnswerNodeKLevel +from .description_node import DescriptionNode diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py new file mode 100644 index 00000000..49ab941f --- /dev/null +++ b/scrapegraphai/nodes/description_node.py @@ -0,0 +1,42 @@ +""" +DescriptionNode Module +""" +from typing import List, Optional +from .base_node import BaseNode + +class DescriptionNode(BaseNode): + """ + A node responsible for compressing the input tokens and storing the document + in a vector database for retrieval. Relevant chunks are stored in the state. + + It allows scraping of big documents without exceeding the token limit of the language model. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "RAG", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + self.cache_path = node_config.get("cache_path", False) + + def execute(self, state: dict) -> dict: + pass diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py new file mode 100644 index 00000000..18a0d435 --- /dev/null +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -0,0 +1,42 @@ +""" +FetchNodelevelK Module +""" +from typing import List, Optional +from .base_node import BaseNode + +class FetchNodelevelK(BaseNode): + """ + A node responsible for compressing the input tokens and storing the document + in a vector database for retrieval. Relevant chunks are stored in the state. + + It allows scraping of big documents without exceeding the token limit of the language model. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "RAG", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + self.cache_path = node_config.get("cache_path", False) + + def execute(self, state: dict) -> dict: + pass diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py new file mode 100644 index 00000000..1d4cdb4d --- /dev/null +++ b/scrapegraphai/nodes/generate_answer_node_k_level.py @@ -0,0 +1,50 @@ +""" +GenerateAnswerNodeKLevel Module +""" +from typing import List, Optional +from .base_node import BaseNode + +class GenerateAnswerNodeKLevel(BaseNode): + """ + A node responsible for compressing the input tokens and storing the document + in a vector database for retrieval. Relevant chunks are stored in the state. + + It allows scraping of big documents without exceeding the token limit of the language model. + + Attributes: + llm_model: An instance of a language model client, configured for generating answers. + verbose (bool): A flag indicating whether to show print statements during execution. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (dict): Additional configuration for the node. + node_name (str): The unique identifier name for the node, defaulting to "Parse". + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "GANLK", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.embedder_model = node_config.get("embedder_model", None) + self.verbose = ( + False if node_config is None else node_config.get("verbose", False) + ) + + def execute(self, state: dict) -> dict: + client = state["vectorial_db"] + + answer = client.query( + collection_name="demo_collection", + query_text="This is a query document" + ) + + state["answer"] = answer + + return state diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py index cc72aaf4..746b10a5 100644 --- a/scrapegraphai/nodes/generate_code_node.py +++ b/scrapegraphai/nodes/generate_code_node.py @@ -26,7 +26,6 @@ from ..utils import (transform_schema, from .base_node import BaseNode from jsonschema import validate, ValidationError - class GenerateCodeNode(BaseNode): """ A node that generates Python code for a function that extracts data @@ -96,7 +95,7 @@ class GenerateCodeNode(BaseNode): Raises: KeyError: If the input keys are not found in the state, indicating that the necessary information for generating an answer is missing. - RuntimeError: If the maximum number of iterations is + RuntimeError: If the maximum number of iterations is reached without obtaining the desired code. """ @@ -170,7 +169,7 @@ class GenerateCodeNode(BaseNode): self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---") state = self.semantic_comparison_loop(state) if state["errors"]["semantic"]: - continue + continue break if state["iteration"] == self.max_iterations["overall"] and \ @@ -195,9 +194,9 @@ class GenerateCodeNode(BaseNode): state["errors"]["syntax"] = [syntax_message] self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---") analysis = syntax_focused_analysis(state, self.llm_model) - self.logger.info(f"""--- (Regenerating Code + self.logger.info(f"""--- (Regenerating Code to fix the Error) ---""") - state["generated_code"] = syntax_focused_code_generation(state, + state["generated_code"] = syntax_focused_code_generation(state, analysis, self.llm_model) state["generated_code"] = extract_code(state["generated_code"]) return state @@ -217,14 +216,14 @@ class GenerateCodeNode(BaseNode): self.logger.info(f"--- (Code Execution Error: {execution_result}) ---") analysis = execution_focused_analysis(state, self.llm_model) self.logger.info(f"--- (Regenerating Code to fix the Error) ---") - state["generated_code"] = execution_focused_code_generation(state, + state["generated_code"] = execution_focused_code_generation(state, analysis, self.llm_model) state["generated_code"] = extract_code(state["generated_code"]) return state def validation_reasoning_loop(self, state: dict) -> dict: for _ in range(self.max_iterations["validation"]): - validation, errors = self.validate_dict(state["execution_result"], + validation, errors = self.validate_dict(state["execution_result"], self.output_schema.schema()) if validation: state["errors"]["validation"] = [] @@ -240,7 +239,7 @@ class GenerateCodeNode(BaseNode): def semantic_comparison_loop(self, state: dict) -> dict: for _ in range(self.max_iterations["semantic"]): - comparison_result = self.semantic_comparison(state["execution_result"], + comparison_result = self.semantic_comparison(state["execution_result"], state["reference_answer"]) if comparison_result["are_semantically_equivalent"]: state["errors"]["semantic"] = [] @@ -342,7 +341,7 @@ class GenerateCodeNode(BaseNode): if not extract_data: raise NameError("Function 'extract_data' not found in the generated code.") - result = extract_data(self.raw_html) + result = extract_data(self.raw_html) return True, result except Exception as e: return False, f"Error during execution: {str(e)}" @@ -357,5 +356,5 @@ class GenerateCodeNode(BaseNode): validate(instance=data, schema=schema) return True, None except ValidationError as e: - errors = e.errors() + errors = [e.message] return False, errors diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 1174beee..c92e40f0 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -1,29 +1,9 @@ """ RAGNode Module """ -import os -import sys from typing import List, Optional -from langchain.docstore.document import Document -from langchain.retrievers import ContextualCompressionRetriever -from langchain.retrievers.document_compressors import ( - DocumentCompressorPipeline, - EmbeddingsFilter, -) -from langchain_community.document_transformers import EmbeddingsRedundantFilter -from langchain_community.vectorstores import FAISS -from langchain_community.chat_models import ChatOllama -from langchain_community.embeddings import OllamaEmbeddings -from langchain_aws import BedrockEmbeddings, ChatBedrock -from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI -from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI -from ..utils.logging import get_logger from .base_node import BaseNode -from ..helpers import models_tokens -from ..models import DeepSeek - -optional_modules = {"langchain_anthropic", "langchain_fireworks", - "langchain_groq", "langchain_google_vertexai"} +from qdrant_client import QdrantClient class RAGNode(BaseNode): """ @@ -34,7 +14,6 @@ class RAGNode(BaseNode): Attributes: llm_model: An instance of a language model client, configured for generating answers. - embedder_model: An instance of an embedding model client, configured for generating embeddings. verbose (bool): A flag indicating whether to show print statements during execution. Args: @@ -58,125 +37,31 @@ class RAGNode(BaseNode): self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) - self.cache_path = node_config.get("cache_path", False) def execute(self, state: dict) -> dict: - # Execution logic - pass - def _create_default_embedder(self, llm_config=None) -> object: - """ - Create an embedding model instance based on the chosen llm model. - - Returns: - object: An instance of the embedding model client. - - Raises: - ValueError: If the model is not supported. - """ - - if isinstance(self.llm_model, ChatGoogleGenerativeAI): - return GoogleGenerativeAIEmbeddings( - google_api_key=llm_config["api_key"], model="models/embedding-001" - ) - if isinstance(self.llm_model, ChatOpenAI): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key, - base_url=self.llm_model.openai_api_base) - elif isinstance(self.llm_model, DeepSeek): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) - elif isinstance(self.llm_model, AzureOpenAIEmbeddings): - return self.llm_model - elif isinstance(self.llm_model, AzureChatOpenAI): - return AzureOpenAIEmbeddings() - elif isinstance(self.llm_model, ChatOllama): - params = self.llm_model._lc_kwargs - params.pop("streaming", None) - params.pop("temperature", None) - return OllamaEmbeddings(**params) - elif isinstance(self.llm_model, ChatBedrock): - return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id) - elif all(key in sys.modules for key in optional_modules): - if isinstance(self.llm_model, ChatFireworks): - from langchain_fireworks import FireworksEmbeddings - return FireworksEmbeddings(model=self.llm_model.model_name) - if isinstance(self.llm_model, ChatNVIDIA): - from langchain_nvidia import NVIDIAEmbeddings - return NVIDIAEmbeddings(model=self.llm_model.model_name) - if isinstance(self.llm_model, ChatHuggingFace): - from langchain_huggingface import HuggingFaceEmbeddings - return HuggingFaceEmbeddings(model=self.llm_model.model) - if isinstance(self.llm_model, ChatVertexAI): - from langchain_vertexai import VertexAIEmbeddings - return VertexAIEmbeddings() + if self.node_config.get("client_type") == "memory": + client = QdrantClient(":memory:") + elif self.node_config.get("client_type") == "local_db": + client = QdrantClient(path="path/to/db") + elif self.node_config.get("client_type") == "image": + client = QdrantClient(url="http://localhost:6333") else: - raise ValueError("Embedding Model missing or not supported") + raise ValueError("client_type provided not correct") - def _create_embedder(self, embedder_config: dict) -> object: - """ - Create an embedding model instance based on the configuration provided. + docs = ["Qdrant has Langchain integrations", "Qdrant also has Llama Index integrations"] + metadata = [ + {"source": "Langchain-docs"}, + {"source": "Linkedin-docs"}, + ] + ids = [42, 2] - Args: - embedder_config (dict): Configuration parameters for the embedding model. + client.add( + collection_name="demo_collection", + documents=docs, + metadata=metadata, + ids=ids + ) - Returns: - object: An instance of the embedding model client. - - Raises: - KeyError: If the model is not supported. - """ - embedder_params = {**embedder_config} - if "model_instance" in embedder_config: - return embedder_params["model_instance"] - if "openai" in embedder_params["model"]: - return OpenAIEmbeddings(api_key=embedder_params["api_key"]) - if "azure" in embedder_params["model"]: - return AzureOpenAIEmbeddings() - if "ollama" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["ollama"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return OllamaEmbeddings(**embedder_params) - if "gemini" in embedder_params["model"]: - try: - models_tokens["gemini"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return GoogleGenerativeAIEmbeddings(model=embedder_params["model"]) - if "bedrock" in embedder_params["model"]: - embedder_params["model"] = embedder_params["model"].split("/")[-1] - client = embedder_params.get("client", None) - try: - models_tokens["bedrock"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return BedrockEmbeddings(client=client, model_id=embedder_params["model"]) - if all(key in sys.modules for key in optional_modules): - if "hugging_face" in embedder_params["model"]: - from langchain_huggingface import HuggingFaceEmbeddings - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["hugging_face"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return HuggingFaceEmbeddings(model=embedder_params["model"]) - elif "fireworks" in embedder_params["model"]: - from langchain_fireworks import FireworksEmbeddings - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["fireworks"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return FireworksEmbeddings(model=embedder_params["model"]) - elif "nvidia" in embedder_params["model"]: - from langchain_nvidia import NVIDIAEmbeddings - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["nvidia"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return NVIDIAEmbeddings(model=embedder_params["model"], - nvidia_api_key=embedder_params["api_key"]) - - raise ValueError("Model provided by the configuration not supported") + state["vectorial_db"] = client + return state