add empyt nodes

This commit is contained in:
Marco Vinciguerra 2024-09-30 11:52:14 +02:00
parent d14fb54548
commit ea27b2499e
8 changed files with 176 additions and 163 deletions

View File

@ -38,10 +38,9 @@ Additional dependecies can be added while installing the library:
- <b>More Language Models</b>: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.
This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
```bash
pip install scrapegraphai[other-language-models]
This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
```bash
pip install scrapegraphai[other-language-models]
```
- <b>Semantic Options</b>: this group includes tools for advanced semantic processing, such as Graphviz.
@ -55,23 +54,15 @@ pip install scrapegraphai[other-language-models]
pip install scrapegraphai[more-browser-options]
```
- <b>faiss Options</b>: this group includes faiss integration
- <b>qdrants Options</b>: this group includes qdrant integration for RAGnode and DeepScraperGraph.
```bash
pip install scrapegraphai[faiss-cpu]
pip install scrapegraphai[qdrant]
```
</details>
### Installing "More Browser Options"
This group includes an ocr scraper for websites
```bash
pip install scrapegraphai[screenshot_scraper]
```
## 💻 Usage
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).

View File

@ -100,8 +100,9 @@ screenshot_scraper = [
]
# Group 5: Faiss CPU
faiss-cpu = [
"faiss-cpu>=1.8.0",
qdrant = [
"qdrant-client>=1.11.3",
"fastembed>=0.3.6"
]
[build-system]

View File

@ -28,3 +28,6 @@ from .html_analyzer_node import HtmlAnalyzerNode
from .generate_code_node import GenerateCodeNode
from .search_node_with_context import SearchLinksWithContext
from .reasoning_node import ReasoningNode
from .fetch_node_level_k import FetchNodelevelK
from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
from .description_node import DescriptionNode

View File

@ -0,0 +1,42 @@
"""
DescriptionNode Module
"""
from typing import List, Optional
from .base_node import BaseNode
class DescriptionNode(BaseNode):
"""
A node responsible for compressing the input tokens and storing the document
in a vector database for retrieval. Relevant chunks are stored in the state.
It allows scraping of big documents without exceeding the token limit of the language model.
Attributes:
llm_model: An instance of a language model client, configured for generating answers.
verbose (bool): A flag indicating whether to show print statements during execution.
Args:
input (str): Boolean expression defining the input keys needed from the state.
output (List[str]): List of output keys to be updated in the state.
node_config (dict): Additional configuration for the node.
node_name (str): The unique identifier name for the node, defaulting to "Parse".
"""
def __init__(
self,
input: str,
output: List[str],
node_config: Optional[dict] = None,
node_name: str = "RAG",
):
super().__init__(node_name, "node", input, output, 2, node_config)
self.llm_model = node_config["llm_model"]
self.embedder_model = node_config.get("embedder_model", None)
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
self.cache_path = node_config.get("cache_path", False)
def execute(self, state: dict) -> dict:
pass

View File

@ -0,0 +1,42 @@
"""
FetchNodelevelK Module
"""
from typing import List, Optional
from .base_node import BaseNode
class FetchNodelevelK(BaseNode):
"""
A node responsible for compressing the input tokens and storing the document
in a vector database for retrieval. Relevant chunks are stored in the state.
It allows scraping of big documents without exceeding the token limit of the language model.
Attributes:
llm_model: An instance of a language model client, configured for generating answers.
verbose (bool): A flag indicating whether to show print statements during execution.
Args:
input (str): Boolean expression defining the input keys needed from the state.
output (List[str]): List of output keys to be updated in the state.
node_config (dict): Additional configuration for the node.
node_name (str): The unique identifier name for the node, defaulting to "Parse".
"""
def __init__(
self,
input: str,
output: List[str],
node_config: Optional[dict] = None,
node_name: str = "RAG",
):
super().__init__(node_name, "node", input, output, 2, node_config)
self.llm_model = node_config["llm_model"]
self.embedder_model = node_config.get("embedder_model", None)
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
self.cache_path = node_config.get("cache_path", False)
def execute(self, state: dict) -> dict:
pass

View File

@ -0,0 +1,50 @@
"""
GenerateAnswerNodeKLevel Module
"""
from typing import List, Optional
from .base_node import BaseNode
class GenerateAnswerNodeKLevel(BaseNode):
"""
A node responsible for compressing the input tokens and storing the document
in a vector database for retrieval. Relevant chunks are stored in the state.
It allows scraping of big documents without exceeding the token limit of the language model.
Attributes:
llm_model: An instance of a language model client, configured for generating answers.
verbose (bool): A flag indicating whether to show print statements during execution.
Args:
input (str): Boolean expression defining the input keys needed from the state.
output (List[str]): List of output keys to be updated in the state.
node_config (dict): Additional configuration for the node.
node_name (str): The unique identifier name for the node, defaulting to "Parse".
"""
def __init__(
self,
input: str,
output: List[str],
node_config: Optional[dict] = None,
node_name: str = "GANLK",
):
super().__init__(node_name, "node", input, output, 2, node_config)
self.llm_model = node_config["llm_model"]
self.embedder_model = node_config.get("embedder_model", None)
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
def execute(self, state: dict) -> dict:
client = state["vectorial_db"]
answer = client.query(
collection_name="demo_collection",
query_text="This is a query document"
)
state["answer"] = answer
return state

View File

@ -26,7 +26,6 @@ from ..utils import (transform_schema,
from .base_node import BaseNode
from jsonschema import validate, ValidationError
class GenerateCodeNode(BaseNode):
"""
A node that generates Python code for a function that extracts data
@ -96,7 +95,7 @@ class GenerateCodeNode(BaseNode):
Raises:
KeyError: If the input keys are not found in the state, indicating
that the necessary information for generating an answer is missing.
RuntimeError: If the maximum number of iterations is
RuntimeError: If the maximum number of iterations is
reached without obtaining the desired code.
"""
@ -170,7 +169,7 @@ class GenerateCodeNode(BaseNode):
self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---")
state = self.semantic_comparison_loop(state)
if state["errors"]["semantic"]:
continue
continue
break
if state["iteration"] == self.max_iterations["overall"] and \
@ -195,9 +194,9 @@ class GenerateCodeNode(BaseNode):
state["errors"]["syntax"] = [syntax_message]
self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---")
analysis = syntax_focused_analysis(state, self.llm_model)
self.logger.info(f"""--- (Regenerating Code
self.logger.info(f"""--- (Regenerating Code
to fix the Error) ---""")
state["generated_code"] = syntax_focused_code_generation(state,
state["generated_code"] = syntax_focused_code_generation(state,
analysis, self.llm_model)
state["generated_code"] = extract_code(state["generated_code"])
return state
@ -217,14 +216,14 @@ class GenerateCodeNode(BaseNode):
self.logger.info(f"--- (Code Execution Error: {execution_result}) ---")
analysis = execution_focused_analysis(state, self.llm_model)
self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
state["generated_code"] = execution_focused_code_generation(state,
state["generated_code"] = execution_focused_code_generation(state,
analysis, self.llm_model)
state["generated_code"] = extract_code(state["generated_code"])
return state
def validation_reasoning_loop(self, state: dict) -> dict:
for _ in range(self.max_iterations["validation"]):
validation, errors = self.validate_dict(state["execution_result"],
validation, errors = self.validate_dict(state["execution_result"],
self.output_schema.schema())
if validation:
state["errors"]["validation"] = []
@ -240,7 +239,7 @@ class GenerateCodeNode(BaseNode):
def semantic_comparison_loop(self, state: dict) -> dict:
for _ in range(self.max_iterations["semantic"]):
comparison_result = self.semantic_comparison(state["execution_result"],
comparison_result = self.semantic_comparison(state["execution_result"],
state["reference_answer"])
if comparison_result["are_semantically_equivalent"]:
state["errors"]["semantic"] = []
@ -342,7 +341,7 @@ class GenerateCodeNode(BaseNode):
if not extract_data:
raise NameError("Function 'extract_data' not found in the generated code.")
result = extract_data(self.raw_html)
result = extract_data(self.raw_html)
return True, result
except Exception as e:
return False, f"Error during execution: {str(e)}"
@ -357,5 +356,5 @@ class GenerateCodeNode(BaseNode):
validate(instance=data, schema=schema)
return True, None
except ValidationError as e:
errors = e.errors()
errors = [e.message]
return False, errors

View File

@ -1,29 +1,9 @@
"""
RAGNode Module
"""
import os
import sys
from typing import List, Optional
from langchain.docstore.document import Document
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import (
DocumentCompressorPipeline,
EmbeddingsFilter,
)
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain_community.vectorstores import FAISS
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from langchain_aws import BedrockEmbeddings, ChatBedrock
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI
from ..utils.logging import get_logger
from .base_node import BaseNode
from ..helpers import models_tokens
from ..models import DeepSeek
optional_modules = {"langchain_anthropic", "langchain_fireworks",
"langchain_groq", "langchain_google_vertexai"}
from qdrant_client import QdrantClient
class RAGNode(BaseNode):
"""
@ -34,7 +14,6 @@ class RAGNode(BaseNode):
Attributes:
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client, configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
Args:
@ -58,125 +37,31 @@ class RAGNode(BaseNode):
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
self.cache_path = node_config.get("cache_path", False)
def execute(self, state: dict) -> dict:
# Execution logic
pass
def _create_default_embedder(self, llm_config=None) -> object:
"""
Create an embedding model instance based on the chosen llm model.
Returns:
object: An instance of the embedding model client.
Raises:
ValueError: If the model is not supported.
"""
if isinstance(self.llm_model, ChatGoogleGenerativeAI):
return GoogleGenerativeAIEmbeddings(
google_api_key=llm_config["api_key"], model="models/embedding-001"
)
if isinstance(self.llm_model, ChatOpenAI):
return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key,
base_url=self.llm_model.openai_api_base)
elif isinstance(self.llm_model, DeepSeek):
return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
elif isinstance(self.llm_model, AzureOpenAIEmbeddings):
return self.llm_model
elif isinstance(self.llm_model, AzureChatOpenAI):
return AzureOpenAIEmbeddings()
elif isinstance(self.llm_model, ChatOllama):
params = self.llm_model._lc_kwargs
params.pop("streaming", None)
params.pop("temperature", None)
return OllamaEmbeddings(**params)
elif isinstance(self.llm_model, ChatBedrock):
return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id)
elif all(key in sys.modules for key in optional_modules):
if isinstance(self.llm_model, ChatFireworks):
from langchain_fireworks import FireworksEmbeddings
return FireworksEmbeddings(model=self.llm_model.model_name)
if isinstance(self.llm_model, ChatNVIDIA):
from langchain_nvidia import NVIDIAEmbeddings
return NVIDIAEmbeddings(model=self.llm_model.model_name)
if isinstance(self.llm_model, ChatHuggingFace):
from langchain_huggingface import HuggingFaceEmbeddings
return HuggingFaceEmbeddings(model=self.llm_model.model)
if isinstance(self.llm_model, ChatVertexAI):
from langchain_vertexai import VertexAIEmbeddings
return VertexAIEmbeddings()
if self.node_config.get("client_type") == "memory":
client = QdrantClient(":memory:")
elif self.node_config.get("client_type") == "local_db":
client = QdrantClient(path="path/to/db")
elif self.node_config.get("client_type") == "image":
client = QdrantClient(url="http://localhost:6333")
else:
raise ValueError("Embedding Model missing or not supported")
raise ValueError("client_type provided not correct")
def _create_embedder(self, embedder_config: dict) -> object:
"""
Create an embedding model instance based on the configuration provided.
docs = ["Qdrant has Langchain integrations", "Qdrant also has Llama Index integrations"]
metadata = [
{"source": "Langchain-docs"},
{"source": "Linkedin-docs"},
]
ids = [42, 2]
Args:
embedder_config (dict): Configuration parameters for the embedding model.
client.add(
collection_name="demo_collection",
documents=docs,
metadata=metadata,
ids=ids
)
Returns:
object: An instance of the embedding model client.
Raises:
KeyError: If the model is not supported.
"""
embedder_params = {**embedder_config}
if "model_instance" in embedder_config:
return embedder_params["model_instance"]
if "openai" in embedder_params["model"]:
return OpenAIEmbeddings(api_key=embedder_params["api_key"])
if "azure" in embedder_params["model"]:
return AzureOpenAIEmbeddings()
if "ollama" in embedder_params["model"]:
embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
try:
models_tokens["ollama"][embedder_params["model"]]
except KeyError as exc:
raise KeyError("Model not supported") from exc
return OllamaEmbeddings(**embedder_params)
if "gemini" in embedder_params["model"]:
try:
models_tokens["gemini"][embedder_params["model"]]
except KeyError as exc:
raise KeyError("Model not supported") from exc
return GoogleGenerativeAIEmbeddings(model=embedder_params["model"])
if "bedrock" in embedder_params["model"]:
embedder_params["model"] = embedder_params["model"].split("/")[-1]
client = embedder_params.get("client", None)
try:
models_tokens["bedrock"][embedder_params["model"]]
except KeyError as exc:
raise KeyError("Model not supported") from exc
return BedrockEmbeddings(client=client, model_id=embedder_params["model"])
if all(key in sys.modules for key in optional_modules):
if "hugging_face" in embedder_params["model"]:
from langchain_huggingface import HuggingFaceEmbeddings
embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
try:
models_tokens["hugging_face"][embedder_params["model"]]
except KeyError as exc:
raise KeyError("Model not supported") from exc
return HuggingFaceEmbeddings(model=embedder_params["model"])
elif "fireworks" in embedder_params["model"]:
from langchain_fireworks import FireworksEmbeddings
embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
try:
models_tokens["fireworks"][embedder_params["model"]]
except KeyError as exc:
raise KeyError("Model not supported") from exc
return FireworksEmbeddings(model=embedder_params["model"])
elif "nvidia" in embedder_params["model"]:
from langchain_nvidia import NVIDIAEmbeddings
embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
try:
models_tokens["nvidia"][embedder_params["model"]]
except KeyError as exc:
raise KeyError("Model not supported") from exc
return NVIDIAEmbeddings(model=embedder_params["model"],
nvidia_api_key=embedder_params["api_key"])
raise ValueError("Model provided by the configuration not supported")
state["vectorial_db"] = client
return state