mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
add empyt nodes
This commit is contained in:
parent
d14fb54548
commit
ea27b2499e
19
README.md
19
README.md
@ -38,10 +38,9 @@ Additional dependecies can be added while installing the library:
|
||||
|
||||
- <b>More Language Models</b>: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.
|
||||
|
||||
|
||||
This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
|
||||
```bash
|
||||
pip install scrapegraphai[other-language-models]
|
||||
This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
|
||||
```bash
|
||||
pip install scrapegraphai[other-language-models]
|
||||
```
|
||||
- <b>Semantic Options</b>: this group includes tools for advanced semantic processing, such as Graphviz.
|
||||
|
||||
@ -55,23 +54,15 @@ pip install scrapegraphai[other-language-models]
|
||||
pip install scrapegraphai[more-browser-options]
|
||||
```
|
||||
|
||||
- <b>faiss Options</b>: this group includes faiss integration
|
||||
- <b>qdrants Options</b>: this group includes qdrant integration for RAGnode and DeepScraperGraph.
|
||||
|
||||
```bash
|
||||
pip install scrapegraphai[faiss-cpu]
|
||||
pip install scrapegraphai[qdrant]
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
|
||||
### Installing "More Browser Options"
|
||||
|
||||
This group includes an ocr scraper for websites
|
||||
```bash
|
||||
pip install scrapegraphai[screenshot_scraper]
|
||||
```
|
||||
|
||||
## 💻 Usage
|
||||
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
|
||||
|
||||
|
||||
@ -100,8 +100,9 @@ screenshot_scraper = [
|
||||
]
|
||||
|
||||
# Group 5: Faiss CPU
|
||||
faiss-cpu = [
|
||||
"faiss-cpu>=1.8.0",
|
||||
qdrant = [
|
||||
"qdrant-client>=1.11.3",
|
||||
"fastembed>=0.3.6"
|
||||
]
|
||||
|
||||
[build-system]
|
||||
|
||||
@ -28,3 +28,6 @@ from .html_analyzer_node import HtmlAnalyzerNode
|
||||
from .generate_code_node import GenerateCodeNode
|
||||
from .search_node_with_context import SearchLinksWithContext
|
||||
from .reasoning_node import ReasoningNode
|
||||
from .fetch_node_level_k import FetchNodelevelK
|
||||
from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
|
||||
from .description_node import DescriptionNode
|
||||
|
||||
42
scrapegraphai/nodes/description_node.py
Normal file
42
scrapegraphai/nodes/description_node.py
Normal file
@ -0,0 +1,42 @@
|
||||
"""
|
||||
DescriptionNode Module
|
||||
"""
|
||||
from typing import List, Optional
|
||||
from .base_node import BaseNode
|
||||
|
||||
class DescriptionNode(BaseNode):
|
||||
"""
|
||||
A node responsible for compressing the input tokens and storing the document
|
||||
in a vector database for retrieval. Relevant chunks are stored in the state.
|
||||
|
||||
It allows scraping of big documents without exceeding the token limit of the language model.
|
||||
|
||||
Attributes:
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
|
||||
Args:
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "Parse".
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input: str,
|
||||
output: List[str],
|
||||
node_config: Optional[dict] = None,
|
||||
node_name: str = "RAG",
|
||||
):
|
||||
super().__init__(node_name, "node", input, output, 2, node_config)
|
||||
|
||||
self.llm_model = node_config["llm_model"]
|
||||
self.embedder_model = node_config.get("embedder_model", None)
|
||||
self.verbose = (
|
||||
False if node_config is None else node_config.get("verbose", False)
|
||||
)
|
||||
self.cache_path = node_config.get("cache_path", False)
|
||||
|
||||
def execute(self, state: dict) -> dict:
|
||||
pass
|
||||
42
scrapegraphai/nodes/fetch_node_level_k.py
Normal file
42
scrapegraphai/nodes/fetch_node_level_k.py
Normal file
@ -0,0 +1,42 @@
|
||||
"""
|
||||
FetchNodelevelK Module
|
||||
"""
|
||||
from typing import List, Optional
|
||||
from .base_node import BaseNode
|
||||
|
||||
class FetchNodelevelK(BaseNode):
|
||||
"""
|
||||
A node responsible for compressing the input tokens and storing the document
|
||||
in a vector database for retrieval. Relevant chunks are stored in the state.
|
||||
|
||||
It allows scraping of big documents without exceeding the token limit of the language model.
|
||||
|
||||
Attributes:
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
|
||||
Args:
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "Parse".
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input: str,
|
||||
output: List[str],
|
||||
node_config: Optional[dict] = None,
|
||||
node_name: str = "RAG",
|
||||
):
|
||||
super().__init__(node_name, "node", input, output, 2, node_config)
|
||||
|
||||
self.llm_model = node_config["llm_model"]
|
||||
self.embedder_model = node_config.get("embedder_model", None)
|
||||
self.verbose = (
|
||||
False if node_config is None else node_config.get("verbose", False)
|
||||
)
|
||||
self.cache_path = node_config.get("cache_path", False)
|
||||
|
||||
def execute(self, state: dict) -> dict:
|
||||
pass
|
||||
50
scrapegraphai/nodes/generate_answer_node_k_level.py
Normal file
50
scrapegraphai/nodes/generate_answer_node_k_level.py
Normal file
@ -0,0 +1,50 @@
|
||||
"""
|
||||
GenerateAnswerNodeKLevel Module
|
||||
"""
|
||||
from typing import List, Optional
|
||||
from .base_node import BaseNode
|
||||
|
||||
class GenerateAnswerNodeKLevel(BaseNode):
|
||||
"""
|
||||
A node responsible for compressing the input tokens and storing the document
|
||||
in a vector database for retrieval. Relevant chunks are stored in the state.
|
||||
|
||||
It allows scraping of big documents without exceeding the token limit of the language model.
|
||||
|
||||
Attributes:
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
|
||||
Args:
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "Parse".
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input: str,
|
||||
output: List[str],
|
||||
node_config: Optional[dict] = None,
|
||||
node_name: str = "GANLK",
|
||||
):
|
||||
super().__init__(node_name, "node", input, output, 2, node_config)
|
||||
|
||||
self.llm_model = node_config["llm_model"]
|
||||
self.embedder_model = node_config.get("embedder_model", None)
|
||||
self.verbose = (
|
||||
False if node_config is None else node_config.get("verbose", False)
|
||||
)
|
||||
|
||||
def execute(self, state: dict) -> dict:
|
||||
client = state["vectorial_db"]
|
||||
|
||||
answer = client.query(
|
||||
collection_name="demo_collection",
|
||||
query_text="This is a query document"
|
||||
)
|
||||
|
||||
state["answer"] = answer
|
||||
|
||||
return state
|
||||
@ -26,7 +26,6 @@ from ..utils import (transform_schema,
|
||||
from .base_node import BaseNode
|
||||
from jsonschema import validate, ValidationError
|
||||
|
||||
|
||||
class GenerateCodeNode(BaseNode):
|
||||
"""
|
||||
A node that generates Python code for a function that extracts data
|
||||
@ -96,7 +95,7 @@ class GenerateCodeNode(BaseNode):
|
||||
Raises:
|
||||
KeyError: If the input keys are not found in the state, indicating
|
||||
that the necessary information for generating an answer is missing.
|
||||
RuntimeError: If the maximum number of iterations is
|
||||
RuntimeError: If the maximum number of iterations is
|
||||
reached without obtaining the desired code.
|
||||
"""
|
||||
|
||||
@ -170,7 +169,7 @@ class GenerateCodeNode(BaseNode):
|
||||
self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---")
|
||||
state = self.semantic_comparison_loop(state)
|
||||
if state["errors"]["semantic"]:
|
||||
continue
|
||||
continue
|
||||
break
|
||||
|
||||
if state["iteration"] == self.max_iterations["overall"] and \
|
||||
@ -195,9 +194,9 @@ class GenerateCodeNode(BaseNode):
|
||||
state["errors"]["syntax"] = [syntax_message]
|
||||
self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---")
|
||||
analysis = syntax_focused_analysis(state, self.llm_model)
|
||||
self.logger.info(f"""--- (Regenerating Code
|
||||
self.logger.info(f"""--- (Regenerating Code
|
||||
to fix the Error) ---""")
|
||||
state["generated_code"] = syntax_focused_code_generation(state,
|
||||
state["generated_code"] = syntax_focused_code_generation(state,
|
||||
analysis, self.llm_model)
|
||||
state["generated_code"] = extract_code(state["generated_code"])
|
||||
return state
|
||||
@ -217,14 +216,14 @@ class GenerateCodeNode(BaseNode):
|
||||
self.logger.info(f"--- (Code Execution Error: {execution_result}) ---")
|
||||
analysis = execution_focused_analysis(state, self.llm_model)
|
||||
self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
|
||||
state["generated_code"] = execution_focused_code_generation(state,
|
||||
state["generated_code"] = execution_focused_code_generation(state,
|
||||
analysis, self.llm_model)
|
||||
state["generated_code"] = extract_code(state["generated_code"])
|
||||
return state
|
||||
|
||||
def validation_reasoning_loop(self, state: dict) -> dict:
|
||||
for _ in range(self.max_iterations["validation"]):
|
||||
validation, errors = self.validate_dict(state["execution_result"],
|
||||
validation, errors = self.validate_dict(state["execution_result"],
|
||||
self.output_schema.schema())
|
||||
if validation:
|
||||
state["errors"]["validation"] = []
|
||||
@ -240,7 +239,7 @@ class GenerateCodeNode(BaseNode):
|
||||
|
||||
def semantic_comparison_loop(self, state: dict) -> dict:
|
||||
for _ in range(self.max_iterations["semantic"]):
|
||||
comparison_result = self.semantic_comparison(state["execution_result"],
|
||||
comparison_result = self.semantic_comparison(state["execution_result"],
|
||||
state["reference_answer"])
|
||||
if comparison_result["are_semantically_equivalent"]:
|
||||
state["errors"]["semantic"] = []
|
||||
@ -342,7 +341,7 @@ class GenerateCodeNode(BaseNode):
|
||||
if not extract_data:
|
||||
raise NameError("Function 'extract_data' not found in the generated code.")
|
||||
|
||||
result = extract_data(self.raw_html)
|
||||
result = extract_data(self.raw_html)
|
||||
return True, result
|
||||
except Exception as e:
|
||||
return False, f"Error during execution: {str(e)}"
|
||||
@ -357,5 +356,5 @@ class GenerateCodeNode(BaseNode):
|
||||
validate(instance=data, schema=schema)
|
||||
return True, None
|
||||
except ValidationError as e:
|
||||
errors = e.errors()
|
||||
errors = [e.message]
|
||||
return False, errors
|
||||
|
||||
@ -1,29 +1,9 @@
|
||||
"""
|
||||
RAGNode Module
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
from typing import List, Optional
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.retrievers import ContextualCompressionRetriever
|
||||
from langchain.retrievers.document_compressors import (
|
||||
DocumentCompressorPipeline,
|
||||
EmbeddingsFilter,
|
||||
)
|
||||
from langchain_community.document_transformers import EmbeddingsRedundantFilter
|
||||
from langchain_community.vectorstores import FAISS
|
||||
from langchain_community.chat_models import ChatOllama
|
||||
from langchain_community.embeddings import OllamaEmbeddings
|
||||
from langchain_aws import BedrockEmbeddings, ChatBedrock
|
||||
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
|
||||
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI
|
||||
from ..utils.logging import get_logger
|
||||
from .base_node import BaseNode
|
||||
from ..helpers import models_tokens
|
||||
from ..models import DeepSeek
|
||||
|
||||
optional_modules = {"langchain_anthropic", "langchain_fireworks",
|
||||
"langchain_groq", "langchain_google_vertexai"}
|
||||
from qdrant_client import QdrantClient
|
||||
|
||||
class RAGNode(BaseNode):
|
||||
"""
|
||||
@ -34,7 +14,6 @@ class RAGNode(BaseNode):
|
||||
|
||||
Attributes:
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client, configured for generating embeddings.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
|
||||
Args:
|
||||
@ -58,125 +37,31 @@ class RAGNode(BaseNode):
|
||||
self.verbose = (
|
||||
False if node_config is None else node_config.get("verbose", False)
|
||||
)
|
||||
self.cache_path = node_config.get("cache_path", False)
|
||||
|
||||
def execute(self, state: dict) -> dict:
|
||||
# Execution logic
|
||||
pass
|
||||
|
||||
def _create_default_embedder(self, llm_config=None) -> object:
|
||||
"""
|
||||
Create an embedding model instance based on the chosen llm model.
|
||||
|
||||
Returns:
|
||||
object: An instance of the embedding model client.
|
||||
|
||||
Raises:
|
||||
ValueError: If the model is not supported.
|
||||
"""
|
||||
|
||||
if isinstance(self.llm_model, ChatGoogleGenerativeAI):
|
||||
return GoogleGenerativeAIEmbeddings(
|
||||
google_api_key=llm_config["api_key"], model="models/embedding-001"
|
||||
)
|
||||
if isinstance(self.llm_model, ChatOpenAI):
|
||||
return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key,
|
||||
base_url=self.llm_model.openai_api_base)
|
||||
elif isinstance(self.llm_model, DeepSeek):
|
||||
return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
|
||||
elif isinstance(self.llm_model, AzureOpenAIEmbeddings):
|
||||
return self.llm_model
|
||||
elif isinstance(self.llm_model, AzureChatOpenAI):
|
||||
return AzureOpenAIEmbeddings()
|
||||
elif isinstance(self.llm_model, ChatOllama):
|
||||
params = self.llm_model._lc_kwargs
|
||||
params.pop("streaming", None)
|
||||
params.pop("temperature", None)
|
||||
return OllamaEmbeddings(**params)
|
||||
elif isinstance(self.llm_model, ChatBedrock):
|
||||
return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id)
|
||||
elif all(key in sys.modules for key in optional_modules):
|
||||
if isinstance(self.llm_model, ChatFireworks):
|
||||
from langchain_fireworks import FireworksEmbeddings
|
||||
return FireworksEmbeddings(model=self.llm_model.model_name)
|
||||
if isinstance(self.llm_model, ChatNVIDIA):
|
||||
from langchain_nvidia import NVIDIAEmbeddings
|
||||
return NVIDIAEmbeddings(model=self.llm_model.model_name)
|
||||
if isinstance(self.llm_model, ChatHuggingFace):
|
||||
from langchain_huggingface import HuggingFaceEmbeddings
|
||||
return HuggingFaceEmbeddings(model=self.llm_model.model)
|
||||
if isinstance(self.llm_model, ChatVertexAI):
|
||||
from langchain_vertexai import VertexAIEmbeddings
|
||||
return VertexAIEmbeddings()
|
||||
if self.node_config.get("client_type") == "memory":
|
||||
client = QdrantClient(":memory:")
|
||||
elif self.node_config.get("client_type") == "local_db":
|
||||
client = QdrantClient(path="path/to/db")
|
||||
elif self.node_config.get("client_type") == "image":
|
||||
client = QdrantClient(url="http://localhost:6333")
|
||||
else:
|
||||
raise ValueError("Embedding Model missing or not supported")
|
||||
raise ValueError("client_type provided not correct")
|
||||
|
||||
def _create_embedder(self, embedder_config: dict) -> object:
|
||||
"""
|
||||
Create an embedding model instance based on the configuration provided.
|
||||
docs = ["Qdrant has Langchain integrations", "Qdrant also has Llama Index integrations"]
|
||||
metadata = [
|
||||
{"source": "Langchain-docs"},
|
||||
{"source": "Linkedin-docs"},
|
||||
]
|
||||
ids = [42, 2]
|
||||
|
||||
Args:
|
||||
embedder_config (dict): Configuration parameters for the embedding model.
|
||||
client.add(
|
||||
collection_name="demo_collection",
|
||||
documents=docs,
|
||||
metadata=metadata,
|
||||
ids=ids
|
||||
)
|
||||
|
||||
Returns:
|
||||
object: An instance of the embedding model client.
|
||||
|
||||
Raises:
|
||||
KeyError: If the model is not supported.
|
||||
"""
|
||||
embedder_params = {**embedder_config}
|
||||
if "model_instance" in embedder_config:
|
||||
return embedder_params["model_instance"]
|
||||
if "openai" in embedder_params["model"]:
|
||||
return OpenAIEmbeddings(api_key=embedder_params["api_key"])
|
||||
if "azure" in embedder_params["model"]:
|
||||
return AzureOpenAIEmbeddings()
|
||||
if "ollama" in embedder_params["model"]:
|
||||
embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
|
||||
try:
|
||||
models_tokens["ollama"][embedder_params["model"]]
|
||||
except KeyError as exc:
|
||||
raise KeyError("Model not supported") from exc
|
||||
return OllamaEmbeddings(**embedder_params)
|
||||
if "gemini" in embedder_params["model"]:
|
||||
try:
|
||||
models_tokens["gemini"][embedder_params["model"]]
|
||||
except KeyError as exc:
|
||||
raise KeyError("Model not supported") from exc
|
||||
return GoogleGenerativeAIEmbeddings(model=embedder_params["model"])
|
||||
if "bedrock" in embedder_params["model"]:
|
||||
embedder_params["model"] = embedder_params["model"].split("/")[-1]
|
||||
client = embedder_params.get("client", None)
|
||||
try:
|
||||
models_tokens["bedrock"][embedder_params["model"]]
|
||||
except KeyError as exc:
|
||||
raise KeyError("Model not supported") from exc
|
||||
return BedrockEmbeddings(client=client, model_id=embedder_params["model"])
|
||||
if all(key in sys.modules for key in optional_modules):
|
||||
if "hugging_face" in embedder_params["model"]:
|
||||
from langchain_huggingface import HuggingFaceEmbeddings
|
||||
embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
|
||||
try:
|
||||
models_tokens["hugging_face"][embedder_params["model"]]
|
||||
except KeyError as exc:
|
||||
raise KeyError("Model not supported") from exc
|
||||
return HuggingFaceEmbeddings(model=embedder_params["model"])
|
||||
elif "fireworks" in embedder_params["model"]:
|
||||
from langchain_fireworks import FireworksEmbeddings
|
||||
embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
|
||||
try:
|
||||
models_tokens["fireworks"][embedder_params["model"]]
|
||||
except KeyError as exc:
|
||||
raise KeyError("Model not supported") from exc
|
||||
return FireworksEmbeddings(model=embedder_params["model"])
|
||||
elif "nvidia" in embedder_params["model"]:
|
||||
from langchain_nvidia import NVIDIAEmbeddings
|
||||
embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:])
|
||||
try:
|
||||
models_tokens["nvidia"][embedder_params["model"]]
|
||||
except KeyError as exc:
|
||||
raise KeyError("Model not supported") from exc
|
||||
return NVIDIAEmbeddings(model=embedder_params["model"],
|
||||
nvidia_api_key=embedder_params["api_key"])
|
||||
|
||||
raise ValueError("Model provided by the configuration not supported")
|
||||
state["vectorial_db"] = client
|
||||
return state
|
||||
|
||||
Loading…
Reference in New Issue
Block a user