diff --git a/pyproject.toml b/pyproject.toml index cb3e6be2..53972f17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,12 +14,12 @@ authors = [ ] dependencies = [ - "langchain==0.2.14", + "langchain>=0.2.14", "langchain-fireworks>=0.1.3", "langchain_community>=0.2.9", "langchain-google-genai>=1.0.7", "langchain-google-vertexai>=1.0.7", - "langchain-openai==0.1.22", + "langchain-openai>=0.1.22", "langchain-groq>=0.1.3", "langchain-aws>=0.1.3", "langchain-anthropic>=0.1.11", diff --git a/requirements-dev.lock b/requirements-dev.lock index 44cc64de..64af8ee8 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -179,6 +179,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 diff --git a/requirements.lock b/requirements.lock index 1812ab21..1d80e1bf 100644 --- a/requirements.lock +++ b/requirements.lock @@ -133,6 +133,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 diff --git a/requirements.txt b/requirements.txt index 754eab61..21c2fd3b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -langchain>=0.2.10 +langchain>=0.2.14 langchain-fireworks>=0.1.3 langchain_community>=0.2.9 langchain-google-genai>=1.0.7 langchain-google-vertexai>=1.0.7 -langchain-openai>=0.1.17 +langchain-openai>=0.1.22 langchain-groq>=0.1.3 langchain-aws>=0.1.3 langchain-anthropic>=0.1.11 diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index db7f8518..8c536bad 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -6,7 +6,6 @@ from typing import List, Optional from semchunk import chunk from langchain_community.document_transformers import Html2TextTransformer from langchain_core.documents import Document -from ..utils.logging import get_logger from .base_node import BaseNode class ParseNode(BaseNode): @@ -79,16 +78,18 @@ class ParseNode(BaseNode): else: docs_transformed = docs_transformed[0] + # Adapt the chunk size, leaving room for the reply, the prompt and the schema + chunk_size = self.node_config.get("chunk_size", 4096) + chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) + if isinstance(docs_transformed, Document): - chunks = chunk(text=docs_transformed.page_content, - chunk_size=self.node_config.get("chunk_size", 4096)-250, + chunk_size=chunk_size, token_counter=lambda text: len(text.split()), memoize=False) else: - chunks = chunk(text=docs_transformed, - chunk_size=self.node_config.get("chunk_size", 4096)-250, + chunk_size=chunk_size, token_counter=lambda text: len(text.split()), memoize=False) diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py index c5263efe..c5e5fbbb 100644 --- a/scrapegraphai/utils/token_calculator.py +++ b/scrapegraphai/utils/token_calculator.py @@ -1,5 +1,5 @@ """ -Module for truncatinh in chunks the messages +Module for truncating in chunks the messages """ from typing import List import tiktoken @@ -27,7 +27,7 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str] """ encoding = tiktoken.get_encoding(encoding_name) - max_tokens = models_tokens[model] - 500 + max_tokens = min(models_tokens[model] - 500, int(models_tokens[model] * 0.9)) encoded_text = encoding.encode(text) chunks = [encoded_text[i:i + max_tokens]