mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
fix(ParseNode): leave room for LLM reply in context window
This commit is contained in:
parent
ebdb74967d
commit
683bf57d89
@ -14,12 +14,12 @@ authors = [
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
"langchain==0.2.14",
|
||||
"langchain>=0.2.14",
|
||||
"langchain-fireworks>=0.1.3",
|
||||
"langchain_community>=0.2.9",
|
||||
"langchain-google-genai>=1.0.7",
|
||||
"langchain-google-vertexai>=1.0.7",
|
||||
"langchain-openai==0.1.22",
|
||||
"langchain-openai>=0.1.22",
|
||||
"langchain-groq>=0.1.3",
|
||||
"langchain-aws>=0.1.3",
|
||||
"langchain-anthropic>=0.1.11",
|
||||
|
||||
@ -179,6 +179,7 @@ graphviz==0.20.3
|
||||
# via scrapegraphai
|
||||
greenlet==3.0.3
|
||||
# via playwright
|
||||
# via sqlalchemy
|
||||
groq==0.9.0
|
||||
# via langchain-groq
|
||||
grpc-google-iam-v1==0.13.1
|
||||
|
||||
@ -133,6 +133,7 @@ graphviz==0.20.3
|
||||
# via scrapegraphai
|
||||
greenlet==3.0.3
|
||||
# via playwright
|
||||
# via sqlalchemy
|
||||
groq==0.9.0
|
||||
# via langchain-groq
|
||||
grpc-google-iam-v1==0.13.1
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
langchain>=0.2.10
|
||||
langchain>=0.2.14
|
||||
langchain-fireworks>=0.1.3
|
||||
langchain_community>=0.2.9
|
||||
langchain-google-genai>=1.0.7
|
||||
langchain-google-vertexai>=1.0.7
|
||||
langchain-openai>=0.1.17
|
||||
langchain-openai>=0.1.22
|
||||
langchain-groq>=0.1.3
|
||||
langchain-aws>=0.1.3
|
||||
langchain-anthropic>=0.1.11
|
||||
|
||||
@ -6,7 +6,6 @@ from typing import List, Optional
|
||||
from semchunk import chunk
|
||||
from langchain_community.document_transformers import Html2TextTransformer
|
||||
from langchain_core.documents import Document
|
||||
from ..utils.logging import get_logger
|
||||
from .base_node import BaseNode
|
||||
|
||||
class ParseNode(BaseNode):
|
||||
@ -79,16 +78,18 @@ class ParseNode(BaseNode):
|
||||
else:
|
||||
docs_transformed = docs_transformed[0]
|
||||
|
||||
# Adapt the chunk size, leaving room for the reply, the prompt and the schema
|
||||
chunk_size = self.node_config.get("chunk_size", 4096)
|
||||
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
|
||||
|
||||
if isinstance(docs_transformed, Document):
|
||||
|
||||
chunks = chunk(text=docs_transformed.page_content,
|
||||
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
||||
chunk_size=chunk_size,
|
||||
token_counter=lambda text: len(text.split()),
|
||||
memoize=False)
|
||||
else:
|
||||
|
||||
chunks = chunk(text=docs_transformed,
|
||||
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
||||
chunk_size=chunk_size,
|
||||
token_counter=lambda text: len(text.split()),
|
||||
memoize=False)
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
"""
|
||||
Module for truncatinh in chunks the messages
|
||||
Module for truncating in chunks the messages
|
||||
"""
|
||||
from typing import List
|
||||
import tiktoken
|
||||
@ -27,7 +27,7 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]
|
||||
"""
|
||||
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
max_tokens = models_tokens[model] - 500
|
||||
max_tokens = min(models_tokens[model] - 500, int(models_tokens[model] * 0.9))
|
||||
encoded_text = encoding.encode(text)
|
||||
|
||||
chunks = [encoded_text[i:i + max_tokens]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user