mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
fix(ParseNode): leave room for LLM reply in context window
This commit is contained in:
parent
ebdb74967d
commit
683bf57d89
@ -14,12 +14,12 @@ authors = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"langchain==0.2.14",
|
"langchain>=0.2.14",
|
||||||
"langchain-fireworks>=0.1.3",
|
"langchain-fireworks>=0.1.3",
|
||||||
"langchain_community>=0.2.9",
|
"langchain_community>=0.2.9",
|
||||||
"langchain-google-genai>=1.0.7",
|
"langchain-google-genai>=1.0.7",
|
||||||
"langchain-google-vertexai>=1.0.7",
|
"langchain-google-vertexai>=1.0.7",
|
||||||
"langchain-openai==0.1.22",
|
"langchain-openai>=0.1.22",
|
||||||
"langchain-groq>=0.1.3",
|
"langchain-groq>=0.1.3",
|
||||||
"langchain-aws>=0.1.3",
|
"langchain-aws>=0.1.3",
|
||||||
"langchain-anthropic>=0.1.11",
|
"langchain-anthropic>=0.1.11",
|
||||||
|
|||||||
@ -179,6 +179,7 @@ graphviz==0.20.3
|
|||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
greenlet==3.0.3
|
greenlet==3.0.3
|
||||||
# via playwright
|
# via playwright
|
||||||
|
# via sqlalchemy
|
||||||
groq==0.9.0
|
groq==0.9.0
|
||||||
# via langchain-groq
|
# via langchain-groq
|
||||||
grpc-google-iam-v1==0.13.1
|
grpc-google-iam-v1==0.13.1
|
||||||
|
|||||||
@ -133,6 +133,7 @@ graphviz==0.20.3
|
|||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
greenlet==3.0.3
|
greenlet==3.0.3
|
||||||
# via playwright
|
# via playwright
|
||||||
|
# via sqlalchemy
|
||||||
groq==0.9.0
|
groq==0.9.0
|
||||||
# via langchain-groq
|
# via langchain-groq
|
||||||
grpc-google-iam-v1==0.13.1
|
grpc-google-iam-v1==0.13.1
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
langchain>=0.2.10
|
langchain>=0.2.14
|
||||||
langchain-fireworks>=0.1.3
|
langchain-fireworks>=0.1.3
|
||||||
langchain_community>=0.2.9
|
langchain_community>=0.2.9
|
||||||
langchain-google-genai>=1.0.7
|
langchain-google-genai>=1.0.7
|
||||||
langchain-google-vertexai>=1.0.7
|
langchain-google-vertexai>=1.0.7
|
||||||
langchain-openai>=0.1.17
|
langchain-openai>=0.1.22
|
||||||
langchain-groq>=0.1.3
|
langchain-groq>=0.1.3
|
||||||
langchain-aws>=0.1.3
|
langchain-aws>=0.1.3
|
||||||
langchain-anthropic>=0.1.11
|
langchain-anthropic>=0.1.11
|
||||||
|
|||||||
@ -6,7 +6,6 @@ from typing import List, Optional
|
|||||||
from semchunk import chunk
|
from semchunk import chunk
|
||||||
from langchain_community.document_transformers import Html2TextTransformer
|
from langchain_community.document_transformers import Html2TextTransformer
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from ..utils.logging import get_logger
|
|
||||||
from .base_node import BaseNode
|
from .base_node import BaseNode
|
||||||
|
|
||||||
class ParseNode(BaseNode):
|
class ParseNode(BaseNode):
|
||||||
@ -79,16 +78,18 @@ class ParseNode(BaseNode):
|
|||||||
else:
|
else:
|
||||||
docs_transformed = docs_transformed[0]
|
docs_transformed = docs_transformed[0]
|
||||||
|
|
||||||
if isinstance(docs_transformed, Document):
|
# Adapt the chunk size, leaving room for the reply, the prompt and the schema
|
||||||
|
chunk_size = self.node_config.get("chunk_size", 4096)
|
||||||
|
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
|
||||||
|
|
||||||
|
if isinstance(docs_transformed, Document):
|
||||||
chunks = chunk(text=docs_transformed.page_content,
|
chunks = chunk(text=docs_transformed.page_content,
|
||||||
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
chunk_size=chunk_size,
|
||||||
token_counter=lambda text: len(text.split()),
|
token_counter=lambda text: len(text.split()),
|
||||||
memoize=False)
|
memoize=False)
|
||||||
else:
|
else:
|
||||||
|
|
||||||
chunks = chunk(text=docs_transformed,
|
chunks = chunk(text=docs_transformed,
|
||||||
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
chunk_size=chunk_size,
|
||||||
token_counter=lambda text: len(text.split()),
|
token_counter=lambda text: len(text.split()),
|
||||||
memoize=False)
|
memoize=False)
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
Module for truncatinh in chunks the messages
|
Module for truncating in chunks the messages
|
||||||
"""
|
"""
|
||||||
from typing import List
|
from typing import List
|
||||||
import tiktoken
|
import tiktoken
|
||||||
@ -27,7 +27,7 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
encoding = tiktoken.get_encoding(encoding_name)
|
encoding = tiktoken.get_encoding(encoding_name)
|
||||||
max_tokens = models_tokens[model] - 500
|
max_tokens = min(models_tokens[model] - 500, int(models_tokens[model] * 0.9))
|
||||||
encoded_text = encoding.encode(text)
|
encoded_text = encoding.encode(text)
|
||||||
|
|
||||||
chunks = [encoded_text[i:i + max_tokens]
|
chunks = [encoded_text[i:i + max_tokens]
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user