fix(chunking): count tokens from words instead of characters

closes #513
This commit is contained in:
Federico Aguzzi 2024-08-08 10:47:08 +02:00
parent 7f1f7503f7
commit 5ec2de9e1a

View File

@ -74,22 +74,22 @@ class ParseNode(BaseNode):
docs_transformed = docs_transformed[0]
chunks = chunk(text=docs_transformed.page_content,
chunk_size= self.node_config.get("chunk_size", 4096)-250,
token_counter= lambda x: len(x),
chunk_size=self.node_config.get("chunk_size", 4096)-250,
token_counter=lambda text: len(text.split()),
memoize=False)
else:
docs_transformed = docs_transformed[0]
if isinstance(docs_transformed, Document):
chunks = chunk(text=docs_transformed.page_content,
chunk_size= self.node_config.get("chunk_size", 4096)-250,
token_counter= lambda x: len(x),
chunk_size=self.node_config.get("chunk_size", 4096)-250,
token_counter=lambda text: len(text.split()),
memoize=False)
else:
chunks = chunk(text=docs_transformed,
chunk_size= self.node_config.get("chunk_size", 4096)-250,
token_counter= lambda x: len(x),
chunk_size=self.node_config.get("chunk_size", 4096)-250,
token_counter=lambda text: len(text.split()),
memoize=False)
state.update({self.output[0]: chunks})