From 5ec2de9e1a14def5596738b6cdf769f5039a246d Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Thu, 8 Aug 2024 10:47:08 +0200 Subject: [PATCH] fix(chunking): count tokens from words instead of characters closes #513 --- scrapegraphai/nodes/parse_node.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index d1bb87bd..59471de1 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -74,22 +74,22 @@ class ParseNode(BaseNode): docs_transformed = docs_transformed[0] chunks = chunk(text=docs_transformed.page_content, - chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter= lambda x: len(x), + chunk_size=self.node_config.get("chunk_size", 4096)-250, + token_counter=lambda text: len(text.split()), memoize=False) else: docs_transformed = docs_transformed[0] if isinstance(docs_transformed, Document): chunks = chunk(text=docs_transformed.page_content, - chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter= lambda x: len(x), + chunk_size=self.node_config.get("chunk_size", 4096)-250, + token_counter=lambda text: len(text.split()), memoize=False) else: chunks = chunk(text=docs_transformed, - chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter= lambda x: len(x), + chunk_size=self.node_config.get("chunk_size", 4096)-250, + token_counter=lambda text: len(text.split()), memoize=False) state.update({self.output[0]: chunks})