mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
fix(chunking): count tokens from words instead of characters
closes #513
This commit is contained in:
parent
7f1f7503f7
commit
5ec2de9e1a
@ -74,22 +74,22 @@ class ParseNode(BaseNode):
|
||||
docs_transformed = docs_transformed[0]
|
||||
|
||||
chunks = chunk(text=docs_transformed.page_content,
|
||||
chunk_size= self.node_config.get("chunk_size", 4096)-250,
|
||||
token_counter= lambda x: len(x),
|
||||
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
||||
token_counter=lambda text: len(text.split()),
|
||||
memoize=False)
|
||||
else:
|
||||
docs_transformed = docs_transformed[0]
|
||||
|
||||
if isinstance(docs_transformed, Document):
|
||||
chunks = chunk(text=docs_transformed.page_content,
|
||||
chunk_size= self.node_config.get("chunk_size", 4096)-250,
|
||||
token_counter= lambda x: len(x),
|
||||
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
||||
token_counter=lambda text: len(text.split()),
|
||||
memoize=False)
|
||||
else:
|
||||
|
||||
chunks = chunk(text=docs_transformed,
|
||||
chunk_size= self.node_config.get("chunk_size", 4096)-250,
|
||||
token_counter= lambda x: len(x),
|
||||
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
||||
token_counter=lambda text: len(text.split()),
|
||||
memoize=False)
|
||||
|
||||
state.update({self.output[0]: chunks})
|
||||
|
||||
Loading…
Reference in New Issue
Block a user