mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
fix(chunking): count tokens from words instead of characters
closes #513
This commit is contained in:
parent
7f1f7503f7
commit
5ec2de9e1a
@ -74,22 +74,22 @@ class ParseNode(BaseNode):
|
|||||||
docs_transformed = docs_transformed[0]
|
docs_transformed = docs_transformed[0]
|
||||||
|
|
||||||
chunks = chunk(text=docs_transformed.page_content,
|
chunks = chunk(text=docs_transformed.page_content,
|
||||||
chunk_size= self.node_config.get("chunk_size", 4096)-250,
|
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
||||||
token_counter= lambda x: len(x),
|
token_counter=lambda text: len(text.split()),
|
||||||
memoize=False)
|
memoize=False)
|
||||||
else:
|
else:
|
||||||
docs_transformed = docs_transformed[0]
|
docs_transformed = docs_transformed[0]
|
||||||
|
|
||||||
if isinstance(docs_transformed, Document):
|
if isinstance(docs_transformed, Document):
|
||||||
chunks = chunk(text=docs_transformed.page_content,
|
chunks = chunk(text=docs_transformed.page_content,
|
||||||
chunk_size= self.node_config.get("chunk_size", 4096)-250,
|
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
||||||
token_counter= lambda x: len(x),
|
token_counter=lambda text: len(text.split()),
|
||||||
memoize=False)
|
memoize=False)
|
||||||
else:
|
else:
|
||||||
|
|
||||||
chunks = chunk(text=docs_transformed,
|
chunks = chunk(text=docs_transformed,
|
||||||
chunk_size= self.node_config.get("chunk_size", 4096)-250,
|
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
||||||
token_counter= lambda x: len(x),
|
token_counter=lambda text: len(text.split()),
|
||||||
memoize=False)
|
memoize=False)
|
||||||
|
|
||||||
state.update({self.output[0]: chunks})
|
state.update({self.output[0]: chunks})
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user