feat: removed semchunk and used tikton

This commit is contained in:
Marco Vinciguerra 2024-09-10 14:03:52 +02:00
parent 380174d490
commit 1a7f21fbf3
6 changed files with 48 additions and 38 deletions

View File

@ -29,7 +29,6 @@ dependencies = [
"playwright>=1.43.0",
"undetected-playwright>=0.3.0",
"google>=3.0.0",
"semchunk>=1.0.1",
"langchain-ollama>=0.1.3",
]

View File

@ -71,7 +71,6 @@ cycler==0.12.1
dataclasses-json==0.6.7
# via langchain-community
dill==0.3.8
# via multiprocess
# via pylint
distro==1.9.0
# via openai
@ -236,13 +235,9 @@ mdurl==0.1.2
# via markdown-it-py
minify-html==0.15.0
# via scrapegraphai
mpire==2.10.2
# via semchunk
multidict==6.0.5
# via aiohttp
# via yarl
multiprocess==0.70.16
# via mpire
mypy-extensions==1.0.0
# via typing-inspect
narwhals==1.3.0
@ -325,7 +320,6 @@ pyee==11.1.0
# via playwright
pygments==2.18.0
# via furo
# via mpire
# via rich
# via sphinx
pylint==3.2.6
@ -373,8 +367,6 @@ rsa==4.9
# via google-auth
s3transfer==0.10.2
# via boto3
semchunk==2.2.0
# via scrapegraphai
sf-hamilton==1.73.1
# via burr
six==1.16.0
@ -436,10 +428,8 @@ tornado==6.4.1
tqdm==4.66.5
# via google-generativeai
# via huggingface-hub
# via mpire
# via openai
# via scrapegraphai
# via semchunk
typing-extensions==4.12.2
# via altair
# via anyio

View File

@ -41,8 +41,6 @@ charset-normalizer==3.3.2
# via requests
dataclasses-json==0.6.7
# via langchain-community
dill==0.3.8
# via multiprocess
distro==1.9.0
# via openai
exceptiongroup==1.2.2
@ -155,13 +153,9 @@ marshmallow==3.21.3
# via dataclasses-json
minify-html==0.15.0
# via scrapegraphai
mpire==2.10.2
# via semchunk
multidict==6.0.5
# via aiohttp
# via yarl
multiprocess==0.70.16
# via mpire
mypy-extensions==1.0.0
# via typing-inspect
numpy==1.26.4
@ -211,8 +205,6 @@ pydantic-core==2.20.1
# via pydantic
pyee==11.1.0
# via playwright
pygments==2.18.0
# via mpire
pyparsing==3.1.2
# via httplib2
python-dateutil==2.9.0.post0
@ -241,8 +233,6 @@ rsa==4.9
# via google-auth
s3transfer==0.10.2
# via boto3
semchunk==2.2.0
# via scrapegraphai
six==1.16.0
# via python-dateutil
sniffio==1.3.1
@ -266,10 +256,8 @@ tokenizers==0.19.1
tqdm==4.66.4
# via google-generativeai
# via huggingface-hub
# via mpire
# via openai
# via scrapegraphai
# via semchunk
typing-extensions==4.12.2
# via anyio
# via google-generativeai

View File

@ -2,11 +2,10 @@
ParseNode Module
"""
from typing import List, Optional
from semchunk import chunk
from langchain_community.document_transformers import Html2TextTransformer
from langchain_core.documents import Document
from .base_node import BaseNode
from tokenizer import num_tokens_calculus
from ..utils.split_text_into_chunks import split_text_into_chunks
class ParseNode(BaseNode):
"""
@ -69,10 +68,8 @@ class ParseNode(BaseNode):
docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0])
docs_transformed = docs_transformed[0]
chunks = chunk(text=docs_transformed.page_content,
chunk_size=self.node_config.get("chunk_size", 4096)-250,
token_counter=lambda text: len(text.split()),
memoize=False)
chunks = split_text_into_chunks(text=docs_transformed.page_content,
chunk_size=self.node_config.get("chunk_size", 4096)-250)
else:
docs_transformed = docs_transformed[0]
@ -80,15 +77,11 @@ class ParseNode(BaseNode):
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
if isinstance(docs_transformed, Document):
chunks = chunk(text=docs_transformed.page_content,
chunk_size=chunk_size,
token_counter=lambda text: len(text.split()),
memoize=False)
chunks = split_text_into_chunks(text=docs_transformed.page_content,
chunk_size=chunk_size)
else:
chunks = chunk(text=docs_transformed,
chunk_size=chunk_size,
token_counter=lambda text: len(text.split()),
memoize=False)
chunks = split_text_into_chunks(text=docs_transformed,
chunk_size=chunk_size)
state.update({self.output[0]: chunks})

View File

@ -1,7 +1,6 @@
"""
__init__.py file for utils folder
"""
from .convert_to_csv import convert_to_csv
from .convert_to_json import convert_to_json
from .prettify_exec_info import prettify_exec_info
@ -17,3 +16,4 @@ from .screenshot_scraping.screenshot_preparation import (take_screenshot,
crop_image)
from .screenshot_scraping.text_detection import detect_text
from .tokenizer import num_tokens_calculus
from .split_text_into_chunks import split_text_into_chunks

View File

@ -0,0 +1,40 @@
"""
split_text_into_chunks module
"""
from typing import List
from .tokenizer import num_tokens_calculus # Import the new tokenizing function
def split_text_into_chunks(text: str, chunk_size: int) -> List[str]:
"""
Splits the text into chunks based on the number of tokens.
Args:
text (str): The text to split.
chunk_size (int): The maximum number of tokens per chunk.
Returns:
List[str]: A list of text chunks.
"""
tokens = num_tokens_calculus(text)
if tokens <= chunk_size:
return [text]
chunks = []
current_chunk = []
current_length = 0
words = text.split()
for word in words:
word_tokens = num_tokens_calculus(word)
if current_length + word_tokens > chunk_size:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = word_tokens
else:
current_chunk.append(word)
current_length += word_tokens
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks