diff --git a/pyproject.toml b/pyproject.toml index 74f425b9..05dc7078 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,6 @@ dependencies = [ "playwright>=1.43.0", "undetected-playwright>=0.3.0", "google>=3.0.0", - "semchunk>=1.0.1", "langchain-ollama>=0.1.3", ] diff --git a/requirements-dev.lock b/requirements-dev.lock index f68580cc..fd04d800 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -71,7 +71,6 @@ cycler==0.12.1 dataclasses-json==0.6.7 # via langchain-community dill==0.3.8 - # via multiprocess # via pylint distro==1.9.0 # via openai @@ -236,13 +235,9 @@ mdurl==0.1.2 # via markdown-it-py minify-html==0.15.0 # via scrapegraphai -mpire==2.10.2 - # via semchunk multidict==6.0.5 # via aiohttp # via yarl -multiprocess==0.70.16 - # via mpire mypy-extensions==1.0.0 # via typing-inspect narwhals==1.3.0 @@ -325,7 +320,6 @@ pyee==11.1.0 # via playwright pygments==2.18.0 # via furo - # via mpire # via rich # via sphinx pylint==3.2.6 @@ -373,8 +367,6 @@ rsa==4.9 # via google-auth s3transfer==0.10.2 # via boto3 -semchunk==2.2.0 - # via scrapegraphai sf-hamilton==1.73.1 # via burr six==1.16.0 @@ -436,10 +428,8 @@ tornado==6.4.1 tqdm==4.66.5 # via google-generativeai # via huggingface-hub - # via mpire # via openai # via scrapegraphai - # via semchunk typing-extensions==4.12.2 # via altair # via anyio diff --git a/requirements.lock b/requirements.lock index e0852c04..b34c9290 100644 --- a/requirements.lock +++ b/requirements.lock @@ -41,8 +41,6 @@ charset-normalizer==3.3.2 # via requests dataclasses-json==0.6.7 # via langchain-community -dill==0.3.8 - # via multiprocess distro==1.9.0 # via openai exceptiongroup==1.2.2 @@ -155,13 +153,9 @@ marshmallow==3.21.3 # via dataclasses-json minify-html==0.15.0 # via scrapegraphai -mpire==2.10.2 - # via semchunk multidict==6.0.5 # via aiohttp # via yarl -multiprocess==0.70.16 - # via mpire mypy-extensions==1.0.0 # via typing-inspect numpy==1.26.4 @@ -211,8 +205,6 @@ pydantic-core==2.20.1 # via pydantic pyee==11.1.0 # via playwright -pygments==2.18.0 - # via mpire pyparsing==3.1.2 # via httplib2 python-dateutil==2.9.0.post0 @@ -241,8 +233,6 @@ rsa==4.9 # via google-auth s3transfer==0.10.2 # via boto3 -semchunk==2.2.0 - # via scrapegraphai six==1.16.0 # via python-dateutil sniffio==1.3.1 @@ -266,10 +256,8 @@ tokenizers==0.19.1 tqdm==4.66.4 # via google-generativeai # via huggingface-hub - # via mpire # via openai # via scrapegraphai - # via semchunk typing-extensions==4.12.2 # via anyio # via google-generativeai diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 4a8638e7..65c2b58d 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -2,11 +2,10 @@ ParseNode Module """ from typing import List, Optional -from semchunk import chunk from langchain_community.document_transformers import Html2TextTransformer from langchain_core.documents import Document from .base_node import BaseNode -from tokenizer import num_tokens_calculus +from ..utils.split_text_into_chunks import split_text_into_chunks class ParseNode(BaseNode): """ @@ -69,10 +68,8 @@ class ParseNode(BaseNode): docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0]) docs_transformed = docs_transformed[0] - chunks = chunk(text=docs_transformed.page_content, - chunk_size=self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda text: len(text.split()), - memoize=False) + chunks = split_text_into_chunks(text=docs_transformed.page_content, + chunk_size=self.node_config.get("chunk_size", 4096)-250) else: docs_transformed = docs_transformed[0] @@ -80,15 +77,11 @@ class ParseNode(BaseNode): chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) if isinstance(docs_transformed, Document): - chunks = chunk(text=docs_transformed.page_content, - chunk_size=chunk_size, - token_counter=lambda text: len(text.split()), - memoize=False) + chunks = split_text_into_chunks(text=docs_transformed.page_content, + chunk_size=chunk_size) else: - chunks = chunk(text=docs_transformed, - chunk_size=chunk_size, - token_counter=lambda text: len(text.split()), - memoize=False) + chunks = split_text_into_chunks(text=docs_transformed, + chunk_size=chunk_size) state.update({self.output[0]: chunks}) diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index aa367a9c..fbd03800 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -1,7 +1,6 @@ """ __init__.py file for utils folder """ - from .convert_to_csv import convert_to_csv from .convert_to_json import convert_to_json from .prettify_exec_info import prettify_exec_info @@ -17,3 +16,4 @@ from .screenshot_scraping.screenshot_preparation import (take_screenshot, crop_image) from .screenshot_scraping.text_detection import detect_text from .tokenizer import num_tokens_calculus +from .split_text_into_chunks import split_text_into_chunks diff --git a/scrapegraphai/utils/split_text_into_chunks.py b/scrapegraphai/utils/split_text_into_chunks.py new file mode 100644 index 00000000..9bb37eb0 --- /dev/null +++ b/scrapegraphai/utils/split_text_into_chunks.py @@ -0,0 +1,40 @@ +""" +split_text_into_chunks module +""" +from typing import List +from .tokenizer import num_tokens_calculus # Import the new tokenizing function + +def split_text_into_chunks(text: str, chunk_size: int) -> List[str]: + """ + Splits the text into chunks based on the number of tokens. + + Args: + text (str): The text to split. + chunk_size (int): The maximum number of tokens per chunk. + + Returns: + List[str]: A list of text chunks. + """ + tokens = num_tokens_calculus(text) + if tokens <= chunk_size: + return [text] + + chunks = [] + current_chunk = [] + current_length = 0 + + words = text.split() + for word in words: + word_tokens = num_tokens_calculus(word) + if current_length + word_tokens > chunk_size: + chunks.append(' '.join(current_chunk)) + current_chunk = [word] + current_length = word_tokens + else: + current_chunk.append(word) + current_length += word_tokens + + if current_chunk: + chunks.append(' '.join(current_chunk)) + + return chunks