mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
feat: removed semchunk and used tikton
This commit is contained in:
parent
380174d490
commit
1a7f21fbf3
@ -29,7 +29,6 @@ dependencies = [
|
||||
"playwright>=1.43.0",
|
||||
"undetected-playwright>=0.3.0",
|
||||
"google>=3.0.0",
|
||||
"semchunk>=1.0.1",
|
||||
"langchain-ollama>=0.1.3",
|
||||
]
|
||||
|
||||
|
||||
@ -71,7 +71,6 @@ cycler==0.12.1
|
||||
dataclasses-json==0.6.7
|
||||
# via langchain-community
|
||||
dill==0.3.8
|
||||
# via multiprocess
|
||||
# via pylint
|
||||
distro==1.9.0
|
||||
# via openai
|
||||
@ -236,13 +235,9 @@ mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
minify-html==0.15.0
|
||||
# via scrapegraphai
|
||||
mpire==2.10.2
|
||||
# via semchunk
|
||||
multidict==6.0.5
|
||||
# via aiohttp
|
||||
# via yarl
|
||||
multiprocess==0.70.16
|
||||
# via mpire
|
||||
mypy-extensions==1.0.0
|
||||
# via typing-inspect
|
||||
narwhals==1.3.0
|
||||
@ -325,7 +320,6 @@ pyee==11.1.0
|
||||
# via playwright
|
||||
pygments==2.18.0
|
||||
# via furo
|
||||
# via mpire
|
||||
# via rich
|
||||
# via sphinx
|
||||
pylint==3.2.6
|
||||
@ -373,8 +367,6 @@ rsa==4.9
|
||||
# via google-auth
|
||||
s3transfer==0.10.2
|
||||
# via boto3
|
||||
semchunk==2.2.0
|
||||
# via scrapegraphai
|
||||
sf-hamilton==1.73.1
|
||||
# via burr
|
||||
six==1.16.0
|
||||
@ -436,10 +428,8 @@ tornado==6.4.1
|
||||
tqdm==4.66.5
|
||||
# via google-generativeai
|
||||
# via huggingface-hub
|
||||
# via mpire
|
||||
# via openai
|
||||
# via scrapegraphai
|
||||
# via semchunk
|
||||
typing-extensions==4.12.2
|
||||
# via altair
|
||||
# via anyio
|
||||
|
||||
@ -41,8 +41,6 @@ charset-normalizer==3.3.2
|
||||
# via requests
|
||||
dataclasses-json==0.6.7
|
||||
# via langchain-community
|
||||
dill==0.3.8
|
||||
# via multiprocess
|
||||
distro==1.9.0
|
||||
# via openai
|
||||
exceptiongroup==1.2.2
|
||||
@ -155,13 +153,9 @@ marshmallow==3.21.3
|
||||
# via dataclasses-json
|
||||
minify-html==0.15.0
|
||||
# via scrapegraphai
|
||||
mpire==2.10.2
|
||||
# via semchunk
|
||||
multidict==6.0.5
|
||||
# via aiohttp
|
||||
# via yarl
|
||||
multiprocess==0.70.16
|
||||
# via mpire
|
||||
mypy-extensions==1.0.0
|
||||
# via typing-inspect
|
||||
numpy==1.26.4
|
||||
@ -211,8 +205,6 @@ pydantic-core==2.20.1
|
||||
# via pydantic
|
||||
pyee==11.1.0
|
||||
# via playwright
|
||||
pygments==2.18.0
|
||||
# via mpire
|
||||
pyparsing==3.1.2
|
||||
# via httplib2
|
||||
python-dateutil==2.9.0.post0
|
||||
@ -241,8 +233,6 @@ rsa==4.9
|
||||
# via google-auth
|
||||
s3transfer==0.10.2
|
||||
# via boto3
|
||||
semchunk==2.2.0
|
||||
# via scrapegraphai
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
sniffio==1.3.1
|
||||
@ -266,10 +256,8 @@ tokenizers==0.19.1
|
||||
tqdm==4.66.4
|
||||
# via google-generativeai
|
||||
# via huggingface-hub
|
||||
# via mpire
|
||||
# via openai
|
||||
# via scrapegraphai
|
||||
# via semchunk
|
||||
typing-extensions==4.12.2
|
||||
# via anyio
|
||||
# via google-generativeai
|
||||
|
||||
@ -2,11 +2,10 @@
|
||||
ParseNode Module
|
||||
"""
|
||||
from typing import List, Optional
|
||||
from semchunk import chunk
|
||||
from langchain_community.document_transformers import Html2TextTransformer
|
||||
from langchain_core.documents import Document
|
||||
from .base_node import BaseNode
|
||||
from tokenizer import num_tokens_calculus
|
||||
from ..utils.split_text_into_chunks import split_text_into_chunks
|
||||
|
||||
class ParseNode(BaseNode):
|
||||
"""
|
||||
@ -69,10 +68,8 @@ class ParseNode(BaseNode):
|
||||
docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0])
|
||||
docs_transformed = docs_transformed[0]
|
||||
|
||||
chunks = chunk(text=docs_transformed.page_content,
|
||||
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
||||
token_counter=lambda text: len(text.split()),
|
||||
memoize=False)
|
||||
chunks = split_text_into_chunks(text=docs_transformed.page_content,
|
||||
chunk_size=self.node_config.get("chunk_size", 4096)-250)
|
||||
else:
|
||||
docs_transformed = docs_transformed[0]
|
||||
|
||||
@ -80,15 +77,11 @@ class ParseNode(BaseNode):
|
||||
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
|
||||
|
||||
if isinstance(docs_transformed, Document):
|
||||
chunks = chunk(text=docs_transformed.page_content,
|
||||
chunk_size=chunk_size,
|
||||
token_counter=lambda text: len(text.split()),
|
||||
memoize=False)
|
||||
chunks = split_text_into_chunks(text=docs_transformed.page_content,
|
||||
chunk_size=chunk_size)
|
||||
else:
|
||||
chunks = chunk(text=docs_transformed,
|
||||
chunk_size=chunk_size,
|
||||
token_counter=lambda text: len(text.split()),
|
||||
memoize=False)
|
||||
chunks = split_text_into_chunks(text=docs_transformed,
|
||||
chunk_size=chunk_size)
|
||||
|
||||
state.update({self.output[0]: chunks})
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
"""
|
||||
__init__.py file for utils folder
|
||||
"""
|
||||
|
||||
from .convert_to_csv import convert_to_csv
|
||||
from .convert_to_json import convert_to_json
|
||||
from .prettify_exec_info import prettify_exec_info
|
||||
@ -17,3 +16,4 @@ from .screenshot_scraping.screenshot_preparation import (take_screenshot,
|
||||
crop_image)
|
||||
from .screenshot_scraping.text_detection import detect_text
|
||||
from .tokenizer import num_tokens_calculus
|
||||
from .split_text_into_chunks import split_text_into_chunks
|
||||
|
||||
40
scrapegraphai/utils/split_text_into_chunks.py
Normal file
40
scrapegraphai/utils/split_text_into_chunks.py
Normal file
@ -0,0 +1,40 @@
|
||||
"""
|
||||
split_text_into_chunks module
|
||||
"""
|
||||
from typing import List
|
||||
from .tokenizer import num_tokens_calculus # Import the new tokenizing function
|
||||
|
||||
def split_text_into_chunks(text: str, chunk_size: int) -> List[str]:
|
||||
"""
|
||||
Splits the text into chunks based on the number of tokens.
|
||||
|
||||
Args:
|
||||
text (str): The text to split.
|
||||
chunk_size (int): The maximum number of tokens per chunk.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of text chunks.
|
||||
"""
|
||||
tokens = num_tokens_calculus(text)
|
||||
if tokens <= chunk_size:
|
||||
return [text]
|
||||
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
words = text.split()
|
||||
for word in words:
|
||||
word_tokens = num_tokens_calculus(word)
|
||||
if current_length + word_tokens > chunk_size:
|
||||
chunks.append(' '.join(current_chunk))
|
||||
current_chunk = [word]
|
||||
current_length = word_tokens
|
||||
else:
|
||||
current_chunk.append(word)
|
||||
current_length += word_tokens
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(' '.join(current_chunk))
|
||||
|
||||
return chunks
|
||||
Loading…
Reference in New Issue
Block a user