From a18471688f0b79f06fb7078b01b68eeddc88eae4 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 19 Oct 2024 07:18:56 +0200 Subject: [PATCH 1/3] fix: removed tokenizer --- scrapegraphai/utils/tokenizer.py | 8 -------- scrapegraphai/utils/tokenizers/tokenizer_ollama.py | 7 +------ 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/scrapegraphai/utils/tokenizer.py b/scrapegraphai/utils/tokenizer.py index 8d5577fd..f6650672 100644 --- a/scrapegraphai/utils/tokenizer.py +++ b/scrapegraphai/utils/tokenizer.py @@ -6,7 +6,6 @@ from langchain_openai import ChatOpenAI from langchain_ollama import ChatOllama from langchain_mistralai import ChatMistralAI from langchain_core.language_models.chat_models import BaseChatModel -from transformers import GPT2TokenizerFast def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int: """ @@ -24,13 +23,6 @@ def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int: from .tokenizers.tokenizer_ollama import num_tokens_ollama num_tokens_fn = num_tokens_ollama - elif isinstance(llm_model, GPT2TokenizerFast): - def num_tokens_gpt2(text: str, model: BaseChatModel) -> int: - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - tokens = tokenizer.encode(text) - return len(tokens) - num_tokens_fn = num_tokens_gpt2 - else: from .tokenizers.tokenizer_openai import num_tokens_openai num_tokens_fn = num_tokens_openai diff --git a/scrapegraphai/utils/tokenizers/tokenizer_ollama.py b/scrapegraphai/utils/tokenizers/tokenizer_ollama.py index feb59e6b..a981e25c 100644 --- a/scrapegraphai/utils/tokenizers/tokenizer_ollama.py +++ b/scrapegraphai/utils/tokenizers/tokenizer_ollama.py @@ -3,7 +3,6 @@ Tokenization utilities for Ollama models """ from langchain_core.language_models.chat_models import BaseChatModel from ..logging import get_logger -from transformers import GPT2TokenizerFast def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int: """ @@ -22,12 +21,8 @@ def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int: logger.debug(f"Counting tokens for text of {len(text)} characters") - if isinstance(llm_model, GPT2TokenizerFast): - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - tokens = tokenizer.encode(text) - return len(tokens) - # Use langchain token count implementation # NB: https://github.com/ollama/ollama/issues/1716#issuecomment-2074265507 tokens = llm_model.get_num_tokens(text) return tokens + From ec9ef2bcda9aa81f66b943829fcdb22fe265976e Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sat, 19 Oct 2024 05:20:39 +0000 Subject: [PATCH 2/3] ci(release): 1.26.7 [skip ci] ## [1.26.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.6...v1.26.7) (2024-10-19) ### Bug Fixes * removed tokenizer ([a184716](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a18471688f0b79f06fb7078b01b68eeddc88eae4)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09629cf1..d807e233 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.26.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.6...v1.26.7) (2024-10-19) + + +### Bug Fixes + +* removed tokenizer ([a184716](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a18471688f0b79f06fb7078b01b68eeddc88eae4)) + ## [1.26.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6) (2024-10-18) diff --git a/pyproject.toml b/pyproject.toml index fec68549..11b98499 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.26.6" +version = "1.26.7" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From 11ae717623a3600d58c32cd0cae5d75265f7d366 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 21 Oct 2024 11:16:29 +0200 Subject: [PATCH 3/3] add new doc --- docs/source/introduction/overview.rst | 41 ++++++++++++++++++- docs/source/modules/modules.rst | 3 ++ .../scrapegraphai.helpers.models_tokens.rst | 28 +++++++++++++ 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 docs/source/modules/scrapegraphai.helpers.models_tokens.rst diff --git a/docs/source/introduction/overview.rst b/docs/source/introduction/overview.rst index 506770a5..a37bbacc 100644 --- a/docs/source/introduction/overview.rst +++ b/docs/source/introduction/overview.rst @@ -22,6 +22,45 @@ This flexibility ensures that scrapers remain functional even when website layou We support many LLMs including **GPT, Gemini, Groq, Azure, Hugging Face** etc. as well as local models which can run on your machine using **Ollama**. +AI Models and Token Limits +========================== + +ScrapGraphAI supports a wide range of AI models from various providers. Each model has a specific token limit, which is important to consider when designing your scraping pipelines. Here's an overview of the supported models and their token limits: + +OpenAI Models +------------- +- GPT-3.5 Turbo (16,385 tokens) +- GPT-4 (8,192 tokens) +- GPT-4 Turbo Preview (128,000 tokens) + +Azure OpenAI Models +------------------- +- GPT-3.5 Turbo (16,385 tokens) +- GPT-4 (8,192 tokens) +- GPT-4 Turbo Preview (128,000 tokens) + +Google AI Models +---------------- +- Gemini Pro (128,000 tokens) +- Gemini 1.5 Pro (128,000 tokens) + +Anthropic Models +---------------- +- Claude Instant (100,000 tokens) +- Claude 2 (200,000 tokens) +- Claude 3 (200,000 tokens) + +Mistral AI Models +----------------- +- Mistral Large (128,000 tokens) +- Open Mistral 7B (32,000 tokens) +- Open Mixtral 8x7B (32,000 tokens) + +For a complete list of supported models and their token limits, please refer to the API documentation. + +Understanding token limits is crucial for optimizing your scraping tasks. Larger token limits allow for processing more text in a single API call, which can be beneficial for scraping lengthy web pages or documents. + + Library Diagram =============== @@ -95,4 +134,4 @@ Sponsors .. image:: ../../assets/transparent_stat.png :width: 15% :alt: Stat Proxies - :target: https://dashboard.statproxies.com/?refferal=scrapegraph \ No newline at end of file + :target: https://dashboard.statproxies.com/?refferal=scrapegraph diff --git a/docs/source/modules/modules.rst b/docs/source/modules/modules.rst index eaa8b0f6..7551ea96 100644 --- a/docs/source/modules/modules.rst +++ b/docs/source/modules/modules.rst @@ -5,3 +5,6 @@ scrapegraphai :maxdepth: 4 scrapegraphai + + scrapegraphai.helpers.models_tokens + diff --git a/docs/source/modules/scrapegraphai.helpers.models_tokens.rst b/docs/source/modules/scrapegraphai.helpers.models_tokens.rst new file mode 100644 index 00000000..173e1bc3 --- /dev/null +++ b/docs/source/modules/scrapegraphai.helpers.models_tokens.rst @@ -0,0 +1,28 @@ +scrapegraphai.helpers.models_tokens module +========================================== + +.. automodule:: scrapegraphai.helpers.models_tokens + :members: + :undoc-members: + :show-inheritance: + +This module contains a comprehensive dictionary of AI models and their corresponding token limits. The `models_tokens` dictionary is organized by provider (e.g., OpenAI, Azure OpenAI, Google AI, etc.) and includes various models with their maximum token counts. + +Example usage: + +.. code-block:: python + + from scrapegraphai.helpers.models_tokens import models_tokens + + # Get the token limit for GPT-4 + gpt4_limit = models_tokens['openai']['gpt-4'] + print(f"GPT-4 token limit: {gpt4_limit}") + + # Check the token limit for a specific model + model_name = "gpt-3.5-turbo" + if model_name in models_tokens['openai']: + print(f"{model_name} token limit: {models_tokens['openai'][model_name]}") + else: + print(f"{model_name} not found in the models list") + +This information is crucial for users to understand the capabilities and limitations of different AI models when designing their scraping pipelines. \ No newline at end of file