diff --git a/CHANGELOG.md b/CHANGELOG.md index 346cf772..b908800e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,8 @@ * implement ScrapeGraph class for only web scraping automation ([612c644](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/612c644623fa6f4fe77a64a5f1a6a4d6cd5f4254)) * Implement SmartScraperMultiParseMergeFirstGraph class that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. ([3e3e1b2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e3e1b2f3ae8ed803d03b3b44b199e139baa68d4)) +======= +## [1.26.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.6...v1.26.7) (2024-10-19) ### Bug Fixes @@ -70,6 +72,8 @@ * add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition ([cacd9cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cacd9cde004dace1a7dcc27981245632a78b95f3)) +* removed tokenizer ([a184716](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a18471688f0b79f06fb7078b01b68eeddc88eae4)) + ## [1.26.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6) (2024-10-18) ## [1.26.6-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6-beta.1) (2024-10-14) @@ -79,7 +83,6 @@ * remove variable "max_result" not being used in the code ([e76a68a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e76a68a782e5bce48d421cb620d0b7bffa412918)) * refactoring of gpt2 tokenizer ([44c3f9c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/44c3f9c98939c44caa86dc582242819a7c6a0f80)) ->>>>>>> main ## [1.26.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.4...v1.26.5) (2024-10-13) diff --git a/docs/source/introduction/overview.rst b/docs/source/introduction/overview.rst index 506770a5..a37bbacc 100644 --- a/docs/source/introduction/overview.rst +++ b/docs/source/introduction/overview.rst @@ -22,6 +22,45 @@ This flexibility ensures that scrapers remain functional even when website layou We support many LLMs including **GPT, Gemini, Groq, Azure, Hugging Face** etc. as well as local models which can run on your machine using **Ollama**. +AI Models and Token Limits +========================== + +ScrapGraphAI supports a wide range of AI models from various providers. Each model has a specific token limit, which is important to consider when designing your scraping pipelines. Here's an overview of the supported models and their token limits: + +OpenAI Models +------------- +- GPT-3.5 Turbo (16,385 tokens) +- GPT-4 (8,192 tokens) +- GPT-4 Turbo Preview (128,000 tokens) + +Azure OpenAI Models +------------------- +- GPT-3.5 Turbo (16,385 tokens) +- GPT-4 (8,192 tokens) +- GPT-4 Turbo Preview (128,000 tokens) + +Google AI Models +---------------- +- Gemini Pro (128,000 tokens) +- Gemini 1.5 Pro (128,000 tokens) + +Anthropic Models +---------------- +- Claude Instant (100,000 tokens) +- Claude 2 (200,000 tokens) +- Claude 3 (200,000 tokens) + +Mistral AI Models +----------------- +- Mistral Large (128,000 tokens) +- Open Mistral 7B (32,000 tokens) +- Open Mixtral 8x7B (32,000 tokens) + +For a complete list of supported models and their token limits, please refer to the API documentation. + +Understanding token limits is crucial for optimizing your scraping tasks. Larger token limits allow for processing more text in a single API call, which can be beneficial for scraping lengthy web pages or documents. + + Library Diagram =============== @@ -95,4 +134,4 @@ Sponsors .. image:: ../../assets/transparent_stat.png :width: 15% :alt: Stat Proxies - :target: https://dashboard.statproxies.com/?refferal=scrapegraph \ No newline at end of file + :target: https://dashboard.statproxies.com/?refferal=scrapegraph diff --git a/docs/source/modules/modules.rst b/docs/source/modules/modules.rst index eaa8b0f6..7551ea96 100644 --- a/docs/source/modules/modules.rst +++ b/docs/source/modules/modules.rst @@ -5,3 +5,6 @@ scrapegraphai :maxdepth: 4 scrapegraphai + + scrapegraphai.helpers.models_tokens + diff --git a/docs/source/modules/scrapegraphai.helpers.models_tokens.rst b/docs/source/modules/scrapegraphai.helpers.models_tokens.rst new file mode 100644 index 00000000..173e1bc3 --- /dev/null +++ b/docs/source/modules/scrapegraphai.helpers.models_tokens.rst @@ -0,0 +1,28 @@ +scrapegraphai.helpers.models_tokens module +========================================== + +.. automodule:: scrapegraphai.helpers.models_tokens + :members: + :undoc-members: + :show-inheritance: + +This module contains a comprehensive dictionary of AI models and their corresponding token limits. The `models_tokens` dictionary is organized by provider (e.g., OpenAI, Azure OpenAI, Google AI, etc.) and includes various models with their maximum token counts. + +Example usage: + +.. code-block:: python + + from scrapegraphai.helpers.models_tokens import models_tokens + + # Get the token limit for GPT-4 + gpt4_limit = models_tokens['openai']['gpt-4'] + print(f"GPT-4 token limit: {gpt4_limit}") + + # Check the token limit for a specific model + model_name = "gpt-3.5-turbo" + if model_name in models_tokens['openai']: + print(f"{model_name} token limit: {models_tokens['openai'][model_name]}") + else: + print(f"{model_name} not found in the models list") + +This information is crucial for users to understand the capabilities and limitations of different AI models when designing their scraping pipelines. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 0fab27b6..553c574c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,7 @@ name = "scrapegraphai" version = "1.27.0b7" + description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, diff --git a/scrapegraphai/utils/tokenizer.py b/scrapegraphai/utils/tokenizer.py index 8d5577fd..f6650672 100644 --- a/scrapegraphai/utils/tokenizer.py +++ b/scrapegraphai/utils/tokenizer.py @@ -6,7 +6,6 @@ from langchain_openai import ChatOpenAI from langchain_ollama import ChatOllama from langchain_mistralai import ChatMistralAI from langchain_core.language_models.chat_models import BaseChatModel -from transformers import GPT2TokenizerFast def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int: """ @@ -24,13 +23,6 @@ def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int: from .tokenizers.tokenizer_ollama import num_tokens_ollama num_tokens_fn = num_tokens_ollama - elif isinstance(llm_model, GPT2TokenizerFast): - def num_tokens_gpt2(text: str, model: BaseChatModel) -> int: - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - tokens = tokenizer.encode(text) - return len(tokens) - num_tokens_fn = num_tokens_gpt2 - else: from .tokenizers.tokenizer_openai import num_tokens_openai num_tokens_fn = num_tokens_openai diff --git a/scrapegraphai/utils/tokenizers/tokenizer_ollama.py b/scrapegraphai/utils/tokenizers/tokenizer_ollama.py index feb59e6b..a981e25c 100644 --- a/scrapegraphai/utils/tokenizers/tokenizer_ollama.py +++ b/scrapegraphai/utils/tokenizers/tokenizer_ollama.py @@ -3,7 +3,6 @@ Tokenization utilities for Ollama models """ from langchain_core.language_models.chat_models import BaseChatModel from ..logging import get_logger -from transformers import GPT2TokenizerFast def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int: """ @@ -22,12 +21,8 @@ def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int: logger.debug(f"Counting tokens for text of {len(text)} characters") - if isinstance(llm_model, GPT2TokenizerFast): - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - tokens = tokenizer.encode(text) - return len(tokens) - # Use langchain token count implementation # NB: https://github.com/ollama/ollama/issues/1716#issuecomment-2074265507 tokens = llm_model.get_num_tokens(text) return tokens +