mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
Merge branch 'main' into pre/beta
This commit is contained in:
commit
066e77dbe7
@ -33,6 +33,8 @@
|
||||
|
||||
* implement ScrapeGraph class for only web scraping automation ([612c644](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/612c644623fa6f4fe77a64a5f1a6a4d6cd5f4254))
|
||||
* Implement SmartScraperMultiParseMergeFirstGraph class that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. ([3e3e1b2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e3e1b2f3ae8ed803d03b3b44b199e139baa68d4))
|
||||
=======
|
||||
## [1.26.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.6...v1.26.7) (2024-10-19)
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
@ -70,6 +72,8 @@
|
||||
* add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition ([cacd9cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cacd9cde004dace1a7dcc27981245632a78b95f3))
|
||||
|
||||
|
||||
* removed tokenizer ([a184716](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a18471688f0b79f06fb7078b01b68eeddc88eae4))
|
||||
|
||||
## [1.26.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6) (2024-10-18)
|
||||
|
||||
## [1.26.6-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6-beta.1) (2024-10-14)
|
||||
@ -79,7 +83,6 @@
|
||||
* remove variable "max_result" not being used in the code ([e76a68a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e76a68a782e5bce48d421cb620d0b7bffa412918))
|
||||
|
||||
* refactoring of gpt2 tokenizer ([44c3f9c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/44c3f9c98939c44caa86dc582242819a7c6a0f80))
|
||||
>>>>>>> main
|
||||
|
||||
## [1.26.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.4...v1.26.5) (2024-10-13)
|
||||
|
||||
|
||||
@ -22,6 +22,45 @@ This flexibility ensures that scrapers remain functional even when website layou
|
||||
We support many LLMs including **GPT, Gemini, Groq, Azure, Hugging Face** etc.
|
||||
as well as local models which can run on your machine using **Ollama**.
|
||||
|
||||
AI Models and Token Limits
|
||||
==========================
|
||||
|
||||
ScrapGraphAI supports a wide range of AI models from various providers. Each model has a specific token limit, which is important to consider when designing your scraping pipelines. Here's an overview of the supported models and their token limits:
|
||||
|
||||
OpenAI Models
|
||||
-------------
|
||||
- GPT-3.5 Turbo (16,385 tokens)
|
||||
- GPT-4 (8,192 tokens)
|
||||
- GPT-4 Turbo Preview (128,000 tokens)
|
||||
|
||||
Azure OpenAI Models
|
||||
-------------------
|
||||
- GPT-3.5 Turbo (16,385 tokens)
|
||||
- GPT-4 (8,192 tokens)
|
||||
- GPT-4 Turbo Preview (128,000 tokens)
|
||||
|
||||
Google AI Models
|
||||
----------------
|
||||
- Gemini Pro (128,000 tokens)
|
||||
- Gemini 1.5 Pro (128,000 tokens)
|
||||
|
||||
Anthropic Models
|
||||
----------------
|
||||
- Claude Instant (100,000 tokens)
|
||||
- Claude 2 (200,000 tokens)
|
||||
- Claude 3 (200,000 tokens)
|
||||
|
||||
Mistral AI Models
|
||||
-----------------
|
||||
- Mistral Large (128,000 tokens)
|
||||
- Open Mistral 7B (32,000 tokens)
|
||||
- Open Mixtral 8x7B (32,000 tokens)
|
||||
|
||||
For a complete list of supported models and their token limits, please refer to the API documentation.
|
||||
|
||||
Understanding token limits is crucial for optimizing your scraping tasks. Larger token limits allow for processing more text in a single API call, which can be beneficial for scraping lengthy web pages or documents.
|
||||
|
||||
|
||||
Library Diagram
|
||||
===============
|
||||
|
||||
@ -95,4 +134,4 @@ Sponsors
|
||||
.. image:: ../../assets/transparent_stat.png
|
||||
:width: 15%
|
||||
:alt: Stat Proxies
|
||||
:target: https://dashboard.statproxies.com/?refferal=scrapegraph
|
||||
:target: https://dashboard.statproxies.com/?refferal=scrapegraph
|
||||
|
||||
@ -5,3 +5,6 @@ scrapegraphai
|
||||
:maxdepth: 4
|
||||
|
||||
scrapegraphai
|
||||
|
||||
scrapegraphai.helpers.models_tokens
|
||||
|
||||
|
||||
28
docs/source/modules/scrapegraphai.helpers.models_tokens.rst
Normal file
28
docs/source/modules/scrapegraphai.helpers.models_tokens.rst
Normal file
@ -0,0 +1,28 @@
|
||||
scrapegraphai.helpers.models_tokens module
|
||||
==========================================
|
||||
|
||||
.. automodule:: scrapegraphai.helpers.models_tokens
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
This module contains a comprehensive dictionary of AI models and their corresponding token limits. The `models_tokens` dictionary is organized by provider (e.g., OpenAI, Azure OpenAI, Google AI, etc.) and includes various models with their maximum token counts.
|
||||
|
||||
Example usage:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from scrapegraphai.helpers.models_tokens import models_tokens
|
||||
|
||||
# Get the token limit for GPT-4
|
||||
gpt4_limit = models_tokens['openai']['gpt-4']
|
||||
print(f"GPT-4 token limit: {gpt4_limit}")
|
||||
|
||||
# Check the token limit for a specific model
|
||||
model_name = "gpt-3.5-turbo"
|
||||
if model_name in models_tokens['openai']:
|
||||
print(f"{model_name} token limit: {models_tokens['openai'][model_name]}")
|
||||
else:
|
||||
print(f"{model_name} not found in the models list")
|
||||
|
||||
This information is crucial for users to understand the capabilities and limitations of different AI models when designing their scraping pipelines.
|
||||
@ -4,6 +4,7 @@ name = "scrapegraphai"
|
||||
version = "1.27.0b7"
|
||||
|
||||
|
||||
|
||||
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
|
||||
authors = [
|
||||
{ name = "Marco Vinciguerra", email = "mvincig11@gmail.com" },
|
||||
|
||||
@ -6,7 +6,6 @@ from langchain_openai import ChatOpenAI
|
||||
from langchain_ollama import ChatOllama
|
||||
from langchain_mistralai import ChatMistralAI
|
||||
from langchain_core.language_models.chat_models import BaseChatModel
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int:
|
||||
"""
|
||||
@ -24,13 +23,6 @@ def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int:
|
||||
from .tokenizers.tokenizer_ollama import num_tokens_ollama
|
||||
num_tokens_fn = num_tokens_ollama
|
||||
|
||||
elif isinstance(llm_model, GPT2TokenizerFast):
|
||||
def num_tokens_gpt2(text: str, model: BaseChatModel) -> int:
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
||||
tokens = tokenizer.encode(text)
|
||||
return len(tokens)
|
||||
num_tokens_fn = num_tokens_gpt2
|
||||
|
||||
else:
|
||||
from .tokenizers.tokenizer_openai import num_tokens_openai
|
||||
num_tokens_fn = num_tokens_openai
|
||||
|
||||
@ -3,7 +3,6 @@ Tokenization utilities for Ollama models
|
||||
"""
|
||||
from langchain_core.language_models.chat_models import BaseChatModel
|
||||
from ..logging import get_logger
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int:
|
||||
"""
|
||||
@ -22,12 +21,8 @@ def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int:
|
||||
|
||||
logger.debug(f"Counting tokens for text of {len(text)} characters")
|
||||
|
||||
if isinstance(llm_model, GPT2TokenizerFast):
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
||||
tokens = tokenizer.encode(text)
|
||||
return len(tokens)
|
||||
|
||||
# Use langchain token count implementation
|
||||
# NB: https://github.com/ollama/ollama/issues/1716#issuecomment-2074265507
|
||||
tokens = llm_model.get_num_tokens(text)
|
||||
return tokens
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user