From cddc9a3c8aabd2e7e2f3b5726aeb79379797f0fb Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Sat, 3 Feb 2024 13:41:02 +0100 Subject: [PATCH] add the opportunity to create chunks --- README.md | 6 ++-- amazscraper/class_creator.py | 7 +++++ amazscraper/class_generator.py | 14 ++++----- amazscraper/getter.py | 52 ++++++++++++++++++++++++++++++++-- amazscraper/pydantic_class.py | 2 +- amazscraper/request.py | 25 ++++++++++++++++ requirements.txt | 2 +- 7 files changed, 93 insertions(+), 15 deletions(-) create mode 100644 amazscraper/request.py diff --git a/README.md b/README.md index bfee31d9..d40561bc 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 🤖 AmazScraper +# 🤖 YOSO-ai This repo is a Python open source library for making a faster scraping using AI and without any knowledge about the HTML code. @@ -15,7 +15,7 @@ Official documentation page: [https://amazscraper.readthedocs.io/en/latest/index Try out AmazScraper in your browser: -[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/VinciGit00/AmazScraper) +[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://obscure-trout-74p9vqwv75wcxwq6.github.dev) # 🔧 Quick Setup @@ -182,7 +182,7 @@ Given the following input { "title": "title", "type": "str", - "description": "Title of the items" + "description": "Title of the news" } ] diff --git a/amazscraper/class_creator.py b/amazscraper/class_creator.py index 0439ccd6..25862333 100644 --- a/amazscraper/class_creator.py +++ b/amazscraper/class_creator.py @@ -8,6 +8,13 @@ class _Response(BaseModel): def create_class(data_dict: dict): ''' This function creates a class at runtime using the values from the list. + Parameters: + data_dict + dict { + "title": str + "type": str, + "description": str + }: dictionary for describing the prompt ''' for elem in data_dict: global base_script diff --git a/amazscraper/class_generator.py b/amazscraper/class_generator.py index 22f54e4e..2a75bfb1 100644 --- a/amazscraper/class_generator.py +++ b/amazscraper/class_generator.py @@ -1,26 +1,26 @@ from dotenv import load_dotenv from .pydantic_class import _Response -from .class_creator import create_class +from .class_creator import create_class #in future to remove from langchain_openai import ChatOpenAI from langchain.prompts import PromptTemplate from langchain_core.pydantic_v1 import Field from langchain.output_parsers import PydanticOutputParser class Generator: - def __init__(self, values:list, api_key:str, temperature_param:float = 0, model_name:str = "gpt-3.5-turbo"): + def __init__(self, values:list[dict], api_key:str, temperature_param:float = 0, model_name:str = "gpt-3.5-turbo")->dict: """ Initializes the Generator object. Parameters: - - values (list): A list of values used for class creation. - - temperature_param (float): A parameter controlling the randomness of the language model's output. - - model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All + values (list): A list of values used for class creation. + temperature_param (float): A parameter controlling the randomness of the language model's output. + model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All the possible models are avaible at the following link: https://platform.openai.com/docs/models Returns: - - result_dict (dict): The result of the language model invocation, converted to a dictionary. + result_dict (dict): The result of the language model invocation, converted to a dictionary. """ - create_class(values) + create_class(values) #in future to remove self.parser = PydanticOutputParser(pydantic_object=_Response) diff --git a/amazscraper/getter.py b/amazscraper/getter.py index 1699e8b6..8f46be25 100644 --- a/amazscraper/getter.py +++ b/amazscraper/getter.py @@ -1,8 +1,53 @@ import requests -from bs4 import BeautifulSoup +import tiktoken +from typing import List HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', -'Accept-Language': 'en-US'} + 'Accept-Language': 'en-US'} + +models_tokens = { + "gpt-3.5-turbo-0125": 16385, + "gpt-3.5-turbo": 4096, + "gpt-3.5-turbo-1106": 16385, + "gpt-3.5-turbo-instruct": 4096, + "gpt-4-0125-preview": 128000, + "gpt-4-turbo-preview": 128000, + "gpt-4-1106-preview": 128000, + "gpt-4-vision-preview": 128000, + "gpt-4": 8192, + "gpt-4-0613": 8192, + "gpt-4-32k": 32768, + "gpt-4-32k-0613": 32768, +} + +EMBEDDING_ENCODING = 'cl100k_base' + +DEFAULT_MESSAGE_LENGTH = 100 + + +def truncate_text_tokens(text: str, model: str, encoding_name: str = EMBEDDING_ENCODING) -> List[str]: + """ + It creates a list of strings to create max dimension tokenizable elements + + Parameters: + text (str): text to scrape + model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All + the possible models are available at the following link: https://platform.openai.com/docs/models + encoding_name (str): + + Returns + List[str] of elements to send the requests + """ + encoding = tiktoken.get_encoding(encoding_name) + max_tokens = models_tokens[model] + encoded_text = encoding.encode(text) + + chunks = [encoded_text[i:i + max_tokens] for i in range(0, len(encoded_text), max_tokens)] + + result = [encoding.decode(chunk) for chunk in chunks] + + return result + def get_function(link:str, param = HEADERS) -> str: """ @@ -46,4 +91,5 @@ def remover(file:str) -> str: if isBody == True: res = res + elem - return res + + return res.replace("\n", "") diff --git a/amazscraper/pydantic_class.py b/amazscraper/pydantic_class.py index 8566aa09..e9b554d1 100644 --- a/amazscraper/pydantic_class.py +++ b/amazscraper/pydantic_class.py @@ -2,4 +2,4 @@ from langchain_core.pydantic_v1 import BaseModel, Field class _Response(BaseModel): - title_website: str = Field(description='Give me the website name') + title: str = Field(description='Title of the news') diff --git a/amazscraper/request.py b/amazscraper/request.py new file mode 100644 index 00000000..9d06fe88 --- /dev/null +++ b/amazscraper/request.py @@ -0,0 +1,25 @@ +from amazscraper.getter import remover +from .class_creator import create_class +from amazscraper.class_generator import Generator + + +def send_request(values:list[dict]) ->dict: + """ + Param: + values (list[dict]): settings of the request. + Format: + [ + dict { + "title": str + "type": str, + "description": str + } + ] + + Return: + dict: the result of the request to openai + """ + res = {} + create_class("TOADD") + + return res \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1735ae88..f4f700fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,8 +5,8 @@ langchain_openai==0.0.5 python-dotenv==1.0.1 Requests==2.31.0 pytest==8.0.0 - wheel==0.42.0 +tiktoken==0.5.2 twine==4.0.2 sphinx==7.1.2 sphinx-rtd-theme==2.0.0 \ No newline at end of file