add the opportunity to create chunks

This commit is contained in:
VinciGit00 2024-02-03 13:41:02 +01:00
parent 0bc7e61072
commit cddc9a3c8a
7 changed files with 93 additions and 15 deletions

View File

@ -1,4 +1,4 @@
# 🤖 AmazScraper
# 🤖 YOSO-ai
This repo is a Python open source library for making a faster scraping using AI and without any knowledge about the HTML code.
@ -15,7 +15,7 @@ Official documentation page: [https://amazscraper.readthedocs.io/en/latest/index
Try out AmazScraper in your browser:
[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/VinciGit00/AmazScraper)
[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://obscure-trout-74p9vqwv75wcxwq6.github.dev)
# 🔧 Quick Setup
@ -182,7 +182,7 @@ Given the following input
{
"title": "title",
"type": "str",
"description": "Title of the items"
"description": "Title of the news"
}
]

View File

@ -8,6 +8,13 @@ class _Response(BaseModel):
def create_class(data_dict: dict):
'''
This function creates a class at runtime using the values from the list.
Parameters:
data_dict
dict {
"title": str
"type": str,
"description": str
}: dictionary for describing the prompt
'''
for elem in data_dict:
global base_script

View File

@ -1,26 +1,26 @@
from dotenv import load_dotenv
from .pydantic_class import _Response
from .class_creator import create_class
from .class_creator import create_class #in future to remove
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import Field
from langchain.output_parsers import PydanticOutputParser
class Generator:
def __init__(self, values:list, api_key:str, temperature_param:float = 0, model_name:str = "gpt-3.5-turbo"):
def __init__(self, values:list[dict], api_key:str, temperature_param:float = 0, model_name:str = "gpt-3.5-turbo")->dict:
"""
Initializes the Generator object.
Parameters:
- values (list): A list of values used for class creation.
- temperature_param (float): A parameter controlling the randomness of the language model's output.
- model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All
values (list): A list of values used for class creation.
temperature_param (float): A parameter controlling the randomness of the language model's output.
model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All
the possible models are avaible at the following link: https://platform.openai.com/docs/models
Returns:
- result_dict (dict): The result of the language model invocation, converted to a dictionary.
result_dict (dict): The result of the language model invocation, converted to a dictionary.
"""
create_class(values)
create_class(values) #in future to remove
self.parser = PydanticOutputParser(pydantic_object=_Response)

View File

@ -1,8 +1,53 @@
import requests
from bs4 import BeautifulSoup
import tiktoken
from typing import List
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US'}
'Accept-Language': 'en-US'}
models_tokens = {
"gpt-3.5-turbo-0125": 16385,
"gpt-3.5-turbo": 4096,
"gpt-3.5-turbo-1106": 16385,
"gpt-3.5-turbo-instruct": 4096,
"gpt-4-0125-preview": 128000,
"gpt-4-turbo-preview": 128000,
"gpt-4-1106-preview": 128000,
"gpt-4-vision-preview": 128000,
"gpt-4": 8192,
"gpt-4-0613": 8192,
"gpt-4-32k": 32768,
"gpt-4-32k-0613": 32768,
}
EMBEDDING_ENCODING = 'cl100k_base'
DEFAULT_MESSAGE_LENGTH = 100
def truncate_text_tokens(text: str, model: str, encoding_name: str = EMBEDDING_ENCODING) -> List[str]:
"""
It creates a list of strings to create max dimension tokenizable elements
Parameters:
text (str): text to scrape
model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All
the possible models are available at the following link: https://platform.openai.com/docs/models
encoding_name (str):
Returns
List[str] of elements to send the requests
"""
encoding = tiktoken.get_encoding(encoding_name)
max_tokens = models_tokens[model]
encoded_text = encoding.encode(text)
chunks = [encoded_text[i:i + max_tokens] for i in range(0, len(encoded_text), max_tokens)]
result = [encoding.decode(chunk) for chunk in chunks]
return result
def get_function(link:str, param = HEADERS) -> str:
"""
@ -46,4 +91,5 @@ def remover(file:str) -> str:
if isBody == True:
res = res + elem
return res
return res.replace("\n", "")

View File

@ -2,4 +2,4 @@
from langchain_core.pydantic_v1 import BaseModel, Field
class _Response(BaseModel):
title_website: str = Field(description='Give me the website name')
title: str = Field(description='Title of the news')

25
amazscraper/request.py Normal file
View File

@ -0,0 +1,25 @@
from amazscraper.getter import remover
from .class_creator import create_class
from amazscraper.class_generator import Generator
def send_request(values:list[dict]) ->dict:
"""
Param:
values (list[dict]): settings of the request.
Format:
[
dict {
"title": str
"type": str,
"description": str
}
]
Return:
dict: the result of the request to openai
"""
res = {}
create_class("TOADD")
return res

View File

@ -5,8 +5,8 @@ langchain_openai==0.0.5
python-dotenv==1.0.1
Requests==2.31.0
pytest==8.0.0
wheel==0.42.0
tiktoken==0.5.2
twine==4.0.2
sphinx==7.1.2
sphinx-rtd-theme==2.0.0