From cddc9a3c8aabd2e7e2f3b5726aeb79379797f0fb Mon Sep 17 00:00:00 2001
From: VinciGit00 <mvincig11@gmail.com>
Date: Sat, 3 Feb 2024 13:41:02 +0100
Subject: [PATCH] add the opportunity to create chunks

---
 README.md                      |  6 ++--
 amazscraper/class_creator.py   |  7 +++++
 amazscraper/class_generator.py | 14 ++++-----
 amazscraper/getter.py          | 52 ++++++++++++++++++++++++++++++++--
 amazscraper/pydantic_class.py  |  2 +-
 amazscraper/request.py         | 25 ++++++++++++++++
 requirements.txt               |  2 +-
 7 files changed, 93 insertions(+), 15 deletions(-)
 create mode 100644 amazscraper/request.py

diff --git a/README.md b/README.md
index bfee31d9..d40561bc 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# 🤖 AmazScraper
+# 🤖 YOSO-ai
 
 This repo is a Python open source library for making a faster scraping using AI and without any knowledge about the HTML code.
 
@@ -15,7 +15,7 @@ Official documentation page: [https://amazscraper.readthedocs.io/en/latest/index
 
 Try out AmazScraper in your browser:
 
-[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/VinciGit00/AmazScraper)
+[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://obscure-trout-74p9vqwv75wcxwq6.github.dev)
 
 # 🔧 Quick Setup
 
@@ -182,7 +182,7 @@ Given the following input
         {
             "title": "title",
             "type": "str",
-            "description": "Title of the items"
+            "description": "Title of the news"
         }
     ]
 
diff --git a/amazscraper/class_creator.py b/amazscraper/class_creator.py
index 0439ccd6..25862333 100644
--- a/amazscraper/class_creator.py
+++ b/amazscraper/class_creator.py
@@ -8,6 +8,13 @@ class _Response(BaseModel):
 def create_class(data_dict: dict):
     '''
     This function creates a class at runtime using the values from the list.
+    Parameters:
+        data_dict 
+            dict {
+                "title": str
+                "type": str,
+                "description": str
+            }: dictionary for describing the  prompt
     '''
     for elem in data_dict:
         global base_script
diff --git a/amazscraper/class_generator.py b/amazscraper/class_generator.py
index 22f54e4e..2a75bfb1 100644
--- a/amazscraper/class_generator.py
+++ b/amazscraper/class_generator.py
@@ -1,26 +1,26 @@
 from dotenv import load_dotenv
 from .pydantic_class import _Response
-from .class_creator import create_class
+from .class_creator import create_class #in future to remove
 from langchain_openai import ChatOpenAI
 from langchain.prompts import PromptTemplate
 from langchain_core.pydantic_v1 import Field
 from langchain.output_parsers import PydanticOutputParser
 
 class Generator:
-    def __init__(self, values:list, api_key:str, temperature_param:float = 0, model_name:str = "gpt-3.5-turbo"):
+    def __init__(self, values:list[dict], api_key:str, temperature_param:float = 0, model_name:str = "gpt-3.5-turbo")->dict:
         """
         Initializes the Generator object.
 
         Parameters:
-        - values (list): A list of values used for class creation.
-        - temperature_param (float): A parameter controlling the randomness of the language model's output.
-        - model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All
+            values (list): A list of values used for class creation.
+            temperature_param (float): A parameter controlling the randomness of the language model's output.
+            model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All
           the possible models are avaible at the following link: https://platform.openai.com/docs/models
 
         Returns:
-        - result_dict (dict): The result of the language model invocation, converted to a dictionary.
+            result_dict (dict): The result of the language model invocation, converted to a dictionary.
         """
-        create_class(values)
+        create_class(values) #in future to remove
 
         self.parser = PydanticOutputParser(pydantic_object=_Response)
 
diff --git a/amazscraper/getter.py b/amazscraper/getter.py
index 1699e8b6..8f46be25 100644
--- a/amazscraper/getter.py
+++ b/amazscraper/getter.py
@@ -1,8 +1,53 @@
 import requests
-from bs4 import BeautifulSoup
+import tiktoken
+from typing import List
 
 HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
-'Accept-Language': 'en-US'}
+           'Accept-Language': 'en-US'}
+
+models_tokens = {
+    "gpt-3.5-turbo-0125": 16385,
+    "gpt-3.5-turbo": 4096,
+    "gpt-3.5-turbo-1106": 16385,
+    "gpt-3.5-turbo-instruct": 4096,
+    "gpt-4-0125-preview": 128000,
+    "gpt-4-turbo-preview": 128000,
+    "gpt-4-1106-preview": 128000,
+    "gpt-4-vision-preview": 128000,
+    "gpt-4": 8192,
+    "gpt-4-0613": 8192,
+    "gpt-4-32k": 32768,
+    "gpt-4-32k-0613": 32768,
+}
+
+EMBEDDING_ENCODING = 'cl100k_base'
+
+DEFAULT_MESSAGE_LENGTH = 100
+
+
+def truncate_text_tokens(text: str, model: str, encoding_name: str = EMBEDDING_ENCODING) -> List[str]:
+    """
+    It creates a list of strings to create max dimension tokenizable elements
+
+    Parameters:
+    text (str): text to scrape
+    model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All
+    the possible models are available at the following link: https://platform.openai.com/docs/models
+    encoding_name (str):
+
+    Returns
+    List[str] of elements to send the requests
+    """
+    encoding = tiktoken.get_encoding(encoding_name)
+    max_tokens = models_tokens[model]
+    encoded_text = encoding.encode(text)
+    
+    chunks = [encoded_text[i:i + max_tokens] for i in range(0, len(encoded_text), max_tokens)]
+    
+    result = [encoding.decode(chunk) for chunk in chunks]
+    
+    return result
+
 
 def get_function(link:str, param = HEADERS) -> str:
     """
@@ -46,4 +91,5 @@ def remover(file:str) -> str:
 
         if isBody == True:
             res = res + elem
-    return res
+
+    return res.replace("\n", "")
diff --git a/amazscraper/pydantic_class.py b/amazscraper/pydantic_class.py
index 8566aa09..e9b554d1 100644
--- a/amazscraper/pydantic_class.py
+++ b/amazscraper/pydantic_class.py
@@ -2,4 +2,4 @@
 from langchain_core.pydantic_v1 import BaseModel, Field
 
 class _Response(BaseModel):
-    title_website: str = Field(description='Give me the website name')
+    title: str = Field(description='Title of the news')
diff --git a/amazscraper/request.py b/amazscraper/request.py
new file mode 100644
index 00000000..9d06fe88
--- /dev/null
+++ b/amazscraper/request.py
@@ -0,0 +1,25 @@
+from amazscraper.getter import remover
+from .class_creator import create_class
+from amazscraper.class_generator import Generator
+
+
+def send_request(values:list[dict]) ->dict: 
+    """
+    Param:
+        values (list[dict]): settings of the request. 
+        Format: 
+        [
+          dict {
+                "title": str
+                "type": str,
+                "description": str
+            }
+        ]
+      
+    Return:
+        dict: the result of the request to openai
+    """
+    res =  {} 
+    create_class("TOADD")
+
+    return res
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 1735ae88..f4f700fe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,8 +5,8 @@ langchain_openai==0.0.5
 python-dotenv==1.0.1
 Requests==2.31.0
 pytest==8.0.0
-
 wheel==0.42.0
+tiktoken==0.5.2
 twine==4.0.2
 sphinx==7.1.2
 sphinx-rtd-theme==2.0.0
\ No newline at end of file