add the opportunity to create chunks

2026-06-23 21:00:30 +08:00 · 2024-02-03 13:41:02 +01:00 · 2024-02-03 13:41:02 +01:00 · cddc9a3c8a
commit cddc9a3c8a
parent 0bc7e61072
7 changed files with 93 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-# 🤖 AmazScraper
+# 🤖 YOSO-ai

 This repo is a Python open source library for making a faster scraping using AI and without any knowledge about the HTML code.

@ -15,7 +15,7 @@ Official documentation page: [https://amazscraper.readthedocs.io/en/latest/index

 Try out AmazScraper in your browser:

-[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/VinciGit00/AmazScraper)
+[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://obscure-trout-74p9vqwv75wcxwq6.github.dev)

 # 🔧 Quick Setup

@ -182,7 +182,7 @@ Given the following input
        {
            "title": "title",
            "type": "str",
-            "description": "Title of the items"
+            "description": "Title of the news"
        }
    ]

--- a/amazscraper/class_creator.py
+++ b/amazscraper/class_creator.py
@ -8,6 +8,13 @@ class _Response(BaseModel):
 def create_class(data_dict: dict):
    '''
    This function creates a class at runtime using the values from the list.
+    Parameters:
+        data_dict 
+            dict {
+                "title": str
+                "type": str,
+                "description": str
+            }: dictionary for describing the  prompt
    '''
    for elem in data_dict:
        global base_script
--- a/amazscraper/class_generator.py
+++ b/amazscraper/class_generator.py
@ -1,26 +1,26 @@
 from dotenv import load_dotenv
 from .pydantic_class import _Response
-from .class_creator import create_class
+from .class_creator import create_class #in future to remove
 from langchain_openai import ChatOpenAI
 from langchain.prompts import PromptTemplate
 from langchain_core.pydantic_v1 import Field
 from langchain.output_parsers import PydanticOutputParser

 class Generator:
-    def __init__(self, values:list, api_key:str, temperature_param:float = 0, model_name:str = "gpt-3.5-turbo"):
+    def __init__(self, values:list[dict], api_key:str, temperature_param:float = 0, model_name:str = "gpt-3.5-turbo")->dict:
        """
        Initializes the Generator object.

        Parameters:
-        - values (list): A list of values used for class creation.
-        - temperature_param (float): A parameter controlling the randomness of the language model's output.
-        - model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All
+            values (list): A list of values used for class creation.
+            temperature_param (float): A parameter controlling the randomness of the language model's output.
+            model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All
          the possible models are avaible at the following link: https://platform.openai.com/docs/models

        Returns:
-        - result_dict (dict): The result of the language model invocation, converted to a dictionary.
+            result_dict (dict): The result of the language model invocation, converted to a dictionary.
        """
-        create_class(values)
+        create_class(values) #in future to remove

        self.parser = PydanticOutputParser(pydantic_object=_Response)

--- a/amazscraper/getter.py
+++ b/amazscraper/getter.py
@ -1,8 +1,53 @@
 import requests
-from bs4 import BeautifulSoup
+import tiktoken
+from typing import List

 HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
-'Accept-Language': 'en-US'}
+           'Accept-Language': 'en-US'}
+
+models_tokens = {
+    "gpt-3.5-turbo-0125": 16385,
+    "gpt-3.5-turbo": 4096,
+    "gpt-3.5-turbo-1106": 16385,
+    "gpt-3.5-turbo-instruct": 4096,
+    "gpt-4-0125-preview": 128000,
+    "gpt-4-turbo-preview": 128000,
+    "gpt-4-1106-preview": 128000,
+    "gpt-4-vision-preview": 128000,
+    "gpt-4": 8192,
+    "gpt-4-0613": 8192,
+    "gpt-4-32k": 32768,
+    "gpt-4-32k-0613": 32768,
+}
+
+EMBEDDING_ENCODING = 'cl100k_base'
+
+DEFAULT_MESSAGE_LENGTH = 100
+
+
+def truncate_text_tokens(text: str, model: str, encoding_name: str = EMBEDDING_ENCODING) -> List[str]:
+    """
+    It creates a list of strings to create max dimension tokenizable elements
+
+    Parameters:
+    text (str): text to scrape
+    model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All
+    the possible models are available at the following link: https://platform.openai.com/docs/models
+    encoding_name (str):
+
+    Returns
+    List[str] of elements to send the requests
+    """
+    encoding = tiktoken.get_encoding(encoding_name)
+    max_tokens = models_tokens[model]
+    encoded_text = encoding.encode(text)
+    
+    chunks = [encoded_text[i:i + max_tokens] for i in range(0, len(encoded_text), max_tokens)]
+    
+    result = [encoding.decode(chunk) for chunk in chunks]
+    
+    return result
+

 def get_function(link:str, param = HEADERS) -> str:
    """
@ -46,4 +91,5 @@ def remover(file:str) -> str:

        if isBody == True:
            res = res + elem
-    return res
+
+    return res.replace("\n", "")
--- a/amazscraper/pydantic_class.py
+++ b/amazscraper/pydantic_class.py
@ -2,4 +2,4 @@
 from langchain_core.pydantic_v1 import BaseModel, Field

 class _Response(BaseModel):
-    title_website: str = Field(description='Give me the website name')
+    title: str = Field(description='Title of the news')
--- a/amazscraper/request.py
+++ b/amazscraper/request.py
@ -0,0 +1,25 @@
+from amazscraper.getter import remover
+from .class_creator import create_class
+from amazscraper.class_generator import Generator
+
+
+def send_request(values:list[dict]) ->dict: 
+    """
+    Param:
+        values (list[dict]): settings of the request. 
+        Format: 
+        [
+          dict {
+                "title": str
+                "type": str,
+                "description": str
+            }
+        ]
+      
+    Return:
+        dict: the result of the request to openai
+    """
+    res =  {} 
+    create_class("TOADD")
+
+    return res
--- a/requirements.txt
+++ b/requirements.txt
@ -5,8 +5,8 @@ langchain_openai==0.0.5
 python-dotenv==1.0.1
 Requests==2.31.0
 pytest==8.0.0
-
 wheel==0.42.0
+tiktoken==0.5.2
 twine==4.0.2
 sphinx==7.1.2
 sphinx-rtd-theme==2.0.0