add generation schema

2026-06-23 21:00:30 +08:00 · 2024-02-11 13:59:47 +01:00 · 2024-02-11 13:59:47 +01:00 · bb2f488236
commit bb2f488236
parent 1e7ca9f3fb
10 changed files with 114 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -28,6 +28,7 @@ source ./venv/bin/activate
 ```

 3.
+
 ```bash
 pip install -r requirements.txt
 # if you want to install it as a library
@ -61,7 +62,7 @@ python -m yoso-ai.examples.html_scraping
 ```python
 import os
 from dotenv import load_dotenv
-from yosoai import get_function, send_request
+from yosoai import _get_function, send_request

 load_dotenv()

@ -89,7 +90,7 @@ def main():
    mockup_world_url = "https://sport.sky.it/nba?gr=www"

    # Invoke send_request function
-    result = send_request(openai_key, get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base')
+    result = send_request(openai_key, _get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base')

    # Print or process the result as needed
    print("Result:", result)
--- a/examples/value_scraping.py
+++ b/examples/value_scraping.py
@ -1,6 +1,6 @@
 import os
 from dotenv import load_dotenv
-from yosoai import get_function, send_request
+from yosoai import _get_function, send_request

 load_dotenv()

@ -28,7 +28,7 @@ def main():
    mockup_world_url = "https://sport.sky.it/nba?gr=www"

    # Invoke send_request function
-    result = send_request(openai_key, get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base')
+    result = send_request(openai_key, _get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base')

    # Print or process the result as needed
    print("Result:", result)
--- a/requirements.txt
+++ b/requirements.txt
@ -4,6 +4,6 @@ langchain_core==0.1.22
 langchain_openai==0.0.5
 pytest==8.0.0
 python-dotenv==1.0.1
-setuptools==65.5.1
+setuptools==63.2.0
 tiktoken==0.6.0
 tqdm==4.66.1
--- a/setup.py
+++ b/setup.py
@ -1,4 +1,4 @@
-# Always prefer setuptools over distutils
+# Always prefer setuptools over distdictionaries
 from setuptools import setup, find_packages

 # Function to read the contents of a requirements file
--- a/yosoai/init.py
+++ b/yosoai/init.py
@ -1,5 +1,5 @@
 from .class_creator import create_class
 from .class_generator import Generator
-from .getter import  get_function
+from .getter import  _get_function
 from .token_calculator import truncate_text_tokens
 from .request import send_request
--- a/yosoai/dictionaries.py
+++ b/yosoai/dictionaries.py
@ -0,0 +1,36 @@
+schema_example= { 
+    "properties": { 
+        "person_name": {"type": "string"}, 
+        "person_surname": {"type": "string"}, 
+        "profession": {"type": "string"}, 
+        "hobbies": {"type": "string"}, 
+        "projects": { 
+            "type": "array", 
+            "items": { 
+                "type": "object", 
+                "properties": { 
+                    "project_name": {"type": "string"}, 
+                    "project_description": {"type": "string"}, 
+                    "url": {"type": "string"} 
+                }, 
+                "required": ["project_name", "project_description", "url"], 
+            }, 
+        }, 
+    }, 
+    "required": ["person_name", "person_surname", "profession", "hobbies", "projects"], 
+}
+
+models_tokens = {
+    "gpt-3.5-turbo-0125": 16385,
+    "gpt-3.5-turbo": 4096,
+    "gpt-3.5-turbo-1106": 16385,
+    "gpt-3.5-turbo-instruct": 4096,
+    "gpt-4-0125-preview": 128000,
+    "gpt-4-turbo-preview": 128000,
+    "gpt-4-1106-preview": 128000,
+    "gpt-4-vision-preview": 128000,
+    "gpt-4": 8192,
+    "gpt-4-0613": 8192,
+    "gpt-4-32k": 32768,
+    "gpt-4-32k-0613": 32768,
+}
--- a/yosoai/getter.py
+++ b/yosoai/getter.py
@ -1,6 +1,6 @@
 from langchain_community.document_loaders import AsyncHtmlLoader 

-def get_function(link:str) -> str:
+def _get_function(link:str) -> str:
    """
    It sends a GET request to the specified link with optional headers.

--- a/yosoai/json_getter.py
+++ b/yosoai/json_getter.py
@ -0,0 +1,65 @@
+import tiktoken
+from tqdm import tqdm
+from typing import List
+from .getter import _get_function
+from langchain_openai import ChatOpenAI
+from .dictionaries import schema_example
+from langchain.prompts import PromptTemplate
+from .token_calculator import truncate_text_tokens
+from langchain_core.output_parsers import JsonOutputParser
+
+EMBEDDING_ENCODING = 'cl100k_base'
+
+def _getJson(key: str, link: str,  model_name:str, encoding_name_chunk: str = EMBEDDING_ENCODING) -> str:
+    """
+    Function that creates a JSON schema given a link
+    Args:
+        key (str): openai key
+        link (str): link to analyze
+        model_name (str): The name of the openai language model to be used.
+        encoding_name_chunk (str):  The name of the encoding to be used (default: EMBEDDING_ENCODING).
+    Returns:
+        str: the HTML schema of the website
+    """
+    model = ChatOpenAI(temperature=0, openai_api_key=key)
+    parser = JsonOutputParser()
+
+    html = _get_function(link)
+
+    chunks = truncate_text_tokens(html, model=model_name, encoding_name=encoding_name_chunk)
+
+    progress_bar = tqdm(total=len(chunks), desc="Sending chunks")
+
+    result = []
+
+    for chunk in chunks:
+        prompt = PromptTemplate(
+            template="You are a website scraper and you want to extract information in a schema like the example provided. Write a dictionary where the key is the section and the value is the type.\n{format_instructions}\n{query}\n. Example: {example}",
+            input_variables=["query"],
+            partial_variables={
+                "format_instructions": parser.get_format_instructions(),
+                "example": str(schema_example),
+            },
+        )
+
+        chain = prompt | model | parser
+
+        result.append(chain.invoke({"query": chunk}))
+
+        progress_bar.update(1)
+
+    progress_bar.close()
+
+    if(len(result)>1):
+        prompt = PromptTemplate(
+            template="You are a website scraper and you have to merge the given schemas without repetitions.\n{format_instructions}}\n. Example: {to_merge}",
+            input_variables=["to_merge"],
+            partial_variables={"format_instructions": parser.get_format_instructions()
+                            },
+        )
+ 
+        chain = prompt | model | parser
+
+        result = chain.invoke({"query": str(result)})
+
+    return result
--- a/yosoai/request.py
+++ b/yosoai/request.py
@ -2,15 +2,15 @@ import time
 from tqdm import tqdm 
 from typing import List
 from tqdm import tqdm  
-from .class_generator import Generator
 from .remover import remover
+from .class_generator import Generator
 from .class_creator import create_class
 from .token_calculator import truncate_text_tokens

 EMBEDDING_ENCODING = 'cl100k_base'

 LAST_REQUEST_TIME = 0
-REQUEST_INTERVAL = 20  # Adjust as needed, represents the interval in seconds between requests
+REQUEST_INTERVAL = 20  

 def send_request(key: str, text:str, values:list[dict], model:str, temperature:float = 0.0, encoding_name: str = EMBEDDING_ENCODING) -> List[dict]:
    """
@ -23,7 +23,7 @@ def send_request(key: str, text:str, values:list[dict], model:str, temperature:f
                            - "title" (str): The title of the field.
                            - "type" (str): The type of the field.
                            - "description" (str): The description of the field.
-        model (str): The name of the language model to be used.
+        model (str): The name of the openai language model to be used.
        temperature (float): A parameter controlling the randomness of the language model's output (default: 0).
        encoding_name (str): The name of the encoding to be used (default: EMBEDDING_ENCODING).
    Returns:
--- a/yosoai/token_calculator.py
+++ b/yosoai/token_calculator.py
@ -1,20 +1,6 @@
 import tiktoken
 from typing import List
-
-models_tokens = {
-    "gpt-3.5-turbo-0125": 16385,
-    "gpt-3.5-turbo": 4096,
-    "gpt-3.5-turbo-1106": 16385,
-    "gpt-3.5-turbo-instruct": 4096,
-    "gpt-4-0125-preview": 128000,
-    "gpt-4-turbo-preview": 128000,
-    "gpt-4-1106-preview": 128000,
-    "gpt-4-vision-preview": 128000,
-    "gpt-4": 8192,
-    "gpt-4-0613": 8192,
-    "gpt-4-32k": 32768,
-    "gpt-4-32k-0613": 32768,
-}
+from .dictionaries import models_tokens 

 def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
    """