add generation schema

This commit is contained in:
VinciGit00 2024-02-11 13:59:47 +01:00
parent 1e7ca9f3fb
commit bb2f488236
10 changed files with 114 additions and 26 deletions

View File

@ -28,6 +28,7 @@ source ./venv/bin/activate
```
3.
```bash
pip install -r requirements.txt
# if you want to install it as a library
@ -61,7 +62,7 @@ python -m yoso-ai.examples.html_scraping
```python
import os
from dotenv import load_dotenv
from yosoai import get_function, send_request
from yosoai import _get_function, send_request
load_dotenv()
@ -89,7 +90,7 @@ def main():
mockup_world_url = "https://sport.sky.it/nba?gr=www"
# Invoke send_request function
result = send_request(openai_key, get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base')
result = send_request(openai_key, _get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base')
# Print or process the result as needed
print("Result:", result)

View File

@ -1,6 +1,6 @@
import os
from dotenv import load_dotenv
from yosoai import get_function, send_request
from yosoai import _get_function, send_request
load_dotenv()
@ -28,7 +28,7 @@ def main():
mockup_world_url = "https://sport.sky.it/nba?gr=www"
# Invoke send_request function
result = send_request(openai_key, get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base')
result = send_request(openai_key, _get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base')
# Print or process the result as needed
print("Result:", result)

View File

@ -4,6 +4,6 @@ langchain_core==0.1.22
langchain_openai==0.0.5
pytest==8.0.0
python-dotenv==1.0.1
setuptools==65.5.1
setuptools==63.2.0
tiktoken==0.6.0
tqdm==4.66.1

View File

@ -1,4 +1,4 @@
# Always prefer setuptools over distutils
# Always prefer setuptools over distdictionaries
from setuptools import setup, find_packages
# Function to read the contents of a requirements file

View File

@ -1,5 +1,5 @@
from .class_creator import create_class
from .class_generator import Generator
from .getter import get_function
from .getter import _get_function
from .token_calculator import truncate_text_tokens
from .request import send_request

36
yosoai/dictionaries.py Normal file
View File

@ -0,0 +1,36 @@
schema_example= {
"properties": {
"person_name": {"type": "string"},
"person_surname": {"type": "string"},
"profession": {"type": "string"},
"hobbies": {"type": "string"},
"projects": {
"type": "array",
"items": {
"type": "object",
"properties": {
"project_name": {"type": "string"},
"project_description": {"type": "string"},
"url": {"type": "string"}
},
"required": ["project_name", "project_description", "url"],
},
},
},
"required": ["person_name", "person_surname", "profession", "hobbies", "projects"],
}
models_tokens = {
"gpt-3.5-turbo-0125": 16385,
"gpt-3.5-turbo": 4096,
"gpt-3.5-turbo-1106": 16385,
"gpt-3.5-turbo-instruct": 4096,
"gpt-4-0125-preview": 128000,
"gpt-4-turbo-preview": 128000,
"gpt-4-1106-preview": 128000,
"gpt-4-vision-preview": 128000,
"gpt-4": 8192,
"gpt-4-0613": 8192,
"gpt-4-32k": 32768,
"gpt-4-32k-0613": 32768,
}

View File

@ -1,6 +1,6 @@
from langchain_community.document_loaders import AsyncHtmlLoader
def get_function(link:str) -> str:
def _get_function(link:str) -> str:
"""
It sends a GET request to the specified link with optional headers.

65
yosoai/json_getter.py Normal file
View File

@ -0,0 +1,65 @@
import tiktoken
from tqdm import tqdm
from typing import List
from .getter import _get_function
from langchain_openai import ChatOpenAI
from .dictionaries import schema_example
from langchain.prompts import PromptTemplate
from .token_calculator import truncate_text_tokens
from langchain_core.output_parsers import JsonOutputParser
EMBEDDING_ENCODING = 'cl100k_base'
def _getJson(key: str, link: str, model_name:str, encoding_name_chunk: str = EMBEDDING_ENCODING) -> str:
"""
Function that creates a JSON schema given a link
Args:
key (str): openai key
link (str): link to analyze
model_name (str): The name of the openai language model to be used.
encoding_name_chunk (str): The name of the encoding to be used (default: EMBEDDING_ENCODING).
Returns:
str: the HTML schema of the website
"""
model = ChatOpenAI(temperature=0, openai_api_key=key)
parser = JsonOutputParser()
html = _get_function(link)
chunks = truncate_text_tokens(html, model=model_name, encoding_name=encoding_name_chunk)
progress_bar = tqdm(total=len(chunks), desc="Sending chunks")
result = []
for chunk in chunks:
prompt = PromptTemplate(
template="You are a website scraper and you want to extract information in a schema like the example provided. Write a dictionary where the key is the section and the value is the type.\n{format_instructions}\n{query}\n. Example: {example}",
input_variables=["query"],
partial_variables={
"format_instructions": parser.get_format_instructions(),
"example": str(schema_example),
},
)
chain = prompt | model | parser
result.append(chain.invoke({"query": chunk}))
progress_bar.update(1)
progress_bar.close()
if(len(result)>1):
prompt = PromptTemplate(
template="You are a website scraper and you have to merge the given schemas without repetitions.\n{format_instructions}}\n. Example: {to_merge}",
input_variables=["to_merge"],
partial_variables={"format_instructions": parser.get_format_instructions()
},
)
chain = prompt | model | parser
result = chain.invoke({"query": str(result)})
return result

View File

@ -2,15 +2,15 @@ import time
from tqdm import tqdm
from typing import List
from tqdm import tqdm
from .class_generator import Generator
from .remover import remover
from .class_generator import Generator
from .class_creator import create_class
from .token_calculator import truncate_text_tokens
EMBEDDING_ENCODING = 'cl100k_base'
LAST_REQUEST_TIME = 0
REQUEST_INTERVAL = 20 # Adjust as needed, represents the interval in seconds between requests
REQUEST_INTERVAL = 20
def send_request(key: str, text:str, values:list[dict], model:str, temperature:float = 0.0, encoding_name: str = EMBEDDING_ENCODING) -> List[dict]:
"""
@ -23,7 +23,7 @@ def send_request(key: str, text:str, values:list[dict], model:str, temperature:f
- "title" (str): The title of the field.
- "type" (str): The type of the field.
- "description" (str): The description of the field.
model (str): The name of the language model to be used.
model (str): The name of the openai language model to be used.
temperature (float): A parameter controlling the randomness of the language model's output (default: 0).
encoding_name (str): The name of the encoding to be used (default: EMBEDDING_ENCODING).
Returns:

View File

@ -1,20 +1,6 @@
import tiktoken
from typing import List
models_tokens = {
"gpt-3.5-turbo-0125": 16385,
"gpt-3.5-turbo": 4096,
"gpt-3.5-turbo-1106": 16385,
"gpt-3.5-turbo-instruct": 4096,
"gpt-4-0125-preview": 128000,
"gpt-4-turbo-preview": 128000,
"gpt-4-1106-preview": 128000,
"gpt-4-vision-preview": 128000,
"gpt-4": 8192,
"gpt-4-0613": 8192,
"gpt-4-32k": 32768,
"gpt-4-32k-0613": 32768,
}
from .dictionaries import models_tokens
def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
"""