diff --git a/README.md b/README.md index f74762e9..847dd5d6 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ source ./venv/bin/activate ``` 3. + ```bash pip install -r requirements.txt # if you want to install it as a library @@ -61,7 +62,7 @@ python -m yoso-ai.examples.html_scraping ```python import os from dotenv import load_dotenv -from yosoai import get_function, send_request +from yosoai import _get_function, send_request load_dotenv() @@ -89,7 +90,7 @@ def main(): mockup_world_url = "https://sport.sky.it/nba?gr=www" # Invoke send_request function - result = send_request(openai_key, get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base') + result = send_request(openai_key, _get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base') # Print or process the result as needed print("Result:", result) diff --git a/examples/value_scraping.py b/examples/value_scraping.py index 041ead84..38128322 100644 --- a/examples/value_scraping.py +++ b/examples/value_scraping.py @@ -1,6 +1,6 @@ import os from dotenv import load_dotenv -from yosoai import get_function, send_request +from yosoai import _get_function, send_request load_dotenv() @@ -28,7 +28,7 @@ def main(): mockup_world_url = "https://sport.sky.it/nba?gr=www" # Invoke send_request function - result = send_request(openai_key, get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base') + result = send_request(openai_key, _get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base') # Print or process the result as needed print("Result:", result) diff --git a/requirements.txt b/requirements.txt index a502bcb5..200869de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,6 @@ langchain_core==0.1.22 langchain_openai==0.0.5 pytest==8.0.0 python-dotenv==1.0.1 -setuptools==65.5.1 +setuptools==63.2.0 tiktoken==0.6.0 tqdm==4.66.1 diff --git a/setup.py b/setup.py index e8c3f177..ab5bf3b6 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -# Always prefer setuptools over distutils +# Always prefer setuptools over distdictionaries from setuptools import setup, find_packages # Function to read the contents of a requirements file diff --git a/yosoai/__init__.py b/yosoai/__init__.py index ce203663..cec705a8 100644 --- a/yosoai/__init__.py +++ b/yosoai/__init__.py @@ -1,5 +1,5 @@ from .class_creator import create_class from .class_generator import Generator -from .getter import get_function +from .getter import _get_function from .token_calculator import truncate_text_tokens from .request import send_request \ No newline at end of file diff --git a/yosoai/dictionaries.py b/yosoai/dictionaries.py new file mode 100644 index 00000000..125c0f65 --- /dev/null +++ b/yosoai/dictionaries.py @@ -0,0 +1,36 @@ +schema_example= { + "properties": { + "person_name": {"type": "string"}, + "person_surname": {"type": "string"}, + "profession": {"type": "string"}, + "hobbies": {"type": "string"}, + "projects": { + "type": "array", + "items": { + "type": "object", + "properties": { + "project_name": {"type": "string"}, + "project_description": {"type": "string"}, + "url": {"type": "string"} + }, + "required": ["project_name", "project_description", "url"], + }, + }, + }, + "required": ["person_name", "person_surname", "profession", "hobbies", "projects"], +} + +models_tokens = { + "gpt-3.5-turbo-0125": 16385, + "gpt-3.5-turbo": 4096, + "gpt-3.5-turbo-1106": 16385, + "gpt-3.5-turbo-instruct": 4096, + "gpt-4-0125-preview": 128000, + "gpt-4-turbo-preview": 128000, + "gpt-4-1106-preview": 128000, + "gpt-4-vision-preview": 128000, + "gpt-4": 8192, + "gpt-4-0613": 8192, + "gpt-4-32k": 32768, + "gpt-4-32k-0613": 32768, +} \ No newline at end of file diff --git a/yosoai/getter.py b/yosoai/getter.py index 1f1215de..7c3bc714 100644 --- a/yosoai/getter.py +++ b/yosoai/getter.py @@ -1,6 +1,6 @@ from langchain_community.document_loaders import AsyncHtmlLoader -def get_function(link:str) -> str: +def _get_function(link:str) -> str: """ It sends a GET request to the specified link with optional headers. diff --git a/yosoai/json_getter.py b/yosoai/json_getter.py new file mode 100644 index 00000000..2b497b09 --- /dev/null +++ b/yosoai/json_getter.py @@ -0,0 +1,65 @@ +import tiktoken +from tqdm import tqdm +from typing import List +from .getter import _get_function +from langchain_openai import ChatOpenAI +from .dictionaries import schema_example +from langchain.prompts import PromptTemplate +from .token_calculator import truncate_text_tokens +from langchain_core.output_parsers import JsonOutputParser + +EMBEDDING_ENCODING = 'cl100k_base' + +def _getJson(key: str, link: str, model_name:str, encoding_name_chunk: str = EMBEDDING_ENCODING) -> str: + """ + Function that creates a JSON schema given a link + Args: + key (str): openai key + link (str): link to analyze + model_name (str): The name of the openai language model to be used. + encoding_name_chunk (str): The name of the encoding to be used (default: EMBEDDING_ENCODING). + Returns: + str: the HTML schema of the website + """ + model = ChatOpenAI(temperature=0, openai_api_key=key) + parser = JsonOutputParser() + + html = _get_function(link) + + chunks = truncate_text_tokens(html, model=model_name, encoding_name=encoding_name_chunk) + + progress_bar = tqdm(total=len(chunks), desc="Sending chunks") + + result = [] + + for chunk in chunks: + prompt = PromptTemplate( + template="You are a website scraper and you want to extract information in a schema like the example provided. Write a dictionary where the key is the section and the value is the type.\n{format_instructions}\n{query}\n. Example: {example}", + input_variables=["query"], + partial_variables={ + "format_instructions": parser.get_format_instructions(), + "example": str(schema_example), + }, + ) + + chain = prompt | model | parser + + result.append(chain.invoke({"query": chunk})) + + progress_bar.update(1) + + progress_bar.close() + + if(len(result)>1): + prompt = PromptTemplate( + template="You are a website scraper and you have to merge the given schemas without repetitions.\n{format_instructions}}\n. Example: {to_merge}", + input_variables=["to_merge"], + partial_variables={"format_instructions": parser.get_format_instructions() + }, + ) + + chain = prompt | model | parser + + result = chain.invoke({"query": str(result)}) + + return result diff --git a/yosoai/request.py b/yosoai/request.py index 31276849..805f713e 100644 --- a/yosoai/request.py +++ b/yosoai/request.py @@ -2,15 +2,15 @@ import time from tqdm import tqdm from typing import List from tqdm import tqdm -from .class_generator import Generator from .remover import remover +from .class_generator import Generator from .class_creator import create_class from .token_calculator import truncate_text_tokens EMBEDDING_ENCODING = 'cl100k_base' LAST_REQUEST_TIME = 0 -REQUEST_INTERVAL = 20 # Adjust as needed, represents the interval in seconds between requests +REQUEST_INTERVAL = 20 def send_request(key: str, text:str, values:list[dict], model:str, temperature:float = 0.0, encoding_name: str = EMBEDDING_ENCODING) -> List[dict]: """ @@ -23,7 +23,7 @@ def send_request(key: str, text:str, values:list[dict], model:str, temperature:f - "title" (str): The title of the field. - "type" (str): The type of the field. - "description" (str): The description of the field. - model (str): The name of the language model to be used. + model (str): The name of the openai language model to be used. temperature (float): A parameter controlling the randomness of the language model's output (default: 0). encoding_name (str): The name of the encoding to be used (default: EMBEDDING_ENCODING). Returns: diff --git a/yosoai/token_calculator.py b/yosoai/token_calculator.py index 00d1eb8c..cb15415a 100644 --- a/yosoai/token_calculator.py +++ b/yosoai/token_calculator.py @@ -1,20 +1,6 @@ import tiktoken from typing import List - -models_tokens = { - "gpt-3.5-turbo-0125": 16385, - "gpt-3.5-turbo": 4096, - "gpt-3.5-turbo-1106": 16385, - "gpt-3.5-turbo-instruct": 4096, - "gpt-4-0125-preview": 128000, - "gpt-4-turbo-preview": 128000, - "gpt-4-1106-preview": 128000, - "gpt-4-vision-preview": 128000, - "gpt-4": 8192, - "gpt-4-0613": 8192, - "gpt-4-32k": 32768, - "gpt-4-32k-0613": 32768, -} +from .dictionaries import models_tokens def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]: """