mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
add generation schema
This commit is contained in:
parent
1e7ca9f3fb
commit
bb2f488236
@ -28,6 +28,7 @@ source ./venv/bin/activate
|
||||
```
|
||||
|
||||
3.
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
# if you want to install it as a library
|
||||
@ -61,7 +62,7 @@ python -m yoso-ai.examples.html_scraping
|
||||
```python
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from yosoai import get_function, send_request
|
||||
from yosoai import _get_function, send_request
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -89,7 +90,7 @@ def main():
|
||||
mockup_world_url = "https://sport.sky.it/nba?gr=www"
|
||||
|
||||
# Invoke send_request function
|
||||
result = send_request(openai_key, get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base')
|
||||
result = send_request(openai_key, _get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base')
|
||||
|
||||
# Print or process the result as needed
|
||||
print("Result:", result)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from yosoai import get_function, send_request
|
||||
from yosoai import _get_function, send_request
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -28,7 +28,7 @@ def main():
|
||||
mockup_world_url = "https://sport.sky.it/nba?gr=www"
|
||||
|
||||
# Invoke send_request function
|
||||
result = send_request(openai_key, get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base')
|
||||
result = send_request(openai_key, _get_function(mockup_world_url), request_settings, selected_model, temperature_value, 'cl100k_base')
|
||||
|
||||
# Print or process the result as needed
|
||||
print("Result:", result)
|
||||
|
||||
@ -4,6 +4,6 @@ langchain_core==0.1.22
|
||||
langchain_openai==0.0.5
|
||||
pytest==8.0.0
|
||||
python-dotenv==1.0.1
|
||||
setuptools==65.5.1
|
||||
setuptools==63.2.0
|
||||
tiktoken==0.6.0
|
||||
tqdm==4.66.1
|
||||
|
||||
2
setup.py
2
setup.py
@ -1,4 +1,4 @@
|
||||
# Always prefer setuptools over distutils
|
||||
# Always prefer setuptools over distdictionaries
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
# Function to read the contents of a requirements file
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
from .class_creator import create_class
|
||||
from .class_generator import Generator
|
||||
from .getter import get_function
|
||||
from .getter import _get_function
|
||||
from .token_calculator import truncate_text_tokens
|
||||
from .request import send_request
|
||||
36
yosoai/dictionaries.py
Normal file
36
yosoai/dictionaries.py
Normal file
@ -0,0 +1,36 @@
|
||||
schema_example= {
|
||||
"properties": {
|
||||
"person_name": {"type": "string"},
|
||||
"person_surname": {"type": "string"},
|
||||
"profession": {"type": "string"},
|
||||
"hobbies": {"type": "string"},
|
||||
"projects": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"project_name": {"type": "string"},
|
||||
"project_description": {"type": "string"},
|
||||
"url": {"type": "string"}
|
||||
},
|
||||
"required": ["project_name", "project_description", "url"],
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["person_name", "person_surname", "profession", "hobbies", "projects"],
|
||||
}
|
||||
|
||||
models_tokens = {
|
||||
"gpt-3.5-turbo-0125": 16385,
|
||||
"gpt-3.5-turbo": 4096,
|
||||
"gpt-3.5-turbo-1106": 16385,
|
||||
"gpt-3.5-turbo-instruct": 4096,
|
||||
"gpt-4-0125-preview": 128000,
|
||||
"gpt-4-turbo-preview": 128000,
|
||||
"gpt-4-1106-preview": 128000,
|
||||
"gpt-4-vision-preview": 128000,
|
||||
"gpt-4": 8192,
|
||||
"gpt-4-0613": 8192,
|
||||
"gpt-4-32k": 32768,
|
||||
"gpt-4-32k-0613": 32768,
|
||||
}
|
||||
@ -1,6 +1,6 @@
|
||||
from langchain_community.document_loaders import AsyncHtmlLoader
|
||||
|
||||
def get_function(link:str) -> str:
|
||||
def _get_function(link:str) -> str:
|
||||
"""
|
||||
It sends a GET request to the specified link with optional headers.
|
||||
|
||||
|
||||
65
yosoai/json_getter.py
Normal file
65
yosoai/json_getter.py
Normal file
@ -0,0 +1,65 @@
|
||||
import tiktoken
|
||||
from tqdm import tqdm
|
||||
from typing import List
|
||||
from .getter import _get_function
|
||||
from langchain_openai import ChatOpenAI
|
||||
from .dictionaries import schema_example
|
||||
from langchain.prompts import PromptTemplate
|
||||
from .token_calculator import truncate_text_tokens
|
||||
from langchain_core.output_parsers import JsonOutputParser
|
||||
|
||||
EMBEDDING_ENCODING = 'cl100k_base'
|
||||
|
||||
def _getJson(key: str, link: str, model_name:str, encoding_name_chunk: str = EMBEDDING_ENCODING) -> str:
|
||||
"""
|
||||
Function that creates a JSON schema given a link
|
||||
Args:
|
||||
key (str): openai key
|
||||
link (str): link to analyze
|
||||
model_name (str): The name of the openai language model to be used.
|
||||
encoding_name_chunk (str): The name of the encoding to be used (default: EMBEDDING_ENCODING).
|
||||
Returns:
|
||||
str: the HTML schema of the website
|
||||
"""
|
||||
model = ChatOpenAI(temperature=0, openai_api_key=key)
|
||||
parser = JsonOutputParser()
|
||||
|
||||
html = _get_function(link)
|
||||
|
||||
chunks = truncate_text_tokens(html, model=model_name, encoding_name=encoding_name_chunk)
|
||||
|
||||
progress_bar = tqdm(total=len(chunks), desc="Sending chunks")
|
||||
|
||||
result = []
|
||||
|
||||
for chunk in chunks:
|
||||
prompt = PromptTemplate(
|
||||
template="You are a website scraper and you want to extract information in a schema like the example provided. Write a dictionary where the key is the section and the value is the type.\n{format_instructions}\n{query}\n. Example: {example}",
|
||||
input_variables=["query"],
|
||||
partial_variables={
|
||||
"format_instructions": parser.get_format_instructions(),
|
||||
"example": str(schema_example),
|
||||
},
|
||||
)
|
||||
|
||||
chain = prompt | model | parser
|
||||
|
||||
result.append(chain.invoke({"query": chunk}))
|
||||
|
||||
progress_bar.update(1)
|
||||
|
||||
progress_bar.close()
|
||||
|
||||
if(len(result)>1):
|
||||
prompt = PromptTemplate(
|
||||
template="You are a website scraper and you have to merge the given schemas without repetitions.\n{format_instructions}}\n. Example: {to_merge}",
|
||||
input_variables=["to_merge"],
|
||||
partial_variables={"format_instructions": parser.get_format_instructions()
|
||||
},
|
||||
)
|
||||
|
||||
chain = prompt | model | parser
|
||||
|
||||
result = chain.invoke({"query": str(result)})
|
||||
|
||||
return result
|
||||
@ -2,15 +2,15 @@ import time
|
||||
from tqdm import tqdm
|
||||
from typing import List
|
||||
from tqdm import tqdm
|
||||
from .class_generator import Generator
|
||||
from .remover import remover
|
||||
from .class_generator import Generator
|
||||
from .class_creator import create_class
|
||||
from .token_calculator import truncate_text_tokens
|
||||
|
||||
EMBEDDING_ENCODING = 'cl100k_base'
|
||||
|
||||
LAST_REQUEST_TIME = 0
|
||||
REQUEST_INTERVAL = 20 # Adjust as needed, represents the interval in seconds between requests
|
||||
REQUEST_INTERVAL = 20
|
||||
|
||||
def send_request(key: str, text:str, values:list[dict], model:str, temperature:float = 0.0, encoding_name: str = EMBEDDING_ENCODING) -> List[dict]:
|
||||
"""
|
||||
@ -23,7 +23,7 @@ def send_request(key: str, text:str, values:list[dict], model:str, temperature:f
|
||||
- "title" (str): The title of the field.
|
||||
- "type" (str): The type of the field.
|
||||
- "description" (str): The description of the field.
|
||||
model (str): The name of the language model to be used.
|
||||
model (str): The name of the openai language model to be used.
|
||||
temperature (float): A parameter controlling the randomness of the language model's output (default: 0).
|
||||
encoding_name (str): The name of the encoding to be used (default: EMBEDDING_ENCODING).
|
||||
Returns:
|
||||
|
||||
@ -1,20 +1,6 @@
|
||||
import tiktoken
|
||||
from typing import List
|
||||
|
||||
models_tokens = {
|
||||
"gpt-3.5-turbo-0125": 16385,
|
||||
"gpt-3.5-turbo": 4096,
|
||||
"gpt-3.5-turbo-1106": 16385,
|
||||
"gpt-3.5-turbo-instruct": 4096,
|
||||
"gpt-4-0125-preview": 128000,
|
||||
"gpt-4-turbo-preview": 128000,
|
||||
"gpt-4-1106-preview": 128000,
|
||||
"gpt-4-vision-preview": 128000,
|
||||
"gpt-4": 8192,
|
||||
"gpt-4-0613": 8192,
|
||||
"gpt-4-32k": 32768,
|
||||
"gpt-4-32k-0613": 32768,
|
||||
}
|
||||
from .dictionaries import models_tokens
|
||||
|
||||
def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
|
||||
"""
|
||||
|
||||
Loading…
Reference in New Issue
Block a user