mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
add the opportunity to create chunks
This commit is contained in:
parent
0bc7e61072
commit
cddc9a3c8a
@ -1,4 +1,4 @@
|
||||
# 🤖 AmazScraper
|
||||
# 🤖 YOSO-ai
|
||||
|
||||
This repo is a Python open source library for making a faster scraping using AI and without any knowledge about the HTML code.
|
||||
|
||||
@ -15,7 +15,7 @@ Official documentation page: [https://amazscraper.readthedocs.io/en/latest/index
|
||||
|
||||
Try out AmazScraper in your browser:
|
||||
|
||||
[](https://codespaces.new/VinciGit00/AmazScraper)
|
||||
[](https://obscure-trout-74p9vqwv75wcxwq6.github.dev)
|
||||
|
||||
# 🔧 Quick Setup
|
||||
|
||||
@ -182,7 +182,7 @@ Given the following input
|
||||
{
|
||||
"title": "title",
|
||||
"type": "str",
|
||||
"description": "Title of the items"
|
||||
"description": "Title of the news"
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@ -8,6 +8,13 @@ class _Response(BaseModel):
|
||||
def create_class(data_dict: dict):
|
||||
'''
|
||||
This function creates a class at runtime using the values from the list.
|
||||
Parameters:
|
||||
data_dict
|
||||
dict {
|
||||
"title": str
|
||||
"type": str,
|
||||
"description": str
|
||||
}: dictionary for describing the prompt
|
||||
'''
|
||||
for elem in data_dict:
|
||||
global base_script
|
||||
|
||||
@ -1,26 +1,26 @@
|
||||
from dotenv import load_dotenv
|
||||
from .pydantic_class import _Response
|
||||
from .class_creator import create_class
|
||||
from .class_creator import create_class #in future to remove
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langchain.prompts import PromptTemplate
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
from langchain.output_parsers import PydanticOutputParser
|
||||
|
||||
class Generator:
|
||||
def __init__(self, values:list, api_key:str, temperature_param:float = 0, model_name:str = "gpt-3.5-turbo"):
|
||||
def __init__(self, values:list[dict], api_key:str, temperature_param:float = 0, model_name:str = "gpt-3.5-turbo")->dict:
|
||||
"""
|
||||
Initializes the Generator object.
|
||||
|
||||
Parameters:
|
||||
- values (list): A list of values used for class creation.
|
||||
- temperature_param (float): A parameter controlling the randomness of the language model's output.
|
||||
- model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All
|
||||
values (list): A list of values used for class creation.
|
||||
temperature_param (float): A parameter controlling the randomness of the language model's output.
|
||||
model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All
|
||||
the possible models are avaible at the following link: https://platform.openai.com/docs/models
|
||||
|
||||
Returns:
|
||||
- result_dict (dict): The result of the language model invocation, converted to a dictionary.
|
||||
result_dict (dict): The result of the language model invocation, converted to a dictionary.
|
||||
"""
|
||||
create_class(values)
|
||||
create_class(values) #in future to remove
|
||||
|
||||
self.parser = PydanticOutputParser(pydantic_object=_Response)
|
||||
|
||||
|
||||
@ -1,8 +1,53 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import tiktoken
|
||||
from typing import List
|
||||
|
||||
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
|
||||
'Accept-Language': 'en-US'}
|
||||
'Accept-Language': 'en-US'}
|
||||
|
||||
models_tokens = {
|
||||
"gpt-3.5-turbo-0125": 16385,
|
||||
"gpt-3.5-turbo": 4096,
|
||||
"gpt-3.5-turbo-1106": 16385,
|
||||
"gpt-3.5-turbo-instruct": 4096,
|
||||
"gpt-4-0125-preview": 128000,
|
||||
"gpt-4-turbo-preview": 128000,
|
||||
"gpt-4-1106-preview": 128000,
|
||||
"gpt-4-vision-preview": 128000,
|
||||
"gpt-4": 8192,
|
||||
"gpt-4-0613": 8192,
|
||||
"gpt-4-32k": 32768,
|
||||
"gpt-4-32k-0613": 32768,
|
||||
}
|
||||
|
||||
EMBEDDING_ENCODING = 'cl100k_base'
|
||||
|
||||
DEFAULT_MESSAGE_LENGTH = 100
|
||||
|
||||
|
||||
def truncate_text_tokens(text: str, model: str, encoding_name: str = EMBEDDING_ENCODING) -> List[str]:
|
||||
"""
|
||||
It creates a list of strings to create max dimension tokenizable elements
|
||||
|
||||
Parameters:
|
||||
text (str): text to scrape
|
||||
model_name (str): The name of the language model to be used (default: "gpt-3.5-turbo"). All
|
||||
the possible models are available at the following link: https://platform.openai.com/docs/models
|
||||
encoding_name (str):
|
||||
|
||||
Returns
|
||||
List[str] of elements to send the requests
|
||||
"""
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
max_tokens = models_tokens[model]
|
||||
encoded_text = encoding.encode(text)
|
||||
|
||||
chunks = [encoded_text[i:i + max_tokens] for i in range(0, len(encoded_text), max_tokens)]
|
||||
|
||||
result = [encoding.decode(chunk) for chunk in chunks]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_function(link:str, param = HEADERS) -> str:
|
||||
"""
|
||||
@ -46,4 +91,5 @@ def remover(file:str) -> str:
|
||||
|
||||
if isBody == True:
|
||||
res = res + elem
|
||||
return res
|
||||
|
||||
return res.replace("\n", "")
|
||||
|
||||
@ -2,4 +2,4 @@
|
||||
from langchain_core.pydantic_v1 import BaseModel, Field
|
||||
|
||||
class _Response(BaseModel):
|
||||
title_website: str = Field(description='Give me the website name')
|
||||
title: str = Field(description='Title of the news')
|
||||
|
||||
25
amazscraper/request.py
Normal file
25
amazscraper/request.py
Normal file
@ -0,0 +1,25 @@
|
||||
from amazscraper.getter import remover
|
||||
from .class_creator import create_class
|
||||
from amazscraper.class_generator import Generator
|
||||
|
||||
|
||||
def send_request(values:list[dict]) ->dict:
|
||||
"""
|
||||
Param:
|
||||
values (list[dict]): settings of the request.
|
||||
Format:
|
||||
[
|
||||
dict {
|
||||
"title": str
|
||||
"type": str,
|
||||
"description": str
|
||||
}
|
||||
]
|
||||
|
||||
Return:
|
||||
dict: the result of the request to openai
|
||||
"""
|
||||
res = {}
|
||||
create_class("TOADD")
|
||||
|
||||
return res
|
||||
@ -5,8 +5,8 @@ langchain_openai==0.0.5
|
||||
python-dotenv==1.0.1
|
||||
Requests==2.31.0
|
||||
pytest==8.0.0
|
||||
|
||||
wheel==0.42.0
|
||||
tiktoken==0.5.2
|
||||
twine==4.0.2
|
||||
sphinx==7.1.2
|
||||
sphinx-rtd-theme==2.0.0
|
||||
Loading…
Reference in New Issue
Block a user