refactoring of remover function

This commit is contained in:
VinciGit00 2024-02-07 20:54:02 +01:00
parent d3ec995e94
commit a884d8f261
2 changed files with 11 additions and 14 deletions

View File

@ -17,27 +17,27 @@ def get_function(link:str, param = HEADERS) -> str:
response = requests.get(url=link, headers=param)
return str(response.content)
def remover(file:str, only_body:bool = False) -> str:
def remover(file: str, only_body: bool = False) -> str:
"""
This function elaborates the HTML file and remove all the not necessary tag
Parameters:
file (str): the file to parse
only_body (bool): whether to parse only the body content or the entire file
Returns:
str: the parsed file
"""
res = ""
if only_body == True:
if only_body:
isBody = True
else:
isBody = False
for elem in file.splitlines():
if "<title>" in elem:
res = res + elem
res += elem
if "<body>" in elem:
isBody = True
@ -48,7 +48,7 @@ def remover(file:str, only_body:bool = False) -> str:
if "<script>" in elem:
continue
if isBody == True:
res = res + elem
if isBody:
res += elem
return res.replace("\n", "")
return res.replace("\\n", "")

View File

@ -36,26 +36,23 @@ def send_request(key: str, text: str, values: list[dict], model: str, temperatur
create_class(values)
time.sleep(2) # TODO: implement asynchronous waiting
# text = remover(text)
text = remover(text)
messages = truncate_text_tokens(text, model, encoding_name)
total_messages = len(messages)
processed_messages = 0
pool = Pool(processes=2) # Limit the number of processes to 3
pool = Pool(processes=2)
with tqdm(total=total_messages) as pbar:
for result in pool.imap_unordered(process_message, [(key, temperature, model, encoding_name, message) for message in messages]):
res.append(result)
processed_messages += 1
pbar.update(1) # Update the progress bar
progress = processed_messages / total_messages * 100
pbar.update(1)
# Wait for 20 seconds between requests to respect the limit
if processed_messages % 2 == 0:
time.sleep(20)
pool.close()
pool.join()
return res
pool.join()