mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-25 21:11:11 +08:00
refactoring of remover function
This commit is contained in:
parent
d3ec995e94
commit
a884d8f261
@ -17,27 +17,27 @@ def get_function(link:str, param = HEADERS) -> str:
|
||||
response = requests.get(url=link, headers=param)
|
||||
return str(response.content)
|
||||
|
||||
def remover(file:str, only_body:bool = False) -> str:
|
||||
def remover(file: str, only_body: bool = False) -> str:
|
||||
"""
|
||||
This function elaborates the HTML file and remove all the not necessary tag
|
||||
|
||||
Parameters:
|
||||
file (str): the file to parse
|
||||
only_body (bool): whether to parse only the body content or the entire file
|
||||
|
||||
Returns:
|
||||
str: the parsed file
|
||||
"""
|
||||
|
||||
res = ""
|
||||
|
||||
if only_body == True:
|
||||
if only_body:
|
||||
isBody = True
|
||||
else:
|
||||
isBody = False
|
||||
|
||||
for elem in file.splitlines():
|
||||
if "<title>" in elem:
|
||||
res = res + elem
|
||||
res += elem
|
||||
|
||||
if "<body>" in elem:
|
||||
isBody = True
|
||||
@ -48,7 +48,7 @@ def remover(file:str, only_body:bool = False) -> str:
|
||||
if "<script>" in elem:
|
||||
continue
|
||||
|
||||
if isBody == True:
|
||||
res = res + elem
|
||||
if isBody:
|
||||
res += elem
|
||||
|
||||
return res.replace("\n", "")
|
||||
return res.replace("\\n", "")
|
||||
|
||||
@ -36,26 +36,23 @@ def send_request(key: str, text: str, values: list[dict], model: str, temperatur
|
||||
create_class(values)
|
||||
time.sleep(2) # TODO: implement asynchronous waiting
|
||||
|
||||
# text = remover(text)
|
||||
text = remover(text)
|
||||
|
||||
messages = truncate_text_tokens(text, model, encoding_name)
|
||||
total_messages = len(messages)
|
||||
processed_messages = 0
|
||||
|
||||
pool = Pool(processes=2) # Limit the number of processes to 3
|
||||
pool = Pool(processes=2)
|
||||
|
||||
with tqdm(total=total_messages) as pbar:
|
||||
for result in pool.imap_unordered(process_message, [(key, temperature, model, encoding_name, message) for message in messages]):
|
||||
res.append(result)
|
||||
processed_messages += 1
|
||||
pbar.update(1) # Update the progress bar
|
||||
progress = processed_messages / total_messages * 100
|
||||
pbar.update(1)
|
||||
|
||||
# Wait for 20 seconds between requests to respect the limit
|
||||
if processed_messages % 2 == 0:
|
||||
time.sleep(20)
|
||||
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
return res
|
||||
pool.join()
|
||||
Loading…
Reference in New Issue
Block a user