Update cleanup_html.py

Remove redundant lines in Links extraction
This commit is contained in:
seyf97 2024-06-04 13:49:00 +03:00 committed by GitHub
parent 28d874e4e1
commit acece72c28
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -35,11 +35,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
tag.extract()
# Links extraction
links = soup.find_all('a')
link_urls = []
for link in links:
if 'href' in link.attrs:
link_urls.append(urljoin(base_url, link['href']))
link_urls = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)]
# Images extraction
images = soup.find_all('img')
@ -62,4 +58,4 @@ def cleanup_html(html_content: str, base_url: str) -> str:
# return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls)
# throw an error if no body content is found
raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")
raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")