Merge branch 'pre/beta' of https://github.com/VinciGit00/Scrapegraph-ai into pre/beta

This commit is contained in:
Marco Vinciguerra 2024-06-04 18:41:46 +02:00
commit 58cd523bc5

View File

@ -35,11 +35,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
tag.extract()
# Links extraction
links = soup.find_all('a')
link_urls = []
for link in links:
if 'href' in link.attrs:
link_urls.append(urljoin(base_url, link['href']))
link_urls = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)]
# Images extraction
images = soup.find_all('img')
@ -62,4 +58,4 @@ def cleanup_html(html_content: str, base_url: str) -> str:
# return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls)
# throw an error if no body content is found
raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")
raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")