Update cleanup_html.py

Remove redundant lines in Links extraction
2026-06-23 21:00:30 +08:00 · 2024-06-04 13:49:00 +03:00 · 2024-06-04 13:49:00 +03:00 · acece72c28
commit acece72c28
parent 28d874e4e1
1 changed files with 2 additions and 6 deletions
--- a/scrapegraphai/utils/cleanup_html.py
+++ b/scrapegraphai/utils/cleanup_html.py
@ -35,11 +35,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
        tag.extract()

    # Links extraction
-    links = soup.find_all('a')
-    link_urls = []
-    for link in links:
-        if 'href' in link.attrs:
-            link_urls.append(urljoin(base_url, link['href']))
+    link_urls = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)]

    # Images extraction
    images = soup.find_all('img')
@ -62,4 +58,4 @@ def cleanup_html(html_content: str, base_url: str) -> str:
        # return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls)

    # throw an error if no body content is found
-    raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")
+    raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")