mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
Merge branch 'pre/beta' of https://github.com/VinciGit00/Scrapegraph-ai into pre/beta
This commit is contained in:
commit
58cd523bc5
@ -35,11 +35,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
|
||||
tag.extract()
|
||||
|
||||
# Links extraction
|
||||
links = soup.find_all('a')
|
||||
link_urls = []
|
||||
for link in links:
|
||||
if 'href' in link.attrs:
|
||||
link_urls.append(urljoin(base_url, link['href']))
|
||||
link_urls = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)]
|
||||
|
||||
# Images extraction
|
||||
images = soup.find_all('img')
|
||||
@ -62,4 +58,4 @@ def cleanup_html(html_content: str, base_url: str) -> str:
|
||||
# return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls)
|
||||
|
||||
# throw an error if no body content is found
|
||||
raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")
|
||||
raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user