mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-25 21:11:11 +08:00
Merge branch 'deepScrape' of github.com:mayurdb/Scrapegraph-ai into deepScrape
This commit is contained in:
commit
dd29c16cbe
@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
|
||||
from minify_html import minify
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
def cleanup_html(html_content: str, base_url: str) -> str:
|
||||
"""
|
||||
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
|
||||
@ -47,5 +48,4 @@ def cleanup_html(html_content: str, base_url: str) -> str:
|
||||
minimized_body = minify(str(body_content))
|
||||
return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls)
|
||||
|
||||
|
||||
return "Title: " + title + ", Body: No body content found" + ", Links: " + str(link_urls)
|
||||
return "Title: " + title + ", Body: No body content found" + ", Links: " + str(link_urls)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user