import pytest from bs4 import BeautifulSoup # Import the functions to be tested from scrapegraphai.utils.cleanup_html import ( cleanup_html, extract_from_script_tags, minify_html, reduce_html, ) def test_extract_from_script_tags(): """Test extracting JSON and dynamic data from script tags.""" html = """
""" soup = BeautifulSoup(html, "html.parser") result = extract_from_script_tags(soup) assert "JSON data from script:" in result assert '"key": "value"' in result assert 'Dynamic data - globalVar: "hello"' in result def test_cleanup_html_success(): """Test cleanup_html with valid HTML containing title, body, links, images, and scripts.""" html = """Hello World!
Link
"""
base_url = "http://example.com"
title, minimized_body, link_urls, image_urls, script_content = cleanup_html(
html, base_url
)
assert title == "Test Title"
assert "" in minimized_body and "" in minimized_body
# Check the link is properly joined
assert "http://example.com/page" in link_urls
# Check the image is properly joined
assert "http://example.com/image.jpg" in image_urls
# Check that we got some output from the script extraction
assert "JSON data from script" in script_content
def test_cleanup_html_no_body():
"""Test cleanup_html raises ValueError when no tag is present."""
html = "Hello World!
""" minified = minify_html(raw_html) # There should be no comment and no unnecessary spaces between tags assert "Some text