import pytest from bs4 import BeautifulSoup # Import the functions to be tested from scrapegraphai.utils.cleanup_html import ( cleanup_html, extract_from_script_tags, minify_html, reduce_html, ) def test_extract_from_script_tags(): """Test extracting JSON and dynamic data from script tags.""" html = """ """ soup = BeautifulSoup(html, "html.parser") result = extract_from_script_tags(soup) assert "JSON data from script:" in result assert '"key": "value"' in result assert 'Dynamic data - globalVar: "hello"' in result def test_cleanup_html_success(): """Test cleanup_html with valid HTML containing title, body, links, images, and scripts.""" html = """ Test Title

Hello World!

Link """ base_url = "http://example.com" title, minimized_body, link_urls, image_urls, script_content = cleanup_html( html, base_url ) assert title == "Test Title" assert "" in minimized_body and "" in minimized_body # Check the link is properly joined assert "http://example.com/page" in link_urls # Check the image is properly joined assert "http://example.com/image.jpg" in image_urls # Check that we got some output from the script extraction assert "JSON data from script" in script_content def test_cleanup_html_no_body(): """Test cleanup_html raises ValueError when no tag is present.""" html = "No Body" base_url = "http://example.com" with pytest.raises(ValueError) as excinfo: cleanup_html(html, base_url) assert "No HTML body content found" in str(excinfo.value) def test_minify_html(): """Test minify_html function to remove comments and unnecessary whitespace.""" raw_html = """

Hello World!

""" minified = minify_html(raw_html) # There should be no comment and no unnecessary spaces between tags assert "

Some text

""" reduced = reduce_html(raw_html, 1) # Ensure that unwanted attributes are removed (data-extra and style are gone, class remains) assert "data-extra" not in reduced assert "style=" not in reduced assert 'class="keep"' in reduced def test_reduce_html_reduction_2(): """Test reduce_html at reduction level 2 (further reducing text content and decomposing style tags).""" raw_html = """

Long text with more than twenty characters. Extra content.

""" reduced = reduce_html(raw_html, 2) # For level 2, text should be truncated to the first 20 characters after normalization. # The original text "Long text with more than twenty characters. Extra content." # normalized becomes "Long text with more than twenty characters. Extra content." # and then truncated to: "Long text with more t" (first 20 characters) assert "Long text with more t" in reduced # Confirm that style tags contents are completely removed assert ".unused" not in reduced def test_reduce_html_no_body(): """Test reduce_html returns specific message when no tag is present.""" raw_html = "No Body" reduced = reduce_html(raw_html, 2) assert reduced == "No tag found in the HTML"