This commit is contained in:
EURAC\marperini 2024-04-25 11:06:54 +02:00
commit c5f9fcaabe

View File

@ -1,15 +1,18 @@
"""
Module for creating the tree
"""
import time
from bs4 import BeautifulSoup, NavigableString
from graphviz import Digraph
from langchain_community.document_loaders import AsyncHtmlLoader
import json
from bs4 import BeautifulSoup, NavigableString, Comment
import time
def tag_structure(tag, exclude=None):
def tag_structure(tag, exclude=None) -> dict:
"""
Recursively get a tag's structure, including its attributes, children, and textual content,
with an option to exclude specific tags. Text is treated as separate nodes.
:param tag: BeautifulSoup tag object
:param exclude: List of tag names to exclude from the structure
:return: A dict with the tag's name, attributes, children, and text nodes
@ -26,7 +29,7 @@ def tag_structure(tag, exclude=None):
text_node = {'text': {
'content': text_content,
'children': []
}
}
}
return text_node
else:
@ -62,19 +65,23 @@ def add_nodes_edges(graph, structure, parent=None, include_scripts=True):
if parent:
graph.edge(parent, node_name)
# Recursively process the children nodes
add_nodes_edges(graph, content['children'], parent=node_name, include_scripts=include_scripts)
add_nodes_edges(
graph, content['children'], parent=node_name, include_scripts=include_scripts)
elif isinstance(structure, list):
for item in structure:
add_nodes_edges(graph, item, parent, include_scripts=include_scripts)
add_nodes_edges(graph, item, parent,
include_scripts=include_scripts)
elif isinstance(structure, str) and parent:
# Adding text node with limited length to keep the visualization clean
text_label = (structure[:30] + '..') if len(structure) > 30 else structure
text_label = (structure[:30] +
'..') if len(structure) > 30 else structure
text_node_name = f"text_{id(structure)}"
graph.node(text_node_name, label=text_label, shape="plaintext")
graph.edge(parent, text_node_name)
def has_text_content(structure):
if isinstance(structure, str) and structure.strip():
# If it's a string with non-whitespace characters, it's text content
@ -92,6 +99,7 @@ def has_text_content(structure):
return True
return False
def add_text_nodes_only(graph, structure, parent=None):
"""
Recursively traverse the structured HTML dictionary and create graph nodes and edges
@ -103,10 +111,11 @@ def add_text_nodes_only(graph, structure, parent=None):
"""
if isinstance(structure, dict):
for tag, content in structure.items():
if 'text' in content:
# Content is a text node
text_label = (content['text'][:30] + '...') if len(content['text']) > 30 else content['text']
text_label = (
content['text'][:30] + '...') if len(content['text']) > 30 else content['text']
text_node_name = f"text_{id(content)}"
graph.node(text_node_name, label=text_label, shape="plaintext")
if parent:
@ -130,8 +139,10 @@ curr_time = time.time()
soup = BeautifulSoup(html_content, 'html.parser')
# Generate and print structured HTML
html_structure = tag_structure(soup.find('html'), exclude=['head', 'style', 'script'])
print(f"Time taken to generate structured HTML: {time.time() - curr_time:.2f} seconds")
html_structure = tag_structure(soup.find('html'), exclude=[
'head', 'style', 'script'])
print(
f"Time taken to generate structured HTML: {time.time() - curr_time:.2f} seconds")
# print(json.dumps(html_structure, indent=2))
# Create a Digraph object
@ -142,4 +153,4 @@ dot.attr(rankdir='LR') # Left to Right, change to 'TB' for Top to Bottom
# add_nodes_edges(dot, html_structure, include_scripts=False)
add_text_nodes_only(dot, html_structure)
# Render the graph to a file and view it
dot.render('html_structure', view=True, format='png')
dot.render('html_structure', view=True, format='png')