From f232717bf8fd09865a1c255935a51e7b27cbde00 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Thu, 25 Apr 2024 09:19:44 +0200 Subject: [PATCH] REFACTORING --- scrapegraphai/utils/asdt.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/scrapegraphai/utils/asdt.py b/scrapegraphai/utils/asdt.py index 92359922..973de6c6 100644 --- a/scrapegraphai/utils/asdt.py +++ b/scrapegraphai/utils/asdt.py @@ -1,15 +1,18 @@ +""" +Module for creating the tree +""" +import time from bs4 import BeautifulSoup, NavigableString from graphviz import Digraph from langchain_community.document_loaders import AsyncHtmlLoader -import json from bs4 import BeautifulSoup, NavigableString, Comment -import time -def tag_structure(tag, exclude=None): + +def tag_structure(tag, exclude=None) -> dict: """ Recursively get a tag's structure, including its attributes, children, and textual content, with an option to exclude specific tags. Text is treated as separate nodes. - + :param tag: BeautifulSoup tag object :param exclude: List of tag names to exclude from the structure :return: A dict with the tag's name, attributes, children, and text nodes @@ -26,7 +29,7 @@ def tag_structure(tag, exclude=None): text_node = {'text': { 'content': text_content, 'children': [] - } + } } return text_node else: @@ -62,19 +65,23 @@ def add_nodes_edges(graph, structure, parent=None, include_scripts=True): if parent: graph.edge(parent, node_name) # Recursively process the children nodes - add_nodes_edges(graph, content['children'], parent=node_name, include_scripts=include_scripts) + add_nodes_edges( + graph, content['children'], parent=node_name, include_scripts=include_scripts) elif isinstance(structure, list): for item in structure: - add_nodes_edges(graph, item, parent, include_scripts=include_scripts) + add_nodes_edges(graph, item, parent, + include_scripts=include_scripts) elif isinstance(structure, str) and parent: # Adding text node with limited length to keep the visualization clean - text_label = (structure[:30] + '..') if len(structure) > 30 else structure + text_label = (structure[:30] + + '..') if len(structure) > 30 else structure text_node_name = f"text_{id(structure)}" graph.node(text_node_name, label=text_label, shape="plaintext") graph.edge(parent, text_node_name) + def has_text_content(structure): if isinstance(structure, str) and structure.strip(): # If it's a string with non-whitespace characters, it's text content @@ -92,6 +99,7 @@ def has_text_content(structure): return True return False + def add_text_nodes_only(graph, structure, parent=None): """ Recursively traverse the structured HTML dictionary and create graph nodes and edges @@ -103,10 +111,11 @@ def add_text_nodes_only(graph, structure, parent=None): """ if isinstance(structure, dict): for tag, content in structure.items(): - + if 'text' in content: # Content is a text node - text_label = (content['text'][:30] + '...') if len(content['text']) > 30 else content['text'] + text_label = ( + content['text'][:30] + '...') if len(content['text']) > 30 else content['text'] text_node_name = f"text_{id(content)}" graph.node(text_node_name, label=text_label, shape="plaintext") if parent: @@ -130,8 +139,10 @@ curr_time = time.time() soup = BeautifulSoup(html_content, 'html.parser') # Generate and print structured HTML -html_structure = tag_structure(soup.find('html'), exclude=['head', 'style', 'script']) -print(f"Time taken to generate structured HTML: {time.time() - curr_time:.2f} seconds") +html_structure = tag_structure(soup.find('html'), exclude=[ + 'head', 'style', 'script']) +print( + f"Time taken to generate structured HTML: {time.time() - curr_time:.2f} seconds") # print(json.dumps(html_structure, indent=2)) # Create a Digraph object @@ -142,4 +153,4 @@ dot.attr(rankdir='LR') # Left to Right, change to 'TB' for Top to Bottom # add_nodes_edges(dot, html_structure, include_scripts=False) add_text_nodes_only(dot, html_structure) # Render the graph to a file and view it -dot.render('html_structure', view=True, format='png') \ No newline at end of file +dot.render('html_structure', view=True, format='png')