structural and textual hashing

2026-06-25 21:11:11 +08:00 · 2024-04-25 11:56:41 +02:00 · 2024-04-25 11:56:41 +02:00 · dd99ac595e
commit dd99ac595e
parent c5f9fcaabe
1 changed files with 87 additions and 6 deletions
--- a/scrapegraphai/utils/aaa.py
+++ b/scrapegraphai/utils/aaa.py
@ -3,6 +3,25 @@ from bs4.element import Tag, NavigableString, Comment
 from langchain_community.document_loaders import AsyncHtmlLoader
 import time

+def hash_subtree_structure(node):
+    """ Recursively generate a hash for the subtree structure. """
+    if node.is_leaf:
+        return hash((node.value,))  # Simple hash for leaf nodes
+    child_hashes = tuple(hash_subtree_structure(child) for child in node.children)
+    return hash((node.value, child_hashes))
+
+def hash_subtree_content(node):
+    """ Generate a hash based on the concatenated text of the subtree. """
+    text_content = get_all_text(node).lower().strip()
+    return hash(text_content)
+
+def get_all_text(node):
+    """ Recursively get all text from a node and its descendants. """
+    text = node.attributes.get('content', '') if node.value == 'text' else ''
+    for child in node.children:
+        text += get_all_text(child)
+    return text
+
 class TreeNode:
    def __init__(self, value=None, attributes=None, children=None, parent=None, depth=0):
        self.value = value
@ -10,16 +29,23 @@ class TreeNode:
        self.children = children if children is not None else []
        self.parent = parent
        self.depth = depth
-        self.leads_to_text = False  # Initialize the flag as False
+        self.leads_to_text = False
        self.root_path = self._compute_root_path()
        self.closest_fork_path = self._compute_fork_path()
+        self.structure_hash = None
+        self.content_hash = None

    def add_child(self, child_node):
        child_node.parent = self
        child_node.depth = self.depth + 1
        self.children.append(child_node)
        child_node.update_paths()
-        self.update_leads_to_text()  # Update this node if the child leads to text
+        self.update_leads_to_text()
+        self.update_hashes()  # Update hashes when the structure changes
+
+    def update_hashes(self):
+        self.structure_hash = hash_subtree_structure(self)
+        self.content_hash = hash_subtree_content(self)

    def update_paths(self):
        self.root_path = self._compute_root_path()
@ -59,7 +85,7 @@ class TreeNode:
        for child in self.children:
            subtrees.extend(child.get_subtrees())
        return subtrees
-    
+
    def __repr__(self):
        return f"TreeNode(value={self.value}, leads_to_text={self.leads_to_text}, depth={self.depth}, root_path={self.root_path}, closest_fork_path={self.closest_fork_path})"

@ -110,9 +136,49 @@ class DOMTree(Tree):
                tree_node.add_child(new_node)
                self.build_dom_tree(child, new_node)

+def index_subtrees(subtrees):
+    from collections import defaultdict
+    structure_index = defaultdict(list)
+    content_index = defaultdict(list)
+
+    for subtree in subtrees:
+        structure_hash = subtree.root.structure_hash
+        content_hash = subtree.root.content_hash
+
+        structure_index[structure_hash].append(subtree)
+        content_index[content_hash].append(subtree)
+
+    return structure_index, content_index
+
+def find_matching_subtrees(index):
+    matches = []
+    for hash_key, subtrees in index.items():
+        if len(subtrees) > 1:
+            # Generate pairs of matched subtrees
+            for i in range(len(subtrees)):
+                for j in range(i + 1, len(subtrees)):
+                    matches.append((subtrees[i], subtrees[j]))
+    return matches
+
+def print_subtree_details(subtree):
+    """ A helper function to print subtree details for comparison. """
+    nodes = []
+    subtree.traverse(lambda node: nodes.append(f"{node.value}: {node.attributes.get('content', '')}"))
+    return " | ".join(nodes)
+
+def print_matches_side_by_side(matches):
+    for match_pair in matches:
+        subtree1, subtree2 = match_pair
+        subtree1_details = print_subtree_details(subtree1)
+        subtree2_details = print_subtree_details(subtree2)
+        print("Match Pair:")
+        print("Subtree 1:", subtree1_details)
+        print("Subtree 2:", subtree2_details)
+        print("\n" + "-"*100 + "\n")
+
 # Usage example:

-loader = AsyncHtmlLoader('https://github.com/PeriniM')
+loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
 document = loader.load()
 html_content = document[0].page_content

@ -121,11 +187,26 @@ curr_time = time.time()
 dom_tree = DOMTree(html_content)
 subtrees = dom_tree.get_subtrees()  # Retrieve subtrees rooted at fork nodes

+# Index subtrees by structure and content
+structure_index, content_index = index_subtrees(subtrees)
+
+# Find matches based on structure
+structure_matches = find_matching_subtrees(structure_index)
+print("Structure-based matches found:", len(structure_matches))
+
+# Print structure-based matches side by side
+print_matches_side_by_side(structure_matches)
+
+# Optionally, do the same for content-based matches if needed
+content_matches = find_matching_subtrees(content_index)
+print("Content-based matches found:", len(content_matches))
+print_matches_side_by_side(content_matches)
+
 print(f"Time taken to build DOM tree: {time.time() - curr_time:.2f} seconds")

 # Optionally, traverse each subtree
-for subtree in subtrees:
-    print("Subtree rooted at:", subtree.root.value)
+# for subtree in subtrees:
+#     print("Subtree rooted at:", subtree.root.value)
    # subtree.traverse(lambda node: print(node))
 # Traverse the DOMTree and print each node
 # dom_tree.traverse(lambda node: print(node))