mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-25 21:11:11 +08:00
structural and textual hashing
This commit is contained in:
parent
c5f9fcaabe
commit
dd99ac595e
@ -3,6 +3,25 @@ from bs4.element import Tag, NavigableString, Comment
|
||||
from langchain_community.document_loaders import AsyncHtmlLoader
|
||||
import time
|
||||
|
||||
def hash_subtree_structure(node):
|
||||
""" Recursively generate a hash for the subtree structure. """
|
||||
if node.is_leaf:
|
||||
return hash((node.value,)) # Simple hash for leaf nodes
|
||||
child_hashes = tuple(hash_subtree_structure(child) for child in node.children)
|
||||
return hash((node.value, child_hashes))
|
||||
|
||||
def hash_subtree_content(node):
|
||||
""" Generate a hash based on the concatenated text of the subtree. """
|
||||
text_content = get_all_text(node).lower().strip()
|
||||
return hash(text_content)
|
||||
|
||||
def get_all_text(node):
|
||||
""" Recursively get all text from a node and its descendants. """
|
||||
text = node.attributes.get('content', '') if node.value == 'text' else ''
|
||||
for child in node.children:
|
||||
text += get_all_text(child)
|
||||
return text
|
||||
|
||||
class TreeNode:
|
||||
def __init__(self, value=None, attributes=None, children=None, parent=None, depth=0):
|
||||
self.value = value
|
||||
@ -10,16 +29,23 @@ class TreeNode:
|
||||
self.children = children if children is not None else []
|
||||
self.parent = parent
|
||||
self.depth = depth
|
||||
self.leads_to_text = False # Initialize the flag as False
|
||||
self.leads_to_text = False
|
||||
self.root_path = self._compute_root_path()
|
||||
self.closest_fork_path = self._compute_fork_path()
|
||||
self.structure_hash = None
|
||||
self.content_hash = None
|
||||
|
||||
def add_child(self, child_node):
|
||||
child_node.parent = self
|
||||
child_node.depth = self.depth + 1
|
||||
self.children.append(child_node)
|
||||
child_node.update_paths()
|
||||
self.update_leads_to_text() # Update this node if the child leads to text
|
||||
self.update_leads_to_text()
|
||||
self.update_hashes() # Update hashes when the structure changes
|
||||
|
||||
def update_hashes(self):
|
||||
self.structure_hash = hash_subtree_structure(self)
|
||||
self.content_hash = hash_subtree_content(self)
|
||||
|
||||
def update_paths(self):
|
||||
self.root_path = self._compute_root_path()
|
||||
@ -59,7 +85,7 @@ class TreeNode:
|
||||
for child in self.children:
|
||||
subtrees.extend(child.get_subtrees())
|
||||
return subtrees
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return f"TreeNode(value={self.value}, leads_to_text={self.leads_to_text}, depth={self.depth}, root_path={self.root_path}, closest_fork_path={self.closest_fork_path})"
|
||||
|
||||
@ -110,9 +136,49 @@ class DOMTree(Tree):
|
||||
tree_node.add_child(new_node)
|
||||
self.build_dom_tree(child, new_node)
|
||||
|
||||
def index_subtrees(subtrees):
|
||||
from collections import defaultdict
|
||||
structure_index = defaultdict(list)
|
||||
content_index = defaultdict(list)
|
||||
|
||||
for subtree in subtrees:
|
||||
structure_hash = subtree.root.structure_hash
|
||||
content_hash = subtree.root.content_hash
|
||||
|
||||
structure_index[structure_hash].append(subtree)
|
||||
content_index[content_hash].append(subtree)
|
||||
|
||||
return structure_index, content_index
|
||||
|
||||
def find_matching_subtrees(index):
|
||||
matches = []
|
||||
for hash_key, subtrees in index.items():
|
||||
if len(subtrees) > 1:
|
||||
# Generate pairs of matched subtrees
|
||||
for i in range(len(subtrees)):
|
||||
for j in range(i + 1, len(subtrees)):
|
||||
matches.append((subtrees[i], subtrees[j]))
|
||||
return matches
|
||||
|
||||
def print_subtree_details(subtree):
|
||||
""" A helper function to print subtree details for comparison. """
|
||||
nodes = []
|
||||
subtree.traverse(lambda node: nodes.append(f"{node.value}: {node.attributes.get('content', '')}"))
|
||||
return " | ".join(nodes)
|
||||
|
||||
def print_matches_side_by_side(matches):
|
||||
for match_pair in matches:
|
||||
subtree1, subtree2 = match_pair
|
||||
subtree1_details = print_subtree_details(subtree1)
|
||||
subtree2_details = print_subtree_details(subtree2)
|
||||
print("Match Pair:")
|
||||
print("Subtree 1:", subtree1_details)
|
||||
print("Subtree 2:", subtree2_details)
|
||||
print("\n" + "-"*100 + "\n")
|
||||
|
||||
# Usage example:
|
||||
|
||||
loader = AsyncHtmlLoader('https://github.com/PeriniM')
|
||||
loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
|
||||
document = loader.load()
|
||||
html_content = document[0].page_content
|
||||
|
||||
@ -121,11 +187,26 @@ curr_time = time.time()
|
||||
dom_tree = DOMTree(html_content)
|
||||
subtrees = dom_tree.get_subtrees() # Retrieve subtrees rooted at fork nodes
|
||||
|
||||
# Index subtrees by structure and content
|
||||
structure_index, content_index = index_subtrees(subtrees)
|
||||
|
||||
# Find matches based on structure
|
||||
structure_matches = find_matching_subtrees(structure_index)
|
||||
print("Structure-based matches found:", len(structure_matches))
|
||||
|
||||
# Print structure-based matches side by side
|
||||
print_matches_side_by_side(structure_matches)
|
||||
|
||||
# Optionally, do the same for content-based matches if needed
|
||||
content_matches = find_matching_subtrees(content_index)
|
||||
print("Content-based matches found:", len(content_matches))
|
||||
print_matches_side_by_side(content_matches)
|
||||
|
||||
print(f"Time taken to build DOM tree: {time.time() - curr_time:.2f} seconds")
|
||||
|
||||
# Optionally, traverse each subtree
|
||||
for subtree in subtrees:
|
||||
print("Subtree rooted at:", subtree.root.value)
|
||||
# for subtree in subtrees:
|
||||
# print("Subtree rooted at:", subtree.root.value)
|
||||
# subtree.traverse(lambda node: print(node))
|
||||
# Traverse the DOMTree and print each node
|
||||
# dom_tree.traverse(lambda node: print(node))
|
||||
|
||||
Loading…
Reference in New Issue
Block a user