From 1dd9b95a25ca8982db3fe8e00a227d14057eca96 Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Mon, 4 Aug 2025 12:42:32 -0400 Subject: [PATCH] Fix edge case for empty tags --- surya/recognition/util.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/surya/recognition/util.py b/surya/recognition/util.py index 51d4a4b..f196dd1 100644 --- a/surya/recognition/util.py +++ b/surya/recognition/util.py @@ -47,8 +47,16 @@ def filter_blacklist_tags(text_chars: List[TextChar]) -> List[TextChar]: if char == ">": full_tag = ''.join(c.text for c in char_buffer) inner = full_tag[1:-1].strip() # remove < > - tag_name_candidate = inner.strip("/").split()[0] # remove '/' and any attributes - + inner = inner.strip("/") # remove '/' + + # Possible that it is just an empty <> + if not inner: + filtered_chars.extend(char_buffer) + in_tag = False + char_buffer = [] + continue + + tag_name_candidate = inner.split()[0] # remove any attributes if tag_name_candidate in BLACKLIST_TAGS: # Discard tag pass