fix: improved links extraction for parse_node, resolves #822

This commit is contained in:
Michele_Zenoni 2024-11-24 14:44:48 +01:00
parent b98dd39150
commit 7da7bfe338

View File

@ -27,6 +27,8 @@ class ParseNode(BaseNode):
node_config (dict): Additional configuration for the node. node_config (dict): Additional configuration for the node.
node_name (str): The unique identifier name for the node, defaulting to "Parse". node_name (str): The unique identifier name for the node, defaulting to "Parse".
""" """
url_pattern = re.compile(r"[http[s]?:\/\/]?(www\.)?([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)")
relative_url_pattern = re.compile(r"[\(](/[^\(\)\s]*)")
def __init__( def __init__(
self, self,
@ -123,12 +125,26 @@ class ParseNode(BaseNode):
return [], [] return [], []
image_extensions = default_filters.filter_dict["img_exts"] image_extensions = default_filters.filter_dict["img_exts"]
image_extension_seq = '|'.join(image_extensions).replace('.','') url = ""
url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))') all_urls = set()
all_urls = url_pattern.findall(text) for group in ParseNode.url_pattern.findall(text):
for el in group:
if el != '':
url += el
all_urls.add(url)
url = ""
url = ""
for group in ParseNode.relative_url_pattern.findall(text):
for el in group:
if el not in ['', '[', ']', '(', ')', '{', '}']:
url += el
all_urls.add(urljoin(source, url))
url = ""
all_urls = list(all_urls)
all_urls = self._clean_urls(all_urls) all_urls = self._clean_urls(all_urls)
if not source.startswith("http"): if not source.startswith("http"):
all_urls = [url for url in all_urls if url.startswith("http")] all_urls = [url for url in all_urls if url.startswith("http")]
else: else:
@ -151,9 +167,32 @@ class ParseNode(BaseNode):
""" """
cleaned_urls = [] cleaned_urls = []
for url in urls: for url in urls:
url = re.sub(r'.*?\]\(', '', url) if not ParseNode._is_valid_url(url):
url = url.rstrip(').') url = re.sub(r'.*?\]\(', '', url)
url = re.sub(r'.*?\[\(', '', url)
cleaned_urls.append(url) url = re.sub(r'.*?\[\)', '', url)
url = re.sub(r'.*?\]\)', '', url)
url = re.sub(r'.*?\)\[', '', url)
url = re.sub(r'.*?\)\[', '', url)
url = re.sub(r'.*?\(\]', '', url)
url = re.sub(r'.*?\)\]', '', url)
url = url.rstrip(').-')
if len(url) > 0:
cleaned_urls.append(url)
return cleaned_urls return cleaned_urls
@staticmethod
def _is_valid_url(url: str) -> bool:
"""
CHecks if the URL format is valid.
Args:
url (str): The URL to check.
Returns:
bool: True if the URL format is valid, False otherwise
"""
if re.fullmatch(ParseNode.url_pattern, url) is not None:
return True
return False