fix: improved links extraction for parse_node, resolves #822

2026-07-01 21:00:48 +08:00 · 2024-11-24 14:44:48 +01:00 · 2024-11-24 14:44:48 +01:00 · 7da7bfe338
commit 7da7bfe338
parent b98dd39150
1 changed files with 47 additions and 8 deletions
--- a/scrapegraphai/nodes/parse_node.py
+++ b/scrapegraphai/nodes/parse_node.py
@ -27,6 +27,8 @@ class ParseNode(BaseNode):
        node_config (dict): Additional configuration for the node.
        node_name (str): The unique identifier name for the node, defaulting to "Parse".
    """
    url_pattern = re.compile(r"[http[s]?:\/\/]?(www\.)?([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)")
    relative_url_pattern = re.compile(r"[\(](/[^\(\)\s]*)")
    def __init__(
        self,
@ -123,12 +125,26 @@ class ParseNode(BaseNode):
            return [], []
        image_extensions = default_filters.filter_dict["img_exts"]
-        image_extension_seq = '|'.join(image_extensions).replace('.','')
+        url = ""
-        url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
+        all_urls = set()
-        all_urls = url_pattern.findall(text)
+        for group in ParseNode.url_pattern.findall(text):
            for el in group:
                if el != '':
                    url += el
            all_urls.add(url)
            url = ""  
        url = ""
        for group in ParseNode.relative_url_pattern.findall(text):
            for el in group:
                if el not in ['', '[', ']', '(', ')', '{', '}']:
                    url += el
            all_urls.add(urljoin(source, url))
            url = ""
        all_urls = list(all_urls)
        all_urls = self._clean_urls(all_urls)
        if not source.startswith("http"):
            all_urls = [url for url in all_urls if url.startswith("http")]
        else:
@ -151,9 +167,32 @@ class ParseNode(BaseNode):
        """
        cleaned_urls = []
        for url in urls:
-            url = re.sub(r'.*?\]\(', '', url)
+            if not ParseNode._is_valid_url(url):
-            url = url.rstrip(').')
+                url = re.sub(r'.*?\]\(', '', url)
-
+                url = re.sub(r'.*?\[\(', '', url)
-            cleaned_urls.append(url)
+                url = re.sub(r'.*?\[\)', '', url)
                url = re.sub(r'.*?\]\)', '', url)
                url = re.sub(r'.*?\)\[', '', url)
                url = re.sub(r'.*?\)\[', '', url)
                url = re.sub(r'.*?\(\]', '', url)
                url = re.sub(r'.*?\)\]', '', url)
            url = url.rstrip(').-')
            if len(url) > 0:
                cleaned_urls.append(url)
        return cleaned_urls    
    @staticmethod
    def _is_valid_url(url: str) -> bool:
        """
        CHecks if the URL format is valid.
        Args:
            url (str): The URL to check.
        Returns:
            bool: True if the URL format is valid, False otherwise
        """
        if re.fullmatch(ParseNode.url_pattern, url) is not None:
            return True
        return False