mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
fix: improved links extraction for parse_node, resolves #822
This commit is contained in:
parent
b98dd39150
commit
7da7bfe338
@ -27,6 +27,8 @@ class ParseNode(BaseNode):
|
|||||||
node_config (dict): Additional configuration for the node.
|
node_config (dict): Additional configuration for the node.
|
||||||
node_name (str): The unique identifier name for the node, defaulting to "Parse".
|
node_name (str): The unique identifier name for the node, defaulting to "Parse".
|
||||||
"""
|
"""
|
||||||
|
url_pattern = re.compile(r"[http[s]?:\/\/]?(www\.)?([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)")
|
||||||
|
relative_url_pattern = re.compile(r"[\(](/[^\(\)\s]*)")
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -123,12 +125,26 @@ class ParseNode(BaseNode):
|
|||||||
return [], []
|
return [], []
|
||||||
|
|
||||||
image_extensions = default_filters.filter_dict["img_exts"]
|
image_extensions = default_filters.filter_dict["img_exts"]
|
||||||
image_extension_seq = '|'.join(image_extensions).replace('.','')
|
url = ""
|
||||||
url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
|
all_urls = set()
|
||||||
|
|
||||||
all_urls = url_pattern.findall(text)
|
for group in ParseNode.url_pattern.findall(text):
|
||||||
|
for el in group:
|
||||||
|
if el != '':
|
||||||
|
url += el
|
||||||
|
all_urls.add(url)
|
||||||
|
url = ""
|
||||||
|
|
||||||
|
url = ""
|
||||||
|
for group in ParseNode.relative_url_pattern.findall(text):
|
||||||
|
for el in group:
|
||||||
|
if el not in ['', '[', ']', '(', ')', '{', '}']:
|
||||||
|
url += el
|
||||||
|
all_urls.add(urljoin(source, url))
|
||||||
|
url = ""
|
||||||
|
|
||||||
|
all_urls = list(all_urls)
|
||||||
all_urls = self._clean_urls(all_urls)
|
all_urls = self._clean_urls(all_urls)
|
||||||
|
|
||||||
if not source.startswith("http"):
|
if not source.startswith("http"):
|
||||||
all_urls = [url for url in all_urls if url.startswith("http")]
|
all_urls = [url for url in all_urls if url.startswith("http")]
|
||||||
else:
|
else:
|
||||||
@ -151,9 +167,32 @@ class ParseNode(BaseNode):
|
|||||||
"""
|
"""
|
||||||
cleaned_urls = []
|
cleaned_urls = []
|
||||||
for url in urls:
|
for url in urls:
|
||||||
url = re.sub(r'.*?\]\(', '', url)
|
if not ParseNode._is_valid_url(url):
|
||||||
url = url.rstrip(').')
|
url = re.sub(r'.*?\]\(', '', url)
|
||||||
|
url = re.sub(r'.*?\[\(', '', url)
|
||||||
|
url = re.sub(r'.*?\[\)', '', url)
|
||||||
|
url = re.sub(r'.*?\]\)', '', url)
|
||||||
|
url = re.sub(r'.*?\)\[', '', url)
|
||||||
|
url = re.sub(r'.*?\)\[', '', url)
|
||||||
|
url = re.sub(r'.*?\(\]', '', url)
|
||||||
|
url = re.sub(r'.*?\)\]', '', url)
|
||||||
|
url = url.rstrip(').-')
|
||||||
|
if len(url) > 0:
|
||||||
|
cleaned_urls.append(url)
|
||||||
|
|
||||||
|
return cleaned_urls
|
||||||
|
|
||||||
cleaned_urls.append(url)
|
@staticmethod
|
||||||
|
def _is_valid_url(url: str) -> bool:
|
||||||
|
"""
|
||||||
|
CHecks if the URL format is valid.
|
||||||
|
|
||||||
return cleaned_urls
|
Args:
|
||||||
|
url (str): The URL to check.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the URL format is valid, False otherwise
|
||||||
|
"""
|
||||||
|
if re.fullmatch(ParseNode.url_pattern, url) is not None:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user