From c3d1b7c200e6fd065bd5aea79b90ca3db4d42b16 Mon Sep 17 00:00:00 2001
From: Lorenzo Paleari <100212108+LorenzoPaleari@users.noreply.github.com>
Date: Fri, 13 Sep 2024 01:47:39 +0200
Subject: [PATCH] fix: OmniScraerGraph working. Added url scraping capability
 to ParseNode

---
 scrapegraphai/nodes/parse_node.py | 65 ++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
index 240daf1f..1f919926 100644
--- a/scrapegraphai/nodes/parse_node.py
+++ b/scrapegraphai/nodes/parse_node.py
@@ -1,11 +1,14 @@
 """
 ParseNode Module
 """
-from typing import List, Optional
+import re
+from typing import List, Optional, Tuple
+from urllib.parse import urljoin
 from langchain_community.document_transformers import Html2TextTransformer
 from langchain_core.documents import Document
 from .base_node import BaseNode
 from ..utils.split_text_into_chunks import split_text_into_chunks
+from ..helpers import default_filters
 
 class ParseNode(BaseNode):
     """
@@ -40,6 +43,9 @@ class ParseNode(BaseNode):
         self.parse_html = (
             True if node_config is None else node_config.get("parse_html", True)
         )
+        self.parse_urls = (
+            False if node_config is None else node_config.get("parse_urls", False)
+        )
 
         self.llm_model = node_config.get("llm_model")
         self.chunk_size = node_config.get("chunk_size")
@@ -66,16 +72,21 @@ class ParseNode(BaseNode):
 
         input_data = [state[key] for key in input_keys]
         docs_transformed = input_data[0]
+        source = input_data[1] if self.parse_urls else None
 
         if self.parse_html:
             docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0])
             docs_transformed = docs_transformed[0]
 
+            link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
+
             chunks = split_text_into_chunks(text=docs_transformed.page_content,
                                             chunk_size=self.chunk_size-250, model=self.llm_model)
         else:
             docs_transformed = docs_transformed[0]
 
+            link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
+
             chunk_size = self.chunk_size
             chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
 
@@ -89,5 +100,57 @@ class ParseNode(BaseNode):
                                                 model=self.llm_model)
 
         state.update({self.output[0]: chunks})
+        if self.parse_urls:
+            state.update({self.output[1]: link_urls})
+            state.update({self.output[2]: img_urls})
 
         return state
+
+    def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
+        """
+        Extracts URLs from the given text.
+
+        Args:
+            text (str): The text to extract URLs from.
+
+        Returns:
+            Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
+        """
+        if not self.parse_urls:
+            return [], []
+        
+        image_extensions = default_filters.filter_dict["img_exts"]
+        image_extension_seq = '|'.join(image_extensions).replace('.','')
+        url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
+
+        all_urls = url_pattern.findall(text)
+        all_urls = self._clean_urls(all_urls)
+
+        if not source.startswith("http"):
+            all_urls = [url for url in all_urls if url.startswith("http")]
+        else:
+            all_urls = [urljoin(source, url) for url in all_urls]
+        
+        images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
+        links = [url for url in all_urls if url not in images]
+
+        return links, images
+    
+    def _clean_urls(self, urls: List[str]) -> List[str]:
+        """
+        Cleans the URLs extracted from the text.
+
+        Args:
+            urls (List[str]): The list of URLs to clean.
+
+        Returns:
+            List[str]: The cleaned URLs.
+        """
+        cleaned_urls = []
+        for url in urls:
+            url = re.sub(r'.*?\]\(', '', url)
+            url = url.rstrip(').')
+            
+            cleaned_urls.append(url)
+        
+        return cleaned_urls
\ No newline at end of file