Merge pull request #662 from LorenzoPaleari/580-omni-scraper-not-working

580 - Omni Scraper not working
2026-06-28 21:01:55 +08:00 · 2024-09-13 08:58:42 +02:00 · 2024-09-13 08:58:42 +02:00 · da827a76fc
commit da827a76fc
parent 88b2c469ae c3d1b7c200
1 changed files with 64 additions and 1 deletions
--- a/scrapegraphai/nodes/parse_node.py
+++ b/scrapegraphai/nodes/parse_node.py
@ -1,11 +1,14 @@
 """
 ParseNode Module
 """
-from typing import List, Optional
+import re
+from typing import List, Optional, Tuple
+from urllib.parse import urljoin
 from langchain_community.document_transformers import Html2TextTransformer
 from langchain_core.documents import Document
 from .base_node import BaseNode
 from ..utils.split_text_into_chunks import split_text_into_chunks
+from ..helpers import default_filters

 class ParseNode(BaseNode):
    """
@ -40,6 +43,9 @@ class ParseNode(BaseNode):
        self.parse_html = (
            True if node_config is None else node_config.get("parse_html", True)
        )
+        self.parse_urls = (
+            False if node_config is None else node_config.get("parse_urls", False)
+        )

        self.llm_model = node_config.get("llm_model")
        self.chunk_size = node_config.get("chunk_size")
@ -66,16 +72,21 @@ class ParseNode(BaseNode):

        input_data = [state[key] for key in input_keys]
        docs_transformed = input_data[0]
+        source = input_data[1] if self.parse_urls else None

        if self.parse_html:
            docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0])
            docs_transformed = docs_transformed[0]

+            link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
+
            chunks = split_text_into_chunks(text=docs_transformed.page_content,
                                            chunk_size=self.chunk_size-250, model=self.llm_model)
        else:
            docs_transformed = docs_transformed[0]

+            link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
+
            chunk_size = self.chunk_size
            chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))

@ -89,5 +100,57 @@ class ParseNode(BaseNode):
                                                model=self.llm_model)

        state.update({self.output[0]: chunks})
+        if self.parse_urls:
+            state.update({self.output[1]: link_urls})
+            state.update({self.output[2]: img_urls})

        return state
+
+    def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
+        """
+        Extracts URLs from the given text.
+
+        Args:
+            text (str): The text to extract URLs from.
+
+        Returns:
+            Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
+        """
+        if not self.parse_urls:
+            return [], []
+        
+        image_extensions = default_filters.filter_dict["img_exts"]
+        image_extension_seq = '|'.join(image_extensions).replace('.','')
+        url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
+
+        all_urls = url_pattern.findall(text)
+        all_urls = self._clean_urls(all_urls)
+
+        if not source.startswith("http"):
+            all_urls = [url for url in all_urls if url.startswith("http")]
+        else:
+            all_urls = [urljoin(source, url) for url in all_urls]
+        
+        images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
+        links = [url for url in all_urls if url not in images]
+
+        return links, images
+    
+    def _clean_urls(self, urls: List[str]) -> List[str]:
+        """
+        Cleans the URLs extracted from the text.
+
+        Args:
+            urls (List[str]): The list of URLs to clean.
+
+        Returns:
+            List[str]: The cleaned URLs.
+        """
+        cleaned_urls = []
+        for url in urls:
+            url = re.sub(r'.*?\]\(', '', url)
+            url = url.rstrip(').')
+            
+            cleaned_urls.append(url)
+        
+        return cleaned_urls