From c3d1b7c200e6fd065bd5aea79b90ca3db4d42b16 Mon Sep 17 00:00:00 2001 From: Lorenzo Paleari <100212108+LorenzoPaleari@users.noreply.github.com> Date: Fri, 13 Sep 2024 01:47:39 +0200 Subject: [PATCH] fix: OmniScraerGraph working. Added url scraping capability to ParseNode --- scrapegraphai/nodes/parse_node.py | 65 ++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 240daf1f..1f919926 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -1,11 +1,14 @@ """ ParseNode Module """ -from typing import List, Optional +import re +from typing import List, Optional, Tuple +from urllib.parse import urljoin from langchain_community.document_transformers import Html2TextTransformer from langchain_core.documents import Document from .base_node import BaseNode from ..utils.split_text_into_chunks import split_text_into_chunks +from ..helpers import default_filters class ParseNode(BaseNode): """ @@ -40,6 +43,9 @@ class ParseNode(BaseNode): self.parse_html = ( True if node_config is None else node_config.get("parse_html", True) ) + self.parse_urls = ( + False if node_config is None else node_config.get("parse_urls", False) + ) self.llm_model = node_config.get("llm_model") self.chunk_size = node_config.get("chunk_size") @@ -66,16 +72,21 @@ class ParseNode(BaseNode): input_data = [state[key] for key in input_keys] docs_transformed = input_data[0] + source = input_data[1] if self.parse_urls else None if self.parse_html: docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0]) docs_transformed = docs_transformed[0] + link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source) + chunks = split_text_into_chunks(text=docs_transformed.page_content, chunk_size=self.chunk_size-250, model=self.llm_model) else: docs_transformed = docs_transformed[0] + link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source) + chunk_size = self.chunk_size chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) @@ -89,5 +100,57 @@ class ParseNode(BaseNode): model=self.llm_model) state.update({self.output[0]: chunks}) + if self.parse_urls: + state.update({self.output[1]: link_urls}) + state.update({self.output[2]: img_urls}) return state + + def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]: + """ + Extracts URLs from the given text. + + Args: + text (str): The text to extract URLs from. + + Returns: + Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs. + """ + if not self.parse_urls: + return [], [] + + image_extensions = default_filters.filter_dict["img_exts"] + image_extension_seq = '|'.join(image_extensions).replace('.','') + url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))') + + all_urls = url_pattern.findall(text) + all_urls = self._clean_urls(all_urls) + + if not source.startswith("http"): + all_urls = [url for url in all_urls if url.startswith("http")] + else: + all_urls = [urljoin(source, url) for url in all_urls] + + images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)] + links = [url for url in all_urls if url not in images] + + return links, images + + def _clean_urls(self, urls: List[str]) -> List[str]: + """ + Cleans the URLs extracted from the text. + + Args: + urls (List[str]): The list of URLs to clean. + + Returns: + List[str]: The cleaned URLs. + """ + cleaned_urls = [] + for url in urls: + url = re.sub(r'.*?\]\(', '', url) + url = url.rstrip(').') + + cleaned_urls.append(url) + + return cleaned_urls \ No newline at end of file