Merge pull request #662 from LorenzoPaleari/580-omni-scraper-not-working

580 - Omni Scraper not working
This commit is contained in:
Marco Vinciguerra 2024-09-13 08:58:42 +02:00 committed by GitHub
commit da827a76fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,11 +1,14 @@
"""
ParseNode Module
"""
from typing import List, Optional
import re
from typing import List, Optional, Tuple
from urllib.parse import urljoin
from langchain_community.document_transformers import Html2TextTransformer
from langchain_core.documents import Document
from .base_node import BaseNode
from ..utils.split_text_into_chunks import split_text_into_chunks
from ..helpers import default_filters
class ParseNode(BaseNode):
"""
@ -40,6 +43,9 @@ class ParseNode(BaseNode):
self.parse_html = (
True if node_config is None else node_config.get("parse_html", True)
)
self.parse_urls = (
False if node_config is None else node_config.get("parse_urls", False)
)
self.llm_model = node_config.get("llm_model")
self.chunk_size = node_config.get("chunk_size")
@ -66,16 +72,21 @@ class ParseNode(BaseNode):
input_data = [state[key] for key in input_keys]
docs_transformed = input_data[0]
source = input_data[1] if self.parse_urls else None
if self.parse_html:
docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0])
docs_transformed = docs_transformed[0]
link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
chunks = split_text_into_chunks(text=docs_transformed.page_content,
chunk_size=self.chunk_size-250, model=self.llm_model)
else:
docs_transformed = docs_transformed[0]
link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
chunk_size = self.chunk_size
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
@ -89,5 +100,57 @@ class ParseNode(BaseNode):
model=self.llm_model)
state.update({self.output[0]: chunks})
if self.parse_urls:
state.update({self.output[1]: link_urls})
state.update({self.output[2]: img_urls})
return state
def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
"""
Extracts URLs from the given text.
Args:
text (str): The text to extract URLs from.
Returns:
Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
"""
if not self.parse_urls:
return [], []
image_extensions = default_filters.filter_dict["img_exts"]
image_extension_seq = '|'.join(image_extensions).replace('.','')
url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
all_urls = url_pattern.findall(text)
all_urls = self._clean_urls(all_urls)
if not source.startswith("http"):
all_urls = [url for url in all_urls if url.startswith("http")]
else:
all_urls = [urljoin(source, url) for url in all_urls]
images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
links = [url for url in all_urls if url not in images]
return links, images
def _clean_urls(self, urls: List[str]) -> List[str]:
"""
Cleans the URLs extracted from the text.
Args:
urls (List[str]): The list of URLs to clean.
Returns:
List[str]: The cleaned URLs.
"""
cleaned_urls = []
for url in urls:
url = re.sub(r'.*?\]\(', '', url)
url = url.rstrip(').')
cleaned_urls.append(url)
return cleaned_urls