From 864aa91326c360992326e04811d272e55eac8355 Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Fri, 10 May 2024 15:11:54 +0200 Subject: [PATCH] feat: revert fetch_node --- scrapegraphai/nodes/fetch_node.py | 23 ++++--------------- scrapegraphai/utils/__init__.py | 1 + .../utils/{cleanup_html.py => remover.py} | 11 ++++----- 3 files changed, 9 insertions(+), 26 deletions(-) rename scrapegraphai/utils/{cleanup_html.py => remover.py} (78%) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index eeb2d0b4..3eabc66f 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -8,9 +8,7 @@ from langchain_community.document_loaders import AsyncChromiumLoader from langchain_core.documents import Document from langchain_community.document_loaders import PyPDFLoader from .base_node import BaseNode -from ..utils.cleanup_html import cleanup_html -import requests -from bs4 import BeautifulSoup +from ..utils.remover import remover class FetchNode(BaseNode): @@ -36,7 +34,6 @@ class FetchNode(BaseNode): def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"): super().__init__(node_name, "node", input, output, 1) - self.headless = True if node_config is None else node_config.get( "headless", True) self.verbose = False if node_config is None else node_config.get( @@ -97,22 +94,10 @@ class FetchNode(BaseNode): pass elif not source.startswith("http"): - compressed_document = [Document(page_content=cleanup_html(source), metadata={ + compressed_document = [Document(page_content=remover(source), metadata={ "source": "local_dir" })] - elif self.useSoup: - response = requests.get(source) - if response.status_code == 200: - soup = BeautifulSoup(response.text, 'html.parser') - links = soup.find_all('a') - link_urls = [] - for link in links: - if 'href' in link.attrs: - link_urls.append(link['href']) - compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))] - else: - print(f"Failed to retrieve contents from the webpage at url: {url}") else: if self.node_config is not None and self.node_config.get("endpoint") is not None: @@ -129,7 +114,7 @@ class FetchNode(BaseNode): document = loader.load() compressed_document = [ - Document(page_content=cleanup_html(str(document[0].page_content)))] + Document(page_content=remover(str(document[0].page_content)))] state.update({self.output[0]: compressed_document}) - return state + return state \ No newline at end of file diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 0aee7839..218506f3 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -6,3 +6,4 @@ from .convert_to_csv import convert_to_csv from .convert_to_json import convert_to_json from .prettify_exec_info import prettify_exec_info from .proxy_rotation import proxy_generator +from .remover import remover diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/remover.py similarity index 78% rename from scrapegraphai/utils/cleanup_html.py rename to scrapegraphai/utils/remover.py index aab1db65..c5a0507b 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/remover.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup from minify_html import minify -def cleanup_html(html_content: str, urls: list = []) -> str: +def remover(html_content: str) -> str: """ Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. @@ -17,7 +17,7 @@ def cleanup_html(html_content: str, urls: list = []) -> str: Example: >>> html_content = "Example

Hello World!

" - >>> cleanup_html(html_content) + >>> remover(html_content) 'Title: Example, Body:

Hello World!

' This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized. @@ -35,12 +35,9 @@ def cleanup_html(html_content: str, urls: list = []) -> str: # Body Extraction (if it exists) body_content = soup.find('body') - urls_content = "" - if urls: - urls_content = f", URLs in page: {urls}" if body_content: # Minify the HTML within the body tag minimized_body = minify(str(body_content)) - return "Title: " + title + ", Body: " + minimized_body + urls_content + return "Title: " + title + ", Body: " + minimized_body - return "Title: " + title + ", Body: No body content found" + urls_content + return "Title: " + title + ", Body: No body content found" \ No newline at end of file