From f8ce3d5916eab926275d59d4d48b0d89ec9cd43f Mon Sep 17 00:00:00 2001 From: mayurdb Date: Fri, 10 May 2024 13:28:53 +0530 Subject: [PATCH] fix: Augment the information getting fetched from a webpage --- scrapegraphai/nodes/fetch_node.py | 21 ++++++++++++++++--- .../utils/{remover.py => cleanup_html.py} | 11 ++++++---- 2 files changed, 25 insertions(+), 7 deletions(-) rename scrapegraphai/utils/{remover.py => cleanup_html.py} (78%) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index bcd207f3..2667f0be 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -6,7 +6,9 @@ from typing import List, Optional from langchain_community.document_loaders import AsyncChromiumLoader from langchain_core.documents import Document from .base_node import BaseNode -from ..utils.remover import remover +from ..utils.cleanup_html import cleanup_html +import requests +from bs4 import BeautifulSoup class FetchNode(BaseNode): @@ -32,6 +34,7 @@ class FetchNode(BaseNode): def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "Fetch"): super().__init__(node_name, "node", input, output, 1) + self.useSoup = True if node_config is None else node_config.get("useSoup", True) self.headless = True if node_config is None else node_config.get("headless", True) self.verbose = False if node_config is None else node_config.get("verbose", False) @@ -67,10 +70,22 @@ class FetchNode(BaseNode): })] # if it is a local directory elif not source.startswith("http"): - compressed_document = [Document(page_content=remover(source), metadata={ + compressed_document = [Document(page_content=cleanup_html(source), metadata={ "source": "local_dir" })] + elif self.useSoup: + response = requests.get(source) + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + links = soup.find_all('a') + link_urls = [] + for link in links: + if 'href' in link.attrs: + link_urls.append(link['href']) + compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))] + else: + print(f"Failed to retrieve contents from the webpage at url: {url}") else: if self.node_config is not None and self.node_config.get("endpoint") is not None: @@ -87,7 +102,7 @@ class FetchNode(BaseNode): document = loader.load() compressed_document = [ - Document(page_content=remover(str(document[0].page_content)))] + Document(page_content=cleanup_html(str(document[0].page_content)))] state.update({self.output[0]: compressed_document}) return state diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/cleanup_html.py similarity index 78% rename from scrapegraphai/utils/remover.py rename to scrapegraphai/utils/cleanup_html.py index 5e203249..aab1db65 100644 --- a/scrapegraphai/utils/remover.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup from minify_html import minify -def remover(html_content: str) -> str: +def cleanup_html(html_content: str, urls: list = []) -> str: """ Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. @@ -17,7 +17,7 @@ def remover(html_content: str) -> str: Example: >>> html_content = "Example

Hello World!

" - >>> remover(html_content) + >>> cleanup_html(html_content) 'Title: Example, Body:

Hello World!

' This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized. @@ -35,9 +35,12 @@ def remover(html_content: str) -> str: # Body Extraction (if it exists) body_content = soup.find('body') + urls_content = "" + if urls: + urls_content = f", URLs in page: {urls}" if body_content: # Minify the HTML within the body tag minimized_body = minify(str(body_content)) - return "Title: " + title + ", Body: " + minimized_body + return "Title: " + title + ", Body: " + minimized_body + urls_content - return "Title: " + title + ", Body: No body content found" + return "Title: " + title + ", Body: No body content found" + urls_content