fix: Augment the information getting fetched from a webpage

This commit is contained in:
mayurdb 2024-05-10 13:28:53 +05:30
parent 0ca52b1da6
commit f8ce3d5916
2 changed files with 25 additions and 7 deletions

View File

@ -6,7 +6,9 @@ from typing import List, Optional
from langchain_community.document_loaders import AsyncChromiumLoader from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document from langchain_core.documents import Document
from .base_node import BaseNode from .base_node import BaseNode
from ..utils.remover import remover from ..utils.cleanup_html import cleanup_html
import requests
from bs4 import BeautifulSoup
class FetchNode(BaseNode): class FetchNode(BaseNode):
@ -32,6 +34,7 @@ class FetchNode(BaseNode):
def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "Fetch"): def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "Fetch"):
super().__init__(node_name, "node", input, output, 1) super().__init__(node_name, "node", input, output, 1)
self.useSoup = True if node_config is None else node_config.get("useSoup", True)
self.headless = True if node_config is None else node_config.get("headless", True) self.headless = True if node_config is None else node_config.get("headless", True)
self.verbose = False if node_config is None else node_config.get("verbose", False) self.verbose = False if node_config is None else node_config.get("verbose", False)
@ -67,10 +70,22 @@ class FetchNode(BaseNode):
})] })]
# if it is a local directory # if it is a local directory
elif not source.startswith("http"): elif not source.startswith("http"):
compressed_document = [Document(page_content=remover(source), metadata={ compressed_document = [Document(page_content=cleanup_html(source), metadata={
"source": "local_dir" "source": "local_dir"
})] })]
elif self.useSoup:
response = requests.get(source)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
link_urls = []
for link in links:
if 'href' in link.attrs:
link_urls.append(link['href'])
compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))]
else:
print(f"Failed to retrieve contents from the webpage at url: {url}")
else: else:
if self.node_config is not None and self.node_config.get("endpoint") is not None: if self.node_config is not None and self.node_config.get("endpoint") is not None:
@ -87,7 +102,7 @@ class FetchNode(BaseNode):
document = loader.load() document = loader.load()
compressed_document = [ compressed_document = [
Document(page_content=remover(str(document[0].page_content)))] Document(page_content=cleanup_html(str(document[0].page_content)))]
state.update({self.output[0]: compressed_document}) state.update({self.output[0]: compressed_document})
return state return state

View File

@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
from minify_html import minify from minify_html import minify
def remover(html_content: str) -> str: def cleanup_html(html_content: str, urls: list = []) -> str:
""" """
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
@ -17,7 +17,7 @@ def remover(html_content: str) -> str:
Example: Example:
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>" >>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
>>> remover(html_content) >>> cleanup_html(html_content)
'Title: Example, Body: <body><p>Hello World!</p></body>' 'Title: Example, Body: <body><p>Hello World!</p></body>'
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized. This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
@ -35,9 +35,12 @@ def remover(html_content: str) -> str:
# Body Extraction (if it exists) # Body Extraction (if it exists)
body_content = soup.find('body') body_content = soup.find('body')
urls_content = ""
if urls:
urls_content = f", URLs in page: {urls}"
if body_content: if body_content:
# Minify the HTML within the body tag # Minify the HTML within the body tag
minimized_body = minify(str(body_content)) minimized_body = minify(str(body_content))
return "Title: " + title + ", Body: " + minimized_body return "Title: " + title + ", Body: " + minimized_body + urls_content
return "Title: " + title + ", Body: No body content found" return "Title: " + title + ", Body: No body content found" + urls_content