feat: revert fetch_node

This commit is contained in:
Marco Perini 2024-05-10 15:11:54 +02:00
parent 63c0dd9372
commit 864aa91326
3 changed files with 9 additions and 26 deletions

View File

@ -8,9 +8,7 @@ from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader from langchain_community.document_loaders import PyPDFLoader
from .base_node import BaseNode from .base_node import BaseNode
from ..utils.cleanup_html import cleanup_html from ..utils.remover import remover
import requests
from bs4 import BeautifulSoup
class FetchNode(BaseNode): class FetchNode(BaseNode):
@ -36,7 +34,6 @@ class FetchNode(BaseNode):
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"): def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
super().__init__(node_name, "node", input, output, 1) super().__init__(node_name, "node", input, output, 1)
self.headless = True if node_config is None else node_config.get( self.headless = True if node_config is None else node_config.get(
"headless", True) "headless", True)
self.verbose = False if node_config is None else node_config.get( self.verbose = False if node_config is None else node_config.get(
@ -97,22 +94,10 @@ class FetchNode(BaseNode):
pass pass
elif not source.startswith("http"): elif not source.startswith("http"):
compressed_document = [Document(page_content=cleanup_html(source), metadata={ compressed_document = [Document(page_content=remover(source), metadata={
"source": "local_dir" "source": "local_dir"
})] })]
elif self.useSoup:
response = requests.get(source)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
link_urls = []
for link in links:
if 'href' in link.attrs:
link_urls.append(link['href'])
compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))]
else:
print(f"Failed to retrieve contents from the webpage at url: {url}")
else: else:
if self.node_config is not None and self.node_config.get("endpoint") is not None: if self.node_config is not None and self.node_config.get("endpoint") is not None:
@ -129,7 +114,7 @@ class FetchNode(BaseNode):
document = loader.load() document = loader.load()
compressed_document = [ compressed_document = [
Document(page_content=cleanup_html(str(document[0].page_content)))] Document(page_content=remover(str(document[0].page_content)))]
state.update({self.output[0]: compressed_document}) state.update({self.output[0]: compressed_document})
return state return state

View File

@ -6,3 +6,4 @@ from .convert_to_csv import convert_to_csv
from .convert_to_json import convert_to_json from .convert_to_json import convert_to_json
from .prettify_exec_info import prettify_exec_info from .prettify_exec_info import prettify_exec_info
from .proxy_rotation import proxy_generator from .proxy_rotation import proxy_generator
from .remover import remover

View File

@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
from minify_html import minify from minify_html import minify
def cleanup_html(html_content: str, urls: list = []) -> str: def remover(html_content: str) -> str:
""" """
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
@ -17,7 +17,7 @@ def cleanup_html(html_content: str, urls: list = []) -> str:
Example: Example:
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>" >>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
>>> cleanup_html(html_content) >>> remover(html_content)
'Title: Example, Body: <body><p>Hello World!</p></body>' 'Title: Example, Body: <body><p>Hello World!</p></body>'
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized. This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
@ -35,12 +35,9 @@ def cleanup_html(html_content: str, urls: list = []) -> str:
# Body Extraction (if it exists) # Body Extraction (if it exists)
body_content = soup.find('body') body_content = soup.find('body')
urls_content = ""
if urls:
urls_content = f", URLs in page: {urls}"
if body_content: if body_content:
# Minify the HTML within the body tag # Minify the HTML within the body tag
minimized_body = minify(str(body_content)) minimized_body = minify(str(body_content))
return "Title: " + title + ", Body: " + minimized_body + urls_content return "Title: " + title + ", Body: " + minimized_body
return "Title: " + title + ", Body: No body content found" + urls_content return "Title: " + title + ", Body: No body content found"