mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-25 21:11:11 +08:00
feat: revert fetch_node
This commit is contained in:
parent
63c0dd9372
commit
864aa91326
@ -8,9 +8,7 @@ from langchain_community.document_loaders import AsyncChromiumLoader
|
|||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_community.document_loaders import PyPDFLoader
|
from langchain_community.document_loaders import PyPDFLoader
|
||||||
from .base_node import BaseNode
|
from .base_node import BaseNode
|
||||||
from ..utils.cleanup_html import cleanup_html
|
from ..utils.remover import remover
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
|
|
||||||
class FetchNode(BaseNode):
|
class FetchNode(BaseNode):
|
||||||
@ -36,7 +34,6 @@ class FetchNode(BaseNode):
|
|||||||
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
|
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
|
||||||
super().__init__(node_name, "node", input, output, 1)
|
super().__init__(node_name, "node", input, output, 1)
|
||||||
|
|
||||||
|
|
||||||
self.headless = True if node_config is None else node_config.get(
|
self.headless = True if node_config is None else node_config.get(
|
||||||
"headless", True)
|
"headless", True)
|
||||||
self.verbose = False if node_config is None else node_config.get(
|
self.verbose = False if node_config is None else node_config.get(
|
||||||
@ -97,22 +94,10 @@ class FetchNode(BaseNode):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
elif not source.startswith("http"):
|
elif not source.startswith("http"):
|
||||||
compressed_document = [Document(page_content=cleanup_html(source), metadata={
|
compressed_document = [Document(page_content=remover(source), metadata={
|
||||||
"source": "local_dir"
|
"source": "local_dir"
|
||||||
})]
|
})]
|
||||||
|
|
||||||
elif self.useSoup:
|
|
||||||
response = requests.get(source)
|
|
||||||
if response.status_code == 200:
|
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
|
||||||
links = soup.find_all('a')
|
|
||||||
link_urls = []
|
|
||||||
for link in links:
|
|
||||||
if 'href' in link.attrs:
|
|
||||||
link_urls.append(link['href'])
|
|
||||||
compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))]
|
|
||||||
else:
|
|
||||||
print(f"Failed to retrieve contents from the webpage at url: {url}")
|
|
||||||
else:
|
else:
|
||||||
if self.node_config is not None and self.node_config.get("endpoint") is not None:
|
if self.node_config is not None and self.node_config.get("endpoint") is not None:
|
||||||
|
|
||||||
@ -129,7 +114,7 @@ class FetchNode(BaseNode):
|
|||||||
|
|
||||||
document = loader.load()
|
document = loader.load()
|
||||||
compressed_document = [
|
compressed_document = [
|
||||||
Document(page_content=cleanup_html(str(document[0].page_content)))]
|
Document(page_content=remover(str(document[0].page_content)))]
|
||||||
|
|
||||||
state.update({self.output[0]: compressed_document})
|
state.update({self.output[0]: compressed_document})
|
||||||
return state
|
return state
|
||||||
@ -6,3 +6,4 @@ from .convert_to_csv import convert_to_csv
|
|||||||
from .convert_to_json import convert_to_json
|
from .convert_to_json import convert_to_json
|
||||||
from .prettify_exec_info import prettify_exec_info
|
from .prettify_exec_info import prettify_exec_info
|
||||||
from .proxy_rotation import proxy_generator
|
from .proxy_rotation import proxy_generator
|
||||||
|
from .remover import remover
|
||||||
|
|||||||
@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
|
|||||||
from minify_html import minify
|
from minify_html import minify
|
||||||
|
|
||||||
|
|
||||||
def cleanup_html(html_content: str, urls: list = []) -> str:
|
def remover(html_content: str) -> str:
|
||||||
"""
|
"""
|
||||||
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
|
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
|
||||||
|
|
||||||
@ -17,7 +17,7 @@ def cleanup_html(html_content: str, urls: list = []) -> str:
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
|
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
|
||||||
>>> cleanup_html(html_content)
|
>>> remover(html_content)
|
||||||
'Title: Example, Body: <body><p>Hello World!</p></body>'
|
'Title: Example, Body: <body><p>Hello World!</p></body>'
|
||||||
|
|
||||||
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
|
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
|
||||||
@ -35,12 +35,9 @@ def cleanup_html(html_content: str, urls: list = []) -> str:
|
|||||||
|
|
||||||
# Body Extraction (if it exists)
|
# Body Extraction (if it exists)
|
||||||
body_content = soup.find('body')
|
body_content = soup.find('body')
|
||||||
urls_content = ""
|
|
||||||
if urls:
|
|
||||||
urls_content = f", URLs in page: {urls}"
|
|
||||||
if body_content:
|
if body_content:
|
||||||
# Minify the HTML within the body tag
|
# Minify the HTML within the body tag
|
||||||
minimized_body = minify(str(body_content))
|
minimized_body = minify(str(body_content))
|
||||||
return "Title: " + title + ", Body: " + minimized_body + urls_content
|
return "Title: " + title + ", Body: " + minimized_body
|
||||||
|
|
||||||
return "Title: " + title + ", Body: No body content found" + urls_content
|
return "Title: " + title + ", Body: No body content found"
|
||||||
Loading…
Reference in New Issue
Block a user