docs: better logging

This commit is contained in:
Marco Perini 2024-06-13 18:13:47 +02:00
parent a6757aca57
commit 283b61fafc
5 changed files with 8 additions and 14 deletions

View File

@ -21,7 +21,7 @@ graph_config = {
"api_key": openai_key,
"model": "gpt-3.5-turbo",
},
"verbose": False,
"verbose": True,
"headless": False,
}

View File

@ -66,10 +66,9 @@ class SmartScraperGraph(AbstractGraph):
output=["doc", "link_urls", "img_urls"],
node_config={
"loader_kwargs": self.config.get("loader_kwargs", {}),
"headless": self.config.get("headless", True) # Ensure headless flag is passed
}
)
logging.info("FetchNode configured with headless: %s", self.config.get("headless", True))
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],

View File

@ -51,6 +51,8 @@ class SmartScraperMultiGraph(AbstractGraph):
self.copy_config = copy(config)
else:
self.copy_config = deepcopy(config)
self.copy_schema = deepcopy(schema)
super().__init__(prompt, config, source, schema)
@ -70,6 +72,7 @@ class SmartScraperMultiGraph(AbstractGraph):
prompt="",
source="",
config=self.copy_config,
schema=self.copy_schema
)
# ************************************************

View File

@ -131,7 +131,7 @@ class FetchNode(BaseNode):
pass
elif not source.startswith("http"):
self.logger.info(f"Fetching local HTML content from: {source}")
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
if not source.strip():
raise ValueError("No HTML body content found in the local source.")
title, minimized_body, link_urls, image_urls = cleanup_html(source, source)
@ -141,7 +141,7 @@ class FetchNode(BaseNode):
]
elif self.useSoup:
self.logger.info(f"Fetching HTML content using requests from: {source}")
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
response = requests.get(source)
if response.status_code == 200:
if not response.text.strip():
@ -157,7 +157,7 @@ class FetchNode(BaseNode):
)
else:
self.logger.info(f"Fetching HTML content using ChromiumLoader from: {source}")
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
loader_kwargs = {}
if self.node_config is not None:

View File

@ -5,7 +5,6 @@ from bs4 import BeautifulSoup
from minify_html import minify
from urllib.parse import urljoin
def cleanup_html(html_content: str, base_url: str) -> str:
"""
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
@ -24,12 +23,6 @@ def cleanup_html(html_content: str, base_url: str) -> str:
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
"""
import logging
logging.basicConfig(level=logging.DEBUG)
# Add logging to capture the HTML content before parsing
logging.debug(f'HTML content before parsing: {html_content}')
soup = BeautifulSoup(html_content, 'html.parser')
# Title Extraction
@ -62,6 +55,5 @@ def cleanup_html(html_content: str, base_url: str) -> str:
return title, minimized_body, link_urls, image_urls
else:
logging.error(f'No body content found in HTML: {html_content}')
raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")