mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
docs: better logging
This commit is contained in:
parent
a6757aca57
commit
283b61fafc
@ -21,7 +21,7 @@ graph_config = {
|
||||
"api_key": openai_key,
|
||||
"model": "gpt-3.5-turbo",
|
||||
},
|
||||
"verbose": False,
|
||||
"verbose": True,
|
||||
"headless": False,
|
||||
}
|
||||
|
||||
|
||||
@ -66,10 +66,9 @@ class SmartScraperGraph(AbstractGraph):
|
||||
output=["doc", "link_urls", "img_urls"],
|
||||
node_config={
|
||||
"loader_kwargs": self.config.get("loader_kwargs", {}),
|
||||
"headless": self.config.get("headless", True) # Ensure headless flag is passed
|
||||
}
|
||||
)
|
||||
logging.info("FetchNode configured with headless: %s", self.config.get("headless", True))
|
||||
|
||||
parse_node = ParseNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
|
||||
@ -51,6 +51,8 @@ class SmartScraperMultiGraph(AbstractGraph):
|
||||
self.copy_config = copy(config)
|
||||
else:
|
||||
self.copy_config = deepcopy(config)
|
||||
|
||||
self.copy_schema = deepcopy(schema)
|
||||
|
||||
super().__init__(prompt, config, source, schema)
|
||||
|
||||
@ -70,6 +72,7 @@ class SmartScraperMultiGraph(AbstractGraph):
|
||||
prompt="",
|
||||
source="",
|
||||
config=self.copy_config,
|
||||
schema=self.copy_schema
|
||||
)
|
||||
|
||||
# ************************************************
|
||||
|
||||
@ -131,7 +131,7 @@ class FetchNode(BaseNode):
|
||||
pass
|
||||
|
||||
elif not source.startswith("http"):
|
||||
self.logger.info(f"Fetching local HTML content from: {source}")
|
||||
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
|
||||
if not source.strip():
|
||||
raise ValueError("No HTML body content found in the local source.")
|
||||
title, minimized_body, link_urls, image_urls = cleanup_html(source, source)
|
||||
@ -141,7 +141,7 @@ class FetchNode(BaseNode):
|
||||
]
|
||||
|
||||
elif self.useSoup:
|
||||
self.logger.info(f"Fetching HTML content using requests from: {source}")
|
||||
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
|
||||
response = requests.get(source)
|
||||
if response.status_code == 200:
|
||||
if not response.text.strip():
|
||||
@ -157,7 +157,7 @@ class FetchNode(BaseNode):
|
||||
)
|
||||
|
||||
else:
|
||||
self.logger.info(f"Fetching HTML content using ChromiumLoader from: {source}")
|
||||
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
|
||||
loader_kwargs = {}
|
||||
|
||||
if self.node_config is not None:
|
||||
|
||||
@ -5,7 +5,6 @@ from bs4 import BeautifulSoup
|
||||
from minify_html import minify
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
def cleanup_html(html_content: str, base_url: str) -> str:
|
||||
"""
|
||||
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
|
||||
@ -24,12 +23,6 @@ def cleanup_html(html_content: str, base_url: str) -> str:
|
||||
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
|
||||
"""
|
||||
|
||||
import logging
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
# Add logging to capture the HTML content before parsing
|
||||
logging.debug(f'HTML content before parsing: {html_content}')
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Title Extraction
|
||||
@ -62,6 +55,5 @@ def cleanup_html(html_content: str, base_url: str) -> str:
|
||||
return title, minimized_body, link_urls, image_urls
|
||||
|
||||
else:
|
||||
logging.error(f'No body content found in HTML: {html_content}')
|
||||
raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user