From 283b61fafcc805e7f866e1acf68ffd6581ace1a9 Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Thu, 13 Jun 2024 18:13:47 +0200 Subject: [PATCH] docs: better logging --- examples/openai/smart_scraper_openai.py | 2 +- scrapegraphai/graphs/smart_scraper_graph.py | 3 +-- scrapegraphai/graphs/smart_scraper_multi_graph.py | 3 +++ scrapegraphai/nodes/fetch_node.py | 6 +++--- scrapegraphai/utils/cleanup_html.py | 8 -------- 5 files changed, 8 insertions(+), 14 deletions(-) diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py index dcee0972..e353fd9b 100644 --- a/examples/openai/smart_scraper_openai.py +++ b/examples/openai/smart_scraper_openai.py @@ -21,7 +21,7 @@ graph_config = { "api_key": openai_key, "model": "gpt-3.5-turbo", }, - "verbose": False, + "verbose": True, "headless": False, } diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 85b292c3..ad0b1df8 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -66,10 +66,9 @@ class SmartScraperGraph(AbstractGraph): output=["doc", "link_urls", "img_urls"], node_config={ "loader_kwargs": self.config.get("loader_kwargs", {}), - "headless": self.config.get("headless", True) # Ensure headless flag is passed } ) - logging.info("FetchNode configured with headless: %s", self.config.get("headless", True)) + parse_node = ParseNode( input="doc", output=["parsed_doc"], diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py index 6c1093ef..70fd570a 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -51,6 +51,8 @@ class SmartScraperMultiGraph(AbstractGraph): self.copy_config = copy(config) else: self.copy_config = deepcopy(config) + + self.copy_schema = deepcopy(schema) super().__init__(prompt, config, source, schema) @@ -70,6 +72,7 @@ class SmartScraperMultiGraph(AbstractGraph): prompt="", source="", config=self.copy_config, + schema=self.copy_schema ) # ************************************************ diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index dbdd9925..2ce060d1 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -131,7 +131,7 @@ class FetchNode(BaseNode): pass elif not source.startswith("http"): - self.logger.info(f"Fetching local HTML content from: {source}") + self.logger.info(f"--- (Fetching HTML from: {source}) ---") if not source.strip(): raise ValueError("No HTML body content found in the local source.") title, minimized_body, link_urls, image_urls = cleanup_html(source, source) @@ -141,7 +141,7 @@ class FetchNode(BaseNode): ] elif self.useSoup: - self.logger.info(f"Fetching HTML content using requests from: {source}") + self.logger.info(f"--- (Fetching HTML from: {source}) ---") response = requests.get(source) if response.status_code == 200: if not response.text.strip(): @@ -157,7 +157,7 @@ class FetchNode(BaseNode): ) else: - self.logger.info(f"Fetching HTML content using ChromiumLoader from: {source}") + self.logger.info(f"--- (Fetching HTML from: {source}) ---") loader_kwargs = {} if self.node_config is not None: diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index d3b4dd48..3dac0efb 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -5,7 +5,6 @@ from bs4 import BeautifulSoup from minify_html import minify from urllib.parse import urljoin - def cleanup_html(html_content: str, base_url: str) -> str: """ Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. @@ -24,12 +23,6 @@ def cleanup_html(html_content: str, base_url: str) -> str: This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized. """ - import logging - logging.basicConfig(level=logging.DEBUG) - - # Add logging to capture the HTML content before parsing - logging.debug(f'HTML content before parsing: {html_content}') - soup = BeautifulSoup(html_content, 'html.parser') # Title Extraction @@ -62,6 +55,5 @@ def cleanup_html(html_content: str, base_url: str) -> str: return title, minimized_body, link_urls, image_urls else: - logging.error(f'No body content found in HTML: {html_content}') raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")