From 283b61fafcc805e7f866e1acf68ffd6581ace1a9 Mon Sep 17 00:00:00 2001
From: Marco Perini <perinim.98@gmail.com>
Date: Thu, 13 Jun 2024 18:13:47 +0200
Subject: [PATCH] docs: better logging

---
 examples/openai/smart_scraper_openai.py           | 2 +-
 scrapegraphai/graphs/smart_scraper_graph.py       | 3 +--
 scrapegraphai/graphs/smart_scraper_multi_graph.py | 3 +++
 scrapegraphai/nodes/fetch_node.py                 | 6 +++---
 scrapegraphai/utils/cleanup_html.py               | 8 --------
 5 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py
index dcee0972..e353fd9b 100644
--- a/examples/openai/smart_scraper_openai.py
+++ b/examples/openai/smart_scraper_openai.py
@@ -21,7 +21,7 @@ graph_config = {
         "api_key": openai_key,
         "model": "gpt-3.5-turbo",
     },
-    "verbose": False,
+    "verbose": True,
     "headless": False,
 }
 
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
index 85b292c3..ad0b1df8 100644
--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -66,10 +66,9 @@ class SmartScraperGraph(AbstractGraph):
             output=["doc", "link_urls", "img_urls"],
             node_config={
                 "loader_kwargs": self.config.get("loader_kwargs", {}),
-                "headless": self.config.get("headless", True)  # Ensure headless flag is passed
             }
         )
-        logging.info("FetchNode configured with headless: %s", self.config.get("headless", True))
+
         parse_node = ParseNode(
             input="doc",
             output=["parsed_doc"],
diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py
index 6c1093ef..70fd570a 100644
--- a/scrapegraphai/graphs/smart_scraper_multi_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py
@@ -51,6 +51,8 @@ class SmartScraperMultiGraph(AbstractGraph):
             self.copy_config = copy(config)
         else:
             self.copy_config = deepcopy(config)
+        
+        self.copy_schema = deepcopy(schema)
 
         super().__init__(prompt, config, source, schema)
 
@@ -70,6 +72,7 @@ class SmartScraperMultiGraph(AbstractGraph):
             prompt="",
             source="",
             config=self.copy_config,
+            schema=self.copy_schema
         )
 
         # ************************************************
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
index dbdd9925..2ce060d1 100644
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@@ -131,7 +131,7 @@ class FetchNode(BaseNode):
             pass
 
         elif not source.startswith("http"):
-            self.logger.info(f"Fetching local HTML content from: {source}")
+            self.logger.info(f"--- (Fetching HTML from: {source}) ---")
             if not source.strip():
                 raise ValueError("No HTML body content found in the local source.")
             title, minimized_body, link_urls, image_urls = cleanup_html(source, source)
@@ -141,7 +141,7 @@ class FetchNode(BaseNode):
             ]
 
         elif self.useSoup:
-            self.logger.info(f"Fetching HTML content using requests from: {source}")
+            self.logger.info(f"--- (Fetching HTML from: {source}) ---")
             response = requests.get(source)
             if response.status_code == 200:
                 if not response.text.strip():
@@ -157,7 +157,7 @@ class FetchNode(BaseNode):
                 )
 
         else:
-            self.logger.info(f"Fetching HTML content using ChromiumLoader from: {source}")
+            self.logger.info(f"--- (Fetching HTML from: {source}) ---")
             loader_kwargs = {}
 
             if self.node_config is not None:
diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py
index d3b4dd48..3dac0efb 100644
--- a/scrapegraphai/utils/cleanup_html.py
+++ b/scrapegraphai/utils/cleanup_html.py
@@ -5,7 +5,6 @@ from bs4 import BeautifulSoup
 from minify_html import minify
 from urllib.parse import urljoin
 
-
 def cleanup_html(html_content: str, base_url: str) -> str:
     """
     Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
@@ -24,12 +23,6 @@ def cleanup_html(html_content: str, base_url: str) -> str:
     This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
     """
 
-    import logging
-    logging.basicConfig(level=logging.DEBUG)
-
-    # Add logging to capture the HTML content before parsing
-    logging.debug(f'HTML content before parsing: {html_content}')
-
     soup = BeautifulSoup(html_content, 'html.parser')
 
     # Title Extraction
@@ -62,6 +55,5 @@ def cleanup_html(html_content: str, base_url: str) -> str:
         return title, minimized_body, link_urls, image_urls
 
     else:
-        logging.error(f'No body content found in HTML: {html_content}')
         raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")