docs: better logging

2026-06-23 21:00:30 +08:00 · 2024-06-13 18:13:47 +02:00 · 2024-06-13 18:13:47 +02:00 · 283b61fafc
commit 283b61fafc
parent a6757aca57
5 changed files with 8 additions and 14 deletions
--- a/examples/openai/smart_scraper_openai.py
+++ b/examples/openai/smart_scraper_openai.py
@ -21,7 +21,7 @@ graph_config = {
        "api_key": openai_key,
        "model": "gpt-3.5-turbo",
    },
-    "verbose": False,
+    "verbose": True,
    "headless": False,
 }

--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@ -66,10 +66,9 @@ class SmartScraperGraph(AbstractGraph):
            output=["doc", "link_urls", "img_urls"],
            node_config={
                "loader_kwargs": self.config.get("loader_kwargs", {}),
-                "headless": self.config.get("headless", True)  # Ensure headless flag is passed
            }
        )
-        logging.info("FetchNode configured with headless: %s", self.config.get("headless", True))
+
        parse_node = ParseNode(
            input="doc",
            output=["parsed_doc"],
--- a/scrapegraphai/graphs/smart_scraper_multi_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py
@ -51,6 +51,8 @@ class SmartScraperMultiGraph(AbstractGraph):
            self.copy_config = copy(config)
        else:
            self.copy_config = deepcopy(config)
+        
+        self.copy_schema = deepcopy(schema)

        super().__init__(prompt, config, source, schema)

@ -70,6 +72,7 @@ class SmartScraperMultiGraph(AbstractGraph):
            prompt="",
            source="",
            config=self.copy_config,
+            schema=self.copy_schema
        )

        # ************************************************
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@ -131,7 +131,7 @@ class FetchNode(BaseNode):
            pass

        elif not source.startswith("http"):
-            self.logger.info(f"Fetching local HTML content from: {source}")
+            self.logger.info(f"--- (Fetching HTML from: {source}) ---")
            if not source.strip():
                raise ValueError("No HTML body content found in the local source.")
            title, minimized_body, link_urls, image_urls = cleanup_html(source, source)
@ -141,7 +141,7 @@ class FetchNode(BaseNode):
            ]

        elif self.useSoup:
-            self.logger.info(f"Fetching HTML content using requests from: {source}")
+            self.logger.info(f"--- (Fetching HTML from: {source}) ---")
            response = requests.get(source)
            if response.status_code == 200:
                if not response.text.strip():
@ -157,7 +157,7 @@ class FetchNode(BaseNode):
                )

        else:
-            self.logger.info(f"Fetching HTML content using ChromiumLoader from: {source}")
+            self.logger.info(f"--- (Fetching HTML from: {source}) ---")
            loader_kwargs = {}

            if self.node_config is not None:
--- a/scrapegraphai/utils/cleanup_html.py
+++ b/scrapegraphai/utils/cleanup_html.py
@ -5,7 +5,6 @@ from bs4 import BeautifulSoup
 from minify_html import minify
 from urllib.parse import urljoin

-
 def cleanup_html(html_content: str, base_url: str) -> str:
    """
    Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
@ -24,12 +23,6 @@ def cleanup_html(html_content: str, base_url: str) -> str:
    This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
    """

-    import logging
-    logging.basicConfig(level=logging.DEBUG)
-
-    # Add logging to capture the HTML content before parsing
-    logging.debug(f'HTML content before parsing: {html_content}')
-
    soup = BeautifulSoup(html_content, 'html.parser')

    # Title Extraction
@ -62,6 +55,5 @@ def cleanup_html(html_content: str, base_url: str) -> str:
        return title, minimized_body, link_urls, image_urls

    else:
-        logging.error(f'No body content found in HTML: {html_content}')
        raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")