diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 1edefdbd..39463057 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -1,4 +1,4 @@ -""" +"""" FetchNode Module """ @@ -13,7 +13,7 @@ from langchain_core.documents import Document from ..docloaders import ChromiumLoader from .base_node import BaseNode from ..utils.cleanup_html import cleanup_html - +from ..utils.logging import get_logger class FetchNode(BaseNode): """ @@ -74,7 +74,8 @@ class FetchNode(BaseNode): necessary information to perform the operation is missing. """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger = get_logger("fetch node") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) @@ -128,7 +129,7 @@ class FetchNode(BaseNode): cleanedup_html = cleanup_html(response.text, source) compressed_document = [Document(page_content=cleanedup_html)] else: - print(f"Failed to retrieve contents from the webpage at url: {source}") + logger.warning(f"Failed to retrieve contents from the webpage at url: {source}") else: loader_kwargs = {} @@ -144,4 +145,4 @@ class FetchNode(BaseNode): ] state.update({self.output[0]: compressed_document}) - return state \ No newline at end of file + return state diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 53f7121b..f3f5b7ec 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -9,6 +9,7 @@ from tqdm import tqdm from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel +from ..utils.logging import get_logger # Imports from the library from .base_node import BaseNode @@ -72,7 +73,8 @@ class GenerateAnswerCSVNode(BaseNode): """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger = get_logger("generate_answer csv node") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index f554f8d9..beeac15a 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -10,6 +10,7 @@ from tqdm import tqdm from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel +from ..utils.logging import get_logger # Imports from the library from .base_node import BaseNode @@ -59,7 +60,8 @@ class GenerateAnswerNode(BaseNode): """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger = get_logger("generate answer node") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 31839d22..4a42df23 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -9,6 +9,7 @@ from tqdm import tqdm from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel +from ..utils.logging import get_logger # Imports from the library from .base_node import BaseNode @@ -72,7 +73,8 @@ class GenerateAnswerPDFNode(BaseNode): """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger = get_logger("generate answer pdf node") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index 804635de..a6a8dc00 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -10,6 +10,7 @@ from tqdm import tqdm from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnableParallel +from ..utils.logging import get_logger # Imports from the library from .base_node import BaseNode @@ -63,7 +64,8 @@ class GenerateScraperNode(BaseNode): """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger = get_logger("generate scraper node") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/get_probable_tags_node.py b/scrapegraphai/nodes/get_probable_tags_node.py index e970c285..b0c2b41d 100644 --- a/scrapegraphai/nodes/get_probable_tags_node.py +++ b/scrapegraphai/nodes/get_probable_tags_node.py @@ -6,7 +6,7 @@ from typing import List, Optional from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate from .base_node import BaseNode - +from ..utils.logging import get_logger class GetProbableTagsNode(BaseNode): """ @@ -25,11 +25,12 @@ class GetProbableTagsNode(BaseNode): node_name (str): The unique identifier name for the node, defaulting to "GetProbableTags". """ - def __init__(self, input: str, output: List[str], model_config: dict, + def __init__(self, input: str, output: List[str], node_config: dict, node_name: str = "GetProbableTags"): - super().__init__(node_name, "node", input, output, 2, model_config) + super().__init__(node_name, "node", input, output, 2, node_config) - self.llm_model = model_config["llm_model"] + self.llm_model = node_config["llm_model"] + self.verbose = False if node_config is None else node_config.get("verbose", False) def execute(self, state: dict) -> dict: """ @@ -49,7 +50,9 @@ class GetProbableTagsNode(BaseNode): necessary information for generating tag predictions is missing. """ - print(f"--- Executing {self.node_name} Node ---") + if self.verbose: + logger = get_logger("get probable tags node") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py index 8a71319a..b6c7690e 100644 --- a/scrapegraphai/nodes/graph_iterator_node.py +++ b/scrapegraphai/nodes/graph_iterator_node.py @@ -5,7 +5,7 @@ GraphIterator Module import asyncio import copy from typing import List, Optional - +from ..utils.logging import get_logger from tqdm.asyncio import tqdm from .base_node import BaseNode diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py index 27f09016..07ef3be7 100644 --- a/scrapegraphai/nodes/image_to_text_node.py +++ b/scrapegraphai/nodes/image_to_text_node.py @@ -4,6 +4,7 @@ ImageToTextNode Module from typing import List, Optional from .base_node import BaseNode +from ..utils.logging import get_logger class ImageToTextNode(BaseNode): @@ -42,7 +43,8 @@ class ImageToTextNode(BaseNode): """ if self.verbose: - print("---GENERATING TEXT FROM IMAGE---") + logger = get_logger("image to text node") + logger.info(f"--- Executing {self.node_name} Node ---") input_keys = self.get_input_keys(state) input_data = [state[key] for key in input_keys] diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index 63ed6afa..da115005 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -5,7 +5,7 @@ MergeAnswersNode Module # Imports from standard library from typing import List, Optional from tqdm import tqdm - +from ..utils.logging import get_logger # Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser @@ -54,7 +54,8 @@ class MergeAnswersNode(BaseNode): """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger = get_logger("fetch node") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 2cd7eb33..436cddc4 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -6,7 +6,7 @@ from typing import List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_transformers import Html2TextTransformer from .base_node import BaseNode - +from ..utils.logging import get_logger class ParseNode(BaseNode): """ @@ -49,7 +49,8 @@ class ParseNode(BaseNode): """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger = get_logger("fetch node") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 27d97b6e..fdcdd8e8 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -8,6 +8,7 @@ from langchain.retrievers import ContextualCompressionRetriever from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline from langchain_community.document_transformers import EmbeddingsRedundantFilter from langchain_community.vectorstores import FAISS +from ..utils.logging import get_logger from .base_node import BaseNode @@ -55,9 +56,10 @@ class RAGNode(BaseNode): KeyError: If the input keys are not found in the state, indicating that the necessary information for compressing the content is missing. """ + logger = get_logger("rag node") if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) @@ -80,7 +82,7 @@ class RAGNode(BaseNode): chunked_docs.append(doc) if self.verbose: - print("--- (updated chunks metadata) ---") + logger.info("--- (updated chunks metadata) ---") # check if embedder_model is provided, if not use llm_model self.embedder_model = self.embedder_model if self.embedder_model else self.llm_model diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index 7aea6cae..ab0c7919 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -9,7 +9,7 @@ from langchain.prompts import PromptTemplate from langchain.output_parsers import CommaSeparatedListOutputParser from .base_node import BaseNode from ..helpers import robots_dictionary - +from ..utils.logging import get_logger class RobotsNode(BaseNode): """ @@ -61,9 +61,10 @@ class RobotsNode(BaseNode): ValueError: If the website is not scrapeable based on the robots.txt file and scraping is not enforced. """ + logger = get_logger("robots node") if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) @@ -121,17 +122,17 @@ class RobotsNode(BaseNode): if "no" in is_scrapable: if self.verbose: - print("\033[31m(Scraping this website is not allowed)\033[0m") + logger.warning("\033[31m(Scraping this website is not allowed)\033[0m") if not self.force_scraping: raise ValueError( 'The website you selected is not scrapable') else: if self.verbose: - print("\033[33m(WARNING: Scraping this website is not allowed but you decided to force it)\033[0m") + logger.warning("\033[33m(WARNING: Scraping this website is not allowed but you decided to force it)\033[0m") else: if self.verbose: - print("\033[32m(Scraping this website is allowed)\033[0m") + logger.warning("\033[32m(Scraping this website is allowed)\033[0m") state.update({self.output[0]: is_scrapable}) return state diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 87f8dcb2..e2443a25 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -7,7 +7,7 @@ from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate from ..utils.research_web import search_on_web from .base_node import BaseNode - +from ..utils.logging import get_logger class SearchInternetNode(BaseNode): """ @@ -54,9 +54,10 @@ class SearchInternetNode(BaseNode): KeyError: If the input keys are not found in the state, indicating that the necessary information for generating the answer is missing. """ + logger = get_logger("search interne node") if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger.info(f"--- Executing {self.node_name} Node ---") input_keys = self.get_input_keys(state) @@ -88,7 +89,8 @@ class SearchInternetNode(BaseNode): search_query = search_answer.invoke({"user_prompt": user_prompt})[0] if self.verbose: - print(f"Search Query: {search_query}") + logger.info(f"Search Query: {search_query}") + answer = search_on_web( query=search_query, max_results=self.max_results) diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index bf64b5d9..93c60e4a 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -5,7 +5,7 @@ SearchLinkNode Module # Imports from standard library from typing import List, Optional from tqdm import tqdm - +from ..utils.logging import get_logger # Imports from Langchain from langchain.prompts import PromptTemplate @@ -59,7 +59,8 @@ class SearchLinkNode(BaseNode): """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger = get_logger("search link node") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/nodes/text_to_speech_node.py b/scrapegraphai/nodes/text_to_speech_node.py index d9fe7ca4..06ed8d5f 100644 --- a/scrapegraphai/nodes/text_to_speech_node.py +++ b/scrapegraphai/nodes/text_to_speech_node.py @@ -4,7 +4,7 @@ TextToSpeechNode Module from typing import List, Optional from .base_node import BaseNode - +from ..utils.logging import get_logger class TextToSpeechNode(BaseNode): """ @@ -45,7 +45,8 @@ class TextToSpeechNode(BaseNode): """ if self.verbose: - print(f"--- Executing {self.node_name} Node ---") + logger = get_logger("text to speach node") + logger.info(f"--- Executing {self.node_name} Node ---") # Interpret input keys based on the provided input expression input_keys = self.get_input_keys(state) diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 72a8b96c..ee647466 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -9,3 +9,4 @@ from .proxy_rotation import Proxy, parse_or_search_proxy, search_proxy_servers from .save_audio_from_bytes import save_audio_from_bytes from .sys_dynamic_import import dynamic_import, srcfile_import from .cleanup_html import cleanup_html +from .logging import * \ No newline at end of file diff --git a/scrapegraphai/utils/logging.py b/scrapegraphai/utils/logging.py new file mode 100644 index 00000000..428fb8a7 --- /dev/null +++ b/scrapegraphai/utils/logging.py @@ -0,0 +1,137 @@ +"""A centralized logging system for any library + +source code inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/utils/logging.py +""" +import logging +import os +import sys +import threading +from functools import lru_cache + + +_library_name = __name__.split(".", maxsplit=1)[0] + +_default_handler = None +_default_logging_level = logging.WARNING + +_semaphore = threading.Lock() + + +def _get_library_root_logger() -> logging.Logger: + return logging.getLogger(_library_name) + + +def _set_library_root_logger() -> None: + global _default_handler + + with _semaphore: + if _default_handler: return + + _default_handler = logging.StreamHandler() # sys.stderr as stream + + # https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176 + if sys.stderr is None: + sys.stderr = open(os.devnull, "w") + + _default_handler.flush = sys.stderr.flush + + library_root_logger = _get_library_root_logger() + library_root_logger.addHandler(_default_handler) + library_root_logger.setLevel(_default_logging_level) + library_root_logger.propagate = False + + +def get_logger(name: str | None = None) -> logging.Logger: + _set_library_root_logger() + return logging.getLogger(name or _library_name) + + +def get_verbosity() -> int: + _set_library_root_logger() + return _get_library_root_logger().getEffectiveLevel() + + +def set_verbosity(verbosity: int) -> None: + _set_library_root_logger() + _get_library_root_logger().setLevel(verbosity) + + +def set_verbosity_debug() -> None: + set_verbosity(logging.DEBUG) + + +def set_verbosity_info() -> None: + set_verbosity(logging.INFO) + + +def set_verbosity_warning() -> None: + set_verbosity(logging.WARNING) + + +def set_verbosity_error() -> None: + set_verbosity(logging.ERROR) + + +def set_verbosity_fatal() -> None: + set_verbosity(logging.FATAL) + + +def set_handler(handler: logging.Handler) -> None: + _set_library_root_logger() + + assert handler is not None + + _get_library_root_logger().addHandler(handler) + + +def set_default_handler() -> None: + set_handler(_default_handler) + + +def unset_handler(handler: logging.Handler) -> None: + _set_library_root_logger() + + assert handler is not None + + _get_library_root_logger().removeHandler(handler) + + +def unset_default_handler() -> None: + unset_handler(_default_handler) + + +def set_propagation() -> None: + _get_library_root_logger().propagate = True + + +def unset_propagation() -> None: + _get_library_root_logger().propagate = False + + +def set_formatting() -> None: + """sets formatting for all handlers bound to the root logger + + ``` + [levelname|filename|line number] time >> message + ``` + """ + formatter = logging.Formatter( + "[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s" + ) + + for handler in _get_library_root_logger().handlers: + handler.setFormatter(formatter) + + +def unset_formatting() -> None: + for handler in _get_library_root_logger().handlers: + handler.setFormatter(None) + + +@lru_cache(None) +def warning_once(self, *args, **kwargs): + """emits warning logs with the same message only once""" + self.warning(*args, **kwargs) + + +logging.Logger.warning_once = warning_once \ No newline at end of file