From db54d694334209f047c950e2f6ac2c02e2da1d39 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 4 Oct 2024 09:54:54 +0200 Subject: [PATCH] refactoring of code for pylint integration --- scrapegraphai/nodes/description_node.py | 1 - scrapegraphai/nodes/fetch_node_level_k.py | 180 ++++++++++++---------- scrapegraphai/nodes/parse_node_depth_k.py | 17 +- 3 files changed, 108 insertions(+), 90 deletions(-) diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py index 60c56cec..4201a61d 100644 --- a/scrapegraphai/nodes/description_node.py +++ b/scrapegraphai/nodes/description_node.py @@ -34,7 +34,6 @@ class DescriptionNode(BaseNode): node_name: str = "DESCRIPTION", ): super().__init__(node_name, "node", input, output, 2, node_config) - self.llm_model = node_config["llm_model"] self.verbose = ( False if node_config is None else node_config.get("verbose", False) diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py index 5cdd6571..d321b33c 100644 --- a/scrapegraphai/nodes/fetch_node_level_k.py +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -1,6 +1,3 @@ -""" -FetchNodeLevelK Module -""" from typing import List, Optional from .base_node import BaseNode from ..docloaders import ChromiumLoader @@ -18,14 +15,21 @@ class FetchNodeLevelK(BaseNode): (with proxy protection). Attributes: - llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An optional model for embedding the fetched content. verbose (bool): A flag indicating whether to show print statements during execution. + cache_path (str): Path to cache fetched content. + headless (bool): Whether to run the Chromium browser in headless mode. + loader_kwargs (dict): Additional arguments for the content loader. + browser_base (dict): Optional configuration for the browser base API. + depth (int): Maximum depth of hyperlink graph traversal. + only_inside_links (bool): Whether to fetch only internal links. + min_input_len (int): Minimum required length of input data. Args: input (str): Boolean expression defining the input keys needed from the state. output (List[str]): List of output keys to be updated in the state. node_config (dict): Additional configuration for the node. - node_name (str): The unique identifier name for the node, defaulting to "Parse". + node_name (str): The unique identifier name for the node, defaulting to "FetchLevelK". """ def __init__( @@ -35,81 +39,68 @@ class FetchNodeLevelK(BaseNode): node_config: Optional[dict] = None, node_name: str = "FetchLevelK", ): + """ + Initializes the FetchNodeLevelK instance. + + Args: + input (str): Boolean expression defining the input keys needed from the state. + output (List[str]): List of output keys to be updated in the state. + node_config (Optional[dict]): Additional configuration for the node. + node_name (str): The name of the node (default is "FetchLevelK"). + """ super().__init__(node_name, "node", input, output, 2, node_config) - + self.embedder_model = node_config.get("embedder_model", None) - - self.verbose = ( - False if node_config is None else node_config.get("verbose", False) - ) - + self.verbose = node_config.get("verbose", False) if node_config else False self.cache_path = node_config.get("cache_path", False) - - self.headless = ( - True if node_config is None else node_config.get("headless", True) - ) - - self.loader_kwargs = ( - {} if node_config is None else node_config.get("loader_kwargs", {}) - ) - - self.browser_base = ( - None if node_config is None else node_config.get("browser_base", None) - ) - - self.depth = ( - 1 if node_config is None else node_config.get("depth", 1) - ) - - self.only_inside_links = ( - False if node_config is None else node_config.get("only_inside_links", False) - ) - + self.headless = node_config.get("headless", True) if node_config else True + self.loader_kwargs = node_config.get("loader_kwargs", {}) if node_config else {} + self.browser_base = node_config.get("browser_base", None) + self.depth = node_config.get("depth", 1) if node_config else 1 + self.only_inside_links = node_config.get("only_inside_links", False) if node_config else False self.min_input_len = 1 def execute(self, state: dict) -> dict: """ - Executes the node's logic to fetch the HTML content of a specified URL and all its sub-links - and update the graph's state with the content. + Executes the node's logic to fetch the HTML content of a specified URL and its sub-links + recursively, then updates the graph's state with the fetched content. Args: - state (dict): The current state of the graph. The input keys will be used - to fetch the correct data types from the state. + state (dict): The current state of the graph. Returns: dict: The updated state with a new output key containing the fetched HTML content. Raises: - KeyError: If the input key is not found in the state, indicating that the - necessary information to perform the operation is missing. + KeyError: If the input key is not found in the state. """ - self.logger.info(f"--- Executing {self.node_name} Node ---") - - # Interpret input keys based on the provided input expression + input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] - source = input_data[0] - - documents = [{"source": source}] - - loader_kwargs = {} - if self.node_config is not None: - loader_kwargs = self.node_config.get("loader_kwargs", {}) - + documents = [{"source": source}] + loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.node_config else {} + for _ in range(self.depth): documents = self.obtain_content(documents, loader_kwargs) - + filtered_documents = [doc for doc in documents if 'document' in doc] - state.update({self.output[0]: filtered_documents}) - return state - + def fetch_content(self, source: str, loader_kwargs) -> Optional[str]: + """ + Fetches the HTML content of a given source URL. + + Args: + source (str): The URL to fetch content from. + loader_kwargs (dict): Additional arguments for the content loader. + + Returns: + Optional[str]: The fetched HTML content or None if fetching failed. + """ self.logger.info(f"--- (Fetching HTML from: {source}) ---") if self.browser_base is not None: @@ -119,26 +110,40 @@ class FetchNodeLevelK(BaseNode): raise ImportError("""The browserbase module is not installed. Please install it using `pip install browserbase`.""") - data = browser_base_fetch(self.browser_base.get("api_key"), - self.browser_base.get("project_id"), [source]) - - document = [Document(page_content=content, - metadata={"source": source}) for content in data] - + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), [source]) + document = [Document(page_content=content, metadata={"source": source}) for content in data] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) - document = loader.load() - return document - + def extract_links(self, html_content: str) -> list: + """ + Extracts all hyperlinks from the HTML content. + + Args: + html_content (str): The HTML content to extract links from. + + Returns: + list: A list of extracted hyperlinks. + """ soup = BeautifulSoup(html_content, 'html.parser') links = [link['href'] for link in soup.find_all('a', href=True)] self.logger.info(f"Extracted {len(links)} links.") return links - + def get_full_links(self, base_url: str, links: list) -> list: + """ + Converts relative URLs to full URLs based on the base URL. + + Args: + base_url (str): The base URL for resolving relative links. + links (list): A list of links to convert. + + Returns: + list: A list of full URLs. + """ full_links = [] for link in links: if self.only_inside_links and link.startswith("http"): @@ -146,36 +151,55 @@ class FetchNodeLevelK(BaseNode): full_link = link if link.startswith("http") else urljoin(base_url, link) full_links.append(full_link) return full_links - + def obtain_content(self, documents: List, loader_kwargs) -> List: + """ + Iterates through documents, fetching and updating content recursively. + + Args: + documents (List): A list of documents containing the source URLs. + loader_kwargs (dict): Additional arguments for the content loader. + + Returns: + List: The updated list of documents with fetched content. + """ new_documents = [] for doc in documents: source = doc['source'] if 'document' not in doc: document = self.fetch_content(source, loader_kwargs) - + if not document or not document[0].page_content.strip(): self.logger.warning(f"Failed to fetch content for {source}") documents.remove(doc) continue - - #doc['document'] = document[0].page_content + doc['document'] = document - links = self.extract_links(doc['document'][0].page_content) full_links = self.get_full_links(source, links) - - # Check if the links are already present in other documents + for link in full_links: - # Check if any document is from the same link if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents): - # Add the document new_documents.append({"source": link}) - + documents.extend(new_documents) return documents - - def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, current_depth: int = 1) -> dict: + + def process_links(self, base_url: str, links: list, + loader_kwargs, depth: int, current_depth: int = 1) -> dict: + """ + Processes a list of links recursively up to a given depth. + + Args: + base_url (str): The base URL for resolving relative links. + links (list): A list of links to process. + loader_kwargs (dict): Additional arguments for the content loader. + depth (int): The maximum depth for recursion. + current_depth (int): The current depth of recursion (default is 1). + + Returns: + dict: A dictionary containing processed link content. + """ content_dict = {} for idx, link in enumerate(links, start=1): full_link = link if link.startswith("http") else urljoin(base_url, link) @@ -184,7 +208,7 @@ class FetchNodeLevelK(BaseNode): if current_depth < depth: new_links = self.extract_links(link_content) - content_dict.update(self.process_links(full_link, new_links, depth, current_depth + 1)) + content_dict.update(self.process_links(full_link, new_links, loader_kwargs, depth, current_depth + 1)) else: self.logger.warning(f"Failed to fetch content for {full_link}") - return content_dict \ No newline at end of file + return content_dict diff --git a/scrapegraphai/nodes/parse_node_depth_k.py b/scrapegraphai/nodes/parse_node_depth_k.py index 7b7ab194..6427b051 100644 --- a/scrapegraphai/nodes/parse_node_depth_k.py +++ b/scrapegraphai/nodes/parse_node_depth_k.py @@ -1,11 +1,9 @@ """ ParseNodeDepthK Module """ -import re -from typing import List, Optional, Tuple -from .base_node import BaseNode -from ..utils.convert_to_md import convert_to_md +from typing import List, Optional from langchain_community.document_transformers import Html2TextTransformer +from .base_node import BaseNode class ParseNodeDepthK(BaseNode): """ @@ -54,19 +52,16 @@ class ParseNodeDepthK(BaseNode): """ self.logger.info(f"--- Executing {self.node_name} Node ---") - - # Interpret input keys based on the provided input expression + input_keys = self.get_input_keys(state) - # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] documents = input_data[0] - + for doc in documents: document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"]) - #document_md = convert_to_md(doc["document"]) doc["document"] = document_md[0].page_content - + state.update({self.output[0]: documents}) - + return state