mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
refactoring of code for pylint integration
This commit is contained in:
parent
c91975e0c8
commit
db54d69433
@ -34,7 +34,6 @@ class DescriptionNode(BaseNode):
|
||||
node_name: str = "DESCRIPTION",
|
||||
):
|
||||
super().__init__(node_name, "node", input, output, 2, node_config)
|
||||
|
||||
self.llm_model = node_config["llm_model"]
|
||||
self.verbose = (
|
||||
False if node_config is None else node_config.get("verbose", False)
|
||||
|
||||
@ -1,6 +1,3 @@
|
||||
"""
|
||||
FetchNodeLevelK Module
|
||||
"""
|
||||
from typing import List, Optional
|
||||
from .base_node import BaseNode
|
||||
from ..docloaders import ChromiumLoader
|
||||
@ -18,14 +15,21 @@ class FetchNodeLevelK(BaseNode):
|
||||
(with proxy protection).
|
||||
|
||||
Attributes:
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An optional model for embedding the fetched content.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
cache_path (str): Path to cache fetched content.
|
||||
headless (bool): Whether to run the Chromium browser in headless mode.
|
||||
loader_kwargs (dict): Additional arguments for the content loader.
|
||||
browser_base (dict): Optional configuration for the browser base API.
|
||||
depth (int): Maximum depth of hyperlink graph traversal.
|
||||
only_inside_links (bool): Whether to fetch only internal links.
|
||||
min_input_len (int): Minimum required length of input data.
|
||||
|
||||
Args:
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (dict): Additional configuration for the node.
|
||||
node_name (str): The unique identifier name for the node, defaulting to "Parse".
|
||||
node_name (str): The unique identifier name for the node, defaulting to "FetchLevelK".
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -35,81 +39,68 @@ class FetchNodeLevelK(BaseNode):
|
||||
node_config: Optional[dict] = None,
|
||||
node_name: str = "FetchLevelK",
|
||||
):
|
||||
"""
|
||||
Initializes the FetchNodeLevelK instance.
|
||||
|
||||
Args:
|
||||
input (str): Boolean expression defining the input keys needed from the state.
|
||||
output (List[str]): List of output keys to be updated in the state.
|
||||
node_config (Optional[dict]): Additional configuration for the node.
|
||||
node_name (str): The name of the node (default is "FetchLevelK").
|
||||
"""
|
||||
super().__init__(node_name, "node", input, output, 2, node_config)
|
||||
|
||||
|
||||
self.embedder_model = node_config.get("embedder_model", None)
|
||||
|
||||
self.verbose = (
|
||||
False if node_config is None else node_config.get("verbose", False)
|
||||
)
|
||||
|
||||
self.verbose = node_config.get("verbose", False) if node_config else False
|
||||
self.cache_path = node_config.get("cache_path", False)
|
||||
|
||||
self.headless = (
|
||||
True if node_config is None else node_config.get("headless", True)
|
||||
)
|
||||
|
||||
self.loader_kwargs = (
|
||||
{} if node_config is None else node_config.get("loader_kwargs", {})
|
||||
)
|
||||
|
||||
self.browser_base = (
|
||||
None if node_config is None else node_config.get("browser_base", None)
|
||||
)
|
||||
|
||||
self.depth = (
|
||||
1 if node_config is None else node_config.get("depth", 1)
|
||||
)
|
||||
|
||||
self.only_inside_links = (
|
||||
False if node_config is None else node_config.get("only_inside_links", False)
|
||||
)
|
||||
|
||||
self.headless = node_config.get("headless", True) if node_config else True
|
||||
self.loader_kwargs = node_config.get("loader_kwargs", {}) if node_config else {}
|
||||
self.browser_base = node_config.get("browser_base", None)
|
||||
self.depth = node_config.get("depth", 1) if node_config else 1
|
||||
self.only_inside_links = node_config.get("only_inside_links", False) if node_config else False
|
||||
self.min_input_len = 1
|
||||
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Executes the node's logic to fetch the HTML content of a specified URL and all its sub-links
|
||||
and update the graph's state with the content.
|
||||
Executes the node's logic to fetch the HTML content of a specified URL and its sub-links
|
||||
recursively, then updates the graph's state with the fetched content.
|
||||
|
||||
Args:
|
||||
state (dict): The current state of the graph. The input keys will be used
|
||||
to fetch the correct data types from the state.
|
||||
state (dict): The current state of the graph.
|
||||
|
||||
Returns:
|
||||
dict: The updated state with a new output key containing the fetched HTML content.
|
||||
|
||||
Raises:
|
||||
KeyError: If the input key is not found in the state, indicating that the
|
||||
necessary information to perform the operation is missing.
|
||||
KeyError: If the input key is not found in the state.
|
||||
"""
|
||||
|
||||
self.logger.info(f"--- Executing {self.node_name} Node ---")
|
||||
|
||||
# Interpret input keys based on the provided input expression
|
||||
|
||||
input_keys = self.get_input_keys(state)
|
||||
# Fetching data from the state based on the input keys
|
||||
input_data = [state[key] for key in input_keys]
|
||||
|
||||
source = input_data[0]
|
||||
|
||||
documents = [{"source": source}]
|
||||
|
||||
loader_kwargs = {}
|
||||
|
||||
if self.node_config is not None:
|
||||
loader_kwargs = self.node_config.get("loader_kwargs", {})
|
||||
|
||||
documents = [{"source": source}]
|
||||
loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.node_config else {}
|
||||
|
||||
for _ in range(self.depth):
|
||||
documents = self.obtain_content(documents, loader_kwargs)
|
||||
|
||||
|
||||
filtered_documents = [doc for doc in documents if 'document' in doc]
|
||||
|
||||
state.update({self.output[0]: filtered_documents})
|
||||
|
||||
return state
|
||||
|
||||
|
||||
def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
|
||||
"""
|
||||
Fetches the HTML content of a given source URL.
|
||||
|
||||
Args:
|
||||
source (str): The URL to fetch content from.
|
||||
loader_kwargs (dict): Additional arguments for the content loader.
|
||||
|
||||
Returns:
|
||||
Optional[str]: The fetched HTML content or None if fetching failed.
|
||||
"""
|
||||
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
|
||||
|
||||
if self.browser_base is not None:
|
||||
@ -119,26 +110,40 @@ class FetchNodeLevelK(BaseNode):
|
||||
raise ImportError("""The browserbase module is not installed.
|
||||
Please install it using `pip install browserbase`.""")
|
||||
|
||||
data = browser_base_fetch(self.browser_base.get("api_key"),
|
||||
self.browser_base.get("project_id"), [source])
|
||||
|
||||
document = [Document(page_content=content,
|
||||
metadata={"source": source}) for content in data]
|
||||
|
||||
data = browser_base_fetch(self.browser_base.get("api_key"),
|
||||
self.browser_base.get("project_id"), [source])
|
||||
document = [Document(page_content=content, metadata={"source": source}) for content in data]
|
||||
else:
|
||||
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
|
||||
|
||||
document = loader.load()
|
||||
|
||||
return document
|
||||
|
||||
|
||||
def extract_links(self, html_content: str) -> list:
|
||||
"""
|
||||
Extracts all hyperlinks from the HTML content.
|
||||
|
||||
Args:
|
||||
html_content (str): The HTML content to extract links from.
|
||||
|
||||
Returns:
|
||||
list: A list of extracted hyperlinks.
|
||||
"""
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
links = [link['href'] for link in soup.find_all('a', href=True)]
|
||||
self.logger.info(f"Extracted {len(links)} links.")
|
||||
return links
|
||||
|
||||
|
||||
def get_full_links(self, base_url: str, links: list) -> list:
|
||||
"""
|
||||
Converts relative URLs to full URLs based on the base URL.
|
||||
|
||||
Args:
|
||||
base_url (str): The base URL for resolving relative links.
|
||||
links (list): A list of links to convert.
|
||||
|
||||
Returns:
|
||||
list: A list of full URLs.
|
||||
"""
|
||||
full_links = []
|
||||
for link in links:
|
||||
if self.only_inside_links and link.startswith("http"):
|
||||
@ -146,36 +151,55 @@ class FetchNodeLevelK(BaseNode):
|
||||
full_link = link if link.startswith("http") else urljoin(base_url, link)
|
||||
full_links.append(full_link)
|
||||
return full_links
|
||||
|
||||
|
||||
def obtain_content(self, documents: List, loader_kwargs) -> List:
|
||||
"""
|
||||
Iterates through documents, fetching and updating content recursively.
|
||||
|
||||
Args:
|
||||
documents (List): A list of documents containing the source URLs.
|
||||
loader_kwargs (dict): Additional arguments for the content loader.
|
||||
|
||||
Returns:
|
||||
List: The updated list of documents with fetched content.
|
||||
"""
|
||||
new_documents = []
|
||||
for doc in documents:
|
||||
source = doc['source']
|
||||
if 'document' not in doc:
|
||||
document = self.fetch_content(source, loader_kwargs)
|
||||
|
||||
|
||||
if not document or not document[0].page_content.strip():
|
||||
self.logger.warning(f"Failed to fetch content for {source}")
|
||||
documents.remove(doc)
|
||||
continue
|
||||
|
||||
#doc['document'] = document[0].page_content
|
||||
|
||||
doc['document'] = document
|
||||
|
||||
links = self.extract_links(doc['document'][0].page_content)
|
||||
full_links = self.get_full_links(source, links)
|
||||
|
||||
# Check if the links are already present in other documents
|
||||
|
||||
for link in full_links:
|
||||
# Check if any document is from the same link
|
||||
if not any(d.get('source', '') == link for d in documents) and not any(d.get('source', '') == link for d in new_documents):
|
||||
# Add the document
|
||||
new_documents.append({"source": link})
|
||||
|
||||
|
||||
documents.extend(new_documents)
|
||||
return documents
|
||||
|
||||
def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, current_depth: int = 1) -> dict:
|
||||
|
||||
def process_links(self, base_url: str, links: list,
|
||||
loader_kwargs, depth: int, current_depth: int = 1) -> dict:
|
||||
"""
|
||||
Processes a list of links recursively up to a given depth.
|
||||
|
||||
Args:
|
||||
base_url (str): The base URL for resolving relative links.
|
||||
links (list): A list of links to process.
|
||||
loader_kwargs (dict): Additional arguments for the content loader.
|
||||
depth (int): The maximum depth for recursion.
|
||||
current_depth (int): The current depth of recursion (default is 1).
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing processed link content.
|
||||
"""
|
||||
content_dict = {}
|
||||
for idx, link in enumerate(links, start=1):
|
||||
full_link = link if link.startswith("http") else urljoin(base_url, link)
|
||||
@ -184,7 +208,7 @@ class FetchNodeLevelK(BaseNode):
|
||||
|
||||
if current_depth < depth:
|
||||
new_links = self.extract_links(link_content)
|
||||
content_dict.update(self.process_links(full_link, new_links, depth, current_depth + 1))
|
||||
content_dict.update(self.process_links(full_link, new_links, loader_kwargs, depth, current_depth + 1))
|
||||
else:
|
||||
self.logger.warning(f"Failed to fetch content for {full_link}")
|
||||
return content_dict
|
||||
return content_dict
|
||||
|
||||
@ -1,11 +1,9 @@
|
||||
"""
|
||||
ParseNodeDepthK Module
|
||||
"""
|
||||
import re
|
||||
from typing import List, Optional, Tuple
|
||||
from .base_node import BaseNode
|
||||
from ..utils.convert_to_md import convert_to_md
|
||||
from typing import List, Optional
|
||||
from langchain_community.document_transformers import Html2TextTransformer
|
||||
from .base_node import BaseNode
|
||||
|
||||
class ParseNodeDepthK(BaseNode):
|
||||
"""
|
||||
@ -54,19 +52,16 @@ class ParseNodeDepthK(BaseNode):
|
||||
"""
|
||||
|
||||
self.logger.info(f"--- Executing {self.node_name} Node ---")
|
||||
|
||||
# Interpret input keys based on the provided input expression
|
||||
|
||||
input_keys = self.get_input_keys(state)
|
||||
# Fetching data from the state based on the input keys
|
||||
input_data = [state[key] for key in input_keys]
|
||||
|
||||
documents = input_data[0]
|
||||
|
||||
|
||||
for doc in documents:
|
||||
document_md = Html2TextTransformer(ignore_links=True).transform_documents(doc["document"])
|
||||
#document_md = convert_to_md(doc["document"])
|
||||
doc["document"] = document_md[0].page_content
|
||||
|
||||
|
||||
state.update({self.output[0]: documents})
|
||||
|
||||
|
||||
return state
|
||||
|
||||
Loading…
Reference in New Issue
Block a user