diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index d1b59500..26fc44c4 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -86,7 +86,8 @@ class BaseNode(ABC): Args: param (dict): The dictionary to update node_config with. - overwrite (bool): Flag indicating if the values of node_config should be overwritten if their value is not None. + overwrite (bool): Flag indicating if the values of node_config + should be overwritten if their value is not None. """ for key, val in params.items(): if hasattr(self, key) and not overwrite: @@ -133,7 +134,8 @@ class BaseNode(ABC): def _parse_input_keys(self, state: dict, expression: str) -> List[str]: """ - Parses the input keys expression to extract relevant keys from the state based on logical conditions. + Parses the input keys expression to extract + relevant keys from the state based on logical conditions. The expression can contain AND (&), OR (|), and parentheses to group conditions. Args: diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 4971ddb3..11cbb5fb 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -133,7 +133,7 @@ class FetchNode(BaseNode): state.update({self.output[0]: compressed_document}) return state elif input_keys[0] == "json": - f = open(source) + f = open(source, encoding="utf-8") compressed_document = [ Document(page_content=str(json.load(f)), metadata={"source": "json"}) ] @@ -181,12 +181,11 @@ class FetchNode(BaseNode): if not response.text.strip(): raise ValueError("No HTML body content found in the response.") - parsed_content = response - if not self.cut: parsed_content = cleanup_html(response, source) - if (isinstance(self.llm_model, ChatOpenAI) and not self.script_creator) or (self.force and not self.script_creator): + if (isinstance(self.llm_model, ChatOpenAI) + and not self.script_creator) or (self.force and not self.script_creator): parsed_content = convert_to_md(source, input_data[0]) compressed_document = [Document(page_content=parsed_content)] else: @@ -205,7 +204,8 @@ class FetchNode(BaseNode): data = browser_base_fetch(self.browser_base.get("api_key"), self.browser_base.get("project_id"), [source]) - document = [Document(page_content=content, metadata={"source": source}) for content in data] + document = [Document(page_content=content, + metadata={"source": source}) for content in data] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load() @@ -215,10 +215,8 @@ class FetchNode(BaseNode): parsed_content = document[0].page_content if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled: - parsed_content = convert_to_md(document[0].page_content, input_data[0]) - compressed_document = [ Document(page_content=parsed_content, metadata={"source": "html file"}) ] diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 6ce19ef2..a91dae3f 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -3,18 +3,12 @@ gg Module for generating the answer node """ -# Imports from standard library from typing import List, Optional - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm - from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode from ..helpers.generate_answer_node_csv_prompts import template_chunks_csv, template_no_chunks_csv, template_merge_csv diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 12ae6f0f..9c530688 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,7 +1,6 @@ """ GenerateAnswerNode Module """ -import asyncio from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser @@ -9,7 +8,6 @@ from langchain_core.runnables import RunnableParallel from langchain_openai import ChatOpenAI from langchain_community.chat_models import ChatOllama from tqdm import tqdm -from langchain_openai import ChatOpenAI from ..utils.logging import get_logger from .base_node import BaseNode from ..helpers import template_chunks, template_no_chunks, template_merge, template_chunks_md, template_no_chunks_md, template_merge_md @@ -130,7 +128,6 @@ class GenerateAnswerNode(BaseNode): partial_variables={"context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions}) - # Add chain to dictionary with dynamic name chain_name = f"chunk{i+1}" chains_dict[chain_name] = prompt | self.llm_model | output_parser diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index 98be26dd..93e96f4e 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -113,7 +113,7 @@ class GenerateAnswerOmniNode(BaseNode): chain = prompt | self.llm_model | output_parser answer = chain.invoke({"question": user_prompt}) - + state.update({self.output[0]: answer}) return state @@ -148,4 +148,4 @@ class GenerateAnswerOmniNode(BaseNode): answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) state.update({self.output[0]: answer}) - return state \ No newline at end of file + return state diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 47f14e86..4cef7ae9 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -2,18 +2,13 @@ Module for generating the answer node """ -# Imports from standard library from typing import List, Optional - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm from langchain_community.chat_models import ChatOllama from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode from ..helpers.generate_answer_node_pdf_prompts import template_chunks_pdf, template_no_chunks_pdf, template_merge_pdf diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index 393f5e90..733898bd 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -83,7 +83,6 @@ class GenerateScraperNode(BaseNode): user_prompt = input_data[0] doc = input_data[1] - # schema to be used for output parsing if self.node_config.get("schema", None) is not None: output_schema = JsonOutputParser(pydantic_object=self.node_config["schema"]) else: @@ -130,7 +129,6 @@ class GenerateScraperNode(BaseNode): ) map_chain = prompt | self.llm_model | StrOutputParser() - # Chain answer = map_chain.invoke({"question": user_prompt}) state.update({self.output[0]: answer}) diff --git a/scrapegraphai/nodes/get_probable_tags_node.py b/scrapegraphai/nodes/get_probable_tags_node.py index f31633c0..38c2ba15 100644 --- a/scrapegraphai/nodes/get_probable_tags_node.py +++ b/scrapegraphai/nodes/get_probable_tags_node.py @@ -1,7 +1,6 @@ """ GetProbableTagsNode Module """ - from typing import List, Optional from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py index 061be77a..6ce4bdaf 100644 --- a/scrapegraphai/nodes/graph_iterator_node.py +++ b/scrapegraphai/nodes/graph_iterator_node.py @@ -5,13 +5,11 @@ GraphIterator Module import asyncio import copy from typing import List, Optional - from tqdm.asyncio import tqdm - from ..utils.logging import get_logger from .base_node import BaseNode -_default_batchsize = 16 +DEFAULT_BATCHSIZE = 16 class GraphIteratorNode(BaseNode): @@ -51,13 +49,15 @@ class GraphIteratorNode(BaseNode): the correct data from the state. Returns: - dict: The updated state with the output key containing the results of the graph instances. + dict: The updated state with the output key c + ontaining the results of the graph instances. Raises: - KeyError: If the input keys are not found in the state, indicating that the - necessary information for running the graph instances is missing. + KeyError: If the input keys are not found in the state, + indicating that thenecessary information for running + the graph instances is missing. """ - batchsize = self.node_config.get("batchsize", _default_batchsize) + batchsize = self.node_config.get("batchsize", DEFAULT_BATCHSIZE) self.logger.info( f"--- Executing {self.node_name} Node with batchsize {batchsize} ---" diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py index 7e7507a9..c1a69390 100644 --- a/scrapegraphai/nodes/image_to_text_node.py +++ b/scrapegraphai/nodes/image_to_text_node.py @@ -3,14 +3,14 @@ ImageToTextNode Module """ from typing import List, Optional - from ..utils.logging import get_logger from .base_node import BaseNode class ImageToTextNode(BaseNode): """ - Retrieve images from a list of URLs and return a description of the images using an image-to-text model. + Retrieve images from a list of URLs and return a description of + the images using an image-to-text model. Attributes: llm_model: An instance of the language model client used for image-to-text conversion. diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index 0efd8ec8..548b7c04 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -2,18 +2,10 @@ MergeAnswersNode Module """ -# Imports from standard library from typing import List, Optional -from tqdm import tqdm - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser -from tqdm import tqdm - from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode diff --git a/scrapegraphai/nodes/merge_generated_scripts.py b/scrapegraphai/nodes/merge_generated_scripts.py index cfda3960..8c8eaecd 100644 --- a/scrapegraphai/nodes/merge_generated_scripts.py +++ b/scrapegraphai/nodes/merge_generated_scripts.py @@ -5,15 +5,9 @@ MergeAnswersNode Module # Imports from standard library from typing import List, Optional from tqdm import tqdm - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser, StrOutputParser -from tqdm import tqdm - from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index b5418717..d1bb87bd 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -75,23 +75,23 @@ class ParseNode(BaseNode): chunks = chunk(text=docs_transformed.page_content, chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda x: len(x), + token_counter= lambda x: len(x), memoize=False) else: docs_transformed = docs_transformed[0] - if type(docs_transformed) == Document: + if isinstance(docs_transformed, Document): chunks = chunk(text=docs_transformed.page_content, chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda x: len(x), + token_counter= lambda x: len(x), memoize=False) else: - + chunks = chunk(text=docs_transformed, chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda x: len(x), + token_counter= lambda x: len(x), memoize=False) - + state.update({self.output[0]: chunks}) return state diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index 66231600..7fa2fe6b 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -4,15 +4,9 @@ RobotsNode Module from typing import List, Optional from urllib.parse import urlparse - from langchain_community.document_loaders import AsyncChromiumLoader from langchain.prompts import PromptTemplate from langchain.output_parsers import CommaSeparatedListOutputParser - -from langchain.output_parsers import CommaSeparatedListOutputParser -from langchain.prompts import PromptTemplate -from langchain_community.document_loaders import AsyncChromiumLoader - from ..helpers import robots_dictionary from ..utils.logging import get_logger from .base_node import BaseNode @@ -146,4 +140,4 @@ class RobotsNode(BaseNode): self.logger.warning("\033[32m(Scraping this website is allowed)\033[0m") state.update({self.output[0]: is_scrapable}) - return state \ No newline at end of file + return state diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 7588b995..61b11995 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -1,9 +1,7 @@ """ SearchInternetNode Module """ - from typing import List, Optional - from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate from langchain_community.chat_models import ChatOllama diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index b3d289d9..6fbe51dd 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -2,19 +2,13 @@ SearchLinkNode Module """ -# Imports from standard library from typing import List, Optional import re from tqdm import tqdm - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel - from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode diff --git a/scrapegraphai/nodes/search_node_with_context.py b/scrapegraphai/nodes/search_node_with_context.py index 62de184a..678e44ae 100644 --- a/scrapegraphai/nodes/search_node_with_context.py +++ b/scrapegraphai/nodes/search_node_with_context.py @@ -67,7 +67,6 @@ class SearchLinksWithContext(BaseNode): # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] - user_prompt = input_data[0] doc = input_data[1] output_parser = CommaSeparatedListOutputParser() diff --git a/scrapegraphai/nodes/text_to_speech_node.py b/scrapegraphai/nodes/text_to_speech_node.py index 59e3fb8b..e8e43cb5 100644 --- a/scrapegraphai/nodes/text_to_speech_node.py +++ b/scrapegraphai/nodes/text_to_speech_node.py @@ -1,13 +1,10 @@ """ TextToSpeechNode Module """ - from typing import List, Optional - from ..utils.logging import get_logger from .base_node import BaseNode - class TextToSpeechNode(BaseNode): """ Converts text to speech using the specified text-to-speech model. diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py index 6f1a2334..74478bcc 100644 --- a/scrapegraphai/utils/convert_to_md.py +++ b/scrapegraphai/utils/convert_to_md.py @@ -1,8 +1,8 @@ """ convert_to_md modul """ -import html2text from urllib.parse import urlparse +import html2text def convert_to_md(html: str, url: str = None) -> str: """ Convert HTML to Markdown. diff --git a/scrapegraphai/utils/logging.py b/scrapegraphai/utils/logging.py index 2684d0b1..afb63c52 100644 --- a/scrapegraphai/utils/logging.py +++ b/scrapegraphai/utils/logging.py @@ -12,7 +12,7 @@ from typing import Optional _library_name = __name__.split(".", maxsplit=1)[0] -_default_handler = None +DEFAULT_HANDLER = None _default_logging_level = logging.WARNING _semaphore = threading.Lock() @@ -23,22 +23,22 @@ def _get_library_root_logger() -> logging.Logger: def _set_library_root_logger() -> None: - global _default_handler + global DEFAULT_HANDLER with _semaphore: - if _default_handler: + if DEFAULT_HANDLER: return - _default_handler = logging.StreamHandler() # sys.stderr as stream + DEFAULT_HANDLER = logging.StreamHandler() # sys.stderr as stream # https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176 if sys.stderr is None: - sys.stderr = open(os.devnull, "w") + sys.stderr = open(os.devnull, "w", encoding="utf-8") - _default_handler.flush = sys.stderr.flush + DEFAULT_HANDLER.flush = sys.stderr.flush library_root_logger = _get_library_root_logger() - library_root_logger.addHandler(_default_handler) + library_root_logger.addHandler(DEFAULT_HANDLER) library_root_logger.setLevel(_default_logging_level) library_root_logger.propagate = False @@ -86,8 +86,8 @@ def set_handler(handler: logging.Handler) -> None: _get_library_root_logger().addHandler(handler) -def set_default_handler() -> None: - set_handler(_default_handler) +def setDEFAULT_HANDLER() -> None: + set_handler(DEFAULT_HANDLER) def unset_handler(handler: logging.Handler) -> None: @@ -98,8 +98,8 @@ def unset_handler(handler: logging.Handler) -> None: _get_library_root_logger().removeHandler(handler) -def unset_default_handler() -> None: - unset_handler(_default_handler) +def unsetDEFAULT_HANDLER() -> None: + unset_handler(DEFAULT_HANDLER) def set_propagation() -> None: diff --git a/scrapegraphai/utils/parse_state_keys.py b/scrapegraphai/utils/parse_state_keys.py index 85712ef6..107397e9 100644 --- a/scrapegraphai/utils/parse_state_keys.py +++ b/scrapegraphai/utils/parse_state_keys.py @@ -13,19 +13,22 @@ def parse_expression(expression, state: dict) -> list: state (dict): Dictionary of state keys used to evaluate the expression. Raises: - ValueError: If the expression is empty, has adjacent state keys without operators, invalid operator usage, - unbalanced parentheses, or if no state keys match the expression. + ValueError: If the expression is empty, has adjacent state keys without operators, + invalid operator usage, unbalanced parentheses, or if no state keys match the expression. Returns: - list: A list of state keys that match the boolean expression, ensuring each key appears only once. + list: A list of state keys that match the boolean expression, + ensuring each key appears only once. Example: >>> parse_expression("user_input & (relevant_chunks | parsed_document | document)", {"user_input": None, "document": None, "parsed_document": None, "relevant_chunks": None}) ['user_input', 'relevant_chunks', 'parsed_document', 'document'] - This function evaluates the expression to determine the logical inclusion of state keys based on provided boolean logic. - It checks for syntax errors such as unbalanced parentheses, incorrect adjacency of operators, and empty expressions. + This function evaluates the expression to determine the + logical inclusion of state keys based on provided boolean logic. + It checks for syntax errors such as unbalanced parentheses, + incorrect adjacency of operators, and empty expressions. """ # Check for empty expression diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py index 07e04d0f..6f6019e9 100644 --- a/scrapegraphai/utils/proxy_rotation.py +++ b/scrapegraphai/utils/proxy_rotation.py @@ -6,7 +6,6 @@ import ipaddress import random import re from typing import List, Optional, Set, TypedDict - import requests from fp.errors import FreeProxyException from fp.fp import FreeProxy diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 101693e4..fe7902d3 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -1,3 +1,6 @@ +""" +Research_web module +""" import re from typing import List from langchain_community.tools import DuckDuckGoSearchResults @@ -5,13 +8,15 @@ from googlesearch import search as google_search import requests from bs4 import BeautifulSoup -def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080) -> List[str]: +def search_on_web(query: str, search_engine: str = "Google", + max_results: int = 10, port: int = 8080) -> List[str]: """ Searches the web for a given query using specified search engine options. Args: query (str): The search query to find on the internet. - search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. + search_engine (str, optional): Specifies the search engine to use, + options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. max_results (int, optional): The maximum number of search results to return. port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. @@ -25,19 +30,19 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = >>> search_on_web("example query", search_engine="Google", max_results=5) ['http://example.com', 'http://example.org', ...] """ - + if search_engine.lower() == "google": res = [] for url in google_search(query, stop=max_results): res.append(url) return res - + elif search_engine.lower() == "duckduckgo": research = DuckDuckGoSearchResults(max_results=max_results) res = research.run(query) links = re.findall(r'https?://[^\s,\]]+', res) return links - + elif search_engine.lower() == "bing": headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" @@ -46,24 +51,24 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = response = requests.get(search_url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - + search_results = [] for result in soup.find_all('li', class_='b_algo', limit=max_results): link = result.find('a')['href'] search_results.append(link) return search_results - + elif search_engine.lower() == "searxng": url = f"http://localhost:{port}" params = {"q": query, "format": "json"} - + # Send the GET request to the server response = requests.get(url, params=params) - + # Parse the response and limit to the specified max_results data = response.json() limited_results = data["results"][:max_results] return limited_results - + else: raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG") diff --git a/scrapegraphai/utils/sys_dynamic_import.py b/scrapegraphai/utils/sys_dynamic_import.py index 30f75d15..19b0d29a 100644 --- a/scrapegraphai/utils/sys_dynamic_import.py +++ b/scrapegraphai/utils/sys_dynamic_import.py @@ -5,7 +5,7 @@ source code inspired by https://gist.github.com/DiTo97/46f4b733396b8d7a8f1d4d22d import sys import typing - +import importlib.util # noqa: F401 if typing.TYPE_CHECKING: import types @@ -24,9 +24,6 @@ def srcfile_import(modpath: str, modname: str) -> "types.ModuleType": Raises: ImportError: If the module cannot be imported from the srcfile """ - import importlib.util # noqa: F401 - - # spec = importlib.util.spec_from_file_location(modname, modpath) if spec is None: diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py index 5b23fdf4..c5263efe 100644 --- a/scrapegraphai/utils/token_calculator.py +++ b/scrapegraphai/utils/token_calculator.py @@ -22,7 +22,8 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str] >>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING") ["This is a sample text", "for truncation."] - This function ensures that each chunk of text can be tokenized by the specified model without exceeding the model's token limit. + This function ensures that each chunk of text can be tokenized + by the specified model without exceeding the model's token limit. """ encoding = tiktoken.get_encoding(encoding_name)