diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index 2ec3b140..9b00f61c 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -60,13 +60,18 @@ def minify_html(html): """ minify_html function """ - html = re.sub(r'', '', html, flags=re.DOTALL) - - html = re.sub(r'>\s+<', '><', html) - html = re.sub(r'\s+>', '>', html) - html = re.sub(r'<\s+', '<', html) - html = re.sub(r'\s+', ' ', html) - html = re.sub(r'\s*=\s*', '=', html) + # Combine multiple regex operations into one for better performance + patterns = [ + (r'', '', re.DOTALL), + (r'>\s+<', '><', 0), + (r'\s+>', '>', 0), + (r'<\s+', '<', 0), + (r'\s+', ' ', 0), + (r'\s*=\s*', '=', 0) + ] + + for pattern, repl, flags in patterns: + html = re.sub(pattern, repl, html, flags=flags) return html.strip() diff --git a/scrapegraphai/utils/copy.py b/scrapegraphai/utils/copy.py index a35370ab..2ec7cee2 100644 --- a/scrapegraphai/utils/copy.py +++ b/scrapegraphai/utils/copy.py @@ -30,56 +30,38 @@ def is_boto3_client(obj): def safe_deepcopy(obj: Any) -> Any: """ - Attempts to create a deep copy of the object using `copy.deepcopy` - whenever possible. If that fails, it falls back to custom deep copy - logic. If that also fails, it raises a `DeepCopyError`. - + Safely create a deep copy of an object, handling special cases. + Args: - obj (Any): The object to be copied, which can be of any type. - + obj: Object to copy + Returns: - Any: A deep copy of the object if possible; otherwise, a shallow - copy if deep copying fails; if neither is possible, the original - object is returned. + Deep copy of the object + Raises: - DeepCopyError: If the object cannot be deep-copied or shallow-copied. + DeepCopyError: If object cannot be deep copied """ - try: - - return copy.deepcopy(obj) - except (TypeError, AttributeError) as e: - - if isinstance(obj, dict): - new_obj = {} - - for k, v in obj.items(): - new_obj[k] = safe_deepcopy(v) - return new_obj - - elif isinstance(obj, list): - new_obj = [] - - for v in obj: - new_obj.append(safe_deepcopy(v)) - return new_obj - - elif isinstance(obj, tuple): - new_obj = tuple(safe_deepcopy(v) for v in obj) - - return new_obj - - elif isinstance(obj, frozenset): - new_obj = frozenset(safe_deepcopy(v) for v in obj) - return new_obj - - elif is_boto3_client(obj): + # Handle special cases first + if obj is None or isinstance(obj, (str, int, float, bool)): return obj - - else: - try: - return copy.copy(obj) - except (TypeError, AttributeError): - raise DeepCopyError( - f"Cannot deep copy the object of type {type(obj)}" - ) from e + + if isinstance(obj, (list, set)): + return type(obj)(safe_deepcopy(v) for v in obj) + + if isinstance(obj, dict): + return {k: safe_deepcopy(v) for k, v in obj.items()} + + if isinstance(obj, tuple): + return tuple(safe_deepcopy(v) for v in obj) + + if isinstance(obj, frozenset): + return frozenset(safe_deepcopy(v) for v in obj) + + if is_boto3_client(obj): + return obj + + return copy.copy(obj) + + except Exception as e: + raise DeepCopyError(f"Cannot deep copy object of type {type(obj)}") from e diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index af351ad4..86f9f5f3 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -9,101 +9,97 @@ import requests from bs4 import BeautifulSoup def search_on_web(query: str, search_engine: str = "Google", - max_results: int = 10, port: int = 8080, + max_results: int = 10, port: int = 8080, timeout: int = 10, proxy: str | dict = None) -> List[str]: + """Search web function with improved error handling and validation""" + + # Input validation + if not query or not isinstance(query, str): + raise ValueError("Query must be a non-empty string") + + search_engine = search_engine.lower() + valid_engines = {"google", "duckduckgo", "bing", "searxng"} + if search_engine not in valid_engines: + raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}") + + # Format proxy once + formatted_proxy = None + if proxy: + formatted_proxy = format_proxy(proxy) + + try: + results = [] + if search_engine == "google": + results = list(google_search(query, num_results=max_results, proxy=formatted_proxy)) + + elif search_engine == "duckduckgo": + research = DuckDuckGoSearchResults(max_results=max_results) + res = research.run(query) + results = re.findall(r'https?://[^\s,\]]+', res) + + elif search_engine == "bing": + results = _search_bing(query, max_results, timeout, formatted_proxy) + + elif search_engine == "searxng": + results = _search_searxng(query, max_results, port, timeout) + + return filter_pdf_links(results) + + except requests.Timeout: + raise TimeoutError(f"Search request timed out after {timeout} seconds") + except requests.RequestException as e: + raise RuntimeError(f"Search request failed: {str(e)}") + +def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]: + """Helper function for Bing search""" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + search_url = f"https://www.bing.com/search?q={query}" + + proxies = {"http": proxy, "https": proxy} if proxy else None + response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)] + +def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]: + """Helper function for SearXNG search""" + url = f"http://localhost:{port}" + params = { + "q": query, + "format": "json", + "engines": "google,duckduckgo,brave,qwant,bing" + } + response = requests.get(url, params=params, timeout=timeout) + response.raise_for_status() + return [result['url'] for result in response.json().get("results", [])[:max_results]] + +def format_proxy(proxy): + if isinstance(proxy, dict): + server = proxy.get('server') + username = proxy.get('username') + password = proxy.get('password') + + if all([username, password, server]): + proxy_url = f"http://{username}:{password}@{server}" + return proxy_url + else: + raise ValueError("Proxy dictionary is missing required fields.") + elif isinstance(proxy, str): + return proxy # "https://username:password@ip:port" + else: + raise TypeError("Proxy should be a dictionary or a string.") + +def filter_pdf_links(links: List[str]) -> List[str]: """ - Searches the web for a given query using specified search - engine options and filters out PDF links. + Filters out any links that point to PDF files. Args: - query (str): The search query to find on the internet. - search_engine (str, optional): Specifies the search engine to use, - options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. - max_results (int, optional): The maximum number of search results to return. - port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. - timeout (int, optional): The number of seconds to wait - for a response from a request. Default is 10 seconds. - proxy (dict or string, optional): The proxy server to use for the request. Default is None. + links (List[str]): A list of URLs as strings. Returns: - List[str]: A list of URLs as strings that are the search results, excluding any PDF links. - - Raises: - ValueError: If the search engine specified is not supported. - requests.exceptions.Timeout: If the request times out. - - Example: - >>> search_on_web("example query", search_engine="Google", max_results=5) - ['http://example.com', 'http://example.org', ...] + List[str]: A list of URLs excluding any that end with '.pdf'. """ - - def format_proxy(proxy): - if isinstance(proxy, dict): - server = proxy.get('server') - username = proxy.get('username') - password = proxy.get('password') - - if all([username, password, server]): - proxy_url = f"http://{username}:{password}@{server}" - return proxy_url - else: - raise ValueError("Proxy dictionary is missing required fields.") - elif isinstance(proxy, str): - return proxy # "https://username:password@ip:port" - else: - raise TypeError("Proxy should be a dictionary or a string.") - - def filter_pdf_links(links: List[str]) -> List[str]: - """ - Filters out any links that point to PDF files. - - Args: - links (List[str]): A list of URLs as strings. - - Returns: - List[str]: A list of URLs excluding any that end with '.pdf'. - """ - return [link for link in links if not link.lower().endswith('.pdf')] - - if proxy: - proxy = format_proxy(proxy) - - if search_engine.lower() == "google": - res = [] - for url in google_search(query, num_results=max_results, proxy=proxy): - res.append(url) - return filter_pdf_links(res) - - elif search_engine.lower() == "duckduckgo": - research = DuckDuckGoSearchResults(max_results=max_results) - res = research.run(query) - links = re.findall(r'https?://[^\s,\]]+', res) - return filter_pdf_links(links) - - elif search_engine.lower() == "bing": - headers = { - "User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64) - AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36""" - } - search_url = f"https://www.bing.com/search?q={query}" - response = requests.get(search_url, headers=headers, timeout=timeout) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") - - search_results = [] - for result in soup.find_all('li', class_='b_algo', limit=max_results): - link = result.find('a')['href'] - search_results.append(link) - return filter_pdf_links(search_results) - - elif search_engine.lower() == "searxng": - url = f"http://localhost:{port}" - params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"} - response = requests.get(url, params=params, timeout=timeout) - data = response.json() - limited_results = [result['url'] for result in data["results"][:max_results]] - return filter_pdf_links(limited_results) - - else: - raise ValueError("""The only search engines available are - DuckDuckGo, Google, Bing, or SearXNG""") + return [link for link in links if not link.lower().endswith('.pdf')]