diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py
index 2ec3b140..9b00f61c 100644
--- a/scrapegraphai/utils/cleanup_html.py
+++ b/scrapegraphai/utils/cleanup_html.py
@@ -60,13 +60,18 @@ def minify_html(html):
"""
minify_html function
"""
- html = re.sub(r'', '', html, flags=re.DOTALL)
-
- html = re.sub(r'>\s+<', '><', html)
- html = re.sub(r'\s+>', '>', html)
- html = re.sub(r'<\s+', '<', html)
- html = re.sub(r'\s+', ' ', html)
- html = re.sub(r'\s*=\s*', '=', html)
+ # Combine multiple regex operations into one for better performance
+ patterns = [
+ (r'', '', re.DOTALL),
+ (r'>\s+<', '><', 0),
+ (r'\s+>', '>', 0),
+ (r'<\s+', '<', 0),
+ (r'\s+', ' ', 0),
+ (r'\s*=\s*', '=', 0)
+ ]
+
+ for pattern, repl, flags in patterns:
+ html = re.sub(pattern, repl, html, flags=flags)
return html.strip()
diff --git a/scrapegraphai/utils/copy.py b/scrapegraphai/utils/copy.py
index a35370ab..2ec7cee2 100644
--- a/scrapegraphai/utils/copy.py
+++ b/scrapegraphai/utils/copy.py
@@ -30,56 +30,38 @@ def is_boto3_client(obj):
def safe_deepcopy(obj: Any) -> Any:
"""
- Attempts to create a deep copy of the object using `copy.deepcopy`
- whenever possible. If that fails, it falls back to custom deep copy
- logic. If that also fails, it raises a `DeepCopyError`.
-
+ Safely create a deep copy of an object, handling special cases.
+
Args:
- obj (Any): The object to be copied, which can be of any type.
-
+ obj: Object to copy
+
Returns:
- Any: A deep copy of the object if possible; otherwise, a shallow
- copy if deep copying fails; if neither is possible, the original
- object is returned.
+ Deep copy of the object
+
Raises:
- DeepCopyError: If the object cannot be deep-copied or shallow-copied.
+ DeepCopyError: If object cannot be deep copied
"""
-
try:
-
- return copy.deepcopy(obj)
- except (TypeError, AttributeError) as e:
-
- if isinstance(obj, dict):
- new_obj = {}
-
- for k, v in obj.items():
- new_obj[k] = safe_deepcopy(v)
- return new_obj
-
- elif isinstance(obj, list):
- new_obj = []
-
- for v in obj:
- new_obj.append(safe_deepcopy(v))
- return new_obj
-
- elif isinstance(obj, tuple):
- new_obj = tuple(safe_deepcopy(v) for v in obj)
-
- return new_obj
-
- elif isinstance(obj, frozenset):
- new_obj = frozenset(safe_deepcopy(v) for v in obj)
- return new_obj
-
- elif is_boto3_client(obj):
+ # Handle special cases first
+ if obj is None or isinstance(obj, (str, int, float, bool)):
return obj
-
- else:
- try:
- return copy.copy(obj)
- except (TypeError, AttributeError):
- raise DeepCopyError(
- f"Cannot deep copy the object of type {type(obj)}"
- ) from e
+
+ if isinstance(obj, (list, set)):
+ return type(obj)(safe_deepcopy(v) for v in obj)
+
+ if isinstance(obj, dict):
+ return {k: safe_deepcopy(v) for k, v in obj.items()}
+
+ if isinstance(obj, tuple):
+ return tuple(safe_deepcopy(v) for v in obj)
+
+ if isinstance(obj, frozenset):
+ return frozenset(safe_deepcopy(v) for v in obj)
+
+ if is_boto3_client(obj):
+ return obj
+
+ return copy.copy(obj)
+
+ except Exception as e:
+ raise DeepCopyError(f"Cannot deep copy object of type {type(obj)}") from e
diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
index af351ad4..86f9f5f3 100644
--- a/scrapegraphai/utils/research_web.py
+++ b/scrapegraphai/utils/research_web.py
@@ -9,101 +9,97 @@ import requests
from bs4 import BeautifulSoup
def search_on_web(query: str, search_engine: str = "Google",
- max_results: int = 10, port: int = 8080,
+ max_results: int = 10, port: int = 8080,
timeout: int = 10, proxy: str | dict = None) -> List[str]:
+ """Search web function with improved error handling and validation"""
+
+ # Input validation
+ if not query or not isinstance(query, str):
+ raise ValueError("Query must be a non-empty string")
+
+ search_engine = search_engine.lower()
+ valid_engines = {"google", "duckduckgo", "bing", "searxng"}
+ if search_engine not in valid_engines:
+ raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")
+
+ # Format proxy once
+ formatted_proxy = None
+ if proxy:
+ formatted_proxy = format_proxy(proxy)
+
+ try:
+ results = []
+ if search_engine == "google":
+ results = list(google_search(query, num_results=max_results, proxy=formatted_proxy))
+
+ elif search_engine == "duckduckgo":
+ research = DuckDuckGoSearchResults(max_results=max_results)
+ res = research.run(query)
+ results = re.findall(r'https?://[^\s,\]]+', res)
+
+ elif search_engine == "bing":
+ results = _search_bing(query, max_results, timeout, formatted_proxy)
+
+ elif search_engine == "searxng":
+ results = _search_searxng(query, max_results, port, timeout)
+
+ return filter_pdf_links(results)
+
+ except requests.Timeout:
+ raise TimeoutError(f"Search request timed out after {timeout} seconds")
+ except requests.RequestException as e:
+ raise RuntimeError(f"Search request failed: {str(e)}")
+
+def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]:
+ """Helper function for Bing search"""
+ headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+ }
+ search_url = f"https://www.bing.com/search?q={query}"
+
+ proxies = {"http": proxy, "https": proxy} if proxy else None
+ response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies)
+ response.raise_for_status()
+
+ soup = BeautifulSoup(response.text, "html.parser")
+ return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)]
+
+def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]:
+ """Helper function for SearXNG search"""
+ url = f"http://localhost:{port}"
+ params = {
+ "q": query,
+ "format": "json",
+ "engines": "google,duckduckgo,brave,qwant,bing"
+ }
+ response = requests.get(url, params=params, timeout=timeout)
+ response.raise_for_status()
+ return [result['url'] for result in response.json().get("results", [])[:max_results]]
+
+def format_proxy(proxy):
+ if isinstance(proxy, dict):
+ server = proxy.get('server')
+ username = proxy.get('username')
+ password = proxy.get('password')
+
+ if all([username, password, server]):
+ proxy_url = f"http://{username}:{password}@{server}"
+ return proxy_url
+ else:
+ raise ValueError("Proxy dictionary is missing required fields.")
+ elif isinstance(proxy, str):
+ return proxy # "https://username:password@ip:port"
+ else:
+ raise TypeError("Proxy should be a dictionary or a string.")
+
+def filter_pdf_links(links: List[str]) -> List[str]:
"""
- Searches the web for a given query using specified search
- engine options and filters out PDF links.
+ Filters out any links that point to PDF files.
Args:
- query (str): The search query to find on the internet.
- search_engine (str, optional): Specifies the search engine to use,
- options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
- max_results (int, optional): The maximum number of search results to return.
- port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
- timeout (int, optional): The number of seconds to wait
- for a response from a request. Default is 10 seconds.
- proxy (dict or string, optional): The proxy server to use for the request. Default is None.
+ links (List[str]): A list of URLs as strings.
Returns:
- List[str]: A list of URLs as strings that are the search results, excluding any PDF links.
-
- Raises:
- ValueError: If the search engine specified is not supported.
- requests.exceptions.Timeout: If the request times out.
-
- Example:
- >>> search_on_web("example query", search_engine="Google", max_results=5)
- ['http://example.com', 'http://example.org', ...]
+ List[str]: A list of URLs excluding any that end with '.pdf'.
"""
-
- def format_proxy(proxy):
- if isinstance(proxy, dict):
- server = proxy.get('server')
- username = proxy.get('username')
- password = proxy.get('password')
-
- if all([username, password, server]):
- proxy_url = f"http://{username}:{password}@{server}"
- return proxy_url
- else:
- raise ValueError("Proxy dictionary is missing required fields.")
- elif isinstance(proxy, str):
- return proxy # "https://username:password@ip:port"
- else:
- raise TypeError("Proxy should be a dictionary or a string.")
-
- def filter_pdf_links(links: List[str]) -> List[str]:
- """
- Filters out any links that point to PDF files.
-
- Args:
- links (List[str]): A list of URLs as strings.
-
- Returns:
- List[str]: A list of URLs excluding any that end with '.pdf'.
- """
- return [link for link in links if not link.lower().endswith('.pdf')]
-
- if proxy:
- proxy = format_proxy(proxy)
-
- if search_engine.lower() == "google":
- res = []
- for url in google_search(query, num_results=max_results, proxy=proxy):
- res.append(url)
- return filter_pdf_links(res)
-
- elif search_engine.lower() == "duckduckgo":
- research = DuckDuckGoSearchResults(max_results=max_results)
- res = research.run(query)
- links = re.findall(r'https?://[^\s,\]]+', res)
- return filter_pdf_links(links)
-
- elif search_engine.lower() == "bing":
- headers = {
- "User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64)
- AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"""
- }
- search_url = f"https://www.bing.com/search?q={query}"
- response = requests.get(search_url, headers=headers, timeout=timeout)
- response.raise_for_status()
- soup = BeautifulSoup(response.text, "html.parser")
-
- search_results = []
- for result in soup.find_all('li', class_='b_algo', limit=max_results):
- link = result.find('a')['href']
- search_results.append(link)
- return filter_pdf_links(search_results)
-
- elif search_engine.lower() == "searxng":
- url = f"http://localhost:{port}"
- params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"}
- response = requests.get(url, params=params, timeout=timeout)
- data = response.json()
- limited_results = [result['url'] for result in data["results"][:max_results]]
- return filter_pdf_links(limited_results)
-
- else:
- raise ValueError("""The only search engines available are
- DuckDuckGo, Google, Bing, or SearXNG""")
+ return [link for link in links if not link.lower().endswith('.pdf')]