feat: revert search function

2026-07-01 21:00:48 +08:00 · 2024-11-24 10:02:10 +01:00 · 2024-11-24 10:02:10 +01:00 · faf0c0123b
commit faf0c0123b
parent b2720a452f
2 changed files with 60 additions and 124 deletions
--- a/scrapegraphai/nodes/search_internet_node.py
+++ b/scrapegraphai/nodes/search_internet_node.py
@ -41,19 +41,11 @@ class SearchInternetNode(BaseNode):
        self.verbose = (
            False if node_config is None else node_config.get("verbose", False)
        )
        self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None)
        self.search_engine = (
            node_config["search_engine"]
            if node_config.get("search_engine")
            else "google"
        )
        self.serper_api_key = (
            node_config["serper_api_key"]
            if node_config.get("serper_api_key")
            else None
        )
        self.max_results = node_config.get("max_results", 3)
    def execute(self, state: dict) -> dict:
@ -102,7 +94,7 @@ class SearchInternetNode(BaseNode):
        self.logger.info(f"Search Query: {search_query}")
        answer = search_on_web(query=search_query, max_results=self.max_results,
-                               search_engine=self.search_engine, proxy=self.proxy, serper_api_key=self.serper_api_key)
+                               search_engine=self.search_engine)
        if len(answer) == 0:
            raise ValueError("Zero results found for the search query.")
--- a/scrapegraphai/utils/research_web.py
+++ b/scrapegraphai/utils/research_web.py
@ -1,5 +1,5 @@
 """
-research_web module
+Research_web module
 """
 import re
 from typing import List
@ -7,123 +7,67 @@ from langchain_community.tools import DuckDuckGoSearchResults
 from googlesearch import search as google_search
 import requests
 from bs4 import BeautifulSoup
 import json
 def search_on_web(query: str, search_engine: str = "Google", 
-                  max_results: int = 10, port: int = 8080,
+                  max_results: int = 10, port: int = 8080) -> List[str]:
                  timeout: int = 10, proxy: str | dict = None,
                  serper_api_key: str = None) -> List[str]:
    """Search web function with improved error handling and validation"""
    # Input validation
    if not query or not isinstance(query, str):
        raise ValueError("Query must be a non-empty string")
    search_engine = search_engine.lower()
    valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
    if search_engine not in valid_engines:
        raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")
    # Format proxy once
    formatted_proxy = None
    if proxy:
        formatted_proxy = format_proxy(proxy)
    try:
        results = []
        if search_engine == "google":
            results = list(google_search(query, num_results=max_results, proxy=formatted_proxy))
        elif search_engine == "duckduckgo":
            research = DuckDuckGoSearchResults(max_results=max_results)
            res = research.run(query)
            results = re.findall(r'https?://[^\s,\]]+', res)
        elif search_engine == "bing":
            results = _search_bing(query, max_results, timeout, formatted_proxy)
        elif search_engine == "searxng":
            results = _search_searxng(query, max_results, port, timeout)
        elif search_engine.lower() == "serper":
            results = _search_serper(query, max_results, serper_api_key, timeout)
        return filter_pdf_links(results)
    except requests.Timeout:
        raise TimeoutError(f"Search request timed out after {timeout} seconds")
    except requests.RequestException as e:
        raise RuntimeError(f"Search request failed: {str(e)}")
 def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]:
    """Helper function for Bing search"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    search_url = f"https://www.bing.com/search?q={query}"
    proxies = {"http": proxy, "https": proxy} if proxy else None
    response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)]
 def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]:
    """Helper function for SearXNG search"""
    url = f"http://localhost:{port}"
    params = {
        "q": query,
        "format": "json",
        "engines": "google,duckduckgo,brave,qwant,bing"
    }
    response = requests.get(url, params=params, timeout=timeout)
    response.raise_for_status()
    return [result['url'] for result in response.json().get("results", [])[:max_results]]
 def _search_serper(query: str, max_results: int, serper_api_key: str, timeout: int) -> List[str]:
    """Helper function for serper api"""
    if not serper_api_key:
        raise ValueError("API key is required for serper api.")
    url = "https://google.serper.dev/search"
    payload = json.dumps({
        "q": query,
        "num": max_results
    })
    headers = {
        'X-API-KEY': serper_api_key,
        'Content-Type': 'application/json'
    }
    response = requests.post(url, headers=headers, data=payload, timeout=timeout)
    response.raise_for_status()
    return [result.get("link") for result in response.json().get("organic", [])]
 def format_proxy(proxy):
    if isinstance(proxy, dict):
        server = proxy.get('server')
        username = proxy.get('username')
        password = proxy.get('password')
        if all([username, password, server]):
            proxy_url = f"http://{username}:{password}@{server}"
            return proxy_url
        else:
            raise ValueError("Proxy dictionary is missing required fields.")
    elif isinstance(proxy, str):
        return proxy  # "https://username:password@ip:port"
    else:
        raise TypeError("Proxy should be a dictionary or a string.")
 def filter_pdf_links(links: List[str]) -> List[str]:
    """
-    Filters out any links that point to PDF files.
+    Searches the web for a given query using specified search engine options.
    Args:
-        links (List[str]): A list of URLs as strings.
+        query (str): The search query to find on the internet.
        search_engine (str, optional): Specifies the search engine to use, 
        options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
        max_results (int, optional): The maximum number of search results to return.
        port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
    Returns:
-        List[str]: A list of URLs excluding any that end with '.pdf'.
+        List[str]: A list of URLs as strings that are the search results.
    Raises:
        ValueError: If the search engine specified is not supported.
    Example:
        >>> search_on_web("example query", search_engine="Google", max_results=5)
        ['http://example.com', 'http://example.org', ...]
    """
-    return [link for link in links if not link.lower().endswith('.pdf')]
+
    if search_engine.lower() == "google":
        res = []
        for url in google_search(query, num_results=max_results):
            res.append(url)
        return res
    elif search_engine.lower() == "duckduckgo":
        research = DuckDuckGoSearchResults(max_results=max_results)
        res = research.run(query)
        links = re.findall(r'https?://[^\s,\]]+', res)
        return links
    elif search_engine.lower() == "bing":
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        search_url = f"https://www.bing.com/search?q={query}"
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        search_results = []
        for result in soup.find_all('li', class_='b_algo', limit=max_results):
            link = result.find('a')['href']
            search_results.append(link)
        return search_results
    elif search_engine.lower() == "searxng":
        url = f"http://localhost:{port}"
        params = {"q": query, "format": "json"}
        # Send the GET request to the server
        response = requests.get(url, params=params)
        data = response.json()
        limited_results = data["results"][:max_results]
        return limited_results
    else:
        raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG")