feat: add research with bing + test function

2026-06-23 21:00:30 +08:00 · 2024-06-18 21:28:29 +02:00 · 2024-06-18 21:28:29 +02:00 · aa2160c108
commit aa2160c108
parent 073d226723
2 changed files with 52 additions and 10 deletions
--- a/scrapegraphai/utils/research_web.py
+++ b/scrapegraphai/utils/research_web.py
@ -1,11 +1,12 @@
 """
-Module for making the request on the web
+research web module
 """
 import re
 from typing import List
 from langchain_community.tools import DuckDuckGoSearchResults
 from googlesearch import search as google_search
-
+import requests
+from bs4 import BeautifulSoup

 def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
    """
@ -13,35 +14,48 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =

    Args:
        query (str): The search query to find on the internet.
-        search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'.
+        search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'.
        max_results (int, optional): The maximum number of search results to return.

    Returns:
        List[str]: A list of URLs as strings that are the search results.

    Raises:
-        ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'.
+        ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'.

    Example:
        >>> search_on_web("example query", search_engine="Google", max_results=5)
        ['http://example.com', 'http://example.org', ...]

-    This function allows switching between Google and DuckDuckGo to perform 
+    This function allows switching between Google, DuckDuckGo, and Bing to perform 
    internet searches, returning a list of result URLs.
    """

    if search_engine.lower() == "google":
        res = []
-
        for url in google_search(query, stop=max_results):
            res.append(url)
        return res
+
    elif search_engine.lower() == "duckduckgo":
        research = DuckDuckGoSearchResults(max_results=max_results)
        res = research.run(query)
-
        links = re.findall(r'https?://[^\s,\]]+', res)
-
        return links
-    raise ValueError(
-        "The only search engines available are DuckDuckGo or Google")
+
+    elif search_engine.lower() == "bing":
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+        }
+        search_url = f"https://www.bing.com/search?q={query}"
+        response = requests.get(search_url, headers=headers)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        search_results = []
+        for result in soup.find_all('li', class_='b_algo', limit=max_results):
+            link = result.find('a')['href']
+            search_results.append(link)
+        return search_results
+
+    raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing")
--- a/tests/utils/research_web_test.py
+++ b/tests/utils/research_web_test.py
@ -0,0 +1,28 @@
+import pytest
+from scrapegraphai.utils.research_web import search_on_web  # Replace with actual path to your file
+
+
+def test_google_search():
+    """Tests search_on_web with Google search engine."""
+    results = search_on_web("test query", search_engine="Google", max_results=2)
+    assert len(results) == 2
+    # You can further assert if the results actually contain 'test query' in the title/snippet using additional libraries
+
+def test_bing_search():
+    """Tests search_on_web with Bing search engine."""
+    results = search_on_web("test query", search_engine="Bing", max_results=1)
+    assert results is not None
+    # You can further assert if the results contain '.com' or '.org' in the domain
+
+
+def test_invalid_search_engine():
+    """Tests search_on_web with invalid search engine."""
+    with pytest.raises(ValueError):
+        search_on_web("test query", search_engine="Yahoo", max_results=5)
+
+
+def test_max_results():
+    """Tests search_on_web with different max_results values."""
+    results_5 = search_on_web("test query", max_results=5)
+    results_10 = search_on_web("test query", max_results=10)
+    assert len(results_5) <= len(results_10)