mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
feat: add research with bing + test function
Some checks failed
/ build (3.10) (push) Has been cancelled
Some checks failed
/ build (3.10) (push) Has been cancelled
This commit is contained in:
parent
073d226723
commit
aa2160c108
@ -1,11 +1,12 @@
|
|||||||
"""
|
"""
|
||||||
Module for making the request on the web
|
research web module
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from typing import List
|
from typing import List
|
||||||
from langchain_community.tools import DuckDuckGoSearchResults
|
from langchain_community.tools import DuckDuckGoSearchResults
|
||||||
from googlesearch import search as google_search
|
from googlesearch import search as google_search
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
|
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
|
||||||
"""
|
"""
|
||||||
@ -13,35 +14,48 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
query (str): The search query to find on the internet.
|
query (str): The search query to find on the internet.
|
||||||
search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'.
|
search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'.
|
||||||
max_results (int, optional): The maximum number of search results to return.
|
max_results (int, optional): The maximum number of search results to return.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[str]: A list of URLs as strings that are the search results.
|
List[str]: A list of URLs as strings that are the search results.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'.
|
ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
>>> search_on_web("example query", search_engine="Google", max_results=5)
|
>>> search_on_web("example query", search_engine="Google", max_results=5)
|
||||||
['http://example.com', 'http://example.org', ...]
|
['http://example.com', 'http://example.org', ...]
|
||||||
|
|
||||||
This function allows switching between Google and DuckDuckGo to perform
|
This function allows switching between Google, DuckDuckGo, and Bing to perform
|
||||||
internet searches, returning a list of result URLs.
|
internet searches, returning a list of result URLs.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if search_engine.lower() == "google":
|
if search_engine.lower() == "google":
|
||||||
res = []
|
res = []
|
||||||
|
|
||||||
for url in google_search(query, stop=max_results):
|
for url in google_search(query, stop=max_results):
|
||||||
res.append(url)
|
res.append(url)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
elif search_engine.lower() == "duckduckgo":
|
elif search_engine.lower() == "duckduckgo":
|
||||||
research = DuckDuckGoSearchResults(max_results=max_results)
|
research = DuckDuckGoSearchResults(max_results=max_results)
|
||||||
res = research.run(query)
|
res = research.run(query)
|
||||||
|
|
||||||
links = re.findall(r'https?://[^\s,\]]+', res)
|
links = re.findall(r'https?://[^\s,\]]+', res)
|
||||||
|
|
||||||
return links
|
return links
|
||||||
raise ValueError(
|
|
||||||
"The only search engines available are DuckDuckGo or Google")
|
elif search_engine.lower() == "bing":
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||||
|
}
|
||||||
|
search_url = f"https://www.bing.com/search?q={query}"
|
||||||
|
response = requests.get(search_url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
search_results = []
|
||||||
|
for result in soup.find_all('li', class_='b_algo', limit=max_results):
|
||||||
|
link = result.find('a')['href']
|
||||||
|
search_results.append(link)
|
||||||
|
return search_results
|
||||||
|
|
||||||
|
raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing")
|
||||||
|
|||||||
28
tests/utils/research_web_test.py
Normal file
28
tests/utils/research_web_test.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import pytest
|
||||||
|
from scrapegraphai.utils.research_web import search_on_web # Replace with actual path to your file
|
||||||
|
|
||||||
|
|
||||||
|
def test_google_search():
|
||||||
|
"""Tests search_on_web with Google search engine."""
|
||||||
|
results = search_on_web("test query", search_engine="Google", max_results=2)
|
||||||
|
assert len(results) == 2
|
||||||
|
# You can further assert if the results actually contain 'test query' in the title/snippet using additional libraries
|
||||||
|
|
||||||
|
def test_bing_search():
|
||||||
|
"""Tests search_on_web with Bing search engine."""
|
||||||
|
results = search_on_web("test query", search_engine="Bing", max_results=1)
|
||||||
|
assert results is not None
|
||||||
|
# You can further assert if the results contain '.com' or '.org' in the domain
|
||||||
|
|
||||||
|
|
||||||
|
def test_invalid_search_engine():
|
||||||
|
"""Tests search_on_web with invalid search engine."""
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
search_on_web("test query", search_engine="Yahoo", max_results=5)
|
||||||
|
|
||||||
|
|
||||||
|
def test_max_results():
|
||||||
|
"""Tests search_on_web with different max_results values."""
|
||||||
|
results_5 = search_on_web("test query", max_results=5)
|
||||||
|
results_10 = search_on_web("test query", max_results=10)
|
||||||
|
assert len(results_5) <= len(results_10)
|
||||||
Loading…
Reference in New Issue
Block a user