mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-04 21:00:36 +08:00
75 lines
2.7 KiB
Python
75 lines
2.7 KiB
Python
"""
|
|
Research_web module
|
|
"""
|
|
import re
|
|
from typing import List
|
|
from langchain_community.tools import DuckDuckGoSearchResults
|
|
from googlesearch import search as google_search
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
def search_on_web(query: str, search_engine: str = "Google",
|
|
max_results: int = 10, port: int = 8080) -> List[str]:
|
|
"""
|
|
Searches the web for a given query using specified search engine options.
|
|
|
|
Args:
|
|
query (str): The search query to find on the internet.
|
|
search_engine (str, optional): Specifies the search engine to use,
|
|
options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
|
|
max_results (int, optional): The maximum number of search results to return.
|
|
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
|
|
|
|
Returns:
|
|
List[str]: A list of URLs as strings that are the search results.
|
|
|
|
Raises:
|
|
ValueError: If the search engine specified is not supported.
|
|
|
|
Example:
|
|
>>> search_on_web("example query", search_engine="Google", max_results=5)
|
|
['http://example.com', 'http://example.org', ...]
|
|
"""
|
|
|
|
if search_engine.lower() == "google":
|
|
res = []
|
|
for url in google_search(query, stop=max_results):
|
|
res.append(url)
|
|
return res
|
|
|
|
elif search_engine.lower() == "duckduckgo":
|
|
research = DuckDuckGoSearchResults(max_results=max_results)
|
|
res = research.run(query)
|
|
links = re.findall(r'https?://[^\s,\]]+', res)
|
|
return links
|
|
|
|
elif search_engine.lower() == "bing":
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
|
}
|
|
search_url = f"https://www.bing.com/search?q={query}"
|
|
response = requests.get(search_url, headers=headers)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
search_results = []
|
|
for result in soup.find_all('li', class_='b_algo', limit=max_results):
|
|
link = result.find('a')['href']
|
|
search_results.append(link)
|
|
return search_results
|
|
|
|
elif search_engine.lower() == "searxng":
|
|
url = f"http://localhost:{port}"
|
|
params = {"q": query, "format": "json"}
|
|
|
|
# Send the GET request to the server
|
|
response = requests.get(url, params=params)
|
|
|
|
# Parse the response and limit to the specified max_results
|
|
data = response.json()
|
|
limited_results = data["results"][:max_results]
|
|
return limited_results
|
|
|
|
else:
|
|
raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG")
|