mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
feat: revert search function
This commit is contained in:
parent
b2720a452f
commit
faf0c0123b
@ -41,19 +41,11 @@ class SearchInternetNode(BaseNode):
|
|||||||
self.verbose = (
|
self.verbose = (
|
||||||
False if node_config is None else node_config.get("verbose", False)
|
False if node_config is None else node_config.get("verbose", False)
|
||||||
)
|
)
|
||||||
self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None)
|
|
||||||
self.search_engine = (
|
self.search_engine = (
|
||||||
node_config["search_engine"]
|
node_config["search_engine"]
|
||||||
if node_config.get("search_engine")
|
if node_config.get("search_engine")
|
||||||
else "google"
|
else "google"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.serper_api_key = (
|
|
||||||
node_config["serper_api_key"]
|
|
||||||
if node_config.get("serper_api_key")
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
|
|
||||||
self.max_results = node_config.get("max_results", 3)
|
self.max_results = node_config.get("max_results", 3)
|
||||||
|
|
||||||
def execute(self, state: dict) -> dict:
|
def execute(self, state: dict) -> dict:
|
||||||
@ -102,7 +94,7 @@ class SearchInternetNode(BaseNode):
|
|||||||
self.logger.info(f"Search Query: {search_query}")
|
self.logger.info(f"Search Query: {search_query}")
|
||||||
|
|
||||||
answer = search_on_web(query=search_query, max_results=self.max_results,
|
answer = search_on_web(query=search_query, max_results=self.max_results,
|
||||||
search_engine=self.search_engine, proxy=self.proxy, serper_api_key=self.serper_api_key)
|
search_engine=self.search_engine)
|
||||||
|
|
||||||
if len(answer) == 0:
|
if len(answer) == 0:
|
||||||
raise ValueError("Zero results found for the search query.")
|
raise ValueError("Zero results found for the search query.")
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
research_web module
|
Research_web module
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -7,123 +7,67 @@ from langchain_community.tools import DuckDuckGoSearchResults
|
|||||||
from googlesearch import search as google_search
|
from googlesearch import search as google_search
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import json
|
|
||||||
|
|
||||||
def search_on_web(query: str, search_engine: str = "Google",
|
def search_on_web(query: str, search_engine: str = "Google",
|
||||||
max_results: int = 10, port: int = 8080,
|
max_results: int = 10, port: int = 8080) -> List[str]:
|
||||||
timeout: int = 10, proxy: str | dict = None,
|
|
||||||
serper_api_key: str = None) -> List[str]:
|
|
||||||
"""Search web function with improved error handling and validation"""
|
|
||||||
|
|
||||||
# Input validation
|
|
||||||
if not query or not isinstance(query, str):
|
|
||||||
raise ValueError("Query must be a non-empty string")
|
|
||||||
|
|
||||||
search_engine = search_engine.lower()
|
|
||||||
valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
|
|
||||||
if search_engine not in valid_engines:
|
|
||||||
raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")
|
|
||||||
|
|
||||||
# Format proxy once
|
|
||||||
formatted_proxy = None
|
|
||||||
if proxy:
|
|
||||||
formatted_proxy = format_proxy(proxy)
|
|
||||||
|
|
||||||
try:
|
|
||||||
results = []
|
|
||||||
if search_engine == "google":
|
|
||||||
results = list(google_search(query, num_results=max_results, proxy=formatted_proxy))
|
|
||||||
|
|
||||||
elif search_engine == "duckduckgo":
|
|
||||||
research = DuckDuckGoSearchResults(max_results=max_results)
|
|
||||||
res = research.run(query)
|
|
||||||
results = re.findall(r'https?://[^\s,\]]+', res)
|
|
||||||
|
|
||||||
elif search_engine == "bing":
|
|
||||||
results = _search_bing(query, max_results, timeout, formatted_proxy)
|
|
||||||
|
|
||||||
elif search_engine == "searxng":
|
|
||||||
results = _search_searxng(query, max_results, port, timeout)
|
|
||||||
|
|
||||||
elif search_engine.lower() == "serper":
|
|
||||||
results = _search_serper(query, max_results, serper_api_key, timeout)
|
|
||||||
|
|
||||||
return filter_pdf_links(results)
|
|
||||||
|
|
||||||
except requests.Timeout:
|
|
||||||
raise TimeoutError(f"Search request timed out after {timeout} seconds")
|
|
||||||
except requests.RequestException as e:
|
|
||||||
raise RuntimeError(f"Search request failed: {str(e)}")
|
|
||||||
|
|
||||||
def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]:
|
|
||||||
"""Helper function for Bing search"""
|
|
||||||
headers = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
|
||||||
}
|
|
||||||
search_url = f"https://www.bing.com/search?q={query}"
|
|
||||||
|
|
||||||
proxies = {"http": proxy, "https": proxy} if proxy else None
|
|
||||||
response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)]
|
|
||||||
|
|
||||||
def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]:
|
|
||||||
"""Helper function for SearXNG search"""
|
|
||||||
url = f"http://localhost:{port}"
|
|
||||||
params = {
|
|
||||||
"q": query,
|
|
||||||
"format": "json",
|
|
||||||
"engines": "google,duckduckgo,brave,qwant,bing"
|
|
||||||
}
|
|
||||||
response = requests.get(url, params=params, timeout=timeout)
|
|
||||||
response.raise_for_status()
|
|
||||||
return [result['url'] for result in response.json().get("results", [])[:max_results]]
|
|
||||||
|
|
||||||
def _search_serper(query: str, max_results: int, serper_api_key: str, timeout: int) -> List[str]:
|
|
||||||
"""Helper function for serper api"""
|
|
||||||
if not serper_api_key:
|
|
||||||
raise ValueError("API key is required for serper api.")
|
|
||||||
|
|
||||||
url = "https://google.serper.dev/search"
|
|
||||||
payload = json.dumps({
|
|
||||||
"q": query,
|
|
||||||
"num": max_results
|
|
||||||
})
|
|
||||||
headers = {
|
|
||||||
'X-API-KEY': serper_api_key,
|
|
||||||
'Content-Type': 'application/json'
|
|
||||||
}
|
|
||||||
response = requests.post(url, headers=headers, data=payload, timeout=timeout)
|
|
||||||
response.raise_for_status()
|
|
||||||
return [result.get("link") for result in response.json().get("organic", [])]
|
|
||||||
|
|
||||||
|
|
||||||
def format_proxy(proxy):
|
|
||||||
if isinstance(proxy, dict):
|
|
||||||
server = proxy.get('server')
|
|
||||||
username = proxy.get('username')
|
|
||||||
password = proxy.get('password')
|
|
||||||
|
|
||||||
if all([username, password, server]):
|
|
||||||
proxy_url = f"http://{username}:{password}@{server}"
|
|
||||||
return proxy_url
|
|
||||||
else:
|
|
||||||
raise ValueError("Proxy dictionary is missing required fields.")
|
|
||||||
elif isinstance(proxy, str):
|
|
||||||
return proxy # "https://username:password@ip:port"
|
|
||||||
else:
|
|
||||||
raise TypeError("Proxy should be a dictionary or a string.")
|
|
||||||
|
|
||||||
def filter_pdf_links(links: List[str]) -> List[str]:
|
|
||||||
"""
|
"""
|
||||||
Filters out any links that point to PDF files.
|
Searches the web for a given query using specified search engine options.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
links (List[str]): A list of URLs as strings.
|
query (str): The search query to find on the internet.
|
||||||
|
search_engine (str, optional): Specifies the search engine to use,
|
||||||
|
options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
|
||||||
|
max_results (int, optional): The maximum number of search results to return.
|
||||||
|
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[str]: A list of URLs excluding any that end with '.pdf'.
|
List[str]: A list of URLs as strings that are the search results.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the search engine specified is not supported.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> search_on_web("example query", search_engine="Google", max_results=5)
|
||||||
|
['http://example.com', 'http://example.org', ...]
|
||||||
"""
|
"""
|
||||||
return [link for link in links if not link.lower().endswith('.pdf')]
|
|
||||||
|
if search_engine.lower() == "google":
|
||||||
|
res = []
|
||||||
|
for url in google_search(query, num_results=max_results):
|
||||||
|
res.append(url)
|
||||||
|
return res
|
||||||
|
|
||||||
|
elif search_engine.lower() == "duckduckgo":
|
||||||
|
research = DuckDuckGoSearchResults(max_results=max_results)
|
||||||
|
res = research.run(query)
|
||||||
|
links = re.findall(r'https?://[^\s,\]]+', res)
|
||||||
|
return links
|
||||||
|
|
||||||
|
elif search_engine.lower() == "bing":
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||||
|
}
|
||||||
|
search_url = f"https://www.bing.com/search?q={query}"
|
||||||
|
response = requests.get(search_url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
search_results = []
|
||||||
|
for result in soup.find_all('li', class_='b_algo', limit=max_results):
|
||||||
|
link = result.find('a')['href']
|
||||||
|
search_results.append(link)
|
||||||
|
return search_results
|
||||||
|
|
||||||
|
elif search_engine.lower() == "searxng":
|
||||||
|
url = f"http://localhost:{port}"
|
||||||
|
params = {"q": query, "format": "json"}
|
||||||
|
|
||||||
|
# Send the GET request to the server
|
||||||
|
response = requests.get(url, params=params)
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
limited_results = data["results"][:max_results]
|
||||||
|
return limited_results
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG")
|
||||||
Loading…
Reference in New Issue
Block a user