mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
feat: refactoring search function
This commit is contained in:
parent
2abe05a0cf
commit
aeb1acbf05
@ -41,7 +41,8 @@ free-proxy = "1.1.1"
|
||||
langchain-groq = "0.1.3"
|
||||
playwright = "^1.43.0"
|
||||
langchain-aws = "^0.1.2"
|
||||
|
||||
langchain-anthropic = "^0.1.11"
|
||||
yahoo-search-py=="^0.3"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "8.0.0"
|
||||
|
||||
@ -15,3 +15,5 @@ free-proxy==1.1.1
|
||||
langchain-groq==0.1.3
|
||||
playwright==1.43.0
|
||||
langchain-aws==0.1.2
|
||||
langchain-anthropic==0.1.11
|
||||
yahoo-search-py==0.3
|
||||
|
||||
@ -1,10 +1,11 @@
|
||||
"""
|
||||
"""
|
||||
Module for making the request on the web
|
||||
"""
|
||||
import re
|
||||
from typing import List
|
||||
from langchain_community.tools import DuckDuckGoSearchResults
|
||||
from googlesearch import search
|
||||
from googlesearch import search as google_search
|
||||
from yahoo_search import search as yahoo_search
|
||||
|
||||
|
||||
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
|
||||
@ -29,18 +30,29 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =
|
||||
This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs.
|
||||
"""
|
||||
|
||||
if search_engine == "Google":
|
||||
if search_engine.lower() == "google":
|
||||
res = []
|
||||
|
||||
for url in search(query, stop=max_results):
|
||||
for url in google_search(query, stop=max_results):
|
||||
res.append(url)
|
||||
return res
|
||||
elif search_engine == "DuckDuckGo":
|
||||
elif search_engine.lower() == "duckduckgo":
|
||||
research = DuckDuckGoSearchResults(max_results=max_results)
|
||||
res = research.run(query)
|
||||
|
||||
links = re.findall(r'https?://[^\s,\]]+', res)
|
||||
|
||||
return links
|
||||
elif search_engine.lower() == "yahoo":
|
||||
list_result = yahoo_search(query)
|
||||
results = []
|
||||
for page in list_result.pages:
|
||||
if len(results) >= max_results: # Check if max_results has already been reached
|
||||
break # Exit loop if max_results has been reached
|
||||
try:
|
||||
results.append(page.link)
|
||||
except AttributeError:
|
||||
continue
|
||||
return results
|
||||
raise ValueError(
|
||||
"The only search engines avaiable are DuckDuckGo or Google")
|
||||
"The only search engines available are DuckDuckGo or Google")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user