mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
Merge branch 'pre/beta' into main
This commit is contained in:
commit
0dfa5bf07e
@ -39,6 +39,7 @@
|
||||
## [1.7.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.6.1...v1.7.0) (2024-06-17)
|
||||
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* add caching ([d790361](https://github.com/VinciGit00/Scrapegraph-ai/commit/d79036149a3197a385b73553f29df66d36480c38))
|
||||
@ -136,6 +137,7 @@
|
||||
* **release:** 1.7.0-beta.8 [skip ci] ([a87702f](https://github.com/VinciGit00/Scrapegraph-ai/commit/a87702f107f3fd16ee73e1af1585cd763788bf46))
|
||||
* **release:** 1.7.0-beta.9 [skip ci] ([0c5d6e2](https://github.com/VinciGit00/Scrapegraph-ai/commit/0c5d6e2c82b9ee81c91cd2325948bb5a4eddcb31))
|
||||
|
||||
|
||||
## [1.7.0-beta.12](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.0-beta.11...v1.7.0-beta.12) (2024-06-17)
|
||||
|
||||
|
||||
|
||||
@ -2,32 +2,31 @@
|
||||
Basic example of scraping pipeline using SmartScraper with schema
|
||||
"""
|
||||
|
||||
import os, json
|
||||
import json
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import BaseModel
|
||||
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Define the output schema for the graph
|
||||
# ************************************************
|
||||
|
||||
schema= """
|
||||
{
|
||||
"Projects": [
|
||||
"Project #":
|
||||
{
|
||||
"title": "...",
|
||||
"description": "...",
|
||||
},
|
||||
"Project #":
|
||||
{
|
||||
"title": "...",
|
||||
"description": "...",
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
|
||||
class Project(BaseModel):
|
||||
title: str
|
||||
description: str
|
||||
|
||||
|
||||
class Projects(BaseModel):
|
||||
Projects: Dict[str, Project]
|
||||
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
@ -37,7 +36,7 @@ openai_key = os.getenv("OPENAI_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key":openai_key,
|
||||
"api_key": openai_key,
|
||||
"model": "gpt-3.5-turbo",
|
||||
},
|
||||
"verbose": True,
|
||||
@ -51,8 +50,8 @@ graph_config = {
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List me all the projects with their description",
|
||||
source="https://perinim.github.io/projects/",
|
||||
schema=schema,
|
||||
config=graph_config
|
||||
schema=Projects,
|
||||
config=graph_config,
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
|
||||
@ -4,6 +4,9 @@ Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from typing import Dict
|
||||
|
||||
from pydantic import BaseModel
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
from langchain_community.llms import HuggingFaceEndpoint
|
||||
@ -13,22 +16,12 @@ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
|
||||
# Define the output schema for the graph
|
||||
# ************************************************
|
||||
|
||||
schema= """
|
||||
{
|
||||
"Projects": [
|
||||
"Project #":
|
||||
{
|
||||
"title": "...",
|
||||
"description": "...",
|
||||
},
|
||||
"Project #":
|
||||
{
|
||||
"title": "...",
|
||||
"description": "...",
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
class Project(BaseModel):
|
||||
title: str
|
||||
description: str
|
||||
|
||||
class Projects(BaseModel):
|
||||
Projects: Dict[str, Project]
|
||||
|
||||
## required environment variable in .env
|
||||
#HUGGINGFACEHUB_API_TOKEN
|
||||
@ -61,7 +54,7 @@ graph_config = {
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List me all the projects with their description",
|
||||
source="https://perinim.github.io/projects/",
|
||||
schema=schema,
|
||||
schema=Projects,
|
||||
config=graph_config
|
||||
)
|
||||
result = smart_scraper_graph.run()
|
||||
|
||||
@ -2,8 +2,13 @@
|
||||
Basic example of scraping pipeline using SmartScraper with schema
|
||||
"""
|
||||
|
||||
import os, json
|
||||
import json
|
||||
import os
|
||||
from typing import Dict, List
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import BaseModel
|
||||
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
@ -13,22 +18,12 @@ load_dotenv()
|
||||
# Define the output schema for the graph
|
||||
# ************************************************
|
||||
|
||||
schema= """
|
||||
{
|
||||
"Projects": [
|
||||
"Project #":
|
||||
{
|
||||
"title": "...",
|
||||
"description": "...",
|
||||
},
|
||||
"Project #":
|
||||
{
|
||||
"title": "...",
|
||||
"description": "...",
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
class Project(BaseModel):
|
||||
title: str
|
||||
description: str
|
||||
|
||||
class Projects(BaseModel):
|
||||
Projects: Dict[str, Project]
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
@ -60,7 +55,7 @@ smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List me all the projects with their description.",
|
||||
# also accepts a string with the already downloaded HTML code
|
||||
source="https://perinim.github.io/projects/",
|
||||
schema=schema,
|
||||
schema=Projects,
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
|
||||
50
examples/single_node/search_internet_node.py
Normal file
50
examples/single_node/search_internet_node.py
Normal file
@ -0,0 +1,50 @@
|
||||
"""
|
||||
Example of custom graph using existing nodes
|
||||
"""
|
||||
|
||||
from scrapegraphai.models import Ollama
|
||||
from scrapegraphai.nodes import SearchInternetNode
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "llama3",
|
||||
"temperature": 0,
|
||||
"streaming": True
|
||||
},
|
||||
"search_engine": "google",
|
||||
"max_results": 3,
|
||||
"verbose": True
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Define the node
|
||||
# ************************************************
|
||||
|
||||
llm_model = Ollama(graph_config["llm"])
|
||||
|
||||
search_node = SearchInternetNode(
|
||||
input="user_input",
|
||||
output=["search_results"],
|
||||
node_config={
|
||||
"llm_model": llm_model,
|
||||
"search_engine": graph_config["search_engine"],
|
||||
"max_results": graph_config["max_results"],
|
||||
"verbose": graph_config["verbose"]
|
||||
}
|
||||
)
|
||||
|
||||
# ************************************************
|
||||
# Test the node
|
||||
# ************************************************
|
||||
|
||||
state = {
|
||||
"user_input": "What is the capital of France?"
|
||||
}
|
||||
|
||||
result = search_node.execute(state)
|
||||
|
||||
print(result)
|
||||
@ -39,7 +39,7 @@ class AbstractGraph(ABC):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client,
|
||||
configured for generating embeddings.
|
||||
|
||||
@ -5,6 +5,8 @@ CSVScraperMultiGraph Module
|
||||
from copy import copy, deepcopy
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
from .csv_scraper_graph import CSVScraperGraph
|
||||
@ -32,7 +34,7 @@ class CSVScraperMultiGraph(AbstractGraph):
|
||||
prompt (str): The user prompt to search the internet.
|
||||
source (List[str]): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (Optional[str]): The schema for the graph output.
|
||||
schema (Optional[BaseModel]): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> search_graph = MultipleSearchGraph(
|
||||
@ -42,7 +44,7 @@ class CSVScraperMultiGraph(AbstractGraph):
|
||||
>>> result = search_graph.run()
|
||||
"""
|
||||
|
||||
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
|
||||
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
|
||||
|
||||
self.max_results = config.get("max_results", 3)
|
||||
|
||||
|
||||
@ -34,7 +34,7 @@ class DeepScraperGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client,
|
||||
configured for generating embeddings.
|
||||
@ -45,7 +45,7 @@ class DeepScraperGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> deep_scraper = DeepScraperGraph(
|
||||
|
||||
@ -23,7 +23,7 @@ class JSONScraperGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client,
|
||||
configured for generating embeddings.
|
||||
@ -34,7 +34,7 @@ class JSONScraperGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> json_scraper = JSONScraperGraph(
|
||||
|
||||
@ -33,7 +33,7 @@ class JSONScraperMultiGraph(AbstractGraph):
|
||||
prompt (str): The user prompt to search the internet.
|
||||
source (List[str]): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (Optional[str]): The schema for the graph output.
|
||||
schema (Optional[BaseModel]): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> search_graph = MultipleSearchGraph(
|
||||
|
||||
@ -29,7 +29,7 @@ class OmniScraperGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client,
|
||||
configured for generating embeddings.
|
||||
@ -41,7 +41,7 @@ class OmniScraperGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> omni_scraper = OmniScraperGraph(
|
||||
|
||||
@ -34,7 +34,7 @@ class OmniSearchGraph(AbstractGraph):
|
||||
Args:
|
||||
prompt (str): The user prompt to search the internet.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (Optional[str]): The schema for the graph output.
|
||||
schema (Optional[BaseModel]): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> omni_search_graph = OmniSearchGraph(
|
||||
|
||||
@ -26,7 +26,7 @@ class PDFScraperGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client,
|
||||
configured for generating embeddings.
|
||||
@ -38,7 +38,7 @@ class PDFScraperGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> pdf_scraper = PDFScraperGraph(
|
||||
|
||||
@ -34,7 +34,7 @@ class PdfScraperMultiGraph(AbstractGraph):
|
||||
prompt (str): The user prompt to search the internet.
|
||||
source (List[str]): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (Optional[str]): The schema for the graph output.
|
||||
schema (Optional[BaseModel]): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> search_graph = MultipleSearchGraph(
|
||||
|
||||
@ -23,7 +23,7 @@ class ScriptCreatorGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client,
|
||||
configured for generating embeddings.
|
||||
@ -36,7 +36,7 @@ class ScriptCreatorGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> script_creator = ScriptCreatorGraph(
|
||||
|
||||
@ -5,6 +5,8 @@ ScriptCreatorMultiGraph Module
|
||||
from copy import copy, deepcopy
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
from .script_creator_graph import ScriptCreatorGraph
|
||||
@ -30,7 +32,7 @@ class ScriptCreatorMultiGraph(AbstractGraph):
|
||||
prompt (str): The user prompt to search the internet.
|
||||
source (List[str]): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (Optional[str]): The schema for the graph output.
|
||||
schema (Optional[BaseModel]): The schema for the graph output.
|
||||
Example:
|
||||
>>> script_graph = ScriptCreatorMultiGraph(
|
||||
... "What is Chioggia famous for?",
|
||||
@ -41,7 +43,7 @@ class ScriptCreatorMultiGraph(AbstractGraph):
|
||||
>>> result = script_graph.run()
|
||||
"""
|
||||
|
||||
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
|
||||
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
|
||||
|
||||
self.max_results = config.get("max_results", 3)
|
||||
|
||||
|
||||
@ -35,7 +35,7 @@ class SearchGraph(AbstractGraph):
|
||||
Args:
|
||||
prompt (str): The user prompt to search the internet.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (Optional[str]): The schema for the graph output.
|
||||
schema (Optional[BaseModel]): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> search_graph = SearchGraph(
|
||||
|
||||
@ -26,7 +26,7 @@ class SmartScraperGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client,
|
||||
configured for generating embeddings.
|
||||
@ -37,7 +37,7 @@ class SmartScraperGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> smart_scraper = SmartScraperGraph(
|
||||
|
||||
@ -33,7 +33,7 @@ class SmartScraperMultiGraph(AbstractGraph):
|
||||
prompt (str): The user prompt to search the internet.
|
||||
source (List[str]): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (Optional[str]): The schema for the graph output.
|
||||
schema (Optional[BaseModel]): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> search_graph = MultipleSearchGraph(
|
||||
|
||||
@ -28,7 +28,7 @@ class SpeechGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client, configured for generating embeddings.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
@ -39,7 +39,7 @@ class SpeechGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> speech_graph = SpeechGraph(
|
||||
|
||||
@ -24,7 +24,7 @@ class XMLScraperGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client,
|
||||
configured for generating embeddings.
|
||||
@ -36,7 +36,7 @@ class XMLScraperGraph(AbstractGraph):
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (str): The schema for the graph output.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> xml_scraper = XMLScraperGraph(
|
||||
|
||||
@ -34,7 +34,7 @@ class XMLScraperMultiGraph(AbstractGraph):
|
||||
prompt (str): The user prompt to search the internet.
|
||||
source (List[str]): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (Optional[str]): The schema for the graph output.
|
||||
schema (Optional[BaseModel]): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> search_graph = MultipleSearchGraph(
|
||||
|
||||
@ -43,6 +43,7 @@ class SearchInternetNode(BaseNode):
|
||||
self.verbose = (
|
||||
False if node_config is None else node_config.get("verbose", False)
|
||||
)
|
||||
self.search_engine = node_config.get("search_engine", "google")
|
||||
self.max_results = node_config.get("max_results", 3)
|
||||
|
||||
def execute(self, state: dict) -> dict:
|
||||
@ -97,7 +98,8 @@ class SearchInternetNode(BaseNode):
|
||||
|
||||
self.logger.info(f"Search Query: {search_query}")
|
||||
|
||||
answer = search_on_web(query=search_query, max_results=self.max_results)
|
||||
answer = search_on_web(query=search_query, max_results=self.max_results,
|
||||
search_engine=self.search_engine)
|
||||
|
||||
if len(answer) == 0:
|
||||
# raise an exception if no answer is found
|
||||
|
||||
@ -1,11 +1,12 @@
|
||||
"""
|
||||
Module for making the request on the web
|
||||
research web module
|
||||
"""
|
||||
import re
|
||||
from typing import List
|
||||
from langchain_community.tools import DuckDuckGoSearchResults
|
||||
from googlesearch import search as google_search
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
|
||||
"""
|
||||
@ -13,34 +14,48 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =
|
||||
|
||||
Args:
|
||||
query (str): The search query to find on the internet.
|
||||
search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'.
|
||||
search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'.
|
||||
max_results (int, optional): The maximum number of search results to return.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of URLs as strings that are the search results.
|
||||
|
||||
Raises:
|
||||
ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'.
|
||||
ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'.
|
||||
|
||||
Example:
|
||||
>>> search_on_web("example query", search_engine="Google", max_results=5)
|
||||
['http://example.com', 'http://example.org', ...]
|
||||
|
||||
This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs.
|
||||
This function allows switching between Google, DuckDuckGo, and Bing to perform
|
||||
internet searches, returning a list of result URLs.
|
||||
"""
|
||||
|
||||
if search_engine.lower() == "google":
|
||||
res = []
|
||||
|
||||
for url in google_search(query, stop=max_results):
|
||||
res.append(url)
|
||||
return res
|
||||
|
||||
elif search_engine.lower() == "duckduckgo":
|
||||
research = DuckDuckGoSearchResults(max_results=max_results)
|
||||
res = research.run(query)
|
||||
|
||||
links = re.findall(r'https?://[^\s,\]]+', res)
|
||||
|
||||
return links
|
||||
raise ValueError(
|
||||
"The only search engines available are DuckDuckGo or Google")
|
||||
|
||||
elif search_engine.lower() == "bing":
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
}
|
||||
search_url = f"https://www.bing.com/search?q={query}"
|
||||
response = requests.get(search_url, headers=headers)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
search_results = []
|
||||
for result in soup.find_all('li', class_='b_algo', limit=max_results):
|
||||
link = result.find('a')['href']
|
||||
search_results.append(link)
|
||||
return search_results
|
||||
|
||||
raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing")
|
||||
|
||||
58
tests/nodes/search_internet_node_test.py
Normal file
58
tests/nodes/search_internet_node_test.py
Normal file
@ -0,0 +1,58 @@
|
||||
import unittest
|
||||
from scrapegraphai.models import Ollama
|
||||
from scrapegraphai.nodes import SearchInternetNode
|
||||
|
||||
class TestSearchInternetNode(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
# Configuration for the graph
|
||||
self.graph_config = {
|
||||
"llm": {
|
||||
"model": "llama3",
|
||||
"temperature": 0,
|
||||
"streaming": True
|
||||
},
|
||||
"search_engine": "google",
|
||||
"max_results": 3,
|
||||
"verbose": True
|
||||
}
|
||||
|
||||
# Define the model
|
||||
self.llm_model = Ollama(self.graph_config["llm"])
|
||||
|
||||
# Initialize the SearchInternetNode
|
||||
self.search_node = SearchInternetNode(
|
||||
input="user_input",
|
||||
output=["search_results"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"search_engine": self.graph_config["search_engine"],
|
||||
"max_results": self.graph_config["max_results"],
|
||||
"verbose": self.graph_config["verbose"]
|
||||
}
|
||||
)
|
||||
|
||||
def test_execute_search_node(self):
|
||||
# Initial state
|
||||
state = {
|
||||
"user_input": "What is the capital of France?"
|
||||
}
|
||||
|
||||
# Expected output
|
||||
expected_output = {
|
||||
"user_input": "What is the capital of France?",
|
||||
"search_results": [
|
||||
"https://en.wikipedia.org/wiki/Paris",
|
||||
"https://en.wikipedia.org/wiki/France",
|
||||
"https://en.wikipedia.org/wiki/%C3%8Ele-de-France"
|
||||
]
|
||||
}
|
||||
|
||||
# Execute the node
|
||||
result = self.search_node.execute(state)
|
||||
|
||||
# Assert the results
|
||||
self.assertEqual(result, expected_output)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
28
tests/utils/research_web_test.py
Normal file
28
tests/utils/research_web_test.py
Normal file
@ -0,0 +1,28 @@
|
||||
import pytest
|
||||
from scrapegraphai.utils.research_web import search_on_web # Replace with actual path to your file
|
||||
|
||||
|
||||
def test_google_search():
|
||||
"""Tests search_on_web with Google search engine."""
|
||||
results = search_on_web("test query", search_engine="Google", max_results=2)
|
||||
assert len(results) == 2
|
||||
# You can further assert if the results actually contain 'test query' in the title/snippet using additional libraries
|
||||
|
||||
def test_bing_search():
|
||||
"""Tests search_on_web with Bing search engine."""
|
||||
results = search_on_web("test query", search_engine="Bing", max_results=1)
|
||||
assert results is not None
|
||||
# You can further assert if the results contain '.com' or '.org' in the domain
|
||||
|
||||
|
||||
def test_invalid_search_engine():
|
||||
"""Tests search_on_web with invalid search engine."""
|
||||
with pytest.raises(ValueError):
|
||||
search_on_web("test query", search_engine="Yahoo", max_results=5)
|
||||
|
||||
|
||||
def test_max_results():
|
||||
"""Tests search_on_web with different max_results values."""
|
||||
results_5 = search_on_web("test query", max_results=5)
|
||||
results_10 = search_on_web("test query", max_results=10)
|
||||
assert len(results_5) <= len(results_10)
|
||||
Loading…
Reference in New Issue
Block a user