Merge branch 'pre/beta' into main

This commit is contained in:
Marco Vinciguerra 2024-06-25 10:47:15 +02:00 committed by GitHub
commit 0dfa5bf07e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
26 changed files with 239 additions and 93 deletions

View File

@ -39,6 +39,7 @@
## [1.7.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.6.1...v1.7.0) (2024-06-17)
### Features
* add caching ([d790361](https://github.com/VinciGit00/Scrapegraph-ai/commit/d79036149a3197a385b73553f29df66d36480c38))
@ -136,6 +137,7 @@
* **release:** 1.7.0-beta.8 [skip ci] ([a87702f](https://github.com/VinciGit00/Scrapegraph-ai/commit/a87702f107f3fd16ee73e1af1585cd763788bf46))
* **release:** 1.7.0-beta.9 [skip ci] ([0c5d6e2](https://github.com/VinciGit00/Scrapegraph-ai/commit/0c5d6e2c82b9ee81c91cd2325948bb5a4eddcb31))
## [1.7.0-beta.12](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.0-beta.11...v1.7.0-beta.12) (2024-06-17)

View File

@ -2,32 +2,31 @@
Basic example of scraping pipeline using SmartScraper with schema
"""
import os, json
import json
import os
from typing import Dict
from dotenv import load_dotenv
from pydantic import BaseModel
from scrapegraphai.graphs import SmartScraperGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
schema= """
{
"Projects": [
"Project #":
{
"title": "...",
"description": "...",
},
"Project #":
{
"title": "...",
"description": "...",
}
]
}
"""
class Project(BaseModel):
title: str
description: str
class Projects(BaseModel):
Projects: Dict[str, Project]
# ************************************************
# Define the configuration for the graph
@ -37,7 +36,7 @@ openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"api_key":openai_key,
"api_key": openai_key,
"model": "gpt-3.5-turbo",
},
"verbose": True,
@ -51,8 +50,8 @@ graph_config = {
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=schema,
config=graph_config
schema=Projects,
config=graph_config,
)
result = smart_scraper_graph.run()

View File

@ -4,6 +4,9 @@ Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key
import os
from dotenv import load_dotenv
from typing import Dict
from pydantic import BaseModel
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
from langchain_community.llms import HuggingFaceEndpoint
@ -13,22 +16,12 @@ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
# Define the output schema for the graph
# ************************************************
schema= """
{
"Projects": [
"Project #":
{
"title": "...",
"description": "...",
},
"Project #":
{
"title": "...",
"description": "...",
}
]
}
"""
class Project(BaseModel):
title: str
description: str
class Projects(BaseModel):
Projects: Dict[str, Project]
## required environment variable in .env
#HUGGINGFACEHUB_API_TOKEN
@ -61,7 +54,7 @@ graph_config = {
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=schema,
schema=Projects,
config=graph_config
)
result = smart_scraper_graph.run()

View File

@ -2,8 +2,13 @@
Basic example of scraping pipeline using SmartScraper with schema
"""
import os, json
import json
import os
from typing import Dict, List
from dotenv import load_dotenv
from pydantic import BaseModel
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
@ -13,22 +18,12 @@ load_dotenv()
# Define the output schema for the graph
# ************************************************
schema= """
{
"Projects": [
"Project #":
{
"title": "...",
"description": "...",
},
"Project #":
{
"title": "...",
"description": "...",
}
]
}
"""
class Project(BaseModel):
title: str
description: str
class Projects(BaseModel):
Projects: Dict[str, Project]
# ************************************************
# Define the configuration for the graph
@ -60,7 +55,7 @@ smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects with their description.",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects/",
schema=schema,
schema=Projects,
config=graph_config
)

View File

@ -0,0 +1,50 @@
"""
Example of custom graph using existing nodes
"""
from scrapegraphai.models import Ollama
from scrapegraphai.nodes import SearchInternetNode
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "llama3",
"temperature": 0,
"streaming": True
},
"search_engine": "google",
"max_results": 3,
"verbose": True
}
# ************************************************
# Define the node
# ************************************************
llm_model = Ollama(graph_config["llm"])
search_node = SearchInternetNode(
input="user_input",
output=["search_results"],
node_config={
"llm_model": llm_model,
"search_engine": graph_config["search_engine"],
"max_results": graph_config["max_results"],
"verbose": graph_config["verbose"]
}
)
# ************************************************
# Test the node
# ************************************************
state = {
"user_input": "What is the capital of France?"
}
result = search_node.execute(state)
print(result)

View File

@ -39,7 +39,7 @@ class AbstractGraph(ABC):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.

View File

@ -5,6 +5,8 @@ CSVScraperMultiGraph Module
from copy import copy, deepcopy
from typing import List, Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from .csv_scraper_graph import CSVScraperGraph
@ -32,7 +34,7 @@ class CSVScraperMultiGraph(AbstractGraph):
prompt (str): The user prompt to search the internet.
source (List[str]): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (Optional[str]): The schema for the graph output.
schema (Optional[BaseModel]): The schema for the graph output.
Example:
>>> search_graph = MultipleSearchGraph(
@ -42,7 +44,7 @@ class CSVScraperMultiGraph(AbstractGraph):
>>> result = search_graph.run()
"""
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
self.max_results = config.get("max_results", 3)

View File

@ -34,7 +34,7 @@ class DeepScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -45,7 +45,7 @@ class DeepScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
Example:
>>> deep_scraper = DeepScraperGraph(

View File

@ -23,7 +23,7 @@ class JSONScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -34,7 +34,7 @@ class JSONScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
Example:
>>> json_scraper = JSONScraperGraph(

View File

@ -33,7 +33,7 @@ class JSONScraperMultiGraph(AbstractGraph):
prompt (str): The user prompt to search the internet.
source (List[str]): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (Optional[str]): The schema for the graph output.
schema (Optional[BaseModel]): The schema for the graph output.
Example:
>>> search_graph = MultipleSearchGraph(

View File

@ -29,7 +29,7 @@ class OmniScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -41,7 +41,7 @@ class OmniScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
Example:
>>> omni_scraper = OmniScraperGraph(

View File

@ -34,7 +34,7 @@ class OmniSearchGraph(AbstractGraph):
Args:
prompt (str): The user prompt to search the internet.
config (dict): Configuration parameters for the graph.
schema (Optional[str]): The schema for the graph output.
schema (Optional[BaseModel]): The schema for the graph output.
Example:
>>> omni_search_graph = OmniSearchGraph(

View File

@ -26,7 +26,7 @@ class PDFScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -38,7 +38,7 @@ class PDFScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
Example:
>>> pdf_scraper = PDFScraperGraph(

View File

@ -34,7 +34,7 @@ class PdfScraperMultiGraph(AbstractGraph):
prompt (str): The user prompt to search the internet.
source (List[str]): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (Optional[str]): The schema for the graph output.
schema (Optional[BaseModel]): The schema for the graph output.
Example:
>>> search_graph = MultipleSearchGraph(

View File

@ -23,7 +23,7 @@ class ScriptCreatorGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -36,7 +36,7 @@ class ScriptCreatorGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
Example:
>>> script_creator = ScriptCreatorGraph(

View File

@ -5,6 +5,8 @@ ScriptCreatorMultiGraph Module
from copy import copy, deepcopy
from typing import List, Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from .script_creator_graph import ScriptCreatorGraph
@ -30,7 +32,7 @@ class ScriptCreatorMultiGraph(AbstractGraph):
prompt (str): The user prompt to search the internet.
source (List[str]): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (Optional[str]): The schema for the graph output.
schema (Optional[BaseModel]): The schema for the graph output.
Example:
>>> script_graph = ScriptCreatorMultiGraph(
... "What is Chioggia famous for?",
@ -41,7 +43,7 @@ class ScriptCreatorMultiGraph(AbstractGraph):
>>> result = script_graph.run()
"""
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
self.max_results = config.get("max_results", 3)

View File

@ -35,7 +35,7 @@ class SearchGraph(AbstractGraph):
Args:
prompt (str): The user prompt to search the internet.
config (dict): Configuration parameters for the graph.
schema (Optional[str]): The schema for the graph output.
schema (Optional[BaseModel]): The schema for the graph output.
Example:
>>> search_graph = SearchGraph(

View File

@ -26,7 +26,7 @@ class SmartScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -37,7 +37,7 @@ class SmartScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
Example:
>>> smart_scraper = SmartScraperGraph(

View File

@ -33,7 +33,7 @@ class SmartScraperMultiGraph(AbstractGraph):
prompt (str): The user prompt to search the internet.
source (List[str]): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (Optional[str]): The schema for the graph output.
schema (Optional[BaseModel]): The schema for the graph output.
Example:
>>> search_graph = MultipleSearchGraph(

View File

@ -28,7 +28,7 @@ class SpeechGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client, configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
@ -39,7 +39,7 @@ class SpeechGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
Example:
>>> speech_graph = SpeechGraph(

View File

@ -24,7 +24,7 @@ class XMLScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -36,7 +36,7 @@ class XMLScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
schema (BaseModel): The schema for the graph output.
Example:
>>> xml_scraper = XMLScraperGraph(

View File

@ -34,7 +34,7 @@ class XMLScraperMultiGraph(AbstractGraph):
prompt (str): The user prompt to search the internet.
source (List[str]): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (Optional[str]): The schema for the graph output.
schema (Optional[BaseModel]): The schema for the graph output.
Example:
>>> search_graph = MultipleSearchGraph(

View File

@ -43,6 +43,7 @@ class SearchInternetNode(BaseNode):
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
self.search_engine = node_config.get("search_engine", "google")
self.max_results = node_config.get("max_results", 3)
def execute(self, state: dict) -> dict:
@ -97,7 +98,8 @@ class SearchInternetNode(BaseNode):
self.logger.info(f"Search Query: {search_query}")
answer = search_on_web(query=search_query, max_results=self.max_results)
answer = search_on_web(query=search_query, max_results=self.max_results,
search_engine=self.search_engine)
if len(answer) == 0:
# raise an exception if no answer is found

View File

@ -1,11 +1,12 @@
"""
Module for making the request on the web
research web module
"""
import re
from typing import List
from langchain_community.tools import DuckDuckGoSearchResults
from googlesearch import search as google_search
import requests
from bs4 import BeautifulSoup
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
"""
@ -13,34 +14,48 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =
Args:
query (str): The search query to find on the internet.
search_engine (str, optional): Specifies the search engine to use, options include 'Google' or 'DuckDuckGo'. Default is 'Google'.
search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'.
max_results (int, optional): The maximum number of search results to return.
Returns:
List[str]: A list of URLs as strings that are the search results.
Raises:
ValueError: If the search engine specified is neither 'Google' nor 'DuckDuckGo'.
ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'.
Example:
>>> search_on_web("example query", search_engine="Google", max_results=5)
['http://example.com', 'http://example.org', ...]
This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs.
This function allows switching between Google, DuckDuckGo, and Bing to perform
internet searches, returning a list of result URLs.
"""
if search_engine.lower() == "google":
res = []
for url in google_search(query, stop=max_results):
res.append(url)
return res
elif search_engine.lower() == "duckduckgo":
research = DuckDuckGoSearchResults(max_results=max_results)
res = research.run(query)
links = re.findall(r'https?://[^\s,\]]+', res)
return links
raise ValueError(
"The only search engines available are DuckDuckGo or Google")
elif search_engine.lower() == "bing":
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
search_url = f"https://www.bing.com/search?q={query}"
response = requests.get(search_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
search_results = []
for result in soup.find_all('li', class_='b_algo', limit=max_results):
link = result.find('a')['href']
search_results.append(link)
return search_results
raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing")

View File

@ -0,0 +1,58 @@
import unittest
from scrapegraphai.models import Ollama
from scrapegraphai.nodes import SearchInternetNode
class TestSearchInternetNode(unittest.TestCase):
def setUp(self):
# Configuration for the graph
self.graph_config = {
"llm": {
"model": "llama3",
"temperature": 0,
"streaming": True
},
"search_engine": "google",
"max_results": 3,
"verbose": True
}
# Define the model
self.llm_model = Ollama(self.graph_config["llm"])
# Initialize the SearchInternetNode
self.search_node = SearchInternetNode(
input="user_input",
output=["search_results"],
node_config={
"llm_model": self.llm_model,
"search_engine": self.graph_config["search_engine"],
"max_results": self.graph_config["max_results"],
"verbose": self.graph_config["verbose"]
}
)
def test_execute_search_node(self):
# Initial state
state = {
"user_input": "What is the capital of France?"
}
# Expected output
expected_output = {
"user_input": "What is the capital of France?",
"search_results": [
"https://en.wikipedia.org/wiki/Paris",
"https://en.wikipedia.org/wiki/France",
"https://en.wikipedia.org/wiki/%C3%8Ele-de-France"
]
}
# Execute the node
result = self.search_node.execute(state)
# Assert the results
self.assertEqual(result, expected_output)
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,28 @@
import pytest
from scrapegraphai.utils.research_web import search_on_web # Replace with actual path to your file
def test_google_search():
"""Tests search_on_web with Google search engine."""
results = search_on_web("test query", search_engine="Google", max_results=2)
assert len(results) == 2
# You can further assert if the results actually contain 'test query' in the title/snippet using additional libraries
def test_bing_search():
"""Tests search_on_web with Bing search engine."""
results = search_on_web("test query", search_engine="Bing", max_results=1)
assert results is not None
# You can further assert if the results contain '.com' or '.org' in the domain
def test_invalid_search_engine():
"""Tests search_on_web with invalid search engine."""
with pytest.raises(ValueError):
search_on_web("test query", search_engine="Yahoo", max_results=5)
def test_max_results():
"""Tests search_on_web with different max_results values."""
results_5 = search_on_web("test query", max_results=5)
results_10 = search_on_web("test query", max_results=10)
assert len(results_5) <= len(results_10)