feat(smart-scraper-multi): add schema to graphs and created SmartScraperMultiGraph

This commit is contained in:
Marco Perini 2024-05-21 13:13:27 +02:00
parent 5701afe927
commit fc58e2d3a6
35 changed files with 401 additions and 172 deletions

View File

@ -0,0 +1,134 @@
"""
Example of custom graph for creating a knowledge graph
"""
import os, json
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from scrapegraphai.models import OpenAI
from scrapegraphai.graphs import BaseGraph, SmartScraperGraph
from scrapegraphai.nodes import GraphIteratorNode, MergeAnswersNode, KnowledgeGraphNode
load_dotenv()
# ************************************************
# Define the output schema
# ************************************************
schema= """{
"Job Postings": {
"Company x": [
{
"title": "...",
"description": "...",
"location": "...",
"date_posted": "..",
"requirements": ["...", "...", "..."]
},
{
"title": "...",
"description": "...",
"location": "...",
"date_posted": "..",
"requirements": ["...", "...", "..."]
}
],
"Company y": [
{
"title": "...",
"description": "...",
"location": "...",
"date_posted": "..",
"requirements": ["...", "...", "..."]
}
]
}
}"""
# ************************************************
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-4o",
},
"verbose": True,
"headless": False,
}
# ************************************************
# Define the graph nodes
# ************************************************
llm_model = OpenAI(graph_config["llm"])
embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key)
smart_scraper_instance = SmartScraperGraph(
prompt="",
source="",
config=graph_config,
)
# ************************************************
# Define the graph nodes
# ************************************************
graph_iterator_node = GraphIteratorNode(
input="user_prompt & urls",
output=["results"],
node_config={
"graph_instance": smart_scraper_instance,
}
)
merge_answers_node = MergeAnswersNode(
input="user_prompt & results",
output=["answer"],
node_config={
"llm_model": llm_model,
"schema": schema
}
)
knowledge_graph_node = KnowledgeGraphNode(
input="user_prompt & answer",
output=["kg"],
node_config={
"llm_model": llm_model,
}
)
graph = BaseGraph(
nodes=[
graph_iterator_node,
merge_answers_node,
knowledge_graph_node
],
edges=[
(graph_iterator_node, merge_answers_node),
(merge_answers_node, knowledge_graph_node)
],
entry_point=graph_iterator_node
)
# ************************************************
# Execute the graph
# ************************************************
result, execution_info = graph.execute({
"user_prompt": "List me all the Machine Learning Engineer job postings",
"urls": [
"https://www.linkedin.com/jobs/machine-learning-engineer-offerte-di-lavoro/?currentJobId=3889037104&originalSubdomain=it",
"https://www.glassdoor.com/Job/italy-machine-learning-engineer-jobs-SRCH_IL.0,5_IN120_KO6,31.html",
"https://it.indeed.com/jobs?q=ML+engineer&vjk=3c2e6d27601ffaaa"
],
})
# get the answer from the result
result = result.get("answer", "No answer found.")
print(json.dumps(result, indent=4))

View File

@ -46,7 +46,7 @@ robot_node = RobotsNode(
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
output=["doc", "link_urls", "img_urls"],
node_config={
"verbose": True,
"headless": True,

View File

@ -1,79 +0,0 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import MultipleSearchGraph
from scrapegraphai.utils import prettify_exec_info
load_dotenv()
schema= """{
"Job Postings": {
"Company x": [
{
"title": "...",
"description": "...",
"location": "...",
"date_posted": "..",
"requirements": ["...", "...", "..."]
},
{
"title": "...",
"description": "...",
"location": "...",
"date_posted": "..",
"requirements": ["...", "...", "..."]
}
],
"Company y": [
{
"title": "...",
"description": "...",
"location": "...",
"date_posted": "..",
"requirements": ["...", "...", "..."]
}
]
}
}"""
# ************************************************
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-4o",
},
"verbose": True,
"headless": False,
"schema": schema,
}
multiple_search_graph = MultipleSearchGraph(
prompt="List me all the projects with their description",
source= [
"https://www.linkedin.com/jobs/machine-learning-engineer-offerte-di-lavoro/?currentJobId=3889037104&originalSubdomain=it",
"https://www.glassdoor.com/Job/italy-machine-learning-engineer-jobs-SRCH_IL.0,5_IN120_KO6,31.html",
"https://it.indeed.com/jobs?q=ML+engineer&vjk=3c2e6d27601ffaaa"
],
config=graph_config,
)
result = multiple_search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = multiple_search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

View File

@ -19,7 +19,7 @@ openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-4-turbo",
"model": "gpt-4o",
},
"verbose": True,
"headless": True,

View File

@ -20,7 +20,7 @@ graph_config = {
"model": "gpt-4o",
},
"max_results": 2,
"max_images": 5,
"max_images": 1,
"verbose": True,
}

View File

@ -0,0 +1,41 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
import os, json
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperMultiGraph
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-4o",
},
"verbose": True,
"headless": False,
}
# *******************************************************
# Create the SmartScraperMultiGraph instance and run it
# *******************************************************
multiple_search_graph = SmartScraperMultiGraph(
prompt="Who is Marco Perini?",
source= [
"https://perinim.github.io/",
"https://perinim.github.io/cv/"
],
schema=None,
config=graph_config
)
result = multiple_search_graph.run()
print(json.dumps(result, indent=4))

View File

@ -0,0 +1,59 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
import os, json
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
schema= """
{
"Projects": [
"Project #":
{
"title": "...",
"description": "...",
},
"Project #":
{
"title": "...",
"description": "...",
}
]
}
"""
# ************************************************
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"api_key":openai_key,
"model": "gpt-3.5-turbo",
},
"verbose": True,
"headless": False,
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=schema,
config=graph_config
)
result = smart_scraper_graph.run()
print(json.dumps(result, indent=4))

View File

@ -45,6 +45,10 @@ certifi==2024.2.2
# via requests
charset-normalizer==3.3.2
# via requests
colorama==0.4.6
# via ipython
# via pytest
# via tqdm
dataclasses-json==0.6.6
# via langchain
# via langchain-community
@ -100,6 +104,7 @@ graphviz==0.20.3
# via scrapegraphai
greenlet==3.0.3
# via playwright
# via sqlalchemy
groq==0.5.0
# via langchain-groq
grpcio==1.63.0
@ -212,8 +217,6 @@ pandas==2.2.2
# via scrapegraphai
parso==0.8.4
# via jedi
pexpect==4.9.0
# via ipython
playwright==1.43.0
# via scrapegraphai
pluggy==1.5.0
@ -230,8 +233,6 @@ protobuf==4.25.3
# via googleapis-common-protos
# via grpcio-status
# via proto-plus
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.2
# via stack-data
pyasn1==0.6.0

View File

@ -45,6 +45,9 @@ certifi==2024.2.2
# via requests
charset-normalizer==3.3.2
# via requests
colorama==0.4.6
# via ipython
# via tqdm
dataclasses-json==0.6.6
# via langchain
# via langchain-community
@ -99,6 +102,7 @@ graphviz==0.20.3
# via scrapegraphai
greenlet==3.0.3
# via playwright
# via sqlalchemy
groq==0.5.0
# via langchain-groq
grpcio==1.63.0
@ -208,8 +212,6 @@ pandas==2.2.2
# via scrapegraphai
parso==0.8.4
# via jedi
pexpect==4.9.0
# via ipython
playwright==1.43.0
# via scrapegraphai
prompt-toolkit==3.0.43
@ -224,8 +226,6 @@ protobuf==4.25.3
# via googleapis-common-protos
# via grpcio-status
# via proto-plus
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.2
# via stack-data
pyasn1==0.6.0

View File

@ -15,4 +15,4 @@ from .csv_scraper_graph import CSVScraperGraph
from .pdf_scraper_graph import PDFScraperGraph
from .omni_scraper_graph import OmniScraperGraph
from .omni_search_graph import OmniSearchGraph
from .multiple_search_graph import MultipleSearchGraph
from .smart_scraper_multi_graph import SmartScraperMultiGraph

View File

@ -7,10 +7,11 @@ from langchain_aws import BedrockEmbeddings
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from ..helpers import models_tokens
from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
from ..helpers import models_tokens
from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek
class AbstractGraph(ABC):
"""
@ -19,6 +20,7 @@ class AbstractGraph(ABC):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -29,6 +31,7 @@ class AbstractGraph(ABC):
prompt (str): The prompt for the graph.
config (dict): Configuration parameters for the graph.
source (str, optional): The source of the graph.
schema (str, optional): The schema for the graph output.
Example:
>>> class MyGraph(AbstractGraph):
@ -40,11 +43,12 @@ class AbstractGraph(ABC):
>>> result = my_graph.run()
"""
def __init__(self, prompt: str, config: dict, source: Optional[str] = None):
def __init__(self, prompt: str, config: dict, source: Optional[str] = None, schema: Optional[str] = None):
self.prompt = prompt
self.source = source
self.config = config
self.schema = schema
self.llm_model = self._create_llm(config["llm"], chat=True)
self.embedder_model = self._create_default_embedder(llm_config=config["llm"]
) if "embeddings" not in config else self._create_embedder(
@ -61,14 +65,14 @@ class AbstractGraph(ABC):
self.headless = True if config is None else config.get(
"headless", True)
self.loader_kwargs = config.get("loader_kwargs", {})
self.schema = config.get("schema", None)
common_params = {"headless": self.headless,
"verbose": self.verbose,
"loader_kwargs": self.loader_kwargs,
"llm_model": self.llm_model,
"embedder_model": self.embedder_model,
"schema": self.schema}
common_params = {
"headless": self.headless,
"verbose": self.verbose,
"loader_kwargs": self.loader_kwargs,
"llm_model": self.llm_model,
"embedder_model": self.embedder_model
}
self.set_common_params(common_params, overwrite=False)

View File

@ -1,14 +1,18 @@
"""
Module for creating the smart scraper
"""
from typing import Optional
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
RAGNode,
GenerateAnswerCSVNode
)
from .abstract_graph import AbstractGraph
class CSVScraperGraph(AbstractGraph):
@ -17,11 +21,11 @@ class CSVScraperGraph(AbstractGraph):
information from web pages using a natural language model to interpret and answer prompts.
"""
def __init__(self, prompt: str, source: str, config: dict):
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
"""
Initializes the CSVScraperGraph with a prompt, source, and configuration.
"""
super().__init__(prompt, config, source)
super().__init__(prompt, config, source, schema)
self.input_key = "csv" if source.endswith("csv") else "csv_dir"
@ -53,6 +57,7 @@ class CSVScraperGraph(AbstractGraph):
output=["answer"],
node_config={
"llm_model": self.llm_model,
"schema": self.schema,
}
)

View File

@ -2,7 +2,11 @@
DeepScraperGraph Module
"""
from typing import Optional
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
SearchLinkNode,
@ -12,7 +16,6 @@ from ..nodes import (
GraphIteratorNode,
MergeAnswersNode
)
from .abstract_graph import AbstractGraph
class DeepScraperGraph(AbstractGraph):
@ -30,15 +33,19 @@ class DeepScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
Args:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
Example:
>>> deep_scraper = DeepScraperGraph(
... "List me all the job titles and detailed job description.",
@ -49,8 +56,10 @@ class DeepScraperGraph(AbstractGraph):
)
"""
def __init__(self, prompt: str, source: str, config: dict):
super().__init__(prompt, config, source)
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
super().__init__(prompt, config, source, schema)
self.input_key = "url" if source.startswith("http") else "local_dir"
def _create_repeated_graph(self) -> BaseGraph:
@ -84,7 +93,8 @@ class DeepScraperGraph(AbstractGraph):
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={
"llm_model": self.llm_model
"llm_model": self.llm_model,
"schema": self.schema
}
)
search_node = SearchLinkNode(
@ -108,6 +118,7 @@ class DeepScraperGraph(AbstractGraph):
output=["answer"],
node_config={
"llm_model": self.llm_model,
"schema": self.schema
}
)

View File

@ -2,14 +2,17 @@
JSONScraperGraph Module
"""
from typing import Optional
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
RAGNode,
GenerateAnswerNode
)
from .abstract_graph import AbstractGraph
class JSONScraperGraph(AbstractGraph):
@ -20,6 +23,7 @@ class JSONScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -30,6 +34,7 @@ class JSONScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
Example:
>>> json_scraper = JSONScraperGraph(
@ -40,8 +45,8 @@ class JSONScraperGraph(AbstractGraph):
>>> result = json_scraper.run()
"""
def __init__(self, prompt: str, source: str, config: dict):
super().__init__(prompt, config, source)
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
super().__init__(prompt, config, source, schema)
self.input_key = "json" if source.endswith("json") else "json_dir"
@ -76,7 +81,8 @@ class JSONScraperGraph(AbstractGraph):
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={
"llm_model": self.llm_model
"llm_model": self.llm_model,
"schema": self.schema
}
)

View File

@ -2,7 +2,11 @@
OmniScraperGraph Module
"""
from typing import Optional
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
@ -10,8 +14,8 @@ from ..nodes import (
RAGNode,
GenerateAnswerOmniNode
)
from scrapegraphai.models import OpenAIImageToText
from .abstract_graph import AbstractGraph
from ..models import OpenAIImageToText
class OmniScraperGraph(AbstractGraph):
@ -24,6 +28,7 @@ class OmniScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -35,6 +40,7 @@ class OmniScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
Example:
>>> omni_scraper = OmniScraperGraph(
@ -46,11 +52,11 @@ class OmniScraperGraph(AbstractGraph):
)
"""
def __init__(self, prompt: str, source: str, config: dict):
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
self.max_images = 5 if config is None else config.get("max_images", 5)
super().__init__(prompt, config, source)
super().__init__(prompt, config, source, schema)
self.input_key = "url" if source.startswith("http") else "local_dir"
@ -96,7 +102,8 @@ class OmniScraperGraph(AbstractGraph):
input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc",
output=["answer"],
node_config={
"llm_model": self.llm_model
"llm_model": self.llm_model,
"schema": self.schema
}
)

View File

@ -3,15 +3,17 @@ OmniSearchGraph Module
"""
from copy import copy, deepcopy
from typing import Optional
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from .omni_scraper_graph import OmniScraperGraph
from ..nodes import (
SearchInternetNode,
GraphIteratorNode,
MergeAnswersNode
)
from .abstract_graph import AbstractGraph
from .omni_scraper_graph import OmniScraperGraph
class OmniSearchGraph(AbstractGraph):
@ -31,6 +33,7 @@ class OmniSearchGraph(AbstractGraph):
Args:
prompt (str): The user prompt to search the internet.
config (dict): Configuration parameters for the graph.
schema (Optional[str]): The schema for the graph output.
Example:
>>> omni_search_graph = OmniSearchGraph(
@ -40,7 +43,7 @@ class OmniSearchGraph(AbstractGraph):
>>> result = search_graph.run()
"""
def __init__(self, prompt: str, config: dict):
def __init__(self, prompt: str, config: dict, schema: Optional[str] = None):
self.max_results = config.get("max_results", 3)
@ -49,7 +52,7 @@ class OmniSearchGraph(AbstractGraph):
else:
self.copy_config = deepcopy(config)
super().__init__(prompt, config)
super().__init__(prompt, config, schema)
def _create_graph(self) -> BaseGraph:
"""
@ -94,6 +97,7 @@ class OmniSearchGraph(AbstractGraph):
output=["answer"],
node_config={
"llm_model": self.llm_model,
"schema": self.schema
}
)

View File

@ -2,14 +2,17 @@
PDFScraperGraph Module
"""
from typing import Optional
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
RAGNode,
GenerateAnswerNode
)
from .abstract_graph import AbstractGraph
class PDFScraperGraph(AbstractGraph):
@ -21,6 +24,7 @@ class PDFScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -32,6 +36,7 @@ class PDFScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
Example:
>>> pdf_scraper = PDFScraperGraph(
@ -42,8 +47,8 @@ class PDFScraperGraph(AbstractGraph):
>>> result = pdf_scraper.run()
"""
def __init__(self, prompt: str, source: str, config: dict):
super().__init__(prompt, config, source)
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
super().__init__(prompt, config, source, schema)
self.input_key = "pdf" if source.endswith("pdf") else "pdf_dir"
@ -79,6 +84,7 @@ class PDFScraperGraph(AbstractGraph):
output=["answer"],
node_config={
"llm_model": self.llm_model,
"schema": self.schema,
}
)

View File

@ -2,13 +2,16 @@
ScriptCreatorGraph Module
"""
from typing import Optional
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
GenerateScraperNode
)
from .abstract_graph import AbstractGraph
class ScriptCreatorGraph(AbstractGraph):
@ -19,6 +22,7 @@ class ScriptCreatorGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -31,6 +35,7 @@ class ScriptCreatorGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
Example:
>>> script_creator = ScriptCreatorGraph(
@ -41,11 +46,11 @@ class ScriptCreatorGraph(AbstractGraph):
>>> result = script_creator.run()
"""
def __init__(self, prompt: str, source: str, config: dict):
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
self.library = config['library']
super().__init__(prompt, config, source)
super().__init__(prompt, config, source, schema)
self.input_key = "url" if source.startswith("http") else "local_dir"
@ -65,14 +70,16 @@ class ScriptCreatorGraph(AbstractGraph):
input="doc",
output=["parsed_doc"],
node_config={"chunk_size": self.model_token,
"verbose": self.verbose,
"parse_html": False
}
)
generate_scraper_node = GenerateScraperNode(
input="user_prompt & (doc)",
output=["answer"],
node_config={"llm_model": self.llm_model},
node_config={
"llm_model": self.llm_model,
"schema": self.schema,
},
library=self.library,
website=self.source
)

View File

@ -3,15 +3,17 @@ SearchGraph Module
"""
from copy import copy, deepcopy
from typing import Optional
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from .smart_scraper_graph import SmartScraperGraph
from ..nodes import (
SearchInternetNode,
GraphIteratorNode,
MergeAnswersNode
)
from .abstract_graph import AbstractGraph
from .smart_scraper_graph import SmartScraperGraph
class SearchGraph(AbstractGraph):
@ -30,6 +32,7 @@ class SearchGraph(AbstractGraph):
Args:
prompt (str): The user prompt to search the internet.
config (dict): Configuration parameters for the graph.
schema (Optional[str]): The schema for the graph output.
Example:
>>> search_graph = SearchGraph(
@ -39,7 +42,7 @@ class SearchGraph(AbstractGraph):
>>> result = search_graph.run()
"""
def __init__(self, prompt: str, config: dict):
def __init__(self, prompt: str, config: dict, schema: Optional[str] = None):
self.max_results = config.get("max_results", 3)
@ -48,7 +51,7 @@ class SearchGraph(AbstractGraph):
else:
self.copy_config = deepcopy(config)
super().__init__(prompt, config)
super().__init__(prompt, config, schema)
def _create_graph(self) -> BaseGraph:
"""
@ -93,6 +96,7 @@ class SearchGraph(AbstractGraph):
output=["answer"],
node_config={
"llm_model": self.llm_model,
"schema": self.schema
}
)

View File

@ -2,14 +2,17 @@
SmartScraperGraph Module
"""
from typing import Optional
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
RAGNode,
GenerateAnswerNode
)
from .abstract_graph import AbstractGraph
class SmartScraperGraph(AbstractGraph):
@ -22,6 +25,7 @@ class SmartScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -32,6 +36,7 @@ class SmartScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
Example:
>>> smart_scraper = SmartScraperGraph(
@ -43,8 +48,8 @@ class SmartScraperGraph(AbstractGraph):
)
"""
def __init__(self, prompt: str, source: str, config: dict):
super().__init__(prompt, config, source)
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
super().__init__(prompt, config, source, schema)
self.input_key = "url" if source.startswith("http") else "local_dir"
@ -82,7 +87,7 @@ class SmartScraperGraph(AbstractGraph):
output=["answer"],
node_config={
"llm_model": self.llm_model,
"schema": self.config.get("schema", None),
"schema": self.schema,
}
)

View File

@ -1,25 +1,25 @@
"""
MultipleSearchGraph Module
SmartScraperMultiGraph Module
"""
from copy import copy, deepcopy
from typing import List, Optional
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from .smart_scraper_graph import SmartScraperGraph
from ..nodes import (
GraphIteratorNode,
MergeAnswersNode,
KnowledgeGraphNode
)
from .abstract_graph import AbstractGraph
from .smart_scraper_graph import SmartScraperGraph
from typing import List, Optional
class MultipleSearchGraph(AbstractGraph):
class SmartScraperMultiGraph(AbstractGraph):
"""
MultipleSearchGraph is a scraping pipeline that searches the internet for answers to a given prompt.
It only requires a user prompt to search the internet and generate an answer.
SmartScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt.
It only requires a user prompt and a list of URLs.
Attributes:
prompt (str): The user prompt to search the internet.
@ -31,7 +31,9 @@ class MultipleSearchGraph(AbstractGraph):
Args:
prompt (str): The user prompt to search the internet.
source (List[str]): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (Optional[str]): The schema for the graph output.
Example:
>>> search_graph = MultipleSearchGraph(
@ -41,7 +43,7 @@ class MultipleSearchGraph(AbstractGraph):
>>> result = search_graph.run()
"""
def __init__(self, prompt: str, source: List[str], config: dict):
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
self.max_results = config.get("max_results", 3)
@ -50,7 +52,7 @@ class MultipleSearchGraph(AbstractGraph):
else:
self.copy_config = deepcopy(config)
super().__init__(prompt, config, source)
super().__init__(prompt, config, source, schema)
def _create_graph(self) -> BaseGraph:
"""
@ -87,15 +89,7 @@ class MultipleSearchGraph(AbstractGraph):
output=["answer"],
node_config={
"llm_model": self.llm_model,
"schema": self.config.get("schema", None),
}
)
knowledge_graph_node = KnowledgeGraphNode(
input="user_prompt & answer",
output=["kg"],
node_config={
"llm_model": self.llm_model,
"schema": self.schema
}
)
@ -103,11 +97,9 @@ class MultipleSearchGraph(AbstractGraph):
nodes=[
graph_iterator_node,
merge_answers_node,
knowledge_graph_node
],
edges=[
(graph_iterator_node, merge_answers_node),
(merge_answers_node, knowledge_graph_node)
],
entry_point=graph_iterator_node
)

View File

@ -2,9 +2,11 @@
SpeechGraph Module
"""
from scrapegraphai.utils.save_audio_from_bytes import save_audio_from_bytes
from ..models import OpenAITextToSpeech
from typing import Optional
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
@ -12,7 +14,9 @@ from ..nodes import (
GenerateAnswerNode,
TextToSpeechNode,
)
from .abstract_graph import AbstractGraph
from ..utils.save_audio_from_bytes import save_audio_from_bytes
from ..models import OpenAITextToSpeech
class SpeechGraph(AbstractGraph):
@ -23,6 +27,7 @@ class SpeechGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client, configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
@ -33,6 +38,7 @@ class SpeechGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
Example:
>>> speech_graph = SpeechGraph(
@ -41,8 +47,8 @@ class SpeechGraph(AbstractGraph):
... {"llm": {"model": "gpt-3.5-turbo"}}
"""
def __init__(self, prompt: str, source: str, config: dict):
super().__init__(prompt, config, source)
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
super().__init__(prompt, config, source, schema)
self.input_key = "url" if source.startswith("http") else "local_dir"
@ -76,7 +82,8 @@ class SpeechGraph(AbstractGraph):
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={
"llm_model": self.llm_model
"llm_model": self.llm_model,
"schema": self.schema
}
)
text_to_speech_node = TextToSpeechNode(

View File

@ -2,14 +2,17 @@
XMLScraperGraph Module
"""
from typing import Optional
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
RAGNode,
GenerateAnswerNode
)
from .abstract_graph import AbstractGraph
class XMLScraperGraph(AbstractGraph):
@ -21,6 +24,7 @@ class XMLScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
@ -32,6 +36,7 @@ class XMLScraperGraph(AbstractGraph):
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (str): The schema for the graph output.
Example:
>>> xml_scraper = XMLScraperGraph(
@ -42,8 +47,8 @@ class XMLScraperGraph(AbstractGraph):
>>> result = xml_scraper.run()
"""
def __init__(self, prompt: str, source: str, config: dict):
super().__init__(prompt, config, source)
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
super().__init__(prompt, config, source, schema)
self.input_key = "xml" if source.endswith("xml") else "xml_dir"
@ -78,7 +83,8 @@ class XMLScraperGraph(AbstractGraph):
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={
"llm_model": self.llm_model
"llm_model": self.llm_model,
"schema": self.schema
}
)

View File

@ -1,6 +1,7 @@
"""
Module for implementing the conditional node
"""
from .base_node import BaseNode

View File

@ -10,10 +10,9 @@ from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import RunnableParallel
from ..helpers.generate_answer_node_csv_prompts import template_chunks_csv, template_no_chunks_csv, template_merge_csv
# Imports from the library
from .base_node import BaseNode
from ..helpers.generate_answer_node_csv_prompts import template_chunks_csv, template_no_chunks_csv, template_merge_csv
class GenerateAnswerCSVNode(BaseNode):

View File

@ -15,6 +15,7 @@ from langchain_core.runnables import RunnableParallel
from .base_node import BaseNode
from ..helpers import template_chunks, template_no_chunks, template_merge, template_chunks_with_schema, template_no_chunks_with_schema
class GenerateAnswerNode(BaseNode):
"""
A node that generates an answer using a large language model (LLM) based on the user's input

View File

@ -15,6 +15,7 @@ from langchain_core.runnables import RunnableParallel
from .base_node import BaseNode
from ..helpers.generate_answer_node_omni_prompts import template_no_chunk_omni, template_chunks_omni, template_merge_omni
class GenerateAnswerOmniNode(BaseNode):
"""
A node that generates an answer using a large language model (LLM) based on the user's input

View File

@ -14,6 +14,7 @@ from langchain_core.runnables import RunnableParallel
from .base_node import BaseNode
from ..helpers.generate_answer_node_pdf_prompts import template_chunks_pdf, template_no_chunks_pdf, template_merge_pdf
class GenerateAnswerPDFNode(BaseNode):
"""
A node that generates an answer using a language model (LLM) based on the user's input

View File

@ -10,7 +10,6 @@ from tqdm.asyncio import tqdm
from .base_node import BaseNode
_default_batchsize = 16

View File

@ -14,6 +14,7 @@ from langchain_core.output_parsers import JsonOutputParser
from .base_node import BaseNode
from ..utils import create_graph, create_interactive_graph
class KnowledgeGraphNode(BaseNode):
"""
A node responsible for generating a knowledge graph from a dictionary.

View File

@ -3,8 +3,10 @@ ParseNode Module
"""
from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_transformers import Html2TextTransformer
from .base_node import BaseNode

View File

@ -3,6 +3,7 @@ RAGNode Module
"""
from typing import List, Optional
from langchain.docstore.document import Document
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline

View File

@ -4,9 +4,11 @@ RobotsNode Module
from typing import List, Optional
from urllib.parse import urlparse
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain.prompts import PromptTemplate
from langchain.output_parsers import CommaSeparatedListOutputParser
from .base_node import BaseNode
from ..helpers import robots_dictionary

View File

@ -3,8 +3,10 @@ SearchInternetNode Module
"""
from typing import List, Optional
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate
from ..utils.research_web import search_on_web
from .base_node import BaseNode

View File

@ -6,7 +6,6 @@ SearchLinkNode Module
from typing import List, Optional
from tqdm import tqdm
# Imports from Langchain
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser