mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-25 21:11:11 +08:00
feat(node): multiple url search in SearchGraph + fixes
Implemented GraphIteratorNode and MergeAnswersNode to create multiple istances of a graph and merge the scraped content from multiple pages
This commit is contained in:
parent
dbb614a8dd
commit
930adb38f2
@ -45,6 +45,7 @@ search_internet_node = SearchInternetNode(
|
|||||||
output=["urls"],
|
output=["urls"],
|
||||||
node_config={
|
node_config={
|
||||||
"llm_model": llm_model,
|
"llm_model": llm_model,
|
||||||
|
"max_results": 5, # num of search results to fetch
|
||||||
"verbose": True,
|
"verbose": True,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@ -19,6 +19,8 @@ graph_config = {
|
|||||||
"api_key": openai_key,
|
"api_key": openai_key,
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
},
|
},
|
||||||
|
"max_results": 5,
|
||||||
|
"verbose": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
# ************************************************
|
# ************************************************
|
||||||
@ -26,7 +28,7 @@ graph_config = {
|
|||||||
# ************************************************
|
# ************************************************
|
||||||
|
|
||||||
search_graph = SearchGraph(
|
search_graph = SearchGraph(
|
||||||
prompt="List me top 5 eyeliner products for a gift.",
|
prompt="List me the best escursions near Trento",
|
||||||
config=graph_config
|
config=graph_config
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -5,12 +5,11 @@ SearchGraph Module
|
|||||||
from .base_graph import BaseGraph
|
from .base_graph import BaseGraph
|
||||||
from ..nodes import (
|
from ..nodes import (
|
||||||
SearchInternetNode,
|
SearchInternetNode,
|
||||||
FetchNode,
|
GraphIteratorNode,
|
||||||
ParseNode,
|
MergeAnswersNode
|
||||||
RAGNode,
|
|
||||||
GenerateAnswerNode
|
|
||||||
)
|
)
|
||||||
from .abstract_graph import AbstractGraph
|
from .abstract_graph import AbstractGraph
|
||||||
|
from .smart_scraper_graph import SmartScraperGraph
|
||||||
|
|
||||||
|
|
||||||
class SearchGraph(AbstractGraph):
|
class SearchGraph(AbstractGraph):
|
||||||
@ -38,6 +37,11 @@ class SearchGraph(AbstractGraph):
|
|||||||
>>> result = search_graph.run()
|
>>> result = search_graph.run()
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self, prompt: str, config: dict):
|
||||||
|
|
||||||
|
self.max_results = config.get("max_results", 3)
|
||||||
|
super().__init__(prompt, config)
|
||||||
|
|
||||||
def _create_graph(self) -> BaseGraph:
|
def _create_graph(self) -> BaseGraph:
|
||||||
"""
|
"""
|
||||||
Creates the graph of nodes representing the workflow for web scraping and searching.
|
Creates the graph of nodes representing the workflow for web scraping and searching.
|
||||||
@ -46,53 +50,53 @@ class SearchGraph(AbstractGraph):
|
|||||||
BaseGraph: A graph instance representing the web scraping and searching workflow.
|
BaseGraph: A graph instance representing the web scraping and searching workflow.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# ************************************************
|
||||||
|
# Create a SmartScraperGraph instance
|
||||||
|
# ************************************************
|
||||||
|
|
||||||
|
smart_scraper_instance = SmartScraperGraph(
|
||||||
|
prompt="",
|
||||||
|
source="",
|
||||||
|
config=self.config
|
||||||
|
)
|
||||||
|
|
||||||
|
# ************************************************
|
||||||
|
# Define the graph nodes
|
||||||
|
# ************************************************
|
||||||
|
|
||||||
search_internet_node = SearchInternetNode(
|
search_internet_node = SearchInternetNode(
|
||||||
input="user_prompt",
|
input="user_prompt",
|
||||||
output=["url"],
|
output=["urls"],
|
||||||
node_config={
|
|
||||||
"llm_model": self.llm_model
|
|
||||||
}
|
|
||||||
)
|
|
||||||
fetch_node = FetchNode(
|
|
||||||
input="url | local_dir",
|
|
||||||
output=["doc"]
|
|
||||||
)
|
|
||||||
parse_node = ParseNode(
|
|
||||||
input="doc",
|
|
||||||
output=["parsed_doc"],
|
|
||||||
node_config={
|
|
||||||
"chunk_size": self.model_token
|
|
||||||
}
|
|
||||||
)
|
|
||||||
rag_node = RAGNode(
|
|
||||||
input="user_prompt & (parsed_doc | doc)",
|
|
||||||
output=["relevant_chunks"],
|
|
||||||
node_config={
|
node_config={
|
||||||
"llm_model": self.llm_model,
|
"llm_model": self.llm_model,
|
||||||
"embedder_model": self.embedder_model
|
"max_results": self.max_results
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
generate_answer_node = GenerateAnswerNode(
|
graph_iterator_node = GraphIteratorNode(
|
||||||
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
|
input="user_prompt & urls",
|
||||||
|
output=["results"],
|
||||||
|
node_config={
|
||||||
|
"graph_instance": smart_scraper_instance,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
merge_answers_node = MergeAnswersNode(
|
||||||
|
input="user_prompt & results",
|
||||||
output=["answer"],
|
output=["answer"],
|
||||||
node_config={
|
node_config={
|
||||||
"llm_model": self.llm_model
|
"llm_model": self.llm_model,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
return BaseGraph(
|
return BaseGraph(
|
||||||
nodes=[
|
nodes=[
|
||||||
search_internet_node,
|
search_internet_node,
|
||||||
fetch_node,
|
graph_iterator_node,
|
||||||
parse_node,
|
merge_answers_node
|
||||||
rag_node,
|
|
||||||
generate_answer_node,
|
|
||||||
],
|
],
|
||||||
edges=[
|
edges=[
|
||||||
(search_internet_node, fetch_node),
|
(search_internet_node, graph_iterator_node),
|
||||||
(fetch_node, parse_node),
|
(graph_iterator_node, merge_answers_node)
|
||||||
(parse_node, rag_node),
|
|
||||||
(rag_node, generate_answer_node)
|
|
||||||
],
|
],
|
||||||
entry_point=search_internet_node
|
entry_point=search_internet_node
|
||||||
)
|
)
|
||||||
|
|||||||
@ -10,11 +10,8 @@ from .base_node import BaseNode
|
|||||||
|
|
||||||
class GraphIteratorNode(BaseNode):
|
class GraphIteratorNode(BaseNode):
|
||||||
"""
|
"""
|
||||||
A node responsible for parsing HTML content from a document.
|
A node responsible for instantiating and running multiple graph instances in parallel.
|
||||||
The parsed content is split into chunks for further processing.
|
It creates as many graph instances as the number of elements in the input list.
|
||||||
|
|
||||||
This node enhances the scraping workflow by allowing for targeted extraction of
|
|
||||||
content, thereby optimizing the processing of large HTML documents.
|
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||||
@ -33,18 +30,18 @@ class GraphIteratorNode(BaseNode):
|
|||||||
|
|
||||||
def execute(self, state: dict) -> dict:
|
def execute(self, state: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
Executes the node's logic to parse the HTML document content and split it into chunks.
|
Executes the node's logic to instantiate and run multiple graph instances in parallel.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
state (dict): The current state of the graph. The input keys will be used to fetch the
|
state (dict): The current state of the graph. The input keys will be used to fetch
|
||||||
correct data from the state.
|
the correct data from the state.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: The updated state with the output key containing the parsed content chunks.
|
dict: The updated state with the output key containing the results of the graph instances.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
KeyError: If the input keys are not found in the state, indicating that the
|
KeyError: If the input keys are not found in the state, indicating that the
|
||||||
necessary information for parsing the content is missing.
|
necessary information for running the graph instances is missing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
@ -79,5 +76,4 @@ class GraphIteratorNode(BaseNode):
|
|||||||
graphs_answers.append(result)
|
graphs_answers.append(result)
|
||||||
|
|
||||||
state.update({self.output[0]: graphs_answers})
|
state.update({self.output[0]: graphs_answers})
|
||||||
|
|
||||||
return state
|
return state
|
||||||
|
|||||||
@ -9,7 +9,6 @@ from tqdm import tqdm
|
|||||||
# Imports from Langchain
|
# Imports from Langchain
|
||||||
from langchain.prompts import PromptTemplate
|
from langchain.prompts import PromptTemplate
|
||||||
from langchain_core.output_parsers import JsonOutputParser
|
from langchain_core.output_parsers import JsonOutputParser
|
||||||
from langchain_core.runnables import RunnableParallel
|
|
||||||
|
|
||||||
# Imports from the library
|
# Imports from the library
|
||||||
from .base_node import BaseNode
|
from .base_node import BaseNode
|
||||||
@ -17,10 +16,7 @@ from .base_node import BaseNode
|
|||||||
|
|
||||||
class MergeAnswersNode(BaseNode):
|
class MergeAnswersNode(BaseNode):
|
||||||
"""
|
"""
|
||||||
A node that generates an answer using a large language model (LLM) based on the user's input
|
A node responsible for merging the answers from multiple graph instances into a single answer.
|
||||||
and the content extracted from a webpage. It constructs a prompt from the user's input
|
|
||||||
and the scraped content, feeds it to the LLM, and parses the LLM's response to produce
|
|
||||||
an answer.
|
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
llm_model: An instance of a language model client, configured for generating answers.
|
llm_model: An instance of a language model client, configured for generating answers.
|
||||||
@ -42,8 +38,7 @@ class MergeAnswersNode(BaseNode):
|
|||||||
|
|
||||||
def execute(self, state: dict) -> dict:
|
def execute(self, state: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
Generates an answer by constructing a prompt from the user's input and the scraped
|
Executes the node's logic to merge the answers from multiple graph instances into a single answer.
|
||||||
content, querying the language model, and parsing its response.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
state (dict): The current state of the graph. The input keys will be used
|
state (dict): The current state of the graph. The input keys will be used
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user