feat(node): multiple url search in SearchGraph + fixes

Implemented GraphIteratorNode and MergeAnswersNode to create multiple istances of a graph and merge the scraped content from multiple pages
2026-06-25 21:11:11 +08:00 · 2024-05-06 00:30:09 +02:00 · 2024-05-06 00:30:09 +02:00 · 930adb38f2
commit 930adb38f2
parent dbb614a8dd
5 changed files with 52 additions and 54 deletions
--- a/examples/openai/search_graph_multi.py
+++ b/examples/openai/search_graph_multi.py
@ -45,6 +45,7 @@ search_internet_node = SearchInternetNode(
    output=["urls"],
    node_config={
        "llm_model": llm_model,
        "max_results": 5,  # num of search results to fetch
        "verbose": True,
    }
 )
--- a/examples/openai/search_graph_openai.py
+++ b/examples/openai/search_graph_openai.py
@ -19,6 +19,8 @@ graph_config = {
        "api_key": openai_key,
        "model": "gpt-3.5-turbo",
    },
    "max_results": 5,
    "verbose": True,
 }
 # ************************************************
@ -26,7 +28,7 @@ graph_config = {
 # ************************************************
 search_graph = SearchGraph(
-    prompt="List me top 5 eyeliner products for a gift.",
+    prompt="List me the best escursions near Trento",
    config=graph_config
 )
--- a/scrapegraphai/graphs/search_graph.py
+++ b/scrapegraphai/graphs/search_graph.py
@ -5,12 +5,11 @@ SearchGraph Module
 from .base_graph import BaseGraph
 from ..nodes import (
    SearchInternetNode,
-    FetchNode,
+    GraphIteratorNode,
-    ParseNode,
+    MergeAnswersNode
    RAGNode,
    GenerateAnswerNode
 )
 from .abstract_graph import AbstractGraph
 from .smart_scraper_graph import SmartScraperGraph
 class SearchGraph(AbstractGraph):
@ -38,6 +37,11 @@ class SearchGraph(AbstractGraph):
        >>> result = search_graph.run()
    """
    def __init__(self, prompt: str, config: dict):
        self.max_results = config.get("max_results", 3)
        super().__init__(prompt, config)
    def _create_graph(self) -> BaseGraph:
        """
        Creates the graph of nodes representing the workflow for web scraping and searching.
@ -46,53 +50,53 @@ class SearchGraph(AbstractGraph):
            BaseGraph: A graph instance representing the web scraping and searching workflow.
        """
        # ************************************************
        # Create a SmartScraperGraph instance
        # ************************************************
        smart_scraper_instance = SmartScraperGraph(
            prompt="",
            source="",
            config=self.config
        )
        # ************************************************
        # Define the graph nodes
        # ************************************************
        search_internet_node = SearchInternetNode(
            input="user_prompt",
-            output=["url"],
+            output=["urls"],
            node_config={
                "llm_model": self.llm_model
            }
        )
        fetch_node = FetchNode(
            input="url | local_dir",
            output=["doc"]
        )
        parse_node = ParseNode(
            input="doc",
            output=["parsed_doc"],
            node_config={
                "chunk_size": self.model_token
            }
        )
        rag_node = RAGNode(
            input="user_prompt & (parsed_doc | doc)",
            output=["relevant_chunks"],
            node_config={
                "llm_model": self.llm_model,
-                "embedder_model": self.embedder_model
+                "max_results": self.max_results
            }
        )
-        generate_answer_node = GenerateAnswerNode(
+        graph_iterator_node = GraphIteratorNode(
-            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+            input="user_prompt & urls",
            output=["results"],
            node_config={
                "graph_instance": smart_scraper_instance,
            }
        )
        merge_answers_node = MergeAnswersNode(
            input="user_prompt & results",
            output=["answer"],
            node_config={
-                "llm_model": self.llm_model
+                "llm_model": self.llm_model,
            }
        )
        return BaseGraph(
            nodes=[
                search_internet_node,
-                fetch_node,
+                graph_iterator_node,
-                parse_node,
+                merge_answers_node
                rag_node,
                generate_answer_node,
            ],
            edges=[
-                (search_internet_node, fetch_node),
+                (search_internet_node, graph_iterator_node),
-                (fetch_node, parse_node),
+                (graph_iterator_node, merge_answers_node)
                (parse_node, rag_node),
                (rag_node, generate_answer_node)
            ],
            entry_point=search_internet_node
        )
--- a/scrapegraphai/nodes/graph_iterator_node.py
+++ b/scrapegraphai/nodes/graph_iterator_node.py
@ -10,11 +10,8 @@ from .base_node import BaseNode
 class GraphIteratorNode(BaseNode):
    """
-    A node responsible for parsing HTML content from a document. 
+    A node responsible for instantiating and running multiple graph instances in parallel.
-    The parsed content is split into chunks for further processing.
+    It creates as many graph instances as the number of elements in the input list.
    This node enhances the scraping workflow by allowing for targeted extraction of 
    content, thereby optimizing the processing of large HTML documents.
    Attributes:
        verbose (bool): A flag indicating whether to show print statements during execution.
@ -33,18 +30,18 @@ class GraphIteratorNode(BaseNode):
    def execute(self,  state: dict) -> dict:
        """
-        Executes the node's logic to parse the HTML document content and split it into chunks.
+        Executes the node's logic to instantiate and run multiple graph instances in parallel.
        Args:
-            state (dict): The current state of the graph. The input keys will be used to fetch the
+            state (dict): The current state of the graph. The input keys will be used to fetch
-                            correct data from the state.
+                            the correct data from the state.
        Returns:
-            dict: The updated state with the output key containing the parsed content chunks.
+            dict: The updated state with the output key containing the results of the graph instances.
        Raises:
            KeyError: If the input keys are not found in the state, indicating that the
-                        necessary information for parsing the content is missing.
+                        necessary information for running the graph instances is missing.
        """
        if self.verbose:
@ -79,5 +76,4 @@ class GraphIteratorNode(BaseNode):
            graphs_answers.append(result)
        state.update({self.output[0]: graphs_answers})
        return state
--- a/scrapegraphai/nodes/merge_answers_node.py
+++ b/scrapegraphai/nodes/merge_answers_node.py
@ -9,7 +9,6 @@ from tqdm import tqdm
 # Imports from Langchain
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.runnables import RunnableParallel
 # Imports from the library
 from .base_node import BaseNode
@ -17,10 +16,7 @@ from .base_node import BaseNode
 class MergeAnswersNode(BaseNode):
    """
-    A node that generates an answer using a large language model (LLM) based on the user's input
+    A node responsible for merging the answers from multiple graph instances into a single answer.
    and the content extracted from a webpage. It constructs a prompt from the user's input
    and the scraped content, feeds it to the LLM, and parses the LLM's response to produce
    an answer.
    Attributes:
        llm_model: An instance of a language model client, configured for generating answers.
@ -42,8 +38,7 @@ class MergeAnswersNode(BaseNode):
    def execute(self, state: dict) -> dict:
        """
-        Generates an answer by constructing a prompt from the user's input and the scraped
+        Executes the node's logic to merge the answers from multiple graph instances into a single answer.
        content, querying the language model, and parsing its response.
        Args:
            state (dict): The current state of the graph. The input keys will be used