From d4b26796d94d314af135d2d1bbd538e1d4be7593 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Fri, 3 Jan 2025 14:00:17 +0100
Subject: [PATCH] fix: search graph

---
 scrapegraphai/nodes/merge_answers_node.py   | 12 +++++++++++-
 scrapegraphai/nodes/search_internet_node.py |  3 +++
 scrapegraphai/utils/research_web.py         |  4 ++--
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py
index 9f9a356c..31573add 100644
--- a/scrapegraphai/nodes/merge_answers_node.py
+++ b/scrapegraphai/nodes/merge_answers_node.py
@@ -96,7 +96,17 @@ class MergeAnswersNode(BaseNode):
 
         merge_chain = prompt_template | self.llm_model | output_parser
         answer = merge_chain.invoke({"user_prompt": user_prompt})
-        answer["sources"] = state.get("urls", [])
+        
+        # Get the URLs from the state, ensuring we get the actual URLs used for scraping
+        urls = []
+        if "urls" in state:
+            urls = state["urls"]
+        elif "considered_urls" in state:
+            urls = state["considered_urls"]
+        
+        # Only add sources if we actually have URLs
+        if urls:
+            answer["sources"] = urls
 
         state.update({self.output[0]: answer})
         return state
diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py
index 18f14bce..af528963 100644
--- a/scrapegraphai/nodes/search_internet_node.py
+++ b/scrapegraphai/nodes/search_internet_node.py
@@ -99,5 +99,8 @@ class SearchInternetNode(BaseNode):
         if len(answer) == 0:
             raise ValueError("Zero results found for the search query.")
 
+        # Store both the URLs and considered_urls in the state
         state.update({self.output[0]: answer})
+        state["considered_urls"] = answer  # Add this as a backup
+
         return state
\ No newline at end of file
diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
index fa38910b..93ea9ae2 100644
--- a/scrapegraphai/utils/research_web.py
+++ b/scrapegraphai/utils/research_web.py
@@ -41,7 +41,7 @@ def search_on_web(query: str, search_engine: str = "Google",
         research = DuckDuckGoSearchResults(max_results=max_results)
         res = research.run(query)
         links = re.findall(r'https?://[^\s,\]]+', res)
-        return links
+        return links[:max_results]
 
     elif search_engine.lower() == "bing":
         headers = {
@@ -66,7 +66,7 @@ def search_on_web(query: str, search_engine: str = "Google",
         response = requests.get(url, params=params)
 
         data = response.json()
-        limited_results = data["results"][:max_results]
+        limited_results = [result['url'] for result in data["results"][:max_results]]
         return limited_results
 
     else: