fix: search graph

2026-07-01 21:00:48 +08:00 · 2025-01-03 14:00:17 +01:00 · 2025-01-03 14:00:17 +01:00 · d4b26796d9
commit d4b26796d9
parent a9569ac08f
3 changed files with 16 additions and 3 deletions
--- a/scrapegraphai/nodes/merge_answers_node.py
+++ b/scrapegraphai/nodes/merge_answers_node.py
@ -96,7 +96,17 @@ class MergeAnswersNode(BaseNode):

        merge_chain = prompt_template | self.llm_model | output_parser
        answer = merge_chain.invoke({"user_prompt": user_prompt})
-        answer["sources"] = state.get("urls", [])
+        
+        # Get the URLs from the state, ensuring we get the actual URLs used for scraping
+        urls = []
+        if "urls" in state:
+            urls = state["urls"]
+        elif "considered_urls" in state:
+            urls = state["considered_urls"]
+        
+        # Only add sources if we actually have URLs
+        if urls:
+            answer["sources"] = urls

        state.update({self.output[0]: answer})
        return state
--- a/scrapegraphai/nodes/search_internet_node.py
+++ b/scrapegraphai/nodes/search_internet_node.py
@ -99,5 +99,8 @@ class SearchInternetNode(BaseNode):
        if len(answer) == 0:
            raise ValueError("Zero results found for the search query.")

+        # Store both the URLs and considered_urls in the state
        state.update({self.output[0]: answer})
+        state["considered_urls"] = answer  # Add this as a backup
+
        return state
--- a/scrapegraphai/utils/research_web.py
+++ b/scrapegraphai/utils/research_web.py
@ -41,7 +41,7 @@ def search_on_web(query: str, search_engine: str = "Google",
        research = DuckDuckGoSearchResults(max_results=max_results)
        res = research.run(query)
        links = re.findall(r'https?://[^\s,\]]+', res)
-        return links
+        return links[:max_results]

    elif search_engine.lower() == "bing":
        headers = {
@ -66,7 +66,7 @@ def search_on_web(query: str, search_engine: str = "Google",
        response = requests.get(url, params=params)

        data = response.json()
-        limited_results = data["results"][:max_results]
+        limited_results = [result['url'] for result in data["results"][:max_results]]
        return limited_results

    else: