From d4b26796d94d314af135d2d1bbd538e1d4be7593 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 3 Jan 2025 14:00:17 +0100 Subject: [PATCH] fix: search graph --- scrapegraphai/nodes/merge_answers_node.py | 12 +++++++++++- scrapegraphai/nodes/search_internet_node.py | 3 +++ scrapegraphai/utils/research_web.py | 4 ++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index 9f9a356c..31573add 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -96,7 +96,17 @@ class MergeAnswersNode(BaseNode): merge_chain = prompt_template | self.llm_model | output_parser answer = merge_chain.invoke({"user_prompt": user_prompt}) - answer["sources"] = state.get("urls", []) + + # Get the URLs from the state, ensuring we get the actual URLs used for scraping + urls = [] + if "urls" in state: + urls = state["urls"] + elif "considered_urls" in state: + urls = state["considered_urls"] + + # Only add sources if we actually have URLs + if urls: + answer["sources"] = urls state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 18f14bce..af528963 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -99,5 +99,8 @@ class SearchInternetNode(BaseNode): if len(answer) == 0: raise ValueError("Zero results found for the search query.") + # Store both the URLs and considered_urls in the state state.update({self.output[0]: answer}) + state["considered_urls"] = answer # Add this as a backup + return state \ No newline at end of file diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index fa38910b..93ea9ae2 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -41,7 +41,7 @@ def search_on_web(query: str, search_engine: str = "Google", research = DuckDuckGoSearchResults(max_results=max_results) res = research.run(query) links = re.findall(r'https?://[^\s,\]]+', res) - return links + return links[:max_results] elif search_engine.lower() == "bing": headers = { @@ -66,7 +66,7 @@ def search_on_web(query: str, search_engine: str = "Google", response = requests.get(url, params=params) data = response.json() - limited_results = data["results"][:max_results] + limited_results = [result['url'] for result in data["results"][:max_results]] return limited_results else: