From cf3ab5564ae5c415c63d1771b32ea68f5169ca82 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 14 Jul 2024 16:49:29 +0200 Subject: [PATCH] fix: search link node --- scrapegraphai/nodes/search_link_node.py | 83 ++++++++++++++----------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 2a0c5f18..8c81d07b 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -4,6 +4,7 @@ SearchLinkNode Module # Imports from standard library from typing import List, Optional +import re from tqdm import tqdm # Imports from Langchain @@ -20,7 +21,7 @@ from .base_node import BaseNode class SearchLinkNode(BaseNode): """ A node that can filter out the relevant links in the webpage content for the user prompt. - Node expects the aleready scrapped links on the webpage and hence it is expected + Node expects the already scrapped links on the webpage and hence it is expected that this node be used after the FetchNode. Attributes: @@ -74,32 +75,6 @@ class SearchLinkNode(BaseNode): parsed_content_chunks = state[input_keys[1]] output_parser = JsonOutputParser() - prompt_relevant_links = """ - You are a website scraper and you have just scraped the following content from a website. - Content: {content} - - You are now tasked with identifying all hyper links within the content that are potentially - relevant to the user task: {user_prompt} - - Assume relevance broadly, including any links that might be related or potentially useful - in relation to the task. - - Sort it in order of importance, the first one should be the most important one, the last one - the least important - - Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain - whether the content at the link is directly relevant. - - Output only a list of relevant links in the format: - [ - "link1", - "link2", - "link3", - . - . - . - ] - """ relevant_links = [] for i, chunk in enumerate( @@ -109,15 +84,49 @@ class SearchLinkNode(BaseNode): disable=not self.verbose, ) ): - merge_prompt = PromptTemplate( - template=prompt_relevant_links, - input_variables=["content", "user_prompt"], - ) - merge_chain = merge_prompt | self.llm_model | output_parser - # merge_chain = merge_prompt | self.llm_model - answer = merge_chain.invoke( - {"content": chunk.page_content, "user_prompt": user_prompt} - ) - relevant_links += answer + try: + # Primary approach: Regular expression to extract links + links = re.findall(r'(https?://\S+)', chunk.page_content) + relevant_links += links + except Exception as e: + # Fallback approach: Using the LLM to extract links + self.logger.error(f"Error extracting links: {e}. Falling back to LLM.") + prompt_relevant_links = """ + You are a website scraper and you have just scraped the following content from a website. + Content: {content} + + You are now tasked with identifying all hyper links within the content that are potentially + relevant to the user task: {user_prompt} + + Assume relevance broadly, including any links that might be related or potentially useful + in relation to the task. + + Sort it in order of importance, the first one should be the most important one, the last one + the least important + + Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain + whether the content at the link is directly relevant. + + Output only a list of relevant links in the format: + [ + "link1", + "link2", + "link3", + . + . + . + ] + """ + + merge_prompt = PromptTemplate( + template=prompt_relevant_links, + input_variables=["content", "user_prompt"], + ) + merge_chain = merge_prompt | self.llm_model | output_parser + answer = merge_chain.invoke( + {"content": chunk.page_content, "user_prompt": user_prompt} + ) + relevant_links += answer + state.update({self.output[0]: relevant_links}) return state