From cf3ab5564ae5c415c63d1771b32ea68f5169ca82 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Sun, 14 Jul 2024 16:49:29 +0200
Subject: [PATCH] fix: search link node

---
 scrapegraphai/nodes/search_link_node.py | 83 ++++++++++++++-----------
 1 file changed, 46 insertions(+), 37 deletions(-)

diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py
index 2a0c5f18..8c81d07b 100644
--- a/scrapegraphai/nodes/search_link_node.py
+++ b/scrapegraphai/nodes/search_link_node.py
@@ -4,6 +4,7 @@ SearchLinkNode Module
 
 # Imports from standard library
 from typing import List, Optional
+import re
 from tqdm import tqdm
 
 # Imports from Langchain
@@ -20,7 +21,7 @@ from .base_node import BaseNode
 class SearchLinkNode(BaseNode):
     """
     A node that can filter out the relevant links in the webpage content for the user prompt.
-    Node expects the aleready scrapped links on the webpage and hence it is expected
+    Node expects the already scrapped links on the webpage and hence it is expected
     that this node be used after the FetchNode.
 
     Attributes:
@@ -74,32 +75,6 @@ class SearchLinkNode(BaseNode):
         parsed_content_chunks = state[input_keys[1]]
         output_parser = JsonOutputParser()
 
-        prompt_relevant_links = """
-            You are a website scraper and you have just scraped the following content from a website.
-            Content: {content}
-            
-            You are now tasked with identifying all hyper links within the content that are potentially
-            relevant to the user task: {user_prompt}
-            
-            Assume relevance broadly, including any links that might be related or potentially useful 
-            in relation to the task.
-
-            Sort it in order of importance, the first one should be the most important one, the last one
-            the least important
-            
-            Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain 
-            whether the content at the link is directly relevant.
-
-            Output only a list of relevant links in the format:
-            [
-                "link1",
-                "link2",
-                "link3",
-                .
-                .
-                .
-            ]
-            """
         relevant_links = []
 
         for i, chunk in enumerate(
@@ -109,15 +84,49 @@ class SearchLinkNode(BaseNode):
                 disable=not self.verbose,
             )
         ):
-            merge_prompt = PromptTemplate(
-                template=prompt_relevant_links,
-                input_variables=["content", "user_prompt"],
-            )
-            merge_chain = merge_prompt | self.llm_model | output_parser
-            # merge_chain = merge_prompt | self.llm_model
-            answer = merge_chain.invoke(
-                {"content": chunk.page_content, "user_prompt": user_prompt}
-            )
-            relevant_links += answer
+            try:
+                # Primary approach: Regular expression to extract links
+                links = re.findall(r'(https?://\S+)', chunk.page_content)
+                relevant_links += links
+            except Exception as e:
+                # Fallback approach: Using the LLM to extract links
+                self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
+                prompt_relevant_links = """
+                    You are a website scraper and you have just scraped the following content from a website.
+                    Content: {content}
+                    
+                    You are now tasked with identifying all hyper links within the content that are potentially
+                    relevant to the user task: {user_prompt}
+                    
+                    Assume relevance broadly, including any links that might be related or potentially useful 
+                    in relation to the task.
+
+                    Sort it in order of importance, the first one should be the most important one, the last one
+                    the least important
+                    
+                    Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain 
+                    whether the content at the link is directly relevant.
+
+                    Output only a list of relevant links in the format:
+                    [
+                        "link1",
+                        "link2",
+                        "link3",
+                        .
+                        .
+                        .
+                    ]
+                    """
+                
+                merge_prompt = PromptTemplate(
+                    template=prompt_relevant_links,
+                    input_variables=["content", "user_prompt"],
+                )
+                merge_chain = merge_prompt | self.llm_model | output_parser
+                answer = merge_chain.invoke(
+                    {"content": chunk.page_content, "user_prompt": user_prompt}
+                )
+                relevant_links += answer
+
         state.update({self.output[0]: relevant_links})
         return state