fix: search link node

2026-07-01 21:00:48 +08:00 · 2024-07-14 16:49:29 +02:00 · 2024-07-14 16:49:29 +02:00 · cf3ab5564a
commit cf3ab5564a
parent d3e63d91be
1 changed files with 46 additions and 37 deletions
--- a/scrapegraphai/nodes/search_link_node.py
+++ b/scrapegraphai/nodes/search_link_node.py
@ -4,6 +4,7 @@ SearchLinkNode Module

 # Imports from standard library
 from typing import List, Optional
+import re
 from tqdm import tqdm

 # Imports from Langchain
@ -20,7 +21,7 @@ from .base_node import BaseNode
 class SearchLinkNode(BaseNode):
    """
    A node that can filter out the relevant links in the webpage content for the user prompt.
-    Node expects the aleready scrapped links on the webpage and hence it is expected
+    Node expects the already scrapped links on the webpage and hence it is expected
    that this node be used after the FetchNode.

    Attributes:
@ -74,32 +75,6 @@ class SearchLinkNode(BaseNode):
        parsed_content_chunks = state[input_keys[1]]
        output_parser = JsonOutputParser()

-        prompt_relevant_links = """
-            You are a website scraper and you have just scraped the following content from a website.
-            Content: {content}
-            
-            You are now tasked with identifying all hyper links within the content that are potentially
-            relevant to the user task: {user_prompt}
-            
-            Assume relevance broadly, including any links that might be related or potentially useful 
-            in relation to the task.
-
-            Sort it in order of importance, the first one should be the most important one, the last one
-            the least important
-            
-            Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain 
-            whether the content at the link is directly relevant.
-
-            Output only a list of relevant links in the format:
-            [
-                "link1",
-                "link2",
-                "link3",
-                .
-                .
-                .
-            ]
-            """
        relevant_links = []

        for i, chunk in enumerate(
@ -109,15 +84,49 @@ class SearchLinkNode(BaseNode):
                disable=not self.verbose,
            )
        ):
-            merge_prompt = PromptTemplate(
-                template=prompt_relevant_links,
-                input_variables=["content", "user_prompt"],
-            )
-            merge_chain = merge_prompt | self.llm_model | output_parser
-            # merge_chain = merge_prompt | self.llm_model
-            answer = merge_chain.invoke(
-                {"content": chunk.page_content, "user_prompt": user_prompt}
-            )
-            relevant_links += answer
+            try:
+                # Primary approach: Regular expression to extract links
+                links = re.findall(r'(https?://\S+)', chunk.page_content)
+                relevant_links += links
+            except Exception as e:
+                # Fallback approach: Using the LLM to extract links
+                self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
+                prompt_relevant_links = """
+                    You are a website scraper and you have just scraped the following content from a website.
+                    Content: {content}
+                    
+                    You are now tasked with identifying all hyper links within the content that are potentially
+                    relevant to the user task: {user_prompt}
+                    
+                    Assume relevance broadly, including any links that might be related or potentially useful 
+                    in relation to the task.
+
+                    Sort it in order of importance, the first one should be the most important one, the last one
+                    the least important
+                    
+                    Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain 
+                    whether the content at the link is directly relevant.
+
+                    Output only a list of relevant links in the format:
+                    [
+                        "link1",
+                        "link2",
+                        "link3",
+                        .
+                        .
+                        .
+                    ]
+                    """
+                
+                merge_prompt = PromptTemplate(
+                    template=prompt_relevant_links,
+                    input_variables=["content", "user_prompt"],
+                )
+                merge_chain = merge_prompt | self.llm_model | output_parser
+                answer = merge_chain.invoke(
+                    {"content": chunk.page_content, "user_prompt": user_prompt}
+                )
+                relevant_links += answer
+
        state.update({self.output[0]: relevant_links})
        return state