fix: search link node

This commit is contained in:
Marco Vinciguerra 2024-07-14 16:49:29 +02:00
parent d3e63d91be
commit cf3ab5564a

View File

@ -4,6 +4,7 @@ SearchLinkNode Module
# Imports from standard library
from typing import List, Optional
import re
from tqdm import tqdm
# Imports from Langchain
@ -20,7 +21,7 @@ from .base_node import BaseNode
class SearchLinkNode(BaseNode):
"""
A node that can filter out the relevant links in the webpage content for the user prompt.
Node expects the aleready scrapped links on the webpage and hence it is expected
Node expects the already scrapped links on the webpage and hence it is expected
that this node be used after the FetchNode.
Attributes:
@ -74,32 +75,6 @@ class SearchLinkNode(BaseNode):
parsed_content_chunks = state[input_keys[1]]
output_parser = JsonOutputParser()
prompt_relevant_links = """
You are a website scraper and you have just scraped the following content from a website.
Content: {content}
You are now tasked with identifying all hyper links within the content that are potentially
relevant to the user task: {user_prompt}
Assume relevance broadly, including any links that might be related or potentially useful
in relation to the task.
Sort it in order of importance, the first one should be the most important one, the last one
the least important
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
whether the content at the link is directly relevant.
Output only a list of relevant links in the format:
[
"link1",
"link2",
"link3",
.
.
.
]
"""
relevant_links = []
for i, chunk in enumerate(
@ -109,15 +84,49 @@ class SearchLinkNode(BaseNode):
disable=not self.verbose,
)
):
merge_prompt = PromptTemplate(
template=prompt_relevant_links,
input_variables=["content", "user_prompt"],
)
merge_chain = merge_prompt | self.llm_model | output_parser
# merge_chain = merge_prompt | self.llm_model
answer = merge_chain.invoke(
{"content": chunk.page_content, "user_prompt": user_prompt}
)
relevant_links += answer
try:
# Primary approach: Regular expression to extract links
links = re.findall(r'(https?://\S+)', chunk.page_content)
relevant_links += links
except Exception as e:
# Fallback approach: Using the LLM to extract links
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
prompt_relevant_links = """
You are a website scraper and you have just scraped the following content from a website.
Content: {content}
You are now tasked with identifying all hyper links within the content that are potentially
relevant to the user task: {user_prompt}
Assume relevance broadly, including any links that might be related or potentially useful
in relation to the task.
Sort it in order of importance, the first one should be the most important one, the last one
the least important
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
whether the content at the link is directly relevant.
Output only a list of relevant links in the format:
[
"link1",
"link2",
"link3",
.
.
.
]
"""
merge_prompt = PromptTemplate(
template=prompt_relevant_links,
input_variables=["content", "user_prompt"],
)
merge_chain = merge_prompt | self.llm_model | output_parser
answer = merge_chain.invoke(
{"content": chunk.page_content, "user_prompt": user_prompt}
)
relevant_links += answer
state.update({self.output[0]: relevant_links})
return state