mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
fix: search link node
This commit is contained in:
parent
d3e63d91be
commit
cf3ab5564a
@ -4,6 +4,7 @@ SearchLinkNode Module
|
||||
|
||||
# Imports from standard library
|
||||
from typing import List, Optional
|
||||
import re
|
||||
from tqdm import tqdm
|
||||
|
||||
# Imports from Langchain
|
||||
@ -20,7 +21,7 @@ from .base_node import BaseNode
|
||||
class SearchLinkNode(BaseNode):
|
||||
"""
|
||||
A node that can filter out the relevant links in the webpage content for the user prompt.
|
||||
Node expects the aleready scrapped links on the webpage and hence it is expected
|
||||
Node expects the already scrapped links on the webpage and hence it is expected
|
||||
that this node be used after the FetchNode.
|
||||
|
||||
Attributes:
|
||||
@ -74,32 +75,6 @@ class SearchLinkNode(BaseNode):
|
||||
parsed_content_chunks = state[input_keys[1]]
|
||||
output_parser = JsonOutputParser()
|
||||
|
||||
prompt_relevant_links = """
|
||||
You are a website scraper and you have just scraped the following content from a website.
|
||||
Content: {content}
|
||||
|
||||
You are now tasked with identifying all hyper links within the content that are potentially
|
||||
relevant to the user task: {user_prompt}
|
||||
|
||||
Assume relevance broadly, including any links that might be related or potentially useful
|
||||
in relation to the task.
|
||||
|
||||
Sort it in order of importance, the first one should be the most important one, the last one
|
||||
the least important
|
||||
|
||||
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
|
||||
whether the content at the link is directly relevant.
|
||||
|
||||
Output only a list of relevant links in the format:
|
||||
[
|
||||
"link1",
|
||||
"link2",
|
||||
"link3",
|
||||
.
|
||||
.
|
||||
.
|
||||
]
|
||||
"""
|
||||
relevant_links = []
|
||||
|
||||
for i, chunk in enumerate(
|
||||
@ -109,15 +84,49 @@ class SearchLinkNode(BaseNode):
|
||||
disable=not self.verbose,
|
||||
)
|
||||
):
|
||||
merge_prompt = PromptTemplate(
|
||||
template=prompt_relevant_links,
|
||||
input_variables=["content", "user_prompt"],
|
||||
)
|
||||
merge_chain = merge_prompt | self.llm_model | output_parser
|
||||
# merge_chain = merge_prompt | self.llm_model
|
||||
answer = merge_chain.invoke(
|
||||
{"content": chunk.page_content, "user_prompt": user_prompt}
|
||||
)
|
||||
relevant_links += answer
|
||||
try:
|
||||
# Primary approach: Regular expression to extract links
|
||||
links = re.findall(r'(https?://\S+)', chunk.page_content)
|
||||
relevant_links += links
|
||||
except Exception as e:
|
||||
# Fallback approach: Using the LLM to extract links
|
||||
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
|
||||
prompt_relevant_links = """
|
||||
You are a website scraper and you have just scraped the following content from a website.
|
||||
Content: {content}
|
||||
|
||||
You are now tasked with identifying all hyper links within the content that are potentially
|
||||
relevant to the user task: {user_prompt}
|
||||
|
||||
Assume relevance broadly, including any links that might be related or potentially useful
|
||||
in relation to the task.
|
||||
|
||||
Sort it in order of importance, the first one should be the most important one, the last one
|
||||
the least important
|
||||
|
||||
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
|
||||
whether the content at the link is directly relevant.
|
||||
|
||||
Output only a list of relevant links in the format:
|
||||
[
|
||||
"link1",
|
||||
"link2",
|
||||
"link3",
|
||||
.
|
||||
.
|
||||
.
|
||||
]
|
||||
"""
|
||||
|
||||
merge_prompt = PromptTemplate(
|
||||
template=prompt_relevant_links,
|
||||
input_variables=["content", "user_prompt"],
|
||||
)
|
||||
merge_chain = merge_prompt | self.llm_model | output_parser
|
||||
answer = merge_chain.invoke(
|
||||
{"content": chunk.page_content, "user_prompt": user_prompt}
|
||||
)
|
||||
relevant_links += answer
|
||||
|
||||
state.update({self.output[0]: relevant_links})
|
||||
return state
|
||||
|
||||
Loading…
Reference in New Issue
Block a user