mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
feat: Implemented a filter logic in search_link_node.py
feat: Added dict entry for Llama3.1:8b
This commit is contained in:
parent
8d6c0b7288
commit
08e9d9d6a0
@ -9,14 +9,26 @@ from scrapegraphai.utils import prettify_exec_info
|
|||||||
|
|
||||||
graph_config = {
|
graph_config = {
|
||||||
"llm": {
|
"llm": {
|
||||||
"model": "ollama/llama3",
|
"model": "ollama/llama3.1:8b",
|
||||||
"temperature": 0,
|
"temperature": 0,
|
||||||
"format": "json", # Ollama needs the format to be specified explicitly
|
"format": "json", # Ollama needs the format to be specified explicitly
|
||||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||||
},
|
},
|
||||||
|
|
||||||
"verbose": True,
|
"verbose": True,
|
||||||
"headless": False
|
"headless": False,
|
||||||
|
"filter_config": {
|
||||||
|
"diff_domain_filter": True,
|
||||||
|
# "img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
|
||||||
|
# "lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
|
||||||
|
# "irrelevant_keywords": [
|
||||||
|
# '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
|
||||||
|
# 'linkedin.com', 'instagram.com', '.js', '.css', '/wp-content/', '/wp-admin/',
|
||||||
|
# '/wp-includes/', '/wp-json/', '/wp-comments-post.php', ';amp', '/about',
|
||||||
|
# '/careers', '/jobs', '/privacy', '/terms', '/legal', '/faq', '/help',
|
||||||
|
# '.pdf', '.zip', '/news', '/files', '/downloads'
|
||||||
|
# ]
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
# ************************************************
|
# ************************************************
|
||||||
|
|||||||
@ -72,7 +72,9 @@ class SearchLinkGraph(AbstractGraph):
|
|||||||
output=["parsed_doc"],
|
output=["parsed_doc"],
|
||||||
node_config={
|
node_config={
|
||||||
"llm_model": self.llm_model,
|
"llm_model": self.llm_model,
|
||||||
"chunk_size": self.model_token
|
"chunk_size": self.model_token,
|
||||||
|
"filter_links": self.config.get("filter_links", None),
|
||||||
|
"filter_config": self.config.get("filter_config", None)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
13
scrapegraphai/helpers/default_filters.py
Normal file
13
scrapegraphai/helpers/default_filters.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
"""
|
||||||
|
Module for filtering irrelevant links
|
||||||
|
"""
|
||||||
|
|
||||||
|
filter_dict = {
|
||||||
|
"diff_domain_filter": True,
|
||||||
|
"img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
|
||||||
|
"lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
|
||||||
|
"irrelevant_keywords": [
|
||||||
|
'/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
|
||||||
|
'linkedin.com', 'instagram.com', '.js', '.css',
|
||||||
|
]
|
||||||
|
}
|
||||||
@ -51,44 +51,42 @@ models_tokens = {
|
|||||||
"gemini-1.5-pro-latest": 128000,
|
"gemini-1.5-pro-latest": 128000,
|
||||||
"models/embedding-001": 2048
|
"models/embedding-001": 2048
|
||||||
},
|
},
|
||||||
"ollama": {
|
"ollama": { "command-r": 12800,
|
||||||
"grok-1": 8192,
|
"codellama": 16000,
|
||||||
"command-r": 12800,
|
"dbrx": 32768,
|
||||||
"codellama": 16000,
|
"deepseek-coder:33b": 16000,
|
||||||
"dbrx": 32768,
|
"falcon": 2048,
|
||||||
"deepseek-coder:33b": 16000,
|
"llama2": 4096,
|
||||||
"falcon": 2048,
|
"llama3": 8192,
|
||||||
"llama2": 4096,
|
"llama3:70b": 8192,
|
||||||
"llama3": 8192,
|
"llama3.1":128000,
|
||||||
"llama3:70b": 8192,
|
"llama3.1:8b": 128000,
|
||||||
"llama3.1":128000,
|
"llama3.1:70b": 128000,
|
||||||
"llama3.1:70b": 128000,
|
"lama3.1:405b": 128000,
|
||||||
"lama3.1:405b": 128000,
|
"scrapegraph": 8192,
|
||||||
"scrapegraph": 8192,
|
"llava": 4096,
|
||||||
"llava": 4096,
|
"mixtral:8x22b-instruct": 65536,
|
||||||
"mixtral:8x22b-instruct": 65536,
|
"mistral-openorca": 32000,
|
||||||
"mistral":8192,
|
"nomic-embed-text": 8192,
|
||||||
"mistral-openorca": 32000,
|
"nous-hermes2:34b": 4096,
|
||||||
"nomic-embed-text": 8192,
|
"orca-mini": 2048,
|
||||||
"nous-hermes2:34b": 4096,
|
"phi3:3.8b": 12800,
|
||||||
"orca-mini": 2048,
|
"qwen:0.5b": 32000,
|
||||||
"phi3:3.8b": 12800,
|
"qwen:1.8b": 32000,
|
||||||
"qwen:0.5b": 32000,
|
"qwen:4b": 32000,
|
||||||
"qwen:1.8b": 32000,
|
"qwen:14b": 32000,
|
||||||
"qwen:4b": 32000,
|
"qwen:32b": 32000,
|
||||||
"qwen:14b": 32000,
|
"qwen:72b": 32000,
|
||||||
"qwen:32b": 32000,
|
"qwen:110b": 32000,
|
||||||
"qwen:72b": 32000,
|
"stablelm-zephyr": 8192,
|
||||||
"qwen:110b": 32000,
|
"wizardlm2:8x22b": 65536,
|
||||||
"stablelm-zephyr": 8192,
|
# embedding models
|
||||||
"wizardlm2:8x22b": 65536,
|
"shaw/dmeta-embedding-zh-small-q4": 8192,
|
||||||
# embedding models
|
"shaw/dmeta-embedding-zh-q4": 8192,
|
||||||
"shaw/dmeta-embedding-zh-small-q4": 8192,
|
"chevalblanc/acge_text_embedding": 8192,
|
||||||
"shaw/dmeta-embedding-zh-q4": 8192,
|
"martcreation/dmeta-embedding-zh": 8192,
|
||||||
"chevalblanc/acge_text_embedding": 8192,
|
"snowflake-arctic-embed": 8192,
|
||||||
"martcreation/dmeta-embedding-zh": 8192,
|
"mxbai-embed-large": 512
|
||||||
"snowflake-arctic-embed": 8192,
|
|
||||||
"mxbai-embed-large": 512
|
|
||||||
},
|
},
|
||||||
"oneapi": {
|
"oneapi": {
|
||||||
"qwen-turbo": 6000
|
"qwen-turbo": 6000
|
||||||
|
|||||||
@ -4,12 +4,14 @@ SearchLinkNode Module
|
|||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
import re
|
import re
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from urllib.parse import urlparse, parse_qs
|
||||||
from langchain.prompts import PromptTemplate
|
from langchain.prompts import PromptTemplate
|
||||||
from langchain_core.output_parsers import JsonOutputParser
|
from langchain_core.output_parsers import JsonOutputParser
|
||||||
from langchain_core.runnables import RunnableParallel
|
from langchain_core.runnables import RunnableParallel
|
||||||
from ..utils.logging import get_logger
|
from ..utils.logging import get_logger
|
||||||
from .base_node import BaseNode
|
from .base_node import BaseNode
|
||||||
from ..prompts import TEMPLATE_RELEVANT_LINKS
|
from ..prompts import TEMPLATE_RELEVANT_LINKS
|
||||||
|
from ..helpers import default_filters
|
||||||
|
|
||||||
|
|
||||||
class SearchLinkNode(BaseNode):
|
class SearchLinkNode(BaseNode):
|
||||||
@ -39,9 +41,53 @@ class SearchLinkNode(BaseNode):
|
|||||||
super().__init__(node_name, "node", input, output, 1, node_config)
|
super().__init__(node_name, "node", input, output, 1, node_config)
|
||||||
|
|
||||||
self.llm_model = node_config["llm_model"]
|
self.llm_model = node_config["llm_model"]
|
||||||
self.verbose = (
|
|
||||||
False if node_config is None else node_config.get("verbose", False)
|
# Apply filters if filter_links is True or if filter_config is provided
|
||||||
)
|
if node_config.get("filter_links", False) or "filter_config" in node_config:
|
||||||
|
# Merge provided filter config with default filter config for partial configuration
|
||||||
|
provided_filter_config = node_config.get("filter_config", {})
|
||||||
|
self.filter_config = {**default_filters.filter_dict, **provided_filter_config}
|
||||||
|
self.filter_links = True
|
||||||
|
else:
|
||||||
|
# Skip filtering if not enabled
|
||||||
|
self.filter_config = None
|
||||||
|
self.filter_links = False
|
||||||
|
|
||||||
|
self.verbose = node_config.get("verbose", False)
|
||||||
|
self.seen_links = set()
|
||||||
|
|
||||||
|
def _is_same_domain(self, url, domain):
|
||||||
|
if not self.filter_links or not self.filter_config.get("diff_domain_filter", True):
|
||||||
|
return True # Skip the domain filter if not enabled
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
parsed_domain = urlparse(domain)
|
||||||
|
return parsed_url.netloc == parsed_domain.netloc
|
||||||
|
|
||||||
|
def _is_image_url(self, url):
|
||||||
|
if not self.filter_links:
|
||||||
|
return False # Skip image filtering if filtering is not enabled
|
||||||
|
|
||||||
|
image_extensions = self.filter_config.get("img_exts", [])
|
||||||
|
return any(url.lower().endswith(ext) for ext in image_extensions)
|
||||||
|
|
||||||
|
def _is_language_url(self, url):
|
||||||
|
if not self.filter_links:
|
||||||
|
return False # Skip language filtering if filtering is not enabled
|
||||||
|
|
||||||
|
lang_indicators = self.filter_config.get("lang_indicators", [])
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
query_params = parse_qs(parsed_url.query)
|
||||||
|
|
||||||
|
# Check if the URL path or query string indicates a language-specific version
|
||||||
|
return any(indicator in parsed_url.path.lower() or indicator in query_params for indicator in lang_indicators)
|
||||||
|
|
||||||
|
def _is_potentially_irrelevant(self, url):
|
||||||
|
if not self.filter_links:
|
||||||
|
return False # Skip irrelevant URL filtering if filtering is not enabled
|
||||||
|
|
||||||
|
irrelevant_keywords = self.filter_config.get("irrelevant_keywords", [])
|
||||||
|
return any(keyword in url.lower() for keyword in irrelevant_keywords)
|
||||||
|
|
||||||
|
|
||||||
def execute(self, state: dict) -> dict:
|
def execute(self, state: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
@ -64,6 +110,7 @@ class SearchLinkNode(BaseNode):
|
|||||||
|
|
||||||
|
|
||||||
parsed_content_chunks = state.get("doc")
|
parsed_content_chunks = state.get("doc")
|
||||||
|
source_url = state.get("url") or state.get("local_dir")
|
||||||
output_parser = JsonOutputParser()
|
output_parser = JsonOutputParser()
|
||||||
|
|
||||||
relevant_links = []
|
relevant_links = []
|
||||||
@ -76,10 +123,28 @@ class SearchLinkNode(BaseNode):
|
|||||||
)
|
)
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
|
|
||||||
# Primary approach: Regular expression to extract links
|
# Primary approach: Regular expression to extract links
|
||||||
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
|
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
|
||||||
|
|
||||||
relevant_links += links
|
if not self.filter_links:
|
||||||
|
links = list(set(links))
|
||||||
|
|
||||||
|
relevant_links += links
|
||||||
|
self.seen_links.update(relevant_links)
|
||||||
|
else:
|
||||||
|
filtered_links = [
|
||||||
|
link for link in links
|
||||||
|
if self._is_same_domain(link, source_url)
|
||||||
|
and not self._is_image_url(link)
|
||||||
|
and not self._is_language_url(link)
|
||||||
|
and not self._is_potentially_irrelevant(link)
|
||||||
|
and link not in self.seen_links
|
||||||
|
]
|
||||||
|
filtered_links = list(set(filtered_links))
|
||||||
|
relevant_links += filtered_links
|
||||||
|
self.seen_links.update(relevant_links)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Fallback approach: Using the LLM to extract links
|
# Fallback approach: Using the LLM to extract links
|
||||||
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
|
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user