mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
feat: Implemented a filter logic in search_link_node.py
feat: Added dict entry for Llama3.1:8b
This commit is contained in:
parent
8d6c0b7288
commit
08e9d9d6a0
@ -9,14 +9,26 @@ from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/llama3",
|
||||
"model": "ollama/llama3.1:8b",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
|
||||
"verbose": True,
|
||||
"headless": False
|
||||
"headless": False,
|
||||
"filter_config": {
|
||||
"diff_domain_filter": True,
|
||||
# "img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
|
||||
# "lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
|
||||
# "irrelevant_keywords": [
|
||||
# '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
|
||||
# 'linkedin.com', 'instagram.com', '.js', '.css', '/wp-content/', '/wp-admin/',
|
||||
# '/wp-includes/', '/wp-json/', '/wp-comments-post.php', ';amp', '/about',
|
||||
# '/careers', '/jobs', '/privacy', '/terms', '/legal', '/faq', '/help',
|
||||
# '.pdf', '.zip', '/news', '/files', '/downloads'
|
||||
# ]
|
||||
},
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
|
||||
@ -72,7 +72,9 @@ class SearchLinkGraph(AbstractGraph):
|
||||
output=["parsed_doc"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"chunk_size": self.model_token
|
||||
"chunk_size": self.model_token,
|
||||
"filter_links": self.config.get("filter_links", None),
|
||||
"filter_config": self.config.get("filter_config", None)
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
13
scrapegraphai/helpers/default_filters.py
Normal file
13
scrapegraphai/helpers/default_filters.py
Normal file
@ -0,0 +1,13 @@
|
||||
"""
|
||||
Module for filtering irrelevant links
|
||||
"""
|
||||
|
||||
filter_dict = {
|
||||
"diff_domain_filter": True,
|
||||
"img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
|
||||
"lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
|
||||
"irrelevant_keywords": [
|
||||
'/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
|
||||
'linkedin.com', 'instagram.com', '.js', '.css',
|
||||
]
|
||||
}
|
||||
@ -51,44 +51,42 @@ models_tokens = {
|
||||
"gemini-1.5-pro-latest": 128000,
|
||||
"models/embedding-001": 2048
|
||||
},
|
||||
"ollama": {
|
||||
"grok-1": 8192,
|
||||
"command-r": 12800,
|
||||
"codellama": 16000,
|
||||
"dbrx": 32768,
|
||||
"deepseek-coder:33b": 16000,
|
||||
"falcon": 2048,
|
||||
"llama2": 4096,
|
||||
"llama3": 8192,
|
||||
"llama3:70b": 8192,
|
||||
"llama3.1":128000,
|
||||
"llama3.1:70b": 128000,
|
||||
"lama3.1:405b": 128000,
|
||||
"scrapegraph": 8192,
|
||||
"llava": 4096,
|
||||
"mixtral:8x22b-instruct": 65536,
|
||||
"mistral":8192,
|
||||
"mistral-openorca": 32000,
|
||||
"nomic-embed-text": 8192,
|
||||
"nous-hermes2:34b": 4096,
|
||||
"orca-mini": 2048,
|
||||
"phi3:3.8b": 12800,
|
||||
"qwen:0.5b": 32000,
|
||||
"qwen:1.8b": 32000,
|
||||
"qwen:4b": 32000,
|
||||
"qwen:14b": 32000,
|
||||
"qwen:32b": 32000,
|
||||
"qwen:72b": 32000,
|
||||
"qwen:110b": 32000,
|
||||
"stablelm-zephyr": 8192,
|
||||
"wizardlm2:8x22b": 65536,
|
||||
# embedding models
|
||||
"shaw/dmeta-embedding-zh-small-q4": 8192,
|
||||
"shaw/dmeta-embedding-zh-q4": 8192,
|
||||
"chevalblanc/acge_text_embedding": 8192,
|
||||
"martcreation/dmeta-embedding-zh": 8192,
|
||||
"snowflake-arctic-embed": 8192,
|
||||
"mxbai-embed-large": 512
|
||||
"ollama": { "command-r": 12800,
|
||||
"codellama": 16000,
|
||||
"dbrx": 32768,
|
||||
"deepseek-coder:33b": 16000,
|
||||
"falcon": 2048,
|
||||
"llama2": 4096,
|
||||
"llama3": 8192,
|
||||
"llama3:70b": 8192,
|
||||
"llama3.1":128000,
|
||||
"llama3.1:8b": 128000,
|
||||
"llama3.1:70b": 128000,
|
||||
"lama3.1:405b": 128000,
|
||||
"scrapegraph": 8192,
|
||||
"llava": 4096,
|
||||
"mixtral:8x22b-instruct": 65536,
|
||||
"mistral-openorca": 32000,
|
||||
"nomic-embed-text": 8192,
|
||||
"nous-hermes2:34b": 4096,
|
||||
"orca-mini": 2048,
|
||||
"phi3:3.8b": 12800,
|
||||
"qwen:0.5b": 32000,
|
||||
"qwen:1.8b": 32000,
|
||||
"qwen:4b": 32000,
|
||||
"qwen:14b": 32000,
|
||||
"qwen:32b": 32000,
|
||||
"qwen:72b": 32000,
|
||||
"qwen:110b": 32000,
|
||||
"stablelm-zephyr": 8192,
|
||||
"wizardlm2:8x22b": 65536,
|
||||
# embedding models
|
||||
"shaw/dmeta-embedding-zh-small-q4": 8192,
|
||||
"shaw/dmeta-embedding-zh-q4": 8192,
|
||||
"chevalblanc/acge_text_embedding": 8192,
|
||||
"martcreation/dmeta-embedding-zh": 8192,
|
||||
"snowflake-arctic-embed": 8192,
|
||||
"mxbai-embed-large": 512
|
||||
},
|
||||
"oneapi": {
|
||||
"qwen-turbo": 6000
|
||||
|
||||
@ -4,12 +4,14 @@ SearchLinkNode Module
|
||||
from typing import List, Optional
|
||||
import re
|
||||
from tqdm import tqdm
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
from langchain.prompts import PromptTemplate
|
||||
from langchain_core.output_parsers import JsonOutputParser
|
||||
from langchain_core.runnables import RunnableParallel
|
||||
from ..utils.logging import get_logger
|
||||
from .base_node import BaseNode
|
||||
from ..prompts import TEMPLATE_RELEVANT_LINKS
|
||||
from ..helpers import default_filters
|
||||
|
||||
|
||||
class SearchLinkNode(BaseNode):
|
||||
@ -39,10 +41,54 @@ class SearchLinkNode(BaseNode):
|
||||
super().__init__(node_name, "node", input, output, 1, node_config)
|
||||
|
||||
self.llm_model = node_config["llm_model"]
|
||||
self.verbose = (
|
||||
False if node_config is None else node_config.get("verbose", False)
|
||||
)
|
||||
|
||||
# Apply filters if filter_links is True or if filter_config is provided
|
||||
if node_config.get("filter_links", False) or "filter_config" in node_config:
|
||||
# Merge provided filter config with default filter config for partial configuration
|
||||
provided_filter_config = node_config.get("filter_config", {})
|
||||
self.filter_config = {**default_filters.filter_dict, **provided_filter_config}
|
||||
self.filter_links = True
|
||||
else:
|
||||
# Skip filtering if not enabled
|
||||
self.filter_config = None
|
||||
self.filter_links = False
|
||||
|
||||
self.verbose = node_config.get("verbose", False)
|
||||
self.seen_links = set()
|
||||
|
||||
def _is_same_domain(self, url, domain):
|
||||
if not self.filter_links or not self.filter_config.get("diff_domain_filter", True):
|
||||
return True # Skip the domain filter if not enabled
|
||||
parsed_url = urlparse(url)
|
||||
parsed_domain = urlparse(domain)
|
||||
return parsed_url.netloc == parsed_domain.netloc
|
||||
|
||||
def _is_image_url(self, url):
|
||||
if not self.filter_links:
|
||||
return False # Skip image filtering if filtering is not enabled
|
||||
|
||||
image_extensions = self.filter_config.get("img_exts", [])
|
||||
return any(url.lower().endswith(ext) for ext in image_extensions)
|
||||
|
||||
def _is_language_url(self, url):
|
||||
if not self.filter_links:
|
||||
return False # Skip language filtering if filtering is not enabled
|
||||
|
||||
lang_indicators = self.filter_config.get("lang_indicators", [])
|
||||
parsed_url = urlparse(url)
|
||||
query_params = parse_qs(parsed_url.query)
|
||||
|
||||
# Check if the URL path or query string indicates a language-specific version
|
||||
return any(indicator in parsed_url.path.lower() or indicator in query_params for indicator in lang_indicators)
|
||||
|
||||
def _is_potentially_irrelevant(self, url):
|
||||
if not self.filter_links:
|
||||
return False # Skip irrelevant URL filtering if filtering is not enabled
|
||||
|
||||
irrelevant_keywords = self.filter_config.get("irrelevant_keywords", [])
|
||||
return any(keyword in url.lower() for keyword in irrelevant_keywords)
|
||||
|
||||
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
Filter out relevant links from the webpage that are relavant to prompt. Out of the filtered links, also
|
||||
@ -64,6 +110,7 @@ class SearchLinkNode(BaseNode):
|
||||
|
||||
|
||||
parsed_content_chunks = state.get("doc")
|
||||
source_url = state.get("url") or state.get("local_dir")
|
||||
output_parser = JsonOutputParser()
|
||||
|
||||
relevant_links = []
|
||||
@ -76,10 +123,28 @@ class SearchLinkNode(BaseNode):
|
||||
)
|
||||
):
|
||||
try:
|
||||
|
||||
# Primary approach: Regular expression to extract links
|
||||
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
|
||||
|
||||
relevant_links += links
|
||||
if not self.filter_links:
|
||||
links = list(set(links))
|
||||
|
||||
relevant_links += links
|
||||
self.seen_links.update(relevant_links)
|
||||
else:
|
||||
filtered_links = [
|
||||
link for link in links
|
||||
if self._is_same_domain(link, source_url)
|
||||
and not self._is_image_url(link)
|
||||
and not self._is_language_url(link)
|
||||
and not self._is_potentially_irrelevant(link)
|
||||
and link not in self.seen_links
|
||||
]
|
||||
filtered_links = list(set(filtered_links))
|
||||
relevant_links += filtered_links
|
||||
self.seen_links.update(relevant_links)
|
||||
|
||||
except Exception as e:
|
||||
# Fallback approach: Using the LLM to extract links
|
||||
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user