feat: Implemented a filter logic in search_link_node.py

feat: Added dict entry for Llama3.1:8b
This commit is contained in:
ekinsenler 2024-08-19 11:05:31 +03:00
parent 8d6c0b7288
commit 08e9d9d6a0
5 changed files with 135 additions and 45 deletions

View File

@ -9,14 +9,26 @@ from scrapegraphai.utils import prettify_exec_info
graph_config = { graph_config = {
"llm": { "llm": {
"model": "ollama/llama3", "model": "ollama/llama3.1:8b",
"temperature": 0, "temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly "format": "json", # Ollama needs the format to be specified explicitly
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily # "base_url": "http://localhost:11434", # set ollama URL arbitrarily
}, },
"verbose": True, "verbose": True,
"headless": False "headless": False,
"filter_config": {
"diff_domain_filter": True,
# "img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
# "lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
# "irrelevant_keywords": [
# '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
# 'linkedin.com', 'instagram.com', '.js', '.css', '/wp-content/', '/wp-admin/',
# '/wp-includes/', '/wp-json/', '/wp-comments-post.php', ';amp', '/about',
# '/careers', '/jobs', '/privacy', '/terms', '/legal', '/faq', '/help',
# '.pdf', '.zip', '/news', '/files', '/downloads'
# ]
},
} }
# ************************************************ # ************************************************

View File

@ -72,7 +72,9 @@ class SearchLinkGraph(AbstractGraph):
output=["parsed_doc"], output=["parsed_doc"],
node_config={ node_config={
"llm_model": self.llm_model, "llm_model": self.llm_model,
"chunk_size": self.model_token "chunk_size": self.model_token,
"filter_links": self.config.get("filter_links", None),
"filter_config": self.config.get("filter_config", None)
} }
) )

View File

@ -0,0 +1,13 @@
"""
Module for filtering irrelevant links
"""
filter_dict = {
"diff_domain_filter": True,
"img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
"lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
"irrelevant_keywords": [
'/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
'linkedin.com', 'instagram.com', '.js', '.css',
]
}

View File

@ -51,44 +51,42 @@ models_tokens = {
"gemini-1.5-pro-latest": 128000, "gemini-1.5-pro-latest": 128000,
"models/embedding-001": 2048 "models/embedding-001": 2048
}, },
"ollama": { "ollama": { "command-r": 12800,
"grok-1": 8192, "codellama": 16000,
"command-r": 12800, "dbrx": 32768,
"codellama": 16000, "deepseek-coder:33b": 16000,
"dbrx": 32768, "falcon": 2048,
"deepseek-coder:33b": 16000, "llama2": 4096,
"falcon": 2048, "llama3": 8192,
"llama2": 4096, "llama3:70b": 8192,
"llama3": 8192, "llama3.1":128000,
"llama3:70b": 8192, "llama3.1:8b": 128000,
"llama3.1":128000, "llama3.1:70b": 128000,
"llama3.1:70b": 128000, "lama3.1:405b": 128000,
"lama3.1:405b": 128000, "scrapegraph": 8192,
"scrapegraph": 8192, "llava": 4096,
"llava": 4096, "mixtral:8x22b-instruct": 65536,
"mixtral:8x22b-instruct": 65536, "mistral-openorca": 32000,
"mistral":8192, "nomic-embed-text": 8192,
"mistral-openorca": 32000, "nous-hermes2:34b": 4096,
"nomic-embed-text": 8192, "orca-mini": 2048,
"nous-hermes2:34b": 4096, "phi3:3.8b": 12800,
"orca-mini": 2048, "qwen:0.5b": 32000,
"phi3:3.8b": 12800, "qwen:1.8b": 32000,
"qwen:0.5b": 32000, "qwen:4b": 32000,
"qwen:1.8b": 32000, "qwen:14b": 32000,
"qwen:4b": 32000, "qwen:32b": 32000,
"qwen:14b": 32000, "qwen:72b": 32000,
"qwen:32b": 32000, "qwen:110b": 32000,
"qwen:72b": 32000, "stablelm-zephyr": 8192,
"qwen:110b": 32000, "wizardlm2:8x22b": 65536,
"stablelm-zephyr": 8192, # embedding models
"wizardlm2:8x22b": 65536, "shaw/dmeta-embedding-zh-small-q4": 8192,
# embedding models "shaw/dmeta-embedding-zh-q4": 8192,
"shaw/dmeta-embedding-zh-small-q4": 8192, "chevalblanc/acge_text_embedding": 8192,
"shaw/dmeta-embedding-zh-q4": 8192, "martcreation/dmeta-embedding-zh": 8192,
"chevalblanc/acge_text_embedding": 8192, "snowflake-arctic-embed": 8192,
"martcreation/dmeta-embedding-zh": 8192, "mxbai-embed-large": 512
"snowflake-arctic-embed": 8192,
"mxbai-embed-large": 512
}, },
"oneapi": { "oneapi": {
"qwen-turbo": 6000 "qwen-turbo": 6000

View File

@ -4,12 +4,14 @@ SearchLinkNode Module
from typing import List, Optional from typing import List, Optional
import re import re
from tqdm import tqdm from tqdm import tqdm
from urllib.parse import urlparse, parse_qs
from langchain.prompts import PromptTemplate from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import RunnableParallel from langchain_core.runnables import RunnableParallel
from ..utils.logging import get_logger from ..utils.logging import get_logger
from .base_node import BaseNode from .base_node import BaseNode
from ..prompts import TEMPLATE_RELEVANT_LINKS from ..prompts import TEMPLATE_RELEVANT_LINKS
from ..helpers import default_filters
class SearchLinkNode(BaseNode): class SearchLinkNode(BaseNode):
@ -39,10 +41,54 @@ class SearchLinkNode(BaseNode):
super().__init__(node_name, "node", input, output, 1, node_config) super().__init__(node_name, "node", input, output, 1, node_config)
self.llm_model = node_config["llm_model"] self.llm_model = node_config["llm_model"]
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
# Apply filters if filter_links is True or if filter_config is provided
if node_config.get("filter_links", False) or "filter_config" in node_config:
# Merge provided filter config with default filter config for partial configuration
provided_filter_config = node_config.get("filter_config", {})
self.filter_config = {**default_filters.filter_dict, **provided_filter_config}
self.filter_links = True
else:
# Skip filtering if not enabled
self.filter_config = None
self.filter_links = False
self.verbose = node_config.get("verbose", False)
self.seen_links = set()
def _is_same_domain(self, url, domain):
if not self.filter_links or not self.filter_config.get("diff_domain_filter", True):
return True # Skip the domain filter if not enabled
parsed_url = urlparse(url)
parsed_domain = urlparse(domain)
return parsed_url.netloc == parsed_domain.netloc
def _is_image_url(self, url):
if not self.filter_links:
return False # Skip image filtering if filtering is not enabled
image_extensions = self.filter_config.get("img_exts", [])
return any(url.lower().endswith(ext) for ext in image_extensions)
def _is_language_url(self, url):
if not self.filter_links:
return False # Skip language filtering if filtering is not enabled
lang_indicators = self.filter_config.get("lang_indicators", [])
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
# Check if the URL path or query string indicates a language-specific version
return any(indicator in parsed_url.path.lower() or indicator in query_params for indicator in lang_indicators)
def _is_potentially_irrelevant(self, url):
if not self.filter_links:
return False # Skip irrelevant URL filtering if filtering is not enabled
irrelevant_keywords = self.filter_config.get("irrelevant_keywords", [])
return any(keyword in url.lower() for keyword in irrelevant_keywords)
def execute(self, state: dict) -> dict: def execute(self, state: dict) -> dict:
""" """
Filter out relevant links from the webpage that are relavant to prompt. Out of the filtered links, also Filter out relevant links from the webpage that are relavant to prompt. Out of the filtered links, also
@ -64,6 +110,7 @@ class SearchLinkNode(BaseNode):
parsed_content_chunks = state.get("doc") parsed_content_chunks = state.get("doc")
source_url = state.get("url") or state.get("local_dir")
output_parser = JsonOutputParser() output_parser = JsonOutputParser()
relevant_links = [] relevant_links = []
@ -76,10 +123,28 @@ class SearchLinkNode(BaseNode):
) )
): ):
try: try:
# Primary approach: Regular expression to extract links # Primary approach: Regular expression to extract links
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content)) links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
relevant_links += links if not self.filter_links:
links = list(set(links))
relevant_links += links
self.seen_links.update(relevant_links)
else:
filtered_links = [
link for link in links
if self._is_same_domain(link, source_url)
and not self._is_image_url(link)
and not self._is_language_url(link)
and not self._is_potentially_irrelevant(link)
and link not in self.seen_links
]
filtered_links = list(set(filtered_links))
relevant_links += filtered_links
self.seen_links.update(relevant_links)
except Exception as e: except Exception as e:
# Fallback approach: Using the LLM to extract links # Fallback approach: Using the LLM to extract links
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.") self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")