feat: Implemented a filter logic in search_link_node.py

feat: Added dict entry for Llama3.1:8b
This commit is contained in:
ekinsenler 2024-08-19 11:05:31 +03:00
parent 8d6c0b7288
commit 08e9d9d6a0
5 changed files with 135 additions and 45 deletions

View File

@ -9,14 +9,26 @@ from scrapegraphai.utils import prettify_exec_info
graph_config = {
"llm": {
"model": "ollama/llama3",
"model": "ollama/llama3.1:8b",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"verbose": True,
"headless": False
"headless": False,
"filter_config": {
"diff_domain_filter": True,
# "img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
# "lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
# "irrelevant_keywords": [
# '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
# 'linkedin.com', 'instagram.com', '.js', '.css', '/wp-content/', '/wp-admin/',
# '/wp-includes/', '/wp-json/', '/wp-comments-post.php', ';amp', '/about',
# '/careers', '/jobs', '/privacy', '/terms', '/legal', '/faq', '/help',
# '.pdf', '.zip', '/news', '/files', '/downloads'
# ]
},
}
# ************************************************

View File

@ -72,7 +72,9 @@ class SearchLinkGraph(AbstractGraph):
output=["parsed_doc"],
node_config={
"llm_model": self.llm_model,
"chunk_size": self.model_token
"chunk_size": self.model_token,
"filter_links": self.config.get("filter_links", None),
"filter_config": self.config.get("filter_config", None)
}
)

View File

@ -0,0 +1,13 @@
"""
Module for filtering irrelevant links
"""
filter_dict = {
"diff_domain_filter": True,
"img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
"lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'],
"irrelevant_keywords": [
'/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com',
'linkedin.com', 'instagram.com', '.js', '.css',
]
}

View File

@ -51,44 +51,42 @@ models_tokens = {
"gemini-1.5-pro-latest": 128000,
"models/embedding-001": 2048
},
"ollama": {
"grok-1": 8192,
"command-r": 12800,
"codellama": 16000,
"dbrx": 32768,
"deepseek-coder:33b": 16000,
"falcon": 2048,
"llama2": 4096,
"llama3": 8192,
"llama3:70b": 8192,
"llama3.1":128000,
"llama3.1:70b": 128000,
"lama3.1:405b": 128000,
"scrapegraph": 8192,
"llava": 4096,
"mixtral:8x22b-instruct": 65536,
"mistral":8192,
"mistral-openorca": 32000,
"nomic-embed-text": 8192,
"nous-hermes2:34b": 4096,
"orca-mini": 2048,
"phi3:3.8b": 12800,
"qwen:0.5b": 32000,
"qwen:1.8b": 32000,
"qwen:4b": 32000,
"qwen:14b": 32000,
"qwen:32b": 32000,
"qwen:72b": 32000,
"qwen:110b": 32000,
"stablelm-zephyr": 8192,
"wizardlm2:8x22b": 65536,
# embedding models
"shaw/dmeta-embedding-zh-small-q4": 8192,
"shaw/dmeta-embedding-zh-q4": 8192,
"chevalblanc/acge_text_embedding": 8192,
"martcreation/dmeta-embedding-zh": 8192,
"snowflake-arctic-embed": 8192,
"mxbai-embed-large": 512
"ollama": { "command-r": 12800,
"codellama": 16000,
"dbrx": 32768,
"deepseek-coder:33b": 16000,
"falcon": 2048,
"llama2": 4096,
"llama3": 8192,
"llama3:70b": 8192,
"llama3.1":128000,
"llama3.1:8b": 128000,
"llama3.1:70b": 128000,
"lama3.1:405b": 128000,
"scrapegraph": 8192,
"llava": 4096,
"mixtral:8x22b-instruct": 65536,
"mistral-openorca": 32000,
"nomic-embed-text": 8192,
"nous-hermes2:34b": 4096,
"orca-mini": 2048,
"phi3:3.8b": 12800,
"qwen:0.5b": 32000,
"qwen:1.8b": 32000,
"qwen:4b": 32000,
"qwen:14b": 32000,
"qwen:32b": 32000,
"qwen:72b": 32000,
"qwen:110b": 32000,
"stablelm-zephyr": 8192,
"wizardlm2:8x22b": 65536,
# embedding models
"shaw/dmeta-embedding-zh-small-q4": 8192,
"shaw/dmeta-embedding-zh-q4": 8192,
"chevalblanc/acge_text_embedding": 8192,
"martcreation/dmeta-embedding-zh": 8192,
"snowflake-arctic-embed": 8192,
"mxbai-embed-large": 512
},
"oneapi": {
"qwen-turbo": 6000

View File

@ -4,12 +4,14 @@ SearchLinkNode Module
from typing import List, Optional
import re
from tqdm import tqdm
from urllib.parse import urlparse, parse_qs
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import RunnableParallel
from ..utils.logging import get_logger
from .base_node import BaseNode
from ..prompts import TEMPLATE_RELEVANT_LINKS
from ..helpers import default_filters
class SearchLinkNode(BaseNode):
@ -39,10 +41,54 @@ class SearchLinkNode(BaseNode):
super().__init__(node_name, "node", input, output, 1, node_config)
self.llm_model = node_config["llm_model"]
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
# Apply filters if filter_links is True or if filter_config is provided
if node_config.get("filter_links", False) or "filter_config" in node_config:
# Merge provided filter config with default filter config for partial configuration
provided_filter_config = node_config.get("filter_config", {})
self.filter_config = {**default_filters.filter_dict, **provided_filter_config}
self.filter_links = True
else:
# Skip filtering if not enabled
self.filter_config = None
self.filter_links = False
self.verbose = node_config.get("verbose", False)
self.seen_links = set()
def _is_same_domain(self, url, domain):
if not self.filter_links or not self.filter_config.get("diff_domain_filter", True):
return True # Skip the domain filter if not enabled
parsed_url = urlparse(url)
parsed_domain = urlparse(domain)
return parsed_url.netloc == parsed_domain.netloc
def _is_image_url(self, url):
if not self.filter_links:
return False # Skip image filtering if filtering is not enabled
image_extensions = self.filter_config.get("img_exts", [])
return any(url.lower().endswith(ext) for ext in image_extensions)
def _is_language_url(self, url):
if not self.filter_links:
return False # Skip language filtering if filtering is not enabled
lang_indicators = self.filter_config.get("lang_indicators", [])
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
# Check if the URL path or query string indicates a language-specific version
return any(indicator in parsed_url.path.lower() or indicator in query_params for indicator in lang_indicators)
def _is_potentially_irrelevant(self, url):
if not self.filter_links:
return False # Skip irrelevant URL filtering if filtering is not enabled
irrelevant_keywords = self.filter_config.get("irrelevant_keywords", [])
return any(keyword in url.lower() for keyword in irrelevant_keywords)
def execute(self, state: dict) -> dict:
"""
Filter out relevant links from the webpage that are relavant to prompt. Out of the filtered links, also
@ -64,6 +110,7 @@ class SearchLinkNode(BaseNode):
parsed_content_chunks = state.get("doc")
source_url = state.get("url") or state.get("local_dir")
output_parser = JsonOutputParser()
relevant_links = []
@ -76,10 +123,28 @@ class SearchLinkNode(BaseNode):
)
):
try:
# Primary approach: Regular expression to extract links
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
relevant_links += links
if not self.filter_links:
links = list(set(links))
relevant_links += links
self.seen_links.update(relevant_links)
else:
filtered_links = [
link for link in links
if self._is_same_domain(link, source_url)
and not self._is_image_url(link)
and not self._is_language_url(link)
and not self._is_potentially_irrelevant(link)
and link not in self.seen_links
]
filtered_links = list(set(filtered_links))
relevant_links += filtered_links
self.seen_links.update(relevant_links)
except Exception as e:
# Fallback approach: Using the LLM to extract links
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")