From 08e9d9d6a09f450a9f512ac2789287819ced9641 Mon Sep 17 00:00:00 2001 From: ekinsenler Date: Mon, 19 Aug 2024 11:05:31 +0300 Subject: [PATCH] feat: Implemented a filter logic in search_link_node.py feat: Added dict entry for Llama3.1:8b --- .../local_models/search_link_graph_ollama.py | 16 +++- scrapegraphai/graphs/search_link_graph.py | 4 +- scrapegraphai/helpers/default_filters.py | 13 ++++ scrapegraphai/helpers/models_tokens.py | 74 +++++++++---------- scrapegraphai/nodes/search_link_node.py | 73 +++++++++++++++++- 5 files changed, 135 insertions(+), 45 deletions(-) create mode 100644 scrapegraphai/helpers/default_filters.py diff --git a/examples/local_models/search_link_graph_ollama.py b/examples/local_models/search_link_graph_ollama.py index a05067dd..885b65e9 100644 --- a/examples/local_models/search_link_graph_ollama.py +++ b/examples/local_models/search_link_graph_ollama.py @@ -9,14 +9,26 @@ from scrapegraphai.utils import prettify_exec_info graph_config = { "llm": { - "model": "ollama/llama3", + "model": "ollama/llama3.1:8b", "temperature": 0, "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, - "headless": False + "headless": False, + "filter_config": { + "diff_domain_filter": True, + # "img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'], + # "lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'], + # "irrelevant_keywords": [ + # '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com', + # 'linkedin.com', 'instagram.com', '.js', '.css', '/wp-content/', '/wp-admin/', + # '/wp-includes/', '/wp-json/', '/wp-comments-post.php', ';amp', '/about', + # '/careers', '/jobs', '/privacy', '/terms', '/legal', '/faq', '/help', + # '.pdf', '.zip', '/news', '/files', '/downloads' + # ] + }, } # ************************************************ diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py index 3898e4a9..66b2f223 100644 --- a/scrapegraphai/graphs/search_link_graph.py +++ b/scrapegraphai/graphs/search_link_graph.py @@ -72,7 +72,9 @@ class SearchLinkGraph(AbstractGraph): output=["parsed_doc"], node_config={ "llm_model": self.llm_model, - "chunk_size": self.model_token + "chunk_size": self.model_token, + "filter_links": self.config.get("filter_links", None), + "filter_config": self.config.get("filter_config", None) } ) diff --git a/scrapegraphai/helpers/default_filters.py b/scrapegraphai/helpers/default_filters.py new file mode 100644 index 00000000..a997736d --- /dev/null +++ b/scrapegraphai/helpers/default_filters.py @@ -0,0 +1,13 @@ +""" +Module for filtering irrelevant links +""" + +filter_dict = { + "diff_domain_filter": True, + "img_exts": ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'], + "lang_indicators": ['lang=', '/fr', '/pt', '/es', '/de', '/jp', '/it'], + "irrelevant_keywords": [ + '/login', '/signup', '/register', '/contact', 'facebook.com', 'twitter.com', + 'linkedin.com', 'instagram.com', '.js', '.css', + ] +} \ No newline at end of file diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 29d4b257..43820159 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -51,44 +51,42 @@ models_tokens = { "gemini-1.5-pro-latest": 128000, "models/embedding-001": 2048 }, - "ollama": { - "grok-1": 8192, - "command-r": 12800, - "codellama": 16000, - "dbrx": 32768, - "deepseek-coder:33b": 16000, - "falcon": 2048, - "llama2": 4096, - "llama3": 8192, - "llama3:70b": 8192, - "llama3.1":128000, - "llama3.1:70b": 128000, - "lama3.1:405b": 128000, - "scrapegraph": 8192, - "llava": 4096, - "mixtral:8x22b-instruct": 65536, - "mistral":8192, - "mistral-openorca": 32000, - "nomic-embed-text": 8192, - "nous-hermes2:34b": 4096, - "orca-mini": 2048, - "phi3:3.8b": 12800, - "qwen:0.5b": 32000, - "qwen:1.8b": 32000, - "qwen:4b": 32000, - "qwen:14b": 32000, - "qwen:32b": 32000, - "qwen:72b": 32000, - "qwen:110b": 32000, - "stablelm-zephyr": 8192, - "wizardlm2:8x22b": 65536, - # embedding models - "shaw/dmeta-embedding-zh-small-q4": 8192, - "shaw/dmeta-embedding-zh-q4": 8192, - "chevalblanc/acge_text_embedding": 8192, - "martcreation/dmeta-embedding-zh": 8192, - "snowflake-arctic-embed": 8192, - "mxbai-embed-large": 512 + "ollama": { "command-r": 12800, + "codellama": 16000, + "dbrx": 32768, + "deepseek-coder:33b": 16000, + "falcon": 2048, + "llama2": 4096, + "llama3": 8192, + "llama3:70b": 8192, + "llama3.1":128000, + "llama3.1:8b": 128000, + "llama3.1:70b": 128000, + "lama3.1:405b": 128000, + "scrapegraph": 8192, + "llava": 4096, + "mixtral:8x22b-instruct": 65536, + "mistral-openorca": 32000, + "nomic-embed-text": 8192, + "nous-hermes2:34b": 4096, + "orca-mini": 2048, + "phi3:3.8b": 12800, + "qwen:0.5b": 32000, + "qwen:1.8b": 32000, + "qwen:4b": 32000, + "qwen:14b": 32000, + "qwen:32b": 32000, + "qwen:72b": 32000, + "qwen:110b": 32000, + "stablelm-zephyr": 8192, + "wizardlm2:8x22b": 65536, + # embedding models + "shaw/dmeta-embedding-zh-small-q4": 8192, + "shaw/dmeta-embedding-zh-q4": 8192, + "chevalblanc/acge_text_embedding": 8192, + "martcreation/dmeta-embedding-zh": 8192, + "snowflake-arctic-embed": 8192, + "mxbai-embed-large": 512 }, "oneapi": { "qwen-turbo": 6000 diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 8b601e5a..c39c469d 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -4,12 +4,14 @@ SearchLinkNode Module from typing import List, Optional import re from tqdm import tqdm +from urllib.parse import urlparse, parse_qs from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from ..utils.logging import get_logger from .base_node import BaseNode from ..prompts import TEMPLATE_RELEVANT_LINKS +from ..helpers import default_filters class SearchLinkNode(BaseNode): @@ -39,10 +41,54 @@ class SearchLinkNode(BaseNode): super().__init__(node_name, "node", input, output, 1, node_config) self.llm_model = node_config["llm_model"] - self.verbose = ( - False if node_config is None else node_config.get("verbose", False) - ) + # Apply filters if filter_links is True or if filter_config is provided + if node_config.get("filter_links", False) or "filter_config" in node_config: + # Merge provided filter config with default filter config for partial configuration + provided_filter_config = node_config.get("filter_config", {}) + self.filter_config = {**default_filters.filter_dict, **provided_filter_config} + self.filter_links = True + else: + # Skip filtering if not enabled + self.filter_config = None + self.filter_links = False + + self.verbose = node_config.get("verbose", False) + self.seen_links = set() + + def _is_same_domain(self, url, domain): + if not self.filter_links or not self.filter_config.get("diff_domain_filter", True): + return True # Skip the domain filter if not enabled + parsed_url = urlparse(url) + parsed_domain = urlparse(domain) + return parsed_url.netloc == parsed_domain.netloc + + def _is_image_url(self, url): + if not self.filter_links: + return False # Skip image filtering if filtering is not enabled + + image_extensions = self.filter_config.get("img_exts", []) + return any(url.lower().endswith(ext) for ext in image_extensions) + + def _is_language_url(self, url): + if not self.filter_links: + return False # Skip language filtering if filtering is not enabled + + lang_indicators = self.filter_config.get("lang_indicators", []) + parsed_url = urlparse(url) + query_params = parse_qs(parsed_url.query) + + # Check if the URL path or query string indicates a language-specific version + return any(indicator in parsed_url.path.lower() or indicator in query_params for indicator in lang_indicators) + + def _is_potentially_irrelevant(self, url): + if not self.filter_links: + return False # Skip irrelevant URL filtering if filtering is not enabled + + irrelevant_keywords = self.filter_config.get("irrelevant_keywords", []) + return any(keyword in url.lower() for keyword in irrelevant_keywords) + + def execute(self, state: dict) -> dict: """ Filter out relevant links from the webpage that are relavant to prompt. Out of the filtered links, also @@ -64,6 +110,7 @@ class SearchLinkNode(BaseNode): parsed_content_chunks = state.get("doc") + source_url = state.get("url") or state.get("local_dir") output_parser = JsonOutputParser() relevant_links = [] @@ -76,10 +123,28 @@ class SearchLinkNode(BaseNode): ) ): try: + # Primary approach: Regular expression to extract links links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content)) - relevant_links += links + if not self.filter_links: + links = list(set(links)) + + relevant_links += links + self.seen_links.update(relevant_links) + else: + filtered_links = [ + link for link in links + if self._is_same_domain(link, source_url) + and not self._is_image_url(link) + and not self._is_language_url(link) + and not self._is_potentially_irrelevant(link) + and link not in self.seen_links + ] + filtered_links = list(set(filtered_links)) + relevant_links += filtered_links + self.seen_links.update(relevant_links) + except Exception as e: # Fallback approach: Using the LLM to extract links self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")