From de10b281bab7385e250f4284ff3922dba38882f7 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 23 Sep 2024 08:26:36 +0200 Subject: [PATCH] feat: update search_link_graph --- scrapegraphai/graphs/search_link_graph.py | 59 ++++++++++++----------- scrapegraphai/nodes/__init__.py | 1 + scrapegraphai/nodes/search_link_node.py | 2 - 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py index 58dd1613..9df04871 100644 --- a/scrapegraphai/graphs/search_link_graph.py +++ b/scrapegraphai/graphs/search_link_graph.py @@ -6,9 +6,11 @@ import logging from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph -from ..nodes import ( FetchNode, ParseNode, SearchLinkNode ) +from ..nodes import (FetchNode, + SearchLinkNode, + SearchLinksWithContext) -class SearchLinkGraph(AbstractGraph): +class SearchLinkGraph(AbstractGraph): """ SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model @@ -30,13 +32,7 @@ class SearchLinkGraph(AbstractGraph): config (dict): Configuration parameters for the graph. schema (BaseModel, optional): The schema for the graph output. Defaults to None. - Example: - >>> smart_scraper = SearchLinkGraph( - ... "List me all the attractions in Chioggia.", - ... "https://en.wikipedia.org/wiki/Chioggia", - ... {"llm": {"model": "openai/gpt-3.5-turbo"}} - ... ) - >>> result = smart_scraper.run() + """ def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None): @@ -51,28 +47,33 @@ class SearchLinkGraph(AbstractGraph): Returns: BaseGraph: A graph instance representing the web scraping workflow. """ - fetch_node = FetchNode( - input="url| local_dir", - output=["doc"], - node_config={ - "llm_model": self.llm_model, - "force": self.config.get("force", False), - "cut": self.config.get("cut", True), - "loader_kwargs": self.config.get("loader_kwargs", {}), - } - ) + input="url| local_dir", + output=["doc"], + node_config={ + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "loader_kwargs": self.config.get("loader_kwargs", {}), + } + ) - search_link_node = SearchLinkNode( - input="doc", - output=["parsed_doc"], - node_config={ - "llm_model": self.llm_model, - "chunk_size": self.model_token, - "filter_links": self.config.get("filter_links", None), - "filter_config": self.config.get("filter_config", None) - } - ) + if self.config.get("llm_style") == (True, None): + search_link_node = SearchLinksWithContext( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token, + } + ) + else: + search_link_node = SearchLinkNode( + input="doc", + output=["parsed_doc"], + node_config={ + "chunk_size": self.model_token, + } + ) return BaseGraph( nodes=[ diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index 1e990400..29d70b37 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -23,3 +23,4 @@ from .merge_generated_scripts import MergeGeneratedScriptsNode from .fetch_screen_node import FetchScreenNode from .generate_answer_from_image_node import GenerateAnswerFromImageNode from .concat_answers_node import ConcatAnswersNode +from .search_node_with_context import SearchLinksWithContext diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 32dc1928..935a64ba 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -40,8 +40,6 @@ class SearchLinkNode(BaseNode): ): super().__init__(node_name, "node", input, output, 1, node_config) - self.llm_model = node_config["llm_model"] - if node_config.get("filter_links", False) or "filter_config" in node_config: provided_filter_config = node_config.get("filter_config", {}) self.filter_config = {**default_filters.filter_dict, **provided_filter_config}