mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
feat: update search_link_graph
This commit is contained in:
parent
369332b39c
commit
de10b281ba
@ -6,9 +6,11 @@ import logging
|
||||
from pydantic import BaseModel
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
from ..nodes import ( FetchNode, ParseNode, SearchLinkNode )
|
||||
from ..nodes import (FetchNode,
|
||||
SearchLinkNode,
|
||||
SearchLinksWithContext)
|
||||
|
||||
class SearchLinkGraph(AbstractGraph):
|
||||
class SearchLinkGraph(AbstractGraph):
|
||||
"""
|
||||
SearchLinkGraph is a scraping pipeline that automates the process of
|
||||
extracting information from web pages using a natural language model
|
||||
@ -30,13 +32,7 @@ class SearchLinkGraph(AbstractGraph):
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (BaseModel, optional): The schema for the graph output. Defaults to None.
|
||||
|
||||
Example:
|
||||
>>> smart_scraper = SearchLinkGraph(
|
||||
... "List me all the attractions in Chioggia.",
|
||||
... "https://en.wikipedia.org/wiki/Chioggia",
|
||||
... {"llm": {"model": "openai/gpt-3.5-turbo"}}
|
||||
... )
|
||||
>>> result = smart_scraper.run()
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None):
|
||||
@ -51,28 +47,33 @@ class SearchLinkGraph(AbstractGraph):
|
||||
Returns:
|
||||
BaseGraph: A graph instance representing the web scraping workflow.
|
||||
"""
|
||||
|
||||
fetch_node = FetchNode(
|
||||
input="url| local_dir",
|
||||
output=["doc"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"force": self.config.get("force", False),
|
||||
"cut": self.config.get("cut", True),
|
||||
"loader_kwargs": self.config.get("loader_kwargs", {}),
|
||||
}
|
||||
)
|
||||
input="url| local_dir",
|
||||
output=["doc"],
|
||||
node_config={
|
||||
"force": self.config.get("force", False),
|
||||
"cut": self.config.get("cut", True),
|
||||
"loader_kwargs": self.config.get("loader_kwargs", {}),
|
||||
}
|
||||
)
|
||||
|
||||
search_link_node = SearchLinkNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"chunk_size": self.model_token,
|
||||
"filter_links": self.config.get("filter_links", None),
|
||||
"filter_config": self.config.get("filter_config", None)
|
||||
}
|
||||
)
|
||||
if self.config.get("llm_style") == (True, None):
|
||||
search_link_node = SearchLinksWithContext(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"chunk_size": self.model_token,
|
||||
}
|
||||
)
|
||||
else:
|
||||
search_link_node = SearchLinkNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
node_config={
|
||||
"chunk_size": self.model_token,
|
||||
}
|
||||
)
|
||||
|
||||
return BaseGraph(
|
||||
nodes=[
|
||||
|
||||
@ -23,3 +23,4 @@ from .merge_generated_scripts import MergeGeneratedScriptsNode
|
||||
from .fetch_screen_node import FetchScreenNode
|
||||
from .generate_answer_from_image_node import GenerateAnswerFromImageNode
|
||||
from .concat_answers_node import ConcatAnswersNode
|
||||
from .search_node_with_context import SearchLinksWithContext
|
||||
|
||||
@ -40,8 +40,6 @@ class SearchLinkNode(BaseNode):
|
||||
):
|
||||
super().__init__(node_name, "node", input, output, 1, node_config)
|
||||
|
||||
self.llm_model = node_config["llm_model"]
|
||||
|
||||
if node_config.get("filter_links", False) or "filter_config" in node_config:
|
||||
provided_filter_config = node_config.get("filter_config", {})
|
||||
self.filter_config = {**default_filters.filter_dict, **provided_filter_config}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user