mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
This commit is contained in:
parent
cf3ab5564a
commit
57fdaf9e3a
43
examples/local_models/search_link_graph_ollama.py
Normal file
43
examples/local_models/search_link_graph_ollama.py
Normal file
@ -0,0 +1,43 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
from scrapegraphai.graphs import SearchLinkGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/llama3",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"verbose": True,
|
||||
"headless": False
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SearchLinkGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_graph = SearchLinkGraph(
|
||||
source="https://sport.sky.it/nba?gr=www",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -23,3 +23,4 @@ from .xml_scraper_multi_graph import XMLScraperMultiGraph
|
||||
from .script_creator_multi_graph import ScriptCreatorMultiGraph
|
||||
from .markdown_scraper_graph import MDScraperGraph
|
||||
from .markdown_scraper_multi_graph import MDScraperMultiGraph
|
||||
from .search_link_graph import SearchLinkGraph
|
||||
|
||||
104
scrapegraphai/graphs/search_link_graph.py
Normal file
104
scrapegraphai/graphs/search_link_graph.py
Normal file
@ -0,0 +1,104 @@
|
||||
""" SearchLinkGraph Module """
|
||||
from typing import Optional
|
||||
import logging
|
||||
from pydantic import BaseModel
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
|
||||
|
||||
from ..nodes import ( FetchNode, ParseNode, SearchLinkNode )
|
||||
|
||||
class SearchLinkGraph(AbstractGraph):
|
||||
"""
|
||||
SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts.
|
||||
|
||||
Attributes:
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client,
|
||||
configured for generating embeddings.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
headless (bool): A flag indicating whether to run the graph in headless mode.
|
||||
|
||||
Args:
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (BaseModel, optional): The schema for the graph output. Defaults to None.
|
||||
|
||||
Example:
|
||||
>>> smart_scraper = SearchLinkGraph(
|
||||
... "List me all the attractions in Chioggia.",
|
||||
... "https://en.wikipedia.org/wiki/Chioggia",
|
||||
... {"llm": {"model": "gpt-3.5-turbo"}}
|
||||
... )
|
||||
>>> result = smart_scraper.run()
|
||||
"""
|
||||
|
||||
def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None):
|
||||
super().__init__("", config, source, schema)
|
||||
|
||||
self.input_key = "url" if source.startswith("http") else "local_dir"
|
||||
|
||||
def _create_graph(self) -> BaseGraph:
|
||||
"""
|
||||
Creates the graph of nodes representing the workflow for web scraping.
|
||||
|
||||
Returns:
|
||||
BaseGraph: A graph instance representing the web scraping workflow.
|
||||
"""
|
||||
|
||||
fetch_node = FetchNode(
|
||||
input="url| local_dir",
|
||||
output=["doc", "link_urls", "img_urls"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"force": self.config.get("force", False),
|
||||
"cut": self.config.get("cut", True),
|
||||
"loader_kwargs": self.config.get("loader_kwargs", {}),
|
||||
}
|
||||
)
|
||||
parse_node = ParseNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
node_config={
|
||||
"chunk_size": self.model_token
|
||||
}
|
||||
)
|
||||
search_link_node = SearchLinkNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"chunk_size": self.model_token
|
||||
}
|
||||
)
|
||||
|
||||
return BaseGraph(
|
||||
nodes=[
|
||||
fetch_node,
|
||||
parse_node,
|
||||
search_link_node
|
||||
],
|
||||
edges=[
|
||||
(fetch_node, parse_node),
|
||||
(parse_node, search_link_node)
|
||||
],
|
||||
entry_point=fetch_node,
|
||||
graph_name=self.__class__.__name__
|
||||
)
|
||||
|
||||
def run(self) -> str:
|
||||
"""
|
||||
Executes the scraping process and returns the answer to the prompt.
|
||||
|
||||
Returns:
|
||||
str: The answer to the prompt.
|
||||
"""
|
||||
|
||||
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||
|
||||
return self.final_state.get("parsed_doc", "No answer found.")
|
||||
@ -68,11 +68,8 @@ class SearchLinkNode(BaseNode):
|
||||
|
||||
self.logger.info(f"--- Executing {self.node_name} Node ---")
|
||||
|
||||
# Interpret input keys based on the provided input expression
|
||||
input_keys = self.get_input_keys(state)
|
||||
|
||||
user_prompt = state[input_keys[0]]
|
||||
parsed_content_chunks = state[input_keys[1]]
|
||||
parsed_content_chunks = state.get("doc")
|
||||
output_parser = JsonOutputParser()
|
||||
|
||||
relevant_links = []
|
||||
@ -86,7 +83,8 @@ class SearchLinkNode(BaseNode):
|
||||
):
|
||||
try:
|
||||
# Primary approach: Regular expression to extract links
|
||||
links = re.findall(r'(https?://\S+)', chunk.page_content)
|
||||
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
|
||||
|
||||
relevant_links += links
|
||||
except Exception as e:
|
||||
# Fallback approach: Using the LLM to extract links
|
||||
@ -95,9 +93,6 @@ class SearchLinkNode(BaseNode):
|
||||
You are a website scraper and you have just scraped the following content from a website.
|
||||
Content: {content}
|
||||
|
||||
You are now tasked with identifying all hyper links within the content that are potentially
|
||||
relevant to the user task: {user_prompt}
|
||||
|
||||
Assume relevance broadly, including any links that might be related or potentially useful
|
||||
in relation to the task.
|
||||
|
||||
@ -124,9 +119,9 @@ class SearchLinkNode(BaseNode):
|
||||
)
|
||||
merge_chain = merge_prompt | self.llm_model | output_parser
|
||||
answer = merge_chain.invoke(
|
||||
{"content": chunk.page_content, "user_prompt": user_prompt}
|
||||
{"content": chunk.page_content}
|
||||
)
|
||||
relevant_links += answer
|
||||
|
||||
state.update({self.output[0]: relevant_links})
|
||||
return state
|
||||
return state
|
||||
Loading…
Reference in New Issue
Block a user