diff --git a/examples/local_models/search_link_graph_ollama.py b/examples/local_models/search_link_graph_ollama.py new file mode 100644 index 00000000..5c594270 --- /dev/null +++ b/examples/local_models/search_link_graph_ollama.py @@ -0,0 +1,43 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +from scrapegraphai.graphs import SearchLinkGraph +from scrapegraphai.utils import prettify_exec_info +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False +} + +# ************************************************ +# Create the SearchLinkGraph instance and run it +# ************************************************ + +smart_scraper_graph = SearchLinkGraph( + source="https://sport.sky.it/nba?gr=www", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index b1bf1242..26a0b9e1 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -23,3 +23,4 @@ from .xml_scraper_multi_graph import XMLScraperMultiGraph from .script_creator_multi_graph import ScriptCreatorMultiGraph from .markdown_scraper_graph import MDScraperGraph from .markdown_scraper_multi_graph import MDScraperMultiGraph +from .search_link_graph import SearchLinkGraph diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py new file mode 100644 index 00000000..2e23357c --- /dev/null +++ b/scrapegraphai/graphs/search_link_graph.py @@ -0,0 +1,104 @@ +""" SearchLinkGraph Module """ +from typing import Optional +import logging +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + + +from ..nodes import ( FetchNode, ParseNode, SearchLinkNode ) + +class SearchLinkGraph(AbstractGraph): + """ + SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel, optional): The schema for the graph output. Defaults to None. + + Example: + >>> smart_scraper = SearchLinkGraph( + ... "List me all the attractions in Chioggia.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper.run() + """ + + def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None): + super().__init__("", config, source, schema) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. + """ + + fetch_node = FetchNode( + input="url| local_dir", + output=["doc", "link_urls", "img_urls"], + node_config={ + "llm_model": self.llm_model, + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "loader_kwargs": self.config.get("loader_kwargs", {}), + } + ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "chunk_size": self.model_token + } + ) + search_link_node = SearchLinkNode( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + search_link_node + ], + edges=[ + (fetch_node, parse_node), + (parse_node, search_link_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. + """ + + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("parsed_doc", "No answer found.") \ No newline at end of file diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 8c81d07b..ffc8a71b 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -68,11 +68,8 @@ class SearchLinkNode(BaseNode): self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression - input_keys = self.get_input_keys(state) - user_prompt = state[input_keys[0]] - parsed_content_chunks = state[input_keys[1]] + parsed_content_chunks = state.get("doc") output_parser = JsonOutputParser() relevant_links = [] @@ -86,7 +83,8 @@ class SearchLinkNode(BaseNode): ): try: # Primary approach: Regular expression to extract links - links = re.findall(r'(https?://\S+)', chunk.page_content) + links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content)) + relevant_links += links except Exception as e: # Fallback approach: Using the LLM to extract links @@ -95,9 +93,6 @@ class SearchLinkNode(BaseNode): You are a website scraper and you have just scraped the following content from a website. Content: {content} - You are now tasked with identifying all hyper links within the content that are potentially - relevant to the user task: {user_prompt} - Assume relevance broadly, including any links that might be related or potentially useful in relation to the task. @@ -124,9 +119,9 @@ class SearchLinkNode(BaseNode): ) merge_chain = merge_prompt | self.llm_model | output_parser answer = merge_chain.invoke( - {"content": chunk.page_content, "user_prompt": user_prompt} + {"content": chunk.page_content} ) relevant_links += answer state.update({self.output[0]: relevant_links}) - return state + return state \ No newline at end of file