diff --git a/examples/anthropic/search_link_graph_haiku.py b/examples/anthropic/search_link_graph_haiku.py new file mode 100644 index 00000000..ccfbc1d2 --- /dev/null +++ b/examples/anthropic/search_link_graph_haiku.py @@ -0,0 +1,57 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +from langchain_openai import AzureChatOpenAI +from langchain_openai import AzureOpenAIEmbeddings + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +llm_model_instance = AzureChatOpenAI( + openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], + azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] +) + +embedder_model_instance = AzureOpenAIEmbeddings( + azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], + openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], +) + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "embeddings": {"model_instance": embedder_model_instance} +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/azure/search_link_graph_azure.py b/examples/azure/search_link_graph_azure.py new file mode 100644 index 00000000..f940c2a4 --- /dev/null +++ b/examples/azure/search_link_graph_azure.py @@ -0,0 +1,52 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "headless": False +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/bedrock/search_link_graph_bedrock.py b/examples/bedrock/search_link_graph_bedrock.py new file mode 100644 index 00000000..116dea01 --- /dev/null +++ b/examples/bedrock/search_link_graph_bedrock.py @@ -0,0 +1,45 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/deepseek/search_link_graph_deepseek.py b/examples/deepseek/search_link_graph_deepseek.py new file mode 100644 index 00000000..30e4a9b3 --- /dev/null +++ b/examples/deepseek/search_link_graph_deepseek.py @@ -0,0 +1,52 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek-chat", + "openai_api_key": deepseek_key, + "openai_api_base": 'https://api.deepseek.com/v1', + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/ernie/search_graph_ernie.py b/examples/ernie/search_graph_ernie.py index 22802c6e..c04d9f9b 100644 --- a/examples/ernie/search_graph_ernie.py +++ b/examples/ernie/search_graph_ernie.py @@ -12,15 +12,18 @@ load_dotenv() # Define the configuration for the graph # ************************************************ -openai_key = os.getenv("OPENAI_APIKEY") - graph_config = { "llm": { - "api_key": openai_key, - "model": "gpt-3.5-turbo", - }, - "max_results": 2, - "verbose": True, + "model": "ernie-bot-turbo", + "ernie_client_id": "", + "ernie_client_secret": "", + "temperature": 0.1 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434"}, + "library": "beautifulsoup" } # ************************************************ diff --git a/examples/ernie/search_link_graph_ernie.py b/examples/ernie/search_link_graph_ernie.py new file mode 100644 index 00000000..466b230c --- /dev/null +++ b/examples/ernie/search_link_graph_ernie.py @@ -0,0 +1,46 @@ +""" +Example of Search Graph +""" +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ernie-bot-turbo", + "ernie_client_id": "", + "ernie_client_secret": "", + "temperature": 0.1 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434"}, + "library": "beautifulsoup" +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/fireworks/search_link_graph_fireworks.py b/examples/fireworks/search_link_graph_fireworks.py new file mode 100644 index 00000000..a1d3a979 --- /dev/null +++ b/examples/fireworks/search_link_graph_fireworks.py @@ -0,0 +1,52 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "max_results": 2, + "verbose": True, + "headless": False, +} +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/gemini/search_link_graph_gemini.py b/examples/gemini/search_link_graph_gemini.py new file mode 100644 index 00000000..937038bd --- /dev/null +++ b/examples/gemini/search_link_graph_gemini.py @@ -0,0 +1,44 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "gemini-pro", + }, +} +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/groq/search_link_graph_groq.py b/examples/groq/search_link_graph_groq.py new file mode 100644 index 00000000..f940c2a4 --- /dev/null +++ b/examples/groq/search_link_graph_groq.py @@ -0,0 +1,52 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "headless": False +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/groq/smart_scraper_groq.py b/examples/groq/smart_scraper_groq.py index c1a5d319..f828cdec 100644 --- a/examples/groq/smart_scraper_groq.py +++ b/examples/groq/smart_scraper_groq.py @@ -9,7 +9,6 @@ from scrapegraphai.utils import prettify_exec_info load_dotenv() - # ************************************************ # Define the configuration for the graph # ************************************************ diff --git a/examples/huggingfacehub/search_link_graph_huggingfacehub.py b/examples/huggingfacehub/search_link_graph_huggingfacehub.py new file mode 100644 index 00000000..a49fb3b9 --- /dev/null +++ b/examples/huggingfacehub/search_link_graph_huggingfacehub.py @@ -0,0 +1,54 @@ +""" +Example of Search Graph +""" +import os +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +from langchain_community.llms import HuggingFaceEndpoint +from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings + +# ************************************************ +# Define the configuration for the graph +# ************************************************ +# ************************************************ + +HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') + +repo_id = "mistralai/Mistral-7B-Instruct-v0.2" + +llm_model_instance = HuggingFaceEndpoint( + repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN +) + +embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( + api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" +) + +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "embeddings": {"model_instance": embedder_model_instance} +} + + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/local_models/search_link_graph_ollama.py b/examples/local_models/search_link_graph_ollama.py new file mode 100644 index 00000000..5c594270 --- /dev/null +++ b/examples/local_models/search_link_graph_ollama.py @@ -0,0 +1,43 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +from scrapegraphai.graphs import SearchLinkGraph +from scrapegraphai.utils import prettify_exec_info +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False +} + +# ************************************************ +# Create the SearchLinkGraph instance and run it +# ************************************************ + +smart_scraper_graph = SearchLinkGraph( + source="https://sport.sky.it/nba?gr=www", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index b1bf1242..26a0b9e1 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -23,3 +23,4 @@ from .xml_scraper_multi_graph import XMLScraperMultiGraph from .script_creator_multi_graph import ScriptCreatorMultiGraph from .markdown_scraper_graph import MDScraperGraph from .markdown_scraper_multi_graph import MDScraperMultiGraph +from .search_link_graph import SearchLinkGraph diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py new file mode 100644 index 00000000..2e23357c --- /dev/null +++ b/scrapegraphai/graphs/search_link_graph.py @@ -0,0 +1,104 @@ +""" SearchLinkGraph Module """ +from typing import Optional +import logging +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + + +from ..nodes import ( FetchNode, ParseNode, SearchLinkNode ) + +class SearchLinkGraph(AbstractGraph): + """ + SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel, optional): The schema for the graph output. Defaults to None. + + Example: + >>> smart_scraper = SearchLinkGraph( + ... "List me all the attractions in Chioggia.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper.run() + """ + + def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None): + super().__init__("", config, source, schema) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. + """ + + fetch_node = FetchNode( + input="url| local_dir", + output=["doc", "link_urls", "img_urls"], + node_config={ + "llm_model": self.llm_model, + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "loader_kwargs": self.config.get("loader_kwargs", {}), + } + ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "chunk_size": self.model_token + } + ) + search_link_node = SearchLinkNode( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + search_link_node + ], + edges=[ + (fetch_node, parse_node), + (parse_node, search_link_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. + """ + + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("parsed_doc", "No answer found.") \ No newline at end of file diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 2a0c5f18..b3d289d9 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -4,6 +4,7 @@ SearchLinkNode Module # Imports from standard library from typing import List, Optional +import re from tqdm import tqdm # Imports from Langchain @@ -20,7 +21,7 @@ from .base_node import BaseNode class SearchLinkNode(BaseNode): """ A node that can filter out the relevant links in the webpage content for the user prompt. - Node expects the aleready scrapped links on the webpage and hence it is expected + Node expects the already scrapped links on the webpage and hence it is expected that this node be used after the FetchNode. Attributes: @@ -67,39 +68,10 @@ class SearchLinkNode(BaseNode): self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression - input_keys = self.get_input_keys(state) - user_prompt = state[input_keys[0]] - parsed_content_chunks = state[input_keys[1]] + parsed_content_chunks = state.get("doc") output_parser = JsonOutputParser() - prompt_relevant_links = """ - You are a website scraper and you have just scraped the following content from a website. - Content: {content} - - You are now tasked with identifying all hyper links within the content that are potentially - relevant to the user task: {user_prompt} - - Assume relevance broadly, including any links that might be related or potentially useful - in relation to the task. - - Sort it in order of importance, the first one should be the most important one, the last one - the least important - - Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain - whether the content at the link is directly relevant. - - Output only a list of relevant links in the format: - [ - "link1", - "link2", - "link3", - . - . - . - ] - """ relevant_links = [] for i, chunk in enumerate( @@ -109,15 +81,47 @@ class SearchLinkNode(BaseNode): disable=not self.verbose, ) ): - merge_prompt = PromptTemplate( - template=prompt_relevant_links, - input_variables=["content", "user_prompt"], - ) - merge_chain = merge_prompt | self.llm_model | output_parser - # merge_chain = merge_prompt | self.llm_model - answer = merge_chain.invoke( - {"content": chunk.page_content, "user_prompt": user_prompt} - ) - relevant_links += answer + try: + # Primary approach: Regular expression to extract links + links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content)) + + relevant_links += links + except Exception as e: + # Fallback approach: Using the LLM to extract links + self.logger.error(f"Error extracting links: {e}. Falling back to LLM.") + prompt_relevant_links = """ + You are a website scraper and you have just scraped the following content from a website. + Content: {content} + + Assume relevance broadly, including any links that might be related or potentially useful + in relation to the task. + + Sort it in order of importance, the first one should be the most important one, the last one + the least important + + Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain + whether the content at the link is directly relevant. + + Output only a list of relevant links in the format: + [ + "link1", + "link2", + "link3", + . + . + . + ] + """ + + merge_prompt = PromptTemplate( + template=prompt_relevant_links, + input_variables=["content", "user_prompt"], + ) + merge_chain = merge_prompt | self.llm_model | output_parser + answer = merge_chain.invoke( + {"content": chunk.page_content} + ) + relevant_links += answer + state.update({self.output[0]: relevant_links}) return state diff --git a/tests/graphs/search_link_ollama.py b/tests/graphs/search_link_ollama.py new file mode 100644 index 00000000..3b41f699 --- /dev/null +++ b/tests/graphs/search_link_ollama.py @@ -0,0 +1,26 @@ +from scrapegraphai.graphs import SearchLinkGraph +from scrapegraphai.utils import prettify_exec_info + +def test_smart_scraper_pipeline(): + graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + }, + "verbose": True, + "headless": False + } + + smart_scraper_graph = SearchLinkGraph( + source="https://sport.sky.it/nba?gr=www", + config=graph_config + ) + + result = smart_scraper_graph.run() + + assert result is not None