Merge pull request #456 from ScrapeGraphAI/refactoring-of-search_link_node

fix: search link node
This commit is contained in:
Federico Aguzzi 2024-07-15 23:06:03 +02:00 committed by GitHub
commit dccb893fdf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 683 additions and 49 deletions

View File

@ -0,0 +1,57 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
# ************************************************
# Define the configuration for the graph
# ************************************************
load_dotenv()
llm_model_instance = AzureChatOpenAI(
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
)
embedder_model_instance = AzureOpenAIEmbeddings(
azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
)
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
graph_config = {
"llm": {"model_instance": llm_model_instance},
"embeddings": {"model_instance": embedder_model_instance}
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,52 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
load_dotenv()
groq_key = os.getenv("GROQ_APIKEY")
graph_config = {
"llm": {
"model": "groq/gemma-7b-it",
"api_key": groq_key,
"temperature": 0
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"headless": False
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,45 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"client": "client_name",
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0
},
"embeddings": {
"model": "bedrock/cohere.embed-multilingual-v3"
}
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,52 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
load_dotenv()
deepseek_key = os.getenv("DEEPSEEK_APIKEY")
graph_config = {
"llm": {
"model": "deepseek-chat",
"openai_api_key": deepseek_key,
"openai_api_base": 'https://api.deepseek.com/v1',
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"verbose": True,
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -12,15 +12,18 @@ load_dotenv()
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-3.5-turbo",
},
"max_results": 2,
"verbose": True,
"model": "ernie-bot-turbo",
"ernie_client_id": "<ernie_client_id>",
"ernie_client_secret": "<ernie_client_secret>",
"temperature": 0.1
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434"},
"library": "beautifulsoup"
}
# ************************************************

View File

@ -0,0 +1,46 @@
"""
Example of Search Graph
"""
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ernie-bot-turbo",
"ernie_client_id": "<ernie_client_id>",
"ernie_client_secret": "<ernie_client_secret>",
"temperature": 0.1
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434"},
"library": "beautifulsoup"
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,52 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
load_dotenv()
fireworks_api_key = os.getenv("FIREWORKS_APIKEY")
graph_config = {
"llm": {
"api_key": fireworks_api_key,
"model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct"
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"max_results": 2,
"verbose": True,
"headless": False,
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,44 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
load_dotenv()
gemini_key = os.getenv("GOOGLE_APIKEY")
graph_config = {
"llm": {
"api_key": gemini_key,
"model": "gemini-pro",
},
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,52 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
load_dotenv()
groq_key = os.getenv("GROQ_APIKEY")
graph_config = {
"llm": {
"model": "groq/gemma-7b-it",
"api_key": groq_key,
"temperature": 0
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"headless": False
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -9,7 +9,6 @@ from scrapegraphai.utils import prettify_exec_info
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************

View File

@ -0,0 +1,54 @@
"""
Example of Search Graph
"""
import os
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
# ************************************************
# Define the configuration for the graph
# ************************************************
# ************************************************
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
llm_model_instance = HuggingFaceEndpoint(
repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
)
embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
)
graph_config = {
"llm": {"model_instance": llm_model_instance},
"embeddings": {"model_instance": embedder_model_instance}
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,43 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
from scrapegraphai.graphs import SearchLinkGraph
from scrapegraphai.utils import prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ollama/llama3",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"verbose": True,
"headless": False
}
# ************************************************
# Create the SearchLinkGraph instance and run it
# ************************************************
smart_scraper_graph = SearchLinkGraph(
source="https://sport.sky.it/nba?gr=www",
config=graph_config
)
result = smart_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

View File

@ -23,3 +23,4 @@ from .xml_scraper_multi_graph import XMLScraperMultiGraph
from .script_creator_multi_graph import ScriptCreatorMultiGraph
from .markdown_scraper_graph import MDScraperGraph
from .markdown_scraper_multi_graph import MDScraperMultiGraph
from .search_link_graph import SearchLinkGraph

View File

@ -0,0 +1,104 @@
""" SearchLinkGraph Module """
from typing import Optional
import logging
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import ( FetchNode, ParseNode, SearchLinkNode )
class SearchLinkGraph(AbstractGraph):
"""
SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts.
Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
Args:
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel, optional): The schema for the graph output. Defaults to None.
Example:
>>> smart_scraper = SearchLinkGraph(
... "List me all the attractions in Chioggia.",
... "https://en.wikipedia.org/wiki/Chioggia",
... {"llm": {"model": "gpt-3.5-turbo"}}
... )
>>> result = smart_scraper.run()
"""
def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None):
super().__init__("", config, source, schema)
self.input_key = "url" if source.startswith("http") else "local_dir"
def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping.
Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""
fetch_node = FetchNode(
input="url| local_dir",
output=["doc", "link_urls", "img_urls"],
node_config={
"llm_model": self.llm_model,
"force": self.config.get("force", False),
"cut": self.config.get("cut", True),
"loader_kwargs": self.config.get("loader_kwargs", {}),
}
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": self.model_token
}
)
search_link_node = SearchLinkNode(
input="doc",
output=["parsed_doc"],
node_config={
"llm_model": self.llm_model,
"chunk_size": self.model_token
}
)
return BaseGraph(
nodes=[
fetch_node,
parse_node,
search_link_node
],
edges=[
(fetch_node, parse_node),
(parse_node, search_link_node)
],
entry_point=fetch_node,
graph_name=self.__class__.__name__
)
def run(self) -> str:
"""
Executes the scraping process and returns the answer to the prompt.
Returns:
str: The answer to the prompt.
"""
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("parsed_doc", "No answer found.")

View File

@ -4,6 +4,7 @@ SearchLinkNode Module
# Imports from standard library
from typing import List, Optional
import re
from tqdm import tqdm
# Imports from Langchain
@ -20,7 +21,7 @@ from .base_node import BaseNode
class SearchLinkNode(BaseNode):
"""
A node that can filter out the relevant links in the webpage content for the user prompt.
Node expects the aleready scrapped links on the webpage and hence it is expected
Node expects the already scrapped links on the webpage and hence it is expected
that this node be used after the FetchNode.
Attributes:
@ -67,39 +68,10 @@ class SearchLinkNode(BaseNode):
self.logger.info(f"--- Executing {self.node_name} Node ---")
# Interpret input keys based on the provided input expression
input_keys = self.get_input_keys(state)
user_prompt = state[input_keys[0]]
parsed_content_chunks = state[input_keys[1]]
parsed_content_chunks = state.get("doc")
output_parser = JsonOutputParser()
prompt_relevant_links = """
You are a website scraper and you have just scraped the following content from a website.
Content: {content}
You are now tasked with identifying all hyper links within the content that are potentially
relevant to the user task: {user_prompt}
Assume relevance broadly, including any links that might be related or potentially useful
in relation to the task.
Sort it in order of importance, the first one should be the most important one, the last one
the least important
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
whether the content at the link is directly relevant.
Output only a list of relevant links in the format:
[
"link1",
"link2",
"link3",
.
.
.
]
"""
relevant_links = []
for i, chunk in enumerate(
@ -109,15 +81,47 @@ class SearchLinkNode(BaseNode):
disable=not self.verbose,
)
):
merge_prompt = PromptTemplate(
template=prompt_relevant_links,
input_variables=["content", "user_prompt"],
)
merge_chain = merge_prompt | self.llm_model | output_parser
# merge_chain = merge_prompt | self.llm_model
answer = merge_chain.invoke(
{"content": chunk.page_content, "user_prompt": user_prompt}
)
relevant_links += answer
try:
# Primary approach: Regular expression to extract links
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
relevant_links += links
except Exception as e:
# Fallback approach: Using the LLM to extract links
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
prompt_relevant_links = """
You are a website scraper and you have just scraped the following content from a website.
Content: {content}
Assume relevance broadly, including any links that might be related or potentially useful
in relation to the task.
Sort it in order of importance, the first one should be the most important one, the last one
the least important
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
whether the content at the link is directly relevant.
Output only a list of relevant links in the format:
[
"link1",
"link2",
"link3",
.
.
.
]
"""
merge_prompt = PromptTemplate(
template=prompt_relevant_links,
input_variables=["content", "user_prompt"],
)
merge_chain = merge_prompt | self.llm_model | output_parser
answer = merge_chain.invoke(
{"content": chunk.page_content}
)
relevant_links += answer
state.update({self.output[0]: relevant_links})
return state

View File

@ -0,0 +1,26 @@
from scrapegraphai.graphs import SearchLinkGraph
from scrapegraphai.utils import prettify_exec_info
def test_smart_scraper_pipeline():
graph_config = {
"llm": {
"model": "ollama/llama3",
"temperature": 0,
"format": "json",
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
},
"verbose": True,
"headless": False
}
smart_scraper_graph = SearchLinkGraph(
source="https://sport.sky.it/nba?gr=www",
config=graph_config
)
result = smart_scraper_graph.run()
assert result is not None