mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
implemented summary graph
This commit is contained in:
parent
da16360b1f
commit
4e3005db0a
@ -4,7 +4,7 @@ Basic example of scraping pipeline using SmartScraper
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SmartScraper
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -15,8 +15,8 @@ llm_config = {
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
}
|
||||
|
||||
smart_scraper = SmartScraper("List me all the titles and project descriptions",
|
||||
smart_scraper_graph = SmartScraperGraph("List me all the titles and project descriptions",
|
||||
"https://perinim.github.io/projects/", llm_config)
|
||||
|
||||
answer = smart_scraper.run()
|
||||
answer = smart_scraper_graph.run()
|
||||
print(answer)
|
||||
|
||||
26
examples/speech_summary_graph_example.py
Normal file
26
examples/speech_summary_graph_example.py
Normal file
@ -0,0 +1,26 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SpeechSummaryGraph
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SpeechSummaryGraph
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Define the configuration for the language model
|
||||
openai_key = os.getenv("OPENAI_APIKEY")
|
||||
llm_config = {
|
||||
"api_key": openai_key,
|
||||
}
|
||||
|
||||
# Save the audio to a file
|
||||
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
output_file_path = os.path.join(curr_dir, "website_summary.mp3")
|
||||
|
||||
speech_summary_graph = SpeechSummaryGraph("Make a summary of the webpage to be converted to speech for blind people.",
|
||||
"https://perinim.github.io/projects/", llm_config,
|
||||
output_file_path)
|
||||
|
||||
final_state = speech_summary_graph.run()
|
||||
print(final_state.get("answer", "No answer found."))
|
||||
@ -2,4 +2,5 @@
|
||||
__init__.py file for graphs folder
|
||||
"""
|
||||
from .base_graph import BaseGraph
|
||||
from .smart_scraper_graph import SmartScraper
|
||||
from .smart_scraper_graph import SmartScraperGraph
|
||||
from .speech_summary_graph import SpeechSummaryGraph
|
||||
|
||||
@ -9,7 +9,7 @@ from ..nodes import (
|
||||
)
|
||||
|
||||
|
||||
class SmartScraper:
|
||||
class SmartScraperGraph:
|
||||
"""
|
||||
SmartScraper is a comprehensive web scraping tool that automates the process of extracting
|
||||
information from web pages using a natural language model to interpret and answer prompts.
|
||||
|
||||
121
scrapegraphai/graphs/speech_summary_graph.py
Normal file
121
scrapegraphai/graphs/speech_summary_graph.py
Normal file
@ -0,0 +1,121 @@
|
||||
from ..models import OpenAI, OpenAITextToSpeech
|
||||
from .base_graph import BaseGraph
|
||||
from ..nodes import (
|
||||
FetchHTMLNode,
|
||||
ConditionalNode,
|
||||
GetProbableTagsNode,
|
||||
GenerateAnswerNode,
|
||||
ParseHTMLNode,
|
||||
TextToSpeechNode,
|
||||
)
|
||||
from scrapegraphai.utils import save_audio_from_bytes
|
||||
|
||||
class SpeechSummaryGraph:
|
||||
"""
|
||||
SpeechSummaryGraph is a tool that automates the process of extracting and summarizing
|
||||
information from web pages, then converting that summary into spoken word via an MP3 file.
|
||||
|
||||
Attributes:
|
||||
url (str): The URL of the web page to scrape and summarize.
|
||||
llm_config (dict): Configuration parameters for the language model, with 'api_key' mandatory.
|
||||
summary_prompt (str): The prompt used to guide the summarization process.
|
||||
output_path (Path): The path where the generated MP3 file will be saved.
|
||||
|
||||
Methods:
|
||||
run(): Executes the web scraping, summarization, and text-to-speech process.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the web page to scrape and summarize.
|
||||
llm_config (dict): A dictionary containing configuration options for the language model.
|
||||
summary_prompt (str): The prompt used to guide the summarization process.
|
||||
output_path (str): The file path where the generated MP3 should be saved.
|
||||
"""
|
||||
|
||||
def __init__(self, prompt: str, url: str, llm_config: dict, output_path: str):
|
||||
"""
|
||||
Initializes the SmartScraper with a prompt, URL, and language model configuration.
|
||||
"""
|
||||
self.prompt = f"{prompt} - Save the summary in a key called 'summary'."
|
||||
self.url = url
|
||||
self.llm_config = llm_config
|
||||
self.llm = self._create_llm()
|
||||
self.output_path = output_path
|
||||
self.text_to_speech_model = OpenAITextToSpeech(llm_config, model="tts-1", voice="alloy")
|
||||
self.graph = self._create_graph()
|
||||
|
||||
|
||||
def _create_llm(self):
|
||||
"""
|
||||
Creates an instance of the ChatOpenAI class with the provided language model configuration.
|
||||
|
||||
Returns:
|
||||
ChatOpenAI: An instance of the ChatOpenAI class.
|
||||
|
||||
Raises:
|
||||
ValueError: If 'api_key' is not provided in llm_config.
|
||||
"""
|
||||
llm_defaults = {
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"temperature": 0,
|
||||
"streaming": True
|
||||
}
|
||||
# Update defaults with any LLM parameters that were provided
|
||||
llm_params = {**llm_defaults, **self.llm_config}
|
||||
# Ensure the api_key is set, raise an error if it's not
|
||||
if "api_key" not in llm_params:
|
||||
raise ValueError("LLM configuration must include an 'api_key'.")
|
||||
# Create the ChatOpenAI instance with the provided and default parameters
|
||||
return OpenAI(llm_params)
|
||||
|
||||
def _create_graph(self):
|
||||
"""
|
||||
Creates the graph of nodes representing the workflow for web scraping.
|
||||
|
||||
Returns:
|
||||
BaseGraph: An instance of the BaseGraph class.
|
||||
"""
|
||||
fetch_html_node = FetchHTMLNode("fetch_html")
|
||||
get_probable_tags_node = GetProbableTagsNode(
|
||||
self.llm, "get_probable_tags")
|
||||
parse_document_node = ParseHTMLNode("parse_document")
|
||||
generate_answer_node = GenerateAnswerNode(self.llm, "generate_answer")
|
||||
conditional_node = ConditionalNode(
|
||||
"conditional", [parse_document_node, generate_answer_node])
|
||||
text_to_speech_node = TextToSpeechNode(
|
||||
self.text_to_speech_model, "text_to_speech")
|
||||
|
||||
return BaseGraph(
|
||||
nodes={
|
||||
fetch_html_node,
|
||||
get_probable_tags_node,
|
||||
conditional_node,
|
||||
parse_document_node,
|
||||
generate_answer_node,
|
||||
text_to_speech_node
|
||||
},
|
||||
edges={
|
||||
(fetch_html_node, get_probable_tags_node),
|
||||
(get_probable_tags_node, conditional_node),
|
||||
(parse_document_node, generate_answer_node),
|
||||
(generate_answer_node, text_to_speech_node)
|
||||
},
|
||||
entry_point=fetch_html_node
|
||||
)
|
||||
|
||||
def run(self) -> str:
|
||||
"""
|
||||
Executes the scraping process by running the graph and returns the extracted information.
|
||||
|
||||
Returns:
|
||||
str: The answer extracted from the web page, corresponding to the given prompt.
|
||||
"""
|
||||
inputs = {"user_input": self.prompt, "url": self.url}
|
||||
final_state = self.graph.execute(inputs)
|
||||
|
||||
audio = final_state.get("audio", None)
|
||||
if not audio:
|
||||
raise ValueError("No audio generated from the text.")
|
||||
save_audio_from_bytes(audio, self.output_path)
|
||||
print(f"Audio saved to {self.output_path}")
|
||||
|
||||
return final_state
|
||||
@ -34,6 +34,8 @@ class ImageToTextNode(BaseNode):
|
||||
url (str): url of the image where to
|
||||
:return: The updated state after executing this node.
|
||||
"""
|
||||
|
||||
print("---GENERATING TEXT FROM IMAGE---")
|
||||
text_answer = self.llm.run(url)
|
||||
|
||||
state.update({"image_text": text_answer})
|
||||
|
||||
@ -23,7 +23,7 @@ class TextToSpeechNode(BaseNode):
|
||||
super().__init__(node_name, "node")
|
||||
self.llm = llm
|
||||
|
||||
def execute(self, state: dict, text: str) -> dict:
|
||||
def execute(self, state: dict, text: str | None = None) -> dict:
|
||||
"""
|
||||
Execute the node's logic and return the updated state.
|
||||
Args:
|
||||
@ -33,7 +33,12 @@ class TextToSpeechNode(BaseNode):
|
||||
:return: The updated state after executing this node.
|
||||
"""
|
||||
|
||||
audio = self.llm.run(text)
|
||||
text2translate = state.get("answer", None)
|
||||
if not text2translate:
|
||||
raise ValueError("No text to translate to speech.")
|
||||
|
||||
print("---TRANSLATING TEXT TO SPEECH---")
|
||||
audio = self.llm.run(text2translate["summary"])
|
||||
|
||||
state.update({"audio": audio})
|
||||
return state
|
||||
|
||||
Loading…
Reference in New Issue
Block a user