mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
fix: removed deep scraper
This commit is contained in:
parent
27ae896cb7
commit
9aa8c889fb
@ -1,55 +0,0 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import DeepScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
openai_key = os.getenv("OPENAI_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ernie/ernie-bot-turbo",
|
||||
"ernie_client_id": "<ernie_client_id>",
|
||||
"ernie_client_secret": "<ernie_client_secret>",
|
||||
"temperature": 0.1
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434"},
|
||||
"verbose": True,
|
||||
"max_depth": 1
|
||||
}
|
||||
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
deep_scraper_graph = DeepScraperGraph(
|
||||
prompt="List me all the job titles and detailed job description.",
|
||||
# also accepts a string with the already downloaded HTML code
|
||||
source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = deep_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = deep_scraper_graph.get_execution_info()
|
||||
print(deep_scraper_graph.get_state("relevant_links"))
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -1,47 +0,0 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import DeepScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
fireworks_api_key = os.getenv("FIREWORKS_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": fireworks_api_key,
|
||||
"model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct"
|
||||
},
|
||||
"verbose": True,
|
||||
"max_depth": 1
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
deep_scraper_graph = DeepScraperGraph(
|
||||
prompt="List me all the job titles and detailed job description.",
|
||||
# also accepts a string with the already downloaded HTML code
|
||||
source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = deep_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = deep_scraper_graph.get_execution_info()
|
||||
print(deep_scraper_graph.get_state("relevant_links"))
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -1,47 +0,0 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import DeepScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
mistral_key = os.getenv("MISTRAL_API_KEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": mistral_key,
|
||||
"model": "mistralai/open-mistral-nemo",
|
||||
},
|
||||
"verbose": True,
|
||||
"max_depth": 1
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
deep_scraper_graph = DeepScraperGraph(
|
||||
prompt="List me all the job titles and detailed job description.",
|
||||
# also accepts a string with the already downloaded HTML code
|
||||
source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = deep_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = deep_scraper_graph.get_execution_info()
|
||||
print(deep_scraper_graph.get_state("relevant_links"))
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -1,47 +0,0 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import DeepScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
nemotron_key = os.getenv("NEMOTRON_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": nemotron_key,
|
||||
"model": "nvidia/meta/llama3-70b-instruct",
|
||||
},
|
||||
"verbose": True,
|
||||
"max_depth": 1
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
deep_scraper_graph = DeepScraperGraph(
|
||||
prompt="List me all the job titles and detailed job description.",
|
||||
# also accepts a string with the already downloaded HTML code
|
||||
source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = deep_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = deep_scraper_graph.get_execution_info()
|
||||
print(deep_scraper_graph.get_state("relevant_links"))
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -1,47 +0,0 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import DeepScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
openai_key = os.getenv("OPENAI_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": openai_key,
|
||||
"model": "openai/gpt-4o",
|
||||
},
|
||||
"verbose": True,
|
||||
"max_depth": 1
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
deep_scraper_graph = DeepScraperGraph(
|
||||
prompt="List me all the job titles and detailed job description.",
|
||||
# also accepts a string with the already downloaded HTML code
|
||||
source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = deep_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = deep_scraper_graph.get_execution_info()
|
||||
print(deep_scraper_graph.get_state("relevant_links"))
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -5,7 +5,6 @@ __init__.py file for graphs folder
|
||||
from .abstract_graph import AbstractGraph
|
||||
from .base_graph import BaseGraph
|
||||
from .smart_scraper_graph import SmartScraperGraph
|
||||
from .deep_scraper_graph import DeepScraperGraph
|
||||
from .speech_graph import SpeechGraph
|
||||
from .search_graph import SearchGraph
|
||||
from .script_creator_graph import ScriptCreatorGraph
|
||||
|
||||
@ -1,162 +0,0 @@
|
||||
"""
|
||||
DeepScraperGraph Module
|
||||
"""
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
SearchLinkNode,
|
||||
ParseNode,
|
||||
GenerateAnswerNode,
|
||||
GraphIteratorNode,
|
||||
MergeAnswersNode
|
||||
)
|
||||
|
||||
class DeepScraperGraph(AbstractGraph):
|
||||
"""
|
||||
[WIP]
|
||||
|
||||
DeepScraper is a scraping pipeline that automates the process of
|
||||
extracting information from web pages using a natural language model
|
||||
to interpret and answer prompts.
|
||||
|
||||
Unlike SmartScraper, DeepScraper can navigate to the links within,
|
||||
the input webpage to fuflfil the task within the prompt.
|
||||
|
||||
Attributes:
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client,
|
||||
configured for generating embeddings.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
headless (bool): A flag indicating whether to run the graph in headless mode.
|
||||
|
||||
Args:
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> deep_scraper = DeepScraperGraph(
|
||||
... "List me all the job titles and detailed job description.",
|
||||
... "https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
|
||||
... {"llm": {"model": "openai/gpt-3.5-turbo"}}
|
||||
... )
|
||||
>>> result = deep_scraper.run()
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
|
||||
|
||||
super().__init__(prompt, config, source, schema)
|
||||
|
||||
self.input_key = "url" if source.startswith("http") else "local_dir"
|
||||
|
||||
def _create_repeated_graph(self) -> BaseGraph:
|
||||
"""
|
||||
Creates the graph that can be repeatedly executed to conduct search on
|
||||
hyperlinks within the webpage.
|
||||
|
||||
Returns:
|
||||
BaseGraph: A graph instance representing the web scraping workflow.
|
||||
"""
|
||||
fetch_node = FetchNode(
|
||||
input="url | local_dir",
|
||||
output=["doc"]
|
||||
)
|
||||
parse_node = ParseNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
node_config={
|
||||
"chunk_size": self.model_token,
|
||||
"llm_model": self.llm_model
|
||||
}
|
||||
)
|
||||
|
||||
generate_answer_node = GenerateAnswerNode(
|
||||
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
|
||||
output=["answer"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"additional_info": self.config.get("additional_info"),
|
||||
"schema": self.schema
|
||||
}
|
||||
)
|
||||
|
||||
search_node = SearchLinkNode(
|
||||
input="user_prompt & relevant_chunks",
|
||||
output=["relevant_links"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
}
|
||||
)
|
||||
|
||||
graph_iterator_node = GraphIteratorNode(
|
||||
input="user_prompt & relevant_links",
|
||||
output=["results"],
|
||||
node_config={
|
||||
"graph_instance": None,
|
||||
"batchsize": 1
|
||||
}
|
||||
)
|
||||
|
||||
merge_answers_node = MergeAnswersNode(
|
||||
input="user_prompt & results",
|
||||
output=["answer"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"schema": self.schema
|
||||
}
|
||||
)
|
||||
|
||||
return BaseGraph(
|
||||
nodes=[
|
||||
fetch_node,
|
||||
parse_node,
|
||||
generate_answer_node,
|
||||
search_node,
|
||||
graph_iterator_node,
|
||||
merge_answers_node
|
||||
],
|
||||
edges=[
|
||||
(fetch_node, parse_node),
|
||||
(search_node, graph_iterator_node),
|
||||
(graph_iterator_node, merge_answers_node)
|
||||
],
|
||||
entry_point=fetch_node,
|
||||
graph_name=self.__class__.__name__
|
||||
)
|
||||
|
||||
|
||||
def _create_graph(self) -> BaseGraph:
|
||||
"""
|
||||
Creates the graph of nodes representing the workflow for web scraping
|
||||
n-levels deep.
|
||||
|
||||
Returns:
|
||||
BaseGraph: A graph instance representing the web scraping workflow.
|
||||
"""
|
||||
|
||||
base_graph = self._create_repeated_graph()
|
||||
graph_iterator_node = list(filter(lambda x: x.node_name == "GraphIterator",
|
||||
base_graph.nodes))[0]
|
||||
graph_iterator_node.node_config["graph_instance"] = self
|
||||
return base_graph
|
||||
|
||||
def run(self) -> str:
|
||||
"""
|
||||
Executes the scraping process and returns the answer to the prompt.
|
||||
Returns:
|
||||
str: The answer to the prompt.
|
||||
"""
|
||||
|
||||
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||
|
||||
return self.final_state.get("answer", "No answer found.")
|
||||
Loading…
Reference in New Issue
Block a user