fix: removed deep scraper

This commit is contained in:
Marco Vinciguerra 2024-09-29 15:40:04 +02:00
parent 27ae896cb7
commit 9aa8c889fb
7 changed files with 0 additions and 406 deletions

View File

@ -1,55 +0,0 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import DeepScraperGraph
from scrapegraphai.utils import prettify_exec_info
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"model": "ernie/ernie-bot-turbo",
"ernie_client_id": "<ernie_client_id>",
"ernie_client_secret": "<ernie_client_secret>",
"temperature": 0.1
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434"},
"verbose": True,
"max_depth": 1
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
deep_scraper_graph = DeepScraperGraph(
prompt="List me all the job titles and detailed job description.",
# also accepts a string with the already downloaded HTML code
source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
config=graph_config
)
result = deep_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = deep_scraper_graph.get_execution_info()
print(deep_scraper_graph.get_state("relevant_links"))
print(prettify_exec_info(graph_exec_info))

View File

@ -1,47 +0,0 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import DeepScraperGraph
from scrapegraphai.utils import prettify_exec_info
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************
fireworks_api_key = os.getenv("FIREWORKS_APIKEY")
graph_config = {
"llm": {
"api_key": fireworks_api_key,
"model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct"
},
"verbose": True,
"max_depth": 1
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
deep_scraper_graph = DeepScraperGraph(
prompt="List me all the job titles and detailed job description.",
# also accepts a string with the already downloaded HTML code
source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
config=graph_config
)
result = deep_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = deep_scraper_graph.get_execution_info()
print(deep_scraper_graph.get_state("relevant_links"))
print(prettify_exec_info(graph_exec_info))

View File

@ -1,47 +0,0 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import DeepScraperGraph
from scrapegraphai.utils import prettify_exec_info
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************
mistral_key = os.getenv("MISTRAL_API_KEY")
graph_config = {
"llm": {
"api_key": mistral_key,
"model": "mistralai/open-mistral-nemo",
},
"verbose": True,
"max_depth": 1
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
deep_scraper_graph = DeepScraperGraph(
prompt="List me all the job titles and detailed job description.",
# also accepts a string with the already downloaded HTML code
source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
config=graph_config
)
result = deep_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = deep_scraper_graph.get_execution_info()
print(deep_scraper_graph.get_state("relevant_links"))
print(prettify_exec_info(graph_exec_info))

View File

@ -1,47 +0,0 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import DeepScraperGraph
from scrapegraphai.utils import prettify_exec_info
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************
nemotron_key = os.getenv("NEMOTRON_APIKEY")
graph_config = {
"llm": {
"api_key": nemotron_key,
"model": "nvidia/meta/llama3-70b-instruct",
},
"verbose": True,
"max_depth": 1
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
deep_scraper_graph = DeepScraperGraph(
prompt="List me all the job titles and detailed job description.",
# also accepts a string with the already downloaded HTML code
source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
config=graph_config
)
result = deep_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = deep_scraper_graph.get_execution_info()
print(deep_scraper_graph.get_state("relevant_links"))
print(prettify_exec_info(graph_exec_info))

View File

@ -1,47 +0,0 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import DeepScraperGraph
from scrapegraphai.utils import prettify_exec_info
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"api_key": openai_key,
"model": "openai/gpt-4o",
},
"verbose": True,
"max_depth": 1
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
deep_scraper_graph = DeepScraperGraph(
prompt="List me all the job titles and detailed job description.",
# also accepts a string with the already downloaded HTML code
source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
config=graph_config
)
result = deep_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = deep_scraper_graph.get_execution_info()
print(deep_scraper_graph.get_state("relevant_links"))
print(prettify_exec_info(graph_exec_info))

View File

@ -5,7 +5,6 @@ __init__.py file for graphs folder
from .abstract_graph import AbstractGraph
from .base_graph import BaseGraph
from .smart_scraper_graph import SmartScraperGraph
from .deep_scraper_graph import DeepScraperGraph
from .speech_graph import SpeechGraph
from .search_graph import SearchGraph
from .script_creator_graph import ScriptCreatorGraph

View File

@ -1,162 +0,0 @@
"""
DeepScraperGraph Module
"""
from typing import Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
SearchLinkNode,
ParseNode,
GenerateAnswerNode,
GraphIteratorNode,
MergeAnswersNode
)
class DeepScraperGraph(AbstractGraph):
"""
[WIP]
DeepScraper is a scraping pipeline that automates the process of
extracting information from web pages using a natural language model
to interpret and answer prompts.
Unlike SmartScraper, DeepScraper can navigate to the links within,
the input webpage to fuflfil the task within the prompt.
Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
Args:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.
Example:
>>> deep_scraper = DeepScraperGraph(
... "List me all the job titles and detailed job description.",
... "https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
... {"llm": {"model": "openai/gpt-3.5-turbo"}}
... )
>>> result = deep_scraper.run()
)
"""
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
super().__init__(prompt, config, source, schema)
self.input_key = "url" if source.startswith("http") else "local_dir"
def _create_repeated_graph(self) -> BaseGraph:
"""
Creates the graph that can be repeatedly executed to conduct search on
hyperlinks within the webpage.
Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"]
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": self.model_token,
"llm_model": self.llm_model
}
)
generate_answer_node = GenerateAnswerNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={
"llm_model": self.llm_model,
"additional_info": self.config.get("additional_info"),
"schema": self.schema
}
)
search_node = SearchLinkNode(
input="user_prompt & relevant_chunks",
output=["relevant_links"],
node_config={
"llm_model": self.llm_model,
}
)
graph_iterator_node = GraphIteratorNode(
input="user_prompt & relevant_links",
output=["results"],
node_config={
"graph_instance": None,
"batchsize": 1
}
)
merge_answers_node = MergeAnswersNode(
input="user_prompt & results",
output=["answer"],
node_config={
"llm_model": self.llm_model,
"schema": self.schema
}
)
return BaseGraph(
nodes=[
fetch_node,
parse_node,
generate_answer_node,
search_node,
graph_iterator_node,
merge_answers_node
],
edges=[
(fetch_node, parse_node),
(search_node, graph_iterator_node),
(graph_iterator_node, merge_answers_node)
],
entry_point=fetch_node,
graph_name=self.__class__.__name__
)
def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping
n-levels deep.
Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""
base_graph = self._create_repeated_graph()
graph_iterator_node = list(filter(lambda x: x.node_name == "GraphIterator",
base_graph.nodes))[0]
graph_iterator_node.node_config["graph_instance"] = self
return base_graph
def run(self) -> str:
"""
Executes the scraping process and returns the answer to the prompt.
Returns:
str: The answer to the prompt.
"""
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("answer", "No answer found.")