mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-25 21:11:11 +08:00
fix: examples and graphs
This commit is contained in:
parent
ba2b24b4cd
commit
5cf4e4f92f
@ -51,6 +51,7 @@ Please make sure to format your code accordingly before submitting a pull reques
|
|||||||
- [Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/)
|
- [Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/)
|
||||||
- [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
|
- [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
|
||||||
- [The Hitchhiker's Guide to Python](https://docs.python-guide.org/writing/style/)
|
- [The Hitchhiker's Guide to Python](https://docs.python-guide.org/writing/style/)
|
||||||
|
- [Pylint style of code for the documentation](https://pylint.pycqa.org/en/1.6.0/tutorial.html)
|
||||||
|
|
||||||
## Submitting a Pull Request
|
## Submitting a Pull Request
|
||||||
|
|
||||||
|
|||||||
@ -12,6 +12,9 @@ from scrapegraphai.nodes import FetchNode
|
|||||||
robots_node = FetchNode(
|
robots_node = FetchNode(
|
||||||
input="url | local_dir",
|
input="url | local_dir",
|
||||||
output=["doc"],
|
output=["doc"],
|
||||||
|
node_config={
|
||||||
|
"headless": False
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# ************************************************
|
# ************************************************
|
||||||
|
|||||||
@ -26,7 +26,9 @@ llm_model = Ollama(graph_config["llm"])
|
|||||||
robots_node = RobotsNode(
|
robots_node = RobotsNode(
|
||||||
input="url",
|
input="url",
|
||||||
output=["is_scrapable"],
|
output=["is_scrapable"],
|
||||||
node_config={"llm": llm_model}
|
node_config={"llm": llm_model,
|
||||||
|
"headless": False
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# ************************************************
|
# ************************************************
|
||||||
|
|||||||
@ -21,7 +21,8 @@ class JSONScraperGraph(AbstractGraph):
|
|||||||
source (str): The source of the graph.
|
source (str): The source of the graph.
|
||||||
config (dict): Configuration parameters for the graph.
|
config (dict): Configuration parameters for the graph.
|
||||||
llm_model: An instance of a language model client, configured for generating answers.
|
llm_model: An instance of a language model client, configured for generating answers.
|
||||||
embedder_model: An instance of an embedding model client, configured for generating embeddings.
|
embedder_model: An instance of an embedding model client,
|
||||||
|
configured for generating embeddings.
|
||||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||||
headless (bool): A flag indicating whether to run the graph in headless mode.
|
headless (bool): A flag indicating whether to run the graph in headless mode.
|
||||||
|
|
||||||
@ -47,7 +48,7 @@ class JSONScraperGraph(AbstractGraph):
|
|||||||
def _create_graph(self) -> BaseGraph:
|
def _create_graph(self) -> BaseGraph:
|
||||||
"""
|
"""
|
||||||
Creates the graph of nodes representing the workflow for web scraping.
|
Creates the graph of nodes representing the workflow for web scraping.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
BaseGraph: A graph instance representing the web scraping workflow.
|
BaseGraph: A graph instance representing the web scraping workflow.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -21,7 +21,8 @@ class ScriptCreatorGraph(AbstractGraph):
|
|||||||
source (str): The source of the graph.
|
source (str): The source of the graph.
|
||||||
config (dict): Configuration parameters for the graph.
|
config (dict): Configuration parameters for the graph.
|
||||||
llm_model: An instance of a language model client, configured for generating answers.
|
llm_model: An instance of a language model client, configured for generating answers.
|
||||||
embedder_model: An instance of an embedding model client, configured for generating embeddings.
|
embedder_model: An instance of an embedding model client,
|
||||||
|
configured for generating embeddings.
|
||||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||||
headless (bool): A flag indicating whether to run the graph in headless mode.
|
headless (bool): A flag indicating whether to run the graph in headless mode.
|
||||||
model_token (int): The token limit for the language model.
|
model_token (int): The token limit for the language model.
|
||||||
@ -44,7 +45,7 @@ class ScriptCreatorGraph(AbstractGraph):
|
|||||||
def __init__(self, prompt: str, source: str, config: dict):
|
def __init__(self, prompt: str, source: str, config: dict):
|
||||||
|
|
||||||
self.library = config['library']
|
self.library = config['library']
|
||||||
|
|
||||||
super().__init__(prompt, config, source)
|
super().__init__(prompt, config, source)
|
||||||
|
|
||||||
self.input_key = "url" if source.startswith("http") else "local_dir"
|
self.input_key = "url" if source.startswith("http") else "local_dir"
|
||||||
@ -61,25 +62,29 @@ class ScriptCreatorGraph(AbstractGraph):
|
|||||||
input="url | local_dir",
|
input="url | local_dir",
|
||||||
output=["doc"],
|
output=["doc"],
|
||||||
node_config={
|
node_config={
|
||||||
"headless": True if self.config is None else self.config.get("headless", True)}
|
"headless": True if self.config is None else self.config.get("headless", True),
|
||||||
|
"verbose": self.verbose}
|
||||||
)
|
)
|
||||||
parse_node = ParseNode(
|
parse_node = ParseNode(
|
||||||
input="doc",
|
input="doc",
|
||||||
output=["parsed_doc"],
|
output=["parsed_doc"],
|
||||||
node_config={"chunk_size": self.model_token}
|
node_config={"chunk_size": self.model_token,
|
||||||
|
"verbose": self.verbose}
|
||||||
)
|
)
|
||||||
rag_node = RAGNode(
|
rag_node = RAGNode(
|
||||||
input="user_prompt & (parsed_doc | doc)",
|
input="user_prompt & (parsed_doc | doc)",
|
||||||
output=["relevant_chunks"],
|
output=["relevant_chunks"],
|
||||||
node_config={
|
node_config={
|
||||||
"llm": self.llm_model,
|
"llm": self.llm_model,
|
||||||
"embedder_model": self.embedder_model
|
"embedder_model": self.embedder_model,
|
||||||
|
"verbose": self.verbose
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
generate_scraper_node = GenerateScraperNode(
|
generate_scraper_node = GenerateScraperNode(
|
||||||
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
|
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
|
||||||
output=["answer"],
|
output=["answer"],
|
||||||
node_config={"llm": self.llm_model},
|
node_config={"llm": self.llm_model,
|
||||||
|
"verbose": self.verbose},
|
||||||
library=self.library,
|
library=self.library,
|
||||||
website=self.source
|
website=self.source
|
||||||
)
|
)
|
||||||
@ -106,7 +111,7 @@ class ScriptCreatorGraph(AbstractGraph):
|
|||||||
Returns:
|
Returns:
|
||||||
str: The answer to the prompt.
|
str: The answer to the prompt.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
||||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,8 @@ from .abstract_graph import AbstractGraph
|
|||||||
|
|
||||||
class SmartScraperGraph(AbstractGraph):
|
class SmartScraperGraph(AbstractGraph):
|
||||||
"""
|
"""
|
||||||
SmartScraper is a scraping pipeline that automates the process of extracting information from web pages
|
SmartScraper is a scraping pipeline that automates the process of
|
||||||
|
extracting information from web pages
|
||||||
using a natural language model to interpret and answer prompts.
|
using a natural language model to interpret and answer prompts.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
@ -22,7 +23,8 @@ class SmartScraperGraph(AbstractGraph):
|
|||||||
source (str): The source of the graph.
|
source (str): The source of the graph.
|
||||||
config (dict): Configuration parameters for the graph.
|
config (dict): Configuration parameters for the graph.
|
||||||
llm_model: An instance of a language model client, configured for generating answers.
|
llm_model: An instance of a language model client, configured for generating answers.
|
||||||
embedder_model: An instance of an embedding model client, configured for generating embeddings.
|
embedder_model: An instance of an embedding model client,
|
||||||
|
configured for generating embeddings.
|
||||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||||
headless (bool): A flag indicating whether to run the graph in headless mode.
|
headless (bool): A flag indicating whether to run the graph in headless mode.
|
||||||
|
|
||||||
@ -45,7 +47,7 @@ class SmartScraperGraph(AbstractGraph):
|
|||||||
super().__init__(prompt, config, source)
|
super().__init__(prompt, config, source)
|
||||||
|
|
||||||
self.input_key = "url" if source.startswith("http") else "local_dir"
|
self.input_key = "url" if source.startswith("http") else "local_dir"
|
||||||
|
|
||||||
def _create_graph(self) -> BaseGraph:
|
def _create_graph(self) -> BaseGraph:
|
||||||
"""
|
"""
|
||||||
Creates the graph of nodes representing the workflow for web scraping.
|
Creates the graph of nodes representing the workflow for web scraping.
|
||||||
|
|||||||
@ -22,7 +22,8 @@ class XMLScraperGraph(AbstractGraph):
|
|||||||
source (str): The source of the graph.
|
source (str): The source of the graph.
|
||||||
config (dict): Configuration parameters for the graph.
|
config (dict): Configuration parameters for the graph.
|
||||||
llm_model: An instance of a language model client, configured for generating answers.
|
llm_model: An instance of a language model client, configured for generating answers.
|
||||||
embedder_model: An instance of an embedding model client, configured for generating embeddings.
|
embedder_model: An instance of an embedding model client,
|
||||||
|
configured for generating embeddings.
|
||||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||||
headless (bool): A flag indicating whether to run the graph in headless mode.
|
headless (bool): A flag indicating whether to run the graph in headless mode.
|
||||||
model_token (int): The token limit for the language model.
|
model_token (int): The token limit for the language model.
|
||||||
@ -49,7 +50,7 @@ class XMLScraperGraph(AbstractGraph):
|
|||||||
def _create_graph(self) -> BaseGraph:
|
def _create_graph(self) -> BaseGraph:
|
||||||
"""
|
"""
|
||||||
Creates the graph of nodes representing the workflow for web scraping.
|
Creates the graph of nodes representing the workflow for web scraping.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
BaseGraph: A graph instance representing the web scraping workflow.
|
BaseGraph: A graph instance representing the web scraping workflow.
|
||||||
"""
|
"""
|
||||||
@ -110,7 +111,7 @@ class XMLScraperGraph(AbstractGraph):
|
|||||||
Returns:
|
Returns:
|
||||||
str: The answer to the prompt.
|
str: The answer to the prompt.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
||||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||||
|
|
||||||
|
|||||||
56
tests/graphs/scrape_json_ollama.py
Normal file
56
tests/graphs/scrape_json_ollama.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
"""
|
||||||
|
Module for scraping json documents
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
from scrapegraphai.graphs import JSONScraperGraph
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_json():
|
||||||
|
"""
|
||||||
|
Example of text
|
||||||
|
"""
|
||||||
|
file_name = "inputs/example.json"
|
||||||
|
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
file_path = os.path.join(curr_dir, file_name)
|
||||||
|
|
||||||
|
with open(file_path, 'r', encoding="utf-8") as file:
|
||||||
|
text = file.read()
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def graph_config():
|
||||||
|
"""
|
||||||
|
Configuration of the graph
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"llm": {
|
||||||
|
"model": "ollama/mistral",
|
||||||
|
"temperature": 0,
|
||||||
|
"format": "json",
|
||||||
|
"base_url": "http://localhost:11434",
|
||||||
|
},
|
||||||
|
"embeddings": {
|
||||||
|
"model": "ollama/nomic-embed-text",
|
||||||
|
"temperature": 0,
|
||||||
|
"base_url": "http://localhost:11434",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_scraping_pipeline(sample_json: str, graph_config: dict):
|
||||||
|
"""
|
||||||
|
Start of the scraping pipeline
|
||||||
|
"""
|
||||||
|
smart_scraper_graph = JSONScraperGraph(
|
||||||
|
prompt="List me all the titles",
|
||||||
|
source=sample_json,
|
||||||
|
config=graph_config
|
||||||
|
)
|
||||||
|
|
||||||
|
result = smart_scraper_graph.run()
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
@ -3,7 +3,7 @@ Module for scraping XML documents
|
|||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import pytest
|
import pytest
|
||||||
from scrapegraphai.graphs import SmartScraperGraph
|
from scrapegraphai.graphs import XMLScraperGraph
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@ -45,7 +45,7 @@ def test_scraping_pipeline(sample_xml: str, graph_config: dict):
|
|||||||
"""
|
"""
|
||||||
Start of the scraping pipeline
|
Start of the scraping pipeline
|
||||||
"""
|
"""
|
||||||
smart_scraper_graph = SmartScraperGraph(
|
smart_scraper_graph = XMLScraperGraph(
|
||||||
prompt="List me all the authors, title and genres of the books",
|
prompt="List me all the authors, title and genres of the books",
|
||||||
source=sample_xml,
|
source=sample_xml,
|
||||||
config=graph_config
|
config=graph_config
|
||||||
|
|||||||
@ -46,6 +46,4 @@ def test_script_creator_graph(graph_config: dict):
|
|||||||
|
|
||||||
assert graph_exec_info is not None
|
assert graph_exec_info is not None
|
||||||
|
|
||||||
assert isinstance(graph_exec_info, dict)
|
|
||||||
|
|
||||||
print(prettify_exec_info(graph_exec_info))
|
print(prettify_exec_info(graph_exec_info))
|
||||||
|
|||||||
@ -17,6 +17,9 @@ def setup():
|
|||||||
robots_node = FetchNode(
|
robots_node = FetchNode(
|
||||||
input="url | local_dir",
|
input="url | local_dir",
|
||||||
output=["doc"],
|
output=["doc"],
|
||||||
|
node_config={
|
||||||
|
"headless": False
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
return robots_node
|
return robots_node
|
||||||
|
|||||||
@ -32,7 +32,9 @@ def setup():
|
|||||||
robots_node = RobotsNode(
|
robots_node = RobotsNode(
|
||||||
input="url",
|
input="url",
|
||||||
output=["is_scrapable"],
|
output=["is_scrapable"],
|
||||||
node_config={"llm": llm_model}
|
node_config={"llm": llm_model,
|
||||||
|
"headless": False
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
return robots_node
|
return robots_node
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user