Merge pull request #352 from tejhande/patch-1
Some checks are pending
CodeQL / Analyze (python) (push) Waiting to run
/ build (3.10) (push) Waiting to run
Release / Build (push) Waiting to run
Release / Release (push) Blocked by required conditions

test: Enhance JSON scraping pipeline test
This commit is contained in:
Marco Vinciguerra 2024-06-07 09:55:14 +02:00 committed by GitHub
commit 261c4fcdf5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,56 +1,50 @@
"""
Module for scraping json documents
"""
Module for scraping JSON documents
"""
import os
import json
import pytest
from scrapegraphai.graphs import JSONScraperGraph
# Load configuration from a JSON file
CONFIG_FILE = "config.json"
with open(CONFIG_FILE, "r") as f:
CONFIG = json.load(f)
# Fixture to read the sample JSON file
@pytest.fixture
def sample_json():
"""
Example of text
Read the sample JSON file
"""
file_name = "inputs/example.json"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, file_name)
with open(file_path, 'r', encoding="utf-8") as file:
file_path = os.path.join(os.path.dirname(__file__), "inputs", "example.json")
with open(file_path, "r", encoding="utf-8") as file:
text = file.read()
return text
# Parametrized fixture to load graph configurations
@pytest.fixture(params=CONFIG["graph_configs"])
def graph_config(request):
"""
Load graph configuration
"""
return request.param
@pytest.fixture
def graph_config():
# Test function for the scraping pipeline
def test_scraping_pipeline(sample_json, graph_config):
"""
Configuration of the graph
Test the scraping pipeline
"""
return {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json",
"base_url": "http://localhost:11434",
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434",
}
}
expected_titles = ["Title 1", "Title 2", "Title 3"] # Replace with expected titles
def test_scraping_pipeline(sample_json: str, graph_config: dict):
"""
Start of the scraping pipeline
"""
smart_scraper_graph = JSONScraperGraph(
prompt="List me all the titles",
source=sample_json,
config=graph_config
)
result = smart_scraper_graph.run()
assert result is not None
assert isinstance(result, list)
assert sorted(result) == sorted(expected_titles)