Scrapegraph-ai/tests/nodes/fetch_node_test.py
Marco Vinciguerra c31706fdae
Some checks are pending
/ build (3.10) (push) Waiting to run
Release / Build (push) Waiting to run
Release / Release (push) Blocked by required conditions
fixed tests
Co-Authored-By: Matteo Vedovati <68272450+vedovati-matteo@users.noreply.github.com>
2024-06-15 20:41:26 +02:00

72 lines
2.1 KiB
Python

from scrapegraphai.nodes import FetchNode
from langchain_core.documents import Document
def test_fetch_html(mocker):
title = "ScrapeGraph AI"
link_url = "https://github.com/VinciGit00/Scrapegraph-ai"
img_url = "https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/scrapegraphai_logo.png"
content = f"""
<html>
<head>
<title>{title}</title>
</head>
<body>
<a href="{link_url}">ScrapeGraphAI: You Only Scrape Once</a>
<img src="{img_url}" alt="Scrapegraph-ai Logo">
</body>
</html>
"""
mock_loader_cls = mocker.patch("scrapegraphai.nodes.fetch_node.ChromiumLoader")
mock_loader = mock_loader_cls.return_value
mock_loader.load.return_value = [Document(page_content=content)]
node = FetchNode(
input="url | local_dir",
output=["doc", "links", "images"],
node_config={"headless": False},
)
result = node.execute({"url": "https://scrapegraph-ai.com/example"})
mock_loader.load.assert_called_once()
doc = result["doc"][0]
assert result is not None
assert "ScrapeGraph AI" in doc.page_content
assert "https://github.com/VinciGit00/Scrapegraph-ai" in doc.page_content
assert "https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/scrapegraphai_logo.png" in doc.page_content
def test_fetch_json():
node = FetchNode(
input="json",
output=["doc"],
)
result = node.execute({"json": "inputs/example.json"})
assert result is not None
def test_fetch_xml():
node = FetchNode(
input="xml",
output=["doc"],
)
result = node.execute({"xml": "inputs/books.xml"})
assert result is not None
def test_fetch_csv():
node = FetchNode(
input="csv",
output=["doc"],
)
result = node.execute({"csv": "inputs/username.csv"})
assert result is not None
def test_fetch_txt():
node = FetchNode(
input="txt",
output=["doc", "links", "images"],
)
with open("inputs/plain_html_example.txt") as f:
result = node.execute({"txt": f.read()})
assert result is not None