From cd795a71c0cf9bf118633d347e99bc4de2cf9618 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 19 Dec 2024 11:15:03 -0500 Subject: [PATCH] Add in tests --- .../workflows/{tests.yml => benchmarks.yml} | 6 --- .github/workflows/ci.yml | 26 ++++++++++ poetry.lock | 50 ++++++++++++++++++- pyproject.toml | 1 + pytest.ini | 7 +++ surya/benchmark/tesseract.py | 5 +- tests/conftest.py | 10 ++++ tests/test_ocr_errors.py | 18 +++++++ 8 files changed, 114 insertions(+), 9 deletions(-) rename .github/workflows/{tests.yml => benchmarks.yml} (85%) create mode 100644 .github/workflows/ci.yml create mode 100644 pytest.ini create mode 100644 tests/conftest.py create mode 100644 tests/test_ocr_errors.py diff --git a/.github/workflows/tests.yml b/.github/workflows/benchmarks.yml similarity index 85% rename from .github/workflows/tests.yml rename to .github/workflows/benchmarks.yml index 9edbe6e..8955d73 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/benchmarks.yml @@ -14,16 +14,10 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.11 - - name: Install apt dependencies - run: | - sudo apt-get update - sudo apt-get install -y tesseract-ocr tesseract-ocr-eng - name: Install python dependencies run: | pip install poetry poetry install - poetry remove torch - poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu - name: Run detection benchmark test run: | poetry run python benchmark/detection.py --max 2 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a6057e9 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,26 @@ +name: Integration test + +on: [push] + +env: + TORCH_DEVICE: "cpu" + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Install apt dependencies + run: | + sudo apt-get update + sudo apt-get install -y tesseract-ocr tesseract-ocr-eng + - name: Install python dependencies + run: | + pip install poetry + poetry install + - name: Run tests + run: poetry run pytest \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 48dbc9f..76cb7d7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1235,6 +1235,17 @@ files = [ [package.extras] all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + [[package]] name = "ipykernel" version = "6.29.5" @@ -2692,6 +2703,21 @@ files = [ greenlet = "3.1.1" pyee = "12.0.0" +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + [[package]] name = "prometheus-client" version = "0.21.1" @@ -3207,6 +3233,28 @@ files = [ packaging = ">=21.3" Pillow = ">=8.0.0" +[[package]] +name = "pytest" +version = "8.3.4" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, + {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2" +tomli = {version = ">=1", markers = "python_version < \"3.11\""} + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -4925,4 +4973,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "e5dfcdc29e7912fe8cedfb2af75c0edfe5a911de4600890f75922f501b440046" +content-hash = "dd035c4c1f7634ad4fc809b9a11bad6d9c936eb2ce5c992830f68835f69fea12" diff --git a/pyproject.toml b/pyproject.toml index 317235d..444eea9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ rapidfuzz = "^3.6.1" arabic-reshaper = "^3.0.0" streamlit = "^1.31.0" playwright = "^1.41.2" +pytest = "^8.3.4" [tool.poetry.scripts] surya_detect = "detect_text:main" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..05ef295 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,7 @@ +[pytest] +testpaths=tests +pythonpath=. +filterwarnings = + ignore::UserWarning + ignore::PendingDeprecationWarning + ignore::DeprecationWarning \ No newline at end of file diff --git a/surya/benchmark/tesseract.py b/surya/benchmark/tesseract.py index a2d025e..140d46c 100644 --- a/surya/benchmark/tesseract.py +++ b/surya/benchmark/tesseract.py @@ -1,8 +1,6 @@ from typing import List, Optional import numpy as np -import pytesseract -from pytesseract import Output from tqdm import tqdm from surya.input.processing import slice_bboxes_from_image @@ -24,6 +22,7 @@ def surya_lang_to_tesseract(code: str) -> Optional[str]: def tesseract_ocr(img, bboxes, lang: str): + import pytesseract line_imgs = slice_bboxes_from_image(img, bboxes) config = f'--tessdata-dir "{settings.TESSDATA_PREFIX}"' lines = [] @@ -50,6 +49,8 @@ def tesseract_ocr_parallel(imgs, bboxes, langs: List[str], cpus=None): def tesseract_bboxes(img): + import pytesseract + from pytesseract import Output arr_img = np.asarray(img, dtype=np.uint8) ocr = pytesseract.image_to_data(arr_img, output_type=Output.DICT) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..8f37c5b --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,10 @@ +import pytest +from surya.model.ocr_error.model import load_model as load_ocr_error_model, load_tokenizer as load_ocr_error_processor + +@pytest.fixture(scope="session") +def ocr_error_model(): + ocr_error_m = load_ocr_error_model() + ocr_error_p = load_ocr_error_processor() + ocr_error_m.processor = ocr_error_p + yield ocr_error_m + del ocr_error_m \ No newline at end of file diff --git a/tests/test_ocr_errors.py b/tests/test_ocr_errors.py new file mode 100644 index 0000000..6471dca --- /dev/null +++ b/tests/test_ocr_errors.py @@ -0,0 +1,18 @@ +from surya.ocr_error import batch_ocr_error_detection + + +def test_garbled_text(ocr_error_model): + text = """" + ; dh vksj ls mifLFkr vf/koDrk % Jh vfuy dqekj + 2. vfHk;qDr dh vksj ls mifLFkr vf/koDrk % Jh iznhi d + """.strip() + results = batch_ocr_error_detection([text], ocr_error_model, ocr_error_model.processor) + assert results.labels[0] == "bad" + + +def test_good_text(ocr_error_model): + text = """" + There are professions more harmful than industrial design, but only a very few of them. + """.strip() + results = batch_ocr_error_detection([text], ocr_error_model, ocr_error_model.processor) + assert results.labels[0] == "good" \ No newline at end of file