From e966a20990c480a8f530d42d6821898440e93eb7 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 27 May 2026 10:59:19 -0400 Subject: [PATCH] CI fix --- .github/workflows/ci.yml | 6 ++++++ .github/workflows/scripts.yml | 14 +++++++++++--- surya/inference/backends/spawn.py | 28 +++++++++++++++++++++++++++- surya/inference/backends/vllm.py | 2 +- surya/settings.py | 3 +++ 5 files changed, 48 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9a539f5..98e6f0b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,6 +9,12 @@ jobs: matrix: os: [t4_gpu, ubuntu-latest, windows-latest] fail-fast: false + env: + # T4 can't run bf16 in vllm; size for the 16GB card. (No-op on the + # CPU runners, which skip the VLM-backed tests.) + VLLM_DTYPE: float16 + VLLM_GPU_TYPE: t4 + SURYA_INFERENCE_STARTUP_TIMEOUT: "1200" steps: - uses: actions/checkout@v4 - name: Install uv diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml index 711155d..500f27a 100644 --- a/.github/workflows/scripts.yml +++ b/.github/workflows/scripts.yml @@ -5,6 +5,12 @@ on: [push] jobs: build: runs-on: t4_gpu + env: + # T4 (Turing, compute 7.5) can't run bf16 in vllm; size vllm for the 16GB + # card and give the cold start (image pull + model download) headroom. + VLLM_DTYPE: float16 + VLLM_GPU_TYPE: t4 + SURYA_INFERENCE_STARTUP_TIMEOUT: "1200" steps: - uses: actions/checkout@v4 - name: Install uv @@ -22,11 +28,13 @@ jobs: unzip -o benchmark_data.zip - name: Test detection run: uv run surya_detect benchmark_data/pdfs/switch_trans.pdf --page_range 0 + # Spawn the vllm server once and reuse it across the OCR/layout/table + # steps (--keep_server) instead of paying a cold start three times. - name: Test OCR - run: uv run surya_ocr benchmark_data/pdfs/switch_trans.pdf --page_range 0 + run: uv run surya_ocr benchmark_data/pdfs/switch_trans.pdf --page_range 0 --keep_server - name: Test layout - run: uv run surya_layout benchmark_data/pdfs/switch_trans.pdf --page_range 0 + run: uv run surya_layout benchmark_data/pdfs/switch_trans.pdf --page_range 0 --keep_server - name: Test table - run: uv run surya_table benchmark_data/pdfs/switch_trans.pdf --page_range 0 + run: uv run surya_table benchmark_data/pdfs/switch_trans.pdf --page_range 0 --keep_server - name: Test detection folder run: uv run surya_detect benchmark_data/pdfs --page_range 0 diff --git a/surya/inference/backends/spawn.py b/surya/inference/backends/spawn.py index 0ece794..e1d6ff1 100644 --- a/surya/inference/backends/spawn.py +++ b/surya/inference/backends/spawn.py @@ -138,6 +138,27 @@ def _stop_process(pid: int, name: str) -> None: logger.warning(f"Failed to stop {name} (pid {pid}): {e}") +def _capture_server_logs(handle: "SpawnHandle", tail: int = 100) -> str: + """Best-effort tail of a server's logs, for surfacing startup failures.""" + try: + if handle.cleanup_kind == "docker": + r = subprocess.run( + ["docker", "logs", "--tail", str(tail), handle.cleanup_id], + capture_output=True, + text=True, + timeout=15, + ) + return (r.stdout or "") + (r.stderr or "") or "(no docker logs)" + # llama.cpp process backend logs to this file (see llamacpp.py) + log_path = Path("~/.cache/datalab/surya/llamacpp_server.log").expanduser() + if log_path.exists(): + lines = log_path.read_text(errors="replace").splitlines() + return "\n".join(lines[-tail:]) or "(empty log)" + except Exception as e: + return f"(could not capture logs: {e})" + return "(no logs available)" + + def _stop_docker_container(name: str) -> None: try: subprocess.run( @@ -290,10 +311,15 @@ def attach_or_spawn( # 6. Wait for health health_url = health_url_for(port) if not wait_for_health(health_url, total_timeout=startup_timeout): + # Grab the server's own logs *before* cleanup tears the (--rm) + # container down, otherwise the actual failure reason is lost and + # all the caller sees is this timeout. + logs = _capture_server_logs(spawn_handle) _cleanup() raise SpawnError( f"{backend} server failed to become healthy at {health_url} " - f"within {startup_timeout}s" + f"within {startup_timeout}s.\n" + f"--- last {backend} server logs ---\n{logs}" ) # 7. Verify model name diff --git a/surya/inference/backends/vllm.py b/surya/inference/backends/vllm.py index e8d943e..ee9e251 100644 --- a/surya/inference/backends/vllm.py +++ b/surya/inference/backends/vllm.py @@ -140,7 +140,7 @@ class VllmBackend(Backend): "--max-num-seqs", str(max_num_seqs), "--dtype", - "bfloat16", + settings.VLLM_DTYPE, "--max-model-len", str(settings.VLLM_MAX_MODEL_LEN), "--max-num-batched-tokens", diff --git a/surya/settings.py b/surya/settings.py index f10fbac..4a55160 100644 --- a/surya/settings.py +++ b/surya/settings.py @@ -89,6 +89,9 @@ class Settings(BaseSettings): VLLM_API_KEY: str = "EMPTY" VLLM_GPUS: str = "0" VLLM_GPU_TYPE: str = "4090" + # bfloat16 needs an Ampere+ GPU (compute capability >= 8.0). On older cards + # (e.g. T4 / Turing) vllm refuses to start with bf16 — set float16 there. + VLLM_DTYPE: str = "bfloat16" VLLM_MAX_MODEL_LEN: int = 18000 VLLM_GPU_MEMORY_UTILIZATION: float = 0.85 VLLM_ENABLE_MTP: bool = True