From e966a20990c480a8f530d42d6821898440e93eb7 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Wed, 27 May 2026 10:59:19 -0400
Subject: [PATCH] CI fix

---
 .github/workflows/ci.yml          |  6 ++++++
 .github/workflows/scripts.yml     | 14 +++++++++++---
 surya/inference/backends/spawn.py | 28 +++++++++++++++++++++++++++-
 surya/inference/backends/vllm.py  |  2 +-
 surya/settings.py                 |  3 +++
 5 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9a539f5..98e6f0b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,6 +9,12 @@ jobs:
       matrix:
         os: [t4_gpu, ubuntu-latest, windows-latest]
       fail-fast: false
+    env:
+      # T4 can't run bf16 in vllm; size for the 16GB card. (No-op on the
+      # CPU runners, which skip the VLM-backed tests.)
+      VLLM_DTYPE: float16
+      VLLM_GPU_TYPE: t4
+      SURYA_INFERENCE_STARTUP_TIMEOUT: "1200"
     steps:
       - uses: actions/checkout@v4
       - name: Install uv
diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml
index 711155d..500f27a 100644
--- a/.github/workflows/scripts.yml
+++ b/.github/workflows/scripts.yml
@@ -5,6 +5,12 @@ on: [push]
 jobs:
   build:
     runs-on: t4_gpu
+    env:
+      # T4 (Turing, compute 7.5) can't run bf16 in vllm; size vllm for the 16GB
+      # card and give the cold start (image pull + model download) headroom.
+      VLLM_DTYPE: float16
+      VLLM_GPU_TYPE: t4
+      SURYA_INFERENCE_STARTUP_TIMEOUT: "1200"
     steps:
       - uses: actions/checkout@v4
       - name: Install uv
@@ -22,11 +28,13 @@ jobs:
           unzip -o benchmark_data.zip
       - name: Test detection
         run: uv run surya_detect benchmark_data/pdfs/switch_trans.pdf --page_range 0
+      # Spawn the vllm server once and reuse it across the OCR/layout/table
+      # steps (--keep_server) instead of paying a cold start three times.
       - name: Test OCR
-        run: uv run surya_ocr benchmark_data/pdfs/switch_trans.pdf --page_range 0
+        run: uv run surya_ocr benchmark_data/pdfs/switch_trans.pdf --page_range 0 --keep_server
       - name: Test layout
-        run: uv run surya_layout benchmark_data/pdfs/switch_trans.pdf --page_range 0
+        run: uv run surya_layout benchmark_data/pdfs/switch_trans.pdf --page_range 0 --keep_server
       - name: Test table
-        run: uv run surya_table benchmark_data/pdfs/switch_trans.pdf --page_range 0
+        run: uv run surya_table benchmark_data/pdfs/switch_trans.pdf --page_range 0 --keep_server
       - name: Test detection folder
         run: uv run surya_detect benchmark_data/pdfs --page_range 0
diff --git a/surya/inference/backends/spawn.py b/surya/inference/backends/spawn.py
index 0ece794..e1d6ff1 100644
--- a/surya/inference/backends/spawn.py
+++ b/surya/inference/backends/spawn.py
@@ -138,6 +138,27 @@ def _stop_process(pid: int, name: str) -> None:
         logger.warning(f"Failed to stop {name} (pid {pid}): {e}")
 
 
+def _capture_server_logs(handle: "SpawnHandle", tail: int = 100) -> str:
+    """Best-effort tail of a server's logs, for surfacing startup failures."""
+    try:
+        if handle.cleanup_kind == "docker":
+            r = subprocess.run(
+                ["docker", "logs", "--tail", str(tail), handle.cleanup_id],
+                capture_output=True,
+                text=True,
+                timeout=15,
+            )
+            return (r.stdout or "") + (r.stderr or "") or "(no docker logs)"
+        # llama.cpp process backend logs to this file (see llamacpp.py)
+        log_path = Path("~/.cache/datalab/surya/llamacpp_server.log").expanduser()
+        if log_path.exists():
+            lines = log_path.read_text(errors="replace").splitlines()
+            return "\n".join(lines[-tail:]) or "(empty log)"
+    except Exception as e:
+        return f"(could not capture logs: {e})"
+    return "(no logs available)"
+
+
 def _stop_docker_container(name: str) -> None:
     try:
         subprocess.run(
@@ -290,10 +311,15 @@ def attach_or_spawn(
         # 6. Wait for health
         health_url = health_url_for(port)
         if not wait_for_health(health_url, total_timeout=startup_timeout):
+            # Grab the server's own logs *before* cleanup tears the (--rm)
+            # container down, otherwise the actual failure reason is lost and
+            # all the caller sees is this timeout.
+            logs = _capture_server_logs(spawn_handle)
             _cleanup()
             raise SpawnError(
                 f"{backend} server failed to become healthy at {health_url} "
-                f"within {startup_timeout}s"
+                f"within {startup_timeout}s.\n"
+                f"--- last {backend} server logs ---\n{logs}"
             )
 
         # 7. Verify model name
diff --git a/surya/inference/backends/vllm.py b/surya/inference/backends/vllm.py
index e8d943e..ee9e251 100644
--- a/surya/inference/backends/vllm.py
+++ b/surya/inference/backends/vllm.py
@@ -140,7 +140,7 @@ class VllmBackend(Backend):
                 "--max-num-seqs",
                 str(max_num_seqs),
                 "--dtype",
-                "bfloat16",
+                settings.VLLM_DTYPE,
                 "--max-model-len",
                 str(settings.VLLM_MAX_MODEL_LEN),
                 "--max-num-batched-tokens",
diff --git a/surya/settings.py b/surya/settings.py
index f10fbac..4a55160 100644
--- a/surya/settings.py
+++ b/surya/settings.py
@@ -89,6 +89,9 @@ class Settings(BaseSettings):
     VLLM_API_KEY: str = "EMPTY"
     VLLM_GPUS: str = "0"
     VLLM_GPU_TYPE: str = "4090"
+    # bfloat16 needs an Ampere+ GPU (compute capability >= 8.0). On older cards
+    # (e.g. T4 / Turing) vllm refuses to start with bf16 — set float16 there.
+    VLLM_DTYPE: str = "bfloat16"
     VLLM_MAX_MODEL_LEN: int = 18000
     VLLM_GPU_MEMORY_UTILIZATION: float = 0.85
     VLLM_ENABLE_MTP: bool = True