CI fix

2026-06-04 21:03:53 +08:00 · 2026-05-27 10:59:19 -04:00 · 2026-05-27 10:59:19 -04:00 · e966a20990
commit e966a20990
parent c421e742b4
5 changed files with 48 additions and 5 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -9,6 +9,12 @@ jobs:
      matrix:
        os: [t4_gpu, ubuntu-latest, windows-latest]
      fail-fast: false
+    env:
+      # T4 can't run bf16 in vllm; size for the 16GB card. (No-op on the
+      # CPU runners, which skip the VLM-backed tests.)
+      VLLM_DTYPE: float16
+      VLLM_GPU_TYPE: t4
+      SURYA_INFERENCE_STARTUP_TIMEOUT: "1200"
    steps:
      - uses: actions/checkout@v4
      - name: Install uv
--- a/.github/workflows/scripts.yml
+++ b/.github/workflows/scripts.yml
@ -5,6 +5,12 @@ on: [push]
 jobs:
  build:
    runs-on: t4_gpu
+    env:
+      # T4 (Turing, compute 7.5) can't run bf16 in vllm; size vllm for the 16GB
+      # card and give the cold start (image pull + model download) headroom.
+      VLLM_DTYPE: float16
+      VLLM_GPU_TYPE: t4
+      SURYA_INFERENCE_STARTUP_TIMEOUT: "1200"
    steps:
      - uses: actions/checkout@v4
      - name: Install uv
@ -22,11 +28,13 @@ jobs:
          unzip -o benchmark_data.zip
      - name: Test detection
        run: uv run surya_detect benchmark_data/pdfs/switch_trans.pdf --page_range 0
+      # Spawn the vllm server once and reuse it across the OCR/layout/table
+      # steps (--keep_server) instead of paying a cold start three times.
      - name: Test OCR
-        run: uv run surya_ocr benchmark_data/pdfs/switch_trans.pdf --page_range 0
+        run: uv run surya_ocr benchmark_data/pdfs/switch_trans.pdf --page_range 0 --keep_server
      - name: Test layout
-        run: uv run surya_layout benchmark_data/pdfs/switch_trans.pdf --page_range 0
+        run: uv run surya_layout benchmark_data/pdfs/switch_trans.pdf --page_range 0 --keep_server
      - name: Test table
-        run: uv run surya_table benchmark_data/pdfs/switch_trans.pdf --page_range 0
+        run: uv run surya_table benchmark_data/pdfs/switch_trans.pdf --page_range 0 --keep_server
      - name: Test detection folder
        run: uv run surya_detect benchmark_data/pdfs --page_range 0
--- a/surya/inference/backends/spawn.py
+++ b/surya/inference/backends/spawn.py
@ -138,6 +138,27 @@ def _stop_process(pid: int, name: str) -> None:
        logger.warning(f"Failed to stop {name} (pid {pid}): {e}")


+def _capture_server_logs(handle: "SpawnHandle", tail: int = 100) -> str:
+    """Best-effort tail of a server's logs, for surfacing startup failures."""
+    try:
+        if handle.cleanup_kind == "docker":
+            r = subprocess.run(
+                ["docker", "logs", "--tail", str(tail), handle.cleanup_id],
+                capture_output=True,
+                text=True,
+                timeout=15,
+            )
+            return (r.stdout or "") + (r.stderr or "") or "(no docker logs)"
+        # llama.cpp process backend logs to this file (see llamacpp.py)
+        log_path = Path("~/.cache/datalab/surya/llamacpp_server.log").expanduser()
+        if log_path.exists():
+            lines = log_path.read_text(errors="replace").splitlines()
+            return "\n".join(lines[-tail:]) or "(empty log)"
+    except Exception as e:
+        return f"(could not capture logs: {e})"
+    return "(no logs available)"
+
+
 def _stop_docker_container(name: str) -> None:
    try:
        subprocess.run(
@ -290,10 +311,15 @@ def attach_or_spawn(
        # 6. Wait for health
        health_url = health_url_for(port)
        if not wait_for_health(health_url, total_timeout=startup_timeout):
+            # Grab the server's own logs *before* cleanup tears the (--rm)
+            # container down, otherwise the actual failure reason is lost and
+            # all the caller sees is this timeout.
+            logs = _capture_server_logs(spawn_handle)
            _cleanup()
            raise SpawnError(
                f"{backend} server failed to become healthy at {health_url} "
-                f"within {startup_timeout}s"
+                f"within {startup_timeout}s.\n"
+                f"--- last {backend} server logs ---\n{logs}"
            )

        # 7. Verify model name
--- a/surya/inference/backends/vllm.py
+++ b/surya/inference/backends/vllm.py
@ -140,7 +140,7 @@ class VllmBackend(Backend):
                "--max-num-seqs",
                str(max_num_seqs),
                "--dtype",
-                "bfloat16",
+                settings.VLLM_DTYPE,
                "--max-model-len",
                str(settings.VLLM_MAX_MODEL_LEN),
                "--max-num-batched-tokens",
--- a/surya/settings.py
+++ b/surya/settings.py
@ -89,6 +89,9 @@ class Settings(BaseSettings):
    VLLM_API_KEY: str = "EMPTY"
    VLLM_GPUS: str = "0"
    VLLM_GPU_TYPE: str = "4090"
+    # bfloat16 needs an Ampere+ GPU (compute capability >= 8.0). On older cards
+    # (e.g. T4 / Turing) vllm refuses to start with bf16 — set float16 there.
+    VLLM_DTYPE: str = "bfloat16"
    VLLM_MAX_MODEL_LEN: int = 18000
    VLLM_GPU_MEMORY_UTILIZATION: float = 0.85
    VLLM_ENABLE_MTP: bool = True