Add screenshot app

2026-06-04 21:03:53 +08:00 · 2026-05-27 10:36:53 -04:00 · 2026-05-27 10:36:53 -04:00 · 79246df837
commit 79246df837
parent 80c2903ea2
8 changed files with 712 additions and 16 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -37,6 +37,7 @@ surya_ocr = "surya.scripts.ocr_text:ocr_text_cli"
 surya_layout = "surya.scripts.detect_layout:detect_layout_cli"
 surya_gui = "surya.scripts.run_streamlit_app:streamlit_app_cli"
 surya_table = "surya.scripts.table_recognition:table_recognition_cli"
+surya_screenshot = "surya.scripts.screenshot_app:main"

 [dependency-groups]
 dev = [
@ -48,6 +49,7 @@ dev = [
    "pytest>=8.3.4",
    "pdftext>=0.5.1",
    "tabulate>=0.9.0",
+    "flask>=3.0.0",
 ]

 [build-system]
--- a/surya/inference/backends/spawn.py
+++ b/surya/inference/backends/spawn.py
@ -265,7 +265,10 @@ def attach_or_spawn(
            },
        )

-        # 5. Register atexit cleanup (only spawner)
+        # 5. Register atexit cleanup (only spawner). Skipped when keep-alive is
+        # set so the server outlives this process and later commands attach to
+        # it via the sentinel. (_cleanup is still callable below on startup
+        # failure, where we always tear a half-started server down.)
        def _cleanup():
            try:
                if spawn_handle.cleanup_kind == "docker":
@ -276,7 +279,13 @@ def attach_or_spawn(
            finally:
                _delete_sentinel(backend)

-        atexit.register(_cleanup)
+        if settings.SURYA_INFERENCE_KEEP_ALIVE:
+            logger.info(
+                f"keep-alive: {backend} server on port {port} will stay up "
+                f"after exit (cleanup_id={spawn_handle.cleanup_id!r})"
+            )
+        else:
+            atexit.register(_cleanup)

        # 6. Wait for health
        health_url = health_url_for(port)
--- a/surya/scripts/config.py
+++ b/surya/scripts/config.py
@ -17,15 +17,45 @@ class CLILoader:
        self.debug = cli_options.get("debug", False)
        self.output_dir = cli_options.get("output_dir")

+        # Opt in to leaving the inference server up so later commands reuse it.
+        if cli_options.get("keep_server"):
+            settings.SURYA_INFERENCE_KEEP_ALIVE = True
+
        self.load(highres)

    @staticmethod
    def common_options(fn):
-        fn = click.argument("input_path", type=click.Path(exists=True), required=True)(fn)
-        fn = click.option("--output_dir", type=click.Path(exists=False), required=False, default=os.path.join(settings.RESULT_DIR, "surya"), help="Directory to save output.")(fn)
-        fn = click.option("--page_range", type=str, default=None, help="Page range to convert, specify comma separated page numbers or ranges.  Example: 0,5-10,20")(fn)
-        fn = click.option("--images", is_flag=True, help="Save images of detected bboxes.", default=False)(fn)
-        fn = click.option('--debug', '-d', is_flag=True, help='Enable debug mode.', default=False)(fn)
+        fn = click.argument("input_path", type=click.Path(exists=True), required=True)(
+            fn
+        )
+        fn = click.option(
+            "--output_dir",
+            type=click.Path(exists=False),
+            required=False,
+            default=os.path.join(settings.RESULT_DIR, "surya"),
+            help="Directory to save output.",
+        )(fn)
+        fn = click.option(
+            "--page_range",
+            type=str,
+            default=None,
+            help="Page range to convert, specify comma separated page numbers or ranges.  Example: 0,5-10,20",
+        )(fn)
+        fn = click.option(
+            "--images",
+            is_flag=True,
+            help="Save images of detected bboxes.",
+            default=False,
+        )(fn)
+        fn = click.option(
+            "--debug", "-d", is_flag=True, help="Enable debug mode.", default=False
+        )(fn)
+        fn = click.option(
+            "--keep_server",
+            is_flag=True,
+            default=False,
+            help="Keep the inference server (vllm/llama.cpp) running after this command exits so later commands reuse it instead of re-spawning.",
+        )(fn)
        return fn

    def load(self, highres: bool = False):
@ -34,13 +64,16 @@ class CLILoader:
            images, names = load_from_folder(self.filepath, self.page_range)
            folder_name = os.path.basename(self.filepath)
            if highres:
-                highres_images, _ = load_from_folder(self.filepath, self.page_range, settings.IMAGE_DPI_HIGHRES)
+                highres_images, _ = load_from_folder(
+                    self.filepath, self.page_range, settings.IMAGE_DPI_HIGHRES
+                )
        else:
            images, names = load_from_file(self.filepath, self.page_range)
            folder_name = os.path.basename(self.filepath).split(".")[0]
            if highres:
-                highres_images, _ = load_from_file(self.filepath, self.page_range, settings.IMAGE_DPI_HIGHRES)
-
+                highres_images, _ = load_from_file(
+                    self.filepath, self.page_range, settings.IMAGE_DPI_HIGHRES
+                )

        self.images = images
        self.highres_images = highres_images
@ -59,5 +92,7 @@ class CLILoader:
                page_lst += list(range(int(start), int(end) + 1))
            else:
                page_lst.append(int(i))
-        page_lst = sorted(list(set(page_lst)))  # Deduplicate page numbers and sort in order
-        return page_lst
+        page_lst = sorted(
+            list(set(page_lst))
+        )  # Deduplicate page numbers and sort in order
+        return page_lst
--- a/surya/scripts/screenshot_app.py
+++ b/surya/scripts/screenshot_app.py
@ -0,0 +1,226 @@
+"""Screenshot-friendly Surya viewer.
+
+Shows a PDF/image page on the left and full-page OCR output on the right, side
+by side, for clean screenshots. You can scroll through pages and preview them
+before running OCR, then export the side-by-side view as a PNG.
+
+Run with `surya_screenshot`, then open http://localhost:8504.
+"""
+
+from __future__ import annotations
+
+import base64
+import io
+import os
+import tempfile
+import uuid
+from typing import List, Optional
+
+import pypdfium2
+from flask import Flask, jsonify, render_template, request
+from PIL import Image
+from werkzeug.utils import secure_filename
+
+from surya.inference import SuryaInferenceManager
+from surya.logging import configure_logging, get_logger
+from surya.recognition import RecognitionPredictor
+from surya.recognition.schema import PageOCRResult
+from surya.settings import settings
+
+configure_logging()
+logger = get_logger()
+
+app = Flask(__name__)
+
+ALLOWED_EXT = {".pdf", ".png", ".jpg", ".jpeg", ".gif", ".webp"}
+UPLOAD_DIR = os.path.join(tempfile.gettempdir(), "surya_screenshot")
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+
+_rec: Optional[RecognitionPredictor] = None
+
+
+def get_rec() -> RecognitionPredictor:
+    """Lazily build the recognition predictor (shared inference manager)."""
+    global _rec
+    if _rec is None:
+        _rec = RecognitionPredictor(SuryaInferenceManager())
+    return _rec
+
+
+# Datalab-flavored palette for layout block overlays, keyed by canonical label.
+LABEL_COLORS = {
+    "Text": "#2563eb",
+    "SectionHeader": "#0ea5e9",
+    "PageHeader": "#7c3aed",
+    "PageFooter": "#7c3aed",
+    "Caption": "#c026d3",
+    "Footnote": "#64748b",
+    "Equation": "#9333ea",
+    "Table": "#f59e0b",
+    "TableOfContents": "#f59e0b",
+    "Form": "#ea580c",
+    "ListGroup": "#10b981",
+    "Picture": "#db2777",
+    "Figure": "#db2777",
+    "Diagram": "#db2777",
+    "Code": "#0d9488",
+    "default": "#ef4444",
+}
+
+
+def _logo_data_url() -> str:
+    path = os.path.join(settings.BASE_DIR, "static", "datalab-logo.png")
+    try:
+        with open(path, "rb") as f:
+            return "data:image/png;base64," + base64.b64encode(f.read()).decode()
+    except Exception:
+        return ""
+
+
+def _pil_to_data_url(img: Image.Image, fmt: str = "PNG") -> str:
+    buf = io.BytesIO()
+    img.save(buf, format=fmt)
+    return (
+        f"data:image/{fmt.lower()};base64," + base64.b64encode(buf.getvalue()).decode()
+    )
+
+
+def _is_pdf(path: str) -> bool:
+    return path.lower().endswith(".pdf")
+
+
+def _page_count(path: str) -> int:
+    if _is_pdf(path):
+        doc = pypdfium2.PdfDocument(path)
+        n = len(doc)
+        doc.close()
+        return n
+    return 1
+
+
+def _render_page(path: str, page: int, dpi: int) -> Image.Image:
+    """Render a 0-indexed page of a PDF (or load an image file) as RGB."""
+    if _is_pdf(path):
+        doc = pypdfium2.PdfDocument(path)
+        try:
+            pil = doc[page].render(scale=dpi / 72).to_pil().convert("RGB")
+        finally:
+            doc.close()
+        return pil
+    return Image.open(path).convert("RGB")
+
+
+def _assemble_page_html(page: PageOCRResult) -> str:
+    """Whole-page HTML from a PageOCRResult (math stays in <math> tags)."""
+    parts: List[str] = []
+    for blk in page.blocks:
+        if blk.skipped:
+            continue
+        x0, y0, x1, y1 = (int(c) for c in blk.bbox)
+        parts.append(
+            f'<div data-bbox="{x0} {y0} {x1} {y1}" '
+            f'data-label="{blk.label}">{blk.html or ""}</div>'
+        )
+    return "\n".join(parts)
+
+
+@app.route("/")
+def index():
+    return render_template("surya_screenshot.html", logo=_logo_data_url())
+
+
+@app.route("/info", methods=["POST"])
+def info():
+    path = (request.json or {}).get("file_path", "").strip()
+    if not path:
+        return jsonify({"error": "file_path is required"}), 400
+    if not os.path.exists(path):
+        return jsonify({"error": f"File not found: {path}"}), 400
+    try:
+        return jsonify({"page_count": _page_count(path)})
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+
+
+@app.route("/upload", methods=["POST"])
+def upload():
+    """Accept a drag/drop (or browsed) file, save to a temp path, return it."""
+    f = request.files.get("file")
+    if f is None or not f.filename:
+        return jsonify({"error": "no file uploaded"}), 400
+    ext = os.path.splitext(f.filename)[1].lower()
+    if ext not in ALLOWED_EXT:
+        return jsonify({"error": f"unsupported file type: {ext or '(none)'}"}), 400
+    safe = secure_filename(f.filename) or f"upload{ext}"
+    dest = os.path.join(UPLOAD_DIR, f"{uuid.uuid4().hex}_{safe}")
+    f.save(dest)
+    try:
+        return jsonify(
+            {"file_path": dest, "page_count": _page_count(dest), "name": f.filename}
+        )
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+
+
+@app.route("/page", methods=["POST"])
+def page():
+    """Render a single page for preview (no OCR)."""
+    data = request.json or {}
+    path = data.get("file_path", "").strip()
+    page_num = int(data.get("page", 0))
+    if not path or not os.path.exists(path):
+        return jsonify({"error": "valid file_path is required"}), 400
+    try:
+        img = _render_page(path, page_num, settings.IMAGE_DPI_HIGHRES)
+        return jsonify(
+            {
+                "image_base64": _pil_to_data_url(img),
+                "width": img.size[0],
+                "height": img.size[1],
+            }
+        )
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+
+
+@app.route("/process", methods=["POST"])
+def process():
+    """Run full-page OCR on one page; return the page image + OCR HTML + blocks."""
+    data = request.json or {}
+    path = data.get("file_path", "").strip()
+    page_num = int(data.get("page", 0))
+    if not path or not os.path.exists(path):
+        return jsonify({"error": "valid file_path is required"}), 400
+    try:
+        img = _render_page(path, page_num, settings.IMAGE_DPI_HIGHRES)
+        page_result = get_rec()([img], full_page=True)[0]
+        blocks = [
+            {
+                "bbox": [int(c) for c in blk.bbox],
+                "label": blk.label,
+                "color": LABEL_COLORS.get(blk.label, LABEL_COLORS["default"]),
+            }
+            for blk in page_result.blocks
+            if not blk.skipped
+        ]
+        return jsonify(
+            {
+                "image_base64": _pil_to_data_url(img),
+                "width": img.size[0],
+                "height": img.size[1],
+                "html": _assemble_page_html(page_result),
+                "blocks": blocks,
+                "n_blocks": len(page_result.blocks),
+            }
+        )
+    except Exception as e:
+        logger.exception("Full-page OCR failed")
+        return jsonify({"error": str(e)}), 500
+
+
+def main():
+    app.run(host="0.0.0.0", port=8504)
+
+
+if __name__ == "__main__":
+    main()
--- a/surya/scripts/streamlit_app.py
+++ b/surya/scripts/streamlit_app.py
@ -4,12 +4,14 @@ inference manager. Detection + OCR-error stay in their own torch paths."""
 from __future__ import annotations

 import io
+import re
 import tempfile
 import time
 from typing import List

 import pypdfium2
 import streamlit as st
+import streamlit.components.v1 as components
 from PIL import Image, ImageDraw

 from surya.debug.draw import draw_polys_on_image, draw_bboxes_on_image
@ -24,6 +26,61 @@ from surya.table_rec import TableRecPredictor
 from surya.table_rec.schema import TableResult


+# KaTeX-enabled HTML wrapper. The OCR HTML wraps math in <math>...</math>
+# (KaTeX-compatible LaTeX inside), which a browser would otherwise show as
+# raw text. We convert those tags to \( \) / \[ \] delimiters and let KaTeX
+# auto-render typeset them inside an iframe component.
+_KATEX_HEAD = r"""<!doctype html><html><head>
+<meta charset="utf-8">
+<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css">
+<script src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.js"></script>
+<script src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/contrib/auto-render.min.js"></script>
+<style>
+/* White "paper" card so the text stays readable in both light and dark
+   Streamlit themes (the iframe is otherwise transparent and our text is dark). */
+html,body{background:#ffffff;}
+body{font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif;font-size:15px;line-height:1.55;color:#111111;margin:0;padding:14px;}
+table{border-collapse:collapse;margin:6px 0;} td,th{border:1px solid #bbb;padding:3px 6px;color:#111111;}
+[data-label="SectionHeader"],[data-label="PageHeader"]{font-weight:600;}
+</style></head><body>
+"""
+
+_KATEX_TAIL = r"""
+<script>
+renderMathInElement(document.body, {
+  delimiters: [
+    {left: "\\[", right: "\\]", display: true},
+    {left: "\\(", right: "\\)", display: false}
+  ],
+  throwOnError: false
+});
+</script></body></html>
+"""
+
+_MATH_RE = re.compile(r"<math\b([^>]*)>(.*?)</math>", re.DOTALL | re.IGNORECASE)
+
+
+def _math_to_katex(html_str: str) -> str:
+    """Rewrite <math>...</math> tags into KaTeX \\( \\) / \\[ \\] delimiters."""
+
+    def repl(m: "re.Match") -> str:
+        attrs, inner = m.group(1), m.group(2)
+        if re.search(r"""display\s*=\s*["']block["']""", attrs):
+            return "\\[" + inner + "\\]"
+        return "\\(" + inner + "\\)"
+
+    return _MATH_RE.sub(repl, html_str or "")
+
+
+def render_ocr_html(html_str: str, height: int = 400) -> None:
+    """Render OCR HTML with math typeset by KaTeX (iframe component)."""
+    components.html(
+        _KATEX_HEAD + _math_to_katex(html_str) + _KATEX_TAIL,
+        height=height,
+        scrolling=True,
+    )
+
+
 def _assemble_page_html(page: PageOCRResult) -> str:
    """Reconstruct a div-block whole-page HTML from a PageOCRResult."""
    parts: List[str] = []
@ -334,7 +391,7 @@ if run_block_ocr:
        )
        full_html = _assemble_page_html(page)
        with st.expander("Full page HTML (rendered)", expanded=False):
-            st.markdown(full_html, unsafe_allow_html=True)
+            render_ocr_html(full_html, height=600)
        with st.expander("Full page HTML (source)", expanded=False):
            st.code(full_html, language="html")
        for blk in page.blocks:
@ -366,6 +423,7 @@ if run_block_ocr:
                elif blk.error:
                    st.error("Block OCR errored")
                else:
+                    render_ocr_html(blk.html, height=160)
                    st.code(blk.html, language="html")


@ -382,7 +440,7 @@ if run_full_page_ocr:
        )
        full_html = _assemble_page_html(page)
        with st.expander("Full page HTML (rendered)", expanded=False):
-            st.markdown(full_html, unsafe_allow_html=True)
+            render_ocr_html(full_html, height=600)
        with st.expander("Full page HTML (source)", expanded=False):
            st.code(full_html, language="html")
        for blk in page.blocks:
@ -394,7 +452,7 @@ if run_full_page_ocr:
                elif blk.error:
                    st.error("Block OCR errored")
                else:
-                    st.markdown(blk.html, unsafe_allow_html=True)
+                    render_ocr_html(blk.html, height=160)
                    st.code(blk.html, language="html")


@ -412,7 +470,7 @@ if run_table_rec:
        for pred in preds:
            if pred.mode == "full" and pred.html:
                with st.expander("Table HTML"):
-                    st.markdown(pred.html, unsafe_allow_html=True)
+                    render_ocr_html(pred.html, height=400)
                    st.code(pred.html, language="html")
            else:
                st.json(pred.model_dump(), expanded=False)
--- a/surya/scripts/templates/surya_screenshot.html
+++ b/surya/scripts/templates/surya_screenshot.html
@ -0,0 +1,331 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Surya · Full-Page OCR</title>
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css">
+  <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js"></script>
+  <style>
+    :root {
+      --ink: #0f1115;
+      --bg: #f4f6f9;
+      --panel: #ffffff;
+      --border: #e4e7ec;
+      --accent: #2563eb;
+      --accent-hover: #1d4ed8;
+      --muted: #667085;
+    }
+    * { margin: 0; padding: 0; box-sizing: border-box; }
+    body {
+      font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+      background: var(--bg);
+      color: var(--ink);
+      height: 100vh;
+      display: flex;
+      flex-direction: column;
+      overflow: hidden;
+    }
+    header {
+      display: flex; align-items: center; gap: 16px;
+      padding: 12px 20px;
+      background: var(--panel);
+      border-bottom: 1px solid var(--border);
+      box-shadow: 0 1px 3px rgba(16,24,40,.04);
+      z-index: 10;
+    }
+    .brand { display: flex; align-items: center; gap: 10px; }
+    .brand img { height: 30px; width: auto; }
+    .brand .title { font-size: 18px; font-weight: 700; letter-spacing: -.01em; }
+    .brand .sub { font-size: 13px; color: var(--muted); font-weight: 500;
+      padding-left: 10px; margin-left: 4px; border-left: 1px solid var(--border); }
+    .controls { display: flex; align-items: center; gap: 10px; flex: 1; flex-wrap: wrap; }
+    input[type=text] {
+      flex: 1; min-width: 260px; max-width: 560px;
+      padding: 9px 12px; font-size: 14px;
+      border: 1px solid var(--border); border-radius: 8px; background: #fff;
+    }
+    input[type=text]:focus { outline: none; border-color: var(--accent); box-shadow: 0 0 0 3px rgba(37,99,235,.12); }
+    button {
+      padding: 9px 14px; font-size: 14px; font-weight: 600;
+      border: none; border-radius: 8px; cursor: pointer;
+      background: var(--accent); color: #fff; transition: background .15s;
+    }
+    button:hover:not(:disabled) { background: var(--accent-hover); }
+    button.secondary { background: #fff; color: var(--ink); border: 1px solid var(--border); }
+    button.secondary:hover:not(:disabled) { background: #f2f4f7; }
+    button:disabled { opacity: .45; cursor: not-allowed; }
+    .pager { display: flex; align-items: center; gap: 6px; }
+    .pager .pageind { font-size: 13px; color: var(--muted); min-width: 96px; text-align: center; }
+    .pager button { padding: 7px 11px; }
+    .toggle { display: flex; align-items: center; gap: 7px; font-size: 13px; color: var(--muted); cursor: pointer; user-select: none; }
+    .status { font-size: 13px; font-weight: 600; min-width: 80px; }
+    .status.loading { color: #b45309; }
+    .status.error { color: #d92d20; }
+    .status.ok { color: #079455; }
+
+    .stage {
+      flex: 1; display: flex; gap: 16px; padding: 16px; overflow: hidden;
+    }
+    .panel {
+      flex: 1; min-width: 0; display: flex; flex-direction: column;
+      background: var(--panel); border: 1px solid var(--border);
+      border-radius: 12px; overflow: hidden;
+      box-shadow: 0 1px 2px rgba(16,24,40,.05);
+    }
+    .panel-head {
+      padding: 11px 16px; font-size: 13px; font-weight: 700;
+      letter-spacing: .02em; text-transform: uppercase; color: var(--muted);
+      border-bottom: 1px solid var(--border); background: #fcfcfd;
+    }
+    .panel-body { flex: 1; overflow: auto; }
+    .img-wrap {
+      display: flex; align-items: flex-start; justify-content: center;
+      padding: 16px; background: #f0f2f5; min-height: 100%;
+    }
+    #pageCanvas { max-width: 100%; height: auto; border-radius: 6px;
+      box-shadow: 0 2px 10px rgba(16,24,40,.12); background: #fff; }
+    .ocr {
+      padding: 28px 32px; line-height: 1.6; font-size: 16px; color: #1d2433;
+    }
+    .ocr [data-label="SectionHeader"], .ocr [data-label="Title"] { font-weight: 700; font-size: 1.15em; margin: .5em 0 .3em; }
+    .ocr [data-label="PageHeader"], .ocr [data-label="PageFooter"] { color: var(--muted); font-size: .9em; }
+    .ocr table { border-collapse: collapse; margin: 14px 0; width: 100%; }
+    .ocr th, .ocr td { border: 1px solid #d0d5dd; padding: 6px 10px; text-align: left; }
+    .ocr th { background: #f2f4f7; font-weight: 600; }
+    .ocr img { max-width: 100%; height: auto; }
+    .placeholder { padding: 48px 32px; color: var(--muted); font-size: 15px; text-align: center; }
+    #dropOverlay { position: fixed; inset: 0; z-index: 100; display: none;
+      align-items: center; justify-content: center; background: rgba(15,17,21,.55); }
+    #dropOverlay.active { display: flex; }
+    #dropOverlay .drop-card { padding: 40px 64px; border: 3px dashed #fff;
+      border-radius: 16px; color: #fff; font-size: 22px; font-weight: 700;
+      background: rgba(37,99,235,.30); }
+    /* While screenshotting, expand panels to full content height so
+       html2canvas captures everything, not just the visible scroll area. */
+    body.capturing { height: auto !important; overflow: visible !important; }
+    body.capturing .stage { height: auto !important; overflow: visible !important; }
+    body.capturing .panel-body { height: auto !important; overflow: visible !important; }
+  </style>
+</head>
+<body>
+  <header>
+    <div class="brand">
+      {% if logo %}<img src="{{ logo }}" alt="Datalab">{% endif %}
+      <span class="title">Surya</span>
+      <span class="sub">Full-Page OCR</span>
+    </div>
+    <div class="controls">
+      <input type="text" id="filePath" placeholder="Drop a file, Browse, or type a server path…">
+      <input type="file" id="fileInput" accept=".pdf,.png,.jpg,.jpeg,.gif,.webp" style="display:none" onchange="onPick(this)">
+      <button class="secondary" onclick="document.getElementById('fileInput').click()">Browse</button>
+      <button class="secondary" id="loadBtn" onclick="loadFile()">Load</button>
+      <div class="pager">
+        <button class="secondary" id="prevBtn" onclick="changePage(-1)" disabled>◀</button>
+        <span class="pageind" id="pageInd">—</span>
+        <button class="secondary" id="nextBtn" onclick="changePage(1)" disabled>▶</button>
+      </div>
+      <button id="runBtn" onclick="runOCR()" disabled>Run Full-Page OCR</button>
+      <label class="toggle"><input type="checkbox" id="showBoxes" checked onchange="drawLeft()"> Layout boxes</label>
+      <button class="secondary" id="copyBtn" onclick="copyHtml()" disabled>Copy HTML</button>
+      <button class="secondary" id="shotBtn" onclick="saveScreenshot()" disabled>Save Screenshot</button>
+      <span class="status" id="status"></span>
+    </div>
+  </header>
+
+  <div class="stage">
+    <div class="panel">
+      <div class="panel-head">PDF Page</div>
+      <div class="panel-body"><div class="img-wrap"><canvas id="pageCanvas"></canvas></div></div>
+    </div>
+    <div class="panel">
+      <div class="panel-head">Full-Page OCR</div>
+      <div class="panel-body"><div class="ocr" id="ocr"><div class="placeholder">Load a file, scroll to a page, then run full-page OCR.</div></div></div>
+    </div>
+  </div>
+
+  <div id="dropOverlay"><div class="drop-card">Drop a PDF or image to load</div></div>
+
+  <script>
+    const S = { path: "", name: "", page: 0, count: 0, img: null, blocks: null, html: null, ocrPage: null };
+
+    const $ = (id) => document.getElementById(id);
+    function setStatus(msg, kind) { const s = $("status"); s.textContent = msg || ""; s.className = "status " + (kind || ""); }
+
+    async function post(url, body) {
+      const r = await fetch(url, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(body) });
+      const data = await r.json();
+      if (!r.ok) throw new Error(data.error || "Request failed");
+      return data;
+    }
+
+    async function loadFile() {
+      const val = $("filePath").value.trim();
+      if (!val) { setStatus("Drop a file or enter a path", "error"); return; }
+      // If the box still shows the name of an uploaded file, just reload it.
+      if (S.path && val === S.name) { S.page = 0; await showPage(); return; }
+      setStatus("Loading…", "loading");
+      try {
+        const info = await post("/info", { file_path: val });
+        S.path = val; S.name = val; S.count = info.page_count; S.page = 0;
+        await showPage();
+        setStatus("");
+      } catch (e) { setStatus(e.message, "error"); }
+    }
+
+    function onPick(input) { if (input.files && input.files[0]) uploadFile(input.files[0]); }
+
+    async function uploadFile(file) {
+      setStatus("Uploading…", "loading");
+      const fd = new FormData(); fd.append("file", file);
+      try {
+        const r = await fetch("/upload", { method: "POST", body: fd });
+        const data = await r.json();
+        if (!r.ok) throw new Error(data.error || "Upload failed");
+        S.path = data.file_path; S.name = data.name; S.count = data.page_count; S.page = 0;
+        $("filePath").value = data.name;
+        await showPage();
+        setStatus("");
+      } catch (e) { setStatus(e.message, "error"); }
+    }
+
+    async function showPage() {
+      setStatus("Rendering…", "loading");
+      try {
+        const data = await post("/page", { file_path: S.path, page: S.page });
+        // New page → clear any previous OCR output.
+        S.blocks = null; S.html = null; S.ocrPage = null;
+        $("ocr").innerHTML = '<div class="placeholder">Run full-page OCR to see the extracted content.</div>';
+        $("shotBtn").disabled = true;
+        $("copyBtn").disabled = true;
+        loadImage(data.image_base64, () => drawLeft());
+        updatePager();
+        $("runBtn").disabled = false;
+        setStatus("");
+      } catch (e) { setStatus(e.message, "error"); }
+    }
+
+    function loadImage(src, cb) {
+      const im = new Image();
+      im.onload = () => { S.img = im; cb && cb(); };
+      im.src = src;
+    }
+
+    function updatePager() {
+      $("pageInd").textContent = S.count ? `Page ${S.page + 1} of ${S.count}` : "—";
+      $("prevBtn").disabled = S.page <= 0;
+      $("nextBtn").disabled = S.page >= S.count - 1;
+    }
+
+    function changePage(delta) {
+      const next = S.page + delta;
+      if (next < 0 || next >= S.count) return;
+      S.page = next;
+      showPage();
+    }
+
+    function drawLeft() {
+      if (!S.img) return;
+      const c = $("pageCanvas"), ctx = c.getContext("2d");
+      c.width = S.img.naturalWidth; c.height = S.img.naturalHeight;
+      ctx.drawImage(S.img, 0, 0);
+      if (!S.blocks || !$("showBoxes").checked) return;
+      ctx.lineWidth = 3;
+      ctx.font = 'bold 15px -apple-system, "Segoe UI", sans-serif';
+      ctx.textBaseline = "top";
+      S.blocks.forEach((b) => {
+        const [x1, y1, x2, y2] = b.bbox;
+        ctx.strokeStyle = b.color; ctx.fillStyle = b.color + "26";
+        ctx.fillRect(x1, y1, x2 - x1, y2 - y1);
+        ctx.strokeRect(x1, y1, x2 - x1, y2 - y1);
+        const tw = ctx.measureText(b.label).width, ly = Math.max(y1 - 22, 0);
+        ctx.fillStyle = b.color; ctx.fillRect(x1, ly, tw + 12, 21);
+        ctx.fillStyle = "#fff"; ctx.fillText(b.label, x1 + 6, ly + 3);
+      });
+    }
+
+    async function runOCR() {
+      if (!S.path) return;
+      setStatus("Running OCR…", "loading");
+      $("runBtn").disabled = true;
+      try {
+        const data = await post("/process", { file_path: S.path, page: S.page });
+        S.blocks = data.blocks; S.html = data.html || ""; S.ocrPage = S.page;
+        loadImage(data.image_base64, () => drawLeft());
+        const ocr = $("ocr");
+        ocr.innerHTML = data.html || '<div class="placeholder">No content detected on this page.</div>';
+        renderMath(ocr);
+        $("shotBtn").disabled = false;
+        $("copyBtn").disabled = !S.html;
+        setStatus(`${data.n_blocks} blocks`, "ok");
+      } catch (e) { setStatus(e.message, "error"); }
+      finally { $("runBtn").disabled = false; }
+    }
+
+    function renderMath(root) {
+      root.querySelectorAll("math").forEach((el) => {
+        const block = el.getAttribute("display") === "block";
+        try {
+          const span = document.createElement(block ? "div" : "span");
+          span.innerHTML = katex.renderToString(el.textContent, { displayMode: block, throwOnError: false });
+          el.replaceWith(span);
+        } catch (e) { /* leave raw on failure */ }
+      });
+    }
+
+    function copyHtml() {
+      if (!S.html) return;
+      const done = () => setStatus("HTML copied", "ok");
+      if (navigator.clipboard && navigator.clipboard.writeText) {
+        navigator.clipboard.writeText(S.html).then(done).catch(() => fallbackCopy(S.html, done));
+      } else {
+        fallbackCopy(S.html, done);
+      }
+    }
+
+    function fallbackCopy(text, done) {
+      const ta = document.createElement("textarea");
+      ta.value = text; ta.style.position = "fixed"; ta.style.opacity = "0";
+      document.body.appendChild(ta); ta.select();
+      try { document.execCommand("copy"); done(); }
+      catch (e) { setStatus("Copy failed", "error"); }
+      document.body.removeChild(ta);
+    }
+
+    async function saveScreenshot() {
+      setStatus("Capturing…", "loading");
+      const stage = document.querySelector(".stage");
+      document.body.classList.add("capturing");
+      try {
+        // Let the expanded layout settle before measuring full size.
+        await new Promise((r) => requestAnimationFrame(() => requestAnimationFrame(r)));
+        const w = stage.scrollWidth, h = stage.scrollHeight;
+        const canvas = await html2canvas(stage, {
+          backgroundColor: "#f4f6f9", scale: 2, useCORS: true, logging: false,
+          width: w, height: h, windowWidth: w, windowHeight: h, scrollX: 0, scrollY: 0,
+        });
+        const a = document.createElement("a");
+        a.href = canvas.toDataURL("image/png");
+        a.download = `surya_ocr_page_${(S.ocrPage ?? S.page) + 1}.png`;
+        a.click();
+        setStatus("Saved", "ok");
+      } catch (e) { setStatus("Screenshot failed: " + e.message, "error"); }
+      finally { document.body.classList.remove("capturing"); }
+    }
+
+    $("filePath").addEventListener("keypress", (e) => { if (e.key === "Enter") loadFile(); });
+
+    // Drag & drop anywhere in the window.
+    let dragDepth = 0;
+    window.addEventListener("dragenter", (e) => { e.preventDefault(); dragDepth++; $("dropOverlay").classList.add("active"); });
+    window.addEventListener("dragover", (e) => { e.preventDefault(); });
+    window.addEventListener("dragleave", (e) => { e.preventDefault(); if (--dragDepth <= 0) { dragDepth = 0; $("dropOverlay").classList.remove("active"); } });
+    window.addEventListener("drop", (e) => {
+      e.preventDefault(); dragDepth = 0; $("dropOverlay").classList.remove("active");
+      const f = e.dataTransfer.files && e.dataTransfer.files[0];
+      if (f) uploadFile(f);
+    });
+  </script>
+</body>
+</html>
--- a/surya/settings.py
+++ b/surya/settings.py
@ -49,6 +49,10 @@ class Settings(BaseSettings):
    SURYA_INFERENCE_BACKEND: Optional[str] = None  # "vllm" | "llamacpp" | None (auto)
    SURYA_INFERENCE_URL: Optional[str] = None  # external server, skip spawn
    SURYA_INFERENCE_AUTOSTART: bool = True
+    # Leave an auto-spawned server running after the process exits so later
+    # commands attach to it instead of re-spawning (avoids repeated startup /
+    # model-load cost). Stop it manually when done — see `surya/inference`.
+    SURYA_INFERENCE_KEEP_ALIVE: bool = False
    SURYA_INFERENCE_HOST: str = "127.0.0.1"
    SURYA_INFERENCE_PORT: Optional[int] = None  # None = pick a free port
    SURYA_INFERENCE_PARALLEL: int = 8
--- a/uv.lock
+++ b/uv.lock
@ -823,6 +823,23 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970 },
 ]

+[[package]]
+name = "flask"
+version = "3.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "blinker" },
+    { name = "click" },
+    { name = "itsdangerous" },
+    { name = "jinja2" },
+    { name = "markupsafe" },
+    { name = "werkzeug" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/00/35d85dcce6c57fdc871f3867d465d780f302a175ea360f62533f12b27e2b/flask-3.1.3.tar.gz", hash = "sha256:0ef0e52b8a9cd932855379197dd8f94047b359ca0a78695144304cb45f87c9eb", size = 759004 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/9c/34f6962f9b9e9c71f6e5ed806e0d0ff03c9d1b0b2340088a0cf4bce09b18/flask-3.1.3-py3-none-any.whl", hash = "sha256:f4bcbefc124291925f1a26446da31a5178f9483862233b23c0c96a20701f670c", size = 103424 },
+]
+
 [[package]]
 name = "fqdn"
 version = "1.5.1"
@ -3944,6 +3961,7 @@ dependencies = [
 [package.dev-dependencies]
 dev = [
    { name = "datasets" },
+    { name = "flask" },
    { name = "jupyter" },
    { name = "pdftext" },
    { name = "pre-commit" },
@ -3976,6 +3994,7 @@ requires-dist = [
 [package.metadata.requires-dev]
 dev = [
    { name = "datasets", specifier = ">=2.16.1" },
+    { name = "flask", specifier = ">=3.0.0" },
    { name = "jupyter", specifier = ">=1.0.0" },
    { name = "pdftext", specifier = ">=0.5.1" },
    { name = "pre-commit", specifier = ">=4.2.0" },
@ -4468,6 +4487,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598 },
 ]

+[[package]]
+name = "werkzeug"
+version = "3.1.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dd/b2/381be8cfdee792dd117872481b6e378f85c957dd7c5bca38897b08f765fd/werkzeug-3.1.8.tar.gz", hash = "sha256:9bad61a4268dac112f1c5cd4630a56ede601b6ed420300677a869083d70a4c44", size = 875852 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/93/8c/2e650f2afeb7ee576912636c23ddb621c91ac6a98e66dc8d29c3c69446e1/werkzeug-3.1.8-py3-none-any.whl", hash = "sha256:63a77fb8892bf28ebc3178683445222aa500e48ebad5ec77b0ad80f8726b1f50", size = 226459 },
+]
+
 [[package]]
 name = "widgetsnbextension"
 version = "4.0.15"