Add screenshot app

This commit is contained in:
Vik Paruchuri 2026-05-27 10:36:53 -04:00
parent 80c2903ea2
commit 79246df837
8 changed files with 712 additions and 16 deletions

View File

@ -37,6 +37,7 @@ surya_ocr = "surya.scripts.ocr_text:ocr_text_cli"
surya_layout = "surya.scripts.detect_layout:detect_layout_cli"
surya_gui = "surya.scripts.run_streamlit_app:streamlit_app_cli"
surya_table = "surya.scripts.table_recognition:table_recognition_cli"
surya_screenshot = "surya.scripts.screenshot_app:main"
[dependency-groups]
dev = [
@ -48,6 +49,7 @@ dev = [
"pytest>=8.3.4",
"pdftext>=0.5.1",
"tabulate>=0.9.0",
"flask>=3.0.0",
]
[build-system]

View File

@ -265,7 +265,10 @@ def attach_or_spawn(
},
)
# 5. Register atexit cleanup (only spawner)
# 5. Register atexit cleanup (only spawner). Skipped when keep-alive is
# set so the server outlives this process and later commands attach to
# it via the sentinel. (_cleanup is still callable below on startup
# failure, where we always tear a half-started server down.)
def _cleanup():
try:
if spawn_handle.cleanup_kind == "docker":
@ -276,7 +279,13 @@ def attach_or_spawn(
finally:
_delete_sentinel(backend)
atexit.register(_cleanup)
if settings.SURYA_INFERENCE_KEEP_ALIVE:
logger.info(
f"keep-alive: {backend} server on port {port} will stay up "
f"after exit (cleanup_id={spawn_handle.cleanup_id!r})"
)
else:
atexit.register(_cleanup)
# 6. Wait for health
health_url = health_url_for(port)

View File

@ -17,15 +17,45 @@ class CLILoader:
self.debug = cli_options.get("debug", False)
self.output_dir = cli_options.get("output_dir")
# Opt in to leaving the inference server up so later commands reuse it.
if cli_options.get("keep_server"):
settings.SURYA_INFERENCE_KEEP_ALIVE = True
self.load(highres)
@staticmethod
def common_options(fn):
fn = click.argument("input_path", type=click.Path(exists=True), required=True)(fn)
fn = click.option("--output_dir", type=click.Path(exists=False), required=False, default=os.path.join(settings.RESULT_DIR, "surya"), help="Directory to save output.")(fn)
fn = click.option("--page_range", type=str, default=None, help="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20")(fn)
fn = click.option("--images", is_flag=True, help="Save images of detected bboxes.", default=False)(fn)
fn = click.option('--debug', '-d', is_flag=True, help='Enable debug mode.', default=False)(fn)
fn = click.argument("input_path", type=click.Path(exists=True), required=True)(
fn
)
fn = click.option(
"--output_dir",
type=click.Path(exists=False),
required=False,
default=os.path.join(settings.RESULT_DIR, "surya"),
help="Directory to save output.",
)(fn)
fn = click.option(
"--page_range",
type=str,
default=None,
help="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20",
)(fn)
fn = click.option(
"--images",
is_flag=True,
help="Save images of detected bboxes.",
default=False,
)(fn)
fn = click.option(
"--debug", "-d", is_flag=True, help="Enable debug mode.", default=False
)(fn)
fn = click.option(
"--keep_server",
is_flag=True,
default=False,
help="Keep the inference server (vllm/llama.cpp) running after this command exits so later commands reuse it instead of re-spawning.",
)(fn)
return fn
def load(self, highres: bool = False):
@ -34,13 +64,16 @@ class CLILoader:
images, names = load_from_folder(self.filepath, self.page_range)
folder_name = os.path.basename(self.filepath)
if highres:
highres_images, _ = load_from_folder(self.filepath, self.page_range, settings.IMAGE_DPI_HIGHRES)
highres_images, _ = load_from_folder(
self.filepath, self.page_range, settings.IMAGE_DPI_HIGHRES
)
else:
images, names = load_from_file(self.filepath, self.page_range)
folder_name = os.path.basename(self.filepath).split(".")[0]
if highres:
highres_images, _ = load_from_file(self.filepath, self.page_range, settings.IMAGE_DPI_HIGHRES)
highres_images, _ = load_from_file(
self.filepath, self.page_range, settings.IMAGE_DPI_HIGHRES
)
self.images = images
self.highres_images = highres_images
@ -59,5 +92,7 @@ class CLILoader:
page_lst += list(range(int(start), int(end) + 1))
else:
page_lst.append(int(i))
page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order
return page_lst
page_lst = sorted(
list(set(page_lst))
) # Deduplicate page numbers and sort in order
return page_lst

View File

@ -0,0 +1,226 @@
"""Screenshot-friendly Surya viewer.
Shows a PDF/image page on the left and full-page OCR output on the right, side
by side, for clean screenshots. You can scroll through pages and preview them
before running OCR, then export the side-by-side view as a PNG.
Run with `surya_screenshot`, then open http://localhost:8504.
"""
from __future__ import annotations
import base64
import io
import os
import tempfile
import uuid
from typing import List, Optional
import pypdfium2
from flask import Flask, jsonify, render_template, request
from PIL import Image
from werkzeug.utils import secure_filename
from surya.inference import SuryaInferenceManager
from surya.logging import configure_logging, get_logger
from surya.recognition import RecognitionPredictor
from surya.recognition.schema import PageOCRResult
from surya.settings import settings
configure_logging()
logger = get_logger()
app = Flask(__name__)
ALLOWED_EXT = {".pdf", ".png", ".jpg", ".jpeg", ".gif", ".webp"}
UPLOAD_DIR = os.path.join(tempfile.gettempdir(), "surya_screenshot")
os.makedirs(UPLOAD_DIR, exist_ok=True)
_rec: Optional[RecognitionPredictor] = None
def get_rec() -> RecognitionPredictor:
"""Lazily build the recognition predictor (shared inference manager)."""
global _rec
if _rec is None:
_rec = RecognitionPredictor(SuryaInferenceManager())
return _rec
# Datalab-flavored palette for layout block overlays, keyed by canonical label.
LABEL_COLORS = {
"Text": "#2563eb",
"SectionHeader": "#0ea5e9",
"PageHeader": "#7c3aed",
"PageFooter": "#7c3aed",
"Caption": "#c026d3",
"Footnote": "#64748b",
"Equation": "#9333ea",
"Table": "#f59e0b",
"TableOfContents": "#f59e0b",
"Form": "#ea580c",
"ListGroup": "#10b981",
"Picture": "#db2777",
"Figure": "#db2777",
"Diagram": "#db2777",
"Code": "#0d9488",
"default": "#ef4444",
}
def _logo_data_url() -> str:
path = os.path.join(settings.BASE_DIR, "static", "datalab-logo.png")
try:
with open(path, "rb") as f:
return "data:image/png;base64," + base64.b64encode(f.read()).decode()
except Exception:
return ""
def _pil_to_data_url(img: Image.Image, fmt: str = "PNG") -> str:
buf = io.BytesIO()
img.save(buf, format=fmt)
return (
f"data:image/{fmt.lower()};base64," + base64.b64encode(buf.getvalue()).decode()
)
def _is_pdf(path: str) -> bool:
return path.lower().endswith(".pdf")
def _page_count(path: str) -> int:
if _is_pdf(path):
doc = pypdfium2.PdfDocument(path)
n = len(doc)
doc.close()
return n
return 1
def _render_page(path: str, page: int, dpi: int) -> Image.Image:
"""Render a 0-indexed page of a PDF (or load an image file) as RGB."""
if _is_pdf(path):
doc = pypdfium2.PdfDocument(path)
try:
pil = doc[page].render(scale=dpi / 72).to_pil().convert("RGB")
finally:
doc.close()
return pil
return Image.open(path).convert("RGB")
def _assemble_page_html(page: PageOCRResult) -> str:
"""Whole-page HTML from a PageOCRResult (math stays in <math> tags)."""
parts: List[str] = []
for blk in page.blocks:
if blk.skipped:
continue
x0, y0, x1, y1 = (int(c) for c in blk.bbox)
parts.append(
f'<div data-bbox="{x0} {y0} {x1} {y1}" '
f'data-label="{blk.label}">{blk.html or ""}</div>'
)
return "\n".join(parts)
@app.route("/")
def index():
return render_template("surya_screenshot.html", logo=_logo_data_url())
@app.route("/info", methods=["POST"])
def info():
path = (request.json or {}).get("file_path", "").strip()
if not path:
return jsonify({"error": "file_path is required"}), 400
if not os.path.exists(path):
return jsonify({"error": f"File not found: {path}"}), 400
try:
return jsonify({"page_count": _page_count(path)})
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/upload", methods=["POST"])
def upload():
"""Accept a drag/drop (or browsed) file, save to a temp path, return it."""
f = request.files.get("file")
if f is None or not f.filename:
return jsonify({"error": "no file uploaded"}), 400
ext = os.path.splitext(f.filename)[1].lower()
if ext not in ALLOWED_EXT:
return jsonify({"error": f"unsupported file type: {ext or '(none)'}"}), 400
safe = secure_filename(f.filename) or f"upload{ext}"
dest = os.path.join(UPLOAD_DIR, f"{uuid.uuid4().hex}_{safe}")
f.save(dest)
try:
return jsonify(
{"file_path": dest, "page_count": _page_count(dest), "name": f.filename}
)
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/page", methods=["POST"])
def page():
"""Render a single page for preview (no OCR)."""
data = request.json or {}
path = data.get("file_path", "").strip()
page_num = int(data.get("page", 0))
if not path or not os.path.exists(path):
return jsonify({"error": "valid file_path is required"}), 400
try:
img = _render_page(path, page_num, settings.IMAGE_DPI_HIGHRES)
return jsonify(
{
"image_base64": _pil_to_data_url(img),
"width": img.size[0],
"height": img.size[1],
}
)
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/process", methods=["POST"])
def process():
"""Run full-page OCR on one page; return the page image + OCR HTML + blocks."""
data = request.json or {}
path = data.get("file_path", "").strip()
page_num = int(data.get("page", 0))
if not path or not os.path.exists(path):
return jsonify({"error": "valid file_path is required"}), 400
try:
img = _render_page(path, page_num, settings.IMAGE_DPI_HIGHRES)
page_result = get_rec()([img], full_page=True)[0]
blocks = [
{
"bbox": [int(c) for c in blk.bbox],
"label": blk.label,
"color": LABEL_COLORS.get(blk.label, LABEL_COLORS["default"]),
}
for blk in page_result.blocks
if not blk.skipped
]
return jsonify(
{
"image_base64": _pil_to_data_url(img),
"width": img.size[0],
"height": img.size[1],
"html": _assemble_page_html(page_result),
"blocks": blocks,
"n_blocks": len(page_result.blocks),
}
)
except Exception as e:
logger.exception("Full-page OCR failed")
return jsonify({"error": str(e)}), 500
def main():
app.run(host="0.0.0.0", port=8504)
if __name__ == "__main__":
main()

View File

@ -4,12 +4,14 @@ inference manager. Detection + OCR-error stay in their own torch paths."""
from __future__ import annotations
import io
import re
import tempfile
import time
from typing import List
import pypdfium2
import streamlit as st
import streamlit.components.v1 as components
from PIL import Image, ImageDraw
from surya.debug.draw import draw_polys_on_image, draw_bboxes_on_image
@ -24,6 +26,61 @@ from surya.table_rec import TableRecPredictor
from surya.table_rec.schema import TableResult
# KaTeX-enabled HTML wrapper. The OCR HTML wraps math in <math>...</math>
# (KaTeX-compatible LaTeX inside), which a browser would otherwise show as
# raw text. We convert those tags to \( \) / \[ \] delimiters and let KaTeX
# auto-render typeset them inside an iframe component.
_KATEX_HEAD = r"""<!doctype html><html><head>
<meta charset="utf-8">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css">
<script src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/contrib/auto-render.min.js"></script>
<style>
/* White "paper" card so the text stays readable in both light and dark
Streamlit themes (the iframe is otherwise transparent and our text is dark). */
html,body{background:#ffffff;}
body{font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif;font-size:15px;line-height:1.55;color:#111111;margin:0;padding:14px;}
table{border-collapse:collapse;margin:6px 0;} td,th{border:1px solid #bbb;padding:3px 6px;color:#111111;}
[data-label="SectionHeader"],[data-label="PageHeader"]{font-weight:600;}
</style></head><body>
"""
_KATEX_TAIL = r"""
<script>
renderMathInElement(document.body, {
delimiters: [
{left: "\\[", right: "\\]", display: true},
{left: "\\(", right: "\\)", display: false}
],
throwOnError: false
});
</script></body></html>
"""
_MATH_RE = re.compile(r"<math\b([^>]*)>(.*?)</math>", re.DOTALL | re.IGNORECASE)
def _math_to_katex(html_str: str) -> str:
"""Rewrite <math>...</math> tags into KaTeX \\( \\) / \\[ \\] delimiters."""
def repl(m: "re.Match") -> str:
attrs, inner = m.group(1), m.group(2)
if re.search(r"""display\s*=\s*["']block["']""", attrs):
return "\\[" + inner + "\\]"
return "\\(" + inner + "\\)"
return _MATH_RE.sub(repl, html_str or "")
def render_ocr_html(html_str: str, height: int = 400) -> None:
"""Render OCR HTML with math typeset by KaTeX (iframe component)."""
components.html(
_KATEX_HEAD + _math_to_katex(html_str) + _KATEX_TAIL,
height=height,
scrolling=True,
)
def _assemble_page_html(page: PageOCRResult) -> str:
"""Reconstruct a div-block whole-page HTML from a PageOCRResult."""
parts: List[str] = []
@ -334,7 +391,7 @@ if run_block_ocr:
)
full_html = _assemble_page_html(page)
with st.expander("Full page HTML (rendered)", expanded=False):
st.markdown(full_html, unsafe_allow_html=True)
render_ocr_html(full_html, height=600)
with st.expander("Full page HTML (source)", expanded=False):
st.code(full_html, language="html")
for blk in page.blocks:
@ -366,6 +423,7 @@ if run_block_ocr:
elif blk.error:
st.error("Block OCR errored")
else:
render_ocr_html(blk.html, height=160)
st.code(blk.html, language="html")
@ -382,7 +440,7 @@ if run_full_page_ocr:
)
full_html = _assemble_page_html(page)
with st.expander("Full page HTML (rendered)", expanded=False):
st.markdown(full_html, unsafe_allow_html=True)
render_ocr_html(full_html, height=600)
with st.expander("Full page HTML (source)", expanded=False):
st.code(full_html, language="html")
for blk in page.blocks:
@ -394,7 +452,7 @@ if run_full_page_ocr:
elif blk.error:
st.error("Block OCR errored")
else:
st.markdown(blk.html, unsafe_allow_html=True)
render_ocr_html(blk.html, height=160)
st.code(blk.html, language="html")
@ -412,7 +470,7 @@ if run_table_rec:
for pred in preds:
if pred.mode == "full" and pred.html:
with st.expander("Table HTML"):
st.markdown(pred.html, unsafe_allow_html=True)
render_ocr_html(pred.html, height=400)
st.code(pred.html, language="html")
else:
st.json(pred.model_dump(), expanded=False)

View File

@ -0,0 +1,331 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Surya · Full-Page OCR</title>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css">
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js"></script>
<style>
:root {
--ink: #0f1115;
--bg: #f4f6f9;
--panel: #ffffff;
--border: #e4e7ec;
--accent: #2563eb;
--accent-hover: #1d4ed8;
--muted: #667085;
}
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: var(--bg);
color: var(--ink);
height: 100vh;
display: flex;
flex-direction: column;
overflow: hidden;
}
header {
display: flex; align-items: center; gap: 16px;
padding: 12px 20px;
background: var(--panel);
border-bottom: 1px solid var(--border);
box-shadow: 0 1px 3px rgba(16,24,40,.04);
z-index: 10;
}
.brand { display: flex; align-items: center; gap: 10px; }
.brand img { height: 30px; width: auto; }
.brand .title { font-size: 18px; font-weight: 700; letter-spacing: -.01em; }
.brand .sub { font-size: 13px; color: var(--muted); font-weight: 500;
padding-left: 10px; margin-left: 4px; border-left: 1px solid var(--border); }
.controls { display: flex; align-items: center; gap: 10px; flex: 1; flex-wrap: wrap; }
input[type=text] {
flex: 1; min-width: 260px; max-width: 560px;
padding: 9px 12px; font-size: 14px;
border: 1px solid var(--border); border-radius: 8px; background: #fff;
}
input[type=text]:focus { outline: none; border-color: var(--accent); box-shadow: 0 0 0 3px rgba(37,99,235,.12); }
button {
padding: 9px 14px; font-size: 14px; font-weight: 600;
border: none; border-radius: 8px; cursor: pointer;
background: var(--accent); color: #fff; transition: background .15s;
}
button:hover:not(:disabled) { background: var(--accent-hover); }
button.secondary { background: #fff; color: var(--ink); border: 1px solid var(--border); }
button.secondary:hover:not(:disabled) { background: #f2f4f7; }
button:disabled { opacity: .45; cursor: not-allowed; }
.pager { display: flex; align-items: center; gap: 6px; }
.pager .pageind { font-size: 13px; color: var(--muted); min-width: 96px; text-align: center; }
.pager button { padding: 7px 11px; }
.toggle { display: flex; align-items: center; gap: 7px; font-size: 13px; color: var(--muted); cursor: pointer; user-select: none; }
.status { font-size: 13px; font-weight: 600; min-width: 80px; }
.status.loading { color: #b45309; }
.status.error { color: #d92d20; }
.status.ok { color: #079455; }
.stage {
flex: 1; display: flex; gap: 16px; padding: 16px; overflow: hidden;
}
.panel {
flex: 1; min-width: 0; display: flex; flex-direction: column;
background: var(--panel); border: 1px solid var(--border);
border-radius: 12px; overflow: hidden;
box-shadow: 0 1px 2px rgba(16,24,40,.05);
}
.panel-head {
padding: 11px 16px; font-size: 13px; font-weight: 700;
letter-spacing: .02em; text-transform: uppercase; color: var(--muted);
border-bottom: 1px solid var(--border); background: #fcfcfd;
}
.panel-body { flex: 1; overflow: auto; }
.img-wrap {
display: flex; align-items: flex-start; justify-content: center;
padding: 16px; background: #f0f2f5; min-height: 100%;
}
#pageCanvas { max-width: 100%; height: auto; border-radius: 6px;
box-shadow: 0 2px 10px rgba(16,24,40,.12); background: #fff; }
.ocr {
padding: 28px 32px; line-height: 1.6; font-size: 16px; color: #1d2433;
}
.ocr [data-label="SectionHeader"], .ocr [data-label="Title"] { font-weight: 700; font-size: 1.15em; margin: .5em 0 .3em; }
.ocr [data-label="PageHeader"], .ocr [data-label="PageFooter"] { color: var(--muted); font-size: .9em; }
.ocr table { border-collapse: collapse; margin: 14px 0; width: 100%; }
.ocr th, .ocr td { border: 1px solid #d0d5dd; padding: 6px 10px; text-align: left; }
.ocr th { background: #f2f4f7; font-weight: 600; }
.ocr img { max-width: 100%; height: auto; }
.placeholder { padding: 48px 32px; color: var(--muted); font-size: 15px; text-align: center; }
#dropOverlay { position: fixed; inset: 0; z-index: 100; display: none;
align-items: center; justify-content: center; background: rgba(15,17,21,.55); }
#dropOverlay.active { display: flex; }
#dropOverlay .drop-card { padding: 40px 64px; border: 3px dashed #fff;
border-radius: 16px; color: #fff; font-size: 22px; font-weight: 700;
background: rgba(37,99,235,.30); }
/* While screenshotting, expand panels to full content height so
html2canvas captures everything, not just the visible scroll area. */
body.capturing { height: auto !important; overflow: visible !important; }
body.capturing .stage { height: auto !important; overflow: visible !important; }
body.capturing .panel-body { height: auto !important; overflow: visible !important; }
</style>
</head>
<body>
<header>
<div class="brand">
{% if logo %}<img src="{{ logo }}" alt="Datalab">{% endif %}
<span class="title">Surya</span>
<span class="sub">Full-Page OCR</span>
</div>
<div class="controls">
<input type="text" id="filePath" placeholder="Drop a file, Browse, or type a server path…">
<input type="file" id="fileInput" accept=".pdf,.png,.jpg,.jpeg,.gif,.webp" style="display:none" onchange="onPick(this)">
<button class="secondary" onclick="document.getElementById('fileInput').click()">Browse</button>
<button class="secondary" id="loadBtn" onclick="loadFile()">Load</button>
<div class="pager">
<button class="secondary" id="prevBtn" onclick="changePage(-1)" disabled></button>
<span class="pageind" id="pageInd"></span>
<button class="secondary" id="nextBtn" onclick="changePage(1)" disabled></button>
</div>
<button id="runBtn" onclick="runOCR()" disabled>Run Full-Page OCR</button>
<label class="toggle"><input type="checkbox" id="showBoxes" checked onchange="drawLeft()"> Layout boxes</label>
<button class="secondary" id="copyBtn" onclick="copyHtml()" disabled>Copy HTML</button>
<button class="secondary" id="shotBtn" onclick="saveScreenshot()" disabled>Save Screenshot</button>
<span class="status" id="status"></span>
</div>
</header>
<div class="stage">
<div class="panel">
<div class="panel-head">PDF Page</div>
<div class="panel-body"><div class="img-wrap"><canvas id="pageCanvas"></canvas></div></div>
</div>
<div class="panel">
<div class="panel-head">Full-Page OCR</div>
<div class="panel-body"><div class="ocr" id="ocr"><div class="placeholder">Load a file, scroll to a page, then run full-page OCR.</div></div></div>
</div>
</div>
<div id="dropOverlay"><div class="drop-card">Drop a PDF or image to load</div></div>
<script>
const S = { path: "", name: "", page: 0, count: 0, img: null, blocks: null, html: null, ocrPage: null };
const $ = (id) => document.getElementById(id);
function setStatus(msg, kind) { const s = $("status"); s.textContent = msg || ""; s.className = "status " + (kind || ""); }
async function post(url, body) {
const r = await fetch(url, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(body) });
const data = await r.json();
if (!r.ok) throw new Error(data.error || "Request failed");
return data;
}
async function loadFile() {
const val = $("filePath").value.trim();
if (!val) { setStatus("Drop a file or enter a path", "error"); return; }
// If the box still shows the name of an uploaded file, just reload it.
if (S.path && val === S.name) { S.page = 0; await showPage(); return; }
setStatus("Loading…", "loading");
try {
const info = await post("/info", { file_path: val });
S.path = val; S.name = val; S.count = info.page_count; S.page = 0;
await showPage();
setStatus("");
} catch (e) { setStatus(e.message, "error"); }
}
function onPick(input) { if (input.files && input.files[0]) uploadFile(input.files[0]); }
async function uploadFile(file) {
setStatus("Uploading…", "loading");
const fd = new FormData(); fd.append("file", file);
try {
const r = await fetch("/upload", { method: "POST", body: fd });
const data = await r.json();
if (!r.ok) throw new Error(data.error || "Upload failed");
S.path = data.file_path; S.name = data.name; S.count = data.page_count; S.page = 0;
$("filePath").value = data.name;
await showPage();
setStatus("");
} catch (e) { setStatus(e.message, "error"); }
}
async function showPage() {
setStatus("Rendering…", "loading");
try {
const data = await post("/page", { file_path: S.path, page: S.page });
// New page → clear any previous OCR output.
S.blocks = null; S.html = null; S.ocrPage = null;
$("ocr").innerHTML = '<div class="placeholder">Run full-page OCR to see the extracted content.</div>';
$("shotBtn").disabled = true;
$("copyBtn").disabled = true;
loadImage(data.image_base64, () => drawLeft());
updatePager();
$("runBtn").disabled = false;
setStatus("");
} catch (e) { setStatus(e.message, "error"); }
}
function loadImage(src, cb) {
const im = new Image();
im.onload = () => { S.img = im; cb && cb(); };
im.src = src;
}
function updatePager() {
$("pageInd").textContent = S.count ? `Page ${S.page + 1} of ${S.count}` : "—";
$("prevBtn").disabled = S.page <= 0;
$("nextBtn").disabled = S.page >= S.count - 1;
}
function changePage(delta) {
const next = S.page + delta;
if (next < 0 || next >= S.count) return;
S.page = next;
showPage();
}
function drawLeft() {
if (!S.img) return;
const c = $("pageCanvas"), ctx = c.getContext("2d");
c.width = S.img.naturalWidth; c.height = S.img.naturalHeight;
ctx.drawImage(S.img, 0, 0);
if (!S.blocks || !$("showBoxes").checked) return;
ctx.lineWidth = 3;
ctx.font = 'bold 15px -apple-system, "Segoe UI", sans-serif';
ctx.textBaseline = "top";
S.blocks.forEach((b) => {
const [x1, y1, x2, y2] = b.bbox;
ctx.strokeStyle = b.color; ctx.fillStyle = b.color + "26";
ctx.fillRect(x1, y1, x2 - x1, y2 - y1);
ctx.strokeRect(x1, y1, x2 - x1, y2 - y1);
const tw = ctx.measureText(b.label).width, ly = Math.max(y1 - 22, 0);
ctx.fillStyle = b.color; ctx.fillRect(x1, ly, tw + 12, 21);
ctx.fillStyle = "#fff"; ctx.fillText(b.label, x1 + 6, ly + 3);
});
}
async function runOCR() {
if (!S.path) return;
setStatus("Running OCR…", "loading");
$("runBtn").disabled = true;
try {
const data = await post("/process", { file_path: S.path, page: S.page });
S.blocks = data.blocks; S.html = data.html || ""; S.ocrPage = S.page;
loadImage(data.image_base64, () => drawLeft());
const ocr = $("ocr");
ocr.innerHTML = data.html || '<div class="placeholder">No content detected on this page.</div>';
renderMath(ocr);
$("shotBtn").disabled = false;
$("copyBtn").disabled = !S.html;
setStatus(`${data.n_blocks} blocks`, "ok");
} catch (e) { setStatus(e.message, "error"); }
finally { $("runBtn").disabled = false; }
}
function renderMath(root) {
root.querySelectorAll("math").forEach((el) => {
const block = el.getAttribute("display") === "block";
try {
const span = document.createElement(block ? "div" : "span");
span.innerHTML = katex.renderToString(el.textContent, { displayMode: block, throwOnError: false });
el.replaceWith(span);
} catch (e) { /* leave raw on failure */ }
});
}
function copyHtml() {
if (!S.html) return;
const done = () => setStatus("HTML copied", "ok");
if (navigator.clipboard && navigator.clipboard.writeText) {
navigator.clipboard.writeText(S.html).then(done).catch(() => fallbackCopy(S.html, done));
} else {
fallbackCopy(S.html, done);
}
}
function fallbackCopy(text, done) {
const ta = document.createElement("textarea");
ta.value = text; ta.style.position = "fixed"; ta.style.opacity = "0";
document.body.appendChild(ta); ta.select();
try { document.execCommand("copy"); done(); }
catch (e) { setStatus("Copy failed", "error"); }
document.body.removeChild(ta);
}
async function saveScreenshot() {
setStatus("Capturing…", "loading");
const stage = document.querySelector(".stage");
document.body.classList.add("capturing");
try {
// Let the expanded layout settle before measuring full size.
await new Promise((r) => requestAnimationFrame(() => requestAnimationFrame(r)));
const w = stage.scrollWidth, h = stage.scrollHeight;
const canvas = await html2canvas(stage, {
backgroundColor: "#f4f6f9", scale: 2, useCORS: true, logging: false,
width: w, height: h, windowWidth: w, windowHeight: h, scrollX: 0, scrollY: 0,
});
const a = document.createElement("a");
a.href = canvas.toDataURL("image/png");
a.download = `surya_ocr_page_${(S.ocrPage ?? S.page) + 1}.png`;
a.click();
setStatus("Saved", "ok");
} catch (e) { setStatus("Screenshot failed: " + e.message, "error"); }
finally { document.body.classList.remove("capturing"); }
}
$("filePath").addEventListener("keypress", (e) => { if (e.key === "Enter") loadFile(); });
// Drag & drop anywhere in the window.
let dragDepth = 0;
window.addEventListener("dragenter", (e) => { e.preventDefault(); dragDepth++; $("dropOverlay").classList.add("active"); });
window.addEventListener("dragover", (e) => { e.preventDefault(); });
window.addEventListener("dragleave", (e) => { e.preventDefault(); if (--dragDepth <= 0) { dragDepth = 0; $("dropOverlay").classList.remove("active"); } });
window.addEventListener("drop", (e) => {
e.preventDefault(); dragDepth = 0; $("dropOverlay").classList.remove("active");
const f = e.dataTransfer.files && e.dataTransfer.files[0];
if (f) uploadFile(f);
});
</script>
</body>
</html>

View File

@ -49,6 +49,10 @@ class Settings(BaseSettings):
SURYA_INFERENCE_BACKEND: Optional[str] = None # "vllm" | "llamacpp" | None (auto)
SURYA_INFERENCE_URL: Optional[str] = None # external server, skip spawn
SURYA_INFERENCE_AUTOSTART: bool = True
# Leave an auto-spawned server running after the process exits so later
# commands attach to it instead of re-spawning (avoids repeated startup /
# model-load cost). Stop it manually when done — see `surya/inference`.
SURYA_INFERENCE_KEEP_ALIVE: bool = False
SURYA_INFERENCE_HOST: str = "127.0.0.1"
SURYA_INFERENCE_PORT: Optional[int] = None # None = pick a free port
SURYA_INFERENCE_PARALLEL: int = 8

31
uv.lock
View File

@ -823,6 +823,23 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970 },
]
[[package]]
name = "flask"
version = "3.1.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "blinker" },
{ name = "click" },
{ name = "itsdangerous" },
{ name = "jinja2" },
{ name = "markupsafe" },
{ name = "werkzeug" },
]
sdist = { url = "https://files.pythonhosted.org/packages/26/00/35d85dcce6c57fdc871f3867d465d780f302a175ea360f62533f12b27e2b/flask-3.1.3.tar.gz", hash = "sha256:0ef0e52b8a9cd932855379197dd8f94047b359ca0a78695144304cb45f87c9eb", size = 759004 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/7f/9c/34f6962f9b9e9c71f6e5ed806e0d0ff03c9d1b0b2340088a0cf4bce09b18/flask-3.1.3-py3-none-any.whl", hash = "sha256:f4bcbefc124291925f1a26446da31a5178f9483862233b23c0c96a20701f670c", size = 103424 },
]
[[package]]
name = "fqdn"
version = "1.5.1"
@ -3944,6 +3961,7 @@ dependencies = [
[package.dev-dependencies]
dev = [
{ name = "datasets" },
{ name = "flask" },
{ name = "jupyter" },
{ name = "pdftext" },
{ name = "pre-commit" },
@ -3976,6 +3994,7 @@ requires-dist = [
[package.metadata.requires-dev]
dev = [
{ name = "datasets", specifier = ">=2.16.1" },
{ name = "flask", specifier = ">=3.0.0" },
{ name = "jupyter", specifier = ">=1.0.0" },
{ name = "pdftext", specifier = ">=0.5.1" },
{ name = "pre-commit", specifier = ">=4.2.0" },
@ -4468,6 +4487,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598 },
]
[[package]]
name = "werkzeug"
version = "3.1.8"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "markupsafe" },
]
sdist = { url = "https://files.pythonhosted.org/packages/dd/b2/381be8cfdee792dd117872481b6e378f85c957dd7c5bca38897b08f765fd/werkzeug-3.1.8.tar.gz", hash = "sha256:9bad61a4268dac112f1c5cd4630a56ede601b6ed420300677a869083d70a4c44", size = 875852 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/93/8c/2e650f2afeb7ee576912636c23ddb621c91ac6a98e66dc8d29c3c69446e1/werkzeug-3.1.8-py3-none-any.whl", hash = "sha256:63a77fb8892bf28ebc3178683445222aa500e48ebad5ec77b0ad80f8726b1f50", size = 226459 },
]
[[package]]
name = "widgetsnbextension"
version = "4.0.15"