diff --git a/MODEL_LICENSE b/MODEL_LICENSE index cc1b863..13b388a 100644 --- a/MODEL_LICENSE +++ b/MODEL_LICENSE @@ -53,7 +53,7 @@ As conditions to the Licenses set forth in this Agreement, You agree not to use, (a) In any way that violates any applicable national, federal, state, local or international law or regulation; or (b) to directly or indirectly infringe or misappropriate any third party intellectual property rights (including those of Licensor or any Contributor) 2. Commercial: -(a) for any purpose if You (your employer, or the entity you are affiliated with) generated more than two million US Dollars ($2,000,000) in gross revenue in the prior year, except where Your Use is limited to personal use or research purposes; -(b) for any purpose if You (your employer, or the entity you are affiliated with) has raised more than two million US dollars ($2,000,000) in total equity or debt funding from any source, except where Your Use is limited to personal use or research purposes; or +(a) for any purpose if You (your employer, or the entity you are affiliated with) generated more than two million US Dollars ($5,000,000) in gross revenue in the prior year, except where Your Use is limited to personal use or research purposes; +(b) for any purpose if You (your employer, or the entity you are affiliated with) has raised more than two million US dollars ($5,000,000) in total equity or debt funding from any source, except where Your Use is limited to personal use or research purposes; or (c) for any purpose if You (your employer, or the entity you are affiliated with) provides or otherwise makes available any product or service that competes with any product or service offered by or made available by Licensor or any of its affiliates. Commercial and broader use licenses may be available from Licensor at the following URL: https://www.datalab.to/ \ No newline at end of file diff --git a/README.md b/README.md index 77c3d6e..5a90f1c 100644 --- a/README.md +++ b/README.md @@ -17,12 +17,13 @@ # Surya -Surya is a document OCR toolkit powered by a 650M param model that does: +Surya is an OCR toolkit powered by a 650M param model that does: -- Full-page OCR with layout, ranking near the top of [olmOCR-bench](https://huggingface.co/datasets/allenai/olmOCR-bench) +- Full-page OCR, scoring 83.3% on [olmOCR-bench](https://huggingface.co/datasets/allenai/olmOCR-bench) (top under 3B params) +- Multilingual OCR - scores 87.2% on an internal benchmark set of 91 languages (more [here](#multilingual)) - Line-level text detection - Layout analysis (table, image, header, etc.) with reading order -- Table recognition (rows + columns + cell HTML) +- Table recognition (rows + columns) It works on a range of documents (see [usage](#usage) and [benchmarks](#benchmarks)). @@ -30,12 +31,13 @@ It works on a range of documents (see [usage](#usage) and [benchmarks](#benchmar Our managed platform runs both Surya, and variants of our highest accuracy model, [Chandra](https://github.com/datalab-to/chandra). -If you have high volume workloads, we offer a batch processing service that can process 1B+ pages per week. +Get started with **$5 in free credits** — [sign up](https://www.datalab.to/?utm_source=gh-surya) (takes under 30 seconds) or try our free [public playground](https://www.datalab.to/playground?utm_source=gh-surya). -Get started with **$5 in free credits** — [sign up](https://www.datalab.to/?utm_source=gh-surya) (takes under 30 seconds) or try our [public playground](https://www.datalab.to/playground?utm_source=gh-surya). +Commercial self-hosting of the model weights requires a license — see [Commercial usage](#commercial-usage). For on-prem licensing, [contact us](https://www.datalab.to/contact?utm_source=gh-surya-onprem). If you have high volume workloads, we offer a batch processing service that can process 1B+ pages per week. -Commercial self-hosting of the model weights requires a license — see [Commercial usage](#commercial-usage). For on-prem licensing, [contact us](https://www.datalab.to/contact?utm_source=gh-surya-onprem). +## Model Information +Surya is a 650M param model that scores 83.3% on the olmocr bench - better than models 10x larger. @@ -53,20 +55,19 @@ Surya is named for the [Hindu sun god](https://en.wikipedia.org/wiki/Surya), who ## Examples -Each row links to four annotated views of the same page: text-line detection, -layout, reading order, and (when present) table recognition. +Each row links to five annotated views of the same page: text-line detection, OCR, layout, reading order, and (when present) table recognition. -| Name | Detection | Layout | Order | Table Rec | -|------------------|:-----------------------------------:|---------------------------------------------:|------------------------------------------------:|------------------------------------------------:| -| Newspaper | [Image](static/images/newspaper.png) | [Image](static/images/newspaper_layout.png) | [Image](static/images/newspaper_reading.png) | | -| Textbook | [Image](static/images/textbook.png) | [Image](static/images/textbook_layout.png) | [Image](static/images/textbook_reading.png) | | -| Tax Form | [Image](static/images/form.png) | [Image](static/images/form_layout.png) | [Image](static/images/form_reading.png) | [Image](static/images/form_tablerec.png) | -| Handwritten Notes | [Image](static/images/handwritten.png) | [Image](static/images/handwritten_layout.png) | [Image](static/images/handwritten_reading.png) | [Image](static/images/handwritten_tablerec.png) | -| Corporate Doc | [Image](static/images/corporate.png) | [Image](static/images/corporate_layout.png) | [Image](static/images/corporate_reading.png) | [Image](static/images/corporate_tablerec.png) | +| Name | Detection | OCR | Layout | Order | Table Rec | +|-------------------|:-----------------------------------:|------------------------------------------:|---------------------------------------------:|------------------------------------------------:|------------------------------------------------:| +| Newspaper | [Image](static/images/newspaper.png) | [Image](static/images/newspaper_text.png) | [Image](static/images/newspaper_layout.png) | [Image](static/images/newspaper_reading.png) | | +| Textbook | [Image](static/images/textbook.png) | [Image](static/images/textbook_text.png) | [Image](static/images/textbook_layout.png) | [Image](static/images/textbook_reading.png) | | +| Tax Form | [Image](static/images/form.png) | [Image](static/images/form_text.png) | [Image](static/images/form_layout.png) | [Image](static/images/form_reading.png) | [Image](static/images/form_tablerec.png) | +| Handwritten Notes | [Image](static/images/handwritten.png) | [Image](static/images/handwritten_text.png) | [Image](static/images/handwritten_layout.png) | [Image](static/images/handwritten_reading.png) | [Image](static/images/handwritten_tablerec.png) | +| Corporate Doc | [Image](static/images/corporate.png) | [Image](static/images/corporate_text.png) | [Image](static/images/corporate_layout.png) | [Image](static/images/corporate_reading.png) | [Image](static/images/corporate_tablerec.png) | # Commercial usage -The Surya code is licensed under Apache 2.0. The model weights use a modified AI Pubs Open Rail-M license (free for research, personal use, and startups under $2M funding/revenue). For broader commercial licensing of the model weights, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-surya). +The Surya code is licensed under Apache 2.0. The model weights use a modified AI Pubs Open Rail-M license (free for research, personal use, and startups under $5M funding/revenue). For broader commercial licensing of the model weights, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-surya). # Installation @@ -354,18 +355,18 @@ standard quality benchmark for document parsers. ## olmOCR-bench - +Best-in-class accuracy under 1B parameters; pareto-optimal vs every model 3B and below. -| Model | Params | Score | -|-----------------------------|-------:|--------:| -| Infinity-Parser2-Pro | 35.1B | 87.6 | -| Chandra OCR 2 (Datalab) | 5.3B | 85.9 | -| dots.mocr | 3.0B | 83.9 | -| LightOnOCR 2-1B \* | 1.0B | 83.2 | -| **Surya OCR 2** (Datalab) | **0.65B** | **83.1** | -| Chandra OCR 1 (Datalab) | 9.0B | 83.1 | -| olmOCR (anchored) | 8.3B | 77.4 | -| GOT OCR | 0.6B | 48.3 | +| Model | Params | Score | +|-----------------------------|----------:|---------:| +| Infinity-Parser2-Pro | 35.1B | 87.6 | +| Chandra OCR 2 (Datalab) | 5.3B | 85.9 | +| dots.mocr | 3.0B | 83.9 | +| **Surya OCR 2** (Datalab) | **0.65B** | **83.3** | +| LightOnOCR 2-1B \* | 1.0B | 83.2 | +| Chandra OCR 1 (Datalab) | 9.0B | 83.1 | +| olmOCR (anchored) | 8.3B | 77.4 | +| GOT OCR | 0.6B | 48.3 | \* **LightOnOCR 2-1B** uses a different benchmark methodology than the other entries (see their [release notes](https://huggingface.co/lightonai/LightOnOCR-2-1B)); the score is included for context but is not directly comparable. @@ -375,7 +376,38 @@ Surya 2, per-source pass rate on the `default` preset (8,413 tests total): | ArXiv | Base | Hdr/Ftr | TinyTxt | MultCol | OldScan | OldMath | Tables | |------:|-----:|--------:|--------:|--------:|--------:|--------:|-------:| -| 88.7 | 99.9 | 92.1 | 86.4 | 82.6 | 42.8 | 85.8 | 86.6 | +| 88.3 | 99.7 | 92.5 | 93.7 | 82.4 | 41.8 | 81.4 | 86.6 | + +## Multilingual + +We also evaluate Surya 2 against a 91-language internal benchmark covering +text accuracy, layout, tables, math, and reading order in documents drawn +from each language. + +**Overall pass rate: 87.2% across 91 languages.** 38 of the +91 languages score ≥ 90%; 76 score ≥ 80%. + +Top 15 widely-spoken languages: + +| Code | Language | Score | +|------|-------------|------:| +| `ar` | Arabic | 72.7% | +| `bn` | Bengali | 82.7% | +| `zh` | Chinese | 82.5% | +| `en` | English | 92.3% | +| `fr` | French | 89.3% | +| `de` | German | 89.7% | +| `hi` | Hindi | 82.2% | +| `it` | Italian | 93.0% | +| `ja` | Japanese | 86.2% | +| `ko` | Korean | 86.7% | +| `fa` | Persian | 82.3% | +| `pt` | Portuguese | 86.1% | +| `ru` | Russian | 88.8% | +| `es` | Spanish | 90.7% | +| `vi` | Vietnamese | 73.2% | + +See [static/docs/multilingual.md](static/docs/multilingual.md) for the full 91-language table. ## Throughput @@ -401,7 +433,7 @@ client-side against a running inference server. ## Reproducing We score Surya 2 on olmOCR-bench by serving the model with `vllm` (or -`llama.cpp`) and running the official olmOCR-bench harness from +`llama.cpp`) and running the olmOCR-bench harness from [allenai/olmocr](https://github.com/allenai/olmocr), with some adjustments applied to account for our output HTML format. # Training diff --git a/static/docs/multilingual.md b/static/docs/multilingual.md new file mode 100644 index 0000000..ca8115e --- /dev/null +++ b/static/docs/multilingual.md @@ -0,0 +1,101 @@ +# Surya 2 — full 91-language results + +Overall pass rate: **87.2%** across 91 languages (32,055 tests). Each language +has between ~30 and ~750 tests. + +Sorted alphabetically by language name. See [README](../../README.md#multilingual) +for the curated subset. + +| Code | Language | Score | +|------|-------------------|------:| +| `af` | Afrikaans | 87.7% | +| `sq` | Albanian | 88.9% | +| `am` | Amharic | 74.6% | +| `ar` | Arabic | 72.7% | +| `hy` | Armenian | 90.1% | +| `as` | Assamese | 86.4% | +| `az` | Azerbaijani | 91.5% | +| `eu` | Basque | 85.5% | +| `be` | Belarusian | 98.5% | +| `bn` | Bengali | 82.7% | +| `bs` | Bosnian | 92.5% | +| `br` | Breton | 93.6% | +| `bg` | Bulgarian | 95.4% | +| `my` | Burmese | 88.2% | +| `ca` | Catalan | 86.4% | +| `zh` | Chinese | 82.5% | +| `hr` | Croatian | 92.5% | +| `cs` | Czech | 85.8% | +| `da` | Danish | 84.5% | +| `nl` | Dutch | 86.5% | +| `en` | English | 92.3% | +| `eo` | Esperanto | 83.1% | +| `et` | Estonian | 76.8% | +| `fi` | Finnish | 87.5% | +| `fr` | French | 89.3% | +| `gl` | Galician | 85.5% | +| `ka` | Georgian | 91.4% | +| `de` | German | 89.7% | +| `el` | Greek | 80.7% | +| `gu` | Gujarati | 83.4% | +| `ha` | Hausa | 89.6% | +| `he` | Hebrew | 90.9% | +| `hi` | Hindi | 82.2% | +| `hu` | Hungarian | 90.6% | +| `is` | Icelandic | 89.5% | +| `id` | Indonesian | 90.3% | +| `ga` | Irish | 92.8% | +| `it` | Italian | 93.0% | +| `ja` | Japanese | 86.2% | +| `jv` | Javanese | 91.1% | +| `kn` | Kannada | 79.2% | +| `kk` | Kazakh | 90.3% | +| `km` | Khmer | 75.0% | +| `ko` | Korean | 86.7% | +| `ku` | Kurdish | 93.9% | +| `ky` | Kyrgyz | 92.3% | +| `lo` | Lao | 72.6% | +| `la` | Latin | 86.1% | +| `lv` | Latvian | 90.3% | +| `lt` | Lithuanian | 85.4% | +| `mk` | Macedonian | 95.3% | +| `mg` | Malagasy | 95.0% | +| `ms` | Malay | 91.2% | +| `ml` | Malayalam | 84.7% | +| `mr` | Marathi | 85.9% | +| `mn` | Mongolian | 94.3% | +| `ne` | Nepali | 84.9% | +| `no` | Norwegian | 93.6% | +| `or` | Oriya | 60.0% | +| `ps` | Pashto | 72.0% | +| `fa` | Persian | 82.3% | +| `pl` | Polish | 91.4% | +| `pt` | Portuguese | 86.1% | +| `pa` | Punjabi | 76.5% | +| `ro` | Romanian | 86.9% | +| `ru` | Russian | 88.8% | +| `sa` | Sanskrit | 78.8% | +| `gd` | Scottish Gaelic | 92.5% | +| `sr` | Serbian | 94.1% | +| `sd` | Sindhi | 87.3% | +| `si` | Sinhala | 85.4% | +| `sk` | Slovak | 90.4% | +| `sl` | Slovenian | 91.4% | +| `so` | Somali | 97.9% | +| `es` | Spanish | 90.7% | +| `su` | Sundanese | 92.6% | +| `sw` | Swahili | 93.5% | +| `sv` | Swedish | 91.4% | +| `ta` | Tamil | 89.9% | +| `te` | Telugu | 79.2% | +| `th` | Thai | 76.4% | +| `tr` | Turkish | 85.4% | +| `uk` | Ukrainian | 92.1% | +| `ur` | Urdu | 68.7% | +| `ug` | Uyghur | 70.2% | +| `uz` | Uzbek | 88.9% | +| `vi` | Vietnamese | 73.2% | +| `cy` | Welsh | 95.1% | +| `fy` | Western Frisian | 90.9% | +| `xh` | Xhosa | 90.3% | +| `yi` | Yiddish | 82.5% | diff --git a/static/images/corporate_text.png b/static/images/corporate_text.png new file mode 100644 index 0000000..e6c4f58 Binary files /dev/null and b/static/images/corporate_text.png differ diff --git a/static/images/excerpt_text.png b/static/images/excerpt_text.png index f29223d..c9f266f 100644 Binary files a/static/images/excerpt_text.png and b/static/images/excerpt_text.png differ diff --git a/static/images/form_text.png b/static/images/form_text.png new file mode 100644 index 0000000..30231f7 Binary files /dev/null and b/static/images/form_text.png differ diff --git a/static/images/handwritten_text.png b/static/images/handwritten_text.png new file mode 100644 index 0000000..a566c6d Binary files /dev/null and b/static/images/handwritten_text.png differ diff --git a/static/images/newspaper_text.png b/static/images/newspaper_text.png new file mode 100644 index 0000000..5e2cd47 Binary files /dev/null and b/static/images/newspaper_text.png differ diff --git a/static/images/olmocr_size_chart.png b/static/images/olmocr_size_chart.png index 0552e23..2e3868d 100644 Binary files a/static/images/olmocr_size_chart.png and b/static/images/olmocr_size_chart.png differ diff --git a/static/images/textbook_text.png b/static/images/textbook_text.png new file mode 100644 index 0000000..82a6ea7 Binary files /dev/null and b/static/images/textbook_text.png differ diff --git a/surya/common/blank.py b/surya/common/blank.py new file mode 100644 index 0000000..d11c721 --- /dev/null +++ b/surya/common/blank.py @@ -0,0 +1,64 @@ +"""Pixel-content heuristics for detecting blank or near-uniform image regions. + +Used by both the layout predictor (drop hallucinated layout blocks over empty +space) and the recognition predictor (drop hallucinated text blocks from +full-page OCR, decide whether an empty full-page output is a correct blank-page +read or a failure). + +Two signals, combined: + * near-white fraction — most pixels have every RGB channel above a threshold + * pixel-value standard deviation — the region is essentially one color + (catches uniform-color fills that the white check misses) +""" + +from __future__ import annotations + +import numpy as np +from PIL import Image + + +# Per-channel value at/above which a pixel is considered "near-white". +# Tolerates the small noise typical of PDF renders at 96 DPI. +BLANK_WHITE_THRESHOLD = 245 +# Fraction of pixels that must be near-white for a region to count as blank. +BLANK_PIXEL_FRACTION = 0.99 +# Pixel-value std below which a region is "essentially one color" regardless +# of what that color is (catches solid-fill rectangles, dark banners, etc.). +UNIFORM_COLOR_STD = 8.0 + + +def near_white_fraction( + image: Image.Image, white_threshold: int = BLANK_WHITE_THRESHOLD +) -> float: + """Fraction of pixels where every RGB channel ≥ ``white_threshold``.""" + arr = np.asarray(image.convert("RGB")) + if arr.size == 0: + return 0.0 + return float(np.all(arr >= white_threshold, axis=-1).mean()) + + +def is_blank_region( + image: Image.Image, + *, + white_threshold: int = BLANK_WHITE_THRESHOLD, + blank_pixel_fraction: float = BLANK_PIXEL_FRACTION, + uniform_color_std: float = UNIFORM_COLOR_STD, +) -> bool: + """True iff the image is essentially blank — either mostly near-white or + near-uniform color. Use this on a per-block crop or a whole page. + + Returns False for empty (0-pixel) crops so callers don't accidentally + treat a degenerate bbox as blank. + """ + arr = np.asarray(image.convert("RGB")) + if arr.size == 0: + return False + if np.all(arr >= white_threshold, axis=-1).mean() > blank_pixel_fraction: + return True + # Per-channel std — a uniform solid color (e.g., red banner with RGB=(200,50,50)) + # has each channel constant across pixels, but mixing channels inflates the + # aggregate std. Check each channel independently. + per_channel_std = arr.reshape(-1, arr.shape[-1]).std(axis=0) + if float(per_channel_std.max()) < uniform_color_std: + return True + return False diff --git a/surya/layout/__init__.py b/surya/layout/__init__.py index cf11b57..d9decb7 100644 --- a/surya/layout/__init__.py +++ b/surya/layout/__init__.py @@ -4,11 +4,12 @@ from typing import List, Optional from PIL import Image +from surya.common.blank import is_blank_region from surya.inference import SuryaInferenceManager, get_default_manager from surya.inference.parsers import denorm_bbox, parse_layout from surya.inference.prompts import LAYOUT_JSON_SCHEMA, PROMPT_TYPE_LAYOUT from surya.inference.schema import BatchInputItem -from surya.layout.label import LAYOUT_PRED_RELABEL +from surya.layout.label import LAYOUT_PRED_RELABEL, TEXT_LABELS from surya.layout.schema import LayoutBox, LayoutResult from surya.logging import get_logger from surya.settings import settings @@ -94,20 +95,40 @@ class LayoutPredictor: continue confidence = out.mean_token_prob if out.mean_token_prob is not None else 1.0 + img_w, img_h = img.size boxes: List[LayoutBox] = [] - for idx, blk in enumerate(parsed): - pixel_bbox = denorm_bbox(blk.bbox, w, h, scale=settings.BBOX_SCALE) + dropped_blank = 0 + for blk in parsed: canon = LAYOUT_PRED_RELABEL.get(blk.label, blk.label) + # Drop text-labeled blocks the model hallucinated over an + # essentially-blank region (mostly white OR near-uniform + # color). Visual blocks (Picture / Figure / Table / etc.) + # are allowed to be uniform — that's normal content. + if canon in TEXT_LABELS: + img_bbox = denorm_bbox( + blk.bbox, img_w, img_h, scale=settings.BBOX_SCALE + ) + x0, y0, x1, y1 = (max(0, int(v)) for v in img_bbox) + if x1 > x0 and y1 > y0: + if is_blank_region(img.crop((x0, y0, x1, y1))): + dropped_blank += 1 + continue + pixel_bbox = denorm_bbox(blk.bbox, w, h, scale=settings.BBOX_SCALE) boxes.append( LayoutBox( polygon=list(pixel_bbox), label=canon, raw_label=blk.label, - position=idx, + position=len(boxes), count=blk.count, confidence=confidence, ) ) + if dropped_blank: + logger.info( + f"dropped {dropped_blank} text-labeled layout block(s) over " + f"blank/uniform regions" + ) results.append( LayoutResult( bboxes=boxes, image_bbox=page_bbox, raw=out.raw, error=False diff --git a/surya/layout/label.py b/surya/layout/label.py index 7608e51..bb4c6e3 100644 --- a/surya/layout/label.py +++ b/surya/layout/label.py @@ -1,6 +1,24 @@ """Surya2 layout labels emitted by the model + canonicalization to surya's public label vocabulary.""" +# Canonical text-bearing labels — used by blank-region filters to decide +# which blocks may be dropped when their underlying image region is empty. +# Excludes Picture/Figure/Diagram/Table/Form/Equation/etc., which can legitimately +# contain whitespace or solid fills. +TEXT_LABELS = frozenset( + { + "Text", + "SectionHeader", + "PageHeader", + "PageFooter", + "Caption", + "Footnote", + "Code", + "Bibliography", + } +) + + # Canonicalize raw model labels to public surya label names. Marker and other # downstream consumers depend on these names. LAYOUT_PRED_RELABEL = { diff --git a/surya/recognition/__init__.py b/surya/recognition/__init__.py index a925308..468a7d9 100644 --- a/surya/recognition/__init__.py +++ b/surya/recognition/__init__.py @@ -10,6 +10,7 @@ from typing import List, Optional from PIL import Image +from surya.common.blank import is_blank_region from surya.inference import SuryaInferenceManager, get_default_manager from surya.inference.parsers import clean_block_html, parse_full_page_html from surya.inference.prompts import ( @@ -19,7 +20,7 @@ from surya.inference.prompts import ( ) from surya.inference.schema import BatchInputItem from surya.inference.util import image_token_budget -from surya.layout.label import LAYOUT_PRED_RELABEL +from surya.layout.label import LAYOUT_PRED_RELABEL, TEXT_LABELS from surya.layout.schema import LayoutResult from surya.logging import get_logger from surya.recognition.schema import ( @@ -48,6 +49,65 @@ def _crop_block(image: Image.Image, polygon, pad: int = 4) -> Image.Image: return image.crop((x0, y0, x1, y1)) +def _drop_blank_text_blocks( + image: Image.Image, + blocks: List[BlockOCRResult], +) -> List[BlockOCRResult]: + """Drop text-labeled blocks whose source page region is essentially blank. + + Full-page OCR can emit text divs for regions that are visually empty + (margins, gutter space) — the model hallucinates a paragraph where there + is none. We crop the region, count near-white pixels, and drop the block + when the fraction exceeds ``blank_pixel_fraction``. Only text-like labels + (see ``TEXT_LABELS``) are eligible: tables, forms, equations, and visual + blocks may legitimately contain large whitespace and are left untouched. + """ + kept: List[BlockOCRResult] = [] + dropped = 0 + for blk in blocks: + if blk.label not in TEXT_LABELS or blk.skipped or blk.error: + kept.append(blk) + continue + crop = _crop_block(image, blk.polygon) + if not is_blank_region(crop): + kept.append(blk) + continue + dropped += 1 + if dropped: + logger.info(f"dropped {dropped} blank text block(s) from full-page OCR") + return kept + + +def _detect_repeat_loop( + text: str, + base_max_repeats: int = 4, + window_size: int = 500, + scaling_factor: float = 3.0, +) -> bool: + """True iff the tail of ``text`` ends in a repeating sequence. + + Ported from chandra's detect_repeat_token. For each candidate length + 1..window_size/2, takes that many trailing chars and counts consecutive + identical preceding blocks. Shorter loops need many repeats to count; + longer ones only need a few. Catches the typical decoder failure mode + where a page output gets stuck emitting the same div / phrase until it + hits max_tokens. + """ + if not text: + return False + for seq_len in range(1, window_size // 2 + 1): + candidate = text[-seq_len:] + max_repeats = int(base_max_repeats * (1 + scaling_factor / seq_len)) + repeats = 0 + pos = len(text) - seq_len + while pos >= 0 and text[pos : pos + seq_len] == candidate: + repeats += 1 + pos -= seq_len + if repeats > max_repeats: + return True + return False + + class RecognitionPredictor: """Per-block OCR. Construct with a SuryaInferenceManager (or rely on default).""" @@ -94,11 +154,12 @@ class RecognitionPredictor: full_page = layout_results is None if full_page: if layout_results is not None: - logger.warning( + logger.info( "RecognitionPredictor called with full_page=True and " - "layout_results; layout_results will be ignored." + "layout_results; layout will be used as fallback if the " + "full-page output devolves into a repetition loop." ) - return self._full_page_ocr(images) + return self._full_page_ocr(images, fallback_layout=layout_results) if layout_results is None: raise ValueError("layout_results required when full_page=False") if len(images) != len(layout_results): @@ -196,8 +257,19 @@ class RecognitionPredictor: ) return results - def _full_page_ocr(self, images: List[Image.Image]) -> List[PageOCRResult]: - """One HIGH_ACCURACY_BBOX_PROMPT request per page; parses divs into blocks.""" + def _full_page_ocr( + self, + images: List[Image.Image], + fallback_layout: Optional[List[LayoutResult]] = None, + ) -> List[PageOCRResult]: + """One HIGH_ACCURACY_BBOX_PROMPT request per page; parses divs into blocks. + + On per-page failure (parse error, empty output, or a detected + repetition loop in the decoder output), falls back to layout + + block-mode OCR for that page only. ``fallback_layout``, if given, + provides per-page LayoutResults to use on fallback; otherwise the + LayoutPredictor is invoked lazily for just the affected pages. + """ manager = self.manager or get_default_manager() batch = [ BatchInputItem( @@ -211,19 +283,44 @@ class RecognitionPredictor: outputs = manager.generate(batch) out_by_page = {o.metadata["page_idx"]: o for o in outputs} - results: List[PageOCRResult] = [] + results: List[Optional[PageOCRResult]] = [None] * len(images) + needs_fallback: List[int] = [] for page_idx, img in enumerate(images): w, h = img.size page_bbox = [0, 0, float(w), float(h)] out = out_by_page.get(page_idx) - if out is None or out.error or not out.raw: - results.append(PageOCRResult(blocks=[], image_bbox=page_bbox)) + if out is None or out.error: + # Hard failure (request lost / server error). Always fallback. + needs_fallback.append(page_idx) + continue + if not out.raw: + # Empty model output. If the page is genuinely blank, the + # model is correct — return an empty result. Only fall back + # when the page has content the model failed to emit. + if is_blank_region(img): + results[page_idx] = PageOCRResult(blocks=[], image_bbox=page_bbox) + else: + logger.info( + f"empty full-page output for non-blank page {page_idx}; " + f"falling back to layout + block OCR" + ) + needs_fallback.append(page_idx) + continue + if _detect_repeat_loop(out.raw): + logger.info( + f"full-page output for page {page_idx} appears to loop; " + f"falling back to layout + block OCR" + ) + needs_fallback.append(page_idx) continue try: parsed = parse_full_page_html(out.raw) except Exception as e: - logger.warning(f"Full-page parse failed for page {page_idx}: {e}") - results.append(PageOCRResult(blocks=[], image_bbox=page_bbox)) + logger.warning( + f"Full-page parse failed for page {page_idx}: {e}; " + f"falling back to layout + block OCR" + ) + needs_fallback.append(page_idx) continue confidence = out.mean_token_prob if out.mean_token_prob is not None else 1.0 blocks: List[BlockOCRResult] = [] @@ -247,5 +344,33 @@ class RecognitionPredictor: confidence=confidence, ) ) - results.append(PageOCRResult(blocks=blocks, image_bbox=page_bbox)) - return results + blocks = _drop_blank_text_blocks(img, blocks) + results[page_idx] = PageOCRResult(blocks=blocks, image_bbox=page_bbox) + + # Block-mode fallback for any pages whose full-page output failed or looped. + if needs_fallback: + fb_images = [images[i] for i in needs_fallback] + if fallback_layout is not None: + fb_layouts = [fallback_layout[i] for i in needs_fallback] + else: + # Lazy import to avoid the surya.layout ↔ surya.recognition cycle. + from surya.layout import LayoutPredictor + + logger.info( + f"running layout for {len(fb_images)} page(s) requiring " + f"block-mode fallback" + ) + fb_layouts = LayoutPredictor(self.manager)(fb_images) + fb_results = self.__call__(fb_images, fb_layouts, full_page=False) + for fb_idx, page_idx in enumerate(needs_fallback): + results[page_idx] = fb_results[fb_idx] + + # Backfill any still-None pages with empty results (defensive — shouldn't happen). + out_results: List[PageOCRResult] = [] + for page_idx, img in enumerate(images): + r = results[page_idx] + if r is None: + w, h = img.size + r = PageOCRResult(blocks=[], image_bbox=[0, 0, float(w), float(h)]) + out_results.append(r) + return out_results diff --git a/surya/scripts/streamlit_app.py b/surya/scripts/streamlit_app.py index eecc2ac..3c4bc8e 100644 --- a/surya/scripts/streamlit_app.py +++ b/surya/scripts/streamlit_app.py @@ -275,11 +275,11 @@ else: pil_image = Image.open(in_file).convert("RGB") page_number = None +run_full_page_ocr = st.sidebar.button("Run Full-Page OCR") run_text_det = st.sidebar.button("Run Text Detection") run_layout = st.sidebar.button("Run Layout Analysis") -run_block_ocr = st.sidebar.button("Run Block OCR") -run_full_page_ocr = st.sidebar.button("Run Full-Page OCR") run_table_rec = st.sidebar.button("Run Table Rec") +run_block_ocr = st.sidebar.button("Run Block OCR") run_ocr_errors = st.sidebar.button("Run bad-PDF-text detection") table_mode = st.sidebar.radio( diff --git a/surya/settings.py b/surya/settings.py index f0298b4..d74e7fa 100644 --- a/surya/settings.py +++ b/surya/settings.py @@ -12,7 +12,7 @@ from platformdirs import user_cache_dir class Settings(BaseSettings): # General TORCH_DEVICE: Optional[str] = None - IMAGE_DPI: int = 96 # used for layout, recognition, and table rec + IMAGE_DPI: int = 192 # used for layout, recognition, and table rec IMAGE_DPI_HIGHRES: int = 192 IN_STREAMLIT: bool = False DISABLE_TQDM: bool = False @@ -77,7 +77,10 @@ class Settings(BaseSettings): SURYA_MAX_TOKENS_TABLE_REC: int = 3072 SURYA_MAX_TOKENS_BLOCK_CEILING: int = 8192 SURYA_MAX_TOKENS_FULL_PAGE: int = ( - 8192 # fallback path needs room for full-page HTML + # 12288 (vs 8192) buys +1pp overall on olmOCR-bench and +7.24pp on + # long_tiny_text — dense pages were truncating at 8k. Total budget + # fits within VLLM_MAX_MODEL_LEN=18000 after image prefill. + 12288 ) # When a layout request fails to produce parseable JSON, fall back to