surya/benchmark/utils/textract.py
Vik Paruchuri d5603693c0 Move dep
2025-02-07 10:48:55 -05:00

29 lines
1.0 KiB
Python

import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import traceback
from surya.input.processing import slice_bboxes_from_image
from surya.recognition import RecognitionPredictor
def textract_ocr(extractor, img):
try:
document = extractor.detect_document_text(file_source=img)
return [line.text for line in document.lines]
except:
traceback.print_exc()
return [None]
def textract_ocr_parallel(imgs, cpus=None):
from textractor import Textractor # Optional dependency
extractor = Textractor(profile_name='default')
parallel_cores = min(len(imgs), RecognitionPredictor().get_batch_size())
if not cpus:
cpus = os.cpu_count()
parallel_cores = min(parallel_cores, cpus)
with ThreadPoolExecutor(max_workers=parallel_cores) as executor:
textract_text = tqdm(executor.map(textract_ocr, [extractor]*len(imgs), imgs), total=len(imgs), desc="Running textract OCR")
textract_text = list(textract_text)
return textract_text