Loading...
Loading...
Process scanned documents and images containing Chuukese text using OCR with specialized post-processing for accent characters and traditional formatting. Use when working with scanned books, documents, or images that contain Chuukese text that needs to be digitized.
npx skill4agent add findinfinitelabs/chuuk document-ocr-processingimport pytesseract
from PIL import Image
import cv2
import numpy as np
class ChuukeseOCRProcessor:
def __init__(self):
# Configure Tesseract for multi-language support
self.tesseract_config = {
'chuukese_optimized': '--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzáéíóúāēīōū0123456789.,!?;:()-"\' ',
'multilingual': '--oem 3 --psm 6',
'preserve_structure': '--oem 3 --psm 1'
}
# Chuukese character mappings for OCR corrections
self.ocr_corrections = {
# Common OCR mistakes for accented characters
'a´': 'á', 'a`': 'à', 'a¯': 'ā',
'e´': 'é', 'e`': 'è', 'e¯': 'ē',
'i´': 'í', 'i`': 'ì', 'i¯': 'ī',
'o´': 'ó', 'o`': 'ò', 'o¯': 'ō',
'u´': 'ú', 'u`': 'ù', 'u¯': 'ū',
# Common character confusions
'0': 'o', '1': 'l', '5': 's',
'rn': 'm', 'cl': 'd', 'ck': 'ch'
}
def preprocess_image(self, image_path):
"""Preprocess image for better OCR accuracy"""
# Load image
image = cv2.imread(image_path)
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Noise removal
denoised = cv2.medianBlur(gray, 3)
# Contrast enhancement
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
enhanced = clahe.apply(denoised)
# Binarization
_, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return binaryclass ChuukeseOCRPostProcessor:
def __init__(self, dictionary_path=None):
self.dictionary = {}
if dictionary_path:
self.load_chuukese_dictionary(dictionary_path)
# Common OCR error patterns for Chuukese
self.error_patterns = {
# Accent corrections
r'a[\'\`\´]': 'á',
r'e[\'\`\´]': 'é',
r'i[\'\`\´]': 'í',
r'o[\'\`\´]': 'ó',
r'u[\'\`\´]': 'ú',
# Common character substitutions
r'\b0(?=[aeiou])': 'o', # 0 at start of word -> o
r'(?<=[aeiou])0\b': 'o', # 0 at end after vowel -> o
r'\brn(?=[aeiou])': 'm', # rn -> m
}
def correct_ocr_errors(self, text):
"""Apply OCR error corrections specific to Chuukese"""
corrected = text
# Apply pattern-based corrections
for pattern, replacement in self.error_patterns.items():
corrected = re.sub(pattern, replacement, corrected)
return corrected# Initialize processor
processor = BatchOCRProcessor("output/ocr_results")
# Process single document
result = processor.process_document("scanned_chuukese_dictionary.jpg")
# Access extracted text
extracted_text = result['extracted_text']
dictionary_entries = result['document_structure']['dictionary_entries']# Process all images in a directory
batch_results = processor.process_batch(
"scanned_documents/",
file_patterns=['*.jpg', '*.png']
)
print(f"Processed {batch_results['successfully_processed']} documents")pytesseractopencv-pythonPillownumpy