Loading...
Loading...
Implement computer vision features including text recognition (OCR), face detection, barcode scanning, image segmentation, object tracking, and document scanning in iOS apps. Covers both the modern Swift-native Vision API (iOS 16+) and legacy VNRequest patterns, VisionKit DataScannerViewController for live camera scanning, and VNCoreMLRequest for custom model inference. Use when adding OCR, barcode scanning, face detection, or custom Core ML model inference with Vision.
npx skill4agent add dpearson2699/swift-ios-skills vision-frameworkreferences/vision-requests.mdreferences/visionkit-scanner.md| Aspect | Modern (iOS 18+) | Legacy |
|---|---|---|
| Pattern | | |
| Request types | Swift types — structs and classes ( | ObjC classes ( |
| Concurrency | Native async/await | Completion handlers or synchronous |
| Observations | Typed return values | Cast |
| Availability | iOS 18+ / macOS 15+ | iOS 11+ |
ImageProcessingRequestperform(on:orientation:)CGImageCIImageCVPixelBufferCMSampleBufferDataURLTrackObjectRequestTrackRectangleRequestDetectTrajectoriesRequestperform(on:)import Vision
func recognizeText(in image: CGImage) async throws -> [String] {
var request = RecognizeTextRequest()
request.recognitionLevel = .accurate
request.recognitionLanguages = [Locale.Language(identifier: "en-US")]
let observations = try await request.perform(on: image)
return observations.compactMap { observation in
observation.topCandidates(1).first?.string
}
}VNImageRequestHandlerimport Vision
func recognizeTextLegacy(in image: CGImage) throws -> [String] {
var recognized: [String] = []
let request = VNRecognizeTextRequest { request, error in
guard let observations = request.results as? [VNRecognizedTextObservation] else { return }
recognized = observations.compactMap { $0.topCandidates(1).first?.string }
}
request.recognitionLevel = .accurate
let handler = VNImageRequestHandler(cgImage: image)
try handler.perform([request])
return recognized
}var request = RecognizeTextRequest()
request.recognitionLevel = .accurate // .fast for real-time
request.recognitionLanguages = [
Locale.Language(identifier: "en-US"),
Locale.Language(identifier: "fr-FR"),
]
request.usesLanguageCorrection = true
request.customWords = ["SwiftUI", "Xcode"] // domain-specific terms
let observations = try await request.perform(on: cgImage)
for observation in observations {
guard let candidate = observation.topCandidates(1).first else { continue }
let text = candidate.string
let confidence = candidate.confidence // 0.0 ... 1.0
let bounds = observation.boundingBox // normalized coordinates
}let request = VNRecognizeTextRequest()
request.recognitionLevel = .accurate
request.recognitionLanguages = ["en-US", "fr-FR"]
request.usesLanguageCorrection = trueLocale.Language.accurate.fast// Modern API
let faceRequest = DetectFaceRectanglesRequest()
let faces = try await faceRequest.perform(on: cgImage)
for face in faces {
let boundingBox = face.boundingBox // normalized CGRect
let roll = face.roll // Measurement<UnitAngle>
let yaw = face.yaw // Measurement<UnitAngle>
}
// Landmarks (eyes, nose, mouth contours)
var landmarkRequest = DetectFaceLandmarksRequest()
let landmarkFaces = try await landmarkRequest.perform(on: cgImage)
for face in landmarkFaces {
let landmarks = face.landmarks
let leftEye = landmarks?.leftEye?.normalizedPoints
let nose = landmarks?.nose?.normalizedPoints
}func convertToUIKit(_ rect: CGRect, imageHeight: CGFloat) -> CGRect {
CGRect(
x: rect.origin.x,
y: imageHeight - rect.origin.y - rect.height,
width: rect.width,
height: rect.height
)
}var request = DetectBarcodesRequest()
request.symbologies = [.qr, .ean13, .code128, .pdf417]
let barcodes = try await request.perform(on: cgImage)
for barcode in barcodes {
let payload = barcode.payloadString // decoded content
let symbology = barcode.symbology // .qr, .ean13, etc.
let bounds = barcode.boundingBox // normalized rect
}.qr.aztec.pdf417.dataMatrix.ean8.ean13.code39.code128.upce.itf14RecognizeDocumentsRequestDocumentObservationContainervar request = RecognizeDocumentsRequest()
let documents = try await request.perform(on: cgImage)
for observation in documents {
let container = observation.document
// Full text content
let fullText = container.text
// Structured access to paragraphs
for paragraph in container.paragraphs {
let paragraphText = paragraph.text
}
// Tables and lists
for table in container.tables { /* structured table data */ }
for list in container.lists { /* structured list data */ }
// Embedded barcodes detected within the document
for barcode in container.barcodes { /* barcode data */ }
// Document title if detected
if let title = container.title { print(title) }
}VNDocumentCameraViewControllervar request = GeneratePersonSegmentationRequest()
request.qualityLevel = .accurate // .balanced, .fast
let mask = try await request.perform(on: cgImage)
// mask is a PersonSegmentationObservation with a pixelBuffer property
let maskBuffer = mask.pixelBuffer
// Apply mask using Core Image: CIFilter.blendWithMask()let request = VNGeneratePersonSegmentationRequest()
request.qualityLevel = .accurate // .balanced, .fast
request.outputPixelFormat = kCVPixelFormatType_OneComponent8
let handler = VNImageRequestHandler(cgImage: cgImage)
try handler.perform([request])
guard let mask = request.results?.first?.pixelBuffer else { return }
// Apply mask using Core Image: CIFilter.blendWithMask().accurate.balanced.fast// Modern API (iOS 18+)
let request = GeneratePersonInstanceMaskRequest()
let observation = try await request.perform(on: cgImage)
let indices = observation.allInstances
for index in indices {
let mask = try observation.generateMask(forInstances: IndexSet(integer: index))
// mask is a CVPixelBuffer with only this person visible
}// Legacy API (iOS 17+)
let request = VNGeneratePersonInstanceMaskRequest()
let handler = VNImageRequestHandler(cgImage: cgImage)
try handler.perform([request])
guard let result = request.results?.first else { return }
let indices = result.allInstances
for index in indices {
let instanceMask = try result.generateMaskedImage(
ofInstances: IndexSet(integer: index),
from: handler,
croppedToInstancesExtent: false
)
}references/vision-requests.mdTrackObjectRequestImageProcessingRequestStatefulRequest// Initialize with a detected object's bounding box
let initialObservation = DetectedObjectObservation(boundingBox: detectedRect)
var request = TrackObjectRequest(observation: initialObservation)
request.trackingLevel = .accurate
// For each video frame:
let results = try await request.perform(on: pixelBuffer)
if let tracked = results.first {
let updatedBounds = tracked.boundingBox
let confidence = tracked.confidence
}let trackRequest = VNTrackObjectRequest(detectedObjectObservation: initialObservation)
trackRequest.trackingLevel = .accurate
let sequenceHandler = VNSequenceRequestHandler()
// For each frame:
try sequenceHandler.perform([trackRequest], on: pixelBuffer)
if let result = trackRequest.results?.first {
let updatedBounds = result.boundingBox
trackRequest.inputObservation = result
}references/vision-requests.md| Request | Purpose |
|---|---|
| Classify scene content (outdoor, food, animal, etc.) |
| Heat map of where viewers focus attention |
| Heat map of object-like regions |
| Foreground object segmentation (not person-specific) |
| Detect rectangular shapes (documents, cards, screens) |
| Detect horizon angle for auto-leveling photos |
| Detect body joints (shoulders, elbows, knees) |
| 3D human body pose estimation |
| Detect hand joints and finger positions |
| Detect animal body joint positions |
| Face capture quality scoring (0–1) for photo selection |
| Track rectangular objects across video frames |
| Optical flow between video frames |
| Detect object trajectories in video |
// Modern API (iOS 18+)
let model = try MLModel(contentsOf: modelURL)
let request = CoreMLRequest(model: .init(model))
let results = try await request.perform(on: cgImage)
// Classification model
if let classification = results.first as? ClassificationObservation {
let label = classification.identifier
let confidence = classification.confidence
}// Legacy API
let vnModel = try VNCoreMLModel(for: model)
let request = VNCoreMLRequest(model: vnModel) { request, error in
guard let results = request.results as? [VNClassificationObservation] else { return }
let topResult = results.first
}
let handler = VNImageRequestHandler(cgImage: cgImage)
try handler.perform([request])coremlDataScannerViewControllerreferences/visionkit-scanner.mdimport VisionKit
// Check availability (requires A12+ chip and camera)
guard DataScannerViewController.isSupported,
DataScannerViewController.isAvailable else { return }
let scanner = DataScannerViewController(
recognizedDataTypes: [
.text(languages: ["en"]),
.barcode(symbologies: [.qr, .ean13])
],
qualityLevel: .balanced,
recognizesMultipleItems: true,
isHighFrameRateTrackingEnabled: true,
isHighlightingEnabled: true
)
scanner.delegate = self
present(scanner, animated: true) {
try? scanner.startScanning()
}DataScannerViewControllerUIViewControllerRepresentablereferences/visionkit-scanner.mdVNImageRequestHandlerperform(on:)VNImageRectForNormalizedRect(_:_:_:).accurate.fast.accurateconfidenceVNImageRequestHandlerVNSequenceRequestHandlerDataScannerViewControllerisSupportedisAvailableisAvailable.fast.accurateDataScannerViewControllerNSCameraUsageDescriptionVNSequenceRequestHandlerreferences/vision-requests.mdreferences/visionkit-scanner.md