MlKit Text Recognition with Live Camera in iOS Swift

Question

I am trying to implement text recognition with a live camera feed. I used the reference from Google's ML Kit example(https://github.com/googlesamples/mlkit/blob/master/ios/quickstarts/vision/VisionExample/CameraViewController.swift) but every time I attempt to start text recognition, I get a blank white screen instead of camera. I have already added the necessary camera permissions, so I don’t believe that’s the issue. Could anyone help me troubleshoot this problem? My code is below:

import UIKit
import SwiftUI
import AVFoundation
import MLKitTextRecognition
import MLKitVision
import FirebaseFirestore
import FirebaseCore

class ScannerViewController: UIViewController, AVCaptureVideoDataOutputSampleBufferDelegate {
    var codes: [String] = []
    var bgColor: Binding!
    var greenScans: Binding!
    var scannedCode: Binding!
    var audioPlayer: AVAudioPlayer?
    var invalidAudioPlayer: AVAudioPlayer?
    private var textRecognizer: TextRecognizer!
    private var scanBuffer: [String] = []
    private var previewLayer: AVCaptureVideoPreviewLayer!
    private lazy var captureSession = AVCaptureSession()
    private var isProcessingBuffer = false
    private var isProcessingFrame = false
    var isUsingFrontCamera = false
    private let sessionQueue = DispatchQueue(label: "sessionQueue")
    private var lastFrame: CMSampleBuffer?
    private var cameraView: UIView!
    
    private lazy var annotationOverlayView: UIView = {
        precondition(isViewLoaded)
        let annotationOverlayView = UIView(frame: .zero)
        annotationOverlayView.translatesAutoresizingMaskIntoConstraints = false
        return annotationOverlayView
      }()
    
    private lazy var previewOverlayView: UIImageView = {
        precondition(isViewLoaded)
        let previewOverlayView = UIImageView(frame: .zero)
        previewOverlayView.contentMode = UIView.ContentMode.scaleAspectFill
        previewOverlayView.translatesAutoresizingMaskIntoConstraints = false
        return previewOverlayView
      }()
    
    override func viewDidLoad() {
        super.viewDidLoad()
        prepareAudioPlayer()
        setupCameraView()
        previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
        previewLayer.videoGravity = .resizeAspectFill
        let latinOptions = TextRecognizerOptions()
        textRecognizer = TextRecognizer.textRecognizer(options: latinOptions)
        setUpCaptureSessionOutput()
        setUpCaptureSessionInput()

    }

    
    override func viewDidAppear(_ animated: Bool) {
        super.viewDidAppear(animated)
        startSession()
        previewLayer.frame = cameraView.bounds
      }
    
    private func setupCameraView() {
           cameraView = UIView()
           cameraView.translatesAutoresizingMaskIntoConstraints = false
           view.addSubview(cameraView)
           
           // Set up constraints to make cameraView fill the screen or position as needed
           NSLayoutConstraint.activate([
               cameraView.leadingAnchor.constraint(equalTo: view.leadingAnchor),
               cameraView.trailingAnchor.constraint(equalTo: view.trailingAnchor),
               cameraView.topAnchor.constraint(equalTo: view.topAnchor),
               cameraView.bottomAnchor.constraint(equalTo: view.bottomAnchor)
           ])
           
           // Initialize the previewLayer after cameraView has been created
           previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
           previewLayer.videoGravity = .resizeAspectFill
           previewLayer.frame = cameraView.bounds
           cameraView.layer.addSublayer(previewLayer)
       }
    
    override func viewDidLayoutSubviews() {
        super.viewDidLayoutSubviews()
        
        // Ensure that previewLayer's frame matches cameraView's bounds
        previewLayer.frame = cameraView.bounds
    }
    
    override func viewDidDisappear(_ animated: Bool) {
        super.viewDidDisappear(animated)

        stopSession()
      }

    
    
    private func setUpCaptureSessionOutput() {
          weak var weakSelf = self
          sessionQueue.async {
              guard let strongSelf = weakSelf else {
                  print("Self is nil!")
                  return
              }
              strongSelf.captureSession.beginConfiguration()
              strongSelf.captureSession.sessionPreset = .medium

              let output = AVCaptureVideoDataOutput()
              output.videoSettings = [
                  (kCVPixelBufferPixelFormatTypeKey as String): kCVPixelFormatType_32BGRA
              ]
              output.alwaysDiscardsLateVideoFrames = true
              let outputQueue = DispatchQueue(label: "videoDataOutputQueue")
              output.setSampleBufferDelegate(strongSelf, queue: outputQueue)
              guard strongSelf.captureSession.canAddOutput(output) else {
                  print("Failed to add capture session output.")
                  return
              }
              strongSelf.captureSession.addOutput(output)
              strongSelf.captureSession.commitConfiguration()
          }
      }

      private func setUpCaptureSessionInput() {
          weak var weakSelf = self
          sessionQueue.async {
              guard let strongSelf = weakSelf else {
                  print("Self is nil!")
                  return
              }
              let cameraPosition: AVCaptureDevice.Position = strongSelf.isUsingFrontCamera ? .front : .back
              guard let device = strongSelf.captureDevice(forPosition: cameraPosition) else {
                  print("Failed to get capture device for camera position: \(cameraPosition)")
                  return
              }
              do {
                  strongSelf.captureSession.beginConfiguration()
                  let currentInputs = strongSelf.captureSession.inputs
                  for input in currentInputs {
                      strongSelf.captureSession.removeInput(input)
                  }

                  let input = try AVCaptureDeviceInput(device: device)
                  guard strongSelf.captureSession.canAddInput(input) else {
                      print("Failed to add capture session input.")
                      return
                  }
                  strongSelf.captureSession.addInput(input)
                  strongSelf.captureSession.commitConfiguration()
              } catch {
                  print("Failed to create capture device input: \(error.localizedDescription)")
              }
          }
      }

      private func captureDevice(forPosition position: AVCaptureDevice.Position) -> AVCaptureDevice? {
          let devices = AVCaptureDevice.devices(for: .video)
          return devices.first(where: { $0.position == position })
      }

      private func startSession() {
          weak var weakSelf = self
          sessionQueue.async {
              guard let strongSelf = weakSelf else {
                  print("Self is nil!")
                  return
              }
              strongSelf.captureSession.startRunning()
          }
      }

      private func stopSession() {
          weak var weakSelf = self
          sessionQueue.async {
              guard let strongSelf = weakSelf else {
                  print("Self is nil!")
                  return
              }
              strongSelf.captureSession.stopRunning()
          }
      }
    
    func captureOutput(
        _ output: AVCaptureOutput,
        didOutput sampleBuffer: CMSampleBuffer,
        from connection: AVCaptureConnection
    ) {
        guard let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
            print("Failed to get image buffer from sample buffer.")
            return
        }

        

        lastFrame = sampleBuffer
        let visionImage = VisionImage(buffer: sampleBuffer)
        let orientation = UIUtilities.imageOrientation(
            fromDevicePosition: isUsingFrontCamera ? .front : .back
        )
        visionImage.orientation = orientation

        guard let inputImage = MLImage(sampleBuffer: sampleBuffer) else {
            print("Failed to create MLImage from sample buffer.")
            return
        }
        inputImage.orientation = orientation
        
        recognizeTextOnDevice(in: visionImage, width: CGFloat(CVPixelBufferGetWidth(imageBuffer)), height: CGFloat(CVPixelBufferGetHeight(imageBuffer)))

    }
    
    private func recognizeTextOnDevice(
       in image: VisionImage, width: CGFloat, height: CGFloat
     ) {
      
       var recognizedText: MLKitTextRecognition.Text?
       let latinOptions = TextRecognizerOptions()
       var detectionError: Error?
       do {
         recognizedText = try TextRecognizer.textRecognizer(options: latinOptions)
           .results(in: image)
       } catch let error {
         detectionError = error
       }
       weak var weakSelf = self
       DispatchQueue.main.sync {
         guard let strongSelf = weakSelf else {
           print("Self is nil!")
           return
         }
         strongSelf.updatePreviewOverlayViewWithLastFrame()
         if let detectionError = detectionError {
           print("Failed to recognize text with error: \(detectionError.localizedDescription).")
           return
         }
         guard let recognizedText = recognizedText else {
           print("Text recognition returned no results.")
           return
         }

         // Blocks.
         for block in recognizedText.blocks {
           let points = strongSelf.convertedPoints(
             from: block.cornerPoints, width: width, height: height)
           UIUtilities.addShape(
             withPoints: points,
             to: strongSelf.annotationOverlayView,
             color: UIColor.purple
           )

           // Lines.
           for line in block.lines {
             let points = strongSelf.convertedPoints(
               from: line.cornerPoints, width: width, height: height)
             UIUtilities.addShape(
               withPoints: points,
               to: strongSelf.annotationOverlayView,
               color: UIColor.orange
             )

             // Elements.
             for element in line.elements {
               let normalizedRect = CGRect(
                 x: element.frame.origin.x / width,
                 y: element.frame.origin.y / height,
                 width: element.frame.size.width / width,
                 height: element.frame.size.height / height
               )
               let convertedRect = strongSelf.previewLayer.layerRectConverted(
                 fromMetadataOutputRect: normalizedRect
               )
               UIUtilities.addRectangle(
                 convertedRect,
                 to: strongSelf.annotationOverlayView,
                 color: UIColor.green
               )
               let label = UILabel(frame: convertedRect)
               label.text = element.text
               label.adjustsFontSizeToFitWidth = true
               strongSelf.rotate(label, orientation: image.orientation)
               strongSelf.annotationOverlayView.addSubview(label)
             }
           }
         }
       }
     }
    
    private func removeDetectionAnnotations() {
        for annotationView in annotationOverlayView.subviews {
          annotationView.removeFromSuperview()
        }
      }
    
    private func rotate(_ view: UIView, orientation: UIImage.Orientation) {
        var degree: CGFloat = 0.0
        switch orientation {
        case .up, .upMirrored:
          degree = 90.0
        case .rightMirrored, .left:
          degree = 180.0
        case .down, .downMirrored:
          degree = 270.0
        case .leftMirrored, .right:
          degree = 0.0
        }
        view.transform = CGAffineTransform.init(rotationAngle: degree * 3.141592654 / 180)
      }
    private func updatePreviewOverlayViewWithLastFrame() {
       guard let lastFrame = lastFrame,
         let imageBuffer = CMSampleBufferGetImageBuffer(lastFrame)
       else {
         return
       }
       self.updatePreviewOverlayViewWithImageBuffer(imageBuffer)
       self.removeDetectionAnnotations()
     }
    
    private func updatePreviewOverlayViewWithImageBuffer(_ imageBuffer: CVImageBuffer?) {
        guard let imageBuffer = imageBuffer else {
          return
        }
        let orientation: UIImage.Orientation = isUsingFrontCamera ? .leftMirrored : .right
        let image = UIUtilities.createUIImage(from: imageBuffer, orientation: orientation)
        previewOverlayView.image = image
      }

    
    func processImage(_ image: UIImage) {
            let visionImage = VisionImage(image: image)
            
            textRecognizer.process(visionImage) { result, error in
                guard error == nil, let result = result else {
                    print("Text recognition failed with error: \(error?.localizedDescription ?? "Unknown error")")
                    return
                }
                self.processRecognizedText(result)
            }
        }
       
     }

MlKit Text Recognition with Live Camera in iOS Swift

Answers (0)

Related Questions