DoctorWho
DoctorWho

Reputation: 1116

Jetpack Compose recomposition vs iOS Vision for OCR is not working

I'm trying to use Compose Multiplatform and create an app that uses ML Kit for Android and Vision for iOS to read words shot by the camera. On Android everything works fine. On iOS, however, there is a huge problem: The performance is terrible and the app does not respond. In simple words, I open the rear camera and try to read the words but for some reason I can't find the right thread in which to do the heavy lifting because the app fails (BAD REQUEST) or because the UI doesn't update even if On the Log side I see the words recognised

import androidx.compose.foundation.layout.fillMaxSize
import androidx.compose.runtime.Composable
import androidx.compose.runtime.DisposableEffect
import androidx.compose.runtime.LaunchedEffect
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableStateOf
import androidx.compose.runtime.remember
import androidx.compose.runtime.setValue
import androidx.compose.ui.Modifier
import androidx.compose.ui.interop.UIKitView
import kotlinx.cinterop.ExperimentalForeignApi
import kotlinx.cinterop.useContents
import platform.AVFoundation.AVAuthorizationStatusAuthorized
import platform.AVFoundation.AVAuthorizationStatusDenied
import platform.AVFoundation.AVAuthorizationStatusNotDetermined
import platform.AVFoundation.AVAuthorizationStatusRestricted
import platform.AVFoundation.AVCaptureConnection
import platform.AVFoundation.AVCaptureDevice
import platform.AVFoundation.AVCaptureDeviceInput
import platform.AVFoundation.AVCaptureInput
import platform.AVFoundation.AVCaptureOutput
import platform.AVFoundation.AVCaptureSession
import platform.AVFoundation.AVCaptureSessionPresetPhoto
import platform.AVFoundation.AVCaptureVideoDataOutput
import platform.AVFoundation.AVCaptureVideoDataOutputSampleBufferDelegateProtocol
import platform.AVFoundation.AVCaptureVideoPreviewLayer
import platform.AVFoundation.AVLayerVideoGravityResizeAspectFill
import platform.AVFoundation.AVMediaTypeVideo
import platform.AVFoundation.authorizationStatusForMediaType
import platform.AVFoundation.requestAccessForMediaType
import platform.CoreGraphics.CGRectMake
import platform.CoreMedia.CMSampleBufferGetImageBuffer
import platform.CoreMedia.CMSampleBufferRef
import platform.CoreVideo.kCVPixelBufferPixelFormatTypeKey
import platform.CoreVideo.kCVPixelFormatType_32BGRA
import platform.UIKit.UIScreen
import platform.UIKit.UIView
import platform.Vision.VNImageRequestHandler
import platform.Vision.VNRecognizeTextRequest
import platform.Vision.VNRecognizedText
import platform.Vision.VNRecognizedTextObservation
import platform.darwin.NSObject
import platform.darwin.dispatch_async
import platform.darwin.dispatch_get_main_queue

@OptIn(ExperimentalForeignApi::class)
actual class CameraPermissionManager {

    private lateinit var captureSession: AVCaptureSession
    private lateinit var textRecognitionRequest: VNRecognizeTextRequest
    private lateinit var videoLayer: AVCaptureVideoPreviewLayer


    @Composable
    actual fun RequestCameraPermission(
        onPermissionGranted: @Composable () -> Unit,
        onPermissionDenied: @Composable () -> Unit
    ) {
        var hasPermission by remember { mutableStateOf(false) }
        var permissionRequested by remember { mutableStateOf(false) }

        LaunchedEffect(Unit) {
            val status = AVCaptureDevice.authorizationStatusForMediaType(AVMediaTypeVideo)
            when (status) {
                AVAuthorizationStatusAuthorized -> {
                    hasPermission = true
                }

                AVAuthorizationStatusNotDetermined -> {
                    AVCaptureDevice.requestAccessForMediaType(AVMediaTypeVideo) { granted ->
                        dispatch_async(dispatch_get_main_queue()) {
                            hasPermission = granted
                            permissionRequested = true
                        }
                    }
                }

                AVAuthorizationStatusDenied, AVAuthorizationStatusRestricted -> {
                    hasPermission = false
                }

                else -> {}
            }
        }

        if (hasPermission) {
            onPermissionGranted()
        } else if (permissionRequested) {
            onPermissionDenied()
        }
    }

    @Composable
    actual fun StartCameraPreview(onTextDetected: (String) -> Unit) {
        val screenWidth = UIScreen.mainScreen.bounds.useContents { size.width }
        val screenHeight = UIScreen.mainScreen.bounds.useContents { size.height }
        val previewView = remember {
            UIView(frame = CGRectMake(0.0, 0.0, screenWidth, screenHeight))
        }

        LaunchedEffect(Unit) {
            // Setup AVCaptureSession
            captureSession = AVCaptureSession().apply {
                sessionPreset = AVCaptureSessionPresetPhoto
            }

            val device = AVCaptureDevice.defaultDeviceWithMediaType(AVMediaTypeVideo)
            val input = device?.let { AVCaptureDeviceInput.deviceInputWithDevice(it, null) }
            captureSession.addInput(input as AVCaptureInput)

            videoLayer = AVCaptureVideoPreviewLayer(session = captureSession).apply {
                this.videoGravity = AVLayerVideoGravityResizeAspectFill
                this.frame = previewView.bounds
            }
            previewView.layer.addSublayer(videoLayer)

            // Setup Vision Text Recognition
            textRecognitionRequest = VNRecognizeTextRequest { request, _ ->
                val observations = request?.results?.filterIsInstance<VNRecognizedTextObservation>()
                observations?.forEach { observation ->
                    val topCandidate = observation.topCandidates(1u).firstOrNull()?.toString() ?: ""
                    onTextDetected(topCandidate)
                }
            }

            val videoOutput = AVCaptureVideoDataOutput().apply {
                videoSettings = mapOf(kCVPixelBufferPixelFormatTypeKey to kCVPixelFormatType_32BGRA)
                setSampleBufferDelegate(
                    createSampleBufferDelegate(onTextDetected),
                    dispatch_get_main_queue()
                )
            }

            captureSession.addOutput(videoOutput)
            captureSession.startRunning()
        }

        DisposableEffect(Unit) {
            onDispose {
                captureSession.stopRunning()
            }
        }

        UIKitView(
            factory = {
                previewView
            },
            modifier = Modifier.fillMaxSize()
        )
    }

    private fun createSampleBufferDelegate(onTextDetected: (String) -> Unit): AVCaptureVideoDataOutputSampleBufferDelegateProtocol {
        return object : NSObject(), AVCaptureVideoDataOutputSampleBufferDelegateProtocol {
            override fun captureOutput(
                output: AVCaptureOutput,
                didOutputSampleBuffer: CMSampleBufferRef?,
                fromConnection: AVCaptureConnection
            ) {
                if (didOutputSampleBuffer != null) {
                    processSampleBuffer(didOutputSampleBuffer, onTextDetected)
                }
            }
        }
    }

    private fun processSampleBuffer(
        sampleBuffer: CMSampleBufferRef,
        onTextDetected: (String) -> Unit
    ) {
        val pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer)

        if (pixelBuffer == null) {
            println("CameraPermissionManager || Pixel buffer is null")
            return
        }

        val handler = VNImageRequestHandler(pixelBuffer, options = mapOf<Any?, Any?>())

        val request = VNRecognizeTextRequest { request, error ->
            if (error != null) {
                println("CameraPermissionManager || Error recognizing text: $error")
                return@VNRecognizeTextRequest
            }

            val observations = request?.results?.filterIsInstance<VNRecognizedTextObservation>()

            observations?.forEach { observation ->
                val topCandidates = observation.topCandidates(1u)
                topCandidates.firstOrNull()?.let { candidate ->
                    val recognizedText = candidate as? VNRecognizedText
                    recognizedText?.string?.let { text ->
                        onTextDetected(text)
                    } ?: run {
                        println("CameraPermissionManager || Recognized text is null")
                    }
                }
            }
        }

        try {
            handler.performRequests(listOf(request), null)
        } catch (e: Exception) {
            println("CameraPermissionManager || Error performing Vision request: $e")
        }
    }
}

What am I doing wrong? What should I change? Vision should be the best for iOS and yet as soon as I try to change the thread for the word recognition part everything freezes and nothing works. Do you have any working suggestions?

Upvotes: 1

Views: 56

Answers (0)

Related Questions