BH Lee
BH Lee

Reputation: 11

Swift AVAssetWriter Video Audio out of sync

I am creating a WebRTC streaming service using Swift.

I need the ability to record video while streaming, so I am saving the video using AVAssetWriter.

The problem is that if you leave the app on for a long time and continuously record, the audio and video are out of sync.

For example, in the initially recorded video, the audio and video are in sync, but if you leave the app on for a day and let it record, the audio and video come out together, but the audio comes out 1-2 seconds faster, then the video comes out, and the video ends. At some point, the video cuts out first, then the audio comes on for another 1-2 seconds and then ends. The longer I leave it on, the more out of sync it becomes. What am I doing wrong?

Video sampleBuffer is obtained by importing RTCVideoFrame and converting it to CMSampleBuffer through the renderFrame method of the RTCVideoRenderer protocol inside the WebRTC (v122.0.0) library. And since there is no way to obtain audio sampleBuffer within the library, the audio sampleBuffer is used to save the video using the AVCaptureAudioDataOutputSampleBufferDelegate protocol with the sampleBuffer of the func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) method.

extension FrameRenderer: RTCVideoRenderer {
    
    func setSize(_ size: CGSize) {
        //        print("=================== \(#function) size: \(size) ===================")
    }
    
    func renderFrame(_ frame: RTCVideoFrame?) {
        guard let videoFrame = frame,
              let videoPixelBuffer = convertRTCVideoFrameToPixelBuffer(videoFrame),
              let videoBuffer = createSampleBuffer(pixelBuffer: videoPixelBuffer, frame: videoFrame)
        else {
            Log.debug("=================== \(#function) Failed to convert videoPixelBuffer or videoBuffer ===================")
            return
        }
        let frame = calculateSNR(videoBuffer)
        frameSubject.onNext(frame)
        
        switch captureStatus {
        case .idle:
            return
        case .start:
            Log.debug("recording start")
            setAudioCaptureSession()
            setAssetWriter(sampleBuffer: videoBuffer)
        case .capturing:
            appendSampleBuffer(sampleBuffer: videoBuffer, isVideo: true)
        case .end:
            Observable<Int>
                .just(1)
                .subscribe(onNext: { [weak self] _ in
                    guard let self = self else { return }
                    self.finishRecording()
                })
                .disposed(by: disposeBag)
        }
    }
    
    private func calculateSNR(_ sampleBuffer: CMSampleBuffer) -> Double {
        guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
            return 0
        }
        
        let attachments = pixelBuffer.attachments
        let propagated = attachments.propagated
        guard let metadata = propagated["MetadataDictionary"] as? [String: Any] else {
            print("There is no matadata from sampleBuffer")
            return 0
        }
        
        guard let snr = metadata["SNR"] as? Double else {
            print("can't cast snr's type")
            return 0
        }
        
        return snr
    }
    
    /// get pixel data from RTCVideoFrame
    func convertRTCVideoFrameToPixelBuffer(_ rtcVideoFrame: RTCVideoFrame) -> CVPixelBuffer? {
        guard let rtcCVPixelBuffer = rtcVideoFrame.buffer as? RTCCVPixelBuffer else {
            return nil
        }
        
        return rtcCVPixelBuffer.pixelBuffer
    }
    
    func createSampleBuffer(pixelBuffer: CVPixelBuffer, frame: RTCVideoFrame) -> CMSampleBuffer? {
        var sampleBuffer: CMSampleBuffer?
        
        let rtcTimeStamp = frame.timeStampNs
        let timescale: CMTimeScale = 1_000_000_000
        let presentationTimeStamp = CMTime(value: rtcTimeStamp, timescale: timescale)
        
        let duration = CMTime.zero
        
        var timingInfo = CMSampleTimingInfo(duration: duration,
                                            presentationTimeStamp: presentationTimeStamp,
                                            decodeTimeStamp: presentationTimeStamp)
        
        var videoInfo: CMVideoFormatDescription?
        CMVideoFormatDescriptionCreateForImageBuffer(allocator: kCFAllocatorDefault,
                                                     imageBuffer: pixelBuffer,
                                                     formatDescriptionOut: &videoInfo)
        CMSampleBufferCreateReadyWithImageBuffer(allocator: kCFAllocatorDefault,
                                                 imageBuffer: pixelBuffer,
                                                 formatDescription: videoInfo!,
                                                 sampleTiming: &timingInfo,
                                                 sampleBufferOut: &sampleBuffer)
        return sampleBuffer
    }
}

// start
private func setAudioCaptureSession() {
        if self.captureSession == nil {
            self.captureSession = AVCaptureSession()
            self.captureSession?.beginConfiguration()
            
            let audioDevice = AVCaptureDevice.default(for: .audio)
            let audioOutput = AVCaptureAudioDataOutput()
            do {
                let audioInput = try AVCaptureDeviceInput(device: audioDevice!)
                if self.captureSession!.canAddInput(audioInput) {
                    self.captureSession?.addInput(audioInput)
                }
            } catch {
                ErrorReportManager.shared.log(
                    message: "\(self)_\(#function): \(DGError.DGInternalError.audioInputCreateFail.messageDescription) \n reason: \(error.localizedDescription)"
                )
            }
            guard self.captureSession!.canAddOutput(audioOutput) else { return }
            self.captureSession?.addOutput(audioOutput)
            audioOutput.setSampleBufferDelegate(self, queue: audioQueue)
            
            self.captureSession?.commitConfiguration()
        }
        
            if self.captureSession?.isRunning == false {
                self.captureSession?.startRunning()
            }
    }

// capturing
private func appendSampleBuffer(sampleBuffer: CMSampleBuffer, isVideo: Bool) {
        videoQueue.async {
            if isVideo {
                if self.isStartedSession == true {
                    if self.assetWriterVideoInput?.isReadyForMoreMediaData == true {
                        Log.info("Video Buffer: \(sampleBuffer.presentationTimeStamp.seconds)")
                        self.assetWriterVideoInput?.append(sampleBuffer)
                    }
                }
            } else {
                if self.assetWriterAudioInput?.isReadyForMoreMediaData == true {
                    if self.isStartedSession == false {
                        self.assetWriter?.startSession(atSourceTime: sampleBuffer.presentationTimeStamp)
                        self.isStartedSession = true
                    }
                    
                    self.assetWriterAudioInput?.append(sampleBuffer)
                    Log.network("Audio Buffer: \(sampleBuffer.presentationTimeStamp.seconds)")
                }
            }
        }
    }

// Get audio sampleBuffer from AVCaptureAudioDataOutputSampleBufferDelegate
extension FrameRenderer: AVCaptureAudioDataOutputSampleBufferDelegate {
    func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
        switch captureStatus {
        case .capturing:
            appendSampleBuffer(sampleBuffer: sampleBuffer, isVideo: false)
        default:
            return
        }
    }
}


// end
func finishRecording() {
        guard assetWriter?.status != .unknown else { return }
        assetWriterVideoInput?.markAsFinished()
        assetWriterAudioInput?.markAsFinished()
        assetWriter?.finishWriting {
            Log.debug("finish recording")
        }
        self.isStartedSession = false
        self.assetWriter = nil
        self.assetWriterVideoInput = nil
        self.assetWriterAudioInput = nil
        self.captureStatus = .idle
        if self.captureSession?.isRunning == true {
            self.captureSession?.stopRunning()
        }
        remainTime = FrameRenderer.maximumRecordingTime
        isMoved = false
        self.videoDuration = videoDurationCount
        
        isManualRecording = false
        uploadVideo(self.fileURL!, self.fileName, isManualRecording: isManualRecording)
    }

Below is the sampleBuffer.presentationTimeStamp.seconds log of audio and video when out of sync.

- firstTime recording log
    
    start
    
    Audio Buffer: 607505.8207029478
    Video Buffer: 607504.618464
    Audio Buffer: 607505.8420181406
    Audio Buffer: 607505.8633560091
    Audio Buffer: 607505.8846938775
    Video Buffer: 607504.667967
    
    end
    
    Video Buffer: 607556.333763
    Audio Buffer: 607557.5753514739
    Audio Buffer: 607557.5966893424
    Video Buffer: 607556.375038
    Audio Buffer: 607557.6180272109
    Audio Buffer: 607557.6393650793
    
- a day later
    
    start
    
    Audio Buffer: 627125.3272789116
    Audio Buffer: 627125.3485941043
    Video Buffer: 627123.609696
    Audio Buffer: 627125.3699319728
    Audio Buffer: 627125.3912698412
    Video Buffer: 627123.642925
    
    end
    
    Audio Buffer: 627215.0126077097
    Audio Buffer: 627215.0339455783
    Video Buffer: 627213.269289
    Audio Buffer: 627215.055260771
    Video Buffer: 627213.302609
    Audio Buffer: 627215.0765986395
    Audio Buffer: 627215.0979365079

Upvotes: 0

Views: 72

Answers (0)

Related Questions