Martin Castin
Martin Castin

Reputation: 163

Getting shifted timestamps when encoding a fragmented h264 mp4 with ffmpeg

I am trying to encode a fragmented h264 mp4 with ffmpeg. I tried the following command:

ffmpeg -i input.mp4 -movflags +frag_keyframe+separate_moof+omit_tfhd_offset+empty_moov output.mp4

It does give me a fragmented mp4 but the timestamps of the frames seem to be shifted by 0.04s when I read the video with mpv. The first frame has a timestamp of 0.04s instead of 0s, as in the input video (1920x1080, 50 fps). I encountered the problem both with ffmpeg 5.1 and ffmpeg 3.4.11.

I tried to add several flags, as -avoid_negative_ts make_zero or -copyts -output_ts_offset -0.04, but it did not help.

I am also trying to achieve this using the ffmpeg libav libraries in C++ but did not get to better result. Here are the code fragments I used.

 avformat_alloc_output_context2(&oc, NULL, NULL, filename);

 if (oc_->oformat->flags & AVFMT_GLOBALHEADER) {
    codecCtx_->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
 }
...
 AVDictionary* opts = NULL;

 av_dict_set(&opts, "movflags", "frag_keyframe+separate_moof+omit_tfhd_offset+empty_moov", 0);

 ret = avformat_write_header(oc_, &opts);

Do you know how to avoid this behaviour of shifted timestamps for fragmented mp4, either with ffmpeg or libav?

Edit: example videos and complete code example

I also tried with the following ffmpeg build

ffmpeg version 5.0.1-static https://johnvansickle.com/ffmpeg/  Copyright (c) 2000-2022 the FFmpeg developers
built with gcc 8 (Debian 8.3.0-6)
configuration: --enable-gpl --enable-version3 --enable-static --disable-debug --disable-ffplay --disable-indev=sndio --disable-outdev=sndio --cc=gcc --enable-fontconfig --enable-frei0r --enable-gnutls --enable-gmp --enable-libgme --enable-gray --enable-libaom --enable-libfribidi --enable-libass --enable-libvmaf --enable-libfreetype --enable-libmp3lame --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-librubberband --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libvorbis --enable-libopus --enable-libtheora --enable-libvidstab --enable-libvo-amrwbenc --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libdav1d --enable-libxvid --enable-libzvbi --enable-libzimg
libavutil      57. 17.100 / 57. 17.100
libavcodec     59. 18.100 / 59. 18.100
libavformat    59. 16.100 / 59. 16.100
libavdevice    59.  4.100 / 59.  4.100
libavfilter     8. 24.100 /  8. 24.100
libswscale      6.  4.100 /  6.  4.100
libswresample   4.  3.100 /  4.  3.100
libpostproc    56.  3.100 / 56.  3.100

and with the sintel trailer as input video, which is 24fps, and I thus get a timeshift of 83ms. Here is the output I get.

Here is a complete code example, slightly adapted from the muxing.c ffmpeg example (audio removed and adapted for c++). This code shows exactly the same problem.

You can just comment the line 383 (that is calling av_dict_set) to switch back to a not fragmented mp4 that will not have the timestamp shift.

/*
 * Copyright (c) 2003 Fabrice Bellard
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

/**
 * @file
 * libavformat API example.
 *
 * Output a media file in any supported libavformat format. The default
 * codecs are used.
 * @example muxing.c
 */

#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <cmath>

extern "C"
{
#define __STDC_CONSTANT_MACROS
#include <libavutil/avassert.h>
#include <libavutil/channel_layout.h>
#include <libavutil/opt.h>
#include <libavutil/mathematics.h>
#include <libavutil/timestamp.h>
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libswscale/swscale.h>
#include <libswresample/swresample.h>
}

#define STREAM_DURATION   10.0
#define STREAM_FRAME_RATE 25 /* 25 images/s */
#define STREAM_PIX_FMT    AV_PIX_FMT_YUV420P /* default pix_fmt */

#define SCALE_FLAGS SWS_BICUBIC

// a wrapper around a single output AVStream
typedef struct OutputStream {
  AVStream *st;
  AVCodecContext *enc;

  /* pts of the next frame that will be generated */
  int64_t next_pts;
  int samples_count;

  AVFrame *frame;
  AVFrame *tmp_frame;

  AVPacket *tmp_pkt;

  float t, tincr, tincr2;

  struct SwsContext *sws_ctx;
  struct SwrContext *swr_ctx;
} OutputStream;

static void log_packet(const AVFormatContext *fmt_ctx, const AVPacket *pkt)
{
  AVRational *time_base = &fmt_ctx->streams[pkt->stream_index]->time_base;

//  printf("pts:%s pts_time:%s dts:%s dts_time:%s duration:%s duration_time:%s stream_index:%d\n",
//         av_ts2str(pkt->pts), av_ts2timestr(pkt->pts, time_base),
//         av_ts2str(pkt->dts), av_ts2timestr(pkt->dts, time_base),
//         av_ts2str(pkt->duration), av_ts2timestr(pkt->duration, time_base),
//         pkt->stream_index);
}

static int write_frame(AVFormatContext *fmt_ctx, AVCodecContext *c,
                       AVStream *st, AVFrame *frame, AVPacket *pkt)
{
  int ret;

  // send the frame to the encoder
  ret = avcodec_send_frame(c, frame);
  if (ret < 0) {
    fprintf(stderr, "Error sending a frame to the encoder");
    exit(1);
  }

  while (ret >= 0) {
    ret = avcodec_receive_packet(c, pkt);
    if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
      break;
    else if (ret < 0) {
      fprintf(stderr, "Error encoding a frame\n");
      exit(1);
    }

    /* rescale output packet timestamp values from codec to stream timebase */
    av_packet_rescale_ts(pkt, c->time_base, st->time_base);
    pkt->stream_index = st->index;

    /* Write the compressed frame to the media file. */
    log_packet(fmt_ctx, pkt);
    ret = av_interleaved_write_frame(fmt_ctx, pkt);
    /* pkt is now blank (av_interleaved_write_frame() takes ownership of
     * its contents and resets pkt), so that no unreferencing is necessary.
     * This would be different if one used av_write_frame(). */
    if (ret < 0) {
      fprintf(stderr, "Error while writing output packet\n");
      exit(1);
    }
  }

  return ret == AVERROR_EOF ? 1 : 0;
}

/* Add an output stream. */
static void add_stream(OutputStream *ost, AVFormatContext *oc,
                       const AVCodec **codec,
                       enum AVCodecID codec_id)
{
  AVCodecContext *c;
  int i;

  /* find the encoder */
  *codec = avcodec_find_encoder(codec_id);
  if (!(*codec)) {
    fprintf(stderr, "Could not find encoder for '%s'\n",
            avcodec_get_name(codec_id));
    exit(1);
  }

  ost->tmp_pkt = av_packet_alloc();
  if (!ost->tmp_pkt) {
    fprintf(stderr, "Could not allocate AVPacket\n");
    exit(1);
  }

  ost->st = avformat_new_stream(oc, NULL);
  if (!ost->st) {
    fprintf(stderr, "Could not allocate stream\n");
    exit(1);
  }
  ost->st->id = oc->nb_streams-1;
  c = avcodec_alloc_context3(*codec);
  if (!c) {
    fprintf(stderr, "Could not alloc an encoding context\n");
    exit(1);
  }
  ost->enc = c;

  switch ((*codec)->type) {
    case AVMEDIA_TYPE_VIDEO:
      c->codec_id = codec_id;

      c->bit_rate = 400000;
      /* Resolution must be a multiple of two. */
      c->width    = 352;
      c->height   = 288;
      /* timebase: This is the fundamental unit of time (in seconds) in terms
       * of which frame timestamps are represented. For fixed-fps content,
       * timebase should be 1/framerate and timestamp increments should be
       * identical to 1. */
      ost->st->time_base = (AVRational){ 1, STREAM_FRAME_RATE };
      c->time_base       = ost->st->time_base;

      c->gop_size      = 12; /* emit one intra frame every twelve frames at most */
      c->pix_fmt       = STREAM_PIX_FMT;
      if (c->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
        /* just for testing, we also add B-frames */
        c->max_b_frames = 2;
      }
      if (c->codec_id == AV_CODEC_ID_MPEG1VIDEO) {
        /* Needed to avoid using macroblocks in which some coeffs overflow.
         * This does not happen with normal video, it just happens here as
         * the motion of the chroma plane does not match the luma plane. */
        c->mb_decision = 2;
      }
      break;

    default:
      break;
  }

  /* Some formats want stream headers to be separate. */
  if (oc->oformat->flags & AVFMT_GLOBALHEADER)
    c->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
}

/**************************************************************/
/* video output */

static AVFrame *alloc_picture(enum AVPixelFormat pix_fmt, int width, int height)
{
  AVFrame *picture;
  int ret;

  picture = av_frame_alloc();
  if (!picture)
    return NULL;

  picture->format = pix_fmt;
  picture->width  = width;
  picture->height = height;

  /* allocate the buffers for the frame data */
  ret = av_frame_get_buffer(picture, 0);
  if (ret < 0) {
    fprintf(stderr, "Could not allocate frame data.\n");
    exit(1);
  }

  return picture;
}

static void open_video(AVFormatContext *oc, const AVCodec *codec,
                       OutputStream *ost, AVDictionary *opt_arg)
{
  int ret;
  AVCodecContext *c = ost->enc;
  AVDictionary *opt = NULL;

  av_dict_copy(&opt, opt_arg, 0);

  /* open the codec */
  ret = avcodec_open2(c, codec, &opt);
  av_dict_free(&opt);
  if (ret < 0) {
    fprintf(stderr, "Could not open video codec\n");
    exit(1);
  }

  /* allocate and init a re-usable frame */
  ost->frame = alloc_picture(c->pix_fmt, c->width, c->height);
  if (!ost->frame) {
    fprintf(stderr, "Could not allocate video frame\n");
    exit(1);
  }

  /* If the output format is not YUV420P, then a temporary YUV420P
   * picture is needed too. It is then converted to the required
   * output format. */
  ost->tmp_frame = NULL;
  if (c->pix_fmt != AV_PIX_FMT_YUV420P) {
    ost->tmp_frame = alloc_picture(AV_PIX_FMT_YUV420P, c->width, c->height);
    if (!ost->tmp_frame) {
      fprintf(stderr, "Could not allocate temporary picture\n");
      exit(1);
    }
  }

  /* copy the stream parameters to the muxer */
  ret = avcodec_parameters_from_context(ost->st->codecpar, c);
  if (ret < 0) {
    fprintf(stderr, "Could not copy the stream parameters\n");
    exit(1);
  }
}

/* Prepare a dummy image. */
static void fill_yuv_image(AVFrame *pict, int frame_index,
                           int width, int height)
{
  int x, y, i;

  i = frame_index;

  /* Y */
  for (y = 0; y < height; y++)
    for (x = 0; x < width; x++)
      pict->data[0][y * pict->linesize[0] + x] = x + y + i * 3;

  /* Cb and Cr */
  for (y = 0; y < height / 2; y++) {
    for (x = 0; x < width / 2; x++) {
      pict->data[1][y * pict->linesize[1] + x] = 128 + y + i * 2;
      pict->data[2][y * pict->linesize[2] + x] = 64 + x + i * 5;
    }
  }
}

static AVFrame *get_video_frame(OutputStream *ost)
{
  AVCodecContext *c = ost->enc;

  /* check if we want to generate more frames */
  if (av_compare_ts(ost->next_pts, c->time_base,
                    STREAM_DURATION, (AVRational){ 1, 1 }) > 0)
    return NULL;

  /* when we pass a frame to the encoder, it may keep a reference to it
   * internally; make sure we do not overwrite it here */
  if (av_frame_make_writable(ost->frame) < 0)
    exit(1);

  if (c->pix_fmt != AV_PIX_FMT_YUV420P) {
    /* as we only generate a YUV420P picture, we must convert it
     * to the codec pixel format if needed */
    if (!ost->sws_ctx) {
      ost->sws_ctx = sws_getContext(c->width, c->height,
                                    AV_PIX_FMT_YUV420P,
                                    c->width, c->height,
                                    c->pix_fmt,
                                    SCALE_FLAGS, NULL, NULL, NULL);
      if (!ost->sws_ctx) {
        fprintf(stderr,
                "Could not initialize the conversion context\n");
        exit(1);
      }
    }
    fill_yuv_image(ost->tmp_frame, ost->next_pts, c->width, c->height);
    sws_scale(ost->sws_ctx, (const uint8_t * const *) ost->tmp_frame->data,
              ost->tmp_frame->linesize, 0, c->height, ost->frame->data,
              ost->frame->linesize);
  } else {
    fill_yuv_image(ost->frame, ost->next_pts, c->width, c->height);
  }

  ost->frame->pts = ost->next_pts++;

  return ost->frame;
}

/*
 * encode one video frame and send it to the muxer
 * return 1 when encoding is finished, 0 otherwise
 */
static int write_video_frame(AVFormatContext *oc, OutputStream *ost)
{
  return write_frame(oc, ost->enc, ost->st, get_video_frame(ost), ost->tmp_pkt);
}

static void close_stream(AVFormatContext *oc, OutputStream *ost)
{
  avcodec_free_context(&ost->enc);
  av_frame_free(&ost->frame);
  av_frame_free(&ost->tmp_frame);
  av_packet_free(&ost->tmp_pkt);
  sws_freeContext(ost->sws_ctx);
  swr_free(&ost->swr_ctx);
}

/**************************************************************/
/* media file output */

int main(int argc, char **argv)
{
  OutputStream video_st = { 0 }, audio_st = { 0 };
  const AVOutputFormat *fmt;
  const char *filename;
  AVFormatContext *oc;
  const AVCodec *audio_codec, *video_codec;
  int ret;
  int have_video = 0, have_audio = 0;
  int encode_video = 0, encode_audio = 0;
  AVDictionary *opt = NULL;
  int i;

  if (argc < 2) {
    printf("usage: %s output_file\n"
           "API example program to output a media file with libavformat.\n"
           "This program generates a synthetic audio and video stream, encodes and\n"
           "muxes them into a file named output_file.\n"
           "The output format is automatically guessed according to the file extension.\n"
           "Raw images can also be output by using '%%d' in the filename.\n"
           "\n", argv[0]);
    return 1;
  }

  filename = argv[1];

  av_dict_set(&opt, "movflags", "frag_keyframe+separate_moof+omit_tfhd_offset+empty_moov", 0);

  /* allocate the output media context */
  avformat_alloc_output_context2(&oc, NULL, NULL, filename);
  if (!oc) {
    printf("Could not deduce output format from file extension: using MPEG.\n");
    avformat_alloc_output_context2(&oc, NULL, "mpeg", filename);
  }
  if (!oc)
    return 1;

  fmt = oc->oformat;

  /* Add the audio and video streams using the default format codecs
   * and initialize the codecs. */
  if (fmt->video_codec != AV_CODEC_ID_NONE) {
    add_stream(&video_st, oc, &video_codec, fmt->video_codec);
    have_video = 1;
    encode_video = 1;
  }

  /* Now that all the parameters are set, we can open the audio and
   * video codecs and allocate the necessary encode buffers. */
  if (have_video)
    open_video(oc, video_codec, &video_st, opt);


  av_dump_format(oc, 0, filename, 1);

  /* open the output file, if needed */
  if (!(fmt->flags & AVFMT_NOFILE)) {
    ret = avio_open(&oc->pb, filename, AVIO_FLAG_WRITE);
    if (ret < 0) {
      fprintf(stderr, "Could not open '%s'\n", filename);
      return 1;
    }
  }

  /* Write the stream header, if any. */
  ret = avformat_write_header(oc, &opt);
  if (ret < 0) {
    fprintf(stderr, "Error occurred when opening output file\n");
    return 1;
  }

  while (encode_video || encode_audio) {
    /* select the stream to encode */
    if (encode_video &&
        (!encode_audio || av_compare_ts(video_st.next_pts, video_st.enc->time_base,
                                        audio_st.next_pts, audio_st.enc->time_base) <= 0)) {
      encode_video = !write_video_frame(oc, &video_st);
    }
  }

  av_write_trailer(oc);

  /* Close each codec. */
  if (have_video)
    close_stream(oc, &video_st);
  if (have_audio)
    close_stream(oc, &audio_st);

  if (!(fmt->flags & AVFMT_NOFILE))
    /* Close the output file. */
    avio_closep(&oc->pb);

  /* free the stream */
  avformat_free_context(oc);

  return 0;
}

Upvotes: 2

Views: 632

Answers (0)

Related Questions