c - 使用ffmpeg c连接视频和音频时如何计算pts和dts

typedef struct file
{
   AVFormatContext *container;
   AVCodecContext **codec;
   int *frames;
} file;


int stream_clip(file *input, file *output)
{
    AVPacket *packet = av_packet_alloc();
    AVFrame *frame = av_frame_alloc();
    int res;

    while (1)
    {
        res = decode_frame(input, frame, packet);

        if (res == 1)
        {
            printf("Error decoding a frame\n");
            av_frame_free(&frame);
            av_packet_free(&packet);

            return 1;
        }
        else if (res == 0)
        {

            AVCodecContext *codec = output->codec[packet->stream_index];
            AVRational fps = output->codec[packet->stream_index]->framerate;
            AVRational time_base = output->container->streams[packet->stream_index]->time_base;

            /*
            if (input->container->streams[packet->stream_index]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
            {
                fps.num = 1,
                fps.den = input->container->streams[packet->stream_index]->codecpar->sample_rate;
            }
            */
           
            frame->pts = (int64_t)(av_q2d(av_div_q((AVRational){time_base.den, 1}, fps)) * output->frames[packet->stream_index]);

            frame->pkt_dts = frame->pts;
            frame->pkt_duration = frame->pts;

            printf("%i FRAME %i PTS %i\n", packet->stream_index, output->frames[packet->stream_index], frame->pts);

            output->frames[packet->stream_index]++;

            res = encode_frame(output, frame, packet->stream_index);
            if (res == 1)
            {
                av_frame_free(&frame);
                printf("Failde encoding frame\n");
                return 1;
            }
            av_frame_unref(frame);
        }

        else if (res == -1)
        {
            printf("\nfile \"%s\" ended\n", input->container->url);
            break;
        }
    }

    av_frame_free(&frame);
    //flush decoder
    decode_frame(input, NULL, packet);

    av_packet_free(&packet);

    return 0;
}

https://github.com/leandromoreira/ffmpeg-libav-tutorial#chapter-1---syncing-audio-and-video

我尝试通过对视频进行时间刻度/fps * frame_number 来计算 pts，对于音频，我只是让 ffmpeg 为我做，视频和音频播放正常，但音频和视频不同步，音频结束速度比视频快

我也遇到这个错误 [mp4 @ 0xe9c7780] 流 1 的数据包中未设置时间戳。这已被弃用，将来将停止工作。修复您的代码以正确设置时间戳

[mp4 @ 0xe9c7780] 编码器没有产生正确的点，正在弥补。

如果我计算音频点，vlc 和 mpv 都不能正确播放视频，mpv 可以正确播放音频，但视频不正确，而 vlc 可以正确播放视频但没有音频

mpv 输出这个错误: “检测到音频/视频不同步!可能的原因包括速度太慢硬件、临时 CPU 峰值、损坏的驱动程序和损坏的文件。声音的位置将与视频不匹配(请参阅 A-V 状态字段)。”

下面是每一帧计算的pts，0是音频，1是视频

frame_type FRAME frame_number PTS
0 FRAME 0 PTS 0
0 FRAME 1 PTS 512
0 FRAME 2 PTS 1024
0 FRAME 3 PTS 1536
1 FRAME 0 PTS 0
0 FRAME 4 PTS 2048
1 FRAME 1 PTS 0
0 FRAME 5 PTS 2560
1 FRAME 2 PTS 0
1 FRAME 3 PTS 0
0 FRAME 6 PTS 3072
1 FRAME 4 PTS 0
1 FRAME 5 PTS 0
0 FRAME 7 PTS 3584
1 FRAME 6 PTS 0
1 FRAME 7 PTS 0
0 FRAME 8 PTS 4096
1 FRAME 8 PTS 0
1 FRAME 9 PTS 0
0 FRAME 9 PTS 4608
1 FRAME 10 PTS 0
0 FRAME 10 PTS 5120
1 FRAME 11 PTS 0
1 FRAME 12 PTS 0
0 FRAME 11 PTS 5632
1 FRAME 13 PTS 0
1 FRAME 14 PTS 0
0 FRAME 12 PTS 6144
1 FRAME 15 PTS 0
1 FRAME 16 PTS 0
0 FRAME 13 PTS 6656
1 FRAME 17 PTS 0
1 FRAME 18 PTS 0
0 FRAME 14 PTS 7168
1 FRAME 19 PTS 0
0 FRAME 15 PTS 7680
1 FRAME 20 PTS 0
1 FRAME 21 PTS 0
0 FRAME 16 PTS 8192
1 FRAME 22 PTS 0
1 FRAME 23 PTS 0
0 FRAME 17 PTS 8704
1 FRAME 24 PTS 0
1 FRAME 25 PTS 0
0 FRAME 18 PTS 9216
1 FRAME 26 PTS 0
1 FRAME 27 PTS 0
0 FRAME 19 PTS 9728
1 FRAME 28 PTS 0
0 FRAME 20 PTS 10240
1 FRAME 29 PTS 0
1 FRAME 30 PTS 0
0 FRAME 21 PTS 10752
1 FRAME 31 PTS 0
1 FRAME 32 PTS 0
0 FRAME 22 PTS 11264
1 FRAME 33 PTS 0
1 FRAME 34 PTS 0
0 FRAME 23 PTS 11776
1 FRAME 35 PTS 0
1 FRAME 36 PTS 0
0 FRAME 24 PTS 12288
1 FRAME 37 PTS 0
0 FRAME 25 PTS 12800
1 FRAME 38 PTS 0
1 FRAME 39 PTS 0
0 FRAME 26 PTS 13312
1 FRAME 40 PTS 0
1 FRAME 41 PTS 0
0 FRAME 27 PTS 13824
1 FRAME 42 PTS 0
1 FRAME 43 PTS 0

file "in_short.mp4" ended
0 FRAME 28 PTS 14336
0 FRAME 29 PTS 14848
0 FRAME 30 PTS 15360
0 FRAME 31 PTS 15872
1 FRAME 44 PTS 0
0 FRAME 32 PTS 16384
1 FRAME 45 PTS 0
0 FRAME 33 PTS 16896
1 FRAME 46 PTS 0
1 FRAME 47 PTS 0
0 FRAME 34 PTS 17408
1 FRAME 48 PTS 0
1 FRAME 49 PTS 0
0 FRAME 35 PTS 17920
1 FRAME 50 PTS 0
1 FRAME 51 PTS 0
0 FRAME 36 PTS 18432
1 FRAME 52 PTS 0
1 FRAME 53 PTS 0
0 FRAME 37 PTS 18944
1 FRAME 54 PTS 0
0 FRAME 38 PTS 19456
1 FRAME 55 PTS 0
1 FRAME 56 PTS 0
0 FRAME 39 PTS 19968
1 FRAME 57 PTS 0
1 FRAME 58 PTS 0
0 FRAME 40 PTS 20480
1 FRAME 59 PTS 0
1 FRAME 60 PTS 0
0 FRAME 41 PTS 20992
1 FRAME 61 PTS 0
1 FRAME 62 PTS 0
0 FRAME 42 PTS 21504
1 FRAME 63 PTS 0
0 FRAME 43 PTS 22016
1 FRAME 64 PTS 0
1 FRAME 65 PTS 0
0 FRAME 44 PTS 22528
1 FRAME 66 PTS 0
1 FRAME 67 PTS 0
0 FRAME 45 PTS 23040
1 FRAME 68 PTS 0
1 FRAME 69 PTS 0
0 FRAME 46 PTS 23552
1 FRAME 70 PTS 0
1 FRAME 71 PTS 0
0 FRAME 47 PTS 24064
1 FRAME 72 PTS 0
0 FRAME 48 PTS 24576
1 FRAME 73 PTS 0
1 FRAME 74 PTS 0
0 FRAME 49 PTS 25088
1 FRAME 75 PTS 0
1 FRAME 76 PTS 0
0 FRAME 50 PTS 25600
1 FRAME 77 PTS 0
1 FRAME 78 PTS 0
0 FRAME 51 PTS 26112
1 FRAME 79 PTS 0
1 FRAME 80 PTS 0
0 FRAME 52 PTS 26624
1 FRAME 81 PTS 0
0 FRAME 53 PTS 27136
1 FRAME 82 PTS 0
1 FRAME 83 PTS 0
0 FRAME 54 PTS 27648
1 FRAME 84 PTS 0
1 FRAME 85 PTS 0
0 FRAME 55 PTS 28160
1 FRAME 86 PTS 0
1 FRAME 87 PTS 0

file "in_short.mp4" ended
0 FRAME 56 PTS 28672
0 FRAME 57 PTS 29184
0 FRAME 58 PTS 29696
0 FRAME 59 PTS 30208
1 FRAME 88 PTS 0
0 FRAME 60 PTS 30720
1 FRAME 89 PTS 0
0 FRAME 61 PTS 31232
1 FRAME 90 PTS 0
1 FRAME 91 PTS 0
0 FRAME 62 PTS 31744
1 FRAME 92 PTS 0
1 FRAME 93 PTS 0
0 FRAME 63 PTS 32256
1 FRAME 94 PTS 0
1 FRAME 95 PTS 0
0 FRAME 64 PTS 32768
1 FRAME 96 PTS 0
1 FRAME 97 PTS 0
0 FRAME 65 PTS 33280
1 FRAME 98 PTS 0
0 FRAME 66 PTS 33792
1 FRAME 99 PTS 0
1 FRAME 100 PTS 0
0 FRAME 67 PTS 34304
1 FRAME 101 PTS 0
1 FRAME 102 PTS 0
0 FRAME 68 PTS 34816
1 FRAME 103 PTS 0
1 FRAME 104 PTS 0
0 FRAME 69 PTS 35328
1 FRAME 105 PTS 0
1 FRAME 106 PTS 0
0 FRAME 70 PTS 35840
1 FRAME 107 PTS 0
0 FRAME 71 PTS 36352
1 FRAME 108 PTS 0
1 FRAME 109 PTS 0
0 FRAME 72 PTS 36864
1 FRAME 110 PTS 0
1 FRAME 111 PTS 0
0 FRAME 73 PTS 37376
1 FRAME 112 PTS 0
1 FRAME 113 PTS 0
0 FRAME 74 PTS 37888
1 FRAME 114 PTS 0
1 FRAME 115 PTS 0
0 FRAME 75 PTS 38400
1 FRAME 116 PTS 0
0 FRAME 76 PTS 38912
1 FRAME 117 PTS 0
1 FRAME 118 PTS 0
0 FRAME 77 PTS 39424
1 FRAME 119 PTS 0
1 FRAME 120 PTS 0
0 FRAME 78 PTS 39936
1 FRAME 121 PTS 0
1 FRAME 122 PTS 0
0 FRAME 79 PTS 40448
1 FRAME 123 PTS 0
1 FRAME 124 PTS 0
0 FRAME 80 PTS 40960
1 FRAME 125 PTS 0
0 FRAME 81 PTS 41472
1 FRAME 126 PTS 0
1 FRAME 127 PTS 0
0 FRAME 82 PTS 41984
1 FRAME 128 PTS 0
1 FRAME 129 PTS 0
0 FRAME 83 PTS 42496
1 FRAME 130 PTS 0
1 FRAME 131 PTS 0

file "in_short.mp4" ended

下面是计算音频pts的时候

0 FRAME 0 PTS 0
0 FRAME 1 PTS 512
0 FRAME 2 PTS 1024
0 FRAME 3 PTS 1536
1 FRAME 0 PTS 0
0 FRAME 4 PTS 2048
1 FRAME 1 PTS 1
0 FRAME 5 PTS 2560
1 FRAME 2 PTS 2
1 FRAME 3 PTS 3
0 FRAME 6 PTS 3072
1 FRAME 4 PTS 4
1 FRAME 5 PTS 5
0 FRAME 7 PTS 3584
1 FRAME 6 PTS 6
1 FRAME 7 PTS 7
0 FRAME 8 PTS 4096
1 FRAME 8 PTS 8
1 FRAME 9 PTS 9
0 FRAME 9 PTS 4608
1 FRAME 10 PTS 10
0 FRAME 10 PTS 5120
1 FRAME 11 PTS 11
1 FRAME 12 PTS 12
0 FRAME 11 PTS 5632
1 FRAME 13 PTS 13
1 FRAME 14 PTS 14
0 FRAME 12 PTS 6144
1 FRAME 15 PTS 15
1 FRAME 16 PTS 16
0 FRAME 13 PTS 6656
1 FRAME 17 PTS 17
1 FRAME 18 PTS 18
0 FRAME 14 PTS 7168
1 FRAME 19 PTS 19
0 FRAME 15 PTS 7680
1 FRAME 20 PTS 20
1 FRAME 21 PTS 21
0 FRAME 16 PTS 8192
1 FRAME 22 PTS 22
1 FRAME 23 PTS 23
0 FRAME 17 PTS 8704
1 FRAME 24 PTS 24
1 FRAME 25 PTS 25
0 FRAME 18 PTS 9216
1 FRAME 26 PTS 26
1 FRAME 27 PTS 27
0 FRAME 19 PTS 9728
1 FRAME 28 PTS 28
0 FRAME 20 PTS 10240
1 FRAME 29 PTS 29
1 FRAME 30 PTS 30
0 FRAME 21 PTS 10752
1 FRAME 31 PTS 31
1 FRAME 32 PTS 32
0 FRAME 22 PTS 11264
1 FRAME 33 PTS 33
1 FRAME 34 PTS 34
0 FRAME 23 PTS 11776
1 FRAME 35 PTS 35
1 FRAME 36 PTS 36
0 FRAME 24 PTS 12288
1 FRAME 37 PTS 37
0 FRAME 25 PTS 12800
1 FRAME 38 PTS 38
1 FRAME 39 PTS 39
0 FRAME 26 PTS 13312
1 FRAME 40 PTS 40
1 FRAME 41 PTS 41
0 FRAME 27 PTS 13824
1 FRAME 42 PTS 42
1 FRAME 43 PTS 43

file "in_short.mp4" ended
0 FRAME 28 PTS 14336
0 FRAME 29 PTS 14848
0 FRAME 30 PTS 15360
0 FRAME 31 PTS 15872
1 FRAME 44 PTS 44
0 FRAME 32 PTS 16384
1 FRAME 45 PTS 45
0 FRAME 33 PTS 16896
1 FRAME 46 PTS 46
1 FRAME 47 PTS 47
0 FRAME 34 PTS 17408
1 FRAME 48 PTS 48
1 FRAME 49 PTS 49
0 FRAME 35 PTS 17920
1 FRAME 50 PTS 50
1 FRAME 51 PTS 51
0 FRAME 36 PTS 18432
1 FRAME 52 PTS 52
1 FRAME 53 PTS 53
0 FRAME 37 PTS 18944
1 FRAME 54 PTS 54
0 FRAME 38 PTS 19456
1 FRAME 55 PTS 55
1 FRAME 56 PTS 56
0 FRAME 39 PTS 19968
1 FRAME 57 PTS 57
1 FRAME 58 PTS 58
0 FRAME 40 PTS 20480
1 FRAME 59 PTS 59
1 FRAME 60 PTS 60
0 FRAME 41 PTS 20992
1 FRAME 61 PTS 61
1 FRAME 62 PTS 62
0 FRAME 42 PTS 21504
1 FRAME 63 PTS 63
0 FRAME 43 PTS 22016
1 FRAME 64 PTS 64
1 FRAME 65 PTS 65
0 FRAME 44 PTS 22528
1 FRAME 66 PTS 66
1 FRAME 67 PTS 67
0 FRAME 45 PTS 23040
1 FRAME 68 PTS 68
1 FRAME 69 PTS 69
0 FRAME 46 PTS 23552
1 FRAME 70 PTS 70
1 FRAME 71 PTS 71
0 FRAME 47 PTS 24064
1 FRAME 72 PTS 72
0 FRAME 48 PTS 24576
1 FRAME 73 PTS 73
1 FRAME 74 PTS 74
0 FRAME 49 PTS 25088
1 FRAME 75 PTS 75
1 FRAME 76 PTS 76
0 FRAME 50 PTS 25600
1 FRAME 77 PTS 77
1 FRAME 78 PTS 78
0 FRAME 51 PTS 26112
1 FRAME 79 PTS 79
1 FRAME 80 PTS 80
0 FRAME 52 PTS 26624
1 FRAME 81 PTS 81
0 FRAME 53 PTS 27136
1 FRAME 82 PTS 82
1 FRAME 83 PTS 83
0 FRAME 54 PTS 27648
1 FRAME 84 PTS 84
1 FRAME 85 PTS 85
0 FRAME 55 PTS 28160
1 FRAME 86 PTS 86
1 FRAME 87 PTS 87

file "in_short.mp4" ended
0 FRAME 56 PTS 28672
0 FRAME 57 PTS 29184
0 FRAME 58 PTS 29696
0 FRAME 59 PTS 30208
1 FRAME 88 PTS 88
0 FRAME 60 PTS 30720
1 FRAME 89 PTS 89
0 FRAME 61 PTS 31232
1 FRAME 90 PTS 90
1 FRAME 91 PTS 91
0 FRAME 62 PTS 31744
1 FRAME 92 PTS 92
1 FRAME 93 PTS 93
0 FRAME 63 PTS 32256
1 FRAME 94 PTS 94
1 FRAME 95 PTS 95
0 FRAME 64 PTS 32768
1 FRAME 96 PTS 96
1 FRAME 97 PTS 97
0 FRAME 65 PTS 33280
1 FRAME 98 PTS 98
0 FRAME 66 PTS 33792
1 FRAME 99 PTS 99
1 FRAME 100 PTS 100
0 FRAME 67 PTS 34304
1 FRAME 101 PTS 101
1 FRAME 102 PTS 102
0 FRAME 68 PTS 34816
1 FRAME 103 PTS 103
1 FRAME 104 PTS 104
0 FRAME 69 PTS 35328
1 FRAME 105 PTS 105
1 FRAME 106 PTS 106
0 FRAME 70 PTS 35840
1 FRAME 107 PTS 107
0 FRAME 71 PTS 36352
1 FRAME 108 PTS 108
1 FRAME 109 PTS 109
0 FRAME 72 PTS 36864
1 FRAME 110 PTS 110
1 FRAME 111 PTS 111
0 FRAME 73 PTS 37376
1 FRAME 112 PTS 112
1 FRAME 113 PTS 113
0 FRAME 74 PTS 37888
1 FRAME 114 PTS 114
1 FRAME 115 PTS 115
0 FRAME 75 PTS 38400
1 FRAME 116 PTS 116
0 FRAME 76 PTS 38912
1 FRAME 117 PTS 117
1 FRAME 118 PTS 118
0 FRAME 77 PTS 39424
1 FRAME 119 PTS 119
1 FRAME 120 PTS 120
0 FRAME 78 PTS 39936
1 FRAME 121 PTS 121
1 FRAME 122 PTS 122
0 FRAME 79 PTS 40448
1 FRAME 123 PTS 123
1 FRAME 124 PTS 124
0 FRAME 80 PTS 40960
1 FRAME 125 PTS 125
0 FRAME 81 PTS 41472
1 FRAME 126 PTS 126
1 FRAME 127 PTS 127
0 FRAME 82 PTS 41984
1 FRAME 128 PTS 128
1 FRAME 129 PTS 129
0 FRAME 83 PTS 42496
1 FRAME 130 PTS 130
1 FRAME 131 PTS 131

file "in_short.mp4" ended

我已经尝试重新缩放 pts 数据包，就像这个例子中的 https://github.com/FFmpeg/FFmpeg/blob/master/doc/examples/transcoding.c 第 448 行与 av_packet_rescale_ts() 但由于我在新视频开始时使用多个视频，所以新视频点从 0 开始。

视频输出这个就是mpv输出这个错误的那个检测到音频/视频不同步!可能的原因包括太慢硬件、临时 CPU 峰值、损坏的驱动程序和损坏的文件。声音的位置将与视频不匹配(请参阅 A-V 状态字段)。 https://drive.google.com/file/d/1DlIOxJGiqUHumvuOQBISPNtHDnfuaKDE/view?usp=sharing

https://drive.google.com/file/d/15fnrZT6XZw_CkOy51PsTbKG_F2ykM09M/view?usp=sharing 这个播放很好，但是音频和视频不同步，不太明显，因为我只附加了 3 个视频，我附加了同一个视频 100 次，音频在视频前几分钟结束

我制作的“视频编辑器”的完整代码: https://github.com/LentilStew/video_transcoder

我很不擅长编程，所以我的代码不是很好。这篇文章中给出的片段来 self 制作的这个小型转码器 https://pastebin.com/VHLREVGf (它在 pastebin 中，因为 stackoverflow 中有 30000 个字符的限制) 我用这个编译 gcc main.c -o compiled.out
pkg-config --libs libavformat libavfilter libavutil libavcodec libswscale libavdevice libavutil

编辑 1:

现在(感谢@Rotem)我可以正确计算音频点，但由于音频流可能比视频流短，当将所有视频附加到一起时，视频和音频不同步，为了解决这个问题，我尝试用空帧，直到所有流都具有相同的点，但出现此错误 [aac @ 0xfc00880] 输入包含(接近)NaN/+-Inf 并且输出与不调用此函数完全相同

说明我认为的问题是什么

graph of problem

填充流的函数

int fill_with_empty_frames_until_all_streams_match_pts(file *input, file *output)
{
    int res;

    int biggest_pts = 0;
    int biggest_pts_index = -1;

    for (int i = 0; i < output->container->nb_streams; i++)
    {
        int curr_pts_in_new_time_base = av_rescale_q(output->pre_pts[i], output->container->streams[i]->time_base, (AVRational){.den = 60000, .num = 1});
    
        if (curr_pts_in_new_time_base > biggest_pts)
        {
            biggest_pts = curr_pts_in_new_time_base;
            biggest_pts_index = i;
        }
    }

    for (int i = 0; i < output->container->nb_streams; i++)
    {

        AVCodecContext *codec = output->codec[i];
        AVRational fps = output->codec[i]->framerate;
        AVRational time_base = output->container->streams[i]->time_base;

        int frames_per_packet = 1;

        if (output->container->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
        {
            fps.den = 1;
            fps.num = input->container->streams[i]->codecpar->sample_rate;

            ////////////////////////////////////////////////////////////////////
            frames_per_packet = input->container->streams[i]->codecpar->frame_size; //For the audio there are 1024 (or 960) frames per packet https://stackoverflow.com/questions/23216103/about-definition-for-terms-of-audio-codec
            ////////////////////////////////////////////////////////////////////
        }

        AVFrame *dummy_frame = av_frame_alloc();

        switch (output->container->streams[i]->codecpar->codec_type)
        {
        case AVMEDIA_TYPE_AUDIO:
            dummy_frame->nb_samples = frames_per_packet;
            dummy_frame->format = output->container->streams[i]->codecpar->format;
            dummy_frame->channel_layout = output->container->streams[i]->codecpar->channel_layout;
            break;
        case AVMEDIA_TYPE_VIDEO:
            dummy_frame->width = output->container->streams[i]->codecpar->width;
            dummy_frame->height = output->container->streams[i]->codecpar->height;
            dummy_frame->format = output->container->streams[i]->codecpar->format;
            break;
        default:
            continue;
        }

        av_frame_get_buffer(dummy_frame, 0);

        while (1)
        {
            int64_t pkt_duration = (int64_t)(av_q2d(av_div_q((AVRational){time_base.den, 1}, fps))) * (int64_t)frames_per_packet;
            int curr_pts_in_new_time_base = av_rescale_q(pkt_duration * output->frames[i], output->container->streams[i]->time_base, (AVRational){.den = 60000, .num = 1});
            printf("biggest_pts %i\n", biggest_pts);
            printf("curr_pts_in_new_time_base %i\n", curr_pts_in_new_time_base);

            printf("Adding frame\n");
            if (biggest_pts <= curr_pts_in_new_time_base)
                break;
            dummy_frame->pkt_duration = pkt_duration;
            dummy_frame->pts = (int64_t)(dummy_frame->pkt_duration * output->frames[i]);

            dummy_frame->pkt_dts = dummy_frame->pts;

            printf("%i FRAME %i PTS %i\n", (int)i, (int)output->frames[i], (int)dummy_frame->pts);

            output->frames[i]++;

            res = encode_frame(output, dummy_frame, i);
        }
        av_frame_free(&dummy_frame);
    }
}

最佳答案

主要问题是将数据包持续时间设置为 pts:frame->pkt_duration = frame->pts。

所有帧的持续时间通常是相同的，并且 pts 是递增的。

其他问题:

每个音频包有多个音频帧。
根据以下post ，“每个数据包可以有 1024(或 960)帧”。
我们必须通过“每个数据包的帧数”来缩放数据包持续时间和 pts。
av_packet_unref(output_packet)好像没到位(不知道算不算)。
视频时基的分辨率好像太低了，我修改成1/60000(不知道是不是米)。
没有为数据包设置 pts、dts 和持续时间(我不知道我们是否必须按帧和每个数据包设置它们)。
为了以防万一，我为每个数据包添加了 pts、dts 和持续时间。

我还在学习Libav的C接口(interface)
我建议的解决方案可能并不完美......

这是设置 pts、dts 和持续时间的更新代码:

AVRational fps = output->codec[packet->stream_index]->framerate;
int frames_per_packet = 1;
AVRational time_base = output->container->streams[packet->stream_index]->time_base;

if (input->container->streams[packet->stream_index]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
{
    fps.den = 1;
    fps.num = input->container->streams[packet->stream_index]->codecpar->sample_rate;
    frames_per_packet = input->container->streams[packet->stream_index]->codecpar->frame_size;  //For the audio there are 1024 (or 960) frames per packet https://stackoverflow.com/questions/23216103/about-definition-for-terms-of-audio-codec
}

frame->pkt_duration = (int64_t)(av_q2d(av_div_q((AVRational) { time_base.den, 1 }, fps))) * (int64_t)frames_per_packet;
frame->pts = (int64_t)(frame->pkt_duration * output->frames[packet->stream_index]);
frame->pkt_dts = frame->pts;

完整的更新代码:

#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavutil/opt.h>
#include <libavutil/pixdesc.h>
#include <libavutil/avutil.h>

typedef struct file
{
    AVFormatContext* container;
    AVCodecContext** codec;
    int* frames;
      
    //Counting packets (same principle as counting frames). 
    ////////////////////////////////////////////////////////////////////
    int* packets;
    ////////////////////////////////////////////////////////////////////
} file;

typedef struct EncoderContext
{
    file* encoder;

} EncoderContext;

file* create_output(int streams, const char* filename);
file* start_output_from_file(const char* path, file* input, const char* video_encoder, const char* audio_encoder);
int create_video_encoder(AVCodecContext** cod_ctx, AVFormatContext* container, const char* encoder, int width, int height,
    int pix_fmt, AVRational sample_aspect_ratio, AVRational frame_rate, int bit_rate, int buffer_size);
int create_audio_encoder(AVCodecContext** cod_ctx, AVFormatContext* container, const char* encoder,
    int channels, int sample_rate, int bit_rate);
int decode_frame(file* decoder, AVFrame* frame, AVPacket* packet);
int open_media(file* video, const char input_path[], const char* video_codec, const char* audio_codec);
void save_gray_frame(unsigned char* buf, int width, int height);
int free_file(file* f);
int encode_frame(file* encoder, AVFrame* input_frame, int index);
int stream_clip(file* input, file* output);

int main()
{
    int res;
    int inputs_len = 2;

    //file* input1 = malloc(sizeof(file));
    file* input1 = calloc(sizeof(file), 1);

    res = open_media(input1, "in_short.mp4", "h264_cuvid", NULL);

    file* output = start_output_from_file("output.mp4", input1, "h264_nvenc", NULL);

    if (res != 0 || !input1)
    {
        printf("Failed opening input 1");
        return 1;
    }

    stream_clip(input1, output);
    free_file(input1);

    for (int i = 0; i < inputs_len; i++)
    {
        //file* input = malloc(sizeof(file));
        file* input = calloc(sizeof(file), 1);
        res = open_media(input, "in_short.mp4", "h264_cuvid", NULL);
        stream_clip(input, output);
        free_file(input);
    }

    encode_frame(output, NULL, 0);
    encode_frame(output, NULL, 1);    

    av_write_trailer(output->container);

    free_file(output);
}

int stream_clip(file* input, file* output)
{
    AVPacket* packet = av_packet_alloc();
    AVFrame* frame = av_frame_alloc();
    int res;

    while (1)
    {
        res = decode_frame(input, frame, packet);

        if (res == 1)
        {
            printf("Error decoding a frame\n");
            av_frame_free(&frame);
            av_packet_free(&packet);

            return 1;
        }
        else if (res == 0)
        {

            AVCodecContext* codec = output->codec[packet->stream_index];
            AVRational fps = output->codec[packet->stream_index]->framerate;
            int frames_per_packet = 1;
            AVRational time_base = output->container->streams[packet->stream_index]->time_base;


            if (input->container->streams[packet->stream_index]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
            {
                fps.den = 1;
                fps.num = input->container->streams[packet->stream_index]->codecpar->sample_rate;

                ////////////////////////////////////////////////////////////////////
                frames_per_packet = input->container->streams[packet->stream_index]->codecpar->frame_size;  //For the audio there are 1024 (or 960) frames per packet https://stackoverflow.com/questions/23216103/about-definition-for-terms-of-audio-codec
                ////////////////////////////////////////////////////////////////////
            }

            //Why pkt_duration = pts???
            ////////////////////////////////////////////////////////////////////
            //frame->pkt_duration = frame->pts;
            frame->pkt_duration = (int64_t)(av_q2d(av_div_q((AVRational) { time_base.den, 1 }, fps))) * (int64_t)frames_per_packet;
            ////////////////////////////////////////////////////////////////////

            frame->pts = (int64_t)(frame->pkt_duration * output->frames[packet->stream_index]);

            frame->pkt_dts = frame->pts;

            printf("%i FRAME %i PTS %i\n", (int)packet->stream_index, (int)output->frames[packet->stream_index], (int)frame->pts);

            output->frames[packet->stream_index]++;

            res = encode_frame(output, frame, packet->stream_index);
            if (res == 1)
            {
                av_frame_free(&frame);
                printf("Failde encoding frame\n");
                return 1;
            }
            av_frame_unref(frame);
        }

        else if (res == -1)
        {
            printf("\nfile \"%s\" ended\n", input->container->url);
            break;
        }
    }

    av_frame_free(&frame);

    decode_frame(input, NULL, packet);

    av_packet_free(&packet);

    return 0;
}

int encode_frame(file* encoder, AVFrame* input_frame, int index)
{

    AVPacket* output_packet = av_packet_alloc();
    if (!output_packet)
    {
        printf("ENCODER: Failed mallocing output_package");
        return 1;
    }

    AVCodecContext* codec = encoder->codec[index];

    if (!codec)
        return 0;

    int response = avcodec_send_frame(codec, input_frame);

    while (response >= 0)
    {
        //The packet unref is supposed to be here
        ////////////////////////////////////////////////////////////////////////
        av_packet_unref(output_packet);
        ////////////////////////////////////////////////////////////////////////

        response = avcodec_receive_packet(codec, output_packet);

        if (response == AVERROR(EAGAIN) || response == AVERROR_EOF)
        {
            break;
        }
        else if (response < 0)
        {
            printf("ENCODER: Error receiving packet");

            return 1;
        }

        output_packet->stream_index = index;

        //I think we have to set PTS, DTS and duration for each packet.
        ////////////////////////////////////////////////////////////////////////
        //output_packet->pts = input_frame->pts;
        //output_packet->dts = input_frame->pkt_dts;
        //output_packet->duration = input_frame->pkt_duration;
        AVRational fps = codec->framerate;
        int frames_per_packet = 1;

        if (encoder->container->streams[index]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
        {
            fps.den = 1;
            fps.num = encoder->container->streams[index]->codecpar->sample_rate;

            ////////////////////////////////////////////////////////////////////
            frames_per_packet = encoder->container->streams[index]->codecpar->frame_size;  //For the audio there are 1024 (or 960) frames per packet https://stackoverflow.com/questions/23216103/about-definition-for-terms-of-audio-codec
            ////////////////////////////////////////////////////////////////////
        }

        AVRational time_base = encoder->container->streams[index]->time_base;

        output_packet->duration = (int64_t)(av_q2d(av_div_q((AVRational) { time_base.den, 1 }, fps))) * (int64_t)frames_per_packet;
        output_packet->pts = (int64_t)(output_packet->duration * encoder->packets[index]);
        output_packet->dts = output_packet->pts;

        encoder->packets[index]++;  //Count packets
        ////////////////////////////////////////////////////////////////////////

        response = av_interleaved_write_frame(encoder->container, output_packet);

        if (response != 0)
        {
            printf("ENCODER:failed writing frame");

            return 1;
        }
    }
    //av_packet_unref(output_packet);
    av_packet_free(&output_packet);

    return 0;
}

int free_file(file* f)
{
    int i;
    for (i = 0; i < (int)f->container->nb_streams; i++)
    {
        if (f->codec[i] == NULL)
        {
            continue;
        }
        avcodec_free_context(&f->codec[i]);
    }

    //av_free - Free a memory block which has been allocated with a function of av_malloc(), but f->codec is not allocated with av_malloc()???
    //av_free(f->codec);

    avformat_close_input(&f->container);

    ////////////////////////////////////////////////////////////////////////
    if (f->frames != NULL)
    {
        free(f->frames);
    }

    if (f->packets != NULL)
    {
        free(f->packets);
    }
    ////////////////////////////////////////////////////////////////////////

    free(f);

    return 0;
}

int open_media(file* video, const char input_path[], const char* video_codec, const char* audio_codec)
{
    video->container = avformat_alloc_context();

    if (!video->container)
    {
        printf("Failed to alloc memory to the container of the input file");
        return 1;
    }
    if (avformat_open_input(&video->container, input_path, NULL, NULL) != 0)
    {
        printf("Failed to open input file");
        return 1;
    }
    if (avformat_find_stream_info(video->container, NULL) < 0)
    {
        printf("Failed to open read stream info");
        return 1;
    }

    video->codec = calloc(video->container->nb_streams, sizeof(AVCodecContext*));

    for (unsigned int i = 0; i < video->container->nb_streams; i++)
    {
        const char* curr_codec = NULL;

        AVStream* stream = video->container->streams[i];
        const AVCodec* dec;
        AVCodecContext* codec_ctx;

        if (AVMEDIA_TYPE_VIDEO == stream->codecpar->codec_type)
        {
            curr_codec = video_codec;
        }
        else if (AVMEDIA_TYPE_AUDIO == stream->codecpar->codec_type)
        {
            curr_codec = audio_codec;
        }

        if (curr_codec == NULL)
            dec = avcodec_find_decoder(stream->codecpar->codec_id);
        else
            dec = avcodec_find_decoder_by_name(video_codec);

        if (!dec)
        {
            printf("failed to find the codec");
            return 1;
        }

        codec_ctx = avcodec_alloc_context3(dec);
        if (!codec_ctx)
        {
            printf("failed to alloc memory for codec context");
            return 1;
        }

        if (avcodec_parameters_to_context(codec_ctx, stream->codecpar) < 0)
        {
            printf("failed to fill codec context");
            return 1;
        }

        if (avcodec_open2(codec_ctx, dec, NULL) < 0)
        {
            printf("failed to open codec");
            return 1;
        }

        video->codec[i] = codec_ctx;
    }
    return 0;
}

/*
    returns:
    1 if error
    0 if success
    -1 if file ended
*/
int decode_frame(file* decoder, AVFrame* frame, AVPacket* packet)
{
    AVCodecContext* dec;

    while (1)
    {
        av_packet_unref(packet);
        if (av_read_frame(decoder->container, packet) < 0)
            break;

        int index = packet->stream_index;

        dec = decoder->codec[index];

        int response = avcodec_send_packet(dec, packet);

        if (response < 0)
        {
            printf("Error while sending packet to decoder");
            return 1;
        }

        while (response >= 0)
        {
            response = avcodec_receive_frame(dec, frame);
            if (response == AVERROR(EAGAIN) || response == AVERROR_EOF)
            {
                break;
            }
            else if (response < 0)
            {
                printf("Error while receiving frame from decoder");
                return 1;
            }
            if (response >= 0)
            {
                return 0;
            }
            av_frame_unref(frame);
        }
    }
    return -1;
}
int create_audio_encoder(AVCodecContext** cod_ctx, AVFormatContext* container, const char* encoder,
    int channels, int sample_rate, int bit_rate)
{
    AVStream* stream = avformat_new_stream(container, NULL);
    if (!stream)
    {
        printf("CREATE AUDIO ENCODER: Failed allocating memory for stream");
        return 1;
    }
    const AVCodec* enc = avcodec_find_encoder_by_name(encoder);
    if (!enc)
    {
        printf("CREATE AUDIO ENCODER: Failed searching encoder");

        return 1;
    }

    cod_ctx[0] = avcodec_alloc_context3(enc);

    if (!cod_ctx[0])
    {
        printf("CREATE AUDIO ENCODER: Failed allocation codec context");
        return 1;
    }

    cod_ctx[0]->channels = channels;
    cod_ctx[0]->channel_layout = av_get_default_channel_layout(channels);
    cod_ctx[0]->sample_rate = sample_rate;
    cod_ctx[0]->sample_fmt = *enc->sample_fmts;
    cod_ctx[0]->bit_rate = bit_rate;
    cod_ctx[0]->time_base = (AVRational){ 1, sample_rate }; // 1/48000

    int res = 0;

    res = avcodec_open2(cod_ctx[0], enc, NULL);
    if (res < 0)
    {
        printf("CREATE AUDIO ENCODER: couldn't open codec");
        return 1;
    }

    res = avcodec_parameters_from_context(stream->codecpar, cod_ctx[0]);

    if (res < 0)
    {
        printf("CREATE AUDIO ENCODER: failed setting codec parameters from context");
        return 1;
    }

    return 0;
}

int create_video_encoder(AVCodecContext** cod_ctx, AVFormatContext* container, const char* encoder, int width, int height,
    int pix_fmt, AVRational sample_aspect_ratio, AVRational frame_rate, int bit_rate, int buffer_size)
{
    AVStream* stream = avformat_new_stream(container, NULL);
    if (!stream)
    {
        printf("CREATE VIDEO ENCODER: Failed allocating memory for stream");
        return 1;
    }
    const AVCodec* enc = avcodec_find_encoder_by_name(encoder);
    if (!enc)
    {
        printf("CREATE VIDEO ENCODER: Failed searching encoder");

        return 1;
    }

    cod_ctx[0] = avcodec_alloc_context3(enc);

    if (!cod_ctx[0])
    {
        printf("CREATE VIDEO ENCODER: Failed allocation codec context");
        return 1;
    }

    cod_ctx[0]->height = height;
    cod_ctx[0]->width = width;
    cod_ctx[0]->pix_fmt = pix_fmt;

    cod_ctx[0]->sample_aspect_ratio = sample_aspect_ratio;

    //It's not a good idea to set the video time base to 1/60 - we need higher resolution for allowing audio synchronization
    ////////////////////////////////////////////////////////////////////////////
    cod_ctx[0]->time_base = av_make_q(1, 60000);//av_inv_q(frame_rate); //av_inv_q(frame_rate);
    ////////////////////////////////////////////////////////////////////////////

    cod_ctx[0]->framerate = frame_rate;
    cod_ctx[0]->bit_rate = bit_rate;
    cod_ctx[0]->rc_buffer_size = buffer_size;
    cod_ctx[0]->rc_max_rate = buffer_size;
    cod_ctx[0]->rc_min_rate = buffer_size;

    stream->time_base = cod_ctx[0]->time_base; //cod_ctx->time_base;

    int res = 0;

    res = av_opt_set(cod_ctx[0]->priv_data, "preset", "fast", 0);

    if (res != 0)
    {
        printf("CREATE VIDEO ENCODER: Failed opt set");
        return 1;
    }

    res = avcodec_open2(cod_ctx[0], enc, NULL);
    if (res < 0)
    {
        printf("CREATE VIDEO ENCODER: couldn't open codec");
        return 1;
    }

    res = avcodec_parameters_from_context(stream->codecpar, cod_ctx[0]);

    if (res < 0)
    {
        printf("CREATE VIDEO ENCODER: failed setting codec parameters from context");
        return 1;
    }

    return 0;
}

file* start_output_from_file(const char* path, file* input, const char* video_encoder, const char* audio_encoder)
{
    int res;

    file* output = create_output(input->container->nb_streams, path);
    if (!output)
    {
        return NULL;
    }
    AVCodecContext* codec_ctx;
    output->frames = calloc(input->container->nb_streams, sizeof(int));
    output->packets = calloc(input->container->nb_streams, sizeof(int));
    for (int stream = 0; stream < (int)input->container->nb_streams; stream++)
    {
        codec_ctx = input->codec[stream];

        switch (codec_ctx->codec_type)
        {
        case AVMEDIA_TYPE_AUDIO:
            if (audio_encoder == NULL)
            {
                audio_encoder = codec_ctx->codec_descriptor->name;
            }
            res = create_audio_encoder(&output->codec[stream], output->container, audio_encoder, codec_ctx->channels, codec_ctx->sample_rate, (int)codec_ctx->bit_rate);

            break;

        case AVMEDIA_TYPE_VIDEO:
            if (video_encoder == NULL)
            {
                video_encoder = codec_ctx->codec_descriptor->name;
            }
            AVRational framerate = av_guess_frame_rate(input->container, input->container->streams[stream], NULL);
            res = create_video_encoder(&output->codec[stream], output->container, video_encoder, codec_ctx->width, codec_ctx->height,
                codec_ctx->sw_pix_fmt, (AVRational) { 1, 1 }, framerate, (int)codec_ctx->bit_rate, codec_ctx->rc_buffer_size);
            break;
        }
        if (res != 0)
        {
            printf("Failed opening encoder stream number %i \n", stream);
            return NULL;
        }
    }

    if (output->container->oformat->flags & AVFMT_GLOBALHEADER)
        output->container->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;

    if (!(output->container->oformat->flags & AVFMT_NOFILE))
    {
        if (avio_open(&output->container->pb, path, AVIO_FLAG_WRITE) < 0)
        {
            printf("could not open the output file");
            return NULL;
        }
    }

    AVDictionary* muxer_opts = NULL;

    if (avformat_write_header(output->container, &muxer_opts) < 0)
    {
        printf("an error occurred when opening output file");
        return NULL;
    }

    return output;
}

file* create_output(int streams, const char* filename)
{
    int res;

    //file* output = malloc(sizeof(file));
    file* output = calloc(sizeof(file), 1);
    if (!output)
    {
        return NULL;
    }
    res = avformat_alloc_output_context2(&output->container, NULL, NULL, filename);
    if (res < 0)
    {
        printf("Failed opening output\n");
        return NULL;
    }

    output->codec = av_calloc(streams, sizeof(AVCodecContext*));

    if (!output->codec)
    {
        printf("Failed allocating ram for codec\n");
        return NULL;
    }

    for (int stream = 0; stream < streams; stream++)
    {
        output->codec[stream] = NULL;
    }

    return output;
}

关于c - 使用ffmpeg c连接视频和音频时如何计算pts和dts，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/70458344/

c - 使用ffmpeg c连接视频和音频时如何计算pts和dts

上一篇：reactjs - 原生应用中的 ffmpeg、ffmpeg wasm 和 ffmpeg 有什么区别？

下一篇：ffmpeg - 如何将 14.939948fps 等 MP4 帧速率转换为 15fps