From: Andreas Rheinhardt Date: Wed, 26 Nov 2025 19:15:55 +0000 (+0100) Subject: avcodec/x86/h264_idct: Fix ff_h264_luma_dc_dequant_idct_sse2 checkasm failures X-Git-Url: http://git.ffmpeg.org/gitweb/ffmpeg.git/commitdiff_plain/HEAD avcodec/x86/h264_idct: Fix ff_h264_luma_dc_dequant_idct_sse2 checkasm failures ff_h264_luma_dc_dequant_idct_sse2() does not pass checkasm for certain seeds, because the input to packssdw no longer fits into an int16_t, leading to saturation, where the C code just truncates. I don't know whether the spec contains provisions that ensure that valid input must not exceed 16 bit or whether the such inputs (even if invalid) can be triggered by the actual code and not only the test. This commit adapts the behavior of the function to the C reference code to fix the test. packssdw is avoided, instead the lower words are directly transfered to GPRs to be written out. This has unfortunately led to a slight performance regression here (14.5 vs 15.1 cycles). Fixes issue #20835. Signed-off-by: Andreas Rheinhardt --- diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index d35d583ce7..47e4116f42 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -592,36 +592,58 @@ RET psrad m1, %1 psrad m2, %1 psrad m3, %1 - packssdw m0, m1 - packssdw m2, m3 %endmacro -%macro STORE_WORDS 9 - movd t0d, %1 - psrldq %1, 4 - movd t1d, %1 - psrldq %1, 4 - mov [t2+%2*32], t0w - mov [t2+%4*32], t1w - shr t0d, 16 - shr t1d, 16 +%macro STORE_WORDS 10 +%if ARCH_X86_64 + movq t0, %1 + movq t1, %2 + psrldq %1, 8 + psrldq %2, 8 mov [t2+%3*32], t0w - mov [t2+%5*32], t1w - movd t0d, %1 - psrldq %1, 4 - movd t1d, %1 + mov [t2+%7*32], t1w + shr t0, 32 + shr t1, 32 + mov [t2+%4*32], t0w + mov [t2+%8*32], t1w + movq t0, %1 + movq t1, %2 + mov [t2+%5*32], t0w + mov [t2+%9*32], t1w + shr t0, 32 + shr t1, 32 mov [t2+%6*32], t0w + mov [t2+%10*32], t1w +%else + movd t0d, %1 + movd t1d, %2 + psrldq %1, 4 + psrldq %2, 4 + mov [t2+%3*32], t0w + mov [t2+%7*32], t1w + movd t0d, %1 + movd t1d, %2 + psrldq %1, 4 + psrldq %2, 4 + mov [t2+%4*32], t0w mov [t2+%8*32], t1w - shr t0d, 16 - shr t1d, 16 - mov [t2+%7*32], t0w + movd t0d, %1 + movd t1d, %2 + psrldq %1, 4 + psrldq %2, 4 + mov [t2+%5*32], t0w mov [t2+%9*32], t1w + movd t0d, %1 + movd t1d, %2 + mov [t2+%6*32], t0w + mov [t2+%10*32], t1w +%endif %endmacro %macro DEQUANT_STORE 1 DEQUANT %1 - STORE_WORDS m0, 0, 1, 4, 5, 2, 3, 6, 7 - STORE_WORDS m2, 8, 9, 12, 13, 10, 11, 14, 15 + STORE_WORDS m0, m1, 0, 1, 4, 5, 2, 3, 6, 7 + STORE_WORDS m2, m3, 8, 9, 12, 13, 10, 11, 14, 15 %endmacro INIT_XMM sse2