From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 26 Nov 2025 19:15:55 +0000 (+0100)
Subject: avcodec/x86/h264_idct: Fix ff_h264_luma_dc_dequant_idct_sse2 checkasm failures
X-Git-Url: http://git.ffmpeg.org/gitweb/ffmpeg.git/commitdiff_plain/HEAD

avcodec/x86/h264_idct: Fix ff_h264_luma_dc_dequant_idct_sse2 checkasm failures

ff_h264_luma_dc_dequant_idct_sse2() does not pass checkasm for certain
seeds, because the input to packssdw no longer fits into an int16_t,
leading to saturation, where the C code just truncates. I don't know
whether the spec contains provisions that ensure that valid input
must not exceed 16 bit or whether the such inputs (even if invalid)
can be triggered by the actual code and not only the test.

This commit adapts the behavior of the function to the C reference code
to fix the test. packssdw is avoided, instead the lower words are
directly transfered to GPRs to be written out. This has unfortunately
led to a slight performance regression here (14.5 vs 15.1 cycles).

Fixes issue #20835.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index d35d583ce7..47e4116f42 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -592,36 +592,58 @@ RET
     psrad       m1, %1
     psrad       m2, %1
     psrad       m3, %1
-    packssdw    m0, m1
-    packssdw    m2, m3
 %endmacro
 
-%macro STORE_WORDS 9
-    movd  t0d, %1
-    psrldq  %1, 4
-    movd  t1d, %1
-    psrldq  %1, 4
-    mov [t2+%2*32], t0w
-    mov [t2+%4*32], t1w
-    shr   t0d, 16
-    shr   t1d, 16
+%macro STORE_WORDS 10
+%if ARCH_X86_64
+    movq        t0, %1
+    movq        t1, %2
+    psrldq      %1, 8
+    psrldq      %2, 8
     mov [t2+%3*32], t0w
-    mov [t2+%5*32], t1w
-    movd  t0d, %1
-    psrldq  %1, 4
-    movd  t1d, %1
+    mov [t2+%7*32], t1w
+    shr         t0, 32
+    shr         t1, 32
+    mov [t2+%4*32], t0w
+    mov [t2+%8*32], t1w
+    movq        t0, %1
+    movq        t1, %2
+    mov [t2+%5*32], t0w
+    mov [t2+%9*32], t1w
+    shr         t0, 32
+    shr         t1, 32
     mov [t2+%6*32], t0w
+    mov [t2+%10*32], t1w
+%else
+    movd       t0d, %1
+    movd       t1d, %2
+    psrldq      %1, 4
+    psrldq      %2, 4
+    mov [t2+%3*32], t0w
+    mov [t2+%7*32], t1w
+    movd       t0d, %1
+    movd       t1d, %2
+    psrldq      %1, 4
+    psrldq      %2, 4
+    mov [t2+%4*32], t0w
     mov [t2+%8*32], t1w
-    shr   t0d, 16
-    shr   t1d, 16
-    mov [t2+%7*32], t0w
+    movd       t0d, %1
+    movd       t1d, %2
+    psrldq      %1, 4
+    psrldq      %2, 4
+    mov [t2+%5*32], t0w
     mov [t2+%9*32], t1w
+    movd       t0d, %1
+    movd       t1d, %2
+    mov [t2+%6*32], t0w
+    mov [t2+%10*32], t1w
+%endif
 %endmacro
 
 %macro DEQUANT_STORE 1
     DEQUANT     %1
-    STORE_WORDS m0,  0,  1,  4,  5,  2,  3,  6,  7
-    STORE_WORDS m2,  8,  9, 12, 13, 10, 11, 14, 15
+    STORE_WORDS m0, m1,  0,  1,  4,  5,  2,  3,  6,  7
+    STORE_WORDS m2, m3,  8,  9, 12, 13, 10, 11, 14, 15
 %endmacro
 
 INIT_XMM sse2