ff_h264_luma_dc_dequant_idct_sse2() does not pass checkasm for certain
seeds, because the input to packssdw no longer fits into an int16_t,
leading to saturation, where the C code just truncates. I don't know
whether the spec contains provisions that ensure that valid input
must not exceed 16 bit or whether the such inputs (even if invalid)
can be triggered by the actual code and not only the test.
This commit adapts the behavior of the function to the C reference code
to fix the test. packssdw is avoided, instead the lower words are
directly transfered to GPRs to be written out. This has unfortunately
led to a slight performance regression here (14.5 vs 15.1 cycles).
Fixes issue #20835.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
psrad m1, %1
psrad m2, %1
psrad m3, %1
psrad m1, %1
psrad m2, %1
psrad m3, %1
- packssdw m0, m1
- packssdw m2, m3
-%macro STORE_WORDS 9
- movd t0d, %1
- psrldq %1, 4
- movd t1d, %1
- psrldq %1, 4
- mov [t2+%2*32], t0w
- mov [t2+%4*32], t1w
- shr t0d, 16
- shr t1d, 16
+%macro STORE_WORDS 10
+%if ARCH_X86_64
+ movq t0, %1
+ movq t1, %2
+ psrldq %1, 8
+ psrldq %2, 8
- mov [t2+%5*32], t1w
- movd t0d, %1
- psrldq %1, 4
- movd t1d, %1
+ mov [t2+%7*32], t1w
+ shr t0, 32
+ shr t1, 32
+ mov [t2+%4*32], t0w
+ mov [t2+%8*32], t1w
+ movq t0, %1
+ movq t1, %2
+ mov [t2+%5*32], t0w
+ mov [t2+%9*32], t1w
+ shr t0, 32
+ shr t1, 32
+ mov [t2+%10*32], t1w
+%else
+ movd t0d, %1
+ movd t1d, %2
+ psrldq %1, 4
+ psrldq %2, 4
+ mov [t2+%3*32], t0w
+ mov [t2+%7*32], t1w
+ movd t0d, %1
+ movd t1d, %2
+ psrldq %1, 4
+ psrldq %2, 4
+ mov [t2+%4*32], t0w
- shr t0d, 16
- shr t1d, 16
- mov [t2+%7*32], t0w
+ movd t0d, %1
+ movd t1d, %2
+ psrldq %1, 4
+ psrldq %2, 4
+ mov [t2+%5*32], t0w
+ movd t0d, %1
+ movd t1d, %2
+ mov [t2+%6*32], t0w
+ mov [t2+%10*32], t1w
+%endif
%endmacro
%macro DEQUANT_STORE 1
DEQUANT %1
%endmacro
%macro DEQUANT_STORE 1
DEQUANT %1
- STORE_WORDS m0, 0, 1, 4, 5, 2, 3, 6, 7
- STORE_WORDS m2, 8, 9, 12, 13, 10, 11, 14, 15
+ STORE_WORDS m0, m1, 0, 1, 4, 5, 2, 3, 6, 7
+ STORE_WORDS m2, m3, 8, 9, 12, 13, 10, 11, 14, 15