hwcontext_vulkan: fix VkImageToMemoryCopyEXT.sType
[ffmpeg.git] / libavutil / x86 / pixelutils.asm
1 ;******************************************************************************
2 ;* Pixel utilities SIMD
3 ;*
4 ;* Copyright (C) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 ;* Copyright (C) 2014 Clément Bœsch <u pkh me>
6 ;*
7 ;* This file is part of FFmpeg.
8 ;*
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
13 ;*
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
18 ;*
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
23
24 %include "libavutil/x86/x86util.asm"
25
26 SECTION .text
27
28 ;-------------------------------------------------------------------------------
29 ; int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1,
30 ; const uint8_t *src2, ptrdiff_t stride2);
31 ;-------------------------------------------------------------------------------
32 INIT_MMX mmxext
33 cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2
34 pxor m2, m2
35 %rep 4
36 mova m0, [src1q]
37 mova m1, [src1q + stride1q]
38 psadbw m0, [src2q]
39 psadbw m1, [src2q + stride2q]
40 paddw m2, m0
41 paddw m2, m1
42 lea src1q, [src1q + 2*stride1q]
43 lea src2q, [src2q + 2*stride2q]
44 %endrep
45 movd eax, m2
46 emms
47 RET
48
49 ;-------------------------------------------------------------------------------
50 ; int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
51 ; const uint8_t *src2, ptrdiff_t stride2);
52 ;-------------------------------------------------------------------------------
53 INIT_XMM sse2
54 cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2
55 movu m4, [src1q]
56 movu m2, [src2q]
57 movu m1, [src1q + stride1q]
58 movu m3, [src2q + stride2q]
59 psadbw m4, m2
60 psadbw m1, m3
61 paddw m4, m1
62 %rep 7
63 lea src1q, [src1q + 2*stride1q]
64 lea src2q, [src2q + 2*stride2q]
65 movu m0, [src1q]
66 movu m2, [src2q]
67 movu m1, [src1q + stride1q]
68 movu m3, [src2q + stride2q]
69 psadbw m0, m2
70 psadbw m1, m3
71 paddw m4, m0
72 paddw m4, m1
73 %endrep
74 movhlps m0, m4
75 paddw m4, m0
76 movd eax, m4
77 RET
78
79 ;-------------------------------------------------------------------------------
80 ; int ff_pixelutils_sad_[au]_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
81 ; const uint8_t *src2, ptrdiff_t stride2);
82 ;-------------------------------------------------------------------------------
83 %macro SAD_XMM_16x16 1
84 INIT_XMM sse2
85 cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2
86 mov%1 m2, [src2q]
87 psadbw m2, [src1q]
88 mov%1 m1, [src2q + stride2q]
89 psadbw m1, [src1q + stride1q]
90 paddw m2, m1
91 %rep 7
92 lea src1q, [src1q + 2*stride1q]
93 lea src2q, [src2q + 2*stride2q]
94 mov%1 m0, [src2q]
95 psadbw m0, [src1q]
96 mov%1 m1, [src2q + stride2q]
97 psadbw m1, [src1q + stride1q]
98 paddw m2, m0
99 paddw m2, m1
100 %endrep
101 movhlps m0, m2
102 paddw m2, m0
103 movd eax, m2
104 RET
105 %endmacro
106
107 SAD_XMM_16x16 a
108 SAD_XMM_16x16 u
109
110
111 %macro PROCESS_SAD_32x4_U 0
112 movu m1, [r2]
113 movu m2, [r2 + 16]
114 movu m3, [r0]
115 movu m4, [r0 + 16]
116 psadbw m1, m3
117 psadbw m2, m4
118 paddd m1, m2
119 paddd m0, m1
120 lea r2, [r2 + r3]
121 lea r0, [r0 + r1]
122
123 movu m1, [r2]
124 movu m2, [r2 + 16]
125 movu m3, [r0]
126 movu m4, [r0 + 16]
127 psadbw m1, m3
128 psadbw m2, m4
129 paddd m1, m2
130 paddd m0, m1
131 lea r2, [r2 + r3]
132 lea r0, [r0 + r1]
133
134 movu m1, [r2]
135 movu m2, [r2 + 16]
136 movu m3, [r0]
137 movu m4, [r0 + 16]
138 psadbw m1, m3
139 psadbw m2, m4
140 paddd m1, m2
141 paddd m0, m1
142 lea r2, [r2 + r3]
143 lea r0, [r0 + r1]
144
145 movu m1, [r2]
146 movu m2, [r2 + 16]
147 movu m3, [r0]
148 movu m4, [r0 + 16]
149 psadbw m1, m3
150 psadbw m2, m4
151 paddd m1, m2
152 paddd m0, m1
153 lea r2, [r2 + r3]
154 lea r0, [r0 + r1]
155 %endmacro
156
157 %macro PROCESS_SAD_32x4 1
158 mov%1 m1, [r2]
159 mov%1 m2, [r2 + 16]
160 psadbw m1, [r0]
161 psadbw m2, [r0 + 16]
162 paddd m1, m2
163 paddd m0, m1
164 lea r2, [r2 + r3]
165 lea r0, [r0 + r1]
166
167 mov%1 m1, [r2]
168 mov%1 m2, [r2 + 16]
169 psadbw m1, [r0]
170 psadbw m2, [r0 + 16]
171 paddd m1, m2
172 paddd m0, m1
173 lea r2, [r2 + r3]
174 lea r0, [r0 + r1]
175
176 mov%1 m1, [r2]
177 mov%1 m2, [r2 + 16]
178 psadbw m1, [r0]
179 psadbw m2, [r0 + 16]
180 paddd m1, m2
181 paddd m0, m1
182 lea r2, [r2 + r3]
183 lea r0, [r0 + r1]
184
185 mov%1 m1, [r2]
186 mov%1 m2, [r2 + 16]
187 psadbw m1, [r0]
188 psadbw m2, [r0 + 16]
189 paddd m1, m2
190 paddd m0, m1
191 lea r2, [r2 + r3]
192 lea r0, [r0 + r1]
193 %endmacro
194
195 ;-----------------------------------------------------------------------------
196 ; int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
197 ; const uint8_t *src2, ptrdiff_t stride2);
198 ;-----------------------------------------------------------------------------
199 INIT_XMM sse2
200 cglobal pixelutils_sad_32x32, 4,5,5, src1, stride1, src2, stride2
201 pxor m0, m0
202 mov r4d, 4
203 .loop:
204 PROCESS_SAD_32x4_U
205 PROCESS_SAD_32x4_U
206 dec r4d
207 jnz .loop
208
209 movhlps m1, m0
210 paddd m0, m1
211 movd eax, m0
212 RET
213
214 ;-------------------------------------------------------------------------------
215 ; int ff_pixelutils_sad_[au]_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
216 ; const uint8_t *src2, ptrdiff_t stride2);
217 ;-------------------------------------------------------------------------------
218 %macro SAD_XMM_32x32 1
219 INIT_XMM sse2
220 cglobal pixelutils_sad_%1_32x32, 4,5,3, src1, stride1, src2, stride2
221 pxor m0, m0
222 mov r4d, 4
223 .loop:
224 PROCESS_SAD_32x4 %1
225 PROCESS_SAD_32x4 %1
226 dec r4d
227 jnz .loop
228
229 movhlps m1, m0
230 paddd m0, m1
231 movd eax, m0
232 RET
233 %endmacro
234
235 SAD_XMM_32x32 a
236 SAD_XMM_32x32 u
237
238 %if HAVE_AVX2_EXTERNAL
239 ;-------------------------------------------------------------------------------
240 ; int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
241 ; const uint8_t *src2, ptrdiff_t stride2);
242 ;-------------------------------------------------------------------------------
243 INIT_YMM avx2
244 cglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2
245 pxor m0, m0
246 mov r4d, 32/4
247 lea r5, [stride1q * 3]
248 lea r6, [stride2q * 3]
249
250 .loop:
251 movu m1, [src1q] ; row 0 of pix0
252 movu m2, [src2q] ; row 0 of pix1
253 movu m3, [src1q + stride1q] ; row 1 of pix0
254 movu m4, [src2q + stride2q] ; row 1 of pix1
255
256 psadbw m1, m2
257 psadbw m3, m4
258 paddd m0, m1
259 paddd m0, m3
260
261 movu m1, [src1q + 2 * stride1q] ; row 2 of pix0
262 movu m2, [src2q + 2 * stride2q] ; row 2 of pix1
263 movu m3, [src1q + r5] ; row 3 of pix0
264 movu m4, [src2q + r6] ; row 3 of pix1
265
266 psadbw m1, m2
267 psadbw m3, m4
268 paddd m0, m1
269 paddd m0, m3
270
271 lea src2q, [src2q + 4 * stride2q]
272 lea src1q, [src1q + 4 * stride1q]
273
274 dec r4d
275 jnz .loop
276
277 vextracti128 xm1, m0, 1
278 paddd xm0, xm1
279 pshufd xm1, xm0, 2
280 paddd xm0, xm1
281 movd eax, xm0
282 RET
283
284 ;-------------------------------------------------------------------------------
285 ; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
286 ; const uint8_t *src2, ptrdiff_t stride2);
287 ;-------------------------------------------------------------------------------
288 %macro SAD_AVX2_32x32 1
289 INIT_YMM avx2
290 cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2
291 pxor m0, m0
292 mov r4d, 32/4
293 lea r5, [stride1q * 3]
294 lea r6, [stride2q * 3]
295
296 .loop:
297 mov%1 m1, [src2q] ; row 0 of pix1
298 psadbw m1, [src1q]
299 mov%1 m2, [src2q + stride2q] ; row 1 of pix1
300 psadbw m2, [src1q + stride1q]
301
302 paddd m0, m1
303 paddd m0, m2
304
305 mov%1 m1, [src2q + 2 * stride2q] ; row 2 of pix1
306 psadbw m1, [src1q + 2 * stride1q]
307 mov%1 m2, [src2q + r6] ; row 3 of pix1
308 psadbw m2, [src1q + r5]
309
310 paddd m0, m1
311 paddd m0, m2
312
313 lea src2q, [src2q + 4 * stride2q]
314 lea src1q, [src1q + 4 * stride1q]
315
316 dec r4d
317 jnz .loop
318
319 vextracti128 xm1, m0, 1
320 paddd xm0, xm1
321 pshufd xm1, xm0, 2
322 paddd xm0, xm1
323 movd eax, xm0
324 RET
325 %endmacro
326
327 SAD_AVX2_32x32 a
328 SAD_AVX2_32x32 u
329 %endif