avfilter/avfiltergraph: fix constant string comparision
[ffmpeg.git] / libavutil / x86 / float_dsp.asm
1 ;*****************************************************************************
2 ;* x86-optimized Float DSP functions
3 ;*
4 ;* Copyright 2006 Loren Merritt
5 ;*
6 ;* This file is part of FFmpeg.
7 ;*
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
12 ;*
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
17 ;*
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
22
23 %include "libavutil/x86/x86util.asm"
24
25 SECTION_RODATA 32
26 pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
27
28 SECTION .text
29
30 ;-----------------------------------------------------------------------------
31 ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
32 ;-----------------------------------------------------------------------------
33 %macro VECTOR_FMUL 0
34 cglobal vector_fmul, 4,4,2, dst, src0, src1, len
35 lea lenq, [lend*4 - 64]
36 ALIGN 16
37 .loop:
38 %assign a 0
39 %rep 32/mmsize
40 mova m0, [src0q + lenq + (a+0)*mmsize]
41 mova m1, [src0q + lenq + (a+1)*mmsize]
42 mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
43 mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
44 mova [dstq + lenq + (a+0)*mmsize], m0
45 mova [dstq + lenq + (a+1)*mmsize], m1
46 %assign a a+2
47 %endrep
48
49 sub lenq, 64
50 jge .loop
51 RET
52 %endmacro
53
54 INIT_XMM sse
55 VECTOR_FMUL
56 %if HAVE_AVX_EXTERNAL
57 INIT_YMM avx
58 VECTOR_FMUL
59 %endif
60
61 ;-----------------------------------------------------------------------------
62 ; void vector_dmul(double *dst, const double *src0, const double *src1, int len)
63 ;-----------------------------------------------------------------------------
64 %macro VECTOR_DMUL 0
65 cglobal vector_dmul, 4,4,4, dst, src0, src1, len
66 lea lend, [lenq*8 - mmsize*4]
67 ALIGN 16
68 .loop:
69 movaps m0, [src0q + lenq + 0*mmsize]
70 movaps m1, [src0q + lenq + 1*mmsize]
71 movaps m2, [src0q + lenq + 2*mmsize]
72 movaps m3, [src0q + lenq + 3*mmsize]
73 mulpd m0, m0, [src1q + lenq + 0*mmsize]
74 mulpd m1, m1, [src1q + lenq + 1*mmsize]
75 mulpd m2, m2, [src1q + lenq + 2*mmsize]
76 mulpd m3, m3, [src1q + lenq + 3*mmsize]
77 movaps [dstq + lenq + 0*mmsize], m0
78 movaps [dstq + lenq + 1*mmsize], m1
79 movaps [dstq + lenq + 2*mmsize], m2
80 movaps [dstq + lenq + 3*mmsize], m3
81
82 sub lenq, mmsize*4
83 jge .loop
84 RET
85 %endmacro
86
87 INIT_XMM sse2
88 VECTOR_DMUL
89 %if HAVE_AVX_EXTERNAL
90 INIT_YMM avx
91 VECTOR_DMUL
92 %endif
93
94 ;------------------------------------------------------------------------------
95 ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
96 ;------------------------------------------------------------------------------
97
98 %macro VECTOR_FMAC_SCALAR 0
99 %if UNIX64
100 cglobal vector_fmac_scalar, 3,3,5, dst, src, len
101 %else
102 cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
103 %endif
104 %if ARCH_X86_32
105 VBROADCASTSS m0, mulm
106 %else
107 %if WIN64
108 SWAP 0, 2
109 %endif
110 shufps xm0, xm0, 0
111 %if cpuflag(avx)
112 vinsertf128 m0, m0, xm0, 1
113 %endif
114 %endif
115 lea lenq, [lend*4-64]
116 .loop:
117 %if cpuflag(fma3)
118 mova m1, [dstq+lenq]
119 mova m2, [dstq+lenq+1*mmsize]
120 fmaddps m1, m0, [srcq+lenq], m1
121 fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
122 %else ; cpuflag
123 mulps m1, m0, [srcq+lenq]
124 mulps m2, m0, [srcq+lenq+1*mmsize]
125 %if mmsize < 32
126 mulps m3, m0, [srcq+lenq+2*mmsize]
127 mulps m4, m0, [srcq+lenq+3*mmsize]
128 %endif ; mmsize
129 addps m1, m1, [dstq+lenq]
130 addps m2, m2, [dstq+lenq+1*mmsize]
131 %if mmsize < 32
132 addps m3, m3, [dstq+lenq+2*mmsize]
133 addps m4, m4, [dstq+lenq+3*mmsize]
134 %endif ; mmsize
135 %endif ; cpuflag
136 mova [dstq+lenq], m1
137 mova [dstq+lenq+1*mmsize], m2
138 %if mmsize < 32
139 mova [dstq+lenq+2*mmsize], m3
140 mova [dstq+lenq+3*mmsize], m4
141 %endif ; mmsize
142 sub lenq, 64
143 jge .loop
144 RET
145 %endmacro
146
147 INIT_XMM sse
148 VECTOR_FMAC_SCALAR
149 %if HAVE_AVX_EXTERNAL
150 INIT_YMM avx
151 VECTOR_FMAC_SCALAR
152 %endif
153 %if HAVE_FMA3_EXTERNAL
154 INIT_YMM fma3
155 VECTOR_FMAC_SCALAR
156 %endif
157
158 ;------------------------------------------------------------------------------
159 ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
160 ;------------------------------------------------------------------------------
161
162 %macro VECTOR_FMUL_SCALAR 0
163 %if UNIX64
164 cglobal vector_fmul_scalar, 3,3,2, dst, src, len
165 %else
166 cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
167 %endif
168 %if ARCH_X86_32
169 movss m0, mulm
170 %elif WIN64
171 SWAP 0, 2
172 %endif
173 shufps m0, m0, 0
174 lea lenq, [lend*4-mmsize]
175 .loop:
176 mova m1, [srcq+lenq]
177 mulps m1, m0
178 mova [dstq+lenq], m1
179 sub lenq, mmsize
180 jge .loop
181 RET
182 %endmacro
183
184 INIT_XMM sse
185 VECTOR_FMUL_SCALAR
186
187 ;------------------------------------------------------------------------------
188 ; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
189 ; int len)
190 ;------------------------------------------------------------------------------
191
192 %macro VECTOR_DMAC_SCALAR 0
193 %if ARCH_X86_32
194 cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
195 mov lenq, lenaddrm
196 VBROADCASTSD m0, mulm
197 %else
198 %if UNIX64
199 cglobal vector_dmac_scalar, 3,3,5, dst, src, len
200 %else
201 cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
202 SWAP 0, 2
203 %endif
204 movlhps xm0, xm0
205 %if cpuflag(avx)
206 vinsertf128 m0, m0, xm0, 1
207 %endif
208 %endif
209 lea lenq, [lend*8-mmsize*4]
210 .loop:
211 %if cpuflag(fma3)
212 movaps m1, [dstq+lenq]
213 movaps m2, [dstq+lenq+1*mmsize]
214 movaps m3, [dstq+lenq+2*mmsize]
215 movaps m4, [dstq+lenq+3*mmsize]
216 fmaddpd m1, m0, [srcq+lenq], m1
217 fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2
218 fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3
219 fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4
220 %else ; cpuflag
221 mulpd m1, m0, [srcq+lenq]
222 mulpd m2, m0, [srcq+lenq+1*mmsize]
223 mulpd m3, m0, [srcq+lenq+2*mmsize]
224 mulpd m4, m0, [srcq+lenq+3*mmsize]
225 addpd m1, m1, [dstq+lenq]
226 addpd m2, m2, [dstq+lenq+1*mmsize]
227 addpd m3, m3, [dstq+lenq+2*mmsize]
228 addpd m4, m4, [dstq+lenq+3*mmsize]
229 %endif ; cpuflag
230 movaps [dstq+lenq], m1
231 movaps [dstq+lenq+1*mmsize], m2
232 movaps [dstq+lenq+2*mmsize], m3
233 movaps [dstq+lenq+3*mmsize], m4
234 sub lenq, mmsize*4
235 jge .loop
236 RET
237 %endmacro
238
239 INIT_XMM sse2
240 VECTOR_DMAC_SCALAR
241 %if HAVE_AVX_EXTERNAL
242 INIT_YMM avx
243 VECTOR_DMAC_SCALAR
244 %endif
245 %if HAVE_FMA3_EXTERNAL
246 INIT_YMM fma3
247 VECTOR_DMAC_SCALAR
248 %endif
249
250 ;------------------------------------------------------------------------------
251 ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
252 ; int len)
253 ;------------------------------------------------------------------------------
254
255 %macro VECTOR_DMUL_SCALAR 0
256 %if ARCH_X86_32
257 cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
258 mov lenq, lenaddrm
259 %elif UNIX64
260 cglobal vector_dmul_scalar, 3,3,3, dst, src, len
261 %else
262 cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
263 %endif
264 %if ARCH_X86_32
265 VBROADCASTSD m0, mulm
266 %else
267 %if WIN64
268 SWAP 0, 2
269 %endif
270 movlhps xm0, xm0
271 %if cpuflag(avx)
272 vinsertf128 ym0, ym0, xm0, 1
273 %endif
274 %endif
275 lea lenq, [lend*8-2*mmsize]
276 .loop:
277 mulpd m1, m0, [srcq+lenq ]
278 mulpd m2, m0, [srcq+lenq+mmsize]
279 movaps [dstq+lenq ], m1
280 movaps [dstq+lenq+mmsize], m2
281 sub lenq, 2*mmsize
282 jge .loop
283 RET
284 %endmacro
285
286 INIT_XMM sse2
287 VECTOR_DMUL_SCALAR
288 %if HAVE_AVX_EXTERNAL
289 INIT_YMM avx
290 VECTOR_DMUL_SCALAR
291 %endif
292
293 ;-----------------------------------------------------------------------------
294 ; vector_fmul_window(float *dst, const float *src0,
295 ; const float *src1, const float *win, int len);
296 ;-----------------------------------------------------------------------------
297 INIT_XMM sse
298 cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
299 shl lend, 2
300 lea len1q, [lenq - mmsize]
301 add src0q, lenq
302 add dstq, lenq
303 add winq, lenq
304 neg lenq
305 .loop:
306 mova m0, [winq + lenq]
307 mova m4, [src0q + lenq]
308 mova m1, [winq + len1q]
309 mova m5, [src1q + len1q]
310 shufps m1, m1, 0x1b
311 shufps m5, m5, 0x1b
312 mova m2, m0
313 mova m3, m1
314 mulps m2, m4
315 mulps m3, m5
316 mulps m1, m4
317 mulps m0, m5
318 addps m2, m3
319 subps m1, m0
320 shufps m2, m2, 0x1b
321 mova [dstq + lenq], m1
322 mova [dstq + len1q], m2
323 sub len1q, mmsize
324 add lenq, mmsize
325 jl .loop
326 RET
327
328 ;-----------------------------------------------------------------------------
329 ; vector_fmul_add(float *dst, const float *src0, const float *src1,
330 ; const float *src2, int len)
331 ;-----------------------------------------------------------------------------
332 %macro VECTOR_FMUL_ADD 0
333 cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
334 lea lenq, [lend*4 - 2*mmsize]
335 ALIGN 16
336 .loop:
337 mova m0, [src0q + lenq]
338 mova m1, [src0q + lenq + mmsize]
339 %if cpuflag(fma3)
340 mova m2, [src2q + lenq]
341 mova m3, [src2q + lenq + mmsize]
342 fmaddps m0, m0, [src1q + lenq], m2
343 fmaddps m1, m1, [src1q + lenq + mmsize], m3
344 %else
345 mulps m0, m0, [src1q + lenq]
346 mulps m1, m1, [src1q + lenq + mmsize]
347 addps m0, m0, [src2q + lenq]
348 addps m1, m1, [src2q + lenq + mmsize]
349 %endif
350 mova [dstq + lenq], m0
351 mova [dstq + lenq + mmsize], m1
352
353 sub lenq, 2*mmsize
354 jge .loop
355 RET
356 %endmacro
357
358 INIT_XMM sse
359 VECTOR_FMUL_ADD
360 %if HAVE_AVX_EXTERNAL
361 INIT_YMM avx
362 VECTOR_FMUL_ADD
363 %endif
364 %if HAVE_FMA3_EXTERNAL
365 INIT_YMM fma3
366 VECTOR_FMUL_ADD
367 %endif
368
369 ;-----------------------------------------------------------------------------
370 ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
371 ; int len)
372 ;-----------------------------------------------------------------------------
373 %macro VECTOR_FMUL_REVERSE 0
374 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
375 %if cpuflag(avx2)
376 movaps m2, [pd_reverse]
377 %endif
378 lea lenq, [lend*4 - 2*mmsize]
379 ALIGN 16
380 .loop:
381 %if cpuflag(avx2)
382 vpermps m0, m2, [src1q]
383 vpermps m1, m2, [src1q+mmsize]
384 %elif cpuflag(avx)
385 vmovaps xmm0, [src1q + 16]
386 vinsertf128 m0, m0, [src1q], 1
387 vshufps m0, m0, m0, q0123
388 vmovaps xmm1, [src1q + mmsize + 16]
389 vinsertf128 m1, m1, [src1q + mmsize], 1
390 vshufps m1, m1, m1, q0123
391 %else
392 mova m0, [src1q]
393 mova m1, [src1q + mmsize]
394 shufps m0, m0, q0123
395 shufps m1, m1, q0123
396 %endif
397 mulps m0, m0, [src0q + lenq + mmsize]
398 mulps m1, m1, [src0q + lenq]
399 movaps [dstq + lenq + mmsize], m0
400 movaps [dstq + lenq], m1
401 add src1q, 2*mmsize
402 sub lenq, 2*mmsize
403 jge .loop
404 RET
405 %endmacro
406
407 INIT_XMM sse
408 VECTOR_FMUL_REVERSE
409 %if HAVE_AVX_EXTERNAL
410 INIT_YMM avx
411 VECTOR_FMUL_REVERSE
412 %endif
413 %if HAVE_AVX2_EXTERNAL
414 INIT_YMM avx2
415 VECTOR_FMUL_REVERSE
416 %endif
417
418 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
419 INIT_XMM sse
420 cglobal scalarproduct_float, 3,3,2, v1, v2, offset
421 shl offsetd, 2
422 add v1q, offsetq
423 add v2q, offsetq
424 neg offsetq
425 xorps xmm0, xmm0
426 .loop:
427 movaps xmm1, [v1q+offsetq]
428 mulps xmm1, [v2q+offsetq]
429 addps xmm0, xmm1
430 add offsetq, 16
431 js .loop
432 movhlps xmm1, xmm0
433 addps xmm0, xmm1
434 movss xmm1, xmm0
435 shufps xmm0, xmm0, 1
436 addss xmm0, xmm1
437 %if ARCH_X86_64 == 0
438 movss r0m, xmm0
439 fld dword r0m
440 %endif
441 RET
442
443 INIT_YMM fma3
444 cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
445 xor offsetq, offsetq
446 xorps m0, m0, m0
447 shl sized, 2
448 mov lenq, sizeq
449 cmp lenq, 32
450 jl .l16
451 cmp lenq, 64
452 jl .l32
453 xorps m1, m1, m1
454 cmp lenq, 128
455 jl .l64
456 and lenq, ~127
457 xorps m2, m2, m2
458 xorps m3, m3, m3
459 .loop128:
460 movups m4, [v1q+offsetq]
461 movups m5, [v1q+offsetq + 32]
462 movups m6, [v1q+offsetq + 64]
463 movups m7, [v1q+offsetq + 96]
464 fmaddps m0, m4, [v2q+offsetq ], m0
465 fmaddps m1, m5, [v2q+offsetq + 32], m1
466 fmaddps m2, m6, [v2q+offsetq + 64], m2
467 fmaddps m3, m7, [v2q+offsetq + 96], m3
468 add offsetq, 128
469 cmp offsetq, lenq
470 jl .loop128
471 addps m0, m0, m2
472 addps m1, m1, m3
473 mov lenq, sizeq
474 and lenq, 127
475 cmp lenq, 64
476 jge .l64
477 addps m0, m0, m1
478 cmp lenq, 32
479 jge .l32
480 vextractf128 xmm2, m0, 1
481 addps xmm0, xmm2
482 cmp lenq, 16
483 jge .l16
484 movhlps xmm1, xmm0
485 addps xmm0, xmm1
486 movss xmm1, xmm0
487 shufps xmm0, xmm0, 1
488 addss xmm0, xmm1
489 %if ARCH_X86_64 == 0
490 movss r0m, xm0
491 fld dword r0m
492 %endif
493 RET
494 .l64:
495 and lenq, ~63
496 add lenq, offsetq
497 .loop64:
498 movups m4, [v1q+offsetq]
499 movups m5, [v1q+offsetq + 32]
500 fmaddps m0, m4, [v2q+offsetq], m0
501 fmaddps m1, m5, [v2q+offsetq + 32], m1
502 add offsetq, 64
503 cmp offsetq, lenq
504 jl .loop64
505 addps m0, m0, m1
506 mov lenq, sizeq
507 and lenq, 63
508 cmp lenq, 32
509 jge .l32
510 vextractf128 xmm2, m0, 1
511 addps xmm0, xmm2
512 cmp lenq, 16
513 jge .l16
514 movhlps xmm1, xmm0
515 addps xmm0, xmm1
516 movss xmm1, xmm0
517 shufps xmm0, xmm0, 1
518 addss xmm0, xmm1
519 %if ARCH_X86_64 == 0
520 movss r0m, xm0
521 fld dword r0m
522 %endif
523 RET
524 .l32:
525 and lenq, ~31
526 add lenq, offsetq
527 .loop32:
528 movups m4, [v1q+offsetq]
529 fmaddps m0, m4, [v2q+offsetq], m0
530 add offsetq, 32
531 cmp offsetq, lenq
532 jl .loop32
533 vextractf128 xmm2, m0, 1
534 addps xmm0, xmm2
535 mov lenq, sizeq
536 and lenq, 31
537 cmp lenq, 16
538 jge .l16
539 movhlps xmm1, xmm0
540 addps xmm0, xmm1
541 movss xmm1, xmm0
542 shufps xmm0, xmm0, 1
543 addss xmm0, xmm1
544 %if ARCH_X86_64 == 0
545 movss r0m, xm0
546 fld dword r0m
547 %endif
548 RET
549 .l16:
550 and lenq, ~15
551 add lenq, offsetq
552 .loop16:
553 movaps xmm1, [v1q+offsetq]
554 mulps xmm1, [v2q+offsetq]
555 addps xmm0, xmm1
556 add offsetq, 16
557 cmp offsetq, lenq
558 jl .loop16
559 movhlps xmm1, xmm0
560 addps xmm0, xmm1
561 movss xmm1, xmm0
562 shufps xmm0, xmm0, 1
563 addss xmm0, xmm1
564 %if ARCH_X86_64 == 0
565 movss r0m, xm0
566 fld dword r0m
567 %endif
568 RET
569
570 ;---------------------------------------------------------------------------------
571 ; double scalarproduct_double(const double *v1, const double *v2, size_t len)
572 ;---------------------------------------------------------------------------------
573 %macro SCALARPRODUCT_DOUBLE 0
574 cglobal scalarproduct_double, 3,3,8, v1, v2, offset
575 shl offsetq, 3
576 add v1q, offsetq
577 add v2q, offsetq
578 neg offsetq
579 xorpd m0, m0
580 xorpd m1, m1
581 movapd m2, m0
582 movapd m3, m1
583 align 16
584 .loop:
585 movapd m4, [v1q+offsetq+mmsize*0]
586 movapd m5, [v1q+offsetq+mmsize*1]
587 movapd m6, [v1q+offsetq+mmsize*2]
588 movapd m7, [v1q+offsetq+mmsize*3]
589 mulpd m4, [v2q+offsetq+mmsize*0]
590 mulpd m5, [v2q+offsetq+mmsize*1]
591 mulpd m6, [v2q+offsetq+mmsize*2]
592 mulpd m7, [v2q+offsetq+mmsize*3]
593 addpd m0, m4
594 addpd m1, m5
595 addpd m2, m6
596 addpd m3, m7
597 add offsetq, mmsize*4
598 jl .loop
599 addpd m0, m1
600 addpd m2, m3
601 addpd m0, m2
602 %if mmsize == 32
603 vextractf128 xm1, m0, 1
604 addpd xm0, xm1
605 %endif
606 movhlps xm1, xm0
607 addsd xm0, xm1
608 %if ARCH_X86_64 == 0
609 movsd r0m, xm0
610 fld qword r0m
611 %endif
612 RET
613 %endmacro
614
615 INIT_XMM sse2
616 SCALARPRODUCT_DOUBLE
617 %if HAVE_AVX_EXTERNAL
618 INIT_YMM avx
619 SCALARPRODUCT_DOUBLE
620 %endif
621
622 ;-----------------------------------------------------------------------------
623 ; void ff_butterflies_float(float *src0, float *src1, int len);
624 ;-----------------------------------------------------------------------------
625 INIT_XMM sse
626 cglobal butterflies_float, 3,3,3, src0, src1, len
627 shl lend, 2
628 add src0q, lenq
629 add src1q, lenq
630 neg lenq
631 .loop:
632 mova m0, [src0q + lenq]
633 mova m1, [src1q + lenq]
634 subps m2, m0, m1
635 addps m0, m0, m1
636 mova [src1q + lenq], m2
637 mova [src0q + lenq], m0
638 add lenq, mmsize
639 jl .loop
640 RET