lavc/aarch64: Fix addp overflow in ff_pred16x16_plane_neon_10

author Bin Peng <pengbin@visionular.com>

Fri, 24 Oct 2025 07:58:08 +0000 (15:58 +0800)

committer Martin Storsjö <martin@martin.st>

Tue, 4 Nov 2025 12:11:13 +0000 (14:11 +0200)
author Bin Peng <pengbin@visionular.com>
Fri, 24 Oct 2025 07:58:08 +0000 (15:58 +0800)
committer Martin Storsjö <martin@martin.st>
Tue, 4 Nov 2025 12:11:13 +0000 (14:11 +0200)
diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S

index d0999938efb3773ca840bb6c7352db5bbf1c11ad..795d2ce540b0cfdabaf55d9c49a70d4d629b20b2 100644 (file)
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -489,10 +489,10 @@ function ff_pred16x16_plane_neon_10, export=1
          mul             v2.8h,  v2.8h,  v0.8h
          mul             v3.8h,  v3.8h,  v0.8h
          addp            v2.8h,  v2.8h,  v3.8h
-        addp            v2.8h,  v2.8h,  v2.8h
-        addp            v2.4h,  v2.4h,  v2.4h
-        sshll           v3.4s,  v2.4h,  #2
-        saddw           v2.4s,  v3.4s,  v2.4h
+        saddlp          v2.4s,  v2.8h
+        addp            v2.4s,  v2.4s,  v2.4s
+        shl             v3.4s,  v2.4s,  #2
+        add             v2.4s,  v3.4s,  v2.4s
          rshrn           v4.4h,  v2.4s,  #6
          trn2            v5.4h,  v4.4h,  v4.4h
          add             v2.4h,  v4.4h,  v5.4h
@@ -506,14 +506,13 @@ function ff_pred16x16_plane_neon_10, export=1
          sxtl            v6.4s,  v5.4h          // c
  
          mov             v0.h[0],  wzr
-        mul             v0.8h,  v0.8h,  v4.h[0]
          dup             v16.4s, v2.s[0]
          dup             v17.4s, v2.s[0]
          dup             v2.8h,  v4.h[0]        // b
          dup             v3.4s,  v6.s[0]        // c
          sshll           v2.4s,  v2.4h,  #3     // b * 8
-        saddw           v16.4s, v16.4s, v0.4h
-        saddw2          v17.4s, v17.4s, v0.8h
+        smlal           v16.4s, v0.4h, v4.h[0]
+        smlal2          v17.4s, v0.8h, v4.h[0]
          sub             v3.4s,  v3.4s,  v2.4s
  
          mov             w3,      #16
author	Bin Peng <pengbin@visionular.com>
	Fri, 24 Oct 2025 07:58:08 +0000 (15:58 +0800)
committer	Martin Storsjö <martin@martin.st>
	Tue, 4 Nov 2025 12:11:13 +0000 (14:11 +0200)