unplugged-system/external/libxaac/decoder/armv8/ixheaacd_post_twiddle_overlap.s

1879 lines
37 KiB
ArmAsm

.macro push_v_regs
stp q8, q9, [sp, #-32]!
stp q10, q11, [sp, #-32]!
stp q12, q13, [sp, #-32]!
stp q14, q15, [sp, #-32]!
//st1 { v8.2d, v9.2d, v10.2d, v11.2d}, [sp, #-64]!
//st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp, #-64]!
stp X8, X9, [sp, #-16]!
stp X10, X11, [sp, #-16]!
stp X12, X13, [sp, #-16]!
stp X14, X15, [sp, #-16]!
stp X16, X17, [sp, #-16]!
stp X18, X19, [sp, #-16]!
stp X20, X21, [sp, #-16]!
stp X22, X23, [sp, #-16]!
stp X24, X25, [sp, #-16]!
stp X26, X27, [sp, #-16]!
stp X28, X29, [sp, #-16]!
stp X30, X29, [sp, #-16]!
.endm
.macro pop_v_regs
ldp X30, X29, [sp], #16
ldp X28, X29, [sp], #16
ldp X26, X27, [sp], #16
ldp X24, X25, [sp], #16
ldp X22, X23, [sp], #16
ldp X20, X21, [sp], #16
ldp X18, X19, [sp], #16
ldp X16, X17, [sp], #16
ldp X14, X15, [sp], #16
ldp X12, X13, [sp], #16
ldp X10, X11, [sp], #16
ldp X8, X9, [sp], #16
//ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
//ld1 { v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ldp q14, q15, [sp], #32
ldp q12, q13, [sp], #32
ldp q10, q11, [sp], #32
ldp q8, q9, [sp], #32
.endm
.text
.p2align 2
.global ixheaacd_post_twid_overlap_add_armv8
ixheaacd_post_twid_overlap_add_armv8:
// STMFD sp!, {x4-x12}
push_v_regs
//stp x19, x20,[sp,#-16]!
//VPUSH {d8 - d15}
//LDR w4, [sp, #100]
//sxtw x4,w4
//LDR w5, [sp, #104]
//sxtw x5,w5
//LDR w6, [sp, #108]
//sxtw x6,w6
MOV x16, x5
MOV x17, x7
LSL x9, x3, #2
ASR x9, x9, #1
ADD x6, x6, x9
SUB x6, x6, #4
MOV w8, #7500
sxtw x8, w8
ADD x2, x2, x8
movi v18.4h, #50
sub x20, x5, #15
neg x9, x20
movi v20.4s, #0x00, LSL #8
dup v16.4s,w5
SUB x5, x5, #16
//STR w5, [sp, #116]
MOV w25, w5
sxtw x25,w25
MOV x8, #1
LSL x8, x8, x9
//STR w8, [sp, #120]
MOV w26, w8
//sxtw x8,w8
ARM_PROLOGUE:
LDR w8, [x1], #4
sxtw x8,w8
LDR w9, [x1], #4
sxtw x9,w9
LDR w10, [x2], #4
sxtw x10,w10
AND w19,w10,0xFFFF
sxth x19,w19
ASR w10,w10,#16
// SMULWT x11, x8, x10
//
// SMULWB x12, x9, x10
// SMULWB x5, x8, x10
// SMLAWT x7, x9, x10, x5
SMULL x11, w8, w10
ASR x11,x11,#16
SMULL x12, w9, w19
ASR x12,x12,#16
SMULL x5, w8, w19
ASR x5,x5,#16
SMULL x7, w9, w10
ASR x7, x7, #16
ADD x7, x7, x5
SUB x8, x12, x11
MVN x5, x7
ADD x5, x5, #1
MOV x9, #50
MOV x12, #-50
AND w19,w9,0xFFFF
sxth x19,w19
SMULL x10, w5, w19
ASR x10,x10,#16
AND w19,w12,0xFFFF
sxth x19,w19
SMULL x11, w8, w19
ASR x11,x11,#16
ADD x8, x8, x10
ADD x5, x5, x11
//LDR w11, [sp, #104]
MOV w11, w16
sxth x11,w11
LDR w10, [x6], #-32
sxtw x10,w10
AND w19,w10,0xFFFF
sxth x19,w19
ASR w20,w10,#16
//SMULWB x7, x8, x10
SMULL x7, w8, w19
ASR x7, x7, #16
MVN x8, x8
ADD x8, x8, #1
//SMULWT x12, x8, x10
SMULL x12, w8, w20
ASR x12, x12, #16
CMP x11, #0
BLT NEXT
SUB x9, x11, #16
negs x9,x9
// LDR w8, [sp, #120]
//sxtw x8,w8
MOV v1.s[0], w26
MOV v2.s[0], w5
//sQADD w5, w5, w8
//ASR w5, w5, w9
SQADD v2.2s, v2.2s, v1.2s
MOV w5, v2.s[0]
ASR w5, w5, w9
SUB x9, x11, #31
negs x9,x9
ASR x20, x7, x9
//MOV x8, x20
ADDS x8, x20, #0
BGE NEXT2
CMN x8, #1
NEXT2:
MOV x20, #0x80000000
csel x7, x20, x7,LT
MOV x20, #0x7fffffff
csel x7, x20, x7,GT
LSL x20, x7, x11
csel x7,x20,x7,EQ
SUB x9, x11, #31
negs x9,x9
ASR x20, x12, x9
//MOV x8, x20
ADDS x8, x20, #0
BGE NEXT3
CMN x8, #1
NEXT3:
MOV x20, #0x80000000
csel x12, x20, x12,LT
MOV x20, #0x7fffffff
csel x12, x20, x12,GT
LSL x20, x12, x11
csel x12,x20,x12,EQ
B NEXT1
NEXT:
MVN w11, w11
ADD w11, w11, #1
ASR w5, w5, w11
MOV w8, #0x8000
MOV v1.s[0], w8
MOV v2.s[0], w5
//QADD x5, x5, x8
SQADD v2.2s, v2.2s, v1.2s
MOV w5, v2.s[0]
ASR w5, w5, #16
ASR w7, w7, w11
ASR w12, w12, w11
NEXT1:
LDR w9, [x4]
sxtw x9,w9
MOV w8, #0x8000
//sxtw x8,w8
STR w5, [x4], #4
sxtw x5,w5
ROR w20, w10, #16
//UXTH x5, x10, ROR #16
UXTH w5, w20
UXTH w10, w10
dup v0.2s,w9
dup v2.2s,w10
dup v3.2s,w5
//VZIP.32 D2, D3
ZIP1 v28.2s, v2.2s, v3.2s
ZIP2 v3.2s, v2.2s, v3.2s
MOV v2.8b, v28.8b
sMULL v0.2d, v2.2s, v0.2s
Sqxtn v8.2s, v0.2d
dup v0.2s,w12
dup v1.2s,w7
//VZIP.32 D0, D1
ZIP1 v28.2s, v0.2s, v1.2s
ZIP2 v1.2s, v0.2s, v1.2s
MOV v0.8b, v28.8b
SQSUB v8.2s, v0.2s , v8.2s
//sQshL v8.2s, v8.2s,#2
dup v0.2s,w8
//SQADD v8.2s, v8.2s , v0.2s
//sshR v8.2s, v8.2s,#16
MOV x7, x17
//sxtw x7,w7
LSL x10, x7, #2
ASR x5, x3, #1
//SMULBB x5, x10, x5
AND w5,w5,0xFFFF
sxth x5,w5
AND w19,w10,0xFFFF
sxth x19,w19
SMULL x5, w19, w5
ADD x5, x5, x0
SUB x0, x5, x10
MVN x9, x10
ADD x9, x9, #1
ST1 {V8.S}[1],[x0], x9
ST1 {V8.S}[0],[x5], x10
MOV x8, x1
LSL x12, x3, #2
ADD x1, x1, x12
SUB x1, x1, #40
MOV x12, #-32
PROLOGUE_NEON:
ASR x3, x3, #2
SUB x3, x3, #4
ASR x3, x3, #2
SUB x3, x3, #2
LD2 { v0.4s, v1.4s}, [x1]
MOV v2.16b, v1.16b
ADD x1, x1, x12
//VUZP.16 D0, D1
UZP1 v28.8h, v0.8h, v0.8h
UZP2 v29.8h, v0.8h, v0.8h
MOV v0.d[0], v28.d[0]
MOV v0.d[1], v29.d[0]
//VUZP.16 D2, D3
UZP1 v28.8h, v2.8h, v2.8h
UZP2 v29.8h, v2.8h, v2.8h
MOV v2.d[0], v28.d[0]
MOV v2.d[1], v29.d[0]
//rev64 v0.8h, v0.8h
rev64 v0.8h, v0.8h
MOV v1.d[0], v0.d[1]
rev64 v2.8h, v2.8h
MOV v3.d[0], v2.d[1]
LD2 {v8.4h, v9.4h}, [x2]
ADD x2, x2, #16
LD2 { v4.4s, v5.4s}, [x8]
MOV v6.16b, v5.16b
ADD x8, x8,#32
uMULL v30.4s, v0.4h, v9.4h
// VUZP.16 D4, D5
UZP1 v28.8h, v4.8h, v4.8h
UZP2 v29.8h, v4.8h, v4.8h
MOV v4.d[0], v28.d[0]
MOV v5.d[0], v29.d[0]
uMULL v28.4s, v2.4h, v8.4h
// VUZP.16 D6, D7
UZP1 v26.8h, v6.8h, v6.8h
UZP2 v27.8h, v6.8h, v6.8h
MOV v6.d[0], v26.d[0]
MOV v7.d[0], v27.d[0]
uMULL v26.4s, v0.4h, v8.4h
uMULL v24.4s, v2.4h, v9.4h
LD2 { v10.4s, v11.4s}, [x6]
MOV v12.16b, v11.16b
ADD x6, x6, x12
ushR v30.4s, v30.4s,#16
//VUZP.16 D10, D11
UZP1 v22.8h, v10.8h, v10.8h
UZP2 v23.8h, v10.8h, v10.8h
MOV v10.d[0], v22.d[0]
MOV v10.d[1], v23.d[0]
ushR v28.4s, v28.4s,#16
//VUZP.16 D12, D13
UZP1 v22.8h, v12.8h, v12.8h
UZP2 v23.8h, v12.8h, v12.8h
MOV v12.d[0], v22.d[0]
MOV v12.d[1], v23.d[0]
sMLAL v30.4s, v1.4h, v9.4h
rev64 v10.8h, v10.8h
MOV v11.d[0], v10.d[1]
sMLAL v28.4s, v3.4h, v8.4h
rev64 v12.8h, v12.8h
MOV v13.d[0], v12.d[1]
ushR v26.4s, v26.4s,#16
ushR v24.4s, v24.4s,#16
sMLAL v26.4s, v1.4h, v8.4h
sMLAL v24.4s, v3.4h, v9.4h
ADD v30.4s, v30.4s , v28.4s
NEG v30.4s, v30.4s
uMULL v22.4s, v4.4h, v8.4h
SUB v28.4s, v24.4s , v26.4s
mov v26.16b, v30.16b
mov v24.16b, v28.16b
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
uMULL v2.4s, v24.4h, v18.4h
uMULL v0.4s, v26.4h, v18.4h
ushR v22.4s, v22.4s,#16
sMLAL v22.4s, v5.4h, v8.4h
ushR v2.4s, v2.4s,#16
ushR v0.4s, v0.4s,#16
sMLAL v2.4s, v25.4h, v18.4h
sMLAL v0.4s, v27.4h, v18.4h
uMULL v24.4s, v4.4h, v9.4h
uMULL v26.4s, v6.4h, v8.4h
NEG v2.4s, v2.4s
ADD v28.4s, v28.4s , v0.4s
ADD v30.4s, v30.4s , v2.4s
uMULL v0.4s, v6.4h, v9.4h
sshR v24.4s, v24.4s,#16
sMLAL v24.4s, v5.4h, v9.4h
sshR v26.4s, v26.4s,#16
sshR v0.4s, v0.4s,#16
sMLAL v26.4s, v7.4h, v8.4h
sMLAL v0.4s, v7.4h, v9.4h
ADD v22.4s, v22.4s , v0.4s
NEG v22.4s, v22.4s
SUB v24.4s, v26.4s , v24.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v14.4s,w11
SQADD v28.4s, v28.4s , v14.4s
//LDR w11, [sp, #116]
MOV w11, w25
//sxtw x11,w11
dup v0.4s,w11
sQshL v28.4s, v28.4s, v0.4s
mov v0.16b, v22.16b
mov v14.16b, v24.16b
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
// VUZP.16 D22, D23
UZP1 v19.8h, v22.8h, v22.8h
UZP2 v21.8h, v22.8h, v22.8h
MOV v22.d[0], v19.d[0]
MOV v23.d[0], v21.d[0]
uMULL v8.4s, v24.4h, v18.4h
uMULL v26.4s, v22.4h, v18.4h
NEG v2.4s, v30.4s
// VUZP.16 D30, D31
UZP1 v19.8h, v30.8h, v30.8h
UZP2 v21.8h, v30.8h, v30.8h
MOV v30.d[0], v19.d[0]
MOV v30.d[1], v21.d[0]
// VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v3.d[0], v21.d[0]
uMULL v4.4s, v30.4h, v12.4h
uMULL v6.4s, v2.4h, v13.4h
ushR v8.4s, v8.4s,#16
ushR v26.4s, v26.4s,#16
sMLAL v8.4s, v25.4h, v18.4h
sMLAL v26.4s, v23.4h, v18.4h
ushR v4.4s, v4.4s,#16
ushR v6.4s, v6.4s,#16
MOV v19.d[0], v30.d[1]
sMLAL v4.4s, v19.4h, v12.4h
sMLAL v6.4s, v3.4h, v13.4h
NEG v8.4s, v8.4s
ADD v14.4s, v14.4s , v26.4s
ADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v8.4s,w11
SQADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v26.4s,w11
sQshL v0.4s, v0.4s, v26.4s
mov v26.16b, v28.16b
LD2 { v28.4s, v29.4s}, [x4]
MOV v30.16b, v29.16b
MOV v29.d[0], v28.d[1]
// VZIP.32 Q13, Q0
ZIP1 v19.4s, v26.4s, v0.4s
ZIP2 v0.4s, v26.4s, v0.4s
MOV v26.16b, v19.16b
ST1 { v26.4s}, [x4],#16
ST1 { v0.4s}, [x4],#16
movi v1.2s, #0
//VADDL.S16 Q0, D13, D1
SADDL v0.4s, v13.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v26.2d, v28.2s, v0.2s
Sqxtn v8.2s, v26.2d
sMULL v26.2d, v29.2s, v1.2s
Sqxtn v9.2s, v26.2d
MOV v8.d[1], v9.d[0]
movi v1.2s, #0
// VADDL.S16 Q0, D12, D1
SADDL v0.4s, v12.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v24.2d, v28.2s, v0.2s
Sqxtn v26.2s, v24.2d
sMULL v24.2d, v29.2s, v1.2s
Sqxtn v27.2s, v24.2d
MOV v26.d[1], v27.d[0]
sQshL v4.4s, v4.4s, v16.4s
sQshL v6.4s, v6.4s, v16.4s
SQSUB v4.4s, v4.4s , v8.4s
SQSUB v6.4s, v6.4s , v26.4s
NEG v26.4s, v14.4s
// VUZP.16 D14, D15
UZP1 v19.8h, v14.8h, v14.8h
UZP2 v21.8h, v14.8h, v14.8h
MOV v14.d[0], v19.d[0]
MOV v15.d[0], v21.d[0]
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
movi v1.2s, #0
// VADDL.S16 Q0, D10, D1
SADDL v0.4s, v10.4h, v1.4h
MOV v1.d[0], v0.d[0]
sMULL v22.2d, v30.2s, v0.2s
Sqxtn v24.2s, v22.2d
sMULL2 v22.2d, v30.4s, v0.4s
Sqxtn v25.2s, v22.2d
MOV v24.d[1], v25.d[0]
movi v1.2s, #0
// VADDL.S16 Q0, D11, D1
SADDL v0.4s, v11.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v8.2d, v30.2s, v0.2s
Sqxtn v22.2s, v8.2d
sMULL2 v8.2d, v30.4s, v0.4s
Sqxtn v23.2s, v8.2d
MOV v22.d[1], v23.d[0]
uMULL v8.4s, v26.4h, v11.4h
uMULL v30.4s, v14.4h, v10.4h
LD2 { v0.4s, v1.4s}, [x1]
MOV v2.16b, v1.16b
ADD x1, x1, x12
// VUZP.16 D0, D1
UZP1 v19.8h, v0.8h, v0.8h
UZP2 v21.8h, v0.8h, v0.8h
MOV v0.d[0], v19.d[0]
MOV v0.d[1], v21.d[0]
// VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v2.d[1], v21.d[0]
ushR v8.4s, v8.4s,#16
rev64 v0.8h, v0.8h
MOV v1.d[0], v0.d[1]
ushR v30.4s, v30.4s,#16
rev64 v2.8h, v2.8h
MOV v3.d[0], v2.d[1]
sMLAL v8.4s, v27.4h, v11.4h
sMLAL v30.4s, v15.4h, v10.4h
LD2 { v10.4s, v11.4s}, [x6]
ADD x6, x6, x12
MOV v12.16b, v11.16b
UZP1 v19.8h, v10.8h, v10.8h
UZP2 v21.8h, v10.8h, v10.8h
MOV v10.d[0], v19.d[0]
MOV v10.d[1], v21.d[0]
UZP1 v19.8h, v12.8h, v12.8h
UZP2 v21.8h, v12.8h, v12.8h
MOV v12.d[0], v19.d[0]
MOV v12.d[1], v21.d[0]
MOV V14.16B , V4.16B
rev64 v10.8h, v10.8h
MOV v11.d[0], v10.d[1]
rev64 v12.8h, v12.8h
MOV v13.d[0], v12.d[1]
sQshL v8.4s, v8.4s, v16.4s
MOV V31.16B, V6.16B
LD2 { v4.4s, v5.4s}, [x8]
ADD x8, x8,#32
MOV v6.16b, v5.16b
sQshL v30.4s, v30.4s, v16.4s
// VUZP.16 D4, D5
UZP1 v19.8h, v4.8h, v4.8h
UZP2 v21.8h, v4.8h, v4.8h
MOV v4.d[0], v19.d[0]
MOV v5.d[0], v21.d[0]
SQSUB v8.4s, v8.4s , v24.4s
// VUZP.16 D6, D7
UZP1 v19.8h, v6.8h, v6.8h
UZP2 v21.8h, v6.8h, v6.8h
MOV v6.d[0], v19.d[0]
MOV v7.d[0], v21.d[0]
SQSUB v22.4s, v30.4s , v22.4s
MOV V30.16B, V8.16B
LD2 {v8.4h, v9.4h}, [x2]
ADD x2, x2, #16
CORE_LOOP:
ST1 {V14.S}[0], [x0]
ADD x0, x0, x9
ST1 {V22.S}[0], [x0]
ADD x0, x0, x9
ST1 {V14.S}[1], [x0]
ADD x0, x0, x9
ST1 {V22.S}[1], [x0]
ADD x0, x0, x9
ST1 {V14.S}[2], [x0]
ADD x0, x0, x9
ST1 {V22.S}[2], [x0]
ADD x0, x0, x9
ST1 {V14.S}[3], [x0]
ADD x0, x0, x9
ST1 {V22.S}[3], [x0]
ADD x0, x0, x9
ST1 {V31.S}[0], [x5]
ADD x5, x5, x10
ST1 {V30.S}[0], [x5]
ADD x5, x5, x10
ST1 {V31.S}[1], [x5]
ADD x5, x5, x10
ST1 {V30.S}[1], [x5]
ADD x5, x5, x10
ST1 {V31.S}[2], [x5]
ADD x5, x5, x10
ST1 {V30.S}[2], [x5]
ADD x5, x5, x10
ST1 {V31.S}[3], [x5]
ADD x5, x5, x10
ST1 {V30.S}[3], [x5]
ADD x5, x5, x10
uMULL v30.4s, v0.4h, v9.4h
uMULL v28.4s, v2.4h, v8.4h
uMULL v26.4s, v0.4h, v8.4h
uMULL v24.4s, v2.4h, v9.4h
ushR v30.4s, v30.4s,#16
ushR v28.4s, v28.4s,#16
sMLAL v30.4s, v1.4h, v9.4h
sMLAL v28.4s, v3.4h, v8.4h
ushR v26.4s, v26.4s,#16
ushR v24.4s, v24.4s,#16
sMLAL v26.4s, v1.4h, v8.4h
sMLAL v24.4s, v3.4h, v9.4h
ADD v30.4s, v30.4s , v28.4s
NEG v30.4s, v30.4s
SUB v28.4s, v24.4s , v26.4s
mov v26.16b, v30.16b
uMULL v22.4s, v4.4h, v8.4h
mov v24.16b, v28.16b
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
uMULL v2.4s, v24.4h, v18.4h
uMULL v0.4s, v26.4h, v18.4h
ushR v22.4s, v22.4s,#16
sMLAL v22.4s, v5.4h, v8.4h
ushR v2.4s, v2.4s,#16
ushR v0.4s, v0.4s,#16
sMLAL v2.4s, v25.4h, v18.4h
sMLAL v0.4s, v27.4h, v18.4h
uMULL v24.4s, v4.4h, v9.4h
uMULL v26.4s, v6.4h, v8.4h
NEG v2.4s, v2.4s
ADD v28.4s, v28.4s , v0.4s
ADD v30.4s, v30.4s , v2.4s
uMULL v0.4s, v6.4h, v9.4h
sshR v24.4s, v24.4s,#16
sMLAL v24.4s, v5.4h, v9.4h
sshR v26.4s, v26.4s,#16
sshR v0.4s, v0.4s,#16
sMLAL v26.4s, v7.4h, v8.4h
sMLAL v0.4s, v7.4h, v9.4h
ADD v22.4s, v22.4s , v0.4s
NEG v22.4s, v22.4s
SUB v24.4s, v26.4s , v24.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v14.4s,w11
SQADD v28.4s, v28.4s , v14.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v0.4s,w11
sQshL v28.4s, v28.4s, v0.4s
mov v0.16b, v22.16b
mov v14.16b, v24.16b
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
// VUZP.16 D22, D23
UZP1 v19.8h, v22.8h, v22.8h
UZP2 v21.8h, v22.8h, v22.8h
MOV v22.d[0], v19.d[0]
MOV v23.d[0], v21.d[0]
uMULL v8.4s, v24.4h, v18.4h
uMULL v26.4s, v22.4h, v18.4h
NEG v2.4s, v30.4s
// VUZP.16 D30, D31
UZP1 v19.8h, v30.8h, v30.8h
UZP2 v21.8h, v30.8h, v30.8h
MOV v30.d[0], v19.d[0]
MOV v30.d[1], v21.d[0]
// VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v3.d[0], v21.d[0]
uMULL v4.4s, v30.4h, v12.4h
uMULL v6.4s, v2.4h, v13.4h
ushR v8.4s, v8.4s,#16
ushR v26.4s, v26.4s,#16
sMLAL v8.4s, v25.4h, v18.4h
sMLAL v26.4s, v23.4h, v18.4h
ushR v4.4s, v4.4s,#16
ushR v6.4s, v6.4s,#16
MOV v19.d[0], v30.d[1]
sMLAL v4.4s, v19.4h, v12.4h
sMLAL v6.4s, v3.4h, v13.4h
NEG v8.4s, v8.4s
ADD v14.4s, v14.4s , v26.4s
ADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v8.4s,w11
SQADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v26.4s,w11
sQshL v0.4s, v0.4s, v26.4s
mov v26.16b, v28.16b
LD2 { v28.4s, v29.4s}, [x4]
MOV v30.16b, v29.16b
MOV v29.d[0], v28.d[1]
// VZIP.32 Q13, Q0
ZIP1 v19.4s, v26.4s, v0.4s
ZIP2 v0.4s, v26.4s, v0.4s
MOV v26.16b, v19.16b
ST1 { v26.4s}, [x4]
ADD x4, x4,#16
ST1 { v0.4s}, [x4]
ADD x4, x4,#16
movi v1.2s, #0
// VADDL.S16 Q0, D13, D1
SADDL v0.4s, v13.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v26.2d, v28.2s, v0.2s
Sqxtn v8.2s, v26.2d
sMULL v26.2d, v29.2s, v1.2s
Sqxtn v9.2s, v26.2d
MOV v8.d[1], v9.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D12, D1
SADDL v0.4s, v12.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v24.2d, v28.2s, v0.2s
Sqxtn v26.2s, v24.2d
sMULL v24.2d, v29.2s, v1.2s
Sqxtn v27.2s, v24.2d
MOV v26.d[1], v27.d[0]
sQshL v4.4s, v4.4s, v16.4s
sQshL v6.4s, v6.4s, v16.4s
SQSUB v4.4s, v4.4s , v8.4s
SQSUB v6.4s, v6.4s , v26.4s
NEG v26.4s, v14.4s
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D10, D1
SADDL v0.4s, v10.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v22.2d, v30.2s, v0.2s
Sqxtn v24.2s, v22.2d
sMULL2 v22.2d, v30.4s, v0.4s
Sqxtn v25.2s, v22.2d
MOV v24.d[1], v25.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D11, D1
SADDL v0.4s, v11.4h, v1.4h
sMULL v8.2d, v30.2s, v0.2s
Sqxtn v22.2s, v8.2d
sMULL2 v8.2d, v30.4s, v0.4s
Sqxtn v23.2s, v8.2d
MOV v22.d[1], v23.d[0]
// VUZP.16 D14, D15
UZP1 v19.8h, v14.8h, v14.8h
UZP2 v21.8h, v14.8h, v14.8h
MOV v14.d[0], v19.d[0]
MOV v15.d[0], v21.d[0]
uMULL v8.4s, v26.4h, v11.4h
uMULL v30.4s, v14.4h, v10.4h
LD2 { v0.4s, v1.4s}, [x1]
MOV v2.16b, v1.16b
ADD X1, X1, x12
// VUZP.16 D0, D1
UZP1 v19.8h, v0.8h, v0.8h
UZP2 v21.8h, v0.8h, v0.8h
MOV v0.d[0], v19.d[0]
MOV v0.d[1], v21.d[0]
// VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v2.d[1], v21.d[0]
ushR v8.4s, v8.4s,#16
rev64 v0.8h, v0.8h
MOV v1.d[0], v0.d[1]
ushR v30.4s, v30.4s,#16
rev64 v2.8h, v2.8h
MOV v3.d[0], v2.d[1]
sMLAL v8.4s, v27.4h, v11.4h
sMLAL v30.4s, v15.4h, v10.4h
LD2 { v10.4s, v11.4s}, [x6]
add X6, x6, x12
MOV v12.16b, v11.16b
//VUZP.16 D10, D11
UZP1 v19.8h, v10.8h, v10.8h
UZP2 v21.8h, v10.8h, v10.8h
MOV v10.d[0], v19.d[0]
MOV v10.d[1], v21.d[0]
// VUZP.16 D12, D13
UZP1 v19.8h, v12.8h, v12.8h
UZP2 v21.8h, v12.8h, v12.8h
MOV v12.d[0], v19.d[0]
MOV v12.d[1], v21.d[0]
MOV V14.16B, V4.16B
rev64 v10.8h, v10.8h
MOV v11.d[0], v10.d[1]
rev64 v12.8h, v12.8h
MOV v13.d[0], v12.d[1]
sQshL v8.4s, v8.4s, v16.4s
LD2 { v4.4s, v5.4s}, [x8]
ADD x8, x8, #32
MOV V31.16B, V6.16B
MOV v6.16b, v5.16b
sQshL v30.4s, v30.4s, v16.4s
UZP1 v19.8h, v4.8h, v4.8h
UZP2 v21.8h, v4.8h, v4.8h
MOV v4.d[0], v19.d[0]
MOV v5.d[0], v21.d[0]
SQSUB v8.4s, v8.4s , v24.4s
// VUZP.16 D6, D7
UZP1 v19.8h, v6.8h, v6.8h
UZP2 v21.8h, v6.8h, v6.8h
MOV v6.d[0], v19.d[0]
MOV v7.d[0], v21.d[0]
SQSUB v22.4s, v30.4s , v22.4s
MOV V30.16B , V8.16B
LD2 {v8.4h, v9.4h}, [x2]
ADD x2, x2,#16
SUBS x3, x3, #1
BNE CORE_LOOP
EPILOGUE:
ST1 {V14.S}[0],[x0]
ADD x0, x0, x9
ST1 {V22.S}[0],[x0]
ADD x0, x0, x9
ST1 {V14.S}[1],[x0]
ADD x0, x0, x9
ST1 {V22.S}[1],[x0]
ADD x0, x0, x9
ST1 {V14.S}[2],[x0]
ADD x0, x0, x9
ST1 {V22.S}[2],[x0]
ADD x0, x0, x9
ST1 {V14.S}[3],[x0]
ADD x0, x0, x9
ST1 {V22.S}[3],[x0]
ADD x0, x0, x9
ST1 {V31.S}[0],[x5]
ADD x5, x5, x10
ST1 {V30.S}[0],[x5]
ADD x5, x5, x10
ST1 {V31.S}[1],[x5]
ADD x5, x5, x10
ST1 {V30.S}[1],[x5]
ADD x5, x5, x10
ST1 {V31.S}[2],[x5]
ADD x5, x5, x10
ST1 {V30.S}[2],[x5]
ADD x5, x5, x10
ST1 {V31.S}[3],[x5]
ADD x5, x5, x10
ST1 {V30.S}[3],[x5]
ADD x5, x5, x10
uMULL v30.4s, v0.4h, v9.4h
uMULL v28.4s, v2.4h, v8.4h
uMULL v26.4s, v0.4h, v8.4h
uMULL v24.4s, v2.4h, v9.4h
ushR v30.4s, v30.4s,#16
ushR v28.4s, v28.4s,#16
sMLAL v30.4s, v1.4h, v9.4h
sMLAL v28.4s, v3.4h, v8.4h
ushR v26.4s, v26.4s,#16
ushR v24.4s, v24.4s,#16
sMLAL v26.4s, v1.4h, v8.4h
sMLAL v24.4s, v3.4h, v9.4h
ADD v30.4s, v30.4s , v28.4s
NEG v30.4s, v30.4s
SUB v28.4s, v24.4s , v26.4s
uMULL v22.4s, v4.4h, v8.4h
mov v26.16b, v30.16b
mov v24.16b, v28.16b
mov v26.16b, v30.16b
mov v24.16b, v28.16b
//VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
uMULL v2.4s, v24.4h, v18.4h
uMULL v0.4s, v26.4h, v18.4h
ushR v22.4s, v22.4s,#16
sMLAL v22.4s, v5.4h, v8.4h
ushR v2.4s, v2.4s,#16
ushR v0.4s, v0.4s,#16
sMLAL v2.4s, v25.4h, v18.4h
sMLAL v0.4s, v27.4h, v18.4h
uMULL v24.4s, v4.4h, v9.4h
uMULL v26.4s, v6.4h, v8.4h
NEG v2.4s, v2.4s
ADD v28.4s, v28.4s , v0.4s
ADD v30.4s, v30.4s , v2.4s
uMULL v0.4s, v6.4h, v9.4h
sshR v24.4s, v24.4s,#16
sMLAL v24.4s, v5.4h, v9.4h
sshR v26.4s, v26.4s,#16
sshR v0.4s, v0.4s,#16
sMLAL v26.4s, v7.4h, v8.4h
sMLAL v0.4s, v7.4h, v9.4h
ADD v22.4s, v22.4s , v0.4s
NEG v22.4s, v22.4s
SUB v24.4s, v26.4s , v24.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v14.4s,w11
SQADD v28.4s, v28.4s , v14.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v0.4s,w11
sQshL v28.4s, v28.4s, v0.4s
mov v0.16b, v22.16b
mov v14.16b, v24.16b
// VUZP.16 D22, D23
UZP1 v19.8h, v22.8h, v22.8h
UZP2 v21.8h, v22.8h, v22.8h
MOV v22.d[0], v19.d[0]
MOV v23.d[0], v21.d[0]
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
uMULL v8.4s, v24.4h, v18.4h
uMULL v26.4s, v22.4h, v18.4h
NEG v2.4s, v30.4s
// VUZP.16 D30, D31
UZP1 v19.8h, v30.8h, v30.8h
UZP2 v21.8h, v30.8h, v30.8h
MOV v30.d[0], v19.d[0]
MOV v30.d[1], v21.d[0]
// VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v3.d[0], v21.d[0]
uMULL v4.4s, v30.4h, v12.4h
uMULL v6.4s, v2.4h, v13.4h
ushR v8.4s, v8.4s,#16
ushR v26.4s, v26.4s,#16
sMLAL v8.4s, v25.4h, v18.4h
sMLAL v26.4s, v23.4h, v18.4h
ushR v4.4s, v4.4s,#16
ushR v6.4s, v6.4s,#16
MOV v19.d[0], v30.d[1]
sMLAL v4.4s, v19.4h, v12.4h
sMLAL v6.4s, v3.4h, v13.4h
NEG v8.4s, v8.4s
ADD v14.4s, v14.4s , v26.4s
ADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v8.4s,w11
SQADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v26.4s,w11
sQshL v0.4s, v0.4s, v26.4s
mov v26.16b, v28.16b
LD2 { v28.4s, v29.4s}, [x4]
MOV v30.16b, v29.16b
MOV v29.d[0], v28.d[1]
// VZIP.32 Q13, Q0
ZIP1 v19.4s, v26.4s, v0.4s
ZIP2 v0.4s, v26.4s, v0.4s
MOV v26.16b, v19.16b
ST1 { v26.4s}, [x4],#16
ST1 { v0.4s}, [x4],#16
movi v1.2s, #0
// VADDL.S16 Q0, D13, D1
SADDL v0.4s, v13.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v26.2d, v28.2s, v0.2s
Sqxtn v8.2s, v26.2d
sMULL v26.2d, v29.2s, v1.2s
Sqxtn v9.2s, v26.2d
MOV v8.d[1], v9.d[0]
movi v1.2s, #0
// VADDL.S16 Q0, D12, D1
SADDL v0.4s, v12.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v24.2d, v28.2s, v0.2s
Sqxtn v26.2s, v24.2d
sMULL v24.2d, v29.2s, v1.2s
Sqxtn v27.2s, v24.2d
MOV v26.d[1], v27.d[0]
sQshL v4.4s, v4.4s, v16.4s
sQshL v6.4s, v6.4s, v16.4s
SQSUB v4.4s, v4.4s , v8.4s
SQSUB v6.4s, v6.4s , v26.4s
NEG v26.4s, v14.4s
// VUZP.16 D14, D15
UZP1 v19.8h, v14.8h, v14.8h
UZP2 v21.8h, v14.8h, v14.8h
MOV v14.d[0], v19.d[0]
MOV v15.d[0], v21.d[0]
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D10, D1
SADDL v0.4s, v10.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v22.2d, v30.2s, v0.2s
Sqxtn v24.2s, v22.2d
sMULL2 v22.2d, v30.4s, v0.4s
Sqxtn v25.2s, v22.2d
MOV v24.d[1], v25.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D11, D1
SADDL v0.4s, v11.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v8.2d, v30.2s, v0.2s
Sqxtn v22.2s, v8.2d
sMULL2 v8.2d, v30.4s, v0.4s
Sqxtn v23.2s, v8.2d
MOV v22.d[1], v23.d[0]
uMULL v8.4s, v26.4h, v11.4h
uMULL v30.4s, v14.4h, v10.4h
ushR v8.4s, v8.4s,#16
ushR v30.4s, v30.4s,#16
sMLAL v8.4s, v27.4h, v11.4h
sMLAL v30.4s, v15.4h, v10.4h
MOV V14.16B, V4.16B
sQshL v8.4s, v8.4s, v16.4s
sQshL v30.4s, v30.4s, v16.4s
SQSUB v8.4s, v8.4s , v24.4s
SQSUB v22.4s, v30.4s , v22.4s
MOV V30.16B , V8.16B
ST1 {V14.S}[0],[x0]
ADD x0, x0, x9
ST1 {V22.S}[0],[x0]
ADD x0, x0, x9
ST1 {V14.S}[1],[x0]
ADD x0, x0, x9
ST1 {V22.S}[1],[x0]
ADD x0, x0, x9
ST1 {V14.S}[2],[x0]
ADD x0, x0, x9
ST1 {V22.S}[2],[x0]
ADD x0, x0, x9
ST1 {V14.S}[3],[x0]
ADD x0, x0, x9
ST1 {V22.S}[3],[x0]
ADD x0, x0, x9
ST1 {V6.S}[0],[x5]
ADD x5, x5, x10
ST1 {V30.S}[0],[x5]
ADD x5, x5, x10
ST1 {V6.S}[1],[x5]
ADD x5, x5, x10
ST1 {V30.S}[1],[x5]
ADD x5, x5, x10
ST1 {V6.S}[2],[x5]
ADD x5, x5, x10
ST1 {V30.S}[2],[x5]
ADD x5, x5, x10
ST1 {V6.S}[3],[x5]
ADD x5, x5, x10
ST1 {V30.S}[3],[x5]
ADD x5, x5, x10
ARM_EPILOGUE:
ARM_LOOP:
LD2 { v0.4s, v1.4s}, [x1]
MOV v2.16b, v1.16b
//VUZP.16 D0, D1
UZP1 v19.8h, v0.8h, v0.8h
UZP2 v21.8h, v0.8h, v0.8h
MOV v0.d[0], v19.d[0]
MOV v0.d[1], v21.d[0]
//VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v2.d[1], v21.d[0]
rev64 v0.8h, v0.8h
MOV v1.d[0], v0.d[1]
rev64 v2.8h, v2.8h
MOV v3.d[0], v2.d[1]
LD2 {v8.4h, v9.4h}, [x2]
ADD x2, x2,#16
LD2 {v4.2s, v5.2s}, [x8]
ADD x8, x8,#16
MOV v6.16b, v5.16b
movi v5.2s, #0x00000000
movi v7.2s, #0x00000000
LD1 {v5.s}[0],[x8],#4
LD1 {v7.s}[0],[x8]
MOV x12, #16
MOV v4.d[1], v5.d[0]
MOV v6.d[1], v7.d[0]
// VUZP.16 D4, D5
UZP1 v19.8h, v4.8h, v4.8h
UZP2 v21.8h, v4.8h, v4.8h
MOV v4.d[0], v19.d[0]
MOV v5.d[0], v21.d[0]
// VUZP.16 D6, D7
UZP1 v19.8h, v6.8h, v6.8h
UZP2 v21.8h, v6.8h, v6.8h
MOV v6.d[0], v19.d[0]
MOV v7.d[0], v21.d[0]
ADD x6, x6, #16
MOV x12, #-4
LD2 {v11.2s, v12.2s}, [x6]
ADD x6, x6, x12
MOV v13.16b, v12.16b
movi v10.2s, #0x00000000
LD1 {v12.s}[1],[x6]
ADD x6, x6, x12
LD1 {v10.s}[1],[x6]
ADD x6, x6, x12
LD1 {v12.s}[0],[x6]
ADD x6, x6, x12
MOV v10.d[1], v11.d[0]
MOV v12.d[1], v13.d[0]
//VUZP.16 D10, D11
UZP1 v19.8h, v10.8h, v10.8h
UZP2 v21.8h, v10.8h, v10.8h
MOV v10.d[0], v19.d[0]
MOV v10.d[1], v21.d[0]
//VUZP.16 D12, D13
UZP1 v19.8h, v12.8h, v12.8h
UZP2 v21.8h, v12.8h, v12.8h
MOV v12.d[0], v19.d[0]
MOV v12.d[1], v21.d[0]
rev64 v10.8h, v10.8h
MOV v11.d[0], v10.d[1]
rev64 v12.8h, v12.8h
MOV v13.d[0], v12.d[1]
uMULL v30.4s, v0.4h, v9.4h
uMULL v28.4s, v2.4h, v8.4h
uMULL v26.4s, v0.4h, v8.4h
uMULL v24.4s, v2.4h, v9.4h
ushR v30.4s, v30.4s,#16
ushR v28.4s, v28.4s,#16
sMLAL v30.4s, v1.4h, v9.4h
sMLAL v28.4s, v3.4h, v8.4h
ushR v26.4s, v26.4s,#16
ushR v24.4s, v24.4s,#16
sMLAL v26.4s, v1.4h, v8.4h
sMLAL v24.4s, v3.4h, v9.4h
ADD v30.4s, v30.4s , v28.4s
NEG v30.4s, v30.4s
uMULL v22.4s, v4.4h, v8.4h
SUB v28.4s, v24.4s , v26.4s
mov v26.16b, v30.16b
mov v24.16b, v28.16b
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
//VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
uMULL v2.4s, v24.4h, v18.4h
uMULL v0.4s, v26.4h, v18.4h
ushR v22.4s, v22.4s,#16
sMLAL v22.4s, v5.4h, v8.4h
ushR v2.4s, v2.4s,#16
ushR v0.4s, v0.4s,#16
sMLAL v2.4s, v25.4h, v18.4h
sMLAL v0.4s, v27.4h, v18.4h
uMULL v24.4s, v4.4h, v9.4h
uMULL v26.4s, v6.4h, v8.4h
NEG v2.4s, v2.4s
ADD v28.4s, v28.4s , v0.4s
ADD v30.4s, v30.4s , v2.4s
uMULL v0.4s, v6.4h, v9.4h
sshR v24.4s, v24.4s,#16
sMLAL v24.4s, v5.4h, v9.4h
sshR v26.4s, v26.4s,#16
sshR v0.4s, v0.4s,#16
sMLAL v26.4s, v7.4h, v8.4h
sMLAL v0.4s, v7.4h, v9.4h
ADD v22.4s, v22.4s , v0.4s
NEG v22.4s, v22.4s
SUB v24.4s, v26.4s , v24.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v14.4s,w11
SQADD v28.4s, v28.4s , v14.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v0.4s,w11
sQshL v28.4s, v28.4s, v0.4s
mov v0.16b, v22.16b
mov v14.16b, v24.16b
// VUZP.16 D22, D23
UZP1 v19.8h, v22.8h, v22.8h
UZP2 v21.8h, v22.8h, v22.8h
MOV v22.d[0], v19.d[0]
MOV v23.d[0], v21.d[0]
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
uMULL v8.4s, v24.4h, v18.4h
uMULL v26.4s, v22.4h, v18.4h
NEG v2.4s, v30.4s
// VUZP.16 D30, D31
UZP1 v19.8h, v30.8h, v30.8h
UZP2 v21.8h, v30.8h, v30.8h
MOV v30.d[0], v19.d[0]
MOV v30.d[1], v21.d[0]
// VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v3.d[0], v21.d[0]
uMULL v4.4s, v30.4h, v12.4h
uMULL v6.4s, v2.4h, v13.4h
ushR v8.4s, v8.4s,#16
ushR v26.4s, v26.4s,#16
sMLAL v8.4s, v25.4h, v18.4h
sMLAL v26.4s, v23.4h, v18.4h
ushR v4.4s, v4.4s,#16
ushR v6.4s, v6.4s,#16
MOV v19.d[0], v30.d[1]
sMLAL v4.4s, v19.4h, v12.4h
sMLAL v6.4s, v3.4h, v13.4h
NEG v8.4s, v8.4s
ADD v14.4s, v14.4s , v26.4s
ADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v8.4s,w11
SQADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v26.4s,w11
sQshL v0.4s, v0.4s, v26.4s
mov v26.16b, v28.16b
MOV x6, x4
LD1 {v28.2s, v29.2s}, [x4],#16
movi v19.2s, #0x00000000
LD1 {v30.s}[0],[x4],#4
LD1 {v30.s}[1],[x4],#4
LD1 {v19.s}[0],[x4],#4
MOV v28.d[1], v29.d[0]
MOV v30.d[1], v19.d[0]
//VUZP.32 Q14, Q15
UZP1 v19.4s, v28.4s, v30.4s
UZP2 v30.4s, v28.4s, v30.4s
MOV v28.16b, v19.16b
MOV v29.d[0], v28.d[1]
ST1 {v26.s}[0],[x6],#4
ST1 {v0.s}[0],[x6],#4
ST1 {v26.s}[1],[x6],#4
ST1 {v0.s}[1],[x6],#4
ST1 {v26.s}[2],[x6],#4
ST1 {v0.s}[2],[x6],#4
ST1 {v26.s}[3],[x6],#4
movi v1.2s, #0
//VADDL.S16 Q0, D13, D1
SADDL v0.4s, v13.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v26.2d, v28.2s, v0.2s
Sqxtn v8.2s, v26.2d
sMULL v26.2d, v29.2s, v1.2s
Sqxtn v9.2s, v26.2d
MOV v8.d[1], v9.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D12, D1
SADDL v0.4s, v12.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v24.2d, v28.2s, v0.2s
Sqxtn v26.2s, v24.2d
sMULL v24.2d, v29.2s, v1.2s
Sqxtn v27.2s, v24.2d
MOV v26.d[1], v27.d[0]
sQshL v4.4s, v4.4s, v16.4s
sQshL v6.4s, v6.4s, v16.4s
SQSUB v4.4s, v4.4s , v8.4s
SQSUB v6.4s, v6.4s , v26.4s
NEG v26.4s, v14.4s
//VUZP.16 D14, D15
UZP1 v19.8h, v14.8h, v14.8h
UZP2 v21.8h, v14.8h, v14.8h
MOV v14.d[0], v19.d[0]
MOV v15.d[0], v21.d[0]
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D10, D1
SADDL v0.4s, v10.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v22.2d, v30.2s, v0.2s
Sqxtn v24.2s, v22.2d
sMULL2 v22.2d, v30.4s, v0.4s
Sqxtn v25.2s, v22.2d
MOV v24.d[1], v25.d[0]
movi v1.2s, #0
// VADDL.S16 Q0, D11, D1
SADDL v0.4s, v11.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v8.2d, v30.2s, v0.2s
Sqxtn v22.2s, v8.2d
sMULL2 v8.2d, v30.4s, v0.4s
Sqxtn v23.2s, v8.2d
MOV v22.d[1], v23.d[0]
uMULL v8.4s, v26.4h, v11.4h
uMULL v30.4s, v14.4h, v10.4h
ushR v8.4s, v8.4s,#16
ushR v30.4s, v30.4s,#16
sMLAL v8.4s, v27.4h, v11.4h
sMLAL v30.4s, v15.4h, v10.4h
MOV V14.16B , V4.16B
//mov v15.8b, v6.8b
sQshL v8.4s, v8.4s, v16.4s
sQshL v30.4s, v30.4s, v16.4s
SQSUB v8.4s, v8.4s , v24.4s
SQSUB v22.4s, v30.4s , v22.4s
MOV V30.16B, V8.16B
ST1 {V14.S}[0],[x0]
ADD x0, x0, x9
ST1 {V22.S}[0],[x0]
ADD x0, x0, x9
ST1 {V14.S}[1],[x0]
ADD x0, x0, x9
ST1 {V22.S}[1],[x0]
ADD x0, x0, x9
ST1 {V14.S}[2],[x0]
ADD x0, x0, x9
ST1 {V22.S}[2],[x0]
ADD x0, x0, x9
ST1 {V14.S}[3],[x0]
ADD x0, x0, x9
ST1 {V6.S}[0],[x5]
ADD x5, x5, x10
ST1 {V30.S}[0],[x5]
ADD x5, x5, x10
ST1 {V6.S}[1],[x5]
ADD x5, x5, x10
ST1 {V30.S}[1],[x5]
ADD x5, x5, x10
ST1 {V6.S}[2],[x5]
ADD x5, x5, x10
ST1 {V30.S}[2],[x5]
ADD x5, x5, x10
ST1 {V6.S}[3],[x5]
ADD x5, x5, x10
pop_v_regs
ret