unplugged-system/external/libaom/av1/encoder/x86/error_sse2.asm

;
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;

;

; Increment %1 by sizeof() tran_low_t * %2.
%macro INCREMENT_ELEMENTS_TRAN_LOW 2
  lea %1, [%1 + %2 * 4]
%endmacro

; Load %2 + %3 into m%1.
; %3 is the offset in elements, not bytes.
; If tran_low_t is 16 bits (low bit depth configuration) then load the value
; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
; the values down to 16 bits.
%macro LOAD_TRAN_LOW 3
  mova     m%1, [%2 + (%3) * 4]
  packssdw m%1, [%2 + (%3) * 4 + 16]
%endmacro

%define private_prefix av1

%include "third_party/x86inc/x86inc.asm"

SECTION .text

; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
;                         int64_t *ssz)

INIT_XMM sse2
cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
  pxor      m4, m4                 ; sse accumulator
  pxor      m6, m6                 ; ssz accumulator
  pxor      m5, m5                 ; dedicated zero register
.loop:
  LOAD_TRAN_LOW 2, uqcq, 0
  LOAD_TRAN_LOW 0, dqcq, 0
  LOAD_TRAN_LOW 3, uqcq, 8
  LOAD_TRAN_LOW 1, dqcq, 8
  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
  sub    sizeq, 16
  psubw     m0, m2
  psubw     m1, m3
  ; individual errors are max. 15bit+sign, so squares are 30bit, and
  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
  pmaddwd   m0, m0
  pmaddwd   m1, m1
  pmaddwd   m2, m2
  pmaddwd   m3, m3
  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
  paddd     m0, m1
  paddd     m2, m3
  ; accumulate in 64bit
  punpckldq m7, m0, m5
  punpckhdq m0, m5
  paddq     m4, m7
  punpckldq m7, m2, m5
  paddq     m4, m0
  punpckhdq m2, m5
  paddq     m6, m7
  paddq     m6, m2
  jg .loop

  ; accumulate horizontally and store in return value
  movhlps   m5, m4
  movhlps   m7, m6
  paddq     m4, m5
  paddq     m6, m7
%if ARCH_X86_64
  movq    rax, m4
  movq [sszq], m6
%else
  mov     eax, sszm
  pshufd   m5, m4, 0x1
  movq  [eax], m6
  movd    eax, m4
  movd    edx, m5
%endif
  RET
Initial commit: AOSP 14 with modifications for Unplugged OS 2025-10-06 13:59:42 +00:00			`;`
			`; Copyright (c) 2016, Alliance for Open Media. All rights reserved`
			`;`
			`; This source code is subject to the terms of the BSD 2 Clause License and`
			`; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License`
			`; was not distributed with this source code in the LICENSE file, you can`
			`; obtain it at www.aomedia.org/license/software. If the Alliance for Open`
			`; Media Patent License 1.0 was not distributed with this source code in the`
			`; PATENTS file, you can obtain it at www.aomedia.org/license/patent.`
			`;`

			`;`

			`; Increment %1 by sizeof() tran_low_t * %2.`
			`%macro INCREMENT_ELEMENTS_TRAN_LOW 2`
			`lea %1, [%1 + %2 * 4]`
			`%endmacro`

			`; Load %2 + %3 into m%1.`
			`; %3 is the offset in elements, not bytes.`
			`; If tran_low_t is 16 bits (low bit depth configuration) then load the value`
			`; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack`
			`; the values down to 16 bits.`
			`%macro LOAD_TRAN_LOW 3`
			`mova m%1, [%2 + (%3) * 4]`
			`packssdw m%1, [%2 + (%3) * 4 + 16]`
			`%endmacro`

			`%define private_prefix av1`

			`%include "third_party/x86inc/x86inc.asm"`

			`SECTION .text`

			`; int64_t av1_block_error(int16_t coeff, int16_t dqcoeff, intptr_t block_size,`
			`; int64_t *ssz)`

			`INIT_XMM sse2`
			`cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz`
			`pxor m4, m4 ; sse accumulator`
			`pxor m6, m6 ; ssz accumulator`
			`pxor m5, m5 ; dedicated zero register`
			`.loop:`
			`LOAD_TRAN_LOW 2, uqcq, 0`
			`LOAD_TRAN_LOW 0, dqcq, 0`
			`LOAD_TRAN_LOW 3, uqcq, 8`
			`LOAD_TRAN_LOW 1, dqcq, 8`
			`INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16`
			`INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16`
			`sub sizeq, 16`
			`psubw m0, m2`
			`psubw m1, m3`
			`; individual errors are max. 15bit+sign, so squares are 30bit, and`
			`; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)`
			`pmaddwd m0, m0`
			`pmaddwd m1, m1`
			`pmaddwd m2, m2`
			`pmaddwd m3, m3`
			`; the sum of 2 31bit integers will fit in a 32bit unsigned integer`
			`paddd m0, m1`
			`paddd m2, m3`
			`; accumulate in 64bit`
			`punpckldq m7, m0, m5`
			`punpckhdq m0, m5`
			`paddq m4, m7`
			`punpckldq m7, m2, m5`
			`paddq m4, m0`
			`punpckhdq m2, m5`
			`paddq m6, m7`
			`paddq m6, m2`
			`jg .loop`

			`; accumulate horizontally and store in return value`
			`movhlps m5, m4`
			`movhlps m7, m6`
			`paddq m4, m5`
			`paddq m6, m7`
			`%if ARCH_X86_64`
			`movq rax, m4`
			`movq [sszq], m6`
			`%else`
			`mov eax, sszm`
			`pshufd m5, m4, 0x1`
			`movq [eax], m6`
			`movd eax, m4`
			`movd edx, m5`
			`%endif`
			`RET`