unplugged-system/external/libavc/common/svc/isvc_resi_trans_quant.c

841 lines
25 KiB
C

/******************************************************************************
*
* Copyright (C) 2022 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/**
*******************************************************************************
* @file
* ih264_resi_trans_quant.c
*
* @brief
* Contains function definitions single stage forward transform for H.264
* It will calculate the residue, do the cf and then do quantization
*
* @author
* Ittiam
*
* @par List of Functions:
* - ih264_resi_trans_quant_4x4()
* - ih264_resi_trans_quant_chroma_4x4
* - ih264_hadamard_quant_4x4
* - ih264_hadamard_quant_2x2_uv
* - ih264_resi_trans_quant_8x8
*
* @remarks
*******************************************************************************
*/
/* System include files */
#include <stdbool.h>
#include <stddef.h>
/* User include files */
#include "ih264_typedefs.h"
#include "ih264_defs.h"
#include "ih264_size_defs.h"
#include "ih264_macros.h"
#include "ih264_trans_macros.h"
#include "ih264_trans_data.h"
#include "ih264_structs.h"
#include "isvc_trans_quant_itrans_iquant.h"
static FORCEINLINE WORD16 isvc_subtract_upsampled_res(WORD16 i2_residue, WORD16 i2_upsampled_res)
{
return (CLIP3(-((WORD16) UINT8_MAX), ((WORD16) UINT8_MAX), i2_residue - i2_upsampled_res));
}
/**
*******************************************************************************
*
* @brief
* This function performs forward transform and quantization on a 4*4 block
*
* @par Description:
* The function accepts source buffer and estimation buffer. From these, it
* computes the residue. This is residue is then transformed and quantized.
* The transform and quantization are in placed computed. They use the residue
* buffer for this.
*
* @param[in] pu1_src
* Pointer to source sub-block
*
* @param[in] pu1_pred
* Pointer to prediction sub-block
*
* @param[in] pi2_out
* Pointer to residual sub-block
*
* @param[in] i4_src_stride
* Source stride
*
* @param[in] i4_pred_stride
* Prediction stride
*
* @param[in] dst_strd
* Destination stride
*
* @param[in] u4_qbits
* QP_BITS_h264_4x4 + floor(QP/6)
*
* @param[in] pu2_threshold_matrix
* Pointer to Forward Quant Threshold Matrix
*
* @param[in] pu2_scale_matrix
* Pointer to Forward Quant Scale Matrix
*
* @param[in] u4_round_factor
* Quantization Round factor
*
* @param[out] pu1_nnz
* Total non-zero coefficients in the current sub-block
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void isvc_resi_trans_quant_4x4(buffer_container_t *ps_src, buffer_container_t *ps_pred,
buffer_container_t *ps_out, buffer_container_t *ps_upsampled_res,
resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz,
WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res)
{
UWORD32 i;
WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
WORD32 i4_value;
UWORD8 *pu1_src = ps_src->pv_data;
UWORD8 *pu1_pred = ps_pred->pv_data;
WORD16 *pi2_out = ps_out->pv_data;
WORD16 *pi2_upsampled_res = ps_upsampled_res ? ps_upsampled_res->pv_data : NULL;
WORD32 i4_src_stride = ps_src->i4_data_stride;
WORD32 i4_pred_stride = ps_pred->i4_data_stride;
WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
WORD16 *pi2_out_tmp = pi2_out;
UWORD32 u4_nonzero_coeff = 0;
const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
{
/* computing prediction error (residue) */
x4 = pu1_src[0] - pu1_pred[0];
x5 = pu1_src[1] - pu1_pred[1];
x6 = pu1_src[2] - pu1_pred[2];
x7 = pu1_src[3] - pu1_pred[3];
if(u1_use_upsampled_res)
{
x4 = isvc_subtract_upsampled_res(x4, pi2_upsampled_res[0]);
x5 = isvc_subtract_upsampled_res(x5, pi2_upsampled_res[1]);
x6 = isvc_subtract_upsampled_res(x6, pi2_upsampled_res[2]);
x7 = isvc_subtract_upsampled_res(x7, pi2_upsampled_res[3]);
}
/* Horizontal transform */
x0 = x4 + x7;
x1 = x5 + x6;
x2 = x5 - x6;
x3 = x4 - x7;
pi2_out_tmp[0] = x0 + x1;
pi2_out_tmp[1] = (x3 << 1) + x2;
pi2_out_tmp[2] = x0 - x1;
pi2_out_tmp[3] = x3 - (x2 << 1);
/* pointing to next row; */
pu1_src += i4_src_stride;
pu1_pred += i4_pred_stride;
pi2_out_tmp += 4;
pi2_upsampled_res += i4_upsampled_res_stride;
}
pi2_out_tmp = pi2_out;
for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
{
/* Vertical transform and quantization */
x4 = pi2_out_tmp[0];
x5 = pi2_out_tmp[4];
x6 = pi2_out_tmp[8];
x7 = pi2_out_tmp[12];
x0 = x4 + x7;
x1 = x5 + x6;
x2 = x5 - x6;
x3 = x4 - x7;
/* quantization is done in place */
i4_value = x0 + x1;
if(i == 0)
{
(*pi2_dc_out) = i4_value;
}
FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[0] = i4_value;
i4_value = (x3 << 1) + x2;
FWD_QUANT(i4_value, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[4] = i4_value;
i4_value = x0 - x1;
FWD_QUANT(i4_value, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[8] = i4_value;
i4_value = x3 - (x2 << 1);
FWD_QUANT(i4_value, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor,
u4_qbits, u4_nonzero_coeff);
pi2_out_tmp[12] = i4_value;
pi2_out_tmp++;
pu2_scale_matrix++;
pu2_threshold_matrix++;
}
/* Return total nonzero coefficients in the current sub block */
*pu1_nnz = u4_nonzero_coeff;
}
/**
*******************************************************************************
*
* @brief
* This function performs forward transform and quantization on a 4*4 chroma
*block with interleaved values
*
* @par Description:
* The function accepts source buffer and estimation buffer. From these, it
* computes the residue. This is residue is then transformed and quantized.
* The transform and quantization are in placed computed. They use the residue
* buffer for this.
*
* @param[in] pu1_src
* Pointer to source sub-block
*
* @param[in] pu1_pred
* Pointer to prediction sub-block
*
* @param[in] pi2_out
* Pointer to residual sub-block
*
* @param[in] i4_src_stride
* Source stride
*
* @param[in] i4_pred_stride
* Prediction stride
*
* @param[in] dst_strd
* Destination stride
*
* @param[in] u4_qbits
* QP_BITS_h264_4x4 + floor(QP/6)
*
* @param[in] pu2_threshold_matrix
* Pointer to Forward Quant Threshold Matrix
*
* @param[in] pu2_scale_matrix
* Pointer to Forward Quant Scale Matrix
*
* @param[in] u4_round_factor
* Quantization Round factor
*
* @param[out] pu1_nnz
* Total non-zero coefficients in the current sub-block
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void isvc_resi_trans_quant_chroma_4x4(buffer_container_t *ps_src, buffer_container_t *ps_pred,
buffer_container_t *ps_out,
buffer_container_t *ps_upsampled_res,
resi_trans_quant_constants_t *ps_quant_constants,
UWORD8 *pu1_nnz, WORD16 *pi2_dc_out,
UWORD8 u1_use_upsampled_res)
{
UWORD32 i;
WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
WORD32 i4_value;
UWORD8 *pu1_src = ps_src->pv_data;
UWORD8 *pu1_pred = ps_pred->pv_data;
WORD16 *pi2_out = ps_out->pv_data;
WORD16 *pi2_upsampled_res = ps_upsampled_res ? ps_upsampled_res->pv_data : NULL;
WORD32 i4_src_stride = ps_src->i4_data_stride;
WORD32 i4_pred_stride = ps_pred->i4_data_stride;
WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
WORD16 *pi2_out_tmp = pi2_out;
UWORD32 u4_nonzero_coeff = 0;
const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
{
/* computing prediction error (residue) */
x4 = pu1_src[0] - pu1_pred[0];
x5 = pu1_src[2] - pu1_pred[2];
x6 = pu1_src[4] - pu1_pred[4];
x7 = pu1_src[6] - pu1_pred[6];
if(u1_use_upsampled_res)
{
x4 = isvc_subtract_upsampled_res(x4, pi2_upsampled_res[0]);
x5 = isvc_subtract_upsampled_res(x5, pi2_upsampled_res[1]);
x6 = isvc_subtract_upsampled_res(x6, pi2_upsampled_res[2]);
x7 = isvc_subtract_upsampled_res(x7, pi2_upsampled_res[3]);
}
/* Horizontal transform */
x0 = x4 + x7;
x1 = x5 + x6;
x2 = x5 - x6;
x3 = x4 - x7;
pi2_out_tmp[0] = x0 + x1;
pi2_out_tmp[1] = (x3 << 1) + x2;
pi2_out_tmp[2] = x0 - x1;
pi2_out_tmp[3] = x3 - (x2 << 1);
/* pointing to next row; */
pu1_src += i4_src_stride;
pu1_pred += i4_pred_stride;
pi2_out_tmp += 4;
pi2_upsampled_res += i4_upsampled_res_stride;
}
pi2_out_tmp = pi2_out;
for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
{
/* Vertical transform and quantization */
x4 = pi2_out_tmp[0];
x5 = pi2_out_tmp[4];
x6 = pi2_out_tmp[8];
x7 = pi2_out_tmp[12];
x0 = x4 + x7;
x1 = x5 + x6;
x2 = x5 - x6;
x3 = x4 - x7;
/* quantization is done in place */
i4_value = x0 + x1;
if(i == 0)
{
*pi2_dc_out = i4_value;
}
FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[0] = i4_value;
i4_value = (x3 << 1) + x2;
FWD_QUANT(i4_value, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[4] = i4_value;
i4_value = x0 - x1;
FWD_QUANT(i4_value, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[8] = i4_value;
i4_value = x3 - (x2 << 1);
FWD_QUANT(i4_value, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor,
u4_qbits, u4_nonzero_coeff);
pi2_out_tmp[12] = i4_value;
pi2_out_tmp++;
pu2_scale_matrix++;
pu2_threshold_matrix++;
}
/* Return total nonzero coefficients in the current sub block */
*pu1_nnz = u4_nonzero_coeff;
}
/**
*******************************************************************************
*
* @brief
* This function performs forward hadamard transform and quantization on a 4*4
*block
*
* @par Description:
* The function accepts source buffer and estimation buffer. From these, it
* computes the residue. This is residue is then transformed and quantized.
* The transform and quantization are in placed computed. They use the residue
* buffer for this.
*
* @param[in] pu1_src
* Pointer to source sub-block
*
* @param[in] pu1_pred
* Pointer to prediction sub-block
*
* @param[in] pi2_out
* Pointer to residual sub-block
*
* @param[in] i4_src_stride
* Source stride
*
* @param[in] i4_pred_stride
* Prediction stride
*
* @param[in] dst_strd
* Destination stride
*
* @param[in] u4_qbits
* QP_BITS_h264_4x4 + floor(QP/6)
*
* @param[in] pu2_threshold_matrix
* Pointer to Forward Quant Threshold Matrix
*
* @param[in] pu2_scale_matrix
* Pointer to Forward Quant Scale Matrix
*
* @param[in] u4_round_factor
* Quantization Round factor
*
* @param[out] pu1_nnz
* Total non-zero coefficients in the current sub-block
*
* @returns
*
* @remarks
* None
*
*/
void isvc_hadamard_quant_4x4(WORD16 *pi2_src, WORD16 *pi2_dst,
resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz)
{
WORD32 i;
WORD32 x0, x1, x2, x3, x4, x5, x6, x7, i4_value;
const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
*pu1_nnz = 0;
for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
{
x4 = pi2_src[0];
x5 = pi2_src[1];
x6 = pi2_src[2];
x7 = pi2_src[3];
x0 = x4 + x7;
x1 = x5 + x6;
x2 = x5 - x6;
x3 = x4 - x7;
pi2_dst[0] = x0 + x1;
pi2_dst[1] = x3 + x2;
pi2_dst[2] = x0 - x1;
pi2_dst[3] = x3 - x2;
pi2_src += 4;
pi2_dst += 4;
}
/* Vertical transform and quantization */
pi2_dst -= SUB_BLK_WIDTH_4x4 << 2;
for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
{
x4 = pi2_dst[0];
x5 = pi2_dst[4];
x6 = pi2_dst[8];
x7 = pi2_dst[12];
x0 = x4 + x7;
x1 = x5 + x6;
x2 = x5 - x6;
x3 = x4 - x7;
i4_value = (x0 + x1) >> 1;
FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
pu1_nnz[0]);
pi2_dst[0] = i4_value;
i4_value = (x3 + x2) >> 1;
FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
pu1_nnz[0]);
pi2_dst[4] = i4_value;
i4_value = (x0 - x1) >> 1;
FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
pu1_nnz[0]);
pi2_dst[8] = i4_value;
i4_value = (x3 - x2) >> 1;
FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
pu1_nnz[0]);
pi2_dst[12] = i4_value;
pi2_dst++;
}
}
/**
*******************************************************************************
*
* @brief
* This function performs forward hadamard transform and quantization on a 2*2
*block for both U and V planes
*
* @par Description:
* The function accepts source buffer and estimation buffer. From these, it
* computes the residue. This is residue is then transformed and quantized.
* The transform and quantization are in placed computed. They use the residue
* buffer for this.
*
* @param[in] pu1_src
* Pointer to source sub-block
*
* @param[in] pu1_pred
* Pointer to prediction sub-block
*
* @param[in] pi2_out
* Pointer to residual sub-block
*
* @param[in] i4_src_stride
* Source stride
*
* @param[in] i4_pred_stride
* Prediction stride
*
* @param[in] dst_strd
* Destination stride
*
* @param[in] u4_qbits
* QP_BITS_h264_4x4 + floor(QP/6)
*
* @param[in] pu2_threshold_matrix
* Pointer to Forward Quant Threshold Matrix
*
* @param[in] pu2_scale_matrix
* Pointer to Forward Quant Scale Matrix
*
* @param[in] u4_round_factor
* Quantization Round factor
*
* @param[out] pu1_nnz
* Total non-zero coefficients in the current sub-block
*
* @returns
*
* @remarks
* NNZ for dc is populated at 0 and 5th position of pu1_nnz
*
*/
void isvc_hadamard_quant_2x2_uv(WORD16 *pi2_src, WORD16 *pi2_dst,
resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz)
{
WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
WORD32 i4_value, plane;
const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
for(plane = 0; plane < 2; plane++)
{
pu1_nnz[plane] = 0;
/* Horizontal transform */
x4 = pi2_src[0];
x5 = pi2_src[1];
x6 = pi2_src[2];
x7 = pi2_src[3];
x0 = x4 + x5;
x1 = x4 - x5;
x2 = x6 + x7;
x3 = x6 - x7;
/* Vertical transform and quantization */
i4_value = (x0 + x2);
FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
pu1_nnz[plane]);
pi2_dst[0] = i4_value;
i4_value = (x0 - x2);
FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
pu1_nnz[plane]);
pi2_dst[2] = i4_value;
i4_value = (x1 - x3);
FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
pu1_nnz[plane]);
pi2_dst[3] = i4_value;
i4_value = (x1 + x3);
FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
pu1_nnz[plane]);
pi2_dst[1] = i4_value;
pi2_dst += 4;
pi2_src += 4;
}
}
/*
*******************************************************************************
*
* @brief
* This function performs Single stage forward transform CF8 and quantization
*on 8*8 blocks for h.264
*
* @par Description:
* Performs single stage 8x8 forward transform CF8 after calculating the
*residue The result is then quantized
*
* @param[in] pu1_src
* Input 8x8 pixels
*
* @param[in] pu1_pred
* Input 8x8 pixels
*
* @param[in] pi1_out
* Output 8x8 pixels
*
* @param[in] u4_thresh
* Threshold under which the coeffs are not quantized
*
* @param[in] u4_qp_div
* QP/6
*
* @param[in] u4_qp_rem
* QP%6
*
* @param[in] u2_src_stride
* Source stride
*
* @param[in] i4_pred_stride
* stride for prediciton buffer
*
* @param[in] dst_strd
* stride for destination buffer
*
* @param[in] pu4_quant_mat
* Pointer to the 4x4 quantization matrix
*
* @returns Void
*
*
*******************************************************************************
*/
void isvc_resi_trans_quant_8x8(buffer_container_t *ps_src, buffer_container_t *ps_pred,
buffer_container_t *ps_out, buffer_container_t *ps_upsampled_res,
resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz,
WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res)
{
UWORD32 i;
WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
UWORD8 *pu1_src = ps_src->pv_data;
UWORD8 *pu1_pred = ps_pred->pv_data;
WORD16 *pi2_out = ps_out->pv_data;
WORD16 *pi2_upsampled_res = ps_upsampled_res ? ps_upsampled_res->pv_data : NULL;
WORD32 i4_src_stride = ps_src->i4_data_stride;
WORD32 i4_pred_stride = ps_pred->i4_data_stride;
WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
WORD16 *pi2_out_tmp = pi2_out;
UWORD32 u4_nonzero_coeff = 0;
const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
UNUSED(pi2_dc_out);
/*Horizontal transform */
/* we are going to use the a's and r's in a twisted way since */
/*i dont want to declare more variables */
for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
{
r0 = pu1_src[0];
r0 -= pu1_pred[0];
r1 = pu1_src[1];
r1 -= pu1_pred[1];
r2 = pu1_src[2];
r2 -= pu1_pred[2];
r3 = pu1_src[3];
r3 -= pu1_pred[3];
r4 = pu1_src[4];
r4 -= pu1_pred[4];
r5 = pu1_src[5];
r5 -= pu1_pred[5];
r6 = pu1_src[6];
r6 -= pu1_pred[6];
r7 = pu1_src[7];
r7 -= pu1_pred[7];
if(u1_use_upsampled_res)
{
r0 = isvc_subtract_upsampled_res(r0, pi2_upsampled_res[0]);
r1 = isvc_subtract_upsampled_res(r1, pi2_upsampled_res[1]);
r2 = isvc_subtract_upsampled_res(r2, pi2_upsampled_res[2]);
r3 = isvc_subtract_upsampled_res(r3, pi2_upsampled_res[3]);
r4 = isvc_subtract_upsampled_res(r4, pi2_upsampled_res[4]);
r5 = isvc_subtract_upsampled_res(r5, pi2_upsampled_res[5]);
r6 = isvc_subtract_upsampled_res(r6, pi2_upsampled_res[6]);
r7 = isvc_subtract_upsampled_res(r7, pi2_upsampled_res[7]);
}
a0 = r0 + r7;
a1 = r1 + r6;
a2 = r2 + r5;
a3 = r3 + r4;
a4 = a0 + a3;
a5 = a1 + a2;
a6 = a0 - a3;
a7 = a1 - a2;
pi2_out_tmp[0] = a4 + a5;
pi2_out_tmp[2] = a6 + (a7 >> 1);
pi2_out_tmp[4] = a4 - a5;
pi2_out_tmp[6] = (a6 >> 1) - a7;
a0 = r0 - r7;
a1 = r1 - r6;
a2 = r2 - r5;
a3 = r3 - r4;
a4 = a1 + a2 + ((a0 >> 1) + a0);
a5 = a0 - a3 - ((a2 >> 1) + a2);
a6 = a0 + a3 - ((a1 >> 1) + a1);
a7 = a1 - a2 + ((a3 >> 1) + a3);
pi2_out_tmp[1] = a4 + (a7 >> 2);
pi2_out_tmp[3] = a5 + (a6 >> 2);
pi2_out_tmp[5] = a6 - (a5 >> 2);
pi2_out_tmp[7] = (a4 >> 2) - a7;
pu1_src += i4_src_stride;
pu1_pred += i4_pred_stride;
pi2_out_tmp += 8;
pi2_upsampled_res += i4_upsampled_res_stride;
}
/*vertical transform and quant */
pi2_out_tmp = pi2_out;
for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
{
r0 = pi2_out_tmp[0];
r1 = pi2_out_tmp[8];
r2 = pi2_out_tmp[16];
r3 = pi2_out_tmp[24];
r4 = pi2_out_tmp[32];
r5 = pi2_out_tmp[40];
r6 = pi2_out_tmp[48];
r7 = pi2_out_tmp[56];
a0 = r0 + r7;
a1 = r1 + r6;
a2 = r2 + r5;
a3 = r3 + r4;
a4 = a0 + a3;
a5 = a1 + a2;
a6 = a0 - a3;
a7 = a1 - a2;
a0 = r0 - r7;
a1 = r1 - r6;
a2 = r2 - r5;
a3 = r3 - r4;
r0 = a4 + a5;
r2 = a6 + (a7 >> 1);
r4 = a4 - a5;
r6 = (a6 >> 1) - a7;
a4 = a1 + a2 + ((a0 >> 1) + a0);
a5 = a0 - a3 - ((a2 >> 1) + a2);
a6 = a0 + a3 - ((a1 >> 1) + a1);
a7 = a1 - a2 + ((a3 >> 1) + a3);
r1 = a4 + (a7 >> 2);
r3 = a5 + (a6 >> 2);
r5 = a6 - (a5 >> 2);
r7 = (a4 >> 2) - a7;
FWD_QUANT(r0, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[0] = r0;
FWD_QUANT(r1, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[8] = r1;
FWD_QUANT(r2, pu2_threshold_matrix[16], pu2_scale_matrix[16], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[16] = r2;
FWD_QUANT(r3, pu2_threshold_matrix[24], pu2_scale_matrix[24], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[24] = r3;
FWD_QUANT(r4, pu2_threshold_matrix[32], pu2_scale_matrix[32], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[32] = r4;
FWD_QUANT(r5, pu2_threshold_matrix[40], pu2_scale_matrix[40], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[40] = r5;
FWD_QUANT(r6, pu2_threshold_matrix[48], pu2_scale_matrix[48], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[48] = r6;
FWD_QUANT(r7, pu2_threshold_matrix[56], pu2_scale_matrix[56], u4_round_factor, u4_qbits,
u4_nonzero_coeff);
pi2_out_tmp[56] = r7;
pi2_out_tmp++;
pu2_scale_matrix++;
pu2_threshold_matrix++;
}
/* Return total nonzero coefficients in the current sub block */
*pu1_nnz = u4_nonzero_coeff;
}