184 lines
5.0 KiB
C
184 lines
5.0 KiB
C
/*
|
|
* Single-precision vector erfc(x) function.
|
|
*
|
|
* Copyright (c) 2021-2023, Arm Limited.
|
|
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
|
|
*/
|
|
|
|
#include "v_math.h"
|
|
#include "erfcf.h"
|
|
#include "estrin.h"
|
|
#include "pl_sig.h"
|
|
#include "pl_test.h"
|
|
|
|
#if V_SUPPORTED
|
|
|
|
#define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)]
|
|
|
|
VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
|
|
|
|
static VPCS_ATTR NOINLINE v_f32_t
|
|
specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
|
|
{
|
|
return v_call_f32 (erfcf, x, y, special);
|
|
}
|
|
|
|
static inline uint32_t
|
|
interval_index (uint32_t ia12)
|
|
{
|
|
// clang-format off
|
|
return (ia12 < 0x400 ? 0 :
|
|
(ia12 < 0x408 ? 1 :
|
|
(ia12 < 0x410 ? 2 :
|
|
3)));
|
|
// clang-format on
|
|
}
|
|
|
|
/* The C macro wraps the coeffs argument in order to make the
|
|
poynomial evaluation more readable. In the scalarised variant the
|
|
second pointer is ignored. */
|
|
#ifdef SCALAR
|
|
#define C(i) coeff1[i]
|
|
#else
|
|
#define C(i) ((v_f64_t){coeff1[i], coeff2[i]})
|
|
#endif
|
|
|
|
static inline v_f64_t
|
|
v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
|
|
const double *coeff2)
|
|
{
|
|
v_f64_t x2 = x * x;
|
|
v_f64_t x4 = x2 * x2;
|
|
v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
|
|
v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
|
|
return poly * gauss;
|
|
}
|
|
|
|
static inline float
|
|
approx_poly_gauss (float abs_x, const double *coeff)
|
|
{
|
|
return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
|
|
}
|
|
|
|
static v_f32_t
|
|
v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes)
|
|
{
|
|
#ifdef SCALAR
|
|
float y = approx_poly_gauss (abs_x, P (ia12));
|
|
return sign ? 2 - y : y;
|
|
#else
|
|
float32x2_t lo32 = {0, 0};
|
|
float32x2_t hi32 = {0, 0};
|
|
/* The polynomial and Gaussian components must be calculated in
|
|
double precision in order to meet the required ULP error. This
|
|
means we have to promote low and high halves of the
|
|
single-precision input vector to two separate double-precision
|
|
input vectors. This incurs some overhead, and there is also
|
|
overhead to loading the polynomial coefficients as this cannot be
|
|
done in a vector fashion. This would be wasted effort for
|
|
elements which lie in the 'boring' zone, as they will be
|
|
overwritten later. Hence we use the lanes parameter to only do
|
|
the promotion on a pair of lanes if both of those lanes are
|
|
interesting and not special cases. If one lane is inactive, we
|
|
use a scalar routine which is shared with the scalar variant. */
|
|
if (lanes[0] & lanes[1])
|
|
{
|
|
lo32 = vcvt_f32_f64 (
|
|
v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)),
|
|
P (ia12[0]), P (ia12[1])));
|
|
}
|
|
else if (lanes[0])
|
|
{
|
|
lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0]));
|
|
}
|
|
else if (lanes[1])
|
|
{
|
|
lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1]));
|
|
}
|
|
|
|
if (lanes[2] & lanes[3])
|
|
{
|
|
hi32
|
|
= vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x),
|
|
P (ia12[2]), P (ia12[3])));
|
|
}
|
|
else if (lanes[2])
|
|
{
|
|
hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2]));
|
|
}
|
|
else if (lanes[3])
|
|
{
|
|
hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3]));
|
|
}
|
|
|
|
v_f32_t y = vcombine_f32 (lo32, hi32);
|
|
|
|
if (v_any_u32 (sign))
|
|
{
|
|
y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y);
|
|
}
|
|
|
|
return y;
|
|
#endif
|
|
}
|
|
|
|
/* Optimized single-precision vector complementary error function
|
|
erfcf. Max measured error: 0.750092 at various values between
|
|
-0x1.06521p-20 and -0x1.add1dap-17. For example:
|
|
__v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0
|
|
+0.249908 ulp err 0.250092. */
|
|
VPCS_ATTR
|
|
v_f32_t V_NAME (erfcf) (v_f32_t x)
|
|
{
|
|
v_u32_t ix = v_as_u32_f32 (x);
|
|
v_u32_t ia = ix & 0x7fffffff;
|
|
v_u32_t ia12 = ia >> 20;
|
|
v_u32_t sign = ix >> 31;
|
|
v_u32_t inf_ia12 = v_u32 (0x7f8);
|
|
|
|
v_u32_t special_cases
|
|
= v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328));
|
|
v_u32_t in_bounds
|
|
= v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3)));
|
|
v_f32_t boring_zone = v_as_f32_u32 (sign << 30);
|
|
|
|
#ifdef SCALAR
|
|
if (unlikely (special_cases))
|
|
{
|
|
if (ia12 >= 0x7f8)
|
|
return (float) (sign << 1) + 1.0f / x; /* Special cases. */
|
|
else
|
|
return 1.0f - x; /* Small case. */
|
|
}
|
|
else if (likely (!in_bounds))
|
|
{
|
|
return sign ? boring_zone : __math_uflowf (boring_zone);
|
|
}
|
|
#endif
|
|
|
|
v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12,
|
|
in_bounds & ~special_cases);
|
|
|
|
#ifndef SCALAR
|
|
y = vbslq_f32 (~in_bounds, boring_zone, y);
|
|
|
|
if (unlikely (v_any_u32 (special_cases)))
|
|
{
|
|
return specialcase (x, y, special_cases);
|
|
}
|
|
#endif
|
|
|
|
return y;
|
|
}
|
|
VPCS_ALIAS
|
|
|
|
PL_SIG (V, F, 1, erfc, -6.0, 28.0)
|
|
PL_TEST_ULP (V_NAME (erfcf), 0.26)
|
|
PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000)
|
|
PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000)
|
|
PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000)
|
|
PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000)
|
|
PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000)
|
|
PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000)
|
|
#endif
|