214 lines
8.7 KiB
Diff
214 lines
8.7 KiB
Diff
|
Taken from upstream PR: https://github.com/xbmc/xbmc/pull/3760
|
||
|
|
||
|
Signed-off-by: Bernd Kuhls <bernd.kuhls@t-online.de>
|
||
|
|
||
|
|
||
|
From 7388e8be7cd5e78100532ebf0dba15dccb7b03f8 Mon Sep 17 00:00:00 2001
|
||
|
From: Ben Avison <bavison@riscosopen.org>
|
||
|
Date: Tue, 3 Dec 2013 15:51:39 +0000
|
||
|
Subject: [PATCH] Faster and simpler portable implementation of
|
||
|
MathUtils::round_int().
|
||
|
|
||
|
Much as I like a bit of inline assembler, I have also removed the ARM versions
|
||
|
of MathUtils::truncate_int() and MathUtils::round_int(). The former was just
|
||
|
how any sane compiler should have assembled a cast from double to signed int
|
||
|
anyway. The latter was a much too complicated way to achieve the desired
|
||
|
effect, and was switched out in most ARM builds anyway in favour of the old
|
||
|
portable implementation that used floor().
|
||
|
|
||
|
Verified that MathUtils::test() still passes, and that GCC is now able to
|
||
|
inline MathUtils::round_int(), where it didn't previously.
|
||
|
|
||
|
I tested on a Raspberry Pi with the default theme, displaying the front page
|
||
|
with the RSS ticker enabled. This saturates the CPU, so I'm measuring the
|
||
|
improvement using the debug window's FPS figure. This patch improves this from
|
||
|
~50.8 FPS to ~52.6 FPS.
|
||
|
---
|
||
|
xbmc/utils/MathUtils.h | 129 +++++++++++++++++++++++--------------------------
|
||
|
1 file changed, 61 insertions(+), 68 deletions(-)
|
||
|
|
||
|
diff --git a/xbmc/utils/MathUtils.h b/xbmc/utils/MathUtils.h
|
||
|
index 96af9f4..0dae77d 100644
|
||
|
--- a/xbmc/utils/MathUtils.h
|
||
|
+++ b/xbmc/utils/MathUtils.h
|
||
|
@@ -34,17 +34,13 @@
|
||
|
|
||
|
#if defined(__ppc__) || \
|
||
|
defined(__powerpc__) || \
|
||
|
- (defined(TARGET_DARWIN_IOS) && defined(__llvm__)) || \
|
||
|
- (defined(TARGET_ANDROID) && defined(__arm__)) || \
|
||
|
- defined(TARGET_RASPBERRY_PI)
|
||
|
+ defined(__arm__)
|
||
|
#define DISABLE_MATHUTILS_ASM_ROUND_INT
|
||
|
#endif
|
||
|
|
||
|
#if defined(__ppc__) || \
|
||
|
defined(__powerpc__) || \
|
||
|
- (defined(TARGET_DARWIN) && defined(__llvm__)) || \
|
||
|
- (defined(TARGET_ANDROID) && defined(__arm__)) || \
|
||
|
- defined(TARGET_RASPBERRY_PI)
|
||
|
+ defined(__arm__)
|
||
|
#define DISABLE_MATHUTILS_ASM_TRUNCATE_INT
|
||
|
#endif
|
||
|
|
||
|
@@ -73,60 +69,63 @@
|
||
|
{
|
||
|
assert(x > static_cast<double>(INT_MIN / 2) - 1.0);
|
||
|
assert(x < static_cast<double>(INT_MAX / 2) + 1.0);
|
||
|
- const float round_to_nearest = 0.5f;
|
||
|
- int i;
|
||
|
|
||
|
#if defined(DISABLE_MATHUTILS_ASM_ROUND_INT)
|
||
|
- i = floor(x + round_to_nearest);
|
||
|
-
|
||
|
-#elif defined(__arm__)
|
||
|
- // From 'ARM-v7-M Architecture Reference Manual' page A7-569:
|
||
|
- // "The floating-point to integer operation (vcvt) [normally] uses the Round towards Zero rounding mode"
|
||
|
- // Because of this...we must use some less-than-straightforward logic to perform this operation without
|
||
|
- // changing the rounding mode flags
|
||
|
-
|
||
|
- /* The assembly below implements the following logic:
|
||
|
- if (x < 0)
|
||
|
- inc = -0.5f
|
||
|
- else
|
||
|
- inc = 0.5f
|
||
|
- int_val = trunc(x+inc);
|
||
|
- err = x - int_val;
|
||
|
- if (err == 0.5f)
|
||
|
- int_val++;
|
||
|
- return int_val;
|
||
|
- */
|
||
|
+ /* This implementation warrants some further explanation.
|
||
|
+ *
|
||
|
+ * First, a couple of notes on rounding:
|
||
|
+ * 1) C casts from float/double to integer round towards zero.
|
||
|
+ * 2) Float/double additions are rounded according to the normal rules,
|
||
|
+ * in other words: on some architectures, it's fixed at compile-time,
|
||
|
+ * and on others it can be set using fesetround()). The following
|
||
|
+ * analysis assumes round-to-nearest with ties rounding to even. This
|
||
|
+ * is a fairly sensible choice, and is the default with ARM VFP.
|
||
|
+ *
|
||
|
+ * What this function wants is round-to-nearest with ties rounding to
|
||
|
+ * +infinity. This isn't an IEEE rounding mode, even if we could guarantee
|
||
|
+ * that all architectures supported fesetround(), which they don't. Instead,
|
||
|
+ * this adds an offset of 2147483648.5 (= 0x80000000.8p0), then casts to
|
||
|
+ * an unsigned int (crucially, all possible inputs are now in a range where
|
||
|
+ * round to zero acts the same as round to -infinity) and then subtracts
|
||
|
+ * 0x80000000 in the integer domain. The 0.5 component of the offset
|
||
|
+ * converts what is effectively a round down into a round to nearest, with
|
||
|
+ * ties rounding up, as desired.
|
||
|
+ *
|
||
|
+ * There is a catch, that because there is a double rounding, there is a
|
||
|
+ * small region where the input falls just *below* a tie, where the addition
|
||
|
+ * of the offset causes a round *up* to an exact integer, due to the finite
|
||
|
+ * level of precision available in floating point. You need to be aware of
|
||
|
+ * this when calling this function, although at present it is not believed
|
||
|
+ * that XBMC ever attempts to round numbers in this window.
|
||
|
+ *
|
||
|
+ * It is worth proving the size of the affected window. Recall that double
|
||
|
+ * precision employs a mantissa of 52 bits.
|
||
|
+ * 1) For all inputs -0.5 <= x <= INT_MAX
|
||
|
+ * Once the offset is applied, the most significant binary digit in the
|
||
|
+ * floating-point representation is +2^31.
|
||
|
+ * At this magnitude, the smallest step representable in double precision
|
||
|
+ * is 2^31 / 2^52 = 0.000000476837158203125
|
||
|
+ * So the size of the range which is rounded up due to the addition is
|
||
|
+ * half the size of this step, or 0.0000002384185791015625
|
||
|
+ *
|
||
|
+ * 2) For all inputs INT_MIN/2 < x < -0.5
|
||
|
+ * Once the offset is applied, the most significant binary digit in the
|
||
|
+ * floating-point representation is +2^30.
|
||
|
+ * At this magnitude, the smallest step representable in double precision
|
||
|
+ * is 2^30 / 2^52 = 0.0000002384185791015625
|
||
|
+ * So the size of the range which is rounded up due to the addition is
|
||
|
+ * half the size of this step, or 0.00000011920928955078125
|
||
|
+ *
|
||
|
+ * 3) For all inputs INT_MIN <= x <= INT_MIN/2
|
||
|
+ * The representation once the offset is applied has equal or greater
|
||
|
+ * precision than the input, so the addition does not cause rounding.
|
||
|
+ */
|
||
|
+ return ((unsigned int) (x + 0x80000000.8p0)) - 0x80000000;
|
||
|
|
||
|
- __asm__ __volatile__ (
|
||
|
-#if defined(__ARM_PCS_VFP)
|
||
|
- "fconstd d1,#%G[rnd_val] \n\t" // Copy round_to_nearest into a working register (d1 = 0.5)
|
||
|
#else
|
||
|
- "vmov.F64 d1,%[rnd_val] \n\t"
|
||
|
-#endif
|
||
|
- "fcmpezd %P[value] \n\t" // Check value against zero (value == 0?)
|
||
|
- "fmstat \n\t" // Copy the floating-point status flags into the general-purpose status flags
|
||
|
- "it mi \n\t"
|
||
|
- "vnegmi.F64 d1, d1 \n\t" // if N-flag is set, negate round_to_nearest (if (value < 0) d1 = -1 * d1)
|
||
|
- "vadd.F64 d1,%P[value],d1 \n\t" // Add round_to_nearest to value, store result in working register (d1 += value)
|
||
|
- "vcvt.S32.F64 s3,d1 \n\t" // Truncate(round towards zero) (s3 = (int)d1)
|
||
|
- "vmov %[result],s3 \n\t" // Store the integer result in a general-purpose register (result = s3)
|
||
|
- "vcvt.F64.S32 d1,s3 \n\t" // Convert back to floating-point (d1 = (double)s3)
|
||
|
- "vsub.F64 d1,%P[value],d1 \n\t" // Calculate the error (d1 = value - d1)
|
||
|
-#if defined(__ARM_PCS_VFP)
|
||
|
- "fconstd d2,#%G[rnd_val] \n\t" // d2 = 0.5;
|
||
|
-#else
|
||
|
- "vmov.F64 d2,%[rnd_val] \n\t"
|
||
|
-#endif
|
||
|
- "fcmped d1, d2 \n\t" // (d1 == 0.5?)
|
||
|
- "fmstat \n\t" // Copy the floating-point status flags into the general-purpose status flags
|
||
|
- "it eq \n\t"
|
||
|
- "addeq %[result],#1 \n\t" // (if (d1 == d2) result++;)
|
||
|
- : [result] "=r"(i) // Outputs
|
||
|
- : [rnd_val] "Dv" (round_to_nearest), [value] "w"(x) // Inputs
|
||
|
- : "d1", "d2", "s3" // Clobbers
|
||
|
- );
|
||
|
-
|
||
|
-#elif defined(__SSE2__)
|
||
|
+ const float round_to_nearest = 0.5f;
|
||
|
+ int i;
|
||
|
+#if defined(__SSE2__)
|
||
|
const float round_dn_to_nearest = 0.4999999f;
|
||
|
i = (x > 0) ? _mm_cvttsd_si32(_mm_set_sd(x + round_to_nearest)) : _mm_cvttsd_si32(_mm_set_sd(x - round_dn_to_nearest));
|
||
|
|
||
|
@@ -150,8 +149,8 @@
|
||
|
);
|
||
|
|
||
|
#endif
|
||
|
-
|
||
|
return i;
|
||
|
+#endif
|
||
|
}
|
||
|
|
||
|
/*! \brief Truncate to nearest integer.
|
||
|
@@ -165,20 +164,13 @@
|
||
|
{
|
||
|
assert(x > static_cast<double>(INT_MIN / 2) - 1.0);
|
||
|
assert(x < static_cast<double>(INT_MAX / 2) + 1.0);
|
||
|
- int i;
|
||
|
|
||
|
#if defined(DISABLE_MATHUTILS_ASM_TRUNCATE_INT)
|
||
|
- return i = (int)x;
|
||
|
-
|
||
|
-#elif defined(__arm__)
|
||
|
- __asm__ __volatile__ (
|
||
|
- "vcvt.S32.F64 %[result],%P[value] \n\t" // Truncate(round towards zero) and store the result
|
||
|
- : [result] "=w"(i) // Outputs
|
||
|
- : [value] "w"(x) // Inputs
|
||
|
- );
|
||
|
- return i;
|
||
|
+ return x;
|
||
|
|
||
|
-#elif defined(TARGET_WINDOWS)
|
||
|
+#else
|
||
|
+ int i;
|
||
|
+#if defined(TARGET_WINDOWS)
|
||
|
const float round_towards_m_i = -0.5f;
|
||
|
__asm
|
||
|
{
|
||
|
@@ -204,6 +196,7 @@
|
||
|
if (x < 0)
|
||
|
i = -i;
|
||
|
return (i);
|
||
|
+#endif
|
||
|
}
|
||
|
|
||
|
inline int64_t abs(int64_t a)
|
||
|
--
|
||
|
1.9.1
|
||
|
|