37 #include <xmmintrin.h> 39 #define OVERRIDE_INNER_PRODUCT_SINGLE 44 __m128 sum = _mm_setzero_ps();
47 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
48 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
50 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
51 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
52 _mm_store_ss(&ret, sum);
56 #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE 60 __m128 sum = _mm_setzero_ps();
61 __m128 f = _mm_loadu_ps(frac);
64 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
65 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
67 sum = _mm_mul_ps(f, sum);
68 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
69 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
70 _mm_store_ss(&ret, sum);
75 #include <emmintrin.h> 76 #define OVERRIDE_INNER_PRODUCT_DOUBLE 78 static inline double inner_product_double(
const float *
a,
const float *
b,
unsigned int len)
82 __m128d sum = _mm_setzero_pd();
86 t = _mm_mul_ps(_mm_loadu_ps(
a+i), _mm_loadu_ps(
b+i));
87 sum = _mm_add_pd(sum, _mm_cvtps_pd(t));
88 sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
90 t = _mm_mul_ps(_mm_loadu_ps(
a+i+4), _mm_loadu_ps(
b+i+4));
91 sum = _mm_add_pd(sum, _mm_cvtps_pd(t));
92 sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
94 sum = _mm_add_sd(sum, (__m128d) _mm_movehl_ps((__m128) sum, (__m128) sum));
95 _mm_store_sd(&ret, sum);
99 #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE 100 static inline double interpolate_product_double(
const float *
a,
const float *
b,
unsigned int len,
const spx_uint32_t
oversample,
float *frac) {
104 __m128d sum1 = _mm_setzero_pd();
105 __m128d sum2 = _mm_setzero_pd();
106 __m128 f = _mm_loadu_ps(frac);
107 __m128d f1 = _mm_cvtps_pd(f);
108 __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));
112 t = _mm_mul_ps(_mm_load1_ps(
a+i), _mm_loadu_ps(
b+i*
oversample));
113 sum1 = _mm_add_pd(sum1, _mm_cvtps_pd(t));
114 sum2 = _mm_add_pd(sum2, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
116 t = _mm_mul_ps(_mm_load1_ps(
a+i+1), _mm_loadu_ps(
b+(i+1)*oversample));
117 sum1 = _mm_add_pd(sum1, _mm_cvtps_pd(t));
118 sum2 = _mm_add_pd(sum2, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
120 sum1 = _mm_mul_pd(f1, sum1);
121 sum2 = _mm_mul_pd(f2, sum2);
122 sum = _mm_add_pd(sum1, sum2);
123 sum = _mm_add_sd(sum, (__m128d) _mm_movehl_ps((__m128) sum, (__m128) sum));
124 _mm_store_sd(&ret, sum);
static float inner_product_single(const float *a, const float *b, unsigned int len)
static int len(struct ast_channel *chan, const char *cmd, char *data, char *buf, size_t buflen)
static float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac)