1 #ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
2 #define INCLUDED_volk_32f_x2_multiply_32f_a_H
16 static inline void volk_32f_x2_multiply_32f_a_sse(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
17 unsigned int number = 0;
18 const unsigned int quarterPoints = num_points / 4;
20 float* cPtr = cVector;
21 const float* aPtr = aVector;
22 const float* bPtr= bVector;
24 __m128 aVal, bVal, cVal;
25 for(;number < quarterPoints; number++){
27 aVal = _mm_load_ps(aPtr);
28 bVal = _mm_load_ps(bPtr);
30 cVal = _mm_mul_ps(aVal, bVal);
32 _mm_store_ps(cPtr,cVal);
39 number = quarterPoints * 4;
40 for(;number < num_points; number++){
41 *cPtr++ = (*aPtr++) * (*bPtr++);
47 #include <immintrin.h>
55 static inline void volk_32f_x2_multiply_32f_a_avx(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
56 unsigned int number = 0;
57 const unsigned int eighthPoints = num_points / 8;
59 float* cPtr = cVector;
60 const float* aPtr = aVector;
61 const float* bPtr= bVector;
63 __m256 aVal, bVal, cVal;
64 for(;number < eighthPoints; number++){
66 aVal = _mm256_load_ps(aPtr);
67 bVal = _mm256_load_ps(bPtr);
69 cVal = _mm256_mul_ps(aVal, bVal);
71 _mm256_store_ps(cPtr,cVal);
78 number = eighthPoints * 8;
79 for(;number < num_points; number++){
80 *cPtr++ = (*aPtr++) * (*bPtr++);
85 #ifdef LV_HAVE_GENERIC
93 static inline void volk_32f_x2_multiply_32f_a_generic(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
94 float* cPtr = cVector;
95 const float* aPtr = aVector;
96 const float* bPtr= bVector;
97 unsigned int number = 0;
99 for(number = 0; number < num_points; number++){
100 *cPtr++ = (*aPtr++) * (*bPtr++);
113 extern void volk_32f_x2_multiply_32f_a_orc_impl(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
114 static inline void volk_32f_x2_multiply_32f_a_orc(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
115 volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);