GNU Radio C++ API
volk_32f_x2_dot_prod_32f_u.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
2 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
3 
4 #include<stdio.h>
5 
6 
7 #ifdef LV_HAVE_GENERIC
8 
9 
10 static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
11 
12  float dotProduct = 0;
13  const float* aPtr = input;
14  const float* bPtr= taps;
15  unsigned int number = 0;
16 
17  for(number = 0; number < num_points; number++){
18  dotProduct += ((*aPtr++) * (*bPtr++));
19  }
20 
21  *result = dotProduct;
22 }
23 
24 #endif /*LV_HAVE_GENERIC*/
25 
26 
27 #ifdef LV_HAVE_SSE
28 
29 
30 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
31 
32  unsigned int number = 0;
33  const unsigned int quarterPoints = num_points / 4;
34 
35  float dotProduct = 0;
36  const float* aPtr = input;
37  const float* bPtr = taps;
38 
39  __m128 aVal, bVal, cVal;
40 
41  __m128 dotProdVal = _mm_setzero_ps();
42 
43  for(;number < quarterPoints; number++){
44 
45  aVal = _mm_loadu_ps(aPtr);
46  bVal = _mm_loadu_ps(bPtr);
47 
48  cVal = _mm_mul_ps(aVal, bVal);
49 
50  dotProdVal = _mm_add_ps(cVal, dotProdVal);
51 
52  aPtr += 4;
53  bPtr += 4;
54  }
55 
56  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
57 
58  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
59 
60  dotProduct = dotProductVector[0];
61  dotProduct += dotProductVector[1];
62  dotProduct += dotProductVector[2];
63  dotProduct += dotProductVector[3];
64 
65  number = quarterPoints * 4;
66  for(;number < num_points; number++){
67  dotProduct += ((*aPtr++) * (*bPtr++));
68  }
69 
70  *result = dotProduct;
71 
72 }
73 
74 #endif /*LV_HAVE_SSE*/
75 
76 #ifdef LV_HAVE_SSE3
77 
78 #include <pmmintrin.h>
79 
80 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
81  unsigned int number = 0;
82  const unsigned int quarterPoints = num_points / 4;
83 
84  float dotProduct = 0;
85  const float* aPtr = input;
86  const float* bPtr = taps;
87 
88  __m128 aVal, bVal, cVal;
89 
90  __m128 dotProdVal = _mm_setzero_ps();
91 
92  for(;number < quarterPoints; number++){
93 
94  aVal = _mm_loadu_ps(aPtr);
95  bVal = _mm_loadu_ps(bPtr);
96 
97  cVal = _mm_mul_ps(aVal, bVal);
98 
99  dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
100 
101  aPtr += 4;
102  bPtr += 4;
103  }
104 
105  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
106  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
107 
108  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
109 
110  dotProduct = dotProductVector[0];
111  dotProduct += dotProductVector[1];
112 
113  number = quarterPoints * 4;
114  for(;number < num_points; number++){
115  dotProduct += ((*aPtr++) * (*bPtr++));
116  }
117 
118  *result = dotProduct;
119 }
120 
121 #endif /*LV_HAVE_SSE3*/
122 
123 #ifdef LV_HAVE_SSE4_1
124 
125 #include <smmintrin.h>
126 
127 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
128  unsigned int number = 0;
129  const unsigned int sixteenthPoints = num_points / 16;
130 
131  float dotProduct = 0;
132  const float* aPtr = input;
133  const float* bPtr = taps;
134 
135  __m128 aVal1, bVal1, cVal1;
136  __m128 aVal2, bVal2, cVal2;
137  __m128 aVal3, bVal3, cVal3;
138  __m128 aVal4, bVal4, cVal4;
139 
140  __m128 dotProdVal = _mm_setzero_ps();
141 
142  for(;number < sixteenthPoints; number++){
143 
144  aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
145  aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
146  aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
147  aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
148 
149  bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
150  bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
151  bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
152  bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
153 
154  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
155  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
156  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
157  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
158 
159  cVal1 = _mm_or_ps(cVal1, cVal2);
160  cVal3 = _mm_or_ps(cVal3, cVal4);
161  cVal1 = _mm_or_ps(cVal1, cVal3);
162 
163  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
164  }
165 
166  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
167  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
168 
169  dotProduct = dotProductVector[0];
170  dotProduct += dotProductVector[1];
171  dotProduct += dotProductVector[2];
172  dotProduct += dotProductVector[3];
173 
174  number = sixteenthPoints * 16;
175  for(;number < num_points; number++){
176  dotProduct += ((*aPtr++) * (*bPtr++));
177  }
178 
179  *result = dotProduct;
180 }
181 
182 #endif /*LV_HAVE_SSE4_1*/
183 
184 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/