GNU Radio C++ API
volk_32fc_x2_square_dist_32f_a.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
2 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
3 
4 #include<inttypes.h>
5 #include<stdio.h>
6 #include<volk/volk_complex.h>
7 
8 #ifdef LV_HAVE_SSE3
9 #include<xmmintrin.h>
10 #include<pmmintrin.h>
11 
12 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
13 
14 
15  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
16 
17  lv_32fc_t diff;
18  float sq_dist;
19  int bound = num_bytes >> 5;
20  int leftovers0 = (num_bytes >> 4) & 1;
21  int leftovers1 = (num_bytes >> 3) & 1;
22  int i = 0;
23 
24  xmm1 = _mm_setzero_ps();
25  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
26  xmm2 = _mm_load_ps((float*)&points[0]);
27  xmm1 = _mm_movelh_ps(xmm1, xmm1);
28  xmm3 = _mm_load_ps((float*)&points[2]);
29 
30 
31  for(; i < bound - 1; ++i) {
32  xmm4 = _mm_sub_ps(xmm1, xmm2);
33  xmm5 = _mm_sub_ps(xmm1, xmm3);
34  points += 4;
35  xmm6 = _mm_mul_ps(xmm4, xmm4);
36  xmm7 = _mm_mul_ps(xmm5, xmm5);
37 
38  xmm2 = _mm_load_ps((float*)&points[0]);
39 
40  xmm4 = _mm_hadd_ps(xmm6, xmm7);
41 
42  xmm3 = _mm_load_ps((float*)&points[2]);
43 
44  _mm_store_ps(target, xmm4);
45 
46  target += 4;
47 
48  }
49 
50  xmm4 = _mm_sub_ps(xmm1, xmm2);
51  xmm5 = _mm_sub_ps(xmm1, xmm3);
52 
53 
54 
55  points += 4;
56  xmm6 = _mm_mul_ps(xmm4, xmm4);
57  xmm7 = _mm_mul_ps(xmm5, xmm5);
58 
59  xmm4 = _mm_hadd_ps(xmm6, xmm7);
60 
61  _mm_store_ps(target, xmm4);
62 
63  target += 4;
64 
65  for(i = 0; i < leftovers0; ++i) {
66 
67  xmm2 = _mm_load_ps((float*)&points[0]);
68 
69  xmm4 = _mm_sub_ps(xmm1, xmm2);
70 
71  points += 2;
72 
73  xmm6 = _mm_mul_ps(xmm4, xmm4);
74 
75  xmm4 = _mm_hadd_ps(xmm6, xmm6);
76 
77  _mm_storeh_pi((__m64*)target, xmm4);
78 
79  target += 2;
80  }
81 
82  for(i = 0; i < leftovers1; ++i) {
83 
84  diff = src0[0] - points[0];
85 
86  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
87 
88  target[0] = sq_dist;
89  }
90 }
91 
92 #endif /*LV_HAVE_SSE3*/
93 
94 #ifdef LV_HAVE_GENERIC
95 static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
96  lv_32fc_t diff;
97  float sq_dist;
98  unsigned int i = 0;
99 
100  for(; i < num_bytes >> 3; ++i) {
101  diff = src0[0] - points[i];
102 
103  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
104 
105  target[i] = sq_dist;
106  }
107 }
108 
109 #endif /*LV_HAVE_GENERIC*/
110 
111 
112 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/