Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_16ic_x2_dot_prod_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
48 #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
49 #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
50 
52 #include <volk/volk_common.h>
53 #include <volk/volk_complex.h>
54 
55 
56 #ifdef LV_HAVE_GENERIC
57 
58 static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
59  const lv_16sc_t* in_a,
60  const lv_16sc_t* in_b,
61  unsigned int num_points)
62 {
63  result[0] = lv_cmake((int16_t)0, (int16_t)0);
64  unsigned int n;
65  for (n = 0; n < num_points; n++) {
66  lv_16sc_t tmp = in_a[n] * in_b[n];
67  result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)),
68  sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
69  }
70 }
71 
72 #endif /*LV_HAVE_GENERIC*/
73 
74 
75 #ifdef LV_HAVE_SSE2
76 #include <emmintrin.h>
77 
79  const lv_16sc_t* in_a,
80  const lv_16sc_t* in_b,
81  unsigned int num_points)
82 {
83  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
84 
85  const unsigned int sse_iters = num_points / 4;
86  unsigned int number;
87 
88  const lv_16sc_t* _in_a = in_a;
89  const lv_16sc_t* _in_b = in_b;
90  lv_16sc_t* _out = out;
91 
92  if (sse_iters > 0) {
93  __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
94  realcacc, imagcacc;
95  __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
96 
97  realcacc = _mm_setzero_si128();
98  imagcacc = _mm_setzero_si128();
99 
100  mask_imag = _mm_set_epi8(
101  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
102  mask_real = _mm_set_epi8(
103  0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
104 
105  for (number = 0; number < sse_iters; number++) {
106  // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
107  a = _mm_load_si128(
108  (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
109  __VOLK_PREFETCH(_in_a + 8);
110  b = _mm_load_si128((__m128i*)_in_b);
111  __VOLK_PREFETCH(_in_b + 8);
112  c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
113 
114  c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
115  // zeros, and store the results in dst.
116  real = _mm_subs_epi16(c, c_sr);
117 
118  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
119  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
120 
121  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
122  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
123 
124  imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
125 
126  realcacc = _mm_adds_epi16(realcacc, real);
127  imagcacc = _mm_adds_epi16(imagcacc, imag);
128 
129  _in_a += 4;
130  _in_b += 4;
131  }
132 
133  realcacc = _mm_and_si128(realcacc, mask_real);
134  imagcacc = _mm_and_si128(imagcacc, mask_imag);
135 
136  a = _mm_or_si128(realcacc, imagcacc);
137 
138  _mm_store_si128((__m128i*)dotProductVector,
139  a); // Store the results back into the dot product vector
140 
141  for (number = 0; number < 4; ++number) {
142  dotProduct = lv_cmake(
143  sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
144  sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
145  }
146  }
147 
148  for (number = 0; number < (num_points % 4); ++number) {
149  lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
150  dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
151  sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
152  }
153 
154  *_out = dotProduct;
155 }
156 
157 #endif /* LV_HAVE_SSE2 */
158 
159 
160 #ifdef LV_HAVE_SSE2
161 #include <emmintrin.h>
162 
164  const lv_16sc_t* in_a,
165  const lv_16sc_t* in_b,
166  unsigned int num_points)
167 {
168  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
169 
170  const unsigned int sse_iters = num_points / 4;
171 
172  const lv_16sc_t* _in_a = in_a;
173  const lv_16sc_t* _in_b = in_b;
174  lv_16sc_t* _out = out;
175  unsigned int number;
176 
177  if (sse_iters > 0) {
178  __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
179  realcacc, imagcacc, result;
180  __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
181 
182  realcacc = _mm_setzero_si128();
183  imagcacc = _mm_setzero_si128();
184 
185  mask_imag = _mm_set_epi8(
186  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
187  mask_real = _mm_set_epi8(
188  0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
189 
190  for (number = 0; number < sse_iters; number++) {
191  // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
192  a = _mm_loadu_si128(
193  (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
194  __VOLK_PREFETCH(_in_a + 8);
195  b = _mm_loadu_si128((__m128i*)_in_b);
196  __VOLK_PREFETCH(_in_b + 8);
197  c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
198 
199  c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
200  // zeros, and store the results in dst.
201  real = _mm_subs_epi16(c, c_sr);
202 
203  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
204  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
205 
206  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
207  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
208 
209  imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
210 
211  realcacc = _mm_adds_epi16(realcacc, real);
212  imagcacc = _mm_adds_epi16(imagcacc, imag);
213 
214  _in_a += 4;
215  _in_b += 4;
216  }
217 
218  realcacc = _mm_and_si128(realcacc, mask_real);
219  imagcacc = _mm_and_si128(imagcacc, mask_imag);
220 
221  result = _mm_or_si128(realcacc, imagcacc);
222 
223  _mm_storeu_si128((__m128i*)dotProductVector,
224  result); // Store the results back into the dot product vector
225 
226  for (number = 0; number < 4; ++number) {
227  dotProduct = lv_cmake(
228  sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
229  sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
230  }
231  }
232 
233  for (number = 0; number < (num_points % 4); ++number) {
234  lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
235  dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
236  sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
237  }
238 
239  *_out = dotProduct;
240 }
241 #endif /* LV_HAVE_SSE2 */
242 
243 
244 #ifdef LV_HAVE_AVX2
245 #include <immintrin.h>
246 
247 static inline void volk_16ic_x2_dot_prod_16ic_u_avx2(lv_16sc_t* out,
248  const lv_16sc_t* in_a,
249  const lv_16sc_t* in_b,
250  unsigned int num_points)
251 {
252  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
253 
254  const unsigned int avx_iters = num_points / 8;
255 
256  const lv_16sc_t* _in_a = in_a;
257  const lv_16sc_t* _in_b = in_b;
258  lv_16sc_t* _out = out;
259  unsigned int number;
260 
261  if (avx_iters > 0) {
262  __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
263  realcacc, imagcacc, result;
264  __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
265 
266  realcacc = _mm256_setzero_si256();
267  imagcacc = _mm256_setzero_si256();
268 
269  mask_imag = _mm256_set_epi8(0xFF,
270  0xFF,
271  0,
272  0,
273  0xFF,
274  0xFF,
275  0,
276  0,
277  0xFF,
278  0xFF,
279  0,
280  0,
281  0xFF,
282  0xFF,
283  0,
284  0,
285  0xFF,
286  0xFF,
287  0,
288  0,
289  0xFF,
290  0xFF,
291  0,
292  0,
293  0xFF,
294  0xFF,
295  0,
296  0,
297  0xFF,
298  0xFF,
299  0,
300  0);
301  mask_real = _mm256_set_epi8(0,
302  0,
303  0xFF,
304  0xFF,
305  0,
306  0,
307  0xFF,
308  0xFF,
309  0,
310  0,
311  0xFF,
312  0xFF,
313  0,
314  0,
315  0xFF,
316  0xFF,
317  0,
318  0,
319  0xFF,
320  0xFF,
321  0,
322  0,
323  0xFF,
324  0xFF,
325  0,
326  0,
327  0xFF,
328  0xFF,
329  0,
330  0,
331  0xFF,
332  0xFF);
333 
334  for (number = 0; number < avx_iters; number++) {
335  a = _mm256_loadu_si256((__m256i*)_in_a);
336  __VOLK_PREFETCH(_in_a + 16);
337  b = _mm256_loadu_si256((__m256i*)_in_b);
338  __VOLK_PREFETCH(_in_b + 16);
339  c = _mm256_mullo_epi16(a, b);
340 
341  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
342  // in zeros, and store the results in dst.
343  real = _mm256_subs_epi16(c, c_sr);
344 
345  b_sl = _mm256_slli_si256(b, 2);
346  a_sl = _mm256_slli_si256(a, 2);
347 
348  imag1 = _mm256_mullo_epi16(a, b_sl);
349  imag2 = _mm256_mullo_epi16(b, a_sl);
350 
351  imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
352 
353  realcacc = _mm256_adds_epi16(realcacc, real);
354  imagcacc = _mm256_adds_epi16(imagcacc, imag);
355 
356  _in_a += 8;
357  _in_b += 8;
358  }
359 
360  realcacc = _mm256_and_si256(realcacc, mask_real);
361  imagcacc = _mm256_and_si256(imagcacc, mask_imag);
362 
363  result = _mm256_or_si256(realcacc, imagcacc);
364 
365  _mm256_storeu_si256((__m256i*)dotProductVector,
366  result); // Store the results back into the dot product vector
367 
368  for (number = 0; number < 8; ++number) {
369  dotProduct = lv_cmake(
370  sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
371  sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
372  }
373  }
374 
375  for (number = 0; number < (num_points % 8); ++number) {
376  lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
377  dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
378  sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
379  }
380 
381  *_out = dotProduct;
382 }
383 #endif /* LV_HAVE_AVX2 */
384 
385 
386 #ifdef LV_HAVE_AVX2
387 #include <immintrin.h>
388 
389 static inline void volk_16ic_x2_dot_prod_16ic_a_avx2(lv_16sc_t* out,
390  const lv_16sc_t* in_a,
391  const lv_16sc_t* in_b,
392  unsigned int num_points)
393 {
394  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
395 
396  const unsigned int avx_iters = num_points / 8;
397 
398  const lv_16sc_t* _in_a = in_a;
399  const lv_16sc_t* _in_b = in_b;
400  lv_16sc_t* _out = out;
401  unsigned int number;
402 
403  if (avx_iters > 0) {
404  __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
405  realcacc, imagcacc, result;
406  __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
407 
408  realcacc = _mm256_setzero_si256();
409  imagcacc = _mm256_setzero_si256();
410 
411  mask_imag = _mm256_set_epi8(0xFF,
412  0xFF,
413  0,
414  0,
415  0xFF,
416  0xFF,
417  0,
418  0,
419  0xFF,
420  0xFF,
421  0,
422  0,
423  0xFF,
424  0xFF,
425  0,
426  0,
427  0xFF,
428  0xFF,
429  0,
430  0,
431  0xFF,
432  0xFF,
433  0,
434  0,
435  0xFF,
436  0xFF,
437  0,
438  0,
439  0xFF,
440  0xFF,
441  0,
442  0);
443  mask_real = _mm256_set_epi8(0,
444  0,
445  0xFF,
446  0xFF,
447  0,
448  0,
449  0xFF,
450  0xFF,
451  0,
452  0,
453  0xFF,
454  0xFF,
455  0,
456  0,
457  0xFF,
458  0xFF,
459  0,
460  0,
461  0xFF,
462  0xFF,
463  0,
464  0,
465  0xFF,
466  0xFF,
467  0,
468  0,
469  0xFF,
470  0xFF,
471  0,
472  0,
473  0xFF,
474  0xFF);
475 
476  for (number = 0; number < avx_iters; number++) {
477  a = _mm256_load_si256((__m256i*)_in_a);
478  __VOLK_PREFETCH(_in_a + 16);
479  b = _mm256_load_si256((__m256i*)_in_b);
480  __VOLK_PREFETCH(_in_b + 16);
481  c = _mm256_mullo_epi16(a, b);
482 
483  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
484  // in zeros, and store the results in dst.
485  real = _mm256_subs_epi16(c, c_sr);
486 
487  b_sl = _mm256_slli_si256(b, 2);
488  a_sl = _mm256_slli_si256(a, 2);
489 
490  imag1 = _mm256_mullo_epi16(a, b_sl);
491  imag2 = _mm256_mullo_epi16(b, a_sl);
492 
493  imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
494 
495  realcacc = _mm256_adds_epi16(realcacc, real);
496  imagcacc = _mm256_adds_epi16(imagcacc, imag);
497 
498  _in_a += 8;
499  _in_b += 8;
500  }
501 
502  realcacc = _mm256_and_si256(realcacc, mask_real);
503  imagcacc = _mm256_and_si256(imagcacc, mask_imag);
504 
505  result = _mm256_or_si256(realcacc, imagcacc);
506 
507  _mm256_store_si256((__m256i*)dotProductVector,
508  result); // Store the results back into the dot product vector
509 
510  for (number = 0; number < 8; ++number) {
511  dotProduct = lv_cmake(
512  sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
513  sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
514  }
515  }
516 
517  for (number = 0; number < (num_points % 8); ++number) {
518  lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
519  dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
520  sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
521  }
522 
523  *_out = dotProduct;
524 }
525 #endif /* LV_HAVE_AVX2 */
526 
527 
528 #ifdef LV_HAVE_NEON
529 #include <arm_neon.h>
530 
532  const lv_16sc_t* in_a,
533  const lv_16sc_t* in_b,
534  unsigned int num_points)
535 {
536  unsigned int quarter_points = num_points / 4;
537  unsigned int number;
538 
539  lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
540  lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
541  *out = lv_cmake((int16_t)0, (int16_t)0);
542 
543  if (quarter_points > 0) {
544  // for 2-lane vectors, 1st lane holds the real part,
545  // 2nd lane holds the imaginary part
546  int16x4x2_t a_val, b_val, c_val, accumulator;
547  int16x4x2_t tmp_real, tmp_imag;
548  __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
549  accumulator.val[0] = vdup_n_s16(0);
550  accumulator.val[1] = vdup_n_s16(0);
551  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
552 
553  for (number = 0; number < quarter_points; ++number) {
554  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
555  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
556  __VOLK_PREFETCH(a_ptr + 8);
557  __VOLK_PREFETCH(b_ptr + 8);
558 
559  // multiply the real*real and imag*imag to get real result
560  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
561  tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
562  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
563  tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
564 
565  // Multiply cross terms to get the imaginary result
566  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
567  tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
568  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
569  tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
570 
571  c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
572  c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
573 
574  accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
575  accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
576 
577  a_ptr += 4;
578  b_ptr += 4;
579  }
580 
581  vst2_s16((int16_t*)accum_result, accumulator);
582  for (number = 0; number < 4; ++number) {
583  dotProduct = lv_cmake(
584  sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])),
585  sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
586  }
587 
588  *out = dotProduct;
589  }
590 
591  // tail case
592  for (number = quarter_points * 4; number < num_points; ++number) {
593  *out += (*a_ptr++) * (*b_ptr++);
594  }
595 }
596 
597 #endif /* LV_HAVE_NEON */
598 
599 
600 #ifdef LV_HAVE_NEON
601 #include <arm_neon.h>
602 
604  const lv_16sc_t* in_a,
605  const lv_16sc_t* in_b,
606  unsigned int num_points)
607 {
608  unsigned int quarter_points = num_points / 4;
609  unsigned int number;
610 
611  lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
612  lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
613  // for 2-lane vectors, 1st lane holds the real part,
614  // 2nd lane holds the imaginary part
615  int16x4x2_t a_val, b_val, accumulator;
616  int16x4x2_t tmp;
617  __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
618  accumulator.val[0] = vdup_n_s16(0);
619  accumulator.val[1] = vdup_n_s16(0);
620 
621  for (number = 0; number < quarter_points; ++number) {
622  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
623  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
624  __VOLK_PREFETCH(a_ptr + 8);
625  __VOLK_PREFETCH(b_ptr + 8);
626 
627  tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
628  tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
629 
630  // use multiply accumulate/subtract to get result
631  tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
632  tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
633 
634  accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
635  accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
636 
637  a_ptr += 4;
638  b_ptr += 4;
639  }
640 
641  vst2_s16((int16_t*)accum_result, accumulator);
642  *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
643 
644  // tail case
645  for (number = quarter_points * 4; number < num_points; ++number) {
646  *out += (*a_ptr++) * (*b_ptr++);
647  }
648 }
649 
650 #endif /* LV_HAVE_NEON */
651 
652 
653 #ifdef LV_HAVE_NEON
654 #include <arm_neon.h>
655 
657  const lv_16sc_t* in_a,
658  const lv_16sc_t* in_b,
659  unsigned int num_points)
660 {
661  unsigned int quarter_points = num_points / 4;
662  unsigned int number;
663 
664  lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
665  lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
666  // for 2-lane vectors, 1st lane holds the real part,
667  // 2nd lane holds the imaginary part
668  int16x4x2_t a_val, b_val, accumulator1, accumulator2;
669 
670  __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
671  accumulator1.val[0] = vdup_n_s16(0);
672  accumulator1.val[1] = vdup_n_s16(0);
673  accumulator2.val[0] = vdup_n_s16(0);
674  accumulator2.val[1] = vdup_n_s16(0);
675 
676  for (number = 0; number < quarter_points; ++number) {
677  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
678  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
679  __VOLK_PREFETCH(a_ptr + 8);
680  __VOLK_PREFETCH(b_ptr + 8);
681 
682  // use 2 accumulators to remove inter-instruction data dependencies
683  accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
684  accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
685  accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
686  accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
687 
688  a_ptr += 4;
689  b_ptr += 4;
690  }
691 
692  accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
693  accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
694 
695  vst2_s16((int16_t*)accum_result, accumulator1);
696  *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
697 
698  // tail case
699  for (number = quarter_points * 4; number < num_points; ++number) {
700  *out += (*a_ptr++) * (*b_ptr++);
701  }
702 }
703 
704 #endif /* LV_HAVE_NEON */
705 
706 #endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/
static int16_t sat_adds16i(int16_t x, int16_t y)
Definition: saturation_arithmetic.h:29
static void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:656
static void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:58
static void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:78
static void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:603
static void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:163
static void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:531
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_cmake(r, i)
Definition: volk_complex.h:68
#define lv_creal(x)
Definition: volk_complex.h:87
short complex lv_16sc_t
Definition: volk_complex.h:62