18 #if defined (_WIN32) || defined (__i386__)
19 #define BT_USE_SSE_IN_API
24 #if defined (BT_USE_SSE) || defined (BT_USE_NEON)
35 #if defined BT_USE_SSE || defined _WIN32
37 #define LOG2_ARRAY_SIZE 6
38 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
40 #include <emmintrin.h>
42 long _maxdot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
43 long _maxdot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
46 static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
47 float4 dotMax = btAssign128( -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY );
48 float4 vvec = _mm_loadu_ps( vec );
49 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
50 float4 vLo = _mm_movelh_ps( vvec, vvec );
55 float4 stack_array[ STACK_ARRAY_COUNT ];
58 memset( stack_array, -1, STACK_ARRAY_COUNT *
sizeof(stack_array[0]) );
64 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
68 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
73 float4 v3 = vertices[3]; vertices += 4;
75 float4 lo0 = _mm_movelh_ps( v0, v1);
76 float4 hi0 = _mm_movehl_ps( v1, v0);
77 float4 lo1 = _mm_movelh_ps( v2, v3);
78 float4 hi1 = _mm_movehl_ps( v3, v2);
82 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
83 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
84 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
88 stack_array[index] = x;
89 max = _mm_max_ps( x, max );
94 v3 = vertices[3]; vertices += 4;
96 lo0 = _mm_movelh_ps( v0, v1);
97 hi0 = _mm_movehl_ps( v1, v0);
98 lo1 = _mm_movelh_ps( v2, v3);
99 hi1 = _mm_movehl_ps( v3, v2);
103 z = _mm_shuffle_ps(hi0, hi1, 0x88);
104 x = _mm_shuffle_ps(lo0, lo1, 0x88);
105 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
109 stack_array[index+1] = x;
110 max = _mm_max_ps( x, max );
115 v3 = vertices[3]; vertices += 4;
117 lo0 = _mm_movelh_ps( v0, v1);
118 hi0 = _mm_movehl_ps( v1, v0);
119 lo1 = _mm_movelh_ps( v2, v3);
120 hi1 = _mm_movehl_ps( v3, v2);
124 z = _mm_shuffle_ps(hi0, hi1, 0x88);
125 x = _mm_shuffle_ps(lo0, lo1, 0x88);
126 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
130 stack_array[index+2] = x;
131 max = _mm_max_ps( x, max );
136 v3 = vertices[3]; vertices += 4;
138 lo0 = _mm_movelh_ps( v0, v1);
139 hi0 = _mm_movehl_ps( v1, v0);
140 lo1 = _mm_movelh_ps( v2, v3);
141 hi1 = _mm_movehl_ps( v3, v2);
145 z = _mm_shuffle_ps(hi0, hi1, 0x88);
146 x = _mm_shuffle_ps(lo0, lo1, 0x88);
147 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
151 stack_array[index+3] = x;
152 max = _mm_max_ps( x, max );
158 if( 0xf != _mm_movemask_ps( (
float4) _mm_cmpeq_ps(max, dotMax)))
161 max = _mm_max_ps(max, (
float4) _mm_shuffle_ps( max, max, 0x4e));
162 max = _mm_max_ps(max, (
float4) _mm_shuffle_ps( max, max, 0xb1));
168 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )
171 maxIndex = 4*index + segment + indexTable[test];
185 for( ; index + 4 <= count / 4; index+=4 )
190 float4 v3 = vertices[3]; vertices += 4;
192 float4 lo0 = _mm_movelh_ps( v0, v1);
193 float4 hi0 = _mm_movehl_ps( v1, v0);
194 float4 lo1 = _mm_movelh_ps( v2, v3);
195 float4 hi1 = _mm_movehl_ps( v3, v2);
199 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
200 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
201 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
205 stack_array[index] = x;
206 max = _mm_max_ps( x, max );
211 v3 = vertices[3]; vertices += 4;
213 lo0 = _mm_movelh_ps( v0, v1);
214 hi0 = _mm_movehl_ps( v1, v0);
215 lo1 = _mm_movelh_ps( v2, v3);
216 hi1 = _mm_movehl_ps( v3, v2);
220 z = _mm_shuffle_ps(hi0, hi1, 0x88);
221 x = _mm_shuffle_ps(lo0, lo1, 0x88);
222 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
226 stack_array[index+1] = x;
227 max = _mm_max_ps( x, max );
232 v3 = vertices[3]; vertices += 4;
234 lo0 = _mm_movelh_ps( v0, v1);
235 hi0 = _mm_movehl_ps( v1, v0);
236 lo1 = _mm_movelh_ps( v2, v3);
237 hi1 = _mm_movehl_ps( v3, v2);
241 z = _mm_shuffle_ps(hi0, hi1, 0x88);
242 x = _mm_shuffle_ps(lo0, lo1, 0x88);
243 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
247 stack_array[index+2] = x;
248 max = _mm_max_ps( x, max );
253 v3 = vertices[3]; vertices += 4;
255 lo0 = _mm_movelh_ps( v0, v1);
256 hi0 = _mm_movehl_ps( v1, v0);
257 lo1 = _mm_movelh_ps( v2, v3);
258 hi1 = _mm_movehl_ps( v3, v2);
262 z = _mm_shuffle_ps(hi0, hi1, 0x88);
263 x = _mm_shuffle_ps(lo0, lo1, 0x88);
264 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
268 stack_array[index+3] = x;
269 max = _mm_max_ps( x, max );
275 size_t localCount = (count & -4L) - 4*index;
279 float4 t0, t1, t2, t3, t4;
280 float4 * sap = &stack_array[index + localCount / 4];
281 vertices += localCount;
282 size_t byteIndex = -(localCount) *
sizeof(
float);
286 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
287 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
288 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
289 movaps %[t0], %[max] // vertices[0] \n\
290 movlhps %[t1], %[max] // x0y0x1y1 \n\
291 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
292 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
293 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
294 movhlps %[t0], %[t1] // z0w0z1w1 \n\
295 movaps %[t3], %[t0] // vertices[2] \n\
296 movlhps %[t4], %[t0] // x2y2x3y3 \n\
297 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
298 movhlps %[t3], %[t4] // z2w2z3w3 \n\
299 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
300 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
301 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
302 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
303 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
304 addps %[t3], %[max] // x + y \n\
305 addps %[t1], %[max] // x + y + z \n\
306 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
307 maxps %[t2], %[max] // record max, restore max \n\
308 add $16, %[byteIndex] // advance loop counter\n\
311 : [
max]
"+x" (
max), [t0]
"=&x" (t0), [t1]
"=&x" (t1), [t2]
"=&x" (t2), [t3]
"=&x" (t3), [t4]
"=&x" (t4), [byteIndex]
"+r" (byteIndex)
312 : [vLo]
"x" (vLo), [vHi]
"x" (vHi), [vertices]
"r" (vertices), [sap]
"r" (sap)
315 index += localCount/4;
318 for(
unsigned int i=0; i<localCount/4; i++,index++)
326 float4 lo0 = _mm_movelh_ps( v0, v1);
327 float4 hi0 = _mm_movehl_ps( v1, v0);
328 float4 lo1 = _mm_movelh_ps( v2, v3);
329 float4 hi1 = _mm_movehl_ps( v3, v2);
333 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
334 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
335 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
339 stack_array[index] = x;
340 max = _mm_max_ps( x, max );
349 float4 v0, v1, v2, x, y, z;
359 float4 lo0 = _mm_movelh_ps( v0, v1);
360 float4 hi0 = _mm_movehl_ps( v1, v0);
362 z = _mm_shuffle_ps(hi0, v2, 0xa8 );
364 float4 lo1 = _mm_movelh_ps(v2, v2);
366 x = _mm_shuffle_ps(lo0, lo1, 0x88);
367 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
374 float4 xy = _mm_movelh_ps(v0, v1);
375 z = _mm_movehl_ps(v1, v0);
377 z = _mm_shuffle_ps( z, z, 0xa8);
378 x = _mm_shuffle_ps( xy, xy, 0xa8);
379 y = _mm_shuffle_ps( xy, xy, 0xfd);
386 z = _mm_shuffle_ps( xy, xy, 0xaa);
389 x = _mm_shuffle_ps(xy, xy, 0);
390 y = _mm_shuffle_ps(xy, xy, 0x55);
396 stack_array[index] = x;
397 max = _mm_max_ps( x, max );
402 if( 0 == segment || 0xf != _mm_movemask_ps( (
float4) _mm_cmpeq_ps(max, dotMax)))
405 max = _mm_max_ps(max, (
float4) _mm_shuffle_ps( max, max, 0x4e));
406 max = _mm_max_ps(max, (
float4) _mm_shuffle_ps( max, max, 0xb1));
417 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )
419 maxIndex = 4*index + segment + indexTable[test];
422 _mm_store_ss( dotResult, dotMax);
426 long _mindot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
428 long _mindot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
431 static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
432 float4 dotmin = btAssign128( BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY );
433 float4 vvec = _mm_loadu_ps( vec );
434 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
435 float4 vLo = _mm_movelh_ps( vvec, vvec );
440 float4 stack_array[ STACK_ARRAY_COUNT ];
443 memset( stack_array, -1, STACK_ARRAY_COUNT *
sizeof(stack_array[0]) );
449 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
453 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
458 float4 v3 = vertices[3]; vertices += 4;
460 float4 lo0 = _mm_movelh_ps( v0, v1);
461 float4 hi0 = _mm_movehl_ps( v1, v0);
462 float4 lo1 = _mm_movelh_ps( v2, v3);
463 float4 hi1 = _mm_movehl_ps( v3, v2);
467 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
468 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
469 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
473 stack_array[index] = x;
474 min = _mm_min_ps( x, min );
479 v3 = vertices[3]; vertices += 4;
481 lo0 = _mm_movelh_ps( v0, v1);
482 hi0 = _mm_movehl_ps( v1, v0);
483 lo1 = _mm_movelh_ps( v2, v3);
484 hi1 = _mm_movehl_ps( v3, v2);
488 z = _mm_shuffle_ps(hi0, hi1, 0x88);
489 x = _mm_shuffle_ps(lo0, lo1, 0x88);
490 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
494 stack_array[index+1] = x;
495 min = _mm_min_ps( x, min );
500 v3 = vertices[3]; vertices += 4;
502 lo0 = _mm_movelh_ps( v0, v1);
503 hi0 = _mm_movehl_ps( v1, v0);
504 lo1 = _mm_movelh_ps( v2, v3);
505 hi1 = _mm_movehl_ps( v3, v2);
509 z = _mm_shuffle_ps(hi0, hi1, 0x88);
510 x = _mm_shuffle_ps(lo0, lo1, 0x88);
511 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
515 stack_array[index+2] = x;
516 min = _mm_min_ps( x, min );
521 v3 = vertices[3]; vertices += 4;
523 lo0 = _mm_movelh_ps( v0, v1);
524 hi0 = _mm_movehl_ps( v1, v0);
525 lo1 = _mm_movelh_ps( v2, v3);
526 hi1 = _mm_movehl_ps( v3, v2);
530 z = _mm_shuffle_ps(hi0, hi1, 0x88);
531 x = _mm_shuffle_ps(lo0, lo1, 0x88);
532 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
536 stack_array[index+3] = x;
537 min = _mm_min_ps( x, min );
543 if( 0xf != _mm_movemask_ps( (
float4) _mm_cmpeq_ps(min, dotmin)))
546 min = _mm_min_ps(min, (
float4) _mm_shuffle_ps( min, min, 0x4e));
547 min = _mm_min_ps(min, (
float4) _mm_shuffle_ps( min, min, 0xb1));
553 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )
556 minIndex = 4*index + segment + indexTable[test];
570 for( ; index + 4 <= count / 4; index+=4 )
575 float4 v3 = vertices[3]; vertices += 4;
577 float4 lo0 = _mm_movelh_ps( v0, v1);
578 float4 hi0 = _mm_movehl_ps( v1, v0);
579 float4 lo1 = _mm_movelh_ps( v2, v3);
580 float4 hi1 = _mm_movehl_ps( v3, v2);
584 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
585 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
586 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
590 stack_array[index] = x;
591 min = _mm_min_ps( x, min );
596 v3 = vertices[3]; vertices += 4;
598 lo0 = _mm_movelh_ps( v0, v1);
599 hi0 = _mm_movehl_ps( v1, v0);
600 lo1 = _mm_movelh_ps( v2, v3);
601 hi1 = _mm_movehl_ps( v3, v2);
605 z = _mm_shuffle_ps(hi0, hi1, 0x88);
606 x = _mm_shuffle_ps(lo0, lo1, 0x88);
607 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
611 stack_array[index+1] = x;
612 min = _mm_min_ps( x, min );
617 v3 = vertices[3]; vertices += 4;
619 lo0 = _mm_movelh_ps( v0, v1);
620 hi0 = _mm_movehl_ps( v1, v0);
621 lo1 = _mm_movelh_ps( v2, v3);
622 hi1 = _mm_movehl_ps( v3, v2);
626 z = _mm_shuffle_ps(hi0, hi1, 0x88);
627 x = _mm_shuffle_ps(lo0, lo1, 0x88);
628 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
632 stack_array[index+2] = x;
633 min = _mm_min_ps( x, min );
638 v3 = vertices[3]; vertices += 4;
640 lo0 = _mm_movelh_ps( v0, v1);
641 hi0 = _mm_movehl_ps( v1, v0);
642 lo1 = _mm_movelh_ps( v2, v3);
643 hi1 = _mm_movehl_ps( v3, v2);
647 z = _mm_shuffle_ps(hi0, hi1, 0x88);
648 x = _mm_shuffle_ps(lo0, lo1, 0x88);
649 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
653 stack_array[index+3] = x;
654 min = _mm_min_ps( x, min );
660 size_t localCount = (count & -4L) - 4*index;
666 vertices += localCount;
667 float4 t0, t1, t2, t3, t4;
668 size_t byteIndex = -(localCount) *
sizeof(
float);
669 float4 * sap = &stack_array[index + localCount / 4];
673 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
674 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
675 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
676 movaps %[t0], %[min] // vertices[0] \n\
677 movlhps %[t1], %[min] // x0y0x1y1 \n\
678 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
679 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
680 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
681 movhlps %[t0], %[t1] // z0w0z1w1 \n\
682 movaps %[t3], %[t0] // vertices[2] \n\
683 movlhps %[t4], %[t0] // x2y2x3y3 \n\
684 movhlps %[t3], %[t4] // z2w2z3w3 \n\
685 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
686 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
687 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
688 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
689 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
690 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
691 addps %[t3], %[min] // x + y \n\
692 addps %[t1], %[min] // x + y + z \n\
693 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
694 minps %[t2], %[min] // record min, restore min \n\
695 add $16, %[byteIndex] // advance loop counter\n\
698 : [
min]
"+x" (
min), [t0]
"=&x" (t0), [t1]
"=&x" (t1), [t2]
"=&x" (t2), [t3]
"=&x" (t3), [t4]
"=&x" (t4), [byteIndex]
"+r" (byteIndex)
699 : [vLo]
"x" (vLo), [vHi]
"x" (vHi), [vertices]
"r" (vertices), [sap]
"r" (sap)
702 index += localCount/4;
705 for(
unsigned int i=0; i<localCount/4; i++,index++)
713 float4 lo0 = _mm_movelh_ps( v0, v1);
714 float4 hi0 = _mm_movehl_ps( v1, v0);
715 float4 lo1 = _mm_movelh_ps( v2, v3);
716 float4 hi1 = _mm_movehl_ps( v3, v2);
720 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
721 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
722 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
726 stack_array[index] = x;
727 min = _mm_min_ps( x, min );
737 float4 v0, v1, v2, x, y, z;
747 float4 lo0 = _mm_movelh_ps( v0, v1);
748 float4 hi0 = _mm_movehl_ps( v1, v0);
750 z = _mm_shuffle_ps(hi0, v2, 0xa8 );
752 float4 lo1 = _mm_movelh_ps(v2, v2);
754 x = _mm_shuffle_ps(lo0, lo1, 0x88);
755 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
762 float4 xy = _mm_movelh_ps(v0, v1);
763 z = _mm_movehl_ps(v1, v0);
765 z = _mm_shuffle_ps( z, z, 0xa8);
766 x = _mm_shuffle_ps( xy, xy, 0xa8);
767 y = _mm_shuffle_ps( xy, xy, 0xfd);
774 z = _mm_shuffle_ps( xy, xy, 0xaa);
777 x = _mm_shuffle_ps(xy, xy, 0);
778 y = _mm_shuffle_ps(xy, xy, 0x55);
784 stack_array[index] = x;
785 min = _mm_min_ps( x, min );
790 if( 0 == segment || 0xf != _mm_movemask_ps( (
float4) _mm_cmpeq_ps(min, dotmin)))
793 min = _mm_min_ps(min, (
float4) _mm_shuffle_ps( min, min, 0x4e));
794 min = _mm_min_ps(min, (
float4) _mm_shuffle_ps( min, min, 0xb1));
805 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )
807 minIndex = 4*index + segment + indexTable[test];
810 _mm_store_ss( dotResult, dotmin);
815 #elif defined BT_USE_NEON
816 #define ARM_NEON_GCC_COMPATIBILITY 1
817 #include <arm_neon.h>
820 static long _maxdot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
821 static long _maxdot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
822 static long _maxdot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
823 static long _mindot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
824 static long _mindot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
825 static long _mindot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
827 long (*_maxdot_large)(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult ) = _maxdot_large_sel;
828 long (*_mindot_large)(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult ) = _mindot_large_sel;
830 extern "C" {
int _get_cpu_capabilities(
void );}
832 static long _maxdot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
834 if( _get_cpu_capabilities() & 0x2000 )
835 _maxdot_large = _maxdot_large_v1;
837 _maxdot_large = _maxdot_large_v0;
839 return _maxdot_large(vv, vec, count, dotResult);
842 static long _mindot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
844 if( _get_cpu_capabilities() & 0x2000 )
845 _mindot_large = _mindot_large_v1;
847 _mindot_large = _mindot_large_v0;
849 return _mindot_large(vv, vec, count, dotResult);
854 #define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); _r; })
857 long _maxdot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
860 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
861 float32x2_t vLo = vget_low_f32(vvec);
862 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
863 float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
864 float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
865 uint32x2_t indexLo = (uint32x2_t) {0, 1};
866 uint32x2_t indexHi = (uint32x2_t) {2, 3};
867 uint32x2_t iLo = (uint32x2_t) {-1, -1};
868 uint32x2_t iHi = (uint32x2_t) {-1, -1};
869 const uint32x2_t four = (uint32x2_t) {4,4};
871 for( ; i+8 <= count; i+= 8 )
873 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
874 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
875 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
876 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
878 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
879 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
880 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
881 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
883 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
884 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
885 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
886 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
888 float32x2_t rLo = vpadd_f32( xy0, xy1);
889 float32x2_t rHi = vpadd_f32( xy2, xy3);
890 rLo = vadd_f32(rLo, zLo);
891 rHi = vadd_f32(rHi, zHi);
893 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
894 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
895 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
896 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
897 iLo = vbsl_u32(maskLo, indexLo, iLo);
898 iHi = vbsl_u32(maskHi, indexHi, iHi);
899 indexLo = vadd_u32(indexLo, four);
900 indexHi = vadd_u32(indexHi, four);
902 v0 = vld1q_f32_aligned_postincrement( vv );
903 v1 = vld1q_f32_aligned_postincrement( vv );
904 v2 = vld1q_f32_aligned_postincrement( vv );
905 v3 = vld1q_f32_aligned_postincrement( vv );
907 xy0 = vmul_f32( vget_low_f32(v0), vLo);
908 xy1 = vmul_f32( vget_low_f32(v1), vLo);
909 xy2 = vmul_f32( vget_low_f32(v2), vLo);
910 xy3 = vmul_f32( vget_low_f32(v3), vLo);
912 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
913 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
914 zLo = vmul_f32( z0.val[0], vHi);
915 zHi = vmul_f32( z1.val[0], vHi);
917 rLo = vpadd_f32( xy0, xy1);
918 rHi = vpadd_f32( xy2, xy3);
919 rLo = vadd_f32(rLo, zLo);
920 rHi = vadd_f32(rHi, zHi);
922 maskLo = vcgt_f32( rLo, dotMaxLo );
923 maskHi = vcgt_f32( rHi, dotMaxHi );
924 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
925 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
926 iLo = vbsl_u32(maskLo, indexLo, iLo);
927 iHi = vbsl_u32(maskHi, indexHi, iHi);
928 indexLo = vadd_u32(indexLo, four);
929 indexHi = vadd_u32(indexHi, four);
932 for( ; i+4 <= count; i+= 4 )
934 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
935 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
936 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
937 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
939 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
940 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
941 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
942 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
944 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
945 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
946 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
947 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
949 float32x2_t rLo = vpadd_f32( xy0, xy1);
950 float32x2_t rHi = vpadd_f32( xy2, xy3);
951 rLo = vadd_f32(rLo, zLo);
952 rHi = vadd_f32(rHi, zHi);
954 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
955 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
956 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
957 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
958 iLo = vbsl_u32(maskLo, indexLo, iLo);
959 iHi = vbsl_u32(maskHi, indexHi, iHi);
960 indexLo = vadd_u32(indexLo, four);
961 indexHi = vadd_u32(indexHi, four);
968 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
969 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
970 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
972 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
973 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
974 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
976 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
977 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
978 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
980 float32x2_t rLo = vpadd_f32( xy0, xy1);
981 float32x2_t rHi = vpadd_f32( xy2, xy2);
982 rLo = vadd_f32(rLo, zLo);
983 rHi = vadd_f32(rHi, zHi);
985 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
986 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
987 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
988 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
989 iLo = vbsl_u32(maskLo, indexLo, iLo);
990 iHi = vbsl_u32(maskHi, indexHi, iHi);
995 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
996 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
998 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
999 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1001 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1002 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1004 float32x2_t rLo = vpadd_f32( xy0, xy1);
1005 rLo = vadd_f32(rLo, zLo);
1007 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1008 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1009 iLo = vbsl_u32(maskLo, indexLo, iLo);
1014 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1015 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1016 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1017 float32x2_t zLo = vmul_f32( z0, vHi);
1018 float32x2_t rLo = vpadd_f32( xy0, xy0);
1019 rLo = vadd_f32(rLo, zLo);
1020 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1021 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1022 iLo = vbsl_u32(maskLo, indexLo, iLo);
1031 uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
1032 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1033 iLo = vbsl_u32(mask, iHi, iLo);
1036 dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1037 iHi = vdup_lane_u32(iLo, 1);
1038 mask = vcgt_f32( dotMaxHi, dotMaxLo );
1039 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1040 iLo = vbsl_u32(mask, iHi, iLo);
1042 *dotResult = vget_lane_f32( dotMaxLo, 0);
1043 return vget_lane_u32(iLo, 0);
1047 long _maxdot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
1049 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1050 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1051 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1052 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1053 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1054 uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
1055 float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
1057 unsigned long i = 0;
1058 for( ; i + 8 <= count; i += 8 )
1060 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1061 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1062 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1063 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1066 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1067 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1069 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1070 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1072 xy0 = vmulq_f32(xy0, vLo);
1073 xy1 = vmulq_f32(xy1, vLo);
1075 float32x4x2_t zb = vuzpq_f32( z0, z1);
1076 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1077 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1078 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1079 x = vaddq_f32(x, z);
1081 uint32x4_t mask = vcgtq_f32(x, maxDot);
1082 maxDot = vbslq_f32( mask, x, maxDot);
1083 index = vbslq_u32(mask, local_index, index);
1084 local_index = vaddq_u32(local_index, four);
1086 v0 = vld1q_f32_aligned_postincrement( vv );
1087 v1 = vld1q_f32_aligned_postincrement( vv );
1088 v2 = vld1q_f32_aligned_postincrement( vv );
1089 v3 = vld1q_f32_aligned_postincrement( vv );
1092 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1093 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1095 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1096 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1098 xy0 = vmulq_f32(xy0, vLo);
1099 xy1 = vmulq_f32(xy1, vLo);
1101 zb = vuzpq_f32( z0, z1);
1102 z = vmulq_f32( zb.val[0], vHi);
1103 xy = vuzpq_f32( xy0, xy1);
1104 x = vaddq_f32(xy.val[0], xy.val[1]);
1105 x = vaddq_f32(x, z);
1107 mask = vcgtq_f32(x, maxDot);
1108 maxDot = vbslq_f32( mask, x, maxDot);
1109 index = vbslq_u32(mask, local_index, index);
1110 local_index = vaddq_u32(local_index, four);
1113 for( ; i + 4 <= count; i += 4 )
1115 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1116 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1117 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1118 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1121 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1122 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1124 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1125 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1127 xy0 = vmulq_f32(xy0, vLo);
1128 xy1 = vmulq_f32(xy1, vLo);
1130 float32x4x2_t zb = vuzpq_f32( z0, z1);
1131 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1132 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1133 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1134 x = vaddq_f32(x, z);
1136 uint32x4_t mask = vcgtq_f32(x, maxDot);
1137 maxDot = vbslq_f32( mask, x, maxDot);
1138 index = vbslq_u32(mask, local_index, index);
1139 local_index = vaddq_u32(local_index, four);
1142 switch (count & 3) {
1145 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1146 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1147 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1150 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1151 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1153 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1154 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1156 xy0 = vmulq_f32(xy0, vLo);
1157 xy1 = vmulq_f32(xy1, vLo);
1159 float32x4x2_t zb = vuzpq_f32( z0, z1);
1160 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1161 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1162 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1163 x = vaddq_f32(x, z);
1165 uint32x4_t mask = vcgtq_f32(x, maxDot);
1166 maxDot = vbslq_f32( mask, x, maxDot);
1167 index = vbslq_u32(mask, local_index, index);
1168 local_index = vaddq_u32(local_index, four);
1174 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1175 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1178 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1180 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1182 xy0 = vmulq_f32(xy0, vLo);
1184 float32x4x2_t zb = vuzpq_f32( z0, z0);
1185 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1186 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1187 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1188 x = vaddq_f32(x, z);
1190 uint32x4_t mask = vcgtq_f32(x, maxDot);
1191 maxDot = vbslq_f32( mask, x, maxDot);
1192 index = vbslq_u32(mask, local_index, index);
1193 local_index = vaddq_u32(local_index, four);
1199 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1202 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1204 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1206 xy0 = vmulq_f32(xy0, vLo);
1208 z = vmulq_f32( z, vHi);
1209 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1210 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1211 x = vaddq_f32(x, z);
1213 uint32x4_t mask = vcgtq_f32(x, maxDot);
1214 maxDot = vbslq_f32( mask, x, maxDot);
1215 index = vbslq_u32(mask, local_index, index);
1216 local_index = vaddq_u32(local_index, four);
1226 uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
1227 float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1228 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1231 float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1232 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1233 mask = vcgt_f32( maxDotO, maxDot2 );
1234 maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1235 index2 = vbsl_u32(mask, indexHi, index2);
1237 *dotResult = vget_lane_f32( maxDot2, 0);
1238 return vget_lane_u32(index2, 0);
1242 long _mindot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
1244 unsigned long i = 0;
1245 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1246 float32x2_t vLo = vget_low_f32(vvec);
1247 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1248 float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1249 float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1250 uint32x2_t indexLo = (uint32x2_t) {0, 1};
1251 uint32x2_t indexHi = (uint32x2_t) {2, 3};
1252 uint32x2_t iLo = (uint32x2_t) {-1, -1};
1253 uint32x2_t iHi = (uint32x2_t) {-1, -1};
1254 const uint32x2_t four = (uint32x2_t) {4,4};
1256 for( ; i+8 <= count; i+= 8 )
1258 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1259 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1260 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1261 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1263 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1264 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1265 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1266 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1268 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1269 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1270 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1271 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1273 float32x2_t rLo = vpadd_f32( xy0, xy1);
1274 float32x2_t rHi = vpadd_f32( xy2, xy3);
1275 rLo = vadd_f32(rLo, zLo);
1276 rHi = vadd_f32(rHi, zHi);
1278 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1279 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1280 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1281 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1282 iLo = vbsl_u32(maskLo, indexLo, iLo);
1283 iHi = vbsl_u32(maskHi, indexHi, iHi);
1284 indexLo = vadd_u32(indexLo, four);
1285 indexHi = vadd_u32(indexHi, four);
1287 v0 = vld1q_f32_aligned_postincrement( vv );
1288 v1 = vld1q_f32_aligned_postincrement( vv );
1289 v2 = vld1q_f32_aligned_postincrement( vv );
1290 v3 = vld1q_f32_aligned_postincrement( vv );
1292 xy0 = vmul_f32( vget_low_f32(v0), vLo);
1293 xy1 = vmul_f32( vget_low_f32(v1), vLo);
1294 xy2 = vmul_f32( vget_low_f32(v2), vLo);
1295 xy3 = vmul_f32( vget_low_f32(v3), vLo);
1297 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1298 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1299 zLo = vmul_f32( z0.val[0], vHi);
1300 zHi = vmul_f32( z1.val[0], vHi);
1302 rLo = vpadd_f32( xy0, xy1);
1303 rHi = vpadd_f32( xy2, xy3);
1304 rLo = vadd_f32(rLo, zLo);
1305 rHi = vadd_f32(rHi, zHi);
1307 maskLo = vclt_f32( rLo, dotMinLo );
1308 maskHi = vclt_f32( rHi, dotMinHi );
1309 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1310 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1311 iLo = vbsl_u32(maskLo, indexLo, iLo);
1312 iHi = vbsl_u32(maskHi, indexHi, iHi);
1313 indexLo = vadd_u32(indexLo, four);
1314 indexHi = vadd_u32(indexHi, four);
1317 for( ; i+4 <= count; i+= 4 )
1319 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1320 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1321 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1322 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1324 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1325 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1326 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1327 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1329 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1330 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1331 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1332 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1334 float32x2_t rLo = vpadd_f32( xy0, xy1);
1335 float32x2_t rHi = vpadd_f32( xy2, xy3);
1336 rLo = vadd_f32(rLo, zLo);
1337 rHi = vadd_f32(rHi, zHi);
1339 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1340 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1341 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1342 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1343 iLo = vbsl_u32(maskLo, indexLo, iLo);
1344 iHi = vbsl_u32(maskHi, indexHi, iHi);
1345 indexLo = vadd_u32(indexLo, four);
1346 indexHi = vadd_u32(indexHi, four);
1352 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1353 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1354 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1356 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1357 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1358 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1360 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1361 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1362 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1364 float32x2_t rLo = vpadd_f32( xy0, xy1);
1365 float32x2_t rHi = vpadd_f32( xy2, xy2);
1366 rLo = vadd_f32(rLo, zLo);
1367 rHi = vadd_f32(rHi, zHi);
1369 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1370 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1371 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1372 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1373 iLo = vbsl_u32(maskLo, indexLo, iLo);
1374 iHi = vbsl_u32(maskHi, indexHi, iHi);
1379 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1380 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1382 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1383 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1385 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1386 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1388 float32x2_t rLo = vpadd_f32( xy0, xy1);
1389 rLo = vadd_f32(rLo, zLo);
1391 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1392 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1393 iLo = vbsl_u32(maskLo, indexLo, iLo);
1398 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1399 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1400 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1401 float32x2_t zLo = vmul_f32( z0, vHi);
1402 float32x2_t rLo = vpadd_f32( xy0, xy0);
1403 rLo = vadd_f32(rLo, zLo);
1404 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1405 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1406 iLo = vbsl_u32(maskLo, indexLo, iLo);
1415 uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
1416 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1417 iLo = vbsl_u32(mask, iHi, iLo);
1420 dotMinHi = vdup_lane_f32(dotMinLo, 1);
1421 iHi = vdup_lane_u32(iLo, 1);
1422 mask = vclt_f32( dotMinHi, dotMinLo );
1423 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1424 iLo = vbsl_u32(mask, iHi, iLo);
1426 *dotResult = vget_lane_f32( dotMinLo, 0);
1427 return vget_lane_u32(iLo, 0);
1430 long _mindot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
1432 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1433 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1434 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1435 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1436 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1437 uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
1438 float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
1440 unsigned long i = 0;
1441 for( ; i + 8 <= count; i += 8 )
1443 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1444 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1445 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1446 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1449 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1450 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1452 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1453 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1455 xy0 = vmulq_f32(xy0, vLo);
1456 xy1 = vmulq_f32(xy1, vLo);
1458 float32x4x2_t zb = vuzpq_f32( z0, z1);
1459 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1460 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1461 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1462 x = vaddq_f32(x, z);
1464 uint32x4_t mask = vcltq_f32(x, minDot);
1465 minDot = vbslq_f32( mask, x, minDot);
1466 index = vbslq_u32(mask, local_index, index);
1467 local_index = vaddq_u32(local_index, four);
1469 v0 = vld1q_f32_aligned_postincrement( vv );
1470 v1 = vld1q_f32_aligned_postincrement( vv );
1471 v2 = vld1q_f32_aligned_postincrement( vv );
1472 v3 = vld1q_f32_aligned_postincrement( vv );
1475 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1476 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1478 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1479 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1481 xy0 = vmulq_f32(xy0, vLo);
1482 xy1 = vmulq_f32(xy1, vLo);
1484 zb = vuzpq_f32( z0, z1);
1485 z = vmulq_f32( zb.val[0], vHi);
1486 xy = vuzpq_f32( xy0, xy1);
1487 x = vaddq_f32(xy.val[0], xy.val[1]);
1488 x = vaddq_f32(x, z);
1490 mask = vcltq_f32(x, minDot);
1491 minDot = vbslq_f32( mask, x, minDot);
1492 index = vbslq_u32(mask, local_index, index);
1493 local_index = vaddq_u32(local_index, four);
1496 for( ; i + 4 <= count; i += 4 )
1498 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1499 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1500 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1501 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1504 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1505 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1507 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1508 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1510 xy0 = vmulq_f32(xy0, vLo);
1511 xy1 = vmulq_f32(xy1, vLo);
1513 float32x4x2_t zb = vuzpq_f32( z0, z1);
1514 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1515 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1516 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1517 x = vaddq_f32(x, z);
1519 uint32x4_t mask = vcltq_f32(x, minDot);
1520 minDot = vbslq_f32( mask, x, minDot);
1521 index = vbslq_u32(mask, local_index, index);
1522 local_index = vaddq_u32(local_index, four);
1525 switch (count & 3) {
1528 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1529 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1530 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1533 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1534 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1536 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1537 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1539 xy0 = vmulq_f32(xy0, vLo);
1540 xy1 = vmulq_f32(xy1, vLo);
1542 float32x4x2_t zb = vuzpq_f32( z0, z1);
1543 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1544 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1545 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1546 x = vaddq_f32(x, z);
1548 uint32x4_t mask = vcltq_f32(x, minDot);
1549 minDot = vbslq_f32( mask, x, minDot);
1550 index = vbslq_u32(mask, local_index, index);
1551 local_index = vaddq_u32(local_index, four);
1557 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1558 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1561 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1563 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1565 xy0 = vmulq_f32(xy0, vLo);
1567 float32x4x2_t zb = vuzpq_f32( z0, z0);
1568 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1569 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1570 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1571 x = vaddq_f32(x, z);
1573 uint32x4_t mask = vcltq_f32(x, minDot);
1574 minDot = vbslq_f32( mask, x, minDot);
1575 index = vbslq_u32(mask, local_index, index);
1576 local_index = vaddq_u32(local_index, four);
1582 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1585 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1587 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1589 xy0 = vmulq_f32(xy0, vLo);
1591 z = vmulq_f32( z, vHi);
1592 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1593 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1594 x = vaddq_f32(x, z);
1596 uint32x4_t mask = vcltq_f32(x, minDot);
1597 minDot = vbslq_f32( mask, x, minDot);
1598 index = vbslq_u32(mask, local_index, index);
1599 local_index = vaddq_u32(local_index, four);
1609 uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
1610 float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1611 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1614 float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1615 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1616 mask = vclt_f32( minDotO, minDot2 );
1617 minDot2 = vbsl_f32(mask, minDotO, minDot2);
1618 index2 = vbsl_u32(mask, indexHi, index2);
1620 *dotResult = vget_lane_f32( minDot2, 0);
1621 return vget_lane_u32(index2, 0);
1626 #error Unhandled __APPLE__ arch