Bullet Collision Detection & Physics Library
btVector3.cpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2011 Apple Inc.
3  http://continuousphysics.com/Bullet/
4 
5  This software is provided 'as-is', without any express or implied warranty.
6  In no event will the authors be held liable for any damages arising from the use of this software.
7  Permission is granted to anyone to use this software for any purpose,
8  including commercial applications, and to alter it and redistribute it freely,
9  subject to the following restrictions:
10 
11  1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12  2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13  3. This notice may not be removed or altered from any source distribution.
14 
15  This source version has been altered.
16  */
17 
18 #if defined (_WIN32) || defined (__i386__)
19 #define BT_USE_SSE_IN_API
20 #endif
21 
22 #include "btVector3.h"
23 
24 #if defined (BT_USE_SSE) || defined (BT_USE_NEON)
25 
26 #ifdef __APPLE__
27 #include <stdint.h>
28 typedef float float4 __attribute__ ((vector_size(16)));
29 #else
30 #define float4 __m128
31 #endif
32 //typedef uint32_t uint4 __attribute__ ((vector_size(16)));
33 
34 
35 #if defined BT_USE_SSE || defined _WIN32
36 
37 #define LOG2_ARRAY_SIZE 6
38 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
39 
40 #include <emmintrin.h>
41 
42 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
43 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
44 {
45  const float4 *vertices = (const float4*) vv;
46  static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
47  float4 dotMax = btAssign128( -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY );
48  float4 vvec = _mm_loadu_ps( vec );
49  float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
50  float4 vLo = _mm_movelh_ps( vvec, vvec );
51 
52  long maxIndex = -1L;
53 
54  size_t segment = 0;
55  float4 stack_array[ STACK_ARRAY_COUNT ];
56 
57 #if DEBUG
58  memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
59 #endif
60 
61  size_t index;
62  float4 max;
63  // Faster loop without cleanup code for full tiles
64  for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
65  {
66  max = dotMax;
67 
68  for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
69  { // do four dot products at a time. Carefully avoid touching the w element.
70  float4 v0 = vertices[0];
71  float4 v1 = vertices[1];
72  float4 v2 = vertices[2];
73  float4 v3 = vertices[3]; vertices += 4;
74 
75  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
76  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
77  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
78  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
79 
80  lo0 = lo0*vLo;
81  lo1 = lo1*vLo;
82  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
83  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
84  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
85  z = z*vHi;
86  x = x+y;
87  x = x+z;
88  stack_array[index] = x;
89  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
90 
91  v0 = vertices[0];
92  v1 = vertices[1];
93  v2 = vertices[2];
94  v3 = vertices[3]; vertices += 4;
95 
96  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
97  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
98  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
99  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
100 
101  lo0 = lo0*vLo;
102  lo1 = lo1*vLo;
103  z = _mm_shuffle_ps(hi0, hi1, 0x88);
104  x = _mm_shuffle_ps(lo0, lo1, 0x88);
105  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
106  z = z*vHi;
107  x = x+y;
108  x = x+z;
109  stack_array[index+1] = x;
110  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
111 
112  v0 = vertices[0];
113  v1 = vertices[1];
114  v2 = vertices[2];
115  v3 = vertices[3]; vertices += 4;
116 
117  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
118  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
119  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
120  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
121 
122  lo0 = lo0*vLo;
123  lo1 = lo1*vLo;
124  z = _mm_shuffle_ps(hi0, hi1, 0x88);
125  x = _mm_shuffle_ps(lo0, lo1, 0x88);
126  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
127  z = z*vHi;
128  x = x+y;
129  x = x+z;
130  stack_array[index+2] = x;
131  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
132 
133  v0 = vertices[0];
134  v1 = vertices[1];
135  v2 = vertices[2];
136  v3 = vertices[3]; vertices += 4;
137 
138  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
139  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
140  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
141  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
142 
143  lo0 = lo0*vLo;
144  lo1 = lo1*vLo;
145  z = _mm_shuffle_ps(hi0, hi1, 0x88);
146  x = _mm_shuffle_ps(lo0, lo1, 0x88);
147  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
148  z = z*vHi;
149  x = x+y;
150  x = x+z;
151  stack_array[index+3] = x;
152  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
153 
154  // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
155  }
156 
157  // If we found a new max
158  if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
159  {
160  // copy the new max across all lanes of our max accumulator
161  max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
162  max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
163 
164  dotMax = max;
165 
166  // find first occurrence of that max
167  size_t test;
168  for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
169  {}
170  // record where it is.
171  maxIndex = 4*index + segment + indexTable[test];
172  }
173  }
174 
175  // account for work we've already done
176  count -= segment;
177 
178  // Deal with the last < STACK_ARRAY_COUNT vectors
179  max = dotMax;
180  index = 0;
181 
182 
183  if( btUnlikely( count > 16) )
184  {
185  for( ; index + 4 <= count / 4; index+=4 )
186  { // do four dot products at a time. Carefully avoid touching the w element.
187  float4 v0 = vertices[0];
188  float4 v1 = vertices[1];
189  float4 v2 = vertices[2];
190  float4 v3 = vertices[3]; vertices += 4;
191 
192  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
193  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
194  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
195  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
196 
197  lo0 = lo0*vLo;
198  lo1 = lo1*vLo;
199  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
200  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
201  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
202  z = z*vHi;
203  x = x+y;
204  x = x+z;
205  stack_array[index] = x;
206  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
207 
208  v0 = vertices[0];
209  v1 = vertices[1];
210  v2 = vertices[2];
211  v3 = vertices[3]; vertices += 4;
212 
213  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
214  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
215  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
216  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
217 
218  lo0 = lo0*vLo;
219  lo1 = lo1*vLo;
220  z = _mm_shuffle_ps(hi0, hi1, 0x88);
221  x = _mm_shuffle_ps(lo0, lo1, 0x88);
222  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
223  z = z*vHi;
224  x = x+y;
225  x = x+z;
226  stack_array[index+1] = x;
227  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
228 
229  v0 = vertices[0];
230  v1 = vertices[1];
231  v2 = vertices[2];
232  v3 = vertices[3]; vertices += 4;
233 
234  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
235  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
236  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
237  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
238 
239  lo0 = lo0*vLo;
240  lo1 = lo1*vLo;
241  z = _mm_shuffle_ps(hi0, hi1, 0x88);
242  x = _mm_shuffle_ps(lo0, lo1, 0x88);
243  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
244  z = z*vHi;
245  x = x+y;
246  x = x+z;
247  stack_array[index+2] = x;
248  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
249 
250  v0 = vertices[0];
251  v1 = vertices[1];
252  v2 = vertices[2];
253  v3 = vertices[3]; vertices += 4;
254 
255  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
256  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
257  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
258  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
259 
260  lo0 = lo0*vLo;
261  lo1 = lo1*vLo;
262  z = _mm_shuffle_ps(hi0, hi1, 0x88);
263  x = _mm_shuffle_ps(lo0, lo1, 0x88);
264  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
265  z = z*vHi;
266  x = x+y;
267  x = x+z;
268  stack_array[index+3] = x;
269  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
270 
271  // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
272  }
273  }
274 
275  size_t localCount = (count & -4L) - 4*index;
276  if( localCount )
277  {
278 #ifdef __APPLE__
279  float4 t0, t1, t2, t3, t4;
280  float4 * sap = &stack_array[index + localCount / 4];
281  vertices += localCount; // counter the offset
282  size_t byteIndex = -(localCount) * sizeof(float);
283  //AT&T Code style assembly
284  asm volatile
285  ( ".align 4 \n\
286  0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
287  movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
288  movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
289  movaps %[t0], %[max] // vertices[0] \n\
290  movlhps %[t1], %[max] // x0y0x1y1 \n\
291  movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
292  movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
293  mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
294  movhlps %[t0], %[t1] // z0w0z1w1 \n\
295  movaps %[t3], %[t0] // vertices[2] \n\
296  movlhps %[t4], %[t0] // x2y2x3y3 \n\
297  mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
298  movhlps %[t3], %[t4] // z2w2z3w3 \n\
299  shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
300  mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
301  movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
302  shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
303  shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
304  addps %[t3], %[max] // x + y \n\
305  addps %[t1], %[max] // x + y + z \n\
306  movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
307  maxps %[t2], %[max] // record max, restore max \n\
308  add $16, %[byteIndex] // advance loop counter\n\
309  jnz 0b \n\
310  "
311  : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
312  : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
313  : "memory", "cc"
314  );
315  index += localCount/4;
316 #else
317  {
318  for( unsigned int i=0; i<localCount/4; i++,index++)
319  { // do four dot products at a time. Carefully avoid touching the w element.
320  float4 v0 = vertices[0];
321  float4 v1 = vertices[1];
322  float4 v2 = vertices[2];
323  float4 v3 = vertices[3];
324  vertices += 4;
325 
326  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
327  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
328  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
329  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
330 
331  lo0 = lo0*vLo;
332  lo1 = lo1*vLo;
333  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
334  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
335  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
336  z = z*vHi;
337  x = x+y;
338  x = x+z;
339  stack_array[index] = x;
340  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
341  }
342  }
343 #endif //__APPLE__
344  }
345 
346  // process the last few points
347  if( count & 3 )
348  {
349  float4 v0, v1, v2, x, y, z;
350  switch( count & 3 )
351  {
352  case 3:
353  {
354  v0 = vertices[0];
355  v1 = vertices[1];
356  v2 = vertices[2];
357 
358  // Calculate 3 dot products, transpose, duplicate v2
359  float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
360  float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
361  lo0 = lo0*vLo;
362  z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
363  z = z*vHi;
364  float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
365  lo1 = lo1*vLo;
366  x = _mm_shuffle_ps(lo0, lo1, 0x88);
367  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
368  }
369  break;
370  case 2:
371  {
372  v0 = vertices[0];
373  v1 = vertices[1];
374  float4 xy = _mm_movelh_ps(v0, v1);
375  z = _mm_movehl_ps(v1, v0);
376  xy = xy*vLo;
377  z = _mm_shuffle_ps( z, z, 0xa8);
378  x = _mm_shuffle_ps( xy, xy, 0xa8);
379  y = _mm_shuffle_ps( xy, xy, 0xfd);
380  z = z*vHi;
381  }
382  break;
383  case 1:
384  {
385  float4 xy = vertices[0];
386  z = _mm_shuffle_ps( xy, xy, 0xaa);
387  xy = xy*vLo;
388  z = z*vHi;
389  x = _mm_shuffle_ps(xy, xy, 0);
390  y = _mm_shuffle_ps(xy, xy, 0x55);
391  }
392  break;
393  }
394  x = x+y;
395  x = x+z;
396  stack_array[index] = x;
397  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
398  index++;
399  }
400 
401  // if we found a new max.
402  if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
403  { // we found a new max. Search for it
404  // find max across the max vector, place in all elements of max -- big latency hit here
405  max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
406  max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
407 
408  // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
409  // this where it actually makes a difference is handled in the early out at the top of the function,
410  // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
411  // complexity, and removed it.
412 
413  dotMax = max;
414 
415  // scan for the first occurence of max in the array
416  size_t test;
417  for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
418  {}
419  maxIndex = 4*index + segment + indexTable[test];
420  }
421 
422  _mm_store_ss( dotResult, dotMax);
423  return maxIndex;
424 }
425 
426 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
427 
428 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
429 {
430  const float4 *vertices = (const float4*) vv;
431  static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
432  float4 dotmin = btAssign128( BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY );
433  float4 vvec = _mm_loadu_ps( vec );
434  float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
435  float4 vLo = _mm_movelh_ps( vvec, vvec );
436 
437  long minIndex = -1L;
438 
439  size_t segment = 0;
440  float4 stack_array[ STACK_ARRAY_COUNT ];
441 
442 #if DEBUG
443  memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
444 #endif
445 
446  size_t index;
447  float4 min;
448  // Faster loop without cleanup code for full tiles
449  for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
450  {
451  min = dotmin;
452 
453  for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
454  { // do four dot products at a time. Carefully avoid touching the w element.
455  float4 v0 = vertices[0];
456  float4 v1 = vertices[1];
457  float4 v2 = vertices[2];
458  float4 v3 = vertices[3]; vertices += 4;
459 
460  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
461  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
462  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
463  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
464 
465  lo0 = lo0*vLo;
466  lo1 = lo1*vLo;
467  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
468  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
469  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
470  z = z*vHi;
471  x = x+y;
472  x = x+z;
473  stack_array[index] = x;
474  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
475 
476  v0 = vertices[0];
477  v1 = vertices[1];
478  v2 = vertices[2];
479  v3 = vertices[3]; vertices += 4;
480 
481  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
482  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
483  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
484  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
485 
486  lo0 = lo0*vLo;
487  lo1 = lo1*vLo;
488  z = _mm_shuffle_ps(hi0, hi1, 0x88);
489  x = _mm_shuffle_ps(lo0, lo1, 0x88);
490  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
491  z = z*vHi;
492  x = x+y;
493  x = x+z;
494  stack_array[index+1] = x;
495  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
496 
497  v0 = vertices[0];
498  v1 = vertices[1];
499  v2 = vertices[2];
500  v3 = vertices[3]; vertices += 4;
501 
502  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
503  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
504  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
505  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
506 
507  lo0 = lo0*vLo;
508  lo1 = lo1*vLo;
509  z = _mm_shuffle_ps(hi0, hi1, 0x88);
510  x = _mm_shuffle_ps(lo0, lo1, 0x88);
511  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
512  z = z*vHi;
513  x = x+y;
514  x = x+z;
515  stack_array[index+2] = x;
516  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
517 
518  v0 = vertices[0];
519  v1 = vertices[1];
520  v2 = vertices[2];
521  v3 = vertices[3]; vertices += 4;
522 
523  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
524  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
525  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
526  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
527 
528  lo0 = lo0*vLo;
529  lo1 = lo1*vLo;
530  z = _mm_shuffle_ps(hi0, hi1, 0x88);
531  x = _mm_shuffle_ps(lo0, lo1, 0x88);
532  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
533  z = z*vHi;
534  x = x+y;
535  x = x+z;
536  stack_array[index+3] = x;
537  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
538 
539  // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
540  }
541 
542  // If we found a new min
543  if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
544  {
545  // copy the new min across all lanes of our min accumulator
546  min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
547  min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
548 
549  dotmin = min;
550 
551  // find first occurrence of that min
552  size_t test;
553  for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
554  {}
555  // record where it is.
556  minIndex = 4*index + segment + indexTable[test];
557  }
558  }
559 
560  // account for work we've already done
561  count -= segment;
562 
563  // Deal with the last < STACK_ARRAY_COUNT vectors
564  min = dotmin;
565  index = 0;
566 
567 
568  if(btUnlikely( count > 16) )
569  {
570  for( ; index + 4 <= count / 4; index+=4 )
571  { // do four dot products at a time. Carefully avoid touching the w element.
572  float4 v0 = vertices[0];
573  float4 v1 = vertices[1];
574  float4 v2 = vertices[2];
575  float4 v3 = vertices[3]; vertices += 4;
576 
577  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
578  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
579  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
580  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
581 
582  lo0 = lo0*vLo;
583  lo1 = lo1*vLo;
584  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
585  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
586  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
587  z = z*vHi;
588  x = x+y;
589  x = x+z;
590  stack_array[index] = x;
591  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
592 
593  v0 = vertices[0];
594  v1 = vertices[1];
595  v2 = vertices[2];
596  v3 = vertices[3]; vertices += 4;
597 
598  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
599  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
600  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
601  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
602 
603  lo0 = lo0*vLo;
604  lo1 = lo1*vLo;
605  z = _mm_shuffle_ps(hi0, hi1, 0x88);
606  x = _mm_shuffle_ps(lo0, lo1, 0x88);
607  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
608  z = z*vHi;
609  x = x+y;
610  x = x+z;
611  stack_array[index+1] = x;
612  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
613 
614  v0 = vertices[0];
615  v1 = vertices[1];
616  v2 = vertices[2];
617  v3 = vertices[3]; vertices += 4;
618 
619  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
620  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
621  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
622  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
623 
624  lo0 = lo0*vLo;
625  lo1 = lo1*vLo;
626  z = _mm_shuffle_ps(hi0, hi1, 0x88);
627  x = _mm_shuffle_ps(lo0, lo1, 0x88);
628  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
629  z = z*vHi;
630  x = x+y;
631  x = x+z;
632  stack_array[index+2] = x;
633  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
634 
635  v0 = vertices[0];
636  v1 = vertices[1];
637  v2 = vertices[2];
638  v3 = vertices[3]; vertices += 4;
639 
640  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
641  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
642  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
643  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
644 
645  lo0 = lo0*vLo;
646  lo1 = lo1*vLo;
647  z = _mm_shuffle_ps(hi0, hi1, 0x88);
648  x = _mm_shuffle_ps(lo0, lo1, 0x88);
649  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
650  z = z*vHi;
651  x = x+y;
652  x = x+z;
653  stack_array[index+3] = x;
654  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
655 
656  // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
657  }
658  }
659 
660  size_t localCount = (count & -4L) - 4*index;
661  if( localCount )
662  {
663 
664 
665 #ifdef __APPLE__
666  vertices += localCount; // counter the offset
667  float4 t0, t1, t2, t3, t4;
668  size_t byteIndex = -(localCount) * sizeof(float);
669  float4 * sap = &stack_array[index + localCount / 4];
670 
671  asm volatile
672  ( ".align 4 \n\
673  0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
674  movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
675  movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
676  movaps %[t0], %[min] // vertices[0] \n\
677  movlhps %[t1], %[min] // x0y0x1y1 \n\
678  movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
679  movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
680  mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
681  movhlps %[t0], %[t1] // z0w0z1w1 \n\
682  movaps %[t3], %[t0] // vertices[2] \n\
683  movlhps %[t4], %[t0] // x2y2x3y3 \n\
684  movhlps %[t3], %[t4] // z2w2z3w3 \n\
685  mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
686  shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
687  mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
688  movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
689  shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
690  shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
691  addps %[t3], %[min] // x + y \n\
692  addps %[t1], %[min] // x + y + z \n\
693  movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
694  minps %[t2], %[min] // record min, restore min \n\
695  add $16, %[byteIndex] // advance loop counter\n\
696  jnz 0b \n\
697  "
698  : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
699  : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
700  : "memory", "cc"
701  );
702  index += localCount/4;
703 #else
704  {
705  for( unsigned int i=0; i<localCount/4; i++,index++)
706  { // do four dot products at a time. Carefully avoid touching the w element.
707  float4 v0 = vertices[0];
708  float4 v1 = vertices[1];
709  float4 v2 = vertices[2];
710  float4 v3 = vertices[3];
711  vertices += 4;
712 
713  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
714  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
715  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
716  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
717 
718  lo0 = lo0*vLo;
719  lo1 = lo1*vLo;
720  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
721  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
722  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
723  z = z*vHi;
724  x = x+y;
725  x = x+z;
726  stack_array[index] = x;
727  min = _mm_min_ps( x, min ); // control the order here so that max is never NaN even if x is nan
728  }
729  }
730 
731 #endif
732  }
733 
734  // process the last few points
735  if( count & 3 )
736  {
737  float4 v0, v1, v2, x, y, z;
738  switch( count & 3 )
739  {
740  case 3:
741  {
742  v0 = vertices[0];
743  v1 = vertices[1];
744  v2 = vertices[2];
745 
746  // Calculate 3 dot products, transpose, duplicate v2
747  float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
748  float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
749  lo0 = lo0*vLo;
750  z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
751  z = z*vHi;
752  float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
753  lo1 = lo1*vLo;
754  x = _mm_shuffle_ps(lo0, lo1, 0x88);
755  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
756  }
757  break;
758  case 2:
759  {
760  v0 = vertices[0];
761  v1 = vertices[1];
762  float4 xy = _mm_movelh_ps(v0, v1);
763  z = _mm_movehl_ps(v1, v0);
764  xy = xy*vLo;
765  z = _mm_shuffle_ps( z, z, 0xa8);
766  x = _mm_shuffle_ps( xy, xy, 0xa8);
767  y = _mm_shuffle_ps( xy, xy, 0xfd);
768  z = z*vHi;
769  }
770  break;
771  case 1:
772  {
773  float4 xy = vertices[0];
774  z = _mm_shuffle_ps( xy, xy, 0xaa);
775  xy = xy*vLo;
776  z = z*vHi;
777  x = _mm_shuffle_ps(xy, xy, 0);
778  y = _mm_shuffle_ps(xy, xy, 0x55);
779  }
780  break;
781  }
782  x = x+y;
783  x = x+z;
784  stack_array[index] = x;
785  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
786  index++;
787  }
788 
789  // if we found a new min.
790  if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
791  { // we found a new min. Search for it
792  // find min across the min vector, place in all elements of min -- big latency hit here
793  min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
794  min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
795 
796  // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
797  // this where it actually makes a difference is handled in the early out at the top of the function,
798  // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
799  // complexity, and removed it.
800 
801  dotmin = min;
802 
803  // scan for the first occurence of min in the array
804  size_t test;
805  for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
806  {}
807  minIndex = 4*index + segment + indexTable[test];
808  }
809 
810  _mm_store_ss( dotResult, dotmin);
811  return minIndex;
812 }
813 
814 
815 #elif defined BT_USE_NEON
816 #define ARM_NEON_GCC_COMPATIBILITY 1
817 #include <arm_neon.h>
818 
819 
820 static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
821 static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
822 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
823 static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
824 static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
825 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
826 
827 long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
828 long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
829 
830 extern "C" {int _get_cpu_capabilities( void );}
831 
832 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
833 {
834  if( _get_cpu_capabilities() & 0x2000 )
835  _maxdot_large = _maxdot_large_v1;
836  else
837  _maxdot_large = _maxdot_large_v0;
838 
839  return _maxdot_large(vv, vec, count, dotResult);
840 }
841 
842 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
843 {
844  if( _get_cpu_capabilities() & 0x2000 )
845  _mindot_large = _mindot_large_v1;
846  else
847  _mindot_large = _mindot_large_v0;
848 
849  return _mindot_large(vv, vec, count, dotResult);
850 }
851 
852 
853 
854 #define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
855 
856 
857 long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
858 {
859  unsigned long i = 0;
860  float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
861  float32x2_t vLo = vget_low_f32(vvec);
862  float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
863  float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
864  float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
865  uint32x2_t indexLo = (uint32x2_t) {0, 1};
866  uint32x2_t indexHi = (uint32x2_t) {2, 3};
867  uint32x2_t iLo = (uint32x2_t) {-1, -1};
868  uint32x2_t iHi = (uint32x2_t) {-1, -1};
869  const uint32x2_t four = (uint32x2_t) {4,4};
870 
871  for( ; i+8 <= count; i+= 8 )
872  {
873  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
874  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
875  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
876  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
877 
878  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
879  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
880  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
881  float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
882 
883  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
884  float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
885  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
886  float32x2_t zHi = vmul_f32( z1.val[0], vHi);
887 
888  float32x2_t rLo = vpadd_f32( xy0, xy1);
889  float32x2_t rHi = vpadd_f32( xy2, xy3);
890  rLo = vadd_f32(rLo, zLo);
891  rHi = vadd_f32(rHi, zHi);
892 
893  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
894  uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
895  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
896  dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
897  iLo = vbsl_u32(maskLo, indexLo, iLo);
898  iHi = vbsl_u32(maskHi, indexHi, iHi);
899  indexLo = vadd_u32(indexLo, four);
900  indexHi = vadd_u32(indexHi, four);
901 
902  v0 = vld1q_f32_aligned_postincrement( vv );
903  v1 = vld1q_f32_aligned_postincrement( vv );
904  v2 = vld1q_f32_aligned_postincrement( vv );
905  v3 = vld1q_f32_aligned_postincrement( vv );
906 
907  xy0 = vmul_f32( vget_low_f32(v0), vLo);
908  xy1 = vmul_f32( vget_low_f32(v1), vLo);
909  xy2 = vmul_f32( vget_low_f32(v2), vLo);
910  xy3 = vmul_f32( vget_low_f32(v3), vLo);
911 
912  z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
913  z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
914  zLo = vmul_f32( z0.val[0], vHi);
915  zHi = vmul_f32( z1.val[0], vHi);
916 
917  rLo = vpadd_f32( xy0, xy1);
918  rHi = vpadd_f32( xy2, xy3);
919  rLo = vadd_f32(rLo, zLo);
920  rHi = vadd_f32(rHi, zHi);
921 
922  maskLo = vcgt_f32( rLo, dotMaxLo );
923  maskHi = vcgt_f32( rHi, dotMaxHi );
924  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
925  dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
926  iLo = vbsl_u32(maskLo, indexLo, iLo);
927  iHi = vbsl_u32(maskHi, indexHi, iHi);
928  indexLo = vadd_u32(indexLo, four);
929  indexHi = vadd_u32(indexHi, four);
930  }
931 
932  for( ; i+4 <= count; i+= 4 )
933  {
934  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
935  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
936  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
937  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
938 
939  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
940  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
941  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
942  float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
943 
944  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
945  float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
946  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
947  float32x2_t zHi = vmul_f32( z1.val[0], vHi);
948 
949  float32x2_t rLo = vpadd_f32( xy0, xy1);
950  float32x2_t rHi = vpadd_f32( xy2, xy3);
951  rLo = vadd_f32(rLo, zLo);
952  rHi = vadd_f32(rHi, zHi);
953 
954  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
955  uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
956  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
957  dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
958  iLo = vbsl_u32(maskLo, indexLo, iLo);
959  iHi = vbsl_u32(maskHi, indexHi, iHi);
960  indexLo = vadd_u32(indexLo, four);
961  indexHi = vadd_u32(indexHi, four);
962  }
963 
964  switch( count & 3 )
965  {
966  case 3:
967  {
968  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
969  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
970  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
971 
972  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
973  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
974  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
975 
976  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
977  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
978  float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
979 
980  float32x2_t rLo = vpadd_f32( xy0, xy1);
981  float32x2_t rHi = vpadd_f32( xy2, xy2);
982  rLo = vadd_f32(rLo, zLo);
983  rHi = vadd_f32(rHi, zHi);
984 
985  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
986  uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
987  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
988  dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
989  iLo = vbsl_u32(maskLo, indexLo, iLo);
990  iHi = vbsl_u32(maskHi, indexHi, iHi);
991  }
992  break;
993  case 2:
994  {
995  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
996  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
997 
998  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
999  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1000 
1001  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1002  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1003 
1004  float32x2_t rLo = vpadd_f32( xy0, xy1);
1005  rLo = vadd_f32(rLo, zLo);
1006 
1007  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1008  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1009  iLo = vbsl_u32(maskLo, indexLo, iLo);
1010  }
1011  break;
1012  case 1:
1013  {
1014  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1015  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1016  float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1017  float32x2_t zLo = vmul_f32( z0, vHi);
1018  float32x2_t rLo = vpadd_f32( xy0, xy0);
1019  rLo = vadd_f32(rLo, zLo);
1020  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1021  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1022  iLo = vbsl_u32(maskLo, indexLo, iLo);
1023  }
1024  break;
1025 
1026  default:
1027  break;
1028  }
1029 
1030  // select best answer between hi and lo results
1031  uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
1032  dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1033  iLo = vbsl_u32(mask, iHi, iLo);
1034 
1035  // select best answer between even and odd results
1036  dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1037  iHi = vdup_lane_u32(iLo, 1);
1038  mask = vcgt_f32( dotMaxHi, dotMaxLo );
1039  dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1040  iLo = vbsl_u32(mask, iHi, iLo);
1041 
1042  *dotResult = vget_lane_f32( dotMaxLo, 0);
1043  return vget_lane_u32(iLo, 0);
1044 }
1045 
1046 
1047 long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1048 {
1049  float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1050  float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1051  float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1052  const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1053  uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1054  uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
1055  float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
1056 
1057  unsigned long i = 0;
1058  for( ; i + 8 <= count; i += 8 )
1059  {
1060  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1061  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1062  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1063  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1064 
1065  // the next two lines should resolve to a single vswp d, d
1066  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1067  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1068  // the next two lines should resolve to a single vswp d, d
1069  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1070  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1071 
1072  xy0 = vmulq_f32(xy0, vLo);
1073  xy1 = vmulq_f32(xy1, vLo);
1074 
1075  float32x4x2_t zb = vuzpq_f32( z0, z1);
1076  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1077  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1078  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1079  x = vaddq_f32(x, z);
1080 
1081  uint32x4_t mask = vcgtq_f32(x, maxDot);
1082  maxDot = vbslq_f32( mask, x, maxDot);
1083  index = vbslq_u32(mask, local_index, index);
1084  local_index = vaddq_u32(local_index, four);
1085 
1086  v0 = vld1q_f32_aligned_postincrement( vv );
1087  v1 = vld1q_f32_aligned_postincrement( vv );
1088  v2 = vld1q_f32_aligned_postincrement( vv );
1089  v3 = vld1q_f32_aligned_postincrement( vv );
1090 
1091  // the next two lines should resolve to a single vswp d, d
1092  xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1093  xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1094  // the next two lines should resolve to a single vswp d, d
1095  z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1096  z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1097 
1098  xy0 = vmulq_f32(xy0, vLo);
1099  xy1 = vmulq_f32(xy1, vLo);
1100 
1101  zb = vuzpq_f32( z0, z1);
1102  z = vmulq_f32( zb.val[0], vHi);
1103  xy = vuzpq_f32( xy0, xy1);
1104  x = vaddq_f32(xy.val[0], xy.val[1]);
1105  x = vaddq_f32(x, z);
1106 
1107  mask = vcgtq_f32(x, maxDot);
1108  maxDot = vbslq_f32( mask, x, maxDot);
1109  index = vbslq_u32(mask, local_index, index);
1110  local_index = vaddq_u32(local_index, four);
1111  }
1112 
1113  for( ; i + 4 <= count; i += 4 )
1114  {
1115  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1116  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1117  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1118  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1119 
1120  // the next two lines should resolve to a single vswp d, d
1121  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1122  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1123  // the next two lines should resolve to a single vswp d, d
1124  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1125  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1126 
1127  xy0 = vmulq_f32(xy0, vLo);
1128  xy1 = vmulq_f32(xy1, vLo);
1129 
1130  float32x4x2_t zb = vuzpq_f32( z0, z1);
1131  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1132  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1133  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1134  x = vaddq_f32(x, z);
1135 
1136  uint32x4_t mask = vcgtq_f32(x, maxDot);
1137  maxDot = vbslq_f32( mask, x, maxDot);
1138  index = vbslq_u32(mask, local_index, index);
1139  local_index = vaddq_u32(local_index, four);
1140  }
1141 
1142  switch (count & 3) {
1143  case 3:
1144  {
1145  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1146  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1147  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1148 
1149  // the next two lines should resolve to a single vswp d, d
1150  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1151  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1152  // the next two lines should resolve to a single vswp d, d
1153  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1154  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1155 
1156  xy0 = vmulq_f32(xy0, vLo);
1157  xy1 = vmulq_f32(xy1, vLo);
1158 
1159  float32x4x2_t zb = vuzpq_f32( z0, z1);
1160  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1161  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1162  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1163  x = vaddq_f32(x, z);
1164 
1165  uint32x4_t mask = vcgtq_f32(x, maxDot);
1166  maxDot = vbslq_f32( mask, x, maxDot);
1167  index = vbslq_u32(mask, local_index, index);
1168  local_index = vaddq_u32(local_index, four);
1169  }
1170  break;
1171 
1172  case 2:
1173  {
1174  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1175  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1176 
1177  // the next two lines should resolve to a single vswp d, d
1178  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1179  // the next two lines should resolve to a single vswp d, d
1180  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1181 
1182  xy0 = vmulq_f32(xy0, vLo);
1183 
1184  float32x4x2_t zb = vuzpq_f32( z0, z0);
1185  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1186  float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1187  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1188  x = vaddq_f32(x, z);
1189 
1190  uint32x4_t mask = vcgtq_f32(x, maxDot);
1191  maxDot = vbslq_f32( mask, x, maxDot);
1192  index = vbslq_u32(mask, local_index, index);
1193  local_index = vaddq_u32(local_index, four);
1194  }
1195  break;
1196 
1197  case 1:
1198  {
1199  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1200 
1201  // the next two lines should resolve to a single vswp d, d
1202  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1203  // the next two lines should resolve to a single vswp d, d
1204  float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1205 
1206  xy0 = vmulq_f32(xy0, vLo);
1207 
1208  z = vmulq_f32( z, vHi);
1209  float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1210  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1211  x = vaddq_f32(x, z);
1212 
1213  uint32x4_t mask = vcgtq_f32(x, maxDot);
1214  maxDot = vbslq_f32( mask, x, maxDot);
1215  index = vbslq_u32(mask, local_index, index);
1216  local_index = vaddq_u32(local_index, four);
1217  }
1218  break;
1219 
1220  default:
1221  break;
1222  }
1223 
1224 
1225  // select best answer between hi and lo results
1226  uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
1227  float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1228  uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1229 
1230  // select best answer between even and odd results
1231  float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1232  uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1233  mask = vcgt_f32( maxDotO, maxDot2 );
1234  maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1235  index2 = vbsl_u32(mask, indexHi, index2);
1236 
1237  *dotResult = vget_lane_f32( maxDot2, 0);
1238  return vget_lane_u32(index2, 0);
1239 
1240 }
1241 
1242 long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
1243 {
1244  unsigned long i = 0;
1245  float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1246  float32x2_t vLo = vget_low_f32(vvec);
1247  float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1248  float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1249  float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1250  uint32x2_t indexLo = (uint32x2_t) {0, 1};
1251  uint32x2_t indexHi = (uint32x2_t) {2, 3};
1252  uint32x2_t iLo = (uint32x2_t) {-1, -1};
1253  uint32x2_t iHi = (uint32x2_t) {-1, -1};
1254  const uint32x2_t four = (uint32x2_t) {4,4};
1255 
1256  for( ; i+8 <= count; i+= 8 )
1257  {
1258  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1259  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1260  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1261  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1262 
1263  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1264  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1265  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1266  float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1267 
1268  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1269  float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1270  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1271  float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1272 
1273  float32x2_t rLo = vpadd_f32( xy0, xy1);
1274  float32x2_t rHi = vpadd_f32( xy2, xy3);
1275  rLo = vadd_f32(rLo, zLo);
1276  rHi = vadd_f32(rHi, zHi);
1277 
1278  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1279  uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1280  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1281  dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1282  iLo = vbsl_u32(maskLo, indexLo, iLo);
1283  iHi = vbsl_u32(maskHi, indexHi, iHi);
1284  indexLo = vadd_u32(indexLo, four);
1285  indexHi = vadd_u32(indexHi, four);
1286 
1287  v0 = vld1q_f32_aligned_postincrement( vv );
1288  v1 = vld1q_f32_aligned_postincrement( vv );
1289  v2 = vld1q_f32_aligned_postincrement( vv );
1290  v3 = vld1q_f32_aligned_postincrement( vv );
1291 
1292  xy0 = vmul_f32( vget_low_f32(v0), vLo);
1293  xy1 = vmul_f32( vget_low_f32(v1), vLo);
1294  xy2 = vmul_f32( vget_low_f32(v2), vLo);
1295  xy3 = vmul_f32( vget_low_f32(v3), vLo);
1296 
1297  z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1298  z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1299  zLo = vmul_f32( z0.val[0], vHi);
1300  zHi = vmul_f32( z1.val[0], vHi);
1301 
1302  rLo = vpadd_f32( xy0, xy1);
1303  rHi = vpadd_f32( xy2, xy3);
1304  rLo = vadd_f32(rLo, zLo);
1305  rHi = vadd_f32(rHi, zHi);
1306 
1307  maskLo = vclt_f32( rLo, dotMinLo );
1308  maskHi = vclt_f32( rHi, dotMinHi );
1309  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1310  dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1311  iLo = vbsl_u32(maskLo, indexLo, iLo);
1312  iHi = vbsl_u32(maskHi, indexHi, iHi);
1313  indexLo = vadd_u32(indexLo, four);
1314  indexHi = vadd_u32(indexHi, four);
1315  }
1316 
1317  for( ; i+4 <= count; i+= 4 )
1318  {
1319  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1320  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1321  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1322  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1323 
1324  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1325  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1326  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1327  float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1328 
1329  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1330  float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1331  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1332  float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1333 
1334  float32x2_t rLo = vpadd_f32( xy0, xy1);
1335  float32x2_t rHi = vpadd_f32( xy2, xy3);
1336  rLo = vadd_f32(rLo, zLo);
1337  rHi = vadd_f32(rHi, zHi);
1338 
1339  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1340  uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1341  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1342  dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1343  iLo = vbsl_u32(maskLo, indexLo, iLo);
1344  iHi = vbsl_u32(maskHi, indexHi, iHi);
1345  indexLo = vadd_u32(indexLo, four);
1346  indexHi = vadd_u32(indexHi, four);
1347  }
1348  switch( count & 3 )
1349  {
1350  case 3:
1351  {
1352  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1353  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1354  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1355 
1356  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1357  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1358  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1359 
1360  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1361  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1362  float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1363 
1364  float32x2_t rLo = vpadd_f32( xy0, xy1);
1365  float32x2_t rHi = vpadd_f32( xy2, xy2);
1366  rLo = vadd_f32(rLo, zLo);
1367  rHi = vadd_f32(rHi, zHi);
1368 
1369  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1370  uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1371  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1372  dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1373  iLo = vbsl_u32(maskLo, indexLo, iLo);
1374  iHi = vbsl_u32(maskHi, indexHi, iHi);
1375  }
1376  break;
1377  case 2:
1378  {
1379  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1380  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1381 
1382  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1383  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1384 
1385  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1386  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1387 
1388  float32x2_t rLo = vpadd_f32( xy0, xy1);
1389  rLo = vadd_f32(rLo, zLo);
1390 
1391  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1392  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1393  iLo = vbsl_u32(maskLo, indexLo, iLo);
1394  }
1395  break;
1396  case 1:
1397  {
1398  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1399  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1400  float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1401  float32x2_t zLo = vmul_f32( z0, vHi);
1402  float32x2_t rLo = vpadd_f32( xy0, xy0);
1403  rLo = vadd_f32(rLo, zLo);
1404  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1405  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1406  iLo = vbsl_u32(maskLo, indexLo, iLo);
1407  }
1408  break;
1409 
1410  default:
1411  break;
1412  }
1413 
1414  // select best answer between hi and lo results
1415  uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
1416  dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1417  iLo = vbsl_u32(mask, iHi, iLo);
1418 
1419  // select best answer between even and odd results
1420  dotMinHi = vdup_lane_f32(dotMinLo, 1);
1421  iHi = vdup_lane_u32(iLo, 1);
1422  mask = vclt_f32( dotMinHi, dotMinLo );
1423  dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1424  iLo = vbsl_u32(mask, iHi, iLo);
1425 
1426  *dotResult = vget_lane_f32( dotMinLo, 0);
1427  return vget_lane_u32(iLo, 0);
1428 }
1429 
1430 long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1431 {
1432  float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1433  float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1434  float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1435  const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1436  uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1437  uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
1438  float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
1439 
1440  unsigned long i = 0;
1441  for( ; i + 8 <= count; i += 8 )
1442  {
1443  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1444  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1445  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1446  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1447 
1448  // the next two lines should resolve to a single vswp d, d
1449  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1450  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1451  // the next two lines should resolve to a single vswp d, d
1452  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1453  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1454 
1455  xy0 = vmulq_f32(xy0, vLo);
1456  xy1 = vmulq_f32(xy1, vLo);
1457 
1458  float32x4x2_t zb = vuzpq_f32( z0, z1);
1459  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1460  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1461  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1462  x = vaddq_f32(x, z);
1463 
1464  uint32x4_t mask = vcltq_f32(x, minDot);
1465  minDot = vbslq_f32( mask, x, minDot);
1466  index = vbslq_u32(mask, local_index, index);
1467  local_index = vaddq_u32(local_index, four);
1468 
1469  v0 = vld1q_f32_aligned_postincrement( vv );
1470  v1 = vld1q_f32_aligned_postincrement( vv );
1471  v2 = vld1q_f32_aligned_postincrement( vv );
1472  v3 = vld1q_f32_aligned_postincrement( vv );
1473 
1474  // the next two lines should resolve to a single vswp d, d
1475  xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1476  xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1477  // the next two lines should resolve to a single vswp d, d
1478  z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1479  z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1480 
1481  xy0 = vmulq_f32(xy0, vLo);
1482  xy1 = vmulq_f32(xy1, vLo);
1483 
1484  zb = vuzpq_f32( z0, z1);
1485  z = vmulq_f32( zb.val[0], vHi);
1486  xy = vuzpq_f32( xy0, xy1);
1487  x = vaddq_f32(xy.val[0], xy.val[1]);
1488  x = vaddq_f32(x, z);
1489 
1490  mask = vcltq_f32(x, minDot);
1491  minDot = vbslq_f32( mask, x, minDot);
1492  index = vbslq_u32(mask, local_index, index);
1493  local_index = vaddq_u32(local_index, four);
1494  }
1495 
1496  for( ; i + 4 <= count; i += 4 )
1497  {
1498  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1499  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1500  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1501  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1502 
1503  // the next two lines should resolve to a single vswp d, d
1504  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1505  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1506  // the next two lines should resolve to a single vswp d, d
1507  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1508  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1509 
1510  xy0 = vmulq_f32(xy0, vLo);
1511  xy1 = vmulq_f32(xy1, vLo);
1512 
1513  float32x4x2_t zb = vuzpq_f32( z0, z1);
1514  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1515  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1516  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1517  x = vaddq_f32(x, z);
1518 
1519  uint32x4_t mask = vcltq_f32(x, minDot);
1520  minDot = vbslq_f32( mask, x, minDot);
1521  index = vbslq_u32(mask, local_index, index);
1522  local_index = vaddq_u32(local_index, four);
1523  }
1524 
1525  switch (count & 3) {
1526  case 3:
1527  {
1528  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1529  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1530  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1531 
1532  // the next two lines should resolve to a single vswp d, d
1533  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1534  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1535  // the next two lines should resolve to a single vswp d, d
1536  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1537  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1538 
1539  xy0 = vmulq_f32(xy0, vLo);
1540  xy1 = vmulq_f32(xy1, vLo);
1541 
1542  float32x4x2_t zb = vuzpq_f32( z0, z1);
1543  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1544  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1545  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1546  x = vaddq_f32(x, z);
1547 
1548  uint32x4_t mask = vcltq_f32(x, minDot);
1549  minDot = vbslq_f32( mask, x, minDot);
1550  index = vbslq_u32(mask, local_index, index);
1551  local_index = vaddq_u32(local_index, four);
1552  }
1553  break;
1554 
1555  case 2:
1556  {
1557  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1558  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1559 
1560  // the next two lines should resolve to a single vswp d, d
1561  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1562  // the next two lines should resolve to a single vswp d, d
1563  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1564 
1565  xy0 = vmulq_f32(xy0, vLo);
1566 
1567  float32x4x2_t zb = vuzpq_f32( z0, z0);
1568  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1569  float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1570  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1571  x = vaddq_f32(x, z);
1572 
1573  uint32x4_t mask = vcltq_f32(x, minDot);
1574  minDot = vbslq_f32( mask, x, minDot);
1575  index = vbslq_u32(mask, local_index, index);
1576  local_index = vaddq_u32(local_index, four);
1577  }
1578  break;
1579 
1580  case 1:
1581  {
1582  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1583 
1584  // the next two lines should resolve to a single vswp d, d
1585  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1586  // the next two lines should resolve to a single vswp d, d
1587  float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1588 
1589  xy0 = vmulq_f32(xy0, vLo);
1590 
1591  z = vmulq_f32( z, vHi);
1592  float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1593  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1594  x = vaddq_f32(x, z);
1595 
1596  uint32x4_t mask = vcltq_f32(x, minDot);
1597  minDot = vbslq_f32( mask, x, minDot);
1598  index = vbslq_u32(mask, local_index, index);
1599  local_index = vaddq_u32(local_index, four);
1600  }
1601  break;
1602 
1603  default:
1604  break;
1605  }
1606 
1607 
1608  // select best answer between hi and lo results
1609  uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
1610  float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1611  uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1612 
1613  // select best answer between even and odd results
1614  float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1615  uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1616  mask = vclt_f32( minDotO, minDot2 );
1617  minDot2 = vbsl_f32(mask, minDotO, minDot2);
1618  index2 = vbsl_u32(mask, indexHi, index2);
1619 
1620  *dotResult = vget_lane_f32( minDot2, 0);
1621  return vget_lane_u32(index2, 0);
1622 
1623 }
1624 
1625 #else
1626  #error Unhandled __APPLE__ arch
1627 #endif
1628 
1629 #endif /* __APPLE__ */
1630 
1631