25 #ifdef BT_USE_DOUBLE_PRECISION
26 #define btVector3Data btVector3DoubleData
27 #define btVector3DataName "btVector3DoubleData"
29 #define btVector3Data btVector3FloatData
30 #define btVector3DataName "btVector3FloatData"
31 #endif //BT_USE_DOUBLE_PRECISION
33 #if defined BT_USE_SSE
38 #pragma warning(disable: 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
42 #define BT_SHUFFLE(x,y,z,w) ((w)<<6 | (z)<<4 | (y)<<2 | (x))
44 #define bt_pshufd_ps( _a, _mask ) _mm_shuffle_ps((_a), (_a), (_mask) )
45 #define bt_splat3_ps( _a, _i ) bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i, 3) )
46 #define bt_splat_ps( _a, _i ) bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i,_i) )
48 #define btv3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
49 #define btvAbsMask (_mm_set_epi32( 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
50 #define btvFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
51 #define btv3AbsfMask btCastiTo128f(btv3AbsiMask)
52 #define btvFFF0fMask btCastiTo128f(btvFFF0Mask)
53 #define btvxyzMaskf btvFFF0fMask
54 #define btvAbsfMask btCastiTo128f(btvAbsMask)
67 const float32x4_t
ATTRIBUTE_ALIGNED16(btvMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
68 const int32x4_t
ATTRIBUTE_ALIGNED16(btvFFF0Mask) = (int32x4_t){0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
69 const int32x4_t
ATTRIBUTE_ALIGNED16(btvAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
70 const int32x4_t
ATTRIBUTE_ALIGNED16(btv3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
84 #if defined (__SPU__) && defined (__CELLOS_LV2__)
92 #else //__CELLOS_LV2__ __SPU__
93 #if defined (BT_USE_SSE) || defined(BT_USE_NEON) // _WIN32 || ARM
109 #endif //__CELLOS_LV2__ __SPU__
134 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) )|| defined (BT_USE_NEON)
144 mVec128 = rhs.mVec128;
155 #endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
161 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
162 mVec128 = _mm_add_ps(mVec128, v.mVec128);
163 #elif defined(BT_USE_NEON)
164 mVec128 = vaddq_f32(mVec128, v.mVec128);
178 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
179 mVec128 = _mm_sub_ps(mVec128, v.mVec128);
180 #elif defined(BT_USE_NEON)
181 mVec128 = vsubq_f32(mVec128, v.mVec128);
194 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
195 __m128 vs = _mm_load_ss(&s);
196 vs = bt_pshufd_ps(vs, 0x80);
197 mVec128 = _mm_mul_ps(mVec128, vs);
198 #elif defined(BT_USE_NEON)
199 mVec128 = vmulq_n_f32(mVec128, s);
214 #if 0 //defined(BT_USE_SSE_IN_API)
216 __m128 vs = _mm_load_ss(&s);
217 vs = _mm_div_ss(v1110, vs);
218 vs = bt_pshufd_ps(vs, 0x00);
220 mVec128 = _mm_mul_ps(mVec128, vs);
232 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
233 __m128 vd = _mm_mul_ps(mVec128, v.mVec128);
234 __m128 z = _mm_movehl_ps(vd, vd);
235 __m128 y = _mm_shuffle_ps(vd, vd, 0x55);
236 vd = _mm_add_ss(vd, y);
237 vd = _mm_add_ss(vd, z);
238 return _mm_cvtss_f32(vd);
239 #elif defined(BT_USE_NEON)
240 float32x4_t vd = vmulq_f32(mVec128, v.mVec128);
241 float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd));
242 x = vadd_f32(x, vget_high_f32(vd));
243 return vget_lane_f32(x, 0);
245 return m_floats[0] * v.
m_floats[0] +
274 int maxIndex = absVec.
maxAxis();
275 if (absVec[maxIndex]>0)
277 *
this /= absVec[maxIndex];
288 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
290 __m128 vd = _mm_mul_ps(mVec128, mVec128);
291 __m128 z = _mm_movehl_ps(vd, vd);
292 __m128 y = _mm_shuffle_ps(vd, vd, 0x55);
293 vd = _mm_add_ss(vd, y);
294 vd = _mm_add_ss(vd, z);
297 vd = _mm_sqrt_ss(vd);
298 vd = _mm_div_ss(v1110, vd);
299 vd = bt_splat_ps(vd, 0x80);
300 mVec128 = _mm_mul_ps(mVec128, vd);
304 y = _mm_rsqrt_ss(vd);
308 vd = _mm_mul_ss(vd, vHalf);
310 vd = _mm_mul_ss(vd, y);
311 vd = _mm_mul_ss(vd, y);
312 z = _mm_sub_ss(z, vd);
314 y = _mm_mul_ss(y, z);
316 y = bt_splat_ps(y, 0x80);
317 mVec128 = _mm_mul_ps(mVec128, y);
348 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
349 return btVector3(_mm_and_ps(mVec128, btv3AbsfMask));
350 #elif defined(BT_USE_NEON)
364 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
367 T = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 3));
368 V = bt_pshufd_ps(v.mVec128, BT_SHUFFLE(1, 2, 0, 3));
370 V = _mm_mul_ps(V, mVec128);
371 T = _mm_mul_ps(T, v.mVec128);
372 V = _mm_sub_ps(V, T);
374 V = bt_pshufd_ps(V, BT_SHUFFLE(1, 2, 0, 3));
376 #elif defined(BT_USE_NEON)
379 float32x2_t Tlow = vget_low_f32(mVec128);
380 float32x2_t Vlow = vget_low_f32(v.mVec128);
381 T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow);
382 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow);
384 V = vmulq_f32(V, mVec128);
385 T = vmulq_f32(T, v.mVec128);
387 Vlow = vget_low_f32(V);
389 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
390 V = (float32x4_t)vandq_s32((int32x4_t)
V, btvFFF0Mask);
403 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
405 __m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, BT_SHUFFLE(1, 2, 0, 3));
406 __m128
V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, BT_SHUFFLE(1, 2, 0, 3));
408 V = _mm_mul_ps(V, v1.mVec128);
409 T = _mm_mul_ps(T, v2.mVec128);
410 V = _mm_sub_ps(V, T);
412 V = _mm_shuffle_ps(V, V, BT_SHUFFLE(1, 2, 0, 3));
415 V = _mm_mul_ps(V, mVec128);
416 __m128 z = _mm_movehl_ps(V, V);
417 __m128 y = _mm_shuffle_ps(V, V, 0x55);
418 V = _mm_add_ss(V, y);
419 V = _mm_add_ss(V, z);
420 return _mm_cvtss_f32(V);
422 #elif defined(BT_USE_NEON)
426 float32x2_t Tlow = vget_low_f32(v1.mVec128);
427 float32x2_t Vlow = vget_low_f32(v2.mVec128);
428 T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow);
429 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow);
431 V = vmulq_f32(V, v1.mVec128);
432 T = vmulq_f32(T, v2.mVec128);
434 Vlow = vget_low_f32(V);
436 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
439 V = vmulq_f32(mVec128, V);
440 float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V));
441 x = vadd_f32(x, vget_high_f32(V));
442 return vget_lane_f32(x, 0);
455 return m_floats[0] < m_floats[1] ? (m_floats[0] <m_floats[2] ? 0 : 2) : (m_floats[1] <m_floats[2] ? 1 : 2);
462 return m_floats[0] < m_floats[1] ? (m_floats[1] <m_floats[2] ? 2 : 1) : (m_floats[0] <m_floats[2] ? 2 : 0);
467 return absolute().minAxis();
472 return absolute().maxAxis();
478 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
479 __m128 vrt = _mm_load_ss(&rt);
481 __m128 vs = _mm_load_ss(&s);
482 vs = bt_pshufd_ps(vs, 0x80);
483 __m128 r0 = _mm_mul_ps(v0.mVec128, vs);
484 vrt = bt_pshufd_ps(vrt, 0x80);
485 __m128 r1 = _mm_mul_ps(v1.mVec128, vrt);
486 __m128 tmp3 = _mm_add_ps(r0,r1);
488 #elif defined(BT_USE_NEON)
489 mVec128 = vsubq_f32(v1.mVec128, v0.mVec128);
490 mVec128 = vmulq_n_f32(mVec128, rt);
491 mVec128 = vaddq_f32(mVec128, v0.mVec128);
507 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
508 __m128 vt = _mm_load_ss(&t);
509 vt = bt_pshufd_ps(vt, 0x80);
510 __m128 vl = _mm_sub_ps(v.mVec128, mVec128);
511 vl = _mm_mul_ps(vl, vt);
512 vl = _mm_add_ps(vl, mVec128);
515 #elif defined(BT_USE_NEON)
516 float32x4_t vl = vsubq_f32(v.mVec128, mVec128);
517 vl = vmulq_n_f32(vl, t);
518 vl = vaddq_f32(vl, mVec128);
524 m_floats[1] + (v.
m_floats[1] - m_floats[1]) * t,
525 m_floats[2] + (v.
m_floats[2] - m_floats[2]) * t);
533 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
534 mVec128 = _mm_mul_ps(mVec128, v.mVec128);
535 #elif defined(BT_USE_NEON)
536 mVec128 = vmulq_f32(mVec128, v.mVec128);
576 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
577 return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
579 return ((m_floats[3]==other.
m_floats[3]) &&
588 return !(*
this == other);
596 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
597 mVec128 = _mm_max_ps(mVec128, other.mVec128);
598 #elif defined(BT_USE_NEON)
599 mVec128 = vmaxq_f32(mVec128, other.mVec128);
613 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
614 mVec128 = _mm_min_ps(mVec128, other.mVec128);
615 #elif defined(BT_USE_NEON)
616 mVec128 = vminq_f32(mVec128, other.mVec128);
635 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
637 __m128
V = _mm_and_ps(mVec128, btvFFF0fMask);
638 __m128 V0 = _mm_xor_ps(btvMzeroMask, V);
639 __m128 V2 = _mm_movelh_ps(V0, V);
641 __m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
643 V0 = _mm_shuffle_ps(V0, V, 0xDB);
644 V2 = _mm_shuffle_ps(V2, V, 0xF9);
658 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
659 mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
660 #elif defined(BT_USE_NEON)
661 int32x4_t vi = vdupq_n_s32(0);
662 mVec128 = vreinterpretq_f32_s32(vi);
705 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
707 __m128 a0 = _mm_mul_ps( v0.mVec128, this->mVec128 );
708 __m128 a1 = _mm_mul_ps( v1.mVec128, this->mVec128 );
709 __m128 a2 = _mm_mul_ps( v2.mVec128, this->mVec128 );
710 __m128 b0 = _mm_unpacklo_ps( a0, a1 );
711 __m128 b1 = _mm_unpackhi_ps( a0, a1 );
712 __m128 b2 = _mm_unpacklo_ps( a2, _mm_setzero_ps() );
713 __m128 r = _mm_movelh_ps( b0, b2 );
714 r = _mm_add_ps( r, _mm_movehl_ps( b2, b0 ));
715 a2 = _mm_and_ps( a2, btvxyzMaskf);
716 r = _mm_add_ps( r, btCastdTo128f (_mm_move_sd( btCastfTo128d(a2), btCastfTo128d(b1) )));
719 #elif defined(BT_USE_NEON)
720 static const uint32x4_t xyzMask = (
const uint32x4_t){ -1, -1, -1, 0 };
721 float32x4_t a0 = vmulq_f32( v0.mVec128, this->mVec128);
722 float32x4_t a1 = vmulq_f32( v1.mVec128, this->mVec128);
723 float32x4_t a2 = vmulq_f32( v2.mVec128, this->mVec128);
724 float32x2x2_t zLo = vtrn_f32( vget_high_f32(a0), vget_high_f32(a1));
725 a2 = (float32x4_t) vandq_u32((uint32x4_t) a2, xyzMask );
726 float32x2_t b0 = vadd_f32( vpadd_f32( vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0] );
727 float32x2_t b1 = vpadd_f32( vpadd_f32( vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
728 return btVector3( vcombine_f32(b0, b1) );
739 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
740 return btVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
741 #elif defined(BT_USE_NEON)
742 return btVector3(vaddq_f32(v1.mVec128, v2.mVec128));
755 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
756 return btVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
757 #elif defined(BT_USE_NEON)
758 return btVector3(vmulq_f32(v1.mVec128, v2.mVec128));
771 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
774 __m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
775 return btVector3(_mm_and_ps(r, btvFFF0fMask));
776 #elif defined(BT_USE_NEON)
777 float32x4_t r = vsubq_f32(v1.mVec128, v2.mVec128);
778 return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
791 #if (defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
792 __m128 r = _mm_xor_ps(v.mVec128, btvMzeroMask);
793 return btVector3(_mm_and_ps(r, btvFFF0fMask));
794 #elif defined(BT_USE_NEON)
795 return btVector3((btSimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)btvMzeroMask));
805 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
806 __m128 vs = _mm_load_ss(&s);
807 vs = bt_pshufd_ps(vs, 0x80);
808 return btVector3(_mm_mul_ps(v.mVec128, vs));
809 #elif defined(BT_USE_NEON)
810 float32x4_t r = vmulq_n_f32(v.mVec128, s);
811 return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
829 #if 0 //defined(BT_USE_SSE_IN_API)
831 __m128 vs = _mm_load_ss(&s);
832 vs = _mm_div_ss(v1110, vs);
833 vs = bt_pshufd_ps(vs, 0x00);
835 return btVector3(_mm_mul_ps(v.mVec128, vs));
845 #if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE))
846 __m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
847 vec = _mm_and_ps(vec, btvFFF0fMask);
849 #elif defined(BT_USE_NEON)
850 float32x4_t x, y, v, m;
856 m = vrecpsq_f32(y, v);
858 m = vrecpsq_f32(y, v);
921 return v1.
lerp(v2, t);
928 return (v - *
this).length2();
933 return (v - *
this).length();
938 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
951 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
953 __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
955 __m128 C = wAxis.
cross( mVec128 ).mVec128;
956 O = _mm_and_ps(O, btvFFF0fMask);
959 __m128 vsin = _mm_load_ss(&ssin);
960 __m128 vcos = _mm_load_ss(&scos);
962 __m128 Y = bt_pshufd_ps(O, 0xC9);
963 __m128 Z = bt_pshufd_ps(O, 0xD2);
964 O = _mm_add_ps(O, Y);
965 vsin = bt_pshufd_ps(vsin, 0x80);
966 O = _mm_add_ps(O, Z);
967 vcos = bt_pshufd_ps(vcos, 0x80);
970 O = O * wAxis.mVec128;
971 __m128 X = mVec128 - O;
983 _y = wAxis.
cross( *
this );
985 return ( o + _x *
btCos( _angle ) + _y *
btSin( _angle ) );
991 #if defined (BT_USE_SSE) || defined (BT_USE_NEON)
992 #if defined _WIN32 || defined (BT_USE_SSE)
993 const long scalar_cutoff = 10;
994 long _maxdot_large(
const float *array,
const float *vec,
unsigned long array_count,
float *dotOut );
995 #elif defined BT_USE_NEON
996 const long scalar_cutoff = 4;
997 extern long (*_maxdot_large)(
const float *array,
const float *vec,
unsigned long array_count,
float *dotOut );
999 if( array_count < scalar_cutoff )
1002 #endif//BT_USE_SSE || BT_USE_NEON
1007 for( i = 0; i < array_count; i++ )
1021 #if defined (BT_USE_SSE) || defined (BT_USE_NEON)
1022 return _maxdot_large( (
float*) array, (
float*) &
m_floats[0], array_count, &dotOut );
1028 #if defined (BT_USE_SSE) || defined (BT_USE_NEON)
1029 #if defined BT_USE_SSE
1030 const long scalar_cutoff = 10;
1031 long _mindot_large(
const float *array,
const float *vec,
unsigned long array_count,
float *dotOut );
1032 #elif defined BT_USE_NEON
1033 const long scalar_cutoff = 4;
1034 extern long (*_mindot_large)(
const float *array,
const float *vec,
unsigned long array_count,
float *dotOut );
1036 #error unhandled arch!
1039 if( array_count < scalar_cutoff )
1040 #endif//BT_USE_SSE || BT_USE_NEON
1046 for( i = 0; i < array_count; i++ )
1061 #if defined (BT_USE_SSE) || defined (BT_USE_NEON)
1062 return _mindot_large( (
float*) array, (
float*) &
m_floats[0], array_count, &dotOut );
1080 #if (defined (BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined (BT_USE_NEON)
1088 mVec128 = rhs.mVec128;
1094 mVec128 = v.mVec128;
1097 #endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
1101 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
1102 return btVector4(_mm_and_ps(mVec128, btvAbsfMask));
1103 #elif defined(BT_USE_NEON)
1219 #ifdef BT_USE_DOUBLE_PRECISION
1220 unsigned char* dest = (
unsigned char*) &destVal;
1221 unsigned char* src = (
unsigned char*) &sourceVal;
1231 unsigned char* dest = (
unsigned char*) &destVal;
1232 unsigned char* src = (
unsigned char*) &sourceVal;
1237 #endif //BT_USE_DOUBLE_PRECISION
1242 for (
int i=0;i<4;i++)
1254 for (
int i=0;i<4;i++)
1258 vector = swappedVec;
1266 btScalar a = n[1]*n[1] + n[2]*n[2];
1278 btScalar a = n[0]*n[0] + n[1]*n[1];
1305 for (
int i=0;i<4;i++)
1311 for (
int i=0;i<4;i++)
1319 for (
int i=0;i<4;i++)
1325 for (
int i=0;i<4;i++)
1333 for (
int i=0;i<4;i++)
1339 for (
int i=0;i<4;i++)
1343 #endif //BT_VECTOR3_H