17 #ifndef BT_SIMD__QUATERNION_H_
18 #define BT_SIMD__QUATERNION_H_
34 #if defined(BT_USE_SSE) || defined(BT_USE_NEON)
47 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))|| defined(BT_USE_NEON)
57 mVec128 = rhs.mVec128;
90 #ifndef BT_EULER_DEFAULT_ZYX
122 setValue(cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
123 cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
124 sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
125 cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
142 setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
143 cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
144 cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
145 cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
151 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
152 mVec128 = _mm_add_ps(mVec128, q.mVec128);
153 #elif defined(BT_USE_NEON)
154 mVec128 = vaddq_f32(mVec128, q.mVec128);
168 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
169 mVec128 = _mm_sub_ps(mVec128, q.mVec128);
170 #elif defined(BT_USE_NEON)
171 mVec128 = vsubq_f32(mVec128, q.mVec128);
185 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
186 __m128 vs = _mm_load_ss(&s);
187 vs = bt_pshufd_ps(vs, 0);
188 mVec128 = _mm_mul_ps(mVec128, vs);
189 #elif defined(BT_USE_NEON)
190 mVec128 = vmulq_n_f32(mVec128, s);
205 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
206 __m128 vQ2 = q.get128();
208 __m128 A1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(0,1,2,0));
209 __m128 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));
213 __m128 A2 = bt_pshufd_ps(mVec128, BT_SHUFFLE(1,2,0,1));
214 __m128 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
218 B1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(2,0,1,2));
219 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
223 mVec128 = bt_splat_ps(mVec128, 3);
224 mVec128 = mVec128 * vQ2;
227 mVec128 = mVec128 - B1;
228 A1 = _mm_xor_ps(A1, vPPPM);
229 mVec128 = mVec128+ A1;
231 #elif defined(BT_USE_NEON)
233 float32x4_t vQ1 = mVec128;
234 float32x4_t vQ2 = q.get128();
235 float32x4_t A0, A1, B1, A2, B2, A3, B3;
236 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
240 tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );
243 tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );
246 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
248 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
250 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
251 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
253 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);
254 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);
256 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
257 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
259 A3 = vcombine_f32(vQ1zx, vQ1yz);
260 B3 = vcombine_f32(vQ2yz, vQ2xz);
262 A1 = vmulq_f32(A1, B1);
263 A2 = vmulq_f32(A2, B2);
264 A3 = vmulq_f32(A3, B3);
265 A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);
267 A1 = vaddq_f32(A1, A2);
268 A0 = vsubq_f32(A0, A3);
271 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
272 A0 = vaddq_f32(A0, A1);
288 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
291 vd = _mm_mul_ps(mVec128, q.mVec128);
293 __m128 t = _mm_movehl_ps(vd, vd);
294 vd = _mm_add_ps(vd, t);
295 t = _mm_shuffle_ps(vd, vd, 0x55);
296 vd = _mm_add_ss(vd, t);
298 return _mm_cvtss_f32(vd);
299 #elif defined(BT_USE_NEON)
300 float32x4_t vd = vmulq_f32(mVec128, q.mVec128);
301 float32x2_t
x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd));
303 return vget_lane_f32(x, 0);
328 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
331 vd = _mm_mul_ps(mVec128, mVec128);
333 __m128 t = _mm_movehl_ps(vd, vd);
334 vd = _mm_add_ps(vd, t);
335 t = _mm_shuffle_ps(vd, vd, 0x55);
336 vd = _mm_add_ss(vd, t);
338 vd = _mm_sqrt_ss(vd);
339 vd = _mm_div_ss(vOnes, vd);
340 vd = bt_pshufd_ps(vd, 0);
341 mVec128 = _mm_mul_ps(mVec128, vd);
354 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
355 __m128 vs = _mm_load_ss(&s);
356 vs = bt_pshufd_ps(vs, 0x00);
359 #elif defined(BT_USE_NEON)
410 return btVector3(m_floats[0] * s, m_floats[1] * s, m_floats[2] * s);
416 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
418 #elif defined(BT_USE_NEON)
419 return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)vQInv));
430 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
432 #elif defined(BT_USE_NEON)
445 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
447 #elif defined(BT_USE_NEON)
459 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
461 #elif defined(BT_USE_NEON)
462 return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)btvMzeroMask) );
474 if( diff.
dot(diff) > sum.
dot(sum) )
485 if( diff.
dot(diff) < sum.
dot(sum) )
542 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
543 __m128 vQ1 = q1.get128();
544 __m128 vQ2 = q2.get128();
545 __m128 A0, A1, B1, A2, B2;
547 A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0));
548 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));
552 A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
553 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
557 B1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
558 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
562 A0 = bt_splat_ps(vQ1, 3);
568 A1 = _mm_xor_ps(A1, vPPPM);
573 #elif defined(BT_USE_NEON)
575 float32x4_t vQ1 = q1.get128();
576 float32x4_t vQ2 = q2.get128();
577 float32x4_t A0, A1, B1, A2, B2, A3, B3;
578 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
582 tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );
585 tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );
588 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
590 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
592 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
593 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
595 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);
596 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);
598 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
599 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
601 A3 = vcombine_f32(vQ1zx, vQ1yz);
602 B3 = vcombine_f32(vQ2yz, vQ2xz);
604 A1 = vmulq_f32(A1, B1);
605 A2 = vmulq_f32(A2, B2);
606 A3 = vmulq_f32(A3, B3);
607 A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);
609 A1 = vaddq_f32(A1, A2);
610 A0 = vsubq_f32(A0, A3);
613 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
614 A0 = vaddq_f32(A0, A1);
620 q1.
w() * q2.
x() + q1.
x() * q2.
w() + q1.
y() * q2.
z() - q1.
z() * q2.
y(),
621 q1.
w() * q2.
y() + q1.
y() * q2.
w() + q1.
z() * q2.
x() - q1.
x() * q2.
z(),
622 q1.
w() * q2.
z() + q1.
z() * q2.
w() + q1.
x() * q2.
y() - q1.
y() * q2.
x(),
623 q1.
w() * q2.
w() - q1.
x() * q2.
x() - q1.
y() * q2.
y() - q1.
z() * q2.
z());
630 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
631 __m128 vQ1 = q.get128();
632 __m128 vQ2 = w.get128();
633 __m128 A1, B1, A2, B2, A3, B3;
635 A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(3,3,3,0));
636 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(0,1,2,0));
640 A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
641 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
645 A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
646 B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
651 A1 = _mm_xor_ps(A1, vPPPM);
656 #elif defined(BT_USE_NEON)
658 float32x4_t vQ1 = q.get128();
659 float32x4_t vQ2 = w.get128();
660 float32x4_t A1, B1, A2, B2, A3, B3;
661 float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz;
663 vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1);
667 tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );
670 tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );
674 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
676 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
677 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
679 A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx);
680 B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx);
682 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
683 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
685 A3 = vcombine_f32(vQ1zx, vQ1yz);
686 B3 = vcombine_f32(vQ2yz, vQ2xz);
688 A1 = vmulq_f32(A1, B1);
689 A2 = vmulq_f32(A2, B2);
690 A3 = vmulq_f32(A3, B3);
692 A1 = vaddq_f32(A1, A2);
695 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
697 A1 = vsubq_f32(A1, A3);
703 q.
w() * w.
x() + q.
y() * w.
z() - q.
z() * w.
y(),
704 q.
w() * w.
y() + q.
z() * w.
x() - q.
x() * w.
z(),
705 q.
w() * w.
z() + q.
x() * w.
y() - q.
y() * w.
x(),
706 -q.
x() * w.
x() - q.
y() * w.
y() - q.
z() * w.
z());
713 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
714 __m128 vQ1 = w.get128();
715 __m128 vQ2 = q.get128();
716 __m128 A1, B1, A2, B2, A3, B3;
718 A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0));
719 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));
723 A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
724 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
728 A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
729 B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
734 A1 = _mm_xor_ps(A1, vPPPM);
739 #elif defined(BT_USE_NEON)
741 float32x4_t vQ1 = w.get128();
742 float32x4_t vQ2 = q.get128();
743 float32x4_t A1, B1, A2, B2, A3, B3;
744 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
749 tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );
752 tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );
755 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
757 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
759 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
760 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
762 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);
763 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);
765 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
766 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
768 A3 = vcombine_f32(vQ1zx, vQ1yz);
769 B3 = vcombine_f32(vQ2yz, vQ2xz);
771 A1 = vmulq_f32(A1, B1);
772 A2 = vmulq_f32(A2, B2);
773 A3 = vmulq_f32(A3, B3);
775 A1 = vaddq_f32(A1, A2);
778 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
780 A1 = vsubq_f32(A1, A3);
786 +w.
x() * q.
w() + w.
y() * q.
z() - w.
z() * q.
y(),
787 +w.
y() * q.
w() + w.
z() * q.
x() - w.
x() * q.
z(),
788 +w.
z() * q.
w() + w.
x() * q.
y() - w.
y() * q.
x(),
789 -w.
x() * q.
x() - w.
y() * q.
y() - w.
z() * q.
z());
830 return q1.
slerp(q2, t);
838 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
839 return btVector3(_mm_and_ps(q.get128(), btvFFF0fMask));
840 #elif defined(BT_USE_NEON)
841 return btVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), btvFFF0Mask));
874 #endif //BT_SIMD__QUATERNION_H_