16 #ifndef BT_MATRIX3x3_H
17 #define BT_MATRIX3x3_H
28 #if defined(BT_USE_SSE) || defined(BT_USE_NEON)
34 #ifdef BT_USE_DOUBLE_PRECISION
35 #define btMatrix3x3Data btMatrix3x3DoubleData
37 #define btMatrix3x3Data btMatrix3x3FloatData
38 #endif //BT_USE_DOUBLE_PRECISION
73 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
91 m_el[0].mVec128 = rhs.
m_el[0].mVec128;
92 m_el[1].mVec128 = rhs.
m_el[1].mVec128;
93 m_el[2].mVec128 = rhs.
m_el[2].mVec128;
99 m_el[0].mVec128 = m.
m_el[0].mVec128;
100 m_el[1].mVec128 = m.
m_el[1].mVec128;
101 m_el[2].mVec128 = m.
m_el[2].mVec128;
111 m_el[0] = other.
m_el[0];
112 m_el[1] = other.
m_el[1];
113 m_el[2] = other.
m_el[2];
119 m_el[0] = other.
m_el[0];
120 m_el[1] = other.
m_el[1];
121 m_el[2] = other.
m_el[2];
131 return btVector3(m_el[0][i],m_el[1][i],m_el[2][i]);
210 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
211 __m128 vs, Q = q.get128();
212 __m128i Qi = btCastfTo128i(Q);
215 __m128 V11, V21, V31;
216 __m128 NQ = _mm_xor_ps(Q, btvMzeroMask);
217 __m128i NQi = btCastfTo128i(NQ);
219 V1 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,2,3)));
220 V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0,0,1,3));
221 V3 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(2,1,0,3)));
222 V1 = _mm_xor_ps(V1, vMPPP);
224 V11 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,1,0,3)));
225 V21 = _mm_unpackhi_ps(Q, Q);
226 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0,2,0,3));
232 V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2,3,1,3));
234 V21 = _mm_xor_ps(V21, vMPPP);
235 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3,3,1,3));
236 V31 = _mm_xor_ps(V31, vMPPP);
237 Y = btCastiTo128f(_mm_shuffle_epi32 (NQi, BT_SHUFFLE(3,2,0,3)));
238 Z = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,1,3)));
240 vs = _mm_load_ss(&s);
248 vs = bt_splat3_ps(vs, 0);
262 btScalar xs = q.
x() * s, ys = q.
y() * s, zs = q.
z() * s;
263 btScalar wx = q.
w() * xs, wy = q.
w() * ys, wz = q.
w() * zs;
264 btScalar xx = q.
x() * xs, xy = q.
x() * ys, xz = q.
x() * zs;
265 btScalar yy = q.
y() * ys, yz = q.
y() * zs, zz = q.
z() * zs;
267 btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
268 xy + wz,
btScalar(1.0) - (xx + zz), yz - wx,
269 xz - wy, yz + wx,
btScalar(1.0) - (xx + yy));
281 setEulerZYX(roll, pitch, yaw);
306 setValue(cj * ch, sj * sc - cs, sj * cc + ss,
307 cj * sh, sj * ss + cc, sj * cs - sc,
308 -sj, cj * si, cj * ci);
314 #if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
327 #if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
329 identityMatrix(v1000, v0100, v0010);
337 return identityMatrix;
344 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
345 __m128 v0 = m_el[0].mVec128;
346 __m128 v1 = m_el[1].mVec128;
347 __m128 v2 = m_el[2].mVec128;
348 __m128 *vm = (__m128 *)m;
351 v2 = _mm_and_ps(v2, btvFFF0fMask);
353 vT = _mm_unpackhi_ps(v0, v1);
354 v0 = _mm_unpacklo_ps(v0, v1);
356 v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) );
357 v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) );
358 v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));
363 #elif defined(BT_USE_NEON)
365 static const uint32x2_t zMask = (
const uint32x2_t) {-1, 0 };
366 float32x4_t *vm = (float32x4_t *)m;
367 float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 );
368 float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) );
369 float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
370 float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
371 float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
372 float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );
397 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
398 btScalar trace = m_el[0].
x() + m_el[1].
y() + m_el[2].
z();
410 temp.f[0]=m_el[2].
y() - m_el[1].
z();
411 temp.f[1]=m_el[0].
z() - m_el[2].
x();
412 temp.f[2]=m_el[1].
x() - m_el[0].
y();
419 if(m_el[0].x() < m_el[1].y())
421 if( m_el[1].y() < m_el[2].z() )
422 { i = 2; j = 0; k = 1; }
424 { i = 1; j = 2; k = 0; }
428 if( m_el[0].x() < m_el[2].z())
429 { i = 2; j = 0; k = 1; }
431 { i = 0; j = 1; k = 2; }
434 x = m_el[i][i] - m_el[j][j] - m_el[k][k] +
btScalar(1.0);
436 temp.f[3] = (m_el[k][j] - m_el[j][k]);
437 temp.f[j] = (m_el[j][i] + m_el[i][j]);
438 temp.f[k] = (m_el[k][i] + m_el[i][k]);
449 btScalar trace = m_el[0].
x() + m_el[1].
y() + m_el[2].
z();
459 temp[0]=((m_el[2].
y() - m_el[1].
z()) * s);
460 temp[1]=((m_el[0].
z() - m_el[2].
x()) * s);
461 temp[2]=((m_el[1].
x() - m_el[0].
y()) * s);
465 int i = m_el[0].
x() < m_el[1].
y() ?
466 (m_el[1].
y() < m_el[2].
z() ? 2 : 1) :
467 (m_el[0].x() < m_el[2].
z() ? 2 : 0);
475 temp[3] = (m_el[k][j] - m_el[j][k]) * s;
476 temp[j] = (m_el[j][i] + m_el[i][j]) * s;
477 temp[k] = (m_el[k][i] + m_el[i][k]) * s;
479 q.
setValue(temp[0],temp[1],temp[2],temp[3]);
530 if (
btFabs(m_el[2].x()) >= 1)
541 euler_out.roll = euler_out.pitch + delta;
542 euler_out2.roll = euler_out.pitch + delta;
548 euler_out.roll = -euler_out.pitch + delta;
549 euler_out2.roll = -euler_out.pitch + delta;
554 euler_out.pitch = -
btAsin(m_el[2].x());
555 euler_out2.pitch =
SIMD_PI - euler_out.pitch;
557 euler_out.roll =
btAtan2(m_el[2].y()/
btCos(euler_out.pitch),
558 m_el[2].
z()/
btCos(euler_out.pitch));
559 euler_out2.roll =
btAtan2(m_el[2].y()/
btCos(euler_out2.pitch),
560 m_el[2].
z()/
btCos(euler_out2.pitch));
562 euler_out.yaw =
btAtan2(m_el[1].x()/
btCos(euler_out.pitch),
563 m_el[0].
x()/
btCos(euler_out.pitch));
564 euler_out2.yaw =
btAtan2(m_el[1].x()/
btCos(euler_out2.pitch),
565 m_el[0].
x()/
btCos(euler_out2.pitch));
568 if (solution_number == 1)
571 pitch = euler_out.pitch;
572 roll = euler_out.roll;
576 yaw = euler_out2.yaw;
577 pitch = euler_out2.pitch;
578 roll = euler_out2.roll;
587 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
588 return btMatrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
591 m_el[0].x() * s.
x(), m_el[0].
y() * s.y(), m_el[0].
z() * s.z(),
592 m_el[1].
x() * s.x(), m_el[1].
y() * s.y(), m_el[1].
z() * s.z(),
593 m_el[2].
x() * s.x(), m_el[2].
y() * s.y(), m_el[2].
z() * s.z());
613 return m_el[0].
x() * v.
x() + m_el[1].
x() * v.
y() + m_el[2].
x() * v.
z();
617 return m_el[0].
y() * v.
x() + m_el[1].
y() * v.
y() + m_el[2].
y() * v.
z();
621 return m_el[0].
z() * v.
x() + m_el[1].
z() * v.
y() + m_el[2].
z() * v.
z();
637 for (
int step = maxSteps; step > 0; step--)
672 btScalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
678 t = (theta >= 0) ? 1 / (theta +
btSqrt(1 + theta2))
679 : 1 / (theta -
btSqrt(1 + theta2));
680 cos = 1 /
btSqrt(1 + t * t);
686 t = 1 / (theta * (2 +
btScalar(0.5) / theta2));
692 m_el[p][q] = m_el[q][p] = 0;
693 m_el[p][p] -= t * mpq;
694 m_el[q][q] += t * mpq;
697 m_el[r][p] = m_el[p][r] = cos * mrp - sin * mrq;
698 m_el[r][q] = m_el[q][r] = cos * mrq + sin * mrp;
701 for (
int i = 0; i < 3; i++)
706 row[p] = cos * mrp - sin * mrq;
707 row[q] = cos * mrq + sin * mrp;
724 return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
743 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
744 __m128 rv00, rv01, rv02;
745 __m128 rv10, rv11, rv12;
746 __m128 rv20, rv21, rv22;
747 __m128 mv0, mv1, mv2;
749 rv02 =
m_el[0].mVec128;
750 rv12 =
m_el[1].mVec128;
751 rv22 =
m_el[2].mVec128;
753 mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask);
754 mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask);
755 mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask);
758 rv00 = bt_splat_ps(rv02, 0);
759 rv01 = bt_splat_ps(rv02, 1);
760 rv02 = bt_splat_ps(rv02, 2);
762 rv00 = _mm_mul_ps(rv00, mv0);
763 rv01 = _mm_mul_ps(rv01, mv1);
764 rv02 = _mm_mul_ps(rv02, mv2);
767 rv10 = bt_splat_ps(rv12, 0);
768 rv11 = bt_splat_ps(rv12, 1);
769 rv12 = bt_splat_ps(rv12, 2);
771 rv10 = _mm_mul_ps(rv10, mv0);
772 rv11 = _mm_mul_ps(rv11, mv1);
773 rv12 = _mm_mul_ps(rv12, mv2);
776 rv20 = bt_splat_ps(rv22, 0);
777 rv21 = bt_splat_ps(rv22, 1);
778 rv22 = bt_splat_ps(rv22, 2);
780 rv20 = _mm_mul_ps(rv20, mv0);
781 rv21 = _mm_mul_ps(rv21, mv1);
782 rv22 = _mm_mul_ps(rv22, mv2);
784 rv00 = _mm_add_ps(rv00, rv01);
785 rv10 = _mm_add_ps(rv10, rv11);
786 rv20 = _mm_add_ps(rv20, rv21);
788 m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
789 m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
790 m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
792 #elif defined(BT_USE_NEON)
794 float32x4_t rv0, rv1, rv2;
795 float32x4_t v0, v1, v2;
796 float32x4_t mv0, mv1, mv2;
798 v0 =
m_el[0].mVec128;
799 v1 =
m_el[1].mVec128;
800 v2 =
m_el[2].mVec128;
802 mv0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
803 mv1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
804 mv2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
806 rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
807 rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
808 rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
810 rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
811 rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
812 rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
814 rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
815 rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
816 rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
818 m_el[0].mVec128 = rv0;
819 m_el[1].mVec128 = rv1;
820 m_el[2].mVec128 = rv2;
833 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
855 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
856 __m128 vk = bt_splat_ps(_mm_load_ss((
float *)&k), 0x80);
858 _mm_mul_ps(m[0].mVec128, vk),
859 _mm_mul_ps(m[1].mVec128, vk),
860 _mm_mul_ps(m[2].mVec128, vk));
861 #elif defined(BT_USE_NEON)
863 vmulq_n_f32(m[0].mVec128, k),
864 vmulq_n_f32(m[1].mVec128, k),
865 vmulq_n_f32(m[2].mVec128, k));
868 m[0].x()*k,m[0].y()*k,m[0].z()*k,
869 m[1].x()*k,m[1].y()*k,m[1].z()*k,
870 m[2].x()*k,m[2].y()*k,m[2].z()*k);
877 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
879 m1[0].mVec128 + m2[0].mVec128,
880 m1[1].mVec128 + m2[1].mVec128,
881 m1[2].mVec128 + m2[2].mVec128);
901 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
903 m1[0].mVec128 - m2[0].mVec128,
904 m1[1].mVec128 - m2[1].mVec128,
905 m1[2].mVec128 - m2[2].mVec128);
926 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
949 return btTriple((*
this)[0], (*
this)[1], (*
this)[2]);
956 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
958 _mm_and_ps(
m_el[0].mVec128, btvAbsfMask),
959 _mm_and_ps(
m_el[1].mVec128, btvAbsfMask),
960 _mm_and_ps(
m_el[2].mVec128, btvAbsfMask));
961 #elif defined(BT_USE_NEON)
963 (float32x4_t)vandq_s32((int32x4_t)
m_el[0].mVec128, btv3AbsMask),
964 (float32x4_t)vandq_s32((int32x4_t)
m_el[1].mVec128, btv3AbsMask),
965 (float32x4_t)vandq_s32((int32x4_t)
m_el[2].mVec128, btv3AbsMask));
977 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
978 __m128 v0 =
m_el[0].mVec128;
979 __m128 v1 =
m_el[1].mVec128;
980 __m128 v2 =
m_el[2].mVec128;
983 v2 = _mm_and_ps(v2, btvFFF0fMask);
985 vT = _mm_unpackhi_ps(v0, v1);
986 v0 = _mm_unpacklo_ps(v0, v1);
988 v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) );
989 v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) );
990 v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));
994 #elif defined(BT_USE_NEON)
996 static const uint32x2_t zMask = (
const uint32x2_t) {-1, 0 };
997 float32x4x2_t top = vtrnq_f32(
m_el[0].mVec128,
m_el[1].mVec128 );
998 float32x2x2_t bl = vtrn_f32( vget_low_f32(
m_el[2].mVec128), vdup_n_f32(0.0f) );
999 float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
1000 float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
1001 float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32(
m_el[2].mVec128), zMask );
1002 float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q );
1014 return btMatrix3x3(
cofac(1, 1, 2, 2),
cofac(0, 2, 2, 1),
cofac(0, 1, 1, 2),
1015 cofac(1, 2, 2, 0),
cofac(0, 0, 2, 2),
cofac(0, 2, 1, 0),
1016 cofac(1, 0, 2, 1),
cofac(0, 1, 2, 0),
cofac(0, 0, 1, 1));
1022 btVector3 co(
cofac(1, 1, 2, 2),
cofac(1, 2, 2, 0),
cofac(1, 0, 2, 1));
1027 co.
y() * s,
cofac(0, 0, 2, 2) * s,
cofac(0, 2, 1, 0) * s,
1028 co.
z() * s,
cofac(0, 1, 2, 0) * s,
cofac(0, 0, 1, 1) * s);
1034 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1037 __m128 row =
m_el[0].mVec128;
1038 __m128 m0 = _mm_and_ps( m.
getRow(0).mVec128, btvFFF0fMask );
1039 __m128 m1 = _mm_and_ps( m.
getRow(1).mVec128, btvFFF0fMask);
1040 __m128 m2 = _mm_and_ps( m.
getRow(2).mVec128, btvFFF0fMask );
1041 __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
1042 __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
1043 __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
1044 row =
m_el[1].mVec128;
1045 r0 = _mm_add_ps( r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
1046 r1 = _mm_add_ps( r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
1047 r2 = _mm_add_ps( r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
1048 row =
m_el[2].mVec128;
1049 r0 = _mm_add_ps( r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
1050 r1 = _mm_add_ps( r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
1051 r2 = _mm_add_ps( r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
1054 #elif defined BT_USE_NEON
1056 static const uint32x4_t xyzMask = (
const uint32x4_t){ -1, -1, -1, 0 };
1057 float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.
getRow(0).mVec128, xyzMask );
1058 float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.
getRow(1).mVec128, xyzMask );
1059 float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.
getRow(2).mVec128, xyzMask );
1060 float32x4_t row =
m_el[0].mVec128;
1061 float32x4_t r0 = vmulq_lane_f32( m0, vget_low_f32(row), 0);
1062 float32x4_t r1 = vmulq_lane_f32( m0, vget_low_f32(row), 1);
1063 float32x4_t r2 = vmulq_lane_f32( m0, vget_high_f32(row), 0);
1064 row =
m_el[1].mVec128;
1065 r0 = vmlaq_lane_f32( r0, m1, vget_low_f32(row), 0);
1066 r1 = vmlaq_lane_f32( r1, m1, vget_low_f32(row), 1);
1067 r2 = vmlaq_lane_f32( r2, m1, vget_high_f32(row), 0);
1068 row =
m_el[2].mVec128;
1069 r0 = vmlaq_lane_f32( r0, m2, vget_low_f32(row), 0);
1070 r1 = vmlaq_lane_f32( r1, m2, vget_low_f32(row), 1);
1071 r2 = vmlaq_lane_f32( r2, m2, vget_high_f32(row), 0);
1075 m_el[0].x() * m[0].x() +
m_el[1].x() * m[1].x() +
m_el[2].x() * m[2].x(),
1076 m_el[0].x() * m[0].y() +
m_el[1].x() * m[1].y() +
m_el[2].x() * m[2].y(),
1077 m_el[0].x() * m[0].z() +
m_el[1].x() * m[1].z() +
m_el[2].x() * m[2].z(),
1078 m_el[0].y() * m[0].x() +
m_el[1].y() * m[1].x() +
m_el[2].y() * m[2].x(),
1079 m_el[0].y() * m[0].y() +
m_el[1].y() * m[1].y() +
m_el[2].y() * m[2].y(),
1080 m_el[0].y() * m[0].z() +
m_el[1].y() * m[1].z() +
m_el[2].y() * m[2].z(),
1081 m_el[0].z() * m[0].x() +
m_el[1].z() * m[1].x() +
m_el[2].z() * m[2].x(),
1082 m_el[0].z() * m[0].y() +
m_el[1].z() * m[1].y() +
m_el[2].z() * m[2].y(),
1083 m_el[0].z() * m[0].z() +
m_el[1].z() * m[1].z() +
m_el[2].z() * m[2].z());
1090 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1091 __m128 a0 =
m_el[0].mVec128;
1092 __m128 a1 =
m_el[1].mVec128;
1093 __m128 a2 =
m_el[2].mVec128;
1096 __m128 mx = mT[0].mVec128;
1097 __m128 my = mT[1].mVec128;
1098 __m128 mz = mT[2].mVec128;
1100 __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
1101 __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
1102 __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
1103 r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
1104 r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
1105 r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
1106 r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
1107 r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
1108 r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
1111 #elif defined BT_USE_NEON
1112 float32x4_t a0 =
m_el[0].mVec128;
1113 float32x4_t a1 =
m_el[1].mVec128;
1114 float32x4_t a2 =
m_el[2].mVec128;
1117 float32x4_t mx = mT[0].mVec128;
1118 float32x4_t my = mT[1].mVec128;
1119 float32x4_t mz = mT[2].mVec128;
1121 float32x4_t r0 = vmulq_lane_f32( mx, vget_low_f32(a0), 0);
1122 float32x4_t r1 = vmulq_lane_f32( mx, vget_low_f32(a1), 0);
1123 float32x4_t r2 = vmulq_lane_f32( mx, vget_low_f32(a2), 0);
1124 r0 = vmlaq_lane_f32( r0, my, vget_low_f32(a0), 1);
1125 r1 = vmlaq_lane_f32( r1, my, vget_low_f32(a1), 1);
1126 r2 = vmlaq_lane_f32( r2, my, vget_low_f32(a2), 1);
1127 r0 = vmlaq_lane_f32( r0, mz, vget_high_f32(a0), 0);
1128 r1 = vmlaq_lane_f32( r1, mz, vget_high_f32(a1), 0);
1129 r2 = vmlaq_lane_f32( r2, mz, vget_high_f32(a2), 0);
1143 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
1144 return v.
dot3(m[0], m[1], m[2]);
1154 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1156 const __m128 vv = v.mVec128;
1158 __m128 c0 = bt_splat_ps( vv, 0);
1159 __m128 c1 = bt_splat_ps( vv, 1);
1160 __m128 c2 = bt_splat_ps( vv, 2);
1162 c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask) );
1163 c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask) );
1164 c0 = _mm_add_ps(c0, c1);
1165 c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask) );
1168 #elif defined(BT_USE_NEON)
1169 const float32x4_t vv = v.mVec128;
1170 const float32x2_t vlo = vget_low_f32(vv);
1171 const float32x2_t vhi = vget_high_f32(vv);
1173 float32x4_t c0, c1, c2;
1175 c0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
1176 c1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
1177 c2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
1179 c0 = vmulq_lane_f32(c0, vlo, 0);
1180 c1 = vmulq_lane_f32(c1, vlo, 1);
1181 c2 = vmulq_lane_f32(c2, vhi, 0);
1182 c0 = vaddq_f32(c0, c1);
1183 c0 = vaddq_f32(c0, c2);
1194 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1196 __m128 m10 = m1[0].mVec128;
1197 __m128 m11 = m1[1].mVec128;
1198 __m128 m12 = m1[2].mVec128;
1200 __m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask);
1202 __m128 c0 = bt_splat_ps( m10, 0);
1203 __m128 c1 = bt_splat_ps( m11, 0);
1204 __m128 c2 = bt_splat_ps( m12, 0);
1206 c0 = _mm_mul_ps(c0, m2v);
1207 c1 = _mm_mul_ps(c1, m2v);
1208 c2 = _mm_mul_ps(c2, m2v);
1210 m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask);
1212 __m128 c0_1 = bt_splat_ps( m10, 1);
1213 __m128 c1_1 = bt_splat_ps( m11, 1);
1214 __m128 c2_1 = bt_splat_ps( m12, 1);
1216 c0_1 = _mm_mul_ps(c0_1, m2v);
1217 c1_1 = _mm_mul_ps(c1_1, m2v);
1218 c2_1 = _mm_mul_ps(c2_1, m2v);
1220 m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask);
1222 c0 = _mm_add_ps(c0, c0_1);
1223 c1 = _mm_add_ps(c1, c1_1);
1224 c2 = _mm_add_ps(c2, c2_1);
1226 m10 = bt_splat_ps( m10, 2);
1227 m11 = bt_splat_ps( m11, 2);
1228 m12 = bt_splat_ps( m12, 2);
1230 m10 = _mm_mul_ps(m10, m2v);
1231 m11 = _mm_mul_ps(m11, m2v);
1232 m12 = _mm_mul_ps(m12, m2v);
1234 c0 = _mm_add_ps(c0, m10);
1235 c1 = _mm_add_ps(c1, m11);
1236 c2 = _mm_add_ps(c2, m12);
1240 #elif defined(BT_USE_NEON)
1242 float32x4_t rv0, rv1, rv2;
1243 float32x4_t v0, v1, v2;
1244 float32x4_t mv0, mv1, mv2;
1250 mv0 = (float32x4_t) vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask);
1251 mv1 = (float32x4_t) vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask);
1252 mv2 = (float32x4_t) vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask);
1254 rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
1255 rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
1256 rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
1258 rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
1259 rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
1260 rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
1262 rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
1263 rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
1264 rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
1295 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1299 c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
1300 c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
1301 c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
1303 c0 = _mm_and_ps(c0, c1);
1304 c0 = _mm_and_ps(c0, c2);
1306 return (0x7 == _mm_movemask_ps((__m128)c0));
1309 ( m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
1310 m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
1311 m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2] );
1332 for (
int i=0;i<3;i++)
1338 for (
int i=0;i<3;i++)
1345 for (
int i=0;i<3;i++)
1351 for (
int i=0;i<3;i++)
1357 for (
int i=0;i<3;i++)
1361 #endif //BT_MATRIX3x3_H