25 #ifdef BT_USE_DOUBLE_PRECISION
26 #define btVector3Data btVector3DoubleData
27 #define btVector3DataName "btVector3DoubleData"
29 #define btVector3Data btVector3FloatData
30 #define btVector3DataName "btVector3FloatData"
31 #endif //BT_USE_DOUBLE_PRECISION
33 #if defined BT_USE_SSE
38 #pragma warning(disable: 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
42 #define BT_SHUFFLE(x,y,z,w) ((w)<<6 | (z)<<4 | (y)<<2 | (x))
44 #define bt_pshufd_ps( _a, _mask ) _mm_shuffle_ps((_a), (_a), (_mask) )
45 #define bt_splat3_ps( _a, _i ) bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i, 3) )
46 #define bt_splat_ps( _a, _i ) bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i,_i) )
48 #define btv3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
49 #define btvAbsMask (_mm_set_epi32( 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
50 #define btvFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
51 #define btv3AbsfMask btCastiTo128f(btv3AbsiMask)
52 #define btvFFF0fMask btCastiTo128f(btvFFF0Mask)
53 #define btvxyzMaskf btvFFF0fMask
54 #define btvAbsfMask btCastiTo128f(btvAbsMask)
67 const float32x4_t
ATTRIBUTE_ALIGNED16(btvMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
68 const int32x4_t
ATTRIBUTE_ALIGNED16(btvFFF0Mask) = (int32x4_t){0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
69 const int32x4_t
ATTRIBUTE_ALIGNED16(btvAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
70 const int32x4_t
ATTRIBUTE_ALIGNED16(btv3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
84 #if defined (__SPU__) && defined (__CELLOS_LV2__)
92 #else //__CELLOS_LV2__ __SPU__
93 #if defined (BT_USE_SSE) || defined(BT_USE_NEON) // _WIN32 || ARM
109 #endif //__CELLOS_LV2__ __SPU__
134 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) )|| defined (BT_USE_NEON)
144 mVec128 = rhs.mVec128;
155 #endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
161 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
162 mVec128 = _mm_add_ps(mVec128, v.mVec128);
163 #elif defined(BT_USE_NEON)
164 mVec128 = vaddq_f32(mVec128, v.mVec128);
178 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
179 mVec128 = _mm_sub_ps(mVec128, v.mVec128);
180 #elif defined(BT_USE_NEON)
181 mVec128 = vsubq_f32(mVec128, v.mVec128);
194 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
195 __m128 vs = _mm_load_ss(&s);
196 vs = bt_pshufd_ps(vs, 0x80);
197 mVec128 = _mm_mul_ps(mVec128, vs);
198 #elif defined(BT_USE_NEON)
199 mVec128 = vmulq_n_f32(mVec128, s);
214 #if 0 //defined(BT_USE_SSE_IN_API)
216 __m128 vs = _mm_load_ss(&s);
217 vs = _mm_div_ss(v1110, vs);
218 vs = bt_pshufd_ps(vs, 0x00);
220 mVec128 = _mm_mul_ps(mVec128, vs);
232 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
233 __m128 vd = _mm_mul_ps(mVec128, v.mVec128);
234 __m128 z = _mm_movehl_ps(vd, vd);
235 __m128 y = _mm_shuffle_ps(vd, vd, 0x55);
236 vd = _mm_add_ss(vd, y);
237 vd = _mm_add_ss(vd, z);
238 return _mm_cvtss_f32(vd);
239 #elif defined(BT_USE_NEON)
240 float32x4_t vd = vmulq_f32(mVec128, v.mVec128);
241 float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd));
242 x = vadd_f32(x, vget_high_f32(vd));
243 return vget_lane_f32(x, 0);
245 return m_floats[0] * v.
m_floats[0] +
274 int maxIndex = absVec.
maxAxis();
275 if (absVec[maxIndex]>0)
277 *
this /= absVec[maxIndex];
288 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
290 __m128 vd = _mm_mul_ps(mVec128, mVec128);
291 __m128 z = _mm_movehl_ps(vd, vd);
292 __m128 y = _mm_shuffle_ps(vd, vd, 0x55);
293 vd = _mm_add_ss(vd, y);
294 vd = _mm_add_ss(vd, z);
297 vd = _mm_sqrt_ss(vd);
298 vd = _mm_div_ss(v1110, vd);
299 vd = bt_splat_ps(vd, 0x80);
300 mVec128 = _mm_mul_ps(mVec128, vd);
304 y = _mm_rsqrt_ss(vd);
308 vd = _mm_mul_ss(vd, vHalf);
310 vd = _mm_mul_ss(vd, y);
311 vd = _mm_mul_ss(vd, y);
312 z = _mm_sub_ss(z, vd);
314 y = _mm_mul_ss(y, z);
316 y = bt_splat_ps(y, 0x80);
317 mVec128 = _mm_mul_ps(mVec128, y);
349 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
350 return btVector3(_mm_and_ps(mVec128, btv3AbsfMask));
351 #elif defined(BT_USE_NEON)
365 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
368 T = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 3));
369 V = bt_pshufd_ps(v.mVec128, BT_SHUFFLE(1, 2, 0, 3));
371 V = _mm_mul_ps(V, mVec128);
372 T = _mm_mul_ps(T, v.mVec128);
373 V = _mm_sub_ps(V, T);
375 V = bt_pshufd_ps(V, BT_SHUFFLE(1, 2, 0, 3));
377 #elif defined(BT_USE_NEON)
380 float32x2_t Tlow = vget_low_f32(mVec128);
381 float32x2_t Vlow = vget_low_f32(v.mVec128);
382 T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow);
383 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow);
385 V = vmulq_f32(V, mVec128);
386 T = vmulq_f32(T, v.mVec128);
388 Vlow = vget_low_f32(V);
390 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
391 V = (float32x4_t)vandq_s32((int32x4_t)
V, btvFFF0Mask);
404 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
406 __m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, BT_SHUFFLE(1, 2, 0, 3));
407 __m128
V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, BT_SHUFFLE(1, 2, 0, 3));
409 V = _mm_mul_ps(V, v1.mVec128);
410 T = _mm_mul_ps(T, v2.mVec128);
411 V = _mm_sub_ps(V, T);
413 V = _mm_shuffle_ps(V, V, BT_SHUFFLE(1, 2, 0, 3));
416 V = _mm_mul_ps(V, mVec128);
417 __m128 z = _mm_movehl_ps(V, V);
418 __m128 y = _mm_shuffle_ps(V, V, 0x55);
419 V = _mm_add_ss(V, y);
420 V = _mm_add_ss(V, z);
421 return _mm_cvtss_f32(V);
423 #elif defined(BT_USE_NEON)
427 float32x2_t Tlow = vget_low_f32(v1.mVec128);
428 float32x2_t Vlow = vget_low_f32(v2.mVec128);
429 T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow);
430 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow);
432 V = vmulq_f32(V, v1.mVec128);
433 T = vmulq_f32(T, v2.mVec128);
435 Vlow = vget_low_f32(V);
437 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
440 V = vmulq_f32(mVec128, V);
441 float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V));
442 x = vadd_f32(x, vget_high_f32(V));
443 return vget_lane_f32(x, 0);
456 return m_floats[0] < m_floats[1] ? (m_floats[0] <m_floats[2] ? 0 : 2) : (m_floats[1] <m_floats[2] ? 1 : 2);
463 return m_floats[0] < m_floats[1] ? (m_floats[1] <m_floats[2] ? 2 : 1) : (m_floats[0] <m_floats[2] ? 2 : 0);
468 return absolute().minAxis();
473 return absolute().maxAxis();
479 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
480 __m128 vrt = _mm_load_ss(&rt);
482 __m128 vs = _mm_load_ss(&s);
483 vs = bt_pshufd_ps(vs, 0x80);
484 __m128 r0 = _mm_mul_ps(v0.mVec128, vs);
485 vrt = bt_pshufd_ps(vrt, 0x80);
486 __m128 r1 = _mm_mul_ps(v1.mVec128, vrt);
487 __m128 tmp3 = _mm_add_ps(r0,r1);
489 #elif defined(BT_USE_NEON)
490 mVec128 = vsubq_f32(v1.mVec128, v0.mVec128);
491 mVec128 = vmulq_n_f32(mVec128, rt);
492 mVec128 = vaddq_f32(mVec128, v0.mVec128);
508 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
509 __m128 vt = _mm_load_ss(&t);
510 vt = bt_pshufd_ps(vt, 0x80);
511 __m128 vl = _mm_sub_ps(v.mVec128, mVec128);
512 vl = _mm_mul_ps(vl, vt);
513 vl = _mm_add_ps(vl, mVec128);
516 #elif defined(BT_USE_NEON)
517 float32x4_t vl = vsubq_f32(v.mVec128, mVec128);
518 vl = vmulq_n_f32(vl, t);
519 vl = vaddq_f32(vl, mVec128);
525 m_floats[1] + (v.
m_floats[1] - m_floats[1]) * t,
526 m_floats[2] + (v.
m_floats[2] - m_floats[2]) * t);
534 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
535 mVec128 = _mm_mul_ps(mVec128, v.mVec128);
536 #elif defined(BT_USE_NEON)
537 mVec128 = vmulq_f32(mVec128, v.mVec128);
577 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
578 return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
580 return ((m_floats[3]==other.
m_floats[3]) &&
589 return !(*
this == other);
597 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
598 mVec128 = _mm_max_ps(mVec128, other.mVec128);
599 #elif defined(BT_USE_NEON)
600 mVec128 = vmaxq_f32(mVec128, other.mVec128);
614 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
615 mVec128 = _mm_min_ps(mVec128, other.mVec128);
616 #elif defined(BT_USE_NEON)
617 mVec128 = vminq_f32(mVec128, other.mVec128);
636 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
638 __m128
V = _mm_and_ps(mVec128, btvFFF0fMask);
639 __m128 V0 = _mm_xor_ps(btvMzeroMask, V);
640 __m128 V2 = _mm_movelh_ps(V0, V);
642 __m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
644 V0 = _mm_shuffle_ps(V0, V, 0xDB);
645 V2 = _mm_shuffle_ps(V2, V, 0xF9);
659 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
660 mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
661 #elif defined(BT_USE_NEON)
662 int32x4_t vi = vdupq_n_s32(0);
663 mVec128 = vreinterpretq_f32_s32(vi);
706 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
708 __m128 a0 = _mm_mul_ps( v0.mVec128, this->mVec128 );
709 __m128 a1 = _mm_mul_ps( v1.mVec128, this->mVec128 );
710 __m128 a2 = _mm_mul_ps( v2.mVec128, this->mVec128 );
711 __m128 b0 = _mm_unpacklo_ps( a0, a1 );
712 __m128 b1 = _mm_unpackhi_ps( a0, a1 );
713 __m128 b2 = _mm_unpacklo_ps( a2, _mm_setzero_ps() );
714 __m128 r = _mm_movelh_ps( b0, b2 );
715 r = _mm_add_ps( r, _mm_movehl_ps( b2, b0 ));
716 a2 = _mm_and_ps( a2, btvxyzMaskf);
717 r = _mm_add_ps( r, btCastdTo128f (_mm_move_sd( btCastfTo128d(a2), btCastfTo128d(b1) )));
720 #elif defined(BT_USE_NEON)
721 static const uint32x4_t xyzMask = (
const uint32x4_t){ -1, -1, -1, 0 };
722 float32x4_t a0 = vmulq_f32( v0.mVec128, this->mVec128);
723 float32x4_t a1 = vmulq_f32( v1.mVec128, this->mVec128);
724 float32x4_t a2 = vmulq_f32( v2.mVec128, this->mVec128);
725 float32x2x2_t zLo = vtrn_f32( vget_high_f32(a0), vget_high_f32(a1));
726 a2 = (float32x4_t) vandq_u32((uint32x4_t) a2, xyzMask );
727 float32x2_t b0 = vadd_f32( vpadd_f32( vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0] );
728 float32x2_t b1 = vpadd_f32( vpadd_f32( vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
729 return btVector3( vcombine_f32(b0, b1) );
740 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
741 return btVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
742 #elif defined(BT_USE_NEON)
743 return btVector3(vaddq_f32(v1.mVec128, v2.mVec128));
756 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
757 return btVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
758 #elif defined(BT_USE_NEON)
759 return btVector3(vmulq_f32(v1.mVec128, v2.mVec128));
772 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
775 __m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
776 return btVector3(_mm_and_ps(r, btvFFF0fMask));
777 #elif defined(BT_USE_NEON)
778 float32x4_t r = vsubq_f32(v1.mVec128, v2.mVec128);
779 return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
792 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
793 __m128 r = _mm_xor_ps(v.mVec128, btvMzeroMask);
794 return btVector3(_mm_and_ps(r, btvFFF0fMask));
795 #elif defined(BT_USE_NEON)
796 return btVector3((btSimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)btvMzeroMask));
806 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
807 __m128 vs = _mm_load_ss(&s);
808 vs = bt_pshufd_ps(vs, 0x80);
809 return btVector3(_mm_mul_ps(v.mVec128, vs));
810 #elif defined(BT_USE_NEON)
811 float32x4_t r = vmulq_n_f32(v.mVec128, s);
812 return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
830 #if 0 //defined(BT_USE_SSE_IN_API)
832 __m128 vs = _mm_load_ss(&s);
833 vs = _mm_div_ss(v1110, vs);
834 vs = bt_pshufd_ps(vs, 0x00);
836 return btVector3(_mm_mul_ps(v.mVec128, vs));
846 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE))
847 __m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
848 vec = _mm_and_ps(vec, btvFFF0fMask);
850 #elif defined(BT_USE_NEON)
851 float32x4_t x, y, v, m;
857 m = vrecpsq_f32(y, v);
859 m = vrecpsq_f32(y, v);
922 return v1.
lerp(v2, t);
929 return (v - *
this).length2();
934 return (v - *
this).length();
939 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
952 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
954 __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
956 __m128 C = wAxis.
cross( mVec128 ).mVec128;
957 O = _mm_and_ps(O, btvFFF0fMask);
960 __m128 vsin = _mm_load_ss(&ssin);
961 __m128 vcos = _mm_load_ss(&scos);
963 __m128 Y = bt_pshufd_ps(O, 0xC9);
964 __m128 Z = bt_pshufd_ps(O, 0xD2);
965 O = _mm_add_ps(O, Y);
966 vsin = bt_pshufd_ps(vsin, 0x80);
967 O = _mm_add_ps(O, Z);
968 vcos = bt_pshufd_ps(vcos, 0x80);
971 O = O * wAxis.mVec128;
972 __m128 X = mVec128 - O;
984 _y = wAxis.
cross( *
this );
986 return ( o + _x *
btCos( _angle ) + _y *
btSin( _angle ) );
992 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
993 #if defined _WIN32 || defined (BT_USE_SSE)
994 const long scalar_cutoff = 10;
995 long _maxdot_large(
const float *array,
const float *vec,
unsigned long array_count,
float *dotOut );
996 #elif defined BT_USE_NEON
997 const long scalar_cutoff = 4;
998 extern long (*_maxdot_large)(
const float *array,
const float *vec,
unsigned long array_count,
float *dotOut );
1000 if( array_count < scalar_cutoff )
1006 for( i = 0; i < array_count; i++ )
1020 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
1021 return _maxdot_large( (
float*) array, (
float*) &
m_floats[0], array_count, &dotOut );
1027 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
1028 #if defined BT_USE_SSE
1029 const long scalar_cutoff = 10;
1030 long _mindot_large(
const float *array,
const float *vec,
unsigned long array_count,
float *dotOut );
1031 #elif defined BT_USE_NEON
1032 const long scalar_cutoff = 4;
1033 extern long (*_mindot_large)(
const float *array,
const float *vec,
unsigned long array_count,
float *dotOut );
1035 #error unhandled arch!
1038 if( array_count < scalar_cutoff )
1045 for( i = 0; i < array_count; i++ )
1060 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
1061 return _mindot_large( (
float*) array, (
float*) &
m_floats[0], array_count, &dotOut );
1062 #endif//BT_USE_SIMD_VECTOR3
1079 #if (defined (BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined (BT_USE_NEON)
1087 mVec128 = rhs.mVec128;
1093 mVec128 = v.mVec128;
1096 #endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
1100 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
1101 return btVector4(_mm_and_ps(mVec128, btvAbsfMask));
1102 #elif defined(BT_USE_NEON)
1218 #ifdef BT_USE_DOUBLE_PRECISION
1219 unsigned char* dest = (
unsigned char*) &destVal;
1220 unsigned char* src = (
unsigned char*) &sourceVal;
1230 unsigned char* dest = (
unsigned char*) &destVal;
1231 unsigned char* src = (
unsigned char*) &sourceVal;
1236 #endif //BT_USE_DOUBLE_PRECISION
1241 for (
int i=0;i<4;i++)
1253 for (
int i=0;i<4;i++)
1257 vector = swappedVec;
1265 btScalar a = n[1]*n[1] + n[2]*n[2];
1277 btScalar a = n[0]*n[0] + n[1]*n[1];
1304 for (
int i=0;i<4;i++)
1310 for (
int i=0;i<4;i++)
1318 for (
int i=0;i<4;i++)
1324 for (
int i=0;i<4;i++)
1332 for (
int i=0;i<4;i++)
1338 for (
int i=0;i<4;i++)
1342 #endif //BT_VECTOR3_H