-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvmath.h
205 lines (171 loc) · 4.78 KB
/
vmath.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
//--------------------------------------------------------------------------------------
// File: vmath.h
//--------------------------------------------------------------------------------------
#ifndef __VMATH__
#define __VMATH__
///////////////////////////////////////////////////////////////////////////////
// This example show the fastest SIMD design for vector calculations
// 1 - Not structured data, instead use atomic datatypes
// 2 - All inline
// 3 - No overloaded operators
// 4 - Data passed and returned by value
// 5 - For single 32-bit results (such as dot product), repeated results
// stored in all 4 slots. Allowing math expression optimizations
///////////////////////////////////////////////////////////////////////////////
namespace VMATH
{
typedef __m128 Vec4;
///////////////////////////////////////////
//BEGIN:hacked from XNAMath to show extra load
///////////////////////////////////////////
typedef __declspec(align(16)) struct Vec4F32 {
union {
float f[4];
Vec4 v;
};
#if defined(__cplusplus)
inline operator Vec4() const { return v; }
#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_)
inline operator __m128i() const { return reinterpret_cast<const __m128i *>(&v)[0]; }
inline operator __m128d() const { return reinterpret_cast<const __m128d *>(&v)[0]; }
#endif
#endif // __cplusplus
} Vec4F32;
const Vec4F32 g_XMOne = { 1.0f, 1.0f, 1.0f, 1.0f};
inline Vec4 VReciprocal(Vec4 va)
{
return _mm_div_ps(g_XMOne,va);
}
///////////////////////////////////////////
//END:hacked from XNAMath to show extra load
///////////////////////////////////////////
inline Vec4 VLoad(float *pVec)
{
return(_mm_load_ps(pVec));
};
inline Vec4 VLoad(float f)
{
return(_mm_set_ps(f, f, f, f));
};
inline Vec4 VReplicate(float f)
{
return _mm_set_ps1(f);
}
inline Vec4 VLoad(float x, float y, float z, float w)
{
return(_mm_set_ps(x, y, z, w));
}
inline Vec4 VAdd(Vec4 va, Vec4 vb)
{
return(_mm_add_ps(va, vb));
};
inline void VAddSlow(Vec4& vr, Vec4 va, Vec4 vb)
{
vr = _mm_add_ps(va, vb);
};
inline Vec4 VSub(Vec4 va, Vec4 vb)
{
return(_mm_sub_ps(va, vb));
};
inline Vec4 VMul(Vec4 va, Vec4 vb)
{
return(_mm_mul_ps(va, vb));
};
inline Vec4 VDiv(Vec4 va, Vec4 vb)
{
return(_mm_div_ps(va, vb));
};
//slower div from XNAMath using extra load
inline Vec4 VDivSlow(Vec4 va, Vec4 vb)
{
Vec4 InvV = VReciprocal(vb);
return VMul(va, InvV);
}
inline void VStore(float *pVec, Vec4 v)
{
_mm_store_ps(pVec, v);
};
inline Vec4 VBc(Vec4 v)
{
return(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3,3,3,3)));
}
//////////////////////////////////////////////////////////////////////////////
// Fast SSE2 4-dimensional dot product by Gustavo M. Oliveira.
// Feel free to use for commercial or non-commercial applications,
// as long as this header is mantained.
//
// For more information on how to write and use high-performance
// SIMD Libraties, visit:
// http://www.guitarv.com/ComputerScience.aspx?page=articles
//
// The author assumes NO RESPONSIBILITY and/or LIABILITY for any problems
// caused by the use of this software. Use it at your own risk.
//////////////////////////////////////////////////////////////////////////////
inline Vec4 Dot(Vec4 va, Vec4 vb)
{
Vec4 t0 = _mm_mul_ps(va, vb);
Vec4 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1,0,3,2));
Vec4 t2 = _mm_add_ps(t0, t1);
Vec4 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1));
Vec4 dot = _mm_add_ps(t3, t2);
return (dot);
}
inline Vec4 Sqrt(Vec4 v)
{
return(_mm_sqrt_ps(v));
}
inline void GetX(float *p, Vec4 v)
{
_mm_store_ss(p, v);
}
inline Vec4 Reflect(Vec4 Incident, Vec4 Normal)
{
// Result = Incident - (2 * dot(Incident, Normal)) * Normal
Vec4 Result = Dot(Incident,Normal);
Result = _mm_add_ps(Result,Result);
Result = _mm_mul_ps(Result,Normal);
Result = _mm_sub_ps(Incident,Result);
return Result;
}
////////////////////////////////////////////////////////////////////////////////
// Overloaded operators, left here just as a reference.
// WARNING: This bloats the code as expressions grow
////////////////////////////////////////////////////////////////////////////////
inline Vec4 operator+(Vec4 va, Vec4 vb)
{
return(VAdd(va, vb));
}
inline Vec4& operator+= (Vec4& va, Vec4 vb)
{
va = VAdd(va, vb);
return (va);
}
inline Vec4 operator-(Vec4 va, Vec4 vb)
{
return(VSub(va, vb));
}
inline Vec4& operator-= (Vec4& va, Vec4 vb)
{
va = VSub(va, vb);
return (va);
}
inline Vec4 operator*(Vec4 va, Vec4 vb)
{
return(VMul(va, vb));
}
inline Vec4& operator*= (Vec4& va, Vec4 vb)
{
va = VMul(va, vb);
return (va);
}
inline Vec4 operator/(Vec4 va, Vec4 vb)
{
return(VDiv(va, vb));
}
inline Vec4& operator/= (Vec4& va, Vec4 vb)
{
va = VDiv(va, vb);
return (va);
}
}
#endif // #ifndef __VMATH__