-
Notifications
You must be signed in to change notification settings - Fork 0
/
sse3.cpp
130 lines (112 loc) · 4.33 KB
/
sse3.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#include "fractals-simd.h"
#include <pmmintrin.h>
#include <cmath>
#define ABI_SSE3 __attribute__((target ("sse3")))
using namespace fractals;
using vf = __m128;
using vi = __m128i;
struct vcomplex
{
vf real;
vf imag;
};
template <typename Init, typename Kernel>
static ABI_SSE3 Buffer loop(const Resolution& resolution,
const Viewport& viewport,
uint32_t iterations,
Init init,
Kernel kernel)
{
const vf epsilon = _mm_set1_ps(4.0f);
const auto dx = float(viewport.width()) / resolution.width();
const auto dy = float(viewport.height()) / resolution.height();
auto output = Buffer(resolution.width(), resolution.height(), sizeof(vi) / sizeof(uint32_t));
vcomplex input;
vcomplex prev;
for (uint32_t y = 0; y < resolution.height(); ++y)
{
auto* result_ptr = output.line<vi>(y);
input.imag = _mm_set1_ps(viewport.top() - y * dy);
for (uint32_t x = 0; x < resolution.width(); x += sizeof(vi) / sizeof(uint32_t))
{
input.real = _mm_set_ps(
viewport.left() + (x + 3) * dx,
viewport.left() + (x + 2) * dx,
viewport.left() + (x + 1) * dx,
viewport.left() + (x + 0) * dx
);
auto result = _mm_set1_epi32(iterations);
auto result_mask = _mm_set1_epi32(0);
prev = init(input);
for (uint32_t iteration = 0; iteration < iterations; ++iteration)
{
prev = kernel(prev, input);
auto rrii = _mm_add_ps(_mm_mul_ps(prev.real, prev.real), _mm_mul_ps(prev.imag, prev.imag));
auto mask = _mm_andnot_si128(result_mask, _mm_castps_si128(_mm_cmpgt_ps(rrii, epsilon)));
auto counter = _mm_set1_epi32(iteration);
result = _mm_or_si128(_mm_andnot_si128(mask, result), _mm_and_si128(counter, mask));
result_mask = _mm_or_si128(result_mask, mask);
}
*result_ptr++ = result;
}
}
return output;
};
Buffer fractals::mandelbrot(const Resolution& resolution, const Viewport& viewport, uint32_t iterations)
{
return loop(resolution, viewport, iterations,
[](const vcomplex& input) ABI_SSE3
{
return vcomplex{
_mm_set1_ps(0.0f),
_mm_set1_ps(0.0f)
};
},
[](const vcomplex& prev, const vcomplex& input) ABI_SSE3
{
return vcomplex{
_mm_add_ps(_mm_sub_ps(_mm_mul_ps(prev.real, prev.real), _mm_mul_ps(prev.imag, prev.imag)), input.real),
_mm_add_ps(_mm_mul_ps(_mm_set1_ps(2), _mm_mul_ps(prev.real, prev.imag)), input.imag)
};
}
);
}
Buffer fractals::burning_ship(const Resolution& resolution, const Viewport& viewport, uint32_t iterations)
{
return loop(resolution, viewport, iterations,
[](const vcomplex& input) ABI_SSE3
{
return vcomplex{
_mm_set1_ps(0.0f),
_mm_set1_ps(0.0f)
};
},
[](const vcomplex& prev, const vcomplex& input) ABI_SSE3
{
vcomplex prev_abs{
_mm_and_ps(prev.real, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))),
_mm_and_ps(prev.imag, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
};
return vcomplex{
_mm_add_ps(_mm_sub_ps(_mm_mul_ps(prev_abs.real, prev_abs.real), _mm_mul_ps(prev_abs.imag, prev_abs.imag)), input.real),
_mm_add_ps(_mm_mul_ps(_mm_set1_ps(2), _mm_mul_ps(prev_abs.real, prev_abs.imag)), input.imag)
};
}
);
}
Buffer fractals::julia_set(const Resolution& resolution, const Viewport& viewport, uint32_t iterations, const std::complex<float>& c)
{
return loop(resolution, viewport, iterations,
[](const vcomplex& input) ABI_SSE3
{
return input;
},
[c](const vcomplex& prev, const vcomplex&) ABI_SSE3
{
return vcomplex{
_mm_add_ps(_mm_sub_ps(_mm_mul_ps(prev.real, prev.real), _mm_mul_ps(prev.imag, prev.imag)), _mm_set1_ps(c.real())),
_mm_add_ps(_mm_mul_ps(_mm_set1_ps(2), _mm_mul_ps(prev.real, prev.imag)), _mm_set1_ps(c.imag()))
};
}
);
}