-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathavx2_cube.h
259 lines (224 loc) · 9.08 KB
/
avx2_cube.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
/* This file is part of vcube.
*
* Copyright (C) 2018 Andrew Skalski
*
* vcube is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* vcube is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with vcube. If not, see <https://www.gnu.org/licenses/>.
*/
#ifndef VCUBE_AVX2_CUBE_H
#define VCUBE_AVX2_CUBE_H
#include <stdint.h>
#include <x86intrin.h>
#include "types.h"
#include "util.h"
namespace vcube {
/* constexpr redeclarations of intrinsics */
namespace cx {
extern inline constexpr __m256i mm256_set_epi64x( // avx
int64_t a, int64_t b, int64_t c, int64_t d)
{
return (__m256i) { d, c, b, a };
}
}
struct avx2 {
static constexpr __m256i identity = cx::mm256_set_epi64x(
0x0f0e0d0c0b0a0908, 0x0706050403020100,
0x0f0e0d0c0b0a0908, 0x0706050403020100);
static constexpr __m256i literal(
uint64_t corners, uint64_t edges_high, uint64_t edges_low)
{
return cx::mm256_set_epi64x(
0x0f0e0d0c0b0a0908, corners,
0x0f0e0d0c00000000 | edges_high, edges_low);
};
static uint32_t bitmask(__m256i v, int b) {
return _mm256_movemask_epi8(_mm256_slli_epi32(v, 7 - b));
}
static bool equals(__m256i a, __m256i b) {
return _mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) == -1;
}
static bool less_than(__m256i a, __m256i b) {
#if 1
// One fewer instruction, except when used together with equals()
uint32_t gt = _mm256_movemask_epi8(_mm256_cmpgt_epi8(a, b));
uint32_t lt = _mm256_movemask_epi8(_mm256_cmpgt_epi8(b, a));
return gt < lt;
#else
/* Check that the most-significant 1-bit in "lt" is
* preceded by all 1-bits in "eq" by testing whether
* addition causes a rollover.
*/
uint32_t eq = _mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b));
uint32_t lt = _mm256_movemask_epi8(_mm256_cmpgt_epi8(b, a));
// Invariant: (eq & lt) == 0
uint32_t sum = (lt << 1) + eq;
return sum < lt;
#endif
}
static __m256i compose(__m256i a, __m256i b, bool mirror = false) {
const __m256i vcarry = _mm256_set_epi64x(
0x3030303030303030, 0x3030303030303030,
0x2020202020202020, 0x2020202020202020);
// Permute edges and corners
__m256i vperm = _mm256_shuffle_epi8(a, b);
// Compose edge and corner orientations
__m256i vorient = _mm256_and_si256(b, _mm256_set1_epi8(0xf0));
if (mirror) {
// Corner orientations are subtracted
vperm = _mm256_sub_epi8(vperm, vorient);
vperm = _mm256_min_epu8(vperm, _mm256_add_epi8(vperm, vcarry));
} else {
// Corner orientations are added
vperm = _mm256_add_epi8(vperm, vorient);
vperm = _mm256_min_epu8(vperm, _mm256_sub_epi8(vperm, vcarry));
}
return vperm;
}
static __m256i xor_edge_orient(__m256i v, eorient_t eorient) {
__m256i vorient = _mm256_shuffle_epi8(
_mm256_set1_epi32(eorient),
_mm256_set_epi64x(-1, -1, 0xffffffff01010101, 0));
vorient = _mm256_or_si256(vorient, _mm256_set1_epi64x(~0x8040201008040201));
vorient = _mm256_cmpeq_epi8(vorient, _mm256_set1_epi64x(-1));
vorient = _mm256_and_si256(vorient, _mm256_set1_epi8(0x10));
return _mm256_xor_si256(v, vorient);
}
static corient_t corner_orient_raw(__m256i v) {
__m256i vorient = _mm256_unpacklo_epi8(
_mm256_slli_epi32(v, 3),
_mm256_slli_epi32(v, 2));
return corient_t(_mm256_movemask_epi8(vorient)) >> 16;
}
static __m256i invert(__m256i v) {
// Split the cube into separate perm and orient vectors
__m256i vperm = _mm256_and_si256(v, _mm256_set1_epi8(0x0f));
__m256i vorient = _mm256_xor_si256(v, vperm);
#if 0
// "Brute force" the inverse of the permutation
__m256i vi = _mm256_set_epi64x(
0x0f0e0d0c00000000, 0x0000000000000000,
0x0f0e0d0c00000000, 0x0000000000000000);
for (int i = 0; i < 12; i++) {
__m256i vtrial = _mm256_set1_epi8(i);
__m256i vcorrect = _mm256_cmpeq_epi8(identity, _mm256_shuffle_epi8(vperm, vtrial));
vi = _mm256_or_si256(vi, _mm256_and_si256(vtrial, vcorrect));
}
#else
/* 27720 (11*9*8*7*5) is the LCM of all possible cycle decompositions,
* so we can invert the permutation by raising it to the 27719th power.
* This method tested faster (9-10%) than the above "brute force".
* The addition chain for 27719 was generated using the calculator
* provided by Achim Flammenkamp on his website:
* http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
*/
__m256i vp3 = _mm256_shuffle_epi8(vperm, vperm); // 2
vp3 = _mm256_shuffle_epi8(vp3, vperm); // 3 (+1)
__m256i vi = _mm256_shuffle_epi8(vp3, vp3); // 6
vi = _mm256_shuffle_epi8(vi, vi); // 12
vi = _mm256_shuffle_epi8(vi, vi); // 24
vi = _mm256_shuffle_epi8(vi, vp3); // 27 (+3)
vi = _mm256_shuffle_epi8(vi, vi); // 54
vi = _mm256_shuffle_epi8(vi, vi); // 108
vi = _mm256_shuffle_epi8(vi, vi); // 216
vi = _mm256_shuffle_epi8(vi, vi); // 432
vi = _mm256_shuffle_epi8(vi, vperm); // 433 (+1)
vi = _mm256_shuffle_epi8(vi, vi); // 866
vi = _mm256_shuffle_epi8(vi, vi); // 1732
vi = _mm256_shuffle_epi8(vi, vi); // 3464
vi = _mm256_shuffle_epi8(vi, vi); // 6928
vi = _mm256_shuffle_epi8(vi, vi); // 13856
vi = _mm256_shuffle_epi8(vi, vp3); // 13859 (+3)
vi = _mm256_shuffle_epi8(vi, vi); // 27718
vi = _mm256_shuffle_epi8(vi, vperm); // 27719 (+1)
#endif
// Invert the corner orientations
const __m256i vcarry_corners = _mm256_set_epi64x(
0x3030303030303030, 0x3030303030303030,
0x1010101010101010, 0x1010101010101010);
vorient = _mm256_add_epi8(vorient, vorient);
vorient = _mm256_min_epu8(vorient, _mm256_sub_epi8(vorient, vcarry_corners));
// Permute the edge and corner orientations
vorient = _mm256_shuffle_epi8(vorient, vi);
// Combine the new perm and orient
return _mm256_or_si256(vi, vorient);
}
static uint64_t edges_low(__m256i v) {
return _mm256_extract_epi64(v, 0);
}
static uint64_t edges_high(__m256i v) {
return _mm256_extract_epi64(v, 1);
}
static uint64_t corners(__m256i v) {
return _mm256_extract_epi64(v, 2);
}
static uint64_t unrank_corner_orient(corient_t corient) {
/* 16-bit mulhi is lower latency than 32-bit, but has two disadvantages:
* - Requires two different shift widths
* - The multiplier for the 3^0 place is 65536
*/
static const __m256i vpow3_reciprocal = _mm256_set_epi32(
1439, 4316, 12946, 38837, 7282, 21846, 0, 0);
static const __m256i vshift = _mm256_set_epi32(
4, 4, 4, 4, 0, 0, 0, 0);
// Divide by powers of 3 (1, 3, 9, ..., 729)
__m256i vcorient = _mm256_set1_epi32(corient);
__m256i vco = _mm256_mulhi_epu16(vcorient, vpow3_reciprocal);
vco = _mm256_srlv_epi32(vco, vshift);
// fixup 3^0 place; reuse vcorient instead of inserting
vco = _mm256_blend_epi32(vco, vcorient, 1 << 1);
// Compute the remainder mod 3
__m256i div3 = _mm256_mulhi_epu16(vco, _mm256_set1_epi32(21846)); // 21846/65536 ~ 1/3
vco = _mm256_add_epi32(vco, div3);
vco = _mm256_sub_epi32(vco, _mm256_slli_epi32(div3, 2));
// Convert the results to a scalar
vco = _mm256_shuffle_epi8(vco, _mm256_set_epi32(
-1, -1, 0x0c080400, -1,
-1, -1, -1, 0x0c080400));
uint64_t co =
_mm256_extract_epi64(vco, 2) |
_mm256_extract_epi64(vco, 0);
// Determine the last corner's orientation
uint64_t sum = co + (co >> 32);
sum += sum >> 16;
sum += sum >> 8;
// Insert the last corner
co |= (0x4924924924924924 >> sum) & 3;
return co << 4;
}
/* Return the parity of the edge+corner permutations */
static bool parity(__m256i v) {
v = _mm256_and_si256(v, _mm256_set1_epi8(0xf));
__m256i a, b, c, d, e, f, g, h;
a = _mm256_bslli_epi128(v, 1); // shift left 1 byte
b = _mm256_bslli_epi128(v, 2); // shift left 2 bytes
c = _mm256_bslli_epi128(v, 3); // shift left 3 bytes
d = _mm256_bslli_epi128(v, 4); // shift left 4 bytes
e = _mm256_bslli_epi128(v, 8); // shift left 8 bytes
f = _mm256_alignr_epi8(v, v, 11); // rotate left 5 bytes
g = _mm256_alignr_epi8(v, v, 10); // rotate left 6 bytes
h = _mm256_alignr_epi8(v, v, 9); // rotate left 7 bytes
// Test for inversions in the permutation
a = _mm256_xor_si256(_mm256_cmpgt_epi8(a, v), _mm256_cmpgt_epi8(b, v));
c = _mm256_xor_si256(_mm256_cmpgt_epi8(c, v), _mm256_cmpgt_epi8(d, v));
e = _mm256_xor_si256(_mm256_cmpgt_epi8(e, v), _mm256_cmpgt_epi8(f, v));
// Xor all the tests together
__m256i parity = _mm256_xor_si256(_mm256_xor_si256(a, c), e);
parity = _mm256_xor_si256(parity, _mm256_cmpgt_epi8(g, v));
parity = _mm256_xor_si256(parity, _mm256_cmpgt_epi8(h, v));
// The 0x5f corrects for the circular shifts, which cause
// certain pairs of values to be compared out-of-order
return _popcnt32(_mm256_movemask_epi8(parity) ^ 0x5f005f) & 1;
}
};
}
#endif