2 * Author......: Jens Steube <jens.steube@gmail.com>
8 #include "include/constants.h"
9 #include "include/kernel_vendor.h"
24 #include "include/kernel_functions.c"
26 #include "common_nv.c"
29 #define VECT_COMPARE_M "check_multi_vect1_comp4.c"
33 #define VECT_COMPARE_M "check_multi_vect2_comp4.c"
56 __device__ __constant__ u32 k_sha256[64] =
58 SHA256C00, SHA256C01, SHA256C02, SHA256C03,
59 SHA256C04, SHA256C05, SHA256C06, SHA256C07,
60 SHA256C08, SHA256C09, SHA256C0a, SHA256C0b,
61 SHA256C0c, SHA256C0d, SHA256C0e, SHA256C0f,
62 SHA256C10, SHA256C11, SHA256C12, SHA256C13,
63 SHA256C14, SHA256C15, SHA256C16, SHA256C17,
64 SHA256C18, SHA256C19, SHA256C1a, SHA256C1b,
65 SHA256C1c, SHA256C1d, SHA256C1e, SHA256C1f,
66 SHA256C20, SHA256C21, SHA256C22, SHA256C23,
67 SHA256C24, SHA256C25, SHA256C26, SHA256C27,
68 SHA256C28, SHA256C29, SHA256C2a, SHA256C2b,
69 SHA256C2c, SHA256C2d, SHA256C2e, SHA256C2f,
70 SHA256C30, SHA256C31, SHA256C32, SHA256C33,
71 SHA256C34, SHA256C35, SHA256C36, SHA256C37,
72 SHA256C38, SHA256C39, SHA256C3a, SHA256C3b,
73 SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f,
76 __device__ static void sha256_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[8])
87 u32x w0_t = swap_workaround (w0[0]);
88 u32x w1_t = swap_workaround (w0[1]);
89 u32x w2_t = swap_workaround (w0[2]);
90 u32x w3_t = swap_workaround (w0[3]);
91 u32x w4_t = swap_workaround (w1[0]);
92 u32x w5_t = swap_workaround (w1[1]);
93 u32x w6_t = swap_workaround (w1[2]);
94 u32x w7_t = swap_workaround (w1[3]);
95 u32x w8_t = swap_workaround (w2[0]);
96 u32x w9_t = swap_workaround (w2[1]);
97 u32x wa_t = swap_workaround (w2[2]);
98 u32x wb_t = swap_workaround (w2[3]);
99 u32x wc_t = swap_workaround (w3[0]);
100 u32x wd_t = swap_workaround (w3[1]);
101 u32x we_t = swap_workaround (w3[2]);
102 u32x wf_t = swap_workaround (w3[3]);
104 #define ROUND256_EXPAND() \
106 w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); \
107 w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); \
108 w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); \
109 w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); \
110 w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); \
111 w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); \
112 w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); \
113 w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); \
114 w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); \
115 w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); \
116 wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); \
117 wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); \
118 wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); \
119 wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); \
120 we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); \
121 wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); \
124 #define ROUND256_STEP(i) \
126 SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha256[i + 0]); \
127 SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha256[i + 1]); \
128 SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha256[i + 2]); \
129 SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha256[i + 3]); \
130 SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha256[i + 4]); \
131 SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha256[i + 5]); \
132 SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha256[i + 6]); \
133 SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha256[i + 7]); \
134 SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha256[i + 8]); \
135 SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha256[i + 9]); \
136 SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha256[i + 10]); \
137 SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha256[i + 11]); \
138 SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha256[i + 12]); \
139 SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha256[i + 13]); \
140 SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, k_sha256[i + 14]); \
141 SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \
146 for (int i = 16; i < 64; i += 16)
148 ROUND256_EXPAND (); ROUND256_STEP (i);
161 __device__ __constant__ u64 k_sha384[80] =
163 SHA384C00, SHA384C01, SHA384C02, SHA384C03,
164 SHA384C04, SHA384C05, SHA384C06, SHA384C07,
165 SHA384C08, SHA384C09, SHA384C0a, SHA384C0b,
166 SHA384C0c, SHA384C0d, SHA384C0e, SHA384C0f,
167 SHA384C10, SHA384C11, SHA384C12, SHA384C13,
168 SHA384C14, SHA384C15, SHA384C16, SHA384C17,
169 SHA384C18, SHA384C19, SHA384C1a, SHA384C1b,
170 SHA384C1c, SHA384C1d, SHA384C1e, SHA384C1f,
171 SHA384C20, SHA384C21, SHA384C22, SHA384C23,
172 SHA384C24, SHA384C25, SHA384C26, SHA384C27,
173 SHA384C28, SHA384C29, SHA384C2a, SHA384C2b,
174 SHA384C2c, SHA384C2d, SHA384C2e, SHA384C2f,
175 SHA384C30, SHA384C31, SHA384C32, SHA384C33,
176 SHA384C34, SHA384C35, SHA384C36, SHA384C37,
177 SHA384C38, SHA384C39, SHA384C3a, SHA384C3b,
178 SHA384C3c, SHA384C3d, SHA384C3e, SHA384C3f,
179 SHA384C40, SHA384C41, SHA384C42, SHA384C43,
180 SHA384C44, SHA384C45, SHA384C46, SHA384C47,
181 SHA384C48, SHA384C49, SHA384C4a, SHA384C4b,
182 SHA384C4c, SHA384C4d, SHA384C4e, SHA384C4f,
185 __device__ static void sha384_transform (const u64 w0[4], const u64 w1[4], const u64 w2[4], const u64 w3[4], u64x digest[8])
196 u64x w0_t = swap_workaround (w0[0]);
197 u64x w1_t = swap_workaround (w0[1]);
198 u64x w2_t = swap_workaround (w0[2]);
199 u64x w3_t = swap_workaround (w0[3]);
200 u64x w4_t = swap_workaround (w1[0]);
201 u64x w5_t = swap_workaround (w1[1]);
202 u64x w6_t = swap_workaround (w1[2]);
203 u64x w7_t = swap_workaround (w1[3]);
204 u64x w8_t = swap_workaround (w2[0]);
205 u64x w9_t = swap_workaround (w2[1]);
206 u64x wa_t = swap_workaround (w2[2]);
207 u64x wb_t = swap_workaround (w2[3]);
208 u64x wc_t = swap_workaround (w3[0]);
209 u64x wd_t = swap_workaround (w3[1]);
210 u64x we_t = swap_workaround (w3[2]);
211 u64x wf_t = swap_workaround (w3[3]);
213 #define ROUND384_EXPAND() \
215 w0_t = SHA384_EXPAND (we_t, w9_t, w1_t, w0_t); \
216 w1_t = SHA384_EXPAND (wf_t, wa_t, w2_t, w1_t); \
217 w2_t = SHA384_EXPAND (w0_t, wb_t, w3_t, w2_t); \
218 w3_t = SHA384_EXPAND (w1_t, wc_t, w4_t, w3_t); \
219 w4_t = SHA384_EXPAND (w2_t, wd_t, w5_t, w4_t); \
220 w5_t = SHA384_EXPAND (w3_t, we_t, w6_t, w5_t); \
221 w6_t = SHA384_EXPAND (w4_t, wf_t, w7_t, w6_t); \
222 w7_t = SHA384_EXPAND (w5_t, w0_t, w8_t, w7_t); \
223 w8_t = SHA384_EXPAND (w6_t, w1_t, w9_t, w8_t); \
224 w9_t = SHA384_EXPAND (w7_t, w2_t, wa_t, w9_t); \
225 wa_t = SHA384_EXPAND (w8_t, w3_t, wb_t, wa_t); \
226 wb_t = SHA384_EXPAND (w9_t, w4_t, wc_t, wb_t); \
227 wc_t = SHA384_EXPAND (wa_t, w5_t, wd_t, wc_t); \
228 wd_t = SHA384_EXPAND (wb_t, w6_t, we_t, wd_t); \
229 we_t = SHA384_EXPAND (wc_t, w7_t, wf_t, we_t); \
230 wf_t = SHA384_EXPAND (wd_t, w8_t, w0_t, wf_t); \
233 #define ROUND384_STEP(i) \
235 SHA384_STEP (SHA384_F0o, SHA384_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha384[i + 0]); \
236 SHA384_STEP (SHA384_F0o, SHA384_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha384[i + 1]); \
237 SHA384_STEP (SHA384_F0o, SHA384_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha384[i + 2]); \
238 SHA384_STEP (SHA384_F0o, SHA384_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha384[i + 3]); \
239 SHA384_STEP (SHA384_F0o, SHA384_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha384[i + 4]); \
240 SHA384_STEP (SHA384_F0o, SHA384_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha384[i + 5]); \
241 SHA384_STEP (SHA384_F0o, SHA384_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha384[i + 6]); \
242 SHA384_STEP (SHA384_F0o, SHA384_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha384[i + 7]); \
243 SHA384_STEP (SHA384_F0o, SHA384_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha384[i + 8]); \
244 SHA384_STEP (SHA384_F0o, SHA384_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha384[i + 9]); \
245 SHA384_STEP (SHA384_F0o, SHA384_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha384[i + 10]); \
246 SHA384_STEP (SHA384_F0o, SHA384_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha384[i + 11]); \
247 SHA384_STEP (SHA384_F0o, SHA384_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha384[i + 12]); \
248 SHA384_STEP (SHA384_F0o, SHA384_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha384[i + 13]); \
249 SHA384_STEP (SHA384_F0o, SHA384_F1o, c, d, e, f, g, h, a, b, we_t, k_sha384[i + 14]); \
250 SHA384_STEP (SHA384_F0o, SHA384_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha384[i + 15]); \
255 for (int i = 16; i < 80; i += 16)
257 ROUND384_EXPAND (); ROUND384_STEP (i);
270 __device__ __constant__ u64 k_sha512[80] =
272 SHA384C00, SHA384C01, SHA384C02, SHA384C03,
273 SHA384C04, SHA384C05, SHA384C06, SHA384C07,
274 SHA384C08, SHA384C09, SHA384C0a, SHA384C0b,
275 SHA384C0c, SHA384C0d, SHA384C0e, SHA384C0f,
276 SHA384C10, SHA384C11, SHA384C12, SHA384C13,
277 SHA384C14, SHA384C15, SHA384C16, SHA384C17,
278 SHA384C18, SHA384C19, SHA384C1a, SHA384C1b,
279 SHA384C1c, SHA384C1d, SHA384C1e, SHA384C1f,
280 SHA384C20, SHA384C21, SHA384C22, SHA384C23,
281 SHA384C24, SHA384C25, SHA384C26, SHA384C27,
282 SHA384C28, SHA384C29, SHA384C2a, SHA384C2b,
283 SHA384C2c, SHA384C2d, SHA384C2e, SHA384C2f,
284 SHA384C30, SHA384C31, SHA384C32, SHA384C33,
285 SHA384C34, SHA384C35, SHA384C36, SHA384C37,
286 SHA384C38, SHA384C39, SHA384C3a, SHA384C3b,
287 SHA384C3c, SHA384C3d, SHA384C3e, SHA384C3f,
288 SHA384C40, SHA384C41, SHA384C42, SHA384C43,
289 SHA384C44, SHA384C45, SHA384C46, SHA384C47,
290 SHA384C48, SHA384C49, SHA384C4a, SHA384C4b,
291 SHA384C4c, SHA384C4d, SHA384C4e, SHA384C4f,
294 __device__ static void sha512_transform (const u64 w0[4], const u64 w1[4], const u64 w2[4], const u64 w3[4], u64 digest[8])
305 u64x w0_t = swap_workaround (w0[0]);
306 u64x w1_t = swap_workaround (w0[1]);
307 u64x w2_t = swap_workaround (w0[2]);
308 u64x w3_t = swap_workaround (w0[3]);
309 u64x w4_t = swap_workaround (w1[0]);
310 u64x w5_t = swap_workaround (w1[1]);
311 u64x w6_t = swap_workaround (w1[2]);
312 u64x w7_t = swap_workaround (w1[3]);
313 u64x w8_t = swap_workaround (w2[0]);
314 u64x w9_t = swap_workaround (w2[1]);
315 u64x wa_t = swap_workaround (w2[2]);
316 u64x wb_t = swap_workaround (w2[3]);
317 u64x wc_t = swap_workaround (w3[0]);
318 u64x wd_t = swap_workaround (w3[1]);
319 u64x we_t = swap_workaround (w3[2]);
320 u64x wf_t = swap_workaround (w3[3]);
322 #define ROUND512_EXPAND() \
324 w0_t = SHA512_EXPAND (we_t, w9_t, w1_t, w0_t); \
325 w1_t = SHA512_EXPAND (wf_t, wa_t, w2_t, w1_t); \
326 w2_t = SHA512_EXPAND (w0_t, wb_t, w3_t, w2_t); \
327 w3_t = SHA512_EXPAND (w1_t, wc_t, w4_t, w3_t); \
328 w4_t = SHA512_EXPAND (w2_t, wd_t, w5_t, w4_t); \
329 w5_t = SHA512_EXPAND (w3_t, we_t, w6_t, w5_t); \
330 w6_t = SHA512_EXPAND (w4_t, wf_t, w7_t, w6_t); \
331 w7_t = SHA512_EXPAND (w5_t, w0_t, w8_t, w7_t); \
332 w8_t = SHA512_EXPAND (w6_t, w1_t, w9_t, w8_t); \
333 w9_t = SHA512_EXPAND (w7_t, w2_t, wa_t, w9_t); \
334 wa_t = SHA512_EXPAND (w8_t, w3_t, wb_t, wa_t); \
335 wb_t = SHA512_EXPAND (w9_t, w4_t, wc_t, wb_t); \
336 wc_t = SHA512_EXPAND (wa_t, w5_t, wd_t, wc_t); \
337 wd_t = SHA512_EXPAND (wb_t, w6_t, we_t, wd_t); \
338 we_t = SHA512_EXPAND (wc_t, w7_t, wf_t, we_t); \
339 wf_t = SHA512_EXPAND (wd_t, w8_t, w0_t, wf_t); \
342 #define ROUND512_STEP(i) \
344 SHA512_STEP (SHA512_F0o, SHA512_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha512[i + 0]); \
345 SHA512_STEP (SHA512_F0o, SHA512_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha512[i + 1]); \
346 SHA512_STEP (SHA512_F0o, SHA512_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha512[i + 2]); \
347 SHA512_STEP (SHA512_F0o, SHA512_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha512[i + 3]); \
348 SHA512_STEP (SHA512_F0o, SHA512_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha512[i + 4]); \
349 SHA512_STEP (SHA512_F0o, SHA512_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha512[i + 5]); \
350 SHA512_STEP (SHA512_F0o, SHA512_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha512[i + 6]); \
351 SHA512_STEP (SHA512_F0o, SHA512_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha512[i + 7]); \
352 SHA512_STEP (SHA512_F0o, SHA512_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha512[i + 8]); \
353 SHA512_STEP (SHA512_F0o, SHA512_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha512[i + 9]); \
354 SHA512_STEP (SHA512_F0o, SHA512_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha512[i + 10]); \
355 SHA512_STEP (SHA512_F0o, SHA512_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha512[i + 11]); \
356 SHA512_STEP (SHA512_F0o, SHA512_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha512[i + 12]); \
357 SHA512_STEP (SHA512_F0o, SHA512_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha512[i + 13]); \
358 SHA512_STEP (SHA512_F0o, SHA512_F1o, c, d, e, f, g, h, a, b, we_t, k_sha512[i + 14]); \
359 SHA512_STEP (SHA512_F0o, SHA512_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha512[i + 15]); \
364 for (int i = 16; i < 80; i += 16)
366 ROUND512_EXPAND (); ROUND512_STEP (i);
379 __device__ __constant__ u32 te0[256] =
381 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
382 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
383 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
384 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
385 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
386 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
387 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
388 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
389 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
390 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
391 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
392 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
393 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
394 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
395 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
396 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
397 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
398 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
399 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
400 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
401 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
402 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
403 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
404 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
405 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
406 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
407 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
408 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
409 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
410 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
411 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
412 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
413 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
414 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
415 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
416 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
417 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
418 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
419 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
420 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
421 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
422 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
423 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
424 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
425 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
426 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
427 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
428 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
429 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
430 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
431 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
432 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
433 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
434 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
435 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
436 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
437 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
438 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
439 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
440 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
441 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
442 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
443 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
444 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a,
447 __device__ __constant__ u32 te1[256] =
449 0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b,
450 0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5,
451 0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b,
452 0x19e7fefe, 0x62b5d7d7, 0xe64dabab, 0x9aec7676,
453 0x458fcaca, 0x9d1f8282, 0x4089c9c9, 0x87fa7d7d,
454 0x15effafa, 0xebb25959, 0xc98e4747, 0x0bfbf0f0,
455 0xec41adad, 0x67b3d4d4, 0xfd5fa2a2, 0xea45afaf,
456 0xbf239c9c, 0xf753a4a4, 0x96e47272, 0x5b9bc0c0,
457 0xc275b7b7, 0x1ce1fdfd, 0xae3d9393, 0x6a4c2626,
458 0x5a6c3636, 0x417e3f3f, 0x02f5f7f7, 0x4f83cccc,
459 0x5c683434, 0xf451a5a5, 0x34d1e5e5, 0x08f9f1f1,
460 0x93e27171, 0x73abd8d8, 0x53623131, 0x3f2a1515,
461 0x0c080404, 0x5295c7c7, 0x65462323, 0x5e9dc3c3,
462 0x28301818, 0xa1379696, 0x0f0a0505, 0xb52f9a9a,
463 0x090e0707, 0x36241212, 0x9b1b8080, 0x3ddfe2e2,
464 0x26cdebeb, 0x694e2727, 0xcd7fb2b2, 0x9fea7575,
465 0x1b120909, 0x9e1d8383, 0x74582c2c, 0x2e341a1a,
466 0x2d361b1b, 0xb2dc6e6e, 0xeeb45a5a, 0xfb5ba0a0,
467 0xf6a45252, 0x4d763b3b, 0x61b7d6d6, 0xce7db3b3,
468 0x7b522929, 0x3edde3e3, 0x715e2f2f, 0x97138484,
469 0xf5a65353, 0x68b9d1d1, 0x00000000, 0x2cc1eded,
470 0x60402020, 0x1fe3fcfc, 0xc879b1b1, 0xedb65b5b,
471 0xbed46a6a, 0x468dcbcb, 0xd967bebe, 0x4b723939,
472 0xde944a4a, 0xd4984c4c, 0xe8b05858, 0x4a85cfcf,
473 0x6bbbd0d0, 0x2ac5efef, 0xe54faaaa, 0x16edfbfb,
474 0xc5864343, 0xd79a4d4d, 0x55663333, 0x94118585,
475 0xcf8a4545, 0x10e9f9f9, 0x06040202, 0x81fe7f7f,
476 0xf0a05050, 0x44783c3c, 0xba259f9f, 0xe34ba8a8,
477 0xf3a25151, 0xfe5da3a3, 0xc0804040, 0x8a058f8f,
478 0xad3f9292, 0xbc219d9d, 0x48703838, 0x04f1f5f5,
479 0xdf63bcbc, 0xc177b6b6, 0x75afdada, 0x63422121,
480 0x30201010, 0x1ae5ffff, 0x0efdf3f3, 0x6dbfd2d2,
481 0x4c81cdcd, 0x14180c0c, 0x35261313, 0x2fc3ecec,
482 0xe1be5f5f, 0xa2359797, 0xcc884444, 0x392e1717,
483 0x5793c4c4, 0xf255a7a7, 0x82fc7e7e, 0x477a3d3d,
484 0xacc86464, 0xe7ba5d5d, 0x2b321919, 0x95e67373,
485 0xa0c06060, 0x98198181, 0xd19e4f4f, 0x7fa3dcdc,
486 0x66442222, 0x7e542a2a, 0xab3b9090, 0x830b8888,
487 0xca8c4646, 0x29c7eeee, 0xd36bb8b8, 0x3c281414,
488 0x79a7dede, 0xe2bc5e5e, 0x1d160b0b, 0x76addbdb,
489 0x3bdbe0e0, 0x56643232, 0x4e743a3a, 0x1e140a0a,
490 0xdb924949, 0x0a0c0606, 0x6c482424, 0xe4b85c5c,
491 0x5d9fc2c2, 0x6ebdd3d3, 0xef43acac, 0xa6c46262,
492 0xa8399191, 0xa4319595, 0x37d3e4e4, 0x8bf27979,
493 0x32d5e7e7, 0x438bc8c8, 0x596e3737, 0xb7da6d6d,
494 0x8c018d8d, 0x64b1d5d5, 0xd29c4e4e, 0xe049a9a9,
495 0xb4d86c6c, 0xfaac5656, 0x07f3f4f4, 0x25cfeaea,
496 0xafca6565, 0x8ef47a7a, 0xe947aeae, 0x18100808,
497 0xd56fbaba, 0x88f07878, 0x6f4a2525, 0x725c2e2e,
498 0x24381c1c, 0xf157a6a6, 0xc773b4b4, 0x5197c6c6,
499 0x23cbe8e8, 0x7ca1dddd, 0x9ce87474, 0x213e1f1f,
500 0xdd964b4b, 0xdc61bdbd, 0x860d8b8b, 0x850f8a8a,
501 0x90e07070, 0x427c3e3e, 0xc471b5b5, 0xaacc6666,
502 0xd8904848, 0x05060303, 0x01f7f6f6, 0x121c0e0e,
503 0xa3c26161, 0x5f6a3535, 0xf9ae5757, 0xd069b9b9,
504 0x91178686, 0x5899c1c1, 0x273a1d1d, 0xb9279e9e,
505 0x38d9e1e1, 0x13ebf8f8, 0xb32b9898, 0x33221111,
506 0xbbd26969, 0x70a9d9d9, 0x89078e8e, 0xa7339494,
507 0xb62d9b9b, 0x223c1e1e, 0x92158787, 0x20c9e9e9,
508 0x4987cece, 0xffaa5555, 0x78502828, 0x7aa5dfdf,
509 0x8f038c8c, 0xf859a1a1, 0x80098989, 0x171a0d0d,
510 0xda65bfbf, 0x31d7e6e6, 0xc6844242, 0xb8d06868,
511 0xc3824141, 0xb0299999, 0x775a2d2d, 0x111e0f0f,
512 0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616,
515 __device__ __constant__ u32 te2[256] =
517 0x63a5c663, 0x7c84f87c, 0x7799ee77, 0x7b8df67b,
518 0xf20dfff2, 0x6bbdd66b, 0x6fb1de6f, 0xc55491c5,
519 0x30506030, 0x01030201, 0x67a9ce67, 0x2b7d562b,
520 0xfe19e7fe, 0xd762b5d7, 0xabe64dab, 0x769aec76,
521 0xca458fca, 0x829d1f82, 0xc94089c9, 0x7d87fa7d,
522 0xfa15effa, 0x59ebb259, 0x47c98e47, 0xf00bfbf0,
523 0xadec41ad, 0xd467b3d4, 0xa2fd5fa2, 0xafea45af,
524 0x9cbf239c, 0xa4f753a4, 0x7296e472, 0xc05b9bc0,
525 0xb7c275b7, 0xfd1ce1fd, 0x93ae3d93, 0x266a4c26,
526 0x365a6c36, 0x3f417e3f, 0xf702f5f7, 0xcc4f83cc,
527 0x345c6834, 0xa5f451a5, 0xe534d1e5, 0xf108f9f1,
528 0x7193e271, 0xd873abd8, 0x31536231, 0x153f2a15,
529 0x040c0804, 0xc75295c7, 0x23654623, 0xc35e9dc3,
530 0x18283018, 0x96a13796, 0x050f0a05, 0x9ab52f9a,
531 0x07090e07, 0x12362412, 0x809b1b80, 0xe23ddfe2,
532 0xeb26cdeb, 0x27694e27, 0xb2cd7fb2, 0x759fea75,
533 0x091b1209, 0x839e1d83, 0x2c74582c, 0x1a2e341a,
534 0x1b2d361b, 0x6eb2dc6e, 0x5aeeb45a, 0xa0fb5ba0,
535 0x52f6a452, 0x3b4d763b, 0xd661b7d6, 0xb3ce7db3,
536 0x297b5229, 0xe33edde3, 0x2f715e2f, 0x84971384,
537 0x53f5a653, 0xd168b9d1, 0x00000000, 0xed2cc1ed,
538 0x20604020, 0xfc1fe3fc, 0xb1c879b1, 0x5bedb65b,
539 0x6abed46a, 0xcb468dcb, 0xbed967be, 0x394b7239,
540 0x4ade944a, 0x4cd4984c, 0x58e8b058, 0xcf4a85cf,
541 0xd06bbbd0, 0xef2ac5ef, 0xaae54faa, 0xfb16edfb,
542 0x43c58643, 0x4dd79a4d, 0x33556633, 0x85941185,
543 0x45cf8a45, 0xf910e9f9, 0x02060402, 0x7f81fe7f,
544 0x50f0a050, 0x3c44783c, 0x9fba259f, 0xa8e34ba8,
545 0x51f3a251, 0xa3fe5da3, 0x40c08040, 0x8f8a058f,
546 0x92ad3f92, 0x9dbc219d, 0x38487038, 0xf504f1f5,
547 0xbcdf63bc, 0xb6c177b6, 0xda75afda, 0x21634221,
548 0x10302010, 0xff1ae5ff, 0xf30efdf3, 0xd26dbfd2,
549 0xcd4c81cd, 0x0c14180c, 0x13352613, 0xec2fc3ec,
550 0x5fe1be5f, 0x97a23597, 0x44cc8844, 0x17392e17,
551 0xc45793c4, 0xa7f255a7, 0x7e82fc7e, 0x3d477a3d,
552 0x64acc864, 0x5de7ba5d, 0x192b3219, 0x7395e673,
553 0x60a0c060, 0x81981981, 0x4fd19e4f, 0xdc7fa3dc,
554 0x22664422, 0x2a7e542a, 0x90ab3b90, 0x88830b88,
555 0x46ca8c46, 0xee29c7ee, 0xb8d36bb8, 0x143c2814,
556 0xde79a7de, 0x5ee2bc5e, 0x0b1d160b, 0xdb76addb,
557 0xe03bdbe0, 0x32566432, 0x3a4e743a, 0x0a1e140a,
558 0x49db9249, 0x060a0c06, 0x246c4824, 0x5ce4b85c,
559 0xc25d9fc2, 0xd36ebdd3, 0xacef43ac, 0x62a6c462,
560 0x91a83991, 0x95a43195, 0xe437d3e4, 0x798bf279,
561 0xe732d5e7, 0xc8438bc8, 0x37596e37, 0x6db7da6d,
562 0x8d8c018d, 0xd564b1d5, 0x4ed29c4e, 0xa9e049a9,
563 0x6cb4d86c, 0x56faac56, 0xf407f3f4, 0xea25cfea,
564 0x65afca65, 0x7a8ef47a, 0xaee947ae, 0x08181008,
565 0xbad56fba, 0x7888f078, 0x256f4a25, 0x2e725c2e,
566 0x1c24381c, 0xa6f157a6, 0xb4c773b4, 0xc65197c6,
567 0xe823cbe8, 0xdd7ca1dd, 0x749ce874, 0x1f213e1f,
568 0x4bdd964b, 0xbddc61bd, 0x8b860d8b, 0x8a850f8a,
569 0x7090e070, 0x3e427c3e, 0xb5c471b5, 0x66aacc66,
570 0x48d89048, 0x03050603, 0xf601f7f6, 0x0e121c0e,
571 0x61a3c261, 0x355f6a35, 0x57f9ae57, 0xb9d069b9,
572 0x86911786, 0xc15899c1, 0x1d273a1d, 0x9eb9279e,
573 0xe138d9e1, 0xf813ebf8, 0x98b32b98, 0x11332211,
574 0x69bbd269, 0xd970a9d9, 0x8e89078e, 0x94a73394,
575 0x9bb62d9b, 0x1e223c1e, 0x87921587, 0xe920c9e9,
576 0xce4987ce, 0x55ffaa55, 0x28785028, 0xdf7aa5df,
577 0x8c8f038c, 0xa1f859a1, 0x89800989, 0x0d171a0d,
578 0xbfda65bf, 0xe631d7e6, 0x42c68442, 0x68b8d068,
579 0x41c38241, 0x99b02999, 0x2d775a2d, 0x0f111e0f,
580 0xb0cb7bb0, 0x54fca854, 0xbbd66dbb, 0x163a2c16,
583 __device__ __constant__ u32 te3[256] =
585 0x6363a5c6, 0x7c7c84f8, 0x777799ee, 0x7b7b8df6,
586 0xf2f20dff, 0x6b6bbdd6, 0x6f6fb1de, 0xc5c55491,
587 0x30305060, 0x01010302, 0x6767a9ce, 0x2b2b7d56,
588 0xfefe19e7, 0xd7d762b5, 0xababe64d, 0x76769aec,
589 0xcaca458f, 0x82829d1f, 0xc9c94089, 0x7d7d87fa,
590 0xfafa15ef, 0x5959ebb2, 0x4747c98e, 0xf0f00bfb,
591 0xadadec41, 0xd4d467b3, 0xa2a2fd5f, 0xafafea45,
592 0x9c9cbf23, 0xa4a4f753, 0x727296e4, 0xc0c05b9b,
593 0xb7b7c275, 0xfdfd1ce1, 0x9393ae3d, 0x26266a4c,
594 0x36365a6c, 0x3f3f417e, 0xf7f702f5, 0xcccc4f83,
595 0x34345c68, 0xa5a5f451, 0xe5e534d1, 0xf1f108f9,
596 0x717193e2, 0xd8d873ab, 0x31315362, 0x15153f2a,
597 0x04040c08, 0xc7c75295, 0x23236546, 0xc3c35e9d,
598 0x18182830, 0x9696a137, 0x05050f0a, 0x9a9ab52f,
599 0x0707090e, 0x12123624, 0x80809b1b, 0xe2e23ddf,
600 0xebeb26cd, 0x2727694e, 0xb2b2cd7f, 0x75759fea,
601 0x09091b12, 0x83839e1d, 0x2c2c7458, 0x1a1a2e34,
602 0x1b1b2d36, 0x6e6eb2dc, 0x5a5aeeb4, 0xa0a0fb5b,
603 0x5252f6a4, 0x3b3b4d76, 0xd6d661b7, 0xb3b3ce7d,
604 0x29297b52, 0xe3e33edd, 0x2f2f715e, 0x84849713,
605 0x5353f5a6, 0xd1d168b9, 0x00000000, 0xeded2cc1,
606 0x20206040, 0xfcfc1fe3, 0xb1b1c879, 0x5b5bedb6,
607 0x6a6abed4, 0xcbcb468d, 0xbebed967, 0x39394b72,
608 0x4a4ade94, 0x4c4cd498, 0x5858e8b0, 0xcfcf4a85,
609 0xd0d06bbb, 0xefef2ac5, 0xaaaae54f, 0xfbfb16ed,
610 0x4343c586, 0x4d4dd79a, 0x33335566, 0x85859411,
611 0x4545cf8a, 0xf9f910e9, 0x02020604, 0x7f7f81fe,
612 0x5050f0a0, 0x3c3c4478, 0x9f9fba25, 0xa8a8e34b,
613 0x5151f3a2, 0xa3a3fe5d, 0x4040c080, 0x8f8f8a05,
614 0x9292ad3f, 0x9d9dbc21, 0x38384870, 0xf5f504f1,
615 0xbcbcdf63, 0xb6b6c177, 0xdada75af, 0x21216342,
616 0x10103020, 0xffff1ae5, 0xf3f30efd, 0xd2d26dbf,
617 0xcdcd4c81, 0x0c0c1418, 0x13133526, 0xecec2fc3,
618 0x5f5fe1be, 0x9797a235, 0x4444cc88, 0x1717392e,
619 0xc4c45793, 0xa7a7f255, 0x7e7e82fc, 0x3d3d477a,
620 0x6464acc8, 0x5d5de7ba, 0x19192b32, 0x737395e6,
621 0x6060a0c0, 0x81819819, 0x4f4fd19e, 0xdcdc7fa3,
622 0x22226644, 0x2a2a7e54, 0x9090ab3b, 0x8888830b,
623 0x4646ca8c, 0xeeee29c7, 0xb8b8d36b, 0x14143c28,
624 0xdede79a7, 0x5e5ee2bc, 0x0b0b1d16, 0xdbdb76ad,
625 0xe0e03bdb, 0x32325664, 0x3a3a4e74, 0x0a0a1e14,
626 0x4949db92, 0x06060a0c, 0x24246c48, 0x5c5ce4b8,
627 0xc2c25d9f, 0xd3d36ebd, 0xacacef43, 0x6262a6c4,
628 0x9191a839, 0x9595a431, 0xe4e437d3, 0x79798bf2,
629 0xe7e732d5, 0xc8c8438b, 0x3737596e, 0x6d6db7da,
630 0x8d8d8c01, 0xd5d564b1, 0x4e4ed29c, 0xa9a9e049,
631 0x6c6cb4d8, 0x5656faac, 0xf4f407f3, 0xeaea25cf,
632 0x6565afca, 0x7a7a8ef4, 0xaeaee947, 0x08081810,
633 0xbabad56f, 0x787888f0, 0x25256f4a, 0x2e2e725c,
634 0x1c1c2438, 0xa6a6f157, 0xb4b4c773, 0xc6c65197,
635 0xe8e823cb, 0xdddd7ca1, 0x74749ce8, 0x1f1f213e,
636 0x4b4bdd96, 0xbdbddc61, 0x8b8b860d, 0x8a8a850f,
637 0x707090e0, 0x3e3e427c, 0xb5b5c471, 0x6666aacc,
638 0x4848d890, 0x03030506, 0xf6f601f7, 0x0e0e121c,
639 0x6161a3c2, 0x35355f6a, 0x5757f9ae, 0xb9b9d069,
640 0x86869117, 0xc1c15899, 0x1d1d273a, 0x9e9eb927,
641 0xe1e138d9, 0xf8f813eb, 0x9898b32b, 0x11113322,
642 0x6969bbd2, 0xd9d970a9, 0x8e8e8907, 0x9494a733,
643 0x9b9bb62d, 0x1e1e223c, 0x87879215, 0xe9e920c9,
644 0xcece4987, 0x5555ffaa, 0x28287850, 0xdfdf7aa5,
645 0x8c8c8f03, 0xa1a1f859, 0x89898009, 0x0d0d171a,
646 0xbfbfda65, 0xe6e631d7, 0x4242c684, 0x6868b8d0,
647 0x4141c382, 0x9999b029, 0x2d2d775a, 0x0f0f111e,
648 0xb0b0cb7b, 0x5454fca8, 0xbbbbd66d, 0x16163a2c,
651 __device__ __constant__ u32 te4[256] =
653 0x63636363, 0x7c7c7c7c, 0x77777777, 0x7b7b7b7b,
654 0xf2f2f2f2, 0x6b6b6b6b, 0x6f6f6f6f, 0xc5c5c5c5,
655 0x30303030, 0x01010101, 0x67676767, 0x2b2b2b2b,
656 0xfefefefe, 0xd7d7d7d7, 0xabababab, 0x76767676,
657 0xcacacaca, 0x82828282, 0xc9c9c9c9, 0x7d7d7d7d,
658 0xfafafafa, 0x59595959, 0x47474747, 0xf0f0f0f0,
659 0xadadadad, 0xd4d4d4d4, 0xa2a2a2a2, 0xafafafaf,
660 0x9c9c9c9c, 0xa4a4a4a4, 0x72727272, 0xc0c0c0c0,
661 0xb7b7b7b7, 0xfdfdfdfd, 0x93939393, 0x26262626,
662 0x36363636, 0x3f3f3f3f, 0xf7f7f7f7, 0xcccccccc,
663 0x34343434, 0xa5a5a5a5, 0xe5e5e5e5, 0xf1f1f1f1,
664 0x71717171, 0xd8d8d8d8, 0x31313131, 0x15151515,
665 0x04040404, 0xc7c7c7c7, 0x23232323, 0xc3c3c3c3,
666 0x18181818, 0x96969696, 0x05050505, 0x9a9a9a9a,
667 0x07070707, 0x12121212, 0x80808080, 0xe2e2e2e2,
668 0xebebebeb, 0x27272727, 0xb2b2b2b2, 0x75757575,
669 0x09090909, 0x83838383, 0x2c2c2c2c, 0x1a1a1a1a,
670 0x1b1b1b1b, 0x6e6e6e6e, 0x5a5a5a5a, 0xa0a0a0a0,
671 0x52525252, 0x3b3b3b3b, 0xd6d6d6d6, 0xb3b3b3b3,
672 0x29292929, 0xe3e3e3e3, 0x2f2f2f2f, 0x84848484,
673 0x53535353, 0xd1d1d1d1, 0x00000000, 0xedededed,
674 0x20202020, 0xfcfcfcfc, 0xb1b1b1b1, 0x5b5b5b5b,
675 0x6a6a6a6a, 0xcbcbcbcb, 0xbebebebe, 0x39393939,
676 0x4a4a4a4a, 0x4c4c4c4c, 0x58585858, 0xcfcfcfcf,
677 0xd0d0d0d0, 0xefefefef, 0xaaaaaaaa, 0xfbfbfbfb,
678 0x43434343, 0x4d4d4d4d, 0x33333333, 0x85858585,
679 0x45454545, 0xf9f9f9f9, 0x02020202, 0x7f7f7f7f,
680 0x50505050, 0x3c3c3c3c, 0x9f9f9f9f, 0xa8a8a8a8,
681 0x51515151, 0xa3a3a3a3, 0x40404040, 0x8f8f8f8f,
682 0x92929292, 0x9d9d9d9d, 0x38383838, 0xf5f5f5f5,
683 0xbcbcbcbc, 0xb6b6b6b6, 0xdadadada, 0x21212121,
684 0x10101010, 0xffffffff, 0xf3f3f3f3, 0xd2d2d2d2,
685 0xcdcdcdcd, 0x0c0c0c0c, 0x13131313, 0xecececec,
686 0x5f5f5f5f, 0x97979797, 0x44444444, 0x17171717,
687 0xc4c4c4c4, 0xa7a7a7a7, 0x7e7e7e7e, 0x3d3d3d3d,
688 0x64646464, 0x5d5d5d5d, 0x19191919, 0x73737373,
689 0x60606060, 0x81818181, 0x4f4f4f4f, 0xdcdcdcdc,
690 0x22222222, 0x2a2a2a2a, 0x90909090, 0x88888888,
691 0x46464646, 0xeeeeeeee, 0xb8b8b8b8, 0x14141414,
692 0xdededede, 0x5e5e5e5e, 0x0b0b0b0b, 0xdbdbdbdb,
693 0xe0e0e0e0, 0x32323232, 0x3a3a3a3a, 0x0a0a0a0a,
694 0x49494949, 0x06060606, 0x24242424, 0x5c5c5c5c,
695 0xc2c2c2c2, 0xd3d3d3d3, 0xacacacac, 0x62626262,
696 0x91919191, 0x95959595, 0xe4e4e4e4, 0x79797979,
697 0xe7e7e7e7, 0xc8c8c8c8, 0x37373737, 0x6d6d6d6d,
698 0x8d8d8d8d, 0xd5d5d5d5, 0x4e4e4e4e, 0xa9a9a9a9,
699 0x6c6c6c6c, 0x56565656, 0xf4f4f4f4, 0xeaeaeaea,
700 0x65656565, 0x7a7a7a7a, 0xaeaeaeae, 0x08080808,
701 0xbabababa, 0x78787878, 0x25252525, 0x2e2e2e2e,
702 0x1c1c1c1c, 0xa6a6a6a6, 0xb4b4b4b4, 0xc6c6c6c6,
703 0xe8e8e8e8, 0xdddddddd, 0x74747474, 0x1f1f1f1f,
704 0x4b4b4b4b, 0xbdbdbdbd, 0x8b8b8b8b, 0x8a8a8a8a,
705 0x70707070, 0x3e3e3e3e, 0xb5b5b5b5, 0x66666666,
706 0x48484848, 0x03030303, 0xf6f6f6f6, 0x0e0e0e0e,
707 0x61616161, 0x35353535, 0x57575757, 0xb9b9b9b9,
708 0x86868686, 0xc1c1c1c1, 0x1d1d1d1d, 0x9e9e9e9e,
709 0xe1e1e1e1, 0xf8f8f8f8, 0x98989898, 0x11111111,
710 0x69696969, 0xd9d9d9d9, 0x8e8e8e8e, 0x94949494,
711 0x9b9b9b9b, 0x1e1e1e1e, 0x87878787, 0xe9e9e9e9,
712 0xcececece, 0x55555555, 0x28282828, 0xdfdfdfdf,
713 0x8c8c8c8c, 0xa1a1a1a1, 0x89898989, 0x0d0d0d0d,
714 0xbfbfbfbf, 0xe6e6e6e6, 0x42424242, 0x68686868,
715 0x41414141, 0x99999999, 0x2d2d2d2d, 0x0f0f0f0f,
716 0xb0b0b0b0, 0x54545454, 0xbbbbbbbb, 0x16161616,
719 __device__ __constant__ u32 rcon[] =
721 0x01000000, 0x02000000, 0x04000000, 0x08000000,
722 0x10000000, 0x20000000, 0x40000000, 0x80000000,
723 0x1b000000, 0x36000000,
726 __device__ static void AES128_ExpandKey (u32 *userkey, u32 *rek, u32 s_te0[256], u32 s_te1[256], u32 s_te2[256], u32 s_te3[256], u32 s_te4[256])
728 rek[0] = swap_workaround (userkey[0]);
729 rek[1] = swap_workaround (userkey[1]);
730 rek[2] = swap_workaround (userkey[2]);
731 rek[3] = swap_workaround (userkey[3]);
733 for (u32 i = 0, j = 0; i < 10; i += 1, j += 4)
735 u32 temp = rek[j + 3];
737 temp = (s_te2[(temp >> 16) & 0xff] & 0xff000000)
738 ^ (s_te3[(temp >> 8) & 0xff] & 0x00ff0000)
739 ^ (s_te0[(temp >> 0) & 0xff] & 0x0000ff00)
740 ^ (s_te1[(temp >> 24) & 0xff] & 0x000000ff);
742 rek[j + 4] = rek[j + 0]
746 rek[j + 5] = rek[j + 1] ^ rek[j + 4];
747 rek[j + 6] = rek[j + 2] ^ rek[j + 5];
748 rek[j + 7] = rek[j + 3] ^ rek[j + 6];
752 __device__ static void AES128_encrypt (const u32 *in, u32 *out, const u32 *rek, u32 s_te0[256], u32 s_te1[256], u32 s_te2[256], u32 s_te3[256], u32 s_te4[256])
756 in_swap[0] = swap_workaround (in[0]);
757 in_swap[1] = swap_workaround (in[1]);
758 in_swap[2] = swap_workaround (in[2]);
759 in_swap[3] = swap_workaround (in[3]);
761 u32 s0 = in_swap[0] ^ rek[0];
762 u32 s1 = in_swap[1] ^ rek[1];
763 u32 s2 = in_swap[2] ^ rek[2];
764 u32 s3 = in_swap[3] ^ rek[3];
771 t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ rek[ 4];
772 t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ rek[ 5];
773 t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ rek[ 6];
774 t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ rek[ 7];
775 s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ rek[ 8];
776 s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ rek[ 9];
777 s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ rek[10];
778 s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ rek[11];
779 t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ rek[12];
780 t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ rek[13];
781 t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ rek[14];
782 t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ rek[15];
783 s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ rek[16];
784 s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ rek[17];
785 s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ rek[18];
786 s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ rek[19];
787 t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ rek[20];
788 t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ rek[21];
789 t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ rek[22];
790 t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ rek[23];
791 s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ rek[24];
792 s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ rek[25];
793 s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ rek[26];
794 s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ rek[27];
795 t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ rek[28];
796 t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ rek[29];
797 t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ rek[30];
798 t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ rek[31];
799 s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ rek[32];
800 s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ rek[33];
801 s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ rek[34];
802 s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ rek[35];
803 t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ rek[36];
804 t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ rek[37];
805 t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ rek[38];
806 t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ rek[39];
808 out[0] = (s_te4[(t0 >> 24) & 0xff] & 0xff000000)
809 ^ (s_te4[(t1 >> 16) & 0xff] & 0x00ff0000)
810 ^ (s_te4[(t2 >> 8) & 0xff] & 0x0000ff00)
811 ^ (s_te4[(t3 >> 0) & 0xff] & 0x000000ff)
814 out[1] = (s_te4[(t1 >> 24) & 0xff] & 0xff000000)
815 ^ (s_te4[(t2 >> 16) & 0xff] & 0x00ff0000)
816 ^ (s_te4[(t3 >> 8) & 0xff] & 0x0000ff00)
817 ^ (s_te4[(t0 >> 0) & 0xff] & 0x000000ff)
820 out[2] = (s_te4[(t2 >> 24) & 0xff] & 0xff000000)
821 ^ (s_te4[(t3 >> 16) & 0xff] & 0x00ff0000)
822 ^ (s_te4[(t0 >> 8) & 0xff] & 0x0000ff00)
823 ^ (s_te4[(t1 >> 0) & 0xff] & 0x000000ff)
826 out[3] = (s_te4[(t3 >> 24) & 0xff] & 0xff000000)
827 ^ (s_te4[(t0 >> 16) & 0xff] & 0x00ff0000)
828 ^ (s_te4[(t1 >> 8) & 0xff] & 0x0000ff00)
829 ^ (s_te4[(t2 >> 0) & 0xff] & 0x000000ff)
832 out[0] = swap_workaround (out[0]);
833 out[1] = swap_workaround (out[1]);
834 out[2] = swap_workaround (out[2]);
835 out[3] = swap_workaround (out[3]);
838 __device__ static void memcat8 (u32x block0[4], u32x block1[4], u32x block2[4], u32x block3[4], const u32 block_len, const u32 append[2])
843 block0[0] = append[0];
844 block0[1] = append[1];
848 block0[0] = block0[0] | append[0] << 8;
849 block0[1] = append[0] >> 24 | append[1] << 8;
850 block0[2] = append[1] >> 24;
854 block0[0] = block0[0] | append[0] << 16;
855 block0[1] = append[0] >> 16 | append[1] << 16;
856 block0[2] = append[1] >> 16;
860 block0[0] = block0[0] | append[0] << 24;
861 block0[1] = append[0] >> 8 | append[1] << 24;
862 block0[2] = append[1] >> 8;
866 block0[1] = append[0];
867 block0[2] = append[1];
871 block0[1] = block0[1] | append[0] << 8;
872 block0[2] = append[0] >> 24 | append[1] << 8;
873 block0[3] = append[1] >> 24;
877 block0[1] = block0[1] | append[0] << 16;
878 block0[2] = append[0] >> 16 | append[1] << 16;
879 block0[3] = append[1] >> 16;
883 block0[1] = block0[1] | append[0] << 24;
884 block0[2] = append[0] >> 8 | append[1] << 24;
885 block0[3] = append[1] >> 8;
889 block0[2] = append[0];
890 block0[3] = append[1];
894 block0[2] = block0[2] | append[0] << 8;
895 block0[3] = append[0] >> 24 | append[1] << 8;
896 block1[0] = append[1] >> 24;
900 block0[2] = block0[2] | append[0] << 16;
901 block0[3] = append[0] >> 16 | append[1] << 16;
902 block1[0] = append[1] >> 16;
906 block0[2] = block0[2] | append[0] << 24;
907 block0[3] = append[0] >> 8 | append[1] << 24;
908 block1[0] = append[1] >> 8;
912 block0[3] = append[0];
913 block1[0] = append[1];
917 block0[3] = block0[3] | append[0] << 8;
918 block1[0] = append[0] >> 24 | append[1] << 8;
919 block1[1] = append[1] >> 24;
923 block0[3] = block0[3] | append[0] << 16;
924 block1[0] = append[0] >> 16 | append[1] << 16;
925 block1[1] = append[1] >> 16;
929 block0[3] = block0[3] | append[0] << 24;
930 block1[0] = append[0] >> 8 | append[1] << 24;
931 block1[1] = append[1] >> 8;
935 block1[0] = append[0];
936 block1[1] = append[1];
940 block1[0] = block1[0] | append[0] << 8;
941 block1[1] = append[0] >> 24 | append[1] << 8;
942 block1[2] = append[1] >> 24;
946 block1[0] = block1[0] | append[0] << 16;
947 block1[1] = append[0] >> 16 | append[1] << 16;
948 block1[2] = append[1] >> 16;
952 block1[0] = block1[0] | append[0] << 24;
953 block1[1] = append[0] >> 8 | append[1] << 24;
954 block1[2] = append[1] >> 8;
958 block1[1] = append[0];
959 block1[2] = append[1];
963 block1[1] = block1[1] | append[0] << 8;
964 block1[2] = append[0] >> 24 | append[1] << 8;
965 block1[3] = append[1] >> 24;
969 block1[1] = block1[1] | append[0] << 16;
970 block1[2] = append[0] >> 16 | append[1] << 16;
971 block1[3] = append[1] >> 16;
975 block1[1] = block1[1] | append[0] << 24;
976 block1[2] = append[0] >> 8 | append[1] << 24;
977 block1[3] = append[1] >> 8;
981 block1[2] = append[0];
982 block1[3] = append[1];
986 block1[2] = block1[2] | append[0] << 8;
987 block1[3] = append[0] >> 24 | append[1] << 8;
988 block2[0] = append[1] >> 24;
992 block1[2] = block1[2] | append[0] << 16;
993 block1[3] = append[0] >> 16 | append[1] << 16;
994 block2[0] = append[1] >> 16;
998 block1[2] = block1[2] | append[0] << 24;
999 block1[3] = append[0] >> 8 | append[1] << 24;
1000 block2[0] = append[1] >> 8;
1004 block1[3] = append[0];
1005 block2[0] = append[1];
1009 block1[3] = block1[3] | append[0] << 8;
1010 block2[0] = append[0] >> 24 | append[1] << 8;
1011 block2[1] = append[1] >> 24;
1015 block1[3] = block1[3] | append[0] << 16;
1016 block2[0] = append[0] >> 16 | append[1] << 16;
1017 block2[1] = append[1] >> 16;
1021 block1[3] = block1[3] | append[0] << 24;
1022 block2[0] = append[0] >> 8 | append[1] << 24;
1023 block2[1] = append[1] >> 8;
1027 block2[0] = append[0];
1028 block2[1] = append[1];
1032 block2[0] = block2[0] | append[0] << 8;
1033 block2[1] = append[0] >> 24 | append[1] << 8;
1034 block2[2] = append[1] >> 24;
1038 block2[0] = block2[0] | append[0] << 16;
1039 block2[1] = append[0] >> 16 | append[1] << 16;
1040 block2[2] = append[1] >> 16;
1044 block2[0] = block2[0] | append[0] << 24;
1045 block2[1] = append[0] >> 8 | append[1] << 24;
1046 block2[2] = append[1] >> 8;
1050 block2[1] = append[0];
1051 block2[2] = append[1];
1055 block2[1] = block2[1] | append[0] << 8;
1056 block2[2] = append[0] >> 24 | append[1] << 8;
1057 block2[3] = append[1] >> 24;
1061 block2[1] = block2[1] | append[0] << 16;
1062 block2[2] = append[0] >> 16 | append[1] << 16;
1063 block2[3] = append[1] >> 16;
1067 block2[1] = block2[1] | append[0] << 24;
1068 block2[2] = append[0] >> 8 | append[1] << 24;
1069 block2[3] = append[1] >> 8;
1073 block2[2] = append[0];
1074 block2[3] = append[1];
1078 block2[2] = block2[2] | append[0] << 8;
1079 block2[3] = append[0] >> 24 | append[1] << 8;
1080 block3[0] = append[1] >> 24;
1084 block2[2] = block2[2] | append[0] << 16;
1085 block2[3] = append[0] >> 16 | append[1] << 16;
1086 block3[0] = append[1] >> 16;
1090 block2[2] = block2[2] | append[0] << 24;
1091 block2[3] = append[0] >> 8 | append[1] << 24;
1092 block3[0] = append[1] >> 8;
1096 block2[3] = append[0];
1097 block3[0] = append[1];
1101 block2[3] = block2[3] | append[0] << 8;
1102 block3[0] = append[0] >> 24 | append[1] << 8;
1103 block3[1] = append[1] >> 24;
1107 block2[3] = block2[3] | append[0] << 16;
1108 block3[0] = append[0] >> 16 | append[1] << 16;
1109 block3[1] = append[1] >> 16;
1113 block2[3] = block2[3] | append[0] << 24;
1114 block3[0] = append[0] >> 8 | append[1] << 24;
1115 block3[1] = append[1] >> 8;
1119 block3[0] = append[0];
1120 block3[1] = append[1];
1124 block3[0] = block3[0] | append[0] << 8;
1125 block3[1] = append[0] >> 24 | append[1] << 8;
1126 block3[2] = append[1] >> 24;
1130 block3[0] = block3[0] | append[0] << 16;
1131 block3[1] = append[0] >> 16 | append[1] << 16;
1132 block3[2] = append[1] >> 16;
1136 block3[0] = block3[0] | append[0] << 24;
1137 block3[1] = append[0] >> 8 | append[1] << 24;
1138 block3[2] = append[1] >> 8;
1142 block3[1] = append[0];
1143 block3[2] = append[1];
1147 block3[1] = block3[1] | append[0] << 8;
1148 block3[2] = append[0] >> 24 | append[1] << 8;
1149 block3[3] = append[1] >> 24;
1153 block3[1] = block3[1] | append[0] << 16;
1154 block3[2] = append[0] >> 16 | append[1] << 16;
1155 block3[3] = append[1] >> 16;
1159 block3[1] = block3[1] | append[0] << 24;
1160 block3[2] = append[0] >> 8 | append[1] << 24;
1161 block3[3] = append[1] >> 8;
1165 block3[2] = append[0];
1166 block3[3] = append[1];
1171 #define AESSZ 16 // AES_BLOCK_SIZE
1177 #define WORDSZ256 64
1178 #define WORDSZ384 128
1179 #define WORDSZ512 128
1181 #define PWMAXSZ 32 // oclHashcat password length limit
1182 #define BLMAXSZ BLSZ512
1183 #define WORDMAXSZ WORDSZ512
1185 #define PWMAXSZ4 (PWMAXSZ / 4)
1186 #define BLMAXSZ4 (BLMAXSZ / 4)
1187 #define WORDMAXSZ4 (WORDMAXSZ / 4)
1188 #define AESSZ4 (AESSZ / 4)
1190 __device__ static void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl, const u32 bl_len)
1192 const u32 bd = bl_len / 4;
1194 const u32 pm = pw_len % 4;
1195 const u32 pd = pw_len / 4;
1201 for (u32 i = 0; i < pd; i++) sc[idx++] = pw[i];
1202 for (u32 i = 0; i < bd; i++) sc[idx++] = bl[i];
1203 for (u32 i = 0; i < 4; i++) sc[idx++] = sc[i];
1209 int selector = (0x76543210 >> (pm4 * 4)) & 0xffff;
1213 for (i = 0; i < pd; i++) sc[idx++] = pw[i];
1215 | __byte_perm ( 0, bl[0], selector);
1216 for (i = 1; i < bd; i++) sc[idx++] = __byte_perm (bl[i - 1], bl[i], selector);
1217 sc[idx++] = __byte_perm (bl[i - 1], sc[0], selector);
1218 for (i = 1; i < 4; i++) sc[idx++] = __byte_perm (sc[i - 1], sc[i], selector);
1219 sc[idx++] = __byte_perm (sc[i - 1], 0, selector);
1223 __device__ static void make_pt_with_offset (u32 *pt, const u32 offset, const u32 *sc, const u32 pwbl_len)
1225 const u32 m = offset % pwbl_len;
1227 const u32 om = m % 4;
1228 const u32 od = m / 4;
1230 int selector = (0x76543210 >> (om * 4)) & 0xffff;
1232 pt[0] = __byte_perm (sc[od + 0], sc[od + 1], selector);
1233 pt[1] = __byte_perm (sc[od + 1], sc[od + 2], selector);
1234 pt[2] = __byte_perm (sc[od + 2], sc[od + 3], selector);
1235 pt[3] = __byte_perm (sc[od + 3], sc[od + 4], selector);
1238 __device__ static void make_w_with_offset (ctx_t *ctx, const u32 W_len, const u32 offset, const u32 *sc, const u32 pwbl_len, u32 *iv, const u32 *rek, u32 s_te0[256], u32 s_te1[256], u32 s_te2[256], u32 s_te3[256], u32 s_te4[256])
1240 for (u32 k = 0, wk = 0; k < W_len; k += AESSZ, wk += AESSZ4)
1244 make_pt_with_offset (pt, offset + k, sc, pwbl_len);
1251 AES128_encrypt (pt, iv, rek, s_te0, s_te1, s_te2, s_te3, s_te4);
1253 ctx->W32[wk + 0] = iv[0];
1254 ctx->W32[wk + 1] = iv[1];
1255 ctx->W32[wk + 2] = iv[2];
1256 ctx->W32[wk + 3] = iv[3];
1260 __device__ static u32 do_round (const u32 *pw, const u32 pw_len, ctx_t *ctx, u32 s_te0[256], u32 s_te1[256], u32 s_te2[256], u32 s_te3[256], u32 s_te4[256])
1262 // make scratch buffer
1264 u32 sc[PWMAXSZ4 + BLMAXSZ4 + AESSZ4];
1266 make_sc (sc, pw, pw_len, ctx->dgst32, ctx->dgst_len);
1268 // make sure pwbl_len is calculcated before it gets changed
1270 const u32 pwbl_len = pw_len + ctx->dgst_len;
1276 iv[0] = ctx->dgst32[4];
1277 iv[1] = ctx->dgst32[5];
1278 iv[2] = ctx->dgst32[6];
1279 iv[3] = ctx->dgst32[7];
1285 AES128_ExpandKey (ctx->dgst32, rek, s_te0, s_te1, s_te2, s_te3, s_te4);
1287 // first call is special as the hash depends on the result of it
1288 // but since we do not know about the outcome at this time
1289 // we must use the max
1291 make_w_with_offset (ctx, WORDMAXSZ, 0, sc, pwbl_len, iv, rek, s_te0, s_te1, s_te2, s_te3, s_te4);
1293 // now we can find out hash to use
1297 for (u32 i = 0; i < 4; i++)
1299 sum += (ctx->W32[i] >> 24) & 0xff;
1300 sum += (ctx->W32[i] >> 16) & 0xff;
1301 sum += (ctx->W32[i] >> 8) & 0xff;
1302 sum += (ctx->W32[i] >> 0) & 0xff;
1309 case 0: ctx->dgst32[0] = SHA256M_A;
1310 ctx->dgst32[1] = SHA256M_B;
1311 ctx->dgst32[2] = SHA256M_C;
1312 ctx->dgst32[3] = SHA256M_D;
1313 ctx->dgst32[4] = SHA256M_E;
1314 ctx->dgst32[5] = SHA256M_F;
1315 ctx->dgst32[6] = SHA256M_G;
1316 ctx->dgst32[7] = SHA256M_H;
1317 ctx->dgst_len = BLSZ256;
1318 ctx->W_len = WORDSZ256;
1319 sha256_transform (&ctx->W32[ 0], &ctx->W32[ 4], &ctx->W32[ 8], &ctx->W32[12], ctx->dgst32);
1320 sha256_transform (&ctx->W32[16], &ctx->W32[20], &ctx->W32[24], &ctx->W32[28], ctx->dgst32);
1322 case 1: ctx->dgst64[0] = SHA384M_A;
1323 ctx->dgst64[1] = SHA384M_B;
1324 ctx->dgst64[2] = SHA384M_C;
1325 ctx->dgst64[3] = SHA384M_D;
1326 ctx->dgst64[4] = SHA384M_E;
1327 ctx->dgst64[5] = SHA384M_F;
1328 ctx->dgst64[6] = SHA384M_G;
1329 ctx->dgst64[7] = SHA384M_H;
1330 ctx->dgst_len = BLSZ384;
1331 ctx->W_len = WORDSZ384;
1332 sha384_transform (&ctx->W64[ 0], &ctx->W64[ 4], &ctx->W64[ 8], &ctx->W64[12], ctx->dgst64);
1334 case 2: ctx->dgst64[0] = SHA512M_A;
1335 ctx->dgst64[1] = SHA512M_B;
1336 ctx->dgst64[2] = SHA512M_C;
1337 ctx->dgst64[3] = SHA512M_D;
1338 ctx->dgst64[4] = SHA512M_E;
1339 ctx->dgst64[5] = SHA512M_F;
1340 ctx->dgst64[6] = SHA512M_G;
1341 ctx->dgst64[7] = SHA512M_H;
1342 ctx->dgst_len = BLSZ512;
1343 ctx->W_len = WORDSZ512;
1344 sha512_transform (&ctx->W64[ 0], &ctx->W64[ 4], &ctx->W64[ 8], &ctx->W64[12], ctx->dgst64);
1350 const u32 final_len = pwbl_len * 64;
1352 const u32 iter_max = ctx->W_len - (ctx->W_len / 8);
1357 for (offset = WORDMAXSZ, left = final_len - offset; left >= iter_max; offset += ctx->W_len, left -= ctx->W_len)
1359 make_w_with_offset (ctx, ctx->W_len, offset, sc, pwbl_len, iv, rek, s_te0, s_te1, s_te2, s_te3, s_te4);
1361 switch (ctx->dgst_len)
1363 case BLSZ256: sha256_transform (&ctx->W32[ 0], &ctx->W32[ 4], &ctx->W32[ 8], &ctx->W32[12], ctx->dgst32);
1365 case BLSZ384: sha384_transform (&ctx->W64[ 0], &ctx->W64[ 4], &ctx->W64[ 8], &ctx->W64[12], ctx->dgst64);
1367 case BLSZ512: sha512_transform (&ctx->W64[ 0], &ctx->W64[ 4], &ctx->W64[ 8], &ctx->W64[12], ctx->dgst64);
1376 switch (ctx->dgst_len)
1378 case BLSZ384: make_w_with_offset (ctx, 64, offset, sc, pwbl_len, iv, rek, s_te0, s_te1, s_te2, s_te3, s_te4);
1379 ctx->W64[ 8] = 0x80;
1386 ctx->W64[15] = swap_workaround ((u64) (final_len * 8));
1387 ex = ctx->W64[7] >> 56;
1389 case BLSZ512: make_w_with_offset (ctx, 64, offset, sc, pwbl_len, iv, rek, s_te0, s_te1, s_te2, s_te3, s_te4);
1390 ctx->W64[ 8] = 0x80;
1397 ctx->W64[15] = swap_workaround ((u64) (final_len * 8));
1398 ex = ctx->W64[7] >> 56;
1404 switch (ctx->dgst_len)
1406 case BLSZ256: ex = ctx->W32[15] >> 24;
1407 ctx->W32[ 0] = 0x80;
1422 ctx->W32[15] = swap_workaround (final_len * 8);
1424 case BLSZ384: ex = ctx->W64[15] >> 56;
1425 ctx->W64[ 0] = 0x80;
1440 ctx->W64[15] = swap_workaround ((u64) (final_len * 8));
1442 case BLSZ512: ex = ctx->W64[15] >> 56;
1443 ctx->W64[ 0] = 0x80;
1458 ctx->W64[15] = swap_workaround ((u64) (final_len * 8));
1463 switch (ctx->dgst_len)
1465 case BLSZ256: sha256_transform (&ctx->W32[ 0], &ctx->W32[ 4], &ctx->W32[ 8], &ctx->W32[12], ctx->dgst32);
1466 ctx->dgst32[ 0] = swap_workaround (ctx->dgst32[0]);
1467 ctx->dgst32[ 1] = swap_workaround (ctx->dgst32[1]);
1468 ctx->dgst32[ 2] = swap_workaround (ctx->dgst32[2]);
1469 ctx->dgst32[ 3] = swap_workaround (ctx->dgst32[3]);
1470 ctx->dgst32[ 4] = swap_workaround (ctx->dgst32[4]);
1471 ctx->dgst32[ 5] = swap_workaround (ctx->dgst32[5]);
1472 ctx->dgst32[ 6] = swap_workaround (ctx->dgst32[6]);
1473 ctx->dgst32[ 7] = swap_workaround (ctx->dgst32[7]);
1474 ctx->dgst32[ 8] = 0;
1475 ctx->dgst32[ 9] = 0;
1476 ctx->dgst32[10] = 0;
1477 ctx->dgst32[11] = 0;
1478 ctx->dgst32[12] = 0;
1479 ctx->dgst32[13] = 0;
1480 ctx->dgst32[14] = 0;
1481 ctx->dgst32[15] = 0;
1483 case BLSZ384: sha384_transform (&ctx->W64[ 0], &ctx->W64[ 4], &ctx->W64[ 8], &ctx->W64[12], ctx->dgst64);
1484 ctx->dgst64[0] = swap_workaround (ctx->dgst64[0]);
1485 ctx->dgst64[1] = swap_workaround (ctx->dgst64[1]);
1486 ctx->dgst64[2] = swap_workaround (ctx->dgst64[2]);
1487 ctx->dgst64[3] = swap_workaround (ctx->dgst64[3]);
1488 ctx->dgst64[4] = swap_workaround (ctx->dgst64[4]);
1489 ctx->dgst64[5] = swap_workaround (ctx->dgst64[5]);
1493 case BLSZ512: sha512_transform (&ctx->W64[ 0], &ctx->W64[ 4], &ctx->W64[ 8], &ctx->W64[12], ctx->dgst64);
1494 ctx->dgst64[0] = swap_workaround (ctx->dgst64[0]);
1495 ctx->dgst64[1] = swap_workaround (ctx->dgst64[1]);
1496 ctx->dgst64[2] = swap_workaround (ctx->dgst64[2]);
1497 ctx->dgst64[3] = swap_workaround (ctx->dgst64[3]);
1498 ctx->dgst64[4] = swap_workaround (ctx->dgst64[4]);
1499 ctx->dgst64[5] = swap_workaround (ctx->dgst64[5]);
1500 ctx->dgst64[6] = swap_workaround (ctx->dgst64[6]);
1501 ctx->dgst64[7] = swap_workaround (ctx->dgst64[7]);
1508 extern "C" __global__ void __launch_bounds__ (256, 1) m10700_init (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, pdf17l8_tmp_t *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const pdf_t *pdf_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
1514 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
1516 if (gid >= gid_max) return;
1520 w0[0] = pws[gid].i[0];
1521 w0[1] = pws[gid].i[1];
1522 w0[2] = pws[gid].i[2];
1523 w0[3] = pws[gid].i[3];
1525 const u32 pw_len = pws[gid].pw_len;
1533 salt_buf[0] = salt_bufs[salt_pos].salt_buf[0];
1534 salt_buf[1] = salt_bufs[salt_pos].salt_buf[1];
1536 const u32 salt_len = salt_bufs[salt_pos].salt_len;
1542 u32 block_len = pw_len;
1572 memcat8 (block0, block1, block2, block3, block_len, salt_buf);
1574 block_len += salt_len;
1576 append_0x80_2 (block0, block1, block_len);
1578 block3[3] = swap_workaround (block_len * 8);
1582 digest[0] = SHA256M_A;
1583 digest[1] = SHA256M_B;
1584 digest[2] = SHA256M_C;
1585 digest[3] = SHA256M_D;
1586 digest[4] = SHA256M_E;
1587 digest[5] = SHA256M_F;
1588 digest[6] = SHA256M_G;
1589 digest[7] = SHA256M_H;
1591 sha256_transform (block0, block1, block2, block3, digest);
1593 digest[0] = swap_workaround (digest[0]);
1594 digest[1] = swap_workaround (digest[1]);
1595 digest[2] = swap_workaround (digest[2]);
1596 digest[3] = swap_workaround (digest[3]);
1597 digest[4] = swap_workaround (digest[4]);
1598 digest[5] = swap_workaround (digest[5]);
1599 digest[6] = swap_workaround (digest[6]);
1600 digest[7] = swap_workaround (digest[7]);
1602 tmps[gid].dgst32[0] = digest[0];
1603 tmps[gid].dgst32[1] = digest[1];
1604 tmps[gid].dgst32[2] = digest[2];
1605 tmps[gid].dgst32[3] = digest[3];
1606 tmps[gid].dgst32[4] = digest[4];
1607 tmps[gid].dgst32[5] = digest[5];
1608 tmps[gid].dgst32[6] = digest[6];
1609 tmps[gid].dgst32[7] = digest[7];
1610 tmps[gid].dgst_len = BLSZ256;
1611 tmps[gid].W_len = WORDSZ256;
1614 extern "C" __global__ void __launch_bounds__ (256, 1) m10700_loop (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, pdf17l8_tmp_t *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const pdf_t *pdf_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
1616 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
1617 const u32 lid = threadIdx.x;
1623 __shared__ u32 s_te0[256];
1624 __shared__ u32 s_te1[256];
1625 __shared__ u32 s_te2[256];
1626 __shared__ u32 s_te3[256];
1627 __shared__ u32 s_te4[256];
1629 s_te0[lid] = te0[lid];
1630 s_te1[lid] = te1[lid];
1631 s_te2[lid] = te2[lid];
1632 s_te3[lid] = te3[lid];
1633 s_te4[lid] = te4[lid];
1637 if (gid >= gid_max) return;
1645 w0[0] = pws[gid].i[0];
1646 w0[1] = pws[gid].i[1];
1647 w0[2] = pws[gid].i[2];
1648 w0[3] = pws[gid].i[3];
1650 const u32 pw_len = pws[gid].pw_len;
1658 ctx.dgst64[0] = tmps[gid].dgst64[0];
1659 ctx.dgst64[1] = tmps[gid].dgst64[1];
1660 ctx.dgst64[2] = tmps[gid].dgst64[2];
1661 ctx.dgst64[3] = tmps[gid].dgst64[3];
1662 ctx.dgst64[4] = tmps[gid].dgst64[4];
1663 ctx.dgst64[5] = tmps[gid].dgst64[5];
1664 ctx.dgst64[6] = tmps[gid].dgst64[6];
1665 ctx.dgst64[7] = tmps[gid].dgst64[7];
1666 ctx.dgst_len = tmps[gid].dgst_len;
1667 ctx.W_len = tmps[gid].W_len;
1671 for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++)
1673 ex = do_round (w0, pw_len, &ctx, s_te0, s_te1, s_te2, s_te3, s_te4);
1676 if ((loop_pos + loop_cnt) == 64)
1678 for (u32 i = 64; i < ex + 32; i++)
1680 ex = do_round (w0, pw_len, &ctx, s_te0, s_te1, s_te2, s_te3, s_te4);
1684 tmps[gid].dgst64[0] = ctx.dgst64[0];
1685 tmps[gid].dgst64[1] = ctx.dgst64[1];
1686 tmps[gid].dgst64[2] = ctx.dgst64[2];
1687 tmps[gid].dgst64[3] = ctx.dgst64[3];
1688 tmps[gid].dgst64[4] = ctx.dgst64[4];
1689 tmps[gid].dgst64[5] = ctx.dgst64[5];
1690 tmps[gid].dgst64[6] = ctx.dgst64[6];
1691 tmps[gid].dgst64[7] = ctx.dgst64[7];
1692 tmps[gid].dgst_len = ctx.dgst_len;
1693 tmps[gid].W_len = ctx.W_len;
1696 extern "C" __global__ void __launch_bounds__ (256, 1) m10700_comp (const pw_t *pws, const gpu_rule_t *rules_buf, const comb_t *combs_buf, const bf_t *bfs_buf, pdf17l8_tmp_t *tmps, void *hooks, const u32 *bitmaps_buf_s1_a, const u32 *bitmaps_buf_s1_b, const u32 *bitmaps_buf_s1_c, const u32 *bitmaps_buf_s1_d, const u32 *bitmaps_buf_s2_a, const u32 *bitmaps_buf_s2_b, const u32 *bitmaps_buf_s2_c, const u32 *bitmaps_buf_s2_d, plain_t *plains_buf, const digest_t *digests_buf, u32 *hashes_shown, const salt_t *salt_bufs, const pdf_t *pdf_bufs, u32 *d_return_buf, u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
1702 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
1704 if (gid >= gid_max) return;
1706 const u32 lid = threadIdx.x;
1712 const u32x r0 = swap_workaround (tmps[gid].dgst32[DGST_R0]);
1713 const u32x r1 = swap_workaround (tmps[gid].dgst32[DGST_R1]);
1714 const u32x r2 = swap_workaround (tmps[gid].dgst32[DGST_R2]);
1715 const u32x r3 = swap_workaround (tmps[gid].dgst32[DGST_R3]);
1719 #include VECT_COMPARE_M