Merge pull request #225 from Fist0urs/format_krb5tgs
[hashcat.git] / OpenCL / types_ocl.c
1 /**
2 * Author......: Jens Steube <jens.steube@gmail.com>
3 * License.....: MIT
4 */
5
6 #define DEVICE_TYPE_CPU 2
7 #define DEVICE_TYPE_GPU 4
8
9 typedef uchar u8;
10 typedef ushort u16;
11 typedef uint u32;
12 typedef ulong u64;
13
14 #ifndef NEW_SIMD_CODE
15 #undef VECT_SIZE
16 #define VECT_SIZE 1
17 #endif
18
19 #if VECT_SIZE == 1
20 typedef uchar u8x;
21 typedef ushort u16x;
22 typedef uint u32x;
23 typedef ulong u64x;
24 #endif
25
26 #if VECT_SIZE == 2
27 typedef uchar2 u8x;
28 typedef ushort2 u16x;
29 typedef uint2 u32x;
30 typedef ulong2 u64x;
31 #endif
32
33 #if VECT_SIZE == 4
34 typedef uchar4 u8x;
35 typedef ushort4 u16x;
36 typedef uint4 u32x;
37 typedef ulong4 u64x;
38 #endif
39
40 #if VECT_SIZE == 8
41 typedef uchar8 u8x;
42 typedef ushort8 u16x;
43 typedef uint8 u32x;
44 typedef ulong8 u64x;
45 #endif
46
47 // this one needs to die
48 #define allx(r) r
49
50 static inline u32 l32_from_64_S (u64 a)
51 {
52 const u32 r = (u32) (a);
53
54 return r;
55 }
56
57 static inline u32 h32_from_64_S (u64 a)
58 {
59 a >>= 32;
60
61 const u32 r = (u32) (a);
62
63 return r;
64 }
65
66 static inline u64 hl32_to_64_S (const u32 a, const u32 b)
67 {
68 return as_ulong ((uint2) (b, a));
69 }
70
71 static inline u32x l32_from_64 (u64x a)
72 {
73 u32x r;
74
75 #if VECT_SIZE == 1
76 r = (u32) a;
77 #endif
78
79 #if VECT_SIZE >= 2
80 r.s0 = (u32) a.s0;
81 r.s1 = (u32) a.s1;
82 #endif
83
84 #if VECT_SIZE >= 4
85 r.s2 = (u32) a.s2;
86 r.s3 = (u32) a.s3;
87 #endif
88
89 #if VECT_SIZE >= 8
90 r.s4 = (u32) a.s4;
91 r.s5 = (u32) a.s5;
92 r.s6 = (u32) a.s6;
93 r.s7 = (u32) a.s7;
94 #endif
95
96 return r;
97 }
98
99 static inline u32x h32_from_64 (u64x a)
100 {
101 a >>= 32;
102
103 u32x r;
104
105 #if VECT_SIZE == 1
106 r = (u32) a;
107 #endif
108
109 #if VECT_SIZE >= 2
110 r.s0 = (u32) a.s0;
111 r.s1 = (u32) a.s1;
112 #endif
113
114 #if VECT_SIZE >= 4
115 r.s2 = (u32) a.s2;
116 r.s3 = (u32) a.s3;
117 #endif
118
119 #if VECT_SIZE >= 8
120 r.s4 = (u32) a.s4;
121 r.s5 = (u32) a.s5;
122 r.s6 = (u32) a.s6;
123 r.s7 = (u32) a.s7;
124 #endif
125
126 return r;
127 }
128
129 static inline u64x hl32_to_64 (const u32x a, const u32x b)
130 {
131 u64x r;
132
133 #if VECT_SIZE == 1
134 r = as_ulong ((uint2) (b, a));
135 #endif
136
137 #if VECT_SIZE >= 2
138 r.s0 = as_ulong ((uint2) (b.s0, a.s0));
139 r.s1 = as_ulong ((uint2) (b.s1, a.s1));
140 #endif
141
142 #if VECT_SIZE >= 4
143 r.s2 = as_ulong ((uint2) (b.s2, a.s2));
144 r.s3 = as_ulong ((uint2) (b.s3, a.s3));
145 #endif
146
147 #if VECT_SIZE >= 8
148 r.s4 = as_ulong ((uint2) (b.s4, a.s4));
149 r.s5 = as_ulong ((uint2) (b.s5, a.s5));
150 r.s6 = as_ulong ((uint2) (b.s6, a.s6));
151 r.s7 = as_ulong ((uint2) (b.s7, a.s7));
152 #endif
153
154 return r;
155 }
156
157 #ifdef IS_AMD
158 static inline u32 swap32_S (const u32 v)
159 {
160 return (as_uint (as_uchar4 (v).s3210));
161 }
162
163 static inline u64 swap64_S (const u64 v)
164 {
165 return (as_ulong (as_uchar8 (v).s76543210));
166 }
167
168 static inline u32 rotr32_S (const u32 a, const u32 n)
169 {
170 return rotate (a, 32 - n);
171 }
172
173 static inline u32 rotl32_S (const u32 a, const u32 n)
174 {
175 return rotate (a, n);
176 }
177
178 static inline u64 rotr64_S (const u64 a, const u32 n)
179 {
180 #if DEVICE_TYPE == DEVICE_TYPE_CPU
181
182 const u64 r = rotate (a, (u64) 64 - n);
183
184 #else
185
186 const u32 a0 = h32_from_64_S (a);
187 const u32 a1 = l32_from_64_S (a);
188
189 const u32 t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n);
190 const u32 t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n);
191
192 const u64 r = hl32_to_64_S (t0, t1);
193
194 #endif
195
196 return r;
197 }
198
199 static inline u64 rotl64_S (const u64 a, const u32 n)
200 {
201 return rotr64_S (a, 64 - n);
202 }
203
204 static inline u32x swap32 (const u32x v)
205 {
206 return ((v >> 24) & 0x000000ff)
207 | ((v >> 8) & 0x0000ff00)
208 | ((v << 8) & 0x00ff0000)
209 | ((v << 24) & 0xff000000);
210 }
211
212 static inline u64x swap64 (const u64x v)
213 {
214 return ((v >> 56) & 0x00000000000000ff)
215 | ((v >> 40) & 0x000000000000ff00)
216 | ((v >> 24) & 0x0000000000ff0000)
217 | ((v >> 8) & 0x00000000ff000000)
218 | ((v << 8) & 0x000000ff00000000)
219 | ((v << 24) & 0x0000ff0000000000)
220 | ((v << 40) & 0x00ff000000000000)
221 | ((v << 56) & 0xff00000000000000);
222 }
223
224 static inline u32x rotr32 (const u32x a, const u32 n)
225 {
226 return rotate (a, 32 - n);
227 }
228
229 static inline u32x rotl32 (const u32x a, const u32 n)
230 {
231 return rotate (a, n);
232 }
233
234 static inline u64x rotr64 (const u64x a, const u32 n)
235 {
236 #if DEVICE_TYPE == DEVICE_TYPE_CPU
237
238 const u64x r = rotate (a, (u64) 64 - n);
239
240 #else
241
242 const u32x a0 = h32_from_64 (a);
243 const u32x a1 = l32_from_64 (a);
244
245 const u32x t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n);
246 const u32x t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n);
247
248 const u64x r = hl32_to_64 (t0, t1);
249
250 #endif
251
252 return r;
253 }
254
255 static inline u64x rotl64 (const u64x a, const u32 n)
256 {
257 return rotr64 (a, 64 - n);
258 }
259
260 static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
261 {
262 return amd_bfe (a, b, c);
263 }
264
265 static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
266 {
267 return amd_bytealign (a, b, c);
268 }
269 #endif
270
271 #ifdef IS_NV
272 static inline u32 swap32_S (const u32 v)
273 {
274 u32 r;
275
276 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r) : "r"(v));
277
278 return r;
279 }
280
281 static inline u64 swap64_S (const u64 v)
282 {
283 u32 il;
284 u32 ir;
285
286 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(v));
287
288 u32 tl;
289 u32 tr;
290
291 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl) : "r"(il));
292 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr) : "r"(ir));
293
294 u64 r;
295
296 asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tr), "r"(tl));
297
298 return r;
299 }
300
301 static inline u32 rotr32_S (const u32 a, const u32 n)
302 {
303 return rotate (a, 32 - n);
304 }
305
306 static inline u32 rotl32_S (const u32 a, const u32 n)
307 {
308 return rotate (a, n);
309 }
310
311 #if CUDA_ARCH >= 350
312 static inline u64 rotr64_S (const u64 a, const u32 n)
313 {
314 u32 il;
315 u32 ir;
316
317 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a));
318
319 u32 tl;
320 u32 tr;
321
322 if (n >= 32)
323 {
324 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
325 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
326 }
327 else
328 {
329 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
330 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
331 }
332
333 u64 r;
334
335 asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr));
336
337 return r;
338 }
339 #else
340 static inline u64 rotr64_S (const u64 a, const u32 n)
341 {
342 return rotate (a, (u64) 64 - n);
343 }
344 #endif
345
346 static inline u64 rotl64_S (const u64 a, const u32 n)
347 {
348 return rotr64_S (a, 64 - n);
349 }
350
351 #if CUDA_ARCH >= 500
352 static inline u32 lut3_2d_S (const u32 a, const u32 b, const u32 c)
353 {
354 u32 r;
355
356 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
357
358 return r;
359 }
360
361 static inline u32 lut3_39_S (const u32 a, const u32 b, const u32 c)
362 {
363 u32 r;
364
365 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
366
367 return r;
368 }
369
370 static inline u32 lut3_59_S (const u32 a, const u32 b, const u32 c)
371 {
372 u32 r;
373
374 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
375
376 return r;
377 }
378
379 static inline u32 lut3_96_S (const u32 a, const u32 b, const u32 c)
380 {
381 u32 r;
382
383 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
384
385 return r;
386 }
387
388 static inline u32 lut3_e4_S (const u32 a, const u32 b, const u32 c)
389 {
390 u32 r;
391
392 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
393
394 return r;
395 }
396
397 static inline u32 lut3_e8_S (const u32 a, const u32 b, const u32 c)
398 {
399 u32 r;
400
401 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
402
403 return r;
404 }
405
406 static inline u32 lut3_ca_S (const u32 a, const u32 b, const u32 c)
407 {
408 u32 r;
409
410 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
411
412 return r;
413 }
414 #endif
415
416 static inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
417 {
418 u32 r;
419
420 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
421
422 return r;
423 }
424
425 static inline u32x swap32 (const u32x v)
426 {
427 return ((v >> 24) & 0x000000ff)
428 | ((v >> 8) & 0x0000ff00)
429 | ((v << 8) & 0x00ff0000)
430 | ((v << 24) & 0xff000000);
431 }
432
433 static inline u64x swap64 (const u64x v)
434 {
435 return ((v >> 56) & 0x00000000000000ff)
436 | ((v >> 40) & 0x000000000000ff00)
437 | ((v >> 24) & 0x0000000000ff0000)
438 | ((v >> 8) & 0x00000000ff000000)
439 | ((v << 8) & 0x000000ff00000000)
440 | ((v << 24) & 0x0000ff0000000000)
441 | ((v << 40) & 0x00ff000000000000)
442 | ((v << 56) & 0xff00000000000000);
443 }
444
445 static inline u32x rotr32 (const u32x a, const u32 n)
446 {
447 return rotate (a, 32 - n);
448 }
449
450 static inline u32x rotl32 (const u32x a, const u32 n)
451 {
452 return rotate (a, n);
453 }
454
455 #if CUDA_ARCH >= 350
456 static inline u64x rotr64 (const u64x a, const u32 n)
457 {
458 u64x r;
459
460 u32 il;
461 u32 ir;
462 u32 tl;
463 u32 tr;
464
465 #if VECT_SIZE == 1
466
467 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a));
468
469 if (n >= 32)
470 {
471 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
472 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
473 }
474 else
475 {
476 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
477 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
478 }
479
480 asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr));
481
482 #endif
483
484 #if VECT_SIZE >= 2
485
486 {
487 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s0));
488
489 if (n >= 32)
490 {
491 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
492 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
493 }
494 else
495 {
496 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
497 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
498 }
499
500 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s0) : "r"(tl), "r"(tr));
501 }
502
503 {
504 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s1));
505
506 if (n >= 32)
507 {
508 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
509 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
510 }
511 else
512 {
513 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
514 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
515 }
516
517 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s1) : "r"(tl), "r"(tr));
518 }
519
520 #endif
521
522 #if VECT_SIZE >= 4
523
524 {
525 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s2));
526
527 if (n >= 32)
528 {
529 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
530 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
531 }
532 else
533 {
534 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
535 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
536 }
537
538 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s2) : "r"(tl), "r"(tr));
539 }
540
541 {
542 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s3));
543
544 if (n >= 32)
545 {
546 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
547 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
548 }
549 else
550 {
551 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
552 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
553 }
554
555 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s3) : "r"(tl), "r"(tr));
556 }
557
558 #endif
559
560 #if VECT_SIZE >= 8
561
562 {
563 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s4));
564
565 if (n >= 32)
566 {
567 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
568 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
569 }
570 else
571 {
572 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
573 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
574 }
575
576 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s4) : "r"(tl), "r"(tr));
577 }
578
579 {
580 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s5));
581
582 if (n >= 32)
583 {
584 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
585 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
586 }
587 else
588 {
589 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
590 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
591 }
592
593 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s5) : "r"(tl), "r"(tr));
594 }
595
596 {
597 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s6));
598
599 if (n >= 32)
600 {
601 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
602 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
603 }
604 else
605 {
606 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
607 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
608 }
609
610 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s6) : "r"(tl), "r"(tr));
611 }
612
613 {
614 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s7));
615
616 if (n >= 32)
617 {
618 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
619 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
620 }
621 else
622 {
623 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
624 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
625 }
626
627 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s7) : "r"(tl), "r"(tr));
628 }
629
630 #endif
631
632 return r;
633 }
634 #else
635 static inline u64x rotr64 (const u64x a, const u32 n)
636 {
637 return rotate (a, (u64) 64 - n);
638 }
639 #endif
640
641 static inline u64x rotl64 (const u64x a, const u32 n)
642 {
643 return rotr64 (a, (u64) 64 - n);
644 }
645
646 static inline u32x __byte_perm (const u32x a, const u32x b, const u32x c)
647 {
648 u32x r;
649
650 #if VECT_SIZE == 1
651 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c) );
652 #endif
653
654 #if VECT_SIZE >= 2
655 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
656 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
657 #endif
658
659 #if VECT_SIZE >= 4
660 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2));
661 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3));
662 #endif
663
664 #if VECT_SIZE >= 8
665 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c.s4));
666 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c.s5));
667 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c.s6));
668 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c.s7));
669 #endif
670
671 return r;
672 }
673
674 static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
675 {
676 u32 r;
677
678 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
679
680 return r;
681 }
682
683 #if CUDA_ARCH >= 350
684 static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
685 {
686 u32 r;
687
688 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"((c & 3) * 8));
689
690 return r;
691 }
692 #else
693 static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
694 {
695 return __byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
696 }
697 #endif
698
699 #if CUDA_ARCH >= 500
700 static inline u32x lut3_2d (const u32x a, const u32x b, const u32x c)
701 {
702 u32x r;
703
704 #if VECT_SIZE == 1
705 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
706 #endif
707
708 #if VECT_SIZE >= 2
709 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
710 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
711 #endif
712
713 #if VECT_SIZE >= 4
714 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
715 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
716 #endif
717
718 #if VECT_SIZE >= 8
719 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
720 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
721 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
722 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
723 #endif
724
725 return r;
726 }
727
728 static inline u32x lut3_39 (const u32x a, const u32x b, const u32x c)
729 {
730 u32x r;
731
732 #if VECT_SIZE == 1
733 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
734 #endif
735
736 #if VECT_SIZE == 2
737 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
738 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
739 #endif
740
741 #if VECT_SIZE == 4
742 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
743 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
744 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
745 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
746 #endif
747
748 #if VECT_SIZE == 8
749 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
750 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
751 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
752 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
753 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
754 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
755 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
756 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
757 #endif
758
759 return r;
760 }
761
762 static inline u32x lut3_59 (const u32x a, const u32x b, const u32x c)
763 {
764 u32x r;
765
766 #if VECT_SIZE == 1
767 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
768 #endif
769
770 #if VECT_SIZE == 2
771 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
772 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
773 #endif
774
775 #if VECT_SIZE == 4
776 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
777 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
778 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
779 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
780 #endif
781
782 #if VECT_SIZE == 8
783 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
784 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
785 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
786 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
787 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
788 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
789 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
790 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
791 #endif
792
793 return r;
794 }
795
796 static inline u32x lut3_96 (const u32x a, const u32x b, const u32x c)
797 {
798 u32x r;
799
800 #if VECT_SIZE == 1
801 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
802 #endif
803
804 #if VECT_SIZE == 2
805 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
806 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
807 #endif
808
809 #if VECT_SIZE == 4
810 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
811 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
812 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
813 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
814 #endif
815
816 #if VECT_SIZE == 8
817 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
818 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
819 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
820 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
821 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
822 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
823 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
824 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
825 #endif
826
827 return r;
828 }
829
830 static inline u32x lut3_e4 (const u32x a, const u32x b, const u32x c)
831 {
832 u32x r;
833
834 #if VECT_SIZE == 1
835 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
836 #endif
837
838 #if VECT_SIZE == 2
839 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
840 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
841 #endif
842
843 #if VECT_SIZE == 4
844 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
845 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
846 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
847 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
848 #endif
849
850 #if VECT_SIZE == 8
851 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
852 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
853 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
854 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
855 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
856 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
857 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
858 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
859 #endif
860
861 return r;
862 }
863
864 static inline u32x lut3_e8 (const u32x a, const u32x b, const u32x c)
865 {
866 u32x r;
867
868 #if VECT_SIZE == 1
869 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
870 #endif
871
872 #if VECT_SIZE == 2
873 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
874 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
875 #endif
876
877 #if VECT_SIZE == 4
878 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
879 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
880 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
881 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
882 #endif
883
884 #if VECT_SIZE == 8
885 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
886 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
887 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
888 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
889 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
890 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
891 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
892 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
893 #endif
894
895 return r;
896 }
897
898 static inline u32x lut3_ca (const u32x a, const u32x b, const u32x c)
899 {
900 u32x r;
901
902 #if VECT_SIZE == 1
903 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
904 #endif
905
906 #if VECT_SIZE == 2
907 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
908 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
909 #endif
910
911 #if VECT_SIZE == 4
912 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
913 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
914 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
915 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
916 #endif
917
918 #if VECT_SIZE == 8
919 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
920 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
921 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
922 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
923 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
924 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
925 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
926 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
927 #endif
928
929 return r;
930 }
931
932 #endif
933 #endif
934
935 #ifdef IS_GENERIC
936 static inline u32 swap32_S (const u32 v)
937 {
938 return (as_uint (as_uchar4 (v).s3210));
939 }
940
941 static inline u64 swap64_S (const u64 v)
942 {
943 return (as_ulong (as_uchar8 (v).s76543210));
944 }
945
946 static inline u32 rotr32_S (const u32 a, const u32 n)
947 {
948 return rotate (a, 32 - n);
949 }
950
951 static inline u32 rotl32_S (const u32 a, const u32 n)
952 {
953 return rotate (a, n);
954 }
955
956 static inline u64 rotr64_S (const u64 a, const u32 n)
957 {
958 return rotate (a, (u64) 64 - n);
959 }
960
961 static inline u64 rotl64_S (const u64 a, const u32 n)
962 {
963 return rotate (a, (u64) n);
964 }
965
966 static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
967 {
968 const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8);
969
970 return (u32) (tmp);
971 }
972
973 static inline u32x swap32 (const u32x v)
974 {
975 return ((v >> 24) & 0x000000ff)
976 | ((v >> 8) & 0x0000ff00)
977 | ((v << 8) & 0x00ff0000)
978 | ((v << 24) & 0xff000000);
979 }
980
981 static inline u64x swap64 (const u64x v)
982 {
983 return ((v >> 56) & 0x00000000000000ff)
984 | ((v >> 40) & 0x000000000000ff00)
985 | ((v >> 24) & 0x0000000000ff0000)
986 | ((v >> 8) & 0x00000000ff000000)
987 | ((v << 8) & 0x000000ff00000000)
988 | ((v << 24) & 0x0000ff0000000000)
989 | ((v << 40) & 0x00ff000000000000)
990 | ((v << 56) & 0xff00000000000000);
991 }
992
993 static inline u32x rotr32 (const u32x a, const u32 n)
994 {
995 return rotate (a, 32 - n);
996 }
997
998 static inline u32x rotl32 (const u32x a, const u32 n)
999 {
1000 return rotate (a, n);
1001 }
1002
1003 static inline u64x rotr64 (const u64x a, const u32 n)
1004 {
1005 return rotate (a, (u64) 64 - n);
1006 }
1007
1008 static inline u64x rotl64 (const u64x a, const u32 n)
1009 {
1010 return rotate (a, (u64) n);
1011 }
1012
1013 static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
1014 {
1015 #define BIT(x) (1 << (x))
1016 #define BIT_MASK(x) (BIT (x) - 1)
1017 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
1018
1019 return BFE (a, b, c);
1020 }
1021
1022 static inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c)
1023 {
1024 #if VECT_SIZE == 1
1025 const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8);
1026
1027 return (u32x) (tmp);
1028 #endif
1029
1030 #if VECT_SIZE == 2
1031 const u64x tmp = ((((u64x) (a.s0, a.s1)) << 32) | ((u64x) (b.s0, b.s1))) >> ((c & 3) * 8);
1032
1033 return (u32x) (tmp.s0, tmp.s1);
1034 #endif
1035
1036 #if VECT_SIZE == 4
1037 const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3))) >> ((c & 3) * 8);
1038
1039 return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3);
1040 #endif
1041
1042 #if VECT_SIZE == 8
1043 const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3, b.s4, b.s5, b.s6, b.s7))) >> ((c & 3) * 8);
1044
1045 return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7);
1046 #endif
1047 }
1048 #endif
1049
1050 typedef struct
1051 {
1052 #if defined _DES_
1053 u32 digest_buf[4];
1054 #elif defined _MD4_
1055 u32 digest_buf[4];
1056 #elif defined _MD5_
1057 u32 digest_buf[4];
1058 #elif defined _MD5H_
1059 u32 digest_buf[4];
1060 #elif defined _SHA1_
1061 u32 digest_buf[5];
1062 #elif defined _BCRYPT_
1063 u32 digest_buf[6];
1064 #elif defined _SHA256_
1065 u32 digest_buf[8];
1066 #elif defined _SHA384_
1067 u32 digest_buf[16];
1068 #elif defined _SHA512_
1069 u32 digest_buf[16];
1070 #elif defined _KECCAK_
1071 u32 digest_buf[50];
1072 #elif defined _RIPEMD160_
1073 u32 digest_buf[5];
1074 #elif defined _WHIRLPOOL_
1075 u32 digest_buf[16];
1076 #elif defined _GOST_
1077 u32 digest_buf[8];
1078 #elif defined _GOST2012_256_
1079 u32 digest_buf[8];
1080 #elif defined _GOST2012_512_
1081 u32 digest_buf[16];
1082 #elif defined _SAPB_
1083 u32 digest_buf[4];
1084 #elif defined _SAPG_
1085 u32 digest_buf[5];
1086 #elif defined _MYSQL323_
1087 u32 digest_buf[4];
1088 #elif defined _LOTUS5_
1089 u32 digest_buf[4];
1090 #elif defined _LOTUS6_
1091 u32 digest_buf[4];
1092 #elif defined _SCRYPT_
1093 u32 digest_buf[8];
1094 #elif defined _LOTUS8_
1095 u32 digest_buf[4];
1096 #elif defined _OFFICE2007_
1097 u32 digest_buf[4];
1098 #elif defined _OFFICE2010_
1099 u32 digest_buf[4];
1100 #elif defined _OFFICE2013_
1101 u32 digest_buf[4];
1102 #elif defined _OLDOFFICE01_
1103 u32 digest_buf[4];
1104 #elif defined _OLDOFFICE34_
1105 u32 digest_buf[4];
1106 #elif defined _SIPHASH_
1107 u32 digest_buf[4];
1108 #elif defined _PBKDF2_MD5_
1109 u32 digest_buf[32];
1110 #elif defined _PBKDF2_SHA1_
1111 u32 digest_buf[32];
1112 #elif defined _PBKDF2_SHA256_
1113 u32 digest_buf[32];
1114 #elif defined _PBKDF2_SHA512_
1115 u32 digest_buf[32];
1116 #elif defined _PDF17L8_
1117 u32 digest_buf[8];
1118 #elif defined _CRC32_
1119 u32 digest_buf[4];
1120 #elif defined _SEVEN_ZIP_
1121 u32 digest_buf[4];
1122 #elif defined _ANDROIDFDE_
1123 u32 digest_buf[4];
1124 #elif defined _DCC2_
1125 u32 digest_buf[4];
1126 #elif defined _WPA_
1127 u32 digest_buf[4];
1128 #elif defined _MD5_SHA1_
1129 u32 digest_buf[4];
1130 #elif defined _SHA1_MD5_
1131 u32 digest_buf[5];
1132 #elif defined _NETNTLMV2_
1133 u32 digest_buf[4];
1134 #elif defined _KRB5PA_
1135 u32 digest_buf[4];
1136 #elif defined _CLOUDKEY_
1137 u32 digest_buf[8];
1138 #elif defined _SCRYPT_
1139 u32 digest_buf[4];
1140 #elif defined _PSAFE2_
1141 u32 digest_buf[5];
1142 #elif defined _LOTUS8_
1143 u32 digest_buf[4];
1144 #elif defined _RAR3_
1145 u32 digest_buf[4];
1146 #elif defined _SHA256_SHA1_
1147 u32 digest_buf[8];
1148 #elif defined _MS_DRSR_
1149 u32 digest_buf[8];
1150 #elif defined _ANDROIDFDE_SAMSUNG_
1151 u32 digest_buf[8];
1152 #elif defined _RAR5_
1153 u32 digest_buf[4];
1154 #elif defined _KRB5TGS_
1155 u32 digest_buf[4];
1156 #endif
1157
1158 } digest_t;
1159
1160 typedef struct
1161 {
1162 u32 salt_buf[16];
1163 u32 salt_buf_pc[8];
1164
1165 u32 salt_len;
1166 u32 salt_iter;
1167 u32 salt_sign[2];
1168
1169 u32 keccak_mdlen;
1170 u32 truecrypt_mdlen;
1171
1172 u32 digests_cnt;
1173 u32 digests_done;
1174
1175 u32 digests_offset;
1176
1177 u32 scrypt_N;
1178 u32 scrypt_r;
1179 u32 scrypt_p;
1180 u32 scrypt_tmto;
1181 u32 scrypt_phy;
1182
1183 } salt_t;
1184
1185 typedef struct
1186 {
1187 int V;
1188 int R;
1189 int P;
1190
1191 int enc_md;
1192
1193 u32 id_buf[8];
1194 u32 u_buf[32];
1195 u32 o_buf[32];
1196
1197 int id_len;
1198 int o_len;
1199 int u_len;
1200
1201 u32 rc4key[2];
1202 u32 rc4data[2];
1203
1204 } pdf_t;
1205
1206 typedef struct
1207 {
1208 u32 pke[25];
1209 u32 eapol[64];
1210 int eapol_size;
1211 int keyver;
1212
1213 } wpa_t;
1214
1215 typedef struct
1216 {
1217 u32 cry_master_buf[64];
1218 u32 ckey_buf[64];
1219 u32 public_key_buf[64];
1220
1221 u32 cry_master_len;
1222 u32 ckey_len;
1223 u32 public_key_len;
1224
1225 } bitcoin_wallet_t;
1226
1227 typedef struct
1228 {
1229 u32 salt_buf[30];
1230 u32 salt_len;
1231
1232 u32 esalt_buf[38];
1233 u32 esalt_len;
1234
1235 } sip_t;
1236
1237 typedef struct
1238 {
1239 u32 data[384];
1240
1241 } androidfde_t;
1242
1243 typedef struct
1244 {
1245 u32 nr_buf[16];
1246 u32 nr_len;
1247
1248 u32 msg_buf[128];
1249 u32 msg_len;
1250
1251 } ikepsk_t;
1252
1253 typedef struct
1254 {
1255 u32 user_len;
1256 u32 domain_len;
1257 u32 srvchall_len;
1258 u32 clichall_len;
1259
1260 u32 userdomain_buf[64];
1261 u32 chall_buf[256];
1262
1263 } netntlm_t;
1264
1265 typedef struct
1266 {
1267 u32 user[16];
1268 u32 realm[16];
1269 u32 salt[32];
1270 u32 timestamp[16];
1271 u32 checksum[4];
1272
1273 } krb5pa_t;
1274
1275 typedef struct
1276 {
1277 u32 account_info[512];
1278 u32 checksum[4];
1279 u32 edata2[2560];
1280 u32 edata2_len;
1281
1282 } krb5tgs_t;
1283
1284 typedef struct
1285 {
1286 u32 salt_buf[16];
1287 u32 data_buf[112];
1288 u32 keyfile_buf[16];
1289
1290 } tc_t;
1291
1292 typedef struct
1293 {
1294 u32 salt_buf[16];
1295
1296 } pbkdf2_md5_t;
1297
1298 typedef struct
1299 {
1300 u32 salt_buf[16];
1301
1302 } pbkdf2_sha1_t;
1303
1304 typedef struct
1305 {
1306 u32 salt_buf[16];
1307
1308 } pbkdf2_sha256_t;
1309
1310 typedef struct
1311 {
1312 u32 salt_buf[32];
1313
1314 } pbkdf2_sha512_t;
1315
1316 typedef struct
1317 {
1318 u32 salt_buf[128];
1319 u32 salt_len;
1320
1321 } rakp_t;
1322
1323 typedef struct
1324 {
1325 u32 data_len;
1326 u32 data_buf[512];
1327
1328 } cloudkey_t;
1329
1330 typedef struct
1331 {
1332 u32 encryptedVerifier[4];
1333 u32 encryptedVerifierHash[5];
1334
1335 u32 keySize;
1336
1337 } office2007_t;
1338
1339 typedef struct
1340 {
1341 u32 encryptedVerifier[4];
1342 u32 encryptedVerifierHash[8];
1343
1344 } office2010_t;
1345
1346 typedef struct
1347 {
1348 u32 encryptedVerifier[4];
1349 u32 encryptedVerifierHash[8];
1350
1351 } office2013_t;
1352
1353 typedef struct
1354 {
1355 u32 version;
1356 u32 encryptedVerifier[4];
1357 u32 encryptedVerifierHash[4];
1358 u32 rc4key[2];
1359
1360 } oldoffice01_t;
1361
1362 typedef struct
1363 {
1364 u32 version;
1365 u32 encryptedVerifier[4];
1366 u32 encryptedVerifierHash[5];
1367 u32 rc4key[2];
1368
1369 } oldoffice34_t;
1370
1371 typedef struct
1372 {
1373 u32 digest[4];
1374 u32 out[4];
1375
1376 } pdf14_tmp_t;
1377
1378 typedef struct
1379 {
1380 union
1381 {
1382 u32 dgst32[16];
1383 u64 dgst64[8];
1384 };
1385
1386 u32 dgst_len;
1387 u32 W_len;
1388
1389 } pdf17l8_tmp_t;
1390
1391 typedef struct
1392 {
1393 u32 digest_buf[4];
1394
1395 } phpass_tmp_t;
1396
1397 typedef struct
1398 {
1399 u32 digest_buf[4];
1400
1401 } md5crypt_tmp_t;
1402
1403 typedef struct
1404 {
1405 u32 alt_result[8];
1406
1407 u32 p_bytes[4];
1408 u32 s_bytes[4];
1409
1410 } sha256crypt_tmp_t;
1411
1412 typedef struct
1413 {
1414 u64 l_alt_result[8];
1415
1416 u64 l_p_bytes[2];
1417 u64 l_s_bytes[2];
1418
1419 } sha512crypt_tmp_t;
1420
1421 typedef struct
1422 {
1423 u32 ipad[5];
1424 u32 opad[5];
1425
1426 u32 dgst[10];
1427 u32 out[10];
1428
1429 } wpa_tmp_t;
1430
1431 typedef struct
1432 {
1433 u64 dgst[8];
1434
1435 } bitcoin_wallet_tmp_t;
1436
1437 typedef struct
1438 {
1439 u32 ipad[5];
1440 u32 opad[5];
1441
1442 u32 dgst[5];
1443 u32 out[4];
1444
1445 } dcc2_tmp_t;
1446
1447 typedef struct
1448 {
1449 u32 E[18];
1450
1451 u32 P[18];
1452
1453 u32 S0[256];
1454 u32 S1[256];
1455 u32 S2[256];
1456 u32 S3[256];
1457
1458 } bcrypt_tmp_t;
1459
1460 typedef struct
1461 {
1462 u32 digest[2];
1463
1464 u32 P[18];
1465
1466 u32 S0[256];
1467 u32 S1[256];
1468 u32 S2[256];
1469 u32 S3[256];
1470
1471 } pwsafe2_tmp_t;
1472
1473 typedef struct
1474 {
1475 u32 digest_buf[8];
1476
1477 } pwsafe3_tmp_t;
1478
1479 typedef struct
1480 {
1481 u32 digest_buf[5];
1482
1483 } androidpin_tmp_t;
1484
1485 typedef struct
1486 {
1487 u32 ipad[5];
1488 u32 opad[5];
1489
1490 u32 dgst[10];
1491 u32 out[10];
1492
1493 } androidfde_tmp_t;
1494
1495 typedef struct
1496 {
1497 u32 ipad[16];
1498 u32 opad[16];
1499
1500 u32 dgst[64];
1501 u32 out[64];
1502
1503 } tc_tmp_t;
1504
1505 typedef struct
1506 {
1507 u64 ipad[8];
1508 u64 opad[8];
1509
1510 u64 dgst[32];
1511 u64 out[32];
1512
1513 } tc64_tmp_t;
1514
1515 typedef struct
1516 {
1517 u32 ipad[4];
1518 u32 opad[4];
1519
1520 u32 dgst[32];
1521 u32 out[32];
1522
1523 } pbkdf2_md5_tmp_t;
1524
1525 typedef struct
1526 {
1527 u32 ipad[5];
1528 u32 opad[5];
1529
1530 u32 dgst[32];
1531 u32 out[32];
1532
1533 } pbkdf2_sha1_tmp_t;
1534
1535 typedef struct
1536 {
1537 u32 ipad[8];
1538 u32 opad[8];
1539
1540 u32 dgst[32];
1541 u32 out[32];
1542
1543 } pbkdf2_sha256_tmp_t;
1544
1545 typedef struct
1546 {
1547 u64 ipad[8];
1548 u64 opad[8];
1549
1550 u64 dgst[16];
1551 u64 out[16];
1552
1553 } pbkdf2_sha512_tmp_t;
1554
1555 typedef struct
1556 {
1557 u64 out[8];
1558
1559 } ecryptfs_tmp_t;
1560
1561 typedef struct
1562 {
1563 u64 ipad[8];
1564 u64 opad[8];
1565
1566 u64 dgst[16];
1567 u64 out[16];
1568
1569 } oraclet_tmp_t;
1570
1571 typedef struct
1572 {
1573 u32 ipad[5];
1574 u32 opad[5];
1575
1576 u32 dgst[5];
1577 u32 out[5];
1578
1579 } agilekey_tmp_t;
1580
1581 typedef struct
1582 {
1583 u32 ipad[5];
1584 u32 opad[5];
1585
1586 u32 dgst1[5];
1587 u32 out1[5];
1588
1589 u32 dgst2[5];
1590 u32 out2[5];
1591
1592 } mywallet_tmp_t;
1593
1594 typedef struct
1595 {
1596 u32 ipad[5];
1597 u32 opad[5];
1598
1599 u32 dgst[5];
1600 u32 out[5];
1601
1602 } sha1aix_tmp_t;
1603
1604 typedef struct
1605 {
1606 u32 ipad[8];
1607 u32 opad[8];
1608
1609 u32 dgst[8];
1610 u32 out[8];
1611
1612 } sha256aix_tmp_t;
1613
1614 typedef struct
1615 {
1616 u64 ipad[8];
1617 u64 opad[8];
1618
1619 u64 dgst[8];
1620 u64 out[8];
1621
1622 } sha512aix_tmp_t;
1623
1624 typedef struct
1625 {
1626 u32 ipad[8];
1627 u32 opad[8];
1628
1629 u32 dgst[8];
1630 u32 out[8];
1631
1632 } lastpass_tmp_t;
1633
1634 typedef struct
1635 {
1636 u64 digest_buf[8];
1637
1638 } drupal7_tmp_t;
1639
1640 typedef struct
1641 {
1642 u32 ipad[5];
1643 u32 opad[5];
1644
1645 u32 dgst[5];
1646 u32 out[5];
1647
1648 } lotus8_tmp_t;
1649
1650 typedef struct
1651 {
1652 u32 out[5];
1653
1654 } office2007_tmp_t;
1655
1656 typedef struct
1657 {
1658 u32 out[5];
1659
1660 } office2010_tmp_t;
1661
1662 typedef struct
1663 {
1664 u64 out[8];
1665
1666 } office2013_tmp_t;
1667
1668 typedef struct
1669 {
1670 u32 digest_buf[5];
1671
1672 } saph_sha1_tmp_t;
1673
1674 typedef struct
1675 {
1676 u32 block[16];
1677
1678 u32 dgst[8];
1679
1680 u32 block_len;
1681 u32 final_len;
1682
1683 } seven_zip_tmp_t;
1684
1685 typedef struct
1686 {
1687 u32 Kc[16];
1688 u32 Kd[16];
1689
1690 u32 iv[2];
1691
1692 } bsdicrypt_tmp_t;
1693
1694 typedef struct
1695 {
1696 u32 dgst[17][5];
1697
1698 } rar3_tmp_t;
1699
1700 typedef struct
1701 {
1702 u32 user[16];
1703
1704 } cram_md5_t;
1705
1706 typedef struct
1707 {
1708 u32 iv_buf[4];
1709 u32 iv_len;
1710
1711 u32 salt_buf[4];
1712 u32 salt_len;
1713
1714 u32 crc;
1715
1716 u32 data_buf[96];
1717 u32 data_len;
1718
1719 u32 unpack_size;
1720
1721 } seven_zip_t;
1722
1723 typedef struct
1724 {
1725 u32 key;
1726 u64 val;
1727
1728 } hcstat_table_t;
1729
1730 typedef struct
1731 {
1732 u32 cs_buf[0x100];
1733 u32 cs_len;
1734
1735 } cs_t;
1736
1737 typedef struct
1738 {
1739 u32 cmds[0x100];
1740
1741 } kernel_rule_t;
1742
1743 typedef struct
1744 {
1745 u32 gidvid;
1746 u32 il_pos;
1747
1748 } plain_t;
1749
1750 typedef struct
1751 {
1752 u32 i[64];
1753
1754 u32 pw_len;
1755
1756 u32 alignment_placeholder_1;
1757 u32 alignment_placeholder_2;
1758 u32 alignment_placeholder_3;
1759
1760 } pw_t;
1761
1762 typedef struct
1763 {
1764 u32 i;
1765
1766 } bf_t;
1767
1768 typedef struct
1769 {
1770 u32 i[8];
1771
1772 u32 pw_len;
1773
1774 } comb_t;
1775
1776 typedef struct
1777 {
1778 u32 b[32];
1779
1780 } bs_word_t;
1781
1782 typedef struct
1783 {
1784 uint4 P[64];
1785
1786 } scrypt_tmp_t;