Merge pull request #184 from gm4tr1x/fixHelp
[hashcat.git] / OpenCL / types_ocl.c
1 /**
2 * Author......: Jens Steube <jens.steube@gmail.com>
3 * License.....: MIT
4 */
5
6 #define DEVICE_TYPE_CPU 2
7 #define DEVICE_TYPE_GPU 4
8
9 typedef uchar u8;
10 typedef ushort u16;
11 typedef uint u32;
12 typedef ulong u64;
13
14 #ifndef NEW_SIMD_CODE
15 #undef VECT_SIZE
16 #define VECT_SIZE 1
17 #endif
18
19 #if VECT_SIZE == 1
20 typedef uchar u8x;
21 typedef ushort u16x;
22 typedef uint u32x;
23 typedef ulong u64x;
24 #endif
25
26 #if VECT_SIZE == 2
27 typedef uchar2 u8x;
28 typedef ushort2 u16x;
29 typedef uint2 u32x;
30 typedef ulong2 u64x;
31 #endif
32
33 #if VECT_SIZE == 4
34 typedef uchar4 u8x;
35 typedef ushort4 u16x;
36 typedef uint4 u32x;
37 typedef ulong4 u64x;
38 #endif
39
40 #if VECT_SIZE == 8
41 typedef uchar8 u8x;
42 typedef ushort8 u16x;
43 typedef uint8 u32x;
44 typedef ulong8 u64x;
45 #endif
46
47 // this one needs to die
48 #define allx(r) r
49
50 static inline u32 l32_from_64_S (u64 a)
51 {
52 const u32 r = (u32) (a);
53
54 return r;
55 }
56
57 static inline u32 h32_from_64_S (u64 a)
58 {
59 a >>= 32;
60
61 const u32 r = (u32) (a);
62
63 return r;
64 }
65
66 static inline u64 hl32_to_64_S (const u32 a, const u32 b)
67 {
68 return as_ulong ((uint2) (b, a));
69 }
70
71 static inline u32x l32_from_64 (u64x a)
72 {
73 u32x r;
74
75 #if VECT_SIZE == 1
76 r = (u32) a;
77 #endif
78
79 #if VECT_SIZE >= 2
80 r.s0 = (u32) a.s0;
81 r.s1 = (u32) a.s1;
82 #endif
83
84 #if VECT_SIZE >= 4
85 r.s2 = (u32) a.s2;
86 r.s3 = (u32) a.s3;
87 #endif
88
89 #if VECT_SIZE >= 8
90 r.s4 = (u32) a.s4;
91 r.s5 = (u32) a.s5;
92 r.s6 = (u32) a.s6;
93 r.s7 = (u32) a.s7;
94 #endif
95
96 return r;
97 }
98
99 static inline u32x h32_from_64 (u64x a)
100 {
101 a >>= 32;
102
103 u32x r;
104
105 #if VECT_SIZE == 1
106 r = (u32) a;
107 #endif
108
109 #if VECT_SIZE >= 2
110 r.s0 = (u32) a.s0;
111 r.s1 = (u32) a.s1;
112 #endif
113
114 #if VECT_SIZE >= 4
115 r.s2 = (u32) a.s2;
116 r.s3 = (u32) a.s3;
117 #endif
118
119 #if VECT_SIZE >= 8
120 r.s4 = (u32) a.s4;
121 r.s5 = (u32) a.s5;
122 r.s6 = (u32) a.s6;
123 r.s7 = (u32) a.s7;
124 #endif
125
126 return r;
127 }
128
129 static inline u64x hl32_to_64 (const u32x a, const u32x b)
130 {
131 u64x r;
132
133 #if VECT_SIZE == 1
134 r = as_ulong ((uint2) (b, a));
135 #endif
136
137 #if VECT_SIZE >= 2
138 r.s0 = as_ulong ((uint2) (b.s0, a.s0));
139 r.s1 = as_ulong ((uint2) (b.s1, a.s1));
140 #endif
141
142 #if VECT_SIZE >= 4
143 r.s2 = as_ulong ((uint2) (b.s2, a.s2));
144 r.s3 = as_ulong ((uint2) (b.s3, a.s3));
145 #endif
146
147 #if VECT_SIZE >= 8
148 r.s4 = as_ulong ((uint2) (b.s4, a.s4));
149 r.s5 = as_ulong ((uint2) (b.s5, a.s5));
150 r.s6 = as_ulong ((uint2) (b.s6, a.s6));
151 r.s7 = as_ulong ((uint2) (b.s7, a.s7));
152 #endif
153
154 return r;
155 }
156
157 #ifdef IS_AMD
158 static inline u32 swap32_S (const u32 v)
159 {
160 return (as_uint (as_uchar4 (v).s3210));
161 }
162
163 static inline u64 swap64_S (const u64 v)
164 {
165 return (as_ulong (as_uchar8 (v).s76543210));
166 }
167
168 static inline u32 rotr32_S (const u32 a, const u32 n)
169 {
170 return rotate (a, 32 - n);
171 }
172
173 static inline u32 rotl32_S (const u32 a, const u32 n)
174 {
175 return rotate (a, n);
176 }
177
178 static inline u64 rotr64_S (const u64 a, const u32 n)
179 {
180 #if DEVICE_TYPE == DEVICE_TYPE_CPU
181
182 const u64 r = rotate (a, (u64) 64 - n);
183
184 #else
185
186 const u32 a0 = h32_from_64_S (a);
187 const u32 a1 = l32_from_64_S (a);
188
189 const u32 t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n);
190 const u32 t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n);
191
192 const u64 r = hl32_to_64_S (t0, t1);
193
194 #endif
195
196 return r;
197 }
198
199 static inline u64 rotl64_S (const u64 a, const u32 n)
200 {
201 return rotr64_S (a, 64 - n);
202 }
203
204 static inline u32x swap32 (const u32x v)
205 {
206 return ((v >> 24) & 0x000000ff)
207 | ((v >> 8) & 0x0000ff00)
208 | ((v << 8) & 0x00ff0000)
209 | ((v << 24) & 0xff000000);
210 }
211
212 static inline u64x swap64 (const u64x v)
213 {
214 return ((v >> 56) & 0x00000000000000ff)
215 | ((v >> 40) & 0x000000000000ff00)
216 | ((v >> 24) & 0x0000000000ff0000)
217 | ((v >> 8) & 0x00000000ff000000)
218 | ((v << 8) & 0x000000ff00000000)
219 | ((v << 24) & 0x0000ff0000000000)
220 | ((v << 40) & 0x00ff000000000000)
221 | ((v << 56) & 0xff00000000000000);
222 }
223
224 static inline u32x rotr32 (const u32x a, const u32 n)
225 {
226 return rotate (a, 32 - n);
227 }
228
229 static inline u32x rotl32 (const u32x a, const u32 n)
230 {
231 return rotate (a, n);
232 }
233
234 static inline u64x rotr64 (const u64x a, const u32 n)
235 {
236 #if DEVICE_TYPE == DEVICE_TYPE_CPU
237
238 const u64x r = rotate (a, (u64) 64 - n);
239
240 #else
241
242 const u32x a0 = h32_from_64 (a);
243 const u32x a1 = l32_from_64 (a);
244
245 const u32x t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n);
246 const u32x t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n);
247
248 const u64x r = hl32_to_64 (t0, t1);
249
250 #endif
251
252 return r;
253 }
254
255 static inline u64x rotl64 (const u64x a, const u32 n)
256 {
257 return rotr64 (a, 64 - n);
258 }
259
260 static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
261 {
262 return amd_bfe (a, b, c);
263 }
264
265 static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
266 {
267 return amd_bytealign (a, b, c);
268 }
269 #endif
270
271 #ifdef IS_NV
272 static inline u32 swap32_S (const u32 v)
273 {
274 u32 r;
275
276 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r) : "r"(v));
277
278 return r;
279 }
280
281 static inline u64 swap64_S (const u64 v)
282 {
283 u32 il;
284 u32 ir;
285
286 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(v));
287
288 u32 tl;
289 u32 tr;
290
291 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl) : "r"(il));
292 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr) : "r"(ir));
293
294 u64 r;
295
296 asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tr), "r"(tl));
297
298 return r;
299 }
300
301 static inline u32 rotr32_S (const u32 a, const u32 n)
302 {
303 return rotate (a, 32 - n);
304 }
305
306 static inline u32 rotl32_S (const u32 a, const u32 n)
307 {
308 return rotate (a, n);
309 }
310
311 #if CUDA_ARCH >= 350
312 static inline u64 rotr64_S (const u64 a, const u32 n)
313 {
314 u32 il;
315 u32 ir;
316
317 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a));
318
319 u32 tl;
320 u32 tr;
321
322 if (n >= 32)
323 {
324 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
325 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
326 }
327 else
328 {
329 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
330 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
331 }
332
333 u64 r;
334
335 asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr));
336
337 return r;
338 }
339 #else
340 static inline u64 rotr64_S (const u64 a, const u32 n)
341 {
342 return rotate (a, (u64) 64 - n);
343 }
344 #endif
345
346 static inline u64 rotl64_S (const u64 a, const u32 n)
347 {
348 return rotr64_S (a, 64 - n);
349 }
350
351 #if CUDA_ARCH >= 500
352 static inline u32 lut3_2d_S (const u32 a, const u32 b, const u32 c)
353 {
354 u32 r;
355
356 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
357
358 return r;
359 }
360
361 static inline u32 lut3_39_S (const u32 a, const u32 b, const u32 c)
362 {
363 u32 r;
364
365 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
366
367 return r;
368 }
369
370 static inline u32 lut3_59_S (const u32 a, const u32 b, const u32 c)
371 {
372 u32 r;
373
374 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
375
376 return r;
377 }
378
379 static inline u32 lut3_96_S (const u32 a, const u32 b, const u32 c)
380 {
381 u32 r;
382
383 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
384
385 return r;
386 }
387
388 static inline u32 lut3_e4_S (const u32 a, const u32 b, const u32 c)
389 {
390 u32 r;
391
392 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
393
394 return r;
395 }
396
397 static inline u32 lut3_e8_S (const u32 a, const u32 b, const u32 c)
398 {
399 u32 r;
400
401 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
402
403 return r;
404 }
405
406 static inline u32 lut3_ca_S (const u32 a, const u32 b, const u32 c)
407 {
408 u32 r;
409
410 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
411
412 return r;
413 }
414 #endif
415
416 static inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
417 {
418 u32 r;
419
420 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
421
422 return r;
423 }
424
425 static inline u32x swap32 (const u32x v)
426 {
427 return ((v >> 24) & 0x000000ff)
428 | ((v >> 8) & 0x0000ff00)
429 | ((v << 8) & 0x00ff0000)
430 | ((v << 24) & 0xff000000);
431 }
432
433 static inline u64x swap64 (const u64x v)
434 {
435 return ((v >> 56) & 0x00000000000000ff)
436 | ((v >> 40) & 0x000000000000ff00)
437 | ((v >> 24) & 0x0000000000ff0000)
438 | ((v >> 8) & 0x00000000ff000000)
439 | ((v << 8) & 0x000000ff00000000)
440 | ((v << 24) & 0x0000ff0000000000)
441 | ((v << 40) & 0x00ff000000000000)
442 | ((v << 56) & 0xff00000000000000);
443 }
444
445 static inline u32x rotr32 (const u32x a, const u32 n)
446 {
447 return rotate (a, 32 - n);
448 }
449
450 static inline u32x rotl32 (const u32x a, const u32 n)
451 {
452 return rotate (a, n);
453 }
454
455 #if CUDA_ARCH >= 350
456 static inline u64x rotr64 (const u64x a, const u32 n)
457 {
458 u64x r;
459
460 u32 il;
461 u32 ir;
462 u32 tl;
463 u32 tr;
464
465 #if VECT_SIZE == 1
466
467 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a));
468
469 if (n >= 32)
470 {
471 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
472 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
473 }
474 else
475 {
476 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
477 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
478 }
479
480 asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr));
481
482 #endif
483
484 #if VECT_SIZE >= 2
485
486 {
487 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s0));
488
489 if (n >= 32)
490 {
491 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
492 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
493 }
494 else
495 {
496 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
497 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
498 }
499
500 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s0) : "r"(tl), "r"(tr));
501 }
502
503 {
504 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s1));
505
506 if (n >= 32)
507 {
508 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
509 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
510 }
511 else
512 {
513 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
514 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
515 }
516
517 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s1) : "r"(tl), "r"(tr));
518 }
519
520 #endif
521
522 #if VECT_SIZE >= 4
523
524 {
525 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s2));
526
527 if (n >= 32)
528 {
529 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
530 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
531 }
532 else
533 {
534 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
535 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
536 }
537
538 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s2) : "r"(tl), "r"(tr));
539 }
540
541 {
542 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s3));
543
544 if (n >= 32)
545 {
546 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
547 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
548 }
549 else
550 {
551 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
552 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
553 }
554
555 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s3) : "r"(tl), "r"(tr));
556 }
557
558 #endif
559
560 #if VECT_SIZE >= 8
561
562 {
563 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s4));
564
565 if (n >= 32)
566 {
567 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
568 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
569 }
570 else
571 {
572 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
573 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
574 }
575
576 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s4) : "r"(tl), "r"(tr));
577 }
578
579 {
580 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s5));
581
582 if (n >= 32)
583 {
584 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
585 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
586 }
587 else
588 {
589 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
590 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
591 }
592
593 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s5) : "r"(tl), "r"(tr));
594 }
595
596 {
597 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s6));
598
599 if (n >= 32)
600 {
601 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
602 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
603 }
604 else
605 {
606 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
607 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
608 }
609
610 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s6) : "r"(tl), "r"(tr));
611 }
612
613 {
614 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s7));
615
616 if (n >= 32)
617 {
618 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
619 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
620 }
621 else
622 {
623 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
624 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
625 }
626
627 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s7) : "r"(tl), "r"(tr));
628 }
629
630 #endif
631
632 return r;
633 }
634 #else
635 static inline u64x rotr64 (const u64x a, const u32 n)
636 {
637 return rotate (a, (u64) 64 - n);
638 }
639 #endif
640
641 static inline u64x rotl64 (const u64x a, const u32 n)
642 {
643 return rotr64 (a, (u64) 64 - n);
644 }
645
646 static inline u32x __byte_perm (const u32x a, const u32x b, const u32x c)
647 {
648 u32x r;
649
650 #if VECT_SIZE == 1
651 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c) );
652 #endif
653
654 #if VECT_SIZE >= 2
655 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
656 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
657 #endif
658
659 #if VECT_SIZE >= 4
660 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2));
661 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3));
662 #endif
663
664 #if VECT_SIZE >= 8
665 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c.s4));
666 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c.s5));
667 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c.s6));
668 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c.s7));
669 #endif
670
671 return r;
672 }
673
674 static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
675 {
676 u32 r;
677
678 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
679
680 return r;
681 }
682
683 #if CUDA_ARCH >= 350
684 static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
685 {
686 u32 r;
687
688 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"((c & 3) * 8));
689
690 return r;
691 }
692 #else
693 static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
694 {
695 return __byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
696 }
697 #endif
698
699 #if CUDA_ARCH >= 500
700 static inline u32x lut3_2d (const u32x a, const u32x b, const u32x c)
701 {
702 u32x r;
703
704 #if VECT_SIZE == 1
705 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
706 #endif
707
708 #if VECT_SIZE >= 2
709 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
710 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
711 #endif
712
713 #if VECT_SIZE >= 4
714 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
715 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
716 #endif
717
718 #if VECT_SIZE >= 8
719 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
720 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
721 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
722 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
723 #endif
724
725 return r;
726 }
727
728 static inline u32x lut3_39 (const u32x a, const u32x b, const u32x c)
729 {
730 u32x r;
731
732 #if VECT_SIZE == 1
733 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
734 #endif
735
736 #if VECT_SIZE == 2
737 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
738 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
739 #endif
740
741 #if VECT_SIZE == 4
742 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
743 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
744 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
745 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
746 #endif
747
748 #if VECT_SIZE == 8
749 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
750 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
751 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
752 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
753 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
754 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
755 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
756 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
757 #endif
758
759 return r;
760 }
761
762 static inline u32x lut3_59 (const u32x a, const u32x b, const u32x c)
763 {
764 u32x r;
765
766 #if VECT_SIZE == 1
767 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
768 #endif
769
770 #if VECT_SIZE == 2
771 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
772 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
773 #endif
774
775 #if VECT_SIZE == 4
776 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
777 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
778 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
779 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
780 #endif
781
782 #if VECT_SIZE == 8
783 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
784 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
785 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
786 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
787 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
788 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
789 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
790 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
791 #endif
792
793 return r;
794 }
795
796 static inline u32x lut3_96 (const u32x a, const u32x b, const u32x c)
797 {
798 u32x r;
799
800 #if VECT_SIZE == 1
801 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
802 #endif
803
804 #if VECT_SIZE == 2
805 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
806 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
807 #endif
808
809 #if VECT_SIZE == 4
810 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
811 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
812 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
813 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
814 #endif
815
816 #if VECT_SIZE == 8
817 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
818 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
819 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
820 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
821 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
822 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
823 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
824 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
825 #endif
826
827 return r;
828 }
829
830 static inline u32x lut3_e4 (const u32x a, const u32x b, const u32x c)
831 {
832 u32x r;
833
834 #if VECT_SIZE == 1
835 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
836 #endif
837
838 #if VECT_SIZE == 2
839 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
840 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
841 #endif
842
843 #if VECT_SIZE == 4
844 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
845 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
846 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
847 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
848 #endif
849
850 #if VECT_SIZE == 8
851 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
852 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
853 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
854 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
855 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
856 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
857 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
858 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
859 #endif
860
861 return r;
862 }
863
864 static inline u32x lut3_e8 (const u32x a, const u32x b, const u32x c)
865 {
866 u32x r;
867
868 #if VECT_SIZE == 1
869 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
870 #endif
871
872 #if VECT_SIZE == 2
873 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
874 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
875 #endif
876
877 #if VECT_SIZE == 4
878 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
879 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
880 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
881 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
882 #endif
883
884 #if VECT_SIZE == 8
885 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
886 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
887 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
888 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
889 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
890 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
891 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
892 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
893 #endif
894
895 return r;
896 }
897
898 static inline u32x lut3_ca (const u32x a, const u32x b, const u32x c)
899 {
900 u32x r;
901
902 #if VECT_SIZE == 1
903 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
904 #endif
905
906 #if VECT_SIZE == 2
907 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
908 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
909 #endif
910
911 #if VECT_SIZE == 4
912 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
913 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
914 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
915 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
916 #endif
917
918 #if VECT_SIZE == 8
919 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
920 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
921 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
922 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
923 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
924 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
925 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
926 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
927 #endif
928
929 return r;
930 }
931
932 #endif
933 #endif
934
935 #ifdef IS_GENERIC
936 static inline u32 swap32_S (const u32 v)
937 {
938 return (as_uint (as_uchar4 (v).s3210));
939 }
940
941 static inline u64 swap64_S (const u64 v)
942 {
943 return (as_ulong (as_uchar8 (v).s76543210));
944 }
945
946 static inline u32 rotr32_S (const u32 a, const u32 n)
947 {
948 return rotate (a, 32 - n);
949 }
950
951 static inline u32 rotl32_S (const u32 a, const u32 n)
952 {
953 return rotate (a, n);
954 }
955
956 static inline u64 rotr64_S (const u64 a, const u32 n)
957 {
958 return rotate (a, (u64) 64 - n);
959 }
960
961 static inline u64 rotl64_S (const u64 a, const u32 n)
962 {
963 return rotate (a, (u64) n);
964 }
965
966 static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
967 {
968 const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8);
969
970 return (u32) (tmp);
971 }
972
973 static inline u32x swap32 (const u32x v)
974 {
975 return ((v >> 24) & 0x000000ff)
976 | ((v >> 8) & 0x0000ff00)
977 | ((v << 8) & 0x00ff0000)
978 | ((v << 24) & 0xff000000);
979 }
980
981 static inline u64x swap64 (const u64x v)
982 {
983 return ((v >> 56) & 0x00000000000000ff)
984 | ((v >> 40) & 0x000000000000ff00)
985 | ((v >> 24) & 0x0000000000ff0000)
986 | ((v >> 8) & 0x00000000ff000000)
987 | ((v << 8) & 0x000000ff00000000)
988 | ((v << 24) & 0x0000ff0000000000)
989 | ((v << 40) & 0x00ff000000000000)
990 | ((v << 56) & 0xff00000000000000);
991 }
992
993 static inline u32x rotr32 (const u32x a, const u32 n)
994 {
995 return rotate (a, 32 - n);
996 }
997
998 static inline u32x rotl32 (const u32x a, const u32 n)
999 {
1000 return rotate (a, n);
1001 }
1002
1003 static inline u64x rotr64 (const u64x a, const u32 n)
1004 {
1005 return rotate (a, (u64) 64 - n);
1006 }
1007
1008 static inline u64x rotl64 (const u64x a, const u32 n)
1009 {
1010 return rotate (a, (u64) n);
1011 }
1012
1013 static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
1014 {
1015 #define BIT(x) (1 << (x))
1016 #define BIT_MASK(x) (BIT (x) - 1)
1017 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
1018
1019 return BFE (a, b, c);
1020 }
1021
1022 static inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c)
1023 {
1024 #if VECT_SIZE == 1
1025 const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8);
1026
1027 return (u32x) (tmp);
1028 #endif
1029
1030 #if VECT_SIZE == 2
1031 const u64x tmp = ((((u64x) (a.s0, a.s1)) << 32) | ((u64x) (b.s0, b.s1))) >> ((c & 3) * 8);
1032
1033 return (u32x) (tmp.s0, tmp.s1);
1034 #endif
1035
1036 #if VECT_SIZE == 4
1037 const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3))) >> ((c & 3) * 8);
1038
1039 return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3);
1040 #endif
1041
1042 #if VECT_SIZE == 8
1043 const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3, b.s4, b.s5, b.s6, b.s7))) >> ((c & 3) * 8);
1044
1045 return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7);
1046 #endif
1047 }
1048 #endif
1049
1050 typedef struct
1051 {
1052 #if defined _DES_
1053 u32 digest_buf[4];
1054 #elif defined _MD4_
1055 u32 digest_buf[4];
1056 #elif defined _MD5_
1057 u32 digest_buf[4];
1058 #elif defined _MD5H_
1059 u32 digest_buf[4];
1060 #elif defined _SHA1_
1061 u32 digest_buf[5];
1062 #elif defined _BCRYPT_
1063 u32 digest_buf[6];
1064 #elif defined _SHA256_
1065 u32 digest_buf[8];
1066 #elif defined _SHA384_
1067 u32 digest_buf[16];
1068 #elif defined _SHA512_
1069 u32 digest_buf[16];
1070 #elif defined _KECCAK_
1071 u32 digest_buf[50];
1072 #elif defined _RIPEMD160_
1073 u32 digest_buf[5];
1074 #elif defined _WHIRLPOOL_
1075 u32 digest_buf[16];
1076 #elif defined _GOST_
1077 u32 digest_buf[8];
1078 #elif defined _GOST2012_256_
1079 u32 digest_buf[8];
1080 #elif defined _GOST2012_512_
1081 u32 digest_buf[16];
1082 #elif defined _SAPB_
1083 u32 digest_buf[4];
1084 #elif defined _SAPG_
1085 u32 digest_buf[5];
1086 #elif defined _MYSQL323_
1087 u32 digest_buf[4];
1088 #elif defined _LOTUS5_
1089 u32 digest_buf[4];
1090 #elif defined _LOTUS6_
1091 u32 digest_buf[4];
1092 #elif defined _SCRYPT_
1093 u32 digest_buf[8];
1094 #elif defined _LOTUS8_
1095 u32 digest_buf[4];
1096 #elif defined _OFFICE2007_
1097 u32 digest_buf[4];
1098 #elif defined _OFFICE2010_
1099 u32 digest_buf[4];
1100 #elif defined _OFFICE2013_
1101 u32 digest_buf[4];
1102 #elif defined _OLDOFFICE01_
1103 u32 digest_buf[4];
1104 #elif defined _OLDOFFICE34_
1105 u32 digest_buf[4];
1106 #elif defined _SIPHASH_
1107 u32 digest_buf[4];
1108 #elif defined _PBKDF2_MD5_
1109 u32 digest_buf[32];
1110 #elif defined _PBKDF2_SHA1_
1111 u32 digest_buf[32];
1112 #elif defined _PBKDF2_SHA256_
1113 u32 digest_buf[32];
1114 #elif defined _PBKDF2_SHA512_
1115 u32 digest_buf[32];
1116 #elif defined _PDF17L8_
1117 u32 digest_buf[8];
1118 #elif defined _CRC32_
1119 u32 digest_buf[4];
1120 #elif defined _SEVEN_ZIP_
1121 u32 digest_buf[4];
1122 #elif defined _ANDROIDFDE_
1123 u32 digest_buf[4];
1124 #elif defined _DCC2_
1125 u32 digest_buf[4];
1126 #elif defined _WPA_
1127 u32 digest_buf[4];
1128 #elif defined _MD5_SHA1_
1129 u32 digest_buf[4];
1130 #elif defined _SHA1_MD5_
1131 u32 digest_buf[5];
1132 #elif defined _NETNTLMV2_
1133 u32 digest_buf[4];
1134 #elif defined _KRB5PA_
1135 u32 digest_buf[4];
1136 #elif defined _CLOUDKEY_
1137 u32 digest_buf[8];
1138 #elif defined _SCRYPT_
1139 u32 digest_buf[4];
1140 #elif defined _PSAFE2_
1141 u32 digest_buf[5];
1142 #elif defined _LOTUS8_
1143 u32 digest_buf[4];
1144 #elif defined _RAR3_
1145 u32 digest_buf[4];
1146 #elif defined _SHA256_SHA1_
1147 u32 digest_buf[8];
1148 #elif defined _MS_DRSR_
1149 u32 digest_buf[8];
1150 #elif defined _ANDROIDFDE_SAMSUNG_
1151 u32 digest_buf[8];
1152 #elif defined _RAR5_
1153 u32 digest_buf[4];
1154 #endif
1155
1156 } digest_t;
1157
1158 typedef struct
1159 {
1160 u32 salt_buf[16];
1161 u32 salt_buf_pc[8];
1162
1163 u32 salt_len;
1164 u32 salt_iter;
1165 u32 salt_sign[2];
1166
1167 u32 keccak_mdlen;
1168 u32 truecrypt_mdlen;
1169
1170 u32 digests_cnt;
1171 u32 digests_done;
1172
1173 u32 digests_offset;
1174
1175 u32 scrypt_N;
1176 u32 scrypt_r;
1177 u32 scrypt_p;
1178 u32 scrypt_tmto;
1179 u32 scrypt_phy;
1180
1181 } salt_t;
1182
1183 typedef struct
1184 {
1185 int V;
1186 int R;
1187 int P;
1188
1189 int enc_md;
1190
1191 u32 id_buf[8];
1192 u32 u_buf[32];
1193 u32 o_buf[32];
1194
1195 int id_len;
1196 int o_len;
1197 int u_len;
1198
1199 u32 rc4key[2];
1200 u32 rc4data[2];
1201
1202 } pdf_t;
1203
1204 typedef struct
1205 {
1206 u32 pke[25];
1207 u32 eapol[64];
1208 int eapol_size;
1209 int keyver;
1210
1211 } wpa_t;
1212
1213 typedef struct
1214 {
1215 u32 cry_master_buf[64];
1216 u32 ckey_buf[64];
1217 u32 public_key_buf[64];
1218
1219 u32 cry_master_len;
1220 u32 ckey_len;
1221 u32 public_key_len;
1222
1223 } bitcoin_wallet_t;
1224
1225 typedef struct
1226 {
1227 u32 salt_buf[30];
1228 u32 salt_len;
1229
1230 u32 esalt_buf[38];
1231 u32 esalt_len;
1232
1233 } sip_t;
1234
1235 typedef struct
1236 {
1237 u32 data[384];
1238
1239 } androidfde_t;
1240
1241 typedef struct
1242 {
1243 u32 nr_buf[16];
1244 u32 nr_len;
1245
1246 u32 msg_buf[128];
1247 u32 msg_len;
1248
1249 } ikepsk_t;
1250
1251 typedef struct
1252 {
1253 u32 user_len;
1254 u32 domain_len;
1255 u32 srvchall_len;
1256 u32 clichall_len;
1257
1258 u32 userdomain_buf[64];
1259 u32 chall_buf[256];
1260
1261 } netntlm_t;
1262
1263 typedef struct
1264 {
1265 u32 user[16];
1266 u32 realm[16];
1267 u32 salt[32];
1268 u32 timestamp[16];
1269 u32 checksum[4];
1270
1271 } krb5pa_t;
1272
1273 typedef struct
1274 {
1275 u32 salt_buf[16];
1276 u32 data_buf[112];
1277 u32 keyfile_buf[16];
1278
1279 } tc_t;
1280
1281 typedef struct
1282 {
1283 u32 salt_buf[16];
1284
1285 } pbkdf2_md5_t;
1286
1287 typedef struct
1288 {
1289 u32 salt_buf[16];
1290
1291 } pbkdf2_sha1_t;
1292
1293 typedef struct
1294 {
1295 u32 salt_buf[16];
1296
1297 } pbkdf2_sha256_t;
1298
1299 typedef struct
1300 {
1301 u32 salt_buf[32];
1302
1303 } pbkdf2_sha512_t;
1304
1305 typedef struct
1306 {
1307 u32 salt_buf[128];
1308 u32 salt_len;
1309
1310 } rakp_t;
1311
1312 typedef struct
1313 {
1314 u32 data_len;
1315 u32 data_buf[512];
1316
1317 } cloudkey_t;
1318
1319 typedef struct
1320 {
1321 u32 encryptedVerifier[4];
1322 u32 encryptedVerifierHash[5];
1323
1324 u32 keySize;
1325
1326 } office2007_t;
1327
1328 typedef struct
1329 {
1330 u32 encryptedVerifier[4];
1331 u32 encryptedVerifierHash[8];
1332
1333 } office2010_t;
1334
1335 typedef struct
1336 {
1337 u32 encryptedVerifier[4];
1338 u32 encryptedVerifierHash[8];
1339
1340 } office2013_t;
1341
1342 typedef struct
1343 {
1344 u32 version;
1345 u32 encryptedVerifier[4];
1346 u32 encryptedVerifierHash[4];
1347 u32 rc4key[2];
1348
1349 } oldoffice01_t;
1350
1351 typedef struct
1352 {
1353 u32 version;
1354 u32 encryptedVerifier[4];
1355 u32 encryptedVerifierHash[5];
1356 u32 rc4key[2];
1357
1358 } oldoffice34_t;
1359
1360 typedef struct
1361 {
1362 u32 digest[4];
1363 u32 out[4];
1364
1365 } pdf14_tmp_t;
1366
1367 typedef struct
1368 {
1369 union
1370 {
1371 u32 dgst32[16];
1372 u64 dgst64[8];
1373 };
1374
1375 u32 dgst_len;
1376 u32 W_len;
1377
1378 } pdf17l8_tmp_t;
1379
1380 typedef struct
1381 {
1382 u32 digest_buf[4];
1383
1384 } phpass_tmp_t;
1385
1386 typedef struct
1387 {
1388 u32 digest_buf[4];
1389
1390 } md5crypt_tmp_t;
1391
1392 typedef struct
1393 {
1394 u32 alt_result[8];
1395
1396 u32 p_bytes[4];
1397 u32 s_bytes[4];
1398
1399 } sha256crypt_tmp_t;
1400
1401 typedef struct
1402 {
1403 u64 l_alt_result[8];
1404
1405 u64 l_p_bytes[2];
1406 u64 l_s_bytes[2];
1407
1408 } sha512crypt_tmp_t;
1409
1410 typedef struct
1411 {
1412 u32 ipad[5];
1413 u32 opad[5];
1414
1415 u32 dgst[10];
1416 u32 out[10];
1417
1418 } wpa_tmp_t;
1419
1420 typedef struct
1421 {
1422 u64 dgst[8];
1423
1424 } bitcoin_wallet_tmp_t;
1425
1426 typedef struct
1427 {
1428 u32 ipad[5];
1429 u32 opad[5];
1430
1431 u32 dgst[5];
1432 u32 out[4];
1433
1434 } dcc2_tmp_t;
1435
1436 typedef struct
1437 {
1438 u32 E[18];
1439
1440 u32 P[18];
1441
1442 u32 S0[256];
1443 u32 S1[256];
1444 u32 S2[256];
1445 u32 S3[256];
1446
1447 } bcrypt_tmp_t;
1448
1449 typedef struct
1450 {
1451 u32 digest[2];
1452
1453 u32 P[18];
1454
1455 u32 S0[256];
1456 u32 S1[256];
1457 u32 S2[256];
1458 u32 S3[256];
1459
1460 } pwsafe2_tmp_t;
1461
1462 typedef struct
1463 {
1464 u32 digest_buf[8];
1465
1466 } pwsafe3_tmp_t;
1467
1468 typedef struct
1469 {
1470 u32 digest_buf[5];
1471
1472 } androidpin_tmp_t;
1473
1474 typedef struct
1475 {
1476 u32 ipad[5];
1477 u32 opad[5];
1478
1479 u32 dgst[10];
1480 u32 out[10];
1481
1482 } androidfde_tmp_t;
1483
1484 typedef struct
1485 {
1486 u32 ipad[16];
1487 u32 opad[16];
1488
1489 u32 dgst[64];
1490 u32 out[64];
1491
1492 } tc_tmp_t;
1493
1494 typedef struct
1495 {
1496 u64 ipad[8];
1497 u64 opad[8];
1498
1499 u64 dgst[32];
1500 u64 out[32];
1501
1502 } tc64_tmp_t;
1503
1504 typedef struct
1505 {
1506 u32 ipad[4];
1507 u32 opad[4];
1508
1509 u32 dgst[32];
1510 u32 out[32];
1511
1512 } pbkdf2_md5_tmp_t;
1513
1514 typedef struct
1515 {
1516 u32 ipad[5];
1517 u32 opad[5];
1518
1519 u32 dgst[32];
1520 u32 out[32];
1521
1522 } pbkdf2_sha1_tmp_t;
1523
1524 typedef struct
1525 {
1526 u32 ipad[8];
1527 u32 opad[8];
1528
1529 u32 dgst[32];
1530 u32 out[32];
1531
1532 } pbkdf2_sha256_tmp_t;
1533
1534 typedef struct
1535 {
1536 u64 ipad[8];
1537 u64 opad[8];
1538
1539 u64 dgst[16];
1540 u64 out[16];
1541
1542 } pbkdf2_sha512_tmp_t;
1543
1544 typedef struct
1545 {
1546 u64 out[8];
1547
1548 } ecryptfs_tmp_t;
1549
1550 typedef struct
1551 {
1552 u64 ipad[8];
1553 u64 opad[8];
1554
1555 u64 dgst[16];
1556 u64 out[16];
1557
1558 } oraclet_tmp_t;
1559
1560 typedef struct
1561 {
1562 u32 ipad[5];
1563 u32 opad[5];
1564
1565 u32 dgst[5];
1566 u32 out[5];
1567
1568 } agilekey_tmp_t;
1569
1570 typedef struct
1571 {
1572 u32 ipad[5];
1573 u32 opad[5];
1574
1575 u32 dgst1[5];
1576 u32 out1[5];
1577
1578 u32 dgst2[5];
1579 u32 out2[5];
1580
1581 } mywallet_tmp_t;
1582
1583 typedef struct
1584 {
1585 u32 ipad[5];
1586 u32 opad[5];
1587
1588 u32 dgst[5];
1589 u32 out[5];
1590
1591 } sha1aix_tmp_t;
1592
1593 typedef struct
1594 {
1595 u32 ipad[8];
1596 u32 opad[8];
1597
1598 u32 dgst[8];
1599 u32 out[8];
1600
1601 } sha256aix_tmp_t;
1602
1603 typedef struct
1604 {
1605 u64 ipad[8];
1606 u64 opad[8];
1607
1608 u64 dgst[8];
1609 u64 out[8];
1610
1611 } sha512aix_tmp_t;
1612
1613 typedef struct
1614 {
1615 u32 ipad[8];
1616 u32 opad[8];
1617
1618 u32 dgst[8];
1619 u32 out[8];
1620
1621 } lastpass_tmp_t;
1622
1623 typedef struct
1624 {
1625 u64 digest_buf[8];
1626
1627 } drupal7_tmp_t;
1628
1629 typedef struct
1630 {
1631 u32 ipad[5];
1632 u32 opad[5];
1633
1634 u32 dgst[5];
1635 u32 out[5];
1636
1637 } lotus8_tmp_t;
1638
1639 typedef struct
1640 {
1641 u32 out[5];
1642
1643 } office2007_tmp_t;
1644
1645 typedef struct
1646 {
1647 u32 out[5];
1648
1649 } office2010_tmp_t;
1650
1651 typedef struct
1652 {
1653 u64 out[8];
1654
1655 } office2013_tmp_t;
1656
1657 typedef struct
1658 {
1659 u32 digest_buf[5];
1660
1661 } saph_sha1_tmp_t;
1662
1663 typedef struct
1664 {
1665 u32 block[16];
1666
1667 u32 dgst[8];
1668
1669 u32 block_len;
1670 u32 final_len;
1671
1672 } seven_zip_tmp_t;
1673
1674 typedef struct
1675 {
1676 u32 Kc[16];
1677 u32 Kd[16];
1678
1679 u32 iv[2];
1680
1681 } bsdicrypt_tmp_t;
1682
1683 typedef struct
1684 {
1685 u32 dgst[17][5];
1686
1687 } rar3_tmp_t;
1688
1689 typedef struct
1690 {
1691 u32 user[16];
1692
1693 } cram_md5_t;
1694
1695 typedef struct
1696 {
1697 u32 iv_buf[4];
1698 u32 iv_len;
1699
1700 u32 salt_buf[4];
1701 u32 salt_len;
1702
1703 u32 crc;
1704
1705 u32 data_buf[96];
1706 u32 data_len;
1707
1708 u32 unpack_size;
1709
1710 } seven_zip_t;
1711
1712 typedef struct
1713 {
1714 u32 key;
1715 u64 val;
1716
1717 } hcstat_table_t;
1718
1719 typedef struct
1720 {
1721 u32 cs_buf[0x100];
1722 u32 cs_len;
1723
1724 } cs_t;
1725
1726 typedef struct
1727 {
1728 u32 cmds[0x100];
1729
1730 } kernel_rule_t;
1731
1732 typedef struct
1733 {
1734 u32 gidvid;
1735 u32 il_pos;
1736
1737 } plain_t;
1738
1739 typedef struct
1740 {
1741 u32 i[64];
1742
1743 u32 pw_len;
1744
1745 u32 alignment_placeholder_1;
1746 u32 alignment_placeholder_2;
1747 u32 alignment_placeholder_3;
1748
1749 } pw_t;
1750
1751 typedef struct
1752 {
1753 u32 i;
1754
1755 } bf_t;
1756
1757 typedef struct
1758 {
1759 u32 i[8];
1760
1761 u32 pw_len;
1762
1763 } comb_t;
1764
1765 typedef struct
1766 {
1767 u32 b[32];
1768
1769 } bs_word_t;
1770
1771 typedef struct
1772 {
1773 uint4 P[64];
1774
1775 } scrypt_tmp_t;