Fix speed for 64-bit datatype based algorithms for NV
[hashcat.git] / OpenCL / types_ocl.c
1 /**
2 * Author......: Jens Steube <jens.steube@gmail.com>
3 * License.....: MIT
4 */
5
6 #define DEVICE_TYPE_CPU 2
7 #define DEVICE_TYPE_GPU 4
8
9 typedef uchar u8;
10 typedef ushort u16;
11 typedef uint u32;
12 typedef ulong u64;
13
14 #ifndef NEW_SIMD_CODE
15 #undef VECT_SIZE
16 #define VECT_SIZE 1
17 #endif
18
19 #if VECT_SIZE == 1
20 typedef uchar u8x;
21 typedef ushort u16x;
22 typedef uint u32x;
23 typedef ulong u64x;
24 #endif
25
26 #if VECT_SIZE == 2
27 typedef uchar2 u8x;
28 typedef ushort2 u16x;
29 typedef uint2 u32x;
30 typedef ulong2 u64x;
31 #endif
32
33 #if VECT_SIZE == 4
34 typedef uchar4 u8x;
35 typedef ushort4 u16x;
36 typedef uint4 u32x;
37 typedef ulong4 u64x;
38 #endif
39
40 #if VECT_SIZE == 8
41 typedef uchar8 u8x;
42 typedef ushort8 u16x;
43 typedef uint8 u32x;
44 typedef ulong8 u64x;
45 #endif
46
47 // this one needs to die
48 #define allx(r) r
49
50 static inline u32 l32_from_64_S (u64 a)
51 {
52 const u32 r = (u32) (a);
53
54 return r;
55 }
56
57 static inline u32 h32_from_64_S (u64 a)
58 {
59 a >>= 32;
60
61 const u32 r = (u32) (a);
62
63 return r;
64 }
65
66 static inline u64 hl32_to_64_S (const u32 a, const u32 b)
67 {
68 return as_ulong ((uint2) (b, a));
69 }
70
71 static inline u32x l32_from_64 (u64x a)
72 {
73 u32x r;
74
75 #if VECT_SIZE == 1
76 r = (u32) a;
77 #endif
78
79 #if VECT_SIZE >= 2
80 r.s0 = (u32) a.s0;
81 r.s1 = (u32) a.s1;
82 #endif
83
84 #if VECT_SIZE >= 4
85 r.s2 = (u32) a.s2;
86 r.s3 = (u32) a.s3;
87 #endif
88
89 #if VECT_SIZE >= 8
90 r.s4 = (u32) a.s4;
91 r.s5 = (u32) a.s5;
92 r.s6 = (u32) a.s6;
93 r.s7 = (u32) a.s7;
94 #endif
95
96 return r;
97 }
98
99 static inline u32x h32_from_64 (u64x a)
100 {
101 a >>= 32;
102
103 u32x r;
104
105 #if VECT_SIZE == 1
106 r = (u32) a;
107 #endif
108
109 #if VECT_SIZE >= 2
110 r.s0 = (u32) a.s0;
111 r.s1 = (u32) a.s1;
112 #endif
113
114 #if VECT_SIZE >= 4
115 r.s2 = (u32) a.s2;
116 r.s3 = (u32) a.s3;
117 #endif
118
119 #if VECT_SIZE >= 8
120 r.s4 = (u32) a.s4;
121 r.s5 = (u32) a.s5;
122 r.s6 = (u32) a.s6;
123 r.s7 = (u32) a.s7;
124 #endif
125
126 return r;
127 }
128
129 static inline u64x hl32_to_64 (const u32x a, const u32x b)
130 {
131 u64x r;
132
133 #if VECT_SIZE == 1
134 r = as_ulong ((uint2) (b, a));
135 #endif
136
137 #if VECT_SIZE >= 2
138 r.s0 = as_ulong ((uint2) (b.s0, a.s0));
139 r.s1 = as_ulong ((uint2) (b.s1, a.s1));
140 #endif
141
142 #if VECT_SIZE >= 4
143 r.s2 = as_ulong ((uint2) (b.s2, a.s2));
144 r.s3 = as_ulong ((uint2) (b.s3, a.s3));
145 #endif
146
147 #if VECT_SIZE >= 8
148 r.s4 = as_ulong ((uint2) (b.s4, a.s4));
149 r.s5 = as_ulong ((uint2) (b.s5, a.s5));
150 r.s6 = as_ulong ((uint2) (b.s6, a.s6));
151 r.s7 = as_ulong ((uint2) (b.s7, a.s7));
152 #endif
153
154 return r;
155 }
156
157 #ifdef IS_AMD
158 static inline u32 swap32_S (const u32 v)
159 {
160 return (as_uint (as_uchar4 (v).s3210));
161 }
162
163 static inline u64 swap64_S (const u64 v)
164 {
165 return (as_ulong (as_uchar8 (v).s76543210));
166 }
167
168 static inline u32 rotr32_S (const u32 a, const u32 n)
169 {
170 return rotate (a, 32 - n);
171 }
172
173 static inline u32 rotl32_S (const u32 a, const u32 n)
174 {
175 return rotate (a, n);
176 }
177
178 static inline u64 rotr64_S (const u64 a, const u32 n)
179 {
180 u64 r;
181
182 #if DEVICE_TYPE == DEVICE_TYPE_CPU
183
184 r = rotate (a, (u64) 64 - n);
185
186 #else
187
188 uint2 a2 = as_uint2 (a);
189
190 uint2 t;
191
192 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32)
193 : amd_bitalign (a2.s1, a2.s0, n);
194 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32)
195 : amd_bitalign (a2.s0, a2.s1, n);
196
197 r = as_ulong (t);
198
199 #endif
200
201 return r;
202 }
203
204 static inline u64 rotl64_S (const u64 a, const u32 n)
205 {
206 return rotr64_S (a, 64 - n);
207 }
208
209 static inline u32x swap32 (const u32x v)
210 {
211 return ((v >> 24) & 0x000000ff)
212 | ((v >> 8) & 0x0000ff00)
213 | ((v << 8) & 0x00ff0000)
214 | ((v << 24) & 0xff000000);
215 }
216
217 static inline u64x swap64 (const u64x v)
218 {
219 return ((v >> 56) & 0x00000000000000ff)
220 | ((v >> 40) & 0x000000000000ff00)
221 | ((v >> 24) & 0x0000000000ff0000)
222 | ((v >> 8) & 0x00000000ff000000)
223 | ((v << 8) & 0x000000ff00000000)
224 | ((v << 24) & 0x0000ff0000000000)
225 | ((v << 40) & 0x00ff000000000000)
226 | ((v << 56) & 0xff00000000000000);
227 }
228
229 static inline u32x rotr32 (const u32x a, const u32 n)
230 {
231 return rotate (a, 32 - n);
232 }
233
234 static inline u32x rotl32 (const u32x a, const u32 n)
235 {
236 return rotate (a, n);
237 }
238
239 static inline u64x rotr64 (const u64x a, const u32 n)
240 {
241 u64x r;
242
243 #if DEVICE_TYPE == DEVICE_TYPE_CPU
244
245 r = rotate (a, (u64) 64 - n);
246
247 #else
248
249 uint2 a2;
250 uint2 t;
251
252 #if VECT_SIZE == 1
253
254 a2 = as_uint2 (a);
255
256 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
257 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
258
259 r = as_ulong (t);
260
261 #elif VECT_SIZE == 2
262
263 {
264 a2 = as_uint2 (a.s0);
265
266 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
267 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
268
269 r.s0 = as_ulong (t);
270 }
271
272 {
273 a2 = as_uint2 (a.s1);
274
275 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
276 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
277
278 r.s1 = as_ulong (t);
279 }
280
281 #elif VECT_SIZE == 4
282
283 {
284 a2 = as_uint2 (a.s0);
285
286 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
287 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
288
289 r.s0 = as_ulong (t);
290 }
291
292 {
293 a2 = as_uint2 (a.s1);
294
295 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
296 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
297
298 r.s1 = as_ulong (t);
299 }
300
301 {
302 a2 = as_uint2 (a.s2);
303
304 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
305 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
306
307 r.s2 = as_ulong (t);
308 }
309
310 {
311 a2 = as_uint2 (a.s3);
312
313 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
314 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
315
316 r.s3 = as_ulong (t);
317 }
318
319 #elif VECT_SIZE == 8
320
321 {
322 a2 = as_uint2 (a.s0);
323
324 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
325 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
326
327 r.s0 = as_ulong (t);
328 }
329
330 {
331 a2 = as_uint2 (a.s1);
332
333 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
334 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
335
336 r.s1 = as_ulong (t);
337 }
338
339 {
340 a2 = as_uint2 (a.s2);
341
342 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
343 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
344
345 r.s2 = as_ulong (t);
346 }
347
348 {
349 a2 = as_uint2 (a.s3);
350
351 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
352 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
353
354 r.s3 = as_ulong (t);
355 }
356
357 {
358 a2 = as_uint2 (a.s4);
359
360 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
361 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
362
363 r.s4 = as_ulong (t);
364 }
365
366 {
367 a2 = as_uint2 (a.s5);
368
369 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
370 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
371
372 r.s5 = as_ulong (t);
373 }
374
375 {
376 a2 = as_uint2 (a.s6);
377
378 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
379 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
380
381 r.s6 = as_ulong (t);
382 }
383
384 {
385 a2 = as_uint2 (a.s7);
386
387 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
388 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
389
390 r.s7 = as_ulong (t);
391 }
392
393 #endif
394 #endif
395
396 return r;
397 }
398
399 static inline u64x rotl64 (const u64x a, const u32 n)
400 {
401 return rotr64 (a, 64 - n);
402 }
403
404 static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
405 {
406 return amd_bfe (a, b, c);
407 }
408
409 static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
410 {
411 return amd_bytealign (a, b, c);
412 }
413 #endif
414
415 #ifdef IS_NV
416 static inline u32 swap32_S (const u32 v)
417 {
418 u32 r;
419
420 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r) : "r"(v));
421
422 return r;
423 }
424
425 static inline u64 swap64_S (const u64 v)
426 {
427 u32 il;
428 u32 ir;
429
430 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(v));
431
432 u32 tl;
433 u32 tr;
434
435 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl) : "r"(il));
436 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr) : "r"(ir));
437
438 u64 r;
439
440 asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tr), "r"(tl));
441
442 return r;
443 }
444
445 static inline u32 rotr32_S (const u32 a, const u32 n)
446 {
447 return rotate (a, 32 - n);
448 }
449
450 static inline u32 rotl32_S (const u32 a, const u32 n)
451 {
452 return rotate (a, n);
453 }
454
455 #if CUDA_ARCH >= 350
456 static inline u64 rotr64_S (const u64 a, const u32 n)
457 {
458 u32 il;
459 u32 ir;
460
461 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a));
462
463 u32 tl;
464 u32 tr;
465
466 if (n >= 32)
467 {
468 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
469 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
470 }
471 else
472 {
473 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
474 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
475 }
476
477 u64 r;
478
479 asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr));
480
481 return r;
482 }
483 #else
484 static inline u64 rotr64_S (const u64 a, const u32 n)
485 {
486 return rotate (a, (u64) 64 - n);
487 }
488 #endif
489
490 static inline u64 rotl64_S (const u64 a, const u32 n)
491 {
492 return rotr64_S (a, 64 - n);
493 }
494
495 #if CUDA_ARCH >= 500
496 static inline u32 lut3_2d_S (const u32 a, const u32 b, const u32 c)
497 {
498 u32 r;
499
500 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
501
502 return r;
503 }
504
505 static inline u32 lut3_39_S (const u32 a, const u32 b, const u32 c)
506 {
507 u32 r;
508
509 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
510
511 return r;
512 }
513
514 static inline u32 lut3_59_S (const u32 a, const u32 b, const u32 c)
515 {
516 u32 r;
517
518 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
519
520 return r;
521 }
522
523 static inline u32 lut3_96_S (const u32 a, const u32 b, const u32 c)
524 {
525 u32 r;
526
527 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
528
529 return r;
530 }
531
532 static inline u32 lut3_e4_S (const u32 a, const u32 b, const u32 c)
533 {
534 u32 r;
535
536 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
537
538 return r;
539 }
540
541 static inline u32 lut3_e8_S (const u32 a, const u32 b, const u32 c)
542 {
543 u32 r;
544
545 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
546
547 return r;
548 }
549
550 static inline u32 lut3_ca_S (const u32 a, const u32 b, const u32 c)
551 {
552 u32 r;
553
554 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
555
556 return r;
557 }
558 #endif
559
560 static inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
561 {
562 u32 r;
563
564 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
565
566 return r;
567 }
568
569 static inline u32x swap32 (const u32x v)
570 {
571 return ((v >> 24) & 0x000000ff)
572 | ((v >> 8) & 0x0000ff00)
573 | ((v << 8) & 0x00ff0000)
574 | ((v << 24) & 0xff000000);
575 }
576
577 static inline u64x swap64 (const u64x v)
578 {
579 return ((v >> 56) & 0x00000000000000ff)
580 | ((v >> 40) & 0x000000000000ff00)
581 | ((v >> 24) & 0x0000000000ff0000)
582 | ((v >> 8) & 0x00000000ff000000)
583 | ((v << 8) & 0x000000ff00000000)
584 | ((v << 24) & 0x0000ff0000000000)
585 | ((v << 40) & 0x00ff000000000000)
586 | ((v << 56) & 0xff00000000000000);
587 }
588
589 static inline u32x rotr32 (const u32x a, const u32 n)
590 {
591 return rotate (a, 32 - n);
592 }
593
594 static inline u32x rotl32 (const u32x a, const u32 n)
595 {
596 return rotate (a, n);
597 }
598
599 #if CUDA_ARCH >= 350
600 static inline u64x rotr64 (const u64x a, const u32 n)
601 {
602 u64x r;
603
604 u32 il;
605 u32 ir;
606 u32 tl;
607 u32 tr;
608
609 #if VECT_SIZE == 1
610
611 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a));
612
613 if (n >= 32)
614 {
615 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
616 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
617 }
618 else
619 {
620 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
621 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
622 }
623
624 asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr));
625
626 #endif
627
628 #if VECT_SIZE >= 2
629
630 {
631 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s0));
632
633 if (n >= 32)
634 {
635 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
636 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
637 }
638 else
639 {
640 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
641 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
642 }
643
644 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s0) : "r"(tl), "r"(tr));
645 }
646
647 {
648 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s1));
649
650 if (n >= 32)
651 {
652 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
653 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
654 }
655 else
656 {
657 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
658 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
659 }
660
661 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s1) : "r"(tl), "r"(tr));
662 }
663
664 #endif
665
666 #if VECT_SIZE >= 4
667
668 {
669 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s2));
670
671 if (n >= 32)
672 {
673 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
674 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
675 }
676 else
677 {
678 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
679 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
680 }
681
682 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s2) : "r"(tl), "r"(tr));
683 }
684
685 {
686 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s3));
687
688 if (n >= 32)
689 {
690 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
691 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
692 }
693 else
694 {
695 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
696 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
697 }
698
699 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s3) : "r"(tl), "r"(tr));
700 }
701
702 #endif
703
704 #if VECT_SIZE >= 8
705
706 {
707 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s4));
708
709 if (n >= 32)
710 {
711 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
712 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
713 }
714 else
715 {
716 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
717 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
718 }
719
720 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s4) : "r"(tl), "r"(tr));
721 }
722
723 {
724 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s5));
725
726 if (n >= 32)
727 {
728 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
729 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
730 }
731 else
732 {
733 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
734 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
735 }
736
737 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s5) : "r"(tl), "r"(tr));
738 }
739
740 {
741 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s6));
742
743 if (n >= 32)
744 {
745 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
746 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
747 }
748 else
749 {
750 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
751 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
752 }
753
754 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s6) : "r"(tl), "r"(tr));
755 }
756
757 {
758 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s7));
759
760 if (n >= 32)
761 {
762 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
763 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
764 }
765 else
766 {
767 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
768 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
769 }
770
771 asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s7) : "r"(tl), "r"(tr));
772 }
773
774 #endif
775
776 return r;
777 }
778 #else
779 static inline u64x rotr64 (const u64x a, const u32 n)
780 {
781 return rotate (a, (u64) 64 - n);
782 }
783 #endif
784
785 static inline u64x rotl64 (const u64x a, const u32 n)
786 {
787 return rotr64 (a, (u64) 64 - n);
788 }
789
790 static inline u32x __byte_perm (const u32x a, const u32x b, const u32x c)
791 {
792 u32x r;
793
794 #if VECT_SIZE == 1
795 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c) );
796 #endif
797
798 #if VECT_SIZE >= 2
799 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
800 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
801 #endif
802
803 #if VECT_SIZE >= 4
804 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2));
805 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3));
806 #endif
807
808 #if VECT_SIZE >= 8
809 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c.s4));
810 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c.s5));
811 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c.s6));
812 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c.s7));
813 #endif
814
815 return r;
816 }
817
818 static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
819 {
820 u32 r;
821
822 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
823
824 return r;
825 }
826
827 #if CUDA_ARCH >= 350
828 static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
829 {
830 u32 r;
831
832 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"((c & 3) * 8));
833
834 return r;
835 }
836 #else
837 static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
838 {
839 return __byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
840 }
841 #endif
842
843 #if CUDA_ARCH >= 500
844 static inline u32x lut3_2d (const u32x a, const u32x b, const u32x c)
845 {
846 u32x r;
847
848 #if VECT_SIZE == 1
849 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
850 #endif
851
852 #if VECT_SIZE >= 2
853 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
854 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
855 #endif
856
857 #if VECT_SIZE >= 4
858 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
859 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
860 #endif
861
862 #if VECT_SIZE >= 8
863 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
864 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
865 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
866 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
867 #endif
868
869 return r;
870 }
871
872 static inline u32x lut3_39 (const u32x a, const u32x b, const u32x c)
873 {
874 u32x r;
875
876 #if VECT_SIZE == 1
877 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
878 #endif
879
880 #if VECT_SIZE == 2
881 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
882 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
883 #endif
884
885 #if VECT_SIZE == 4
886 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
887 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
888 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
889 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
890 #endif
891
892 #if VECT_SIZE == 8
893 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
894 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
895 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
896 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
897 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
898 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
899 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
900 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
901 #endif
902
903 return r;
904 }
905
906 static inline u32x lut3_59 (const u32x a, const u32x b, const u32x c)
907 {
908 u32x r;
909
910 #if VECT_SIZE == 1
911 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
912 #endif
913
914 #if VECT_SIZE == 2
915 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
916 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
917 #endif
918
919 #if VECT_SIZE == 4
920 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
921 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
922 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
923 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
924 #endif
925
926 #if VECT_SIZE == 8
927 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
928 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
929 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
930 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
931 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
932 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
933 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
934 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
935 #endif
936
937 return r;
938 }
939
940 static inline u32x lut3_96 (const u32x a, const u32x b, const u32x c)
941 {
942 u32x r;
943
944 #if VECT_SIZE == 1
945 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
946 #endif
947
948 #if VECT_SIZE == 2
949 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
950 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
951 #endif
952
953 #if VECT_SIZE == 4
954 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
955 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
956 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
957 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
958 #endif
959
960 #if VECT_SIZE == 8
961 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
962 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
963 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
964 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
965 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
966 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
967 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
968 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
969 #endif
970
971 return r;
972 }
973
974 static inline u32x lut3_e4 (const u32x a, const u32x b, const u32x c)
975 {
976 u32x r;
977
978 #if VECT_SIZE == 1
979 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
980 #endif
981
982 #if VECT_SIZE == 2
983 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
984 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
985 #endif
986
987 #if VECT_SIZE == 4
988 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
989 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
990 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
991 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
992 #endif
993
994 #if VECT_SIZE == 8
995 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
996 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
997 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
998 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
999 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
1000 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
1001 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
1002 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
1003 #endif
1004
1005 return r;
1006 }
1007
1008 static inline u32x lut3_e8 (const u32x a, const u32x b, const u32x c)
1009 {
1010 u32x r;
1011
1012 #if VECT_SIZE == 1
1013 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
1014 #endif
1015
1016 #if VECT_SIZE == 2
1017 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
1018 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
1019 #endif
1020
1021 #if VECT_SIZE == 4
1022 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
1023 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
1024 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
1025 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
1026 #endif
1027
1028 #if VECT_SIZE == 8
1029 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
1030 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
1031 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
1032 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
1033 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
1034 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
1035 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
1036 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
1037 #endif
1038
1039 return r;
1040 }
1041
1042 static inline u32x lut3_ca (const u32x a, const u32x b, const u32x c)
1043 {
1044 u32x r;
1045
1046 #if VECT_SIZE == 1
1047 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
1048 #endif
1049
1050 #if VECT_SIZE == 2
1051 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
1052 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
1053 #endif
1054
1055 #if VECT_SIZE == 4
1056 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
1057 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
1058 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
1059 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
1060 #endif
1061
1062 #if VECT_SIZE == 8
1063 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
1064 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
1065 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
1066 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
1067 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
1068 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
1069 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
1070 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
1071 #endif
1072
1073 return r;
1074 }
1075
1076 #endif
1077 #endif
1078
1079 #ifdef IS_GENERIC
1080 static inline u32 swap32_S (const u32 v)
1081 {
1082 return (as_uint (as_uchar4 (v).s3210));
1083 }
1084
1085 static inline u64 swap64_S (const u64 v)
1086 {
1087 return (as_ulong (as_uchar8 (v).s76543210));
1088 }
1089
1090 static inline u32 rotr32_S (const u32 a, const u32 n)
1091 {
1092 return rotate (a, 32 - n);
1093 }
1094
1095 static inline u32 rotl32_S (const u32 a, const u32 n)
1096 {
1097 return rotate (a, n);
1098 }
1099
1100 static inline u64 rotr64_S (const u64 a, const u32 n)
1101 {
1102 return rotate (a, (u64) 64 - n);
1103 }
1104
1105 static inline u64 rotl64_S (const u64 a, const u32 n)
1106 {
1107 return rotate (a, (u64) n);
1108 }
1109
1110 static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
1111 {
1112 const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8);
1113
1114 return (u32) (tmp);
1115 }
1116
1117 static inline u32x swap32 (const u32x v)
1118 {
1119 return ((v >> 24) & 0x000000ff)
1120 | ((v >> 8) & 0x0000ff00)
1121 | ((v << 8) & 0x00ff0000)
1122 | ((v << 24) & 0xff000000);
1123 }
1124
1125 static inline u64x swap64 (const u64x v)
1126 {
1127 return ((v >> 56) & 0x00000000000000ff)
1128 | ((v >> 40) & 0x000000000000ff00)
1129 | ((v >> 24) & 0x0000000000ff0000)
1130 | ((v >> 8) & 0x00000000ff000000)
1131 | ((v << 8) & 0x000000ff00000000)
1132 | ((v << 24) & 0x0000ff0000000000)
1133 | ((v << 40) & 0x00ff000000000000)
1134 | ((v << 56) & 0xff00000000000000);
1135 }
1136
1137 static inline u32x rotr32 (const u32x a, const u32 n)
1138 {
1139 return rotate (a, 32 - n);
1140 }
1141
1142 static inline u32x rotl32 (const u32x a, const u32 n)
1143 {
1144 return rotate (a, n);
1145 }
1146
1147 static inline u64x rotr64 (const u64x a, const u32 n)
1148 {
1149 return rotate (a, (u64) 64 - n);
1150 }
1151
1152 static inline u64x rotl64 (const u64x a, const u32 n)
1153 {
1154 return rotate (a, (u64) n);
1155 }
1156
1157 static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
1158 {
1159 #define BIT(x) (1 << (x))
1160 #define BIT_MASK(x) (BIT (x) - 1)
1161 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
1162
1163 return BFE (a, b, c);
1164 }
1165
1166 static inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c)
1167 {
1168 #if VECT_SIZE == 1
1169 const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8);
1170
1171 return (u32x) (tmp);
1172 #endif
1173
1174 #if VECT_SIZE == 2
1175 const u64x tmp = ((((u64x) (a.s0, a.s1)) << 32) | ((u64x) (b.s0, b.s1))) >> ((c & 3) * 8);
1176
1177 return (u32x) (tmp.s0, tmp.s1);
1178 #endif
1179
1180 #if VECT_SIZE == 4
1181 const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3))) >> ((c & 3) * 8);
1182
1183 return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3);
1184 #endif
1185
1186 #if VECT_SIZE == 8
1187 const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3, b.s4, b.s5, b.s6, b.s7))) >> ((c & 3) * 8);
1188
1189 return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7);
1190 #endif
1191 }
1192 #endif
1193
1194 typedef struct
1195 {
1196 #if defined _DES_
1197 u32 digest_buf[4];
1198 #elif defined _MD4_
1199 u32 digest_buf[4];
1200 #elif defined _MD5_
1201 u32 digest_buf[4];
1202 #elif defined _MD5H_
1203 u32 digest_buf[4];
1204 #elif defined _SHA1_
1205 u32 digest_buf[5];
1206 #elif defined _BCRYPT_
1207 u32 digest_buf[6];
1208 #elif defined _SHA256_
1209 u32 digest_buf[8];
1210 #elif defined _SHA384_
1211 u32 digest_buf[16];
1212 #elif defined _SHA512_
1213 u32 digest_buf[16];
1214 #elif defined _KECCAK_
1215 u32 digest_buf[50];
1216 #elif defined _RIPEMD160_
1217 u32 digest_buf[5];
1218 #elif defined _WHIRLPOOL_
1219 u32 digest_buf[16];
1220 #elif defined _GOST_
1221 u32 digest_buf[8];
1222 #elif defined _GOST2012_256_
1223 u32 digest_buf[8];
1224 #elif defined _GOST2012_512_
1225 u32 digest_buf[16];
1226 #elif defined _SAPB_
1227 u32 digest_buf[4];
1228 #elif defined _SAPG_
1229 u32 digest_buf[5];
1230 #elif defined _MYSQL323_
1231 u32 digest_buf[4];
1232 #elif defined _LOTUS5_
1233 u32 digest_buf[4];
1234 #elif defined _LOTUS6_
1235 u32 digest_buf[4];
1236 #elif defined _SCRYPT_
1237 u32 digest_buf[8];
1238 #elif defined _LOTUS8_
1239 u32 digest_buf[4];
1240 #elif defined _OFFICE2007_
1241 u32 digest_buf[4];
1242 #elif defined _OFFICE2010_
1243 u32 digest_buf[4];
1244 #elif defined _OFFICE2013_
1245 u32 digest_buf[4];
1246 #elif defined _OLDOFFICE01_
1247 u32 digest_buf[4];
1248 #elif defined _OLDOFFICE34_
1249 u32 digest_buf[4];
1250 #elif defined _SIPHASH_
1251 u32 digest_buf[4];
1252 #elif defined _PBKDF2_MD5_
1253 u32 digest_buf[32];
1254 #elif defined _PBKDF2_SHA1_
1255 u32 digest_buf[32];
1256 #elif defined _PBKDF2_SHA256_
1257 u32 digest_buf[32];
1258 #elif defined _PBKDF2_SHA512_
1259 u32 digest_buf[32];
1260 #elif defined _PDF17L8_
1261 u32 digest_buf[8];
1262 #elif defined _CRC32_
1263 u32 digest_buf[4];
1264 #elif defined _SEVEN_ZIP_
1265 u32 digest_buf[4];
1266 #elif defined _ANDROIDFDE_
1267 u32 digest_buf[4];
1268 #elif defined _DCC2_
1269 u32 digest_buf[4];
1270 #elif defined _WPA_
1271 u32 digest_buf[4];
1272 #elif defined _MD5_SHA1_
1273 u32 digest_buf[4];
1274 #elif defined _SHA1_MD5_
1275 u32 digest_buf[5];
1276 #elif defined _NETNTLMV2_
1277 u32 digest_buf[4];
1278 #elif defined _KRB5PA_
1279 u32 digest_buf[4];
1280 #elif defined _CLOUDKEY_
1281 u32 digest_buf[8];
1282 #elif defined _SCRYPT_
1283 u32 digest_buf[4];
1284 #elif defined _PSAFE2_
1285 u32 digest_buf[5];
1286 #elif defined _LOTUS8_
1287 u32 digest_buf[4];
1288 #elif defined _RAR3_
1289 u32 digest_buf[4];
1290 #elif defined _SHA256_SHA1_
1291 u32 digest_buf[8];
1292 #elif defined _MS_DRSR_
1293 u32 digest_buf[8];
1294 #elif defined _ANDROIDFDE_SAMSUNG_
1295 u32 digest_buf[8];
1296 #elif defined _RAR5_
1297 u32 digest_buf[4];
1298 #endif
1299
1300 } digest_t;
1301
1302 typedef struct
1303 {
1304 u32 salt_buf[16];
1305 u32 salt_buf_pc[8];
1306
1307 u32 salt_len;
1308 u32 salt_iter;
1309 u32 salt_sign[2];
1310
1311 u32 keccak_mdlen;
1312 u32 truecrypt_mdlen;
1313
1314 u32 digests_cnt;
1315 u32 digests_done;
1316
1317 u32 digests_offset;
1318
1319 u32 scrypt_N;
1320 u32 scrypt_r;
1321 u32 scrypt_p;
1322 u32 scrypt_tmto;
1323 u32 scrypt_phy;
1324
1325 } salt_t;
1326
1327 typedef struct
1328 {
1329 int V;
1330 int R;
1331 int P;
1332
1333 int enc_md;
1334
1335 u32 id_buf[8];
1336 u32 u_buf[32];
1337 u32 o_buf[32];
1338
1339 int id_len;
1340 int o_len;
1341 int u_len;
1342
1343 u32 rc4key[2];
1344 u32 rc4data[2];
1345
1346 } pdf_t;
1347
1348 typedef struct
1349 {
1350 u32 pke[25];
1351 u32 eapol[64];
1352 int eapol_size;
1353 int keyver;
1354
1355 } wpa_t;
1356
1357 typedef struct
1358 {
1359 u32 cry_master_buf[64];
1360 u32 ckey_buf[64];
1361 u32 public_key_buf[64];
1362
1363 u32 cry_master_len;
1364 u32 ckey_len;
1365 u32 public_key_len;
1366
1367 } bitcoin_wallet_t;
1368
1369 typedef struct
1370 {
1371 u32 salt_buf[30];
1372 u32 salt_len;
1373
1374 u32 esalt_buf[38];
1375 u32 esalt_len;
1376
1377 } sip_t;
1378
1379 typedef struct
1380 {
1381 u32 data[384];
1382
1383 } androidfde_t;
1384
1385 typedef struct
1386 {
1387 u32 nr_buf[16];
1388 u32 nr_len;
1389
1390 u32 msg_buf[128];
1391 u32 msg_len;
1392
1393 } ikepsk_t;
1394
1395 typedef struct
1396 {
1397 u32 user_len;
1398 u32 domain_len;
1399 u32 srvchall_len;
1400 u32 clichall_len;
1401
1402 u32 userdomain_buf[64];
1403 u32 chall_buf[256];
1404
1405 } netntlm_t;
1406
1407 typedef struct
1408 {
1409 u32 user[16];
1410 u32 realm[16];
1411 u32 salt[32];
1412 u32 timestamp[16];
1413 u32 checksum[4];
1414
1415 } krb5pa_t;
1416
1417 typedef struct
1418 {
1419 u32 salt_buf[16];
1420 u32 data_buf[112];
1421 u32 keyfile_buf[16];
1422
1423 } tc_t;
1424
1425 typedef struct
1426 {
1427 u32 salt_buf[16];
1428
1429 } pbkdf2_md5_t;
1430
1431 typedef struct
1432 {
1433 u32 salt_buf[16];
1434
1435 } pbkdf2_sha1_t;
1436
1437 typedef struct
1438 {
1439 u32 salt_buf[16];
1440
1441 } pbkdf2_sha256_t;
1442
1443 typedef struct
1444 {
1445 u32 salt_buf[32];
1446
1447 } pbkdf2_sha512_t;
1448
1449 typedef struct
1450 {
1451 u32 salt_buf[128];
1452 u32 salt_len;
1453
1454 } rakp_t;
1455
1456 typedef struct
1457 {
1458 u32 data_len;
1459 u32 data_buf[512];
1460
1461 } cloudkey_t;
1462
1463 typedef struct
1464 {
1465 u32 encryptedVerifier[4];
1466 u32 encryptedVerifierHash[5];
1467
1468 u32 keySize;
1469
1470 } office2007_t;
1471
1472 typedef struct
1473 {
1474 u32 encryptedVerifier[4];
1475 u32 encryptedVerifierHash[8];
1476
1477 } office2010_t;
1478
1479 typedef struct
1480 {
1481 u32 encryptedVerifier[4];
1482 u32 encryptedVerifierHash[8];
1483
1484 } office2013_t;
1485
1486 typedef struct
1487 {
1488 u32 version;
1489 u32 encryptedVerifier[4];
1490 u32 encryptedVerifierHash[4];
1491 u32 rc4key[2];
1492
1493 } oldoffice01_t;
1494
1495 typedef struct
1496 {
1497 u32 version;
1498 u32 encryptedVerifier[4];
1499 u32 encryptedVerifierHash[5];
1500 u32 rc4key[2];
1501
1502 } oldoffice34_t;
1503
1504 typedef struct
1505 {
1506 u32 digest[4];
1507 u32 out[4];
1508
1509 } pdf14_tmp_t;
1510
1511 typedef struct
1512 {
1513 union
1514 {
1515 u32 dgst32[16];
1516 u64 dgst64[8];
1517 };
1518
1519 u32 dgst_len;
1520 u32 W_len;
1521
1522 } pdf17l8_tmp_t;
1523
1524 typedef struct
1525 {
1526 u32 digest_buf[4];
1527
1528 } phpass_tmp_t;
1529
1530 typedef struct
1531 {
1532 u32 digest_buf[4];
1533
1534 } md5crypt_tmp_t;
1535
1536 typedef struct
1537 {
1538 u32 alt_result[8];
1539
1540 u32 p_bytes[4];
1541 u32 s_bytes[4];
1542
1543 } sha256crypt_tmp_t;
1544
1545 typedef struct
1546 {
1547 u64 l_alt_result[8];
1548
1549 u64 l_p_bytes[2];
1550 u64 l_s_bytes[2];
1551
1552 } sha512crypt_tmp_t;
1553
1554 typedef struct
1555 {
1556 u32 ipad[5];
1557 u32 opad[5];
1558
1559 u32 dgst[10];
1560 u32 out[10];
1561
1562 } wpa_tmp_t;
1563
1564 typedef struct
1565 {
1566 u64 dgst[8];
1567
1568 } bitcoin_wallet_tmp_t;
1569
1570 typedef struct
1571 {
1572 u32 ipad[5];
1573 u32 opad[5];
1574
1575 u32 dgst[5];
1576 u32 out[4];
1577
1578 } dcc2_tmp_t;
1579
1580 typedef struct
1581 {
1582 u32 E[18];
1583
1584 u32 P[18];
1585
1586 u32 S0[256];
1587 u32 S1[256];
1588 u32 S2[256];
1589 u32 S3[256];
1590
1591 } bcrypt_tmp_t;
1592
1593 typedef struct
1594 {
1595 u32 digest[2];
1596
1597 u32 P[18];
1598
1599 u32 S0[256];
1600 u32 S1[256];
1601 u32 S2[256];
1602 u32 S3[256];
1603
1604 } pwsafe2_tmp_t;
1605
1606 typedef struct
1607 {
1608 u32 digest_buf[8];
1609
1610 } pwsafe3_tmp_t;
1611
1612 typedef struct
1613 {
1614 u32 digest_buf[5];
1615
1616 } androidpin_tmp_t;
1617
1618 typedef struct
1619 {
1620 u32 ipad[5];
1621 u32 opad[5];
1622
1623 u32 dgst[10];
1624 u32 out[10];
1625
1626 } androidfde_tmp_t;
1627
1628 typedef struct
1629 {
1630 u32 ipad[16];
1631 u32 opad[16];
1632
1633 u32 dgst[64];
1634 u32 out[64];
1635
1636 } tc_tmp_t;
1637
1638 typedef struct
1639 {
1640 u64 ipad[8];
1641 u64 opad[8];
1642
1643 u64 dgst[32];
1644 u64 out[32];
1645
1646 } tc64_tmp_t;
1647
1648 typedef struct
1649 {
1650 u32 ipad[4];
1651 u32 opad[4];
1652
1653 u32 dgst[32];
1654 u32 out[32];
1655
1656 } pbkdf2_md5_tmp_t;
1657
1658 typedef struct
1659 {
1660 u32 ipad[5];
1661 u32 opad[5];
1662
1663 u32 dgst[32];
1664 u32 out[32];
1665
1666 } pbkdf2_sha1_tmp_t;
1667
1668 typedef struct
1669 {
1670 u32 ipad[8];
1671 u32 opad[8];
1672
1673 u32 dgst[32];
1674 u32 out[32];
1675
1676 } pbkdf2_sha256_tmp_t;
1677
1678 typedef struct
1679 {
1680 u64 ipad[8];
1681 u64 opad[8];
1682
1683 u64 dgst[16];
1684 u64 out[16];
1685
1686 } pbkdf2_sha512_tmp_t;
1687
1688 typedef struct
1689 {
1690 u64 out[8];
1691
1692 } ecryptfs_tmp_t;
1693
1694 typedef struct
1695 {
1696 u64 ipad[8];
1697 u64 opad[8];
1698
1699 u64 dgst[16];
1700 u64 out[16];
1701
1702 } oraclet_tmp_t;
1703
1704 typedef struct
1705 {
1706 u32 ipad[5];
1707 u32 opad[5];
1708
1709 u32 dgst[5];
1710 u32 out[5];
1711
1712 } agilekey_tmp_t;
1713
1714 typedef struct
1715 {
1716 u32 ipad[5];
1717 u32 opad[5];
1718
1719 u32 dgst1[5];
1720 u32 out1[5];
1721
1722 u32 dgst2[5];
1723 u32 out2[5];
1724
1725 } mywallet_tmp_t;
1726
1727 typedef struct
1728 {
1729 u32 ipad[5];
1730 u32 opad[5];
1731
1732 u32 dgst[5];
1733 u32 out[5];
1734
1735 } sha1aix_tmp_t;
1736
1737 typedef struct
1738 {
1739 u32 ipad[8];
1740 u32 opad[8];
1741
1742 u32 dgst[8];
1743 u32 out[8];
1744
1745 } sha256aix_tmp_t;
1746
1747 typedef struct
1748 {
1749 u64 ipad[8];
1750 u64 opad[8];
1751
1752 u64 dgst[8];
1753 u64 out[8];
1754
1755 } sha512aix_tmp_t;
1756
1757 typedef struct
1758 {
1759 u32 ipad[8];
1760 u32 opad[8];
1761
1762 u32 dgst[8];
1763 u32 out[8];
1764
1765 } lastpass_tmp_t;
1766
1767 typedef struct
1768 {
1769 u64 digest_buf[8];
1770
1771 } drupal7_tmp_t;
1772
1773 typedef struct
1774 {
1775 u32 ipad[5];
1776 u32 opad[5];
1777
1778 u32 dgst[5];
1779 u32 out[5];
1780
1781 } lotus8_tmp_t;
1782
1783 typedef struct
1784 {
1785 u32 out[5];
1786
1787 } office2007_tmp_t;
1788
1789 typedef struct
1790 {
1791 u32 out[5];
1792
1793 } office2010_tmp_t;
1794
1795 typedef struct
1796 {
1797 u64 out[8];
1798
1799 } office2013_tmp_t;
1800
1801 typedef struct
1802 {
1803 u32 digest_buf[5];
1804
1805 } saph_sha1_tmp_t;
1806
1807 typedef struct
1808 {
1809 u32 block[16];
1810
1811 u32 dgst[8];
1812
1813 u32 block_len;
1814 u32 final_len;
1815
1816 } seven_zip_tmp_t;
1817
1818 typedef struct
1819 {
1820 u32 Kc[16];
1821 u32 Kd[16];
1822
1823 u32 iv[2];
1824
1825 } bsdicrypt_tmp_t;
1826
1827 typedef struct
1828 {
1829 u32 dgst[17][5];
1830
1831 } rar3_tmp_t;
1832
1833 typedef struct
1834 {
1835 u32 user[16];
1836
1837 } cram_md5_t;
1838
1839 typedef struct
1840 {
1841 u32 iv_buf[4];
1842 u32 iv_len;
1843
1844 u32 salt_buf[4];
1845 u32 salt_len;
1846
1847 u32 crc;
1848
1849 u32 data_buf[96];
1850 u32 data_len;
1851
1852 u32 unpack_size;
1853
1854 } seven_zip_t;
1855
1856 typedef struct
1857 {
1858 u32 key;
1859 u64 val;
1860
1861 } hcstat_table_t;
1862
1863 typedef struct
1864 {
1865 u32 cs_buf[0x100];
1866 u32 cs_len;
1867
1868 } cs_t;
1869
1870 typedef struct
1871 {
1872 u32 cmds[0x100];
1873
1874 } kernel_rule_t;
1875
1876 typedef struct
1877 {
1878 u32 gidvid;
1879 u32 il_pos;
1880
1881 } plain_t;
1882
1883 typedef struct
1884 {
1885 u32 i[64];
1886
1887 u32 pw_len;
1888
1889 u32 alignment_placeholder_1;
1890 u32 alignment_placeholder_2;
1891 u32 alignment_placeholder_3;
1892
1893 } pw_t;
1894
1895 typedef struct
1896 {
1897 u32 i;
1898
1899 } bf_t;
1900
1901 typedef struct
1902 {
1903 u32 i[8];
1904
1905 u32 pw_len;
1906
1907 } comb_t;
1908
1909 typedef struct
1910 {
1911 u32 b[32];
1912
1913 } bs_word_t;
1914
1915 typedef struct
1916 {
1917 uint4 P[64];
1918
1919 } scrypt_tmp_t;