2 * Author......: Jens Steube <jens.steube@gmail.com>
6 #define DEVICE_TYPE_CPU 2
7 #define DEVICE_TYPE_GPU 4
47 // this one needs to die
50 static inline u32
l32_from_64_S (u64 a
)
52 const u32 r
= (u32
) (a
);
57 static inline u32
h32_from_64_S (u64 a
)
61 const u32 r
= (u32
) (a
);
66 static inline u64
hl32_to_64_S (const u32 a
, const u32 b
)
68 return as_ulong ((uint2
) (b
, a
));
71 static inline u32x
l32_from_64 (u64x a
)
99 static inline u32x
h32_from_64 (u64x a
)
129 static inline u64x
hl32_to_64 (const u32x a
, const u32x b
)
134 r
= as_ulong ((uint2
) (b
, a
));
138 r
.s0
= as_ulong ((uint2
) (b
.s0
, a
.s0
));
139 r
.s1
= as_ulong ((uint2
) (b
.s1
, a
.s1
));
143 r
.s2
= as_ulong ((uint2
) (b
.s2
, a
.s2
));
144 r
.s3
= as_ulong ((uint2
) (b
.s3
, a
.s3
));
148 r
.s4
= as_ulong ((uint2
) (b
.s4
, a
.s4
));
149 r
.s5
= as_ulong ((uint2
) (b
.s5
, a
.s5
));
150 r
.s6
= as_ulong ((uint2
) (b
.s6
, a
.s6
));
151 r
.s7
= as_ulong ((uint2
) (b
.s7
, a
.s7
));
158 static inline u32
swap32_S (const u32 v
)
160 return (as_uint (as_uchar4 (v
).s3210
));
163 static inline u64
swap64_S (const u64 v
)
165 return (as_ulong (as_uchar8 (v
).s76543210
));
168 static inline u32
rotr32_S (const u32 a
, const u32 n
)
170 return rotate (a
, 32 - n
);
173 static inline u32
rotl32_S (const u32 a
, const u32 n
)
175 return rotate (a
, n
);
178 static inline u64
rotr64_S (const u64 a
, const u32 n
)
180 #if DEVICE_TYPE == DEVICE_TYPE_CPU
182 const u64 r
= rotate (a
, (u64
) 64 - n
);
186 const u32 a0
= h32_from_64_S (a
);
187 const u32 a1
= l32_from_64_S (a
);
189 const u32 t0
= (n
>= 32) ? amd_bitalign (a0
, a1
, n
- 32) : amd_bitalign (a1
, a0
, n
);
190 const u32 t1
= (n
>= 32) ? amd_bitalign (a1
, a0
, n
- 32) : amd_bitalign (a0
, a1
, n
);
192 const u64 r
= hl32_to_64_S (t0
, t1
);
199 static inline u64
rotl64_S (const u64 a
, const u32 n
)
201 return rotr64_S (a
, 64 - n
);
204 static inline u32x
swap32 (const u32x v
)
206 return ((v
>> 24) & 0x000000ff)
207 | ((v
>> 8) & 0x0000ff00)
208 | ((v
<< 8) & 0x00ff0000)
209 | ((v
<< 24) & 0xff000000);
212 static inline u64x
swap64 (const u64x v
)
214 return ((v
>> 56) & 0x00000000000000ff)
215 | ((v
>> 40) & 0x000000000000ff00)
216 | ((v
>> 24) & 0x0000000000ff0000)
217 | ((v
>> 8) & 0x00000000ff000000)
218 | ((v
<< 8) & 0x000000ff00000000)
219 | ((v
<< 24) & 0x0000ff0000000000)
220 | ((v
<< 40) & 0x00ff000000000000)
221 | ((v
<< 56) & 0xff00000000000000);
224 static inline u32x
rotr32 (const u32x a
, const u32 n
)
226 return rotate (a
, 32 - n
);
229 static inline u32x
rotl32 (const u32x a
, const u32 n
)
231 return rotate (a
, n
);
234 static inline u64x
rotr64 (const u64x a
, const u32 n
)
236 #if DEVICE_TYPE == DEVICE_TYPE_CPU
238 const u64x r
= rotate (a
, (u64
) 64 - n
);
242 const u32x a0
= h32_from_64 (a
);
243 const u32x a1
= l32_from_64 (a
);
245 const u32x t0
= (n
>= 32) ? amd_bitalign (a0
, a1
, n
- 32) : amd_bitalign (a1
, a0
, n
);
246 const u32x t1
= (n
>= 32) ? amd_bitalign (a1
, a0
, n
- 32) : amd_bitalign (a0
, a1
, n
);
248 const u64x r
= hl32_to_64 (t0
, t1
);
255 static inline u64x
rotl64 (const u64x a
, const u32 n
)
257 return rotr64 (a
, 64 - n
);
260 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
262 return amd_bfe (a
, b
, c
);
265 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
267 return amd_bytealign (a
, b
, c
);
272 static inline u32
swap32_S (const u32 v
)
276 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r
) : "r"(v
));
281 static inline u64
swap64_S (const u64 v
)
286 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(v
));
291 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl
) : "r"(il
));
292 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr
) : "r"(ir
));
296 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tr
), "r"(tl
));
301 static inline u32
rotr32_S (const u32 a
, const u32 n
)
303 return rotate (a
, 32 - n
);
306 static inline u32
rotl32_S (const u32 a
, const u32 n
)
308 return rotate (a
, n
);
312 static inline u64
rotr64_S (const u64 a
, const u32 n
)
317 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
324 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
325 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
329 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
330 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
335 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
340 static inline u64
rotr64_S (const u64 a
, const u32 n
)
342 return rotate (a
, (u64
) 64 - n
);
346 static inline u64
rotl64_S (const u64 a
, const u32 n
)
348 return rotr64_S (a
, 64 - n
);
352 static inline u32
lut3_2d_S (const u32 a
, const u32 b
, const u32 c
)
356 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
361 static inline u32
lut3_39_S (const u32 a
, const u32 b
, const u32 c
)
365 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
370 static inline u32
lut3_59_S (const u32 a
, const u32 b
, const u32 c
)
374 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
379 static inline u32
lut3_96_S (const u32 a
, const u32 b
, const u32 c
)
383 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
388 static inline u32
lut3_e4_S (const u32 a
, const u32 b
, const u32 c
)
392 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
397 static inline u32
lut3_e8_S (const u32 a
, const u32 b
, const u32 c
)
401 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
406 static inline u32
lut3_ca_S (const u32 a
, const u32 b
, const u32 c
)
410 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
416 static inline u32
__byte_perm_S (const u32 a
, const u32 b
, const u32 c
)
420 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
425 static inline u32x
swap32 (const u32x v
)
427 return ((v
>> 24) & 0x000000ff)
428 | ((v
>> 8) & 0x0000ff00)
429 | ((v
<< 8) & 0x00ff0000)
430 | ((v
<< 24) & 0xff000000);
433 static inline u64x
swap64 (const u64x v
)
435 return ((v
>> 56) & 0x00000000000000ff)
436 | ((v
>> 40) & 0x000000000000ff00)
437 | ((v
>> 24) & 0x0000000000ff0000)
438 | ((v
>> 8) & 0x00000000ff000000)
439 | ((v
<< 8) & 0x000000ff00000000)
440 | ((v
<< 24) & 0x0000ff0000000000)
441 | ((v
<< 40) & 0x00ff000000000000)
442 | ((v
<< 56) & 0xff00000000000000);
445 static inline u32x
rotr32 (const u32x a
, const u32 n
)
447 return rotate (a
, 32 - n
);
450 static inline u32x
rotl32 (const u32x a
, const u32 n
)
452 return rotate (a
, n
);
456 static inline u64x
rotr64 (const u64x a
, const u32 n
)
467 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
471 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
472 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
476 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
477 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
480 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
487 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s0
));
491 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
492 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
496 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
497 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
500 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s0
) : "r"(tl
), "r"(tr
));
504 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s1
));
508 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
509 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
513 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
514 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
517 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s1
) : "r"(tl
), "r"(tr
));
525 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s2
));
529 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
530 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
534 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
535 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
538 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s2
) : "r"(tl
), "r"(tr
));
542 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s3
));
546 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
547 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
551 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
552 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
555 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s3
) : "r"(tl
), "r"(tr
));
563 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s4
));
567 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
568 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
572 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
573 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
576 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s4
) : "r"(tl
), "r"(tr
));
580 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s5
));
584 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
585 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
589 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
590 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
593 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s5
) : "r"(tl
), "r"(tr
));
597 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s6
));
601 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
602 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
606 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
607 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
610 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s6
) : "r"(tl
), "r"(tr
));
614 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s7
));
618 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
619 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
623 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
624 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
627 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s7
) : "r"(tl
), "r"(tr
));
635 static inline u64x
rotr64 (const u64x a
, const u32 n
)
637 return rotate (a
, (u64
) 64 - n
);
641 static inline u64x
rotl64 (const u64x a
, const u32 n
)
643 return rotr64 (a
, (u64
) 64 - n
);
646 static inline u32x
__byte_perm (const u32x a
, const u32x b
, const u32x c
)
651 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
) );
655 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s0
) : "r"(a
.s0
), "r"(b
.s0
), "r"(c
.s0
));
656 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s1
) : "r"(a
.s1
), "r"(b
.s1
), "r"(c
.s1
));
660 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s2
) : "r"(a
.s2
), "r"(b
.s2
), "r"(c
.s2
));
661 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s3
) : "r"(a
.s3
), "r"(b
.s3
), "r"(c
.s3
));
665 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s4
) : "r"(a
.s4
), "r"(b
.s4
), "r"(c
.s4
));
666 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s5
) : "r"(a
.s5
), "r"(b
.s5
), "r"(c
.s5
));
667 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s6
) : "r"(a
.s6
), "r"(b
.s6
), "r"(c
.s6
));
668 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s7
) : "r"(a
.s7
), "r"(b
.s7
), "r"(c
.s7
));
674 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
678 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
684 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
688 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(b
), "r"(a
), "r"((c
& 3) * 8));
693 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
695 return __byte_perm_S (b
, a
, (0x76543210 >> ((c
& 3) * 4)) & 0xffff);
700 static inline u32x
lut3_2d (const u32x a
, const u32x b
, const u32x c
)
705 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
709 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
710 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
714 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
715 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
719 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
720 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
721 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
722 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
728 static inline u32x
lut3_39 (const u32x a
, const u32x b
, const u32x c
)
733 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
737 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
738 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
742 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
743 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
744 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
745 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
749 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
750 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
751 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
752 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
753 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
754 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
755 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
756 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
762 static inline u32x
lut3_59 (const u32x a
, const u32x b
, const u32x c
)
767 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
771 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
772 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
776 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
777 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
778 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
779 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
783 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
784 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
785 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
786 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
787 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
788 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
789 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
790 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
796 static inline u32x
lut3_96 (const u32x a
, const u32x b
, const u32x c
)
801 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
805 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
806 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
810 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
811 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
812 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
813 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
817 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
818 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
819 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
820 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
821 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
822 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
823 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
824 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
830 static inline u32x
lut3_e4 (const u32x a
, const u32x b
, const u32x c
)
835 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
839 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
840 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
844 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
845 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
846 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
847 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
851 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
852 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
853 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
854 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
855 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
856 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
857 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
858 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
864 static inline u32x
lut3_e8 (const u32x a
, const u32x b
, const u32x c
)
869 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
873 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
874 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
878 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
879 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
880 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
881 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
885 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
886 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
887 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
888 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
889 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
890 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
891 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
892 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
898 static inline u32x
lut3_ca (const u32x a
, const u32x b
, const u32x c
)
903 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
907 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
908 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
912 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
913 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
914 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
915 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
919 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
920 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
921 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
922 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
923 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
924 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
925 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
926 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
936 static inline u32
swap32_S (const u32 v
)
938 return (as_uint (as_uchar4 (v
).s3210
));
941 static inline u64
swap64_S (const u64 v
)
943 return (as_ulong (as_uchar8 (v
).s76543210
));
946 static inline u32
rotr32_S (const u32 a
, const u32 n
)
948 return rotate (a
, 32 - n
);
951 static inline u32
rotl32_S (const u32 a
, const u32 n
)
953 return rotate (a
, n
);
956 static inline u64
rotr64_S (const u64 a
, const u32 n
)
958 return rotate (a
, (u64
) 64 - n
);
961 static inline u64
rotl64_S (const u64 a
, const u32 n
)
963 return rotate (a
, (u64
) n
);
966 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
968 const u64 tmp
= ((((u64
) a
) << 32) | ((u64
) b
)) >> ((c
& 3) * 8);
973 static inline u32x
swap32 (const u32x v
)
975 return ((v
>> 24) & 0x000000ff)
976 | ((v
>> 8) & 0x0000ff00)
977 | ((v
<< 8) & 0x00ff0000)
978 | ((v
<< 24) & 0xff000000);
981 static inline u64x
swap64 (const u64x v
)
983 return ((v
>> 56) & 0x00000000000000ff)
984 | ((v
>> 40) & 0x000000000000ff00)
985 | ((v
>> 24) & 0x0000000000ff0000)
986 | ((v
>> 8) & 0x00000000ff000000)
987 | ((v
<< 8) & 0x000000ff00000000)
988 | ((v
<< 24) & 0x0000ff0000000000)
989 | ((v
<< 40) & 0x00ff000000000000)
990 | ((v
<< 56) & 0xff00000000000000);
993 static inline u32x
rotr32 (const u32x a
, const u32 n
)
995 return rotate (a
, 32 - n
);
998 static inline u32x
rotl32 (const u32x a
, const u32 n
)
1000 return rotate (a
, n
);
1003 static inline u64x
rotr64 (const u64x a
, const u32 n
)
1005 return rotate (a
, (u64
) 64 - n
);
1008 static inline u64x
rotl64 (const u64x a
, const u32 n
)
1010 return rotate (a
, (u64
) n
);
1013 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
1015 #define BIT(x) (1 << (x))
1016 #define BIT_MASK(x) (BIT (x) - 1)
1017 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
1019 return BFE (a
, b
, c
);
1022 static inline u32x
amd_bytealign (const u32x a
, const u32x b
, const u32 c
)
1025 const u64x tmp
= ((((u64x
) (a
)) << 32) | ((u64x
) (b
))) >> ((c
& 3) * 8);
1027 return (u32x
) (tmp
);
1031 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
)) << 32) | ((u64x
) (b
.s0
, b
.s1
))) >> ((c
& 3) * 8);
1033 return (u32x
) (tmp
.s0
, tmp
.s1
);
1037 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
))) >> ((c
& 3) * 8);
1039 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
);
1043 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
))) >> ((c
& 3) * 8);
1045 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
);
1058 #elif defined _MD5H_
1060 #elif defined _SHA1_
1062 #elif defined _BCRYPT_
1064 #elif defined _SHA256_
1066 #elif defined _SHA384_
1068 #elif defined _SHA512_
1070 #elif defined _KECCAK_
1072 #elif defined _RIPEMD160_
1074 #elif defined _WHIRLPOOL_
1076 #elif defined _GOST_
1078 #elif defined _GOST2012_256_
1080 #elif defined _GOST2012_512_
1082 #elif defined _SAPB_
1084 #elif defined _SAPG_
1086 #elif defined _MYSQL323_
1088 #elif defined _LOTUS5_
1090 #elif defined _LOTUS6_
1092 #elif defined _SCRYPT_
1094 #elif defined _LOTUS8_
1096 #elif defined _OFFICE2007_
1098 #elif defined _OFFICE2010_
1100 #elif defined _OFFICE2013_
1102 #elif defined _OLDOFFICE01_
1104 #elif defined _OLDOFFICE34_
1106 #elif defined _SIPHASH_
1108 #elif defined _PBKDF2_MD5_
1110 #elif defined _PBKDF2_SHA1_
1112 #elif defined _PBKDF2_SHA256_
1114 #elif defined _PBKDF2_SHA512_
1116 #elif defined _PDF17L8_
1118 #elif defined _CRC32_
1120 #elif defined _SEVEN_ZIP_
1122 #elif defined _ANDROIDFDE_
1124 #elif defined _DCC2_
1128 #elif defined _MD5_SHA1_
1130 #elif defined _SHA1_MD5_
1132 #elif defined _NETNTLMV2_
1134 #elif defined _KRB5PA_
1136 #elif defined _CLOUDKEY_
1138 #elif defined _SCRYPT_
1140 #elif defined _PSAFE2_
1142 #elif defined _LOTUS8_
1144 #elif defined _RAR3_
1146 #elif defined _SHA256_SHA1_
1148 #elif defined _MS_DRSR_
1150 #elif defined _ANDROIDFDE_SAMSUNG_
1152 #elif defined _RAR5_
1168 u32 truecrypt_mdlen
;
1215 u32 cry_master_buf
[64];
1217 u32 public_key_buf
[64];
1258 u32 userdomain_buf
[64];
1277 u32 keyfile_buf
[16];
1321 u32 encryptedVerifier
[4];
1322 u32 encryptedVerifierHash
[5];
1330 u32 encryptedVerifier
[4];
1331 u32 encryptedVerifierHash
[8];
1337 u32 encryptedVerifier
[4];
1338 u32 encryptedVerifierHash
[8];
1345 u32 encryptedVerifier
[4];
1346 u32 encryptedVerifierHash
[4];
1354 u32 encryptedVerifier
[4];
1355 u32 encryptedVerifierHash
[5];
1399 } sha256crypt_tmp_t
;
1403 u64 l_alt_result
[8];
1408 } sha512crypt_tmp_t
;
1424 } bitcoin_wallet_tmp_t
;
1522 } pbkdf2_sha1_tmp_t
;
1532 } pbkdf2_sha256_tmp_t
;
1542 } pbkdf2_sha512_tmp_t
;
1745 u32 alignment_placeholder_1
;
1746 u32 alignment_placeholder_2
;
1747 u32 alignment_placeholder_3
;