2 * Author......: Jens Steube <jens.steube@gmail.com>
6 #define DEVICE_TYPE_CPU 2
7 #define DEVICE_TYPE_GPU 4
47 // this one needs to die
50 static inline u32
l32_from_64_S (u64 a
)
52 const u32 r
= (u32
) (a
);
57 static inline u32
h32_from_64_S (u64 a
)
61 const u32 r
= (u32
) (a
);
66 static inline u64
hl32_to_64_S (const u32 a
, const u32 b
)
68 return as_ulong ((uint2
) (b
, a
));
71 static inline u32x
l32_from_64 (u64x a
)
99 static inline u32x
h32_from_64 (u64x a
)
129 static inline u64x
hl32_to_64 (const u32x a
, const u32x b
)
134 r
= as_ulong ((uint2
) (b
, a
));
138 r
.s0
= as_ulong ((uint2
) (b
.s0
, a
.s0
));
139 r
.s1
= as_ulong ((uint2
) (b
.s1
, a
.s1
));
143 r
.s2
= as_ulong ((uint2
) (b
.s2
, a
.s2
));
144 r
.s3
= as_ulong ((uint2
) (b
.s3
, a
.s3
));
148 r
.s4
= as_ulong ((uint2
) (b
.s4
, a
.s4
));
149 r
.s5
= as_ulong ((uint2
) (b
.s5
, a
.s5
));
150 r
.s6
= as_ulong ((uint2
) (b
.s6
, a
.s6
));
151 r
.s7
= as_ulong ((uint2
) (b
.s7
, a
.s7
));
158 static inline u32
swap32_S (const u32 v
)
160 return (as_uint (as_uchar4 (v
).s3210
));
163 static inline u64
swap64_S (const u64 v
)
165 return (as_ulong (as_uchar8 (v
).s76543210
));
168 static inline u32
rotr32_S (const u32 a
, const u32 n
)
170 return rotate (a
, 32 - n
);
173 static inline u32
rotl32_S (const u32 a
, const u32 n
)
175 return rotate (a
, n
);
178 static inline u64
rotr64_S (const u64 a
, const u32 n
)
182 #if DEVICE_TYPE == DEVICE_TYPE_CPU
184 r
= rotate (a
, (u64
) 64 - n
);
188 uint2 a2
= as_uint2 (a
);
192 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32)
193 : amd_bitalign (a2
.s1
, a2
.s0
, n
);
194 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32)
195 : amd_bitalign (a2
.s0
, a2
.s1
, n
);
204 static inline u64
rotl64_S (const u64 a
, const u32 n
)
206 return rotr64_S (a
, 64 - n
);
209 static inline u32x
swap32 (const u32x v
)
211 return ((v
>> 24) & 0x000000ff)
212 | ((v
>> 8) & 0x0000ff00)
213 | ((v
<< 8) & 0x00ff0000)
214 | ((v
<< 24) & 0xff000000);
217 static inline u64x
swap64 (const u64x v
)
219 return ((v
>> 56) & 0x00000000000000ff)
220 | ((v
>> 40) & 0x000000000000ff00)
221 | ((v
>> 24) & 0x0000000000ff0000)
222 | ((v
>> 8) & 0x00000000ff000000)
223 | ((v
<< 8) & 0x000000ff00000000)
224 | ((v
<< 24) & 0x0000ff0000000000)
225 | ((v
<< 40) & 0x00ff000000000000)
226 | ((v
<< 56) & 0xff00000000000000);
229 static inline u32x
rotr32 (const u32x a
, const u32 n
)
231 return rotate (a
, 32 - n
);
234 static inline u32x
rotl32 (const u32x a
, const u32 n
)
236 return rotate (a
, n
);
239 static inline u64x
rotr64 (const u64x a
, const u32 n
)
243 #if DEVICE_TYPE == DEVICE_TYPE_CPU
245 r
= rotate (a
, (u64
) 64 - n
);
256 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
257 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
264 a2
= as_uint2 (a
.s0
);
266 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
267 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
273 a2
= as_uint2 (a
.s1
);
275 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
276 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
284 a2
= as_uint2 (a
.s0
);
286 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
287 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
293 a2
= as_uint2 (a
.s1
);
295 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
296 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
302 a2
= as_uint2 (a
.s2
);
304 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
305 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
311 a2
= as_uint2 (a
.s3
);
313 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
314 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
322 a2
= as_uint2 (a
.s0
);
324 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
325 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
331 a2
= as_uint2 (a
.s1
);
333 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
334 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
340 a2
= as_uint2 (a
.s2
);
342 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
343 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
349 a2
= as_uint2 (a
.s3
);
351 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
352 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
358 a2
= as_uint2 (a
.s4
);
360 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
361 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
367 a2
= as_uint2 (a
.s5
);
369 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
370 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
376 a2
= as_uint2 (a
.s6
);
378 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
379 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
385 a2
= as_uint2 (a
.s7
);
387 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
388 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
399 static inline u64x
rotl64 (const u64x a
, const u32 n
)
401 return rotr64 (a
, 64 - n
);
404 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
406 return amd_bfe (a
, b
, c
);
409 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
411 return amd_bytealign (a
, b
, c
);
416 static inline u32
swap32_S (const u32 v
)
420 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r
) : "r"(v
));
425 static inline u64
swap64_S (const u64 v
)
430 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(v
));
435 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl
) : "r"(il
));
436 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr
) : "r"(ir
));
440 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tr
), "r"(tl
));
445 static inline u32
rotr32_S (const u32 a
, const u32 n
)
447 return rotate (a
, 32 - n
);
450 static inline u32
rotl32_S (const u32 a
, const u32 n
)
452 return rotate (a
, n
);
456 static inline u64
rotr64_S (const u64 a
, const u32 n
)
461 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
468 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
469 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
473 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
474 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
479 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
484 static inline u64
rotr64_S (const u64 a
, const u32 n
)
486 return rotate (a
, (u64
) 64 - n
);
490 static inline u64
rotl64_S (const u64 a
, const u32 n
)
492 return rotr64_S (a
, 64 - n
);
496 static inline u32
lut3_2d_S (const u32 a
, const u32 b
, const u32 c
)
500 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
505 static inline u32
lut3_39_S (const u32 a
, const u32 b
, const u32 c
)
509 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
514 static inline u32
lut3_59_S (const u32 a
, const u32 b
, const u32 c
)
518 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
523 static inline u32
lut3_96_S (const u32 a
, const u32 b
, const u32 c
)
527 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
532 static inline u32
lut3_e4_S (const u32 a
, const u32 b
, const u32 c
)
536 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
541 static inline u32
lut3_e8_S (const u32 a
, const u32 b
, const u32 c
)
545 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
550 static inline u32
lut3_ca_S (const u32 a
, const u32 b
, const u32 c
)
554 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
560 static inline u32
__byte_perm_S (const u32 a
, const u32 b
, const u32 c
)
564 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
569 static inline u32x
swap32 (const u32x v
)
571 return ((v
>> 24) & 0x000000ff)
572 | ((v
>> 8) & 0x0000ff00)
573 | ((v
<< 8) & 0x00ff0000)
574 | ((v
<< 24) & 0xff000000);
577 static inline u64x
swap64 (const u64x v
)
579 return ((v
>> 56) & 0x00000000000000ff)
580 | ((v
>> 40) & 0x000000000000ff00)
581 | ((v
>> 24) & 0x0000000000ff0000)
582 | ((v
>> 8) & 0x00000000ff000000)
583 | ((v
<< 8) & 0x000000ff00000000)
584 | ((v
<< 24) & 0x0000ff0000000000)
585 | ((v
<< 40) & 0x00ff000000000000)
586 | ((v
<< 56) & 0xff00000000000000);
589 static inline u32x
rotr32 (const u32x a
, const u32 n
)
591 return rotate (a
, 32 - n
);
594 static inline u32x
rotl32 (const u32x a
, const u32 n
)
596 return rotate (a
, n
);
599 static inline u64x
rotr64 (const u64x a
, const u32 n
)
601 return rotate (a
, (u64
) 64 - n
);
604 static inline u64x
rotl64 (const u64x a
, const u32 n
)
606 return rotate (a
, (u64
) n
);
609 static inline u32x
__byte_perm (const u32x a
, const u32x b
, const u32x c
)
614 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
) );
618 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s0
) : "r"(a
.s0
), "r"(b
.s0
), "r"(c
.s0
));
619 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s1
) : "r"(a
.s1
), "r"(b
.s1
), "r"(c
.s1
));
623 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s2
) : "r"(a
.s2
), "r"(b
.s2
), "r"(c
.s2
));
624 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s3
) : "r"(a
.s3
), "r"(b
.s3
), "r"(c
.s3
));
628 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s4
) : "r"(a
.s4
), "r"(b
.s4
), "r"(c
.s4
));
629 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s5
) : "r"(a
.s5
), "r"(b
.s5
), "r"(c
.s5
));
630 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s6
) : "r"(a
.s6
), "r"(b
.s6
), "r"(c
.s6
));
631 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s7
) : "r"(a
.s7
), "r"(b
.s7
), "r"(c
.s7
));
637 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
641 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
647 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
651 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(b
), "r"(a
), "r"((c
& 3) * 8));
656 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
658 return __byte_perm_S (b
, a
, (0x76543210 >> ((c
& 3) * 4)) & 0xffff);
663 static inline u32x
lut3_2d (const u32x a
, const u32x b
, const u32x c
)
668 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
672 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
673 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
677 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
678 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
682 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
683 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
684 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
685 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
691 static inline u32x
lut3_39 (const u32x a
, const u32x b
, const u32x c
)
696 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
700 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
701 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
705 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
706 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
707 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
708 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
712 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
713 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
714 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
715 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
716 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
717 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
718 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
719 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
725 static inline u32x
lut3_59 (const u32x a
, const u32x b
, const u32x c
)
730 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
734 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
735 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
739 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
740 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
741 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
742 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
746 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
747 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
748 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
749 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
750 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
751 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
752 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
753 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
759 static inline u32x
lut3_96 (const u32x a
, const u32x b
, const u32x c
)
764 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
768 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
769 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
773 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
774 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
775 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
776 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
780 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
781 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
782 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
783 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
784 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
785 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
786 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
787 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
793 static inline u32x
lut3_e4 (const u32x a
, const u32x b
, const u32x c
)
798 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
802 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
803 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
807 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
808 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
809 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
810 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
814 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
815 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
816 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
817 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
818 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
819 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
820 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
821 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
827 static inline u32x
lut3_e8 (const u32x a
, const u32x b
, const u32x c
)
832 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
836 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
837 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
841 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
842 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
843 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
844 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
848 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
849 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
850 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
851 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
852 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
853 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
854 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
855 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
861 static inline u32x
lut3_ca (const u32x a
, const u32x b
, const u32x c
)
866 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
870 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
871 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
875 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
876 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
877 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
878 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
882 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
883 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
884 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
885 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
886 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
887 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
888 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
889 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
899 static inline u32
swap32_S (const u32 v
)
901 return (as_uint (as_uchar4 (v
).s3210
));
904 static inline u64
swap64_S (const u64 v
)
906 return (as_ulong (as_uchar8 (v
).s76543210
));
909 static inline u32
rotr32_S (const u32 a
, const u32 n
)
911 return rotate (a
, 32 - n
);
914 static inline u32
rotl32_S (const u32 a
, const u32 n
)
916 return rotate (a
, n
);
919 static inline u64
rotr64_S (const u64 a
, const u32 n
)
921 return rotate (a
, (u64
) 64 - n
);
924 static inline u64
rotl64_S (const u64 a
, const u32 n
)
926 return rotate (a
, (u64
) n
);
929 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
931 const u64 tmp
= ((((u64
) a
) << 32) | ((u64
) b
)) >> ((c
& 3) * 8);
936 static inline u32x
swap32 (const u32x v
)
938 return ((v
>> 24) & 0x000000ff)
939 | ((v
>> 8) & 0x0000ff00)
940 | ((v
<< 8) & 0x00ff0000)
941 | ((v
<< 24) & 0xff000000);
944 static inline u64x
swap64 (const u64x v
)
946 return ((v
>> 56) & 0x00000000000000ff)
947 | ((v
>> 40) & 0x000000000000ff00)
948 | ((v
>> 24) & 0x0000000000ff0000)
949 | ((v
>> 8) & 0x00000000ff000000)
950 | ((v
<< 8) & 0x000000ff00000000)
951 | ((v
<< 24) & 0x0000ff0000000000)
952 | ((v
<< 40) & 0x00ff000000000000)
953 | ((v
<< 56) & 0xff00000000000000);
956 static inline u32x
rotr32 (const u32x a
, const u32 n
)
958 return rotate (a
, 32 - n
);
961 static inline u32x
rotl32 (const u32x a
, const u32 n
)
963 return rotate (a
, n
);
966 static inline u64x
rotr64 (const u64x a
, const u32 n
)
968 return rotate (a
, (u64
) 64 - n
);
971 static inline u64x
rotl64 (const u64x a
, const u32 n
)
973 return rotate (a
, (u64
) n
);
976 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
978 #define BIT(x) (1 << (x))
979 #define BIT_MASK(x) (BIT (x) - 1)
980 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
982 return BFE (a
, b
, c
);
985 static inline u32x
amd_bytealign (const u32x a
, const u32x b
, const u32 c
)
988 const u64x tmp
= ((((u64x
) (a
)) << 32) | ((u64x
) (b
))) >> ((c
& 3) * 8);
994 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
)) << 32) | ((u64x
) (b
.s0
, b
.s1
))) >> ((c
& 3) * 8);
996 return (u32x
) (tmp
.s0
, tmp
.s1
);
1000 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
))) >> ((c
& 3) * 8);
1002 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
);
1006 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
))) >> ((c
& 3) * 8);
1008 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
);
1021 #elif defined _MD5H_
1023 #elif defined _SHA1_
1025 #elif defined _BCRYPT_
1027 #elif defined _SHA256_
1029 #elif defined _SHA384_
1031 #elif defined _SHA512_
1033 #elif defined _KECCAK_
1035 #elif defined _RIPEMD160_
1037 #elif defined _WHIRLPOOL_
1039 #elif defined _GOST_
1041 #elif defined _GOST2012_256_
1043 #elif defined _GOST2012_512_
1045 #elif defined _SAPB_
1047 #elif defined _SAPG_
1049 #elif defined _MYSQL323_
1051 #elif defined _LOTUS5_
1053 #elif defined _LOTUS6_
1055 #elif defined _SCRYPT_
1057 #elif defined _LOTUS8_
1059 #elif defined _OFFICE2007_
1061 #elif defined _OFFICE2010_
1063 #elif defined _OFFICE2013_
1065 #elif defined _OLDOFFICE01_
1067 #elif defined _OLDOFFICE34_
1069 #elif defined _SIPHASH_
1071 #elif defined _PBKDF2_MD5_
1073 #elif defined _PBKDF2_SHA1_
1075 #elif defined _PBKDF2_SHA256_
1077 #elif defined _PBKDF2_SHA512_
1079 #elif defined _PDF17L8_
1081 #elif defined _CRC32_
1083 #elif defined _SEVEN_ZIP_
1085 #elif defined _ANDROIDFDE_
1087 #elif defined _DCC2_
1091 #elif defined _MD5_SHA1_
1093 #elif defined _SHA1_MD5_
1095 #elif defined _NETNTLMV2_
1097 #elif defined _KRB5PA_
1099 #elif defined _CLOUDKEY_
1101 #elif defined _SCRYPT_
1103 #elif defined _PSAFE2_
1105 #elif defined _LOTUS8_
1107 #elif defined _RAR3_
1109 #elif defined _SHA256_SHA1_
1111 #elif defined _MS_DRSR_
1113 #elif defined _ANDROIDFDE_SAMSUNG_
1115 #elif defined _RAR5_
1131 u32 truecrypt_mdlen
;
1178 u32 cry_master_buf
[64];
1180 u32 public_key_buf
[64];
1221 u32 userdomain_buf
[64];
1240 u32 keyfile_buf
[16];
1284 u32 encryptedVerifier
[4];
1285 u32 encryptedVerifierHash
[5];
1293 u32 encryptedVerifier
[4];
1294 u32 encryptedVerifierHash
[8];
1300 u32 encryptedVerifier
[4];
1301 u32 encryptedVerifierHash
[8];
1308 u32 encryptedVerifier
[4];
1309 u32 encryptedVerifierHash
[4];
1317 u32 encryptedVerifier
[4];
1318 u32 encryptedVerifierHash
[5];
1362 } sha256crypt_tmp_t
;
1366 u64 l_alt_result
[8];
1371 } sha512crypt_tmp_t
;
1387 } bitcoin_wallet_tmp_t
;
1485 } pbkdf2_sha1_tmp_t
;
1495 } pbkdf2_sha256_tmp_t
;
1505 } pbkdf2_sha512_tmp_t
;
1708 u32 alignment_placeholder_1
;
1709 u32 alignment_placeholder_2
;
1710 u32 alignment_placeholder_3
;