2 * Authors.....: Jens Steube <jens.steube@gmail.com>
3 * magnum <john.magnum@hushmail.com>
8 #define DEVICE_TYPE_CPU 2
9 #define DEVICE_TYPE_GPU 4
21 #define CONCAT(a, b) a##b
22 #define VTYPE(type, width) CONCAT(type, width)
30 typedef VTYPE(uchar
, VECT_SIZE
) u8x
;
31 typedef VTYPE(ushort
, VECT_SIZE
) u16x
;
32 typedef VTYPE(uint
, VECT_SIZE
) u32x
;
33 typedef VTYPE(ulong
, VECT_SIZE
) u64x
;
36 static inline u32
l32_from_64_S (u64 a
)
38 const u32 r
= (u32
) (a
);
43 static inline u32
h32_from_64_S (u64 a
)
47 const u32 r
= (u32
) (a
);
52 static inline u64
hl32_to_64_S (const u32 a
, const u32 b
)
54 return as_ulong ((uint2
) (b
, a
));
57 static inline u32x
l32_from_64 (u64x a
)
96 static inline u32x
h32_from_64 (u64x a
)
137 static inline u64x
hl32_to_64 (const u32x a
, const u32x b
)
142 r
= as_ulong ((uint2
) (b
, a
));
146 r
.s0
= as_ulong ((uint2
) (b
.s0
, a
.s0
));
147 r
.s1
= as_ulong ((uint2
) (b
.s1
, a
.s1
));
151 r
.s2
= as_ulong ((uint2
) (b
.s2
, a
.s2
));
152 r
.s3
= as_ulong ((uint2
) (b
.s3
, a
.s3
));
156 r
.s4
= as_ulong ((uint2
) (b
.s4
, a
.s4
));
157 r
.s5
= as_ulong ((uint2
) (b
.s5
, a
.s5
));
158 r
.s6
= as_ulong ((uint2
) (b
.s6
, a
.s6
));
159 r
.s7
= as_ulong ((uint2
) (b
.s7
, a
.s7
));
163 r
.s8
= as_ulong ((uint2
) (b
.s8
, a
.s8
));
164 r
.s9
= as_ulong ((uint2
) (b
.s9
, a
.s9
));
165 r
.sa
= as_ulong ((uint2
) (b
.sa
, a
.sa
));
166 r
.sb
= as_ulong ((uint2
) (b
.sb
, a
.sb
));
167 r
.sc
= as_ulong ((uint2
) (b
.sc
, a
.sc
));
168 r
.sd
= as_ulong ((uint2
) (b
.sd
, a
.sd
));
169 r
.se
= as_ulong ((uint2
) (b
.se
, a
.se
));
170 r
.sf
= as_ulong ((uint2
) (b
.sf
, a
.sf
));
177 static inline u32
swap32_S (const u32 v
)
179 return (as_uint (as_uchar4 (v
).s3210
));
182 static inline u64
swap64_S (const u64 v
)
184 return (as_ulong (as_uchar8 (v
).s76543210
));
187 static inline u32
rotr32_S (const u32 a
, const u32 n
)
189 return rotate (a
, 32 - n
);
192 static inline u32
rotl32_S (const u32 a
, const u32 n
)
194 return rotate (a
, n
);
197 static inline u64
rotr64_S (const u64 a
, const u32 n
)
199 #if (DEVICE_TYPE == DEVICE_TYPE_GPU)
203 const u32 a0
= h32_from_64_S (a
);
204 const u32 a1
= l32_from_64_S (a
);
206 const u32 t0
= (n
>= 32) ? amd_bitalign (a0
, a1
, n
- 32) : amd_bitalign (a1
, a0
, n
);
207 const u32 t1
= (n
>= 32) ? amd_bitalign (a1
, a0
, n
- 32) : amd_bitalign (a0
, a1
, n
);
209 const u64 r
= hl32_to_64_S (t0
, t1
);
213 const u64 r
= rotate (a
, (u64
) 64 - n
);
219 const u64 r
= rotate (a
, (u64
) 64 - n
);
226 static inline u64
rotl64_S (const u64 a
, const u32 n
)
228 return rotr64_S (a
, 64 - n
);
231 static inline u32x
swap32 (const u32x v
)
233 return ((v
>> 24) & 0x000000ff)
234 | ((v
>> 8) & 0x0000ff00)
235 | ((v
<< 8) & 0x00ff0000)
236 | ((v
<< 24) & 0xff000000);
239 static inline u64x
swap64 (const u64x v
)
241 return ((v
>> 56) & 0x00000000000000ff)
242 | ((v
>> 40) & 0x000000000000ff00)
243 | ((v
>> 24) & 0x0000000000ff0000)
244 | ((v
>> 8) & 0x00000000ff000000)
245 | ((v
<< 8) & 0x000000ff00000000)
246 | ((v
<< 24) & 0x0000ff0000000000)
247 | ((v
<< 40) & 0x00ff000000000000)
248 | ((v
<< 56) & 0xff00000000000000);
251 static inline u32x
rotr32 (const u32x a
, const u32 n
)
253 return rotate (a
, 32 - n
);
256 static inline u32x
rotl32 (const u32x a
, const u32 n
)
258 return rotate (a
, n
);
261 static inline u64x
rotr64 (const u64x a
, const u32 n
)
263 #if (DEVICE_TYPE == DEVICE_TYPE_GPU)
266 const u32x a0
= h32_from_64 (a
);
267 const u32x a1
= l32_from_64 (a
);
269 const u32x t0
= (n
>= 32) ? amd_bitalign (a0
, a1
, n
- 32) : amd_bitalign (a1
, a0
, n
);
270 const u32x t1
= (n
>= 32) ? amd_bitalign (a1
, a0
, n
- 32) : amd_bitalign (a0
, a1
, n
);
272 const u64x r
= hl32_to_64 (t0
, t1
);
276 const u64x r
= rotate (a
, (u64
) 64 - n
);
282 const u64x r
= rotate (a
, (u64
) 64 - n
);
289 static inline u64x
rotl64 (const u64x a
, const u32 n
)
291 return rotr64 (a
, 64 - n
);
294 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
297 return amd_bfe (a
, b
, c
);
299 #define BIT(x) (1 << (x))
300 #define BIT_MASK(x) (BIT (x) - 1)
301 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
303 return BFE (a
, b
, c
);
307 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
310 return amd_bytealign (a
, b
, c
);
312 const u64 tmp
= ((((u64
) a
) << 32) | ((u64
) b
)) >> ((c
& 3) * 8);
320 static inline u32
swap32_S (const u32 v
)
324 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r
) : "r"(v
));
329 static inline u64
swap64_S (const u64 v
)
334 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(v
));
339 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl
) : "r"(il
));
340 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr
) : "r"(ir
));
344 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tr
), "r"(tl
));
349 static inline u32
rotr32_S (const u32 a
, const u32 n
)
351 return rotate (a
, 32 - n
);
354 static inline u32
rotl32_S (const u32 a
, const u32 n
)
356 return rotate (a
, n
);
360 static inline u64
rotr64_S (const u64 a
, const u32 n
)
365 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
372 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
373 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
377 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
378 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
383 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
388 static inline u64
rotr64_S (const u64 a
, const u32 n
)
390 return rotate (a
, (u64
) 64 - n
);
394 static inline u64
rotl64_S (const u64 a
, const u32 n
)
396 return rotr64_S (a
, 64 - n
);
400 static inline u32
lut3_2d_S (const u32 a
, const u32 b
, const u32 c
)
404 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
409 static inline u32
lut3_39_S (const u32 a
, const u32 b
, const u32 c
)
413 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
418 static inline u32
lut3_59_S (const u32 a
, const u32 b
, const u32 c
)
422 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
427 static inline u32
lut3_96_S (const u32 a
, const u32 b
, const u32 c
)
431 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
436 static inline u32
lut3_e4_S (const u32 a
, const u32 b
, const u32 c
)
440 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
445 static inline u32
lut3_e8_S (const u32 a
, const u32 b
, const u32 c
)
449 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
454 static inline u32
lut3_ca_S (const u32 a
, const u32 b
, const u32 c
)
458 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
464 static inline u32
__byte_perm_S (const u32 a
, const u32 b
, const u32 c
)
468 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
473 static inline u32x
swap32 (const u32x v
)
475 return ((v
>> 24) & 0x000000ff)
476 | ((v
>> 8) & 0x0000ff00)
477 | ((v
<< 8) & 0x00ff0000)
478 | ((v
<< 24) & 0xff000000);
481 static inline u64x
swap64 (const u64x v
)
483 return ((v
>> 56) & 0x00000000000000ff)
484 | ((v
>> 40) & 0x000000000000ff00)
485 | ((v
>> 24) & 0x0000000000ff0000)
486 | ((v
>> 8) & 0x00000000ff000000)
487 | ((v
<< 8) & 0x000000ff00000000)
488 | ((v
<< 24) & 0x0000ff0000000000)
489 | ((v
<< 40) & 0x00ff000000000000)
490 | ((v
<< 56) & 0xff00000000000000);
493 static inline u32x
rotr32 (const u32x a
, const u32 n
)
495 return rotate (a
, 32 - n
);
498 static inline u32x
rotl32 (const u32x a
, const u32 n
)
500 return rotate (a
, n
);
504 static inline u64x
rotr64 (const u64x a
, const u32 n
)
515 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
519 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
520 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
524 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
525 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
528 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
535 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s0
));
539 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
540 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
544 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
545 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
548 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s0
) : "r"(tl
), "r"(tr
));
552 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s1
));
556 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
557 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
561 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
562 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
565 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s1
) : "r"(tl
), "r"(tr
));
573 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s2
));
577 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
578 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
582 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
583 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
586 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s2
) : "r"(tl
), "r"(tr
));
590 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s3
));
594 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
595 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
599 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
600 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
603 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s3
) : "r"(tl
), "r"(tr
));
611 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s4
));
615 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
616 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
620 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
621 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
624 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s4
) : "r"(tl
), "r"(tr
));
628 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s5
));
632 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
633 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
637 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
638 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
641 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s5
) : "r"(tl
), "r"(tr
));
645 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s6
));
649 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
650 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
654 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
655 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
658 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s6
) : "r"(tl
), "r"(tr
));
662 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s7
));
666 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
667 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
671 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
672 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
675 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s7
) : "r"(tl
), "r"(tr
));
683 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s8
));
687 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
688 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
692 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
693 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
696 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s8
) : "r"(tl
), "r"(tr
));
700 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s9
));
704 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
705 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
709 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
710 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
713 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s9
) : "r"(tl
), "r"(tr
));
717 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sa
));
721 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
722 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
726 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
727 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
730 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sa
) : "r"(tl
), "r"(tr
));
734 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sb
));
738 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
739 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
743 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
744 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
747 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sb
) : "r"(tl
), "r"(tr
));
751 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sc
));
755 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
756 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
760 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
761 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
764 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sc
) : "r"(tl
), "r"(tr
));
768 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sd
));
772 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
773 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
777 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
778 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
781 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sd
) : "r"(tl
), "r"(tr
));
785 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.se
));
789 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
790 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
794 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
795 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
798 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.se
) : "r"(tl
), "r"(tr
));
802 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sf
));
806 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
807 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
811 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
812 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
815 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sf
) : "r"(tl
), "r"(tr
));
823 static inline u64x
rotr64 (const u64x a
, const u32 n
)
825 return rotate (a
, (u64
) 64 - n
);
829 static inline u64x
rotl64 (const u64x a
, const u32 n
)
831 return rotr64 (a
, (u64
) 64 - n
);
834 static inline u32x
__byte_perm (const u32x a
, const u32x b
, const u32x c
)
839 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
) );
843 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s0
) : "r"(a
.s0
), "r"(b
.s0
), "r"(c
.s0
));
844 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s1
) : "r"(a
.s1
), "r"(b
.s1
), "r"(c
.s1
));
848 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s2
) : "r"(a
.s2
), "r"(b
.s2
), "r"(c
.s2
));
849 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s3
) : "r"(a
.s3
), "r"(b
.s3
), "r"(c
.s3
));
853 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s4
) : "r"(a
.s4
), "r"(b
.s4
), "r"(c
.s4
));
854 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s5
) : "r"(a
.s5
), "r"(b
.s5
), "r"(c
.s5
));
855 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s6
) : "r"(a
.s6
), "r"(b
.s6
), "r"(c
.s6
));
856 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s7
) : "r"(a
.s7
), "r"(b
.s7
), "r"(c
.s7
));
860 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s8
) : "r"(a
.s8
), "r"(b
.s8
), "r"(c
.s8
));
861 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s9
) : "r"(a
.s9
), "r"(b
.s9
), "r"(c
.s9
));
862 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sa
) : "r"(a
.sa
), "r"(b
.sa
), "r"(c
.sa
));
863 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sb
) : "r"(a
.sb
), "r"(b
.sb
), "r"(c
.sb
));
864 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sc
) : "r"(a
.sc
), "r"(b
.sc
), "r"(c
.sc
));
865 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sd
) : "r"(a
.sd
), "r"(b
.sd
), "r"(c
.sd
));
866 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.se
) : "r"(a
.se
), "r"(b
.se
), "r"(c
.se
));
867 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sf
) : "r"(a
.sf
), "r"(b
.sf
), "r"(c
.sf
));
873 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
877 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
883 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
887 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(b
), "r"(a
), "r"((c
& 3) * 8));
892 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
894 return __byte_perm_S (b
, a
, (0x76543210 >> ((c
& 3) * 4)) & 0xffff);
899 static inline u32x
lut3_2d (const u32x a
, const u32x b
, const u32x c
)
904 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
908 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
909 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
913 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
914 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
918 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
919 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
920 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
921 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
925 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
926 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
927 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
928 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
929 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
930 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
931 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
932 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
938 static inline u32x
lut3_39 (const u32x a
, const u32x b
, const u32x c
)
943 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
947 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
948 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
952 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
953 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
957 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
958 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
959 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
960 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
964 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
965 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
966 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
967 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
968 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
969 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
970 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
971 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
977 static inline u32x
lut3_59 (const u32x a
, const u32x b
, const u32x c
)
982 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
986 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
987 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
991 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
992 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
996 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
997 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
998 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
999 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1003 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1004 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1005 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1006 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1007 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1008 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1009 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1010 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1016 static inline u32x
lut3_96 (const u32x a
, const u32x b
, const u32x c
)
1021 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1025 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1026 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1030 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1031 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1035 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1036 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1037 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1038 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1042 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1043 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1044 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1045 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1046 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1047 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1048 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1049 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1055 static inline u32x
lut3_e4 (const u32x a
, const u32x b
, const u32x c
)
1060 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1064 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1065 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1069 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1070 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1074 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1075 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1076 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1077 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1081 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1082 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1083 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1084 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1085 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1086 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1087 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1088 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1094 static inline u32x
lut3_e8 (const u32x a
, const u32x b
, const u32x c
)
1099 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1103 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1104 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1108 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1109 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1113 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1114 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1115 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1116 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1120 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1121 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1122 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1123 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1124 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1125 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1126 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1127 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1133 static inline u32x
lut3_ca (const u32x a
, const u32x b
, const u32x c
)
1138 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1142 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1143 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1147 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1148 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1152 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1153 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1154 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1155 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1159 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1160 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1161 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1162 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1163 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1164 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1165 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1166 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1176 static inline u32
swap32_S (const u32 v
)
1178 return (as_uint (as_uchar4 (v
).s3210
));
1181 static inline u64
swap64_S (const u64 v
)
1183 return (as_ulong (as_uchar8 (v
).s76543210
));
1186 static inline u32
rotr32_S (const u32 a
, const u32 n
)
1188 return rotate (a
, 32 - n
);
1191 static inline u32
rotl32_S (const u32 a
, const u32 n
)
1193 return rotate (a
, n
);
1196 static inline u64
rotr64_S (const u64 a
, const u32 n
)
1198 return rotate (a
, (u64
) 64 - n
);
1201 static inline u64
rotl64_S (const u64 a
, const u32 n
)
1203 return rotate (a
, (u64
) n
);
1206 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
1208 const u64 tmp
= ((((u64
) a
) << 32) | ((u64
) b
)) >> ((c
& 3) * 8);
1213 static inline u32x
swap32 (const u32x v
)
1215 return ((v
>> 24) & 0x000000ff)
1216 | ((v
>> 8) & 0x0000ff00)
1217 | ((v
<< 8) & 0x00ff0000)
1218 | ((v
<< 24) & 0xff000000);
1221 static inline u64x
swap64 (const u64x v
)
1223 return ((v
>> 56) & 0x00000000000000ff)
1224 | ((v
>> 40) & 0x000000000000ff00)
1225 | ((v
>> 24) & 0x0000000000ff0000)
1226 | ((v
>> 8) & 0x00000000ff000000)
1227 | ((v
<< 8) & 0x000000ff00000000)
1228 | ((v
<< 24) & 0x0000ff0000000000)
1229 | ((v
<< 40) & 0x00ff000000000000)
1230 | ((v
<< 56) & 0xff00000000000000);
1233 static inline u32x
rotr32 (const u32x a
, const u32 n
)
1235 return rotate (a
, 32 - n
);
1238 static inline u32x
rotl32 (const u32x a
, const u32 n
)
1240 return rotate (a
, n
);
1243 static inline u64x
rotr64 (const u64x a
, const u32 n
)
1245 return rotate (a
, (u64
) 64 - n
);
1248 static inline u64x
rotl64 (const u64x a
, const u32 n
)
1250 return rotate (a
, (u64
) n
);
1253 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
1255 #define BIT(x) (1 << (x))
1256 #define BIT_MASK(x) (BIT (x) - 1)
1257 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
1259 return BFE (a
, b
, c
);
1262 static inline u32x
amd_bytealign (const u32x a
, const u32x b
, const u32 c
)
1265 const u64x tmp
= ((((u64x
) (a
)) << 32) | ((u64x
) (b
))) >> ((c
& 3) * 8);
1267 return (u32x
) (tmp
);
1271 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
)) << 32) | ((u64x
) (b
.s0
, b
.s1
))) >> ((c
& 3) * 8);
1273 return (u32x
) (tmp
.s0
, tmp
.s1
);
1277 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
))) >> ((c
& 3) * 8);
1279 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
);
1283 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
))) >> ((c
& 3) * 8);
1285 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
);
1289 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
, a
.s8
, a
.s9
, a
.sa
, a
.sb
, a
.sc
, a
.sd
, a
.se
, a
.sf
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
, b
.s8
, b
.s9
, b
.sa
, b
.sb
, b
.sc
, b
.sd
, b
.se
, b
.sf
))) >> ((c
& 3) * 8);
1291 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
, tmp
.s8
, tmp
.s9
, tmp
.sa
, tmp
.sb
, tmp
.sc
, tmp
.sd
, tmp
.se
, tmp
.sf
);
1304 #elif defined _MD5H_
1306 #elif defined _SHA1_
1308 #elif defined _BCRYPT_
1310 #elif defined _SHA256_
1312 #elif defined _SHA384_
1314 #elif defined _SHA512_
1316 #elif defined _KECCAK_
1318 #elif defined _RIPEMD160_
1320 #elif defined _WHIRLPOOL_
1322 #elif defined _GOST_
1324 #elif defined _GOST2012_256_
1326 #elif defined _GOST2012_512_
1328 #elif defined _SAPB_
1330 #elif defined _SAPG_
1332 #elif defined _MYSQL323_
1334 #elif defined _LOTUS5_
1336 #elif defined _LOTUS6_
1338 #elif defined _SCRYPT_
1340 #elif defined _LOTUS8_
1342 #elif defined _OFFICE2007_
1344 #elif defined _OFFICE2010_
1346 #elif defined _OFFICE2013_
1348 #elif defined _OLDOFFICE01_
1350 #elif defined _OLDOFFICE34_
1352 #elif defined _SIPHASH_
1354 #elif defined _PBKDF2_MD5_
1356 #elif defined _PBKDF2_SHA1_
1358 #elif defined _PBKDF2_SHA256_
1360 #elif defined _PBKDF2_SHA512_
1362 #elif defined _PDF17L8_
1364 #elif defined _CRC32_
1366 #elif defined _SEVEN_ZIP_
1368 #elif defined _ANDROIDFDE_
1370 #elif defined _DCC2_
1374 #elif defined _MD5_SHA1_
1376 #elif defined _SHA1_MD5_
1378 #elif defined _NETNTLMV2_
1380 #elif defined _KRB5PA_
1382 #elif defined _CLOUDKEY_
1384 #elif defined _SCRYPT_
1386 #elif defined _PSAFE2_
1388 #elif defined _LOTUS8_
1390 #elif defined _RAR3_
1392 #elif defined _SHA256_SHA1_
1394 #elif defined _MS_DRSR_
1396 #elif defined _ANDROIDFDE_SAMSUNG_
1398 #elif defined _RAR5_
1400 #elif defined _KRB5TGS_
1402 #elif defined _AXCRYPT_
1404 #elif defined _KEEPASS_
1420 u32 truecrypt_mdlen
;
1471 u32 cry_master_buf
[64];
1473 u32 public_key_buf
[64];
1514 u32 userdomain_buf
[64];
1531 u32 account_info
[512];
1542 u32 keyfile_buf
[16];
1586 u32 encryptedVerifier
[4];
1587 u32 encryptedVerifierHash
[5];
1595 u32 encryptedVerifier
[4];
1596 u32 encryptedVerifierHash
[8];
1602 u32 encryptedVerifier
[4];
1603 u32 encryptedVerifierHash
[8];
1610 u32 encryptedVerifier
[4];
1611 u32 encryptedVerifierHash
[4];
1619 u32 encryptedVerifier
[4];
1620 u32 encryptedVerifierHash
[5];
1630 /* key-file handling */
1634 u32 final_random_seed
[8];
1635 u32 transf_random_seed
[8];
1637 u32 contents_hash
[8];
1639 /* specific to version 1 */
1641 u32 contents
[75000];
1643 /* specific to version 2 */
1644 u32 expected_bytes
[8];
1687 } sha256crypt_tmp_t
;
1691 u64 l_alt_result
[8];
1696 } sha512crypt_tmp_t
;
1712 } bitcoin_wallet_tmp_t
;
1810 } pbkdf2_sha1_tmp_t
;
1820 } pbkdf2_sha256_tmp_t
;
1830 } pbkdf2_sha512_tmp_t
;
2048 u32 alignment_placeholder_1
;
2049 u32 alignment_placeholder_2
;
2050 u32 alignment_placeholder_3
;