2 * Authors.....: Jens Steube <jens.steube@gmail.com>
3 * magnum <john.magnum@hushmail.com>
8 #define DEVICE_TYPE_CPU 2
9 #define DEVICE_TYPE_GPU 4
21 #define CONCAT(a, b) a##b
22 #define VTYPE(type, width) CONCAT(type, width)
30 typedef VTYPE(uchar
, VECT_SIZE
) u8x
;
31 typedef VTYPE(ushort
, VECT_SIZE
) u16x
;
32 typedef VTYPE(uint
, VECT_SIZE
) u32x
;
33 typedef VTYPE(ulong
, VECT_SIZE
) u64x
;
36 // this one needs to die
39 static inline u32
l32_from_64_S (u64 a
)
41 const u32 r
= (u32
) (a
);
46 static inline u32
h32_from_64_S (u64 a
)
50 const u32 r
= (u32
) (a
);
55 static inline u64
hl32_to_64_S (const u32 a
, const u32 b
)
57 return as_ulong ((uint2
) (b
, a
));
60 static inline u32x
l32_from_64 (u64x a
)
99 static inline u32x
h32_from_64 (u64x a
)
140 static inline u64x
hl32_to_64 (const u32x a
, const u32x b
)
145 r
= as_ulong ((uint2
) (b
, a
));
149 r
.s0
= as_ulong ((uint2
) (b
.s0
, a
.s0
));
150 r
.s1
= as_ulong ((uint2
) (b
.s1
, a
.s1
));
154 r
.s2
= as_ulong ((uint2
) (b
.s2
, a
.s2
));
155 r
.s3
= as_ulong ((uint2
) (b
.s3
, a
.s3
));
159 r
.s4
= as_ulong ((uint2
) (b
.s4
, a
.s4
));
160 r
.s5
= as_ulong ((uint2
) (b
.s5
, a
.s5
));
161 r
.s6
= as_ulong ((uint2
) (b
.s6
, a
.s6
));
162 r
.s7
= as_ulong ((uint2
) (b
.s7
, a
.s7
));
166 r
.s8
= as_ulong ((uint2
) (b
.s8
, a
.s8
));
167 r
.s9
= as_ulong ((uint2
) (b
.s9
, a
.s9
));
168 r
.sa
= as_ulong ((uint2
) (b
.sa
, a
.sa
));
169 r
.sb
= as_ulong ((uint2
) (b
.sb
, a
.sb
));
170 r
.sc
= as_ulong ((uint2
) (b
.sc
, a
.sc
));
171 r
.sd
= as_ulong ((uint2
) (b
.sd
, a
.sd
));
172 r
.se
= as_ulong ((uint2
) (b
.se
, a
.se
));
173 r
.sf
= as_ulong ((uint2
) (b
.sf
, a
.sf
));
180 static inline u32
swap32_S (const u32 v
)
182 return (as_uint (as_uchar4 (v
).s3210
));
185 static inline u64
swap64_S (const u64 v
)
187 return (as_ulong (as_uchar8 (v
).s76543210
));
190 static inline u32
rotr32_S (const u32 a
, const u32 n
)
192 return rotate (a
, 32 - n
);
195 static inline u32
rotl32_S (const u32 a
, const u32 n
)
197 return rotate (a
, n
);
200 static inline u64
rotr64_S (const u64 a
, const u32 n
)
202 #if (DEVICE_TYPE == DEVICE_TYPE_GPU)
206 const u32 a0
= h32_from_64_S (a
);
207 const u32 a1
= l32_from_64_S (a
);
209 const u32 t0
= (n
>= 32) ? amd_bitalign (a0
, a1
, n
- 32) : amd_bitalign (a1
, a0
, n
);
210 const u32 t1
= (n
>= 32) ? amd_bitalign (a1
, a0
, n
- 32) : amd_bitalign (a0
, a1
, n
);
212 const u64 r
= hl32_to_64_S (t0
, t1
);
216 const u64 r
= rotate (a
, (u64
) 64 - n
);
222 const u64 r
= rotate (a
, (u64
) 64 - n
);
229 static inline u64
rotl64_S (const u64 a
, const u32 n
)
231 return rotr64_S (a
, 64 - n
);
234 static inline u32x
swap32 (const u32x v
)
236 return ((v
>> 24) & 0x000000ff)
237 | ((v
>> 8) & 0x0000ff00)
238 | ((v
<< 8) & 0x00ff0000)
239 | ((v
<< 24) & 0xff000000);
242 static inline u64x
swap64 (const u64x v
)
244 return ((v
>> 56) & 0x00000000000000ff)
245 | ((v
>> 40) & 0x000000000000ff00)
246 | ((v
>> 24) & 0x0000000000ff0000)
247 | ((v
>> 8) & 0x00000000ff000000)
248 | ((v
<< 8) & 0x000000ff00000000)
249 | ((v
<< 24) & 0x0000ff0000000000)
250 | ((v
<< 40) & 0x00ff000000000000)
251 | ((v
<< 56) & 0xff00000000000000);
254 static inline u32x
rotr32 (const u32x a
, const u32 n
)
256 return rotate (a
, 32 - n
);
259 static inline u32x
rotl32 (const u32x a
, const u32 n
)
261 return rotate (a
, n
);
264 static inline u64x
rotr64 (const u64x a
, const u32 n
)
266 #if (DEVICE_TYPE == DEVICE_TYPE_GPU)
269 const u32x a0
= h32_from_64 (a
);
270 const u32x a1
= l32_from_64 (a
);
272 const u32x t0
= (n
>= 32) ? amd_bitalign (a0
, a1
, n
- 32) : amd_bitalign (a1
, a0
, n
);
273 const u32x t1
= (n
>= 32) ? amd_bitalign (a1
, a0
, n
- 32) : amd_bitalign (a0
, a1
, n
);
275 const u64x r
= hl32_to_64 (t0
, t1
);
279 const u64x r
= rotate (a
, (u64
) 64 - n
);
285 const u64x r
= rotate (a
, (u64
) 64 - n
);
292 static inline u64x
rotl64 (const u64x a
, const u32 n
)
294 return rotr64 (a
, 64 - n
);
297 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
300 return amd_bfe (a
, b
, c
);
302 #define BIT(x) (1 << (x))
303 #define BIT_MASK(x) (BIT (x) - 1)
304 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
306 return BFE (a
, b
, c
);
310 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
313 return amd_bytealign (a
, b
, c
);
315 const u64 tmp
= ((((u64
) a
) << 32) | ((u64
) b
)) >> ((c
& 3) * 8);
323 static inline u32
swap32_S (const u32 v
)
327 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r
) : "r"(v
));
332 static inline u64
swap64_S (const u64 v
)
337 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(v
));
342 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl
) : "r"(il
));
343 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr
) : "r"(ir
));
347 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tr
), "r"(tl
));
352 static inline u32
rotr32_S (const u32 a
, const u32 n
)
354 return rotate (a
, 32 - n
);
357 static inline u32
rotl32_S (const u32 a
, const u32 n
)
359 return rotate (a
, n
);
363 static inline u64
rotr64_S (const u64 a
, const u32 n
)
368 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
375 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
376 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
380 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
381 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
386 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
391 static inline u64
rotr64_S (const u64 a
, const u32 n
)
393 return rotate (a
, (u64
) 64 - n
);
397 static inline u64
rotl64_S (const u64 a
, const u32 n
)
399 return rotr64_S (a
, 64 - n
);
403 static inline u32
lut3_2d_S (const u32 a
, const u32 b
, const u32 c
)
407 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
412 static inline u32
lut3_39_S (const u32 a
, const u32 b
, const u32 c
)
416 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
421 static inline u32
lut3_59_S (const u32 a
, const u32 b
, const u32 c
)
425 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
430 static inline u32
lut3_96_S (const u32 a
, const u32 b
, const u32 c
)
434 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
439 static inline u32
lut3_e4_S (const u32 a
, const u32 b
, const u32 c
)
443 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
448 static inline u32
lut3_e8_S (const u32 a
, const u32 b
, const u32 c
)
452 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
457 static inline u32
lut3_ca_S (const u32 a
, const u32 b
, const u32 c
)
461 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
467 static inline u32
__byte_perm_S (const u32 a
, const u32 b
, const u32 c
)
471 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
476 static inline u32x
swap32 (const u32x v
)
478 return ((v
>> 24) & 0x000000ff)
479 | ((v
>> 8) & 0x0000ff00)
480 | ((v
<< 8) & 0x00ff0000)
481 | ((v
<< 24) & 0xff000000);
484 static inline u64x
swap64 (const u64x v
)
486 return ((v
>> 56) & 0x00000000000000ff)
487 | ((v
>> 40) & 0x000000000000ff00)
488 | ((v
>> 24) & 0x0000000000ff0000)
489 | ((v
>> 8) & 0x00000000ff000000)
490 | ((v
<< 8) & 0x000000ff00000000)
491 | ((v
<< 24) & 0x0000ff0000000000)
492 | ((v
<< 40) & 0x00ff000000000000)
493 | ((v
<< 56) & 0xff00000000000000);
496 static inline u32x
rotr32 (const u32x a
, const u32 n
)
498 return rotate (a
, 32 - n
);
501 static inline u32x
rotl32 (const u32x a
, const u32 n
)
503 return rotate (a
, n
);
507 static inline u64x
rotr64 (const u64x a
, const u32 n
)
518 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
522 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
523 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
527 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
528 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
531 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
538 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s0
));
542 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
543 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
547 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
548 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
551 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s0
) : "r"(tl
), "r"(tr
));
555 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s1
));
559 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
560 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
564 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
565 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
568 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s1
) : "r"(tl
), "r"(tr
));
576 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s2
));
580 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
581 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
585 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
586 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
589 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s2
) : "r"(tl
), "r"(tr
));
593 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s3
));
597 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
598 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
602 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
603 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
606 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s3
) : "r"(tl
), "r"(tr
));
614 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s4
));
618 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
619 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
623 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
624 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
627 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s4
) : "r"(tl
), "r"(tr
));
631 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s5
));
635 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
636 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
640 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
641 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
644 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s5
) : "r"(tl
), "r"(tr
));
648 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s6
));
652 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
653 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
657 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
658 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
661 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s6
) : "r"(tl
), "r"(tr
));
665 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s7
));
669 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
670 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
674 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
675 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
678 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s7
) : "r"(tl
), "r"(tr
));
686 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s8
));
690 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
691 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
695 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
696 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
699 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s8
) : "r"(tl
), "r"(tr
));
703 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s9
));
707 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
708 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
712 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
713 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
716 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s9
) : "r"(tl
), "r"(tr
));
720 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sa
));
724 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
725 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
729 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
730 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
733 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sa
) : "r"(tl
), "r"(tr
));
737 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sb
));
741 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
742 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
746 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
747 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
750 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sb
) : "r"(tl
), "r"(tr
));
754 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sc
));
758 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
759 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
763 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
764 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
767 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sc
) : "r"(tl
), "r"(tr
));
771 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sd
));
775 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
776 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
780 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
781 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
784 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sd
) : "r"(tl
), "r"(tr
));
788 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.se
));
792 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
793 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
797 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
798 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
801 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.se
) : "r"(tl
), "r"(tr
));
805 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sf
));
809 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
810 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
814 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
815 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
818 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sf
) : "r"(tl
), "r"(tr
));
826 static inline u64x
rotr64 (const u64x a
, const u32 n
)
828 return rotate (a
, (u64
) 64 - n
);
832 static inline u64x
rotl64 (const u64x a
, const u32 n
)
834 return rotr64 (a
, (u64
) 64 - n
);
837 static inline u32x
__byte_perm (const u32x a
, const u32x b
, const u32x c
)
842 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
) );
846 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s0
) : "r"(a
.s0
), "r"(b
.s0
), "r"(c
.s0
));
847 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s1
) : "r"(a
.s1
), "r"(b
.s1
), "r"(c
.s1
));
851 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s2
) : "r"(a
.s2
), "r"(b
.s2
), "r"(c
.s2
));
852 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s3
) : "r"(a
.s3
), "r"(b
.s3
), "r"(c
.s3
));
856 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s4
) : "r"(a
.s4
), "r"(b
.s4
), "r"(c
.s4
));
857 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s5
) : "r"(a
.s5
), "r"(b
.s5
), "r"(c
.s5
));
858 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s6
) : "r"(a
.s6
), "r"(b
.s6
), "r"(c
.s6
));
859 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s7
) : "r"(a
.s7
), "r"(b
.s7
), "r"(c
.s7
));
863 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s8
) : "r"(a
.s8
), "r"(b
.s8
), "r"(c
.s8
));
864 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s9
) : "r"(a
.s9
), "r"(b
.s9
), "r"(c
.s9
));
865 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sa
) : "r"(a
.sa
), "r"(b
.sa
), "r"(c
.sa
));
866 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sb
) : "r"(a
.sb
), "r"(b
.sb
), "r"(c
.sb
));
867 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sc
) : "r"(a
.sc
), "r"(b
.sc
), "r"(c
.sc
));
868 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sd
) : "r"(a
.sd
), "r"(b
.sd
), "r"(c
.sd
));
869 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.se
) : "r"(a
.se
), "r"(b
.se
), "r"(c
.se
));
870 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sf
) : "r"(a
.sf
), "r"(b
.sf
), "r"(c
.sf
));
876 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
880 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
886 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
890 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(b
), "r"(a
), "r"((c
& 3) * 8));
895 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
897 return __byte_perm_S (b
, a
, (0x76543210 >> ((c
& 3) * 4)) & 0xffff);
902 static inline u32x
lut3_2d (const u32x a
, const u32x b
, const u32x c
)
907 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
911 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
912 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
916 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
917 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
921 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
922 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
923 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
924 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
928 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
929 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
930 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
931 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
932 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
933 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
934 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
935 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
941 static inline u32x
lut3_39 (const u32x a
, const u32x b
, const u32x c
)
946 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
950 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
951 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
955 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
956 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
960 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
961 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
962 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
963 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
967 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
968 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
969 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
970 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
971 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
972 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
973 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
974 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
980 static inline u32x
lut3_59 (const u32x a
, const u32x b
, const u32x c
)
985 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
989 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
990 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
994 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
995 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
999 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1000 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1001 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1002 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1006 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1007 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1008 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1009 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1010 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1011 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1012 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1013 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1019 static inline u32x
lut3_96 (const u32x a
, const u32x b
, const u32x c
)
1024 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1028 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1029 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1033 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1034 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1038 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1039 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1040 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1041 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1045 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1046 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1047 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1048 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1049 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1050 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1051 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1052 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1058 static inline u32x
lut3_e4 (const u32x a
, const u32x b
, const u32x c
)
1063 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1067 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1068 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1072 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1073 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1077 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1078 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1079 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1080 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1084 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1085 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1086 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1087 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1088 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1089 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1090 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1091 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1097 static inline u32x
lut3_e8 (const u32x a
, const u32x b
, const u32x c
)
1102 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1106 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1107 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1111 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1112 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1116 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1117 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1118 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1119 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1123 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1124 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1125 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1126 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1127 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1128 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1129 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1130 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1136 static inline u32x
lut3_ca (const u32x a
, const u32x b
, const u32x c
)
1141 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1145 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1146 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1150 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1151 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1155 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1156 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1157 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1158 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1162 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1163 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1164 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1165 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1166 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1167 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1168 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1169 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1179 static inline u32
swap32_S (const u32 v
)
1181 return (as_uint (as_uchar4 (v
).s3210
));
1184 static inline u64
swap64_S (const u64 v
)
1186 return (as_ulong (as_uchar8 (v
).s76543210
));
1189 static inline u32
rotr32_S (const u32 a
, const u32 n
)
1191 return rotate (a
, 32 - n
);
1194 static inline u32
rotl32_S (const u32 a
, const u32 n
)
1196 return rotate (a
, n
);
1199 static inline u64
rotr64_S (const u64 a
, const u32 n
)
1201 return rotate (a
, (u64
) 64 - n
);
1204 static inline u64
rotl64_S (const u64 a
, const u32 n
)
1206 return rotate (a
, (u64
) n
);
1209 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
1211 const u64 tmp
= ((((u64
) a
) << 32) | ((u64
) b
)) >> ((c
& 3) * 8);
1216 static inline u32x
swap32 (const u32x v
)
1218 return ((v
>> 24) & 0x000000ff)
1219 | ((v
>> 8) & 0x0000ff00)
1220 | ((v
<< 8) & 0x00ff0000)
1221 | ((v
<< 24) & 0xff000000);
1224 static inline u64x
swap64 (const u64x v
)
1226 return ((v
>> 56) & 0x00000000000000ff)
1227 | ((v
>> 40) & 0x000000000000ff00)
1228 | ((v
>> 24) & 0x0000000000ff0000)
1229 | ((v
>> 8) & 0x00000000ff000000)
1230 | ((v
<< 8) & 0x000000ff00000000)
1231 | ((v
<< 24) & 0x0000ff0000000000)
1232 | ((v
<< 40) & 0x00ff000000000000)
1233 | ((v
<< 56) & 0xff00000000000000);
1236 static inline u32x
rotr32 (const u32x a
, const u32 n
)
1238 return rotate (a
, 32 - n
);
1241 static inline u32x
rotl32 (const u32x a
, const u32 n
)
1243 return rotate (a
, n
);
1246 static inline u64x
rotr64 (const u64x a
, const u32 n
)
1248 return rotate (a
, (u64
) 64 - n
);
1251 static inline u64x
rotl64 (const u64x a
, const u32 n
)
1253 return rotate (a
, (u64
) n
);
1256 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
1258 #define BIT(x) (1 << (x))
1259 #define BIT_MASK(x) (BIT (x) - 1)
1260 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
1262 return BFE (a
, b
, c
);
1265 static inline u32x
amd_bytealign (const u32x a
, const u32x b
, const u32 c
)
1268 const u64x tmp
= ((((u64x
) (a
)) << 32) | ((u64x
) (b
))) >> ((c
& 3) * 8);
1270 return (u32x
) (tmp
);
1274 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
)) << 32) | ((u64x
) (b
.s0
, b
.s1
))) >> ((c
& 3) * 8);
1276 return (u32x
) (tmp
.s0
, tmp
.s1
);
1280 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
))) >> ((c
& 3) * 8);
1282 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
);
1286 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
))) >> ((c
& 3) * 8);
1288 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
);
1292 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
, a
.s8
, a
.s9
, a
.sa
, a
.sb
, a
.sc
, a
.sd
, a
.se
, a
.sf
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
, b
.s8
, b
.s9
, b
.sa
, b
.sb
, b
.sc
, b
.sd
, b
.se
, b
.sf
))) >> ((c
& 3) * 8);
1294 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
, tmp
.s8
, tmp
.s9
, tmp
.sa
, tmp
.sb
, tmp
.sc
, tmp
.sd
, tmp
.se
, tmp
.sf
);
1307 #elif defined _MD5H_
1309 #elif defined _SHA1_
1311 #elif defined _BCRYPT_
1313 #elif defined _SHA256_
1315 #elif defined _SHA384_
1317 #elif defined _SHA512_
1319 #elif defined _KECCAK_
1321 #elif defined _RIPEMD160_
1323 #elif defined _WHIRLPOOL_
1325 #elif defined _GOST_
1327 #elif defined _GOST2012_256_
1329 #elif defined _GOST2012_512_
1331 #elif defined _SAPB_
1333 #elif defined _SAPG_
1335 #elif defined _MYSQL323_
1337 #elif defined _LOTUS5_
1339 #elif defined _LOTUS6_
1341 #elif defined _SCRYPT_
1343 #elif defined _LOTUS8_
1345 #elif defined _OFFICE2007_
1347 #elif defined _OFFICE2010_
1349 #elif defined _OFFICE2013_
1351 #elif defined _OLDOFFICE01_
1353 #elif defined _OLDOFFICE34_
1355 #elif defined _SIPHASH_
1357 #elif defined _PBKDF2_MD5_
1359 #elif defined _PBKDF2_SHA1_
1361 #elif defined _PBKDF2_SHA256_
1363 #elif defined _PBKDF2_SHA512_
1365 #elif defined _PDF17L8_
1367 #elif defined _CRC32_
1369 #elif defined _SEVEN_ZIP_
1371 #elif defined _ANDROIDFDE_
1373 #elif defined _DCC2_
1377 #elif defined _MD5_SHA1_
1379 #elif defined _SHA1_MD5_
1381 #elif defined _NETNTLMV2_
1383 #elif defined _KRB5PA_
1385 #elif defined _CLOUDKEY_
1387 #elif defined _SCRYPT_
1389 #elif defined _PSAFE2_
1391 #elif defined _LOTUS8_
1393 #elif defined _RAR3_
1395 #elif defined _SHA256_SHA1_
1397 #elif defined _MS_DRSR_
1399 #elif defined _ANDROIDFDE_SAMSUNG_
1401 #elif defined _RAR5_
1403 #elif defined _KRB5TGS_
1405 #elif defined _AXCRYPT_
1407 #elif defined _KEEPASS_
1423 u32 truecrypt_mdlen
;
1474 u32 cry_master_buf
[64];
1476 u32 public_key_buf
[64];
1517 u32 userdomain_buf
[64];
1534 u32 account_info
[512];
1545 u32 keyfile_buf
[16];
1589 u32 encryptedVerifier
[4];
1590 u32 encryptedVerifierHash
[5];
1598 u32 encryptedVerifier
[4];
1599 u32 encryptedVerifierHash
[8];
1605 u32 encryptedVerifier
[4];
1606 u32 encryptedVerifierHash
[8];
1613 u32 encryptedVerifier
[4];
1614 u32 encryptedVerifierHash
[4];
1622 u32 encryptedVerifier
[4];
1623 u32 encryptedVerifierHash
[5];
1633 /* key-file handling */
1637 u32 final_random_seed
[8];
1638 u32 transf_random_seed
[8];
1640 u32 contents_hash
[8];
1642 /* specific to version 1 */
1644 u32 contents
[75000];
1646 /* specific to version 2 */
1647 u32 expected_bytes
[8];
1690 } sha256crypt_tmp_t
;
1694 u64 l_alt_result
[8];
1699 } sha512crypt_tmp_t
;
1715 } bitcoin_wallet_tmp_t
;
1813 } pbkdf2_sha1_tmp_t
;
1823 } pbkdf2_sha256_tmp_t
;
1833 } pbkdf2_sha512_tmp_t
;
2051 u32 alignment_placeholder_1
;
2052 u32 alignment_placeholder_2
;
2053 u32 alignment_placeholder_3
;