2 * Authors.....: Jens Steube <jens.steube@gmail.com>
3 * magnum <john.magnum@hushmail.com>
8 #define DEVICE_TYPE_CPU 2
9 #define DEVICE_TYPE_GPU 4
21 #define CONCAT(a, b) a##b
22 #define VTYPE(type, width) CONCAT(type, width)
30 typedef VTYPE(uchar
, VECT_SIZE
) u8x
;
31 typedef VTYPE(ushort
, VECT_SIZE
) u16x
;
32 typedef VTYPE(uint
, VECT_SIZE
) u32x
;
33 typedef VTYPE(ulong
, VECT_SIZE
) u64x
;
36 // this one needs to die
39 static inline u32
l32_from_64_S (u64 a
)
41 const u32 r
= (u32
) (a
);
46 static inline u32
h32_from_64_S (u64 a
)
50 const u32 r
= (u32
) (a
);
55 static inline u64
hl32_to_64_S (const u32 a
, const u32 b
)
57 return as_ulong ((uint2
) (b
, a
));
60 static inline u32x
l32_from_64 (u64x a
)
99 static inline u32x
h32_from_64 (u64x a
)
140 static inline u64x
hl32_to_64 (const u32x a
, const u32x b
)
145 r
= as_ulong ((uint2
) (b
, a
));
149 r
.s0
= as_ulong ((uint2
) (b
.s0
, a
.s0
));
150 r
.s1
= as_ulong ((uint2
) (b
.s1
, a
.s1
));
154 r
.s2
= as_ulong ((uint2
) (b
.s2
, a
.s2
));
155 r
.s3
= as_ulong ((uint2
) (b
.s3
, a
.s3
));
159 r
.s4
= as_ulong ((uint2
) (b
.s4
, a
.s4
));
160 r
.s5
= as_ulong ((uint2
) (b
.s5
, a
.s5
));
161 r
.s6
= as_ulong ((uint2
) (b
.s6
, a
.s6
));
162 r
.s7
= as_ulong ((uint2
) (b
.s7
, a
.s7
));
166 r
.s8
= as_ulong ((uint2
) (b
.s8
, a
.s8
));
167 r
.s9
= as_ulong ((uint2
) (b
.s9
, a
.s9
));
168 r
.sa
= as_ulong ((uint2
) (b
.sa
, a
.sa
));
169 r
.sb
= as_ulong ((uint2
) (b
.sb
, a
.sb
));
170 r
.sc
= as_ulong ((uint2
) (b
.sc
, a
.sc
));
171 r
.sd
= as_ulong ((uint2
) (b
.sd
, a
.sd
));
172 r
.se
= as_ulong ((uint2
) (b
.se
, a
.se
));
173 r
.sf
= as_ulong ((uint2
) (b
.sf
, a
.sf
));
180 static inline u32
swap32_S (const u32 v
)
182 return (as_uint (as_uchar4 (v
).s3210
));
185 static inline u64
swap64_S (const u64 v
)
187 return (as_ulong (as_uchar8 (v
).s76543210
));
190 static inline u32
rotr32_S (const u32 a
, const u32 n
)
192 return rotate (a
, 32 - n
);
195 static inline u32
rotl32_S (const u32 a
, const u32 n
)
197 return rotate (a
, n
);
200 static inline u64
rotr64_S (const u64 a
, const u32 n
)
202 #if DEVICE_TYPE == DEVICE_TYPE_CPU
204 const u64 r
= rotate (a
, (u64
) 64 - n
);
208 const u32 a0
= h32_from_64_S (a
);
209 const u32 a1
= l32_from_64_S (a
);
211 const u32 t0
= (n
>= 32) ? amd_bitalign (a0
, a1
, n
- 32) : amd_bitalign (a1
, a0
, n
);
212 const u32 t1
= (n
>= 32) ? amd_bitalign (a1
, a0
, n
- 32) : amd_bitalign (a0
, a1
, n
);
214 const u64 r
= hl32_to_64_S (t0
, t1
);
221 static inline u64
rotl64_S (const u64 a
, const u32 n
)
223 return rotr64_S (a
, 64 - n
);
226 static inline u32x
swap32 (const u32x v
)
228 return ((v
>> 24) & 0x000000ff)
229 | ((v
>> 8) & 0x0000ff00)
230 | ((v
<< 8) & 0x00ff0000)
231 | ((v
<< 24) & 0xff000000);
234 static inline u64x
swap64 (const u64x v
)
236 return ((v
>> 56) & 0x00000000000000ff)
237 | ((v
>> 40) & 0x000000000000ff00)
238 | ((v
>> 24) & 0x0000000000ff0000)
239 | ((v
>> 8) & 0x00000000ff000000)
240 | ((v
<< 8) & 0x000000ff00000000)
241 | ((v
<< 24) & 0x0000ff0000000000)
242 | ((v
<< 40) & 0x00ff000000000000)
243 | ((v
<< 56) & 0xff00000000000000);
246 static inline u32x
rotr32 (const u32x a
, const u32 n
)
248 return rotate (a
, 32 - n
);
251 static inline u32x
rotl32 (const u32x a
, const u32 n
)
253 return rotate (a
, n
);
256 static inline u64x
rotr64 (const u64x a
, const u32 n
)
258 #if DEVICE_TYPE == DEVICE_TYPE_CPU
260 const u64x r
= rotate (a
, (u64
) 64 - n
);
264 const u32x a0
= h32_from_64 (a
);
265 const u32x a1
= l32_from_64 (a
);
267 const u32x t0
= (n
>= 32) ? amd_bitalign (a0
, a1
, n
- 32) : amd_bitalign (a1
, a0
, n
);
268 const u32x t1
= (n
>= 32) ? amd_bitalign (a1
, a0
, n
- 32) : amd_bitalign (a0
, a1
, n
);
270 const u64x r
= hl32_to_64 (t0
, t1
);
277 static inline u64x
rotl64 (const u64x a
, const u32 n
)
279 return rotr64 (a
, 64 - n
);
282 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
284 return amd_bfe (a
, b
, c
);
287 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
289 return amd_bytealign (a
, b
, c
);
294 static inline u32
swap32_S (const u32 v
)
298 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r
) : "r"(v
));
303 static inline u64
swap64_S (const u64 v
)
308 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(v
));
313 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl
) : "r"(il
));
314 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr
) : "r"(ir
));
318 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tr
), "r"(tl
));
323 static inline u32
rotr32_S (const u32 a
, const u32 n
)
325 return rotate (a
, 32 - n
);
328 static inline u32
rotl32_S (const u32 a
, const u32 n
)
330 return rotate (a
, n
);
334 static inline u64
rotr64_S (const u64 a
, const u32 n
)
339 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
346 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
347 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
351 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
352 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
357 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
362 static inline u64
rotr64_S (const u64 a
, const u32 n
)
364 return rotate (a
, (u64
) 64 - n
);
368 static inline u64
rotl64_S (const u64 a
, const u32 n
)
370 return rotr64_S (a
, 64 - n
);
374 static inline u32
lut3_2d_S (const u32 a
, const u32 b
, const u32 c
)
378 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
383 static inline u32
lut3_39_S (const u32 a
, const u32 b
, const u32 c
)
387 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
392 static inline u32
lut3_59_S (const u32 a
, const u32 b
, const u32 c
)
396 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
401 static inline u32
lut3_96_S (const u32 a
, const u32 b
, const u32 c
)
405 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
410 static inline u32
lut3_e4_S (const u32 a
, const u32 b
, const u32 c
)
414 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
419 static inline u32
lut3_e8_S (const u32 a
, const u32 b
, const u32 c
)
423 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
428 static inline u32
lut3_ca_S (const u32 a
, const u32 b
, const u32 c
)
432 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
438 static inline u32
__byte_perm_S (const u32 a
, const u32 b
, const u32 c
)
442 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
447 static inline u32x
swap32 (const u32x v
)
449 return ((v
>> 24) & 0x000000ff)
450 | ((v
>> 8) & 0x0000ff00)
451 | ((v
<< 8) & 0x00ff0000)
452 | ((v
<< 24) & 0xff000000);
455 static inline u64x
swap64 (const u64x v
)
457 return ((v
>> 56) & 0x00000000000000ff)
458 | ((v
>> 40) & 0x000000000000ff00)
459 | ((v
>> 24) & 0x0000000000ff0000)
460 | ((v
>> 8) & 0x00000000ff000000)
461 | ((v
<< 8) & 0x000000ff00000000)
462 | ((v
<< 24) & 0x0000ff0000000000)
463 | ((v
<< 40) & 0x00ff000000000000)
464 | ((v
<< 56) & 0xff00000000000000);
467 static inline u32x
rotr32 (const u32x a
, const u32 n
)
469 return rotate (a
, 32 - n
);
472 static inline u32x
rotl32 (const u32x a
, const u32 n
)
474 return rotate (a
, n
);
478 static inline u64x
rotr64 (const u64x a
, const u32 n
)
489 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
493 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
494 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
498 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
499 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
502 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
509 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s0
));
513 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
514 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
518 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
519 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
522 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s0
) : "r"(tl
), "r"(tr
));
526 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s1
));
530 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
531 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
535 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
536 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
539 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s1
) : "r"(tl
), "r"(tr
));
547 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s2
));
551 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
552 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
556 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
557 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
560 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s2
) : "r"(tl
), "r"(tr
));
564 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s3
));
568 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
569 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
573 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
574 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
577 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s3
) : "r"(tl
), "r"(tr
));
585 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s4
));
589 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
590 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
594 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
595 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
598 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s4
) : "r"(tl
), "r"(tr
));
602 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s5
));
606 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
607 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
611 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
612 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
615 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s5
) : "r"(tl
), "r"(tr
));
619 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s6
));
623 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
624 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
628 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
629 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
632 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s6
) : "r"(tl
), "r"(tr
));
636 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s7
));
640 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
641 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
645 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
646 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
649 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s7
) : "r"(tl
), "r"(tr
));
657 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s8
));
661 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
662 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
666 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
667 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
670 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s8
) : "r"(tl
), "r"(tr
));
674 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s9
));
678 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
679 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
683 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
684 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
687 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s9
) : "r"(tl
), "r"(tr
));
691 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sa
));
695 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
696 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
700 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
701 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
704 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sa
) : "r"(tl
), "r"(tr
));
708 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sb
));
712 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
713 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
717 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
718 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
721 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sb
) : "r"(tl
), "r"(tr
));
725 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sc
));
729 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
730 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
734 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
735 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
738 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sc
) : "r"(tl
), "r"(tr
));
742 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sd
));
746 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
747 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
751 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
752 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
755 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sd
) : "r"(tl
), "r"(tr
));
759 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.se
));
763 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
764 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
768 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
769 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
772 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.se
) : "r"(tl
), "r"(tr
));
776 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sf
));
780 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
781 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
785 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
786 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
789 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sf
) : "r"(tl
), "r"(tr
));
797 static inline u64x
rotr64 (const u64x a
, const u32 n
)
799 return rotate (a
, (u64
) 64 - n
);
803 static inline u64x
rotl64 (const u64x a
, const u32 n
)
805 return rotr64 (a
, (u64
) 64 - n
);
808 static inline u32x
__byte_perm (const u32x a
, const u32x b
, const u32x c
)
813 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
) );
817 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s0
) : "r"(a
.s0
), "r"(b
.s0
), "r"(c
.s0
));
818 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s1
) : "r"(a
.s1
), "r"(b
.s1
), "r"(c
.s1
));
822 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s2
) : "r"(a
.s2
), "r"(b
.s2
), "r"(c
.s2
));
823 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s3
) : "r"(a
.s3
), "r"(b
.s3
), "r"(c
.s3
));
827 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s4
) : "r"(a
.s4
), "r"(b
.s4
), "r"(c
.s4
));
828 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s5
) : "r"(a
.s5
), "r"(b
.s5
), "r"(c
.s5
));
829 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s6
) : "r"(a
.s6
), "r"(b
.s6
), "r"(c
.s6
));
830 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s7
) : "r"(a
.s7
), "r"(b
.s7
), "r"(c
.s7
));
834 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s8
) : "r"(a
.s8
), "r"(b
.s8
), "r"(c
.s8
));
835 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s9
) : "r"(a
.s9
), "r"(b
.s9
), "r"(c
.s9
));
836 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sa
) : "r"(a
.sa
), "r"(b
.sa
), "r"(c
.sa
));
837 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sb
) : "r"(a
.sb
), "r"(b
.sb
), "r"(c
.sb
));
838 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sc
) : "r"(a
.sc
), "r"(b
.sc
), "r"(c
.sc
));
839 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sd
) : "r"(a
.sd
), "r"(b
.sd
), "r"(c
.sd
));
840 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.se
) : "r"(a
.se
), "r"(b
.se
), "r"(c
.se
));
841 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sf
) : "r"(a
.sf
), "r"(b
.sf
), "r"(c
.sf
));
847 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
851 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
857 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
861 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(b
), "r"(a
), "r"((c
& 3) * 8));
866 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
868 return __byte_perm_S (b
, a
, (0x76543210 >> ((c
& 3) * 4)) & 0xffff);
873 static inline u32x
lut3_2d (const u32x a
, const u32x b
, const u32x c
)
878 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
882 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
883 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
887 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
888 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
892 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
893 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
894 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
895 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
899 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
900 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
901 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
902 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
903 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
904 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
905 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
906 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
912 static inline u32x
lut3_39 (const u32x a
, const u32x b
, const u32x c
)
917 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
921 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
922 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
926 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
927 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
931 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
932 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
933 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
934 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
938 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
939 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
940 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
941 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
942 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
943 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
944 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
945 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
951 static inline u32x
lut3_59 (const u32x a
, const u32x b
, const u32x c
)
956 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
960 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
961 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
965 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
966 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
970 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
971 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
972 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
973 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
977 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
978 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
979 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
980 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
981 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
982 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
983 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
984 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
990 static inline u32x
lut3_96 (const u32x a
, const u32x b
, const u32x c
)
995 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
999 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1000 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1004 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1005 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1009 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1010 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1011 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1012 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1016 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1017 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1018 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1019 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1020 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1021 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1022 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1023 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1029 static inline u32x
lut3_e4 (const u32x a
, const u32x b
, const u32x c
)
1034 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1038 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1039 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1043 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1044 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1048 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1049 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1050 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1051 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1055 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1056 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1057 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1058 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1059 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1060 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1061 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1062 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1068 static inline u32x
lut3_e8 (const u32x a
, const u32x b
, const u32x c
)
1073 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1077 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1078 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1082 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1083 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1087 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1088 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1089 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1090 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1094 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1095 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1096 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1097 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1098 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1099 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1100 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1101 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1107 static inline u32x
lut3_ca (const u32x a
, const u32x b
, const u32x c
)
1112 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1116 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1117 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1121 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1122 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1126 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1127 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1128 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1129 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1133 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1134 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1135 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1136 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1137 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1138 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1139 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1140 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1150 static inline u32
swap32_S (const u32 v
)
1152 return (as_uint (as_uchar4 (v
).s3210
));
1155 static inline u64
swap64_S (const u64 v
)
1157 return (as_ulong (as_uchar8 (v
).s76543210
));
1160 static inline u32
rotr32_S (const u32 a
, const u32 n
)
1162 return rotate (a
, 32 - n
);
1165 static inline u32
rotl32_S (const u32 a
, const u32 n
)
1167 return rotate (a
, n
);
1170 static inline u64
rotr64_S (const u64 a
, const u32 n
)
1172 return rotate (a
, (u64
) 64 - n
);
1175 static inline u64
rotl64_S (const u64 a
, const u32 n
)
1177 return rotate (a
, (u64
) n
);
1180 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
1182 const u64 tmp
= ((((u64
) a
) << 32) | ((u64
) b
)) >> ((c
& 3) * 8);
1187 static inline u32x
swap32 (const u32x v
)
1189 return ((v
>> 24) & 0x000000ff)
1190 | ((v
>> 8) & 0x0000ff00)
1191 | ((v
<< 8) & 0x00ff0000)
1192 | ((v
<< 24) & 0xff000000);
1195 static inline u64x
swap64 (const u64x v
)
1197 return ((v
>> 56) & 0x00000000000000ff)
1198 | ((v
>> 40) & 0x000000000000ff00)
1199 | ((v
>> 24) & 0x0000000000ff0000)
1200 | ((v
>> 8) & 0x00000000ff000000)
1201 | ((v
<< 8) & 0x000000ff00000000)
1202 | ((v
<< 24) & 0x0000ff0000000000)
1203 | ((v
<< 40) & 0x00ff000000000000)
1204 | ((v
<< 56) & 0xff00000000000000);
1207 static inline u32x
rotr32 (const u32x a
, const u32 n
)
1209 return rotate (a
, 32 - n
);
1212 static inline u32x
rotl32 (const u32x a
, const u32 n
)
1214 return rotate (a
, n
);
1217 static inline u64x
rotr64 (const u64x a
, const u32 n
)
1219 return rotate (a
, (u64
) 64 - n
);
1222 static inline u64x
rotl64 (const u64x a
, const u32 n
)
1224 return rotate (a
, (u64
) n
);
1227 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
1229 #define BIT(x) (1 << (x))
1230 #define BIT_MASK(x) (BIT (x) - 1)
1231 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
1233 return BFE (a
, b
, c
);
1236 static inline u32x
amd_bytealign (const u32x a
, const u32x b
, const u32 c
)
1239 const u64x tmp
= ((((u64x
) (a
)) << 32) | ((u64x
) (b
))) >> ((c
& 3) * 8);
1241 return (u32x
) (tmp
);
1245 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
)) << 32) | ((u64x
) (b
.s0
, b
.s1
))) >> ((c
& 3) * 8);
1247 return (u32x
) (tmp
.s0
, tmp
.s1
);
1251 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
))) >> ((c
& 3) * 8);
1253 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
);
1257 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
))) >> ((c
& 3) * 8);
1259 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
);
1263 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
, a
.s8
, a
.s9
, a
.sa
, a
.sb
, a
.sc
, a
.sd
, a
.se
, a
.sf
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
, b
.s8
, b
.s9
, b
.sa
, b
.sb
, b
.sc
, b
.sd
, b
.se
, b
.sf
))) >> ((c
& 3) * 8);
1265 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
, tmp
.s8
, tmp
.s9
, tmp
.sa
, tmp
.sb
, tmp
.sc
, tmp
.sd
, tmp
.se
, tmp
.sf
);
1278 #elif defined _MD5H_
1280 #elif defined _SHA1_
1282 #elif defined _BCRYPT_
1284 #elif defined _SHA256_
1286 #elif defined _SHA384_
1288 #elif defined _SHA512_
1290 #elif defined _KECCAK_
1292 #elif defined _RIPEMD160_
1294 #elif defined _WHIRLPOOL_
1296 #elif defined _GOST_
1298 #elif defined _GOST2012_256_
1300 #elif defined _GOST2012_512_
1302 #elif defined _SAPB_
1304 #elif defined _SAPG_
1306 #elif defined _MYSQL323_
1308 #elif defined _LOTUS5_
1310 #elif defined _LOTUS6_
1312 #elif defined _SCRYPT_
1314 #elif defined _LOTUS8_
1316 #elif defined _OFFICE2007_
1318 #elif defined _OFFICE2010_
1320 #elif defined _OFFICE2013_
1322 #elif defined _OLDOFFICE01_
1324 #elif defined _OLDOFFICE34_
1326 #elif defined _SIPHASH_
1328 #elif defined _PBKDF2_MD5_
1330 #elif defined _PBKDF2_SHA1_
1332 #elif defined _PBKDF2_SHA256_
1334 #elif defined _PBKDF2_SHA512_
1336 #elif defined _PDF17L8_
1338 #elif defined _CRC32_
1340 #elif defined _SEVEN_ZIP_
1342 #elif defined _ANDROIDFDE_
1344 #elif defined _DCC2_
1348 #elif defined _MD5_SHA1_
1350 #elif defined _SHA1_MD5_
1352 #elif defined _NETNTLMV2_
1354 #elif defined _KRB5PA_
1356 #elif defined _CLOUDKEY_
1358 #elif defined _SCRYPT_
1360 #elif defined _PSAFE2_
1362 #elif defined _LOTUS8_
1364 #elif defined _RAR3_
1366 #elif defined _SHA256_SHA1_
1368 #elif defined _MS_DRSR_
1370 #elif defined _ANDROIDFDE_SAMSUNG_
1372 #elif defined _RAR5_
1374 #elif defined _KRB5TGS_
1376 #elif defined _AXCRYPT_
1378 #elif defined _KEEPASS_
1394 u32 truecrypt_mdlen
;
1445 u32 cry_master_buf
[64];
1447 u32 public_key_buf
[64];
1488 u32 userdomain_buf
[64];
1505 u32 account_info
[512];
1516 u32 keyfile_buf
[16];
1560 u32 encryptedVerifier
[4];
1561 u32 encryptedVerifierHash
[5];
1569 u32 encryptedVerifier
[4];
1570 u32 encryptedVerifierHash
[8];
1576 u32 encryptedVerifier
[4];
1577 u32 encryptedVerifierHash
[8];
1584 u32 encryptedVerifier
[4];
1585 u32 encryptedVerifierHash
[4];
1593 u32 encryptedVerifier
[4];
1594 u32 encryptedVerifierHash
[5];
1604 /* key-file handling */
1608 u32 final_random_seed
[8];
1609 u32 transf_random_seed
[8];
1611 u32 contents_hash
[8];
1613 /* specific to version 1 */
1615 u32 contents
[12500];
1617 /* specific to version 2 */
1618 u32 expected_bytes
[8];
1661 } sha256crypt_tmp_t
;
1665 u64 l_alt_result
[8];
1670 } sha512crypt_tmp_t
;
1686 } bitcoin_wallet_tmp_t
;
1784 } pbkdf2_sha1_tmp_t
;
1794 } pbkdf2_sha256_tmp_t
;
1804 } pbkdf2_sha512_tmp_t
;
2022 u32 alignment_placeholder_1
;
2023 u32 alignment_placeholder_2
;
2024 u32 alignment_placeholder_3
;