2 * Authors.....: Jens Steube <jens.steube@gmail.com>
3 * magnum <john.magnum@hushmail.com>
8 #define DEVICE_TYPE_CPU 2
9 #define DEVICE_TYPE_GPU 4
21 #define CONCAT(a, b) a##b
22 #define VTYPE(type, width) CONCAT(type, width)
30 typedef VTYPE(uchar
, VECT_SIZE
) u8x
;
31 typedef VTYPE(ushort
, VECT_SIZE
) u16x
;
32 typedef VTYPE(uint
, VECT_SIZE
) u32x
;
33 typedef VTYPE(ulong
, VECT_SIZE
) u64x
;
36 static inline u32
l32_from_64_S (u64 a
)
38 const u32 r
= (u32
) (a
);
43 static inline u32
h32_from_64_S (u64 a
)
47 const u32 r
= (u32
) (a
);
52 static inline u64
hl32_to_64_S (const u32 a
, const u32 b
)
54 return as_ulong ((uint2
) (b
, a
));
57 static inline u32x
l32_from_64 (u64x a
)
96 static inline u32x
h32_from_64 (u64x a
)
137 static inline u64x
hl32_to_64 (const u32x a
, const u32x b
)
142 r
= as_ulong ((uint2
) (b
, a
));
146 r
.s0
= as_ulong ((uint2
) (b
.s0
, a
.s0
));
147 r
.s1
= as_ulong ((uint2
) (b
.s1
, a
.s1
));
151 r
.s2
= as_ulong ((uint2
) (b
.s2
, a
.s2
));
152 r
.s3
= as_ulong ((uint2
) (b
.s3
, a
.s3
));
156 r
.s4
= as_ulong ((uint2
) (b
.s4
, a
.s4
));
157 r
.s5
= as_ulong ((uint2
) (b
.s5
, a
.s5
));
158 r
.s6
= as_ulong ((uint2
) (b
.s6
, a
.s6
));
159 r
.s7
= as_ulong ((uint2
) (b
.s7
, a
.s7
));
163 r
.s8
= as_ulong ((uint2
) (b
.s8
, a
.s8
));
164 r
.s9
= as_ulong ((uint2
) (b
.s9
, a
.s9
));
165 r
.sa
= as_ulong ((uint2
) (b
.sa
, a
.sa
));
166 r
.sb
= as_ulong ((uint2
) (b
.sb
, a
.sb
));
167 r
.sc
= as_ulong ((uint2
) (b
.sc
, a
.sc
));
168 r
.sd
= as_ulong ((uint2
) (b
.sd
, a
.sd
));
169 r
.se
= as_ulong ((uint2
) (b
.se
, a
.se
));
170 r
.sf
= as_ulong ((uint2
) (b
.sf
, a
.sf
));
177 static inline u32
swap32_S (const u32 v
)
179 return (as_uint (as_uchar4 (v
).s3210
));
182 static inline u64
swap64_S (const u64 v
)
184 return (as_ulong (as_uchar8 (v
).s76543210
));
187 static inline u32
rotr32_S (const u32 a
, const u32 n
)
189 return rotate (a
, 32 - n
);
192 static inline u32
rotl32_S (const u32 a
, const u32 n
)
194 return rotate (a
, n
);
197 static inline u64
rotr64_S (const u64 a
, const u32 n
)
199 const u32 a0
= h32_from_64_S (a
);
200 const u32 a1
= l32_from_64_S (a
);
202 const u32 t0
= (n
>= 32) ? amd_bitalign (a0
, a1
, n
- 32) : amd_bitalign (a1
, a0
, n
);
203 const u32 t1
= (n
>= 32) ? amd_bitalign (a1
, a0
, n
- 32) : amd_bitalign (a0
, a1
, n
);
205 const u64 r
= hl32_to_64_S (t0
, t1
);
210 static inline u64
rotl64_S (const u64 a
, const u32 n
)
212 return rotr64_S (a
, 64 - n
);
215 static inline u32x
swap32 (const u32x v
)
217 return ((v
>> 24) & 0x000000ff)
218 | ((v
>> 8) & 0x0000ff00)
219 | ((v
<< 8) & 0x00ff0000)
220 | ((v
<< 24) & 0xff000000);
223 static inline u64x
swap64 (const u64x v
)
225 return ((v
>> 56) & 0x00000000000000ff)
226 | ((v
>> 40) & 0x000000000000ff00)
227 | ((v
>> 24) & 0x0000000000ff0000)
228 | ((v
>> 8) & 0x00000000ff000000)
229 | ((v
<< 8) & 0x000000ff00000000)
230 | ((v
<< 24) & 0x0000ff0000000000)
231 | ((v
<< 40) & 0x00ff000000000000)
232 | ((v
<< 56) & 0xff00000000000000);
235 static inline u32x
rotr32 (const u32x a
, const u32 n
)
237 return rotate (a
, 32 - n
);
240 static inline u32x
rotl32 (const u32x a
, const u32 n
)
242 return rotate (a
, n
);
245 static inline u64x
rotr64 (const u64x a
, const u32 n
)
247 const u32x a0
= h32_from_64 (a
);
248 const u32x a1
= l32_from_64 (a
);
250 const u32x t0
= (n
>= 32) ? amd_bitalign (a0
, a1
, n
- 32) : amd_bitalign (a1
, a0
, n
);
251 const u32x t1
= (n
>= 32) ? amd_bitalign (a1
, a0
, n
- 32) : amd_bitalign (a0
, a1
, n
);
253 const u64x r
= hl32_to_64 (t0
, t1
);
258 static inline u64x
rotl64 (const u64x a
, const u32 n
)
260 return rotr64 (a
, 64 - n
);
263 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
265 return amd_bfe (a
, b
, c
);
268 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
270 return amd_bytealign (a
, b
, c
);
275 static inline u32
swap32_S (const u32 v
)
279 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r
) : "r"(v
));
284 static inline u64
swap64_S (const u64 v
)
289 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(v
));
294 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl
) : "r"(il
));
295 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr
) : "r"(ir
));
299 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tr
), "r"(tl
));
304 static inline u32
rotr32_S (const u32 a
, const u32 n
)
306 return rotate (a
, 32 - n
);
309 static inline u32
rotl32_S (const u32 a
, const u32 n
)
311 return rotate (a
, n
);
315 static inline u64
rotr64_S (const u64 a
, const u32 n
)
320 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
327 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
328 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
332 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
333 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
338 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
343 static inline u64
rotr64_S (const u64 a
, const u32 n
)
345 return rotate (a
, (u64
) 64 - n
);
349 static inline u64
rotl64_S (const u64 a
, const u32 n
)
351 return rotr64_S (a
, 64 - n
);
355 static inline u32
lut3_2d_S (const u32 a
, const u32 b
, const u32 c
)
359 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
364 static inline u32
lut3_39_S (const u32 a
, const u32 b
, const u32 c
)
368 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
373 static inline u32
lut3_59_S (const u32 a
, const u32 b
, const u32 c
)
377 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
382 static inline u32
lut3_96_S (const u32 a
, const u32 b
, const u32 c
)
386 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
391 static inline u32
lut3_e4_S (const u32 a
, const u32 b
, const u32 c
)
395 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
400 static inline u32
lut3_e8_S (const u32 a
, const u32 b
, const u32 c
)
404 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
409 static inline u32
lut3_ca_S (const u32 a
, const u32 b
, const u32 c
)
413 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
419 static inline u32
__byte_perm_S (const u32 a
, const u32 b
, const u32 c
)
423 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
428 static inline u32x
swap32 (const u32x v
)
430 return ((v
>> 24) & 0x000000ff)
431 | ((v
>> 8) & 0x0000ff00)
432 | ((v
<< 8) & 0x00ff0000)
433 | ((v
<< 24) & 0xff000000);
436 static inline u64x
swap64 (const u64x v
)
438 return ((v
>> 56) & 0x00000000000000ff)
439 | ((v
>> 40) & 0x000000000000ff00)
440 | ((v
>> 24) & 0x0000000000ff0000)
441 | ((v
>> 8) & 0x00000000ff000000)
442 | ((v
<< 8) & 0x000000ff00000000)
443 | ((v
<< 24) & 0x0000ff0000000000)
444 | ((v
<< 40) & 0x00ff000000000000)
445 | ((v
<< 56) & 0xff00000000000000);
448 static inline u32x
rotr32 (const u32x a
, const u32 n
)
450 return rotate (a
, 32 - n
);
453 static inline u32x
rotl32 (const u32x a
, const u32 n
)
455 return rotate (a
, n
);
459 static inline u64x
rotr64 (const u64x a
, const u32 n
)
470 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
474 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
475 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
479 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
480 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
483 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
490 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s0
));
494 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
495 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
499 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
500 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
503 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s0
) : "r"(tl
), "r"(tr
));
507 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s1
));
511 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
512 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
516 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
517 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
520 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s1
) : "r"(tl
), "r"(tr
));
528 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s2
));
532 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
533 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
537 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
538 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
541 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s2
) : "r"(tl
), "r"(tr
));
545 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s3
));
549 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
550 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
554 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
555 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
558 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s3
) : "r"(tl
), "r"(tr
));
566 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s4
));
570 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
571 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
575 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
576 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
579 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s4
) : "r"(tl
), "r"(tr
));
583 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s5
));
587 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
588 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
592 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
593 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
596 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s5
) : "r"(tl
), "r"(tr
));
600 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s6
));
604 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
605 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
609 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
610 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
613 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s6
) : "r"(tl
), "r"(tr
));
617 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s7
));
621 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
622 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
626 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
627 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
630 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s7
) : "r"(tl
), "r"(tr
));
638 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s8
));
642 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
643 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
647 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
648 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
651 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s8
) : "r"(tl
), "r"(tr
));
655 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s9
));
659 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
660 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
664 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
665 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
668 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s9
) : "r"(tl
), "r"(tr
));
672 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sa
));
676 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
677 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
681 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
682 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
685 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sa
) : "r"(tl
), "r"(tr
));
689 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sb
));
693 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
694 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
698 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
699 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
702 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sb
) : "r"(tl
), "r"(tr
));
706 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sc
));
710 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
711 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
715 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
716 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
719 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sc
) : "r"(tl
), "r"(tr
));
723 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sd
));
727 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
728 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
732 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
733 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
736 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sd
) : "r"(tl
), "r"(tr
));
740 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.se
));
744 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
745 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
749 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
750 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
753 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.se
) : "r"(tl
), "r"(tr
));
757 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sf
));
761 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
762 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
766 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
767 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
770 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sf
) : "r"(tl
), "r"(tr
));
778 static inline u64x
rotr64 (const u64x a
, const u32 n
)
780 return rotate (a
, (u64
) 64 - n
);
784 static inline u64x
rotl64 (const u64x a
, const u32 n
)
786 return rotr64 (a
, (u64
) 64 - n
);
789 static inline u32x
__byte_perm (const u32x a
, const u32x b
, const u32x c
)
794 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
) );
798 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s0
) : "r"(a
.s0
), "r"(b
.s0
), "r"(c
.s0
));
799 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s1
) : "r"(a
.s1
), "r"(b
.s1
), "r"(c
.s1
));
803 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s2
) : "r"(a
.s2
), "r"(b
.s2
), "r"(c
.s2
));
804 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s3
) : "r"(a
.s3
), "r"(b
.s3
), "r"(c
.s3
));
808 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s4
) : "r"(a
.s4
), "r"(b
.s4
), "r"(c
.s4
));
809 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s5
) : "r"(a
.s5
), "r"(b
.s5
), "r"(c
.s5
));
810 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s6
) : "r"(a
.s6
), "r"(b
.s6
), "r"(c
.s6
));
811 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s7
) : "r"(a
.s7
), "r"(b
.s7
), "r"(c
.s7
));
815 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s8
) : "r"(a
.s8
), "r"(b
.s8
), "r"(c
.s8
));
816 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s9
) : "r"(a
.s9
), "r"(b
.s9
), "r"(c
.s9
));
817 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sa
) : "r"(a
.sa
), "r"(b
.sa
), "r"(c
.sa
));
818 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sb
) : "r"(a
.sb
), "r"(b
.sb
), "r"(c
.sb
));
819 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sc
) : "r"(a
.sc
), "r"(b
.sc
), "r"(c
.sc
));
820 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sd
) : "r"(a
.sd
), "r"(b
.sd
), "r"(c
.sd
));
821 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.se
) : "r"(a
.se
), "r"(b
.se
), "r"(c
.se
));
822 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sf
) : "r"(a
.sf
), "r"(b
.sf
), "r"(c
.sf
));
828 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
832 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
838 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
842 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(b
), "r"(a
), "r"((c
& 3) * 8));
847 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
849 return __byte_perm_S (b
, a
, (0x76543210 >> ((c
& 3) * 4)) & 0xffff);
854 static inline u32x
lut3_2d (const u32x a
, const u32x b
, const u32x c
)
859 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
863 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
864 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
868 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
869 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
873 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
874 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
875 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
876 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
880 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
881 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
882 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
883 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
884 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
885 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
886 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
887 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
893 static inline u32x
lut3_39 (const u32x a
, const u32x b
, const u32x c
)
898 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
902 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
903 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
907 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
908 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
912 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
913 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
914 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
915 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
919 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
920 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
921 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
922 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
923 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
924 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
925 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
926 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
932 static inline u32x
lut3_59 (const u32x a
, const u32x b
, const u32x c
)
937 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
941 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
942 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
946 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
947 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
951 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
952 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
953 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
954 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
958 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
959 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
960 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
961 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
962 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
963 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
964 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
965 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
971 static inline u32x
lut3_96 (const u32x a
, const u32x b
, const u32x c
)
976 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
980 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
981 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
985 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
986 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
990 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
991 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
992 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
993 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
997 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
998 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
999 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1000 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1001 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1002 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1003 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1004 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1010 static inline u32x
lut3_e4 (const u32x a
, const u32x b
, const u32x c
)
1015 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1019 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1020 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1024 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1025 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1029 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1030 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1031 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1032 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1036 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1037 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1038 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1039 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1040 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1041 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1042 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1043 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1049 static inline u32x
lut3_e8 (const u32x a
, const u32x b
, const u32x c
)
1054 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1058 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1059 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1063 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1064 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1068 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1069 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1070 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1071 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1075 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1076 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1077 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1078 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1079 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1080 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1081 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1082 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1088 static inline u32x
lut3_ca (const u32x a
, const u32x b
, const u32x c
)
1093 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1097 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1098 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1102 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1103 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1107 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1108 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1109 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1110 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1114 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s8
) : "r" (a
.s8
), "r" (b
.s8
), "r" (c
.s8
));
1115 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s9
) : "r" (a
.s9
), "r" (b
.s9
), "r" (c
.s9
));
1116 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sa
) : "r" (a
.sa
), "r" (b
.sa
), "r" (c
.sa
));
1117 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sb
) : "r" (a
.sb
), "r" (b
.sb
), "r" (c
.sb
));
1118 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sc
) : "r" (a
.sc
), "r" (b
.sc
), "r" (c
.sc
));
1119 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sd
) : "r" (a
.sd
), "r" (b
.sd
), "r" (c
.sd
));
1120 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.se
) : "r" (a
.se
), "r" (b
.se
), "r" (c
.se
));
1121 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.sf
) : "r" (a
.sf
), "r" (b
.sf
), "r" (c
.sf
));
1131 static inline u32
swap32_S (const u32 v
)
1133 return (as_uint (as_uchar4 (v
).s3210
));
1136 static inline u64
swap64_S (const u64 v
)
1138 return (as_ulong (as_uchar8 (v
).s76543210
));
1141 static inline u32
rotr32_S (const u32 a
, const u32 n
)
1143 return rotate (a
, 32 - n
);
1146 static inline u32
rotl32_S (const u32 a
, const u32 n
)
1148 return rotate (a
, n
);
1151 static inline u64
rotr64_S (const u64 a
, const u32 n
)
1153 return rotate (a
, (u64
) 64 - n
);
1156 static inline u64
rotl64_S (const u64 a
, const u32 n
)
1158 return rotate (a
, (u64
) n
);
1161 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
1163 const u64 tmp
= ((((u64
) a
) << 32) | ((u64
) b
)) >> ((c
& 3) * 8);
1168 static inline u32x
swap32 (const u32x v
)
1170 return ((v
>> 24) & 0x000000ff)
1171 | ((v
>> 8) & 0x0000ff00)
1172 | ((v
<< 8) & 0x00ff0000)
1173 | ((v
<< 24) & 0xff000000);
1176 static inline u64x
swap64 (const u64x v
)
1178 return ((v
>> 56) & 0x00000000000000ff)
1179 | ((v
>> 40) & 0x000000000000ff00)
1180 | ((v
>> 24) & 0x0000000000ff0000)
1181 | ((v
>> 8) & 0x00000000ff000000)
1182 | ((v
<< 8) & 0x000000ff00000000)
1183 | ((v
<< 24) & 0x0000ff0000000000)
1184 | ((v
<< 40) & 0x00ff000000000000)
1185 | ((v
<< 56) & 0xff00000000000000);
1188 static inline u32x
rotr32 (const u32x a
, const u32 n
)
1190 return rotate (a
, 32 - n
);
1193 static inline u32x
rotl32 (const u32x a
, const u32 n
)
1195 return rotate (a
, n
);
1198 static inline u64x
rotr64 (const u64x a
, const u32 n
)
1200 return rotate (a
, (u64
) 64 - n
);
1203 static inline u64x
rotl64 (const u64x a
, const u32 n
)
1205 return rotate (a
, (u64
) n
);
1208 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
1210 #define BIT(x) (1 << (x))
1211 #define BIT_MASK(x) (BIT (x) - 1)
1212 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
1214 return BFE (a
, b
, c
);
1217 static inline u32x
amd_bytealign (const u32x a
, const u32x b
, const u32 c
)
1220 const u64x tmp
= ((((u64x
) (a
)) << 32) | ((u64x
) (b
))) >> ((c
& 3) * 8);
1222 return (u32x
) (tmp
);
1226 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
)) << 32) | ((u64x
) (b
.s0
, b
.s1
))) >> ((c
& 3) * 8);
1228 return (u32x
) (tmp
.s0
, tmp
.s1
);
1232 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
))) >> ((c
& 3) * 8);
1234 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
);
1238 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
))) >> ((c
& 3) * 8);
1240 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
);
1244 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
, a
.s8
, a
.s9
, a
.sa
, a
.sb
, a
.sc
, a
.sd
, a
.se
, a
.sf
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
, b
.s8
, b
.s9
, b
.sa
, b
.sb
, b
.sc
, b
.sd
, b
.se
, b
.sf
))) >> ((c
& 3) * 8);
1246 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
, tmp
.s8
, tmp
.s9
, tmp
.sa
, tmp
.sb
, tmp
.sc
, tmp
.sd
, tmp
.se
, tmp
.sf
);
1259 #elif defined _MD5H_
1261 #elif defined _SHA1_
1263 #elif defined _BCRYPT_
1265 #elif defined _SHA256_
1267 #elif defined _SHA384_
1269 #elif defined _SHA512_
1271 #elif defined _KECCAK_
1273 #elif defined _RIPEMD160_
1275 #elif defined _WHIRLPOOL_
1277 #elif defined _GOST_
1279 #elif defined _GOST2012_256_
1281 #elif defined _GOST2012_512_
1283 #elif defined _SAPB_
1285 #elif defined _SAPG_
1287 #elif defined _MYSQL323_
1289 #elif defined _LOTUS5_
1291 #elif defined _LOTUS6_
1293 #elif defined _SCRYPT_
1295 #elif defined _LOTUS8_
1297 #elif defined _OFFICE2007_
1299 #elif defined _OFFICE2010_
1301 #elif defined _OFFICE2013_
1303 #elif defined _OLDOFFICE01_
1305 #elif defined _OLDOFFICE34_
1307 #elif defined _SIPHASH_
1309 #elif defined _PBKDF2_MD5_
1311 #elif defined _PBKDF2_SHA1_
1313 #elif defined _PBKDF2_SHA256_
1315 #elif defined _PBKDF2_SHA512_
1317 #elif defined _PDF17L8_
1319 #elif defined _CRC32_
1321 #elif defined _SEVEN_ZIP_
1323 #elif defined _ANDROIDFDE_
1325 #elif defined _DCC2_
1329 #elif defined _MD5_SHA1_
1331 #elif defined _SHA1_MD5_
1333 #elif defined _NETNTLMV2_
1335 #elif defined _KRB5PA_
1337 #elif defined _CLOUDKEY_
1339 #elif defined _SCRYPT_
1341 #elif defined _PSAFE2_
1343 #elif defined _LOTUS8_
1345 #elif defined _RAR3_
1347 #elif defined _SHA256_SHA1_
1349 #elif defined _MS_DRSR_
1351 #elif defined _ANDROIDFDE_SAMSUNG_
1353 #elif defined _RAR5_
1355 #elif defined _KRB5TGS_
1357 #elif defined _AXCRYPT_
1359 #elif defined _KEEPASS_
1375 u32 truecrypt_mdlen
;
1426 u32 cry_master_buf
[64];
1428 u32 public_key_buf
[64];
1469 u32 userdomain_buf
[64];
1486 u32 account_info
[512];
1497 u32 keyfile_buf
[16];
1541 u32 encryptedVerifier
[4];
1542 u32 encryptedVerifierHash
[5];
1550 u32 encryptedVerifier
[4];
1551 u32 encryptedVerifierHash
[8];
1557 u32 encryptedVerifier
[4];
1558 u32 encryptedVerifierHash
[8];
1565 u32 encryptedVerifier
[4];
1566 u32 encryptedVerifierHash
[4];
1574 u32 encryptedVerifier
[4];
1575 u32 encryptedVerifierHash
[5];
1595 /* key-file handling */
1599 u32 final_random_seed
[8];
1600 u32 transf_random_seed
[8];
1602 u32 contents_hash
[8];
1604 /* specific to version 1 */
1606 u32 contents
[75000];
1608 /* specific to version 2 */
1609 u32 expected_bytes
[8];
1652 } sha256crypt_tmp_t
;
1656 u64 l_alt_result
[8];
1661 } sha512crypt_tmp_t
;
1677 } bitcoin_wallet_tmp_t
;
1775 } pbkdf2_sha1_tmp_t
;
1785 } pbkdf2_sha256_tmp_t
;
1795 } pbkdf2_sha512_tmp_t
;
2013 u32 alignment_placeholder_1
;
2014 u32 alignment_placeholder_2
;
2015 u32 alignment_placeholder_3
;