2 * Authors.....: Jens Steube <jens.steube@gmail.com>
3 * magnum <john.magnum@hushmail.com>
8 #define DEVICE_TYPE_CPU 2
9 #define DEVICE_TYPE_GPU 4
21 #define CONCAT(a, b) a##b
22 #define VTYPE(type, width) CONCAT(type, width)
30 typedef VTYPE(uchar
, VECT_SIZE
) u8x
;
31 typedef VTYPE(ushort
, VECT_SIZE
) u16x
;
32 typedef VTYPE(uint
, VECT_SIZE
) u32x
;
33 typedef VTYPE(ulong
, VECT_SIZE
) u64x
;
36 inline u32
l32_from_64_S (u64 a
)
38 const u32 r
= (u32
) (a
);
43 inline u32
h32_from_64_S (u64 a
)
47 const u32 r
= (u32
) (a
);
52 inline u64
hl32_to_64_S (const u32 a
, const u32 b
)
54 return as_ulong ((uint2
) (b
, a
));
57 inline u32x
l32_from_64 (u64x a
)
96 inline u32x
h32_from_64 (u64x a
)
137 inline u64x
hl32_to_64 (const u32x a
, const u32x b
)
142 r
= as_ulong ((uint2
) (b
, a
));
146 r
.s0
= as_ulong ((uint2
) (b
.s0
, a
.s0
));
147 r
.s1
= as_ulong ((uint2
) (b
.s1
, a
.s1
));
151 r
.s2
= as_ulong ((uint2
) (b
.s2
, a
.s2
));
152 r
.s3
= as_ulong ((uint2
) (b
.s3
, a
.s3
));
156 r
.s4
= as_ulong ((uint2
) (b
.s4
, a
.s4
));
157 r
.s5
= as_ulong ((uint2
) (b
.s5
, a
.s5
));
158 r
.s6
= as_ulong ((uint2
) (b
.s6
, a
.s6
));
159 r
.s7
= as_ulong ((uint2
) (b
.s7
, a
.s7
));
163 r
.s8
= as_ulong ((uint2
) (b
.s8
, a
.s8
));
164 r
.s9
= as_ulong ((uint2
) (b
.s9
, a
.s9
));
165 r
.sa
= as_ulong ((uint2
) (b
.sa
, a
.sa
));
166 r
.sb
= as_ulong ((uint2
) (b
.sb
, a
.sb
));
167 r
.sc
= as_ulong ((uint2
) (b
.sc
, a
.sc
));
168 r
.sd
= as_ulong ((uint2
) (b
.sd
, a
.sd
));
169 r
.se
= as_ulong ((uint2
) (b
.se
, a
.se
));
170 r
.sf
= as_ulong ((uint2
) (b
.sf
, a
.sf
));
177 inline u32
swap32_S (const u32 v
)
179 return (as_uint (as_uchar4 (v
).s3210
));
182 inline u64
swap64_S (const u64 v
)
184 return (as_ulong (as_uchar8 (v
).s76543210
));
187 inline u32
rotr32_S (const u32 a
, const u32 n
)
189 return rotate (a
, 32 - n
);
192 inline u32
rotl32_S (const u32 a
, const u32 n
)
194 return rotate (a
, n
);
197 inline u64
rotr64_S (const u64 a
, const u32 n
)
199 const u32 a0
= h32_from_64_S (a
);
200 const u32 a1
= l32_from_64_S (a
);
202 const u32 t0
= (n
>= 32) ? amd_bitalign (a0
, a1
, n
- 32) : amd_bitalign (a1
, a0
, n
);
203 const u32 t1
= (n
>= 32) ? amd_bitalign (a1
, a0
, n
- 32) : amd_bitalign (a0
, a1
, n
);
205 const u64 r
= hl32_to_64_S (t0
, t1
);
210 inline u64
rotl64_S (const u64 a
, const u32 n
)
212 return rotr64_S (a
, 64 - n
);
215 inline u32x
swap32 (const u32x v
)
217 return ((v
>> 24) & 0x000000ff)
218 | ((v
>> 8) & 0x0000ff00)
219 | ((v
<< 8) & 0x00ff0000)
220 | ((v
<< 24) & 0xff000000);
223 inline u64x
swap64 (const u64x v
)
225 return ((v
>> 56) & 0x00000000000000ff)
226 | ((v
>> 40) & 0x000000000000ff00)
227 | ((v
>> 24) & 0x0000000000ff0000)
228 | ((v
>> 8) & 0x00000000ff000000)
229 | ((v
<< 8) & 0x000000ff00000000)
230 | ((v
<< 24) & 0x0000ff0000000000)
231 | ((v
<< 40) & 0x00ff000000000000)
232 | ((v
<< 56) & 0xff00000000000000);
235 inline u32x
rotr32 (const u32x a
, const u32 n
)
237 return rotate (a
, 32 - n
);
240 inline u32x
rotl32 (const u32x a
, const u32 n
)
242 return rotate (a
, n
);
245 inline u64x
rotr64 (const u64x a
, const u32 n
)
247 const u32x a0
= h32_from_64 (a
);
248 const u32x a1
= l32_from_64 (a
);
250 const u32x t0
= (n
>= 32) ? amd_bitalign (a0
, a1
, n
- 32) : amd_bitalign (a1
, a0
, n
);
251 const u32x t1
= (n
>= 32) ? amd_bitalign (a1
, a0
, n
- 32) : amd_bitalign (a0
, a1
, n
);
253 const u64x r
= hl32_to_64 (t0
, t1
);
258 inline u64x
rotl64 (const u64x a
, const u32 n
)
260 return rotr64 (a
, 64 - n
);
263 inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
265 return amd_bfe (a
, b
, c
);
268 inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
270 return amd_bytealign (a
, b
, c
);
275 inline u32
swap32_S (const u32 v
)
279 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r
) : "r"(v
));
284 inline u64
swap64_S (const u64 v
)
289 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(v
));
294 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl
) : "r"(il
));
295 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr
) : "r"(ir
));
299 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tr
), "r"(tl
));
304 inline u32
rotr32_S (const u32 a
, const u32 n
)
306 return rotate (a
, 32 - n
);
309 inline u32
rotl32_S (const u32 a
, const u32 n
)
311 return rotate (a
, n
);
315 inline u64
rotr64_S (const u64 a
, const u32 n
)
320 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
327 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
328 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
332 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
333 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
338 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
343 inline u64
rotr64_S (const u64 a
, const u32 n
)
345 return rotate (a
, (u64
) 64 - n
);
349 inline u64
rotl64_S (const u64 a
, const u32 n
)
351 return rotr64_S (a
, 64 - n
);
354 inline u32
__byte_perm_S (const u32 a
, const u32 b
, const u32 c
)
358 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
363 inline u32x
swap32 (const u32x v
)
365 return ((v
>> 24) & 0x000000ff)
366 | ((v
>> 8) & 0x0000ff00)
367 | ((v
<< 8) & 0x00ff0000)
368 | ((v
<< 24) & 0xff000000);
371 inline u64x
swap64 (const u64x v
)
373 return ((v
>> 56) & 0x00000000000000ff)
374 | ((v
>> 40) & 0x000000000000ff00)
375 | ((v
>> 24) & 0x0000000000ff0000)
376 | ((v
>> 8) & 0x00000000ff000000)
377 | ((v
<< 8) & 0x000000ff00000000)
378 | ((v
<< 24) & 0x0000ff0000000000)
379 | ((v
<< 40) & 0x00ff000000000000)
380 | ((v
<< 56) & 0xff00000000000000);
383 inline u32x
rotr32 (const u32x a
, const u32 n
)
385 return rotate (a
, 32 - n
);
388 inline u32x
rotl32 (const u32x a
, const u32 n
)
390 return rotate (a
, n
);
394 inline u64x
rotr64 (const u64x a
, const u32 n
)
405 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
409 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
410 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
414 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
415 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
418 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
425 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s0
));
429 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
430 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
434 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
435 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
438 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s0
) : "r"(tl
), "r"(tr
));
442 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s1
));
446 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
447 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
451 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
452 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
455 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s1
) : "r"(tl
), "r"(tr
));
463 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s2
));
467 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
468 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
472 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
473 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
476 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s2
) : "r"(tl
), "r"(tr
));
480 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s3
));
484 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
485 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
489 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
490 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
493 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s3
) : "r"(tl
), "r"(tr
));
501 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s4
));
505 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
506 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
510 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
511 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
514 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s4
) : "r"(tl
), "r"(tr
));
518 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s5
));
522 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
523 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
527 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
528 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
531 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s5
) : "r"(tl
), "r"(tr
));
535 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s6
));
539 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
540 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
544 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
545 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
548 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s6
) : "r"(tl
), "r"(tr
));
552 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s7
));
556 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
557 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
561 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
562 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
565 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s7
) : "r"(tl
), "r"(tr
));
573 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s8
));
577 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
578 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
582 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
583 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
586 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s8
) : "r"(tl
), "r"(tr
));
590 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s9
));
594 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
595 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
599 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
600 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
603 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s9
) : "r"(tl
), "r"(tr
));
607 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sa
));
611 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
612 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
616 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
617 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
620 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sa
) : "r"(tl
), "r"(tr
));
624 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sb
));
628 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
629 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
633 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
634 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
637 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sb
) : "r"(tl
), "r"(tr
));
641 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sc
));
645 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
646 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
650 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
651 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
654 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sc
) : "r"(tl
), "r"(tr
));
658 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sd
));
662 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
663 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
667 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
668 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
671 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sd
) : "r"(tl
), "r"(tr
));
675 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.se
));
679 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
680 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
684 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
685 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
688 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.se
) : "r"(tl
), "r"(tr
));
692 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.sf
));
696 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
697 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
701 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
702 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
705 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.sf
) : "r"(tl
), "r"(tr
));
713 inline u64x
rotr64 (const u64x a
, const u32 n
)
715 return rotate (a
, (u64
) 64 - n
);
719 inline u64x
rotl64 (const u64x a
, const u32 n
)
721 return rotr64 (a
, (u64
) 64 - n
);
724 inline u32x
__byte_perm (const u32x a
, const u32x b
, const u32x c
)
729 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
) );
733 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s0
) : "r"(a
.s0
), "r"(b
.s0
), "r"(c
.s0
));
734 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s1
) : "r"(a
.s1
), "r"(b
.s1
), "r"(c
.s1
));
738 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s2
) : "r"(a
.s2
), "r"(b
.s2
), "r"(c
.s2
));
739 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s3
) : "r"(a
.s3
), "r"(b
.s3
), "r"(c
.s3
));
743 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s4
) : "r"(a
.s4
), "r"(b
.s4
), "r"(c
.s4
));
744 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s5
) : "r"(a
.s5
), "r"(b
.s5
), "r"(c
.s5
));
745 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s6
) : "r"(a
.s6
), "r"(b
.s6
), "r"(c
.s6
));
746 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s7
) : "r"(a
.s7
), "r"(b
.s7
), "r"(c
.s7
));
750 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s8
) : "r"(a
.s8
), "r"(b
.s8
), "r"(c
.s8
));
751 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s9
) : "r"(a
.s9
), "r"(b
.s9
), "r"(c
.s9
));
752 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sa
) : "r"(a
.sa
), "r"(b
.sa
), "r"(c
.sa
));
753 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sb
) : "r"(a
.sb
), "r"(b
.sb
), "r"(c
.sb
));
754 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sc
) : "r"(a
.sc
), "r"(b
.sc
), "r"(c
.sc
));
755 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sd
) : "r"(a
.sd
), "r"(b
.sd
), "r"(c
.sd
));
756 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.se
) : "r"(a
.se
), "r"(b
.se
), "r"(c
.se
));
757 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.sf
) : "r"(a
.sf
), "r"(b
.sf
), "r"(c
.sf
));
763 inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
767 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
773 inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
777 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(b
), "r"(a
), "r"((c
& 3) * 8));
782 inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
784 return __byte_perm_S (b
, a
, (0x76543210 >> ((c
& 3) * 4)) & 0xffff);
791 inline u32
swap32_S (const u32 v
)
793 return (as_uint (as_uchar4 (v
).s3210
));
796 inline u64
swap64_S (const u64 v
)
798 return (as_ulong (as_uchar8 (v
).s76543210
));
801 inline u32
rotr32_S (const u32 a
, const u32 n
)
803 return rotate (a
, 32 - n
);
806 inline u32
rotl32_S (const u32 a
, const u32 n
)
808 return rotate (a
, n
);
811 inline u64
rotr64_S (const u64 a
, const u32 n
)
813 return rotate (a
, (u64
) 64 - n
);
816 inline u64
rotl64_S (const u64 a
, const u32 n
)
818 return rotate (a
, (u64
) n
);
821 inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
823 const u64 tmp
= ((((u64
) a
) << 32) | ((u64
) b
)) >> ((c
& 3) * 8);
828 inline u32x
swap32 (const u32x v
)
830 return ((v
>> 24) & 0x000000ff)
831 | ((v
>> 8) & 0x0000ff00)
832 | ((v
<< 8) & 0x00ff0000)
833 | ((v
<< 24) & 0xff000000);
836 inline u64x
swap64 (const u64x v
)
838 return ((v
>> 56) & 0x00000000000000ff)
839 | ((v
>> 40) & 0x000000000000ff00)
840 | ((v
>> 24) & 0x0000000000ff0000)
841 | ((v
>> 8) & 0x00000000ff000000)
842 | ((v
<< 8) & 0x000000ff00000000)
843 | ((v
<< 24) & 0x0000ff0000000000)
844 | ((v
<< 40) & 0x00ff000000000000)
845 | ((v
<< 56) & 0xff00000000000000);
848 inline u32x
rotr32 (const u32x a
, const u32 n
)
850 return rotate (a
, 32 - n
);
853 inline u32x
rotl32 (const u32x a
, const u32 n
)
855 return rotate (a
, n
);
858 inline u64x
rotr64 (const u64x a
, const u32 n
)
860 return rotate (a
, (u64
) 64 - n
);
863 inline u64x
rotl64 (const u64x a
, const u32 n
)
865 return rotate (a
, (u64
) n
);
868 inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
870 #define BIT(x) (1 << (x))
871 #define BIT_MASK(x) (BIT (x) - 1)
872 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
874 return BFE (a
, b
, c
);
877 inline u32x
amd_bytealign (const u32x a
, const u32x b
, const u32 c
)
880 const u64x tmp
= ((((u64x
) (a
)) << 32) | ((u64x
) (b
))) >> ((c
& 3) * 8);
886 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
)) << 32) | ((u64x
) (b
.s0
, b
.s1
))) >> ((c
& 3) * 8);
888 return (u32x
) (tmp
.s0
, tmp
.s1
);
892 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
))) >> ((c
& 3) * 8);
894 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
);
898 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
))) >> ((c
& 3) * 8);
900 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
);
904 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
, a
.s8
, a
.s9
, a
.sa
, a
.sb
, a
.sc
, a
.sd
, a
.se
, a
.sf
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
, b
.s8
, b
.s9
, b
.sa
, b
.sb
, b
.sc
, b
.sd
, b
.se
, b
.sf
))) >> ((c
& 3) * 8);
906 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
, tmp
.s8
, tmp
.s9
, tmp
.sa
, tmp
.sb
, tmp
.sc
, tmp
.sd
, tmp
.se
, tmp
.sf
);
923 #elif defined _BCRYPT_
925 #elif defined _SHA256_
927 #elif defined _SHA384_
929 #elif defined _SHA512_
931 #elif defined _KECCAK_
933 #elif defined _RIPEMD160_
935 #elif defined _WHIRLPOOL_
939 #elif defined _GOST2012_256_
941 #elif defined _GOST2012_512_
947 #elif defined _MYSQL323_
949 #elif defined _LOTUS5_
951 #elif defined _LOTUS6_
953 #elif defined _SCRYPT_
955 #elif defined _LOTUS8_
957 #elif defined _OFFICE2007_
959 #elif defined _OFFICE2010_
961 #elif defined _OFFICE2013_
963 #elif defined _OLDOFFICE01_
965 #elif defined _OLDOFFICE34_
967 #elif defined _SIPHASH_
969 #elif defined _PBKDF2_MD5_
971 #elif defined _PBKDF2_SHA1_
973 #elif defined _PBKDF2_SHA256_
975 #elif defined _PBKDF2_SHA512_
977 #elif defined _PDF17L8_
979 #elif defined _CRC32_
981 #elif defined _SEVEN_ZIP_
983 #elif defined _ANDROIDFDE_
989 #elif defined _MD5_SHA1_
991 #elif defined _SHA1_MD5_
993 #elif defined _NETNTLMV2_
995 #elif defined _KRB5PA_
997 #elif defined _CLOUDKEY_
999 #elif defined _SCRYPT_
1001 #elif defined _PSAFE2_
1003 #elif defined _LOTUS8_
1005 #elif defined _RAR3_
1007 #elif defined _SHA256_SHA1_
1009 #elif defined _MS_DRSR_
1011 #elif defined _ANDROIDFDE_SAMSUNG_
1013 #elif defined _RAR5_
1015 #elif defined _KRB5TGS_
1017 #elif defined _AXCRYPT_
1019 #elif defined _KEEPASS_
1035 u32 truecrypt_mdlen
;
1086 u32 cry_master_buf
[64];
1088 u32 public_key_buf
[64];
1129 u32 userdomain_buf
[64];
1146 u32 account_info
[512];
1157 u32 keyfile_buf
[16];
1201 u32 encryptedVerifier
[4];
1202 u32 encryptedVerifierHash
[5];
1210 u32 encryptedVerifier
[4];
1211 u32 encryptedVerifierHash
[8];
1217 u32 encryptedVerifier
[4];
1218 u32 encryptedVerifierHash
[8];
1225 u32 encryptedVerifier
[4];
1226 u32 encryptedVerifierHash
[4];
1234 u32 encryptedVerifier
[4];
1235 u32 encryptedVerifierHash
[5];
1255 /* key-file handling */
1259 u32 final_random_seed
[8];
1260 u32 transf_random_seed
[8];
1262 u32 contents_hash
[8];
1264 /* specific to version 1 */
1266 u32 contents
[75000];
1268 /* specific to version 2 */
1269 u32 expected_bytes
[8];
1312 } sha256crypt_tmp_t
;
1316 u64 l_alt_result
[8];
1321 } sha512crypt_tmp_t
;
1337 } bitcoin_wallet_tmp_t
;
1435 } pbkdf2_sha1_tmp_t
;
1445 } pbkdf2_sha256_tmp_t
;
1455 } pbkdf2_sha512_tmp_t
;
1673 u32 alignment_placeholder_1
;
1674 u32 alignment_placeholder_2
;
1675 u32 alignment_placeholder_3
;