2 * Author......: Jens Steube <jens.steube@gmail.com>
6 #define DEVICE_TYPE_CPU 2
7 #define DEVICE_TYPE_GPU 4
47 // this one needs to die
50 static inline u32
l32_from_64_S (u64 a
)
52 const u32 r
= (u32
) (a
);
57 static inline u32
h32_from_64_S (u64 a
)
61 const u32 r
= (u32
) (a
);
66 static inline u64
hl32_to_64_S (const u32 a
, const u32 b
)
68 return as_ulong ((uint2
) (b
, a
));
71 static inline u32x
l32_from_64 (u64x a
)
99 static inline u32x
h32_from_64 (u64x a
)
129 static inline u64x
hl32_to_64 (const u32x a
, const u32x b
)
134 r
= as_ulong ((uint2
) (b
, a
));
138 r
.s0
= as_ulong ((uint2
) (b
.s0
, a
.s0
));
139 r
.s1
= as_ulong ((uint2
) (b
.s1
, a
.s1
));
143 r
.s2
= as_ulong ((uint2
) (b
.s2
, a
.s2
));
144 r
.s3
= as_ulong ((uint2
) (b
.s3
, a
.s3
));
148 r
.s4
= as_ulong ((uint2
) (b
.s4
, a
.s4
));
149 r
.s5
= as_ulong ((uint2
) (b
.s5
, a
.s5
));
150 r
.s6
= as_ulong ((uint2
) (b
.s6
, a
.s6
));
151 r
.s7
= as_ulong ((uint2
) (b
.s7
, a
.s7
));
158 static inline u32
swap32_S (const u32 v
)
160 return (as_uint (as_uchar4 (v
).s3210
));
163 static inline u64
swap64_S (const u64 v
)
165 return (as_ulong (as_uchar8 (v
).s76543210
));
168 static inline u32
rotr32_S (const u32 a
, const u32 n
)
170 return rotate (a
, 32 - n
);
173 static inline u32
rotl32_S (const u32 a
, const u32 n
)
175 return rotate (a
, n
);
178 static inline u64
rotr64_S (const u64 a
, const u32 n
)
182 #if DEVICE_TYPE == DEVICE_TYPE_CPU
184 r
= rotate (a
, (u64
) 64 - n
);
188 uint2 a2
= as_uint2 (a
);
192 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32)
193 : amd_bitalign (a2
.s1
, a2
.s0
, n
);
194 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32)
195 : amd_bitalign (a2
.s0
, a2
.s1
, n
);
204 static inline u64
rotl64_S (const u64 a
, const u32 n
)
206 return rotr64_S (a
, 64 - n
);
209 static inline u32x
swap32 (const u32x v
)
211 return ((v
>> 24) & 0x000000ff)
212 | ((v
>> 8) & 0x0000ff00)
213 | ((v
<< 8) & 0x00ff0000)
214 | ((v
<< 24) & 0xff000000);
217 static inline u64x
swap64 (const u64x v
)
219 return ((v
>> 56) & 0x00000000000000ff)
220 | ((v
>> 40) & 0x000000000000ff00)
221 | ((v
>> 24) & 0x0000000000ff0000)
222 | ((v
>> 8) & 0x00000000ff000000)
223 | ((v
<< 8) & 0x000000ff00000000)
224 | ((v
<< 24) & 0x0000ff0000000000)
225 | ((v
<< 40) & 0x00ff000000000000)
226 | ((v
<< 56) & 0xff00000000000000);
229 static inline u32x
rotr32 (const u32x a
, const u32 n
)
231 return rotate (a
, 32 - n
);
234 static inline u32x
rotl32 (const u32x a
, const u32 n
)
236 return rotate (a
, n
);
239 static inline u64x
rotr64 (const u64x a
, const u32 n
)
243 #if DEVICE_TYPE == DEVICE_TYPE_CPU
245 r
= rotate (a
, (u64
) 64 - n
);
256 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
257 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
264 a2
= as_uint2 (a
.s0
);
266 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
267 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
273 a2
= as_uint2 (a
.s1
);
275 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
276 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
284 a2
= as_uint2 (a
.s0
);
286 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
287 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
293 a2
= as_uint2 (a
.s1
);
295 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
296 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
302 a2
= as_uint2 (a
.s2
);
304 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
305 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
311 a2
= as_uint2 (a
.s3
);
313 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
314 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
322 a2
= as_uint2 (a
.s0
);
324 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
325 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
331 a2
= as_uint2 (a
.s1
);
333 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
334 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
340 a2
= as_uint2 (a
.s2
);
342 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
343 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
349 a2
= as_uint2 (a
.s3
);
351 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
352 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
358 a2
= as_uint2 (a
.s4
);
360 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
361 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
367 a2
= as_uint2 (a
.s5
);
369 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
370 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
376 a2
= as_uint2 (a
.s6
);
378 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
379 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
385 a2
= as_uint2 (a
.s7
);
387 t
.s0
= (n
>= 32) ? amd_bitalign (a2
.s0
, a2
.s1
, n
- 32) : amd_bitalign (a2
.s1
, a2
.s0
, n
);
388 t
.s1
= (n
>= 32) ? amd_bitalign (a2
.s1
, a2
.s0
, n
- 32) : amd_bitalign (a2
.s0
, a2
.s1
, n
);
399 static inline u64x
rotl64 (const u64x a
, const u32 n
)
401 return rotr64 (a
, 64 - n
);
404 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
406 return amd_bfe (a
, b
, c
);
409 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
411 return amd_bytealign (a
, b
, c
);
416 static inline u32
swap32_S (const u32 v
)
420 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r
) : "r"(v
));
425 static inline u64
swap64_S (const u64 v
)
430 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(v
));
435 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl
) : "r"(il
));
436 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr
) : "r"(ir
));
440 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tr
), "r"(tl
));
445 static inline u32
rotr32_S (const u32 a
, const u32 n
)
447 return rotate (a
, 32 - n
);
450 static inline u32
rotl32_S (const u32 a
, const u32 n
)
452 return rotate (a
, n
);
456 static inline u64
rotr64_S (const u64 a
, const u32 n
)
461 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
468 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
469 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
473 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
474 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
479 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
484 static inline u64
rotr64_S (const u64 a
, const u32 n
)
486 return rotate (a
, (u64
) 64 - n
);
490 static inline u64
rotl64_S (const u64 a
, const u32 n
)
492 return rotr64_S (a
, 64 - n
);
496 static inline u32
lut3_2d_S (const u32 a
, const u32 b
, const u32 c
)
500 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
505 static inline u32
lut3_39_S (const u32 a
, const u32 b
, const u32 c
)
509 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
514 static inline u32
lut3_59_S (const u32 a
, const u32 b
, const u32 c
)
518 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
523 static inline u32
lut3_96_S (const u32 a
, const u32 b
, const u32 c
)
527 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
532 static inline u32
lut3_e4_S (const u32 a
, const u32 b
, const u32 c
)
536 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
541 static inline u32
lut3_e8_S (const u32 a
, const u32 b
, const u32 c
)
545 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
550 static inline u32
lut3_ca_S (const u32 a
, const u32 b
, const u32 c
)
554 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
560 static inline u32
__byte_perm_S (const u32 a
, const u32 b
, const u32 c
)
564 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
569 static inline u32x
swap32 (const u32x v
)
571 return ((v
>> 24) & 0x000000ff)
572 | ((v
>> 8) & 0x0000ff00)
573 | ((v
<< 8) & 0x00ff0000)
574 | ((v
<< 24) & 0xff000000);
577 static inline u64x
swap64 (const u64x v
)
579 return ((v
>> 56) & 0x00000000000000ff)
580 | ((v
>> 40) & 0x000000000000ff00)
581 | ((v
>> 24) & 0x0000000000ff0000)
582 | ((v
>> 8) & 0x00000000ff000000)
583 | ((v
<< 8) & 0x000000ff00000000)
584 | ((v
<< 24) & 0x0000ff0000000000)
585 | ((v
<< 40) & 0x00ff000000000000)
586 | ((v
<< 56) & 0xff00000000000000);
589 static inline u32x
rotr32 (const u32x a
, const u32 n
)
591 return rotate (a
, 32 - n
);
594 static inline u32x
rotl32 (const u32x a
, const u32 n
)
596 return rotate (a
, n
);
600 static inline u64x
rotr64 (const u64x a
, const u32 n
)
611 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
));
615 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
616 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
620 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
621 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
624 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
) : "r"(tl
), "r"(tr
));
631 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s0
));
635 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
636 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
640 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
641 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
644 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s0
) : "r"(tl
), "r"(tr
));
648 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s1
));
652 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
653 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
657 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
658 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
661 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s1
) : "r"(tl
), "r"(tr
));
669 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s2
));
673 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
674 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
678 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
679 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
682 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s2
) : "r"(tl
), "r"(tr
));
686 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s3
));
690 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
691 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
695 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
696 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
699 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s3
) : "r"(tl
), "r"(tr
));
707 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s4
));
711 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
712 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
716 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
717 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
720 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s4
) : "r"(tl
), "r"(tr
));
724 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s5
));
728 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
729 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
733 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
734 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
737 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s5
) : "r"(tl
), "r"(tr
));
741 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s6
));
745 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
746 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
750 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
751 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
754 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s6
) : "r"(tl
), "r"(tr
));
758 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il
), "=r"(ir
) : "l"(a
.s7
));
762 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(ir
), "r"(il
), "r"(n
- 32));
763 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(il
), "r"(ir
), "r"(n
- 32));
767 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl
) : "r"(il
), "r"(ir
), "r"(n
));
768 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr
) : "r"(ir
), "r"(il
), "r"(n
));
771 asm ("mov.b64 %0, {%1, %2};" : "=l"(r
.s7
) : "r"(tl
), "r"(tr
));
779 static inline u64x
rotr64 (const u64x a
, const u32 n
)
781 return rotate (a
, (u64
) 64 - n
);
785 static inline u64x
rotl64 (const u64x a
, const u32 n
)
787 return rotr64 (a
, (u64
) 64 - n
);
790 static inline u32x
__byte_perm (const u32x a
, const u32x b
, const u32x c
)
795 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
) );
799 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s0
) : "r"(a
.s0
), "r"(b
.s0
), "r"(c
.s0
));
800 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s1
) : "r"(a
.s1
), "r"(b
.s1
), "r"(c
.s1
));
804 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s2
) : "r"(a
.s2
), "r"(b
.s2
), "r"(c
.s2
));
805 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s3
) : "r"(a
.s3
), "r"(b
.s3
), "r"(c
.s3
));
809 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s4
) : "r"(a
.s4
), "r"(b
.s4
), "r"(c
.s4
));
810 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s5
) : "r"(a
.s5
), "r"(b
.s5
), "r"(c
.s5
));
811 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s6
) : "r"(a
.s6
), "r"(b
.s6
), "r"(c
.s6
));
812 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r
.s7
) : "r"(a
.s7
), "r"(b
.s7
), "r"(c
.s7
));
818 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
822 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r
) : "r"(a
), "r"(b
), "r"(c
));
828 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
832 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r
) : "r"(b
), "r"(a
), "r"((c
& 3) * 8));
837 static inline u32
amd_bytealign (const u32 a
, const u32 b
, const u32 c
)
839 return __byte_perm_S (b
, a
, (0x76543210 >> ((c
& 3) * 4)) & 0xffff);
844 static inline u32x
lut3_2d (const u32x a
, const u32x b
, const u32x c
)
849 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
853 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
854 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
858 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
859 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
863 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
864 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
865 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
866 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
872 static inline u32x
lut3_39 (const u32x a
, const u32x b
, const u32x c
)
877 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
881 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
882 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
886 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
887 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
888 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
889 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
893 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
894 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
895 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
896 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
897 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
898 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
899 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
900 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
906 static inline u32x
lut3_59 (const u32x a
, const u32x b
, const u32x c
)
911 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
915 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
916 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
920 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
921 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
922 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
923 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
927 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
928 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
929 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
930 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
931 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
932 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
933 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
934 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
940 static inline u32x
lut3_96 (const u32x a
, const u32x b
, const u32x c
)
945 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
949 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
950 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
954 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
955 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
956 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
957 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
961 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
962 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
963 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
964 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
965 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
966 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
967 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
968 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
974 static inline u32x
lut3_e4 (const u32x a
, const u32x b
, const u32x c
)
979 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
983 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
984 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
988 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
989 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
990 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
991 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
995 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
996 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
997 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
998 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
999 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1000 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1001 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1002 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1008 static inline u32x
lut3_e8 (const u32x a
, const u32x b
, const u32x c
)
1013 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1017 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1018 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1022 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1023 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1024 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1025 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1029 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1030 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1031 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1032 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1033 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1034 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1035 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1036 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1042 static inline u32x
lut3_ca (const u32x a
, const u32x b
, const u32x c
)
1047 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
) : "r" (a
), "r" (b
), "r" (c
));
1051 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1052 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1056 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1057 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1058 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1059 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1063 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s0
) : "r" (a
.s0
), "r" (b
.s0
), "r" (c
.s0
));
1064 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s1
) : "r" (a
.s1
), "r" (b
.s1
), "r" (c
.s1
));
1065 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s2
) : "r" (a
.s2
), "r" (b
.s2
), "r" (c
.s2
));
1066 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s3
) : "r" (a
.s3
), "r" (b
.s3
), "r" (c
.s3
));
1067 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s4
) : "r" (a
.s4
), "r" (b
.s4
), "r" (c
.s4
));
1068 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s5
) : "r" (a
.s5
), "r" (b
.s5
), "r" (c
.s5
));
1069 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s6
) : "r" (a
.s6
), "r" (b
.s6
), "r" (c
.s6
));
1070 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r
.s7
) : "r" (a
.s7
), "r" (b
.s7
), "r" (c
.s7
));
1080 static inline u32
swap32_S (const u32 v
)
1082 return (as_uint (as_uchar4 (v
).s3210
));
1085 static inline u64
swap64_S (const u64 v
)
1087 return (as_ulong (as_uchar8 (v
).s76543210
));
1090 static inline u32
rotr32_S (const u32 a
, const u32 n
)
1092 return rotate (a
, 32 - n
);
1095 static inline u32
rotl32_S (const u32 a
, const u32 n
)
1097 return rotate (a
, n
);
1100 static inline u64
rotr64_S (const u64 a
, const u32 n
)
1102 return rotate (a
, (u64
) 64 - n
);
1105 static inline u64
rotl64_S (const u64 a
, const u32 n
)
1107 return rotate (a
, (u64
) n
);
1110 static inline u32
amd_bytealign_S (const u32 a
, const u32 b
, const u32 c
)
1112 const u64 tmp
= ((((u64
) a
) << 32) | ((u64
) b
)) >> ((c
& 3) * 8);
1117 static inline u32x
swap32 (const u32x v
)
1119 return ((v
>> 24) & 0x000000ff)
1120 | ((v
>> 8) & 0x0000ff00)
1121 | ((v
<< 8) & 0x00ff0000)
1122 | ((v
<< 24) & 0xff000000);
1125 static inline u64x
swap64 (const u64x v
)
1127 return ((v
>> 56) & 0x00000000000000ff)
1128 | ((v
>> 40) & 0x000000000000ff00)
1129 | ((v
>> 24) & 0x0000000000ff0000)
1130 | ((v
>> 8) & 0x00000000ff000000)
1131 | ((v
<< 8) & 0x000000ff00000000)
1132 | ((v
<< 24) & 0x0000ff0000000000)
1133 | ((v
<< 40) & 0x00ff000000000000)
1134 | ((v
<< 56) & 0xff00000000000000);
1137 static inline u32x
rotr32 (const u32x a
, const u32 n
)
1139 return rotate (a
, 32 - n
);
1142 static inline u32x
rotl32 (const u32x a
, const u32 n
)
1144 return rotate (a
, n
);
1147 static inline u64x
rotr64 (const u64x a
, const u32 n
)
1149 return rotate (a
, (u64
) 64 - n
);
1152 static inline u64x
rotl64 (const u64x a
, const u32 n
)
1154 return rotate (a
, (u64
) n
);
1157 static inline u32
__bfe (const u32 a
, const u32 b
, const u32 c
)
1159 #define BIT(x) (1 << (x))
1160 #define BIT_MASK(x) (BIT (x) - 1)
1161 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
1163 return BFE (a
, b
, c
);
1166 static inline u32x
amd_bytealign (const u32x a
, const u32x b
, const u32 c
)
1169 const u64x tmp
= ((((u64x
) (a
)) << 32) | ((u64x
) (b
))) >> ((c
& 3) * 8);
1171 return (u32x
) (tmp
);
1175 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
)) << 32) | ((u64x
) (b
.s0
, b
.s1
))) >> ((c
& 3) * 8);
1177 return (u32x
) (tmp
.s0
, tmp
.s1
);
1181 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
))) >> ((c
& 3) * 8);
1183 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
);
1187 const u64x tmp
= ((((u64x
) (a
.s0
, a
.s1
, a
.s2
, a
.s3
, a
.s4
, a
.s5
, a
.s6
, a
.s7
)) << 32) | ((u64x
) (b
.s0
, b
.s1
, b
.s2
, b
.s3
, b
.s4
, b
.s5
, b
.s6
, b
.s7
))) >> ((c
& 3) * 8);
1189 return (u32x
) (tmp
.s0
, tmp
.s1
, tmp
.s2
, tmp
.s3
, tmp
.s4
, tmp
.s5
, tmp
.s6
, tmp
.s7
);
1202 #elif defined _MD5H_
1204 #elif defined _SHA1_
1206 #elif defined _BCRYPT_
1208 #elif defined _SHA256_
1210 #elif defined _SHA384_
1212 #elif defined _SHA512_
1214 #elif defined _KECCAK_
1216 #elif defined _RIPEMD160_
1218 #elif defined _WHIRLPOOL_
1220 #elif defined _GOST_
1222 #elif defined _GOST2012_256_
1224 #elif defined _GOST2012_512_
1226 #elif defined _SAPB_
1228 #elif defined _SAPG_
1230 #elif defined _MYSQL323_
1232 #elif defined _LOTUS5_
1234 #elif defined _LOTUS6_
1236 #elif defined _SCRYPT_
1238 #elif defined _LOTUS8_
1240 #elif defined _OFFICE2007_
1242 #elif defined _OFFICE2010_
1244 #elif defined _OFFICE2013_
1246 #elif defined _OLDOFFICE01_
1248 #elif defined _OLDOFFICE34_
1250 #elif defined _SIPHASH_
1252 #elif defined _PBKDF2_MD5_
1254 #elif defined _PBKDF2_SHA1_
1256 #elif defined _PBKDF2_SHA256_
1258 #elif defined _PBKDF2_SHA512_
1260 #elif defined _PDF17L8_
1262 #elif defined _CRC32_
1264 #elif defined _SEVEN_ZIP_
1266 #elif defined _ANDROIDFDE_
1268 #elif defined _DCC2_
1272 #elif defined _MD5_SHA1_
1274 #elif defined _SHA1_MD5_
1276 #elif defined _NETNTLMV2_
1278 #elif defined _KRB5PA_
1280 #elif defined _CLOUDKEY_
1282 #elif defined _SCRYPT_
1284 #elif defined _PSAFE2_
1286 #elif defined _LOTUS8_
1288 #elif defined _RAR3_
1290 #elif defined _SHA256_SHA1_
1292 #elif defined _MS_DRSR_
1294 #elif defined _ANDROIDFDE_SAMSUNG_
1296 #elif defined _RAR5_
1312 u32 truecrypt_mdlen
;
1359 u32 cry_master_buf
[64];
1361 u32 public_key_buf
[64];
1402 u32 userdomain_buf
[64];
1421 u32 keyfile_buf
[16];
1465 u32 encryptedVerifier
[4];
1466 u32 encryptedVerifierHash
[5];
1474 u32 encryptedVerifier
[4];
1475 u32 encryptedVerifierHash
[8];
1481 u32 encryptedVerifier
[4];
1482 u32 encryptedVerifierHash
[8];
1489 u32 encryptedVerifier
[4];
1490 u32 encryptedVerifierHash
[4];
1498 u32 encryptedVerifier
[4];
1499 u32 encryptedVerifierHash
[5];
1543 } sha256crypt_tmp_t
;
1547 u64 l_alt_result
[8];
1552 } sha512crypt_tmp_t
;
1568 } bitcoin_wallet_tmp_t
;
1666 } pbkdf2_sha1_tmp_t
;
1676 } pbkdf2_sha256_tmp_t
;
1686 } pbkdf2_sha512_tmp_t
;
1889 u32 alignment_placeholder_1
;
1890 u32 alignment_placeholder_2
;
1891 u32 alignment_placeholder_3
;