#endif
#ifdef IS_NV
-#define BF_ROUND(L,R,N) \
-{ \
- u32 tmp; \
- \
- tmp = S0[__bfe ((L), 24, 8)]; \
- tmp += S1[__bfe ((L), 16, 8)]; \
- tmp ^= S2[__bfe ((L), 8, 8)]; \
- tmp += S3[__bfe ((L), 0, 8)]; \
- \
- (R) ^= tmp ^ P[(N)]; \
+#define BF_ROUND(L,R,N) \
+{ \
+ u32 tmp; \
+ \
+ tmp = S0[__bfe_S ((L), 24, 8)]; \
+ tmp += S1[__bfe_S ((L), 16, 8)]; \
+ tmp ^= S2[__bfe_S ((L), 8, 8)]; \
+ tmp += S3[__bfe_S ((L), 0, 8)]; \
+ \
+ (R) ^= tmp ^ P[(N)]; \
}
#endif
u32 P[18];
+ #pragma unroll
for (u32 i = 0; i < 18; i++)
{
P[i] = tmps[gid].P[i];
__local u32 *S2 = S2_all[lid];
__local u32 *S3 = S3_all[lid];
+ #pragma unroll
for (u32 i = 0; i < 256; i++)
{
S0[i] = tmps[gid].S0[i];
return rotr64 (a, 64 - n);
}
-inline u32 __bfe (const u32 a, const u32 b, const u32 c)
+inline u32x __bfe (const u32x a, const u32x b, const u32x c)
+{
+ return amd_bfe (a, b, c);
+}
+
+inline u32 __bfe_S (const u32 a, const u32 b, const u32 c)
{
return amd_bfe (a, b, c);
}
// balancing the workload turns out to be very efficient
- const u32 kernel_power_balance = kernel_accel * kernel_loops;
+ if (kernel_loops_min != kernel_loops_max)
+ {
+ const u32 kernel_power_balance = kernel_accel * kernel_loops;
- u32 sqrtv;
+ u32 sqrtv;
- for (sqrtv = 1; sqrtv < 0x100000; sqrtv++)
- {
- if ((sqrtv * sqrtv) >= kernel_power_balance) break;
- }
+ for (sqrtv = 1; sqrtv < 0x100000; sqrtv++)
+ {
+ if ((sqrtv * sqrtv) >= kernel_power_balance) break;
+ }
- const u32 kernel_accel_try = sqrtv;
- const u32 kernel_loops_try = sqrtv;
+ const u32 kernel_accel_try = sqrtv;
+ const u32 kernel_loops_try = sqrtv;
- if ((kernel_accel_try <= kernel_accel_max) && (kernel_loops_try >= kernel_loops_min))
- {
- kernel_accel = kernel_accel_try;
- kernel_loops = kernel_loops_try;
+ if ((kernel_accel_try <= kernel_accel_max) && (kernel_loops_try >= kernel_loops_min))
+ {
+ kernel_accel = kernel_accel_try;
+ kernel_loops = kernel_loops_try;
+ }
}
// reset fake words