#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
+// Buggy drivers...
+
+#ifdef IS_AMD
+#define STATE_DECL volatile
+#else
+#define STATE_DECL
+#endif
+
#define PUTCHAR64_BE(a,p,c) ((u8 *)(a))[(p) ^ 7] = (u8) (c)
#define GETCHAR64_BE(a,p) ((u8 *)(a))[(p) ^ 7]
u64 we_t = w[14];
u64 wf_t = w[15];
- u64 a = digest[0];
- u64 b = digest[1];
- u64 c = digest[2];
- u64 d = digest[3];
- u64 e = digest[4];
- u64 f = digest[5];
- u64 g = digest[6];
- u64 h = digest[7];
+ STATE_DECL u64 a = digest[0];
+ STATE_DECL u64 b = digest[1];
+ STATE_DECL u64 c = digest[2];
+ STATE_DECL u64 d = digest[3];
+ STATE_DECL u64 e = digest[4];
+ STATE_DECL u64 f = digest[5];
+ STATE_DECL u64 g = digest[6];
+ STATE_DECL u64 h = digest[7];
#define ROUND_EXPAND() \
{ \
digest[7] += h;
}
-#ifdef IS_AMD
-static void sha512_transform_workaround (const u64 w[16], u64 digest[8])
-{
- u64 w0_t = w[ 0];
- u64 w1_t = w[ 1];
- u64 w2_t = w[ 2];
- u64 w3_t = w[ 3];
- u64 w4_t = w[ 4];
- u64 w5_t = w[ 5];
- u64 w6_t = w[ 6];
- u64 w7_t = w[ 7];
- u64 w8_t = w[ 8];
- u64 w9_t = w[ 9];
- u64 wa_t = w[10];
- u64 wb_t = w[11];
- u64 wc_t = w[12];
- u64 wd_t = w[13];
- u64 we_t = w[14];
- u64 wf_t = w[15];
-
- u64 a = digest[0];
- u64 b = digest[1];
- u64 c = digest[2];
- u64 d = digest[3];
- u64 e = digest[4];
- u64 f = digest[5];
- u64 g = digest[6];
- u64 h = digest[7];
-
- #define ROUND_EXPAND_WO() \
- { \
- w0_t = SHA512_EXPAND_WO (we_t, w9_t, w1_t, w0_t); \
- w1_t = SHA512_EXPAND_WO (wf_t, wa_t, w2_t, w1_t); \
- w2_t = SHA512_EXPAND_WO (w0_t, wb_t, w3_t, w2_t); \
- w3_t = SHA512_EXPAND_WO (w1_t, wc_t, w4_t, w3_t); \
- w4_t = SHA512_EXPAND_WO (w2_t, wd_t, w5_t, w4_t); \
- w5_t = SHA512_EXPAND_WO (w3_t, we_t, w6_t, w5_t); \
- w6_t = SHA512_EXPAND_WO (w4_t, wf_t, w7_t, w6_t); \
- w7_t = SHA512_EXPAND_WO (w5_t, w0_t, w8_t, w7_t); \
- w8_t = SHA512_EXPAND_WO (w6_t, w1_t, w9_t, w8_t); \
- w9_t = SHA512_EXPAND_WO (w7_t, w2_t, wa_t, w9_t); \
- wa_t = SHA512_EXPAND_WO (w8_t, w3_t, wb_t, wa_t); \
- wb_t = SHA512_EXPAND_WO (w9_t, w4_t, wc_t, wb_t); \
- wc_t = SHA512_EXPAND_WO (wa_t, w5_t, wd_t, wc_t); \
- wd_t = SHA512_EXPAND_WO (wb_t, w6_t, we_t, wd_t); \
- we_t = SHA512_EXPAND_WO (wc_t, w7_t, wf_t, we_t); \
- wf_t = SHA512_EXPAND_WO (wd_t, w8_t, w0_t, wf_t); \
- }
-
- ROUND_STEP (0);
-
- for (int i = 16; i < 80; i += 16)
- {
- ROUND_EXPAND_WO (); ROUND_STEP (i);
- }
-
- digest[0] += a;
- digest[1] += b;
- digest[2] += c;
- digest[3] += d;
- digest[4] += e;
- digest[5] += f;
- digest[6] += g;
- digest[7] += h;
-}
-#endif
-
static void sha512_init (sha512_ctx_t *sha512_ctx)
{
sha512_ctx->state[0] = SHA512M_A;
PUTCHAR64_BE (sha512_ctx->buf, pos++, GETCHAR64_BE (buf, i));
}
- #ifdef IS_AMD
- sha512_transform_workaround (sha512_ctx->buf, sha512_ctx->state);
- #endif
-
- #ifdef IS_NV
- sha512_transform (sha512_ctx->buf, sha512_ctx->state);
- #endif
-
- #ifdef IS_GENERIC
sha512_transform (sha512_ctx->buf, sha512_ctx->state);
- #endif
len -= cnt;
static inline u64 rotr64_S (const u64 a, const u32 n)
{
- u64 r;
-
#if DEVICE_TYPE == DEVICE_TYPE_CPU
- r = rotate (a, (u64) 64 - n);
+ const u64 r = rotate (a, (u64) 64 - n);
#else
- uint2 a2 = as_uint2 (a);
-
- uint2 t;
+ const u32 a0 = h32_from_64_S (a);
+ const u32 a1 = l32_from_64_S (a);
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32)
- : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32)
- : amd_bitalign (a2.s0, a2.s1, n);
+ const u32 t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n);
+ const u32 t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n);
- r = as_ulong (t);
+ const u64 r = hl32_to_64_S (t0, t1);
#endif
static inline u64x rotr64 (const u64x a, const u32 n)
{
- u64x r;
-
#if DEVICE_TYPE == DEVICE_TYPE_CPU
- r = rotate (a, (u64) 64 - n);
+ const u64x r = rotate (a, (u64) 64 - n);
#else
- uint2 a2;
- uint2 t;
-
- #if VECT_SIZE == 1
+ const u32x a0 = h32_from_64 (a);
+ const u32x a1 = l32_from_64 (a);
- a2 = as_uint2 (a);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r = as_ulong (t);
-
- #elif VECT_SIZE == 2
-
- {
- a2 = as_uint2 (a.s0);
+ const u32x t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n);
+ const u32x t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n);
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s0 = as_ulong (t);
- }
-
- {
- a2 = as_uint2 (a.s1);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s1 = as_ulong (t);
- }
-
- #elif VECT_SIZE == 4
-
- {
- a2 = as_uint2 (a.s0);
+ const u64x r = hl32_to_64 (t0, t1);
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s0 = as_ulong (t);
- }
-
- {
- a2 = as_uint2 (a.s1);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s1 = as_ulong (t);
- }
-
- {
- a2 = as_uint2 (a.s2);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s2 = as_ulong (t);
- }
-
- {
- a2 = as_uint2 (a.s3);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s3 = as_ulong (t);
- }
-
- #elif VECT_SIZE == 8
-
- {
- a2 = as_uint2 (a.s0);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s0 = as_ulong (t);
- }
-
- {
- a2 = as_uint2 (a.s1);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s1 = as_ulong (t);
- }
-
- {
- a2 = as_uint2 (a.s2);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s2 = as_ulong (t);
- }
-
- {
- a2 = as_uint2 (a.s3);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s3 = as_ulong (t);
- }
-
- {
- a2 = as_uint2 (a.s4);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s4 = as_ulong (t);
- }
-
- {
- a2 = as_uint2 (a.s5);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s5 = as_ulong (t);
- }
-
- {
- a2 = as_uint2 (a.s6);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s6 = as_ulong (t);
- }
-
- {
- a2 = as_uint2 (a.s7);
-
- t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n);
- t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n);
-
- r.s7 = as_ulong (t);
- }
-
- #endif
#endif
return r;
#define SHA384_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \
{ \
- u64x temp0; \
- temp0 = K; \
- temp0 += x; \
- temp0 += h; \
- temp0 += SHA384_S1 (e); \
- temp0 += F0 (e, f, g); \
- d += temp0; \
- h = SHA384_S0 (a); \
- h += F1 (a, b, c); \
- h += temp0; \
+ h += K; \
+ h += x; \
+ h += SHA384_S1 (e); \
+ h += F0 (e, f, g); \
+ d += h; \
+ h += SHA384_S0 (a); \
+ h += F1 (a, b, c); \
}
#define SHA384_EXPAND(x,y,z,w) (SHA384_S3 (x) + y + SHA384_S2 (z) + w)
#define SHA512_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \
{ \
- u64x temp0; \
- temp0 = K; \
- temp0 += x; \
- temp0 += h; \
- temp0 += SHA512_S1 (e); \
- temp0 += F0 (e, f, g); \
- d += temp0; \
- h = SHA512_S0 (a); \
- h += F1 (a, b, c); \
- h += temp0; \
+ h += K; \
+ h += x; \
+ h += SHA512_S1 (e); \
+ h += F0 (e, f, g); \
+ d += h; \
+ h += SHA512_S0 (a); \
+ h += F1 (a, b, c); \
}
#define SHA512_EXPAND(x,y,z,w) (SHA512_S3 (x) + y + SHA512_S2 (z) + w)
-
-#define SHA512_S2_WO(x) (rotate ((x), 64- 1ull) ^ rotate ((x), 64- 8ull) ^ SHIFT_RIGHT_64 ((x), 7))
-#define SHA512_S3_WO(x) (rotate ((x), 64-19ull) ^ rotate ((x), 64-61ull) ^ SHIFT_RIGHT_64 ((x), 6))
-
-#define SHA512_EXPAND_WO(x,y,z,w) (SHA512_S3_WO (x) + y + SHA512_S2_WO (z) + w)
#endif
#ifdef _RIPEMD160_