From: jsteube Date: Tue, 15 Dec 2015 15:50:21 +0000 (+0100) Subject: Fix more append_* functions in kernels X-Git-Tag: v3.00-beta~584^2~89 X-Git-Url: https://www.flypig.org.uk/git/?a=commitdiff_plain;h=2283d5c843c425aebfdad4a521b1e0ad85fae387;p=hashcat.git Fix more append_* functions in kernels --- diff --git a/OpenCL/common.c b/OpenCL/common.c index df7a2bb..fcbc56c 100644 --- a/OpenCL/common.c +++ b/OpenCL/common.c @@ -177,7 +177,6 @@ static void undo_unicode (const u32 in1[4], const u32 in2[4], u32 out[4]) #endif } -// before: append_0x01_1 static void append_0x01_1x4 (u32 w0[4], const u32 offset) { switch (offset) @@ -248,7 +247,6 @@ static void append_0x01_1x4 (u32 w0[4], const u32 offset) } } -// before: append_0x01_2 static void append_0x01_2x4 (u32 w0[4], u32 w1[4], const u32 offset) { switch (offset) @@ -383,7 +381,6 @@ static void append_0x01_2x4 (u32 w0[4], u32 w1[4], const u32 offset) } } -// before: append_0x01_3 static void append_0x01_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) { switch (offset) @@ -582,7 +579,6 @@ static void append_0x01_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) } } -// before: append_0x01_4 static void append_0x01_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { switch (offset) @@ -845,7 +841,6 @@ static void append_0x01_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u } } -// before: append_0x01_8 static void append_0x01_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { switch (offset) @@ -1364,7 +1359,6 @@ static void append_0x01_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[ } } -// before: append_0x02_1 static void append_0x02_1x4 (u32 w0[4], const u32 offset) { switch (offset) @@ -1435,7 +1429,6 @@ static void append_0x02_1x4 (u32 w0[4], const u32 offset) } } -// before: append_0x02_2 static void append_0x02_2x4 (u32 w0[4], u32 w1[4], const u32 offset) { switch (offset) @@ -1570,7 +1563,6 @@ static void append_0x02_2x4 (u32 w0[4], u32 w1[4], const u32 offset) } } -// before: append_0x02_3 static void append_0x02_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) { switch (offset) @@ -1769,7 +1761,6 @@ static void append_0x02_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) } } -// before: append_0x02_4 static void append_0x02_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { switch (offset) @@ -2032,7 +2023,6 @@ static void append_0x02_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u } } -// before: append_0x02_8 static void append_0x02_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { switch (offset) @@ -2551,7 +2541,6 @@ static void append_0x02_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[ } } -// before: append_0x80_1 static void append_0x80_1x4 (u32 w0[4], const u32 offset) { switch (offset) @@ -2622,7 +2611,6 @@ static void append_0x80_1x4 (u32 w0[4], const u32 offset) } } -// before: append_0x80_2 static void append_0x80_2x4 (u32 w0[4], u32 w1[4], const u32 offset) { switch (offset) @@ -2757,7 +2745,6 @@ static void append_0x80_2x4 (u32 w0[4], u32 w1[4], const u32 offset) } } -// before: append_0x80_3 static void append_0x80_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) { switch (offset) @@ -2956,7 +2943,6 @@ static void append_0x80_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) } } -// before: append_0x80_4 static void append_0x80_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { switch (offset) @@ -3219,7 +3205,6 @@ static void append_0x80_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u } } -// before: append_0x80_8 static void append_0x80_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { switch (offset) @@ -3738,4245 +3723,4246 @@ static void append_0x80_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[ } } -// before: device_memcat2L -static void memcat_c7_d1x2_sl1x2_sr1x2 (const u32 offset, u32 dst0[2], u32 src_l0[2], u32 src_r0[2]) +static void append_0x80_1x16 (u32 w[16], const u32 offset) { switch (offset) { - case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - break; - - case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - break; - - case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - break; - - case 4: - dst0[1] = src_r0[0]; - break; - - case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - break; - - case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - break; - - case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; + case 0: + w[ 0] = 0x80; break; - } -} -// before: device_memcat4L -static void memcat_c15_d1x4_sl1x4_sr1x4 (const u32 offset, u32 dst0[4], u32 src_l0[4], u32 src_r0[4]) -{ - switch (offset) - { case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; + w[ 0] = w[ 0] | 0x8000; break; case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; + w[ 0] = w[ 0] | 0x800000; break; case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; + w[ 0] = w[ 0] | 0x80000000; break; case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; + w[ 1] = 0x80; break; case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; + w[ 1] = w[ 1] | 0x8000; break; case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; + w[ 1] = w[ 1] | 0x800000; break; case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; + w[ 1] = w[ 1] | 0x80000000; break; case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; + w[ 2] = 0x80; break; case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; + w[ 2] = w[ 2] | 0x8000; break; case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; + w[ 2] = w[ 2] | 0x800000; break; case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; + w[ 2] = w[ 2] | 0x80000000; break; case 12: - dst0[3] = src_r0[0]; + w[ 3] = 0x80; break; case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; + w[ 3] = w[ 3] | 0x8000; break; case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; + w[ 3] = w[ 3] | 0x800000; break; case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; + w[ 3] = w[ 3] | 0x80000000; break; - } -} -// before: device_memcat8L -static void memcat_c31_d2x4_sl2x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 src_l0[4], u32 src_l1[4], u32 src_r0[4]) -{ - switch (offset) - { - case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[0] = src_r0[3] >> 24; + case 16: + w[ 4] = 0x80; break; - case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[0] = src_r0[3] >> 16; + case 17: + w[ 4] = w[ 4] | 0x8000; break; - case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[0] = src_r0[3] >> 8; + case 18: + w[ 4] = w[ 4] | 0x800000; break; - case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; - dst1[0] = src_r0[3]; + case 19: + w[ 4] = w[ 4] | 0x80000000; break; - case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[1] = src_r0[3] >> 24; + case 20: + w[ 5] = 0x80; break; - case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[1] = src_r0[3] >> 16; + case 21: + w[ 5] = w[ 5] | 0x8000; break; - case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[1] = src_r0[3] >> 8; + case 22: + w[ 5] = w[ 5] | 0x800000; break; - case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; - dst1[0] = src_r0[2]; - dst1[1] = src_r0[3]; + case 23: + w[ 5] = w[ 5] | 0x80000000; break; - case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[2] = src_r0[3] >> 24; + case 24: + w[ 6] = 0x80; break; - case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[2] = src_r0[3] >> 16; + case 25: + w[ 6] = w[ 6] | 0x8000; break; - case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[2] = src_r0[3] >> 8; + case 26: + w[ 6] = w[ 6] | 0x800000; break; - case 12: - dst0[3] = src_r0[0]; - dst1[0] = src_r0[1]; - dst1[1] = src_r0[2]; - dst1[2] = src_r0[3]; + case 27: + w[ 6] = w[ 6] | 0x80000000; break; - case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; - dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[3] = src_r0[3] >> 24; + case 28: + w[ 7] = 0x80; break; - case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; - dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[3] = src_r0[3] >> 16; + case 29: + w[ 7] = w[ 7] | 0x8000; break; - case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; - dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[3] = src_r0[3] >> 8; + case 30: + w[ 7] = w[ 7] | 0x800000; break; - case 16: - dst1[0] = src_r0[0]; - dst1[1] = src_r0[1]; - dst1[2] = src_r0[2]; - dst1[3] = src_r0[3]; + case 31: + w[ 7] = w[ 7] | 0x80000000; break; - case 17: - dst1[0] = src_l1[0] | src_r0[0] << 8; - dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; + case 32: + w[ 8] = 0x80; break; - case 18: - dst1[0] = src_l1[0] | src_r0[0] << 16; - dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; + case 33: + w[ 8] = w[ 8] | 0x8000; break; - case 19: - dst1[0] = src_l1[0] | src_r0[0] << 24; - dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; + case 34: + w[ 8] = w[ 8] | 0x800000; break; - case 20: - dst1[1] = src_r0[0]; - dst1[2] = src_r0[1]; - dst1[3] = src_r0[2]; + case 35: + w[ 8] = w[ 8] | 0x80000000; break; - case 21: - dst1[1] = src_l1[1] | src_r0[0] << 8; - dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; - break; - - case 22: - dst1[1] = src_l1[1] | src_r0[0] << 16; - dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; - break; - - case 23: - dst1[1] = src_l1[1] | src_r0[0] << 24; - dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; + case 36: + w[ 9] = 0x80; break; - case 24: - dst1[2] = src_r0[0]; - dst1[3] = src_r0[1]; + case 37: + w[ 9] = w[ 9] | 0x8000; break; - case 25: - dst1[2] = src_l1[2] | src_r0[0] << 8; - dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; + case 38: + w[ 9] = w[ 9] | 0x800000; break; - case 26: - dst1[2] = src_l1[2] | src_r0[0] << 16; - dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; + case 39: + w[ 9] = w[ 9] | 0x80000000; break; - case 27: - dst1[2] = src_l1[2] | src_r0[0] << 24; - dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; + case 40: + w[10] = 0x80; break; - case 28: - dst1[3] = src_r0[0]; + case 41: + w[10] = w[10] | 0x8000; break; - case 29: - dst1[3] = src_l1[3] | src_r0[0] << 8; + case 42: + w[10] = w[10] | 0x800000; break; - case 30: - dst1[3] = src_l1[3] | src_r0[0] << 16; + case 43: + w[10] = w[10] | 0x80000000; break; - case 31: - dst1[3] = src_l1[3] | src_r0[0] << 24; + case 44: + w[11] = 0x80; break; - } -} -// before: device_memcat12L -static void memcat_c47_d3x4_sl3x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4]) -{ - switch (offset) - { - case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[0] = src_r0[3] >> 24; + case 45: + w[11] = w[11] | 0x8000; break; - case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[0] = src_r0[3] >> 16; + case 46: + w[11] = w[11] | 0x800000; break; - case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[0] = src_r0[3] >> 8; + case 47: + w[11] = w[11] | 0x80000000; break; - case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; - dst1[0] = src_r0[3]; + case 48: + w[12] = 0x80; break; - case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[1] = src_r0[3] >> 24; + case 49: + w[12] = w[12] | 0x8000; break; - case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[1] = src_r0[3] >> 16; + case 50: + w[12] = w[12] | 0x800000; break; - case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[1] = src_r0[3] >> 8; + case 51: + w[12] = w[12] | 0x80000000; break; - case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; - dst1[0] = src_r0[2]; - dst1[1] = src_r0[3]; + case 52: + w[13] = 0x80; break; - case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[2] = src_r0[3] >> 24; + case 53: + w[13] = w[13] | 0x8000; break; - case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[2] = src_r0[3] >> 16; + case 54: + w[13] = w[13] | 0x800000; break; - case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[2] = src_r0[3] >> 8; + case 55: + w[13] = w[13] | 0x80000000; break; - case 12: - dst0[3] = src_r0[0]; - dst1[0] = src_r0[1]; - dst1[1] = src_r0[2]; - dst1[2] = src_r0[3]; + case 56: + w[14] = 0x80; break; - case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; - dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[3] = src_r0[3] >> 24; + case 57: + w[14] = w[14] | 0x8000; break; - case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; - dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[3] = src_r0[3] >> 16; + case 58: + w[14] = w[14] | 0x800000; break; - case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; - dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[3] = src_r0[3] >> 8; + case 59: + w[14] = w[14] | 0x80000000; break; - case 16: - dst1[0] = src_r0[0]; - dst1[1] = src_r0[1]; - dst1[2] = src_r0[2]; - dst1[3] = src_r0[3]; + case 60: + w[15] = 0x80; break; - case 17: - dst1[0] = src_l1[0] | src_r0[0] << 8; - dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[0] = src_r0[3] >> 24; + case 61: + w[15] = w[15] | 0x8000; break; - case 18: - dst1[0] = src_l1[0] | src_r0[0] << 16; - dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[0] = src_r0[3] >> 16; + case 62: + w[15] = w[15] | 0x800000; break; - case 19: - dst1[0] = src_l1[0] | src_r0[0] << 24; - dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[0] = src_r0[3] >> 8; + case 63: + w[15] = w[15] | 0x80000000; break; + } +} - case 20: - dst1[1] = src_r0[0]; - dst1[2] = src_r0[1]; - dst1[3] = src_r0[2]; - dst2[0] = src_r0[3]; - break; +static void switch_buffer_by_offset (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +{ + #ifdef IS_AMD + const int offset_mod_4 = offset & 3; - case 21: - dst1[1] = src_l1[1] | src_r0[0] << 8; - dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[1] = src_r0[3] >> 24; - break; + const int offset_minus_4 = 4 - offset; - case 22: - dst1[1] = src_l1[1] | src_r0[0] << 16; - dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[1] = src_r0[3] >> 16; - break; + switch (offset / 4) + { + case 0: + w3[2] = amd_bytealign ( 0, w3[1], offset_minus_4); + w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); + w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); + w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); + w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); + w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); + w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); + w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w0[0] = amd_bytealign (w0[0], 0, offset_minus_4); - case 23: - dst1[1] = src_l1[1] | src_r0[0] << 24; - dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[1] = src_r0[3] >> 8; - break; + if (offset_mod_4 == 0) + { + w0[0] = w0[1]; + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } - case 24: - dst1[2] = src_r0[0]; - dst1[3] = src_r0[1]; - dst2[0] = src_r0[2]; - dst2[1] = src_r0[3]; break; - case 25: - dst1[2] = src_l1[2] | src_r0[0] << 8; - dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[2] = src_r0[3] >> 24; - break; + case 1: + w3[2] = amd_bytealign ( 0, w3[0], offset_minus_4); + w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); + w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); + w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); + w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); + w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); + w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w0[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w0[0] = 0; - case 26: - dst1[2] = src_l1[2] | src_r0[0] << 16; - dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[2] = src_r0[3] >> 16; - break; + if (offset_mod_4 == 0) + { + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } - case 27: - dst1[2] = src_l1[2] | src_r0[0] << 24; - dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[2] = src_r0[3] >> 8; break; - case 28: - dst1[3] = src_r0[0]; - dst2[0] = src_r0[1]; - dst2[1] = src_r0[2]; - dst2[2] = src_r0[3]; - break; + case 2: + w3[2] = amd_bytealign ( 0, w2[3], offset_minus_4); + w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); + w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); + w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); + w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); + w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w0[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w0[1] = 0; + w0[0] = 0; - case 29: - dst1[3] = src_l1[3] | src_r0[0] << 8; - dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[3] = src_r0[3] >> 24; - break; + if (offset_mod_4 == 0) + { + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } - case 30: - dst1[3] = src_l1[3] | src_r0[0] << 16; - dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[3] = src_r0[3] >> 16; break; - case 31: - dst1[3] = src_l1[3] | src_r0[0] << 24; - dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[3] = src_r0[3] >> 8; - break; + case 3: + w3[2] = amd_bytealign ( 0, w2[2], offset_minus_4); + w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); + w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); + w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); + w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w0[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 32: - dst2[0] = src_r0[0]; - dst2[1] = src_r0[1]; - dst2[2] = src_r0[2]; - dst2[3] = src_r0[3]; - break; + if (offset_mod_4 == 0) + { + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } - case 33: - dst2[0] = src_l2[0] | src_r0[0] << 8; - dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8; break; - case 34: - dst2[0] = src_l2[0] | src_r0[0] << 16; - dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16; - break; + case 4: + w3[2] = amd_bytealign ( 0, w2[1], offset_minus_4); + w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); + w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); + w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w1[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 35: - dst2[0] = src_l2[0] | src_r0[0] << 24; - dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24; - break; + if (offset_mod_4 == 0) + { + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } - case 36: - dst2[1] = src_r0[0]; - dst2[2] = src_r0[1]; - dst2[3] = src_r0[2]; break; - case 37: - dst2[1] = src_l2[1] | src_r0[0] << 8; - dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8; - break; + case 5: + w3[2] = amd_bytealign ( 0, w2[0], offset_minus_4); + w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); + w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w1[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 38: - dst2[1] = src_l2[1] | src_r0[0] << 16; - dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16; - break; + if (offset_mod_4 == 0) + { + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } - case 39: - dst2[1] = src_l2[1] | src_r0[0] << 24; - dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24; break; - case 40: - dst2[2] = src_r0[0]; - dst2[3] = src_r0[1]; - break; + case 6: + w3[2] = amd_bytealign ( 0, w1[3], offset_minus_4); + w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w1[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 41: - dst2[2] = src_l2[2] | src_r0[0] << 8; - dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8; - break; + if (offset_mod_4 == 0) + { + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } - case 42: - dst2[2] = src_l2[2] | src_r0[0] << 16; - dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16; break; - case 43: - dst2[2] = src_l2[2] | src_r0[0] << 24; - dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24; + case 7: + w3[2] = amd_bytealign ( 0, w1[2], offset_minus_4); + w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w1[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; - case 44: - dst2[3] = src_r0[0]; + case 8: + w3[2] = amd_bytealign ( 0, w1[1], offset_minus_4); + w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w2[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; - case 45: - dst2[3] = src_l2[3] | src_r0[0] << 8; + case 9: + w3[2] = amd_bytealign ( 0, w1[0], offset_minus_4); + w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w2[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; - case 46: - dst2[3] = src_l2[3] | src_r0[0] << 16; + case 10: + w3[2] = amd_bytealign ( 0, w0[3], offset_minus_4); + w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w2[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; - case 47: - dst2[3] = src_l2[3] | src_r0[0] << 24; - break; - } -} + case 11: + w3[2] = amd_bytealign ( 0, w0[2], offset_minus_4); + w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w2[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } -// before: device_memcat12L -static void memcat_c47_d3x4_sl3x4_sr2x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4], u32 src_r1[4]) -{ - switch (offset) - { - case 0: - dst0[0] = src_r0[0]; - dst0[1] = src_r0[1]; - dst0[2] = src_r0[2]; - dst0[3] = src_r0[3]; - dst1[0] = src_r1[0]; - dst1[1] = src_r1[1]; - dst1[2] = src_r1[2]; - dst1[3] = src_r1[3]; break; - case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[0] = src_r0[3] >> 24 | src_r1[0] << 8; - dst1[1] = src_r1[0] >> 24 | src_r1[1] << 8; - dst1[2] = src_r1[1] >> 24 | src_r1[2] << 8; - dst1[3] = src_r1[2] >> 24 | src_r1[3] << 8; - dst2[0] = src_r1[3] >> 24; - break; + case 12: + w3[2] = amd_bytealign ( 0, w0[1], offset_minus_4); + w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w3[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[0] = src_r0[3] >> 16 | src_r1[0] << 16; - dst1[1] = src_r1[0] >> 16 | src_r1[1] << 16; - dst1[2] = src_r1[1] >> 16 | src_r1[2] << 16; - dst1[3] = src_r1[2] >> 16 | src_r1[3] << 16; - dst2[0] = src_r1[3] >> 16; - break; + if (offset_mod_4 == 0) + { + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } - case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[0] = src_r0[3] >> 8 | src_r1[0] << 24; - dst1[1] = src_r1[0] >> 8 | src_r1[1] << 24; - dst1[2] = src_r1[1] >> 8 | src_r1[2] << 24; - dst1[3] = src_r1[2] >> 8 | src_r1[3] << 24; - dst2[0] = src_r1[3] >> 8; break; - case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; - dst1[0] = src_r0[3]; - dst1[1] = src_r1[0]; - dst1[2] = src_r1[1]; - dst1[3] = src_r1[2]; - dst2[0] = src_r1[3]; - break; + case 13: + w3[2] = amd_bytealign ( 0, w0[0], offset_minus_4); + w3[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[1] = src_r0[3] >> 24 | src_r1[0] << 8; - dst1[2] = src_r1[0] >> 24 | src_r1[1] << 8; - dst1[3] = src_r1[1] >> 24 | src_r1[2] << 8; - dst2[0] = src_r1[2] >> 24 | src_r1[3] << 8; - dst2[1] = src_r1[3] >> 24; - break; + if (offset_mod_4 == 0) + { + w3[1] = w3[2]; + w3[2] = 0; + } - case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[1] = src_r0[3] >> 16 | src_r1[0] << 16; - dst1[2] = src_r1[0] >> 16 | src_r1[1] << 16; - dst1[3] = src_r1[1] >> 16 | src_r1[2] << 16; - dst2[0] = src_r1[2] >> 16 | src_r1[3] << 16; - dst2[1] = src_r1[3] >> 16; break; + } + #endif - case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[1] = src_r0[3] >> 8 | src_r1[0] << 24; - dst1[2] = src_r1[0] >> 8 | src_r1[1] << 24; - dst1[3] = src_r1[1] >> 8 | src_r1[2] << 24; - dst2[0] = src_r1[2] >> 8 | src_r1[3] << 24; - dst2[1] = src_r1[3] >> 8; - break; + #ifdef IS_NV + const int offset_minus_4 = 4 - (offset % 4); - case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; - dst1[0] = src_r0[2]; - dst1[1] = src_r0[3]; - dst1[2] = src_r1[0]; - dst1[3] = src_r1[1]; - dst2[0] = src_r1[2]; - dst2[1] = src_r1[3]; - break; + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[2] = src_r0[3] >> 24 | src_r1[0] << 8; - dst1[3] = src_r1[0] >> 24 | src_r1[1] << 8; - dst2[0] = src_r1[1] >> 24 | src_r1[2] << 8; - dst2[1] = src_r1[2] >> 24 | src_r1[3] << 8; - dst2[2] = src_r1[3] >> 24; - break; + switch (offset / 4) + { + case 0: + w3[1] = __byte_perm (w3[0], w3[1], selector); + w3[0] = __byte_perm (w2[3], w3[0], selector); + w2[3] = __byte_perm (w2[2], w2[3], selector); + w2[2] = __byte_perm (w2[1], w2[2], selector); + w2[1] = __byte_perm (w2[0], w2[1], selector); + w2[0] = __byte_perm (w1[3], w2[0], selector); + w1[3] = __byte_perm (w1[2], w1[3], selector); + w1[2] = __byte_perm (w1[1], w1[2], selector); + w1[1] = __byte_perm (w1[0], w1[1], selector); + w1[0] = __byte_perm (w0[3], w1[0], selector); + w0[3] = __byte_perm (w0[2], w0[3], selector); + w0[2] = __byte_perm (w0[1], w0[2], selector); + w0[1] = __byte_perm (w0[0], w0[1], selector); + w0[0] = __byte_perm ( 0, w0[0], selector); - case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[2] = src_r0[3] >> 16 | src_r1[0] << 16; - dst1[3] = src_r1[0] >> 16 | src_r1[1] << 16; - dst2[0] = src_r1[1] >> 16 | src_r1[2] << 16; - dst2[1] = src_r1[2] >> 16 | src_r1[3] << 16; - dst2[2] = src_r1[3] >> 16; break; - case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[2] = src_r0[3] >> 8 | src_r1[0] << 24; - dst1[3] = src_r1[0] >> 8 | src_r1[1] << 24; - dst2[0] = src_r1[1] >> 8 | src_r1[2] << 24; - dst2[1] = src_r1[2] >> 8 | src_r1[3] << 24; - dst2[2] = src_r1[3] >> 8; - break; + case 1: + w3[1] = __byte_perm (w2[3], w3[0], selector); + w3[0] = __byte_perm (w2[2], w2[3], selector); + w2[3] = __byte_perm (w2[1], w2[2], selector); + w2[2] = __byte_perm (w2[0], w2[1], selector); + w2[1] = __byte_perm (w1[3], w2[0], selector); + w2[0] = __byte_perm (w1[2], w1[3], selector); + w1[3] = __byte_perm (w1[1], w1[2], selector); + w1[2] = __byte_perm (w1[0], w1[1], selector); + w1[1] = __byte_perm (w0[3], w1[0], selector); + w1[0] = __byte_perm (w0[2], w0[3], selector); + w0[3] = __byte_perm (w0[1], w0[2], selector); + w0[2] = __byte_perm (w0[0], w0[1], selector); + w0[1] = __byte_perm ( 0, w0[0], selector); + w0[0] = 0; - case 12: - dst0[3] = src_r0[0]; - dst1[0] = src_r0[1]; - dst1[1] = src_r0[2]; - dst1[2] = src_r0[3]; - dst1[3] = src_r1[0]; - dst2[0] = src_r1[1]; - dst2[1] = src_r1[2]; - dst2[2] = src_r1[3]; break; - case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; - dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[3] = src_r0[3] >> 24 | src_r1[0] << 8; - dst2[0] = src_r1[0] >> 24 | src_r1[1] << 8; - dst2[1] = src_r1[1] >> 24 | src_r1[2] << 8; - dst2[2] = src_r1[2] >> 24 | src_r1[3] << 8; - dst2[3] = src_r1[3] >> 24; - break; + case 2: + w3[1] = __byte_perm (w2[2], w2[3], selector); + w3[0] = __byte_perm (w2[1], w2[2], selector); + w2[3] = __byte_perm (w2[0], w2[1], selector); + w2[2] = __byte_perm (w1[3], w2[0], selector); + w2[1] = __byte_perm (w1[2], w1[3], selector); + w2[0] = __byte_perm (w1[1], w1[2], selector); + w1[3] = __byte_perm (w1[0], w1[1], selector); + w1[2] = __byte_perm (w0[3], w1[0], selector); + w1[1] = __byte_perm (w0[2], w0[3], selector); + w1[0] = __byte_perm (w0[1], w0[2], selector); + w0[3] = __byte_perm (w0[0], w0[1], selector); + w0[2] = __byte_perm ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; - case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; - dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[3] = src_r0[3] >> 16 | src_r1[0] << 16; - dst2[0] = src_r1[0] >> 16 | src_r1[1] << 16; - dst2[1] = src_r1[1] >> 16 | src_r1[2] << 16; - dst2[2] = src_r1[2] >> 16 | src_r1[3] << 16; - dst2[3] = src_r1[3] >> 16; break; - case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; - dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[3] = src_r0[3] >> 8 | src_r1[0] << 24; - dst2[0] = src_r1[0] >> 8 | src_r1[1] << 24; - dst2[1] = src_r1[1] >> 8 | src_r1[2] << 24; - dst2[2] = src_r1[2] >> 8 | src_r1[3] << 24; - dst2[3] = src_r1[3] >> 8; - break; + case 3: + w3[1] = __byte_perm (w2[1], w2[2], selector); + w3[0] = __byte_perm (w2[0], w2[1], selector); + w2[3] = __byte_perm (w1[3], w2[0], selector); + w2[2] = __byte_perm (w1[2], w1[3], selector); + w2[1] = __byte_perm (w1[1], w1[2], selector); + w2[0] = __byte_perm (w1[0], w1[1], selector); + w1[3] = __byte_perm (w0[3], w1[0], selector); + w1[2] = __byte_perm (w0[2], w0[3], selector); + w1[1] = __byte_perm (w0[1], w0[2], selector); + w1[0] = __byte_perm (w0[0], w0[1], selector); + w0[3] = __byte_perm ( 0, w0[0], selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 16: - dst1[0] = src_r0[0]; - dst1[1] = src_r0[1]; - dst1[2] = src_r0[2]; - dst1[3] = src_r0[3]; - dst2[0] = src_r1[0]; - dst2[1] = src_r1[1]; - dst2[2] = src_r1[2]; - dst2[3] = src_r1[3]; break; - case 17: - dst1[0] = src_l1[0] | src_r0[0] << 8; - dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[0] = src_r0[3] >> 24 | src_r1[0] << 8; - dst2[1] = src_r1[0] >> 24 | src_r1[1] << 8; - dst2[2] = src_r1[1] >> 24 | src_r1[2] << 8; - dst2[3] = src_r1[2] >> 24 | src_r1[3] << 8; - break; + case 4: + w3[1] = __byte_perm (w2[0], w2[1], selector); + w3[0] = __byte_perm (w1[3], w2[0], selector); + w2[3] = __byte_perm (w1[2], w1[3], selector); + w2[2] = __byte_perm (w1[1], w1[2], selector); + w2[1] = __byte_perm (w1[0], w1[1], selector); + w2[0] = __byte_perm (w0[3], w1[0], selector); + w1[3] = __byte_perm (w0[2], w0[3], selector); + w1[2] = __byte_perm (w0[1], w0[2], selector); + w1[1] = __byte_perm (w0[0], w0[1], selector); + w1[0] = __byte_perm ( 0, w0[0], selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 18: - dst1[0] = src_l1[0] | src_r0[0] << 16; - dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[0] = src_r0[3] >> 16 | src_r1[0] << 16; - dst2[1] = src_r1[0] >> 16 | src_r1[1] << 16; - dst2[2] = src_r1[1] >> 16 | src_r1[2] << 16; - dst2[3] = src_r1[2] >> 16 | src_r1[3] << 16; break; - case 19: - dst1[0] = src_l1[0] | src_r0[0] << 24; - dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[0] = src_r0[3] >> 8 | src_r1[0] << 24; - dst2[1] = src_r1[0] >> 8 | src_r1[1] << 24; - dst2[2] = src_r1[1] >> 8 | src_r1[2] << 24; - dst2[3] = src_r1[2] >> 8 | src_r1[3] << 24; - break; + case 5: + w3[1] = __byte_perm (w1[3], w2[0], selector); + w3[0] = __byte_perm (w1[2], w1[3], selector); + w2[3] = __byte_perm (w1[1], w1[2], selector); + w2[2] = __byte_perm (w1[0], w1[1], selector); + w2[1] = __byte_perm (w0[3], w1[0], selector); + w2[0] = __byte_perm (w0[2], w0[3], selector); + w1[3] = __byte_perm (w0[1], w0[2], selector); + w1[2] = __byte_perm (w0[0], w0[1], selector); + w1[1] = __byte_perm ( 0, w0[0], selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 20: - dst1[1] = src_r1[0]; - dst1[2] = src_r0[1]; - dst1[3] = src_r0[2]; - dst2[0] = src_r0[3]; - dst2[1] = src_r1[0]; - dst2[2] = src_r1[1]; - dst2[3] = src_r1[2]; break; - case 21: - dst1[1] = src_l1[1] | src_r0[0] << 8; - dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[1] = src_r0[3] >> 24 | src_r1[0] << 8; - dst2[2] = src_r1[0] >> 24 | src_r1[1] << 8; - dst2[3] = src_r1[1] >> 24 | src_r1[2] << 8; - break; + case 6: + w3[1] = __byte_perm (w1[2], w1[3], selector); + w3[0] = __byte_perm (w1[1], w1[2], selector); + w2[3] = __byte_perm (w1[0], w1[1], selector); + w2[2] = __byte_perm (w0[3], w1[0], selector); + w2[1] = __byte_perm (w0[2], w0[3], selector); + w2[0] = __byte_perm (w0[1], w0[2], selector); + w1[3] = __byte_perm (w0[0], w0[1], selector); + w1[2] = __byte_perm ( 0, w0[0], selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 22: - dst1[1] = src_l1[1] | src_r0[0] << 16; - dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[1] = src_r0[3] >> 16 | src_r1[0] << 16; - dst2[2] = src_r1[0] >> 16 | src_r1[1] << 16; - dst2[3] = src_r1[1] >> 16 | src_r1[2] << 16; break; - case 23: - dst1[1] = src_l1[1] | src_r0[0] << 24; - dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[1] = src_r0[3] >> 8 | src_r1[0] << 24; - dst2[2] = src_r1[0] >> 8 | src_r1[1] << 24; - dst2[3] = src_r1[1] >> 8 | src_r1[2] << 24; - break; + case 7: + w3[1] = __byte_perm (w1[1], w1[2], selector); + w3[0] = __byte_perm (w1[0], w1[1], selector); + w2[3] = __byte_perm (w0[3], w1[0], selector); + w2[2] = __byte_perm (w0[2], w0[3], selector); + w2[1] = __byte_perm (w0[1], w0[2], selector); + w2[0] = __byte_perm (w0[0], w0[1], selector); + w1[3] = __byte_perm ( 0, w0[0], selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 24: - dst1[2] = src_r1[0]; - dst1[3] = src_r0[1]; - dst2[0] = src_r0[2]; - dst2[1] = src_r0[3]; - dst2[2] = src_r1[0]; - dst2[3] = src_r1[1]; break; - case 25: - dst1[2] = src_l1[2] | src_r0[0] << 8; - dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[2] = src_r0[3] >> 24 | src_r1[0] << 8; - dst2[3] = src_r1[0] >> 24 | src_r1[1] << 8; - break; + case 8: + w3[1] = __byte_perm (w1[0], w1[1], selector); + w3[0] = __byte_perm (w0[3], w1[0], selector); + w2[3] = __byte_perm (w0[2], w0[3], selector); + w2[2] = __byte_perm (w0[1], w0[2], selector); + w2[1] = __byte_perm (w0[0], w0[1], selector); + w2[0] = __byte_perm ( 0, w0[0], selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 26: - dst1[2] = src_l1[2] | src_r0[0] << 16; - dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[2] = src_r0[3] >> 16 | src_r1[0] << 16; - dst2[3] = src_r1[0] >> 16 | src_r1[1] << 16; break; - case 27: - dst1[2] = src_l1[2] | src_r0[0] << 24; - dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[2] = src_r0[3] >> 8 | src_r1[0] << 24; - dst2[3] = src_r1[0] >> 8 | src_r1[1] << 24; - break; + case 9: + w3[1] = __byte_perm (w0[3], w1[0], selector); + w3[0] = __byte_perm (w0[2], w0[3], selector); + w2[3] = __byte_perm (w0[1], w0[2], selector); + w2[2] = __byte_perm (w0[0], w0[1], selector); + w2[1] = __byte_perm ( 0, w0[0], selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 28: - dst1[3] = src_r1[0]; - dst2[0] = src_r0[1]; - dst2[1] = src_r0[2]; - dst2[2] = src_r0[3]; - dst2[3] = src_r1[0]; break; - case 29: - dst1[3] = src_l1[3] | src_r0[0] << 8; - dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[3] = src_r0[3] >> 24 | src_r1[0] << 8; - break; + case 10: + w3[1] = __byte_perm (w0[2], w0[3], selector); + w3[0] = __byte_perm (w0[1], w0[2], selector); + w2[3] = __byte_perm (w0[0], w0[1], selector); + w2[2] = __byte_perm ( 0, w0[0], selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 30: - dst1[3] = src_l1[3] | src_r0[0] << 16; - dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[3] = src_r0[3] >> 16 | src_r1[0] << 16; break; - case 31: - dst1[3] = src_l1[3] | src_r0[0] << 24; - dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[3] = src_r0[3] >> 8 | src_r1[0] << 24; + case 11: + w3[1] = __byte_perm (w0[1], w0[2], selector); + w3[0] = __byte_perm (w0[0], w0[1], selector); + w2[3] = __byte_perm ( 0, w0[0], selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; - case 32: - dst2[0] = src_r0[0]; - dst2[1] = src_r0[1]; - dst2[2] = src_r0[2]; - dst2[3] = src_r0[3]; + case 12: + w3[1] = __byte_perm (w0[0], w0[1], selector); + w3[0] = __byte_perm ( 0, w0[0], selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; - case 33: - dst2[0] = src_l2[0] | src_r0[0] << 8; - dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8; + case 13: + w3[1] = __byte_perm ( 0, w0[0], selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + } + #endif +} - case 34: - dst2[0] = src_l2[0] | src_r0[0] << 16; - dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16; +static void switch_buffer_by_offset_be (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +{ + #ifdef IS_AMD + switch (offset / 4) + { + case 0: + w3[2] = amd_bytealign (w3[1], 0, offset); + w3[1] = amd_bytealign (w3[0], w3[1], offset); + w3[0] = amd_bytealign (w2[3], w3[0], offset); + w2[3] = amd_bytealign (w2[2], w2[3], offset); + w2[2] = amd_bytealign (w2[1], w2[2], offset); + w2[1] = amd_bytealign (w2[0], w2[1], offset); + w2[0] = amd_bytealign (w1[3], w2[0], offset); + w1[3] = amd_bytealign (w1[2], w1[3], offset); + w1[2] = amd_bytealign (w1[1], w1[2], offset); + w1[1] = amd_bytealign (w1[0], w1[1], offset); + w1[0] = amd_bytealign (w0[3], w1[0], offset); + w0[3] = amd_bytealign (w0[2], w0[3], offset); + w0[2] = amd_bytealign (w0[1], w0[2], offset); + w0[1] = amd_bytealign (w0[0], w0[1], offset); + w0[0] = amd_bytealign ( 0, w0[0], offset); break; - case 35: - dst2[0] = src_l2[0] | src_r0[0] << 24; - dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24; + case 1: + w3[2] = amd_bytealign (w3[0], 0, offset); + w3[1] = amd_bytealign (w2[3], w3[0], offset); + w3[0] = amd_bytealign (w2[2], w2[3], offset); + w2[3] = amd_bytealign (w2[1], w2[2], offset); + w2[2] = amd_bytealign (w2[0], w2[1], offset); + w2[1] = amd_bytealign (w1[3], w2[0], offset); + w2[0] = amd_bytealign (w1[2], w1[3], offset); + w1[3] = amd_bytealign (w1[1], w1[2], offset); + w1[2] = amd_bytealign (w1[0], w1[1], offset); + w1[1] = amd_bytealign (w0[3], w1[0], offset); + w1[0] = amd_bytealign (w0[2], w0[3], offset); + w0[3] = amd_bytealign (w0[1], w0[2], offset); + w0[2] = amd_bytealign (w0[0], w0[1], offset); + w0[1] = amd_bytealign ( 0, w0[0], offset); + w0[0] = 0; break; - case 36: - dst2[1] = src_r0[0]; - dst2[2] = src_r0[1]; - dst2[3] = src_r0[2]; + case 2: + w3[2] = amd_bytealign (w2[3], 0, offset); + w3[1] = amd_bytealign (w2[2], w2[3], offset); + w3[0] = amd_bytealign (w2[1], w2[2], offset); + w2[3] = amd_bytealign (w2[0], w2[1], offset); + w2[2] = amd_bytealign (w1[3], w2[0], offset); + w2[1] = amd_bytealign (w1[2], w1[3], offset); + w2[0] = amd_bytealign (w1[1], w1[2], offset); + w1[3] = amd_bytealign (w1[0], w1[1], offset); + w1[2] = amd_bytealign (w0[3], w1[0], offset); + w1[1] = amd_bytealign (w0[2], w0[3], offset); + w1[0] = amd_bytealign (w0[1], w0[2], offset); + w0[3] = amd_bytealign (w0[0], w0[1], offset); + w0[2] = amd_bytealign ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; break; - case 37: - dst2[1] = src_l2[1] | src_r0[0] << 8; - dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8; + case 3: + w3[2] = amd_bytealign (w2[2], 0, offset); + w3[1] = amd_bytealign (w2[1], w2[2], offset); + w3[0] = amd_bytealign (w2[0], w2[1], offset); + w2[3] = amd_bytealign (w1[3], w2[0], offset); + w2[2] = amd_bytealign (w1[2], w1[3], offset); + w2[1] = amd_bytealign (w1[1], w1[2], offset); + w2[0] = amd_bytealign (w1[0], w1[1], offset); + w1[3] = amd_bytealign (w0[3], w1[0], offset); + w1[2] = amd_bytealign (w0[2], w0[3], offset); + w1[1] = amd_bytealign (w0[1], w0[2], offset); + w1[0] = amd_bytealign (w0[0], w0[1], offset); + w0[3] = amd_bytealign ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 38: - dst2[1] = src_l2[1] | src_r0[0] << 16; - dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16; + case 4: + w3[2] = amd_bytealign (w2[1], 0, offset); + w3[1] = amd_bytealign (w2[0], w2[1], offset); + w3[0] = amd_bytealign (w1[3], w2[0], offset); + w2[3] = amd_bytealign (w1[2], w1[3], offset); + w2[2] = amd_bytealign (w1[1], w1[2], offset); + w2[1] = amd_bytealign (w1[0], w1[1], offset); + w2[0] = amd_bytealign (w0[3], w1[0], offset); + w1[3] = amd_bytealign (w0[2], w0[3], offset); + w1[2] = amd_bytealign (w0[1], w0[2], offset); + w1[1] = amd_bytealign (w0[0], w0[1], offset); + w1[0] = amd_bytealign ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 39: - dst2[1] = src_l2[1] | src_r0[0] << 24; - dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24; + case 5: + w3[2] = amd_bytealign (w2[0], 0, offset); + w3[1] = amd_bytealign (w1[3], w2[0], offset); + w3[0] = amd_bytealign (w1[2], w1[3], offset); + w2[3] = amd_bytealign (w1[1], w1[2], offset); + w2[2] = amd_bytealign (w1[0], w1[1], offset); + w2[1] = amd_bytealign (w0[3], w1[0], offset); + w2[0] = amd_bytealign (w0[2], w0[3], offset); + w1[3] = amd_bytealign (w0[1], w0[2], offset); + w1[2] = amd_bytealign (w0[0], w0[1], offset); + w1[1] = amd_bytealign ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 40: - dst2[2] = src_r0[0]; - dst2[3] = src_r0[1]; + case 6: + w3[2] = amd_bytealign (w1[3], 0, offset); + w3[1] = amd_bytealign (w1[2], w1[3], offset); + w3[0] = amd_bytealign (w1[1], w1[2], offset); + w2[3] = amd_bytealign (w1[0], w1[1], offset); + w2[2] = amd_bytealign (w0[3], w1[0], offset); + w2[1] = amd_bytealign (w0[2], w0[3], offset); + w2[0] = amd_bytealign (w0[1], w0[2], offset); + w1[3] = amd_bytealign (w0[0], w0[1], offset); + w1[2] = amd_bytealign ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 41: - dst2[2] = src_l2[2] | src_r0[0] << 8; - dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8; + case 7: + w3[2] = amd_bytealign (w1[2], 0, offset); + w3[1] = amd_bytealign (w1[1], w1[2], offset); + w3[0] = amd_bytealign (w1[0], w1[1], offset); + w2[3] = amd_bytealign (w0[3], w1[0], offset); + w2[2] = amd_bytealign (w0[2], w0[3], offset); + w2[1] = amd_bytealign (w0[1], w0[2], offset); + w2[0] = amd_bytealign (w0[0], w0[1], offset); + w1[3] = amd_bytealign ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 42: - dst2[2] = src_l2[2] | src_r0[0] << 16; - dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16; + case 8: + w3[2] = amd_bytealign (w1[1], 0, offset); + w3[1] = amd_bytealign (w1[0], w1[1], offset); + w3[0] = amd_bytealign (w0[3], w1[0], offset); + w2[3] = amd_bytealign (w0[2], w0[3], offset); + w2[2] = amd_bytealign (w0[1], w0[2], offset); + w2[1] = amd_bytealign (w0[0], w0[1], offset); + w2[0] = amd_bytealign ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 43: - dst2[2] = src_l2[2] | src_r0[0] << 24; - dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24; + case 9: + w3[2] = amd_bytealign (w1[0], 0, offset); + w3[1] = amd_bytealign (w0[3], w1[0], offset); + w3[0] = amd_bytealign (w0[2], w0[3], offset); + w2[3] = amd_bytealign (w0[1], w0[2], offset); + w2[2] = amd_bytealign (w0[0], w0[1], offset); + w2[1] = amd_bytealign ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 44: - dst2[3] = src_r0[0]; + case 10: + w3[2] = amd_bytealign (w0[3], 0, offset); + w3[1] = amd_bytealign (w0[2], w0[3], offset); + w3[0] = amd_bytealign (w0[1], w0[2], offset); + w2[3] = amd_bytealign (w0[0], w0[1], offset); + w2[2] = amd_bytealign ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 45: - dst2[3] = src_l2[3] | src_r0[0] << 8; + case 11: + w3[2] = amd_bytealign (w0[2], 0, offset); + w3[1] = amd_bytealign (w0[1], w0[2], offset); + w3[0] = amd_bytealign (w0[0], w0[1], offset); + w2[3] = amd_bytealign ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 46: - dst2[3] = src_l2[3] | src_r0[0] << 16; + case 12: + w3[2] = amd_bytealign (w0[1], 0, offset); + w3[1] = amd_bytealign (w0[0], w0[1], offset); + w3[0] = amd_bytealign ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 47: - dst2[3] = src_l2[3] | src_r0[0] << 24; + case 13: + w3[2] = amd_bytealign (w0[0], 0, offset); + w3[1] = amd_bytealign ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; } -} + #endif -// before: memcat16_9 -static void memcat_c15_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset) -{ - switch (offset) + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + + switch (offset / 4) { case 0: - w0[0] = append0[0]; - w0[1] = append0[1]; - w0[2] = append0[2]; - w0[3] = append0[3]; - w1[0] = append1[0]; - w1[1] = append1[1]; - w1[2] = append1[2]; - w1[3] = append1[3]; - w2[0] = append2[0]; + w3[1] = __byte_perm (w3[1], w3[0], selector); + w3[0] = __byte_perm (w3[0], w2[3], selector); + w2[3] = __byte_perm (w2[3], w2[2], selector); + w2[2] = __byte_perm (w2[2], w2[1], selector); + w2[1] = __byte_perm (w2[1], w2[0], selector); + w2[0] = __byte_perm (w2[0], w1[3], selector); + w1[3] = __byte_perm (w1[3], w1[2], selector); + w1[2] = __byte_perm (w1[2], w1[1], selector); + w1[1] = __byte_perm (w1[1], w1[0], selector); + w1[0] = __byte_perm (w1[0], w0[3], selector); + w0[3] = __byte_perm (w0[3], w0[2], selector); + w0[2] = __byte_perm (w0[2], w0[1], selector); + w0[1] = __byte_perm (w0[1], w0[0], selector); + w0[0] = __byte_perm (w0[0], 0, selector); break; case 1: - w0[0] = w0[0] | append0[0] << 8; - w0[1] = append0[0] >> 24 | append0[1] << 8; - w0[2] = append0[1] >> 24 | append0[2] << 8; - w0[3] = append0[2] >> 24 | append0[3] << 8; - w1[0] = append0[3] >> 24 | append1[0] << 8; - w1[1] = append1[0] >> 24 | append1[1] << 8; - w1[2] = append1[1] >> 24 | append1[2] << 8; - w1[3] = append1[2] >> 24 | append1[3] << 8; - w2[0] = append1[3] >> 24 | append2[0] << 8; - w2[1] = append2[0] >> 24; + w3[1] = __byte_perm (w3[0], w2[3], selector); + w3[0] = __byte_perm (w2[3], w2[2], selector); + w2[3] = __byte_perm (w2[2], w2[1], selector); + w2[2] = __byte_perm (w2[1], w2[0], selector); + w2[1] = __byte_perm (w2[0], w1[3], selector); + w2[0] = __byte_perm (w1[3], w1[2], selector); + w1[3] = __byte_perm (w1[2], w1[1], selector); + w1[2] = __byte_perm (w1[1], w1[0], selector); + w1[1] = __byte_perm (w1[0], w0[3], selector); + w1[0] = __byte_perm (w0[3], w0[2], selector); + w0[3] = __byte_perm (w0[2], w0[1], selector); + w0[2] = __byte_perm (w0[1], w0[0], selector); + w0[1] = __byte_perm (w0[0], 0, selector); + w0[0] = 0; break; case 2: - w0[0] = w0[0] | append0[0] << 16; - w0[1] = append0[0] >> 16 | append0[1] << 16; - w0[2] = append0[1] >> 16 | append0[2] << 16; - w0[3] = append0[2] >> 16 | append0[3] << 16; - w1[0] = append0[3] >> 16 | append1[0] << 16; - w1[1] = append1[0] >> 16 | append1[1] << 16; - w1[2] = append1[1] >> 16 | append1[2] << 16; - w1[3] = append1[2] >> 16 | append1[3] << 16; - w2[0] = append1[3] >> 16 | append2[0] << 16; - w2[1] = append2[0] >> 16; + w3[1] = __byte_perm (w2[3], w2[2], selector); + w3[0] = __byte_perm (w2[2], w2[1], selector); + w2[3] = __byte_perm (w2[1], w2[0], selector); + w2[2] = __byte_perm (w2[0], w1[3], selector); + w2[1] = __byte_perm (w1[3], w1[2], selector); + w2[0] = __byte_perm (w1[2], w1[1], selector); + w1[3] = __byte_perm (w1[1], w1[0], selector); + w1[2] = __byte_perm (w1[0], w0[3], selector); + w1[1] = __byte_perm (w0[3], w0[2], selector); + w1[0] = __byte_perm (w0[2], w0[1], selector); + w0[3] = __byte_perm (w0[1], w0[0], selector); + w0[2] = __byte_perm (w0[0], 0, selector); + w0[1] = 0; + w0[0] = 0; break; case 3: - w0[0] = w0[0] | append0[0] << 24; - w0[1] = append0[0] >> 8 | append0[1] << 24; - w0[2] = append0[1] >> 8 | append0[2] << 24; - w0[3] = append0[2] >> 8 | append0[3] << 24; - w1[0] = append0[3] >> 8 | append1[0] << 24; - w1[1] = append1[0] >> 8 | append1[1] << 24; - w1[2] = append1[1] >> 8 | append1[2] << 24; - w1[3] = append1[2] >> 8 | append1[3] << 24; - w2[0] = append1[3] >> 8 | append2[0] << 24; - w2[1] = append2[0] >> 8; + w3[1] = __byte_perm (w2[2], w2[1], selector); + w3[0] = __byte_perm (w2[1], w2[0], selector); + w2[3] = __byte_perm (w2[0], w1[3], selector); + w2[2] = __byte_perm (w1[3], w1[2], selector); + w2[1] = __byte_perm (w1[2], w1[1], selector); + w2[0] = __byte_perm (w1[1], w1[0], selector); + w1[3] = __byte_perm (w1[0], w0[3], selector); + w1[2] = __byte_perm (w0[3], w0[2], selector); + w1[1] = __byte_perm (w0[2], w0[1], selector); + w1[0] = __byte_perm (w0[1], w0[0], selector); + w0[3] = __byte_perm (w0[0], 0, selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 4: - w0[1] = append0[0]; - w0[2] = append0[1]; - w0[3] = append0[2]; - w1[0] = append0[3]; - w1[1] = append1[0]; - w1[2] = append1[1]; - w1[3] = append1[2]; - w2[0] = append1[3]; - w2[1] = append2[0]; + w3[1] = __byte_perm (w2[1], w2[0], selector); + w3[0] = __byte_perm (w2[0], w1[3], selector); + w2[3] = __byte_perm (w1[3], w1[2], selector); + w2[2] = __byte_perm (w1[2], w1[1], selector); + w2[1] = __byte_perm (w1[1], w1[0], selector); + w2[0] = __byte_perm (w1[0], w0[3], selector); + w1[3] = __byte_perm (w0[3], w0[2], selector); + w1[2] = __byte_perm (w0[2], w0[1], selector); + w1[1] = __byte_perm (w0[1], w0[0], selector); + w1[0] = __byte_perm (w0[0], 0, selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 5: - w0[1] = w0[1] | append0[0] << 8; - w0[2] = append0[0] >> 24 | append0[1] << 8; - w0[3] = append0[1] >> 24 | append0[2] << 8; - w1[0] = append0[2] >> 24 | append0[3] << 8; - w1[1] = append0[3] >> 24 | append1[0] << 8; - w1[2] = append1[0] >> 24 | append1[1] << 8; - w1[3] = append1[1] >> 24 | append1[2] << 8; - w2[0] = append1[2] >> 24 | append1[3] << 8; - w2[1] = append1[3] >> 24 | append2[0] << 8; - w2[2] = append2[0] >> 24; - break; - - case 6: - w0[1] = w0[1] | append0[0] << 16; - w0[2] = append0[0] >> 16 | append0[1] << 16; - w0[3] = append0[1] >> 16 | append0[2] << 16; - w1[0] = append0[2] >> 16 | append0[3] << 16; - w1[1] = append0[3] >> 16 | append1[0] << 16; - w1[2] = append1[0] >> 16 | append1[1] << 16; - w1[3] = append1[1] >> 16 | append1[2] << 16; - w2[0] = append1[2] >> 16 | append1[3] << 16; - w2[1] = append1[3] >> 16 | append2[0] << 16; - w2[2] = append2[0] >> 16; + w3[1] = __byte_perm (w2[0], w1[3], selector); + w3[0] = __byte_perm (w1[3], w1[2], selector); + w2[3] = __byte_perm (w1[2], w1[1], selector); + w2[2] = __byte_perm (w1[1], w1[0], selector); + w2[1] = __byte_perm (w1[0], w0[3], selector); + w2[0] = __byte_perm (w0[3], w0[2], selector); + w1[3] = __byte_perm (w0[2], w0[1], selector); + w1[2] = __byte_perm (w0[1], w0[0], selector); + w1[1] = __byte_perm (w0[0], 0, selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 6: + w3[1] = __byte_perm (w1[3], w1[2], selector); + w3[0] = __byte_perm (w1[2], w1[1], selector); + w2[3] = __byte_perm (w1[1], w1[0], selector); + w2[2] = __byte_perm (w1[0], w0[3], selector); + w2[1] = __byte_perm (w0[3], w0[2], selector); + w2[0] = __byte_perm (w0[2], w0[1], selector); + w1[3] = __byte_perm (w0[1], w0[0], selector); + w1[2] = __byte_perm (w0[0], 0, selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 7: - w0[1] = w0[1] | append0[0] << 24; - w0[2] = append0[0] >> 8 | append0[1] << 24; - w0[3] = append0[1] >> 8 | append0[2] << 24; - w1[0] = append0[2] >> 8 | append0[3] << 24; - w1[1] = append0[3] >> 8 | append1[0] << 24; - w1[2] = append1[0] >> 8 | append1[1] << 24; - w1[3] = append1[1] >> 8 | append1[2] << 24; - w2[0] = append1[2] >> 8 | append1[3] << 24; - w2[1] = append1[3] >> 8 | append2[0] << 24; - w2[2] = append2[0] >> 8; + w3[1] = __byte_perm (w1[2], w1[1], selector); + w3[0] = __byte_perm (w1[1], w1[0], selector); + w2[3] = __byte_perm (w1[0], w0[3], selector); + w2[2] = __byte_perm (w0[3], w0[2], selector); + w2[1] = __byte_perm (w0[2], w0[1], selector); + w2[0] = __byte_perm (w0[1], w0[0], selector); + w1[3] = __byte_perm (w0[0], 0, selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 8: - w0[2] = append0[0]; - w0[3] = append0[1]; - w1[0] = append0[2]; - w1[1] = append0[3]; - w1[2] = append1[0]; - w1[3] = append1[1]; - w2[0] = append1[2]; - w2[1] = append1[3]; - w2[2] = append2[0]; + w3[1] = __byte_perm (w1[1], w1[0], selector); + w3[0] = __byte_perm (w1[0], w0[3], selector); + w2[3] = __byte_perm (w0[3], w0[2], selector); + w2[2] = __byte_perm (w0[2], w0[1], selector); + w2[1] = __byte_perm (w0[1], w0[0], selector); + w2[0] = __byte_perm (w0[0], 0, selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 9: - w0[2] = w0[2] | append0[0] << 8; - w0[3] = append0[0] >> 24 | append0[1] << 8; - w1[0] = append0[1] >> 24 | append0[2] << 8; - w1[1] = append0[2] >> 24 | append0[3] << 8; - w1[2] = append0[3] >> 24 | append1[0] << 8; - w1[3] = append1[0] >> 24 | append1[1] << 8; - w2[0] = append1[1] >> 24 | append1[2] << 8; - w2[1] = append1[2] >> 24 | append1[3] << 8; - w2[2] = append1[3] >> 24 | append2[0] << 8; - w2[3] = append2[0] >> 24; + w3[1] = __byte_perm (w1[0], w0[3], selector); + w3[0] = __byte_perm (w0[3], w0[2], selector); + w2[3] = __byte_perm (w0[2], w0[1], selector); + w2[2] = __byte_perm (w0[1], w0[0], selector); + w2[1] = __byte_perm (w0[0], 0, selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 10: - w0[2] = w0[2] | append0[0] << 16; - w0[3] = append0[0] >> 16 | append0[1] << 16; - w1[0] = append0[1] >> 16 | append0[2] << 16; - w1[1] = append0[2] >> 16 | append0[3] << 16; - w1[2] = append0[3] >> 16 | append1[0] << 16; - w1[3] = append1[0] >> 16 | append1[1] << 16; - w2[0] = append1[1] >> 16 | append1[2] << 16; - w2[1] = append1[2] >> 16 | append1[3] << 16; - w2[2] = append1[3] >> 16 | append2[0] << 16; - w2[3] = append2[0] >> 16; + w3[1] = __byte_perm (w0[3], w0[2], selector); + w3[0] = __byte_perm (w0[2], w0[1], selector); + w2[3] = __byte_perm (w0[1], w0[0], selector); + w2[2] = __byte_perm (w0[0], 0, selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 11: - w0[2] = w0[2] | append0[0] << 24; - w0[3] = append0[0] >> 8 | append0[1] << 24; - w1[0] = append0[1] >> 8 | append0[2] << 24; - w1[1] = append0[2] >> 8 | append0[3] << 24; - w1[2] = append0[3] >> 8 | append1[0] << 24; - w1[3] = append1[0] >> 8 | append1[1] << 24; - w2[0] = append1[1] >> 8 | append1[2] << 24; - w2[1] = append1[2] >> 8 | append1[3] << 24; - w2[2] = append1[3] >> 8 | append2[0] << 24; - w2[3] = append2[0] >> 8; + w3[1] = __byte_perm (w0[2], w0[1], selector); + w3[0] = __byte_perm (w0[1], w0[0], selector); + w2[3] = __byte_perm (w0[0], 0, selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 12: - w0[3] = append0[0]; - w1[0] = append0[1]; - w1[1] = append0[2]; - w1[2] = append0[3]; - w1[3] = append1[0]; - w2[0] = append1[1]; - w2[1] = append1[2]; - w2[2] = append1[3]; - w2[3] = append2[0]; + w3[1] = __byte_perm (w0[1], w0[0], selector); + w3[0] = __byte_perm (w0[0], 0, selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 13: - w0[3] = w0[3] | append0[0] << 8; - w1[0] = append0[0] >> 24 | append0[1] << 8; - w1[1] = append0[1] >> 24 | append0[2] << 8; - w1[2] = append0[2] >> 24 | append0[3] << 8; - w1[3] = append0[3] >> 24 | append1[0] << 8; - w2[0] = append1[0] >> 24 | append1[1] << 8; - w2[1] = append1[1] >> 24 | append1[2] << 8; - w2[2] = append1[2] >> 24 | append1[3] << 8; - w2[3] = append1[3] >> 24 | append2[0] << 8; - w3[0] = append2[0] >> 24; - break; - - case 14: - w0[3] = w0[3] | append0[0] << 16; - w1[0] = append0[0] >> 16 | append0[1] << 16; - w1[1] = append0[1] >> 16 | append0[2] << 16; - w1[2] = append0[2] >> 16 | append0[3] << 16; - w1[3] = append0[3] >> 16 | append1[0] << 16; - w2[0] = append1[0] >> 16 | append1[1] << 16; - w2[1] = append1[1] >> 16 | append1[2] << 16; - w2[2] = append1[2] >> 16 | append1[3] << 16; - w2[3] = append1[3] >> 16 | append2[0] << 16; - w3[0] = append2[0] >> 16; - break; - - case 15: - w0[3] = w0[3] | append0[0] << 24; - w1[0] = append0[0] >> 8 | append0[1] << 24; - w1[1] = append0[1] >> 8 | append0[2] << 24; - w1[2] = append0[2] >> 8 | append0[3] << 24; - w1[3] = append0[3] >> 8 | append1[0] << 24; - w2[0] = append1[0] >> 8 | append1[1] << 24; - w2[1] = append1[1] >> 8 | append1[2] << 24; - w2[2] = append1[2] >> 8 | append1[3] << 24; - w2[3] = append1[3] >> 8 | append2[0] << 24; - w3[0] = append2[0] >> 8; + w3[1] = __byte_perm (w0[0], 0, selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; } + #endif } -// before: memcat32_8 -static void memcat_c32_w4x4_a2x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 offset) +/* not needed anymore? + +// before: append_0x80_2_be +static void append_0x80_2x4_be (u32 w0[4], u32 w1[4], const u32 offset) { switch (offset) { - case 0: - w0[0] = append0[0]; - w0[1] = append0[1]; - w0[2] = append0[2]; - w0[3] = append0[3]; - w1[0] = append1[0]; - w1[1] = append1[1]; - w1[2] = append1[2]; - w1[3] = append1[3]; + case 0: + w0[0] |= 0x80000000; break; - case 1: - w0[0] = w0[0] | append0[0] << 8; - w0[1] = append0[0] >> 24 | append0[1] << 8; - w0[2] = append0[1] >> 24 | append0[2] << 8; - w0[3] = append0[2] >> 24 | append0[3] << 8; - w1[0] = append0[3] >> 24 | append1[0] << 8; - w1[1] = append1[0] >> 24 | append1[1] << 8; - w1[2] = append1[1] >> 24 | append1[2] << 8; - w1[3] = append1[2] >> 24 | append1[3] << 8; - w2[0] = append1[3] >> 24; + case 1: + w0[0] |= 0x800000; break; - case 2: - w0[0] = w0[0] | append0[0] << 16; - w0[1] = append0[0] >> 16 | append0[1] << 16; - w0[2] = append0[1] >> 16 | append0[2] << 16; - w0[3] = append0[2] >> 16 | append0[3] << 16; - w1[0] = append0[3] >> 16 | append1[0] << 16; - w1[1] = append1[0] >> 16 | append1[1] << 16; - w1[2] = append1[1] >> 16 | append1[2] << 16; - w1[3] = append1[2] >> 16 | append1[3] << 16; - w2[0] = append1[3] >> 16; + case 2: + w0[0] |= 0x8000; break; - case 3: - w0[0] = w0[0] | append0[0] << 24; - w0[1] = append0[0] >> 8 | append0[1] << 24; - w0[2] = append0[1] >> 8 | append0[2] << 24; - w0[3] = append0[2] >> 8 | append0[3] << 24; - w1[0] = append0[3] >> 8 | append1[0] << 24; - w1[1] = append1[0] >> 8 | append1[1] << 24; - w1[2] = append1[1] >> 8 | append1[2] << 24; - w1[3] = append1[2] >> 8 | append1[3] << 24; - w2[0] = append1[3] >> 8; + case 3: + w0[0] |= 0x80; break; - case 4: - w0[1] = append0[0]; - w0[2] = append0[1]; - w0[3] = append0[2]; - w1[0] = append0[3]; - w1[1] = append1[0]; - w1[2] = append1[1]; - w1[3] = append1[2]; - w2[0] = append1[3]; + case 4: + w0[1] |= 0x80000000; break; - case 5: - w0[1] = w0[1] | append0[0] << 8; - w0[2] = append0[0] >> 24 | append0[1] << 8; - w0[3] = append0[1] >> 24 | append0[2] << 8; - w1[0] = append0[2] >> 24 | append0[3] << 8; - w1[1] = append0[3] >> 24 | append1[0] << 8; - w1[2] = append1[0] >> 24 | append1[1] << 8; - w1[3] = append1[1] >> 24 | append1[2] << 8; - w2[0] = append1[2] >> 24 | append1[3] << 8; - w2[1] = append1[3] >> 24; + case 5: + w0[1] |= 0x800000; break; - case 6: - w0[1] = w0[1] | append0[0] << 16; - w0[2] = append0[0] >> 16 | append0[1] << 16; - w0[3] = append0[1] >> 16 | append0[2] << 16; - w1[0] = append0[2] >> 16 | append0[3] << 16; - w1[1] = append0[3] >> 16 | append1[0] << 16; - w1[2] = append1[0] >> 16 | append1[1] << 16; - w1[3] = append1[1] >> 16 | append1[2] << 16; - w2[0] = append1[2] >> 16 | append1[3] << 16; - w2[1] = append1[3] >> 16; + case 6: + w0[1] |= 0x8000; break; - case 7: - w0[1] = w0[1] | append0[0] << 24; - w0[2] = append0[0] >> 8 | append0[1] << 24; - w0[3] = append0[1] >> 8 | append0[2] << 24; - w1[0] = append0[2] >> 8 | append0[3] << 24; - w1[1] = append0[3] >> 8 | append1[0] << 24; - w1[2] = append1[0] >> 8 | append1[1] << 24; - w1[3] = append1[1] >> 8 | append1[2] << 24; - w2[0] = append1[2] >> 8 | append1[3] << 24; - w2[1] = append1[3] >> 8; + case 7: + w0[1] |= 0x80; break; - case 8: - w0[2] = append0[0]; - w0[3] = append0[1]; - w1[0] = append0[2]; - w1[1] = append0[3]; - w1[2] = append1[0]; - w1[3] = append1[1]; - w2[0] = append1[2]; - w2[1] = append1[3]; + case 8: + w0[2] |= 0x80000000; break; - case 9: - w0[2] = w0[2] | append0[0] << 8; - w0[3] = append0[0] >> 24 | append0[1] << 8; - w1[0] = append0[1] >> 24 | append0[2] << 8; - w1[1] = append0[2] >> 24 | append0[3] << 8; - w1[2] = append0[3] >> 24 | append1[0] << 8; - w1[3] = append1[0] >> 24 | append1[1] << 8; - w2[0] = append1[1] >> 24 | append1[2] << 8; - w2[1] = append1[2] >> 24 | append1[3] << 8; - w2[2] = append1[3] >> 24; + case 9: + w0[2] |= 0x800000; break; case 10: - w0[2] = w0[2] | append0[0] << 16; - w0[3] = append0[0] >> 16 | append0[1] << 16; - w1[0] = append0[1] >> 16 | append0[2] << 16; - w1[1] = append0[2] >> 16 | append0[3] << 16; - w1[2] = append0[3] >> 16 | append1[0] << 16; - w1[3] = append1[0] >> 16 | append1[1] << 16; - w2[0] = append1[1] >> 16 | append1[2] << 16; - w2[1] = append1[2] >> 16 | append1[3] << 16; - w2[2] = append1[3] >> 16; + w0[2] |= 0x8000; break; case 11: - w0[2] = w0[2] | append0[0] << 24; - w0[3] = append0[0] >> 8 | append0[1] << 24; - w1[0] = append0[1] >> 8 | append0[2] << 24; - w1[1] = append0[2] >> 8 | append0[3] << 24; - w1[2] = append0[3] >> 8 | append1[0] << 24; - w1[3] = append1[0] >> 8 | append1[1] << 24; - w2[0] = append1[1] >> 8 | append1[2] << 24; - w2[1] = append1[2] >> 8 | append1[3] << 24; - w2[2] = append1[3] >> 8; + w0[2] |= 0x80; break; case 12: - w0[3] = append0[0]; - w1[0] = append0[1]; - w1[1] = append0[2]; - w1[2] = append0[3]; - w1[3] = append1[0]; - w2[0] = append1[1]; - w2[1] = append1[2]; - w2[2] = append1[3]; + w0[3] |= 0x80000000; break; case 13: - w0[3] = w0[3] | append0[0] << 8; - w1[0] = append0[0] >> 24 | append0[1] << 8; - w1[1] = append0[1] >> 24 | append0[2] << 8; - w1[2] = append0[2] >> 24 | append0[3] << 8; - w1[3] = append0[3] >> 24 | append1[0] << 8; - w2[0] = append1[0] >> 24 | append1[1] << 8; - w2[1] = append1[1] >> 24 | append1[2] << 8; - w2[2] = append1[2] >> 24 | append1[3] << 8; - w2[3] = append1[3] >> 24; + w0[3] |= 0x800000; break; case 14: - w0[3] = w0[3] | append0[0] << 16; - w1[0] = append0[0] >> 16 | append0[1] << 16; - w1[1] = append0[1] >> 16 | append0[2] << 16; - w1[2] = append0[2] >> 16 | append0[3] << 16; - w1[3] = append0[3] >> 16 | append1[0] << 16; - w2[0] = append1[0] >> 16 | append1[1] << 16; - w2[1] = append1[1] >> 16 | append1[2] << 16; - w2[2] = append1[2] >> 16 | append1[3] << 16; - w2[3] = append1[3] >> 16; + w0[3] |= 0x8000; break; case 15: - w0[3] = w0[3] | append0[0] << 24; - w1[0] = append0[0] >> 8 | append0[1] << 24; - w1[1] = append0[1] >> 8 | append0[2] << 24; - w1[2] = append0[2] >> 8 | append0[3] << 24; - w1[3] = append0[3] >> 8 | append1[0] << 24; - w2[0] = append1[0] >> 8 | append1[1] << 24; - w2[1] = append1[1] >> 8 | append1[2] << 24; - w2[2] = append1[2] >> 8 | append1[3] << 24; - w2[3] = append1[3] >> 8; + w0[3] |= 0x80; break; case 16: - w1[0] = append0[0]; - w1[1] = append0[1]; - w1[2] = append0[2]; - w1[3] = append0[3]; - w2[0] = append1[0]; - w2[1] = append1[1]; - w2[2] = append1[2]; - w2[3] = append1[3]; + w1[0] |= 0x80000000; break; case 17: - w1[0] = w1[0] | append0[0] << 8; - w1[1] = append0[0] >> 24 | append0[1] << 8; - w1[2] = append0[1] >> 24 | append0[2] << 8; - w1[3] = append0[2] >> 24 | append0[3] << 8; - w2[0] = append0[3] >> 24 | append1[0] << 8; - w2[1] = append1[0] >> 24 | append1[1] << 8; - w2[2] = append1[1] >> 24 | append1[2] << 8; - w2[3] = append1[2] >> 24 | append1[3] << 8; - w3[0] = append1[3] >> 24; + w1[0] |= 0x800000; break; case 18: - w1[0] = w1[0] | append0[0] << 16; - w1[1] = append0[0] >> 16 | append0[1] << 16; - w1[2] = append0[1] >> 16 | append0[2] << 16; - w1[3] = append0[2] >> 16 | append0[3] << 16; - w2[0] = append0[3] >> 16 | append1[0] << 16; - w2[1] = append1[0] >> 16 | append1[1] << 16; - w2[2] = append1[1] >> 16 | append1[2] << 16; - w2[3] = append1[2] >> 16 | append1[3] << 16; - w3[0] = append1[3] >> 16; + w1[0] |= 0x8000; break; case 19: - w1[0] = w1[0] | append0[0] << 24; - w1[1] = append0[0] >> 8 | append0[1] << 24; - w1[2] = append0[1] >> 8 | append0[2] << 24; - w1[3] = append0[2] >> 8 | append0[3] << 24; - w2[0] = append0[3] >> 8 | append1[0] << 24; - w2[1] = append1[0] >> 8 | append1[1] << 24; - w2[2] = append1[1] >> 8 | append1[2] << 24; - w2[3] = append1[2] >> 8 | append1[3] << 24; - w3[0] = append1[3] >> 8; + w1[0] |= 0x80; break; case 20: - w1[1] = append0[0]; - w1[2] = append0[1]; - w1[3] = append0[2]; - w2[0] = append0[3]; - w2[1] = append1[0]; - w2[2] = append1[1]; - w2[3] = append1[2]; - w3[0] = append1[3]; + w1[1] |= 0x80000000; break; case 21: - w1[1] = w1[1] | append0[0] << 8; - w1[2] = append0[0] >> 24 | append0[1] << 8; - w1[3] = append0[1] >> 24 | append0[2] << 8; - w2[0] = append0[2] >> 24 | append0[3] << 8; - w2[1] = append0[3] >> 24 | append1[0] << 8; - w2[2] = append1[0] >> 24 | append1[1] << 8; - w2[3] = append1[1] >> 24 | append1[2] << 8; - w3[0] = append1[2] >> 24 | append1[3] << 8; - w3[1] = append1[3] >> 24; + w1[1] |= 0x800000; break; case 22: - w1[1] = w1[1] | append0[0] << 16; - w1[2] = append0[0] >> 16 | append0[1] << 16; - w1[3] = append0[1] >> 16 | append0[2] << 16; - w2[0] = append0[2] >> 16 | append0[3] << 16; - w2[1] = append0[3] >> 16 | append1[0] << 16; - w2[2] = append1[0] >> 16 | append1[1] << 16; - w2[3] = append1[1] >> 16 | append1[2] << 16; - w3[0] = append1[2] >> 16 | append1[3] << 16; - w3[1] = append1[3] >> 16; + w1[1] |= 0x8000; break; case 23: - w1[1] = w1[1] | append0[0] << 24; - w1[2] = append0[0] >> 8 | append0[1] << 24; - w1[3] = append0[1] >> 8 | append0[2] << 24; - w2[0] = append0[2] >> 8 | append0[3] << 24; - w2[1] = append0[3] >> 8 | append1[0] << 24; - w2[2] = append1[0] >> 8 | append1[1] << 24; - w2[3] = append1[1] >> 8 | append1[2] << 24; - w3[0] = append1[2] >> 8 | append1[3] << 24; - w3[1] = append1[3] >> 8; + w1[1] |= 0x80; break; case 24: - w1[2] = append0[0]; - w1[3] = append0[1]; - w2[0] = append0[2]; - w2[1] = append0[3]; - w2[2] = append1[0]; - w2[3] = append1[1]; - w3[0] = append1[2]; - w3[1] = append1[3]; + w1[2] |= 0x80000000; break; case 25: - w1[2] = w1[2] | append0[0] << 8; - w1[3] = append0[0] >> 24 | append0[1] << 8; - w2[0] = append0[1] >> 24 | append0[2] << 8; - w2[1] = append0[2] >> 24 | append0[3] << 8; - w2[2] = append0[3] >> 24 | append1[0] << 8; - w2[3] = append1[0] >> 24 | append1[1] << 8; - w3[0] = append1[1] >> 24 | append1[2] << 8; - w3[1] = append1[2] >> 24 | append1[3] << 8; + w1[2] |= 0x800000; break; case 26: - w1[2] = w1[2] | append0[0] << 16; - w1[3] = append0[0] >> 16 | append0[1] << 16; - w2[0] = append0[1] >> 16 | append0[2] << 16; - w2[1] = append0[2] >> 16 | append0[3] << 16; - w2[2] = append0[3] >> 16 | append1[0] << 16; - w2[3] = append1[0] >> 16 | append1[1] << 16; - w3[0] = append1[1] >> 16 | append1[2] << 16; - w3[1] = append1[2] >> 16 | append1[3] << 16; + w1[2] |= 0x8000; break; - - case 27: - w1[2] = w1[2] | append0[0] << 24; - w1[3] = append0[0] >> 8 | append0[1] << 24; - w2[0] = append0[1] >> 8 | append0[2] << 24; - w2[1] = append0[2] >> 8 | append0[3] << 24; - w2[2] = append0[3] >> 8 | append1[0] << 24; - w2[3] = append1[0] >> 8 | append1[1] << 24; - w3[0] = append1[1] >> 8 | append1[2] << 24; - w3[1] = append1[2] >> 8 | append1[3] << 24; + + case 27: + w1[2] |= 0x80; break; case 28: - w1[3] = append0[0]; - w2[0] = append0[1]; - w2[1] = append0[2]; - w2[2] = append0[3]; - w2[3] = append1[0]; - w3[0] = append1[1]; - w3[1] = append1[2]; + w1[3] |= 0x80000000; break; case 29: - w1[3] = w1[3] | append0[0] << 8; - w2[0] = append0[0] >> 24 | append0[1] << 8; - w2[1] = append0[1] >> 24 | append0[2] << 8; - w2[2] = append0[2] >> 24 | append0[3] << 8; - w2[3] = append0[3] >> 24 | append1[0] << 8; - w3[0] = append1[0] >> 24 | append1[1] << 8; - w3[1] = append1[1] >> 24 | append1[2] << 8; + w1[3] |= 0x800000; break; case 30: - w1[3] = w1[3] | append0[0] << 16; - w2[0] = append0[0] >> 16 | append0[1] << 16; - w2[1] = append0[1] >> 16 | append0[2] << 16; - w2[2] = append0[2] >> 16 | append0[3] << 16; - w2[3] = append0[3] >> 16 | append1[0] << 16; - w3[0] = append1[0] >> 16 | append1[1] << 16; - w3[1] = append1[1] >> 16 | append1[2] << 16; + w1[3] |= 0x8000; break; case 31: - w1[3] = w1[3] | append0[0] << 24; - w2[0] = append0[0] >> 8 | append0[1] << 24; - w2[1] = append0[1] >> 8 | append0[2] << 24; - w2[2] = append0[2] >> 8 | append0[3] << 24; - w2[3] = append0[3] >> 8 | append1[0] << 24; - w3[0] = append1[0] >> 8 | append1[1] << 24; - w3[1] = append1[1] >> 8 | append1[2] << 24; - break; - - case 32: - w2[0] = append0[0]; - w2[1] = append0[1]; - w2[2] = append0[2]; - w2[3] = append0[3]; - w3[0] = append1[0]; - w3[1] = append1[1]; + w1[3] |= 0x80; break; } } -// before: memcat32_9 -static void memcat_c32_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset) +// before: append_0x80_8 +static void append_0x80_1x32 (u32 w[32], const u32 offset) { switch (offset) { case 0: - w0[0] = append0[0]; - w0[1] = append0[1]; - w0[2] = append0[2]; - w0[3] = append0[3]; - w1[0] = append1[0]; - w1[1] = append1[1]; - w1[2] = append1[2]; - w1[3] = append1[3]; - w2[0] = append2[0]; + w[ 0] = 0x80; break; case 1: - w0[0] = w0[0] | append0[0] << 8; - w0[1] = append0[0] >> 24 | append0[1] << 8; - w0[2] = append0[1] >> 24 | append0[2] << 8; - w0[3] = append0[2] >> 24 | append0[3] << 8; - w1[0] = append0[3] >> 24 | append1[0] << 8; - w1[1] = append1[0] >> 24 | append1[1] << 8; - w1[2] = append1[1] >> 24 | append1[2] << 8; - w1[3] = append1[2] >> 24 | append1[3] << 8; - w2[0] = append1[3] >> 24 | append2[0] << 8; - w2[1] = append2[0] >> 24; + w[ 0] = w[ 0] | 0x8000; break; case 2: - w0[0] = w0[0] | append0[0] << 16; - w0[1] = append0[0] >> 16 | append0[1] << 16; - w0[2] = append0[1] >> 16 | append0[2] << 16; - w0[3] = append0[2] >> 16 | append0[3] << 16; - w1[0] = append0[3] >> 16 | append1[0] << 16; - w1[1] = append1[0] >> 16 | append1[1] << 16; - w1[2] = append1[1] >> 16 | append1[2] << 16; - w1[3] = append1[2] >> 16 | append1[3] << 16; - w2[0] = append1[3] >> 16 | append2[0] << 16; - w2[1] = append2[0] >> 16; + w[ 0] = w[ 0] | 0x800000; break; case 3: - w0[0] = w0[0] | append0[0] << 24; - w0[1] = append0[0] >> 8 | append0[1] << 24; - w0[2] = append0[1] >> 8 | append0[2] << 24; - w0[3] = append0[2] >> 8 | append0[3] << 24; - w1[0] = append0[3] >> 8 | append1[0] << 24; - w1[1] = append1[0] >> 8 | append1[1] << 24; - w1[2] = append1[1] >> 8 | append1[2] << 24; - w1[3] = append1[2] >> 8 | append1[3] << 24; - w2[0] = append1[3] >> 8 | append2[0] << 24; - w2[1] = append2[0] >> 8; + w[ 0] = w[ 0] | 0x80000000; break; case 4: - w0[1] = append0[0]; - w0[2] = append0[1]; - w0[3] = append0[2]; - w1[0] = append0[3]; - w1[1] = append1[0]; - w1[2] = append1[1]; - w1[3] = append1[2]; - w2[0] = append1[3]; - w2[1] = append2[0]; + w[ 1] = 0x80; break; case 5: - w0[1] = w0[1] | append0[0] << 8; - w0[2] = append0[0] >> 24 | append0[1] << 8; - w0[3] = append0[1] >> 24 | append0[2] << 8; - w1[0] = append0[2] >> 24 | append0[3] << 8; - w1[1] = append0[3] >> 24 | append1[0] << 8; - w1[2] = append1[0] >> 24 | append1[1] << 8; - w1[3] = append1[1] >> 24 | append1[2] << 8; - w2[0] = append1[2] >> 24 | append1[3] << 8; - w2[1] = append1[3] >> 24 | append2[0] << 8; - w2[2] = append2[0] >> 24; + w[ 1] = w[ 1] | 0x8000; break; case 6: - w0[1] = w0[1] | append0[0] << 16; - w0[2] = append0[0] >> 16 | append0[1] << 16; - w0[3] = append0[1] >> 16 | append0[2] << 16; - w1[0] = append0[2] >> 16 | append0[3] << 16; - w1[1] = append0[3] >> 16 | append1[0] << 16; - w1[2] = append1[0] >> 16 | append1[1] << 16; - w1[3] = append1[1] >> 16 | append1[2] << 16; - w2[0] = append1[2] >> 16 | append1[3] << 16; - w2[1] = append1[3] >> 16 | append2[0] << 16; - w2[2] = append2[0] >> 16; + w[ 1] = w[ 1] | 0x800000; break; case 7: - w0[1] = w0[1] | append0[0] << 24; - w0[2] = append0[0] >> 8 | append0[1] << 24; - w0[3] = append0[1] >> 8 | append0[2] << 24; - w1[0] = append0[2] >> 8 | append0[3] << 24; - w1[1] = append0[3] >> 8 | append1[0] << 24; - w1[2] = append1[0] >> 8 | append1[1] << 24; - w1[3] = append1[1] >> 8 | append1[2] << 24; - w2[0] = append1[2] >> 8 | append1[3] << 24; - w2[1] = append1[3] >> 8 | append2[0] << 24; - w2[2] = append2[0] >> 8; + w[ 1] = w[ 1] | 0x80000000; break; case 8: - w0[2] = append0[0]; - w0[3] = append0[1]; - w1[0] = append0[2]; - w1[1] = append0[3]; - w1[2] = append1[0]; - w1[3] = append1[1]; - w2[0] = append1[2]; - w2[1] = append1[3]; - w2[2] = append2[0]; + w[ 2] = 0x80; break; case 9: - w0[2] = w0[2] | append0[0] << 8; - w0[3] = append0[0] >> 24 | append0[1] << 8; - w1[0] = append0[1] >> 24 | append0[2] << 8; - w1[1] = append0[2] >> 24 | append0[3] << 8; - w1[2] = append0[3] >> 24 | append1[0] << 8; - w1[3] = append1[0] >> 24 | append1[1] << 8; - w2[0] = append1[1] >> 24 | append1[2] << 8; - w2[1] = append1[2] >> 24 | append1[3] << 8; - w2[2] = append1[3] >> 24 | append2[0] << 8; - w2[3] = append2[0] >> 24; + w[ 2] = w[ 2] | 0x8000; break; case 10: - w0[2] = w0[2] | append0[0] << 16; - w0[3] = append0[0] >> 16 | append0[1] << 16; - w1[0] = append0[1] >> 16 | append0[2] << 16; - w1[1] = append0[2] >> 16 | append0[3] << 16; - w1[2] = append0[3] >> 16 | append1[0] << 16; - w1[3] = append1[0] >> 16 | append1[1] << 16; - w2[0] = append1[1] >> 16 | append1[2] << 16; - w2[1] = append1[2] >> 16 | append1[3] << 16; - w2[2] = append1[3] >> 16 | append2[0] << 16; - w2[3] = append2[0] >> 16; + w[ 2] = w[ 2] | 0x800000; break; case 11: - w0[2] = w0[2] | append0[0] << 24; - w0[3] = append0[0] >> 8 | append0[1] << 24; - w1[0] = append0[1] >> 8 | append0[2] << 24; - w1[1] = append0[2] >> 8 | append0[3] << 24; - w1[2] = append0[3] >> 8 | append1[0] << 24; - w1[3] = append1[0] >> 8 | append1[1] << 24; - w2[0] = append1[1] >> 8 | append1[2] << 24; - w2[1] = append1[2] >> 8 | append1[3] << 24; - w2[2] = append1[3] >> 8 | append2[0] << 24; - w2[3] = append2[0] >> 8; + w[ 2] = w[ 2] | 0x80000000; break; case 12: - w0[3] = append0[0]; - w1[0] = append0[1]; - w1[1] = append0[2]; - w1[2] = append0[3]; - w1[3] = append1[0]; - w2[0] = append1[1]; - w2[1] = append1[2]; - w2[2] = append1[3]; - w2[3] = append2[0]; + w[ 3] = 0x80; break; case 13: - w0[3] = w0[3] | append0[0] << 8; - w1[0] = append0[0] >> 24 | append0[1] << 8; - w1[1] = append0[1] >> 24 | append0[2] << 8; - w1[2] = append0[2] >> 24 | append0[3] << 8; - w1[3] = append0[3] >> 24 | append1[0] << 8; - w2[0] = append1[0] >> 24 | append1[1] << 8; - w2[1] = append1[1] >> 24 | append1[2] << 8; - w2[2] = append1[2] >> 24 | append1[3] << 8; - w2[3] = append1[3] >> 24 | append2[0] << 8; - w3[0] = append2[0] >> 24; + w[ 3] = w[ 3] | 0x8000; break; case 14: - w0[3] = w0[3] | append0[0] << 16; - w1[0] = append0[0] >> 16 | append0[1] << 16; - w1[1] = append0[1] >> 16 | append0[2] << 16; - w1[2] = append0[2] >> 16 | append0[3] << 16; - w1[3] = append0[3] >> 16 | append1[0] << 16; - w2[0] = append1[0] >> 16 | append1[1] << 16; - w2[1] = append1[1] >> 16 | append1[2] << 16; - w2[2] = append1[2] >> 16 | append1[3] << 16; - w2[3] = append1[3] >> 16 | append2[0] << 16; - w3[0] = append2[0] >> 16; + w[ 3] = w[ 3] | 0x800000; break; case 15: - w0[3] = w0[3] | append0[0] << 24; - w1[0] = append0[0] >> 8 | append0[1] << 24; - w1[1] = append0[1] >> 8 | append0[2] << 24; - w1[2] = append0[2] >> 8 | append0[3] << 24; - w1[3] = append0[3] >> 8 | append1[0] << 24; - w2[0] = append1[0] >> 8 | append1[1] << 24; - w2[1] = append1[1] >> 8 | append1[2] << 24; - w2[2] = append1[2] >> 8 | append1[3] << 24; - w2[3] = append1[3] >> 8 | append2[0] << 24; - w3[0] = append2[0] >> 8; - break; - - case 16: - w1[0] = append0[0]; - w1[1] = append0[1]; - w1[2] = append0[2]; - w1[3] = append0[3]; - w2[0] = append1[0]; - w2[1] = append1[1]; - w2[2] = append1[2]; - w2[3] = append1[3]; - w3[0] = append2[0]; - break; - - case 17: - w1[0] = w1[0] | append0[0] << 8; - w1[1] = append0[0] >> 24 | append0[1] << 8; - w1[2] = append0[1] >> 24 | append0[2] << 8; - w1[3] = append0[2] >> 24 | append0[3] << 8; - w2[0] = append0[3] >> 24 | append1[0] << 8; - w2[1] = append1[0] >> 24 | append1[1] << 8; - w2[2] = append1[1] >> 24 | append1[2] << 8; - w2[3] = append1[2] >> 24 | append1[3] << 8; - w3[0] = append1[3] >> 24 | append2[0] << 8; - w3[1] = append2[0] >> 24; + w[ 3] = w[ 3] | 0x80000000; break; - case 18: - w1[0] = w1[0] | append0[0] << 16; - w1[1] = append0[0] >> 16 | append0[1] << 16; - w1[2] = append0[1] >> 16 | append0[2] << 16; - w1[3] = append0[2] >> 16 | append0[3] << 16; - w2[0] = append0[3] >> 16 | append1[0] << 16; - w2[1] = append1[0] >> 16 | append1[1] << 16; - w2[2] = append1[1] >> 16 | append1[2] << 16; - w2[3] = append1[2] >> 16 | append1[3] << 16; - w3[0] = append1[3] >> 16 | append2[0] << 16; - w3[1] = append2[0] >> 16; + case 16: + w[ 4] = 0x80; + break; + + case 17: + w[ 4] = w[ 4] | 0x8000; + break; + + case 18: + w[ 4] = w[ 4] | 0x800000; break; case 19: - w1[0] = w1[0] | append0[0] << 24; - w1[1] = append0[0] >> 8 | append0[1] << 24; - w1[2] = append0[1] >> 8 | append0[2] << 24; - w1[3] = append0[2] >> 8 | append0[3] << 24; - w2[0] = append0[3] >> 8 | append1[0] << 24; - w2[1] = append1[0] >> 8 | append1[1] << 24; - w2[2] = append1[1] >> 8 | append1[2] << 24; - w2[3] = append1[2] >> 8 | append1[3] << 24; - w3[0] = append1[3] >> 8 | append2[0] << 24; - w3[1] = append2[0] >> 8; + w[ 4] = w[ 4] | 0x80000000; break; case 20: - w1[1] = append0[0]; - w1[2] = append0[1]; - w1[3] = append0[2]; - w2[0] = append0[3]; - w2[1] = append1[0]; - w2[2] = append1[1]; - w2[3] = append1[2]; - w3[0] = append1[3]; - w3[1] = append2[0]; + w[ 5] = 0x80; break; case 21: - w1[1] = w1[1] | append0[0] << 8; - w1[2] = append0[0] >> 24 | append0[1] << 8; - w1[3] = append0[1] >> 24 | append0[2] << 8; - w2[0] = append0[2] >> 24 | append0[3] << 8; - w2[1] = append0[3] >> 24 | append1[0] << 8; - w2[2] = append1[0] >> 24 | append1[1] << 8; - w2[3] = append1[1] >> 24 | append1[2] << 8; - w3[0] = append1[2] >> 24 | append1[3] << 8; - w3[1] = append1[3] >> 24 | append2[0] << 8; + w[ 5] = w[ 5] | 0x8000; break; case 22: - w1[1] = w1[1] | append0[0] << 16; - w1[2] = append0[0] >> 16 | append0[1] << 16; - w1[3] = append0[1] >> 16 | append0[2] << 16; - w2[0] = append0[2] >> 16 | append0[3] << 16; - w2[1] = append0[3] >> 16 | append1[0] << 16; - w2[2] = append1[0] >> 16 | append1[1] << 16; - w2[3] = append1[1] >> 16 | append1[2] << 16; - w3[0] = append1[2] >> 16 | append1[3] << 16; - w3[1] = append1[3] >> 16 | append2[0] << 16; + w[ 5] = w[ 5] | 0x800000; break; case 23: - w1[1] = w1[1] | append0[0] << 24; - w1[2] = append0[0] >> 8 | append0[1] << 24; - w1[3] = append0[1] >> 8 | append0[2] << 24; - w2[0] = append0[2] >> 8 | append0[3] << 24; - w2[1] = append0[3] >> 8 | append1[0] << 24; - w2[2] = append1[0] >> 8 | append1[1] << 24; - w2[3] = append1[1] >> 8 | append1[2] << 24; - w3[0] = append1[2] >> 8 | append1[3] << 24; - w3[1] = append1[3] >> 8 | append2[0] << 24; + w[ 5] = w[ 5] | 0x80000000; break; case 24: - w1[2] = append0[0]; - w1[3] = append0[1]; - w2[0] = append0[2]; - w2[1] = append0[3]; - w2[2] = append1[0]; - w2[3] = append1[1]; - w3[0] = append1[2]; - w3[1] = append1[3]; + w[ 6] = 0x80; break; case 25: - w1[2] = w1[2] | append0[0] << 8; - w1[3] = append0[0] >> 24 | append0[1] << 8; - w2[0] = append0[1] >> 24 | append0[2] << 8; - w2[1] = append0[2] >> 24 | append0[3] << 8; - w2[2] = append0[3] >> 24 | append1[0] << 8; - w2[3] = append1[0] >> 24 | append1[1] << 8; - w3[0] = append1[1] >> 24 | append1[2] << 8; - w3[1] = append1[2] >> 24 | append1[3] << 8; + w[ 6] = w[ 6] | 0x8000; break; case 26: - w1[2] = w1[2] | append0[0] << 16; - w1[3] = append0[0] >> 16 | append0[1] << 16; - w2[0] = append0[1] >> 16 | append0[2] << 16; - w2[1] = append0[2] >> 16 | append0[3] << 16; - w2[2] = append0[3] >> 16 | append1[0] << 16; - w2[3] = append1[0] >> 16 | append1[1] << 16; - w3[0] = append1[1] >> 16 | append1[2] << 16; - w3[1] = append1[2] >> 16 | append1[3] << 16; + w[ 6] = w[ 6] | 0x800000; break; case 27: - w1[2] = w1[2] | append0[0] << 24; - w1[3] = append0[0] >> 8 | append0[1] << 24; - w2[0] = append0[1] >> 8 | append0[2] << 24; - w2[1] = append0[2] >> 8 | append0[3] << 24; - w2[2] = append0[3] >> 8 | append1[0] << 24; - w2[3] = append1[0] >> 8 | append1[1] << 24; - w3[0] = append1[1] >> 8 | append1[2] << 24; - w3[1] = append1[2] >> 8 | append1[3] << 24; + w[ 6] = w[ 6] | 0x80000000; break; case 28: - w1[3] = append0[0]; - w2[0] = append0[1]; - w2[1] = append0[2]; - w2[2] = append0[3]; - w2[3] = append1[0]; - w3[0] = append1[1]; - w3[1] = append1[2]; + w[ 7] = 0x80; break; case 29: - w1[3] = w1[3] | append0[0] << 8; - w2[0] = append0[0] >> 24 | append0[1] << 8; - w2[1] = append0[1] >> 24 | append0[2] << 8; - w2[2] = append0[2] >> 24 | append0[3] << 8; - w2[3] = append0[3] >> 24 | append1[0] << 8; - w3[0] = append1[0] >> 24 | append1[1] << 8; - w3[1] = append1[1] >> 24 | append1[2] << 8; + w[ 7] = w[ 7] | 0x8000; break; case 30: - w1[3] = w1[3] | append0[0] << 16; - w2[0] = append0[0] >> 16 | append0[1] << 16; - w2[1] = append0[1] >> 16 | append0[2] << 16; - w2[2] = append0[2] >> 16 | append0[3] << 16; - w2[3] = append0[3] >> 16 | append1[0] << 16; - w3[0] = append1[0] >> 16 | append1[1] << 16; - w3[1] = append1[1] >> 16 | append1[2] << 16; + w[ 7] = w[ 7] | 0x800000; break; case 31: - w1[3] = w1[3] | append0[0] << 24; - w2[0] = append0[0] >> 8 | append0[1] << 24; - w2[1] = append0[1] >> 8 | append0[2] << 24; - w2[2] = append0[2] >> 8 | append0[3] << 24; - w2[3] = append0[3] >> 8 | append1[0] << 24; - w3[0] = append1[0] >> 8 | append1[1] << 24; - w3[1] = append1[1] >> 8 | append1[2] << 24; + w[ 7] = w[ 7] | 0x80000000; break; case 32: - w2[0] = append0[0]; - w2[1] = append0[1]; - w2[2] = append0[2]; - w2[3] = append0[3]; - w3[0] = append1[0]; - w3[1] = append1[1]; + w[ 8] = 0x80; break; - } -} -static void switch_buffer_by_offset (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) -{ - #ifdef IS_AMD - const int offset_mod_4 = offset & 3; + case 33: + w[ 8] = w[ 8] | 0x8000; + break; - const int offset_minus_4 = 4 - offset; + case 34: + w[ 8] = w[ 8] | 0x800000; + break; - switch (offset / 4) - { - case 0: - w3[2] = amd_bytealign ( 0, w3[1], offset_minus_4); - w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign (w0[0], 0, offset_minus_4); + case 35: + w[ 8] = w[ 8] | 0x80000000; + break; - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 36: + w[ 9] = 0x80; + break; + + case 37: + w[ 9] = w[ 9] | 0x8000; + break; + case 38: + w[ 9] = w[ 9] | 0x800000; break; - case 1: - w3[2] = amd_bytealign ( 0, w3[0], offset_minus_4); - w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign (w0[0], 0, offset_minus_4); - w0[0] = 0; + case 39: + w[ 9] = w[ 9] | 0x80000000; + break; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 40: + w[10] = 0x80; + break; + + case 41: + w[10] = w[10] | 0x8000; + break; + + case 42: + w[10] = w[10] | 0x800000; + break; + + case 43: + w[10] = w[10] | 0x80000000; + break; + + case 44: + w[11] = 0x80; + break; + + case 45: + w[11] = w[11] | 0x8000; + break; + case 46: + w[11] = w[11] | 0x800000; break; - case 2: - w3[2] = amd_bytealign ( 0, w2[3], offset_minus_4); - w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign (w0[0], 0, offset_minus_4); - w0[1] = 0; - w0[0] = 0; + case 47: + w[11] = w[11] | 0x80000000; + break; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 48: + w[12] = 0x80; + break; + case 49: + w[12] = w[12] | 0x8000; break; - case 3: - w3[2] = amd_bytealign ( 0, w2[2], offset_minus_4); - w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign (w0[0], 0, offset_minus_4); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 50: + w[12] = w[12] | 0x800000; + break; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 51: + w[12] = w[12] | 0x80000000; + break; + case 52: + w[13] = 0x80; break; - case 4: - w3[2] = amd_bytealign ( 0, w2[1], offset_minus_4); - w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign (w0[0], 0, offset_minus_4); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 53: + w[13] = w[13] | 0x8000; + break; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 54: + w[13] = w[13] | 0x800000; + break; + case 55: + w[13] = w[13] | 0x80000000; break; - case 5: - w3[2] = amd_bytealign ( 0, w2[0], offset_minus_4); - w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign (w0[0], 0, offset_minus_4); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 56: + w[14] = 0x80; + break; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 57: + w[14] = w[14] | 0x8000; + break; + case 58: + w[14] = w[14] | 0x800000; break; - case 6: - w3[2] = amd_bytealign ( 0, w1[3], offset_minus_4); - w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign (w0[0], 0, offset_minus_4); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 59: + w[14] = w[14] | 0x80000000; + break; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 60: + w[15] = 0x80; + break; + case 61: + w[15] = w[15] | 0x8000; break; - case 7: - w3[2] = amd_bytealign ( 0, w1[2], offset_minus_4); - w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign (w0[0], 0, offset_minus_4); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 62: + w[15] = w[15] | 0x800000; + break; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 63: + w[15] = w[15] | 0x80000000; + break; + case 64: + w[16] = 0x80; break; - case 8: - w3[2] = amd_bytealign ( 0, w1[1], offset_minus_4); - w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign (w0[0], 0, offset_minus_4); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 65: + w[16] = w[16] | 0x8000; + break; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 66: + w[16] = w[16] | 0x800000; + break; + case 67: + w[16] = w[16] | 0x80000000; break; - case 9: - w3[2] = amd_bytealign ( 0, w1[0], offset_minus_4); - w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign (w0[0], 0, offset_minus_4); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 68: + w[17] = 0x80; + break; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 69: + w[17] = w[17] | 0x8000; + break; + case 70: + w[17] = w[17] | 0x800000; break; - case 10: - w3[2] = amd_bytealign ( 0, w0[3], offset_minus_4); - w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign (w0[0], 0, offset_minus_4); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 71: + w[17] = w[17] | 0x80000000; + break; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 72: + w[18] = 0x80; + break; + case 73: + w[18] = w[18] | 0x8000; break; - case 11: - w3[2] = amd_bytealign ( 0, w0[2], offset_minus_4); - w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign (w0[0], 0, offset_minus_4); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 74: + w[18] = w[18] | 0x800000; + break; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 75: + w[18] = w[18] | 0x80000000; + break; + case 76: + w[19] = 0x80; break; - case 12: - w3[2] = amd_bytealign ( 0, w0[1], offset_minus_4); - w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign (w0[0], 0, offset_minus_4); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 77: + w[19] = w[19] | 0x8000; + break; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 78: + w[19] = w[19] | 0x800000; + break; + case 79: + w[19] = w[19] | 0x80000000; break; - case 13: - w3[2] = amd_bytealign ( 0, w0[0], offset_minus_4); - w3[1] = amd_bytealign (w0[0], 0, offset_minus_4); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 80: + w[20] = 0x80; + break; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = 0; - } + case 81: + w[20] = w[20] | 0x8000; + break; + + case 82: + w[20] = w[20] | 0x800000; + break; + case 83: + w[20] = w[20] | 0x80000000; break; - } - #endif - #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); + case 84: + w[21] = 0x80; + break; - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + case 85: + w[21] = w[21] | 0x8000; + break; - switch (offset / 4) - { - case 0: - w3[1] = __byte_perm (w3[0], w3[1], selector); - w3[0] = __byte_perm (w2[3], w3[0], selector); - w2[3] = __byte_perm (w2[2], w2[3], selector); - w2[2] = __byte_perm (w2[1], w2[2], selector); - w2[1] = __byte_perm (w2[0], w2[1], selector); - w2[0] = __byte_perm (w1[3], w2[0], selector); - w1[3] = __byte_perm (w1[2], w1[3], selector); - w1[2] = __byte_perm (w1[1], w1[2], selector); - w1[1] = __byte_perm (w1[0], w1[1], selector); - w1[0] = __byte_perm (w0[3], w1[0], selector); - w0[3] = __byte_perm (w0[2], w0[3], selector); - w0[2] = __byte_perm (w0[1], w0[2], selector); - w0[1] = __byte_perm (w0[0], w0[1], selector); - w0[0] = __byte_perm ( 0, w0[0], selector); + case 86: + w[21] = w[21] | 0x800000; + break; + case 87: + w[21] = w[21] | 0x80000000; break; - case 1: - w3[1] = __byte_perm (w2[3], w3[0], selector); - w3[0] = __byte_perm (w2[2], w2[3], selector); - w2[3] = __byte_perm (w2[1], w2[2], selector); - w2[2] = __byte_perm (w2[0], w2[1], selector); - w2[1] = __byte_perm (w1[3], w2[0], selector); - w2[0] = __byte_perm (w1[2], w1[3], selector); - w1[3] = __byte_perm (w1[1], w1[2], selector); - w1[2] = __byte_perm (w1[0], w1[1], selector); - w1[1] = __byte_perm (w0[3], w1[0], selector); - w1[0] = __byte_perm (w0[2], w0[3], selector); - w0[3] = __byte_perm (w0[1], w0[2], selector); - w0[2] = __byte_perm (w0[0], w0[1], selector); - w0[1] = __byte_perm ( 0, w0[0], selector); - w0[0] = 0; + case 88: + w[22] = 0x80; + break; + case 89: + w[22] = w[22] | 0x8000; break; - case 2: - w3[1] = __byte_perm (w2[2], w2[3], selector); - w3[0] = __byte_perm (w2[1], w2[2], selector); - w2[3] = __byte_perm (w2[0], w2[1], selector); - w2[2] = __byte_perm (w1[3], w2[0], selector); - w2[1] = __byte_perm (w1[2], w1[3], selector); - w2[0] = __byte_perm (w1[1], w1[2], selector); - w1[3] = __byte_perm (w1[0], w1[1], selector); - w1[2] = __byte_perm (w0[3], w1[0], selector); - w1[1] = __byte_perm (w0[2], w0[3], selector); - w1[0] = __byte_perm (w0[1], w0[2], selector); - w0[3] = __byte_perm (w0[0], w0[1], selector); - w0[2] = __byte_perm ( 0, w0[0], selector); - w0[1] = 0; - w0[0] = 0; + case 90: + w[22] = w[22] | 0x800000; + break; + case 91: + w[22] = w[22] | 0x80000000; break; - case 3: - w3[1] = __byte_perm (w2[1], w2[2], selector); - w3[0] = __byte_perm (w2[0], w2[1], selector); - w2[3] = __byte_perm (w1[3], w2[0], selector); - w2[2] = __byte_perm (w1[2], w1[3], selector); - w2[1] = __byte_perm (w1[1], w1[2], selector); - w2[0] = __byte_perm (w1[0], w1[1], selector); - w1[3] = __byte_perm (w0[3], w1[0], selector); - w1[2] = __byte_perm (w0[2], w0[3], selector); - w1[1] = __byte_perm (w0[1], w0[2], selector); - w1[0] = __byte_perm (w0[0], w0[1], selector); - w0[3] = __byte_perm ( 0, w0[0], selector); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 92: + w[23] = 0x80; + break; + + case 93: + w[23] = w[23] | 0x8000; + break; + case 94: + w[23] = w[23] | 0x800000; break; - case 4: - w3[1] = __byte_perm (w2[0], w2[1], selector); - w3[0] = __byte_perm (w1[3], w2[0], selector); - w2[3] = __byte_perm (w1[2], w1[3], selector); - w2[2] = __byte_perm (w1[1], w1[2], selector); - w2[1] = __byte_perm (w1[0], w1[1], selector); - w2[0] = __byte_perm (w0[3], w1[0], selector); - w1[3] = __byte_perm (w0[2], w0[3], selector); - w1[2] = __byte_perm (w0[1], w0[2], selector); - w1[1] = __byte_perm (w0[0], w0[1], selector); - w1[0] = __byte_perm ( 0, w0[0], selector); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 95: + w[23] = w[23] | 0x80000000; + break; + case 96: + w[24] = 0x80; break; - case 5: - w3[1] = __byte_perm (w1[3], w2[0], selector); - w3[0] = __byte_perm (w1[2], w1[3], selector); - w2[3] = __byte_perm (w1[1], w1[2], selector); - w2[2] = __byte_perm (w1[0], w1[1], selector); - w2[1] = __byte_perm (w0[3], w1[0], selector); - w2[0] = __byte_perm (w0[2], w0[3], selector); - w1[3] = __byte_perm (w0[1], w0[2], selector); - w1[2] = __byte_perm (w0[0], w0[1], selector); - w1[1] = __byte_perm ( 0, w0[0], selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 97: + w[24] = w[24] | 0x8000; + break; + case 98: + w[24] = w[24] | 0x800000; break; - case 6: - w3[1] = __byte_perm (w1[2], w1[3], selector); - w3[0] = __byte_perm (w1[1], w1[2], selector); - w2[3] = __byte_perm (w1[0], w1[1], selector); - w2[2] = __byte_perm (w0[3], w1[0], selector); - w2[1] = __byte_perm (w0[2], w0[3], selector); - w2[0] = __byte_perm (w0[1], w0[2], selector); - w1[3] = __byte_perm (w0[0], w0[1], selector); - w1[2] = __byte_perm ( 0, w0[0], selector); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 99: + w[24] = w[24] | 0x80000000; + break; + case 100: + w[25] = 0x80; break; - case 7: - w3[1] = __byte_perm (w1[1], w1[2], selector); - w3[0] = __byte_perm (w1[0], w1[1], selector); - w2[3] = __byte_perm (w0[3], w1[0], selector); - w2[2] = __byte_perm (w0[2], w0[3], selector); - w2[1] = __byte_perm (w0[1], w0[2], selector); - w2[0] = __byte_perm (w0[0], w0[1], selector); - w1[3] = __byte_perm ( 0, w0[0], selector); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 101: + w[25] = w[25] | 0x8000; + break; + case 102: + w[25] = w[25] | 0x800000; break; - case 8: - w3[1] = __byte_perm (w1[0], w1[1], selector); - w3[0] = __byte_perm (w0[3], w1[0], selector); - w2[3] = __byte_perm (w0[2], w0[3], selector); - w2[2] = __byte_perm (w0[1], w0[2], selector); - w2[1] = __byte_perm (w0[0], w0[1], selector); - w2[0] = __byte_perm ( 0, w0[0], selector); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 103: + w[25] = w[25] | 0x80000000; + break; + case 104: + w[26] = 0x80; break; - case 9: - w3[1] = __byte_perm (w0[3], w1[0], selector); - w3[0] = __byte_perm (w0[2], w0[3], selector); - w2[3] = __byte_perm (w0[1], w0[2], selector); - w2[2] = __byte_perm (w0[0], w0[1], selector); - w2[1] = __byte_perm ( 0, w0[0], selector); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 105: + w[26] = w[26] | 0x8000; + break; + case 106: + w[26] = w[26] | 0x800000; break; - case 10: - w3[1] = __byte_perm (w0[2], w0[3], selector); - w3[0] = __byte_perm (w0[1], w0[2], selector); - w2[3] = __byte_perm (w0[0], w0[1], selector); - w2[2] = __byte_perm ( 0, w0[0], selector); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 107: + w[26] = w[26] | 0x80000000; + break; + case 108: + w[27] = 0x80; break; - case 11: - w3[1] = __byte_perm (w0[1], w0[2], selector); - w3[0] = __byte_perm (w0[0], w0[1], selector); - w2[3] = __byte_perm ( 0, w0[0], selector); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 109: + w[27] = w[27] | 0x8000; + break; + case 110: + w[27] = w[27] | 0x800000; break; - case 12: - w3[1] = __byte_perm (w0[0], w0[1], selector); - w3[0] = __byte_perm ( 0, w0[0], selector); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 111: + w[27] = w[27] | 0x80000000; + break; + case 112: + w[28] = 0x80; break; - case 13: - w3[1] = __byte_perm ( 0, w0[0], selector); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 113: + w[28] = w[28] | 0x8000; + break; + case 114: + w[28] = w[28] | 0x800000; break; - } - #endif -} -static void switch_buffer_by_offset_be (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) -{ - #ifdef IS_AMD - switch (offset / 4) - { - case 0: - w3[2] = amd_bytealign (w3[1], 0, offset); - w3[1] = amd_bytealign (w3[0], w3[1], offset); - w3[0] = amd_bytealign (w2[3], w3[0], offset); - w2[3] = amd_bytealign (w2[2], w2[3], offset); - w2[2] = amd_bytealign (w2[1], w2[2], offset); - w2[1] = amd_bytealign (w2[0], w2[1], offset); - w2[0] = amd_bytealign (w1[3], w2[0], offset); - w1[3] = amd_bytealign (w1[2], w1[3], offset); - w1[2] = amd_bytealign (w1[1], w1[2], offset); - w1[1] = amd_bytealign (w1[0], w1[1], offset); - w1[0] = amd_bytealign (w0[3], w1[0], offset); - w0[3] = amd_bytealign (w0[2], w0[3], offset); - w0[2] = amd_bytealign (w0[1], w0[2], offset); - w0[1] = amd_bytealign (w0[0], w0[1], offset); - w0[0] = amd_bytealign ( 0, w0[0], offset); + case 115: + w[28] = w[28] | 0x80000000; break; - case 1: - w3[2] = amd_bytealign (w3[0], 0, offset); - w3[1] = amd_bytealign (w2[3], w3[0], offset); - w3[0] = amd_bytealign (w2[2], w2[3], offset); - w2[3] = amd_bytealign (w2[1], w2[2], offset); - w2[2] = amd_bytealign (w2[0], w2[1], offset); - w2[1] = amd_bytealign (w1[3], w2[0], offset); - w2[0] = amd_bytealign (w1[2], w1[3], offset); - w1[3] = amd_bytealign (w1[1], w1[2], offset); - w1[2] = amd_bytealign (w1[0], w1[1], offset); - w1[1] = amd_bytealign (w0[3], w1[0], offset); - w1[0] = amd_bytealign (w0[2], w0[3], offset); - w0[3] = amd_bytealign (w0[1], w0[2], offset); - w0[2] = amd_bytealign (w0[0], w0[1], offset); - w0[1] = amd_bytealign ( 0, w0[0], offset); - w0[0] = 0; + case 116: + w[29] = 0x80; + break; + + case 117: + w[29] = w[29] | 0x8000; + break; + + case 118: + w[29] = w[29] | 0x800000; break; - case 2: - w3[2] = amd_bytealign (w2[3], 0, offset); - w3[1] = amd_bytealign (w2[2], w2[3], offset); - w3[0] = amd_bytealign (w2[1], w2[2], offset); - w2[3] = amd_bytealign (w2[0], w2[1], offset); - w2[2] = amd_bytealign (w1[3], w2[0], offset); - w2[1] = amd_bytealign (w1[2], w1[3], offset); - w2[0] = amd_bytealign (w1[1], w1[2], offset); - w1[3] = amd_bytealign (w1[0], w1[1], offset); - w1[2] = amd_bytealign (w0[3], w1[0], offset); - w1[1] = amd_bytealign (w0[2], w0[3], offset); - w1[0] = amd_bytealign (w0[1], w0[2], offset); - w0[3] = amd_bytealign (w0[0], w0[1], offset); - w0[2] = amd_bytealign ( 0, w0[0], offset); - w0[1] = 0; - w0[0] = 0; + case 119: + w[29] = w[29] | 0x80000000; break; - case 3: - w3[2] = amd_bytealign (w2[2], 0, offset); - w3[1] = amd_bytealign (w2[1], w2[2], offset); - w3[0] = amd_bytealign (w2[0], w2[1], offset); - w2[3] = amd_bytealign (w1[3], w2[0], offset); - w2[2] = amd_bytealign (w1[2], w1[3], offset); - w2[1] = amd_bytealign (w1[1], w1[2], offset); - w2[0] = amd_bytealign (w1[0], w1[1], offset); - w1[3] = amd_bytealign (w0[3], w1[0], offset); - w1[2] = amd_bytealign (w0[2], w0[3], offset); - w1[1] = amd_bytealign (w0[1], w0[2], offset); - w1[0] = amd_bytealign (w0[0], w0[1], offset); - w0[3] = amd_bytealign ( 0, w0[0], offset); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 120: + w[30] = 0x80; break; - case 4: - w3[2] = amd_bytealign (w2[1], 0, offset); - w3[1] = amd_bytealign (w2[0], w2[1], offset); - w3[0] = amd_bytealign (w1[3], w2[0], offset); - w2[3] = amd_bytealign (w1[2], w1[3], offset); - w2[2] = amd_bytealign (w1[1], w1[2], offset); - w2[1] = amd_bytealign (w1[0], w1[1], offset); - w2[0] = amd_bytealign (w0[3], w1[0], offset); - w1[3] = amd_bytealign (w0[2], w0[3], offset); - w1[2] = amd_bytealign (w0[1], w0[2], offset); - w1[1] = amd_bytealign (w0[0], w0[1], offset); - w1[0] = amd_bytealign ( 0, w0[0], offset); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 121: + w[30] = w[30] | 0x8000; break; - case 5: - w3[2] = amd_bytealign (w2[0], 0, offset); - w3[1] = amd_bytealign (w1[3], w2[0], offset); - w3[0] = amd_bytealign (w1[2], w1[3], offset); - w2[3] = amd_bytealign (w1[1], w1[2], offset); - w2[2] = amd_bytealign (w1[0], w1[1], offset); - w2[1] = amd_bytealign (w0[3], w1[0], offset); - w2[0] = amd_bytealign (w0[2], w0[3], offset); - w1[3] = amd_bytealign (w0[1], w0[2], offset); - w1[2] = amd_bytealign (w0[0], w0[1], offset); - w1[1] = amd_bytealign ( 0, w0[0], offset); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 122: + w[30] = w[30] | 0x800000; break; - case 6: - w3[2] = amd_bytealign (w1[3], 0, offset); - w3[1] = amd_bytealign (w1[2], w1[3], offset); - w3[0] = amd_bytealign (w1[1], w1[2], offset); - w2[3] = amd_bytealign (w1[0], w1[1], offset); - w2[2] = amd_bytealign (w0[3], w1[0], offset); - w2[1] = amd_bytealign (w0[2], w0[3], offset); - w2[0] = amd_bytealign (w0[1], w0[2], offset); - w1[3] = amd_bytealign (w0[0], w0[1], offset); - w1[2] = amd_bytealign ( 0, w0[0], offset); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 123: + w[30] = w[30] | 0x80000000; break; - case 7: - w3[2] = amd_bytealign (w1[2], 0, offset); - w3[1] = amd_bytealign (w1[1], w1[2], offset); - w3[0] = amd_bytealign (w1[0], w1[1], offset); - w2[3] = amd_bytealign (w0[3], w1[0], offset); - w2[2] = amd_bytealign (w0[2], w0[3], offset); - w2[1] = amd_bytealign (w0[1], w0[2], offset); - w2[0] = amd_bytealign (w0[0], w0[1], offset); - w1[3] = amd_bytealign ( 0, w0[0], offset); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 124: + w[31] = 0x80; break; - case 8: - w3[2] = amd_bytealign (w1[1], 0, offset); - w3[1] = amd_bytealign (w1[0], w1[1], offset); - w3[0] = amd_bytealign (w0[3], w1[0], offset); - w2[3] = amd_bytealign (w0[2], w0[3], offset); - w2[2] = amd_bytealign (w0[1], w0[2], offset); - w2[1] = amd_bytealign (w0[0], w0[1], offset); - w2[0] = amd_bytealign ( 0, w0[0], offset); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 125: + w[31] = w[31] | 0x8000; break; - case 9: - w3[2] = amd_bytealign (w1[0], 0, offset); - w3[1] = amd_bytealign (w0[3], w1[0], offset); - w3[0] = amd_bytealign (w0[2], w0[3], offset); - w2[3] = amd_bytealign (w0[1], w0[2], offset); - w2[2] = amd_bytealign (w0[0], w0[1], offset); - w2[1] = amd_bytealign ( 0, w0[0], offset); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 126: + w[31] = w[31] | 0x800000; break; - case 10: - w3[2] = amd_bytealign (w0[3], 0, offset); - w3[1] = amd_bytealign (w0[2], w0[3], offset); - w3[0] = amd_bytealign (w0[1], w0[2], offset); - w2[3] = amd_bytealign (w0[0], w0[1], offset); - w2[2] = amd_bytealign ( 0, w0[0], offset); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 127: + w[31] = w[31] | 0x80000000; break; + } +} - case 11: - w3[2] = amd_bytealign (w0[2], 0, offset); - w3[1] = amd_bytealign (w0[1], w0[2], offset); - w3[0] = amd_bytealign (w0[0], w0[1], offset); - w2[3] = amd_bytealign ( 0, w0[0], offset); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; +// before: device_memcat2L +static void memcat_c7_d1x2_sl1x2_sr1x2 (const u32 offset, u32 dst0[2], u32 src_l0[2], u32 src_r0[2]) +{ + switch (offset) + { + case 1: + dst0[0] = src_l0[0] | src_r0[0] << 8; + dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; break; - case 12: - w3[2] = amd_bytealign (w0[1], 0, offset); - w3[1] = amd_bytealign (w0[0], w0[1], offset); - w3[0] = amd_bytealign ( 0, w0[0], offset); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 2: + dst0[0] = src_l0[0] | src_r0[0] << 16; + dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; break; - case 13: - w3[2] = amd_bytealign (w0[0], 0, offset); - w3[1] = amd_bytealign ( 0, w0[0], offset); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 3: + dst0[0] = src_l0[0] | src_r0[0] << 24; + dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; break; - } - #endif - #ifdef IS_NV - const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + case 4: + dst0[1] = src_r0[0]; + break; - switch (offset / 4) - { - case 0: - w3[1] = __byte_perm (w3[1], w3[0], selector); - w3[0] = __byte_perm (w3[0], w2[3], selector); - w2[3] = __byte_perm (w2[3], w2[2], selector); - w2[2] = __byte_perm (w2[2], w2[1], selector); - w2[1] = __byte_perm (w2[1], w2[0], selector); - w2[0] = __byte_perm (w2[0], w1[3], selector); - w1[3] = __byte_perm (w1[3], w1[2], selector); - w1[2] = __byte_perm (w1[2], w1[1], selector); - w1[1] = __byte_perm (w1[1], w1[0], selector); - w1[0] = __byte_perm (w1[0], w0[3], selector); - w0[3] = __byte_perm (w0[3], w0[2], selector); - w0[2] = __byte_perm (w0[2], w0[1], selector); - w0[1] = __byte_perm (w0[1], w0[0], selector); - w0[0] = __byte_perm (w0[0], 0, selector); + case 5: + dst0[1] = src_l0[1] | src_r0[0] << 8; + break; + + case 6: + dst0[1] = src_l0[1] | src_r0[0] << 16; + break; + + case 7: + dst0[1] = src_l0[1] | src_r0[0] << 24; break; + } +} +// before: device_memcat4L +static void memcat_c15_d1x4_sl1x4_sr1x4 (const u32 offset, u32 dst0[4], u32 src_l0[4], u32 src_r0[4]) +{ + switch (offset) + { case 1: - w3[1] = __byte_perm (w3[0], w2[3], selector); - w3[0] = __byte_perm (w2[3], w2[2], selector); - w2[3] = __byte_perm (w2[2], w2[1], selector); - w2[2] = __byte_perm (w2[1], w2[0], selector); - w2[1] = __byte_perm (w2[0], w1[3], selector); - w2[0] = __byte_perm (w1[3], w1[2], selector); - w1[3] = __byte_perm (w1[2], w1[1], selector); - w1[2] = __byte_perm (w1[1], w1[0], selector); - w1[1] = __byte_perm (w1[0], w0[3], selector); - w1[0] = __byte_perm (w0[3], w0[2], selector); - w0[3] = __byte_perm (w0[2], w0[1], selector); - w0[2] = __byte_perm (w0[1], w0[0], selector); - w0[1] = __byte_perm (w0[0], 0, selector); - w0[0] = 0; + dst0[0] = src_l0[0] | src_r0[0] << 8; + dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; break; case 2: - w3[1] = __byte_perm (w2[3], w2[2], selector); - w3[0] = __byte_perm (w2[2], w2[1], selector); - w2[3] = __byte_perm (w2[1], w2[0], selector); - w2[2] = __byte_perm (w2[0], w1[3], selector); - w2[1] = __byte_perm (w1[3], w1[2], selector); - w2[0] = __byte_perm (w1[2], w1[1], selector); - w1[3] = __byte_perm (w1[1], w1[0], selector); - w1[2] = __byte_perm (w1[0], w0[3], selector); - w1[1] = __byte_perm (w0[3], w0[2], selector); - w1[0] = __byte_perm (w0[2], w0[1], selector); - w0[3] = __byte_perm (w0[1], w0[0], selector); - w0[2] = __byte_perm (w0[0], 0, selector); - w0[1] = 0; - w0[0] = 0; + dst0[0] = src_l0[0] | src_r0[0] << 16; + dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; break; case 3: - w3[1] = __byte_perm (w2[2], w2[1], selector); - w3[0] = __byte_perm (w2[1], w2[0], selector); - w2[3] = __byte_perm (w2[0], w1[3], selector); - w2[2] = __byte_perm (w1[3], w1[2], selector); - w2[1] = __byte_perm (w1[2], w1[1], selector); - w2[0] = __byte_perm (w1[1], w1[0], selector); - w1[3] = __byte_perm (w1[0], w0[3], selector); - w1[2] = __byte_perm (w0[3], w0[2], selector); - w1[1] = __byte_perm (w0[2], w0[1], selector); - w1[0] = __byte_perm (w0[1], w0[0], selector); - w0[3] = __byte_perm (w0[0], 0, selector); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + dst0[0] = src_l0[0] | src_r0[0] << 24; + dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; break; case 4: - w3[1] = __byte_perm (w2[1], w2[0], selector); - w3[0] = __byte_perm (w2[0], w1[3], selector); - w2[3] = __byte_perm (w1[3], w1[2], selector); - w2[2] = __byte_perm (w1[2], w1[1], selector); - w2[1] = __byte_perm (w1[1], w1[0], selector); - w2[0] = __byte_perm (w1[0], w0[3], selector); - w1[3] = __byte_perm (w0[3], w0[2], selector); - w1[2] = __byte_perm (w0[2], w0[1], selector); - w1[1] = __byte_perm (w0[1], w0[0], selector); - w1[0] = __byte_perm (w0[0], 0, selector); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + dst0[1] = src_r0[0]; + dst0[2] = src_r0[1]; + dst0[3] = src_r0[2]; break; case 5: - w3[1] = __byte_perm (w2[0], w1[3], selector); - w3[0] = __byte_perm (w1[3], w1[2], selector); - w2[3] = __byte_perm (w1[2], w1[1], selector); - w2[2] = __byte_perm (w1[1], w1[0], selector); - w2[1] = __byte_perm (w1[0], w0[3], selector); - w2[0] = __byte_perm (w0[3], w0[2], selector); - w1[3] = __byte_perm (w0[2], w0[1], selector); - w1[2] = __byte_perm (w0[1], w0[0], selector); - w1[1] = __byte_perm (w0[0], 0, selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + dst0[1] = src_l0[1] | src_r0[0] << 8; + dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; break; case 6: - w3[1] = __byte_perm (w1[3], w1[2], selector); - w3[0] = __byte_perm (w1[2], w1[1], selector); - w2[3] = __byte_perm (w1[1], w1[0], selector); - w2[2] = __byte_perm (w1[0], w0[3], selector); - w2[1] = __byte_perm (w0[3], w0[2], selector); - w2[0] = __byte_perm (w0[2], w0[1], selector); - w1[3] = __byte_perm (w0[1], w0[0], selector); - w1[2] = __byte_perm (w0[0], 0, selector); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + dst0[1] = src_l0[1] | src_r0[0] << 16; + dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; break; case 7: - w3[1] = __byte_perm (w1[2], w1[1], selector); - w3[0] = __byte_perm (w1[1], w1[0], selector); - w2[3] = __byte_perm (w1[0], w0[3], selector); - w2[2] = __byte_perm (w0[3], w0[2], selector); - w2[1] = __byte_perm (w0[2], w0[1], selector); - w2[0] = __byte_perm (w0[1], w0[0], selector); - w1[3] = __byte_perm (w0[0], 0, selector); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + dst0[1] = src_l0[1] | src_r0[0] << 24; + dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; break; case 8: - w3[1] = __byte_perm (w1[1], w1[0], selector); - w3[0] = __byte_perm (w1[0], w0[3], selector); - w2[3] = __byte_perm (w0[3], w0[2], selector); - w2[2] = __byte_perm (w0[2], w0[1], selector); - w2[1] = __byte_perm (w0[1], w0[0], selector); - w2[0] = __byte_perm (w0[0], 0, selector); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + dst0[2] = src_r0[0]; + dst0[3] = src_r0[1]; break; case 9: - w3[1] = __byte_perm (w1[0], w0[3], selector); - w3[0] = __byte_perm (w0[3], w0[2], selector); - w2[3] = __byte_perm (w0[2], w0[1], selector); - w2[2] = __byte_perm (w0[1], w0[0], selector); - w2[1] = __byte_perm (w0[0], 0, selector); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + dst0[2] = src_l0[2] | src_r0[0] << 8; + dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; break; case 10: - w3[1] = __byte_perm (w0[3], w0[2], selector); - w3[0] = __byte_perm (w0[2], w0[1], selector); - w2[3] = __byte_perm (w0[1], w0[0], selector); - w2[2] = __byte_perm (w0[0], 0, selector); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + dst0[2] = src_l0[2] | src_r0[0] << 16; + dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; break; case 11: - w3[1] = __byte_perm (w0[2], w0[1], selector); - w3[0] = __byte_perm (w0[1], w0[0], selector); - w2[3] = __byte_perm (w0[0], 0, selector); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + dst0[2] = src_l0[2] | src_r0[0] << 24; + dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; break; case 12: - w3[1] = __byte_perm (w0[1], w0[0], selector); - w3[0] = __byte_perm (w0[0], 0, selector); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + dst0[3] = src_r0[0]; break; case 13: - w3[1] = __byte_perm (w0[0], 0, selector); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + dst0[3] = src_l0[3] | src_r0[0] << 8; + break; + + case 14: + dst0[3] = src_l0[3] | src_r0[0] << 16; + break; + + case 15: + dst0[3] = src_l0[3] | src_r0[0] << 24; break; } - #endif } -/* not needed anymore? -// before: append_0x80_2_be -static void append_0x80_2x4_be (u32 w0[4], u32 w1[4], const u32 offset) +// before: device_memcat8L +static void memcat_c31_d2x4_sl2x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 src_l0[4], u32 src_l1[4], u32 src_r0[4]) { switch (offset) { - case 0: - w0[0] |= 0x80000000; - break; - - case 1: - w0[0] |= 0x800000; + case 1: + dst0[0] = src_l0[0] | src_r0[0] << 8; + dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[0] = src_r0[3] >> 24; break; - case 2: - w0[0] |= 0x8000; + case 2: + dst0[0] = src_l0[0] | src_r0[0] << 16; + dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[0] = src_r0[3] >> 16; break; - case 3: - w0[0] |= 0x80; + case 3: + dst0[0] = src_l0[0] | src_r0[0] << 24; + dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[0] = src_r0[3] >> 8; break; - case 4: - w0[1] |= 0x80000000; + case 4: + dst0[1] = src_r0[0]; + dst0[2] = src_r0[1]; + dst0[3] = src_r0[2]; + dst1[0] = src_r0[3]; break; - case 5: - w0[1] |= 0x800000; + case 5: + dst0[1] = src_l0[1] | src_r0[0] << 8; + dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[1] = src_r0[3] >> 24; break; - case 6: - w0[1] |= 0x8000; + case 6: + dst0[1] = src_l0[1] | src_r0[0] << 16; + dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[1] = src_r0[3] >> 16; break; - case 7: - w0[1] |= 0x80; + case 7: + dst0[1] = src_l0[1] | src_r0[0] << 24; + dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[1] = src_r0[3] >> 8; break; - case 8: - w0[2] |= 0x80000000; + case 8: + dst0[2] = src_r0[0]; + dst0[3] = src_r0[1]; + dst1[0] = src_r0[2]; + dst1[1] = src_r0[3]; break; - case 9: - w0[2] |= 0x800000; + case 9: + dst0[2] = src_l0[2] | src_r0[0] << 8; + dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[2] = src_r0[3] >> 24; break; case 10: - w0[2] |= 0x8000; + dst0[2] = src_l0[2] | src_r0[0] << 16; + dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[2] = src_r0[3] >> 16; break; case 11: - w0[2] |= 0x80; + dst0[2] = src_l0[2] | src_r0[0] << 24; + dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[2] = src_r0[3] >> 8; break; case 12: - w0[3] |= 0x80000000; + dst0[3] = src_r0[0]; + dst1[0] = src_r0[1]; + dst1[1] = src_r0[2]; + dst1[2] = src_r0[3]; break; case 13: - w0[3] |= 0x800000; + dst0[3] = src_l0[3] | src_r0[0] << 8; + dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[3] = src_r0[3] >> 24; break; case 14: - w0[3] |= 0x8000; + dst0[3] = src_l0[3] | src_r0[0] << 16; + dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[3] = src_r0[3] >> 16; break; case 15: - w0[3] |= 0x80; + dst0[3] = src_l0[3] | src_r0[0] << 24; + dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[3] = src_r0[3] >> 8; break; case 16: - w1[0] |= 0x80000000; + dst1[0] = src_r0[0]; + dst1[1] = src_r0[1]; + dst1[2] = src_r0[2]; + dst1[3] = src_r0[3]; break; case 17: - w1[0] |= 0x800000; + dst1[0] = src_l1[0] | src_r0[0] << 8; + dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; break; case 18: - w1[0] |= 0x8000; + dst1[0] = src_l1[0] | src_r0[0] << 16; + dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; break; case 19: - w1[0] |= 0x80; + dst1[0] = src_l1[0] | src_r0[0] << 24; + dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; break; case 20: - w1[1] |= 0x80000000; + dst1[1] = src_r0[0]; + dst1[2] = src_r0[1]; + dst1[3] = src_r0[2]; break; case 21: - w1[1] |= 0x800000; + dst1[1] = src_l1[1] | src_r0[0] << 8; + dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; break; case 22: - w1[1] |= 0x8000; + dst1[1] = src_l1[1] | src_r0[0] << 16; + dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; break; case 23: - w1[1] |= 0x80; + dst1[1] = src_l1[1] | src_r0[0] << 24; + dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; break; case 24: - w1[2] |= 0x80000000; + dst1[2] = src_r0[0]; + dst1[3] = src_r0[1]; break; case 25: - w1[2] |= 0x800000; + dst1[2] = src_l1[2] | src_r0[0] << 8; + dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; break; case 26: - w1[2] |= 0x8000; + dst1[2] = src_l1[2] | src_r0[0] << 16; + dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; break; case 27: - w1[2] |= 0x80; + dst1[2] = src_l1[2] | src_r0[0] << 24; + dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; break; case 28: - w1[3] |= 0x80000000; + dst1[3] = src_r0[0]; break; case 29: - w1[3] |= 0x800000; + dst1[3] = src_l1[3] | src_r0[0] << 8; break; case 30: - w1[3] |= 0x8000; + dst1[3] = src_l1[3] | src_r0[0] << 16; break; case 31: - w1[3] |= 0x80; + dst1[3] = src_l1[3] | src_r0[0] << 24; break; } } -// before: append_0x80_4 -static void append_0x80_1x16 (u32 w[16], const u32 offset) +// before: device_memcat12L +static void memcat_c47_d3x4_sl3x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4]) { switch (offset) { - case 0: - w[ 0] = 0x80; - break; - case 1: - w[ 0] = w[ 0] | 0x8000; + dst0[0] = src_l0[0] | src_r0[0] << 8; + dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[0] = src_r0[3] >> 24; break; case 2: - w[ 0] = w[ 0] | 0x800000; + dst0[0] = src_l0[0] | src_r0[0] << 16; + dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[0] = src_r0[3] >> 16; break; case 3: - w[ 0] = w[ 0] | 0x80000000; + dst0[0] = src_l0[0] | src_r0[0] << 24; + dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[0] = src_r0[3] >> 8; break; case 4: - w[ 1] = 0x80; + dst0[1] = src_r0[0]; + dst0[2] = src_r0[1]; + dst0[3] = src_r0[2]; + dst1[0] = src_r0[3]; break; case 5: - w[ 1] = w[ 1] | 0x8000; + dst0[1] = src_l0[1] | src_r0[0] << 8; + dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[1] = src_r0[3] >> 24; break; case 6: - w[ 1] = w[ 1] | 0x800000; + dst0[1] = src_l0[1] | src_r0[0] << 16; + dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[1] = src_r0[3] >> 16; break; case 7: - w[ 1] = w[ 1] | 0x80000000; + dst0[1] = src_l0[1] | src_r0[0] << 24; + dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[1] = src_r0[3] >> 8; break; case 8: - w[ 2] = 0x80; + dst0[2] = src_r0[0]; + dst0[3] = src_r0[1]; + dst1[0] = src_r0[2]; + dst1[1] = src_r0[3]; break; case 9: - w[ 2] = w[ 2] | 0x8000; + dst0[2] = src_l0[2] | src_r0[0] << 8; + dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[2] = src_r0[3] >> 24; break; case 10: - w[ 2] = w[ 2] | 0x800000; + dst0[2] = src_l0[2] | src_r0[0] << 16; + dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[2] = src_r0[3] >> 16; break; case 11: - w[ 2] = w[ 2] | 0x80000000; + dst0[2] = src_l0[2] | src_r0[0] << 24; + dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[2] = src_r0[3] >> 8; break; case 12: - w[ 3] = 0x80; + dst0[3] = src_r0[0]; + dst1[0] = src_r0[1]; + dst1[1] = src_r0[2]; + dst1[2] = src_r0[3]; break; case 13: - w[ 3] = w[ 3] | 0x8000; + dst0[3] = src_l0[3] | src_r0[0] << 8; + dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[3] = src_r0[3] >> 24; break; case 14: - w[ 3] = w[ 3] | 0x800000; + dst0[3] = src_l0[3] | src_r0[0] << 16; + dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[3] = src_r0[3] >> 16; break; case 15: - w[ 3] = w[ 3] | 0x80000000; + dst0[3] = src_l0[3] | src_r0[0] << 24; + dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[3] = src_r0[3] >> 8; break; case 16: - w[ 4] = 0x80; + dst1[0] = src_r0[0]; + dst1[1] = src_r0[1]; + dst1[2] = src_r0[2]; + dst1[3] = src_r0[3]; break; case 17: - w[ 4] = w[ 4] | 0x8000; + dst1[0] = src_l1[0] | src_r0[0] << 8; + dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[0] = src_r0[3] >> 24; break; case 18: - w[ 4] = w[ 4] | 0x800000; + dst1[0] = src_l1[0] | src_r0[0] << 16; + dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[0] = src_r0[3] >> 16; break; case 19: - w[ 4] = w[ 4] | 0x80000000; + dst1[0] = src_l1[0] | src_r0[0] << 24; + dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[0] = src_r0[3] >> 8; break; case 20: - w[ 5] = 0x80; + dst1[1] = src_r0[0]; + dst1[2] = src_r0[1]; + dst1[3] = src_r0[2]; + dst2[0] = src_r0[3]; break; case 21: - w[ 5] = w[ 5] | 0x8000; + dst1[1] = src_l1[1] | src_r0[0] << 8; + dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[1] = src_r0[3] >> 24; break; case 22: - w[ 5] = w[ 5] | 0x800000; + dst1[1] = src_l1[1] | src_r0[0] << 16; + dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[1] = src_r0[3] >> 16; break; case 23: - w[ 5] = w[ 5] | 0x80000000; + dst1[1] = src_l1[1] | src_r0[0] << 24; + dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[1] = src_r0[3] >> 8; break; case 24: - w[ 6] = 0x80; + dst1[2] = src_r0[0]; + dst1[3] = src_r0[1]; + dst2[0] = src_r0[2]; + dst2[1] = src_r0[3]; break; case 25: - w[ 6] = w[ 6] | 0x8000; + dst1[2] = src_l1[2] | src_r0[0] << 8; + dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[2] = src_r0[3] >> 24; break; case 26: - w[ 6] = w[ 6] | 0x800000; + dst1[2] = src_l1[2] | src_r0[0] << 16; + dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[2] = src_r0[3] >> 16; break; case 27: - w[ 6] = w[ 6] | 0x80000000; + dst1[2] = src_l1[2] | src_r0[0] << 24; + dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[2] = src_r0[3] >> 8; break; case 28: - w[ 7] = 0x80; + dst1[3] = src_r0[0]; + dst2[0] = src_r0[1]; + dst2[1] = src_r0[2]; + dst2[2] = src_r0[3]; break; case 29: - w[ 7] = w[ 7] | 0x8000; + dst1[3] = src_l1[3] | src_r0[0] << 8; + dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[3] = src_r0[3] >> 24; break; case 30: - w[ 7] = w[ 7] | 0x800000; + dst1[3] = src_l1[3] | src_r0[0] << 16; + dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[3] = src_r0[3] >> 16; break; case 31: - w[ 7] = w[ 7] | 0x80000000; + dst1[3] = src_l1[3] | src_r0[0] << 24; + dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[3] = src_r0[3] >> 8; break; case 32: - w[ 8] = 0x80; + dst2[0] = src_r0[0]; + dst2[1] = src_r0[1]; + dst2[2] = src_r0[2]; + dst2[3] = src_r0[3]; break; case 33: - w[ 8] = w[ 8] | 0x8000; + dst2[0] = src_l2[0] | src_r0[0] << 8; + dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8; break; case 34: - w[ 8] = w[ 8] | 0x800000; + dst2[0] = src_l2[0] | src_r0[0] << 16; + dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16; break; case 35: - w[ 8] = w[ 8] | 0x80000000; + dst2[0] = src_l2[0] | src_r0[0] << 24; + dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24; break; case 36: - w[ 9] = 0x80; + dst2[1] = src_r0[0]; + dst2[2] = src_r0[1]; + dst2[3] = src_r0[2]; break; case 37: - w[ 9] = w[ 9] | 0x8000; + dst2[1] = src_l2[1] | src_r0[0] << 8; + dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8; break; case 38: - w[ 9] = w[ 9] | 0x800000; + dst2[1] = src_l2[1] | src_r0[0] << 16; + dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16; break; case 39: - w[ 9] = w[ 9] | 0x80000000; + dst2[1] = src_l2[1] | src_r0[0] << 24; + dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24; break; case 40: - w[10] = 0x80; + dst2[2] = src_r0[0]; + dst2[3] = src_r0[1]; break; case 41: - w[10] = w[10] | 0x8000; + dst2[2] = src_l2[2] | src_r0[0] << 8; + dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8; break; case 42: - w[10] = w[10] | 0x800000; + dst2[2] = src_l2[2] | src_r0[0] << 16; + dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16; break; case 43: - w[10] = w[10] | 0x80000000; + dst2[2] = src_l2[2] | src_r0[0] << 24; + dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24; break; case 44: - w[11] = 0x80; + dst2[3] = src_r0[0]; break; case 45: - w[11] = w[11] | 0x8000; + dst2[3] = src_l2[3] | src_r0[0] << 8; break; case 46: - w[11] = w[11] | 0x800000; + dst2[3] = src_l2[3] | src_r0[0] << 16; break; case 47: - w[11] = w[11] | 0x80000000; - break; - - case 48: - w[12] = 0x80; - break; - - case 49: - w[12] = w[12] | 0x8000; - break; - - case 50: - w[12] = w[12] | 0x800000; - break; - - case 51: - w[12] = w[12] | 0x80000000; - break; - - case 52: - w[13] = 0x80; - break; - - case 53: - w[13] = w[13] | 0x8000; - break; - - case 54: - w[13] = w[13] | 0x800000; - break; - - case 55: - w[13] = w[13] | 0x80000000; - break; - - case 56: - w[14] = 0x80; - break; - - case 57: - w[14] = w[14] | 0x8000; - break; - - case 58: - w[14] = w[14] | 0x800000; - break; - - case 59: - w[14] = w[14] | 0x80000000; - break; - - case 60: - w[15] = 0x80; - break; - - case 61: - w[15] = w[15] | 0x8000; - break; - - case 62: - w[15] = w[15] | 0x800000; - break; - - case 63: - w[15] = w[15] | 0x80000000; + dst2[3] = src_l2[3] | src_r0[0] << 24; break; } } -// before: append_0x80_8 -static void append_0x80_1x32 (u32 w[32], const u32 offset) +// before: device_memcat12L +static void memcat_c47_d3x4_sl3x4_sr2x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4], u32 src_r1[4]) { switch (offset) { case 0: - w[ 0] = 0x80; + dst0[0] = src_r0[0]; + dst0[1] = src_r0[1]; + dst0[2] = src_r0[2]; + dst0[3] = src_r0[3]; + dst1[0] = src_r1[0]; + dst1[1] = src_r1[1]; + dst1[2] = src_r1[2]; + dst1[3] = src_r1[3]; break; case 1: - w[ 0] = w[ 0] | 0x8000; + dst0[0] = src_l0[0] | src_r0[0] << 8; + dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[0] = src_r0[3] >> 24 | src_r1[0] << 8; + dst1[1] = src_r1[0] >> 24 | src_r1[1] << 8; + dst1[2] = src_r1[1] >> 24 | src_r1[2] << 8; + dst1[3] = src_r1[2] >> 24 | src_r1[3] << 8; + dst2[0] = src_r1[3] >> 24; break; case 2: - w[ 0] = w[ 0] | 0x800000; + dst0[0] = src_l0[0] | src_r0[0] << 16; + dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[0] = src_r0[3] >> 16 | src_r1[0] << 16; + dst1[1] = src_r1[0] >> 16 | src_r1[1] << 16; + dst1[2] = src_r1[1] >> 16 | src_r1[2] << 16; + dst1[3] = src_r1[2] >> 16 | src_r1[3] << 16; + dst2[0] = src_r1[3] >> 16; break; case 3: - w[ 0] = w[ 0] | 0x80000000; + dst0[0] = src_l0[0] | src_r0[0] << 24; + dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[0] = src_r0[3] >> 8 | src_r1[0] << 24; + dst1[1] = src_r1[0] >> 8 | src_r1[1] << 24; + dst1[2] = src_r1[1] >> 8 | src_r1[2] << 24; + dst1[3] = src_r1[2] >> 8 | src_r1[3] << 24; + dst2[0] = src_r1[3] >> 8; break; case 4: - w[ 1] = 0x80; + dst0[1] = src_r0[0]; + dst0[2] = src_r0[1]; + dst0[3] = src_r0[2]; + dst1[0] = src_r0[3]; + dst1[1] = src_r1[0]; + dst1[2] = src_r1[1]; + dst1[3] = src_r1[2]; + dst2[0] = src_r1[3]; break; case 5: - w[ 1] = w[ 1] | 0x8000; + dst0[1] = src_l0[1] | src_r0[0] << 8; + dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[1] = src_r0[3] >> 24 | src_r1[0] << 8; + dst1[2] = src_r1[0] >> 24 | src_r1[1] << 8; + dst1[3] = src_r1[1] >> 24 | src_r1[2] << 8; + dst2[0] = src_r1[2] >> 24 | src_r1[3] << 8; + dst2[1] = src_r1[3] >> 24; break; case 6: - w[ 1] = w[ 1] | 0x800000; + dst0[1] = src_l0[1] | src_r0[0] << 16; + dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[1] = src_r0[3] >> 16 | src_r1[0] << 16; + dst1[2] = src_r1[0] >> 16 | src_r1[1] << 16; + dst1[3] = src_r1[1] >> 16 | src_r1[2] << 16; + dst2[0] = src_r1[2] >> 16 | src_r1[3] << 16; + dst2[1] = src_r1[3] >> 16; break; case 7: - w[ 1] = w[ 1] | 0x80000000; + dst0[1] = src_l0[1] | src_r0[0] << 24; + dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[1] = src_r0[3] >> 8 | src_r1[0] << 24; + dst1[2] = src_r1[0] >> 8 | src_r1[1] << 24; + dst1[3] = src_r1[1] >> 8 | src_r1[2] << 24; + dst2[0] = src_r1[2] >> 8 | src_r1[3] << 24; + dst2[1] = src_r1[3] >> 8; break; case 8: - w[ 2] = 0x80; + dst0[2] = src_r0[0]; + dst0[3] = src_r0[1]; + dst1[0] = src_r0[2]; + dst1[1] = src_r0[3]; + dst1[2] = src_r1[0]; + dst1[3] = src_r1[1]; + dst2[0] = src_r1[2]; + dst2[1] = src_r1[3]; break; case 9: - w[ 2] = w[ 2] | 0x8000; + dst0[2] = src_l0[2] | src_r0[0] << 8; + dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[2] = src_r0[3] >> 24 | src_r1[0] << 8; + dst1[3] = src_r1[0] >> 24 | src_r1[1] << 8; + dst2[0] = src_r1[1] >> 24 | src_r1[2] << 8; + dst2[1] = src_r1[2] >> 24 | src_r1[3] << 8; + dst2[2] = src_r1[3] >> 24; break; case 10: - w[ 2] = w[ 2] | 0x800000; + dst0[2] = src_l0[2] | src_r0[0] << 16; + dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[2] = src_r0[3] >> 16 | src_r1[0] << 16; + dst1[3] = src_r1[0] >> 16 | src_r1[1] << 16; + dst2[0] = src_r1[1] >> 16 | src_r1[2] << 16; + dst2[1] = src_r1[2] >> 16 | src_r1[3] << 16; + dst2[2] = src_r1[3] >> 16; break; case 11: - w[ 2] = w[ 2] | 0x80000000; + dst0[2] = src_l0[2] | src_r0[0] << 24; + dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[2] = src_r0[3] >> 8 | src_r1[0] << 24; + dst1[3] = src_r1[0] >> 8 | src_r1[1] << 24; + dst2[0] = src_r1[1] >> 8 | src_r1[2] << 24; + dst2[1] = src_r1[2] >> 8 | src_r1[3] << 24; + dst2[2] = src_r1[3] >> 8; break; case 12: - w[ 3] = 0x80; + dst0[3] = src_r0[0]; + dst1[0] = src_r0[1]; + dst1[1] = src_r0[2]; + dst1[2] = src_r0[3]; + dst1[3] = src_r1[0]; + dst2[0] = src_r1[1]; + dst2[1] = src_r1[2]; + dst2[2] = src_r1[3]; break; case 13: - w[ 3] = w[ 3] | 0x8000; + dst0[3] = src_l0[3] | src_r0[0] << 8; + dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[3] = src_r0[3] >> 24 | src_r1[0] << 8; + dst2[0] = src_r1[0] >> 24 | src_r1[1] << 8; + dst2[1] = src_r1[1] >> 24 | src_r1[2] << 8; + dst2[2] = src_r1[2] >> 24 | src_r1[3] << 8; + dst2[3] = src_r1[3] >> 24; break; case 14: - w[ 3] = w[ 3] | 0x800000; + dst0[3] = src_l0[3] | src_r0[0] << 16; + dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[3] = src_r0[3] >> 16 | src_r1[0] << 16; + dst2[0] = src_r1[0] >> 16 | src_r1[1] << 16; + dst2[1] = src_r1[1] >> 16 | src_r1[2] << 16; + dst2[2] = src_r1[2] >> 16 | src_r1[3] << 16; + dst2[3] = src_r1[3] >> 16; break; case 15: - w[ 3] = w[ 3] | 0x80000000; + dst0[3] = src_l0[3] | src_r0[0] << 24; + dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[3] = src_r0[3] >> 8 | src_r1[0] << 24; + dst2[0] = src_r1[0] >> 8 | src_r1[1] << 24; + dst2[1] = src_r1[1] >> 8 | src_r1[2] << 24; + dst2[2] = src_r1[2] >> 8 | src_r1[3] << 24; + dst2[3] = src_r1[3] >> 8; break; case 16: - w[ 4] = 0x80; + dst1[0] = src_r0[0]; + dst1[1] = src_r0[1]; + dst1[2] = src_r0[2]; + dst1[3] = src_r0[3]; + dst2[0] = src_r1[0]; + dst2[1] = src_r1[1]; + dst2[2] = src_r1[2]; + dst2[3] = src_r1[3]; break; case 17: - w[ 4] = w[ 4] | 0x8000; + dst1[0] = src_l1[0] | src_r0[0] << 8; + dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[0] = src_r0[3] >> 24 | src_r1[0] << 8; + dst2[1] = src_r1[0] >> 24 | src_r1[1] << 8; + dst2[2] = src_r1[1] >> 24 | src_r1[2] << 8; + dst2[3] = src_r1[2] >> 24 | src_r1[3] << 8; break; case 18: - w[ 4] = w[ 4] | 0x800000; + dst1[0] = src_l1[0] | src_r0[0] << 16; + dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[0] = src_r0[3] >> 16 | src_r1[0] << 16; + dst2[1] = src_r1[0] >> 16 | src_r1[1] << 16; + dst2[2] = src_r1[1] >> 16 | src_r1[2] << 16; + dst2[3] = src_r1[2] >> 16 | src_r1[3] << 16; break; case 19: - w[ 4] = w[ 4] | 0x80000000; + dst1[0] = src_l1[0] | src_r0[0] << 24; + dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[0] = src_r0[3] >> 8 | src_r1[0] << 24; + dst2[1] = src_r1[0] >> 8 | src_r1[1] << 24; + dst2[2] = src_r1[1] >> 8 | src_r1[2] << 24; + dst2[3] = src_r1[2] >> 8 | src_r1[3] << 24; break; case 20: - w[ 5] = 0x80; + dst1[1] = src_r1[0]; + dst1[2] = src_r0[1]; + dst1[3] = src_r0[2]; + dst2[0] = src_r0[3]; + dst2[1] = src_r1[0]; + dst2[2] = src_r1[1]; + dst2[3] = src_r1[2]; break; case 21: - w[ 5] = w[ 5] | 0x8000; + dst1[1] = src_l1[1] | src_r0[0] << 8; + dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[1] = src_r0[3] >> 24 | src_r1[0] << 8; + dst2[2] = src_r1[0] >> 24 | src_r1[1] << 8; + dst2[3] = src_r1[1] >> 24 | src_r1[2] << 8; break; case 22: - w[ 5] = w[ 5] | 0x800000; + dst1[1] = src_l1[1] | src_r0[0] << 16; + dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[1] = src_r0[3] >> 16 | src_r1[0] << 16; + dst2[2] = src_r1[0] >> 16 | src_r1[1] << 16; + dst2[3] = src_r1[1] >> 16 | src_r1[2] << 16; break; case 23: - w[ 5] = w[ 5] | 0x80000000; + dst1[1] = src_l1[1] | src_r0[0] << 24; + dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[1] = src_r0[3] >> 8 | src_r1[0] << 24; + dst2[2] = src_r1[0] >> 8 | src_r1[1] << 24; + dst2[3] = src_r1[1] >> 8 | src_r1[2] << 24; break; case 24: - w[ 6] = 0x80; + dst1[2] = src_r1[0]; + dst1[3] = src_r0[1]; + dst2[0] = src_r0[2]; + dst2[1] = src_r0[3]; + dst2[2] = src_r1[0]; + dst2[3] = src_r1[1]; break; case 25: - w[ 6] = w[ 6] | 0x8000; + dst1[2] = src_l1[2] | src_r0[0] << 8; + dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[2] = src_r0[3] >> 24 | src_r1[0] << 8; + dst2[3] = src_r1[0] >> 24 | src_r1[1] << 8; break; case 26: - w[ 6] = w[ 6] | 0x800000; + dst1[2] = src_l1[2] | src_r0[0] << 16; + dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[2] = src_r0[3] >> 16 | src_r1[0] << 16; + dst2[3] = src_r1[0] >> 16 | src_r1[1] << 16; break; case 27: - w[ 6] = w[ 6] | 0x80000000; + dst1[2] = src_l1[2] | src_r0[0] << 24; + dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[2] = src_r0[3] >> 8 | src_r1[0] << 24; + dst2[3] = src_r1[0] >> 8 | src_r1[1] << 24; break; case 28: - w[ 7] = 0x80; + dst1[3] = src_r1[0]; + dst2[0] = src_r0[1]; + dst2[1] = src_r0[2]; + dst2[2] = src_r0[3]; + dst2[3] = src_r1[0]; break; case 29: - w[ 7] = w[ 7] | 0x8000; + dst1[3] = src_l1[3] | src_r0[0] << 8; + dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[3] = src_r0[3] >> 24 | src_r1[0] << 8; break; case 30: - w[ 7] = w[ 7] | 0x800000; + dst1[3] = src_l1[3] | src_r0[0] << 16; + dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[3] = src_r0[3] >> 16 | src_r1[0] << 16; break; case 31: - w[ 7] = w[ 7] | 0x80000000; + dst1[3] = src_l1[3] | src_r0[0] << 24; + dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[3] = src_r0[3] >> 8 | src_r1[0] << 24; break; case 32: - w[ 8] = 0x80; + dst2[0] = src_r0[0]; + dst2[1] = src_r0[1]; + dst2[2] = src_r0[2]; + dst2[3] = src_r0[3]; break; case 33: - w[ 8] = w[ 8] | 0x8000; + dst2[0] = src_l2[0] | src_r0[0] << 8; + dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8; break; case 34: - w[ 8] = w[ 8] | 0x800000; + dst2[0] = src_l2[0] | src_r0[0] << 16; + dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16; break; case 35: - w[ 8] = w[ 8] | 0x80000000; + dst2[0] = src_l2[0] | src_r0[0] << 24; + dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24; break; case 36: - w[ 9] = 0x80; + dst2[1] = src_r0[0]; + dst2[2] = src_r0[1]; + dst2[3] = src_r0[2]; break; case 37: - w[ 9] = w[ 9] | 0x8000; + dst2[1] = src_l2[1] | src_r0[0] << 8; + dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8; break; case 38: - w[ 9] = w[ 9] | 0x800000; + dst2[1] = src_l2[1] | src_r0[0] << 16; + dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16; break; case 39: - w[ 9] = w[ 9] | 0x80000000; + dst2[1] = src_l2[1] | src_r0[0] << 24; + dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24; break; case 40: - w[10] = 0x80; + dst2[2] = src_r0[0]; + dst2[3] = src_r0[1]; break; case 41: - w[10] = w[10] | 0x8000; + dst2[2] = src_l2[2] | src_r0[0] << 8; + dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8; break; case 42: - w[10] = w[10] | 0x800000; + dst2[2] = src_l2[2] | src_r0[0] << 16; + dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16; break; case 43: - w[10] = w[10] | 0x80000000; + dst2[2] = src_l2[2] | src_r0[0] << 24; + dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24; break; case 44: - w[11] = 0x80; + dst2[3] = src_r0[0]; break; case 45: - w[11] = w[11] | 0x8000; + dst2[3] = src_l2[3] | src_r0[0] << 8; break; case 46: - w[11] = w[11] | 0x800000; + dst2[3] = src_l2[3] | src_r0[0] << 16; break; case 47: - w[11] = w[11] | 0x80000000; + dst2[3] = src_l2[3] | src_r0[0] << 24; break; + } +} - case 48: - w[12] = 0x80; +// before: memcat16_9 +static void memcat_c15_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset) +{ + switch (offset) + { + case 0: + w0[0] = append0[0]; + w0[1] = append0[1]; + w0[2] = append0[2]; + w0[3] = append0[3]; + w1[0] = append1[0]; + w1[1] = append1[1]; + w1[2] = append1[2]; + w1[3] = append1[3]; + w2[0] = append2[0]; break; - case 49: - w[12] = w[12] | 0x8000; + case 1: + w0[0] = w0[0] | append0[0] << 8; + w0[1] = append0[0] >> 24 | append0[1] << 8; + w0[2] = append0[1] >> 24 | append0[2] << 8; + w0[3] = append0[2] >> 24 | append0[3] << 8; + w1[0] = append0[3] >> 24 | append1[0] << 8; + w1[1] = append1[0] >> 24 | append1[1] << 8; + w1[2] = append1[1] >> 24 | append1[2] << 8; + w1[3] = append1[2] >> 24 | append1[3] << 8; + w2[0] = append1[3] >> 24 | append2[0] << 8; + w2[1] = append2[0] >> 24; break; - case 50: - w[12] = w[12] | 0x800000; + case 2: + w0[0] = w0[0] | append0[0] << 16; + w0[1] = append0[0] >> 16 | append0[1] << 16; + w0[2] = append0[1] >> 16 | append0[2] << 16; + w0[3] = append0[2] >> 16 | append0[3] << 16; + w1[0] = append0[3] >> 16 | append1[0] << 16; + w1[1] = append1[0] >> 16 | append1[1] << 16; + w1[2] = append1[1] >> 16 | append1[2] << 16; + w1[3] = append1[2] >> 16 | append1[3] << 16; + w2[0] = append1[3] >> 16 | append2[0] << 16; + w2[1] = append2[0] >> 16; break; - case 51: - w[12] = w[12] | 0x80000000; + case 3: + w0[0] = w0[0] | append0[0] << 24; + w0[1] = append0[0] >> 8 | append0[1] << 24; + w0[2] = append0[1] >> 8 | append0[2] << 24; + w0[3] = append0[2] >> 8 | append0[3] << 24; + w1[0] = append0[3] >> 8 | append1[0] << 24; + w1[1] = append1[0] >> 8 | append1[1] << 24; + w1[2] = append1[1] >> 8 | append1[2] << 24; + w1[3] = append1[2] >> 8 | append1[3] << 24; + w2[0] = append1[3] >> 8 | append2[0] << 24; + w2[1] = append2[0] >> 8; break; - case 52: - w[13] = 0x80; + case 4: + w0[1] = append0[0]; + w0[2] = append0[1]; + w0[3] = append0[2]; + w1[0] = append0[3]; + w1[1] = append1[0]; + w1[2] = append1[1]; + w1[3] = append1[2]; + w2[0] = append1[3]; + w2[1] = append2[0]; break; - case 53: - w[13] = w[13] | 0x8000; + case 5: + w0[1] = w0[1] | append0[0] << 8; + w0[2] = append0[0] >> 24 | append0[1] << 8; + w0[3] = append0[1] >> 24 | append0[2] << 8; + w1[0] = append0[2] >> 24 | append0[3] << 8; + w1[1] = append0[3] >> 24 | append1[0] << 8; + w1[2] = append1[0] >> 24 | append1[1] << 8; + w1[3] = append1[1] >> 24 | append1[2] << 8; + w2[0] = append1[2] >> 24 | append1[3] << 8; + w2[1] = append1[3] >> 24 | append2[0] << 8; + w2[2] = append2[0] >> 24; break; - case 54: - w[13] = w[13] | 0x800000; + case 6: + w0[1] = w0[1] | append0[0] << 16; + w0[2] = append0[0] >> 16 | append0[1] << 16; + w0[3] = append0[1] >> 16 | append0[2] << 16; + w1[0] = append0[2] >> 16 | append0[3] << 16; + w1[1] = append0[3] >> 16 | append1[0] << 16; + w1[2] = append1[0] >> 16 | append1[1] << 16; + w1[3] = append1[1] >> 16 | append1[2] << 16; + w2[0] = append1[2] >> 16 | append1[3] << 16; + w2[1] = append1[3] >> 16 | append2[0] << 16; + w2[2] = append2[0] >> 16; break; - case 55: - w[13] = w[13] | 0x80000000; + case 7: + w0[1] = w0[1] | append0[0] << 24; + w0[2] = append0[0] >> 8 | append0[1] << 24; + w0[3] = append0[1] >> 8 | append0[2] << 24; + w1[0] = append0[2] >> 8 | append0[3] << 24; + w1[1] = append0[3] >> 8 | append1[0] << 24; + w1[2] = append1[0] >> 8 | append1[1] << 24; + w1[3] = append1[1] >> 8 | append1[2] << 24; + w2[0] = append1[2] >> 8 | append1[3] << 24; + w2[1] = append1[3] >> 8 | append2[0] << 24; + w2[2] = append2[0] >> 8; break; - case 56: - w[14] = 0x80; + case 8: + w0[2] = append0[0]; + w0[3] = append0[1]; + w1[0] = append0[2]; + w1[1] = append0[3]; + w1[2] = append1[0]; + w1[3] = append1[1]; + w2[0] = append1[2]; + w2[1] = append1[3]; + w2[2] = append2[0]; break; - case 57: - w[14] = w[14] | 0x8000; + case 9: + w0[2] = w0[2] | append0[0] << 8; + w0[3] = append0[0] >> 24 | append0[1] << 8; + w1[0] = append0[1] >> 24 | append0[2] << 8; + w1[1] = append0[2] >> 24 | append0[3] << 8; + w1[2] = append0[3] >> 24 | append1[0] << 8; + w1[3] = append1[0] >> 24 | append1[1] << 8; + w2[0] = append1[1] >> 24 | append1[2] << 8; + w2[1] = append1[2] >> 24 | append1[3] << 8; + w2[2] = append1[3] >> 24 | append2[0] << 8; + w2[3] = append2[0] >> 24; break; - case 58: - w[14] = w[14] | 0x800000; + case 10: + w0[2] = w0[2] | append0[0] << 16; + w0[3] = append0[0] >> 16 | append0[1] << 16; + w1[0] = append0[1] >> 16 | append0[2] << 16; + w1[1] = append0[2] >> 16 | append0[3] << 16; + w1[2] = append0[3] >> 16 | append1[0] << 16; + w1[3] = append1[0] >> 16 | append1[1] << 16; + w2[0] = append1[1] >> 16 | append1[2] << 16; + w2[1] = append1[2] >> 16 | append1[3] << 16; + w2[2] = append1[3] >> 16 | append2[0] << 16; + w2[3] = append2[0] >> 16; break; - case 59: - w[14] = w[14] | 0x80000000; + case 11: + w0[2] = w0[2] | append0[0] << 24; + w0[3] = append0[0] >> 8 | append0[1] << 24; + w1[0] = append0[1] >> 8 | append0[2] << 24; + w1[1] = append0[2] >> 8 | append0[3] << 24; + w1[2] = append0[3] >> 8 | append1[0] << 24; + w1[3] = append1[0] >> 8 | append1[1] << 24; + w2[0] = append1[1] >> 8 | append1[2] << 24; + w2[1] = append1[2] >> 8 | append1[3] << 24; + w2[2] = append1[3] >> 8 | append2[0] << 24; + w2[3] = append2[0] >> 8; break; - case 60: - w[15] = 0x80; + case 12: + w0[3] = append0[0]; + w1[0] = append0[1]; + w1[1] = append0[2]; + w1[2] = append0[3]; + w1[3] = append1[0]; + w2[0] = append1[1]; + w2[1] = append1[2]; + w2[2] = append1[3]; + w2[3] = append2[0]; break; - case 61: - w[15] = w[15] | 0x8000; + case 13: + w0[3] = w0[3] | append0[0] << 8; + w1[0] = append0[0] >> 24 | append0[1] << 8; + w1[1] = append0[1] >> 24 | append0[2] << 8; + w1[2] = append0[2] >> 24 | append0[3] << 8; + w1[3] = append0[3] >> 24 | append1[0] << 8; + w2[0] = append1[0] >> 24 | append1[1] << 8; + w2[1] = append1[1] >> 24 | append1[2] << 8; + w2[2] = append1[2] >> 24 | append1[3] << 8; + w2[3] = append1[3] >> 24 | append2[0] << 8; + w3[0] = append2[0] >> 24; break; - case 62: - w[15] = w[15] | 0x800000; + case 14: + w0[3] = w0[3] | append0[0] << 16; + w1[0] = append0[0] >> 16 | append0[1] << 16; + w1[1] = append0[1] >> 16 | append0[2] << 16; + w1[2] = append0[2] >> 16 | append0[3] << 16; + w1[3] = append0[3] >> 16 | append1[0] << 16; + w2[0] = append1[0] >> 16 | append1[1] << 16; + w2[1] = append1[1] >> 16 | append1[2] << 16; + w2[2] = append1[2] >> 16 | append1[3] << 16; + w2[3] = append1[3] >> 16 | append2[0] << 16; + w3[0] = append2[0] >> 16; break; - case 63: - w[15] = w[15] | 0x80000000; + case 15: + w0[3] = w0[3] | append0[0] << 24; + w1[0] = append0[0] >> 8 | append0[1] << 24; + w1[1] = append0[1] >> 8 | append0[2] << 24; + w1[2] = append0[2] >> 8 | append0[3] << 24; + w1[3] = append0[3] >> 8 | append1[0] << 24; + w2[0] = append1[0] >> 8 | append1[1] << 24; + w2[1] = append1[1] >> 8 | append1[2] << 24; + w2[2] = append1[2] >> 8 | append1[3] << 24; + w2[3] = append1[3] >> 8 | append2[0] << 24; + w3[0] = append2[0] >> 8; + break; + } +} + +// before: memcat32_8 +static void memcat_c32_w4x4_a2x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 offset) +{ + switch (offset) + { + case 0: + w0[0] = append0[0]; + w0[1] = append0[1]; + w0[2] = append0[2]; + w0[3] = append0[3]; + w1[0] = append1[0]; + w1[1] = append1[1]; + w1[2] = append1[2]; + w1[3] = append1[3]; + break; + + case 1: + w0[0] = w0[0] | append0[0] << 8; + w0[1] = append0[0] >> 24 | append0[1] << 8; + w0[2] = append0[1] >> 24 | append0[2] << 8; + w0[3] = append0[2] >> 24 | append0[3] << 8; + w1[0] = append0[3] >> 24 | append1[0] << 8; + w1[1] = append1[0] >> 24 | append1[1] << 8; + w1[2] = append1[1] >> 24 | append1[2] << 8; + w1[3] = append1[2] >> 24 | append1[3] << 8; + w2[0] = append1[3] >> 24; break; - case 64: - w[16] = 0x80; + case 2: + w0[0] = w0[0] | append0[0] << 16; + w0[1] = append0[0] >> 16 | append0[1] << 16; + w0[2] = append0[1] >> 16 | append0[2] << 16; + w0[3] = append0[2] >> 16 | append0[3] << 16; + w1[0] = append0[3] >> 16 | append1[0] << 16; + w1[1] = append1[0] >> 16 | append1[1] << 16; + w1[2] = append1[1] >> 16 | append1[2] << 16; + w1[3] = append1[2] >> 16 | append1[3] << 16; + w2[0] = append1[3] >> 16; break; - case 65: - w[16] = w[16] | 0x8000; + case 3: + w0[0] = w0[0] | append0[0] << 24; + w0[1] = append0[0] >> 8 | append0[1] << 24; + w0[2] = append0[1] >> 8 | append0[2] << 24; + w0[3] = append0[2] >> 8 | append0[3] << 24; + w1[0] = append0[3] >> 8 | append1[0] << 24; + w1[1] = append1[0] >> 8 | append1[1] << 24; + w1[2] = append1[1] >> 8 | append1[2] << 24; + w1[3] = append1[2] >> 8 | append1[3] << 24; + w2[0] = append1[3] >> 8; break; - case 66: - w[16] = w[16] | 0x800000; + case 4: + w0[1] = append0[0]; + w0[2] = append0[1]; + w0[3] = append0[2]; + w1[0] = append0[3]; + w1[1] = append1[0]; + w1[2] = append1[1]; + w1[3] = append1[2]; + w2[0] = append1[3]; break; - case 67: - w[16] = w[16] | 0x80000000; + case 5: + w0[1] = w0[1] | append0[0] << 8; + w0[2] = append0[0] >> 24 | append0[1] << 8; + w0[3] = append0[1] >> 24 | append0[2] << 8; + w1[0] = append0[2] >> 24 | append0[3] << 8; + w1[1] = append0[3] >> 24 | append1[0] << 8; + w1[2] = append1[0] >> 24 | append1[1] << 8; + w1[3] = append1[1] >> 24 | append1[2] << 8; + w2[0] = append1[2] >> 24 | append1[3] << 8; + w2[1] = append1[3] >> 24; break; - case 68: - w[17] = 0x80; + case 6: + w0[1] = w0[1] | append0[0] << 16; + w0[2] = append0[0] >> 16 | append0[1] << 16; + w0[3] = append0[1] >> 16 | append0[2] << 16; + w1[0] = append0[2] >> 16 | append0[3] << 16; + w1[1] = append0[3] >> 16 | append1[0] << 16; + w1[2] = append1[0] >> 16 | append1[1] << 16; + w1[3] = append1[1] >> 16 | append1[2] << 16; + w2[0] = append1[2] >> 16 | append1[3] << 16; + w2[1] = append1[3] >> 16; break; - case 69: - w[17] = w[17] | 0x8000; + case 7: + w0[1] = w0[1] | append0[0] << 24; + w0[2] = append0[0] >> 8 | append0[1] << 24; + w0[3] = append0[1] >> 8 | append0[2] << 24; + w1[0] = append0[2] >> 8 | append0[3] << 24; + w1[1] = append0[3] >> 8 | append1[0] << 24; + w1[2] = append1[0] >> 8 | append1[1] << 24; + w1[3] = append1[1] >> 8 | append1[2] << 24; + w2[0] = append1[2] >> 8 | append1[3] << 24; + w2[1] = append1[3] >> 8; break; - case 70: - w[17] = w[17] | 0x800000; + case 8: + w0[2] = append0[0]; + w0[3] = append0[1]; + w1[0] = append0[2]; + w1[1] = append0[3]; + w1[2] = append1[0]; + w1[3] = append1[1]; + w2[0] = append1[2]; + w2[1] = append1[3]; break; - case 71: - w[17] = w[17] | 0x80000000; + case 9: + w0[2] = w0[2] | append0[0] << 8; + w0[3] = append0[0] >> 24 | append0[1] << 8; + w1[0] = append0[1] >> 24 | append0[2] << 8; + w1[1] = append0[2] >> 24 | append0[3] << 8; + w1[2] = append0[3] >> 24 | append1[0] << 8; + w1[3] = append1[0] >> 24 | append1[1] << 8; + w2[0] = append1[1] >> 24 | append1[2] << 8; + w2[1] = append1[2] >> 24 | append1[3] << 8; + w2[2] = append1[3] >> 24; break; - case 72: - w[18] = 0x80; + case 10: + w0[2] = w0[2] | append0[0] << 16; + w0[3] = append0[0] >> 16 | append0[1] << 16; + w1[0] = append0[1] >> 16 | append0[2] << 16; + w1[1] = append0[2] >> 16 | append0[3] << 16; + w1[2] = append0[3] >> 16 | append1[0] << 16; + w1[3] = append1[0] >> 16 | append1[1] << 16; + w2[0] = append1[1] >> 16 | append1[2] << 16; + w2[1] = append1[2] >> 16 | append1[3] << 16; + w2[2] = append1[3] >> 16; break; - case 73: - w[18] = w[18] | 0x8000; + case 11: + w0[2] = w0[2] | append0[0] << 24; + w0[3] = append0[0] >> 8 | append0[1] << 24; + w1[0] = append0[1] >> 8 | append0[2] << 24; + w1[1] = append0[2] >> 8 | append0[3] << 24; + w1[2] = append0[3] >> 8 | append1[0] << 24; + w1[3] = append1[0] >> 8 | append1[1] << 24; + w2[0] = append1[1] >> 8 | append1[2] << 24; + w2[1] = append1[2] >> 8 | append1[3] << 24; + w2[2] = append1[3] >> 8; break; - case 74: - w[18] = w[18] | 0x800000; + case 12: + w0[3] = append0[0]; + w1[0] = append0[1]; + w1[1] = append0[2]; + w1[2] = append0[3]; + w1[3] = append1[0]; + w2[0] = append1[1]; + w2[1] = append1[2]; + w2[2] = append1[3]; break; - case 75: - w[18] = w[18] | 0x80000000; + case 13: + w0[3] = w0[3] | append0[0] << 8; + w1[0] = append0[0] >> 24 | append0[1] << 8; + w1[1] = append0[1] >> 24 | append0[2] << 8; + w1[2] = append0[2] >> 24 | append0[3] << 8; + w1[3] = append0[3] >> 24 | append1[0] << 8; + w2[0] = append1[0] >> 24 | append1[1] << 8; + w2[1] = append1[1] >> 24 | append1[2] << 8; + w2[2] = append1[2] >> 24 | append1[3] << 8; + w2[3] = append1[3] >> 24; break; - case 76: - w[19] = 0x80; + case 14: + w0[3] = w0[3] | append0[0] << 16; + w1[0] = append0[0] >> 16 | append0[1] << 16; + w1[1] = append0[1] >> 16 | append0[2] << 16; + w1[2] = append0[2] >> 16 | append0[3] << 16; + w1[3] = append0[3] >> 16 | append1[0] << 16; + w2[0] = append1[0] >> 16 | append1[1] << 16; + w2[1] = append1[1] >> 16 | append1[2] << 16; + w2[2] = append1[2] >> 16 | append1[3] << 16; + w2[3] = append1[3] >> 16; break; - case 77: - w[19] = w[19] | 0x8000; + case 15: + w0[3] = w0[3] | append0[0] << 24; + w1[0] = append0[0] >> 8 | append0[1] << 24; + w1[1] = append0[1] >> 8 | append0[2] << 24; + w1[2] = append0[2] >> 8 | append0[3] << 24; + w1[3] = append0[3] >> 8 | append1[0] << 24; + w2[0] = append1[0] >> 8 | append1[1] << 24; + w2[1] = append1[1] >> 8 | append1[2] << 24; + w2[2] = append1[2] >> 8 | append1[3] << 24; + w2[3] = append1[3] >> 8; break; - case 78: - w[19] = w[19] | 0x800000; + case 16: + w1[0] = append0[0]; + w1[1] = append0[1]; + w1[2] = append0[2]; + w1[3] = append0[3]; + w2[0] = append1[0]; + w2[1] = append1[1]; + w2[2] = append1[2]; + w2[3] = append1[3]; break; - case 79: - w[19] = w[19] | 0x80000000; + case 17: + w1[0] = w1[0] | append0[0] << 8; + w1[1] = append0[0] >> 24 | append0[1] << 8; + w1[2] = append0[1] >> 24 | append0[2] << 8; + w1[3] = append0[2] >> 24 | append0[3] << 8; + w2[0] = append0[3] >> 24 | append1[0] << 8; + w2[1] = append1[0] >> 24 | append1[1] << 8; + w2[2] = append1[1] >> 24 | append1[2] << 8; + w2[3] = append1[2] >> 24 | append1[3] << 8; + w3[0] = append1[3] >> 24; break; - case 80: - w[20] = 0x80; + case 18: + w1[0] = w1[0] | append0[0] << 16; + w1[1] = append0[0] >> 16 | append0[1] << 16; + w1[2] = append0[1] >> 16 | append0[2] << 16; + w1[3] = append0[2] >> 16 | append0[3] << 16; + w2[0] = append0[3] >> 16 | append1[0] << 16; + w2[1] = append1[0] >> 16 | append1[1] << 16; + w2[2] = append1[1] >> 16 | append1[2] << 16; + w2[3] = append1[2] >> 16 | append1[3] << 16; + w3[0] = append1[3] >> 16; break; - case 81: - w[20] = w[20] | 0x8000; + case 19: + w1[0] = w1[0] | append0[0] << 24; + w1[1] = append0[0] >> 8 | append0[1] << 24; + w1[2] = append0[1] >> 8 | append0[2] << 24; + w1[3] = append0[2] >> 8 | append0[3] << 24; + w2[0] = append0[3] >> 8 | append1[0] << 24; + w2[1] = append1[0] >> 8 | append1[1] << 24; + w2[2] = append1[1] >> 8 | append1[2] << 24; + w2[3] = append1[2] >> 8 | append1[3] << 24; + w3[0] = append1[3] >> 8; break; - case 82: - w[20] = w[20] | 0x800000; + case 20: + w1[1] = append0[0]; + w1[2] = append0[1]; + w1[3] = append0[2]; + w2[0] = append0[3]; + w2[1] = append1[0]; + w2[2] = append1[1]; + w2[3] = append1[2]; + w3[0] = append1[3]; break; - case 83: - w[20] = w[20] | 0x80000000; + case 21: + w1[1] = w1[1] | append0[0] << 8; + w1[2] = append0[0] >> 24 | append0[1] << 8; + w1[3] = append0[1] >> 24 | append0[2] << 8; + w2[0] = append0[2] >> 24 | append0[3] << 8; + w2[1] = append0[3] >> 24 | append1[0] << 8; + w2[2] = append1[0] >> 24 | append1[1] << 8; + w2[3] = append1[1] >> 24 | append1[2] << 8; + w3[0] = append1[2] >> 24 | append1[3] << 8; + w3[1] = append1[3] >> 24; break; - case 84: - w[21] = 0x80; + case 22: + w1[1] = w1[1] | append0[0] << 16; + w1[2] = append0[0] >> 16 | append0[1] << 16; + w1[3] = append0[1] >> 16 | append0[2] << 16; + w2[0] = append0[2] >> 16 | append0[3] << 16; + w2[1] = append0[3] >> 16 | append1[0] << 16; + w2[2] = append1[0] >> 16 | append1[1] << 16; + w2[3] = append1[1] >> 16 | append1[2] << 16; + w3[0] = append1[2] >> 16 | append1[3] << 16; + w3[1] = append1[3] >> 16; break; - case 85: - w[21] = w[21] | 0x8000; + case 23: + w1[1] = w1[1] | append0[0] << 24; + w1[2] = append0[0] >> 8 | append0[1] << 24; + w1[3] = append0[1] >> 8 | append0[2] << 24; + w2[0] = append0[2] >> 8 | append0[3] << 24; + w2[1] = append0[3] >> 8 | append1[0] << 24; + w2[2] = append1[0] >> 8 | append1[1] << 24; + w2[3] = append1[1] >> 8 | append1[2] << 24; + w3[0] = append1[2] >> 8 | append1[3] << 24; + w3[1] = append1[3] >> 8; break; - case 86: - w[21] = w[21] | 0x800000; + case 24: + w1[2] = append0[0]; + w1[3] = append0[1]; + w2[0] = append0[2]; + w2[1] = append0[3]; + w2[2] = append1[0]; + w2[3] = append1[1]; + w3[0] = append1[2]; + w3[1] = append1[3]; break; - case 87: - w[21] = w[21] | 0x80000000; + case 25: + w1[2] = w1[2] | append0[0] << 8; + w1[3] = append0[0] >> 24 | append0[1] << 8; + w2[0] = append0[1] >> 24 | append0[2] << 8; + w2[1] = append0[2] >> 24 | append0[3] << 8; + w2[2] = append0[3] >> 24 | append1[0] << 8; + w2[3] = append1[0] >> 24 | append1[1] << 8; + w3[0] = append1[1] >> 24 | append1[2] << 8; + w3[1] = append1[2] >> 24 | append1[3] << 8; break; - case 88: - w[22] = 0x80; + case 26: + w1[2] = w1[2] | append0[0] << 16; + w1[3] = append0[0] >> 16 | append0[1] << 16; + w2[0] = append0[1] >> 16 | append0[2] << 16; + w2[1] = append0[2] >> 16 | append0[3] << 16; + w2[2] = append0[3] >> 16 | append1[0] << 16; + w2[3] = append1[0] >> 16 | append1[1] << 16; + w3[0] = append1[1] >> 16 | append1[2] << 16; + w3[1] = append1[2] >> 16 | append1[3] << 16; break; - case 89: - w[22] = w[22] | 0x8000; + case 27: + w1[2] = w1[2] | append0[0] << 24; + w1[3] = append0[0] >> 8 | append0[1] << 24; + w2[0] = append0[1] >> 8 | append0[2] << 24; + w2[1] = append0[2] >> 8 | append0[3] << 24; + w2[2] = append0[3] >> 8 | append1[0] << 24; + w2[3] = append1[0] >> 8 | append1[1] << 24; + w3[0] = append1[1] >> 8 | append1[2] << 24; + w3[1] = append1[2] >> 8 | append1[3] << 24; break; - case 90: - w[22] = w[22] | 0x800000; + case 28: + w1[3] = append0[0]; + w2[0] = append0[1]; + w2[1] = append0[2]; + w2[2] = append0[3]; + w2[3] = append1[0]; + w3[0] = append1[1]; + w3[1] = append1[2]; break; - case 91: - w[22] = w[22] | 0x80000000; + case 29: + w1[3] = w1[3] | append0[0] << 8; + w2[0] = append0[0] >> 24 | append0[1] << 8; + w2[1] = append0[1] >> 24 | append0[2] << 8; + w2[2] = append0[2] >> 24 | append0[3] << 8; + w2[3] = append0[3] >> 24 | append1[0] << 8; + w3[0] = append1[0] >> 24 | append1[1] << 8; + w3[1] = append1[1] >> 24 | append1[2] << 8; break; - case 92: - w[23] = 0x80; + case 30: + w1[3] = w1[3] | append0[0] << 16; + w2[0] = append0[0] >> 16 | append0[1] << 16; + w2[1] = append0[1] >> 16 | append0[2] << 16; + w2[2] = append0[2] >> 16 | append0[3] << 16; + w2[3] = append0[3] >> 16 | append1[0] << 16; + w3[0] = append1[0] >> 16 | append1[1] << 16; + w3[1] = append1[1] >> 16 | append1[2] << 16; break; - case 93: - w[23] = w[23] | 0x8000; + case 31: + w1[3] = w1[3] | append0[0] << 24; + w2[0] = append0[0] >> 8 | append0[1] << 24; + w2[1] = append0[1] >> 8 | append0[2] << 24; + w2[2] = append0[2] >> 8 | append0[3] << 24; + w2[3] = append0[3] >> 8 | append1[0] << 24; + w3[0] = append1[0] >> 8 | append1[1] << 24; + w3[1] = append1[1] >> 8 | append1[2] << 24; break; - case 94: - w[23] = w[23] | 0x800000; + case 32: + w2[0] = append0[0]; + w2[1] = append0[1]; + w2[2] = append0[2]; + w2[3] = append0[3]; + w3[0] = append1[0]; + w3[1] = append1[1]; break; + } +} - case 95: - w[23] = w[23] | 0x80000000; +// before: memcat32_9 +static void memcat_c32_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset) +{ + switch (offset) + { + case 0: + w0[0] = append0[0]; + w0[1] = append0[1]; + w0[2] = append0[2]; + w0[3] = append0[3]; + w1[0] = append1[0]; + w1[1] = append1[1]; + w1[2] = append1[2]; + w1[3] = append1[3]; + w2[0] = append2[0]; break; - case 96: - w[24] = 0x80; + case 1: + w0[0] = w0[0] | append0[0] << 8; + w0[1] = append0[0] >> 24 | append0[1] << 8; + w0[2] = append0[1] >> 24 | append0[2] << 8; + w0[3] = append0[2] >> 24 | append0[3] << 8; + w1[0] = append0[3] >> 24 | append1[0] << 8; + w1[1] = append1[0] >> 24 | append1[1] << 8; + w1[2] = append1[1] >> 24 | append1[2] << 8; + w1[3] = append1[2] >> 24 | append1[3] << 8; + w2[0] = append1[3] >> 24 | append2[0] << 8; + w2[1] = append2[0] >> 24; break; - case 97: - w[24] = w[24] | 0x8000; + case 2: + w0[0] = w0[0] | append0[0] << 16; + w0[1] = append0[0] >> 16 | append0[1] << 16; + w0[2] = append0[1] >> 16 | append0[2] << 16; + w0[3] = append0[2] >> 16 | append0[3] << 16; + w1[0] = append0[3] >> 16 | append1[0] << 16; + w1[1] = append1[0] >> 16 | append1[1] << 16; + w1[2] = append1[1] >> 16 | append1[2] << 16; + w1[3] = append1[2] >> 16 | append1[3] << 16; + w2[0] = append1[3] >> 16 | append2[0] << 16; + w2[1] = append2[0] >> 16; break; - case 98: - w[24] = w[24] | 0x800000; + case 3: + w0[0] = w0[0] | append0[0] << 24; + w0[1] = append0[0] >> 8 | append0[1] << 24; + w0[2] = append0[1] >> 8 | append0[2] << 24; + w0[3] = append0[2] >> 8 | append0[3] << 24; + w1[0] = append0[3] >> 8 | append1[0] << 24; + w1[1] = append1[0] >> 8 | append1[1] << 24; + w1[2] = append1[1] >> 8 | append1[2] << 24; + w1[3] = append1[2] >> 8 | append1[3] << 24; + w2[0] = append1[3] >> 8 | append2[0] << 24; + w2[1] = append2[0] >> 8; break; - case 99: - w[24] = w[24] | 0x80000000; + case 4: + w0[1] = append0[0]; + w0[2] = append0[1]; + w0[3] = append0[2]; + w1[0] = append0[3]; + w1[1] = append1[0]; + w1[2] = append1[1]; + w1[3] = append1[2]; + w2[0] = append1[3]; + w2[1] = append2[0]; break; - case 100: - w[25] = 0x80; + case 5: + w0[1] = w0[1] | append0[0] << 8; + w0[2] = append0[0] >> 24 | append0[1] << 8; + w0[3] = append0[1] >> 24 | append0[2] << 8; + w1[0] = append0[2] >> 24 | append0[3] << 8; + w1[1] = append0[3] >> 24 | append1[0] << 8; + w1[2] = append1[0] >> 24 | append1[1] << 8; + w1[3] = append1[1] >> 24 | append1[2] << 8; + w2[0] = append1[2] >> 24 | append1[3] << 8; + w2[1] = append1[3] >> 24 | append2[0] << 8; + w2[2] = append2[0] >> 24; break; - case 101: - w[25] = w[25] | 0x8000; + case 6: + w0[1] = w0[1] | append0[0] << 16; + w0[2] = append0[0] >> 16 | append0[1] << 16; + w0[3] = append0[1] >> 16 | append0[2] << 16; + w1[0] = append0[2] >> 16 | append0[3] << 16; + w1[1] = append0[3] >> 16 | append1[0] << 16; + w1[2] = append1[0] >> 16 | append1[1] << 16; + w1[3] = append1[1] >> 16 | append1[2] << 16; + w2[0] = append1[2] >> 16 | append1[3] << 16; + w2[1] = append1[3] >> 16 | append2[0] << 16; + w2[2] = append2[0] >> 16; break; - case 102: - w[25] = w[25] | 0x800000; + case 7: + w0[1] = w0[1] | append0[0] << 24; + w0[2] = append0[0] >> 8 | append0[1] << 24; + w0[3] = append0[1] >> 8 | append0[2] << 24; + w1[0] = append0[2] >> 8 | append0[3] << 24; + w1[1] = append0[3] >> 8 | append1[0] << 24; + w1[2] = append1[0] >> 8 | append1[1] << 24; + w1[3] = append1[1] >> 8 | append1[2] << 24; + w2[0] = append1[2] >> 8 | append1[3] << 24; + w2[1] = append1[3] >> 8 | append2[0] << 24; + w2[2] = append2[0] >> 8; break; - case 103: - w[25] = w[25] | 0x80000000; + case 8: + w0[2] = append0[0]; + w0[3] = append0[1]; + w1[0] = append0[2]; + w1[1] = append0[3]; + w1[2] = append1[0]; + w1[3] = append1[1]; + w2[0] = append1[2]; + w2[1] = append1[3]; + w2[2] = append2[0]; break; - case 104: - w[26] = 0x80; + case 9: + w0[2] = w0[2] | append0[0] << 8; + w0[3] = append0[0] >> 24 | append0[1] << 8; + w1[0] = append0[1] >> 24 | append0[2] << 8; + w1[1] = append0[2] >> 24 | append0[3] << 8; + w1[2] = append0[3] >> 24 | append1[0] << 8; + w1[3] = append1[0] >> 24 | append1[1] << 8; + w2[0] = append1[1] >> 24 | append1[2] << 8; + w2[1] = append1[2] >> 24 | append1[3] << 8; + w2[2] = append1[3] >> 24 | append2[0] << 8; + w2[3] = append2[0] >> 24; break; - case 105: - w[26] = w[26] | 0x8000; + case 10: + w0[2] = w0[2] | append0[0] << 16; + w0[3] = append0[0] >> 16 | append0[1] << 16; + w1[0] = append0[1] >> 16 | append0[2] << 16; + w1[1] = append0[2] >> 16 | append0[3] << 16; + w1[2] = append0[3] >> 16 | append1[0] << 16; + w1[3] = append1[0] >> 16 | append1[1] << 16; + w2[0] = append1[1] >> 16 | append1[2] << 16; + w2[1] = append1[2] >> 16 | append1[3] << 16; + w2[2] = append1[3] >> 16 | append2[0] << 16; + w2[3] = append2[0] >> 16; break; - - case 106: - w[26] = w[26] | 0x800000; + + case 11: + w0[2] = w0[2] | append0[0] << 24; + w0[3] = append0[0] >> 8 | append0[1] << 24; + w1[0] = append0[1] >> 8 | append0[2] << 24; + w1[1] = append0[2] >> 8 | append0[3] << 24; + w1[2] = append0[3] >> 8 | append1[0] << 24; + w1[3] = append1[0] >> 8 | append1[1] << 24; + w2[0] = append1[1] >> 8 | append1[2] << 24; + w2[1] = append1[2] >> 8 | append1[3] << 24; + w2[2] = append1[3] >> 8 | append2[0] << 24; + w2[3] = append2[0] >> 8; break; - case 107: - w[26] = w[26] | 0x80000000; + case 12: + w0[3] = append0[0]; + w1[0] = append0[1]; + w1[1] = append0[2]; + w1[2] = append0[3]; + w1[3] = append1[0]; + w2[0] = append1[1]; + w2[1] = append1[2]; + w2[2] = append1[3]; + w2[3] = append2[0]; break; - case 108: - w[27] = 0x80; + case 13: + w0[3] = w0[3] | append0[0] << 8; + w1[0] = append0[0] >> 24 | append0[1] << 8; + w1[1] = append0[1] >> 24 | append0[2] << 8; + w1[2] = append0[2] >> 24 | append0[3] << 8; + w1[3] = append0[3] >> 24 | append1[0] << 8; + w2[0] = append1[0] >> 24 | append1[1] << 8; + w2[1] = append1[1] >> 24 | append1[2] << 8; + w2[2] = append1[2] >> 24 | append1[3] << 8; + w2[3] = append1[3] >> 24 | append2[0] << 8; + w3[0] = append2[0] >> 24; break; - case 109: - w[27] = w[27] | 0x8000; + case 14: + w0[3] = w0[3] | append0[0] << 16; + w1[0] = append0[0] >> 16 | append0[1] << 16; + w1[1] = append0[1] >> 16 | append0[2] << 16; + w1[2] = append0[2] >> 16 | append0[3] << 16; + w1[3] = append0[3] >> 16 | append1[0] << 16; + w2[0] = append1[0] >> 16 | append1[1] << 16; + w2[1] = append1[1] >> 16 | append1[2] << 16; + w2[2] = append1[2] >> 16 | append1[3] << 16; + w2[3] = append1[3] >> 16 | append2[0] << 16; + w3[0] = append2[0] >> 16; break; - case 110: - w[27] = w[27] | 0x800000; + case 15: + w0[3] = w0[3] | append0[0] << 24; + w1[0] = append0[0] >> 8 | append0[1] << 24; + w1[1] = append0[1] >> 8 | append0[2] << 24; + w1[2] = append0[2] >> 8 | append0[3] << 24; + w1[3] = append0[3] >> 8 | append1[0] << 24; + w2[0] = append1[0] >> 8 | append1[1] << 24; + w2[1] = append1[1] >> 8 | append1[2] << 24; + w2[2] = append1[2] >> 8 | append1[3] << 24; + w2[3] = append1[3] >> 8 | append2[0] << 24; + w3[0] = append2[0] >> 8; break; - case 111: - w[27] = w[27] | 0x80000000; + case 16: + w1[0] = append0[0]; + w1[1] = append0[1]; + w1[2] = append0[2]; + w1[3] = append0[3]; + w2[0] = append1[0]; + w2[1] = append1[1]; + w2[2] = append1[2]; + w2[3] = append1[3]; + w3[0] = append2[0]; break; - case 112: - w[28] = 0x80; + case 17: + w1[0] = w1[0] | append0[0] << 8; + w1[1] = append0[0] >> 24 | append0[1] << 8; + w1[2] = append0[1] >> 24 | append0[2] << 8; + w1[3] = append0[2] >> 24 | append0[3] << 8; + w2[0] = append0[3] >> 24 | append1[0] << 8; + w2[1] = append1[0] >> 24 | append1[1] << 8; + w2[2] = append1[1] >> 24 | append1[2] << 8; + w2[3] = append1[2] >> 24 | append1[3] << 8; + w3[0] = append1[3] >> 24 | append2[0] << 8; + w3[1] = append2[0] >> 24; break; - case 113: - w[28] = w[28] | 0x8000; + case 18: + w1[0] = w1[0] | append0[0] << 16; + w1[1] = append0[0] >> 16 | append0[1] << 16; + w1[2] = append0[1] >> 16 | append0[2] << 16; + w1[3] = append0[2] >> 16 | append0[3] << 16; + w2[0] = append0[3] >> 16 | append1[0] << 16; + w2[1] = append1[0] >> 16 | append1[1] << 16; + w2[2] = append1[1] >> 16 | append1[2] << 16; + w2[3] = append1[2] >> 16 | append1[3] << 16; + w3[0] = append1[3] >> 16 | append2[0] << 16; + w3[1] = append2[0] >> 16; break; - case 114: - w[28] = w[28] | 0x800000; + case 19: + w1[0] = w1[0] | append0[0] << 24; + w1[1] = append0[0] >> 8 | append0[1] << 24; + w1[2] = append0[1] >> 8 | append0[2] << 24; + w1[3] = append0[2] >> 8 | append0[3] << 24; + w2[0] = append0[3] >> 8 | append1[0] << 24; + w2[1] = append1[0] >> 8 | append1[1] << 24; + w2[2] = append1[1] >> 8 | append1[2] << 24; + w2[3] = append1[2] >> 8 | append1[3] << 24; + w3[0] = append1[3] >> 8 | append2[0] << 24; + w3[1] = append2[0] >> 8; break; - case 115: - w[28] = w[28] | 0x80000000; + case 20: + w1[1] = append0[0]; + w1[2] = append0[1]; + w1[3] = append0[2]; + w2[0] = append0[3]; + w2[1] = append1[0]; + w2[2] = append1[1]; + w2[3] = append1[2]; + w3[0] = append1[3]; + w3[1] = append2[0]; break; - case 116: - w[29] = 0x80; + case 21: + w1[1] = w1[1] | append0[0] << 8; + w1[2] = append0[0] >> 24 | append0[1] << 8; + w1[3] = append0[1] >> 24 | append0[2] << 8; + w2[0] = append0[2] >> 24 | append0[3] << 8; + w2[1] = append0[3] >> 24 | append1[0] << 8; + w2[2] = append1[0] >> 24 | append1[1] << 8; + w2[3] = append1[1] >> 24 | append1[2] << 8; + w3[0] = append1[2] >> 24 | append1[3] << 8; + w3[1] = append1[3] >> 24 | append2[0] << 8; break; - case 117: - w[29] = w[29] | 0x8000; + case 22: + w1[1] = w1[1] | append0[0] << 16; + w1[2] = append0[0] >> 16 | append0[1] << 16; + w1[3] = append0[1] >> 16 | append0[2] << 16; + w2[0] = append0[2] >> 16 | append0[3] << 16; + w2[1] = append0[3] >> 16 | append1[0] << 16; + w2[2] = append1[0] >> 16 | append1[1] << 16; + w2[3] = append1[1] >> 16 | append1[2] << 16; + w3[0] = append1[2] >> 16 | append1[3] << 16; + w3[1] = append1[3] >> 16 | append2[0] << 16; break; - case 118: - w[29] = w[29] | 0x800000; + case 23: + w1[1] = w1[1] | append0[0] << 24; + w1[2] = append0[0] >> 8 | append0[1] << 24; + w1[3] = append0[1] >> 8 | append0[2] << 24; + w2[0] = append0[2] >> 8 | append0[3] << 24; + w2[1] = append0[3] >> 8 | append1[0] << 24; + w2[2] = append1[0] >> 8 | append1[1] << 24; + w2[3] = append1[1] >> 8 | append1[2] << 24; + w3[0] = append1[2] >> 8 | append1[3] << 24; + w3[1] = append1[3] >> 8 | append2[0] << 24; break; - case 119: - w[29] = w[29] | 0x80000000; + case 24: + w1[2] = append0[0]; + w1[3] = append0[1]; + w2[0] = append0[2]; + w2[1] = append0[3]; + w2[2] = append1[0]; + w2[3] = append1[1]; + w3[0] = append1[2]; + w3[1] = append1[3]; break; - case 120: - w[30] = 0x80; + case 25: + w1[2] = w1[2] | append0[0] << 8; + w1[3] = append0[0] >> 24 | append0[1] << 8; + w2[0] = append0[1] >> 24 | append0[2] << 8; + w2[1] = append0[2] >> 24 | append0[3] << 8; + w2[2] = append0[3] >> 24 | append1[0] << 8; + w2[3] = append1[0] >> 24 | append1[1] << 8; + w3[0] = append1[1] >> 24 | append1[2] << 8; + w3[1] = append1[2] >> 24 | append1[3] << 8; break; - case 121: - w[30] = w[30] | 0x8000; + case 26: + w1[2] = w1[2] | append0[0] << 16; + w1[3] = append0[0] >> 16 | append0[1] << 16; + w2[0] = append0[1] >> 16 | append0[2] << 16; + w2[1] = append0[2] >> 16 | append0[3] << 16; + w2[2] = append0[3] >> 16 | append1[0] << 16; + w2[3] = append1[0] >> 16 | append1[1] << 16; + w3[0] = append1[1] >> 16 | append1[2] << 16; + w3[1] = append1[2] >> 16 | append1[3] << 16; break; - case 122: - w[30] = w[30] | 0x800000; + case 27: + w1[2] = w1[2] | append0[0] << 24; + w1[3] = append0[0] >> 8 | append0[1] << 24; + w2[0] = append0[1] >> 8 | append0[2] << 24; + w2[1] = append0[2] >> 8 | append0[3] << 24; + w2[2] = append0[3] >> 8 | append1[0] << 24; + w2[3] = append1[0] >> 8 | append1[1] << 24; + w3[0] = append1[1] >> 8 | append1[2] << 24; + w3[1] = append1[2] >> 8 | append1[3] << 24; break; - case 123: - w[30] = w[30] | 0x80000000; + case 28: + w1[3] = append0[0]; + w2[0] = append0[1]; + w2[1] = append0[2]; + w2[2] = append0[3]; + w2[3] = append1[0]; + w3[0] = append1[1]; + w3[1] = append1[2]; break; - case 124: - w[31] = 0x80; + case 29: + w1[3] = w1[3] | append0[0] << 8; + w2[0] = append0[0] >> 24 | append0[1] << 8; + w2[1] = append0[1] >> 24 | append0[2] << 8; + w2[2] = append0[2] >> 24 | append0[3] << 8; + w2[3] = append0[3] >> 24 | append1[0] << 8; + w3[0] = append1[0] >> 24 | append1[1] << 8; + w3[1] = append1[1] >> 24 | append1[2] << 8; break; - case 125: - w[31] = w[31] | 0x8000; + case 30: + w1[3] = w1[3] | append0[0] << 16; + w2[0] = append0[0] >> 16 | append0[1] << 16; + w2[1] = append0[1] >> 16 | append0[2] << 16; + w2[2] = append0[2] >> 16 | append0[3] << 16; + w2[3] = append0[3] >> 16 | append1[0] << 16; + w3[0] = append1[0] >> 16 | append1[1] << 16; + w3[1] = append1[1] >> 16 | append1[2] << 16; break; - case 126: - w[31] = w[31] | 0x800000; + case 31: + w1[3] = w1[3] | append0[0] << 24; + w2[0] = append0[0] >> 8 | append0[1] << 24; + w2[1] = append0[1] >> 8 | append0[2] << 24; + w2[2] = append0[2] >> 8 | append0[3] << 24; + w2[3] = append0[3] >> 8 | append1[0] << 24; + w3[0] = append1[0] >> 8 | append1[1] << 24; + w3[1] = append1[1] >> 8 | append1[2] << 24; break; - case 127: - w[31] = w[31] | 0x80000000; + case 32: + w2[0] = append0[0]; + w2[1] = append0[1]; + w2[2] = append0[2]; + w2[3] = append0[3]; + w3[0] = append1[0]; + w3[1] = append1[1]; break; } } + */ diff --git a/OpenCL/gpu_aes256_amd.c b/OpenCL/gpu_aes256_amd.c index ecb6d13..1127165 100644 --- a/OpenCL/gpu_aes256_amd.c +++ b/OpenCL/gpu_aes256_amd.c @@ -784,14 +784,14 @@ static void aes256_set_encrypt_key (u32 *ks, const u32 *ukey) { u32 ukey_s[8]; - ukey_s[0] = swap_workaround (ukey[0]); - ukey_s[1] = swap_workaround (ukey[1]); - ukey_s[2] = swap_workaround (ukey[2]); - ukey_s[3] = swap_workaround (ukey[3]); - ukey_s[4] = swap_workaround (ukey[4]); - ukey_s[5] = swap_workaround (ukey[5]); - ukey_s[6] = swap_workaround (ukey[6]); - ukey_s[7] = swap_workaround (ukey[7]); + ukey_s[0] = swap32 (ukey[0]); + ukey_s[1] = swap32 (ukey[1]); + ukey_s[2] = swap32 (ukey[2]); + ukey_s[3] = swap32 (ukey[3]); + ukey_s[4] = swap32 (ukey[4]); + ukey_s[5] = swap32 (ukey[5]); + ukey_s[6] = swap32 (ukey[6]); + ukey_s[7] = swap32 (ukey[7]); aes256_ExpandKey (ks, ukey_s); } @@ -800,14 +800,14 @@ static void aes256_set_decrypt_key (u32 *ks, const u32 *ukey) { u32 ukey_s[8]; - ukey_s[0] = swap_workaround (ukey[0]); - ukey_s[1] = swap_workaround (ukey[1]); - ukey_s[2] = swap_workaround (ukey[2]); - ukey_s[3] = swap_workaround (ukey[3]); - ukey_s[4] = swap_workaround (ukey[4]); - ukey_s[5] = swap_workaround (ukey[5]); - ukey_s[6] = swap_workaround (ukey[6]); - ukey_s[7] = swap_workaround (ukey[7]); + ukey_s[0] = swap32 (ukey[0]); + ukey_s[1] = swap32 (ukey[1]); + ukey_s[2] = swap32 (ukey[2]); + ukey_s[3] = swap32 (ukey[3]); + ukey_s[4] = swap32 (ukey[4]); + ukey_s[5] = swap32 (ukey[5]); + ukey_s[6] = swap32 (ukey[6]); + ukey_s[7] = swap32 (ukey[7]); aes256_ExpandKey (ks, ukey_s); @@ -818,10 +818,10 @@ static void aes256_decrypt (const u32 *ks, const u32 *in, u32 *out) { u32 in_s[4]; - in_s[0] = swap_workaround (in[0]); - in_s[1] = swap_workaround (in[1]); - in_s[2] = swap_workaround (in[2]); - in_s[3] = swap_workaround (in[3]); + in_s[0] = swap32 (in[0]); + in_s[1] = swap32 (in[1]); + in_s[2] = swap32 (in[2]); + in_s[3] = swap32 (in[3]); u32 s0 = in_s[0] ^ ks[0]; u32 s1 = in_s[1] ^ ks[1]; @@ -910,20 +910,20 @@ static void aes256_decrypt (const u32 *ks, const u32 *in, u32 *out) ^ (td4[(t0 >> 0) & 0xff] & 0x000000ff) ^ ks[59]; - out[0] = swap_workaround (out[0]); - out[1] = swap_workaround (out[1]); - out[2] = swap_workaround (out[2]); - out[3] = swap_workaround (out[3]); + out[0] = swap32 (out[0]); + out[1] = swap32 (out[1]); + out[2] = swap32 (out[2]); + out[3] = swap32 (out[3]); } static void aes256_encrypt (const u32 *ks, const u32 *in, u32 *out) { u32 in_s[4]; - in_s[0] = swap_workaround (in[0]); - in_s[1] = swap_workaround (in[1]); - in_s[2] = swap_workaround (in[2]); - in_s[3] = swap_workaround (in[3]); + in_s[0] = swap32 (in[0]); + in_s[1] = swap32 (in[1]); + in_s[2] = swap32 (in[2]); + in_s[3] = swap32 (in[3]); u32 s0 = in_s[0] ^ ks[0]; u32 s1 = in_s[1] ^ ks[1]; @@ -1012,10 +1012,10 @@ static void aes256_encrypt (const u32 *ks, const u32 *in, u32 *out) ^ (te4[(t2 >> 0) & 0xff] & 0x000000ff) ^ ks[59]; - out[0] = swap_workaround (out[0]); - out[1] = swap_workaround (out[1]); - out[2] = swap_workaround (out[2]); - out[3] = swap_workaround (out[3]); + out[0] = swap32 (out[0]); + out[1] = swap32 (out[1]); + out[2] = swap32 (out[2]); + out[3] = swap32 (out[3]); } static void aes256_decrypt_xts (const u32 *ukey1, const u32 *ukey2, const u32 *in, u32 *out) diff --git a/OpenCL/m00100_a0.cl b/OpenCL/m00100_a0.cl index d280fc7..69b3b6f 100644 --- a/OpenCL/m00100_a0.cl +++ b/OpenCL/m00100_a0.cl @@ -96,20 +96,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00100_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; @@ -325,20 +325,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00100_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; diff --git a/OpenCL/m00100_a1.cl b/OpenCL/m00100_a1.cl index 383d7e9..0ad1c17 100644 --- a/OpenCL/m00100_a1.cl +++ b/OpenCL/m00100_a1.cl @@ -150,20 +150,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00100_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -435,20 +435,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00100_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; diff --git a/OpenCL/m00110_a0.cl b/OpenCL/m00110_a0.cl index 5e88b61..7621c5f 100644 --- a/OpenCL/m00110_a0.cl +++ b/OpenCL/m00110_a0.cl @@ -172,20 +172,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00110_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; @@ -477,20 +477,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00110_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; diff --git a/OpenCL/m00110_a1.cl b/OpenCL/m00110_a1.cl index b28b13b..ce73e43 100644 --- a/OpenCL/m00110_a1.cl +++ b/OpenCL/m00110_a1.cl @@ -204,20 +204,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00110_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; @@ -543,20 +543,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00110_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; diff --git a/OpenCL/m00110_a3.cl b/OpenCL/m00110_a3.cl index e62def9..e65f382 100644 --- a/OpenCL/m00110_a3.cl +++ b/OpenCL/m00110_a3.cl @@ -63,22 +63,22 @@ static void m00110m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global g switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); - w[ 0] |= swap_workaround (salt_buf0[0]); - w[ 1] |= swap_workaround (salt_buf0[1]); - w[ 2] |= swap_workaround (salt_buf0[2]); - w[ 3] |= swap_workaround (salt_buf0[3]); - w[ 4] |= swap_workaround (salt_buf1[0]); - w[ 5] |= swap_workaround (salt_buf1[1]); - w[ 6] |= swap_workaround (salt_buf1[2]); - w[ 7] |= swap_workaround (salt_buf1[3]); - w[ 8] |= swap_workaround (salt_buf2[0]); - w[ 9] |= swap_workaround (salt_buf2[1]); - w[10] |= swap_workaround (salt_buf2[2]); - w[11] |= swap_workaround (salt_buf2[3]); - w[12] |= swap_workaround (salt_buf3[0]); - w[13] |= swap_workaround (salt_buf3[1]); - w[14] |= swap_workaround (salt_buf3[2]); - w[15] |= swap_workaround (salt_buf3[3]); + w[ 0] |= swap32 (salt_buf0[0]); + w[ 1] |= swap32 (salt_buf0[1]); + w[ 2] |= swap32 (salt_buf0[2]); + w[ 3] |= swap32 (salt_buf0[3]); + w[ 4] |= swap32 (salt_buf1[0]); + w[ 5] |= swap32 (salt_buf1[1]); + w[ 6] |= swap32 (salt_buf1[2]); + w[ 7] |= swap32 (salt_buf1[3]); + w[ 8] |= swap32 (salt_buf2[0]); + w[ 9] |= swap32 (salt_buf2[1]); + w[10] |= swap32 (salt_buf2[2]); + w[11] |= swap32 (salt_buf2[3]); + w[12] |= swap32 (salt_buf3[0]); + w[13] |= swap32 (salt_buf3[1]); + w[14] |= swap32 (salt_buf3[2]); + w[15] |= swap32 (salt_buf3[3]); const u32 salt_len = salt_bufs[salt_pos].salt_len; diff --git a/OpenCL/m00120_a0.cl b/OpenCL/m00120_a0.cl index 2372f6f..1cfb69a 100644 --- a/OpenCL/m00120_a0.cl +++ b/OpenCL/m00120_a0.cl @@ -151,22 +151,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00120_m04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; @@ -435,22 +435,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00120_s04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; diff --git a/OpenCL/m00120_a1.cl b/OpenCL/m00120_a1.cl index b7172e0..fbaad38 100644 --- a/OpenCL/m00120_a1.cl +++ b/OpenCL/m00120_a1.cl @@ -197,22 +197,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00120_m04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; @@ -529,22 +529,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00120_s04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; diff --git a/OpenCL/m00120_a3.cl b/OpenCL/m00120_a3.cl index 593a98b..ce2c64b 100644 --- a/OpenCL/m00120_a3.cl +++ b/OpenCL/m00120_a3.cl @@ -169,22 +169,22 @@ static void m00120m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); - w3_t[2] = swap_workaround (w3[2]); - w3_t[3] = swap_workaround (w3[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); + w3_t[2] = swap32 (w3[2]); + w3_t[3] = swap32 (w3[3]); switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); @@ -205,22 +205,22 @@ static void m00120m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] |= salt_buf3[2]; w3_t[3] |= salt_buf3[3]; - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - w3_t[2] = swap_workaround (w3_t[2]); - w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + w3_t[2] = swap32 (w3_t[2]); + w3_t[3] = swap32 (w3_t[3]); /** * loop @@ -464,22 +464,22 @@ static void m00120s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); - w3_t[2] = swap_workaround (w3[2]); - w3_t[3] = swap_workaround (w3[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); + w3_t[2] = swap32 (w3[2]); + w3_t[3] = swap32 (w3[3]); switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); @@ -500,22 +500,22 @@ static void m00120s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] |= salt_buf3[2]; w3_t[3] |= salt_buf3[3]; - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - w3_t[2] = swap_workaround (w3_t[2]); - w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + w3_t[2] = swap32 (w3_t[2]); + w3_t[3] = swap32 (w3_t[3]); /** * loop diff --git a/OpenCL/m00130_a0.cl b/OpenCL/m00130_a0.cl index ade4da7..49fad8a 100644 --- a/OpenCL/m00130_a0.cl +++ b/OpenCL/m00130_a0.cl @@ -181,22 +181,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00130_m04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; @@ -493,22 +493,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00130_s04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; diff --git a/OpenCL/m00130_a1.cl b/OpenCL/m00130_a1.cl index 12a642d..d7d7668 100644 --- a/OpenCL/m00130_a1.cl +++ b/OpenCL/m00130_a1.cl @@ -227,22 +227,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00130_m04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; @@ -587,22 +587,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00130_s04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; diff --git a/OpenCL/m00130_a3.cl b/OpenCL/m00130_a3.cl index 9847b7e..b51bbac 100644 --- a/OpenCL/m00130_a3.cl +++ b/OpenCL/m00130_a3.cl @@ -63,22 +63,22 @@ static void m00130m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global g switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); - w[ 0] |= swap_workaround (salt_buf0[0]); - w[ 1] |= swap_workaround (salt_buf0[1]); - w[ 2] |= swap_workaround (salt_buf0[2]); - w[ 3] |= swap_workaround (salt_buf0[3]); - w[ 4] |= swap_workaround (salt_buf1[0]); - w[ 5] |= swap_workaround (salt_buf1[1]); - w[ 6] |= swap_workaround (salt_buf1[2]); - w[ 7] |= swap_workaround (salt_buf1[3]); - w[ 8] |= swap_workaround (salt_buf2[0]); - w[ 9] |= swap_workaround (salt_buf2[1]); - w[10] |= swap_workaround (salt_buf2[2]); - w[11] |= swap_workaround (salt_buf2[3]); - w[12] |= swap_workaround (salt_buf3[0]); - w[13] |= swap_workaround (salt_buf3[1]); - w[14] |= swap_workaround (salt_buf3[2]); - w[15] |= swap_workaround (salt_buf3[3]); + w[ 0] |= swap32 (salt_buf0[0]); + w[ 1] |= swap32 (salt_buf0[1]); + w[ 2] |= swap32 (salt_buf0[2]); + w[ 3] |= swap32 (salt_buf0[3]); + w[ 4] |= swap32 (salt_buf1[0]); + w[ 5] |= swap32 (salt_buf1[1]); + w[ 6] |= swap32 (salt_buf1[2]); + w[ 7] |= swap32 (salt_buf1[3]); + w[ 8] |= swap32 (salt_buf2[0]); + w[ 9] |= swap32 (salt_buf2[1]); + w[10] |= swap32 (salt_buf2[2]); + w[11] |= swap32 (salt_buf2[3]); + w[12] |= swap32 (salt_buf3[0]); + w[13] |= swap32 (salt_buf3[1]); + w[14] |= swap32 (salt_buf3[2]); + w[15] |= swap32 (salt_buf3[3]); const u32 salt_len = salt_bufs[salt_pos].salt_len; diff --git a/OpenCL/m00140_a0.cl b/OpenCL/m00140_a0.cl index 90b368d..e745ddf 100644 --- a/OpenCL/m00140_a0.cl +++ b/OpenCL/m00140_a0.cl @@ -145,22 +145,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00140_m04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; @@ -421,22 +421,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00140_s04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; diff --git a/OpenCL/m00140_a1.cl b/OpenCL/m00140_a1.cl index 3b297d5..f056a16 100644 --- a/OpenCL/m00140_a1.cl +++ b/OpenCL/m00140_a1.cl @@ -191,22 +191,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00140_m04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; @@ -515,22 +515,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00140_s04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; diff --git a/OpenCL/m00140_a3.cl b/OpenCL/m00140_a3.cl index 48489ef..6d0b68d 100644 --- a/OpenCL/m00140_a3.cl +++ b/OpenCL/m00140_a3.cl @@ -169,22 +169,22 @@ static void m00140m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); - w3_t[2] = swap_workaround (w3[2]); - w3_t[3] = swap_workaround (w3[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); + w3_t[2] = swap32 (w3[2]); + w3_t[3] = swap32 (w3[3]); switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); @@ -205,22 +205,22 @@ static void m00140m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] |= salt_buf3[2]; w3_t[3] |= salt_buf3[3]; - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - w3_t[2] = swap_workaround (w3_t[2]); - w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + w3_t[2] = swap32 (w3_t[2]); + w3_t[3] = swap32 (w3_t[3]); /** * loop @@ -464,22 +464,22 @@ static void m00140s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); - w3_t[2] = swap_workaround (w3[2]); - w3_t[3] = swap_workaround (w3[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); + w3_t[2] = swap32 (w3[2]); + w3_t[3] = swap32 (w3[3]); switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); @@ -500,22 +500,22 @@ static void m00140s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] |= salt_buf3[2]; w3_t[3] |= salt_buf3[3]; - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - w3_t[2] = swap_workaround (w3_t[2]); - w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + w3_t[2] = swap32 (w3_t[2]); + w3_t[3] = swap32 (w3_t[3]); /** * loop diff --git a/OpenCL/m00150_a0.cl b/OpenCL/m00150_a0.cl index 2981602..7524b94 100644 --- a/OpenCL/m00150_a0.cl +++ b/OpenCL/m00150_a0.cl @@ -333,17 +333,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00150_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -364,14 +364,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00150_m04 (__glo hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; @@ -508,17 +508,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00150_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -539,14 +539,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00150_s04 (__glo hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; diff --git a/OpenCL/m00150_a1.cl b/OpenCL/m00150_a1.cl index 3cf3afe..bf785ef 100644 --- a/OpenCL/m00150_a1.cl +++ b/OpenCL/m00150_a1.cl @@ -385,17 +385,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00150_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -416,14 +416,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00150_m04 (__glo hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; @@ -614,17 +614,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00150_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -645,14 +645,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00150_s04 (__glo hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; diff --git a/OpenCL/m00150_a3.cl b/OpenCL/m00150_a3.cl index 18da15a..22c6bde 100644 --- a/OpenCL/m00150_a3.cl +++ b/OpenCL/m00150_a3.cl @@ -315,14 +315,14 @@ static void m00150m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; @@ -435,14 +435,14 @@ static void m00150s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; diff --git a/OpenCL/m00160_a0.cl b/OpenCL/m00160_a0.cl index 53db683..64f686d 100644 --- a/OpenCL/m00160_a0.cl +++ b/OpenCL/m00160_a0.cl @@ -295,17 +295,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00160_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -364,14 +364,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00160_m04 (__glo append_0x80_2x4 (w0, w1, out_len); - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; @@ -458,17 +458,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00160_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -539,14 +539,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00160_s04 (__glo append_0x80_2x4 (w0, w1, out_len); - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; diff --git a/OpenCL/m00160_a1.cl b/OpenCL/m00160_a1.cl index 8b3cdda..bd58f48 100644 --- a/OpenCL/m00160_a1.cl +++ b/OpenCL/m00160_a1.cl @@ -312,17 +312,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00160_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -416,20 +416,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00160_m04 (__glo append_0x80_4x4 (w0, w1, w2, w3, pw_len); - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = (64 + pw_len) * 8; @@ -529,17 +529,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00160_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -645,20 +645,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00160_s04 (__glo append_0x80_4x4 (w0, w1, w2, w3, pw_len); - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = (64 + pw_len) * 8; diff --git a/OpenCL/m00160_a3.cl b/OpenCL/m00160_a3.cl index 3fcb174..0c0b414 100644 --- a/OpenCL/m00160_a3.cl +++ b/OpenCL/m00160_a3.cl @@ -270,17 +270,17 @@ static void m00160m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -376,17 +376,17 @@ static void m00160s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; diff --git a/OpenCL/m00190_a0.cl b/OpenCL/m00190_a0.cl index bdfe91d..e0111fc 100644 --- a/OpenCL/m00190_a0.cl +++ b/OpenCL/m00190_a0.cl @@ -96,20 +96,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00190_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; @@ -336,20 +336,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00190_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; diff --git a/OpenCL/m00190_a1.cl b/OpenCL/m00190_a1.cl index 0918054..4eb4181 100644 --- a/OpenCL/m00190_a1.cl +++ b/OpenCL/m00190_a1.cl @@ -150,20 +150,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00190_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -446,20 +446,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00190_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; diff --git a/OpenCL/m00300_a0.cl b/OpenCL/m00300_a0.cl index cdc7017..ce69731 100644 --- a/OpenCL/m00300_a0.cl +++ b/OpenCL/m00300_a0.cl @@ -96,20 +96,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00300_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; @@ -450,20 +450,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00300_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; diff --git a/OpenCL/m00300_a1.cl b/OpenCL/m00300_a1.cl index 21635be..93f2501 100644 --- a/OpenCL/m00300_a1.cl +++ b/OpenCL/m00300_a1.cl @@ -150,20 +150,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00300_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -560,20 +560,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00300_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; diff --git a/OpenCL/m01400_a0.cl b/OpenCL/m01400_a0.cl index fd768e5..865795d 100644 --- a/OpenCL/m01400_a0.cl +++ b/OpenCL/m01400_a0.cl @@ -96,14 +96,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01400_m04 (__glo * SHA256 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); u32 w8_t = 0; u32 w9_t = 0; u32 wa_t = 0; @@ -294,14 +294,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01400_s04 (__glo * SHA256 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); u32 w8_t = 0; u32 w9_t = 0; u32 wa_t = 0; diff --git a/OpenCL/m01400_a1.cl b/OpenCL/m01400_a1.cl index 2facff5..d11d389 100644 --- a/OpenCL/m01400_a1.cl +++ b/OpenCL/m01400_a1.cl @@ -144,20 +144,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01400_m04 (__glo * SHA256 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -392,20 +392,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01400_s04 (__glo * SHA256 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; diff --git a/OpenCL/m01410_a0.cl b/OpenCL/m01410_a0.cl index 9bd5688..b559da6 100644 --- a/OpenCL/m01410_a0.cl +++ b/OpenCL/m01410_a0.cl @@ -172,20 +172,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01410_m04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_salt_len * 8; @@ -446,20 +446,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01410_s04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_salt_len * 8; diff --git a/OpenCL/m01410_a1.cl b/OpenCL/m01410_a1.cl index 3a4861c..fc54f7c4 100644 --- a/OpenCL/m01410_a1.cl +++ b/OpenCL/m01410_a1.cl @@ -198,20 +198,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01410_m04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; @@ -500,20 +500,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01410_s04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; diff --git a/OpenCL/m01410_a3.cl b/OpenCL/m01410_a3.cl index c18442e..7e7caec 100644 --- a/OpenCL/m01410_a3.cl +++ b/OpenCL/m01410_a3.cl @@ -63,22 +63,22 @@ static void m01410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global g switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); - w[ 0] |= swap_workaround (salt_buf0[0]); - w[ 1] |= swap_workaround (salt_buf0[1]); - w[ 2] |= swap_workaround (salt_buf0[2]); - w[ 3] |= swap_workaround (salt_buf0[3]); - w[ 4] |= swap_workaround (salt_buf1[0]); - w[ 5] |= swap_workaround (salt_buf1[1]); - w[ 6] |= swap_workaround (salt_buf1[2]); - w[ 7] |= swap_workaround (salt_buf1[3]); - w[ 8] |= swap_workaround (salt_buf2[0]); - w[ 9] |= swap_workaround (salt_buf2[1]); - w[10] |= swap_workaround (salt_buf2[2]); - w[11] |= swap_workaround (salt_buf2[3]); - w[12] |= swap_workaround (salt_buf3[0]); - w[13] |= swap_workaround (salt_buf3[1]); - w[14] |= swap_workaround (salt_buf3[2]); - w[15] |= swap_workaround (salt_buf3[3]); + w[ 0] |= swap32 (salt_buf0[0]); + w[ 1] |= swap32 (salt_buf0[1]); + w[ 2] |= swap32 (salt_buf0[2]); + w[ 3] |= swap32 (salt_buf0[3]); + w[ 4] |= swap32 (salt_buf1[0]); + w[ 5] |= swap32 (salt_buf1[1]); + w[ 6] |= swap32 (salt_buf1[2]); + w[ 7] |= swap32 (salt_buf1[3]); + w[ 8] |= swap32 (salt_buf2[0]); + w[ 9] |= swap32 (salt_buf2[1]); + w[10] |= swap32 (salt_buf2[2]); + w[11] |= swap32 (salt_buf2[3]); + w[12] |= swap32 (salt_buf3[0]); + w[13] |= swap32 (salt_buf3[1]); + w[14] |= swap32 (salt_buf3[2]); + w[15] |= swap32 (salt_buf3[3]); const u32 salt_len = salt_bufs[salt_pos].salt_len; diff --git a/OpenCL/m01420_a0.cl b/OpenCL/m01420_a0.cl index b513b1b..6d1c1e5 100644 --- a/OpenCL/m01420_a0.cl +++ b/OpenCL/m01420_a0.cl @@ -133,20 +133,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01420_m04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_salt_len * 8; @@ -368,20 +368,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01420_s04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_salt_len * 8; diff --git a/OpenCL/m01420_a1.cl b/OpenCL/m01420_a1.cl index a05f22c..7c37b77 100644 --- a/OpenCL/m01420_a1.cl +++ b/OpenCL/m01420_a1.cl @@ -173,20 +173,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01420_m04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; @@ -450,20 +450,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01420_s04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; diff --git a/OpenCL/m01420_a3.cl b/OpenCL/m01420_a3.cl index 33ac15f..5cbce06 100644 --- a/OpenCL/m01420_a3.cl +++ b/OpenCL/m01420_a3.cl @@ -86,22 +86,22 @@ static void m01420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t2[4]; u32 w3_t2[4]; - w0_t2[0] = swap_workaround (w0[0]); - w0_t2[1] = swap_workaround (w0[1]); - w0_t2[2] = swap_workaround (w0[2]); - w0_t2[3] = swap_workaround (w0[3]); - w1_t2[0] = swap_workaround (w1[0]); - w1_t2[1] = swap_workaround (w1[1]); - w1_t2[2] = swap_workaround (w1[2]); - w1_t2[3] = swap_workaround (w1[3]); - w2_t2[0] = swap_workaround (w2[0]); - w2_t2[1] = swap_workaround (w2[1]); - w2_t2[2] = swap_workaround (w2[2]); - w2_t2[3] = swap_workaround (w2[3]); - w3_t2[0] = swap_workaround (w3[0]); - w3_t2[1] = swap_workaround (w3[1]); - w3_t2[2] = swap_workaround (w3[2]); - w3_t2[3] = swap_workaround (w3[3]); + w0_t2[0] = swap32 (w0[0]); + w0_t2[1] = swap32 (w0[1]); + w0_t2[2] = swap32 (w0[2]); + w0_t2[3] = swap32 (w0[3]); + w1_t2[0] = swap32 (w1[0]); + w1_t2[1] = swap32 (w1[1]); + w1_t2[2] = swap32 (w1[2]); + w1_t2[3] = swap32 (w1[3]); + w2_t2[0] = swap32 (w2[0]); + w2_t2[1] = swap32 (w2[1]); + w2_t2[2] = swap32 (w2[2]); + w2_t2[3] = swap32 (w2[3]); + w3_t2[0] = swap32 (w3[0]); + w3_t2[1] = swap32 (w3[1]); + w3_t2[2] = swap32 (w3[2]); + w3_t2[3] = swap32 (w3[3]); switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); @@ -126,20 +126,20 @@ static void m01420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha256 */ - u32 w0_t = swap_workaround (w0_t2[0]); - u32 w1_t = swap_workaround (w0_t2[1]); - u32 w2_t = swap_workaround (w0_t2[2]); - u32 w3_t = swap_workaround (w0_t2[3]); - u32 w4_t = swap_workaround (w1_t2[0]); - u32 w5_t = swap_workaround (w1_t2[1]); - u32 w6_t = swap_workaround (w1_t2[2]); - u32 w7_t = swap_workaround (w1_t2[3]); - u32 w8_t = swap_workaround (w2_t2[0]); - u32 w9_t = swap_workaround (w2_t2[1]); - u32 wa_t = swap_workaround (w2_t2[2]); - u32 wb_t = swap_workaround (w2_t2[3]); - u32 wc_t = swap_workaround (w3_t2[0]); - u32 wd_t = swap_workaround (w3_t2[1]); + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; @@ -308,22 +308,22 @@ static void m01420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t2[4]; u32 w3_t2[4]; - w0_t2[0] = swap_workaround (w0[0]); - w0_t2[1] = swap_workaround (w0[1]); - w0_t2[2] = swap_workaround (w0[2]); - w0_t2[3] = swap_workaround (w0[3]); - w1_t2[0] = swap_workaround (w1[0]); - w1_t2[1] = swap_workaround (w1[1]); - w1_t2[2] = swap_workaround (w1[2]); - w1_t2[3] = swap_workaround (w1[3]); - w2_t2[0] = swap_workaround (w2[0]); - w2_t2[1] = swap_workaround (w2[1]); - w2_t2[2] = swap_workaround (w2[2]); - w2_t2[3] = swap_workaround (w2[3]); - w3_t2[0] = swap_workaround (w3[0]); - w3_t2[1] = swap_workaround (w3[1]); - w3_t2[2] = swap_workaround (w3[2]); - w3_t2[3] = swap_workaround (w3[3]); + w0_t2[0] = swap32 (w0[0]); + w0_t2[1] = swap32 (w0[1]); + w0_t2[2] = swap32 (w0[2]); + w0_t2[3] = swap32 (w0[3]); + w1_t2[0] = swap32 (w1[0]); + w1_t2[1] = swap32 (w1[1]); + w1_t2[2] = swap32 (w1[2]); + w1_t2[3] = swap32 (w1[3]); + w2_t2[0] = swap32 (w2[0]); + w2_t2[1] = swap32 (w2[1]); + w2_t2[2] = swap32 (w2[2]); + w2_t2[3] = swap32 (w2[3]); + w3_t2[0] = swap32 (w3[0]); + w3_t2[1] = swap32 (w3[1]); + w3_t2[2] = swap32 (w3[2]); + w3_t2[3] = swap32 (w3[3]); switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); @@ -348,20 +348,20 @@ static void m01420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha256 */ - u32 w0_t = swap_workaround (w0_t2[0]); - u32 w1_t = swap_workaround (w0_t2[1]); - u32 w2_t = swap_workaround (w0_t2[2]); - u32 w3_t = swap_workaround (w0_t2[3]); - u32 w4_t = swap_workaround (w1_t2[0]); - u32 w5_t = swap_workaround (w1_t2[1]); - u32 w6_t = swap_workaround (w1_t2[2]); - u32 w7_t = swap_workaround (w1_t2[3]); - u32 w8_t = swap_workaround (w2_t2[0]); - u32 w9_t = swap_workaround (w2_t2[1]); - u32 wa_t = swap_workaround (w2_t2[2]); - u32 wb_t = swap_workaround (w2_t2[3]); - u32 wc_t = swap_workaround (w3_t2[0]); - u32 wd_t = swap_workaround (w3_t2[1]); + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; diff --git a/OpenCL/m01430_a0.cl b/OpenCL/m01430_a0.cl index 32912d7..0f445a6 100644 --- a/OpenCL/m01430_a0.cl +++ b/OpenCL/m01430_a0.cl @@ -177,20 +177,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01430_m04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0_t2[0]); - u32 w1_t = swap_workaround (w0_t2[1]); - u32 w2_t = swap_workaround (w0_t2[2]); - u32 w3_t = swap_workaround (w0_t2[3]); - u32 w4_t = swap_workaround (w1_t2[0]); - u32 w5_t = swap_workaround (w1_t2[1]); - u32 w6_t = swap_workaround (w1_t2[2]); - u32 w7_t = swap_workaround (w1_t2[3]); - u32 w8_t = swap_workaround (w2_t2[0]); - u32 w9_t = swap_workaround (w2_t2[1]); - u32 wa_t = swap_workaround (w2_t2[2]); - u32 wb_t = swap_workaround (w2_t2[3]); - u32 wc_t = swap_workaround (w3_t2[0]); - u32 wd_t = swap_workaround (w3_t2[1]); + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); u32 we_t = 0; u32 wf_t = out_salt_len * 8; @@ -456,20 +456,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01430_s04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0_t2[0]); - u32 w1_t = swap_workaround (w0_t2[1]); - u32 w2_t = swap_workaround (w0_t2[2]); - u32 w3_t = swap_workaround (w0_t2[3]); - u32 w4_t = swap_workaround (w1_t2[0]); - u32 w5_t = swap_workaround (w1_t2[1]); - u32 w6_t = swap_workaround (w1_t2[2]); - u32 w7_t = swap_workaround (w1_t2[3]); - u32 w8_t = swap_workaround (w2_t2[0]); - u32 w9_t = swap_workaround (w2_t2[1]); - u32 wa_t = swap_workaround (w2_t2[2]); - u32 wb_t = swap_workaround (w2_t2[3]); - u32 wc_t = swap_workaround (w3_t2[0]); - u32 wd_t = swap_workaround (w3_t2[1]); + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); u32 we_t = 0; u32 wf_t = out_salt_len * 8; diff --git a/OpenCL/m01430_a1.cl b/OpenCL/m01430_a1.cl index dee0372..71b8d67 100644 --- a/OpenCL/m01430_a1.cl +++ b/OpenCL/m01430_a1.cl @@ -217,20 +217,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01430_m04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0_t2[0]); - u32 w1_t = swap_workaround (w0_t2[1]); - u32 w2_t = swap_workaround (w0_t2[2]); - u32 w3_t = swap_workaround (w0_t2[3]); - u32 w4_t = swap_workaround (w1_t2[0]); - u32 w5_t = swap_workaround (w1_t2[1]); - u32 w6_t = swap_workaround (w1_t2[2]); - u32 w7_t = swap_workaround (w1_t2[3]); - u32 w8_t = swap_workaround (w2_t2[0]); - u32 w9_t = swap_workaround (w2_t2[1]); - u32 wa_t = swap_workaround (w2_t2[2]); - u32 wb_t = swap_workaround (w2_t2[3]); - u32 wc_t = swap_workaround (w3_t2[0]); - u32 wd_t = swap_workaround (w3_t2[1]); + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; @@ -538,20 +538,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01430_s04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0_t2[0]); - u32 w1_t = swap_workaround (w0_t2[1]); - u32 w2_t = swap_workaround (w0_t2[2]); - u32 w3_t = swap_workaround (w0_t2[3]); - u32 w4_t = swap_workaround (w1_t2[0]); - u32 w5_t = swap_workaround (w1_t2[1]); - u32 w6_t = swap_workaround (w1_t2[2]); - u32 w7_t = swap_workaround (w1_t2[3]); - u32 w8_t = swap_workaround (w2_t2[0]); - u32 w9_t = swap_workaround (w2_t2[1]); - u32 wa_t = swap_workaround (w2_t2[2]); - u32 wb_t = swap_workaround (w2_t2[3]); - u32 wc_t = swap_workaround (w3_t2[0]); - u32 wd_t = swap_workaround (w3_t2[1]); + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; diff --git a/OpenCL/m01430_a3.cl b/OpenCL/m01430_a3.cl index 807993f..40a3873 100644 --- a/OpenCL/m01430_a3.cl +++ b/OpenCL/m01430_a3.cl @@ -63,22 +63,22 @@ static void m01430m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global g switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); - w[ 0] |= swap_workaround (salt_buf0[0]); - w[ 1] |= swap_workaround (salt_buf0[1]); - w[ 2] |= swap_workaround (salt_buf0[2]); - w[ 3] |= swap_workaround (salt_buf0[3]); - w[ 4] |= swap_workaround (salt_buf1[0]); - w[ 5] |= swap_workaround (salt_buf1[1]); - w[ 6] |= swap_workaround (salt_buf1[2]); - w[ 7] |= swap_workaround (salt_buf1[3]); - w[ 8] |= swap_workaround (salt_buf2[0]); - w[ 9] |= swap_workaround (salt_buf2[1]); - w[10] |= swap_workaround (salt_buf2[2]); - w[11] |= swap_workaround (salt_buf2[3]); - w[12] |= swap_workaround (salt_buf3[0]); - w[13] |= swap_workaround (salt_buf3[1]); - w[14] |= swap_workaround (salt_buf3[2]); - w[15] |= swap_workaround (salt_buf3[3]); + w[ 0] |= swap32 (salt_buf0[0]); + w[ 1] |= swap32 (salt_buf0[1]); + w[ 2] |= swap32 (salt_buf0[2]); + w[ 3] |= swap32 (salt_buf0[3]); + w[ 4] |= swap32 (salt_buf1[0]); + w[ 5] |= swap32 (salt_buf1[1]); + w[ 6] |= swap32 (salt_buf1[2]); + w[ 7] |= swap32 (salt_buf1[3]); + w[ 8] |= swap32 (salt_buf2[0]); + w[ 9] |= swap32 (salt_buf2[1]); + w[10] |= swap32 (salt_buf2[2]); + w[11] |= swap32 (salt_buf2[3]); + w[12] |= swap32 (salt_buf3[0]); + w[13] |= swap32 (salt_buf3[1]); + w[14] |= swap32 (salt_buf3[2]); + w[15] |= swap32 (salt_buf3[3]); const u32 salt_len = salt_bufs[salt_pos].salt_len; diff --git a/OpenCL/m01440_a0.cl b/OpenCL/m01440_a0.cl index dd9dfa7..9e211f7 100644 --- a/OpenCL/m01440_a0.cl +++ b/OpenCL/m01440_a0.cl @@ -135,20 +135,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01440_m04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0_t2[0]); - u32 w1_t = swap_workaround (w0_t2[1]); - u32 w2_t = swap_workaround (w0_t2[2]); - u32 w3_t = swap_workaround (w0_t2[3]); - u32 w4_t = swap_workaround (w1_t2[0]); - u32 w5_t = swap_workaround (w1_t2[1]); - u32 w6_t = swap_workaround (w1_t2[2]); - u32 w7_t = swap_workaround (w1_t2[3]); - u32 w8_t = swap_workaround (w2_t2[0]); - u32 w9_t = swap_workaround (w2_t2[1]); - u32 wa_t = swap_workaround (w2_t2[2]); - u32 wb_t = swap_workaround (w2_t2[3]); - u32 wc_t = swap_workaround (w3_t2[0]); - u32 wd_t = swap_workaround (w3_t2[1]); + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); u32 we_t = 0; u32 wf_t = out_salt_len * 8; @@ -372,20 +372,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01440_s04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0_t2[0]); - u32 w1_t = swap_workaround (w0_t2[1]); - u32 w2_t = swap_workaround (w0_t2[2]); - u32 w3_t = swap_workaround (w0_t2[3]); - u32 w4_t = swap_workaround (w1_t2[0]); - u32 w5_t = swap_workaround (w1_t2[1]); - u32 w6_t = swap_workaround (w1_t2[2]); - u32 w7_t = swap_workaround (w1_t2[3]); - u32 w8_t = swap_workaround (w2_t2[0]); - u32 w9_t = swap_workaround (w2_t2[1]); - u32 wa_t = swap_workaround (w2_t2[2]); - u32 wb_t = swap_workaround (w2_t2[3]); - u32 wc_t = swap_workaround (w3_t2[0]); - u32 wd_t = swap_workaround (w3_t2[1]); + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); u32 we_t = 0; u32 wf_t = out_salt_len * 8; diff --git a/OpenCL/m01440_a1.cl b/OpenCL/m01440_a1.cl index f4c93c8..77c4df4 100644 --- a/OpenCL/m01440_a1.cl +++ b/OpenCL/m01440_a1.cl @@ -181,20 +181,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01440_m04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0_t2[0]); - u32 w1_t = swap_workaround (w0_t2[1]); - u32 w2_t = swap_workaround (w0_t2[2]); - u32 w3_t = swap_workaround (w0_t2[3]); - u32 w4_t = swap_workaround (w1_t2[0]); - u32 w5_t = swap_workaround (w1_t2[1]); - u32 w6_t = swap_workaround (w1_t2[2]); - u32 w7_t = swap_workaround (w1_t2[3]); - u32 w8_t = swap_workaround (w2_t2[0]); - u32 w9_t = swap_workaround (w2_t2[1]); - u32 wa_t = swap_workaround (w2_t2[2]); - u32 wb_t = swap_workaround (w2_t2[3]); - u32 wc_t = swap_workaround (w3_t2[0]); - u32 wd_t = swap_workaround (w3_t2[1]); + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; @@ -466,20 +466,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01440_s04 (__glo * sha256 */ - u32 w0_t = swap_workaround (w0_t2[0]); - u32 w1_t = swap_workaround (w0_t2[1]); - u32 w2_t = swap_workaround (w0_t2[2]); - u32 w3_t = swap_workaround (w0_t2[3]); - u32 w4_t = swap_workaround (w1_t2[0]); - u32 w5_t = swap_workaround (w1_t2[1]); - u32 w6_t = swap_workaround (w1_t2[2]); - u32 w7_t = swap_workaround (w1_t2[3]); - u32 w8_t = swap_workaround (w2_t2[0]); - u32 w9_t = swap_workaround (w2_t2[1]); - u32 wa_t = swap_workaround (w2_t2[2]); - u32 wb_t = swap_workaround (w2_t2[3]); - u32 wc_t = swap_workaround (w3_t2[0]); - u32 wd_t = swap_workaround (w3_t2[1]); + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; diff --git a/OpenCL/m01440_a3.cl b/OpenCL/m01440_a3.cl index 3756b4e..bbec4bb 100644 --- a/OpenCL/m01440_a3.cl +++ b/OpenCL/m01440_a3.cl @@ -86,22 +86,22 @@ static void m01440m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t2[4]; u32 w3_t2[4]; - w0_t2[0] = swap_workaround (w0[0]); - w0_t2[1] = swap_workaround (w0[1]); - w0_t2[2] = swap_workaround (w0[2]); - w0_t2[3] = swap_workaround (w0[3]); - w1_t2[0] = swap_workaround (w1[0]); - w1_t2[1] = swap_workaround (w1[1]); - w1_t2[2] = swap_workaround (w1[2]); - w1_t2[3] = swap_workaround (w1[3]); - w2_t2[0] = swap_workaround (w2[0]); - w2_t2[1] = swap_workaround (w2[1]); - w2_t2[2] = swap_workaround (w2[2]); - w2_t2[3] = swap_workaround (w2[3]); - w3_t2[0] = swap_workaround (w3[0]); - w3_t2[1] = swap_workaround (w3[1]); - w3_t2[2] = swap_workaround (w3[2]); - w3_t2[3] = swap_workaround (w3[3]); + w0_t2[0] = swap32 (w0[0]); + w0_t2[1] = swap32 (w0[1]); + w0_t2[2] = swap32 (w0[2]); + w0_t2[3] = swap32 (w0[3]); + w1_t2[0] = swap32 (w1[0]); + w1_t2[1] = swap32 (w1[1]); + w1_t2[2] = swap32 (w1[2]); + w1_t2[3] = swap32 (w1[3]); + w2_t2[0] = swap32 (w2[0]); + w2_t2[1] = swap32 (w2[1]); + w2_t2[2] = swap32 (w2[2]); + w2_t2[3] = swap32 (w2[3]); + w3_t2[0] = swap32 (w3[0]); + w3_t2[1] = swap32 (w3[1]); + w3_t2[2] = swap32 (w3[2]); + w3_t2[3] = swap32 (w3[3]); switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); @@ -126,20 +126,20 @@ static void m01440m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha256 */ - u32 w0_t = swap_workaround (w0_t2[0]); - u32 w1_t = swap_workaround (w0_t2[1]); - u32 w2_t = swap_workaround (w0_t2[2]); - u32 w3_t = swap_workaround (w0_t2[3]); - u32 w4_t = swap_workaround (w1_t2[0]); - u32 w5_t = swap_workaround (w1_t2[1]); - u32 w6_t = swap_workaround (w1_t2[2]); - u32 w7_t = swap_workaround (w1_t2[3]); - u32 w8_t = swap_workaround (w2_t2[0]); - u32 w9_t = swap_workaround (w2_t2[1]); - u32 wa_t = swap_workaround (w2_t2[2]); - u32 wb_t = swap_workaround (w2_t2[3]); - u32 wc_t = swap_workaround (w3_t2[0]); - u32 wd_t = swap_workaround (w3_t2[1]); + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; @@ -308,22 +308,22 @@ static void m01440s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t2[4]; u32 w3_t2[4]; - w0_t2[0] = swap_workaround (w0[0]); - w0_t2[1] = swap_workaround (w0[1]); - w0_t2[2] = swap_workaround (w0[2]); - w0_t2[3] = swap_workaround (w0[3]); - w1_t2[0] = swap_workaround (w1[0]); - w1_t2[1] = swap_workaround (w1[1]); - w1_t2[2] = swap_workaround (w1[2]); - w1_t2[3] = swap_workaround (w1[3]); - w2_t2[0] = swap_workaround (w2[0]); - w2_t2[1] = swap_workaround (w2[1]); - w2_t2[2] = swap_workaround (w2[2]); - w2_t2[3] = swap_workaround (w2[3]); - w3_t2[0] = swap_workaround (w3[0]); - w3_t2[1] = swap_workaround (w3[1]); - w3_t2[2] = swap_workaround (w3[2]); - w3_t2[3] = swap_workaround (w3[3]); + w0_t2[0] = swap32 (w0[0]); + w0_t2[1] = swap32 (w0[1]); + w0_t2[2] = swap32 (w0[2]); + w0_t2[3] = swap32 (w0[3]); + w1_t2[0] = swap32 (w1[0]); + w1_t2[1] = swap32 (w1[1]); + w1_t2[2] = swap32 (w1[2]); + w1_t2[3] = swap32 (w1[3]); + w2_t2[0] = swap32 (w2[0]); + w2_t2[1] = swap32 (w2[1]); + w2_t2[2] = swap32 (w2[2]); + w2_t2[3] = swap32 (w2[3]); + w3_t2[0] = swap32 (w3[0]); + w3_t2[1] = swap32 (w3[1]); + w3_t2[2] = swap32 (w3[2]); + w3_t2[3] = swap32 (w3[3]); switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); @@ -348,20 +348,20 @@ static void m01440s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha256 */ - u32 w0_t = swap_workaround (w0_t2[0]); - u32 w1_t = swap_workaround (w0_t2[1]); - u32 w2_t = swap_workaround (w0_t2[2]); - u32 w3_t = swap_workaround (w0_t2[3]); - u32 w4_t = swap_workaround (w1_t2[0]); - u32 w5_t = swap_workaround (w1_t2[1]); - u32 w6_t = swap_workaround (w1_t2[2]); - u32 w7_t = swap_workaround (w1_t2[3]); - u32 w8_t = swap_workaround (w2_t2[0]); - u32 w9_t = swap_workaround (w2_t2[1]); - u32 wa_t = swap_workaround (w2_t2[2]); - u32 wb_t = swap_workaround (w2_t2[3]); - u32 wc_t = swap_workaround (w3_t2[0]); - u32 wd_t = swap_workaround (w3_t2[1]); + u32 w0_t = swap32 (w0_t2[0]); + u32 w1_t = swap32 (w0_t2[1]); + u32 w2_t = swap32 (w0_t2[2]); + u32 w3_t = swap32 (w0_t2[3]); + u32 w4_t = swap32 (w1_t2[0]); + u32 w5_t = swap32 (w1_t2[1]); + u32 w6_t = swap32 (w1_t2[2]); + u32 w7_t = swap32 (w1_t2[3]); + u32 w8_t = swap32 (w2_t2[0]); + u32 w9_t = swap32 (w2_t2[1]); + u32 wa_t = swap32 (w2_t2[2]); + u32 wb_t = swap32 (w2_t2[3]); + u32 wc_t = swap32 (w3_t2[0]); + u32 wd_t = swap32 (w3_t2[1]); u32 we_t = 0; u32 wf_t = pw_salt_len * 8; diff --git a/OpenCL/m01450_a0.cl b/OpenCL/m01450_a0.cl index a18fbe6..fcc420e 100644 --- a/OpenCL/m01450_a0.cl +++ b/OpenCL/m01450_a0.cl @@ -323,17 +323,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01450_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -354,14 +354,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01450_m04 (__glo hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; @@ -498,17 +498,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01450_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -529,14 +529,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01450_s04 (__glo hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; diff --git a/OpenCL/m01450_a1.cl b/OpenCL/m01450_a1.cl index 6bbdbe8..7d693da 100644 --- a/OpenCL/m01450_a1.cl +++ b/OpenCL/m01450_a1.cl @@ -375,17 +375,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01450_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -406,14 +406,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01450_m04 (__glo hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; @@ -604,17 +604,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01450_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -635,14 +635,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01450_s04 (__glo hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; diff --git a/OpenCL/m01450_a3.cl b/OpenCL/m01450_a3.cl index a9f623f..3a1af43 100644 --- a/OpenCL/m01450_a3.cl +++ b/OpenCL/m01450_a3.cl @@ -305,14 +305,14 @@ static void m01450m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; @@ -425,14 +425,14 @@ static void m01450s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; diff --git a/OpenCL/m01460_a0.cl b/OpenCL/m01460_a0.cl index 9e0d4f5..e7a2ae1 100644 --- a/OpenCL/m01460_a0.cl +++ b/OpenCL/m01460_a0.cl @@ -285,17 +285,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01460_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -354,14 +354,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01460_m04 (__glo append_0x80_2x4 (w0, w1, out_len); - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; @@ -448,17 +448,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01460_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -529,14 +529,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01460_s04 (__glo append_0x80_2x4 (w0, w1, out_len); - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; diff --git a/OpenCL/m01460_a1.cl b/OpenCL/m01460_a1.cl index 8941941..862aec3 100644 --- a/OpenCL/m01460_a1.cl +++ b/OpenCL/m01460_a1.cl @@ -302,17 +302,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01460_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -406,20 +406,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01460_m04 (__glo append_0x80_4x4 (w0, w1, w2, w3, pw_len); - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = (64 + pw_len) * 8; @@ -519,17 +519,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01460_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -635,20 +635,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01460_s04 (__glo append_0x80_4x4 (w0, w1, w2, w3, pw_len); - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = (64 + pw_len) * 8; diff --git a/OpenCL/m01460_a3.cl b/OpenCL/m01460_a3.cl index 81149df..02c276e 100644 --- a/OpenCL/m01460_a3.cl +++ b/OpenCL/m01460_a3.cl @@ -260,17 +260,17 @@ static void m01460m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -366,17 +366,17 @@ static void m01460s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; diff --git a/OpenCL/m01600.cl b/OpenCL/m01600.cl index 4b459fb..dbd6393 100644 --- a/OpenCL/m01600.cl +++ b/OpenCL/m01600.cl @@ -17,13 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" #define md5apr1_magic0 0x72706124 #define md5apr1_magic1 0x00002431 diff --git a/OpenCL/m01700_a0.cl b/OpenCL/m01700_a0.cl index 3b26ea9..46818f2 100644 --- a/OpenCL/m01700_a0.cl +++ b/OpenCL/m01700_a0.cl @@ -222,20 +222,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01700_m04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = out_len * 8; @@ -361,20 +361,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01700_s04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = out_len * 8; diff --git a/OpenCL/m01700_a1.cl b/OpenCL/m01700_a1.cl index 46e7470..3428c74 100644 --- a/OpenCL/m01700_a1.cl +++ b/OpenCL/m01700_a1.cl @@ -270,20 +270,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01700_m04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = pw_len * 8; @@ -459,20 +459,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01700_s04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = pw_len * 8; diff --git a/OpenCL/m01710_a0.cl b/OpenCL/m01710_a0.cl index 6e56b68..eef489b 100644 --- a/OpenCL/m01710_a0.cl +++ b/OpenCL/m01710_a0.cl @@ -298,20 +298,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01710_m04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = out_salt_len * 8; @@ -513,20 +513,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01710_s04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = out_salt_len * 8; diff --git a/OpenCL/m01710_a1.cl b/OpenCL/m01710_a1.cl index d3c96fb..bc28ac2 100644 --- a/OpenCL/m01710_a1.cl +++ b/OpenCL/m01710_a1.cl @@ -324,20 +324,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01710_m04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; @@ -567,20 +567,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01710_s04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; diff --git a/OpenCL/m01710_a3.cl b/OpenCL/m01710_a3.cl index 5b79726..0fd026d 100644 --- a/OpenCL/m01710_a3.cl +++ b/OpenCL/m01710_a3.cl @@ -184,22 +184,22 @@ static void m01710m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global g switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); - w[ 0] |= swap_workaround (salt_buf0[0]); - w[ 1] |= swap_workaround (salt_buf0[1]); - w[ 2] |= swap_workaround (salt_buf0[2]); - w[ 3] |= swap_workaround (salt_buf0[3]); - w[ 4] |= swap_workaround (salt_buf1[0]); - w[ 5] |= swap_workaround (salt_buf1[1]); - w[ 6] |= swap_workaround (salt_buf1[2]); - w[ 7] |= swap_workaround (salt_buf1[3]); - w[ 8] |= swap_workaround (salt_buf2[0]); - w[ 9] |= swap_workaround (salt_buf2[1]); - w[10] |= swap_workaround (salt_buf2[2]); - w[11] |= swap_workaround (salt_buf2[3]); - w[12] |= swap_workaround (salt_buf3[0]); - w[13] |= swap_workaround (salt_buf3[1]); - w[14] |= swap_workaround (salt_buf3[2]); - w[15] |= swap_workaround (salt_buf3[3]); + w[ 0] |= swap32 (salt_buf0[0]); + w[ 1] |= swap32 (salt_buf0[1]); + w[ 2] |= swap32 (salt_buf0[2]); + w[ 3] |= swap32 (salt_buf0[3]); + w[ 4] |= swap32 (salt_buf1[0]); + w[ 5] |= swap32 (salt_buf1[1]); + w[ 6] |= swap32 (salt_buf1[2]); + w[ 7] |= swap32 (salt_buf1[3]); + w[ 8] |= swap32 (salt_buf2[0]); + w[ 9] |= swap32 (salt_buf2[1]); + w[10] |= swap32 (salt_buf2[2]); + w[11] |= swap32 (salt_buf2[3]); + w[12] |= swap32 (salt_buf3[0]); + w[13] |= swap32 (salt_buf3[1]); + w[14] |= swap32 (salt_buf3[2]); + w[15] |= swap32 (salt_buf3[3]); const u32 salt_len = salt_bufs[salt_pos].salt_len; diff --git a/OpenCL/m01720_a0.cl b/OpenCL/m01720_a0.cl index b97faea..e714450 100644 --- a/OpenCL/m01720_a0.cl +++ b/OpenCL/m01720_a0.cl @@ -259,20 +259,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01720_m04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = out_salt_len * 8; @@ -435,20 +435,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01720_s04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = out_salt_len * 8; diff --git a/OpenCL/m01720_a1.cl b/OpenCL/m01720_a1.cl index c197b5f..13230df 100644 --- a/OpenCL/m01720_a1.cl +++ b/OpenCL/m01720_a1.cl @@ -299,20 +299,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01720_m04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; @@ -517,20 +517,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01720_s04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; diff --git a/OpenCL/m01720_a3.cl b/OpenCL/m01720_a3.cl index 2a363d4..ff1b251 100644 --- a/OpenCL/m01720_a3.cl +++ b/OpenCL/m01720_a3.cl @@ -207,22 +207,22 @@ static void m01720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); - w3_t[2] = swap_workaround (w3[2]); - w3_t[3] = swap_workaround (w3[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); + w3_t[2] = swap32 (w3[2]); + w3_t[3] = swap32 (w3[3]); switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); @@ -247,22 +247,22 @@ static void m01720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha512 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u64 digest[8]; @@ -365,22 +365,22 @@ static void m01720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); - w3_t[2] = swap_workaround (w3[2]); - w3_t[3] = swap_workaround (w3[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); + w3_t[2] = swap32 (w3[2]); + w3_t[3] = swap32 (w3[3]); switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); @@ -405,22 +405,22 @@ static void m01720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha512 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u64 digest[8]; diff --git a/OpenCL/m01730_a0.cl b/OpenCL/m01730_a0.cl index 7e95572..cda4754 100644 --- a/OpenCL/m01730_a0.cl +++ b/OpenCL/m01730_a0.cl @@ -298,20 +298,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01730_m04 (__glo * sha512 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = out_salt_len * 8; @@ -513,20 +513,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01730_s04 (__glo * sha512 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = out_salt_len * 8; diff --git a/OpenCL/m01730_a1.cl b/OpenCL/m01730_a1.cl index deb875c..6b419c3 100644 --- a/OpenCL/m01730_a1.cl +++ b/OpenCL/m01730_a1.cl @@ -338,20 +338,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01730_m04 (__glo * sha512 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; @@ -595,20 +595,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01730_s04 (__glo * sha512 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; diff --git a/OpenCL/m01730_a3.cl b/OpenCL/m01730_a3.cl index 6e69c30..98b5b33 100644 --- a/OpenCL/m01730_a3.cl +++ b/OpenCL/m01730_a3.cl @@ -184,22 +184,22 @@ static void m01730m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global g switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); - w[ 0] |= swap_workaround (salt_buf0[0]); - w[ 1] |= swap_workaround (salt_buf0[1]); - w[ 2] |= swap_workaround (salt_buf0[2]); - w[ 3] |= swap_workaround (salt_buf0[3]); - w[ 4] |= swap_workaround (salt_buf1[0]); - w[ 5] |= swap_workaround (salt_buf1[1]); - w[ 6] |= swap_workaround (salt_buf1[2]); - w[ 7] |= swap_workaround (salt_buf1[3]); - w[ 8] |= swap_workaround (salt_buf2[0]); - w[ 9] |= swap_workaround (salt_buf2[1]); - w[10] |= swap_workaround (salt_buf2[2]); - w[11] |= swap_workaround (salt_buf2[3]); - w[12] |= swap_workaround (salt_buf3[0]); - w[13] |= swap_workaround (salt_buf3[1]); - w[14] |= swap_workaround (salt_buf3[2]); - w[15] |= swap_workaround (salt_buf3[3]); + w[ 0] |= swap32 (salt_buf0[0]); + w[ 1] |= swap32 (salt_buf0[1]); + w[ 2] |= swap32 (salt_buf0[2]); + w[ 3] |= swap32 (salt_buf0[3]); + w[ 4] |= swap32 (salt_buf1[0]); + w[ 5] |= swap32 (salt_buf1[1]); + w[ 6] |= swap32 (salt_buf1[2]); + w[ 7] |= swap32 (salt_buf1[3]); + w[ 8] |= swap32 (salt_buf2[0]); + w[ 9] |= swap32 (salt_buf2[1]); + w[10] |= swap32 (salt_buf2[2]); + w[11] |= swap32 (salt_buf2[3]); + w[12] |= swap32 (salt_buf3[0]); + w[13] |= swap32 (salt_buf3[1]); + w[14] |= swap32 (salt_buf3[2]); + w[15] |= swap32 (salt_buf3[3]); const u32 salt_len = salt_bufs[salt_pos].salt_len; diff --git a/OpenCL/m01740_a0.cl b/OpenCL/m01740_a0.cl index d272ab9..cc2cab3 100644 --- a/OpenCL/m01740_a0.cl +++ b/OpenCL/m01740_a0.cl @@ -256,20 +256,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01740_m04 (__glo * sha512 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = out_salt_len * 8; @@ -429,20 +429,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01740_s04 (__glo * sha512 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = out_salt_len * 8; diff --git a/OpenCL/m01740_a1.cl b/OpenCL/m01740_a1.cl index e2d25fd..00c2d61 100644 --- a/OpenCL/m01740_a1.cl +++ b/OpenCL/m01740_a1.cl @@ -302,20 +302,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01740_m04 (__glo * sha512 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; @@ -523,20 +523,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01740_s04 (__glo * sha512 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; diff --git a/OpenCL/m01740_a3.cl b/OpenCL/m01740_a3.cl index 8a53310..6d3a611 100644 --- a/OpenCL/m01740_a3.cl +++ b/OpenCL/m01740_a3.cl @@ -207,22 +207,22 @@ static void m01740m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); - w3_t[2] = swap_workaround (w3[2]); - w3_t[3] = swap_workaround (w3[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); + w3_t[2] = swap32 (w3[2]); + w3_t[3] = swap32 (w3[3]); switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); @@ -247,22 +247,22 @@ static void m01740m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha512 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u64 digest[8]; @@ -365,22 +365,22 @@ static void m01740s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); - w3_t[2] = swap_workaround (w3[2]); - w3_t[3] = swap_workaround (w3[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); + w3_t[2] = swap32 (w3[2]); + w3_t[3] = swap32 (w3[3]); switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); @@ -405,22 +405,22 @@ static void m01740s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha512 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u64 digest[8]; diff --git a/OpenCL/m01750_a0.cl b/OpenCL/m01750_a0.cl index d352903..fef7b7a 100644 --- a/OpenCL/m01750_a0.cl +++ b/OpenCL/m01750_a0.cl @@ -354,17 +354,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01750_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -385,14 +385,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01750_m04 (__glo hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; @@ -530,17 +530,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01750_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -561,14 +561,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01750_s04 (__glo hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; diff --git a/OpenCL/m01750_a1.cl b/OpenCL/m01750_a1.cl index c714c3a..211535d 100644 --- a/OpenCL/m01750_a1.cl +++ b/OpenCL/m01750_a1.cl @@ -406,17 +406,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01750_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -437,14 +437,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01750_m04 (__glo hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; @@ -636,17 +636,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01750_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -667,14 +667,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01750_s04 (__glo hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; diff --git a/OpenCL/m01750_a3.cl b/OpenCL/m01750_a3.cl index 1cbe77a..6cf93d4 100644 --- a/OpenCL/m01750_a3.cl +++ b/OpenCL/m01750_a3.cl @@ -336,14 +336,14 @@ static void m01750m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; @@ -457,14 +457,14 @@ static void m01750s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; diff --git a/OpenCL/m01760_a0.cl b/OpenCL/m01760_a0.cl index cd8c777..31bb8d0 100644 --- a/OpenCL/m01760_a0.cl +++ b/OpenCL/m01760_a0.cl @@ -316,17 +316,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01760_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -385,14 +385,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01760_m04 (__glo append_0x80_2x4 (w0, w1, out_len); - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; @@ -480,17 +480,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01760_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -561,14 +561,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01760_s04 (__glo append_0x80_2x4 (w0, w1, out_len); - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; diff --git a/OpenCL/m01760_a1.cl b/OpenCL/m01760_a1.cl index bd9ee24..6c416f7 100644 --- a/OpenCL/m01760_a1.cl +++ b/OpenCL/m01760_a1.cl @@ -333,17 +333,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01760_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -437,20 +437,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01760_m04 (__glo append_0x80_4x4 (w0, w1, w2, w3, pw_len); - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = (128 + pw_len) * 8; @@ -551,17 +551,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01760_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -667,20 +667,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01760_s04 (__glo append_0x80_4x4 (w0, w1, w2, w3, pw_len); - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = (128 + pw_len) * 8; diff --git a/OpenCL/m01760_a3.cl b/OpenCL/m01760_a3.cl index 36e6ac6..0cbd3b2 100644 --- a/OpenCL/m01760_a3.cl +++ b/OpenCL/m01760_a3.cl @@ -291,17 +291,17 @@ static void m01760m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; @@ -398,17 +398,17 @@ static void m01760s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0_t[4]; - w0_t[0] = swap_workaround (salt_buf0[0]); - w0_t[1] = swap_workaround (salt_buf0[1]); - w0_t[2] = swap_workaround (salt_buf0[2]); - w0_t[3] = swap_workaround (salt_buf0[3]); + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (salt_buf1[0]); - w1_t[1] = swap_workaround (salt_buf1[1]); - w1_t[2] = swap_workaround (salt_buf1[2]); - w1_t[3] = swap_workaround (salt_buf1[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); u32 w2_t[4]; diff --git a/OpenCL/m01800.cl b/OpenCL/m01800.cl index ae3678f..3089975 100644 --- a/OpenCL/m01800.cl +++ b/OpenCL/m01800.cl @@ -330,13 +330,13 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01800_init (__gl u64 pw[2]; - pw[0] = swap_workaround (hl32_to_64 (w0[1], w0[0])); - pw[1] = swap_workaround (hl32_to_64 (w0[3], w0[2])); + pw[0] = swap32 (hl32_to_64 (w0[1], w0[0])); + pw[1] = swap32 (hl32_to_64 (w0[3], w0[2])); u64 salt[2]; - salt[0] = swap_workaround (hl32_to_64 (salt_buf[1], salt_buf[0])); - salt[1] = swap_workaround (hl32_to_64 (salt_buf[3], salt_buf[2])); + salt[0] = swap32 (hl32_to_64 (salt_buf[1], salt_buf[0])); + salt[1] = swap32 (hl32_to_64 (salt_buf[3], salt_buf[2])); /** * begin @@ -609,8 +609,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01800_comp (__gl const u32 lid = get_local_id (0); - const u64 a = swap_workaround (tmps[gid].l_alt_result[0]); - const u64 b = swap_workaround (tmps[gid].l_alt_result[1]); + const u64 a = swap32 (tmps[gid].l_alt_result[0]); + const u64 b = swap32 (tmps[gid].l_alt_result[1]); const u32 r0 = l32_from_64 (a); const u32 r1 = h32_from_64 (a); diff --git a/OpenCL/m02100.cl b/OpenCL/m02100.cl index 061c249..182d40c 100644 --- a/OpenCL/m02100.cl +++ b/OpenCL/m02100.cl @@ -418,10 +418,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02100_init (__gl * pads */ - w0[0] = swap_workaround (digest_md4[0]); - w0[1] = swap_workaround (digest_md4[1]); - w0[2] = swap_workaround (digest_md4[2]); - w0[3] = swap_workaround (digest_md4[3]); + w0[0] = swap32 (digest_md4[0]); + w0[1] = swap32 (digest_md4[1]); + w0[2] = swap32 (digest_md4[2]); + w0[3] = swap32 (digest_md4[3]); w1[0] = 0; w1[1] = 0; w1[2] = 0; @@ -476,20 +476,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02100_init (__gl append_0x01_4x4 (w0, w1, w2, w3, salt_len + 3); append_0x80_4x4 (w0, w1, w2, w3, salt_len + 4); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); u32 digest[5]; diff --git a/OpenCL/m02500.cl b/OpenCL/m02500.cl index ed996ab..4738882 100644 --- a/OpenCL/m02500.cl +++ b/OpenCL/m02500.cl @@ -17,17 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE2 -#define COMPARE_M "check_multi_vect2_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) { @@ -491,22 +482,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02500_init (__gl * pads */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); u32 ipad[5]; u32 opad[5]; @@ -551,16 +542,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02500_init (__gl append_0x80_3x4 (w0, w1, w2, salt_len + 4); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); w2[2] = 0; w2[3] = 0; w3[0] = 0; @@ -747,10 +738,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02500_comp (__gl hmac_sha1_run (w0, w1, w2, w3, ipad, opad, digest); { - w0[0] = swap_workaround (digest[0]); - w0[1] = swap_workaround (digest[1]); - w0[2] = swap_workaround (digest[2]); - w0[3] = swap_workaround (digest[3]); + w0[0] = swap32 (digest[0]); + w0[1] = swap32 (digest[1]); + w0[2] = swap32 (digest[2]); + w0[3] = swap32 (digest[3]); w1[0] = 0; w1[1] = 0; w1[2] = 0; diff --git a/OpenCL/m03200.cl b/OpenCL/m03200.cl index 8d5681a..d6b8e0b 100644 --- a/OpenCL/m03200.cl +++ b/OpenCL/m03200.cl @@ -404,24 +404,24 @@ __kernel void __attribute__((reqd_work_group_size (8, 1, 1))) m03200_init (__glo expand_key (E, w, pw_len); - E[ 0] = swap_workaround (E[ 0]); - E[ 1] = swap_workaround (E[ 1]); - E[ 2] = swap_workaround (E[ 2]); - E[ 3] = swap_workaround (E[ 3]); - E[ 4] = swap_workaround (E[ 4]); - E[ 5] = swap_workaround (E[ 5]); - E[ 6] = swap_workaround (E[ 6]); - E[ 7] = swap_workaround (E[ 7]); - E[ 8] = swap_workaround (E[ 8]); - E[ 9] = swap_workaround (E[ 9]); - E[10] = swap_workaround (E[10]); - E[11] = swap_workaround (E[11]); - E[12] = swap_workaround (E[12]); - E[13] = swap_workaround (E[13]); - E[14] = swap_workaround (E[14]); - E[15] = swap_workaround (E[15]); - E[16] = swap_workaround (E[16]); - E[17] = swap_workaround (E[17]); + E[ 0] = swap32 (E[ 0]); + E[ 1] = swap32 (E[ 1]); + E[ 2] = swap32 (E[ 2]); + E[ 3] = swap32 (E[ 3]); + E[ 4] = swap32 (E[ 4]); + E[ 5] = swap32 (E[ 5]); + E[ 6] = swap32 (E[ 6]); + E[ 7] = swap32 (E[ 7]); + E[ 8] = swap32 (E[ 8]); + E[ 9] = swap32 (E[ 9]); + E[10] = swap32 (E[10]); + E[11] = swap32 (E[11]); + E[12] = swap32 (E[12]); + E[13] = swap32 (E[13]); + E[14] = swap32 (E[14]); + E[15] = swap32 (E[15]); + E[16] = swap32 (E[16]); + E[17] = swap32 (E[17]); /** * salt @@ -615,24 +615,24 @@ __kernel void __attribute__((reqd_work_group_size (8, 1, 1))) m03200_loop (__glo expand_key (E, w, pw_len); - E[ 0] = swap_workaround (E[ 0]); - E[ 1] = swap_workaround (E[ 1]); - E[ 2] = swap_workaround (E[ 2]); - E[ 3] = swap_workaround (E[ 3]); - E[ 4] = swap_workaround (E[ 4]); - E[ 5] = swap_workaround (E[ 5]); - E[ 6] = swap_workaround (E[ 6]); - E[ 7] = swap_workaround (E[ 7]); - E[ 8] = swap_workaround (E[ 8]); - E[ 9] = swap_workaround (E[ 9]); - E[10] = swap_workaround (E[10]); - E[11] = swap_workaround (E[11]); - E[12] = swap_workaround (E[12]); - E[13] = swap_workaround (E[13]); - E[14] = swap_workaround (E[14]); - E[15] = swap_workaround (E[15]); - E[16] = swap_workaround (E[16]); - E[17] = swap_workaround (E[17]); + E[ 0] = swap32 (E[ 0]); + E[ 1] = swap32 (E[ 1]); + E[ 2] = swap32 (E[ 2]); + E[ 3] = swap32 (E[ 3]); + E[ 4] = swap32 (E[ 4]); + E[ 5] = swap32 (E[ 5]); + E[ 6] = swap32 (E[ 6]); + E[ 7] = swap32 (E[ 7]); + E[ 8] = swap32 (E[ 8]); + E[ 9] = swap32 (E[ 9]); + E[10] = swap32 (E[10]); + E[11] = swap32 (E[11]); + E[12] = swap32 (E[12]); + E[13] = swap32 (E[13]); + E[14] = swap32 (E[14]); + E[15] = swap32 (E[15]); + E[16] = swap32 (E[16]); + E[17] = swap32 (E[17]); // load diff --git a/OpenCL/m04400_a0.cl b/OpenCL/m04400_a0.cl index 889389e..b6045af 100644 --- a/OpenCL/m04400_a0.cl +++ b/OpenCL/m04400_a0.cl @@ -141,20 +141,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04400_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; @@ -507,20 +507,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04400_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; diff --git a/OpenCL/m04400_a1.cl b/OpenCL/m04400_a1.cl index 4d41d67..c5289c0 100644 --- a/OpenCL/m04400_a1.cl +++ b/OpenCL/m04400_a1.cl @@ -195,20 +195,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04400_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -617,20 +617,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04400_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; diff --git a/OpenCL/m04500_a0.cl b/OpenCL/m04500_a0.cl index 5d00c94..51c7a4e 100644 --- a/OpenCL/m04500_a0.cl +++ b/OpenCL/m04500_a0.cl @@ -141,20 +141,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04500_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; @@ -543,20 +543,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04500_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; diff --git a/OpenCL/m04500_a1.cl b/OpenCL/m04500_a1.cl index fd4660e..fb275ce 100644 --- a/OpenCL/m04500_a1.cl +++ b/OpenCL/m04500_a1.cl @@ -195,20 +195,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04500_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -652,20 +652,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04500_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; diff --git a/OpenCL/m04900_a0.cl b/OpenCL/m04900_a0.cl index 39562cc..5cd3d27 100644 --- a/OpenCL/m04900_a0.cl +++ b/OpenCL/m04900_a0.cl @@ -202,20 +202,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04900_m04 (__glo append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); - u32 w0 = swap_workaround (w0_t[0]); - u32 w1 = swap_workaround (w0_t[1]); - u32 w2 = swap_workaround (w0_t[2]); - u32 w3 = swap_workaround (w0_t[3]); - u32 w4 = swap_workaround (w1_t[0]); - u32 w5 = swap_workaround (w1_t[1]); - u32 w6 = swap_workaround (w1_t[2]); - u32 w7 = swap_workaround (w1_t[3]); - u32 w8 = swap_workaround (w2_t[0]); - u32 w9 = swap_workaround (w2_t[1]); - u32 wa = swap_workaround (w2_t[2]); - u32 wb = swap_workaround (w2_t[3]); - u32 wc = swap_workaround (w3_t[0]); - u32 wd = swap_workaround (w3_t[1]); + u32 w0 = swap32 (w0_t[0]); + u32 w1 = swap32 (w0_t[1]); + u32 w2 = swap32 (w0_t[2]); + u32 w3 = swap32 (w0_t[3]); + u32 w4 = swap32 (w1_t[0]); + u32 w5 = swap32 (w1_t[1]); + u32 w6 = swap32 (w1_t[2]); + u32 w7 = swap32 (w1_t[3]); + u32 w8 = swap32 (w2_t[0]); + u32 w9 = swap32 (w2_t[1]); + u32 wa = swap32 (w2_t[2]); + u32 wb = swap32 (w2_t[3]); + u32 wc = swap32 (w3_t[0]); + u32 wd = swap32 (w3_t[1]); u32 we = 0; u32 wf = pw_salt_len * 8; @@ -540,20 +540,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04900_s04 (__glo append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); - u32 w0 = swap_workaround (w0_t[0]); - u32 w1 = swap_workaround (w0_t[1]); - u32 w2 = swap_workaround (w0_t[2]); - u32 w3 = swap_workaround (w0_t[3]); - u32 w4 = swap_workaround (w1_t[0]); - u32 w5 = swap_workaround (w1_t[1]); - u32 w6 = swap_workaround (w1_t[2]); - u32 w7 = swap_workaround (w1_t[3]); - u32 w8 = swap_workaround (w2_t[0]); - u32 w9 = swap_workaround (w2_t[1]); - u32 wa = swap_workaround (w2_t[2]); - u32 wb = swap_workaround (w2_t[3]); - u32 wc = swap_workaround (w3_t[0]); - u32 wd = swap_workaround (w3_t[1]); + u32 w0 = swap32 (w0_t[0]); + u32 w1 = swap32 (w0_t[1]); + u32 w2 = swap32 (w0_t[2]); + u32 w3 = swap32 (w0_t[3]); + u32 w4 = swap32 (w1_t[0]); + u32 w5 = swap32 (w1_t[1]); + u32 w6 = swap32 (w1_t[2]); + u32 w7 = swap32 (w1_t[3]); + u32 w8 = swap32 (w2_t[0]); + u32 w9 = swap32 (w2_t[1]); + u32 wa = swap32 (w2_t[2]); + u32 wb = swap32 (w2_t[3]); + u32 wc = swap32 (w3_t[0]); + u32 wd = swap32 (w3_t[1]); u32 we = 0; u32 wf = pw_salt_len * 8; diff --git a/OpenCL/m04900_a1.cl b/OpenCL/m04900_a1.cl index 5dd74b7..5a9e9ca 100644 --- a/OpenCL/m04900_a1.cl +++ b/OpenCL/m04900_a1.cl @@ -254,20 +254,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04900_m04 (__glo append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); - u32 w0 = swap_workaround (w0_t[0]); - u32 w1 = swap_workaround (w0_t[1]); - u32 w2 = swap_workaround (w0_t[2]); - u32 w3 = swap_workaround (w0_t[3]); - u32 w4 = swap_workaround (w1_t[0]); - u32 w5 = swap_workaround (w1_t[1]); - u32 w6 = swap_workaround (w1_t[2]); - u32 w7 = swap_workaround (w1_t[3]); - u32 w8 = swap_workaround (w2_t[0]); - u32 w9 = swap_workaround (w2_t[1]); - u32 wa = swap_workaround (w2_t[2]); - u32 wb = swap_workaround (w2_t[3]); - u32 wc = swap_workaround (w3_t[0]); - u32 wd = swap_workaround (w3_t[1]); + u32 w0 = swap32 (w0_t[0]); + u32 w1 = swap32 (w0_t[1]); + u32 w2 = swap32 (w0_t[2]); + u32 w3 = swap32 (w0_t[3]); + u32 w4 = swap32 (w1_t[0]); + u32 w5 = swap32 (w1_t[1]); + u32 w6 = swap32 (w1_t[2]); + u32 w7 = swap32 (w1_t[3]); + u32 w8 = swap32 (w2_t[0]); + u32 w9 = swap32 (w2_t[1]); + u32 wa = swap32 (w2_t[2]); + u32 wb = swap32 (w2_t[3]); + u32 wc = swap32 (w3_t[0]); + u32 wd = swap32 (w3_t[1]); u32 we = 0; u32 wf = pw_salt_len * 8; @@ -646,20 +646,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04900_s04 (__glo append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); - u32 w0 = swap_workaround (w0_t[0]); - u32 w1 = swap_workaround (w0_t[1]); - u32 w2 = swap_workaround (w0_t[2]); - u32 w3 = swap_workaround (w0_t[3]); - u32 w4 = swap_workaround (w1_t[0]); - u32 w5 = swap_workaround (w1_t[1]); - u32 w6 = swap_workaround (w1_t[2]); - u32 w7 = swap_workaround (w1_t[3]); - u32 w8 = swap_workaround (w2_t[0]); - u32 w9 = swap_workaround (w2_t[1]); - u32 wa = swap_workaround (w2_t[2]); - u32 wb = swap_workaround (w2_t[3]); - u32 wc = swap_workaround (w3_t[0]); - u32 wd = swap_workaround (w3_t[1]); + u32 w0 = swap32 (w0_t[0]); + u32 w1 = swap32 (w0_t[1]); + u32 w2 = swap32 (w0_t[2]); + u32 w3 = swap32 (w0_t[3]); + u32 w4 = swap32 (w1_t[0]); + u32 w5 = swap32 (w1_t[1]); + u32 w6 = swap32 (w1_t[2]); + u32 w7 = swap32 (w1_t[3]); + u32 w8 = swap32 (w2_t[0]); + u32 w9 = swap32 (w2_t[1]); + u32 wa = swap32 (w2_t[2]); + u32 wb = swap32 (w2_t[3]); + u32 wc = swap32 (w3_t[0]); + u32 wd = swap32 (w3_t[1]); u32 we = 0; u32 wf = pw_salt_len * 8; diff --git a/OpenCL/m04900_a3.cl b/OpenCL/m04900_a3.cl index 712cd2e..e2e8e30 100644 --- a/OpenCL/m04900_a3.cl +++ b/OpenCL/m04900_a3.cl @@ -183,21 +183,21 @@ static void m04900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[1] |= salt_buf3[1]; w3_t[2] |= salt_buf3[2]; - u32 w0 = swap_workaround (w0_t[0]); - u32 w1 = swap_workaround (w0_t[1]); - u32 w2 = swap_workaround (w0_t[2]); - u32 w3 = swap_workaround (w0_t[3]); - u32 w4 = swap_workaround (w1_t[0]); - u32 w5 = swap_workaround (w1_t[1]); - u32 w6 = swap_workaround (w1_t[2]); - u32 w7 = swap_workaround (w1_t[3]); - u32 w8 = swap_workaround (w2_t[0]); - u32 w9 = swap_workaround (w2_t[1]); - u32 wa = swap_workaround (w2_t[2]); - u32 wb = swap_workaround (w2_t[3]); - u32 wc = swap_workaround (w3_t[0]); - u32 wd = swap_workaround (w3_t[1]); - u32 we = swap_workaround (w3_t[2]); + u32 w0 = swap32 (w0_t[0]); + u32 w1 = swap32 (w0_t[1]); + u32 w2 = swap32 (w0_t[2]); + u32 w3 = swap32 (w0_t[3]); + u32 w4 = swap32 (w1_t[0]); + u32 w5 = swap32 (w1_t[1]); + u32 w6 = swap32 (w1_t[2]); + u32 w7 = swap32 (w1_t[3]); + u32 w8 = swap32 (w2_t[0]); + u32 w9 = swap32 (w2_t[1]); + u32 wa = swap32 (w2_t[2]); + u32 wb = swap32 (w2_t[3]); + u32 wc = swap32 (w3_t[0]); + u32 wd = swap32 (w3_t[1]); + u32 we = swap32 (w3_t[2]); u32 wf = pw_salt_len * 8; /** @@ -496,21 +496,21 @@ static void m04900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[1] |= salt_buf3[1]; w3_t[2] |= salt_buf3[2]; - u32 w0 = swap_workaround (w0_t[0]); - u32 w1 = swap_workaround (w0_t[1]); - u32 w2 = swap_workaround (w0_t[2]); - u32 w3 = swap_workaround (w0_t[3]); - u32 w4 = swap_workaround (w1_t[0]); - u32 w5 = swap_workaround (w1_t[1]); - u32 w6 = swap_workaround (w1_t[2]); - u32 w7 = swap_workaround (w1_t[3]); - u32 w8 = swap_workaround (w2_t[0]); - u32 w9 = swap_workaround (w2_t[1]); - u32 wa = swap_workaround (w2_t[2]); - u32 wb = swap_workaround (w2_t[3]); - u32 wc = swap_workaround (w3_t[0]); - u32 wd = swap_workaround (w3_t[1]); - u32 we = swap_workaround (w3_t[2]); + u32 w0 = swap32 (w0_t[0]); + u32 w1 = swap32 (w0_t[1]); + u32 w2 = swap32 (w0_t[2]); + u32 w3 = swap32 (w0_t[3]); + u32 w4 = swap32 (w1_t[0]); + u32 w5 = swap32 (w1_t[1]); + u32 w6 = swap32 (w1_t[2]); + u32 w7 = swap32 (w1_t[3]); + u32 w8 = swap32 (w2_t[0]); + u32 w9 = swap32 (w2_t[1]); + u32 wa = swap32 (w2_t[2]); + u32 wb = swap32 (w2_t[3]); + u32 wc = swap32 (w3_t[0]); + u32 wd = swap32 (w3_t[1]); + u32 we = swap32 (w3_t[2]); u32 wf = pw_salt_len * 8; /** diff --git a/OpenCL/m05200.cl b/OpenCL/m05200.cl index ddc59f3..d962176 100644 --- a/OpenCL/m05200.cl +++ b/OpenCL/m05200.cl @@ -17,17 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE2 -#define COMPARE_M "check_multi_vect2_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" __constant u32 k_sha256[64] = { @@ -239,23 +230,23 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05200_init (__gl * init */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); w3[2] = 0; w3[3] = block_len * 8; diff --git a/OpenCL/m05400_a0.cl b/OpenCL/m05400_a0.cl index fa3c4ea..d94ee1e 100644 --- a/OpenCL/m05400_a0.cl +++ b/OpenCL/m05400_a0.cl @@ -278,29 +278,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_m04 (__glo u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 0]); - salt_buf0[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 1]); - salt_buf0[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 2]); - salt_buf0[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 3]); + salt_buf0[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 0]); + salt_buf0[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 1]); + salt_buf0[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 2]); + salt_buf0[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 4]); - salt_buf1[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 5]); - salt_buf1[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 6]); - salt_buf1[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 7]); + salt_buf1[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 4]); + salt_buf1[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 5]); + salt_buf1[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 6]); + salt_buf1[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 8]); - salt_buf2[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 9]); - salt_buf2[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[10]); - salt_buf2[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[11]); + salt_buf2[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 8]); + salt_buf2[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 9]); + salt_buf2[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[10]); + salt_buf2[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[11]); u32 salt_buf3[4]; - salt_buf3[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[12]); - salt_buf3[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[13]); + salt_buf3[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[12]); + salt_buf3[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[13]); salt_buf3[2] = 0; salt_buf3[3] = 0; @@ -308,8 +308,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_m04 (__glo const u32 lid2 = lid * 2; - s_msg_buf[lid2 + 0] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); - s_msg_buf[lid2 + 1] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); + s_msg_buf[lid2 + 0] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); + s_msg_buf[lid2 + 1] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); barrier (CLK_LOCAL_MEM_FENCE); @@ -357,17 +357,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -528,29 +528,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_s04 (__glo u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 0]); - salt_buf0[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 1]); - salt_buf0[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 2]); - salt_buf0[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 3]); + salt_buf0[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 0]); + salt_buf0[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 1]); + salt_buf0[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 2]); + salt_buf0[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 4]); - salt_buf1[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 5]); - salt_buf1[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 6]); - salt_buf1[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 7]); + salt_buf1[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 4]); + salt_buf1[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 5]); + salt_buf1[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 6]); + salt_buf1[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 8]); - salt_buf2[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 9]); - salt_buf2[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[10]); - salt_buf2[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[11]); + salt_buf2[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 8]); + salt_buf2[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 9]); + salt_buf2[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[10]); + salt_buf2[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[11]); u32 salt_buf3[4]; - salt_buf3[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[12]); - salt_buf3[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[13]); + salt_buf3[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[12]); + salt_buf3[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[13]); salt_buf3[2] = 0; salt_buf3[3] = 0; @@ -558,8 +558,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_s04 (__glo const u32 lid2 = lid * 2; - s_msg_buf[lid2 + 0] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); - s_msg_buf[lid2 + 1] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); + s_msg_buf[lid2 + 0] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); + s_msg_buf[lid2 + 1] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); barrier (CLK_LOCAL_MEM_FENCE); @@ -619,17 +619,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; diff --git a/OpenCL/m05400_a1.cl b/OpenCL/m05400_a1.cl index a9c9aff..4a6e65e 100644 --- a/OpenCL/m05400_a1.cl +++ b/OpenCL/m05400_a1.cl @@ -295,29 +295,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_m04 (__glo u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 0]); - salt_buf0[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 1]); - salt_buf0[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 2]); - salt_buf0[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 3]); + salt_buf0[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 0]); + salt_buf0[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 1]); + salt_buf0[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 2]); + salt_buf0[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 4]); - salt_buf1[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 5]); - salt_buf1[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 6]); - salt_buf1[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 7]); + salt_buf1[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 4]); + salt_buf1[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 5]); + salt_buf1[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 6]); + salt_buf1[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 8]); - salt_buf2[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 9]); - salt_buf2[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[10]); - salt_buf2[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[11]); + salt_buf2[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 8]); + salt_buf2[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 9]); + salt_buf2[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[10]); + salt_buf2[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[11]); u32 salt_buf3[4]; - salt_buf3[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[12]); - salt_buf3[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[13]); + salt_buf3[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[12]); + salt_buf3[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[13]); salt_buf3[2] = 0; salt_buf3[3] = 0; @@ -325,8 +325,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_m04 (__glo const u32 lid2 = lid * 2; - s_msg_buf[lid2 + 0] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); - s_msg_buf[lid2 + 1] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); + s_msg_buf[lid2 + 0] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); + s_msg_buf[lid2 + 1] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); barrier (CLK_LOCAL_MEM_FENCE); @@ -409,17 +409,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -599,29 +599,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_s04 (__glo u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 0]); - salt_buf0[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 1]); - salt_buf0[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 2]); - salt_buf0[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 3]); + salt_buf0[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 0]); + salt_buf0[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 1]); + salt_buf0[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 2]); + salt_buf0[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 4]); - salt_buf1[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 5]); - salt_buf1[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 6]); - salt_buf1[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 7]); + salt_buf1[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 4]); + salt_buf1[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 5]); + salt_buf1[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 6]); + salt_buf1[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 8]); - salt_buf2[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 9]); - salt_buf2[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[10]); - salt_buf2[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[11]); + salt_buf2[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 8]); + salt_buf2[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 9]); + salt_buf2[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[10]); + salt_buf2[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[11]); u32 salt_buf3[4]; - salt_buf3[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[12]); - salt_buf3[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[13]); + salt_buf3[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[12]); + salt_buf3[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[13]); salt_buf3[2] = 0; salt_buf3[3] = 0; @@ -629,8 +629,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_s04 (__glo const u32 lid2 = lid * 2; - s_msg_buf[lid2 + 0] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); - s_msg_buf[lid2 + 1] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); + s_msg_buf[lid2 + 0] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); + s_msg_buf[lid2 + 1] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); barrier (CLK_LOCAL_MEM_FENCE); @@ -725,17 +725,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; diff --git a/OpenCL/m05400_a3.cl b/OpenCL/m05400_a3.cl index ca557ca..8f5a62a 100644 --- a/OpenCL/m05400_a3.cl +++ b/OpenCL/m05400_a3.cl @@ -255,29 +255,29 @@ static void m05400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 0]); - salt_buf0[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 1]); - salt_buf0[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 2]); - salt_buf0[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 3]); + salt_buf0[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 0]); + salt_buf0[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 1]); + salt_buf0[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 2]); + salt_buf0[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 4]); - salt_buf1[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 5]); - salt_buf1[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 6]); - salt_buf1[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 7]); + salt_buf1[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 4]); + salt_buf1[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 5]); + salt_buf1[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 6]); + salt_buf1[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 8]); - salt_buf2[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 9]); - salt_buf2[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[10]); - salt_buf2[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[11]); + salt_buf2[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 8]); + salt_buf2[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 9]); + salt_buf2[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[10]); + salt_buf2[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[11]); u32 salt_buf3[4]; - salt_buf3[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[12]); - salt_buf3[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[13]); + salt_buf3[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[12]); + salt_buf3[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[13]); salt_buf3[2] = 0; salt_buf3[3] = 0; @@ -441,29 +441,29 @@ static void m05400s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 0]); - salt_buf0[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 1]); - salt_buf0[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 2]); - salt_buf0[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 3]); + salt_buf0[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 0]); + salt_buf0[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 1]); + salt_buf0[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 2]); + salt_buf0[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 4]); - salt_buf1[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 5]); - salt_buf1[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 6]); - salt_buf1[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 7]); + salt_buf1[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 4]); + salt_buf1[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 5]); + salt_buf1[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 6]); + salt_buf1[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 8]); - salt_buf2[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[ 9]); - salt_buf2[2] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[10]); - salt_buf2[3] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[11]); + salt_buf2[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 8]); + salt_buf2[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[ 9]); + salt_buf2[2] = swap32 (ikepsk_bufs[salt_pos].nr_buf[10]); + salt_buf2[3] = swap32 (ikepsk_bufs[salt_pos].nr_buf[11]); u32 salt_buf3[4]; - salt_buf3[0] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[12]); - salt_buf3[1] = swap_workaround (ikepsk_bufs[salt_pos].nr_buf[13]); + salt_buf3[0] = swap32 (ikepsk_bufs[salt_pos].nr_buf[12]); + salt_buf3[1] = swap32 (ikepsk_bufs[salt_pos].nr_buf[13]); salt_buf3[2] = 0; salt_buf3[3] = 0; @@ -673,8 +673,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_m04 (__glo const u32 lid2 = lid * 2; - s_msg_buf[lid2 + 0] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); - s_msg_buf[lid2 + 1] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); + s_msg_buf[lid2 + 0] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); + s_msg_buf[lid2 + 1] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); barrier (CLK_LOCAL_MEM_FENCE); @@ -739,8 +739,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_m08 (__glo const u32 lid2 = lid * 2; - s_msg_buf[lid2 + 0] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); - s_msg_buf[lid2 + 1] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); + s_msg_buf[lid2 + 0] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); + s_msg_buf[lid2 + 1] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); barrier (CLK_LOCAL_MEM_FENCE); @@ -805,8 +805,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_m16 (__glo const u32 lid2 = lid * 2; - s_msg_buf[lid2 + 0] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); - s_msg_buf[lid2 + 1] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); + s_msg_buf[lid2 + 0] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); + s_msg_buf[lid2 + 1] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); barrier (CLK_LOCAL_MEM_FENCE); @@ -871,8 +871,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_s04 (__glo const u32 lid2 = lid * 2; - s_msg_buf[lid2 + 0] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); - s_msg_buf[lid2 + 1] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); + s_msg_buf[lid2 + 0] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); + s_msg_buf[lid2 + 1] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); barrier (CLK_LOCAL_MEM_FENCE); @@ -937,8 +937,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_s08 (__glo const u32 lid2 = lid * 2; - s_msg_buf[lid2 + 0] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); - s_msg_buf[lid2 + 1] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); + s_msg_buf[lid2 + 0] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); + s_msg_buf[lid2 + 1] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); barrier (CLK_LOCAL_MEM_FENCE); @@ -1003,8 +1003,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05400_s16 (__glo const u32 lid2 = lid * 2; - s_msg_buf[lid2 + 0] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); - s_msg_buf[lid2 + 1] = swap_workaround (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); + s_msg_buf[lid2 + 0] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 0]); + s_msg_buf[lid2 + 1] = swap32 (ikepsk_bufs[salt_pos].msg_buf[lid2 + 1]); barrier (CLK_LOCAL_MEM_FENCE); diff --git a/OpenCL/m05800.cl b/OpenCL/m05800.cl index c18e38c..512dc89 100644 --- a/OpenCL/m05800.cl +++ b/OpenCL/m05800.cl @@ -17,13 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" typedef struct { @@ -545,16 +540,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05800_init (__gl u32 w2[4]; u32 w3[4]; - w0[0] = swap_workaround (data0[0]); - w0[1] = swap_workaround (data0[1]); - w0[2] = swap_workaround (data0[2]); - w0[3] = swap_workaround (data0[3]); - w1[0] = swap_workaround (data1[0]); - w1[1] = swap_workaround (data1[1]); - w1[2] = swap_workaround (data1[2]); - w1[3] = swap_workaround (data1[3]); - w2[0] = swap_workaround (data2[0]); - w2[1] = swap_workaround (data2[1]); + w0[0] = swap32 (data0[0]); + w0[1] = swap32 (data0[1]); + w0[2] = swap32 (data0[2]); + w0[3] = swap32 (data0[3]); + w1[0] = swap32 (data1[0]); + w1[1] = swap32 (data1[1]); + w1[2] = swap32 (data1[2]); + w1[3] = swap32 (data1[3]); + w2[0] = swap32 (data2[0]); + w2[1] = swap32 (data2[1]); w2[2] = 0; w2[3] = 0; w3[0] = 0; @@ -678,15 +673,15 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05800_loop (__gl w0[2] = digest[2]; w0[3] = digest[3]; w1[0] = digest[4]; - w1[1] = swap_workaround (data0[0]); - w1[2] = swap_workaround (data0[1]); - w1[3] = swap_workaround (data0[2]); - w2[0] = swap_workaround (data0[3]); - w2[1] = swap_workaround (data1[0]); - w2[2] = swap_workaround (data1[1]); - w2[3] = swap_workaround (data1[2]); - w3[0] = swap_workaround (data1[3]); - w3[1] = swap_workaround (data2[0]); + w1[1] = swap32 (data0[0]); + w1[2] = swap32 (data0[1]); + w1[3] = swap32 (data0[2]); + w2[0] = swap32 (data0[3]); + w2[1] = swap32 (data1[0]); + w2[2] = swap32 (data1[1]); + w2[3] = swap32 (data1[2]); + w3[0] = swap32 (data1[3]); + w3[1] = swap32 (data2[0]); w3[2] = 0; w3[3] = (20 + pc_len + pw_len + salt_len) * 8; diff --git a/OpenCL/m06100_a0.cl b/OpenCL/m06100_a0.cl index b264871..0be3a10 100644 --- a/OpenCL/m06100_a0.cl +++ b/OpenCL/m06100_a0.cl @@ -1436,14 +1436,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06100_m04 (__glo u32 wl[16]; - wl[ 0] = swap_workaround (w0[0]); - wl[ 1] = swap_workaround (w0[1]); - wl[ 2] = swap_workaround (w0[2]); - wl[ 3] = swap_workaround (w0[3]); - wl[ 4] = swap_workaround (w1[0]); - wl[ 5] = swap_workaround (w1[1]); - wl[ 6] = swap_workaround (w1[2]); - wl[ 7] = swap_workaround (w1[3]); + wl[ 0] = swap32 (w0[0]); + wl[ 1] = swap32 (w0[1]); + wl[ 2] = swap32 (w0[2]); + wl[ 3] = swap32 (w0[3]); + wl[ 4] = swap32 (w1[0]); + wl[ 5] = swap32 (w1[1]); + wl[ 6] = swap32 (w1[2]); + wl[ 7] = swap32 (w1[3]); wl[ 8] = 0; wl[ 9] = 0; wl[10] = 0; @@ -1598,14 +1598,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06100_s04 (__glo u32 wl[16]; - wl[ 0] = swap_workaround (w0[0]); - wl[ 1] = swap_workaround (w0[1]); - wl[ 2] = swap_workaround (w0[2]); - wl[ 3] = swap_workaround (w0[3]); - wl[ 4] = swap_workaround (w1[0]); - wl[ 5] = swap_workaround (w1[1]); - wl[ 6] = swap_workaround (w1[2]); - wl[ 7] = swap_workaround (w1[3]); + wl[ 0] = swap32 (w0[0]); + wl[ 1] = swap32 (w0[1]); + wl[ 2] = swap32 (w0[2]); + wl[ 3] = swap32 (w0[3]); + wl[ 4] = swap32 (w1[0]); + wl[ 5] = swap32 (w1[1]); + wl[ 6] = swap32 (w1[2]); + wl[ 7] = swap32 (w1[3]); wl[ 8] = 0; wl[ 9] = 0; wl[10] = 0; diff --git a/OpenCL/m06100_a1.cl b/OpenCL/m06100_a1.cl index 1aeab93..c62c4c2 100644 --- a/OpenCL/m06100_a1.cl +++ b/OpenCL/m06100_a1.cl @@ -1488,14 +1488,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06100_m04 (__glo u32 wl[16]; - wl[ 0] = swap_workaround (w0[0]); - wl[ 1] = swap_workaround (w0[1]); - wl[ 2] = swap_workaround (w0[2]); - wl[ 3] = swap_workaround (w0[3]); - wl[ 4] = swap_workaround (w1[0]); - wl[ 5] = swap_workaround (w1[1]); - wl[ 6] = swap_workaround (w1[2]); - wl[ 7] = swap_workaround (w1[3]); + wl[ 0] = swap32 (w0[0]); + wl[ 1] = swap32 (w0[1]); + wl[ 2] = swap32 (w0[2]); + wl[ 3] = swap32 (w0[3]); + wl[ 4] = swap32 (w1[0]); + wl[ 5] = swap32 (w1[1]); + wl[ 6] = swap32 (w1[2]); + wl[ 7] = swap32 (w1[3]); wl[ 8] = 0; wl[ 9] = 0; wl[10] = 0; @@ -1704,14 +1704,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06100_s04 (__glo u32 wl[16]; - wl[ 0] = swap_workaround (w0[0]); - wl[ 1] = swap_workaround (w0[1]); - wl[ 2] = swap_workaround (w0[2]); - wl[ 3] = swap_workaround (w0[3]); - wl[ 4] = swap_workaround (w1[0]); - wl[ 5] = swap_workaround (w1[1]); - wl[ 6] = swap_workaround (w1[2]); - wl[ 7] = swap_workaround (w1[3]); + wl[ 0] = swap32 (w0[0]); + wl[ 1] = swap32 (w0[1]); + wl[ 2] = swap32 (w0[2]); + wl[ 3] = swap32 (w0[3]); + wl[ 4] = swap32 (w1[0]); + wl[ 5] = swap32 (w1[1]); + wl[ 6] = swap32 (w1[2]); + wl[ 7] = swap32 (w1[3]); wl[ 8] = 0; wl[ 9] = 0; wl[10] = 0; diff --git a/OpenCL/m06211.cl b/OpenCL/m06211.cl index 4238b1a..4356bff 100644 --- a/OpenCL/m06211.cl +++ b/OpenCL/m06211.cl @@ -512,7 +512,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06211_init (__gl for (u32 i = 0, j = 1; i < (truecrypt_mdlen / 8 / 4); i += 5, j += 1) { - salt_buf2[0] = swap_workaround (j); + salt_buf2[0] = swap32 (j); u32 dgst[5]; diff --git a/OpenCL/m06212.cl b/OpenCL/m06212.cl index 120801e..757c9ee 100644 --- a/OpenCL/m06212.cl +++ b/OpenCL/m06212.cl @@ -512,7 +512,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06212_init (__gl for (u32 i = 0, j = 1; i < (truecrypt_mdlen / 8 / 4); i += 5, j += 1) { - salt_buf2[0] = swap_workaround (j); + salt_buf2[0] = swap32 (j); u32 dgst[5]; diff --git a/OpenCL/m06213.cl b/OpenCL/m06213.cl index 41ae9f8..bdcc7bc 100644 --- a/OpenCL/m06213.cl +++ b/OpenCL/m06213.cl @@ -512,7 +512,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06213_init (__gl for (u32 i = 0, j = 1; i < (truecrypt_mdlen / 8 / 4); i += 5, j += 1) { - salt_buf2[0] = swap_workaround (j); + salt_buf2[0] = swap32 (j); u32 dgst[5]; diff --git a/OpenCL/m06221.cl b/OpenCL/m06221.cl index e6295e1..337d31a 100644 --- a/OpenCL/m06221.cl +++ b/OpenCL/m06221.cl @@ -326,14 +326,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06221_init (__gl // swap fehlt - salt_buf[ 0] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 0])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 1]); - salt_buf[ 1] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 2])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 3]); - salt_buf[ 2] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 4])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 5]); - salt_buf[ 3] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 6])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 7]); - salt_buf[ 4] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 8])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 9]); - salt_buf[ 5] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[10])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[11]); - salt_buf[ 6] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[12])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[13]); - salt_buf[ 7] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[14])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[15]); + salt_buf[ 0] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 0])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 1]); + salt_buf[ 1] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 2])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 3]); + salt_buf[ 2] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 4])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 5]); + salt_buf[ 3] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 6])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 7]); + salt_buf[ 4] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 8])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 9]); + salt_buf[ 5] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[10])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[11]); + salt_buf[ 6] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[12])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[13]); + salt_buf[ 7] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[14])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[15]); salt_buf[ 8] = 0; salt_buf[ 9] = 0; salt_buf[10] = 0; @@ -347,14 +347,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06221_init (__gl u64 w[16]; - w[ 0] = ((u64) swap_workaround (w0[0])) << 32 | (u64) swap_workaround (w0[1]); - w[ 1] = ((u64) swap_workaround (w0[2])) << 32 | (u64) swap_workaround (w0[3]); - w[ 2] = ((u64) swap_workaround (w1[0])) << 32 | (u64) swap_workaround (w1[1]); - w[ 3] = ((u64) swap_workaround (w1[2])) << 32 | (u64) swap_workaround (w1[3]); - w[ 4] = ((u64) swap_workaround (w2[0])) << 32 | (u64) swap_workaround (w2[1]); - w[ 5] = ((u64) swap_workaround (w2[2])) << 32 | (u64) swap_workaround (w2[3]); - w[ 6] = ((u64) swap_workaround (w3[0])) << 32 | (u64) swap_workaround (w3[1]); - w[ 7] = ((u64) swap_workaround (w3[2])) << 32 | (u64) swap_workaround (w3[3]); + w[ 0] = ((u64) swap32 (w0[0])) << 32 | (u64) swap32 (w0[1]); + w[ 1] = ((u64) swap32 (w0[2])) << 32 | (u64) swap32 (w0[3]); + w[ 2] = ((u64) swap32 (w1[0])) << 32 | (u64) swap32 (w1[1]); + w[ 3] = ((u64) swap32 (w1[2])) << 32 | (u64) swap32 (w1[3]); + w[ 4] = ((u64) swap32 (w2[0])) << 32 | (u64) swap32 (w2[1]); + w[ 5] = ((u64) swap32 (w2[2])) << 32 | (u64) swap32 (w2[3]); + w[ 6] = ((u64) swap32 (w3[0])) << 32 | (u64) swap32 (w3[1]); + w[ 7] = ((u64) swap32 (w3[2])) << 32 | (u64) swap32 (w3[3]); w[ 8] = 0; w[ 9] = 0; w[10] = 0; @@ -535,25 +535,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06221_comp (__gl u32 ukey1[8]; - ukey1[0] = swap_workaround (h32_from_64 (tmps[gid].out[ 0])); - ukey1[1] = swap_workaround (l32_from_64 (tmps[gid].out[ 0])); - ukey1[2] = swap_workaround (h32_from_64 (tmps[gid].out[ 1])); - ukey1[3] = swap_workaround (l32_from_64 (tmps[gid].out[ 1])); - ukey1[4] = swap_workaround (h32_from_64 (tmps[gid].out[ 2])); - ukey1[5] = swap_workaround (l32_from_64 (tmps[gid].out[ 2])); - ukey1[6] = swap_workaround (h32_from_64 (tmps[gid].out[ 3])); - ukey1[7] = swap_workaround (l32_from_64 (tmps[gid].out[ 3])); + ukey1[0] = swap32 (h32_from_64 (tmps[gid].out[ 0])); + ukey1[1] = swap32 (l32_from_64 (tmps[gid].out[ 0])); + ukey1[2] = swap32 (h32_from_64 (tmps[gid].out[ 1])); + ukey1[3] = swap32 (l32_from_64 (tmps[gid].out[ 1])); + ukey1[4] = swap32 (h32_from_64 (tmps[gid].out[ 2])); + ukey1[5] = swap32 (l32_from_64 (tmps[gid].out[ 2])); + ukey1[6] = swap32 (h32_from_64 (tmps[gid].out[ 3])); + ukey1[7] = swap32 (l32_from_64 (tmps[gid].out[ 3])); u32 ukey2[8]; - ukey2[0] = swap_workaround (h32_from_64 (tmps[gid].out[ 4])); - ukey2[1] = swap_workaround (l32_from_64 (tmps[gid].out[ 4])); - ukey2[2] = swap_workaround (h32_from_64 (tmps[gid].out[ 5])); - ukey2[3] = swap_workaround (l32_from_64 (tmps[gid].out[ 5])); - ukey2[4] = swap_workaround (h32_from_64 (tmps[gid].out[ 6])); - ukey2[5] = swap_workaround (l32_from_64 (tmps[gid].out[ 6])); - ukey2[6] = swap_workaround (h32_from_64 (tmps[gid].out[ 7])); - ukey2[7] = swap_workaround (l32_from_64 (tmps[gid].out[ 7])); + ukey2[0] = swap32 (h32_from_64 (tmps[gid].out[ 4])); + ukey2[1] = swap32 (l32_from_64 (tmps[gid].out[ 4])); + ukey2[2] = swap32 (h32_from_64 (tmps[gid].out[ 5])); + ukey2[3] = swap32 (l32_from_64 (tmps[gid].out[ 5])); + ukey2[4] = swap32 (h32_from_64 (tmps[gid].out[ 6])); + ukey2[5] = swap32 (l32_from_64 (tmps[gid].out[ 6])); + ukey2[6] = swap32 (h32_from_64 (tmps[gid].out[ 7])); + ukey2[7] = swap32 (l32_from_64 (tmps[gid].out[ 7])); u32 data[4]; diff --git a/OpenCL/m06222.cl b/OpenCL/m06222.cl index e3d5ad8..91336fb 100644 --- a/OpenCL/m06222.cl +++ b/OpenCL/m06222.cl @@ -326,14 +326,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06222_init (__gl // swap fehlt - salt_buf[ 0] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 0])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 1]); - salt_buf[ 1] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 2])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 3]); - salt_buf[ 2] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 4])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 5]); - salt_buf[ 3] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 6])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 7]); - salt_buf[ 4] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 8])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 9]); - salt_buf[ 5] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[10])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[11]); - salt_buf[ 6] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[12])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[13]); - salt_buf[ 7] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[14])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[15]); + salt_buf[ 0] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 0])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 1]); + salt_buf[ 1] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 2])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 3]); + salt_buf[ 2] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 4])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 5]); + salt_buf[ 3] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 6])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 7]); + salt_buf[ 4] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 8])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 9]); + salt_buf[ 5] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[10])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[11]); + salt_buf[ 6] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[12])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[13]); + salt_buf[ 7] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[14])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[15]); salt_buf[ 8] = 0; salt_buf[ 9] = 0; salt_buf[10] = 0; @@ -347,14 +347,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06222_init (__gl u64 w[16]; - w[ 0] = ((u64) swap_workaround (w0[0])) << 32 | (u64) swap_workaround (w0[1]); - w[ 1] = ((u64) swap_workaround (w0[2])) << 32 | (u64) swap_workaround (w0[3]); - w[ 2] = ((u64) swap_workaround (w1[0])) << 32 | (u64) swap_workaround (w1[1]); - w[ 3] = ((u64) swap_workaround (w1[2])) << 32 | (u64) swap_workaround (w1[3]); - w[ 4] = ((u64) swap_workaround (w2[0])) << 32 | (u64) swap_workaround (w2[1]); - w[ 5] = ((u64) swap_workaround (w2[2])) << 32 | (u64) swap_workaround (w2[3]); - w[ 6] = ((u64) swap_workaround (w3[0])) << 32 | (u64) swap_workaround (w3[1]); - w[ 7] = ((u64) swap_workaround (w3[2])) << 32 | (u64) swap_workaround (w3[3]); + w[ 0] = ((u64) swap32 (w0[0])) << 32 | (u64) swap32 (w0[1]); + w[ 1] = ((u64) swap32 (w0[2])) << 32 | (u64) swap32 (w0[3]); + w[ 2] = ((u64) swap32 (w1[0])) << 32 | (u64) swap32 (w1[1]); + w[ 3] = ((u64) swap32 (w1[2])) << 32 | (u64) swap32 (w1[3]); + w[ 4] = ((u64) swap32 (w2[0])) << 32 | (u64) swap32 (w2[1]); + w[ 5] = ((u64) swap32 (w2[2])) << 32 | (u64) swap32 (w2[3]); + w[ 6] = ((u64) swap32 (w3[0])) << 32 | (u64) swap32 (w3[1]); + w[ 7] = ((u64) swap32 (w3[2])) << 32 | (u64) swap32 (w3[3]); w[ 8] = 0; w[ 9] = 0; w[10] = 0; @@ -535,25 +535,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06222_comp (__gl u32 ukey1[8]; - ukey1[0] = swap_workaround (h32_from_64 (tmps[gid].out[ 0])); - ukey1[1] = swap_workaround (l32_from_64 (tmps[gid].out[ 0])); - ukey1[2] = swap_workaround (h32_from_64 (tmps[gid].out[ 1])); - ukey1[3] = swap_workaround (l32_from_64 (tmps[gid].out[ 1])); - ukey1[4] = swap_workaround (h32_from_64 (tmps[gid].out[ 2])); - ukey1[5] = swap_workaround (l32_from_64 (tmps[gid].out[ 2])); - ukey1[6] = swap_workaround (h32_from_64 (tmps[gid].out[ 3])); - ukey1[7] = swap_workaround (l32_from_64 (tmps[gid].out[ 3])); + ukey1[0] = swap32 (h32_from_64 (tmps[gid].out[ 0])); + ukey1[1] = swap32 (l32_from_64 (tmps[gid].out[ 0])); + ukey1[2] = swap32 (h32_from_64 (tmps[gid].out[ 1])); + ukey1[3] = swap32 (l32_from_64 (tmps[gid].out[ 1])); + ukey1[4] = swap32 (h32_from_64 (tmps[gid].out[ 2])); + ukey1[5] = swap32 (l32_from_64 (tmps[gid].out[ 2])); + ukey1[6] = swap32 (h32_from_64 (tmps[gid].out[ 3])); + ukey1[7] = swap32 (l32_from_64 (tmps[gid].out[ 3])); u32 ukey2[8]; - ukey2[0] = swap_workaround (h32_from_64 (tmps[gid].out[ 4])); - ukey2[1] = swap_workaround (l32_from_64 (tmps[gid].out[ 4])); - ukey2[2] = swap_workaround (h32_from_64 (tmps[gid].out[ 5])); - ukey2[3] = swap_workaround (l32_from_64 (tmps[gid].out[ 5])); - ukey2[4] = swap_workaround (h32_from_64 (tmps[gid].out[ 6])); - ukey2[5] = swap_workaround (l32_from_64 (tmps[gid].out[ 6])); - ukey2[6] = swap_workaround (h32_from_64 (tmps[gid].out[ 7])); - ukey2[7] = swap_workaround (l32_from_64 (tmps[gid].out[ 7])); + ukey2[0] = swap32 (h32_from_64 (tmps[gid].out[ 4])); + ukey2[1] = swap32 (l32_from_64 (tmps[gid].out[ 4])); + ukey2[2] = swap32 (h32_from_64 (tmps[gid].out[ 5])); + ukey2[3] = swap32 (l32_from_64 (tmps[gid].out[ 5])); + ukey2[4] = swap32 (h32_from_64 (tmps[gid].out[ 6])); + ukey2[5] = swap32 (l32_from_64 (tmps[gid].out[ 6])); + ukey2[6] = swap32 (h32_from_64 (tmps[gid].out[ 7])); + ukey2[7] = swap32 (l32_from_64 (tmps[gid].out[ 7])); u32 data[4]; @@ -614,25 +614,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06222_comp (__gl u32 ukey3[8]; - ukey3[0] = swap_workaround (h32_from_64 (tmps[gid].out[ 8])); - ukey3[1] = swap_workaround (l32_from_64 (tmps[gid].out[ 8])); - ukey3[2] = swap_workaround (h32_from_64 (tmps[gid].out[ 9])); - ukey3[3] = swap_workaround (l32_from_64 (tmps[gid].out[ 9])); - ukey3[4] = swap_workaround (h32_from_64 (tmps[gid].out[10])); - ukey3[5] = swap_workaround (l32_from_64 (tmps[gid].out[10])); - ukey3[6] = swap_workaround (h32_from_64 (tmps[gid].out[11])); - ukey3[7] = swap_workaround (l32_from_64 (tmps[gid].out[11])); + ukey3[0] = swap32 (h32_from_64 (tmps[gid].out[ 8])); + ukey3[1] = swap32 (l32_from_64 (tmps[gid].out[ 8])); + ukey3[2] = swap32 (h32_from_64 (tmps[gid].out[ 9])); + ukey3[3] = swap32 (l32_from_64 (tmps[gid].out[ 9])); + ukey3[4] = swap32 (h32_from_64 (tmps[gid].out[10])); + ukey3[5] = swap32 (l32_from_64 (tmps[gid].out[10])); + ukey3[6] = swap32 (h32_from_64 (tmps[gid].out[11])); + ukey3[7] = swap32 (l32_from_64 (tmps[gid].out[11])); u32 ukey4[8]; - ukey4[0] = swap_workaround (h32_from_64 (tmps[gid].out[12])); - ukey4[1] = swap_workaround (l32_from_64 (tmps[gid].out[12])); - ukey4[2] = swap_workaround (h32_from_64 (tmps[gid].out[13])); - ukey4[3] = swap_workaround (l32_from_64 (tmps[gid].out[13])); - ukey4[4] = swap_workaround (h32_from_64 (tmps[gid].out[14])); - ukey4[5] = swap_workaround (l32_from_64 (tmps[gid].out[14])); - ukey4[6] = swap_workaround (h32_from_64 (tmps[gid].out[15])); - ukey4[7] = swap_workaround (l32_from_64 (tmps[gid].out[15])); + ukey4[0] = swap32 (h32_from_64 (tmps[gid].out[12])); + ukey4[1] = swap32 (l32_from_64 (tmps[gid].out[12])); + ukey4[2] = swap32 (h32_from_64 (tmps[gid].out[13])); + ukey4[3] = swap32 (l32_from_64 (tmps[gid].out[13])); + ukey4[4] = swap32 (h32_from_64 (tmps[gid].out[14])); + ukey4[5] = swap32 (l32_from_64 (tmps[gid].out[14])); + ukey4[6] = swap32 (h32_from_64 (tmps[gid].out[15])); + ukey4[7] = swap32 (l32_from_64 (tmps[gid].out[15])); { tmp[0] = data[0]; diff --git a/OpenCL/m06223.cl b/OpenCL/m06223.cl index 670cf94..f14dd36 100644 --- a/OpenCL/m06223.cl +++ b/OpenCL/m06223.cl @@ -326,14 +326,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06223_init (__gl // swap fehlt - salt_buf[ 0] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 0])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 1]); - salt_buf[ 1] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 2])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 3]); - salt_buf[ 2] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 4])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 5]); - salt_buf[ 3] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 6])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 7]); - salt_buf[ 4] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 8])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[ 9]); - salt_buf[ 5] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[10])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[11]); - salt_buf[ 6] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[12])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[13]); - salt_buf[ 7] = ((u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[14])) << 32 | (u64) swap_workaround (esalt_bufs[salt_pos].salt_buf[15]); + salt_buf[ 0] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 0])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 1]); + salt_buf[ 1] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 2])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 3]); + salt_buf[ 2] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 4])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 5]); + salt_buf[ 3] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 6])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 7]); + salt_buf[ 4] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 8])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[ 9]); + salt_buf[ 5] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[10])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[11]); + salt_buf[ 6] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[12])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[13]); + salt_buf[ 7] = ((u64) swap32 (esalt_bufs[salt_pos].salt_buf[14])) << 32 | (u64) swap32 (esalt_bufs[salt_pos].salt_buf[15]); salt_buf[ 8] = 0; salt_buf[ 9] = 0; salt_buf[10] = 0; @@ -347,14 +347,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06223_init (__gl u64 w[16]; - w[ 0] = ((u64) swap_workaround (w0[0])) << 32 | (u64) swap_workaround (w0[1]); - w[ 1] = ((u64) swap_workaround (w0[2])) << 32 | (u64) swap_workaround (w0[3]); - w[ 2] = ((u64) swap_workaround (w1[0])) << 32 | (u64) swap_workaround (w1[1]); - w[ 3] = ((u64) swap_workaround (w1[2])) << 32 | (u64) swap_workaround (w1[3]); - w[ 4] = ((u64) swap_workaround (w2[0])) << 32 | (u64) swap_workaround (w2[1]); - w[ 5] = ((u64) swap_workaround (w2[2])) << 32 | (u64) swap_workaround (w2[3]); - w[ 6] = ((u64) swap_workaround (w3[0])) << 32 | (u64) swap_workaround (w3[1]); - w[ 7] = ((u64) swap_workaround (w3[2])) << 32 | (u64) swap_workaround (w3[3]); + w[ 0] = ((u64) swap32 (w0[0])) << 32 | (u64) swap32 (w0[1]); + w[ 1] = ((u64) swap32 (w0[2])) << 32 | (u64) swap32 (w0[3]); + w[ 2] = ((u64) swap32 (w1[0])) << 32 | (u64) swap32 (w1[1]); + w[ 3] = ((u64) swap32 (w1[2])) << 32 | (u64) swap32 (w1[3]); + w[ 4] = ((u64) swap32 (w2[0])) << 32 | (u64) swap32 (w2[1]); + w[ 5] = ((u64) swap32 (w2[2])) << 32 | (u64) swap32 (w2[3]); + w[ 6] = ((u64) swap32 (w3[0])) << 32 | (u64) swap32 (w3[1]); + w[ 7] = ((u64) swap32 (w3[2])) << 32 | (u64) swap32 (w3[3]); w[ 8] = 0; w[ 9] = 0; w[10] = 0; @@ -535,25 +535,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06223_comp (__gl u32 ukey1[8]; - ukey1[0] = swap_workaround (h32_from_64 (tmps[gid].out[ 0])); - ukey1[1] = swap_workaround (l32_from_64 (tmps[gid].out[ 0])); - ukey1[2] = swap_workaround (h32_from_64 (tmps[gid].out[ 1])); - ukey1[3] = swap_workaround (l32_from_64 (tmps[gid].out[ 1])); - ukey1[4] = swap_workaround (h32_from_64 (tmps[gid].out[ 2])); - ukey1[5] = swap_workaround (l32_from_64 (tmps[gid].out[ 2])); - ukey1[6] = swap_workaround (h32_from_64 (tmps[gid].out[ 3])); - ukey1[7] = swap_workaround (l32_from_64 (tmps[gid].out[ 3])); + ukey1[0] = swap32 (h32_from_64 (tmps[gid].out[ 0])); + ukey1[1] = swap32 (l32_from_64 (tmps[gid].out[ 0])); + ukey1[2] = swap32 (h32_from_64 (tmps[gid].out[ 1])); + ukey1[3] = swap32 (l32_from_64 (tmps[gid].out[ 1])); + ukey1[4] = swap32 (h32_from_64 (tmps[gid].out[ 2])); + ukey1[5] = swap32 (l32_from_64 (tmps[gid].out[ 2])); + ukey1[6] = swap32 (h32_from_64 (tmps[gid].out[ 3])); + ukey1[7] = swap32 (l32_from_64 (tmps[gid].out[ 3])); u32 ukey2[8]; - ukey2[0] = swap_workaround (h32_from_64 (tmps[gid].out[ 4])); - ukey2[1] = swap_workaround (l32_from_64 (tmps[gid].out[ 4])); - ukey2[2] = swap_workaround (h32_from_64 (tmps[gid].out[ 5])); - ukey2[3] = swap_workaround (l32_from_64 (tmps[gid].out[ 5])); - ukey2[4] = swap_workaround (h32_from_64 (tmps[gid].out[ 6])); - ukey2[5] = swap_workaround (l32_from_64 (tmps[gid].out[ 6])); - ukey2[6] = swap_workaround (h32_from_64 (tmps[gid].out[ 7])); - ukey2[7] = swap_workaround (l32_from_64 (tmps[gid].out[ 7])); + ukey2[0] = swap32 (h32_from_64 (tmps[gid].out[ 4])); + ukey2[1] = swap32 (l32_from_64 (tmps[gid].out[ 4])); + ukey2[2] = swap32 (h32_from_64 (tmps[gid].out[ 5])); + ukey2[3] = swap32 (l32_from_64 (tmps[gid].out[ 5])); + ukey2[4] = swap32 (h32_from_64 (tmps[gid].out[ 6])); + ukey2[5] = swap32 (l32_from_64 (tmps[gid].out[ 6])); + ukey2[6] = swap32 (h32_from_64 (tmps[gid].out[ 7])); + ukey2[7] = swap32 (l32_from_64 (tmps[gid].out[ 7])); u32 data[4]; @@ -614,25 +614,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06223_comp (__gl u32 ukey3[8]; - ukey3[0] = swap_workaround (h32_from_64 (tmps[gid].out[ 8])); - ukey3[1] = swap_workaround (l32_from_64 (tmps[gid].out[ 8])); - ukey3[2] = swap_workaround (h32_from_64 (tmps[gid].out[ 9])); - ukey3[3] = swap_workaround (l32_from_64 (tmps[gid].out[ 9])); - ukey3[4] = swap_workaround (h32_from_64 (tmps[gid].out[10])); - ukey3[5] = swap_workaround (l32_from_64 (tmps[gid].out[10])); - ukey3[6] = swap_workaround (h32_from_64 (tmps[gid].out[11])); - ukey3[7] = swap_workaround (l32_from_64 (tmps[gid].out[11])); + ukey3[0] = swap32 (h32_from_64 (tmps[gid].out[ 8])); + ukey3[1] = swap32 (l32_from_64 (tmps[gid].out[ 8])); + ukey3[2] = swap32 (h32_from_64 (tmps[gid].out[ 9])); + ukey3[3] = swap32 (l32_from_64 (tmps[gid].out[ 9])); + ukey3[4] = swap32 (h32_from_64 (tmps[gid].out[10])); + ukey3[5] = swap32 (l32_from_64 (tmps[gid].out[10])); + ukey3[6] = swap32 (h32_from_64 (tmps[gid].out[11])); + ukey3[7] = swap32 (l32_from_64 (tmps[gid].out[11])); u32 ukey4[8]; - ukey4[0] = swap_workaround (h32_from_64 (tmps[gid].out[12])); - ukey4[1] = swap_workaround (l32_from_64 (tmps[gid].out[12])); - ukey4[2] = swap_workaround (h32_from_64 (tmps[gid].out[13])); - ukey4[3] = swap_workaround (l32_from_64 (tmps[gid].out[13])); - ukey4[4] = swap_workaround (h32_from_64 (tmps[gid].out[14])); - ukey4[5] = swap_workaround (l32_from_64 (tmps[gid].out[14])); - ukey4[6] = swap_workaround (h32_from_64 (tmps[gid].out[15])); - ukey4[7] = swap_workaround (l32_from_64 (tmps[gid].out[15])); + ukey4[0] = swap32 (h32_from_64 (tmps[gid].out[12])); + ukey4[1] = swap32 (l32_from_64 (tmps[gid].out[12])); + ukey4[2] = swap32 (h32_from_64 (tmps[gid].out[13])); + ukey4[3] = swap32 (l32_from_64 (tmps[gid].out[13])); + ukey4[4] = swap32 (h32_from_64 (tmps[gid].out[14])); + ukey4[5] = swap32 (l32_from_64 (tmps[gid].out[14])); + ukey4[6] = swap32 (h32_from_64 (tmps[gid].out[15])); + ukey4[7] = swap32 (l32_from_64 (tmps[gid].out[15])); { tmp[0] = data[0]; @@ -687,25 +687,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06223_comp (__gl u32 ukey5[8]; - ukey5[0] = swap_workaround (h32_from_64 (tmps[gid].out[16])); - ukey5[1] = swap_workaround (l32_from_64 (tmps[gid].out[16])); - ukey5[2] = swap_workaround (h32_from_64 (tmps[gid].out[17])); - ukey5[3] = swap_workaround (l32_from_64 (tmps[gid].out[17])); - ukey5[4] = swap_workaround (h32_from_64 (tmps[gid].out[18])); - ukey5[5] = swap_workaround (l32_from_64 (tmps[gid].out[18])); - ukey5[6] = swap_workaround (h32_from_64 (tmps[gid].out[19])); - ukey5[7] = swap_workaround (l32_from_64 (tmps[gid].out[19])); + ukey5[0] = swap32 (h32_from_64 (tmps[gid].out[16])); + ukey5[1] = swap32 (l32_from_64 (tmps[gid].out[16])); + ukey5[2] = swap32 (h32_from_64 (tmps[gid].out[17])); + ukey5[3] = swap32 (l32_from_64 (tmps[gid].out[17])); + ukey5[4] = swap32 (h32_from_64 (tmps[gid].out[18])); + ukey5[5] = swap32 (l32_from_64 (tmps[gid].out[18])); + ukey5[6] = swap32 (h32_from_64 (tmps[gid].out[19])); + ukey5[7] = swap32 (l32_from_64 (tmps[gid].out[19])); u32 ukey6[8]; - ukey6[0] = swap_workaround (h32_from_64 (tmps[gid].out[20])); - ukey6[1] = swap_workaround (l32_from_64 (tmps[gid].out[20])); - ukey6[2] = swap_workaround (h32_from_64 (tmps[gid].out[21])); - ukey6[3] = swap_workaround (l32_from_64 (tmps[gid].out[21])); - ukey6[4] = swap_workaround (h32_from_64 (tmps[gid].out[22])); - ukey6[5] = swap_workaround (l32_from_64 (tmps[gid].out[22])); - ukey6[6] = swap_workaround (h32_from_64 (tmps[gid].out[23])); - ukey6[7] = swap_workaround (l32_from_64 (tmps[gid].out[23])); + ukey6[0] = swap32 (h32_from_64 (tmps[gid].out[20])); + ukey6[1] = swap32 (l32_from_64 (tmps[gid].out[20])); + ukey6[2] = swap32 (h32_from_64 (tmps[gid].out[21])); + ukey6[3] = swap32 (l32_from_64 (tmps[gid].out[21])); + ukey6[4] = swap32 (h32_from_64 (tmps[gid].out[22])); + ukey6[5] = swap32 (l32_from_64 (tmps[gid].out[22])); + ukey6[6] = swap32 (h32_from_64 (tmps[gid].out[23])); + ukey6[7] = swap32 (l32_from_64 (tmps[gid].out[23])); { tmp[0] = data[0]; diff --git a/OpenCL/m06231.cl b/OpenCL/m06231.cl index dc74685..5403f6a 100644 --- a/OpenCL/m06231.cl +++ b/OpenCL/m06231.cl @@ -1559,22 +1559,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06231_init (__gl u32 salt_buf1[16]; - salt_buf1[ 0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 0]); - salt_buf1[ 1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 1]); - salt_buf1[ 2] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 2]); - salt_buf1[ 3] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 3]); - salt_buf1[ 4] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 4]); - salt_buf1[ 5] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 5]); - salt_buf1[ 6] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 6]); - salt_buf1[ 7] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 7]); - salt_buf1[ 8] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 8]); - salt_buf1[ 9] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 9]); - salt_buf1[10] = swap_workaround (esalt_bufs[salt_pos].salt_buf[10]); - salt_buf1[11] = swap_workaround (esalt_bufs[salt_pos].salt_buf[11]); - salt_buf1[12] = swap_workaround (esalt_bufs[salt_pos].salt_buf[12]); - salt_buf1[13] = swap_workaround (esalt_bufs[salt_pos].salt_buf[13]); - salt_buf1[14] = swap_workaround (esalt_bufs[salt_pos].salt_buf[14]); - salt_buf1[15] = swap_workaround (esalt_bufs[salt_pos].salt_buf[15]); + salt_buf1[ 0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 0]); + salt_buf1[ 1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 1]); + salt_buf1[ 2] = swap32 (esalt_bufs[salt_pos].salt_buf[ 2]); + salt_buf1[ 3] = swap32 (esalt_bufs[salt_pos].salt_buf[ 3]); + salt_buf1[ 4] = swap32 (esalt_bufs[salt_pos].salt_buf[ 4]); + salt_buf1[ 5] = swap32 (esalt_bufs[salt_pos].salt_buf[ 5]); + salt_buf1[ 6] = swap32 (esalt_bufs[salt_pos].salt_buf[ 6]); + salt_buf1[ 7] = swap32 (esalt_bufs[salt_pos].salt_buf[ 7]); + salt_buf1[ 8] = swap32 (esalt_bufs[salt_pos].salt_buf[ 8]); + salt_buf1[ 9] = swap32 (esalt_bufs[salt_pos].salt_buf[ 9]); + salt_buf1[10] = swap32 (esalt_bufs[salt_pos].salt_buf[10]); + salt_buf1[11] = swap32 (esalt_bufs[salt_pos].salt_buf[11]); + salt_buf1[12] = swap32 (esalt_bufs[salt_pos].salt_buf[12]); + salt_buf1[13] = swap32 (esalt_bufs[salt_pos].salt_buf[13]); + salt_buf1[14] = swap32 (esalt_bufs[salt_pos].salt_buf[14]); + salt_buf1[15] = swap32 (esalt_bufs[salt_pos].salt_buf[15]); u32 salt_buf2[16]; @@ -1599,22 +1599,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06231_init (__gl u32 w[16]; - w[ 0] = swap_workaround (w0[0]); - w[ 1] = swap_workaround (w0[1]); - w[ 2] = swap_workaround (w0[2]); - w[ 3] = swap_workaround (w0[3]); - w[ 4] = swap_workaround (w1[0]); - w[ 5] = swap_workaround (w1[1]); - w[ 6] = swap_workaround (w1[2]); - w[ 7] = swap_workaround (w1[3]); - w[ 8] = swap_workaround (w2[0]); - w[ 9] = swap_workaround (w2[1]); - w[10] = swap_workaround (w2[2]); - w[11] = swap_workaround (w2[3]); - w[12] = swap_workaround (w3[0]); - w[13] = swap_workaround (w3[1]); - w[14] = swap_workaround (w3[2]); - w[15] = swap_workaround (w3[3]); + w[ 0] = swap32 (w0[0]); + w[ 1] = swap32 (w0[1]); + w[ 2] = swap32 (w0[2]); + w[ 3] = swap32 (w0[3]); + w[ 4] = swap32 (w1[0]); + w[ 5] = swap32 (w1[1]); + w[ 6] = swap32 (w1[2]); + w[ 7] = swap32 (w1[3]); + w[ 8] = swap32 (w2[0]); + w[ 9] = swap32 (w2[1]); + w[10] = swap32 (w2[2]); + w[11] = swap32 (w2[3]); + w[12] = swap32 (w3[0]); + w[13] = swap32 (w3[1]); + w[14] = swap32 (w3[2]); + w[15] = swap32 (w3[3]); u32 ipad[16]; u32 opad[16]; @@ -1914,25 +1914,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06231_comp (__gl u32 ukey1[8]; - ukey1[0] = swap_workaround (tmps[gid].out[ 0]); - ukey1[1] = swap_workaround (tmps[gid].out[ 1]); - ukey1[2] = swap_workaround (tmps[gid].out[ 2]); - ukey1[3] = swap_workaround (tmps[gid].out[ 3]); - ukey1[4] = swap_workaround (tmps[gid].out[ 4]); - ukey1[5] = swap_workaround (tmps[gid].out[ 5]); - ukey1[6] = swap_workaround (tmps[gid].out[ 6]); - ukey1[7] = swap_workaround (tmps[gid].out[ 7]); + ukey1[0] = swap32 (tmps[gid].out[ 0]); + ukey1[1] = swap32 (tmps[gid].out[ 1]); + ukey1[2] = swap32 (tmps[gid].out[ 2]); + ukey1[3] = swap32 (tmps[gid].out[ 3]); + ukey1[4] = swap32 (tmps[gid].out[ 4]); + ukey1[5] = swap32 (tmps[gid].out[ 5]); + ukey1[6] = swap32 (tmps[gid].out[ 6]); + ukey1[7] = swap32 (tmps[gid].out[ 7]); u32 ukey2[8]; - ukey2[0] = swap_workaround (tmps[gid].out[ 8]); - ukey2[1] = swap_workaround (tmps[gid].out[ 9]); - ukey2[2] = swap_workaround (tmps[gid].out[10]); - ukey2[3] = swap_workaround (tmps[gid].out[11]); - ukey2[4] = swap_workaround (tmps[gid].out[12]); - ukey2[5] = swap_workaround (tmps[gid].out[13]); - ukey2[6] = swap_workaround (tmps[gid].out[14]); - ukey2[7] = swap_workaround (tmps[gid].out[15]); + ukey2[0] = swap32 (tmps[gid].out[ 8]); + ukey2[1] = swap32 (tmps[gid].out[ 9]); + ukey2[2] = swap32 (tmps[gid].out[10]); + ukey2[3] = swap32 (tmps[gid].out[11]); + ukey2[4] = swap32 (tmps[gid].out[12]); + ukey2[5] = swap32 (tmps[gid].out[13]); + ukey2[6] = swap32 (tmps[gid].out[14]); + ukey2[7] = swap32 (tmps[gid].out[15]); u32 data[4]; diff --git a/OpenCL/m06232.cl b/OpenCL/m06232.cl index 9cc55c7..0fbe937 100644 --- a/OpenCL/m06232.cl +++ b/OpenCL/m06232.cl @@ -1559,22 +1559,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06232_init (__gl u32 salt_buf1[16]; - salt_buf1[ 0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 0]); - salt_buf1[ 1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 1]); - salt_buf1[ 2] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 2]); - salt_buf1[ 3] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 3]); - salt_buf1[ 4] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 4]); - salt_buf1[ 5] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 5]); - salt_buf1[ 6] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 6]); - salt_buf1[ 7] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 7]); - salt_buf1[ 8] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 8]); - salt_buf1[ 9] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 9]); - salt_buf1[10] = swap_workaround (esalt_bufs[salt_pos].salt_buf[10]); - salt_buf1[11] = swap_workaround (esalt_bufs[salt_pos].salt_buf[11]); - salt_buf1[12] = swap_workaround (esalt_bufs[salt_pos].salt_buf[12]); - salt_buf1[13] = swap_workaround (esalt_bufs[salt_pos].salt_buf[13]); - salt_buf1[14] = swap_workaround (esalt_bufs[salt_pos].salt_buf[14]); - salt_buf1[15] = swap_workaround (esalt_bufs[salt_pos].salt_buf[15]); + salt_buf1[ 0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 0]); + salt_buf1[ 1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 1]); + salt_buf1[ 2] = swap32 (esalt_bufs[salt_pos].salt_buf[ 2]); + salt_buf1[ 3] = swap32 (esalt_bufs[salt_pos].salt_buf[ 3]); + salt_buf1[ 4] = swap32 (esalt_bufs[salt_pos].salt_buf[ 4]); + salt_buf1[ 5] = swap32 (esalt_bufs[salt_pos].salt_buf[ 5]); + salt_buf1[ 6] = swap32 (esalt_bufs[salt_pos].salt_buf[ 6]); + salt_buf1[ 7] = swap32 (esalt_bufs[salt_pos].salt_buf[ 7]); + salt_buf1[ 8] = swap32 (esalt_bufs[salt_pos].salt_buf[ 8]); + salt_buf1[ 9] = swap32 (esalt_bufs[salt_pos].salt_buf[ 9]); + salt_buf1[10] = swap32 (esalt_bufs[salt_pos].salt_buf[10]); + salt_buf1[11] = swap32 (esalt_bufs[salt_pos].salt_buf[11]); + salt_buf1[12] = swap32 (esalt_bufs[salt_pos].salt_buf[12]); + salt_buf1[13] = swap32 (esalt_bufs[salt_pos].salt_buf[13]); + salt_buf1[14] = swap32 (esalt_bufs[salt_pos].salt_buf[14]); + salt_buf1[15] = swap32 (esalt_bufs[salt_pos].salt_buf[15]); u32 salt_buf2[16]; @@ -1599,22 +1599,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06232_init (__gl u32 w[16]; - w[ 0] = swap_workaround (w0[0]); - w[ 1] = swap_workaround (w0[1]); - w[ 2] = swap_workaround (w0[2]); - w[ 3] = swap_workaround (w0[3]); - w[ 4] = swap_workaround (w1[0]); - w[ 5] = swap_workaround (w1[1]); - w[ 6] = swap_workaround (w1[2]); - w[ 7] = swap_workaround (w1[3]); - w[ 8] = swap_workaround (w2[0]); - w[ 9] = swap_workaround (w2[1]); - w[10] = swap_workaround (w2[2]); - w[11] = swap_workaround (w2[3]); - w[12] = swap_workaround (w3[0]); - w[13] = swap_workaround (w3[1]); - w[14] = swap_workaround (w3[2]); - w[15] = swap_workaround (w3[3]); + w[ 0] = swap32 (w0[0]); + w[ 1] = swap32 (w0[1]); + w[ 2] = swap32 (w0[2]); + w[ 3] = swap32 (w0[3]); + w[ 4] = swap32 (w1[0]); + w[ 5] = swap32 (w1[1]); + w[ 6] = swap32 (w1[2]); + w[ 7] = swap32 (w1[3]); + w[ 8] = swap32 (w2[0]); + w[ 9] = swap32 (w2[1]); + w[10] = swap32 (w2[2]); + w[11] = swap32 (w2[3]); + w[12] = swap32 (w3[0]); + w[13] = swap32 (w3[1]); + w[14] = swap32 (w3[2]); + w[15] = swap32 (w3[3]); u32 ipad[16]; u32 opad[16]; @@ -1914,25 +1914,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06232_comp (__gl u32 ukey1[8]; - ukey1[0] = swap_workaround (tmps[gid].out[ 0]); - ukey1[1] = swap_workaround (tmps[gid].out[ 1]); - ukey1[2] = swap_workaround (tmps[gid].out[ 2]); - ukey1[3] = swap_workaround (tmps[gid].out[ 3]); - ukey1[4] = swap_workaround (tmps[gid].out[ 4]); - ukey1[5] = swap_workaround (tmps[gid].out[ 5]); - ukey1[6] = swap_workaround (tmps[gid].out[ 6]); - ukey1[7] = swap_workaround (tmps[gid].out[ 7]); + ukey1[0] = swap32 (tmps[gid].out[ 0]); + ukey1[1] = swap32 (tmps[gid].out[ 1]); + ukey1[2] = swap32 (tmps[gid].out[ 2]); + ukey1[3] = swap32 (tmps[gid].out[ 3]); + ukey1[4] = swap32 (tmps[gid].out[ 4]); + ukey1[5] = swap32 (tmps[gid].out[ 5]); + ukey1[6] = swap32 (tmps[gid].out[ 6]); + ukey1[7] = swap32 (tmps[gid].out[ 7]); u32 ukey2[8]; - ukey2[0] = swap_workaround (tmps[gid].out[ 8]); - ukey2[1] = swap_workaround (tmps[gid].out[ 9]); - ukey2[2] = swap_workaround (tmps[gid].out[10]); - ukey2[3] = swap_workaround (tmps[gid].out[11]); - ukey2[4] = swap_workaround (tmps[gid].out[12]); - ukey2[5] = swap_workaround (tmps[gid].out[13]); - ukey2[6] = swap_workaround (tmps[gid].out[14]); - ukey2[7] = swap_workaround (tmps[gid].out[15]); + ukey2[0] = swap32 (tmps[gid].out[ 8]); + ukey2[1] = swap32 (tmps[gid].out[ 9]); + ukey2[2] = swap32 (tmps[gid].out[10]); + ukey2[3] = swap32 (tmps[gid].out[11]); + ukey2[4] = swap32 (tmps[gid].out[12]); + ukey2[5] = swap32 (tmps[gid].out[13]); + ukey2[6] = swap32 (tmps[gid].out[14]); + ukey2[7] = swap32 (tmps[gid].out[15]); u32 data[4]; @@ -1993,25 +1993,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06232_comp (__gl u32 ukey3[8]; - ukey3[0] = swap_workaround (tmps[gid].out[16]); - ukey3[1] = swap_workaround (tmps[gid].out[17]); - ukey3[2] = swap_workaround (tmps[gid].out[18]); - ukey3[3] = swap_workaround (tmps[gid].out[19]); - ukey3[4] = swap_workaround (tmps[gid].out[20]); - ukey3[5] = swap_workaround (tmps[gid].out[21]); - ukey3[6] = swap_workaround (tmps[gid].out[22]); - ukey3[7] = swap_workaround (tmps[gid].out[23]); + ukey3[0] = swap32 (tmps[gid].out[16]); + ukey3[1] = swap32 (tmps[gid].out[17]); + ukey3[2] = swap32 (tmps[gid].out[18]); + ukey3[3] = swap32 (tmps[gid].out[19]); + ukey3[4] = swap32 (tmps[gid].out[20]); + ukey3[5] = swap32 (tmps[gid].out[21]); + ukey3[6] = swap32 (tmps[gid].out[22]); + ukey3[7] = swap32 (tmps[gid].out[23]); u32 ukey4[8]; - ukey4[0] = swap_workaround (tmps[gid].out[24]); - ukey4[1] = swap_workaround (tmps[gid].out[25]); - ukey4[2] = swap_workaround (tmps[gid].out[26]); - ukey4[3] = swap_workaround (tmps[gid].out[27]); - ukey4[4] = swap_workaround (tmps[gid].out[28]); - ukey4[5] = swap_workaround (tmps[gid].out[29]); - ukey4[6] = swap_workaround (tmps[gid].out[30]); - ukey4[7] = swap_workaround (tmps[gid].out[31]); + ukey4[0] = swap32 (tmps[gid].out[24]); + ukey4[1] = swap32 (tmps[gid].out[25]); + ukey4[2] = swap32 (tmps[gid].out[26]); + ukey4[3] = swap32 (tmps[gid].out[27]); + ukey4[4] = swap32 (tmps[gid].out[28]); + ukey4[5] = swap32 (tmps[gid].out[29]); + ukey4[6] = swap32 (tmps[gid].out[30]); + ukey4[7] = swap32 (tmps[gid].out[31]); { tmp[0] = data[0]; diff --git a/OpenCL/m06233.cl b/OpenCL/m06233.cl index fb67506..864dcda 100644 --- a/OpenCL/m06233.cl +++ b/OpenCL/m06233.cl @@ -1559,22 +1559,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06233_init (__gl u32 salt_buf1[16]; - salt_buf1[ 0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 0]); - salt_buf1[ 1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 1]); - salt_buf1[ 2] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 2]); - salt_buf1[ 3] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 3]); - salt_buf1[ 4] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 4]); - salt_buf1[ 5] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 5]); - salt_buf1[ 6] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 6]); - salt_buf1[ 7] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 7]); - salt_buf1[ 8] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 8]); - salt_buf1[ 9] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 9]); - salt_buf1[10] = swap_workaround (esalt_bufs[salt_pos].salt_buf[10]); - salt_buf1[11] = swap_workaround (esalt_bufs[salt_pos].salt_buf[11]); - salt_buf1[12] = swap_workaround (esalt_bufs[salt_pos].salt_buf[12]); - salt_buf1[13] = swap_workaround (esalt_bufs[salt_pos].salt_buf[13]); - salt_buf1[14] = swap_workaround (esalt_bufs[salt_pos].salt_buf[14]); - salt_buf1[15] = swap_workaround (esalt_bufs[salt_pos].salt_buf[15]); + salt_buf1[ 0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 0]); + salt_buf1[ 1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 1]); + salt_buf1[ 2] = swap32 (esalt_bufs[salt_pos].salt_buf[ 2]); + salt_buf1[ 3] = swap32 (esalt_bufs[salt_pos].salt_buf[ 3]); + salt_buf1[ 4] = swap32 (esalt_bufs[salt_pos].salt_buf[ 4]); + salt_buf1[ 5] = swap32 (esalt_bufs[salt_pos].salt_buf[ 5]); + salt_buf1[ 6] = swap32 (esalt_bufs[salt_pos].salt_buf[ 6]); + salt_buf1[ 7] = swap32 (esalt_bufs[salt_pos].salt_buf[ 7]); + salt_buf1[ 8] = swap32 (esalt_bufs[salt_pos].salt_buf[ 8]); + salt_buf1[ 9] = swap32 (esalt_bufs[salt_pos].salt_buf[ 9]); + salt_buf1[10] = swap32 (esalt_bufs[salt_pos].salt_buf[10]); + salt_buf1[11] = swap32 (esalt_bufs[salt_pos].salt_buf[11]); + salt_buf1[12] = swap32 (esalt_bufs[salt_pos].salt_buf[12]); + salt_buf1[13] = swap32 (esalt_bufs[salt_pos].salt_buf[13]); + salt_buf1[14] = swap32 (esalt_bufs[salt_pos].salt_buf[14]); + salt_buf1[15] = swap32 (esalt_bufs[salt_pos].salt_buf[15]); u32 salt_buf2[16]; @@ -1599,22 +1599,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06233_init (__gl u32 w[16]; - w[ 0] = swap_workaround (w0[0]); - w[ 1] = swap_workaround (w0[1]); - w[ 2] = swap_workaround (w0[2]); - w[ 3] = swap_workaround (w0[3]); - w[ 4] = swap_workaround (w1[0]); - w[ 5] = swap_workaround (w1[1]); - w[ 6] = swap_workaround (w1[2]); - w[ 7] = swap_workaround (w1[3]); - w[ 8] = swap_workaround (w2[0]); - w[ 9] = swap_workaround (w2[1]); - w[10] = swap_workaround (w2[2]); - w[11] = swap_workaround (w2[3]); - w[12] = swap_workaround (w3[0]); - w[13] = swap_workaround (w3[1]); - w[14] = swap_workaround (w3[2]); - w[15] = swap_workaround (w3[3]); + w[ 0] = swap32 (w0[0]); + w[ 1] = swap32 (w0[1]); + w[ 2] = swap32 (w0[2]); + w[ 3] = swap32 (w0[3]); + w[ 4] = swap32 (w1[0]); + w[ 5] = swap32 (w1[1]); + w[ 6] = swap32 (w1[2]); + w[ 7] = swap32 (w1[3]); + w[ 8] = swap32 (w2[0]); + w[ 9] = swap32 (w2[1]); + w[10] = swap32 (w2[2]); + w[11] = swap32 (w2[3]); + w[12] = swap32 (w3[0]); + w[13] = swap32 (w3[1]); + w[14] = swap32 (w3[2]); + w[15] = swap32 (w3[3]); u32 ipad[16]; u32 opad[16]; @@ -1914,25 +1914,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06233_comp (__gl u32 ukey1[8]; - ukey1[0] = swap_workaround (tmps[gid].out[ 0]); - ukey1[1] = swap_workaround (tmps[gid].out[ 1]); - ukey1[2] = swap_workaround (tmps[gid].out[ 2]); - ukey1[3] = swap_workaround (tmps[gid].out[ 3]); - ukey1[4] = swap_workaround (tmps[gid].out[ 4]); - ukey1[5] = swap_workaround (tmps[gid].out[ 5]); - ukey1[6] = swap_workaround (tmps[gid].out[ 6]); - ukey1[7] = swap_workaround (tmps[gid].out[ 7]); + ukey1[0] = swap32 (tmps[gid].out[ 0]); + ukey1[1] = swap32 (tmps[gid].out[ 1]); + ukey1[2] = swap32 (tmps[gid].out[ 2]); + ukey1[3] = swap32 (tmps[gid].out[ 3]); + ukey1[4] = swap32 (tmps[gid].out[ 4]); + ukey1[5] = swap32 (tmps[gid].out[ 5]); + ukey1[6] = swap32 (tmps[gid].out[ 6]); + ukey1[7] = swap32 (tmps[gid].out[ 7]); u32 ukey2[8]; - ukey2[0] = swap_workaround (tmps[gid].out[ 8]); - ukey2[1] = swap_workaround (tmps[gid].out[ 9]); - ukey2[2] = swap_workaround (tmps[gid].out[10]); - ukey2[3] = swap_workaround (tmps[gid].out[11]); - ukey2[4] = swap_workaround (tmps[gid].out[12]); - ukey2[5] = swap_workaround (tmps[gid].out[13]); - ukey2[6] = swap_workaround (tmps[gid].out[14]); - ukey2[7] = swap_workaround (tmps[gid].out[15]); + ukey2[0] = swap32 (tmps[gid].out[ 8]); + ukey2[1] = swap32 (tmps[gid].out[ 9]); + ukey2[2] = swap32 (tmps[gid].out[10]); + ukey2[3] = swap32 (tmps[gid].out[11]); + ukey2[4] = swap32 (tmps[gid].out[12]); + ukey2[5] = swap32 (tmps[gid].out[13]); + ukey2[6] = swap32 (tmps[gid].out[14]); + ukey2[7] = swap32 (tmps[gid].out[15]); u32 data[4]; @@ -1993,25 +1993,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06233_comp (__gl u32 ukey3[8]; - ukey3[0] = swap_workaround (tmps[gid].out[16]); - ukey3[1] = swap_workaround (tmps[gid].out[17]); - ukey3[2] = swap_workaround (tmps[gid].out[18]); - ukey3[3] = swap_workaround (tmps[gid].out[19]); - ukey3[4] = swap_workaround (tmps[gid].out[20]); - ukey3[5] = swap_workaround (tmps[gid].out[21]); - ukey3[6] = swap_workaround (tmps[gid].out[22]); - ukey3[7] = swap_workaround (tmps[gid].out[23]); + ukey3[0] = swap32 (tmps[gid].out[16]); + ukey3[1] = swap32 (tmps[gid].out[17]); + ukey3[2] = swap32 (tmps[gid].out[18]); + ukey3[3] = swap32 (tmps[gid].out[19]); + ukey3[4] = swap32 (tmps[gid].out[20]); + ukey3[5] = swap32 (tmps[gid].out[21]); + ukey3[6] = swap32 (tmps[gid].out[22]); + ukey3[7] = swap32 (tmps[gid].out[23]); u32 ukey4[8]; - ukey4[0] = swap_workaround (tmps[gid].out[24]); - ukey4[1] = swap_workaround (tmps[gid].out[25]); - ukey4[2] = swap_workaround (tmps[gid].out[26]); - ukey4[3] = swap_workaround (tmps[gid].out[27]); - ukey4[4] = swap_workaround (tmps[gid].out[28]); - ukey4[5] = swap_workaround (tmps[gid].out[29]); - ukey4[6] = swap_workaround (tmps[gid].out[30]); - ukey4[7] = swap_workaround (tmps[gid].out[31]); + ukey4[0] = swap32 (tmps[gid].out[24]); + ukey4[1] = swap32 (tmps[gid].out[25]); + ukey4[2] = swap32 (tmps[gid].out[26]); + ukey4[3] = swap32 (tmps[gid].out[27]); + ukey4[4] = swap32 (tmps[gid].out[28]); + ukey4[5] = swap32 (tmps[gid].out[29]); + ukey4[6] = swap32 (tmps[gid].out[30]); + ukey4[7] = swap32 (tmps[gid].out[31]); { tmp[0] = data[0]; @@ -2066,25 +2066,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06233_comp (__gl u32 ukey5[8]; - ukey5[0] = swap_workaround (tmps[gid].out[32]); - ukey5[1] = swap_workaround (tmps[gid].out[33]); - ukey5[2] = swap_workaround (tmps[gid].out[34]); - ukey5[3] = swap_workaround (tmps[gid].out[35]); - ukey5[4] = swap_workaround (tmps[gid].out[36]); - ukey5[5] = swap_workaround (tmps[gid].out[37]); - ukey5[6] = swap_workaround (tmps[gid].out[38]); - ukey5[7] = swap_workaround (tmps[gid].out[39]); + ukey5[0] = swap32 (tmps[gid].out[32]); + ukey5[1] = swap32 (tmps[gid].out[33]); + ukey5[2] = swap32 (tmps[gid].out[34]); + ukey5[3] = swap32 (tmps[gid].out[35]); + ukey5[4] = swap32 (tmps[gid].out[36]); + ukey5[5] = swap32 (tmps[gid].out[37]); + ukey5[6] = swap32 (tmps[gid].out[38]); + ukey5[7] = swap32 (tmps[gid].out[39]); u32 ukey6[8]; - ukey6[0] = swap_workaround (tmps[gid].out[40]); - ukey6[1] = swap_workaround (tmps[gid].out[41]); - ukey6[2] = swap_workaround (tmps[gid].out[42]); - ukey6[3] = swap_workaround (tmps[gid].out[43]); - ukey6[4] = swap_workaround (tmps[gid].out[44]); - ukey6[5] = swap_workaround (tmps[gid].out[45]); - ukey6[6] = swap_workaround (tmps[gid].out[46]); - ukey6[7] = swap_workaround (tmps[gid].out[47]); + ukey6[0] = swap32 (tmps[gid].out[40]); + ukey6[1] = swap32 (tmps[gid].out[41]); + ukey6[2] = swap32 (tmps[gid].out[42]); + ukey6[3] = swap32 (tmps[gid].out[43]); + ukey6[4] = swap32 (tmps[gid].out[44]); + ukey6[5] = swap32 (tmps[gid].out[45]); + ukey6[6] = swap32 (tmps[gid].out[46]); + ukey6[7] = swap32 (tmps[gid].out[47]); { tmp[0] = data[0]; diff --git a/OpenCL/m06300.cl b/OpenCL/m06300.cl index 593d311..ace8fdf 100644 --- a/OpenCL/m06300.cl +++ b/OpenCL/m06300.cl @@ -17,13 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) { diff --git a/OpenCL/m06400.cl b/OpenCL/m06400.cl index 2d04903..5d88ec6 100644 --- a/OpenCL/m06400.cl +++ b/OpenCL/m06400.cl @@ -17,17 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE2 -#define COMPARE_M "check_multi_vect2_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" __constant u32 k_sha256[64] = { @@ -316,22 +307,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06400_init (__gl * pads */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); u32 ipad[8]; u32 opad[8]; @@ -373,21 +364,21 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06400_init (__gl w3[2] = salt_buf3[2]; // w3[3] = salt_buf3[3]; - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); w3[3] = (64 + salt_len + 4) * 8; u32 dgst[8]; diff --git a/OpenCL/m06500.cl b/OpenCL/m06500.cl index 7704580..9793c1d 100644 --- a/OpenCL/m06500.cl +++ b/OpenCL/m06500.cl @@ -17,17 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE2 -#define COMPARE_M "check_multi_vect2_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" __constant u64 k_sha512[80] = { @@ -320,22 +311,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06500_init (__gl * pads */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); u64 w0l[4]; u64 w1l[4]; @@ -399,14 +390,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06500_init (__gl w3l[2] = 0; w3l[3] = 0; - w0l[0] = swap_workaround (w0l[0]); - w0l[1] = swap_workaround (w0l[1]); - w0l[2] = swap_workaround (w0l[2]); - w0l[3] = swap_workaround (w0l[3]); - w1l[0] = swap_workaround (w1l[0]); - w1l[1] = swap_workaround (w1l[1]); - w1l[2] = swap_workaround (w1l[2]); - w1l[3] = swap_workaround (w1l[3]); + w0l[0] = swap32 (w0l[0]); + w0l[1] = swap32 (w0l[1]); + w0l[2] = swap32 (w0l[2]); + w0l[3] = swap32 (w0l[3]); + w1l[0] = swap32 (w1l[0]); + w1l[1] = swap32 (w1l[1]); + w1l[2] = swap32 (w1l[2]); + w1l[3] = swap32 (w1l[3]); w2l[0] = 0; w2l[1] = 0; w2l[2] = 0; diff --git a/OpenCL/m06600.cl b/OpenCL/m06600.cl index f45b8a8..6cd6562 100644 --- a/OpenCL/m06600.cl +++ b/OpenCL/m06600.cl @@ -1123,22 +1123,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06600_init (__gl * pads */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); u32 ipad[5]; u32 opad[5]; @@ -1177,10 +1177,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06600_init (__gl append_0x01_1x4 (w0, salt_len + 3); append_0x80_1x4 (w0, salt_len + 4); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); w1[0] = 0; w1[1] = 0; w1[2] = 0; diff --git a/OpenCL/m06700.cl b/OpenCL/m06700.cl index d824593..6252a99 100644 --- a/OpenCL/m06700.cl +++ b/OpenCL/m06700.cl @@ -17,17 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE2 -#define COMPARE_M "check_multi_vect2_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) { @@ -326,22 +317,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06700_init (__gl * pads */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); u32 ipad[5]; u32 opad[5]; @@ -377,21 +368,21 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06700_init (__gl w3[2] = salt_buf3[2]; //w3[3] = salt_buf3[3]; - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); w3[3] = (64 + salt_len + 4) * 8; u32 dgst[5]; diff --git a/OpenCL/m06800.cl b/OpenCL/m06800.cl index 6eacfa1..e94c5c5 100644 --- a/OpenCL/m06800.cl +++ b/OpenCL/m06800.cl @@ -1255,22 +1255,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06800_init (__gl * pads */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); u32 ipad[8]; u32 opad[8]; @@ -1315,16 +1315,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06800_init (__gl append_0x01_3x4 (w0, w1, w2, salt_len + 3); append_0x80_3x4 (w0, w1, w2, salt_len + 4); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); w2[2] = 0; w2[3] = 0; w3[0] = 0; @@ -1596,10 +1596,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06800_comp (__gl salt_buf[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf[3] = salt_bufs[salt_pos].salt_buf[3]; - out[0] = swap_workaround (out[0]); - out[1] = swap_workaround (out[1]); - out[2] = swap_workaround (out[2]); - out[3] = swap_workaround (out[3]); + out[0] = swap32 (out[0]); + out[1] = swap32 (out[1]); + out[2] = swap32 (out[2]); + out[3] = swap32 (out[3]); truncate_block (out, salt_len); diff --git a/OpenCL/m07100.cl b/OpenCL/m07100.cl index f00223e..23383f9 100644 --- a/OpenCL/m07100.cl +++ b/OpenCL/m07100.cl @@ -17,9 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" __constant u64 k_sha512[80] = { @@ -246,31 +245,31 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07100_init (__gl u32 w0[4]; - w0[0] = swap_workaround (pws[gid].i[ 0]); - w0[1] = swap_workaround (pws[gid].i[ 1]); - w0[2] = swap_workaround (pws[gid].i[ 2]); - w0[3] = swap_workaround (pws[gid].i[ 3]); + w0[0] = swap32 (pws[gid].i[ 0]); + w0[1] = swap32 (pws[gid].i[ 1]); + w0[2] = swap32 (pws[gid].i[ 2]); + w0[3] = swap32 (pws[gid].i[ 3]); u32 w1[4]; - w1[0] = swap_workaround (pws[gid].i[ 4]); - w1[1] = swap_workaround (pws[gid].i[ 5]); - w1[2] = swap_workaround (pws[gid].i[ 6]); - w1[3] = swap_workaround (pws[gid].i[ 7]); + w1[0] = swap32 (pws[gid].i[ 4]); + w1[1] = swap32 (pws[gid].i[ 5]); + w1[2] = swap32 (pws[gid].i[ 6]); + w1[3] = swap32 (pws[gid].i[ 7]); u32 w2[4]; - w2[0] = swap_workaround (pws[gid].i[ 8]); - w2[1] = swap_workaround (pws[gid].i[ 9]); - w2[2] = swap_workaround (pws[gid].i[10]); - w2[3] = swap_workaround (pws[gid].i[11]); + w2[0] = swap32 (pws[gid].i[ 8]); + w2[1] = swap32 (pws[gid].i[ 9]); + w2[2] = swap32 (pws[gid].i[10]); + w2[3] = swap32 (pws[gid].i[11]); u32 w3[4]; - w3[0] = swap_workaround (pws[gid].i[12]); - w3[1] = swap_workaround (pws[gid].i[13]); - w3[2] = swap_workaround (pws[gid].i[14]); - w3[3] = swap_workaround (pws[gid].i[15]); + w3[0] = swap32 (pws[gid].i[12]); + w3[1] = swap32 (pws[gid].i[13]); + w3[2] = swap32 (pws[gid].i[14]); + w3[3] = swap32 (pws[gid].i[15]); /** * salt @@ -280,20 +279,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07100_init (__gl u32 salt_len = salt_bufs[salt_pos].salt_len; - esalt_buf[ 0] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[ 0]), swap_workaround (esalt_bufs[salt_pos].salt_buf[ 1])); - esalt_buf[ 1] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[ 2]), swap_workaround (esalt_bufs[salt_pos].salt_buf[ 3])); - esalt_buf[ 2] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[ 4]), swap_workaround (esalt_bufs[salt_pos].salt_buf[ 5])); - esalt_buf[ 3] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[ 6]), swap_workaround (esalt_bufs[salt_pos].salt_buf[ 7])); - esalt_buf[ 4] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[ 8]), swap_workaround (esalt_bufs[salt_pos].salt_buf[ 9])); - esalt_buf[ 5] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[10]), swap_workaround (esalt_bufs[salt_pos].salt_buf[11])); - esalt_buf[ 6] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[12]), swap_workaround (esalt_bufs[salt_pos].salt_buf[13])); - esalt_buf[ 7] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[14]), swap_workaround (esalt_bufs[salt_pos].salt_buf[15])); - esalt_buf[ 8] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[16]), swap_workaround (esalt_bufs[salt_pos].salt_buf[17])); - esalt_buf[ 9] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[18]), swap_workaround (esalt_bufs[salt_pos].salt_buf[19])); - esalt_buf[10] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[20]), swap_workaround (esalt_bufs[salt_pos].salt_buf[21])); - esalt_buf[11] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[22]), swap_workaround (esalt_bufs[salt_pos].salt_buf[23])); - esalt_buf[12] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[24]), swap_workaround (esalt_bufs[salt_pos].salt_buf[25])); - esalt_buf[13] = hl32_to_64 (swap_workaround (esalt_bufs[salt_pos].salt_buf[26]), swap_workaround (esalt_bufs[salt_pos].salt_buf[27])); + esalt_buf[ 0] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[ 0]), swap32 (esalt_bufs[salt_pos].salt_buf[ 1])); + esalt_buf[ 1] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[ 2]), swap32 (esalt_bufs[salt_pos].salt_buf[ 3])); + esalt_buf[ 2] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[ 4]), swap32 (esalt_bufs[salt_pos].salt_buf[ 5])); + esalt_buf[ 3] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[ 6]), swap32 (esalt_bufs[salt_pos].salt_buf[ 7])); + esalt_buf[ 4] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[ 8]), swap32 (esalt_bufs[salt_pos].salt_buf[ 9])); + esalt_buf[ 5] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[10]), swap32 (esalt_bufs[salt_pos].salt_buf[11])); + esalt_buf[ 6] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[12]), swap32 (esalt_bufs[salt_pos].salt_buf[13])); + esalt_buf[ 7] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[14]), swap32 (esalt_bufs[salt_pos].salt_buf[15])); + esalt_buf[ 8] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[16]), swap32 (esalt_bufs[salt_pos].salt_buf[17])); + esalt_buf[ 9] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[18]), swap32 (esalt_bufs[salt_pos].salt_buf[19])); + esalt_buf[10] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[20]), swap32 (esalt_bufs[salt_pos].salt_buf[21])); + esalt_buf[11] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[22]), swap32 (esalt_bufs[salt_pos].salt_buf[23])); + esalt_buf[12] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[24]), swap32 (esalt_bufs[salt_pos].salt_buf[25])); + esalt_buf[13] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[26]), swap32 (esalt_bufs[salt_pos].salt_buf[27])); esalt_buf[14] = 0; esalt_buf[15] = (128 + salt_len + 4) * 8; diff --git a/OpenCL/m07300_a0.cl b/OpenCL/m07300_a0.cl index 466c10c..3717f26 100644 --- a/OpenCL/m07300_a0.cl +++ b/OpenCL/m07300_a0.cl @@ -319,17 +319,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07300_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -507,17 +507,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07300_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; diff --git a/OpenCL/m07300_a1.cl b/OpenCL/m07300_a1.cl index 6242f83..098317e 100644 --- a/OpenCL/m07300_a1.cl +++ b/OpenCL/m07300_a1.cl @@ -371,17 +371,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07300_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; @@ -613,17 +613,17 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07300_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; diff --git a/OpenCL/m07400.cl b/OpenCL/m07400.cl index 53fa1fa..79a6664 100644 --- a/OpenCL/m07400.cl +++ b/OpenCL/m07400.cl @@ -17,9 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" __constant u32 k_sha256[64] = { @@ -52,22 +51,22 @@ static void sha256_transform (const u32 w[16], u32 digest[8]) u32 g = digest[6]; u32 h = digest[7]; - u32 w0_t = swap_workaround (w[ 0]); - u32 w1_t = swap_workaround (w[ 1]); - u32 w2_t = swap_workaround (w[ 2]); - u32 w3_t = swap_workaround (w[ 3]); - u32 w4_t = swap_workaround (w[ 4]); - u32 w5_t = swap_workaround (w[ 5]); - u32 w6_t = swap_workaround (w[ 6]); - u32 w7_t = swap_workaround (w[ 7]); - u32 w8_t = swap_workaround (w[ 8]); - u32 w9_t = swap_workaround (w[ 9]); - u32 wa_t = swap_workaround (w[10]); - u32 wb_t = swap_workaround (w[11]); - u32 wc_t = swap_workaround (w[12]); - u32 wd_t = swap_workaround (w[13]); - u32 we_t = swap_workaround (w[14]); - u32 wf_t = swap_workaround (w[15]); + u32 w0_t = swap32 (w[ 0]); + u32 w1_t = swap32 (w[ 1]); + u32 w2_t = swap32 (w[ 2]); + u32 w3_t = swap32 (w[ 3]); + u32 w4_t = swap32 (w[ 4]); + u32 w5_t = swap32 (w[ 5]); + u32 w6_t = swap32 (w[ 6]); + u32 w7_t = swap32 (w[ 7]); + u32 w8_t = swap32 (w[ 8]); + u32 w9_t = swap32 (w[ 9]); + u32 wa_t = swap32 (w[10]); + u32 wb_t = swap32 (w[11]); + u32 wc_t = swap32 (w[12]); + u32 wd_t = swap32 (w[13]); + u32 we_t = swap32 (w[14]); + u32 wf_t = swap32 (w[15]); #define ROUND_EXPAND() \ { \ @@ -185,14 +184,14 @@ static void bzero16 (u32 block[16]) static void bswap8 (u32 block[16]) { - block[ 0] = swap_workaround (block[ 0]); - block[ 1] = swap_workaround (block[ 1]); - block[ 2] = swap_workaround (block[ 2]); - block[ 3] = swap_workaround (block[ 3]); - block[ 4] = swap_workaround (block[ 4]); - block[ 5] = swap_workaround (block[ 5]); - block[ 6] = swap_workaround (block[ 6]); - block[ 7] = swap_workaround (block[ 7]); + block[ 0] = swap32 (block[ 0]); + block[ 1] = swap32 (block[ 1]); + block[ 2] = swap32 (block[ 2]); + block[ 3] = swap32 (block[ 3]); + block[ 4] = swap32 (block[ 4]); + block[ 5] = swap32 (block[ 5]); + block[ 6] = swap32 (block[ 6]); + block[ 7] = swap32 (block[ 7]); } static u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], const u32 append_len) @@ -206,6 +205,7 @@ static u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], co u32 tmp3; u32 tmp4; + #ifdef IS_AMD const int offset_minus_4 = 4 - block_len; tmp0 = amd_bytealign (append[0], 0, offset_minus_4); @@ -222,6 +222,19 @@ static u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], co tmp3 = tmp4; tmp4 = 0; } + #endif + + #ifdef IS_NV + const int offset_minus_4 = 4 - (block_len & 3); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + tmp0 = __byte_perm ( 0, append[0], selector); + tmp1 = __byte_perm (append[0], append[1], selector); + tmp2 = __byte_perm (append[1], append[2], selector); + tmp3 = __byte_perm (append[2], append[3], selector); + tmp4 = __byte_perm (append[3], 0, selector); + #endif switch (div) { @@ -329,6 +342,7 @@ static u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], c u32 tmp3; u32 tmp4; + #ifdef IS_AMD const int offset_minus_4 = 4 - block_len; tmp0 = amd_bytealign (append[0], 0, offset_minus_4); @@ -345,6 +359,19 @@ static u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], c tmp3 = tmp4; tmp4 = 0; } + #endif + + #ifdef IS_NV + const int offset_minus_4 = 4 - (block_len & 3); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + tmp0 = __byte_perm ( 0, append[0], selector); + tmp1 = __byte_perm (append[0], append[1], selector); + tmp2 = __byte_perm (append[1], append[2], selector); + tmp3 = __byte_perm (append[2], append[3], selector); + tmp4 = __byte_perm (append[3], 0, selector); + #endif u32 carry[4] = { 0, 0, 0, 0 }; @@ -478,6 +505,7 @@ static u32 memcat20 (u32 block[20], const u32 block_len, const u32 append[4], co u32 tmp3; u32 tmp4; + #ifdef IS_AMD const int offset_minus_4 = 4 - block_len; tmp0 = amd_bytealign (append[0], 0, offset_minus_4); @@ -494,6 +522,19 @@ static u32 memcat20 (u32 block[20], const u32 block_len, const u32 append[4], co tmp3 = tmp4; tmp4 = 0; } + #endif + + #ifdef IS_NV + const int offset_minus_4 = 4 - (block_len & 3); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + tmp0 = __byte_perm ( 0, append[0], selector); + tmp1 = __byte_perm (append[0], append[1], selector); + tmp2 = __byte_perm (append[1], append[2], selector); + tmp3 = __byte_perm (append[2], append[3], selector); + tmp4 = __byte_perm (append[3], 0, selector); + #endif switch (div) { @@ -609,6 +650,7 @@ static u32 memcat20_x80 (u32 block[20], const u32 block_len, const u32 append[4] u32 tmp3; u32 tmp4; + #ifdef IS_AMD const int offset_minus_4 = 4 - block_len; tmp0 = amd_bytealign (append[0], 0, offset_minus_4); @@ -625,6 +667,19 @@ static u32 memcat20_x80 (u32 block[20], const u32 block_len, const u32 append[4] tmp3 = tmp4; tmp4 = 0x80; } + #endif + + #ifdef IS_NV + const int offset_minus_4 = 4 - (block_len & 3); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + tmp0 = __byte_perm ( 0, append[0], selector); + tmp1 = __byte_perm (append[0], append[1], selector); + tmp2 = __byte_perm (append[1], append[2], selector); + tmp3 = __byte_perm (append[2], append[3], selector); + tmp4 = __byte_perm (append[3], 0x80, selector); + #endif switch (div) { @@ -792,9 +847,9 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_init (__gl block_len = memcat16 (block, block_len, w0, pw_len); - append_0x80_4x4 (block, block_len); + append_0x80_1x16 (block, block_len); - block[15] = swap_workaround (block_len * 8); + block[15] = swap32 (block_len * 8); init_ctx (alt_result); @@ -867,7 +922,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_init (__gl } } - append_0x80_4x4 (block, block_len); + append_0x80_1x16 (block, block_len); if (block_len >= 56) { @@ -876,7 +931,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_init (__gl bzero16 (block); } - block[15] = swap_workaround (transform_len * 8); + block[15] = swap32 (transform_len * 8); sha256_transform (block, alt_result); @@ -912,7 +967,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_init (__gl /* Finish the digest. */ - append_0x80_4x4 (block, block_len); + append_0x80_1x16 (block, block_len); if (block_len >= 56) { @@ -921,7 +976,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_init (__gl bzero16 (block); } - block[15] = swap_workaround (transform_len * 8); + block[15] = swap32 (transform_len * 8); sha256_transform (block, p_bytes); @@ -955,7 +1010,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_init (__gl /* Finish the digest. */ - append_0x80_4x4 (block, block_len); + append_0x80_1x16 (block, block_len); if (block_len >= 56) { @@ -964,7 +1019,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_init (__gl bzero16 (block); } - block[15] = swap_workaround (transform_len * 8); + block[15] = swap32 (transform_len * 8); sha256_transform (block, s_bytes); @@ -1115,7 +1170,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_loop (__gl block[15] = 0; } - block[15] = swap_workaround (block_len * 8); + block[15] = swap32 (block_len * 8); sha256_transform_no14 (block, tmp); diff --git a/OpenCL/m07600_a0.cl b/OpenCL/m07600_a0.cl index 663ad15..e0592d3 100644 --- a/OpenCL/m07600_a0.cl +++ b/OpenCL/m07600_a0.cl @@ -177,20 +177,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -358,22 +358,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_m04 (__glo // 1st transform - w0_t = swap_workaround (w0_t); - w1_t = swap_workaround (w1_t); - w2_t = swap_workaround (w2_t); - w3_t = swap_workaround (w3_t); - w4_t = swap_workaround (w4_t); - w5_t = swap_workaround (w5_t); - w6_t = swap_workaround (w6_t); - w7_t = swap_workaround (w7_t); - w8_t = swap_workaround (w8_t); - w9_t = swap_workaround (w9_t); - wa_t = swap_workaround (wa_t); - wb_t = swap_workaround (wb_t); - wc_t = swap_workaround (wc_t); - wd_t = swap_workaround (wd_t); - we_t = swap_workaround (we_t); - wf_t = swap_workaround (wf_t); + w0_t = swap32 (w0_t); + w1_t = swap32 (w1_t); + w2_t = swap32 (w2_t); + w3_t = swap32 (w3_t); + w4_t = swap32 (w4_t); + w5_t = swap32 (w5_t); + w6_t = swap32 (w6_t); + w7_t = swap32 (w7_t); + w8_t = swap32 (w8_t); + w9_t = swap32 (w9_t); + wa_t = swap32 (wa_t); + wb_t = swap32 (wb_t); + wc_t = swap32 (wc_t); + wd_t = swap32 (wd_t); + we_t = swap32 (we_t); + wf_t = swap32 (wf_t); a = SHA1M_A; b = SHA1M_B; @@ -491,8 +491,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_m04 (__glo // 2nd transform - w0_t = swap_workaround (w2t[0]); - w1_t = swap_workaround (w2t[1]); + w0_t = swap32 (w2t[0]); + w1_t = swap32 (w2t[1]); w2_t = 0x80000000; w3_t = 0; w4_t = 0; @@ -788,20 +788,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -969,22 +969,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_s04 (__glo // 1st transform - w0_t = swap_workaround (w0_t); - w1_t = swap_workaround (w1_t); - w2_t = swap_workaround (w2_t); - w3_t = swap_workaround (w3_t); - w4_t = swap_workaround (w4_t); - w5_t = swap_workaround (w5_t); - w6_t = swap_workaround (w6_t); - w7_t = swap_workaround (w7_t); - w8_t = swap_workaround (w8_t); - w9_t = swap_workaround (w9_t); - wa_t = swap_workaround (wa_t); - wb_t = swap_workaround (wb_t); - wc_t = swap_workaround (wc_t); - wd_t = swap_workaround (wd_t); - we_t = swap_workaround (we_t); - wf_t = swap_workaround (wf_t); + w0_t = swap32 (w0_t); + w1_t = swap32 (w1_t); + w2_t = swap32 (w2_t); + w3_t = swap32 (w3_t); + w4_t = swap32 (w4_t); + w5_t = swap32 (w5_t); + w6_t = swap32 (w6_t); + w7_t = swap32 (w7_t); + w8_t = swap32 (w8_t); + w9_t = swap32 (w9_t); + wa_t = swap32 (wa_t); + wb_t = swap32 (wb_t); + wc_t = swap32 (wc_t); + wd_t = swap32 (wd_t); + we_t = swap32 (we_t); + wf_t = swap32 (wf_t); a = SHA1M_A; b = SHA1M_B; @@ -1102,8 +1102,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_s04 (__glo // 2nd transform - w0_t = swap_workaround (w2t[0]); - w1_t = swap_workaround (w2t[1]); + w0_t = swap32 (w2t[0]); + w1_t = swap32 (w2t[1]); w2_t = 0x80000000; w3_t = 0; w4_t = 0; diff --git a/OpenCL/m07600_a1.cl b/OpenCL/m07600_a1.cl index 2039e59..0de7114 100644 --- a/OpenCL/m07600_a1.cl +++ b/OpenCL/m07600_a1.cl @@ -231,20 +231,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -412,22 +412,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_m04 (__glo // 1st transform - w0_t = swap_workaround (w0_t); - w1_t = swap_workaround (w1_t); - w2_t = swap_workaround (w2_t); - w3_t = swap_workaround (w3_t); - w4_t = swap_workaround (w4_t); - w5_t = swap_workaround (w5_t); - w6_t = swap_workaround (w6_t); - w7_t = swap_workaround (w7_t); - w8_t = swap_workaround (w8_t); - w9_t = swap_workaround (w9_t); - wa_t = swap_workaround (wa_t); - wb_t = swap_workaround (wb_t); - wc_t = swap_workaround (wc_t); - wd_t = swap_workaround (wd_t); - we_t = swap_workaround (we_t); - wf_t = swap_workaround (wf_t); + w0_t = swap32 (w0_t); + w1_t = swap32 (w1_t); + w2_t = swap32 (w2_t); + w3_t = swap32 (w3_t); + w4_t = swap32 (w4_t); + w5_t = swap32 (w5_t); + w6_t = swap32 (w6_t); + w7_t = swap32 (w7_t); + w8_t = swap32 (w8_t); + w9_t = swap32 (w9_t); + wa_t = swap32 (wa_t); + wb_t = swap32 (wb_t); + wc_t = swap32 (wc_t); + wd_t = swap32 (wd_t); + we_t = swap32 (we_t); + wf_t = swap32 (wf_t); a = SHA1M_A; b = SHA1M_B; @@ -545,8 +545,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_m04 (__glo // 2nd transform - w0_t = swap_workaround (w2t[0]); - w1_t = swap_workaround (w2t[1]); + w0_t = swap32 (w2t[0]); + w1_t = swap32 (w2t[1]); w2_t = 0x80000000; w3_t = 0; w4_t = 0; @@ -898,20 +898,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -1079,22 +1079,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_s04 (__glo // 1st transform - w0_t = swap_workaround (w0_t); - w1_t = swap_workaround (w1_t); - w2_t = swap_workaround (w2_t); - w3_t = swap_workaround (w3_t); - w4_t = swap_workaround (w4_t); - w5_t = swap_workaround (w5_t); - w6_t = swap_workaround (w6_t); - w7_t = swap_workaround (w7_t); - w8_t = swap_workaround (w8_t); - w9_t = swap_workaround (w9_t); - wa_t = swap_workaround (wa_t); - wb_t = swap_workaround (wb_t); - wc_t = swap_workaround (wc_t); - wd_t = swap_workaround (wd_t); - we_t = swap_workaround (we_t); - wf_t = swap_workaround (wf_t); + w0_t = swap32 (w0_t); + w1_t = swap32 (w1_t); + w2_t = swap32 (w2_t); + w3_t = swap32 (w3_t); + w4_t = swap32 (w4_t); + w5_t = swap32 (w5_t); + w6_t = swap32 (w6_t); + w7_t = swap32 (w7_t); + w8_t = swap32 (w8_t); + w9_t = swap32 (w9_t); + wa_t = swap32 (wa_t); + wb_t = swap32 (wb_t); + wc_t = swap32 (wc_t); + wd_t = swap32 (wd_t); + we_t = swap32 (we_t); + wf_t = swap32 (wf_t); a = SHA1M_A; b = SHA1M_B; @@ -1212,8 +1212,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_s04 (__glo // 2nd transform - w0_t = swap_workaround (w2t[0]); - w1_t = swap_workaround (w2t[1]); + w0_t = swap32 (w2t[0]); + w1_t = swap32 (w2t[1]); w2_t = 0x80000000; w3_t = 0; w4_t = 0; diff --git a/OpenCL/m07600_a3.cl b/OpenCL/m07600_a3.cl index a8d6ecf..9652cd4 100644 --- a/OpenCL/m07600_a3.cl +++ b/OpenCL/m07600_a3.cl @@ -272,22 +272,22 @@ static void m07600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // 1st transform - w0_t = swap_workaround (w0_t); - w1_t = swap_workaround (w1_t); - w2_t = swap_workaround (w2_t); - w3_t = swap_workaround (w3_t); - w4_t = swap_workaround (w4_t); - w5_t = swap_workaround (w5_t); - w6_t = swap_workaround (w6_t); - w7_t = swap_workaround (w7_t); - w8_t = swap_workaround (w8_t); - w9_t = swap_workaround (w9_t); - wa_t = swap_workaround (wa_t); - wb_t = swap_workaround (wb_t); - wc_t = swap_workaround (wc_t); - wd_t = swap_workaround (wd_t); - we_t = swap_workaround (we_t); - wf_t = swap_workaround (wf_t); + w0_t = swap32 (w0_t); + w1_t = swap32 (w1_t); + w2_t = swap32 (w2_t); + w3_t = swap32 (w3_t); + w4_t = swap32 (w4_t); + w5_t = swap32 (w5_t); + w6_t = swap32 (w6_t); + w7_t = swap32 (w7_t); + w8_t = swap32 (w8_t); + w9_t = swap32 (w9_t); + wa_t = swap32 (wa_t); + wb_t = swap32 (wb_t); + wc_t = swap32 (wc_t); + wd_t = swap32 (wd_t); + we_t = swap32 (we_t); + wf_t = swap32 (wf_t); a = SHA1M_A; b = SHA1M_B; @@ -405,8 +405,8 @@ static void m07600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // 2nd transform - w0_t = swap_workaround (w2t[0]); - w1_t = swap_workaround (w2t[1]); + w0_t = swap32 (w2t[0]); + w1_t = swap32 (w2t[1]); w2_t = 0x80000000; w3_t = 0; w4_t = 0; @@ -785,22 +785,22 @@ static void m07600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // 1st transform - w0_t = swap_workaround (w0_t); - w1_t = swap_workaround (w1_t); - w2_t = swap_workaround (w2_t); - w3_t = swap_workaround (w3_t); - w4_t = swap_workaround (w4_t); - w5_t = swap_workaround (w5_t); - w6_t = swap_workaround (w6_t); - w7_t = swap_workaround (w7_t); - w8_t = swap_workaround (w8_t); - w9_t = swap_workaround (w9_t); - wa_t = swap_workaround (wa_t); - wb_t = swap_workaround (wb_t); - wc_t = swap_workaround (wc_t); - wd_t = swap_workaround (wd_t); - we_t = swap_workaround (we_t); - wf_t = swap_workaround (wf_t); + w0_t = swap32 (w0_t); + w1_t = swap32 (w1_t); + w2_t = swap32 (w2_t); + w3_t = swap32 (w3_t); + w4_t = swap32 (w4_t); + w5_t = swap32 (w5_t); + w6_t = swap32 (w6_t); + w7_t = swap32 (w7_t); + w8_t = swap32 (w8_t); + w9_t = swap32 (w9_t); + wa_t = swap32 (wa_t); + wb_t = swap32 (wb_t); + wc_t = swap32 (wc_t); + wd_t = swap32 (wd_t); + we_t = swap32 (we_t); + wf_t = swap32 (wf_t); a = SHA1M_A; b = SHA1M_B; @@ -918,8 +918,8 @@ static void m07600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // 2nd transform - w0_t = swap_workaround (w2t[0]); - w1_t = swap_workaround (w2t[1]); + w0_t = swap32 (w2t[0]); + w1_t = swap32 (w2t[1]); w2_t = 0x80000000; w3_t = 0; w4_t = 0; diff --git a/OpenCL/m07800_a0.cl b/OpenCL/m07800_a0.cl index 7d7decd..78e4df0 100644 --- a/OpenCL/m07800_a0.cl +++ b/OpenCL/m07800_a0.cl @@ -46,22 +46,22 @@ __constant u32 theMagicArray[64] = static void swap_buffer (u32 final[16]) { - final[ 0] = swap_workaround (final[ 0]); - final[ 1] = swap_workaround (final[ 1]); - final[ 2] = swap_workaround (final[ 2]); - final[ 3] = swap_workaround (final[ 3]); - final[ 4] = swap_workaround (final[ 4]); - final[ 5] = swap_workaround (final[ 5]); - final[ 6] = swap_workaround (final[ 6]); - final[ 7] = swap_workaround (final[ 7]); - final[ 8] = swap_workaround (final[ 8]); - final[ 9] = swap_workaround (final[ 9]); - final[10] = swap_workaround (final[10]); - final[11] = swap_workaround (final[11]); - final[12] = swap_workaround (final[12]); - final[13] = swap_workaround (final[13]); - final[14] = swap_workaround (final[14]); - final[15] = swap_workaround (final[15]); + final[ 0] = swap32 (final[ 0]); + final[ 1] = swap32 (final[ 1]); + final[ 2] = swap32 (final[ 2]); + final[ 3] = swap32 (final[ 3]); + final[ 4] = swap32 (final[ 4]); + final[ 5] = swap32 (final[ 5]); + final[ 6] = swap32 (final[ 6]); + final[ 7] = swap32 (final[ 7]); + final[ 8] = swap32 (final[ 8]); + final[ 9] = swap32 (final[ 9]); + final[10] = swap32 (final[10]); + final[11] = swap32 (final[11]); + final[12] = swap32 (final[12]); + final[13] = swap32 (final[13]); + final[14] = swap32 (final[14]); + final[15] = swap32 (final[15]); } static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) @@ -319,20 +319,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07800_m04 (__glo u32 final[256]; - final[ 0] = swap_workaround (w0[0] | s0[0]); - final[ 1] = swap_workaround (w0[1] | s0[1]); - final[ 2] = swap_workaround (w0[2] | s0[2]); - final[ 3] = swap_workaround (w0[3] | s0[3]); - final[ 4] = swap_workaround (w1[0] | s1[0]); - final[ 5] = swap_workaround (w1[1] | s1[1]); - final[ 6] = swap_workaround (w1[2] | s1[2]); - final[ 7] = swap_workaround (w1[3] | s1[3]); - final[ 8] = swap_workaround (w2[0] | s2[0]); - final[ 9] = swap_workaround (w2[1] | s2[1]); - final[10] = swap_workaround (w2[2] | s2[2]); - final[11] = swap_workaround (w2[3] | s2[3]); - final[12] = swap_workaround (w3[0] | s3[0]); - final[13] = swap_workaround (w3[1] | s3[1]); + final[ 0] = swap32 (w0[0] | s0[0]); + final[ 1] = swap32 (w0[1] | s0[1]); + final[ 2] = swap32 (w0[2] | s0[2]); + final[ 3] = swap32 (w0[3] | s0[3]); + final[ 4] = swap32 (w1[0] | s1[0]); + final[ 5] = swap32 (w1[1] | s1[1]); + final[ 6] = swap32 (w1[2] | s1[2]); + final[ 7] = swap32 (w1[3] | s1[3]); + final[ 8] = swap32 (w2[0] | s2[0]); + final[ 9] = swap32 (w2[1] | s2[1]); + final[10] = swap32 (w2[2] | s2[2]); + final[11] = swap32 (w2[3] | s2[3]); + final[12] = swap32 (w3[0] | s3[0]); + final[13] = swap32 (w3[1] | s3[1]); final[14] = 0; final[15] = pw_salt_len * 8; @@ -599,20 +599,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07800_s04 (__glo u32 final[256]; - final[ 0] = swap_workaround (w0[0] | s0[0]); - final[ 1] = swap_workaround (w0[1] | s0[1]); - final[ 2] = swap_workaround (w0[2] | s0[2]); - final[ 3] = swap_workaround (w0[3] | s0[3]); - final[ 4] = swap_workaround (w1[0] | s1[0]); - final[ 5] = swap_workaround (w1[1] | s1[1]); - final[ 6] = swap_workaround (w1[2] | s1[2]); - final[ 7] = swap_workaround (w1[3] | s1[3]); - final[ 8] = swap_workaround (w2[0] | s2[0]); - final[ 9] = swap_workaround (w2[1] | s2[1]); - final[10] = swap_workaround (w2[2] | s2[2]); - final[11] = swap_workaround (w2[3] | s2[3]); - final[12] = swap_workaround (w3[0] | s3[0]); - final[13] = swap_workaround (w3[1] | s3[1]); + final[ 0] = swap32 (w0[0] | s0[0]); + final[ 1] = swap32 (w0[1] | s0[1]); + final[ 2] = swap32 (w0[2] | s0[2]); + final[ 3] = swap32 (w0[3] | s0[3]); + final[ 4] = swap32 (w1[0] | s1[0]); + final[ 5] = swap32 (w1[1] | s1[1]); + final[ 6] = swap32 (w1[2] | s1[2]); + final[ 7] = swap32 (w1[3] | s1[3]); + final[ 8] = swap32 (w2[0] | s2[0]); + final[ 9] = swap32 (w2[1] | s2[1]); + final[10] = swap32 (w2[2] | s2[2]); + final[11] = swap32 (w2[3] | s2[3]); + final[12] = swap32 (w3[0] | s3[0]); + final[13] = swap32 (w3[1] | s3[1]); final[14] = 0; final[15] = pw_salt_len * 8; diff --git a/OpenCL/m07800_a1.cl b/OpenCL/m07800_a1.cl index 1a7786f..c20d59b 100644 --- a/OpenCL/m07800_a1.cl +++ b/OpenCL/m07800_a1.cl @@ -44,22 +44,22 @@ __constant u32 theMagicArray[64] = static void swap_buffer (u32 final[16]) { - final[ 0] = swap_workaround (final[ 0]); - final[ 1] = swap_workaround (final[ 1]); - final[ 2] = swap_workaround (final[ 2]); - final[ 3] = swap_workaround (final[ 3]); - final[ 4] = swap_workaround (final[ 4]); - final[ 5] = swap_workaround (final[ 5]); - final[ 6] = swap_workaround (final[ 6]); - final[ 7] = swap_workaround (final[ 7]); - final[ 8] = swap_workaround (final[ 8]); - final[ 9] = swap_workaround (final[ 9]); - final[10] = swap_workaround (final[10]); - final[11] = swap_workaround (final[11]); - final[12] = swap_workaround (final[12]); - final[13] = swap_workaround (final[13]); - final[14] = swap_workaround (final[14]); - final[15] = swap_workaround (final[15]); + final[ 0] = swap32 (final[ 0]); + final[ 1] = swap32 (final[ 1]); + final[ 2] = swap32 (final[ 2]); + final[ 3] = swap32 (final[ 3]); + final[ 4] = swap32 (final[ 4]); + final[ 5] = swap32 (final[ 5]); + final[ 6] = swap32 (final[ 6]); + final[ 7] = swap32 (final[ 7]); + final[ 8] = swap32 (final[ 8]); + final[ 9] = swap32 (final[ 9]); + final[10] = swap32 (final[10]); + final[11] = swap32 (final[11]); + final[12] = swap32 (final[12]); + final[13] = swap32 (final[13]); + final[14] = swap32 (final[14]); + final[15] = swap32 (final[15]); } static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) @@ -371,20 +371,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07800_m04 (__glo u32 final[256]; - final[ 0] = swap_workaround (w0[0] | s0[0]); - final[ 1] = swap_workaround (w0[1] | s0[1]); - final[ 2] = swap_workaround (w0[2] | s0[2]); - final[ 3] = swap_workaround (w0[3] | s0[3]); - final[ 4] = swap_workaround (w1[0] | s1[0]); - final[ 5] = swap_workaround (w1[1] | s1[1]); - final[ 6] = swap_workaround (w1[2] | s1[2]); - final[ 7] = swap_workaround (w1[3] | s1[3]); - final[ 8] = swap_workaround (w2[0] | s2[0]); - final[ 9] = swap_workaround (w2[1] | s2[1]); - final[10] = swap_workaround (w2[2] | s2[2]); - final[11] = swap_workaround (w2[3] | s2[3]); - final[12] = swap_workaround (w3[0] | s3[0]); - final[13] = swap_workaround (w3[1] | s3[1]); + final[ 0] = swap32 (w0[0] | s0[0]); + final[ 1] = swap32 (w0[1] | s0[1]); + final[ 2] = swap32 (w0[2] | s0[2]); + final[ 3] = swap32 (w0[3] | s0[3]); + final[ 4] = swap32 (w1[0] | s1[0]); + final[ 5] = swap32 (w1[1] | s1[1]); + final[ 6] = swap32 (w1[2] | s1[2]); + final[ 7] = swap32 (w1[3] | s1[3]); + final[ 8] = swap32 (w2[0] | s2[0]); + final[ 9] = swap32 (w2[1] | s2[1]); + final[10] = swap32 (w2[2] | s2[2]); + final[11] = swap32 (w2[3] | s2[3]); + final[12] = swap32 (w3[0] | s3[0]); + final[13] = swap32 (w3[1] | s3[1]); final[14] = 0; final[15] = pw_salt_len * 8; @@ -701,20 +701,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07800_s04 (__glo u32 final[256]; - final[ 0] = swap_workaround (w0[0] | s0[0]); - final[ 1] = swap_workaround (w0[1] | s0[1]); - final[ 2] = swap_workaround (w0[2] | s0[2]); - final[ 3] = swap_workaround (w0[3] | s0[3]); - final[ 4] = swap_workaround (w1[0] | s1[0]); - final[ 5] = swap_workaround (w1[1] | s1[1]); - final[ 6] = swap_workaround (w1[2] | s1[2]); - final[ 7] = swap_workaround (w1[3] | s1[3]); - final[ 8] = swap_workaround (w2[0] | s2[0]); - final[ 9] = swap_workaround (w2[1] | s2[1]); - final[10] = swap_workaround (w2[2] | s2[2]); - final[11] = swap_workaround (w2[3] | s2[3]); - final[12] = swap_workaround (w3[0] | s3[0]); - final[13] = swap_workaround (w3[1] | s3[1]); + final[ 0] = swap32 (w0[0] | s0[0]); + final[ 1] = swap32 (w0[1] | s0[1]); + final[ 2] = swap32 (w0[2] | s0[2]); + final[ 3] = swap32 (w0[3] | s0[3]); + final[ 4] = swap32 (w1[0] | s1[0]); + final[ 5] = swap32 (w1[1] | s1[1]); + final[ 6] = swap32 (w1[2] | s1[2]); + final[ 7] = swap32 (w1[3] | s1[3]); + final[ 8] = swap32 (w2[0] | s2[0]); + final[ 9] = swap32 (w2[1] | s2[1]); + final[10] = swap32 (w2[2] | s2[2]); + final[11] = swap32 (w2[3] | s2[3]); + final[12] = swap32 (w3[0] | s3[0]); + final[13] = swap32 (w3[1] | s3[1]); final[14] = 0; final[15] = pw_salt_len * 8; diff --git a/OpenCL/m07800_a3.cl b/OpenCL/m07800_a3.cl index 6c6ed28..06b9829 100644 --- a/OpenCL/m07800_a3.cl +++ b/OpenCL/m07800_a3.cl @@ -44,22 +44,22 @@ __constant u32 theMagicArray[64] = static void swap_buffer (u32 final[16]) { - final[ 0] = swap_workaround (final[ 0]); - final[ 1] = swap_workaround (final[ 1]); - final[ 2] = swap_workaround (final[ 2]); - final[ 3] = swap_workaround (final[ 3]); - final[ 4] = swap_workaround (final[ 4]); - final[ 5] = swap_workaround (final[ 5]); - final[ 6] = swap_workaround (final[ 6]); - final[ 7] = swap_workaround (final[ 7]); - final[ 8] = swap_workaround (final[ 8]); - final[ 9] = swap_workaround (final[ 9]); - final[10] = swap_workaround (final[10]); - final[11] = swap_workaround (final[11]); - final[12] = swap_workaround (final[12]); - final[13] = swap_workaround (final[13]); - final[14] = swap_workaround (final[14]); - final[15] = swap_workaround (final[15]); + final[ 0] = swap32 (final[ 0]); + final[ 1] = swap32 (final[ 1]); + final[ 2] = swap32 (final[ 2]); + final[ 3] = swap32 (final[ 3]); + final[ 4] = swap32 (final[ 4]); + final[ 5] = swap32 (final[ 5]); + final[ 6] = swap32 (final[ 6]); + final[ 7] = swap32 (final[ 7]); + final[ 8] = swap32 (final[ 8]); + final[ 9] = swap32 (final[ 9]); + final[10] = swap32 (final[10]); + final[11] = swap32 (final[11]); + final[12] = swap32 (final[12]); + final[13] = swap32 (final[13]); + final[14] = swap32 (final[14]); + final[15] = swap32 (final[15]); } static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) @@ -199,15 +199,15 @@ static void m07800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 gid = get_global_id (0); const u32 lid = get_local_id (0); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); /** * salt @@ -266,7 +266,7 @@ static void m07800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) { - const u32 w0r = swap_workaround (bfs_buf[il_pos].i); + const u32 w0r = swap32 (bfs_buf[il_pos].i); w0[0] = w0l | w0r; @@ -276,20 +276,20 @@ static void m07800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 final[256]; - final[ 0] = swap_workaround (w0[0] | s0[0]); - final[ 1] = swap_workaround (w0[1] | s0[1]); - final[ 2] = swap_workaround (w0[2] | s0[2]); - final[ 3] = swap_workaround (w0[3] | s0[3]); - final[ 4] = swap_workaround (w1[0] | s1[0]); - final[ 5] = swap_workaround (w1[1] | s1[1]); - final[ 6] = swap_workaround (w1[2] | s1[2]); - final[ 7] = swap_workaround (w1[3] | s1[3]); - final[ 8] = swap_workaround (w2[0] | s2[0]); - final[ 9] = swap_workaround (w2[1] | s2[1]); - final[10] = swap_workaround (w2[2] | s2[2]); - final[11] = swap_workaround (w2[3] | s2[3]); - final[12] = swap_workaround (w3[0] | s3[0]); - final[13] = swap_workaround (w3[1] | s3[1]); + final[ 0] = swap32 (w0[0] | s0[0]); + final[ 1] = swap32 (w0[1] | s0[1]); + final[ 2] = swap32 (w0[2] | s0[2]); + final[ 3] = swap32 (w0[3] | s0[3]); + final[ 4] = swap32 (w1[0] | s1[0]); + final[ 5] = swap32 (w1[1] | s1[1]); + final[ 6] = swap32 (w1[2] | s1[2]); + final[ 7] = swap32 (w1[3] | s1[3]); + final[ 8] = swap32 (w2[0] | s2[0]); + final[ 9] = swap32 (w2[1] | s2[1]); + final[10] = swap32 (w2[2] | s2[2]); + final[11] = swap32 (w2[3] | s2[3]); + final[12] = swap32 (w3[0] | s3[0]); + final[13] = swap32 (w3[1] | s3[1]); final[14] = 0; final[15] = pw_salt_len * 8; @@ -418,15 +418,15 @@ static void m07800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 gid = get_global_id (0); const u32 lid = get_local_id (0); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); /** * salt @@ -497,7 +497,7 @@ static void m07800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) { - const u32 w0r = swap_workaround (bfs_buf[il_pos].i); + const u32 w0r = swap32 (bfs_buf[il_pos].i); w0[0] = w0l | w0r; @@ -507,20 +507,20 @@ static void m07800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 final[256]; - final[ 0] = swap_workaround (w0[0] | s0[0]); - final[ 1] = swap_workaround (w0[1] | s0[1]); - final[ 2] = swap_workaround (w0[2] | s0[2]); - final[ 3] = swap_workaround (w0[3] | s0[3]); - final[ 4] = swap_workaround (w1[0] | s1[0]); - final[ 5] = swap_workaround (w1[1] | s1[1]); - final[ 6] = swap_workaround (w1[2] | s1[2]); - final[ 7] = swap_workaround (w1[3] | s1[3]); - final[ 8] = swap_workaround (w2[0] | s2[0]); - final[ 9] = swap_workaround (w2[1] | s2[1]); - final[10] = swap_workaround (w2[2] | s2[2]); - final[11] = swap_workaround (w2[3] | s2[3]); - final[12] = swap_workaround (w3[0] | s3[0]); - final[13] = swap_workaround (w3[1] | s3[1]); + final[ 0] = swap32 (w0[0] | s0[0]); + final[ 1] = swap32 (w0[1] | s0[1]); + final[ 2] = swap32 (w0[2] | s0[2]); + final[ 3] = swap32 (w0[3] | s0[3]); + final[ 4] = swap32 (w1[0] | s1[0]); + final[ 5] = swap32 (w1[1] | s1[1]); + final[ 6] = swap32 (w1[2] | s1[2]); + final[ 7] = swap32 (w1[3] | s1[3]); + final[ 8] = swap32 (w2[0] | s2[0]); + final[ 9] = swap32 (w2[1] | s2[1]); + final[10] = swap32 (w2[2] | s2[2]); + final[11] = swap32 (w2[3] | s2[3]); + final[12] = swap32 (w3[0] | s3[0]); + final[13] = swap32 (w3[1] | s3[1]); final[14] = 0; final[15] = pw_salt_len * 8; diff --git a/OpenCL/m07900.cl b/OpenCL/m07900.cl index 66061ea..c6812a7 100644 --- a/OpenCL/m07900.cl +++ b/OpenCL/m07900.cl @@ -190,15 +190,15 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07900_init (__gl u64 w[16]; - w[ 0] = ((u64) swap_workaround (salt_buf[0])) << 32 | (u64) swap_workaround (salt_buf[1]); - w[ 1] = ((u64) swap_workaround (w0[0])) << 32 | (u64) swap_workaround (w0[1]); - w[ 2] = ((u64) swap_workaround (w0[2])) << 32 | (u64) swap_workaround (w0[3]); - w[ 3] = ((u64) swap_workaround (w1[0])) << 32 | (u64) swap_workaround (w1[1]); - w[ 4] = ((u64) swap_workaround (w1[2])) << 32 | (u64) swap_workaround (w1[3]); - w[ 5] = ((u64) swap_workaround (w2[0])) << 32 | (u64) swap_workaround (w2[1]); - w[ 6] = ((u64) swap_workaround (w2[2])) << 32 | (u64) swap_workaround (w2[3]); - w[ 7] = ((u64) swap_workaround (w3[0])) << 32 | (u64) swap_workaround (w3[1]); - w[ 8] = ((u64) swap_workaround (w3[2])) << 32 | (u64) swap_workaround (w3[3]); + w[ 0] = ((u64) swap32 (salt_buf[0])) << 32 | (u64) swap32 (salt_buf[1]); + w[ 1] = ((u64) swap32 (w0[0])) << 32 | (u64) swap32 (w0[1]); + w[ 2] = ((u64) swap32 (w0[2])) << 32 | (u64) swap32 (w0[3]); + w[ 3] = ((u64) swap32 (w1[0])) << 32 | (u64) swap32 (w1[1]); + w[ 4] = ((u64) swap32 (w1[2])) << 32 | (u64) swap32 (w1[3]); + w[ 5] = ((u64) swap32 (w2[0])) << 32 | (u64) swap32 (w2[1]); + w[ 6] = ((u64) swap32 (w2[2])) << 32 | (u64) swap32 (w2[3]); + w[ 7] = ((u64) swap32 (w3[0])) << 32 | (u64) swap32 (w3[1]); + w[ 8] = ((u64) swap32 (w3[2])) << 32 | (u64) swap32 (w3[3]); w[ 9] = 0; w[10] = 0; w[11] = 0; @@ -307,12 +307,12 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07900_loop (__gl w[ 5] = 0; w[ 6] = 0; w[ 7] = 0; - w[ 8] = ((u64) swap_workaround (w0[0])) << 32 | (u64) swap_workaround (w0[1]); - w[ 9] = ((u64) swap_workaround (w0[2])) << 32 | (u64) swap_workaround (w0[3]); - w[10] = ((u64) swap_workaround (w1[0])) << 32 | (u64) swap_workaround (w1[1]); - w[11] = ((u64) swap_workaround (w1[2])) << 32 | (u64) swap_workaround (w1[3]); - w[12] = ((u64) swap_workaround (w2[0])) << 32 | (u64) swap_workaround (w2[1]); - w[13] = ((u64) swap_workaround (w2[2])) << 32 | (u64) swap_workaround (w2[3]); + w[ 8] = ((u64) swap32 (w0[0])) << 32 | (u64) swap32 (w0[1]); + w[ 9] = ((u64) swap32 (w0[2])) << 32 | (u64) swap32 (w0[3]); + w[10] = ((u64) swap32 (w1[0])) << 32 | (u64) swap32 (w1[1]); + w[11] = ((u64) swap32 (w1[2])) << 32 | (u64) swap32 (w1[3]); + w[12] = ((u64) swap32 (w2[0])) << 32 | (u64) swap32 (w2[1]); + w[13] = ((u64) swap32 (w2[2])) << 32 | (u64) swap32 (w2[3]); w[14] = 0; w[15] = block_len * 8; diff --git a/OpenCL/m08000_a0.cl b/OpenCL/m08000_a0.cl index 2f10770..87a2748 100644 --- a/OpenCL/m08000_a0.cl +++ b/OpenCL/m08000_a0.cl @@ -164,9 +164,9 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08000_m04 (__glo * salt */ - const u32 salt_buf0 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 0]); - const u32 salt_buf1 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 1]); - const u32 salt_buf2 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 + const u32 salt_buf0 = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); + const u32 salt_buf1 = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); + const u32 salt_buf2 = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 /** * loop @@ -209,22 +209,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08000_m04 (__glo u32 w_t[16]; - w_t[ 0] = swap_workaround (w0_t[0]); - w_t[ 1] = swap_workaround (w0_t[1]); - w_t[ 2] = swap_workaround (w0_t[2]); - w_t[ 3] = swap_workaround (w0_t[3]); - w_t[ 4] = swap_workaround (w1_t[0]); - w_t[ 5] = swap_workaround (w1_t[1]); - w_t[ 6] = swap_workaround (w1_t[2]); - w_t[ 7] = swap_workaround (w1_t[3]); - w_t[ 8] = swap_workaround (w2_t[0]); - w_t[ 9] = swap_workaround (w2_t[1]); - w_t[10] = swap_workaround (w2_t[2]); - w_t[11] = swap_workaround (w2_t[3]); - w_t[12] = swap_workaround (w3_t[0]); - w_t[13] = swap_workaround (w3_t[1]); - w_t[14] = swap_workaround (w3_t[2]); - w_t[15] = swap_workaround (w3_t[3]); + w_t[ 0] = swap32 (w0_t[0]); + w_t[ 1] = swap32 (w0_t[1]); + w_t[ 2] = swap32 (w0_t[2]); + w_t[ 3] = swap32 (w0_t[3]); + w_t[ 4] = swap32 (w1_t[0]); + w_t[ 5] = swap32 (w1_t[1]); + w_t[ 6] = swap32 (w1_t[2]); + w_t[ 7] = swap32 (w1_t[3]); + w_t[ 8] = swap32 (w2_t[0]); + w_t[ 9] = swap32 (w2_t[1]); + w_t[10] = swap32 (w2_t[2]); + w_t[11] = swap32 (w2_t[3]); + w_t[12] = swap32 (w3_t[0]); + w_t[13] = swap32 (w3_t[1]); + w_t[14] = swap32 (w3_t[2]); + w_t[15] = swap32 (w3_t[3]); w_t[ 0] = w_t[ 0] >> 8; w_t[ 1] = w_t[ 1] >> 8; @@ -344,9 +344,9 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08000_s04 (__glo * salt */ - const u32 salt_buf0 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 0]); - const u32 salt_buf1 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 1]); - const u32 salt_buf2 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 + const u32 salt_buf0 = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); + const u32 salt_buf1 = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); + const u32 salt_buf2 = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 /** * digest @@ -401,22 +401,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08000_s04 (__glo u32 w_t[16]; - w_t[ 0] = swap_workaround (w0_t[0]); - w_t[ 1] = swap_workaround (w0_t[1]); - w_t[ 2] = swap_workaround (w0_t[2]); - w_t[ 3] = swap_workaround (w0_t[3]); - w_t[ 4] = swap_workaround (w1_t[0]); - w_t[ 5] = swap_workaround (w1_t[1]); - w_t[ 6] = swap_workaround (w1_t[2]); - w_t[ 7] = swap_workaround (w1_t[3]); - w_t[ 8] = swap_workaround (w2_t[0]); - w_t[ 9] = swap_workaround (w2_t[1]); - w_t[10] = swap_workaround (w2_t[2]); - w_t[11] = swap_workaround (w2_t[3]); - w_t[12] = swap_workaround (w3_t[0]); - w_t[13] = swap_workaround (w3_t[1]); - w_t[14] = swap_workaround (w3_t[2]); - w_t[15] = swap_workaround (w3_t[3]); + w_t[ 0] = swap32 (w0_t[0]); + w_t[ 1] = swap32 (w0_t[1]); + w_t[ 2] = swap32 (w0_t[2]); + w_t[ 3] = swap32 (w0_t[3]); + w_t[ 4] = swap32 (w1_t[0]); + w_t[ 5] = swap32 (w1_t[1]); + w_t[ 6] = swap32 (w1_t[2]); + w_t[ 7] = swap32 (w1_t[3]); + w_t[ 8] = swap32 (w2_t[0]); + w_t[ 9] = swap32 (w2_t[1]); + w_t[10] = swap32 (w2_t[2]); + w_t[11] = swap32 (w2_t[3]); + w_t[12] = swap32 (w3_t[0]); + w_t[13] = swap32 (w3_t[1]); + w_t[14] = swap32 (w3_t[2]); + w_t[15] = swap32 (w3_t[3]); w_t[ 0] = w_t[ 0] >> 8; w_t[ 1] = w_t[ 1] >> 8; diff --git a/OpenCL/m08000_a1.cl b/OpenCL/m08000_a1.cl index e9154aa..af8f935 100644 --- a/OpenCL/m08000_a1.cl +++ b/OpenCL/m08000_a1.cl @@ -181,9 +181,9 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08000_m04 (__glo * salt */ - const u32 salt_buf0 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 0]); - const u32 salt_buf1 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 1]); - const u32 salt_buf2 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 + const u32 salt_buf0 = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); + const u32 salt_buf1 = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); + const u32 salt_buf2 = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 /** * loop @@ -251,22 +251,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08000_m04 (__glo u32 w_t[16]; - w_t[ 0] = swap_workaround (w0_t[0]); - w_t[ 1] = swap_workaround (w0_t[1]); - w_t[ 2] = swap_workaround (w0_t[2]); - w_t[ 3] = swap_workaround (w0_t[3]); - w_t[ 4] = swap_workaround (w1_t[0]); - w_t[ 5] = swap_workaround (w1_t[1]); - w_t[ 6] = swap_workaround (w1_t[2]); - w_t[ 7] = swap_workaround (w1_t[3]); - w_t[ 8] = swap_workaround (w2_t[0]); - w_t[ 9] = swap_workaround (w2_t[1]); - w_t[10] = swap_workaround (w2_t[2]); - w_t[11] = swap_workaround (w2_t[3]); - w_t[12] = swap_workaround (w3_t[0]); - w_t[13] = swap_workaround (w3_t[1]); - w_t[14] = swap_workaround (w3_t[2]); - w_t[15] = swap_workaround (w3_t[3]); + w_t[ 0] = swap32 (w0_t[0]); + w_t[ 1] = swap32 (w0_t[1]); + w_t[ 2] = swap32 (w0_t[2]); + w_t[ 3] = swap32 (w0_t[3]); + w_t[ 4] = swap32 (w1_t[0]); + w_t[ 5] = swap32 (w1_t[1]); + w_t[ 6] = swap32 (w1_t[2]); + w_t[ 7] = swap32 (w1_t[3]); + w_t[ 8] = swap32 (w2_t[0]); + w_t[ 9] = swap32 (w2_t[1]); + w_t[10] = swap32 (w2_t[2]); + w_t[11] = swap32 (w2_t[3]); + w_t[12] = swap32 (w3_t[0]); + w_t[13] = swap32 (w3_t[1]); + w_t[14] = swap32 (w3_t[2]); + w_t[15] = swap32 (w3_t[3]); w_t[ 0] = w_t[ 0] >> 8; w_t[ 1] = w_t[ 1] >> 8; @@ -405,9 +405,9 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08000_s04 (__glo * salt */ - const u32 salt_buf0 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 0]); - const u32 salt_buf1 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 1]); - const u32 salt_buf2 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 + const u32 salt_buf0 = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); + const u32 salt_buf1 = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); + const u32 salt_buf2 = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 /** * digest @@ -487,22 +487,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08000_s04 (__glo u32 w_t[16]; - w_t[ 0] = swap_workaround (w0_t[0]); - w_t[ 1] = swap_workaround (w0_t[1]); - w_t[ 2] = swap_workaround (w0_t[2]); - w_t[ 3] = swap_workaround (w0_t[3]); - w_t[ 4] = swap_workaround (w1_t[0]); - w_t[ 5] = swap_workaround (w1_t[1]); - w_t[ 6] = swap_workaround (w1_t[2]); - w_t[ 7] = swap_workaround (w1_t[3]); - w_t[ 8] = swap_workaround (w2_t[0]); - w_t[ 9] = swap_workaround (w2_t[1]); - w_t[10] = swap_workaround (w2_t[2]); - w_t[11] = swap_workaround (w2_t[3]); - w_t[12] = swap_workaround (w3_t[0]); - w_t[13] = swap_workaround (w3_t[1]); - w_t[14] = swap_workaround (w3_t[2]); - w_t[15] = swap_workaround (w3_t[3]); + w_t[ 0] = swap32 (w0_t[0]); + w_t[ 1] = swap32 (w0_t[1]); + w_t[ 2] = swap32 (w0_t[2]); + w_t[ 3] = swap32 (w0_t[3]); + w_t[ 4] = swap32 (w1_t[0]); + w_t[ 5] = swap32 (w1_t[1]); + w_t[ 6] = swap32 (w1_t[2]); + w_t[ 7] = swap32 (w1_t[3]); + w_t[ 8] = swap32 (w2_t[0]); + w_t[ 9] = swap32 (w2_t[1]); + w_t[10] = swap32 (w2_t[2]); + w_t[11] = swap32 (w2_t[3]); + w_t[12] = swap32 (w3_t[0]); + w_t[13] = swap32 (w3_t[1]); + w_t[14] = swap32 (w3_t[2]); + w_t[15] = swap32 (w3_t[3]); w_t[ 0] = w_t[ 0] >> 8; w_t[ 1] = w_t[ 1] >> 8; diff --git a/OpenCL/m08000_a3.cl b/OpenCL/m08000_a3.cl index 49d2816..ce3cdf0 100644 --- a/OpenCL/m08000_a3.cl +++ b/OpenCL/m08000_a3.cl @@ -139,9 +139,9 @@ static void m08000m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global g * salt */ - const u32 salt_buf0 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 0]); - const u32 salt_buf1 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 1]); - const u32 salt_buf2 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 + const u32 salt_buf0 = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); + const u32 salt_buf1 = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); + const u32 salt_buf2 = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 /** * loop @@ -256,9 +256,9 @@ static void m08000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global g * salt */ - const u32 salt_buf0 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 0]); - const u32 salt_buf1 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 1]); - const u32 salt_buf2 = swap_workaround (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 + const u32 salt_buf0 = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); + const u32 salt_buf1 = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); + const u32 salt_buf2 = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 /** * loop diff --git a/OpenCL/m08100_a0.cl b/OpenCL/m08100_a0.cl index c5dc31e..f0bba13 100644 --- a/OpenCL/m08100_a0.cl +++ b/OpenCL/m08100_a0.cl @@ -60,8 +60,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_m04 (__glo u32 salt_buf0[2]; - salt_buf0[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[0]); - salt_buf0[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[1]); + salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); + salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -129,22 +129,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_m04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; @@ -303,8 +303,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_s04 (__glo u32 salt_buf0[2]; - salt_buf0[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[0]); - salt_buf0[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[1]); + salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); + salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -390,22 +390,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_s04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; diff --git a/OpenCL/m08100_a1.cl b/OpenCL/m08100_a1.cl index 5cc8e44..264ffc8 100644 --- a/OpenCL/m08100_a1.cl +++ b/OpenCL/m08100_a1.cl @@ -77,8 +77,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_m04 (__glo u32 salt_buf0[2]; - salt_buf0[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[0]); - salt_buf0[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[1]); + salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); + salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -175,22 +175,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_m04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; @@ -368,8 +368,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_s04 (__glo u32 salt_buf0[2]; - salt_buf0[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[0]); - salt_buf0[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[1]); + salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); + salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -484,22 +484,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_s04 (__glo * sha1 */ - w0_t[0] = swap_workaround (w0_t[0]); - w0_t[1] = swap_workaround (w0_t[1]); - w0_t[2] = swap_workaround (w0_t[2]); - w0_t[3] = swap_workaround (w0_t[3]); - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); - //w3_t[2] = swap_workaround (w3_t[2]); - //w3_t[3] = swap_workaround (w3_t[3]); + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + //w3_t[2] = swap32 (w3_t[2]); + //w3_t[3] = swap32 (w3_t[3]); u32 a = SHA1M_A; u32 b = SHA1M_B; diff --git a/OpenCL/m08100_a3.cl b/OpenCL/m08100_a3.cl index ca8d2cd..dcf642c 100644 --- a/OpenCL/m08100_a3.cl +++ b/OpenCL/m08100_a3.cl @@ -435,18 +435,18 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_m04 (__glo * base */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); append_0x80_2x4 (w0, w1, pw_len + 1); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); /** * main @@ -499,26 +499,26 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_m08 (__glo * base */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); append_0x80_3x4 (w0, w1, w2, pw_len + 1); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); /** * main @@ -571,39 +571,39 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_m16 (__glo * base */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); w3[2] = 0; w3[3] = 0; append_0x80_4x4 (w0, w1, w2, w3, pw_len + 1); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); w3[2] = 0; w3[3] = 0; @@ -658,18 +658,18 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_s04 (__glo * base */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); append_0x80_2x4 (w0, w1, pw_len + 1); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); /** * main @@ -722,26 +722,26 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_s08 (__glo * base */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); append_0x80_3x4 (w0, w1, w2, pw_len + 1); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); /** * main @@ -794,39 +794,39 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_s16 (__glo * base */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); w3[2] = 0; w3[3] = 0; append_0x80_4x4 (w0, w1, w2, w3, pw_len + 1); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); w3[2] = 0; w3[3] = 0; diff --git a/OpenCL/m08200.cl b/OpenCL/m08200.cl index 15ba027..9d3b843 100644 --- a/OpenCL/m08200.cl +++ b/OpenCL/m08200.cl @@ -501,9 +501,9 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08200_init (__gl u64 data_buf[16]; - data_buf[ 0] = ((u64) swap_workaround (salt_buf0[ 0])) << 32 | (u64) swap_workaround (salt_buf0[ 1]); - data_buf[ 1] = ((u64) swap_workaround (salt_buf0[ 2])) << 32 | (u64) swap_workaround (salt_buf0[ 3]); - data_buf[ 2] = ((u64) swap_workaround (salt_buf1[ 0])) << 32 | (u64) swap_workaround (salt_buf1[ 1]); + data_buf[ 0] = ((u64) swap32 (salt_buf0[ 0])) << 32 | (u64) swap32 (salt_buf0[ 1]); + data_buf[ 1] = ((u64) swap32 (salt_buf0[ 2])) << 32 | (u64) swap32 (salt_buf0[ 3]); + data_buf[ 2] = ((u64) swap32 (salt_buf1[ 0])) << 32 | (u64) swap32 (salt_buf1[ 1]); data_buf[ 3] = 0; data_buf[ 4] = 0; data_buf[ 5] = 0; @@ -520,14 +520,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08200_init (__gl u64 w[16]; - w[ 0] = ((u64) swap_workaround (w0[0])) << 32 | (u64) swap_workaround (w0[1]); - w[ 1] = ((u64) swap_workaround (w0[2])) << 32 | (u64) swap_workaround (w0[3]); - w[ 2] = ((u64) swap_workaround (w1[0])) << 32 | (u64) swap_workaround (w1[1]); - w[ 3] = ((u64) swap_workaround (w1[2])) << 32 | (u64) swap_workaround (w1[3]); - w[ 4] = ((u64) swap_workaround (w2[0])) << 32 | (u64) swap_workaround (w2[1]); - w[ 5] = ((u64) swap_workaround (w2[2])) << 32 | (u64) swap_workaround (w2[3]); - w[ 6] = ((u64) swap_workaround (w3[0])) << 32 | (u64) swap_workaround (w3[1]); - w[ 7] = ((u64) swap_workaround (w3[2])) << 32 | (u64) swap_workaround (w3[3]); + w[ 0] = ((u64) swap32 (w0[0])) << 32 | (u64) swap32 (w0[1]); + w[ 1] = ((u64) swap32 (w0[2])) << 32 | (u64) swap32 (w0[3]); + w[ 2] = ((u64) swap32 (w1[0])) << 32 | (u64) swap32 (w1[1]); + w[ 3] = ((u64) swap32 (w1[2])) << 32 | (u64) swap32 (w1[3]); + w[ 4] = ((u64) swap32 (w2[0])) << 32 | (u64) swap32 (w2[1]); + w[ 5] = ((u64) swap32 (w2[2])) << 32 | (u64) swap32 (w2[3]); + w[ 6] = ((u64) swap32 (w3[0])) << 32 | (u64) swap32 (w3[1]); + w[ 7] = ((u64) swap32 (w3[2])) << 32 | (u64) swap32 (w3[3]); w[ 8] = 0; w[ 9] = 0; w[10] = 0; diff --git a/OpenCL/m08300_a0.cl b/OpenCL/m08300_a0.cl index d3ecfa8..9641a07 100644 --- a/OpenCL/m08300_a0.cl +++ b/OpenCL/m08300_a0.cl @@ -352,29 +352,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08300_m04 (__glo u32 w0_t2[4]; - w0_t2[0] = swap_workaround (w0_t[0] | d0[0] | s0[0]); - w0_t2[1] = swap_workaround (w0_t[1] | d0[1] | s0[1]); - w0_t2[2] = swap_workaround (w0_t[2] | d0[2] | s0[2]); - w0_t2[3] = swap_workaround (w0_t[3] | d0[3] | s0[3]); + w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]); + w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]); + w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]); + w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]); u32 w1_t2[4]; - w1_t2[0] = swap_workaround (w1_t[0] | d1[0] | s1[0]); - w1_t2[1] = swap_workaround (w1_t[1] | d1[1] | s1[1]); - w1_t2[2] = swap_workaround (w1_t[2] | d1[2] | s1[2]); - w1_t2[3] = swap_workaround (w1_t[3] | d1[3] | s1[3]); + w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]); + w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]); + w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]); + w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]); u32 w2_t2[4]; - w2_t2[0] = swap_workaround (w2_t[0] | d2[0] | s2[0]); - w2_t2[1] = swap_workaround (w2_t[1] | d2[1] | s2[1]); - w2_t2[2] = swap_workaround (w2_t[2] | d2[2] | s2[2]); - w2_t2[3] = swap_workaround (w2_t[3] | d2[3] | s2[3]); + w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]); + w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]); + w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]); + w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]); u32 w3_t2[4]; - w3_t2[0] = swap_workaround (w3_t[0] | d3[0] | s3[0]); - w3_t2[1] = swap_workaround (w3_t[1] | d3[1] | s3[1]); + w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]); + w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]); w3_t2[2] = 0; w3_t2[3] = (1 + out_len + domain_len + 1 + salt_len) * 8; @@ -402,20 +402,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08300_m04 (__glo u32 w1_t3[4]; w1_t3[0] = digest[4]; - w1_t3[1] = swap_workaround (salt_buf0[0]); - w1_t3[2] = swap_workaround (salt_buf0[1]); - w1_t3[3] = swap_workaround (salt_buf0[2]); + w1_t3[1] = swap32 (salt_buf0[0]); + w1_t3[2] = swap32 (salt_buf0[1]); + w1_t3[3] = swap32 (salt_buf0[2]); u32 w2_t3[4]; - w2_t3[0] = swap_workaround (salt_buf0[3]); - w2_t3[1] = swap_workaround (salt_buf1[0]); - w2_t3[2] = swap_workaround (salt_buf1[1]); - w2_t3[3] = swap_workaround (salt_buf1[2]); + w2_t3[0] = swap32 (salt_buf0[3]); + w2_t3[1] = swap32 (salt_buf1[0]); + w2_t3[2] = swap32 (salt_buf1[1]); + w2_t3[3] = swap32 (salt_buf1[2]); u32 w3_t3[4]; - w3_t3[0] = swap_workaround (salt_buf1[3]); + w3_t3[0] = swap32 (salt_buf1[3]); w3_t3[1] = 0; w3_t3[2] = 0; w3_t3[3] = (20 + salt_len) * 8; @@ -660,29 +660,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08300_s04 (__glo u32 w0_t2[4]; - w0_t2[0] = swap_workaround (w0_t[0] | d0[0] | s0[0]); - w0_t2[1] = swap_workaround (w0_t[1] | d0[1] | s0[1]); - w0_t2[2] = swap_workaround (w0_t[2] | d0[2] | s0[2]); - w0_t2[3] = swap_workaround (w0_t[3] | d0[3] | s0[3]); + w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]); + w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]); + w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]); + w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]); u32 w1_t2[4]; - w1_t2[0] = swap_workaround (w1_t[0] | d1[0] | s1[0]); - w1_t2[1] = swap_workaround (w1_t[1] | d1[1] | s1[1]); - w1_t2[2] = swap_workaround (w1_t[2] | d1[2] | s1[2]); - w1_t2[3] = swap_workaround (w1_t[3] | d1[3] | s1[3]); + w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]); + w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]); + w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]); + w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]); u32 w2_t2[4]; - w2_t2[0] = swap_workaround (w2_t[0] | d2[0] | s2[0]); - w2_t2[1] = swap_workaround (w2_t[1] | d2[1] | s2[1]); - w2_t2[2] = swap_workaround (w2_t[2] | d2[2] | s2[2]); - w2_t2[3] = swap_workaround (w2_t[3] | d2[3] | s2[3]); + w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]); + w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]); + w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]); + w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]); u32 w3_t2[4]; - w3_t2[0] = swap_workaround (w3_t[0] | d3[0] | s3[0]); - w3_t2[1] = swap_workaround (w3_t[1] | d3[1] | s3[1]); + w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]); + w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]); w3_t2[2] = 0; w3_t2[3] = (1 + out_len + domain_len + 1 + salt_len) * 8; @@ -710,20 +710,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08300_s04 (__glo u32 w1_t3[4]; w1_t3[0] = digest[4]; - w1_t3[1] = swap_workaround (salt_buf0[0]); - w1_t3[2] = swap_workaround (salt_buf0[1]); - w1_t3[3] = swap_workaround (salt_buf0[2]); + w1_t3[1] = swap32 (salt_buf0[0]); + w1_t3[2] = swap32 (salt_buf0[1]); + w1_t3[3] = swap32 (salt_buf0[2]); u32 w2_t3[4]; - w2_t3[0] = swap_workaround (salt_buf0[3]); - w2_t3[1] = swap_workaround (salt_buf1[0]); - w2_t3[2] = swap_workaround (salt_buf1[1]); - w2_t3[3] = swap_workaround (salt_buf1[2]); + w2_t3[0] = swap32 (salt_buf0[3]); + w2_t3[1] = swap32 (salt_buf1[0]); + w2_t3[2] = swap32 (salt_buf1[1]); + w2_t3[3] = swap32 (salt_buf1[2]); u32 w3_t3[4]; - w3_t3[0] = swap_workaround (salt_buf1[3]); + w3_t3[0] = swap32 (salt_buf1[3]); w3_t3[1] = 0; w3_t3[2] = 0; w3_t3[3] = (20 + salt_len) * 8; diff --git a/OpenCL/m08300_a1.cl b/OpenCL/m08300_a1.cl index 1c026c9..52ad037 100644 --- a/OpenCL/m08300_a1.cl +++ b/OpenCL/m08300_a1.cl @@ -398,29 +398,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08300_m04 (__glo u32 w0_t2[4]; - w0_t2[0] = swap_workaround (w0_t[0] | d0[0] | s0[0]); - w0_t2[1] = swap_workaround (w0_t[1] | d0[1] | s0[1]); - w0_t2[2] = swap_workaround (w0_t[2] | d0[2] | s0[2]); - w0_t2[3] = swap_workaround (w0_t[3] | d0[3] | s0[3]); + w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]); + w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]); + w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]); + w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]); u32 w1_t2[4]; - w1_t2[0] = swap_workaround (w1_t[0] | d1[0] | s1[0]); - w1_t2[1] = swap_workaround (w1_t[1] | d1[1] | s1[1]); - w1_t2[2] = swap_workaround (w1_t[2] | d1[2] | s1[2]); - w1_t2[3] = swap_workaround (w1_t[3] | d1[3] | s1[3]); + w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]); + w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]); + w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]); + w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]); u32 w2_t2[4]; - w2_t2[0] = swap_workaround (w2_t[0] | d2[0] | s2[0]); - w2_t2[1] = swap_workaround (w2_t[1] | d2[1] | s2[1]); - w2_t2[2] = swap_workaround (w2_t[2] | d2[2] | s2[2]); - w2_t2[3] = swap_workaround (w2_t[3] | d2[3] | s2[3]); + w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]); + w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]); + w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]); + w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]); u32 w3_t2[4]; - w3_t2[0] = swap_workaround (w3_t[0] | d3[0] | s3[0]); - w3_t2[1] = swap_workaround (w3_t[1] | d3[1] | s3[1]); + w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]); + w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]); w3_t2[2] = 0; w3_t2[3] = (1 + pw_len + domain_len + 1 + salt_len) * 8; @@ -448,20 +448,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08300_m04 (__glo u32 w1_t3[4]; w1_t3[0] = digest[4]; - w1_t3[1] = swap_workaround (salt_buf0[0]); - w1_t3[2] = swap_workaround (salt_buf0[1]); - w1_t3[3] = swap_workaround (salt_buf0[2]); + w1_t3[1] = swap32 (salt_buf0[0]); + w1_t3[2] = swap32 (salt_buf0[1]); + w1_t3[3] = swap32 (salt_buf0[2]); u32 w2_t3[4]; - w2_t3[0] = swap_workaround (salt_buf0[3]); - w2_t3[1] = swap_workaround (salt_buf1[0]); - w2_t3[2] = swap_workaround (salt_buf1[1]); - w2_t3[3] = swap_workaround (salt_buf1[2]); + w2_t3[0] = swap32 (salt_buf0[3]); + w2_t3[1] = swap32 (salt_buf1[0]); + w2_t3[2] = swap32 (salt_buf1[1]); + w2_t3[3] = swap32 (salt_buf1[2]); u32 w3_t3[4]; - w3_t3[0] = swap_workaround (salt_buf1[3]); + w3_t3[0] = swap32 (salt_buf1[3]); w3_t3[1] = 0; w3_t3[2] = 0; w3_t3[3] = (20 + salt_len) * 8; @@ -754,29 +754,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08300_s04 (__glo u32 w0_t2[4]; - w0_t2[0] = swap_workaround (w0_t[0] | d0[0] | s0[0]); - w0_t2[1] = swap_workaround (w0_t[1] | d0[1] | s0[1]); - w0_t2[2] = swap_workaround (w0_t[2] | d0[2] | s0[2]); - w0_t2[3] = swap_workaround (w0_t[3] | d0[3] | s0[3]); + w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]); + w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]); + w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]); + w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]); u32 w1_t2[4]; - w1_t2[0] = swap_workaround (w1_t[0] | d1[0] | s1[0]); - w1_t2[1] = swap_workaround (w1_t[1] | d1[1] | s1[1]); - w1_t2[2] = swap_workaround (w1_t[2] | d1[2] | s1[2]); - w1_t2[3] = swap_workaround (w1_t[3] | d1[3] | s1[3]); + w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]); + w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]); + w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]); + w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]); u32 w2_t2[4]; - w2_t2[0] = swap_workaround (w2_t[0] | d2[0] | s2[0]); - w2_t2[1] = swap_workaround (w2_t[1] | d2[1] | s2[1]); - w2_t2[2] = swap_workaround (w2_t[2] | d2[2] | s2[2]); - w2_t2[3] = swap_workaround (w2_t[3] | d2[3] | s2[3]); + w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]); + w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]); + w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]); + w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]); u32 w3_t2[4]; - w3_t2[0] = swap_workaround (w3_t[0] | d3[0] | s3[0]); - w3_t2[1] = swap_workaround (w3_t[1] | d3[1] | s3[1]); + w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]); + w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]); w3_t2[2] = 0; w3_t2[3] = (1 + pw_len + domain_len + 1 + salt_len) * 8; @@ -804,20 +804,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08300_s04 (__glo u32 w1_t3[4]; w1_t3[0] = digest[4]; - w1_t3[1] = swap_workaround (salt_buf0[0]); - w1_t3[2] = swap_workaround (salt_buf0[1]); - w1_t3[3] = swap_workaround (salt_buf0[2]); + w1_t3[1] = swap32 (salt_buf0[0]); + w1_t3[2] = swap32 (salt_buf0[1]); + w1_t3[3] = swap32 (salt_buf0[2]); u32 w2_t3[4]; - w2_t3[0] = swap_workaround (salt_buf0[3]); - w2_t3[1] = swap_workaround (salt_buf1[0]); - w2_t3[2] = swap_workaround (salt_buf1[1]); - w2_t3[3] = swap_workaround (salt_buf1[2]); + w2_t3[0] = swap32 (salt_buf0[3]); + w2_t3[1] = swap32 (salt_buf1[0]); + w2_t3[2] = swap32 (salt_buf1[1]); + w2_t3[3] = swap32 (salt_buf1[2]); u32 w3_t3[4]; - w3_t3[0] = swap_workaround (salt_buf1[3]); + w3_t3[0] = swap32 (salt_buf1[3]); w3_t3[1] = 0; w3_t3[2] = 0; w3_t3[3] = (20 + salt_len) * 8; diff --git a/OpenCL/m08300_a3.cl b/OpenCL/m08300_a3.cl index ed51d67..fad8e5d 100644 --- a/OpenCL/m08300_a3.cl +++ b/OpenCL/m08300_a3.cl @@ -309,29 +309,29 @@ static void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0_t2[4]; - w0_t2[0] = swap_workaround (w0_t[0] | d0[0] | s0[0]); - w0_t2[1] = swap_workaround (w0_t[1] | d0[1] | s0[1]); - w0_t2[2] = swap_workaround (w0_t[2] | d0[2] | s0[2]); - w0_t2[3] = swap_workaround (w0_t[3] | d0[3] | s0[3]); + w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]); + w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]); + w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]); + w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]); u32 w1_t2[4]; - w1_t2[0] = swap_workaround (w1_t[0] | d1[0] | s1[0]); - w1_t2[1] = swap_workaround (w1_t[1] | d1[1] | s1[1]); - w1_t2[2] = swap_workaround (w1_t[2] | d1[2] | s1[2]); - w1_t2[3] = swap_workaround (w1_t[3] | d1[3] | s1[3]); + w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]); + w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]); + w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]); + w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]); u32 w2_t2[4]; - w2_t2[0] = swap_workaround (w2_t[0] | d2[0] | s2[0]); - w2_t2[1] = swap_workaround (w2_t[1] | d2[1] | s2[1]); - w2_t2[2] = swap_workaround (w2_t[2] | d2[2] | s2[2]); - w2_t2[3] = swap_workaround (w2_t[3] | d2[3] | s2[3]); + w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]); + w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]); + w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]); + w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]); u32 w3_t2[4]; - w3_t2[0] = swap_workaround (w3_t[0] | d3[0] | s3[0]); - w3_t2[1] = swap_workaround (w3_t[1] | d3[1] | s3[1]); + w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]); + w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]); w3_t2[2] = 0; w3_t2[3] = (1 + pw_len + domain_len + 1 + salt_len) * 8; @@ -359,20 +359,20 @@ static void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w1_t3[4]; w1_t3[0] = digest[4]; - w1_t3[1] = swap_workaround (salt_buf0[0]); - w1_t3[2] = swap_workaround (salt_buf0[1]); - w1_t3[3] = swap_workaround (salt_buf0[2]); + w1_t3[1] = swap32 (salt_buf0[0]); + w1_t3[2] = swap32 (salt_buf0[1]); + w1_t3[3] = swap32 (salt_buf0[2]); u32 w2_t3[4]; - w2_t3[0] = swap_workaround (salt_buf0[3]); - w2_t3[1] = swap_workaround (salt_buf1[0]); - w2_t3[2] = swap_workaround (salt_buf1[1]); - w2_t3[3] = swap_workaround (salt_buf1[2]); + w2_t3[0] = swap32 (salt_buf0[3]); + w2_t3[1] = swap32 (salt_buf1[0]); + w2_t3[2] = swap32 (salt_buf1[1]); + w2_t3[3] = swap32 (salt_buf1[2]); u32 w3_t3[4]; - w3_t3[0] = swap_workaround (salt_buf1[3]); + w3_t3[0] = swap32 (salt_buf1[3]); w3_t3[1] = 0; w3_t3[2] = 0; w3_t3[3] = (20 + salt_len) * 8; @@ -568,29 +568,29 @@ static void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0_t2[4]; - w0_t2[0] = swap_workaround (w0_t[0] | d0[0] | s0[0]); - w0_t2[1] = swap_workaround (w0_t[1] | d0[1] | s0[1]); - w0_t2[2] = swap_workaround (w0_t[2] | d0[2] | s0[2]); - w0_t2[3] = swap_workaround (w0_t[3] | d0[3] | s0[3]); + w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]); + w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]); + w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]); + w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]); u32 w1_t2[4]; - w1_t2[0] = swap_workaround (w1_t[0] | d1[0] | s1[0]); - w1_t2[1] = swap_workaround (w1_t[1] | d1[1] | s1[1]); - w1_t2[2] = swap_workaround (w1_t[2] | d1[2] | s1[2]); - w1_t2[3] = swap_workaround (w1_t[3] | d1[3] | s1[3]); + w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]); + w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]); + w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]); + w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]); u32 w2_t2[4]; - w2_t2[0] = swap_workaround (w2_t[0] | d2[0] | s2[0]); - w2_t2[1] = swap_workaround (w2_t[1] | d2[1] | s2[1]); - w2_t2[2] = swap_workaround (w2_t[2] | d2[2] | s2[2]); - w2_t2[3] = swap_workaround (w2_t[3] | d2[3] | s2[3]); + w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]); + w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]); + w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]); + w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]); u32 w3_t2[4]; - w3_t2[0] = swap_workaround (w3_t[0] | d3[0] | s3[0]); - w3_t2[1] = swap_workaround (w3_t[1] | d3[1] | s3[1]); + w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]); + w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]); w3_t2[2] = 0; w3_t2[3] = (1 + pw_len + domain_len + 1 + salt_len) * 8; @@ -618,20 +618,20 @@ static void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w1_t3[4]; w1_t3[0] = digest[4]; - w1_t3[1] = swap_workaround (salt_buf0[0]); - w1_t3[2] = swap_workaround (salt_buf0[1]); - w1_t3[3] = swap_workaround (salt_buf0[2]); + w1_t3[1] = swap32 (salt_buf0[0]); + w1_t3[2] = swap32 (salt_buf0[1]); + w1_t3[3] = swap32 (salt_buf0[2]); u32 w2_t3[4]; - w2_t3[0] = swap_workaround (salt_buf0[3]); - w2_t3[1] = swap_workaround (salt_buf1[0]); - w2_t3[2] = swap_workaround (salt_buf1[1]); - w2_t3[3] = swap_workaround (salt_buf1[2]); + w2_t3[0] = swap32 (salt_buf0[3]); + w2_t3[1] = swap32 (salt_buf1[0]); + w2_t3[2] = swap32 (salt_buf1[1]); + w2_t3[3] = swap32 (salt_buf1[2]); u32 w3_t3[4]; - w3_t3[0] = swap_workaround (salt_buf1[3]); + w3_t3[0] = swap32 (salt_buf1[3]); w3_t3[1] = 0; w3_t3[2] = 0; w3_t3[3] = (20 + salt_len) * 8; diff --git a/OpenCL/m08400_a0.cl b/OpenCL/m08400_a0.cl index 26beaff..00d0971 100644 --- a/OpenCL/m08400_a0.cl +++ b/OpenCL/m08400_a0.cl @@ -233,22 +233,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_m04 (__glo u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 0]); - salt_buf0[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 1]); - salt_buf0[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 2]); - salt_buf0[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 3]); + salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); + salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); + salt_buf0[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); + salt_buf0[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 4]); - salt_buf1[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 5]); - salt_buf1[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 6]); - salt_buf1[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 7]); + salt_buf1[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 4]); + salt_buf1[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 5]); + salt_buf1[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 6]); + salt_buf1[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 8]); - salt_buf2[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 9]); + salt_buf2[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 8]); + salt_buf2[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 9]); salt_buf2[2] = 0; salt_buf2[3] = 0; @@ -294,29 +294,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); u32 w3_t[4]; - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = pw_len * 8; @@ -544,22 +544,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_s04 (__glo u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 0]); - salt_buf0[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 1]); - salt_buf0[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 2]); - salt_buf0[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 3]); + salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); + salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); + salt_buf0[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); + salt_buf0[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 4]); - salt_buf1[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 5]); - salt_buf1[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 6]); - salt_buf1[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 7]); + salt_buf1[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 4]); + salt_buf1[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 5]); + salt_buf1[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 6]); + salt_buf1[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 8]); - salt_buf2[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 9]); + salt_buf2[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 8]); + salt_buf2[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 9]); salt_buf2[2] = 0; salt_buf2[3] = 0; @@ -617,29 +617,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); u32 w3_t[4]; - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = pw_len * 8; diff --git a/OpenCL/m08400_a1.cl b/OpenCL/m08400_a1.cl index 135acce..36b86af 100644 --- a/OpenCL/m08400_a1.cl +++ b/OpenCL/m08400_a1.cl @@ -252,22 +252,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_m04 (__glo u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 0]); - salt_buf0[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 1]); - salt_buf0[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 2]); - salt_buf0[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 3]); + salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); + salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); + salt_buf0[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); + salt_buf0[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 4]); - salt_buf1[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 5]); - salt_buf1[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 6]); - salt_buf1[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 7]); + salt_buf1[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 4]); + salt_buf1[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 5]); + salt_buf1[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 6]); + salt_buf1[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 8]); - salt_buf2[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 9]); + salt_buf2[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 8]); + salt_buf2[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 9]); salt_buf2[2] = 0; salt_buf2[3] = 0; @@ -348,29 +348,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_m04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); u32 w3_t[4]; - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = pw_len * 8; @@ -619,22 +619,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_s04 (__glo u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 0]); - salt_buf0[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 1]); - salt_buf0[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 2]); - salt_buf0[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 3]); + salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); + salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); + salt_buf0[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); + salt_buf0[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 4]); - salt_buf1[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 5]); - salt_buf1[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 6]); - salt_buf1[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 7]); + salt_buf1[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 4]); + salt_buf1[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 5]); + salt_buf1[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 6]); + salt_buf1[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 8]); - salt_buf2[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 9]); + salt_buf2[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 8]); + salt_buf2[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 9]); salt_buf2[2] = 0; salt_buf2[3] = 0; @@ -727,29 +727,29 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_s04 (__glo u32 w0_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); u32 w1_t[4]; - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); u32 w2_t[4]; - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); u32 w3_t[4]; - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = pw_len * 8; diff --git a/OpenCL/m08400_a3.cl b/OpenCL/m08400_a3.cl index 61527ea..59a2c72 100644 --- a/OpenCL/m08400_a3.cl +++ b/OpenCL/m08400_a3.cl @@ -175,22 +175,22 @@ static void m08400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 0]); - salt_buf0[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 1]); - salt_buf0[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 2]); - salt_buf0[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 3]); + salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); + salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); + salt_buf0[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); + salt_buf0[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 4]); - salt_buf1[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 5]); - salt_buf1[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 6]); - salt_buf1[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 7]); + salt_buf1[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 4]); + salt_buf1[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 5]); + salt_buf1[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 6]); + salt_buf1[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 8]); - salt_buf2[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 9]); + salt_buf2[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 8]); + salt_buf2[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 9]); salt_buf2[2] = 0; salt_buf2[3] = 0; @@ -408,22 +408,22 @@ static void m08400s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 0]); - salt_buf0[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 1]); - salt_buf0[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 2]); - salt_buf0[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 3]); + salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); + salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); + salt_buf0[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); + salt_buf0[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 4]); - salt_buf1[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 5]); - salt_buf1[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 6]); - salt_buf1[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 7]); + salt_buf1[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 4]); + salt_buf1[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 5]); + salt_buf1[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 6]); + salt_buf1[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 8]); - salt_buf2[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[ 9]); + salt_buf2[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 8]); + salt_buf2[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 9]); salt_buf2[2] = 0; salt_buf2[3] = 0; diff --git a/OpenCL/m08800.cl b/OpenCL/m08800.cl index 6ff2525..e3568ef 100644 --- a/OpenCL/m08800.cl +++ b/OpenCL/m08800.cl @@ -1498,22 +1498,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08800_init (__gl * pads */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); u32 ipad[5]; u32 opad[5]; @@ -1558,16 +1558,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08800_init (__gl append_0x80_3x4 (w0, w1, w2, salt_len + 4); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); w2[2] = 0; w2[3] = 0; w3[0] = 0; @@ -1979,9 +1979,9 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08800_comp (__gl } // we need just a few swapped, because we do not access the others - r[ 5] = swap_workaround (r[ 5]); - r[ 6] = swap_workaround (r[ 6]); - r[14] = swap_workaround (r[14]); + r[ 5] = swap32 (r[ 5]); + r[ 6] = swap32 (r[ 6]); + r[14] = swap32 (r[14]); // superblock not on id 0 or 1 // assumes max block size is 32MiB diff --git a/OpenCL/m08900.cl b/OpenCL/m08900.cl index 25bfeac..e2e6cda 100644 --- a/OpenCL/m08900.cl +++ b/OpenCL/m08900.cl @@ -17,17 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE2 -#define COMPARE_M "check_multi_vect2_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" __constant u32 k_sha256[64] = { @@ -569,7 +560,7 @@ static void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], } } -static uint4 swap_workaround (uint4 v) +static uint4 swap32_4 (uint4 v) { return (rotate ((v & 0x00FF00FF), 24u) | rotate ((v & 0xFF00FF00), 8u)); } @@ -578,6 +569,11 @@ static uint4 swap_workaround (uint4 v) #define GET_SMIX_CNT(r,N) (2 * (r) * 16 * (N)) #define GET_STATE_CNT(r) (2 * (r) * 16) +#define SCRYPT_CNT GET_SCRYPT_CNT (SCRYPT_R, SCRYPT_P) +#define SCRYPT_CNT4 (SCRYPT_CNT / 4) +#define STATE_CNT GET_STATE_CNT (SCRYPT_R) +#define STATE_CNT4 (STATE_CNT / 4) + #define ADD_ROTATE_XOR(r,i1,i2,s) (r) ^= rotate ((i1) + (i2), (s)); #define SALSA20_2R() \ @@ -624,18 +620,14 @@ static uint4 swap_workaround (uint4 v) R3 = R3 + X3; \ } -static void salsa_r (uint4 *T, const u32 r) +static void salsa_r (uint4 *T) { - const u32 state_cnt = GET_STATE_CNT (r); - - const u32 state_cnt4 = state_cnt / 4; - - uint4 R0 = T[state_cnt4 - 4]; - uint4 R1 = T[state_cnt4 - 3]; - uint4 R2 = T[state_cnt4 - 2]; - uint4 R3 = T[state_cnt4 - 1]; + uint4 R0 = T[STATE_CNT4 - 4]; + uint4 R1 = T[STATE_CNT4 - 3]; + uint4 R2 = T[STATE_CNT4 - 2]; + uint4 R3 = T[STATE_CNT4 - 1]; - for (u32 i = 0; i < state_cnt4; i += 8) + for (u32 i = 0; i < STATE_CNT4; i += 8) { uint4 Y0; uint4 Y1; @@ -680,7 +672,7 @@ static void salsa_r (uint4 *T, const u32 r) exchg (x4 + 3, y4 + 3); \ } - for (u32 i = 1; i < r / 1; i++) + for (u32 i = 1; i < SCRYPT_R / 1; i++) { const u32 x = i * 1; const u32 y = i * 2; @@ -688,37 +680,33 @@ static void salsa_r (uint4 *T, const u32 r) exchg4 (x, y); } - for (u32 i = 1; i < r / 2; i++) + for (u32 i = 1; i < SCRYPT_R / 2; i++) { const u32 x = i * 1; const u32 y = i * 2; - const u32 xr1 = (r * 2) - 1 - x; - const u32 yr1 = (r * 2) - 1 - y; + const u32 xr1 = (SCRYPT_R * 2) - 1 - x; + const u32 yr1 = (SCRYPT_R * 2) - 1 - y; exchg4 (xr1, yr1); } } -static void scrypt_smix (uint4 *X, uint4 *T, const u32 N, const u32 r, const u32 tmto, const u32 phy, __global uint4 *V) +static void scrypt_smix (uint4 *X, uint4 *T, const u32 phy, __global uint4 *V) { - const u32 state_cnt = GET_STATE_CNT (r); - - const u32 state_cnt4 = state_cnt / 4; - #define Coord(x,y,z) (((x) * zSIZE) + ((y) * zSIZE * xSIZE) + (z)) #define CO Coord(x,y,z) const u32 xSIZE = phy; - const u32 ySIZE = N / tmto; - const u32 zSIZE = state_cnt4; + const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO; + const u32 zSIZE = STATE_CNT4; const u32 gid = get_global_id (0); const u32 x = gid % xSIZE; #pragma unroll - for (u32 i = 0; i < state_cnt4; i += 4) + for (u32 i = 0; i < STATE_CNT4; i += 4) { T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); @@ -735,28 +723,28 @@ static void scrypt_smix (uint4 *X, uint4 *T, const u32 N, const u32 r, const u32 { for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; - for (u32 i = 0; i < tmto; i++) salsa_r (X, r); + for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X); } - for (u32 i = 0; i < N; i++) + for (u32 i = 0; i < SCRYPT_N; i++) { - const u32 k = X[zSIZE - 4].x & (N - 1); + const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1); - const u32 y = k / tmto; + const u32 y = k / SCRYPT_TMTO; - const u32 km = k - (y * tmto); + const u32 km = k - (y * SCRYPT_TMTO); for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO]; - for (u32 i = 0; i < km; i++) salsa_r (T, r); + for (u32 i = 0; i < km; i++) salsa_r (T); for (u32 z = 0; z < zSIZE; z++) X[z] ^= T[z]; - salsa_r (X, r); + salsa_r (X); } #pragma unroll - for (u32 i = 0; i < state_cnt4; i += 4) + for (u32 i = 0; i < STATE_CNT4; i += 4) { T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); @@ -828,45 +816,33 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_init (__gl const u32 salt_len = salt_bufs[salt_pos].salt_len; - /** - * memory buffers - */ - - const u32 scrypt_r = SCRYPT_R; - const u32 scrypt_p = SCRYPT_P; - //const u32 scrypt_N = SCRYPT_N; - - //const u32 state_cnt = GET_STATE_CNT (scrypt_r); - const u32 scrypt_cnt = GET_SCRYPT_CNT (scrypt_r, scrypt_p); - //const u32 smix_cnt = GET_SMIX_CNT (scrypt_r, scrypt_N); - /** * 1st pbkdf2, creates B */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); u32 ipad[8]; u32 opad[8]; hmac_sha256_pad (w0, w1, w2, w3, ipad, opad); - for (u32 i = 0, j = 0, k = 0; i < scrypt_cnt; i += 8, j += 1, k += 2) + for (u32 i = 0, j = 0, k = 0; i < SCRYPT_CNT; i += 8, j += 1, k += 2) { w0[0] = salt_buf0[0]; w0[1] = salt_buf0[1]; @@ -887,25 +863,25 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_init (__gl u32 append[2]; - append[0] = swap_workaround (j + 1); + append[0] = swap32 (j + 1); append[1] = 0x80; memcat8 (w0, w1, w2, w3, salt_len, append); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); w3[2] = 0; w3[3] = (64 + salt_len + 4) * 8; @@ -929,33 +905,27 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_loop (__gl if (gid >= gid_max) return; - const u32 scrypt_phy = salt_bufs[salt_pos].scrypt_phy; - - const u32 state_cnt = GET_STATE_CNT (SCRYPT_R); - const u32 scrypt_cnt = GET_SCRYPT_CNT (SCRYPT_R, SCRYPT_P); + const u32 scrypt_phy = salt_bufs[salt_pos].scrypt_phy; - const u32 state_cnt4 = state_cnt / 4; - const u32 scrypt_cnt4 = scrypt_cnt / 4; - - uint4 X[state_cnt4]; - uint4 T[state_cnt4]; + uint4 X[STATE_CNT4]; + uint4 T[STATE_CNT4]; #pragma unroll - for (int z = 0; z < state_cnt4; z++) X[z] = swap_workaround (tmps[gid].P[z]); + for (int z = 0; z < STATE_CNT4; z++) X[z] = swap32_4 (tmps[gid].P[z]); - scrypt_smix (X, T, SCRYPT_N, SCRYPT_R, SCRYPT_TMTO, scrypt_phy, d_scryptV_buf); + scrypt_smix (X, T, scrypt_phy, d_scryptV_buf); #pragma unroll - for (int z = 0; z < state_cnt4; z++) tmps[gid].P[z] = swap_workaround (X[z]); + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = swap32_4 (X[z]); #if SCRYPT_P >= 1 - for (int i = state_cnt4; i < scrypt_cnt4; i += state_cnt4) + for (int i = STATE_CNT4; i < SCRYPT_CNT4; i += STATE_CNT4) { - for (int z = 0; z < state_cnt4; z++) X[z] = swap_workaround (tmps[gid].P[i + z]); + for (int z = 0; z < STATE_CNT4; z++) X[z] = swap32_4 (tmps[gid].P[i + z]); - scrypt_smix (X, T, SCRYPT_N, SCRYPT_R, SCRYPT_TMTO, scrypt_phy, d_scryptV_buf); + scrypt_smix (X, T, scrypt_phy, d_scryptV_buf); - for (int z = 0; z < state_cnt4; z++) tmps[gid].P[i + z] = swap_workaround (X[z]); + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[i + z] = swap32_4 (X[z]); } #endif } @@ -1007,37 +977,34 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_comp (__gl const u32 scrypt_p = SCRYPT_P; //const u32 scrypt_N = SCRYPT_N; - const u32 scrypt_cnt = GET_SCRYPT_CNT (scrypt_r, scrypt_p); - - const u32 scrypt_cnt4 = scrypt_cnt / 4; /** * 2nd pbkdf2, creates B */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); u32 ipad[8]; u32 opad[8]; hmac_sha256_pad (w0, w1, w2, w3, ipad, opad); - for (u32 l = 0; l < scrypt_cnt4; l += 4) + for (u32 l = 0; l < SCRYPT_CNT4; l += 4) { barrier (CLK_GLOBAL_MEM_FENCE); @@ -1089,16 +1056,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08900_comp (__gl w3[0] = 0; w3[1] = 0; w3[2] = 0; - w3[3] = (64 + (scrypt_cnt * 4) + 4) * 8; + w3[3] = (64 + (SCRYPT_CNT * 4) + 4) * 8; u32 digest[8]; hmac_sha256_run (w0, w1, w2, w3, ipad, opad, digest); - const u32 r0 = swap_workaround (digest[DGST_R0]); - const u32 r1 = swap_workaround (digest[DGST_R1]); - const u32 r2 = swap_workaround (digest[DGST_R2]); - const u32 r3 = swap_workaround (digest[DGST_R3]); + const u32 r0 = swap32 (digest[DGST_R0]); + const u32 r1 = swap32 (digest[DGST_R1]); + const u32 r2 = swap32 (digest[DGST_R2]); + const u32 r3 = swap32 (digest[DGST_R3]); #define il_pos 0 diff --git a/OpenCL/m09000.cl b/OpenCL/m09000.cl index 3543f8d..b6a5e92 100644 --- a/OpenCL/m09000.cl +++ b/OpenCL/m09000.cl @@ -549,20 +549,20 @@ __kernel void __attribute__((reqd_work_group_size (8, 1, 1))) m09000_init (__glo w0[1] = salt_buf[1]; w0[0] = salt_buf[0]; - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); const u32 block_len = salt_len + 2 + pw_len; @@ -781,8 +781,8 @@ __kernel void __attribute__((reqd_work_group_size (8, 1, 1))) m09000_comp (__glo u32 w2[4]; u32 w3[4]; - w0[0] = swap_workaround (digest[0]); - w0[1] = swap_workaround (digest[1]); + w0[0] = swap32 (digest[0]); + w0[1] = swap32 (digest[1]); w0[2] = 0x00008000; w0[3] = 0; w1[0] = 0; diff --git a/OpenCL/m09100.cl b/OpenCL/m09100.cl index 038b438..f50dcfb 100644 --- a/OpenCL/m09100.cl +++ b/OpenCL/m09100.cl @@ -20,17 +20,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE2 -#define COMPARE_M "check_multi_vect2_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" __constant char lotus64_table[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/"; @@ -1027,21 +1018,21 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09100_init (__gl w3[2] = salt_buf3[2]; //w3[3] = salt_buf3[3]; - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); w3[3] = (64 + salt_len + 4) * 8; u32 dgst[5]; diff --git a/OpenCL/m09400.cl b/OpenCL/m09400.cl index df0c9b5..7c0e10a 100644 --- a/OpenCL/m09400.cl +++ b/OpenCL/m09400.cl @@ -17,17 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE2 -#define COMPARE_M "check_multi_vect2_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" __constant u32 te0[256] = { @@ -1399,22 +1390,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09400_init (__gl u32 t1[4]; - t1[0] = swap_workaround (w0[0]); - t1[1] = swap_workaround (w0[1]); - t1[2] = swap_workaround (w0[2]); - t1[3] = swap_workaround (w0[3]); + t1[0] = swap32 (w0[0]); + t1[1] = swap32 (w0[1]); + t1[2] = swap32 (w0[2]); + t1[3] = swap32 (w0[3]); u32 t2[4]; - t2[0] = swap_workaround (w1[0]); - t2[1] = swap_workaround (w1[1]); - t2[2] = swap_workaround (w1[2]); - t2[3] = swap_workaround (w1[3]); + t2[0] = swap32 (w1[0]); + t2[1] = swap32 (w1[1]); + t2[2] = swap32 (w1[2]); + t2[3] = swap32 (w1[3]); u32 t3[4]; - t3[0] = swap_workaround (w2[0]); - t3[1] = swap_workaround (w2[1]); + t3[0] = swap32 (w2[0]); + t3[1] = swap32 (w2[1]); t3[2] = 0; t3[3] = (salt_len + (pw_len * 2)) * 8; @@ -1471,7 +1462,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09400_loop (__gl for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++) { - w0[0] = swap_workaround (j); + w0[0] = swap32 (j); u32 digest[5]; diff --git a/OpenCL/m09500.cl b/OpenCL/m09500.cl index 4fc5b74..5d92a6f 100644 --- a/OpenCL/m09500.cl +++ b/OpenCL/m09500.cl @@ -17,17 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE2 -#define COMPARE_M "check_multi_vect2_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" __constant u32 te0[256] = { @@ -1129,22 +1120,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09500_init (__gl u32 t1[4]; - t1[0] = swap_workaround (w0[0]); - t1[1] = swap_workaround (w0[1]); - t1[2] = swap_workaround (w0[2]); - t1[3] = swap_workaround (w0[3]); + t1[0] = swap32 (w0[0]); + t1[1] = swap32 (w0[1]); + t1[2] = swap32 (w0[2]); + t1[3] = swap32 (w0[3]); u32 t2[4]; - t2[0] = swap_workaround (w1[0]); - t2[1] = swap_workaround (w1[1]); - t2[2] = swap_workaround (w1[2]); - t2[3] = swap_workaround (w1[3]); + t2[0] = swap32 (w1[0]); + t2[1] = swap32 (w1[1]); + t2[2] = swap32 (w1[2]); + t2[3] = swap32 (w1[3]); u32 t3[4]; - t3[0] = swap_workaround (w2[0]); - t3[1] = swap_workaround (w2[1]); + t3[0] = swap32 (w2[0]); + t3[1] = swap32 (w2[1]); t3[2] = 0; t3[3] = (salt_len + (pw_len * 2)) * 8; @@ -1201,7 +1192,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09500_loop (__gl for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++) { - w0[0] = swap_workaround (j); + w0[0] = swap32 (j); u32 digest[5]; diff --git a/OpenCL/m09600.cl b/OpenCL/m09600.cl index 1541b30..9bf4b37 100644 --- a/OpenCL/m09600.cl +++ b/OpenCL/m09600.cl @@ -17,17 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE2 -#define COMPARE_M "check_multi_vect2_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" __constant u32 te0[256] = { @@ -1162,20 +1153,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09600_init (__gl t0[0] = (u64) salt_buf[0] << 32 | salt_buf[1]; t0[1] = (u64) salt_buf[2] << 32 | salt_buf[3]; - t0[2] = (u64) swap_workaround (w0[0]) << 32 | swap_workaround (w0[1]); - t0[3] = (u64) swap_workaround (w0[2]) << 32 | swap_workaround (w0[3]); + t0[2] = (u64) swap32 (w0[0]) << 32 | swap32 (w0[1]); + t0[3] = (u64) swap32 (w0[2]) << 32 | swap32 (w0[3]); u64 t1[4]; - t1[0] = (u64) swap_workaround (w1[0]) << 32 | swap_workaround (w1[1]); - t1[1] = (u64) swap_workaround (w1[2]) << 32 | swap_workaround (w1[3]); - t1[2] = (u64) swap_workaround (w2[0]) << 32 | swap_workaround (w2[1]); - t1[3] = (u64) swap_workaround (w2[2]) << 32 | swap_workaround (w2[3]); + t1[0] = (u64) swap32 (w1[0]) << 32 | swap32 (w1[1]); + t1[1] = (u64) swap32 (w1[2]) << 32 | swap32 (w1[3]); + t1[2] = (u64) swap32 (w2[0]) << 32 | swap32 (w2[1]); + t1[3] = (u64) swap32 (w2[2]) << 32 | swap32 (w2[3]); u64 t2[4]; - t2[0] = (u64) swap_workaround (w3[0]) << 32 | swap_workaround (w3[1]); - t2[1] = (u64) swap_workaround (w3[2]) << 32 | swap_workaround (w3[3]); + t2[0] = (u64) swap32 (w3[0]) << 32 | swap32 (w3[1]); + t2[1] = (u64) swap32 (w3[2]) << 32 | swap32 (w3[3]); t2[2] = 0; t2[3] = 0; @@ -1245,7 +1236,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09600_loop (__gl for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++) { - w0[0] = (u64) swap_workaround (j) << 32 | w0[0] & 0xffffffff; + w0[0] = (u64) swap32 (j) << 32 | w0[0] & 0xffffffff; u64 digest[8]; diff --git a/OpenCL/m09800_a0.cl b/OpenCL/m09800_a0.cl index d20242a..33331bc 100644 --- a/OpenCL/m09800_a0.cl +++ b/OpenCL/m09800_a0.cl @@ -386,16 +386,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_m04 (__glo w0_t[1] = salt_buf[1]; w0_t[2] = salt_buf[2]; w0_t[3] = salt_buf[3]; - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; @@ -436,10 +436,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_m04 (__glo u32 key[4]; - key[0] = swap_workaround (digest[0]); - key[1] = swap_workaround (digest[1]); - key[2] = swap_workaround (digest[2]); - key[3] = swap_workaround (digest[3]); + key[0] = swap32 (digest[0]); + key[1] = swap32 (digest[1]); + key[2] = swap32 (digest[2]); + key[3] = swap32 (digest[3]); if (version == 3) { @@ -454,10 +454,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_m04 (__glo u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); - w0_t[0] = swap_workaround (out[0]); - w0_t[1] = swap_workaround (out[1]); - w0_t[2] = swap_workaround (out[2]); - w0_t[3] = swap_workaround (out[3]); + w0_t[0] = swap32 (out[0]); + w0_t[1] = swap32 (out[1]); + w0_t[2] = swap32 (out[2]); + w0_t[3] = swap32 (out[3]); w1_t[0] = 0x80000000; w1_t[1] = 0; w1_t[2] = 0; @@ -479,10 +479,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_m04 (__glo sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); rc4_next_16 (rc4_key, 16, j, digest, out); @@ -631,16 +631,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_s04 (__glo w0_t[1] = salt_buf[1]; w0_t[2] = salt_buf[2]; w0_t[3] = salt_buf[3]; - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; @@ -681,10 +681,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_s04 (__glo u32 key[4]; - key[0] = swap_workaround (digest[0]); - key[1] = swap_workaround (digest[1]); - key[2] = swap_workaround (digest[2]); - key[3] = swap_workaround (digest[3]); + key[0] = swap32 (digest[0]); + key[1] = swap32 (digest[1]); + key[2] = swap32 (digest[2]); + key[3] = swap32 (digest[3]); if (version == 3) { @@ -699,10 +699,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_s04 (__glo u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); - w0_t[0] = swap_workaround (out[0]); - w0_t[1] = swap_workaround (out[1]); - w0_t[2] = swap_workaround (out[2]); - w0_t[3] = swap_workaround (out[3]); + w0_t[0] = swap32 (out[0]); + w0_t[1] = swap32 (out[1]); + w0_t[2] = swap32 (out[2]); + w0_t[3] = swap32 (out[3]); w1_t[0] = 0x80000000; w1_t[1] = 0; w1_t[2] = 0; @@ -724,10 +724,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_s04 (__glo sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); rc4_next_16 (rc4_key, 16, j, digest, out); diff --git a/OpenCL/m09800_a1.cl b/OpenCL/m09800_a1.cl index fd77036..91e9e62 100644 --- a/OpenCL/m09800_a1.cl +++ b/OpenCL/m09800_a1.cl @@ -438,16 +438,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_m04 (__glo w0_t[1] = salt_buf[1]; w0_t[2] = salt_buf[2]; w0_t[3] = salt_buf[3]; - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; @@ -488,10 +488,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_m04 (__glo u32 key[4]; - key[0] = swap_workaround (digest[0]); - key[1] = swap_workaround (digest[1]); - key[2] = swap_workaround (digest[2]); - key[3] = swap_workaround (digest[3]); + key[0] = swap32 (digest[0]); + key[1] = swap32 (digest[1]); + key[2] = swap32 (digest[2]); + key[3] = swap32 (digest[3]); if (version == 3) { @@ -506,10 +506,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_m04 (__glo u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); - w0_t[0] = swap_workaround (out[0]); - w0_t[1] = swap_workaround (out[1]); - w0_t[2] = swap_workaround (out[2]); - w0_t[3] = swap_workaround (out[3]); + w0_t[0] = swap32 (out[0]); + w0_t[1] = swap32 (out[1]); + w0_t[2] = swap32 (out[2]); + w0_t[3] = swap32 (out[3]); w1_t[0] = 0x80000000; w1_t[1] = 0; w1_t[2] = 0; @@ -531,10 +531,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_m04 (__glo sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); rc4_next_16 (rc4_key, 16, j, digest, out); @@ -737,16 +737,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_s04 (__glo w0_t[1] = salt_buf[1]; w0_t[2] = salt_buf[2]; w0_t[3] = salt_buf[3]; - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; @@ -787,10 +787,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_s04 (__glo u32 key[4]; - key[0] = swap_workaround (digest[0]); - key[1] = swap_workaround (digest[1]); - key[2] = swap_workaround (digest[2]); - key[3] = swap_workaround (digest[3]); + key[0] = swap32 (digest[0]); + key[1] = swap32 (digest[1]); + key[2] = swap32 (digest[2]); + key[3] = swap32 (digest[3]); if (version == 3) { @@ -805,10 +805,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_s04 (__glo u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); - w0_t[0] = swap_workaround (out[0]); - w0_t[1] = swap_workaround (out[1]); - w0_t[2] = swap_workaround (out[2]); - w0_t[3] = swap_workaround (out[3]); + w0_t[0] = swap32 (out[0]); + w0_t[1] = swap32 (out[1]); + w0_t[2] = swap32 (out[2]); + w0_t[3] = swap32 (out[3]); w1_t[0] = 0x80000000; w1_t[1] = 0; w1_t[2] = 0; @@ -830,10 +830,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_s04 (__glo sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); rc4_next_16 (rc4_key, 16, j, digest, out); diff --git a/OpenCL/m09800_a3.cl b/OpenCL/m09800_a3.cl index 1541e50..ee088eb 100644 --- a/OpenCL/m09800_a3.cl +++ b/OpenCL/m09800_a3.cl @@ -378,10 +378,10 @@ static void m09800m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 key[4]; - key[0] = swap_workaround (digest[0]); - key[1] = swap_workaround (digest[1]); - key[2] = swap_workaround (digest[2]); - key[3] = swap_workaround (digest[3]); + key[0] = swap32 (digest[0]); + key[1] = swap32 (digest[1]); + key[2] = swap32 (digest[2]); + key[3] = swap32 (digest[3]); if (version == 3) { @@ -396,10 +396,10 @@ static void m09800m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); - w0_t[0] = swap_workaround (out[0]); - w0_t[1] = swap_workaround (out[1]); - w0_t[2] = swap_workaround (out[2]); - w0_t[3] = swap_workaround (out[3]); + w0_t[0] = swap32 (out[0]); + w0_t[1] = swap32 (out[1]); + w0_t[2] = swap32 (out[2]); + w0_t[3] = swap32 (out[3]); w1_t[0] = 0x80000000; w1_t[1] = 0; w1_t[2] = 0; @@ -421,10 +421,10 @@ static void m09800m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); rc4_next_16 (rc4_key, 16, j, digest, out); @@ -559,10 +559,10 @@ static void m09800s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 key[4]; - key[0] = swap_workaround (digest[0]); - key[1] = swap_workaround (digest[1]); - key[2] = swap_workaround (digest[2]); - key[3] = swap_workaround (digest[3]); + key[0] = swap32 (digest[0]); + key[1] = swap32 (digest[1]); + key[2] = swap32 (digest[2]); + key[3] = swap32 (digest[3]); if (version == 3) { @@ -577,10 +577,10 @@ static void m09800s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); - w0_t[0] = swap_workaround (out[0]); - w0_t[1] = swap_workaround (out[1]); - w0_t[2] = swap_workaround (out[2]); - w0_t[3] = swap_workaround (out[3]); + w0_t[0] = swap32 (out[0]); + w0_t[1] = swap32 (out[1]); + w0_t[2] = swap32 (out[2]); + w0_t[3] = swap32 (out[3]); w1_t[0] = 0x80000000; w1_t[1] = 0; w1_t[2] = 0; @@ -602,10 +602,10 @@ static void m09800s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); rc4_next_16 (rc4_key, 16, j, digest, out); diff --git a/OpenCL/m09810_a0.cl b/OpenCL/m09810_a0.cl index 45fb4ee..c03ee81 100644 --- a/OpenCL/m09810_a0.cl +++ b/OpenCL/m09810_a0.cl @@ -373,10 +373,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09810_m04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (out[0]); - w0_t[1] = swap_workaround (out[1]); - w0_t[2] = swap_workaround (out[2]); - w0_t[3] = swap_workaround (out[3]); + w0_t[0] = swap32 (out[0]); + w0_t[1] = swap32 (out[1]); + w0_t[2] = swap32 (out[2]); + w0_t[3] = swap32 (out[3]); w1_t[0] = 0x80000000; w1_t[1] = 0; w1_t[2] = 0; @@ -400,10 +400,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09810_m04 (__glo sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); rc4_next_16 (rc4_key, 16, j, digest, out); @@ -539,10 +539,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09810_s04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (out[0]); - w0_t[1] = swap_workaround (out[1]); - w0_t[2] = swap_workaround (out[2]); - w0_t[3] = swap_workaround (out[3]); + w0_t[0] = swap32 (out[0]); + w0_t[1] = swap32 (out[1]); + w0_t[2] = swap32 (out[2]); + w0_t[3] = swap32 (out[3]); w1_t[0] = 0x80000000; w1_t[1] = 0; w1_t[2] = 0; @@ -566,10 +566,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09810_s04 (__glo sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); rc4_next_16 (rc4_key, 16, j, digest, out); diff --git a/OpenCL/m09810_a1.cl b/OpenCL/m09810_a1.cl index 8458139..14e1e34 100644 --- a/OpenCL/m09810_a1.cl +++ b/OpenCL/m09810_a1.cl @@ -399,10 +399,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09810_m04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (out[0]); - w0_t[1] = swap_workaround (out[1]); - w0_t[2] = swap_workaround (out[2]); - w0_t[3] = swap_workaround (out[3]); + w0_t[0] = swap32 (out[0]); + w0_t[1] = swap32 (out[1]); + w0_t[2] = swap32 (out[2]); + w0_t[3] = swap32 (out[3]); w1_t[0] = 0x80000000; w1_t[1] = 0; w1_t[2] = 0; @@ -426,10 +426,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09810_m04 (__glo sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); rc4_next_16 (rc4_key, 16, j, digest, out); @@ -593,10 +593,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09810_s04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (out[0]); - w0_t[1] = swap_workaround (out[1]); - w0_t[2] = swap_workaround (out[2]); - w0_t[3] = swap_workaround (out[3]); + w0_t[0] = swap32 (out[0]); + w0_t[1] = swap32 (out[1]); + w0_t[2] = swap32 (out[2]); + w0_t[3] = swap32 (out[3]); w1_t[0] = 0x80000000; w1_t[1] = 0; w1_t[2] = 0; @@ -620,10 +620,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09810_s04 (__glo sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); rc4_next_16 (rc4_key, 16, j, digest, out); diff --git a/OpenCL/m09810_a3.cl b/OpenCL/m09810_a3.cl index 8d69709..af4886e 100644 --- a/OpenCL/m09810_a3.cl +++ b/OpenCL/m09810_a3.cl @@ -322,10 +322,10 @@ static void m09810m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (out[0]); - w0_t[1] = swap_workaround (out[1]); - w0_t[2] = swap_workaround (out[2]); - w0_t[3] = swap_workaround (out[3]); + w0_t[0] = swap32 (out[0]); + w0_t[1] = swap32 (out[1]); + w0_t[2] = swap32 (out[2]); + w0_t[3] = swap32 (out[3]); w1_t[0] = 0x80000000; w1_t[1] = 0; w1_t[2] = 0; @@ -349,10 +349,10 @@ static void m09810m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); rc4_next_16 (rc4_key, 16, j, digest, out); @@ -431,10 +431,10 @@ static void m09810s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (out[0]); - w0_t[1] = swap_workaround (out[1]); - w0_t[2] = swap_workaround (out[2]); - w0_t[3] = swap_workaround (out[3]); + w0_t[0] = swap32 (out[0]); + w0_t[1] = swap32 (out[1]); + w0_t[2] = swap32 (out[2]); + w0_t[3] = swap32 (out[3]); w1_t[0] = 0x80000000; w1_t[1] = 0; w1_t[2] = 0; @@ -458,10 +458,10 @@ static void m09810s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); rc4_next_16 (rc4_key, 16, j, digest, out); diff --git a/OpenCL/m09820_a0.cl b/OpenCL/m09820_a0.cl index fd6d1ad..1ead01e 100644 --- a/OpenCL/m09820_a0.cl +++ b/OpenCL/m09820_a0.cl @@ -251,16 +251,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09820_m04 (__glo w0_t[1] = salt_buf[1]; w0_t[2] = salt_buf[2]; w0_t[3] = salt_buf[3]; - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; @@ -299,8 +299,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09820_m04 (__glo sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 a = swap_workaround (digest[0]); - u32 b = swap_workaround (digest[1]) & 0xff; + u32 a = swap32 (digest[0]); + u32 b = swap32 (digest[1]) & 0xff; const u32 r0 = a; const u32 r1 = b; @@ -432,16 +432,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09820_s04 (__glo w0_t[1] = salt_buf[1]; w0_t[2] = salt_buf[2]; w0_t[3] = salt_buf[3]; - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; @@ -480,8 +480,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09820_s04 (__glo sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 a = swap_workaround (digest[0]); - u32 b = swap_workaround (digest[1]) & 0xff; + u32 a = swap32 (digest[0]); + u32 b = swap32 (digest[1]) & 0xff; const u32 r0 = a; const u32 r1 = b; diff --git a/OpenCL/m09820_a1.cl b/OpenCL/m09820_a1.cl index a3fea7e..2e4cae6 100644 --- a/OpenCL/m09820_a1.cl +++ b/OpenCL/m09820_a1.cl @@ -301,16 +301,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09820_m04 (__glo w0_t[1] = salt_buf[1]; w0_t[2] = salt_buf[2]; w0_t[3] = salt_buf[3]; - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; @@ -349,8 +349,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09820_m04 (__glo sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 a = swap_workaround (digest[0]); - u32 b = swap_workaround (digest[1]) & 0xff; + u32 a = swap32 (digest[0]); + u32 b = swap32 (digest[1]) & 0xff; const u32 r0 = a; const u32 r1 = b; @@ -534,16 +534,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09820_s04 (__glo w0_t[1] = salt_buf[1]; w0_t[2] = salt_buf[2]; w0_t[3] = salt_buf[3]; - w1_t[0] = swap_workaround (w1_t[0]); - w1_t[1] = swap_workaround (w1_t[1]); - w1_t[2] = swap_workaround (w1_t[2]); - w1_t[3] = swap_workaround (w1_t[3]); - w2_t[0] = swap_workaround (w2_t[0]); - w2_t[1] = swap_workaround (w2_t[1]); - w2_t[2] = swap_workaround (w2_t[2]); - w2_t[3] = swap_workaround (w2_t[3]); - w3_t[0] = swap_workaround (w3_t[0]); - w3_t[1] = swap_workaround (w3_t[1]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; @@ -582,8 +582,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09820_s04 (__glo sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 a = swap_workaround (digest[0]); - u32 b = swap_workaround (digest[1]) & 0xff; + u32 a = swap32 (digest[0]); + u32 b = swap32 (digest[1]) & 0xff; const u32 r0 = a; const u32 r1 = b; diff --git a/OpenCL/m09820_a3.cl b/OpenCL/m09820_a3.cl index c9b32c4..46bf9eb 100644 --- a/OpenCL/m09820_a3.cl +++ b/OpenCL/m09820_a3.cl @@ -241,8 +241,8 @@ static void m09820m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 a = swap_workaround (digest[0]); - u32 b = swap_workaround (digest[1]) & 0xff; + u32 a = swap32 (digest[0]); + u32 b = swap32 (digest[1]) & 0xff; const u32 r0 = a; const u32 r1 = b; @@ -358,8 +358,8 @@ static void m09820s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 a = swap_workaround (digest[0]); - u32 b = swap_workaround (digest[1]) & 0xff; + u32 a = swap32 (digest[0]); + u32 b = swap32 (digest[1]) & 0xff; const u32 r0 = a; const u32 r1 = b; diff --git a/OpenCL/m10300.cl b/OpenCL/m10300.cl index a5266aa..ca528ab 100644 --- a/OpenCL/m10300.cl +++ b/OpenCL/m10300.cl @@ -17,17 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE2 -#define COMPARE_M "check_multi_vect2_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) { @@ -254,24 +245,24 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10300_init (__gl // swaps needed - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); w3[3] = pw_salt_len * 8; u32 digest[5]; @@ -301,22 +292,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10300_loop (__gl u32 word_buf0[4]; - word_buf0[0] = swap_workaround (pws[gid].i[0]); - word_buf0[1] = swap_workaround (pws[gid].i[1]); - word_buf0[2] = swap_workaround (pws[gid].i[2]); - word_buf0[3] = swap_workaround (pws[gid].i[3]); + word_buf0[0] = swap32 (pws[gid].i[0]); + word_buf0[1] = swap32 (pws[gid].i[1]); + word_buf0[2] = swap32 (pws[gid].i[2]); + word_buf0[3] = swap32 (pws[gid].i[3]); u32 word_buf1[4]; - word_buf1[0] = swap_workaround (pws[gid].i[4]); - word_buf1[1] = swap_workaround (pws[gid].i[5]); - word_buf1[2] = swap_workaround (pws[gid].i[6]); - word_buf1[3] = swap_workaround (pws[gid].i[7]); + word_buf1[0] = swap32 (pws[gid].i[4]); + word_buf1[1] = swap32 (pws[gid].i[5]); + word_buf1[2] = swap32 (pws[gid].i[6]); + word_buf1[3] = swap32 (pws[gid].i[7]); u32 word_buf2[2]; - word_buf2[0] = swap_workaround (pws[gid].i[8]); - word_buf2[1] = swap_workaround (pws[gid].i[9]); + word_buf2[0] = swap32 (pws[gid].i[8]); + word_buf2[1] = swap32 (pws[gid].i[9]); const u32 pw_len = pws[gid].pw_len; diff --git a/OpenCL/m10500.cl b/OpenCL/m10500.cl index 3048b19..bb032d8 100644 --- a/OpenCL/m10500.cl +++ b/OpenCL/m10500.cl @@ -17,13 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" __constant u32 padding[8] = { diff --git a/OpenCL/m10700.cl b/OpenCL/m10700.cl index cebdaa4..abdead1 100644 --- a/OpenCL/m10700.cl +++ b/OpenCL/m10700.cl @@ -17,19 +17,14 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif - -#ifdef VECT_SIZE4 -#define COMPARE_M "check_multi_vect4_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" typedef struct { union { - u32 dgst32[16]; + u32 dgst32[16]; u64 dgst64[8]; }; @@ -37,7 +32,7 @@ typedef struct union { - u32 W32[32]; + u32 W32[32]; u64 W64[16]; }; @@ -76,22 +71,22 @@ static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], u32 g = digest[6]; u32 h = digest[7]; - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); - u32 we_t = swap_workaround (w3[2]); - u32 wf_t = swap_workaround (w3[3]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); + u32 we_t = swap32 (w3[2]); + u32 wf_t = swap32 (w3[3]); #define ROUND256_EXPAND() \ { \ @@ -186,22 +181,22 @@ static void sha384_transform (const u64 w0[4], const u64 w1[4], const u64 w2[4], u64 g = digest[6]; u64 h = digest[7]; - u64 w0_t = swap_workaround (w0[0]); - u64 w1_t = swap_workaround (w0[1]); - u64 w2_t = swap_workaround (w0[2]); - u64 w3_t = swap_workaround (w0[3]); - u64 w4_t = swap_workaround (w1[0]); - u64 w5_t = swap_workaround (w1[1]); - u64 w6_t = swap_workaround (w1[2]); - u64 w7_t = swap_workaround (w1[3]); - u64 w8_t = swap_workaround (w2[0]); - u64 w9_t = swap_workaround (w2[1]); - u64 wa_t = swap_workaround (w2[2]); - u64 wb_t = swap_workaround (w2[3]); - u64 wc_t = swap_workaround (w3[0]); - u64 wd_t = swap_workaround (w3[1]); - u64 we_t = swap_workaround (w3[2]); - u64 wf_t = swap_workaround (w3[3]); + u64 w0_t = swap32 (w0[0]); + u64 w1_t = swap32 (w0[1]); + u64 w2_t = swap32 (w0[2]); + u64 w3_t = swap32 (w0[3]); + u64 w4_t = swap32 (w1[0]); + u64 w5_t = swap32 (w1[1]); + u64 w6_t = swap32 (w1[2]); + u64 w7_t = swap32 (w1[3]); + u64 w8_t = swap32 (w2[0]); + u64 w9_t = swap32 (w2[1]); + u64 wa_t = swap32 (w2[2]); + u64 wb_t = swap32 (w2[3]); + u64 wc_t = swap32 (w3[0]); + u64 wd_t = swap32 (w3[1]); + u64 we_t = swap32 (w3[2]); + u64 wf_t = swap32 (w3[3]); #define ROUND384_EXPAND() \ { \ @@ -296,22 +291,22 @@ static void sha512_transform (const u64 w0[4], const u64 w1[4], const u64 w2[4], u64 g = digest[6]; u64 h = digest[7]; - u64 w0_t = swap_workaround (w0[0]); - u64 w1_t = swap_workaround (w0[1]); - u64 w2_t = swap_workaround (w0[2]); - u64 w3_t = swap_workaround (w0[3]); - u64 w4_t = swap_workaround (w1[0]); - u64 w5_t = swap_workaround (w1[1]); - u64 w6_t = swap_workaround (w1[2]); - u64 w7_t = swap_workaround (w1[3]); - u64 w8_t = swap_workaround (w2[0]); - u64 w9_t = swap_workaround (w2[1]); - u64 wa_t = swap_workaround (w2[2]); - u64 wb_t = swap_workaround (w2[3]); - u64 wc_t = swap_workaround (w3[0]); - u64 wd_t = swap_workaround (w3[1]); - u64 we_t = swap_workaround (w3[2]); - u64 wf_t = swap_workaround (w3[3]); + u64 w0_t = swap32 (w0[0]); + u64 w1_t = swap32 (w0[1]); + u64 w2_t = swap32 (w0[2]); + u64 w3_t = swap32 (w0[3]); + u64 w4_t = swap32 (w1[0]); + u64 w5_t = swap32 (w1[1]); + u64 w6_t = swap32 (w1[2]); + u64 w7_t = swap32 (w1[3]); + u64 w8_t = swap32 (w2[0]); + u64 w9_t = swap32 (w2[1]); + u64 wa_t = swap32 (w2[2]); + u64 wb_t = swap32 (w2[3]); + u64 wc_t = swap32 (w3[0]); + u64 wd_t = swap32 (w3[1]); + u64 we_t = swap32 (w3[2]); + u64 wf_t = swap32 (w3[3]); #define ROUND512_EXPAND() \ { \ @@ -720,10 +715,10 @@ __constant u32 rcon[] = static void AES128_ExpandKey (u32 *userkey, u32 *rek, __local u32 s_te0[256], __local u32 s_te1[256], __local u32 s_te2[256], __local u32 s_te3[256], __local u32 s_te4[256]) { - rek[0] = swap_workaround (userkey[0]); - rek[1] = swap_workaround (userkey[1]); - rek[2] = swap_workaround (userkey[2]); - rek[3] = swap_workaround (userkey[3]); + rek[0] = swap32 (userkey[0]); + rek[1] = swap32 (userkey[1]); + rek[2] = swap32 (userkey[2]); + rek[3] = swap32 (userkey[3]); for (u32 i = 0, j = 0; i < 10; i += 1, j += 4) { @@ -748,10 +743,10 @@ static void AES128_encrypt (const u32 *in, u32 *out, const u32 *rek, __local u32 { u32 in_swap[4]; - in_swap[0] = swap_workaround (in[0]); - in_swap[1] = swap_workaround (in[1]); - in_swap[2] = swap_workaround (in[2]); - in_swap[3] = swap_workaround (in[3]); + in_swap[0] = swap32 (in[0]); + in_swap[1] = swap32 (in[1]); + in_swap[2] = swap32 (in[2]); + in_swap[3] = swap32 (in[3]); u32 s0 = in_swap[0] ^ rek[0]; u32 s1 = in_swap[1] ^ rek[1]; @@ -824,10 +819,10 @@ static void AES128_encrypt (const u32 *in, u32 *out, const u32 *rek, __local u32 ^ (s_te4[(t2 >> 0) & 0xff] & 0x000000ff) ^ rek[43]; - out[0] = swap_workaround (out[0]); - out[1] = swap_workaround (out[1]); - out[2] = swap_workaround (out[2]); - out[3] = swap_workaround (out[3]); + out[0] = swap32 (out[0]); + out[1] = swap32 (out[1]); + out[2] = swap32 (out[2]); + out[3] = swap32 (out[3]); } static void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2]) @@ -1203,6 +1198,7 @@ static void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl, co u32 i; + #ifdef IS_AMD for (i = 0; i < pd; i++) sc[idx++] = pw[i]; sc[idx++] = pw[i] | amd_bytealign (bl[0], 0, pm4); @@ -1210,6 +1206,19 @@ static void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl, co sc[idx++] = amd_bytealign (sc[0], bl[i - 1], pm4); for (i = 1; i < 4; i++) sc[idx++] = amd_bytealign (sc[i], sc[i - 1], pm4); sc[idx++] = amd_bytealign ( 0, sc[i - 1], pm4); + #endif + + #ifdef IS_NV + int selector = (0x76543210 >> (pm4 * 4)) & 0xffff; + + for (i = 0; i < pd; i++) sc[idx++] = pw[i]; + sc[idx++] = pw[i] + | __byte_perm ( 0, bl[0], selector); + for (i = 1; i < bd; i++) sc[idx++] = __byte_perm (bl[i - 1], bl[i], selector); + sc[idx++] = __byte_perm (bl[i - 1], sc[0], selector); + for (i = 1; i < 4; i++) sc[idx++] = __byte_perm (sc[i - 1], sc[i], selector); + sc[idx++] = __byte_perm (sc[i - 1], 0, selector); + #endif } } @@ -1220,10 +1229,21 @@ static void make_pt_with_offset (u32 *pt, const u32 offset, const u32 *sc, const const u32 om = m % 4; const u32 od = m / 4; + #ifdef IS_AMD pt[0] = amd_bytealign (sc[od + 1], sc[od + 0], om); pt[1] = amd_bytealign (sc[od + 2], sc[od + 1], om); pt[2] = amd_bytealign (sc[od + 3], sc[od + 2], om); pt[3] = amd_bytealign (sc[od + 4], sc[od + 3], om); + #endif + + #ifdef IS_NV + int selector = (0x76543210 >> (om * 4)) & 0xffff; + + pt[0] = __byte_perm (sc[od + 0], sc[od + 1], selector); + pt[1] = __byte_perm (sc[od + 1], sc[od + 2], selector); + pt[2] = __byte_perm (sc[od + 2], sc[od + 3], selector); + pt[3] = __byte_perm (sc[od + 3], sc[od + 4], selector); + #endif } static void make_w_with_offset (ctx_t *ctx, const u32 W_len, const u32 offset, const u32 *sc, const u32 pwbl_len, u32 *iv, const u32 *rek, __local u32 s_te0[256], __local u32 s_te1[256], __local u32 s_te2[256], __local u32 s_te3[256], __local u32 s_te4[256]) @@ -1374,7 +1394,7 @@ static u32 do_round (const u32 *pw, const u32 pw_len, ctx_t *ctx, __local u32 s_ ctx->W64[12] = 0; ctx->W64[13] = 0; ctx->W64[14] = 0; - ctx->W64[15] = swap_workaround ((u64) (final_len * 8)); + ctx->W64[15] = swap32 ((u64) (final_len * 8)); ex = ctx->W64[7] >> 56; break; case BLSZ512: make_w_with_offset (ctx, 64, offset, sc, pwbl_len, iv, rek, s_te0, s_te1, s_te2, s_te3, s_te4); @@ -1385,7 +1405,7 @@ static u32 do_round (const u32 *pw, const u32 pw_len, ctx_t *ctx, __local u32 s_ ctx->W64[12] = 0; ctx->W64[13] = 0; ctx->W64[14] = 0; - ctx->W64[15] = swap_workaround ((u64) (final_len * 8)); + ctx->W64[15] = swap32 ((u64) (final_len * 8)); ex = ctx->W64[7] >> 56; break; } @@ -1410,7 +1430,7 @@ static u32 do_round (const u32 *pw, const u32 pw_len, ctx_t *ctx, __local u32 s_ ctx->W32[12] = 0; ctx->W32[13] = 0; ctx->W32[14] = 0; - ctx->W32[15] = swap_workaround (final_len * 8); + ctx->W32[15] = swap32 (final_len * 8); break; case BLSZ384: ex = ctx->W64[15] >> 56; ctx->W64[ 0] = 0x80; @@ -1428,7 +1448,7 @@ static u32 do_round (const u32 *pw, const u32 pw_len, ctx_t *ctx, __local u32 s_ ctx->W64[12] = 0; ctx->W64[13] = 0; ctx->W64[14] = 0; - ctx->W64[15] = swap_workaround ((u64) (final_len * 8)); + ctx->W64[15] = swap32 ((u64) (final_len * 8)); break; case BLSZ512: ex = ctx->W64[15] >> 56; ctx->W64[ 0] = 0x80; @@ -1446,7 +1466,7 @@ static u32 do_round (const u32 *pw, const u32 pw_len, ctx_t *ctx, __local u32 s_ ctx->W64[12] = 0; ctx->W64[13] = 0; ctx->W64[14] = 0; - ctx->W64[15] = swap_workaround ((u64) (final_len * 8)); + ctx->W64[15] = swap32 ((u64) (final_len * 8)); break; } } @@ -1454,14 +1474,14 @@ static u32 do_round (const u32 *pw, const u32 pw_len, ctx_t *ctx, __local u32 s_ switch (ctx->dgst_len) { case BLSZ256: sha256_transform (&ctx->W32[ 0], &ctx->W32[ 4], &ctx->W32[ 8], &ctx->W32[12], ctx->dgst32); - ctx->dgst32[ 0] = swap_workaround (ctx->dgst32[0]); - ctx->dgst32[ 1] = swap_workaround (ctx->dgst32[1]); - ctx->dgst32[ 2] = swap_workaround (ctx->dgst32[2]); - ctx->dgst32[ 3] = swap_workaround (ctx->dgst32[3]); - ctx->dgst32[ 4] = swap_workaround (ctx->dgst32[4]); - ctx->dgst32[ 5] = swap_workaround (ctx->dgst32[5]); - ctx->dgst32[ 6] = swap_workaround (ctx->dgst32[6]); - ctx->dgst32[ 7] = swap_workaround (ctx->dgst32[7]); + ctx->dgst32[ 0] = swap32 (ctx->dgst32[0]); + ctx->dgst32[ 1] = swap32 (ctx->dgst32[1]); + ctx->dgst32[ 2] = swap32 (ctx->dgst32[2]); + ctx->dgst32[ 3] = swap32 (ctx->dgst32[3]); + ctx->dgst32[ 4] = swap32 (ctx->dgst32[4]); + ctx->dgst32[ 5] = swap32 (ctx->dgst32[5]); + ctx->dgst32[ 6] = swap32 (ctx->dgst32[6]); + ctx->dgst32[ 7] = swap32 (ctx->dgst32[7]); ctx->dgst32[ 8] = 0; ctx->dgst32[ 9] = 0; ctx->dgst32[10] = 0; @@ -1472,24 +1492,24 @@ static u32 do_round (const u32 *pw, const u32 pw_len, ctx_t *ctx, __local u32 s_ ctx->dgst32[15] = 0; break; case BLSZ384: sha384_transform (&ctx->W64[ 0], &ctx->W64[ 4], &ctx->W64[ 8], &ctx->W64[12], ctx->dgst64); - ctx->dgst64[0] = swap_workaround (ctx->dgst64[0]); - ctx->dgst64[1] = swap_workaround (ctx->dgst64[1]); - ctx->dgst64[2] = swap_workaround (ctx->dgst64[2]); - ctx->dgst64[3] = swap_workaround (ctx->dgst64[3]); - ctx->dgst64[4] = swap_workaround (ctx->dgst64[4]); - ctx->dgst64[5] = swap_workaround (ctx->dgst64[5]); + ctx->dgst64[0] = swap32 (ctx->dgst64[0]); + ctx->dgst64[1] = swap32 (ctx->dgst64[1]); + ctx->dgst64[2] = swap32 (ctx->dgst64[2]); + ctx->dgst64[3] = swap32 (ctx->dgst64[3]); + ctx->dgst64[4] = swap32 (ctx->dgst64[4]); + ctx->dgst64[5] = swap32 (ctx->dgst64[5]); ctx->dgst64[6] = 0; ctx->dgst64[7] = 0; break; case BLSZ512: sha512_transform (&ctx->W64[ 0], &ctx->W64[ 4], &ctx->W64[ 8], &ctx->W64[12], ctx->dgst64); - ctx->dgst64[0] = swap_workaround (ctx->dgst64[0]); - ctx->dgst64[1] = swap_workaround (ctx->dgst64[1]); - ctx->dgst64[2] = swap_workaround (ctx->dgst64[2]); - ctx->dgst64[3] = swap_workaround (ctx->dgst64[3]); - ctx->dgst64[4] = swap_workaround (ctx->dgst64[4]); - ctx->dgst64[5] = swap_workaround (ctx->dgst64[5]); - ctx->dgst64[6] = swap_workaround (ctx->dgst64[6]); - ctx->dgst64[7] = swap_workaround (ctx->dgst64[7]); + ctx->dgst64[0] = swap32 (ctx->dgst64[0]); + ctx->dgst64[1] = swap32 (ctx->dgst64[1]); + ctx->dgst64[2] = swap32 (ctx->dgst64[2]); + ctx->dgst64[3] = swap32 (ctx->dgst64[3]); + ctx->dgst64[4] = swap32 (ctx->dgst64[4]); + ctx->dgst64[5] = swap32 (ctx->dgst64[5]); + ctx->dgst64[6] = swap32 (ctx->dgst64[6]); + ctx->dgst64[7] = swap32 (ctx->dgst64[7]); break; } @@ -1566,7 +1586,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10700_init (__gl append_0x80_2x4 (block0, block1, block_len); - block3[3] = swap_workaround (block_len * 8); + block3[3] = swap32 (block_len * 8); u32 digest[8]; @@ -1581,14 +1601,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10700_init (__gl sha256_transform (block0, block1, block2, block3, digest); - digest[0] = swap_workaround (digest[0]); - digest[1] = swap_workaround (digest[1]); - digest[2] = swap_workaround (digest[2]); - digest[3] = swap_workaround (digest[3]); - digest[4] = swap_workaround (digest[4]); - digest[5] = swap_workaround (digest[5]); - digest[6] = swap_workaround (digest[6]); - digest[7] = swap_workaround (digest[7]); + digest[0] = swap32 (digest[0]); + digest[1] = swap32 (digest[1]); + digest[2] = swap32 (digest[2]); + digest[3] = swap32 (digest[3]); + digest[4] = swap32 (digest[4]); + digest[5] = swap32 (digest[5]); + digest[6] = swap32 (digest[6]); + digest[7] = swap32 (digest[7]); tmps[gid].dgst32[0] = digest[0]; tmps[gid].dgst32[1] = digest[1]; @@ -1721,10 +1741,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10700_comp (__gl * digest */ - const u32 r0 = swap_workaround (tmps[gid].dgst32[DGST_R0]); - const u32 r1 = swap_workaround (tmps[gid].dgst32[DGST_R1]); - const u32 r2 = swap_workaround (tmps[gid].dgst32[DGST_R2]); - const u32 r3 = swap_workaround (tmps[gid].dgst32[DGST_R3]); + const u32 r0 = swap32 (tmps[gid].dgst32[DGST_R0]); + const u32 r1 = swap32 (tmps[gid].dgst32[DGST_R1]); + const u32 r2 = swap32 (tmps[gid].dgst32[DGST_R2]); + const u32 r3 = swap32 (tmps[gid].dgst32[DGST_R3]); #define il_pos 0 diff --git a/OpenCL/m10800_a0.cl b/OpenCL/m10800_a0.cl index c3110d8..7db896f 100644 --- a/OpenCL/m10800_a0.cl +++ b/OpenCL/m10800_a0.cl @@ -221,20 +221,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10800_m04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = out_len * 8; @@ -359,20 +359,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10800_s04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = out_len * 8; diff --git a/OpenCL/m10800_a1.cl b/OpenCL/m10800_a1.cl index e367cdf..0b26a46 100644 --- a/OpenCL/m10800_a1.cl +++ b/OpenCL/m10800_a1.cl @@ -269,20 +269,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10800_m04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = pw_len * 8; @@ -457,20 +457,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10800_s04 (__glo u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap_workaround (w0[0]); - w0_t[1] = swap_workaround (w0[1]); - w0_t[2] = swap_workaround (w0[2]); - w0_t[3] = swap_workaround (w0[3]); - w1_t[0] = swap_workaround (w1[0]); - w1_t[1] = swap_workaround (w1[1]); - w1_t[2] = swap_workaround (w1[2]); - w1_t[3] = swap_workaround (w1[3]); - w2_t[0] = swap_workaround (w2[0]); - w2_t[1] = swap_workaround (w2[1]); - w2_t[2] = swap_workaround (w2[2]); - w2_t[3] = swap_workaround (w2[3]); - w3_t[0] = swap_workaround (w3[0]); - w3_t[1] = swap_workaround (w3[1]); + w0_t[0] = swap32 (w0[0]); + w0_t[1] = swap32 (w0[1]); + w0_t[2] = swap32 (w0[2]); + w0_t[3] = swap32 (w0[3]); + w1_t[0] = swap32 (w1[0]); + w1_t[1] = swap32 (w1[1]); + w1_t[2] = swap32 (w1[2]); + w1_t[3] = swap32 (w1[3]); + w2_t[0] = swap32 (w2[0]); + w2_t[1] = swap32 (w2[1]); + w2_t[2] = swap32 (w2[2]); + w2_t[3] = swap32 (w2[3]); + w3_t[0] = swap32 (w3[0]); + w3_t[1] = swap32 (w3[1]); w3_t[2] = 0; w3_t[3] = pw_len * 8; diff --git a/OpenCL/m10900.cl b/OpenCL/m10900.cl index de3b193..96c4195 100644 --- a/OpenCL/m10900.cl +++ b/OpenCL/m10900.cl @@ -241,31 +241,31 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10900_init (__gl u32 w0[4]; - w0[0] = swap_workaround (pws[gid].i[ 0]); - w0[1] = swap_workaround (pws[gid].i[ 1]); - w0[2] = swap_workaround (pws[gid].i[ 2]); - w0[3] = swap_workaround (pws[gid].i[ 3]); + w0[0] = swap32 (pws[gid].i[ 0]); + w0[1] = swap32 (pws[gid].i[ 1]); + w0[2] = swap32 (pws[gid].i[ 2]); + w0[3] = swap32 (pws[gid].i[ 3]); u32 w1[4]; - w1[0] = swap_workaround (pws[gid].i[ 4]); - w1[1] = swap_workaround (pws[gid].i[ 5]); - w1[2] = swap_workaround (pws[gid].i[ 6]); - w1[3] = swap_workaround (pws[gid].i[ 7]); + w1[0] = swap32 (pws[gid].i[ 4]); + w1[1] = swap32 (pws[gid].i[ 5]); + w1[2] = swap32 (pws[gid].i[ 6]); + w1[3] = swap32 (pws[gid].i[ 7]); u32 w2[4]; - w2[0] = swap_workaround (pws[gid].i[ 8]); - w2[1] = swap_workaround (pws[gid].i[ 9]); - w2[2] = swap_workaround (pws[gid].i[10]); - w2[3] = swap_workaround (pws[gid].i[11]); + w2[0] = swap32 (pws[gid].i[ 8]); + w2[1] = swap32 (pws[gid].i[ 9]); + w2[2] = swap32 (pws[gid].i[10]); + w2[3] = swap32 (pws[gid].i[11]); u32 w3[4]; - w3[0] = swap_workaround (pws[gid].i[12]); - w3[1] = swap_workaround (pws[gid].i[13]); - w3[2] = swap_workaround (pws[gid].i[14]); - w3[3] = swap_workaround (pws[gid].i[15]); + w3[0] = swap32 (pws[gid].i[12]); + w3[1] = swap32 (pws[gid].i[13]); + w3[2] = swap32 (pws[gid].i[14]); + w3[3] = swap32 (pws[gid].i[15]); /** * salt @@ -278,20 +278,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10900_init (__gl u32 esalt_buf2[4]; u32 esalt_buf3[4]; - esalt_buf0[0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 0]); - esalt_buf0[1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 1]); - esalt_buf0[2] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 2]); - esalt_buf0[3] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 3]); - esalt_buf1[0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 4]); - esalt_buf1[1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 5]); - esalt_buf1[2] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 6]); - esalt_buf1[3] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 7]); - esalt_buf2[0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 8]); - esalt_buf2[1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 9]); - esalt_buf2[2] = swap_workaround (esalt_bufs[salt_pos].salt_buf[10]); - esalt_buf2[3] = swap_workaround (esalt_bufs[salt_pos].salt_buf[11]); - esalt_buf3[0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[12]); - esalt_buf3[1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[13]); + esalt_buf0[0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 0]); + esalt_buf0[1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 1]); + esalt_buf0[2] = swap32 (esalt_bufs[salt_pos].salt_buf[ 2]); + esalt_buf0[3] = swap32 (esalt_bufs[salt_pos].salt_buf[ 3]); + esalt_buf1[0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 4]); + esalt_buf1[1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 5]); + esalt_buf1[2] = swap32 (esalt_bufs[salt_pos].salt_buf[ 6]); + esalt_buf1[3] = swap32 (esalt_bufs[salt_pos].salt_buf[ 7]); + esalt_buf2[0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 8]); + esalt_buf2[1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 9]); + esalt_buf2[2] = swap32 (esalt_bufs[salt_pos].salt_buf[10]); + esalt_buf2[3] = swap32 (esalt_bufs[salt_pos].salt_buf[11]); + esalt_buf3[0] = swap32 (esalt_bufs[salt_pos].salt_buf[12]); + esalt_buf3[1] = swap32 (esalt_bufs[salt_pos].salt_buf[13]); esalt_buf3[2] = 0; esalt_buf3[3] = (64 + salt_len + 4) * 8; diff --git a/OpenCL/m11200_a0.cl b/OpenCL/m11200_a0.cl index de3eac6..cbec710 100644 --- a/OpenCL/m11200_a0.cl +++ b/OpenCL/m11200_a0.cl @@ -60,11 +60,11 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_m04 (__glo u32 salt_buf[5]; - salt_buf[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[0]); - salt_buf[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[1]); - salt_buf[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[2]); - salt_buf[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[3]); - salt_buf[4] = swap_workaround (salt_bufs[salt_pos].salt_buf[4]); + salt_buf[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); + salt_buf[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); + salt_buf[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]); + salt_buf[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]); + salt_buf[4] = swap32 (salt_bufs[salt_pos].salt_buf[4]); /** * loop @@ -108,20 +108,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_m04 (__glo * sha1 ($pass) */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; @@ -565,11 +565,11 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_s04 (__glo u32 salt_buf[5]; - salt_buf[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[0]); - salt_buf[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[1]); - salt_buf[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[2]); - salt_buf[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[3]); - salt_buf[4] = swap_workaround (salt_bufs[salt_pos].salt_buf[4]); + salt_buf[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); + salt_buf[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); + salt_buf[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]); + salt_buf[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]); + salt_buf[4] = swap32 (salt_bufs[salt_pos].salt_buf[4]); /** * loop @@ -613,20 +613,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_s04 (__glo * sha1 ($pass) */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; diff --git a/OpenCL/m11200_a1.cl b/OpenCL/m11200_a1.cl index 7279d14..bf7c8bc 100644 --- a/OpenCL/m11200_a1.cl +++ b/OpenCL/m11200_a1.cl @@ -79,11 +79,11 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_m04 (__glo u32 salt_buf[5]; - salt_buf[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[0]); - salt_buf[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[1]); - salt_buf[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[2]); - salt_buf[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[3]); - salt_buf[4] = swap_workaround (salt_bufs[salt_pos].salt_buf[4]); + salt_buf[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); + salt_buf[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); + salt_buf[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]); + salt_buf[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]); + salt_buf[4] = swap32 (salt_bufs[salt_pos].salt_buf[4]); /** * loop @@ -162,20 +162,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_m04 (__glo * sha1 ($pass) */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -640,11 +640,11 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_s04 (__glo u32 salt_buf[5]; - salt_buf[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[0]); - salt_buf[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[1]); - salt_buf[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[2]); - salt_buf[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[3]); - salt_buf[4] = swap_workaround (salt_bufs[salt_pos].salt_buf[4]); + salt_buf[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); + salt_buf[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); + salt_buf[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]); + salt_buf[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]); + salt_buf[4] = swap32 (salt_bufs[salt_pos].salt_buf[4]); /** * loop @@ -723,20 +723,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_s04 (__glo * sha1 ($pass) */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; diff --git a/OpenCL/m11200_a3.cl b/OpenCL/m11200_a3.cl index 3670a92..5b4ff60 100644 --- a/OpenCL/m11200_a3.cl +++ b/OpenCL/m11200_a3.cl @@ -35,11 +35,11 @@ static void m11200m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 salt_buf[5]; - salt_buf[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[0]); - salt_buf[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[1]); - salt_buf[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[2]); - salt_buf[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[3]); - salt_buf[4] = swap_workaround (salt_bufs[salt_pos].salt_buf[4]); + salt_buf[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); + salt_buf[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); + salt_buf[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]); + salt_buf[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]); + salt_buf[4] = swap32 (salt_bufs[salt_pos].salt_buf[4]); /** * loop @@ -483,11 +483,11 @@ static void m11200s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 salt_buf[5]; - salt_buf[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[0]); - salt_buf[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[1]); - salt_buf[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[2]); - salt_buf[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[3]); - salt_buf[4] = swap_workaround (salt_bufs[salt_pos].salt_buf[4]); + salt_buf[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); + salt_buf[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); + salt_buf[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]); + salt_buf[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]); + salt_buf[4] = swap32 (salt_bufs[salt_pos].salt_buf[4]); /** * loop diff --git a/OpenCL/m11300.cl b/OpenCL/m11300.cl index a786027..cbf2363 100644 --- a/OpenCL/m11300.cl +++ b/OpenCL/m11300.cl @@ -1102,22 +1102,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11300_init (__gl * init */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); u64 w[16]; @@ -1358,10 +1358,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11300_comp (__gl { u32 data[4]; - data[0] = swap_workaround (esalt_bufs[salt_pos].cry_master_buf[(i / 4) + 0]); - data[1] = swap_workaround (esalt_bufs[salt_pos].cry_master_buf[(i / 4) + 1]); - data[2] = swap_workaround (esalt_bufs[salt_pos].cry_master_buf[(i / 4) + 2]); - data[3] = swap_workaround (esalt_bufs[salt_pos].cry_master_buf[(i / 4) + 3]); + data[0] = swap32 (esalt_bufs[salt_pos].cry_master_buf[(i / 4) + 0]); + data[1] = swap32 (esalt_bufs[salt_pos].cry_master_buf[(i / 4) + 1]); + data[2] = swap32 (esalt_bufs[salt_pos].cry_master_buf[(i / 4) + 2]); + data[3] = swap32 (esalt_bufs[salt_pos].cry_master_buf[(i / 4) + 3]); AES256_decrypt (data, out, rk, s_td0, s_td1, s_td2, s_td3, s_td4); diff --git a/OpenCL/m11400_a0.cl b/OpenCL/m11400_a0.cl index 03ca34c..23d9ba5 100644 --- a/OpenCL/m11400_a0.cl +++ b/OpenCL/m11400_a0.cl @@ -22,28 +22,19 @@ #define COMPARE_S "check_single_comp4.c" #define COMPARE_M "check_multi_comp4.c" -#ifdef VECT_SIZE1 #define uint_to_hex_lower8(i) l_bin2asc[(i)] -#endif - -#ifdef VECT_SIZE2 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) -#endif - -#ifdef VECT_SIZE4 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) -#endif static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 append3[4], const u32 append_len) { const u32 mod = block_len & 3; const u32 div = block_len / 4; + #ifdef IS_AMD const int offset_minus_4 = 4 - mod; u32 append0_t[4]; - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); + append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); @@ -103,6 +94,49 @@ static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const append4_t[2] = 0; append4_t[3] = 0; } + #endif + + #ifdef IS_NV + + const int offset_minus_4 = 4 - mod; + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + u32 append0_t[4]; + + append0_t[0] = __byte_perm ( 0, append0[0], selector); + append0_t[1] = __byte_perm (append0[0], append0[1], selector); + append0_t[2] = __byte_perm (append0[1], append0[2], selector); + append0_t[3] = __byte_perm (append0[2], append0[3], selector); + + u32 append1_t[4]; + + append1_t[0] = __byte_perm (append0[3], append1[0], selector); + append1_t[1] = __byte_perm (append1[0], append1[1], selector); + append1_t[2] = __byte_perm (append1[1], append1[2], selector); + append1_t[3] = __byte_perm (append1[2], append1[3], selector); + + u32 append2_t[4]; + + append2_t[0] = __byte_perm (append1[3], append2[0], selector); + append2_t[1] = __byte_perm (append2[0], append2[1], selector); + append2_t[2] = __byte_perm (append2[1], append2[2], selector); + append2_t[3] = __byte_perm (append2[2], append2[3], selector); + + u32 append3_t[4]; + + append3_t[0] = __byte_perm (append2[3], append3[0], selector); + append3_t[1] = __byte_perm (append3[0], append3[1], selector); + append3_t[2] = __byte_perm (append3[1], append3[2], selector); + append3_t[3] = __byte_perm (append3[2], append3[3], selector); + + u32 append4_t[4]; + + append4_t[0] = __byte_perm (append3[3], 0, selector); + append4_t[1] = 0; + append4_t[2] = 0; + append4_t[3] = 0; + #endif switch (div) { diff --git a/OpenCL/m11400_a1.cl b/OpenCL/m11400_a1.cl index 272d48d..8a467ae 100644 --- a/OpenCL/m11400_a1.cl +++ b/OpenCL/m11400_a1.cl @@ -20,28 +20,19 @@ #define COMPARE_S "check_single_comp4.c" #define COMPARE_M "check_multi_comp4.c" -#ifdef VECT_SIZE1 #define uint_to_hex_lower8(i) l_bin2asc[(i)] -#endif - -#ifdef VECT_SIZE2 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) -#endif - -#ifdef VECT_SIZE4 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) -#endif static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 append3[4], const u32 append_len) { const u32 mod = block_len & 3; const u32 div = block_len / 4; + #ifdef IS_AMD const int offset_minus_4 = 4 - mod; u32 append0_t[4]; - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); + append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); @@ -101,6 +92,49 @@ static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const append4_t[2] = 0; append4_t[3] = 0; } + #endif + + #ifdef IS_NV + + const int offset_minus_4 = 4 - mod; + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + u32 append0_t[4]; + + append0_t[0] = __byte_perm ( 0, append0[0], selector); + append0_t[1] = __byte_perm (append0[0], append0[1], selector); + append0_t[2] = __byte_perm (append0[1], append0[2], selector); + append0_t[3] = __byte_perm (append0[2], append0[3], selector); + + u32 append1_t[4]; + + append1_t[0] = __byte_perm (append0[3], append1[0], selector); + append1_t[1] = __byte_perm (append1[0], append1[1], selector); + append1_t[2] = __byte_perm (append1[1], append1[2], selector); + append1_t[3] = __byte_perm (append1[2], append1[3], selector); + + u32 append2_t[4]; + + append2_t[0] = __byte_perm (append1[3], append2[0], selector); + append2_t[1] = __byte_perm (append2[0], append2[1], selector); + append2_t[2] = __byte_perm (append2[1], append2[2], selector); + append2_t[3] = __byte_perm (append2[2], append2[3], selector); + + u32 append3_t[4]; + + append3_t[0] = __byte_perm (append2[3], append3[0], selector); + append3_t[1] = __byte_perm (append3[0], append3[1], selector); + append3_t[2] = __byte_perm (append3[1], append3[2], selector); + append3_t[3] = __byte_perm (append3[2], append3[3], selector); + + u32 append4_t[4]; + + append4_t[0] = __byte_perm (append3[3], 0, selector); + append4_t[1] = 0; + append4_t[2] = 0; + append4_t[3] = 0; + #endif switch (div) { diff --git a/OpenCL/m11400_a3.cl b/OpenCL/m11400_a3.cl index 6043c1d..0113fc0 100644 --- a/OpenCL/m11400_a3.cl +++ b/OpenCL/m11400_a3.cl @@ -20,32 +20,19 @@ #define COMPARE_S "check_single_comp4.c" #define COMPARE_M "check_multi_comp4.c" -#ifdef VECT_SIZE1 #define uint_to_hex_lower8(i) l_bin2asc[(i)] -#endif - -#ifdef VECT_SIZE1 -#define uint_to_hex_lower8(i) l_bin2asc[(i)] -#endif - -#ifdef VECT_SIZE2 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) -#endif - -#ifdef VECT_SIZE4 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) -#endif static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 append3[4], const u32 append_len) { const u32 mod = block_len & 3; const u32 div = block_len / 4; + #ifdef IS_AMD const int offset_minus_4 = 4 - mod; u32 append0_t[4]; - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); + append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); @@ -105,6 +92,49 @@ static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const append4_t[2] = 0; append4_t[3] = 0; } + #endif + + #ifdef IS_NV + + const int offset_minus_4 = 4 - mod; + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + u32 append0_t[4]; + + append0_t[0] = __byte_perm ( 0, append0[0], selector); + append0_t[1] = __byte_perm (append0[0], append0[1], selector); + append0_t[2] = __byte_perm (append0[1], append0[2], selector); + append0_t[3] = __byte_perm (append0[2], append0[3], selector); + + u32 append1_t[4]; + + append1_t[0] = __byte_perm (append0[3], append1[0], selector); + append1_t[1] = __byte_perm (append1[0], append1[1], selector); + append1_t[2] = __byte_perm (append1[1], append1[2], selector); + append1_t[3] = __byte_perm (append1[2], append1[3], selector); + + u32 append2_t[4]; + + append2_t[0] = __byte_perm (append1[3], append2[0], selector); + append2_t[1] = __byte_perm (append2[0], append2[1], selector); + append2_t[2] = __byte_perm (append2[1], append2[2], selector); + append2_t[3] = __byte_perm (append2[2], append2[3], selector); + + u32 append3_t[4]; + + append3_t[0] = __byte_perm (append2[3], append3[0], selector); + append3_t[1] = __byte_perm (append3[0], append3[1], selector); + append3_t[2] = __byte_perm (append3[1], append3[2], selector); + append3_t[3] = __byte_perm (append3[2], append3[3], selector); + + u32 append4_t[4]; + + append4_t[0] = __byte_perm (append3[3], 0, selector); + append4_t[1] = 0; + append4_t[2] = 0; + append4_t[3] = 0; + #endif switch (div) { diff --git a/OpenCL/m11600.cl b/OpenCL/m11600.cl index 7e64dbe..1ba1fbe 100644 --- a/OpenCL/m11600.cl +++ b/OpenCL/m11600.cl @@ -17,9 +17,8 @@ #include "types_ocl.c" #include "common.c" -#ifdef VECT_SIZE1 -#define COMPARE_M "check_multi_vect1_comp4.c" -#endif +#define COMPARE_S "check_single_comp4.c" +#define COMPARE_M "check_multi_comp4.c" __constant u32 te0[256] = { @@ -925,22 +924,22 @@ static void sha256_transform (const u32 w[16], u32 digest[8]) u32 g = digest[6]; u32 h = digest[7]; - u32 w0_t = swap_workaround (w[ 0]); - u32 w1_t = swap_workaround (w[ 1]); - u32 w2_t = swap_workaround (w[ 2]); - u32 w3_t = swap_workaround (w[ 3]); - u32 w4_t = swap_workaround (w[ 4]); - u32 w5_t = swap_workaround (w[ 5]); - u32 w6_t = swap_workaround (w[ 6]); - u32 w7_t = swap_workaround (w[ 7]); - u32 w8_t = swap_workaround (w[ 8]); - u32 w9_t = swap_workaround (w[ 9]); - u32 wa_t = swap_workaround (w[10]); - u32 wb_t = swap_workaround (w[11]); - u32 wc_t = swap_workaround (w[12]); - u32 wd_t = swap_workaround (w[13]); - u32 we_t = swap_workaround (w[14]); - u32 wf_t = swap_workaround (w[15]); + u32 w0_t = swap32 (w[ 0]); + u32 w1_t = swap32 (w[ 1]); + u32 w2_t = swap32 (w[ 2]); + u32 w3_t = swap32 (w[ 3]); + u32 w4_t = swap32 (w[ 4]); + u32 w5_t = swap32 (w[ 5]); + u32 w6_t = swap32 (w[ 6]); + u32 w7_t = swap32 (w[ 7]); + u32 w8_t = swap32 (w[ 8]); + u32 w9_t = swap32 (w[ 9]); + u32 wa_t = swap32 (w[10]); + u32 wb_t = swap32 (w[11]); + u32 wc_t = swap32 (w[12]); + u32 wd_t = swap32 (w[13]); + u32 we_t = swap32 (w[14]); + u32 wf_t = swap32 (w[15]); #define ROUND_EXPAND() \ { \ @@ -1144,6 +1143,17 @@ static u32 memcat8c (u32 block[16], const u32 block_len, const u32 append[2], co u32 tmp1; u32 tmp2; + #ifdef IS_NV + const int offset_minus_4 = 4 - (block_len & 3); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + tmp0 = __byte_perm ( 0, append[0], selector); + tmp1 = __byte_perm (append[0], append[1], selector); + tmp2 = __byte_perm (append[1], 0, selector); + #endif + + #ifdef IS_AMD const int offset_minus_4 = 4 - block_len; tmp0 = amd_bytealign (append[0], 0, offset_minus_4); @@ -1156,6 +1166,7 @@ static u32 memcat8c (u32 block[16], const u32 block_len, const u32 append[2], co tmp1 = tmp2; tmp2 = 0; } + #endif u32 carry[2] = { 0, 0 }; @@ -1259,6 +1270,23 @@ static u32 memcat32c (u32 block[16], const u32 block_len, const u32 append[8], c u32 tmp7; u32 tmp8; + #ifdef IS_NV + const int offset_minus_4 = 4 - (block_len & 3); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + tmp0 = __byte_perm ( 0, append[0], selector); + tmp1 = __byte_perm (append[0], append[1], selector); + tmp2 = __byte_perm (append[1], append[2], selector); + tmp3 = __byte_perm (append[2], append[3], selector); + tmp4 = __byte_perm (append[3], append[4], selector); + tmp5 = __byte_perm (append[4], append[5], selector); + tmp6 = __byte_perm (append[5], append[6], selector); + tmp7 = __byte_perm (append[6], append[7], selector); + tmp8 = __byte_perm (append[7], 0, selector); + #endif + + #ifdef IS_AMD const int offset_minus_4 = 4 - block_len; tmp0 = amd_bytealign (append[0], 0, offset_minus_4); @@ -1283,6 +1311,7 @@ static u32 memcat32c (u32 block[16], const u32 block_len, const u32 append[8], c tmp7 = tmp8; tmp8 = 0; } + #endif u32 carry[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; @@ -1770,7 +1799,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_comp (__gl u32 block_len = tmps[gid].block_len; u32 final_len = tmps[gid].final_len; - append_0x80_4x4 (block, block_len); + append_0x80_1x16 (block, block_len); if (block_len >= 56) { @@ -1779,7 +1808,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_comp (__gl bzero16 (block); } - block[15] = swap_workaround (final_len * 8); + block[15] = swap32 (final_len * 8); sha256_transform (block, dgst); @@ -1825,10 +1854,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_comp (__gl { u32 data[4]; - data[0] = swap_workaround (esalt_bufs[salt_pos].data_buf[j + 0]); - data[1] = swap_workaround (esalt_bufs[salt_pos].data_buf[j + 1]); - data[2] = swap_workaround (esalt_bufs[salt_pos].data_buf[j + 2]); - data[3] = swap_workaround (esalt_bufs[salt_pos].data_buf[j + 3]); + data[0] = swap32 (esalt_bufs[salt_pos].data_buf[j + 0]); + data[1] = swap32 (esalt_bufs[salt_pos].data_buf[j + 1]); + data[2] = swap32 (esalt_bufs[salt_pos].data_buf[j + 2]); + data[3] = swap32 (esalt_bufs[salt_pos].data_buf[j + 3]); u32 out[4]; @@ -1844,20 +1873,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_comp (__gl iv[2] = data[2]; iv[3] = data[3]; - out[0] = swap_workaround (out[0]); - out[1] = swap_workaround (out[1]); - out[2] = swap_workaround (out[2]); - out[3] = swap_workaround (out[3]); + out[0] = swap32 (out[0]); + out[1] = swap32 (out[1]); + out[2] = swap32 (out[2]); + out[3] = swap32 (out[3]); crc = crc32 (out, 16, crc); } u32 data[4]; - data[0] = swap_workaround (esalt_bufs[salt_pos].data_buf[j + 0]); - data[1] = swap_workaround (esalt_bufs[salt_pos].data_buf[j + 1]); - data[2] = swap_workaround (esalt_bufs[salt_pos].data_buf[j + 2]); - data[3] = swap_workaround (esalt_bufs[salt_pos].data_buf[j + 3]); + data[0] = swap32 (esalt_bufs[salt_pos].data_buf[j + 0]); + data[1] = swap32 (esalt_bufs[salt_pos].data_buf[j + 1]); + data[2] = swap32 (esalt_bufs[salt_pos].data_buf[j + 2]); + data[3] = swap32 (esalt_bufs[salt_pos].data_buf[j + 3]); u32 out[4]; @@ -1873,10 +1902,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_comp (__gl iv[2] = data[2]; iv[3] = data[3]; - out[0] = swap_workaround (out[0]); - out[1] = swap_workaround (out[1]); - out[2] = swap_workaround (out[2]); - out[3] = swap_workaround (out[3]); + out[0] = swap32 (out[0]); + out[1] = swap32 (out[1]); + out[2] = swap32 (out[2]); + out[3] = swap32 (out[3]); const u32 margin = data_len - unpack_size; diff --git a/OpenCL/m11700_a0.cl b/OpenCL/m11700_a0.cl index 00ffbc7..305424d 100644 --- a/OpenCL/m11700_a0.cl +++ b/OpenCL/m11700_a0.cl @@ -2403,14 +2403,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_m04 (__glo m[6] = hl32_to_64 (w[ 3], w[ 2]); m[7] = hl32_to_64 (w[ 1], w[ 0]); - m[0] = swap_workaround (m[0]); - m[1] = swap_workaround (m[1]); - m[2] = swap_workaround (m[2]); - m[3] = swap_workaround (m[3]); - m[4] = swap_workaround (m[4]); - m[5] = swap_workaround (m[5]); - m[6] = swap_workaround (m[6]); - m[7] = swap_workaround (m[7]); + m[0] = swap32 (m[0]); + m[1] = swap32 (m[1]); + m[2] = swap32 (m[2]); + m[3] = swap32 (m[3]); + m[4] = swap32 (m[4]); + m[5] = swap32 (m[5]); + m[6] = swap32 (m[6]); + m[7] = swap32 (m[7]); // state buffer (hash) @@ -2436,7 +2436,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_m04 (__glo z[4] = 0; z[5] = 0; z[6] = 0; - z[7] = swap_workaround ((u64) (out_len * 8)); + z[7] = swap32 ((u64) (out_len * 8)); streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); @@ -2590,14 +2590,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_s04 (__glo m[6] = hl32_to_64 (w[ 3], w[ 2]); m[7] = hl32_to_64 (w[ 1], w[ 0]); - m[0] = swap_workaround (m[0]); - m[1] = swap_workaround (m[1]); - m[2] = swap_workaround (m[2]); - m[3] = swap_workaround (m[3]); - m[4] = swap_workaround (m[4]); - m[5] = swap_workaround (m[5]); - m[6] = swap_workaround (m[6]); - m[7] = swap_workaround (m[7]); + m[0] = swap32 (m[0]); + m[1] = swap32 (m[1]); + m[2] = swap32 (m[2]); + m[3] = swap32 (m[3]); + m[4] = swap32 (m[4]); + m[5] = swap32 (m[5]); + m[6] = swap32 (m[6]); + m[7] = swap32 (m[7]); // state buffer (hash) @@ -2623,7 +2623,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_s04 (__glo z[4] = 0; z[5] = 0; z[6] = 0; - z[7] = swap_workaround ((u64) (out_len * 8)); + z[7] = swap32 ((u64) (out_len * 8)); streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); diff --git a/OpenCL/m11700_a1.cl b/OpenCL/m11700_a1.cl index 0d228e4..9dc6cfa 100644 --- a/OpenCL/m11700_a1.cl +++ b/OpenCL/m11700_a1.cl @@ -2457,14 +2457,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_m04 (__glo m[6] = hl32_to_64 (w[ 3], w[ 2]); m[7] = hl32_to_64 (w[ 1], w[ 0]); - m[0] = swap_workaround (m[0]); - m[1] = swap_workaround (m[1]); - m[2] = swap_workaround (m[2]); - m[3] = swap_workaround (m[3]); - m[4] = swap_workaround (m[4]); - m[5] = swap_workaround (m[5]); - m[6] = swap_workaround (m[6]); - m[7] = swap_workaround (m[7]); + m[0] = swap32 (m[0]); + m[1] = swap32 (m[1]); + m[2] = swap32 (m[2]); + m[3] = swap32 (m[3]); + m[4] = swap32 (m[4]); + m[5] = swap32 (m[5]); + m[6] = swap32 (m[6]); + m[7] = swap32 (m[7]); // state buffer (hash) @@ -2490,7 +2490,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_m04 (__glo z[4] = 0; z[5] = 0; z[6] = 0; - z[7] = swap_workaround ((u64) (pw_len * 8)); + z[7] = swap32 ((u64) (pw_len * 8)); streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); @@ -2700,14 +2700,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_s04 (__glo m[6] = hl32_to_64 (w[ 3], w[ 2]); m[7] = hl32_to_64 (w[ 1], w[ 0]); - m[0] = swap_workaround (m[0]); - m[1] = swap_workaround (m[1]); - m[2] = swap_workaround (m[2]); - m[3] = swap_workaround (m[3]); - m[4] = swap_workaround (m[4]); - m[5] = swap_workaround (m[5]); - m[6] = swap_workaround (m[6]); - m[7] = swap_workaround (m[7]); + m[0] = swap32 (m[0]); + m[1] = swap32 (m[1]); + m[2] = swap32 (m[2]); + m[3] = swap32 (m[3]); + m[4] = swap32 (m[4]); + m[5] = swap32 (m[5]); + m[6] = swap32 (m[6]); + m[7] = swap32 (m[7]); // state buffer (hash) @@ -2733,7 +2733,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_s04 (__glo z[4] = 0; z[5] = 0; z[6] = 0; - z[7] = swap_workaround ((u64) (pw_len * 8)); + z[7] = swap32 ((u64) (pw_len * 8)); streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); diff --git a/OpenCL/m11700_a3.cl b/OpenCL/m11700_a3.cl index 12187ef..8b5fa93 100644 --- a/OpenCL/m11700_a3.cl +++ b/OpenCL/m11700_a3.cl @@ -2318,14 +2318,14 @@ static void m11700m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le m[6] = hl32_to_64 (w[ 3], w[ 2]); m[7] = hl32_to_64 (w[ 1], w[ 0]); - m[0] = swap_workaround (m[0]); - m[1] = swap_workaround (m[1]); - m[2] = swap_workaround (m[2]); - m[3] = swap_workaround (m[3]); - m[4] = swap_workaround (m[4]); - m[5] = swap_workaround (m[5]); - m[6] = swap_workaround (m[6]); - m[7] = swap_workaround (m[7]); + m[0] = swap32 (m[0]); + m[1] = swap32 (m[1]); + m[2] = swap32 (m[2]); + m[3] = swap32 (m[3]); + m[4] = swap32 (m[4]); + m[5] = swap32 (m[5]); + m[6] = swap32 (m[6]); + m[7] = swap32 (m[7]); // state buffer (hash) @@ -2351,7 +2351,7 @@ static void m11700m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le z[4] = 0; z[5] = 0; z[6] = 0; - z[7] = swap_workaround ((u64) (pw_len * 8)); + z[7] = swap32 ((u64) (pw_len * 8)); streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); @@ -2413,14 +2413,14 @@ static void m11700s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le m[6] = hl32_to_64 (w[ 3], w[ 2]); m[7] = hl32_to_64 (w[ 1], w[ 0]); - m[0] = swap_workaround (m[0]); - m[1] = swap_workaround (m[1]); - m[2] = swap_workaround (m[2]); - m[3] = swap_workaround (m[3]); - m[4] = swap_workaround (m[4]); - m[5] = swap_workaround (m[5]); - m[6] = swap_workaround (m[6]); - m[7] = swap_workaround (m[7]); + m[0] = swap32 (m[0]); + m[1] = swap32 (m[1]); + m[2] = swap32 (m[2]); + m[3] = swap32 (m[3]); + m[4] = swap32 (m[4]); + m[5] = swap32 (m[5]); + m[6] = swap32 (m[6]); + m[7] = swap32 (m[7]); // state buffer (hash) @@ -2446,7 +2446,7 @@ static void m11700s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le z[4] = 0; z[5] = 0; z[6] = 0; - z[7] = swap_workaround ((u64) (pw_len * 8)); + z[7] = swap32 ((u64) (pw_len * 8)); streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); diff --git a/OpenCL/m11800_a0.cl b/OpenCL/m11800_a0.cl index 6229099..ed31f18 100644 --- a/OpenCL/m11800_a0.cl +++ b/OpenCL/m11800_a0.cl @@ -2403,14 +2403,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_m04 (__glo m[6] = hl32_to_64 (w[ 3], w[ 2]); m[7] = hl32_to_64 (w[ 1], w[ 0]); - m[0] = swap_workaround (m[0]); - m[1] = swap_workaround (m[1]); - m[2] = swap_workaround (m[2]); - m[3] = swap_workaround (m[3]); - m[4] = swap_workaround (m[4]); - m[5] = swap_workaround (m[5]); - m[6] = swap_workaround (m[6]); - m[7] = swap_workaround (m[7]); + m[0] = swap32 (m[0]); + m[1] = swap32 (m[1]); + m[2] = swap32 (m[2]); + m[3] = swap32 (m[3]); + m[4] = swap32 (m[4]); + m[5] = swap32 (m[5]); + m[6] = swap32 (m[6]); + m[7] = swap32 (m[7]); // state buffer (hash) @@ -2436,7 +2436,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_m04 (__glo z[4] = 0; z[5] = 0; z[6] = 0; - z[7] = swap_workaround ((u64) (out_len * 8)); + z[7] = swap32 ((u64) (out_len * 8)); streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); @@ -2590,14 +2590,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_s04 (__glo m[6] = hl32_to_64 (w[ 3], w[ 2]); m[7] = hl32_to_64 (w[ 1], w[ 0]); - m[0] = swap_workaround (m[0]); - m[1] = swap_workaround (m[1]); - m[2] = swap_workaround (m[2]); - m[3] = swap_workaround (m[3]); - m[4] = swap_workaround (m[4]); - m[5] = swap_workaround (m[5]); - m[6] = swap_workaround (m[6]); - m[7] = swap_workaround (m[7]); + m[0] = swap32 (m[0]); + m[1] = swap32 (m[1]); + m[2] = swap32 (m[2]); + m[3] = swap32 (m[3]); + m[4] = swap32 (m[4]); + m[5] = swap32 (m[5]); + m[6] = swap32 (m[6]); + m[7] = swap32 (m[7]); // state buffer (hash) @@ -2623,7 +2623,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_s04 (__glo z[4] = 0; z[5] = 0; z[6] = 0; - z[7] = swap_workaround ((u64) (out_len * 8)); + z[7] = swap32 ((u64) (out_len * 8)); streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); diff --git a/OpenCL/m11800_a1.cl b/OpenCL/m11800_a1.cl index 74e22b3..64232d3 100644 --- a/OpenCL/m11800_a1.cl +++ b/OpenCL/m11800_a1.cl @@ -2458,14 +2458,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_m04 (__glo m[6] = hl32_to_64 (w[ 3], w[ 2]); m[7] = hl32_to_64 (w[ 1], w[ 0]); - m[0] = swap_workaround (m[0]); - m[1] = swap_workaround (m[1]); - m[2] = swap_workaround (m[2]); - m[3] = swap_workaround (m[3]); - m[4] = swap_workaround (m[4]); - m[5] = swap_workaround (m[5]); - m[6] = swap_workaround (m[6]); - m[7] = swap_workaround (m[7]); + m[0] = swap32 (m[0]); + m[1] = swap32 (m[1]); + m[2] = swap32 (m[2]); + m[3] = swap32 (m[3]); + m[4] = swap32 (m[4]); + m[5] = swap32 (m[5]); + m[6] = swap32 (m[6]); + m[7] = swap32 (m[7]); // state buffer (hash) @@ -2491,7 +2491,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_m04 (__glo z[4] = 0; z[5] = 0; z[6] = 0; - z[7] = swap_workaround ((u64) (pw_len * 8)); + z[7] = swap32 ((u64) (pw_len * 8)); streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); @@ -2701,14 +2701,14 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_s04 (__glo m[6] = hl32_to_64 (w[ 3], w[ 2]); m[7] = hl32_to_64 (w[ 1], w[ 0]); - m[0] = swap_workaround (m[0]); - m[1] = swap_workaround (m[1]); - m[2] = swap_workaround (m[2]); - m[3] = swap_workaround (m[3]); - m[4] = swap_workaround (m[4]); - m[5] = swap_workaround (m[5]); - m[6] = swap_workaround (m[6]); - m[7] = swap_workaround (m[7]); + m[0] = swap32 (m[0]); + m[1] = swap32 (m[1]); + m[2] = swap32 (m[2]); + m[3] = swap32 (m[3]); + m[4] = swap32 (m[4]); + m[5] = swap32 (m[5]); + m[6] = swap32 (m[6]); + m[7] = swap32 (m[7]); // state buffer (hash) @@ -2734,7 +2734,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_s04 (__glo z[4] = 0; z[5] = 0; z[6] = 0; - z[7] = swap_workaround ((u64) (pw_len * 8)); + z[7] = swap32 ((u64) (pw_len * 8)); streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); diff --git a/OpenCL/m11800_a3.cl b/OpenCL/m11800_a3.cl index b1e81d7..6a702fc 100644 --- a/OpenCL/m11800_a3.cl +++ b/OpenCL/m11800_a3.cl @@ -2318,14 +2318,14 @@ static void m11800m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le m[6] = hl32_to_64 (w[ 3], w[ 2]); m[7] = hl32_to_64 (w[ 1], w[ 0]); - m[0] = swap_workaround (m[0]); - m[1] = swap_workaround (m[1]); - m[2] = swap_workaround (m[2]); - m[3] = swap_workaround (m[3]); - m[4] = swap_workaround (m[4]); - m[5] = swap_workaround (m[5]); - m[6] = swap_workaround (m[6]); - m[7] = swap_workaround (m[7]); + m[0] = swap32 (m[0]); + m[1] = swap32 (m[1]); + m[2] = swap32 (m[2]); + m[3] = swap32 (m[3]); + m[4] = swap32 (m[4]); + m[5] = swap32 (m[5]); + m[6] = swap32 (m[6]); + m[7] = swap32 (m[7]); // state buffer (hash) @@ -2351,7 +2351,7 @@ static void m11800m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le z[4] = 0; z[5] = 0; z[6] = 0; - z[7] = swap_workaround ((u64) (pw_len * 8)); + z[7] = swap32 ((u64) (pw_len * 8)); streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); @@ -2413,14 +2413,14 @@ static void m11800s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le m[6] = hl32_to_64 (w[ 3], w[ 2]); m[7] = hl32_to_64 (w[ 1], w[ 0]); - m[0] = swap_workaround (m[0]); - m[1] = swap_workaround (m[1]); - m[2] = swap_workaround (m[2]); - m[3] = swap_workaround (m[3]); - m[4] = swap_workaround (m[4]); - m[5] = swap_workaround (m[5]); - m[6] = swap_workaround (m[6]); - m[7] = swap_workaround (m[7]); + m[0] = swap32 (m[0]); + m[1] = swap32 (m[1]); + m[2] = swap32 (m[2]); + m[3] = swap32 (m[3]); + m[4] = swap32 (m[4]); + m[5] = swap32 (m[5]); + m[6] = swap32 (m[6]); + m[7] = swap32 (m[7]); // state buffer (hash) @@ -2446,7 +2446,7 @@ static void m11800s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le z[4] = 0; z[5] = 0; z[6] = 0; - z[7] = swap_workaround ((u64) (pw_len * 8)); + z[7] = swap32 ((u64) (pw_len * 8)); streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); diff --git a/OpenCL/m12000.cl b/OpenCL/m12000.cl index d85fbf4..eeb17ae 100644 --- a/OpenCL/m12000.cl +++ b/OpenCL/m12000.cl @@ -251,31 +251,31 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12000_init (__gl u32 w0[4]; - w0[0] = swap_workaround (pws[gid].i[ 0]); - w0[1] = swap_workaround (pws[gid].i[ 1]); - w0[2] = swap_workaround (pws[gid].i[ 2]); - w0[3] = swap_workaround (pws[gid].i[ 3]); + w0[0] = swap32 (pws[gid].i[ 0]); + w0[1] = swap32 (pws[gid].i[ 1]); + w0[2] = swap32 (pws[gid].i[ 2]); + w0[3] = swap32 (pws[gid].i[ 3]); u32 w1[4]; - w1[0] = swap_workaround (pws[gid].i[ 4]); - w1[1] = swap_workaround (pws[gid].i[ 5]); - w1[2] = swap_workaround (pws[gid].i[ 6]); - w1[3] = swap_workaround (pws[gid].i[ 7]); + w1[0] = swap32 (pws[gid].i[ 4]); + w1[1] = swap32 (pws[gid].i[ 5]); + w1[2] = swap32 (pws[gid].i[ 6]); + w1[3] = swap32 (pws[gid].i[ 7]); u32 w2[4]; - w2[0] = swap_workaround (pws[gid].i[ 8]); - w2[1] = swap_workaround (pws[gid].i[ 9]); - w2[2] = swap_workaround (pws[gid].i[10]); - w2[3] = swap_workaround (pws[gid].i[11]); + w2[0] = swap32 (pws[gid].i[ 8]); + w2[1] = swap32 (pws[gid].i[ 9]); + w2[2] = swap32 (pws[gid].i[10]); + w2[3] = swap32 (pws[gid].i[11]); u32 w3[4]; - w3[0] = swap_workaround (pws[gid].i[12]); - w3[1] = swap_workaround (pws[gid].i[13]); - w3[2] = swap_workaround (pws[gid].i[14]); - w3[3] = swap_workaround (pws[gid].i[15]); + w3[0] = swap32 (pws[gid].i[12]); + w3[1] = swap32 (pws[gid].i[13]); + w3[2] = swap32 (pws[gid].i[14]); + w3[3] = swap32 (pws[gid].i[15]); /** * salt @@ -288,20 +288,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12000_init (__gl u32 esalt_buf2[4]; u32 esalt_buf3[4]; - esalt_buf0[0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 0]); - esalt_buf0[1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 1]); - esalt_buf0[2] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 2]); - esalt_buf0[3] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 3]); - esalt_buf1[0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 4]); - esalt_buf1[1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 5]); - esalt_buf1[2] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 6]); - esalt_buf1[3] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 7]); - esalt_buf2[0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 8]); - esalt_buf2[1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[ 9]); - esalt_buf2[2] = swap_workaround (esalt_bufs[salt_pos].salt_buf[10]); - esalt_buf2[3] = swap_workaround (esalt_bufs[salt_pos].salt_buf[11]); - esalt_buf3[0] = swap_workaround (esalt_bufs[salt_pos].salt_buf[12]); - esalt_buf3[1] = swap_workaround (esalt_bufs[salt_pos].salt_buf[13]); + esalt_buf0[0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 0]); + esalt_buf0[1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 1]); + esalt_buf0[2] = swap32 (esalt_bufs[salt_pos].salt_buf[ 2]); + esalt_buf0[3] = swap32 (esalt_bufs[salt_pos].salt_buf[ 3]); + esalt_buf1[0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 4]); + esalt_buf1[1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 5]); + esalt_buf1[2] = swap32 (esalt_bufs[salt_pos].salt_buf[ 6]); + esalt_buf1[3] = swap32 (esalt_bufs[salt_pos].salt_buf[ 7]); + esalt_buf2[0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 8]); + esalt_buf2[1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 9]); + esalt_buf2[2] = swap32 (esalt_bufs[salt_pos].salt_buf[10]); + esalt_buf2[3] = swap32 (esalt_bufs[salt_pos].salt_buf[11]); + esalt_buf3[0] = swap32 (esalt_bufs[salt_pos].salt_buf[12]); + esalt_buf3[1] = swap32 (esalt_bufs[salt_pos].salt_buf[13]); esalt_buf3[2] = 0; esalt_buf3[3] = (64 + salt_len + 4) * 8; diff --git a/OpenCL/m12200.cl b/OpenCL/m12200.cl index 279ab63..298bbc7 100644 --- a/OpenCL/m12200.cl +++ b/OpenCL/m12200.cl @@ -173,22 +173,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12200_init (__gl append_0x80_4x4 (w0, w1, w2, w3, pw_len); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); /** * salt diff --git a/OpenCL/m12300.cl b/OpenCL/m12300.cl index 97e67fe..8d4df81 100644 --- a/OpenCL/m12300.cl +++ b/OpenCL/m12300.cl @@ -246,31 +246,31 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12300_init (__gl u32 w0[4]; - w0[0] = swap_workaround (pws[gid].i[ 0]); - w0[1] = swap_workaround (pws[gid].i[ 1]); - w0[2] = swap_workaround (pws[gid].i[ 2]); - w0[3] = swap_workaround (pws[gid].i[ 3]); + w0[0] = swap32 (pws[gid].i[ 0]); + w0[1] = swap32 (pws[gid].i[ 1]); + w0[2] = swap32 (pws[gid].i[ 2]); + w0[3] = swap32 (pws[gid].i[ 3]); u32 w1[4]; - w1[0] = swap_workaround (pws[gid].i[ 4]); - w1[1] = swap_workaround (pws[gid].i[ 5]); - w1[2] = swap_workaround (pws[gid].i[ 6]); - w1[3] = swap_workaround (pws[gid].i[ 7]); + w1[0] = swap32 (pws[gid].i[ 4]); + w1[1] = swap32 (pws[gid].i[ 5]); + w1[2] = swap32 (pws[gid].i[ 6]); + w1[3] = swap32 (pws[gid].i[ 7]); u32 w2[4]; - w2[0] = swap_workaround (pws[gid].i[ 8]); - w2[1] = swap_workaround (pws[gid].i[ 9]); - w2[2] = swap_workaround (pws[gid].i[10]); - w2[3] = swap_workaround (pws[gid].i[11]); + w2[0] = swap32 (pws[gid].i[ 8]); + w2[1] = swap32 (pws[gid].i[ 9]); + w2[2] = swap32 (pws[gid].i[10]); + w2[3] = swap32 (pws[gid].i[11]); u32 w3[4]; - w3[0] = swap_workaround (pws[gid].i[12]); - w3[1] = swap_workaround (pws[gid].i[13]); - w3[2] = swap_workaround (pws[gid].i[14]); - w3[3] = swap_workaround (pws[gid].i[15]); + w3[0] = swap32 (pws[gid].i[12]); + w3[1] = swap32 (pws[gid].i[13]); + w3[2] = swap32 (pws[gid].i[14]); + w3[3] = swap32 (pws[gid].i[15]); /** * salt diff --git a/OpenCL/m12500.cl b/OpenCL/m12500.cl index 52a6149..e726518 100644 --- a/OpenCL/m12500.cl +++ b/OpenCL/m12500.cl @@ -1227,10 +1227,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12500_comp (__gl u32 ukeyx[4]; - ukeyx[0] = swap_workaround (dgst[0]); - ukeyx[1] = swap_workaround (dgst[1]); - ukeyx[2] = swap_workaround (dgst[2]); - ukeyx[3] = swap_workaround (dgst[3]); + ukeyx[0] = swap32 (dgst[0]); + ukeyx[1] = swap32 (dgst[1]); + ukeyx[2] = swap32 (dgst[2]); + ukeyx[3] = swap32 (dgst[3]); AES128_ExpandKey (ukeyx, rk, s_te0, s_te1, s_te2, s_te3, s_te4); @@ -1322,10 +1322,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12500_comp (__gl PUTCHAR (iv, i, dgst[4] & 0xff); } - out[0] ^= swap_workaround (iv[0]); - out[1] ^= swap_workaround (iv[1]); - out[2] ^= swap_workaround (iv[2]); - out[3] ^= swap_workaround (iv[3]); + out[0] ^= swap32 (iv[0]); + out[1] ^= swap32 (iv[1]); + out[2] ^= swap32 (iv[2]); + out[3] ^= swap32 (iv[3]); const u32 r0 = out[0]; const u32 r1 = out[1]; diff --git a/OpenCL/m12600_a0.cl b/OpenCL/m12600_a0.cl index 3eab930..33e7cd1 100644 --- a/OpenCL/m12600_a0.cl +++ b/OpenCL/m12600_a0.cl @@ -156,20 +156,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; @@ -309,16 +309,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_m04 (__glo w9_t = uint_to_hex_upper8 ((e >> 8) & 255) << 0 | uint_to_hex_upper8 ((e >> 0) & 255) << 16; - w0_t = swap_workaround (w0_t); - w1_t = swap_workaround (w1_t); - w2_t = swap_workaround (w2_t); - w3_t = swap_workaround (w3_t); - w4_t = swap_workaround (w4_t); - w5_t = swap_workaround (w5_t); - w6_t = swap_workaround (w6_t); - w7_t = swap_workaround (w7_t); - w8_t = swap_workaround (w8_t); - w9_t = swap_workaround (w9_t); + w0_t = swap32 (w0_t); + w1_t = swap32 (w1_t); + w2_t = swap32 (w2_t); + w3_t = swap32 (w3_t); + w4_t = swap32 (w4_t); + w5_t = swap32 (w5_t); + w6_t = swap32 (w6_t); + w7_t = swap32 (w7_t); + w8_t = swap32 (w8_t); + w9_t = swap32 (w9_t); wa_t = 0x80000000; wb_t = 0; wc_t = 0; @@ -554,20 +554,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = out_len * 8; @@ -708,16 +708,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_s04 (__glo w9_t = uint_to_hex_upper8 ((e >> 8) & 255) << 0 | uint_to_hex_upper8 ((e >> 0) & 255) << 16; - w0_t = swap_workaround (w0_t); - w1_t = swap_workaround (w1_t); - w2_t = swap_workaround (w2_t); - w3_t = swap_workaround (w3_t); - w4_t = swap_workaround (w4_t); - w5_t = swap_workaround (w5_t); - w6_t = swap_workaround (w6_t); - w7_t = swap_workaround (w7_t); - w8_t = swap_workaround (w8_t); - w9_t = swap_workaround (w9_t); + w0_t = swap32 (w0_t); + w1_t = swap32 (w1_t); + w2_t = swap32 (w2_t); + w3_t = swap32 (w3_t); + w4_t = swap32 (w4_t); + w5_t = swap32 (w5_t); + w6_t = swap32 (w6_t); + w7_t = swap32 (w7_t); + w8_t = swap32 (w8_t); + w9_t = swap32 (w9_t); wa_t = 0x80000000; wb_t = 0; wc_t = 0; diff --git a/OpenCL/m12600_a1.cl b/OpenCL/m12600_a1.cl index 5a1133a..978dfbb 100644 --- a/OpenCL/m12600_a1.cl +++ b/OpenCL/m12600_a1.cl @@ -210,20 +210,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_m04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -364,16 +364,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_m04 (__glo w9_t = uint_to_hex_upper8 ((e >> 8) & 255) << 0 | uint_to_hex_upper8 ((e >> 0) & 255) << 16; - w0_t = swap_workaround (w0_t); - w1_t = swap_workaround (w1_t); - w2_t = swap_workaround (w2_t); - w3_t = swap_workaround (w3_t); - w4_t = swap_workaround (w4_t); - w5_t = swap_workaround (w5_t); - w6_t = swap_workaround (w6_t); - w7_t = swap_workaround (w7_t); - w8_t = swap_workaround (w8_t); - w9_t = swap_workaround (w9_t); + w0_t = swap32 (w0_t); + w1_t = swap32 (w1_t); + w2_t = swap32 (w2_t); + w3_t = swap32 (w3_t); + w4_t = swap32 (w4_t); + w5_t = swap32 (w5_t); + w6_t = swap32 (w6_t); + w7_t = swap32 (w7_t); + w8_t = swap32 (w8_t); + w9_t = swap32 (w9_t); wa_t = 0x80000000; wb_t = 0; wc_t = 0; @@ -665,20 +665,20 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_s04 (__glo * sha1 */ - u32 w0_t = swap_workaround (w0[0]); - u32 w1_t = swap_workaround (w0[1]); - u32 w2_t = swap_workaround (w0[2]); - u32 w3_t = swap_workaround (w0[3]); - u32 w4_t = swap_workaround (w1[0]); - u32 w5_t = swap_workaround (w1[1]); - u32 w6_t = swap_workaround (w1[2]); - u32 w7_t = swap_workaround (w1[3]); - u32 w8_t = swap_workaround (w2[0]); - u32 w9_t = swap_workaround (w2[1]); - u32 wa_t = swap_workaround (w2[2]); - u32 wb_t = swap_workaround (w2[3]); - u32 wc_t = swap_workaround (w3[0]); - u32 wd_t = swap_workaround (w3[1]); + u32 w0_t = swap32 (w0[0]); + u32 w1_t = swap32 (w0[1]); + u32 w2_t = swap32 (w0[2]); + u32 w3_t = swap32 (w0[3]); + u32 w4_t = swap32 (w1[0]); + u32 w5_t = swap32 (w1[1]); + u32 w6_t = swap32 (w1[2]); + u32 w7_t = swap32 (w1[3]); + u32 w8_t = swap32 (w2[0]); + u32 w9_t = swap32 (w2[1]); + u32 wa_t = swap32 (w2[2]); + u32 wb_t = swap32 (w2[3]); + u32 wc_t = swap32 (w3[0]); + u32 wd_t = swap32 (w3[1]); u32 we_t = 0; u32 wf_t = pw_len * 8; @@ -819,16 +819,16 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_s04 (__glo w9_t = uint_to_hex_upper8 ((e >> 8) & 255) << 0 | uint_to_hex_upper8 ((e >> 0) & 255) << 16; - w0_t = swap_workaround (w0_t); - w1_t = swap_workaround (w1_t); - w2_t = swap_workaround (w2_t); - w3_t = swap_workaround (w3_t); - w4_t = swap_workaround (w4_t); - w5_t = swap_workaround (w5_t); - w6_t = swap_workaround (w6_t); - w7_t = swap_workaround (w7_t); - w8_t = swap_workaround (w8_t); - w9_t = swap_workaround (w9_t); + w0_t = swap32 (w0_t); + w1_t = swap32 (w1_t); + w2_t = swap32 (w2_t); + w3_t = swap32 (w3_t); + w4_t = swap32 (w4_t); + w5_t = swap32 (w5_t); + w6_t = swap32 (w6_t); + w7_t = swap32 (w7_t); + w8_t = swap32 (w8_t); + w9_t = swap32 (w9_t); wa_t = 0x80000000; wb_t = 0; wc_t = 0; diff --git a/OpenCL/m12600_a3.cl b/OpenCL/m12600_a3.cl index 1dfc85e..14addfa 100644 --- a/OpenCL/m12600_a3.cl +++ b/OpenCL/m12600_a3.cl @@ -225,16 +225,16 @@ static void m12600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w9_t = uint_to_hex_upper8 ((e >> 8) & 255) << 0 | uint_to_hex_upper8 ((e >> 0) & 255) << 16; - w0_t = swap_workaround (w0_t); - w1_t = swap_workaround (w1_t); - w2_t = swap_workaround (w2_t); - w3_t = swap_workaround (w3_t); - w4_t = swap_workaround (w4_t); - w5_t = swap_workaround (w5_t); - w6_t = swap_workaround (w6_t); - w7_t = swap_workaround (w7_t); - w8_t = swap_workaround (w8_t); - w9_t = swap_workaround (w9_t); + w0_t = swap32 (w0_t); + w1_t = swap32 (w1_t); + w2_t = swap32 (w2_t); + w3_t = swap32 (w3_t); + w4_t = swap32 (w4_t); + w5_t = swap32 (w5_t); + w6_t = swap32 (w6_t); + w7_t = swap32 (w7_t); + w8_t = swap32 (w8_t); + w9_t = swap32 (w9_t); wa_t = 0x80000000; wb_t = 0; wc_t = 0; @@ -533,16 +533,16 @@ static void m12600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w9_t = uint_to_hex_upper8 ((e >> 8) & 255) << 0 | uint_to_hex_upper8 ((e >> 0) & 255) << 16; - w0_t = swap_workaround (w0_t); - w1_t = swap_workaround (w1_t); - w2_t = swap_workaround (w2_t); - w3_t = swap_workaround (w3_t); - w4_t = swap_workaround (w4_t); - w5_t = swap_workaround (w5_t); - w6_t = swap_workaround (w6_t); - w7_t = swap_workaround (w7_t); - w8_t = swap_workaround (w8_t); - w9_t = swap_workaround (w9_t); + w0_t = swap32 (w0_t); + w1_t = swap32 (w1_t); + w2_t = swap32 (w2_t); + w3_t = swap32 (w3_t); + w4_t = swap32 (w4_t); + w5_t = swap32 (w5_t); + w6_t = swap32 (w6_t); + w7_t = swap32 (w7_t); + w8_t = swap32 (w8_t); + w9_t = swap32 (w9_t); wa_t = 0x80000000; wb_t = 0; wc_t = 0; diff --git a/OpenCL/m12700.cl b/OpenCL/m12700.cl index 2c56350..214ceb0 100644 --- a/OpenCL/m12700.cl +++ b/OpenCL/m12700.cl @@ -1170,22 +1170,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12700_init (__gl * pads */ - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); u32 ipad[5]; u32 opad[5]; @@ -1558,10 +1558,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12700_comp (__gl out[2] ^= iv[2]; out[3] ^= iv[3]; - out[0] = swap_workaround (out[0]); - out[1] = swap_workaround (out[1]); - out[2] = swap_workaround (out[2]); - out[3] = swap_workaround (out[3]); + out[0] = swap32 (out[0]); + out[1] = swap32 (out[1]); + out[2] = swap32 (out[2]); + out[3] = swap32 (out[3]); if ((out[0] & 0xff) != '{') return; diff --git a/OpenCL/m12800.cl b/OpenCL/m12800.cl index a4ec85b..54375f4 100644 --- a/OpenCL/m12800.cl +++ b/OpenCL/m12800.cl @@ -383,10 +383,10 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12800_init (__gl u32 salt_buf0[4]; - salt_buf0[0] = swap_workaround (salt_bufs[salt_pos].salt_buf[0]); - salt_buf0[1] = swap_workaround (salt_bufs[salt_pos].salt_buf[1]); - salt_buf0[2] = swap_workaround (salt_bufs[salt_pos].salt_buf[2]); - salt_buf0[3] = swap_workaround (salt_bufs[salt_pos].salt_buf[3]); + salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); + salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); + salt_buf0[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]); + salt_buf0[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]); u32 salt_buf1[4]; @@ -449,22 +449,22 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12800_init (__gl make_unicode (w1, w2, w3); make_unicode (w0, w0, w1); - w0[0] = swap_workaround (w0[0]); - w0[1] = swap_workaround (w0[1]); - w0[2] = swap_workaround (w0[2]); - w0[3] = swap_workaround (w0[3]); - w1[0] = swap_workaround (w1[0]); - w1[1] = swap_workaround (w1[1]); - w1[2] = swap_workaround (w1[2]); - w1[3] = swap_workaround (w1[3]); - w2[0] = swap_workaround (w2[0]); - w2[1] = swap_workaround (w2[1]); - w2[2] = swap_workaround (w2[2]); - w2[3] = swap_workaround (w2[3]); - w3[0] = swap_workaround (w3[0]); - w3[1] = swap_workaround (w3[1]); - w3[2] = swap_workaround (w3[2]); - w3[3] = swap_workaround (w3[3]); + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); u32 ipad[8]; u32 opad[8]; diff --git a/OpenCL/types_ocl.c b/OpenCL/types_ocl.c index c052384..703151e 100644 --- a/OpenCL/types_ocl.c +++ b/OpenCL/types_ocl.c @@ -35,7 +35,8 @@ static u32 __byte_perm (const u32 a, const u32 b, const u32 c) static u32 swap32 (const u32 v) { - return __byte_perm (v, 0, 0x0123); + return (as_uint (as_uchar4 (v).s3210)); + // return __byte_perm (v, 0, 0x0123); } static u64 swap64 (const u64 v) diff --git a/src/oclHashcat.c b/src/oclHashcat.c index 2ac9e4f..2bdc17c 100644 --- a/src/oclHashcat.c +++ b/src/oclHashcat.c @@ -12936,7 +12936,7 @@ int main (int argc, char **argv) * kernel find */ - char build_opts[100]; + char build_opts[1024]; // we don't have sm_* on AMD but it doesn't matter @@ -13257,7 +13257,7 @@ int main (int argc, char **argv) * kernel compile */ - #ifdef BINARY_KERNEL + //#ifdef BINARY_KERNEL if (force_jit_compilation == 0) { @@ -13272,7 +13272,7 @@ int main (int argc, char **argv) sprintf (build_opts, "%s -DSCRYPT_N=%d -DSCRYPT_R=%d -DSCRYPT_P=%d -DSCRYPT_TMTO=%d", build_opts, data.salts_buf[0].scrypt_N, data.salts_buf[0].scrypt_r, data.salts_buf[0].scrypt_p, data.salts_buf[0].scrypt_tmto); } - #endif + //#endif clBuildProgram (device_param->program, 1, &device_param->device, build_opts, NULL, NULL);