* pure scalar functions
*/
-static int hash_comp (const u32 d1[4], __global u32 *d2)
+inline int hash_comp (const u32 d1[4], __global u32 *d2)
{
if (d1[3] > d2[DGST_R3]) return ( 1);
if (d1[3] < d2[DGST_R3]) return (-1);
return (0);
}
-static int find_hash (const u32 digest[4], const u32 digests_cnt, __global digest_t *digests_buf)
+inline int find_hash (const u32 digest[4], const u32 digests_cnt, __global digest_t *digests_buf)
{
for (u32 l = 0, r = digests_cnt; r; r >>= 1)
{
return (-1);
}
-static u32 check_bitmap (__global u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest)
+inline u32 check_bitmap (__global u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest)
{
return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f)));
}
-static u32 check (const u32 digest[2], __global u32 *bitmap_s1_a, __global u32 *bitmap_s1_b, __global u32 *bitmap_s1_c, __global u32 *bitmap_s1_d, __global u32 *bitmap_s2_a, __global u32 *bitmap_s2_b, __global u32 *bitmap_s2_c, __global u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2)
+inline u32 check (const u32 digest[2], __global u32 *bitmap_s1_a, __global u32 *bitmap_s1_b, __global u32 *bitmap_s1_c, __global u32 *bitmap_s1_d, __global u32 *bitmap_s2_a, __global u32 *bitmap_s2_b, __global u32 *bitmap_s2_c, __global u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2)
{
if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0);
if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0);
return (1);
}
-static void mark_hash (__global plain_t *plains_buf, __global u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
+inline void mark_hash (__global plain_t *plains_buf, __global u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
{
hashes_shown[hash_pos] = 1;
* vector functions
*/
-static void truncate_block (u32x w[4], const u32 len)
+inline void truncate_block (u32x w[4], const u32 len)
{
switch (len)
{
}
}
-static void make_unicode (const u32x in[4], u32x out1[4], u32x out2[4])
+inline void make_unicode (const u32x in[4], u32x out1[4], u32x out2[4])
{
#ifdef IS_NV
out2[3] = __byte_perm (in[3], 0, 0x7372);
#endif
}
-static void undo_unicode (const u32x in1[4], const u32x in2[4], u32x out[4])
+inline void undo_unicode (const u32x in1[4], const u32x in2[4], u32x out[4])
{
#ifdef IS_NV
out[0] = __byte_perm (in1[0], in1[1], 0x6420);
#endif
}
-static void append_0x01_1x4 (u32x w0[4], const u32 offset)
+inline void append_0x01_1x4 (u32x w0[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x01_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
+inline void append_0x01_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x01_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
+inline void append_0x01_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x01_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
+inline void append_0x01_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x01_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
+inline void append_0x01_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x02_1x4 (u32x w0[4], const u32 offset)
+inline void append_0x02_1x4 (u32x w0[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x02_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
+inline void append_0x02_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x02_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
+inline void append_0x02_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x02_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
+inline void append_0x02_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x02_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
+inline void append_0x02_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_1x4 (u32x w0[4], const u32 offset)
+inline void append_0x80_1x4 (u32x w0[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
+inline void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
+inline void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
+inline void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
+inline void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_1x16 (u32x w[16], const u32 offset)
+inline void append_0x80_1x16 (u32x w[16], const u32 offset)
{
switch (offset)
{
}
}
-static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
+inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
{
#if defined IS_AMD || defined IS_GENERIC
const int offset_mod_4 = offset & 3;
#endif
}
-static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
+inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
{
#if defined IS_AMD || defined IS_GENERIC
switch (offset / 4)
#endif
}
-static void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len)
+inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len)
{
#if defined cl_amd_media_ops
switch (salt_len)
#endif
}
-static void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len)
+inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len)
{
// would be nice to have optimization based on amd_bytealign as with _le counterpart
}
}
+inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len)
+{
+ #if defined cl_amd_media_ops
+ switch (salt_len)
+ {
+ case 0: w0[0] = wx;
+ break;
+ case 1: w0[0] = amd_bytealign (wx, w0[0] << 24, 3);
+ w0[1] = amd_bytealign (w0[1] >> 8, wx, 3);
+ break;
+ case 2: w0[0] = amd_bytealign (wx, w0[0] << 16, 2);
+ w0[1] = amd_bytealign (w0[1] >> 16, wx, 2);
+ break;
+ case 3: w0[0] = amd_bytealign (wx, w0[0] << 8, 1);
+ w0[1] = amd_bytealign (w0[1] >> 24, wx, 1);
+ break;
+ case 4: w0[1] = wx;
+ break;
+ case 5: w0[1] = amd_bytealign (wx, w0[1] << 24, 3);
+ w0[2] = amd_bytealign (w0[2] >> 8, wx, 3);
+ break;
+ case 6: w0[1] = amd_bytealign (wx, w0[1] << 16, 2);
+ w0[2] = amd_bytealign (w0[2] >> 16, wx, 2);
+ break;
+ case 7: w0[1] = amd_bytealign (wx, w0[1] << 8, 1);
+ w0[2] = amd_bytealign (w0[2] >> 24, wx, 1);
+ break;
+ case 8: w0[2] = wx;
+ break;
+ case 9: w0[2] = amd_bytealign (wx, w0[2] << 24, 3);
+ w0[3] = amd_bytealign (w0[3] >> 8, wx, 3);
+ break;
+ case 10: w0[2] = amd_bytealign (wx, w0[2] << 16, 2);
+ w0[3] = amd_bytealign (w0[3] >> 16, wx, 2);
+ break;
+ case 11: w0[2] = amd_bytealign (wx, w0[2] << 8, 1);
+ w0[3] = amd_bytealign (w0[3] >> 24, wx, 1);
+ break;
+ case 12: w0[3] = wx;
+ break;
+ case 13: w0[3] = amd_bytealign (wx, w0[3] << 24, 3);
+ w1[0] = amd_bytealign (w1[0] >> 8, wx, 3);
+ break;
+ case 14: w0[3] = amd_bytealign (wx, w0[3] << 16, 2);
+ w1[0] = amd_bytealign (w1[0] >> 16, wx, 2);
+ break;
+ case 15: w0[3] = amd_bytealign (wx, w0[3] << 8, 1);
+ w1[0] = amd_bytealign (w1[0] >> 24, wx, 1);
+ break;
+ case 16: w1[0] = wx;
+ break;
+ case 17: w1[0] = amd_bytealign (wx, w1[0] << 24, 3);
+ w1[1] = amd_bytealign (w1[1] >> 8, wx, 3);
+ break;
+ case 18: w1[0] = amd_bytealign (wx, w1[0] << 16, 2);
+ w1[1] = amd_bytealign (w1[1] >> 16, wx, 2);
+ break;
+ case 19: w1[0] = amd_bytealign (wx, w1[0] << 8, 1);
+ w1[1] = amd_bytealign (w1[1] >> 24, wx, 1);
+ break;
+ case 20: w1[1] = wx;
+ break;
+ case 21: w1[1] = amd_bytealign (wx, w1[1] << 24, 3);
+ w1[2] = amd_bytealign (w1[2] >> 8, wx, 3);
+ break;
+ case 22: w1[1] = amd_bytealign (wx, w1[1] << 16, 2);
+ w1[2] = amd_bytealign (w1[2] >> 16, wx, 2);
+ break;
+ case 23: w1[1] = amd_bytealign (wx, w1[1] << 8, 1);
+ w1[2] = amd_bytealign (w1[2] >> 24, wx, 1);
+ break;
+ case 24: w1[2] = wx;
+ break;
+ case 25: w1[2] = amd_bytealign (wx, w1[2] << 24, 3);
+ w1[3] = amd_bytealign (w1[3] >> 8, wx, 3);
+ break;
+ case 26: w1[2] = amd_bytealign (wx, w1[2] << 16, 2);
+ w1[3] = amd_bytealign (w1[3] >> 16, wx, 2);
+ break;
+ case 27: w1[2] = amd_bytealign (wx, w1[2] << 8, 1);
+ w1[3] = amd_bytealign (w1[3] >> 24, wx, 1);
+ break;
+ case 28: w1[3] = wx;
+ break;
+ case 29: w1[3] = amd_bytealign (wx, w1[3] << 24, 3);
+ w2[0] = amd_bytealign (w2[0] >> 8, wx, 3);
+ break;
+ case 30: w1[3] = amd_bytealign (wx, w1[3] << 16, 2);
+ w2[0] = amd_bytealign (w2[0] >> 16, wx, 2);
+ break;
+ case 31: w1[3] = amd_bytealign (wx, w1[3] << 8, 1);
+ w2[0] = amd_bytealign (w2[0] >> 24, wx, 1);
+ break;
+ case 32: w2[0] = wx;
+ break;
+ case 33: w2[0] = amd_bytealign (wx, w2[0] << 24, 3);
+ w2[1] = amd_bytealign (w2[1] >> 8, wx, 3);
+ break;
+ case 34: w2[0] = amd_bytealign (wx, w2[0] << 16, 2);
+ w2[1] = amd_bytealign (w2[1] >> 16, wx, 2);
+ break;
+ case 35: w2[0] = amd_bytealign (wx, w2[0] << 8, 1);
+ w2[1] = amd_bytealign (w2[1] >> 24, wx, 1);
+ break;
+ case 36: w2[1] = wx;
+ break;
+ case 37: w2[1] = amd_bytealign (wx, w2[1] << 24, 3);
+ w2[2] = amd_bytealign (w2[2] >> 8, wx, 3);
+ break;
+ case 38: w2[1] = amd_bytealign (wx, w2[1] << 16, 2);
+ w2[2] = amd_bytealign (w2[2] >> 16, wx, 2);
+ break;
+ case 39: w2[1] = amd_bytealign (wx, w2[1] << 8, 1);
+ w2[2] = amd_bytealign (w2[2] >> 24, wx, 1);
+ break;
+ case 40: w2[2] = wx;
+ break;
+ case 41: w2[2] = amd_bytealign (wx, w2[2] << 24, 3);
+ w2[3] = amd_bytealign (w2[3] >> 8, wx, 3);
+ break;
+ case 42: w2[2] = amd_bytealign (wx, w2[2] << 16, 2);
+ w2[3] = amd_bytealign (w2[3] >> 16, wx, 2);
+ break;
+ case 43: w2[2] = amd_bytealign (wx, w2[2] << 8, 1);
+ w2[3] = amd_bytealign (w2[3] >> 24, wx, 1);
+ break;
+ case 44: w2[3] = wx;
+ break;
+ case 45: w2[3] = amd_bytealign (wx, w2[3] << 24, 3);
+ w3[0] = amd_bytealign (w3[0] >> 8, wx, 3);
+ break;
+ case 46: w2[3] = amd_bytealign (wx, w2[3] << 16, 2);
+ w3[0] = amd_bytealign (w3[0] >> 16, wx, 2);
+ break;
+ case 47: w2[3] = amd_bytealign (wx, w2[3] << 8, 1);
+ w3[0] = amd_bytealign (w3[0] >> 24, wx, 1);
+ break;
+ case 48: w3[0] = wx;
+ break;
+ case 49: w3[0] = amd_bytealign (wx, w3[0] << 24, 3);
+ w3[1] = amd_bytealign (w3[1] >> 8, wx, 3);
+ break;
+ case 50: w3[0] = amd_bytealign (wx, w3[0] << 16, 2);
+ w3[1] = amd_bytealign (w3[1] >> 16, wx, 2);
+ break;
+ case 51: w3[0] = amd_bytealign (wx, w3[0] << 8, 1);
+ w3[1] = amd_bytealign (w3[1] >> 24, wx, 1);
+ break;
+ case 52: w3[1] = wx;
+ break;
+ case 53: w3[1] = amd_bytealign (wx, w3[1] << 24, 3);
+ w3[2] = amd_bytealign (w3[2] >> 8, wx, 3);
+ break;
+ case 54: w3[1] = amd_bytealign (wx, w3[1] << 16, 2);
+ w3[2] = amd_bytealign (w3[2] >> 16, wx, 2);
+ break;
+ case 55: w3[1] = amd_bytealign (wx, w3[1] << 8, 1);
+ w3[2] = amd_bytealign (w3[2] >> 24, wx, 1);
+ break;
+ case 56: w3[2] = wx;
+ break;
+ case 57: w3[2] = amd_bytealign (wx, w3[2] << 24, 3);
+ w3[3] = amd_bytealign (w3[3] >> 8, wx, 3);
+ break;
+ case 58: w3[2] = amd_bytealign (wx, w3[2] << 16, 2);
+ w3[3] = amd_bytealign (w3[3] >> 16, wx, 2);
+ break;
+ case 59: w3[2] = amd_bytealign (wx, w3[2] << 8, 1);
+ w3[3] = amd_bytealign (w3[3] >> 24, wx, 1);
+ break;
+ case 60: w3[3] = wx;
+ break;
+ case 61: w3[3] = amd_bytealign (wx, w3[3] << 24, 3);
+ //w4[0] = amd_bytealign (w4[0] >> 8, wx, 3);
+ break;
+ case 62: w3[3] = amd_bytealign (wx, w3[3] << 16, 2);
+ //w4[0] = amd_bytealign (w4[0] >> 16, wx, 2);
+ break;
+ case 63: w3[3] = amd_bytealign (wx, w3[3] << 8, 1);
+ //w4[0] = amd_bytealign (w4[0] >> 24, wx, 1);
+ break;
+ }
+ #else
+ switch (salt_len)
+ {
+ case 0: w0[0] = wx;
+ break;
+ case 1: w0[0] = (w0[0] & 0x000000ff) | (wx << 8);
+ w0[1] = (w0[1] & 0xffffff00) | (wx >> 24);
+ break;
+ case 2: w0[0] = (w0[0] & 0x0000ffff) | (wx << 16);
+ w0[1] = (w0[1] & 0xffff0000) | (wx >> 16);
+ break;
+ case 3: w0[0] = (w0[0] & 0x00ffffff) | (wx << 24);
+ w0[1] = (w0[1] & 0xff000000) | (wx >> 8);
+ break;
+ case 4: w0[1] = wx;
+ break;
+ case 5: w0[1] = (w0[1] & 0x000000ff) | (wx << 8);
+ w0[2] = (w0[2] & 0xffffff00) | (wx >> 24);
+ break;
+ case 6: w0[1] = (w0[1] & 0x0000ffff) | (wx << 16);
+ w0[2] = (w0[2] & 0xffff0000) | (wx >> 16);
+ break;
+ case 7: w0[1] = (w0[1] & 0x00ffffff) | (wx << 24);
+ w0[2] = (w0[2] & 0xff000000) | (wx >> 8);
+ break;
+ case 8: w0[2] = wx;
+ break;
+ case 9: w0[2] = (w0[2] & 0x000000ff) | (wx << 8);
+ w0[3] = (w0[3] & 0xffffff00) | (wx >> 24);
+ break;
+ case 10: w0[2] = (w0[2] & 0x0000ffff) | (wx << 16);
+ w0[3] = (w0[3] & 0xffff0000) | (wx >> 16);
+ break;
+ case 11: w0[2] = (w0[2] & 0x00ffffff) | (wx << 24);
+ w0[3] = (w0[3] & 0xff000000) | (wx >> 8);
+ break;
+ case 12: w0[3] = wx;
+ break;
+ case 13: w0[3] = (w0[3] & 0x000000ff) | (wx << 8);
+ w1[0] = (w1[0] & 0xffffff00) | (wx >> 24);
+ break;
+ case 14: w0[3] = (w0[3] & 0x0000ffff) | (wx << 16);
+ w1[0] = (w1[0] & 0xffff0000) | (wx >> 16);
+ break;
+ case 15: w0[3] = (w0[3] & 0x00ffffff) | (wx << 24);
+ w1[0] = (w1[0] & 0xff000000) | (wx >> 8);
+ break;
+ case 16: w1[0] = wx;
+ break;
+ case 17: w1[0] = (w1[0] & 0x000000ff) | (wx << 8);
+ w1[1] = (w1[1] & 0xffffff00) | (wx >> 24);
+ break;
+ case 18: w1[0] = (w1[0] & 0x0000ffff) | (wx << 16);
+ w1[1] = (w1[1] & 0xffff0000) | (wx >> 16);
+ break;
+ case 19: w1[0] = (w1[0] & 0x00ffffff) | (wx << 24);
+ w1[1] = (w1[1] & 0xff000000) | (wx >> 8);
+ break;
+ case 20: w1[1] = wx;
+ break;
+ case 21: w1[1] = (w1[1] & 0x000000ff) | (wx << 8);
+ w1[2] = (w1[2] & 0xffffff00) | (wx >> 24);
+ break;
+ case 22: w1[1] = (w1[1] & 0x0000ffff) | (wx << 16);
+ w1[2] = (w1[2] & 0xffff0000) | (wx >> 16);
+ break;
+ case 23: w1[1] = (w1[1] & 0x00ffffff) | (wx << 24);
+ w1[2] = (w1[2] & 0xff000000) | (wx >> 8);
+ break;
+ case 24: w1[2] = wx;
+ break;
+ case 25: w1[2] = (w1[2] & 0x000000ff) | (wx << 8);
+ w1[3] = (w1[3] & 0xffffff00) | (wx >> 24);
+ break;
+ case 26: w1[2] = (w1[2] & 0x0000ffff) | (wx << 16);
+ w1[3] = (w1[3] & 0xffff0000) | (wx >> 16);
+ break;
+ case 27: w1[2] = (w1[2] & 0x00ffffff) | (wx << 24);
+ w1[3] = (w1[3] & 0xff000000) | (wx >> 8);
+ break;
+ case 28: w1[3] = wx;
+ break;
+ case 29: w1[3] = (w1[3] & 0x000000ff) | (wx << 8);
+ w2[0] = (w2[0] & 0xffffff00) | (wx >> 24);
+ break;
+ case 30: w1[3] = (w1[3] & 0x0000ffff) | (wx << 16);
+ w2[0] = (w2[0] & 0xffff0000) | (wx >> 16);
+ break;
+ case 31: w1[3] = (w1[3] & 0x00ffffff) | (wx << 24);
+ w2[0] = (w2[0] & 0xff000000) | (wx >> 8);
+ break;
+ case 32: w2[0] = wx;
+ break;
+ case 33: w2[0] = (w2[0] & 0x000000ff) | (wx << 8);
+ w2[1] = (w2[1] & 0xffffff00) | (wx >> 24);
+ break;
+ case 34: w2[0] = (w2[0] & 0x0000ffff) | (wx << 16);
+ w2[1] = (w2[1] & 0xffff0000) | (wx >> 16);
+ break;
+ case 35: w2[0] = (w2[0] & 0x00ffffff) | (wx << 24);
+ w2[1] = (w2[1] & 0xff000000) | (wx >> 8);
+ break;
+ case 36: w2[1] = wx;
+ break;
+ case 37: w2[1] = (w2[1] & 0x000000ff) | (wx << 8);
+ w2[2] = (w2[2] & 0xffffff00) | (wx >> 24);
+ break;
+ case 38: w2[1] = (w2[1] & 0x0000ffff) | (wx << 16);
+ w2[2] = (w2[2] & 0xffff0000) | (wx >> 16);
+ break;
+ case 39: w2[1] = (w2[1] & 0x00ffffff) | (wx << 24);
+ w2[2] = (w2[2] & 0xff000000) | (wx >> 8);
+ break;
+ case 40: w2[2] = wx;
+ break;
+ case 41: w2[2] = (w2[2] & 0x000000ff) | (wx << 8);
+ w2[3] = (w2[3] & 0xffffff00) | (wx >> 24);
+ break;
+ case 42: w2[2] = (w2[2] & 0x0000ffff) | (wx << 16);
+ w2[3] = (w2[3] & 0xffff0000) | (wx >> 16);
+ break;
+ case 43: w2[2] = (w2[2] & 0x00ffffff) | (wx << 24);
+ w2[3] = (w2[3] & 0xff000000) | (wx >> 8);
+ break;
+ case 44: w2[3] = wx;
+ break;
+ case 45: w2[3] = (w2[3] & 0x000000ff) | (wx << 8);
+ w3[0] = (w3[0] & 0xffffff00) | (wx >> 24);
+ break;
+ case 46: w2[3] = (w2[3] & 0x0000ffff) | (wx << 16);
+ w3[0] = (w3[0] & 0xffff0000) | (wx >> 16);
+ break;
+ case 47: w2[3] = (w2[3] & 0x00ffffff) | (wx << 24);
+ w3[0] = (w3[0] & 0xff000000) | (wx >> 8);
+ break;
+ case 48: w3[0] = wx;
+ break;
+ case 49: w3[0] = (w3[0] & 0x000000ff) | (wx << 8);
+ w3[1] = (w3[1] & 0xffffff00) | (wx >> 24);
+ break;
+ case 50: w3[0] = (w3[0] & 0x0000ffff) | (wx << 16);
+ w3[1] = (w3[1] & 0xffff0000) | (wx >> 16);
+ break;
+ case 51: w3[0] = (w3[0] & 0x00ffffff) | (wx << 24);
+ w3[1] = (w3[1] & 0xff000000) | (wx >> 8);
+ break;
+ case 52: w3[1] = wx;
+ break;
+ case 53: w3[1] = (w3[1] & 0x000000ff) | (wx << 8);
+ w3[2] = (w3[2] & 0xffffff00) | (wx >> 24);
+ break;
+ case 54: w3[1] = (w3[1] & 0x0000ffff) | (wx << 16);
+ w3[2] = (w3[2] & 0xffff0000) | (wx >> 16);
+ break;
+ case 55: w3[1] = (w3[1] & 0x00ffffff) | (wx << 24);
+ w3[2] = (w3[2] & 0xff000000) | (wx >> 8);
+ break;
+ case 56: w3[2] = wx;
+ break;
+ case 57: w3[2] = (w3[2] & 0x000000ff) | (wx << 8);
+ w3[3] = (w3[3] & 0xffffff00) | (wx >> 24);
+ break;
+ case 58: w3[2] = (w3[2] & 0x0000ffff) | (wx << 16);
+ w3[3] = (w3[3] & 0xffff0000) | (wx >> 16);
+ break;
+ case 59: w3[2] = (w3[2] & 0x00ffffff) | (wx << 24);
+ w3[3] = (w3[3] & 0xff000000) | (wx >> 8);
+ break;
+ case 60: w3[3] = wx;
+ break;
+ case 61: w3[3] = (w3[3] & 0x000000ff) | (wx << 8);
+ //w4[0] = (w4[0] & 0xffffff00) | (wx >> 24);
+ break;
+ case 62: w3[3] = (w3[3] & 0x0000ffff) | (wx << 16);
+ //w4[0] = (w4[0] & 0xffff0000) | (wx >> 16);
+ break;
+ case 63: w3[3] = (w3[3] & 0x00ffffff) | (wx << 24);
+ //w4[0] = (w4[0] & 0xff000000) | (wx >> 8);
+ break;
+ }
+ #endif
+}
+
+inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len)
+{
+ // would be nice to have optimization based on amd_bytealign as with _le counterpart
+
+ switch (salt_len)
+ {
+ case 0: w0[0] = wx;
+ break;
+ case 1: w0[0] = (w0[0] & 0xff000000) | (wx >> 8);
+ w0[1] = (w0[1] & 0x00ffffff) | (wx << 24);
+ break;
+ case 2: w0[0] = (w0[0] & 0xffff0000) | (wx >> 16);
+ w0[1] = (w0[1] & 0x0000ffff) | (wx << 16);
+ break;
+ case 3: w0[0] = (w0[0] & 0xffffff00) | (wx >> 24);
+ w0[1] = (w0[1] & 0x000000ff) | (wx << 8);
+ break;
+ case 4: w0[1] = wx;
+ break;
+ case 5: w0[1] = (w0[1] & 0xff000000) | (wx >> 8);
+ w0[2] = (w0[2] & 0x00ffffff) | (wx << 24);
+ break;
+ case 6: w0[1] = (w0[1] & 0xffff0000) | (wx >> 16);
+ w0[2] = (w0[2] & 0x0000ffff) | (wx << 16);
+ break;
+ case 7: w0[1] = (w0[1] & 0xffffff00) | (wx >> 24);
+ w0[2] = (w0[2] & 0x000000ff) | (wx << 8);
+ break;
+ case 8: w0[2] = wx;
+ break;
+ case 9: w0[2] = (w0[2] & 0xff000000) | (wx >> 8);
+ w0[3] = (w0[3] & 0x00ffffff) | (wx << 24);
+ break;
+ case 10: w0[2] = (w0[2] & 0xffff0000) | (wx >> 16);
+ w0[3] = (w0[3] & 0x0000ffff) | (wx << 16);
+ break;
+ case 11: w0[2] = (w0[2] & 0xffffff00) | (wx >> 24);
+ w0[3] = (w0[3] & 0x000000ff) | (wx << 8);
+ break;
+ case 12: w0[3] = wx;
+ break;
+ case 13: w0[3] = (w0[3] & 0xff000000) | (wx >> 8);
+ w1[0] = (w1[0] & 0x00ffffff) | (wx << 24);
+ break;
+ case 14: w0[3] = (w0[3] & 0xffff0000) | (wx >> 16);
+ w1[0] = (w1[0] & 0x0000ffff) | (wx << 16);
+ break;
+ case 15: w0[3] = (w0[3] & 0xffffff00) | (wx >> 24);
+ w1[0] = (w1[0] & 0x000000ff) | (wx << 8);
+ break;
+ case 16: w1[0] = wx;
+ break;
+ case 17: w1[0] = (w1[0] & 0xff000000) | (wx >> 8);
+ w1[1] = (w1[1] & 0x00ffffff) | (wx << 24);
+ break;
+ case 18: w1[0] = (w1[0] & 0xffff0000) | (wx >> 16);
+ w1[1] = (w1[1] & 0x0000ffff) | (wx << 16);
+ break;
+ case 19: w1[0] = (w1[0] & 0xffffff00) | (wx >> 24);
+ w1[1] = (w1[1] & 0x000000ff) | (wx << 8);
+ break;
+ case 20: w1[1] = wx;
+ break;
+ case 21: w1[1] = (w1[1] & 0xff000000) | (wx >> 8);
+ w1[2] = (w1[2] & 0x00ffffff) | (wx << 24);
+ break;
+ case 22: w1[1] = (w1[1] & 0xffff0000) | (wx >> 16);
+ w1[2] = (w1[2] & 0x0000ffff) | (wx << 16);
+ break;
+ case 23: w1[1] = (w1[1] & 0xffffff00) | (wx >> 24);
+ w1[2] = (w1[2] & 0x000000ff) | (wx << 8);
+ break;
+ case 24: w1[2] = wx;
+ break;
+ case 25: w1[2] = (w1[2] & 0xff000000) | (wx >> 8);
+ w1[3] = (w1[3] & 0x00ffffff) | (wx << 24);
+ break;
+ case 26: w1[2] = (w1[2] & 0xffff0000) | (wx >> 16);
+ w1[3] = (w1[3] & 0x0000ffff) | (wx << 16);
+ break;
+ case 27: w1[2] = (w1[2] & 0xffffff00) | (wx >> 24);
+ w1[3] = (w1[3] & 0x000000ff) | (wx << 8);
+ break;
+ case 28: w1[3] = wx;
+ break;
+ case 29: w1[3] = (w1[3] & 0xff000000) | (wx >> 8);
+ w2[0] = (w2[0] & 0x00ffffff) | (wx << 24);
+ break;
+ case 30: w1[3] = (w1[3] & 0xffff0000) | (wx >> 16);
+ w2[0] = (w2[0] & 0x0000ffff) | (wx << 16);
+ break;
+ case 31: w1[3] = (w1[3] & 0xffffff00) | (wx >> 24);
+ w2[0] = (w2[0] & 0x000000ff) | (wx << 8);
+ break;
+ case 32: w2[0] = wx;
+ break;
+ case 33: w2[0] = (w2[0] & 0xff000000) | (wx >> 8);
+ w2[1] = (w2[1] & 0x00ffffff) | (wx << 24);
+ break;
+ case 34: w2[0] = (w2[0] & 0xffff0000) | (wx >> 16);
+ w2[1] = (w2[1] & 0x0000ffff) | (wx << 16);
+ break;
+ case 35: w2[0] = (w2[0] & 0xffffff00) | (wx >> 24);
+ w2[1] = (w2[1] & 0x000000ff) | (wx << 8);
+ break;
+ case 36: w2[1] = wx;
+ break;
+ case 37: w2[1] = (w2[1] & 0xff000000) | (wx >> 8);
+ w2[2] = (w2[2] & 0x00ffffff) | (wx << 24);
+ break;
+ case 38: w2[1] = (w2[1] & 0xffff0000) | (wx >> 16);
+ w2[2] = (w2[2] & 0x0000ffff) | (wx << 16);
+ break;
+ case 39: w2[1] = (w2[1] & 0xffffff00) | (wx >> 24);
+ w2[2] = (w2[2] & 0x000000ff) | (wx << 8);
+ break;
+ case 40: w2[2] = wx;
+ break;
+ case 41: w2[2] = (w2[2] & 0xff000000) | (wx >> 8);
+ w2[3] = (w2[3] & 0x00ffffff) | (wx << 24);
+ break;
+ case 42: w2[2] = (w2[2] & 0xffff0000) | (wx >> 16);
+ w2[3] = (w2[3] & 0x0000ffff) | (wx << 16);
+ break;
+ case 43: w2[2] = (w2[2] & 0xffffff00) | (wx >> 24);
+ w2[3] = (w2[3] & 0x000000ff) | (wx << 8);
+ break;
+ case 44: w2[3] = wx;
+ break;
+ case 45: w2[3] = (w2[3] & 0xff000000) | (wx >> 8);
+ w3[0] = (w3[0] & 0x00ffffff) | (wx << 24);
+ break;
+ case 46: w2[3] = (w2[3] & 0xffff0000) | (wx >> 16);
+ w3[0] = (w3[0] & 0x0000ffff) | (wx << 16);
+ break;
+ case 47: w2[3] = (w2[3] & 0xffffff00) | (wx >> 24);
+ w3[0] = (w3[0] & 0x000000ff) | (wx << 8);
+ break;
+ case 48: w3[0] = wx;
+ break;
+ case 49: w3[0] = (w3[0] & 0xff000000) | (wx >> 8);
+ w3[1] = (w3[1] & 0x00ffffff) | (wx << 24);
+ break;
+ case 50: w3[0] = (w3[0] & 0xffff0000) | (wx >> 16);
+ w3[1] = (w3[1] & 0x0000ffff) | (wx << 16);
+ break;
+ case 51: w3[0] = (w3[0] & 0xffffff00) | (wx >> 24);
+ w3[1] = (w3[1] & 0x000000ff) | (wx << 8);
+ break;
+ case 52: w3[1] = wx;
+ break;
+ case 53: w3[1] = (w3[1] & 0xff000000) | (wx >> 8);
+ w3[2] = (w3[2] & 0x00ffffff) | (wx << 24);
+ break;
+ case 54: w3[1] = (w3[1] & 0xffff0000) | (wx >> 16);
+ w3[2] = (w3[2] & 0x0000ffff) | (wx << 16);
+ break;
+ case 55: w3[1] = (w3[1] & 0xffffff00) | (wx >> 24);
+ w3[2] = (w3[2] & 0x000000ff) | (wx << 8);
+ break;
+ case 56: w3[2] = wx;
+ break;
+ case 57: w3[2] = (w3[2] & 0xff000000) | (wx >> 8);
+ w3[3] = (w3[3] & 0x00ffffff) | (wx << 24);
+ break;
+ case 58: w3[2] = (w3[2] & 0xffff0000) | (wx >> 16);
+ w3[3] = (w3[3] & 0x0000ffff) | (wx << 16);
+ break;
+ case 59: w3[2] = (w3[2] & 0xffffff00) | (wx >> 24);
+ w3[3] = (w3[3] & 0x000000ff) | (wx << 8);
+ break;
+ case 60: w3[3] = wx;
+ break;
+ case 61: w3[3] = (w3[3] & 0xff000000) | (wx >> 8);
+ //w4[0] = (w4[0] & 0x00ffffff) | (wx << 24);
+ break;
+ case 62: w3[3] = (w3[3] & 0xffff0000) | (wx >> 16);
+ //w4[0] = (w4[0] & 0x0000ffff) | (wx << 16);
+ break;
+ case 63: w3[3] = (w3[3] & 0xffffff00) | (wx >> 24);
+ //w4[0] = (w4[0] & 0x000000ff) | (wx << 8);
+ break;
+ }
+}
+
/**
* vector functions as scalar (for outer loop usage)
*/
-static void append_0x80_1x4_S (u32 w0[4], const u32 offset)
+inline void append_0x01_2x4_S (u32x w0[4], u32x w1[4], const u32 offset)
+{
+ switch (offset)
+ {
+ case 0:
+ w0[0] = 0x01;
+ break;
+
+ case 1:
+ w0[0] = w0[0] | 0x0100;
+ break;
+
+ case 2:
+ w0[0] = w0[0] | 0x010000;
+ break;
+
+ case 3:
+ w0[0] = w0[0] | 0x01000000;
+ break;
+
+ case 4:
+ w0[1] = 0x01;
+ break;
+
+ case 5:
+ w0[1] = w0[1] | 0x0100;
+ break;
+
+ case 6:
+ w0[1] = w0[1] | 0x010000;
+ break;
+
+ case 7:
+ w0[1] = w0[1] | 0x01000000;
+ break;
+
+ case 8:
+ w0[2] = 0x01;
+ break;
+
+ case 9:
+ w0[2] = w0[2] | 0x0100;
+ break;
+
+ case 10:
+ w0[2] = w0[2] | 0x010000;
+ break;
+
+ case 11:
+ w0[2] = w0[2] | 0x01000000;
+ break;
+
+ case 12:
+ w0[3] = 0x01;
+ break;
+
+ case 13:
+ w0[3] = w0[3] | 0x0100;
+ break;
+
+ case 14:
+ w0[3] = w0[3] | 0x010000;
+ break;
+
+ case 15:
+ w0[3] = w0[3] | 0x01000000;
+ break;
+
+ case 16:
+ w1[0] = 0x01;
+ break;
+
+ case 17:
+ w1[0] = w1[0] | 0x0100;
+ break;
+
+ case 18:
+ w1[0] = w1[0] | 0x010000;
+ break;
+
+ case 19:
+ w1[0] = w1[0] | 0x01000000;
+ break;
+
+ case 20:
+ w1[1] = 0x01;
+ break;
+
+ case 21:
+ w1[1] = w1[1] | 0x0100;
+ break;
+
+ case 22:
+ w1[1] = w1[1] | 0x010000;
+ break;
+
+ case 23:
+ w1[1] = w1[1] | 0x01000000;
+ break;
+
+ case 24:
+ w1[2] = 0x01;
+ break;
+
+ case 25:
+ w1[2] = w1[2] | 0x0100;
+ break;
+
+ case 26:
+ w1[2] = w1[2] | 0x010000;
+ break;
+
+ case 27:
+ w1[2] = w1[2] | 0x01000000;
+ break;
+
+ case 28:
+ w1[3] = 0x01;
+ break;
+
+ case 29:
+ w1[3] = w1[3] | 0x0100;
+ break;
+
+ case 30:
+ w1[3] = w1[3] | 0x010000;
+ break;
+
+ case 31:
+ w1[3] = w1[3] | 0x01000000;
+ break;
+ }
+}
+
+inline void append_0x80_1x4_S (u32 w0[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
+inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
+inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
{
switch (offset)
{
}
}
-static void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
+inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
{
switch (offset)
{
}
}
-static void truncate_block_S (u32 w[4], const u32 len)
+inline void truncate_block_S (u32 w[4], const u32 len)
{
switch (len)
{
}
}
-static void make_unicode_S (const u32 in[4], u32 out1[4], u32 out2[4])
+inline void make_unicode_S (const u32 in[4], u32 out1[4], u32 out2[4])
{
#ifdef IS_NV
out2[3] = __byte_perm_S (in[3], 0, 0x7372);
#endif
}
-static void undo_unicode_S (const u32 in1[4], const u32 in2[4], u32 out[4])
+inline void undo_unicode_S (const u32 in1[4], const u32 in2[4], u32 out[4])
{
#ifdef IS_NV
out[0] = __byte_perm_S (in1[0], in1[1], 0x6420);
#endif
}
-static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
+inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
{
#if defined IS_AMD || defined IS_GENERIC
const int offset_mod_4 = offset & 3;
#endif
}
-static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
+inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
{
#if defined IS_AMD || defined IS_GENERIC
switch (offset / 4)
}
#endif
}
+
+/**
+ * vector functions on scalar types (for inner loop usage)
+ */
+
+#define PACKVS2(sn,vn,e) \
+ sn[0] = vn[0].s##e; \
+ sn[1] = vn[1].s##e;
+
+#define PACKSV2(sn,vn,e) \
+ vn[0].s##e = sn[0]; \
+ vn[1].s##e = sn[1];
+
+#define PACKVS24(s0,s1,v0,v1,e) \
+ PACKVS4 (s0, v0, e); \
+ PACKVS4 (s1, v1, e);
+
+#define PACKSV24(s0,s1,v0,v1,e) \
+ PACKSV4 (s0, v0, e); \
+ PACKSV4 (s1, v1, e);
+
+#define PACKVS4(sn,vn,e) \
+ sn[0] = vn[0].s##e; \
+ sn[1] = vn[1].s##e; \
+ sn[2] = vn[2].s##e; \
+ sn[3] = vn[3].s##e;
+
+#define PACKSV4(sn,vn,e) \
+ vn[0].s##e = sn[0]; \
+ vn[1].s##e = sn[1]; \
+ vn[2].s##e = sn[2]; \
+ vn[3].s##e = sn[3];
+
+#define PACKVS44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
+ PACKVS4 (s0, v0, e); \
+ PACKVS4 (s1, v1, e); \
+ PACKVS4 (s2, v2, e); \
+ PACKVS4 (s3, v3, e);
+
+#define PACKSV44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
+ PACKSV4 (s0, v0, e); \
+ PACKSV4 (s1, v1, e); \
+ PACKSV4 (s2, v2, e); \
+ PACKSV4 (s3, v3, e);
+
+inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
+{
+ #if VECT_SIZE == 1
+
+ switch_buffer_by_offset_le_S (w0, w1, w2, w3, offset);
+
+ #else
+
+ u32 t0[4];
+ u32 t1[4];
+ u32 t2[4];
+ u32 t3[4];
+
+ #endif
+
+ #if VECT_SIZE == 2
+
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
+
+ #elif VECT_SIZE == 4
+
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
+
+ #elif VECT_SIZE == 8
+
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
+
+ #elif VECT_SIZE == 16
+
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
+
+ #endif
+}
+
+inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
+{
+ #if VECT_SIZE == 1
+
+ append_0x01_2x4_S (w0, w1, offset);
+
+ #else
+
+ u32 t0[4];
+ u32 t1[4];
+
+ #endif
+
+ #if VECT_SIZE == 2
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+
+ #elif VECT_SIZE == 4
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+ PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
+ PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
+
+ #elif VECT_SIZE == 8
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+ PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
+ PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
+ PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
+ PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
+ PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
+ PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
+
+ #elif VECT_SIZE == 16
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+ PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
+ PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
+ PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
+ PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
+ PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
+ PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
+ PACKVS24 (t0, t1, w0, w1, 8); append_0x01_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
+ PACKVS24 (t0, t1, w0, w1, 9); append_0x01_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
+ PACKVS24 (t0, t1, w0, w1, a); append_0x01_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
+ PACKVS24 (t0, t1, w0, w1, b); append_0x01_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
+ PACKVS24 (t0, t1, w0, w1, c); append_0x01_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
+ PACKVS24 (t0, t1, w0, w1, d); append_0x01_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
+ PACKVS24 (t0, t1, w0, w1, e); append_0x01_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
+ PACKVS24 (t0, t1, w0, w1, f); append_0x01_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
+
+ #endif
+}
+
+inline void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
+{
+ #if VECT_SIZE == 1
+
+ append_0x80_2x4_S (w0, w1, offset);
+
+ #else
+
+ u32 t0[4];
+ u32 t1[4];
+
+ #endif
+
+ #if VECT_SIZE == 2
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+
+ #elif VECT_SIZE == 4
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+ PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
+ PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
+
+ #elif VECT_SIZE == 8
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+ PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
+ PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
+ PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
+ PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
+ PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
+ PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
+
+ #elif VECT_SIZE == 16
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+ PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
+ PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
+ PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
+ PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
+ PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
+ PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
+ PACKVS24 (t0, t1, w0, w1, 8); append_0x80_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
+ PACKVS24 (t0, t1, w0, w1, 9); append_0x80_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
+ PACKVS24 (t0, t1, w0, w1, a); append_0x80_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
+ PACKVS24 (t0, t1, w0, w1, b); append_0x80_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
+ PACKVS24 (t0, t1, w0, w1, c); append_0x80_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
+ PACKVS24 (t0, t1, w0, w1, d); append_0x80_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
+ PACKVS24 (t0, t1, w0, w1, e); append_0x80_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
+ PACKVS24 (t0, t1, w0, w1, f); append_0x80_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
+
+ #endif
+}
+
+inline void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
+{
+ #if VECT_SIZE == 1
+
+ append_0x80_4x4_S (w0, w1, w2, w3, offset);
+
+ #else
+
+ u32 t0[4];
+ u32 t1[4];
+ u32 t2[4];
+ u32 t3[4];
+
+ #endif
+
+ #if VECT_SIZE == 2
+
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
+
+ #elif VECT_SIZE == 4
+
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
+
+ #elif VECT_SIZE == 8
+
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
+
+ #elif VECT_SIZE == 16
+
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); append_0x80_4x4_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); append_0x80_4x4_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); append_0x80_4x4_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); append_0x80_4x4_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); append_0x80_4x4_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); append_0x80_4x4_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); append_0x80_4x4_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
+ PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); append_0x80_4x4_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
+
+ #endif
+}