case 31: w1[3] = amd_bytealign (wx, w1[3] << 8, 1);
w2[0] = amd_bytealign (w2[0] >> 24, wx, 1);
break;
- case 32: w0[0] = wx;
+ case 32: w2[0] = wx;
break;
case 33: w2[0] = amd_bytealign (wx, w2[0] << 24, 3);
w2[1] = amd_bytealign (w2[1] >> 8, wx, 3);
* vector functions as scalar (for outer loop usage)
*/
+static void append_0x01_2x4_S (u32x w0[4], u32x w1[4], const u32 offset)
+{
+ switch (offset)
+ {
+ case 0:
+ w0[0] = 0x01;
+ break;
+
+ case 1:
+ w0[0] = w0[0] | 0x0100;
+ break;
+
+ case 2:
+ w0[0] = w0[0] | 0x010000;
+ break;
+
+ case 3:
+ w0[0] = w0[0] | 0x01000000;
+ break;
+
+ case 4:
+ w0[1] = 0x01;
+ break;
+
+ case 5:
+ w0[1] = w0[1] | 0x0100;
+ break;
+
+ case 6:
+ w0[1] = w0[1] | 0x010000;
+ break;
+
+ case 7:
+ w0[1] = w0[1] | 0x01000000;
+ break;
+
+ case 8:
+ w0[2] = 0x01;
+ break;
+
+ case 9:
+ w0[2] = w0[2] | 0x0100;
+ break;
+
+ case 10:
+ w0[2] = w0[2] | 0x010000;
+ break;
+
+ case 11:
+ w0[2] = w0[2] | 0x01000000;
+ break;
+
+ case 12:
+ w0[3] = 0x01;
+ break;
+
+ case 13:
+ w0[3] = w0[3] | 0x0100;
+ break;
+
+ case 14:
+ w0[3] = w0[3] | 0x010000;
+ break;
+
+ case 15:
+ w0[3] = w0[3] | 0x01000000;
+ break;
+
+ case 16:
+ w1[0] = 0x01;
+ break;
+
+ case 17:
+ w1[0] = w1[0] | 0x0100;
+ break;
+
+ case 18:
+ w1[0] = w1[0] | 0x010000;
+ break;
+
+ case 19:
+ w1[0] = w1[0] | 0x01000000;
+ break;
+
+ case 20:
+ w1[1] = 0x01;
+ break;
+
+ case 21:
+ w1[1] = w1[1] | 0x0100;
+ break;
+
+ case 22:
+ w1[1] = w1[1] | 0x010000;
+ break;
+
+ case 23:
+ w1[1] = w1[1] | 0x01000000;
+ break;
+
+ case 24:
+ w1[2] = 0x01;
+ break;
+
+ case 25:
+ w1[2] = w1[2] | 0x0100;
+ break;
+
+ case 26:
+ w1[2] = w1[2] | 0x010000;
+ break;
+
+ case 27:
+ w1[2] = w1[2] | 0x01000000;
+ break;
+
+ case 28:
+ w1[3] = 0x01;
+ break;
+
+ case 29:
+ w1[3] = w1[3] | 0x0100;
+ break;
+
+ case 30:
+ w1[3] = w1[3] | 0x010000;
+ break;
+
+ case 31:
+ w1[3] = w1[3] | 0x01000000;
+ break;
+ }
+}
+
static void append_0x80_1x4_S (u32 w0[4], const u32 offset)
{
switch (offset)
* vector functions on scalar types (for inner loop usage)
*/
+#define PACKVS2(sn,vn,e) \
+ sn[0] = vn[0].s##e; \
+ sn[1] = vn[1].s##e;
+
+#define PACKSV2(sn,vn,e) \
+ vn[0].s##e = sn[0]; \
+ vn[1].s##e = sn[1];
+
+#define PACKVS24(s0,s1,v0,v1,e) \
+ PACKVS4 (s0, v0, e); \
+ PACKVS4 (s1, v1, e);
+
+#define PACKSV24(s0,s1,v0,v1,e) \
+ PACKSV4 (s0, v0, e); \
+ PACKSV4 (s1, v1, e);
+
#define PACKVS4(sn,vn,e) \
sn[0] = vn[0].s##e; \
sn[1] = vn[1].s##e; \
#endif
}
+static void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
+{
+ #if VECT_SIZE == 1
+
+ append_0x01_2x4_S (w0, w1, offset);
+
+ #else
+
+ u32 t0[4];
+ u32 t1[4];
+
+ #endif
+
+ #if VECT_SIZE == 2
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+
+ #elif VECT_SIZE == 4
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+ PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
+ PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
+
+ #elif VECT_SIZE == 8
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+ PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
+ PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
+ PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
+ PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
+ PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
+ PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
+
+ #elif VECT_SIZE == 16
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+ PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
+ PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
+ PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
+ PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
+ PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
+ PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
+ PACKVS24 (t0, t1, w0, w1, 8); append_0x01_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
+ PACKVS24 (t0, t1, w0, w1, 9); append_0x01_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
+ PACKVS24 (t0, t1, w0, w1, a); append_0x01_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
+ PACKVS24 (t0, t1, w0, w1, b); append_0x01_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
+ PACKVS24 (t0, t1, w0, w1, c); append_0x01_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
+ PACKVS24 (t0, t1, w0, w1, d); append_0x01_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
+ PACKVS24 (t0, t1, w0, w1, e); append_0x01_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
+ PACKVS24 (t0, t1, w0, w1, f); append_0x01_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
+
+ #endif
+}
+
+static void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
+{
+ #if VECT_SIZE == 1
+
+ append_0x80_2x4_S (w0, w1, offset);
+
+ #else
+
+ u32 t0[4];
+ u32 t1[4];
+
+ #endif
+
+ #if VECT_SIZE == 2
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+
+ #elif VECT_SIZE == 4
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+ PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
+ PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
+
+ #elif VECT_SIZE == 8
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+ PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
+ PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
+ PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
+ PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
+ PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
+ PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
+
+ #elif VECT_SIZE == 16
+
+ PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
+ PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
+ PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
+ PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
+ PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
+ PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
+ PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
+ PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
+ PACKVS24 (t0, t1, w0, w1, 8); append_0x80_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
+ PACKVS24 (t0, t1, w0, w1, 9); append_0x80_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
+ PACKVS24 (t0, t1, w0, w1, a); append_0x80_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
+ PACKVS24 (t0, t1, w0, w1, b); append_0x80_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
+ PACKVS24 (t0, t1, w0, w1, c); append_0x80_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
+ PACKVS24 (t0, t1, w0, w1, d); append_0x80_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
+ PACKVS24 (t0, t1, w0, w1, e); append_0x80_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
+ PACKVS24 (t0, t1, w0, w1, f); append_0x80_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
+
+ #endif
+}
+
static void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
{
#if VECT_SIZE == 1