#ifdef IS_NV
switch (num)
{
- case 0: out1[3] = in1[3];
- out1[2] = in1[2];
- out1[1] = in1[1];
- out1[0] = in1[0];
- out0[3] = in0[3];
- out0[2] = in0[2];
+ case 0: out0[0] = in0[0];
out0[1] = in0[1];
- out0[0] = in0[0];
- break;
- case 1: out1[3] = __byte_perm (in1[2], in1[3], 0x6543);
- out1[2] = __byte_perm (in1[1], in1[2], 0x6543);
- out1[1] = __byte_perm (in1[0], in1[1], 0x6543);
- out1[0] = __byte_perm (in0[3], in1[0], 0x6543);
- out0[3] = __byte_perm (in0[2], in0[3], 0x6543);
- out0[2] = __byte_perm (in0[1], in0[2], 0x6543);
- out0[1] = __byte_perm (in0[0], in0[1], 0x6543);
- out0[0] = __byte_perm ( 0, in0[0], 0x6543);
- break;
- case 2: out1[3] = __byte_perm (in1[2], in1[3], 0x5432);
- out1[2] = __byte_perm (in1[1], in1[2], 0x5432);
- out1[1] = __byte_perm (in1[0], in1[1], 0x5432);
- out1[0] = __byte_perm (in0[3], in1[0], 0x5432);
- out0[3] = __byte_perm (in0[2], in0[3], 0x5432);
- out0[2] = __byte_perm (in0[1], in0[2], 0x5432);
- out0[1] = __byte_perm (in0[0], in0[1], 0x5432);
- out0[0] = __byte_perm ( 0, in0[0], 0x5432);
- break;
- case 3: out1[3] = __byte_perm (in1[2], in1[3], 0x4321);
- out1[2] = __byte_perm (in1[1], in1[2], 0x4321);
- out1[1] = __byte_perm (in1[0], in1[1], 0x4321);
- out1[0] = __byte_perm (in0[3], in1[0], 0x4321);
- out0[3] = __byte_perm (in0[2], in0[3], 0x4321);
- out0[2] = __byte_perm (in0[1], in0[2], 0x4321);
- out0[1] = __byte_perm (in0[0], in0[1], 0x4321);
- out0[0] = __byte_perm ( 0, in0[0], 0x4321);
- break;
- case 4: out1[3] = in1[2];
- out1[2] = in1[1];
- out1[1] = in1[0];
- out1[0] = in0[3];
- out0[3] = in0[2];
- out0[2] = in0[1];
- out0[1] = in0[0];
- out0[0] = 0;
+ out0[2] = in0[2];
+ out0[3] = in0[3];
+ out1[0] = in1[0];
+ out1[1] = in1[1];
+ out1[2] = in1[2];
+ out1[3] = in1[3];
break;
- case 5: out1[3] = __byte_perm (in1[1], in1[2], 0x6543);
- out1[2] = __byte_perm (in1[0], in1[1], 0x6543);
- out1[1] = __byte_perm (in0[3], in1[0], 0x6543);
- out1[0] = __byte_perm (in0[2], in0[3], 0x6543);
- out0[3] = __byte_perm (in0[1], in0[2], 0x6543);
- out0[2] = __byte_perm (in0[0], in0[1], 0x6543);
- out0[1] = __byte_perm ( 0, in0[0], 0x6543);
- out0[0] = 0;
+ case 1: out0[0] = __byte_perm (in0[0], in0[1], 0x4321);
+ out0[1] = __byte_perm (in0[1], in0[2], 0x4321);
+ out0[2] = __byte_perm (in0[2], in0[3], 0x4321);
+ out0[3] = __byte_perm (in0[3], in1[0], 0x4321);
+ out1[0] = __byte_perm (in1[0], in1[1], 0x4321);
+ out1[1] = __byte_perm (in1[1], in1[2], 0x4321);
+ out1[2] = __byte_perm (in1[2], in1[3], 0x4321);
+ out1[3] = __byte_perm (in1[3], 0, 0x4321);
break;
- case 6: out1[3] = __byte_perm (in1[1], in1[2], 0x5432);
- out1[2] = __byte_perm (in1[0], in1[1], 0x5432);
- out1[1] = __byte_perm (in0[3], in1[0], 0x5432);
- out1[0] = __byte_perm (in0[2], in0[3], 0x5432);
- out0[3] = __byte_perm (in0[1], in0[2], 0x5432);
- out0[2] = __byte_perm (in0[0], in0[1], 0x5432);
- out0[1] = __byte_perm ( 0, in0[0], 0x5432);
- out0[0] = 0;
+ case 2: out0[0] = __byte_perm (in0[0], in0[1], 0x5432);
+ out0[1] = __byte_perm (in0[1], in0[2], 0x5432);
+ out0[2] = __byte_perm (in0[2], in0[3], 0x5432);
+ out0[3] = __byte_perm (in0[3], in1[0], 0x5432);
+ out1[0] = __byte_perm (in1[0], in1[1], 0x5432);
+ out1[1] = __byte_perm (in1[1], in1[2], 0x5432);
+ out1[2] = __byte_perm (in1[2], in1[3], 0x5432);
+ out1[3] = __byte_perm (in1[3], 0, 0x5432);
break;
- case 7: out1[3] = __byte_perm (in1[1], in1[2], 0x4321);
- out1[2] = __byte_perm (in1[0], in1[1], 0x4321);
- out1[1] = __byte_perm (in0[3], in1[0], 0x4321);
- out1[0] = __byte_perm (in0[2], in0[3], 0x4321);
- out0[3] = __byte_perm (in0[1], in0[2], 0x4321);
- out0[2] = __byte_perm (in0[0], in0[1], 0x4321);
- out0[1] = __byte_perm ( 0, in0[0], 0x4321);
- out0[0] = 0;
+ case 3: out0[0] = __byte_perm (in0[0], in0[1], 0x6543);
+ out0[1] = __byte_perm (in0[1], in0[2], 0x6543);
+ out0[2] = __byte_perm (in0[2], in0[3], 0x6543);
+ out0[3] = __byte_perm (in0[3], in1[0], 0x6543);
+ out1[0] = __byte_perm (in1[0], in1[1], 0x6543);
+ out1[1] = __byte_perm (in1[1], in1[2], 0x6543);
+ out1[2] = __byte_perm (in1[2], in1[3], 0x6543);
+ out1[3] = __byte_perm (in1[3], 0, 0x6543);
break;
- case 8: out1[3] = in1[1];
- out1[2] = in1[0];
- out1[1] = in0[3];
- out1[0] = in0[2];
- out0[3] = in0[1];
- out0[2] = in0[0];
- out0[1] = 0;
- out0[0] = 0;
+ case 4: out0[0] = in0[1];
+ out0[1] = in0[2];
+ out0[2] = in0[3];
+ out0[3] = in1[0];
+ out1[0] = in1[1];
+ out1[1] = in1[2];
+ out1[2] = in1[3];
+ out1[3] = 0;
break;
- case 9: out1[3] = __byte_perm (in1[0], in1[1], 0x6543);
- out1[2] = __byte_perm (in0[3], in1[0], 0x6543);
- out1[1] = __byte_perm (in0[2], in0[3], 0x6543);
- out1[0] = __byte_perm (in0[1], in0[2], 0x6543);
- out0[3] = __byte_perm (in0[0], in0[1], 0x6543);
- out0[2] = __byte_perm ( 0, in0[0], 0x6543);
- out0[1] = 0;
- out0[0] = 0;
+ case 5: out0[0] = __byte_perm (in0[1], in0[2], 0x4321);
+ out0[1] = __byte_perm (in0[2], in0[3], 0x4321);
+ out0[2] = __byte_perm (in0[3], in1[0], 0x4321);
+ out0[3] = __byte_perm (in1[0], in1[1], 0x4321);
+ out1[0] = __byte_perm (in1[1], in1[2], 0x4321);
+ out1[1] = __byte_perm (in1[2], in1[3], 0x4321);
+ out1[2] = __byte_perm (in1[3], 0, 0x4321);
+ out1[3] = 0;
break;
- case 10: out1[3] = __byte_perm (in1[0], in1[1], 0x5432);
- out1[2] = __byte_perm (in0[3], in1[0], 0x5432);
- out1[1] = __byte_perm (in0[2], in0[3], 0x5432);
- out1[0] = __byte_perm (in0[1], in0[2], 0x5432);
- out0[3] = __byte_perm (in0[0], in0[1], 0x5432);
- out0[2] = __byte_perm ( 0, in0[0], 0x5432);
- out0[1] = 0;
- out0[0] = 0;
+ case 6: out0[0] = __byte_perm (in0[1], in0[2], 0x5432);
+ out0[1] = __byte_perm (in0[2], in0[3], 0x5432);
+ out0[2] = __byte_perm (in0[3], in1[0], 0x5432);
+ out0[3] = __byte_perm (in1[0], in1[1], 0x5432);
+ out1[0] = __byte_perm (in1[1], in1[2], 0x5432);
+ out1[1] = __byte_perm (in1[2], in1[3], 0x5432);
+ out1[2] = __byte_perm (in1[3], 0, 0x5432);
+ out1[3] = 0;
break;
- case 11: out1[3] = __byte_perm (in1[0], in1[1], 0x4321);
- out1[2] = __byte_perm (in0[3], in1[0], 0x4321);
- out1[1] = __byte_perm (in0[2], in0[3], 0x4321);
- out1[0] = __byte_perm (in0[1], in0[2], 0x4321);
- out0[3] = __byte_perm (in0[0], in0[1], 0x4321);
- out0[2] = __byte_perm ( 0, in0[0], 0x4321);
- out0[1] = 0;
- out0[0] = 0;
+ case 7: out0[0] = __byte_perm (in0[1], in0[2], 0x6543);
+ out0[1] = __byte_perm (in0[2], in0[3], 0x6543);
+ out0[2] = __byte_perm (in0[3], in1[0], 0x6543);
+ out0[3] = __byte_perm (in1[0], in1[1], 0x6543);
+ out1[0] = __byte_perm (in1[1], in1[2], 0x6543);
+ out1[1] = __byte_perm (in1[2], in1[3], 0x6543);
+ out1[2] = __byte_perm (in1[3], 0, 0x6543);
+ out1[3] = 0;
break;
- case 12: out1[3] = in1[0];
- out1[2] = in0[3];
- out1[1] = in0[2];
- out1[0] = in0[1];
- out0[3] = in0[0];
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ case 8: out0[0] = in0[2];
+ out0[1] = in0[3];
+ out0[2] = in1[0];
+ out0[3] = in1[1];
+ out1[0] = in1[2];
+ out1[1] = in1[3];
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 13: out1[3] = __byte_perm (in0[3], in1[0], 0x6543);
- out1[2] = __byte_perm (in0[2], in0[3], 0x6543);
- out1[1] = __byte_perm (in0[1], in0[2], 0x6543);
- out1[0] = __byte_perm (in0[0], in0[1], 0x6543);
- out0[3] = __byte_perm ( 0, in0[0], 0x6543);
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ case 9: out0[0] = __byte_perm (in0[2], in0[3], 0x4321);
+ out0[1] = __byte_perm (in0[3], in1[0], 0x4321);
+ out0[2] = __byte_perm (in1[0], in1[1], 0x4321);
+ out0[3] = __byte_perm (in1[1], in1[2], 0x4321);
+ out1[0] = __byte_perm (in1[2], in1[3], 0x4321);
+ out1[1] = __byte_perm (in1[3], 0, 0x4321);
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 14: out1[3] = __byte_perm (in0[3], in1[0], 0x5432);
- out1[2] = __byte_perm (in0[2], in0[3], 0x5432);
- out1[1] = __byte_perm (in0[1], in0[2], 0x5432);
- out1[0] = __byte_perm (in0[0], in0[1], 0x5432);
- out0[3] = __byte_perm ( 0, in0[0], 0x5432);
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ case 10: out0[0] = __byte_perm (in0[2], in0[3], 0x5432);
+ out0[1] = __byte_perm (in0[3], in1[0], 0x5432);
+ out0[2] = __byte_perm (in1[0], in1[1], 0x5432);
+ out0[3] = __byte_perm (in1[1], in1[2], 0x5432);
+ out1[0] = __byte_perm (in1[2], in1[3], 0x5432);
+ out1[1] = __byte_perm (in1[3], 0, 0x5432);
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 15: out1[3] = __byte_perm (in0[3], in1[0], 0x4321);
- out1[2] = __byte_perm (in0[2], in0[3], 0x4321);
- out1[1] = __byte_perm (in0[1], in0[2], 0x4321);
- out1[0] = __byte_perm (in0[0], in0[1], 0x4321);
- out0[3] = __byte_perm ( 0, in0[0], 0x4321);
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ case 11: out0[0] = __byte_perm (in0[2], in0[3], 0x6543);
+ out0[1] = __byte_perm (in0[3], in1[0], 0x6543);
+ out0[2] = __byte_perm (in1[0], in1[1], 0x6543);
+ out0[3] = __byte_perm (in1[1], in1[2], 0x6543);
+ out1[0] = __byte_perm (in1[2], in1[3], 0x6543);
+ out1[1] = __byte_perm (in1[3], 0, 0x6543);
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 16: out1[3] = in0[3];
- out1[2] = in0[2];
- out1[1] = in0[1];
- out1[0] = in0[0];
- out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ case 12: out0[0] = in0[3];
+ out0[1] = in1[0];
+ out0[2] = in1[1];
+ out0[3] = in1[2];
+ out1[0] = in1[3];
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 17: out1[3] = __byte_perm (in0[2], in0[3], 0x6543);
- out1[2] = __byte_perm (in0[1], in0[2], 0x6543);
- out1[1] = __byte_perm (in0[0], in0[1], 0x6543);
- out1[0] = __byte_perm ( 0, in0[0], 0x6543);
- out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ case 13:
+ out0[0] = __byte_perm (in0[3], in1[0], 0x4321);
+ out0[1] = __byte_perm (in1[0], in1[1], 0x4321);
+ out0[2] = __byte_perm (in1[1], in1[2], 0x4321);
+ out0[3] = __byte_perm (in1[2], in1[3], 0x4321);
+ out1[0] = __byte_perm (in1[3], 0, 0x4321);
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 18: out1[3] = __byte_perm (in0[2], in0[3], 0x5432);
- out1[2] = __byte_perm (in0[1], in0[2], 0x5432);
- out1[1] = __byte_perm (in0[0], in0[1], 0x5432);
- out1[0] = __byte_perm ( 0, in0[0], 0x5432);
- out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ case 14: out0[0] = __byte_perm (in0[3], in1[0], 0x5432);
+ out0[1] = __byte_perm (in1[0], in1[1], 0x5432);
+ out0[2] = __byte_perm (in1[1], in1[2], 0x5432);
+ out0[3] = __byte_perm (in1[2], in1[3], 0x5432);
+ out1[0] = __byte_perm (in1[3], 0, 0x5432);
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 19: out1[3] = __byte_perm (in0[2], in0[3], 0x4321);
- out1[2] = __byte_perm (in0[1], in0[2], 0x4321);
- out1[1] = __byte_perm (in0[0], in0[1], 0x4321);
- out1[0] = __byte_perm ( 0, in0[0], 0x4321);
- out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ case 15: out0[0] = __byte_perm (in0[3], in1[0], 0x6543);
+ out0[1] = __byte_perm (in1[0], in1[1], 0x6543);
+ out0[2] = __byte_perm (in1[1], in1[2], 0x6543);
+ out0[3] = __byte_perm (in1[2], in1[3], 0x6543);
+ out1[0] = __byte_perm (in1[3], 0, 0x6543);
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 20: out1[3] = in0[2];
- out1[2] = in0[1];
- out1[1] = in0[0];
+ case 16: out0[0] = in1[0];
+ out0[1] = in1[1];
+ out0[2] = in1[2];
+ out0[3] = in1[3];
out1[0] = 0;
- out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 21: out1[3] = __byte_perm (in0[1], in0[2], 0x6543);
- out1[2] = __byte_perm (in0[0], in0[1], 0x6543);
- out1[1] = __byte_perm ( 0, in0[0], 0x6543);
+ case 17: out0[0] = __byte_perm (in1[0], in1[1], 0x4321);
+ out0[1] = __byte_perm (in1[1], in1[2], 0x4321);
+ out0[2] = __byte_perm (in1[2], in1[3], 0x4321);
+ out0[3] = __byte_perm (in1[3], 0, 0x4321);
out1[0] = 0;
- out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 22: out1[3] = __byte_perm (in0[1], in0[2], 0x5432);
- out1[2] = __byte_perm (in0[0], in0[1], 0x5432);
- out1[1] = __byte_perm ( 0, in0[0], 0x5432);
+ case 18: out0[0] = __byte_perm (in1[0], in1[1], 0x5432);
+ out0[1] = __byte_perm (in1[1], in1[2], 0x5432);
+ out0[2] = __byte_perm (in1[2], in1[3], 0x5432);
+ out0[3] = __byte_perm (in1[3], 0, 0x5432);
out1[0] = 0;
- out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 23: out1[3] = __byte_perm (in0[1], in0[2], 0x4321);
- out1[2] = __byte_perm (in0[0], in0[1], 0x4321);
- out1[1] = __byte_perm ( 0, in0[0], 0x4321);
+ case 19: out0[0] = __byte_perm (in1[0], in1[1], 0x6543);
+ out0[1] = __byte_perm (in1[1], in1[2], 0x6543);
+ out0[2] = __byte_perm (in1[2], in1[3], 0x6543);
+ out0[3] = __byte_perm (in1[3], 0, 0x6543);
out1[0] = 0;
- out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
- break;
- case 24: out1[3] = in0[1];
- out1[2] = in0[0];
out1[1] = 0;
- out1[0] = 0;
- out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 25: out1[3] = __byte_perm (in0[0], in0[1], 0x6543);
- out1[2] = __byte_perm ( 0, in0[0], 0x6543);
- out1[1] = 0;
- out1[0] = 0;
+ case 20: out0[0] = in1[1];
+ out0[1] = in1[2];
+ out0[2] = in1[3];
out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
- break;
- case 26: out1[3] = __byte_perm (in0[0], in0[1], 0x5432);
- out1[2] = __byte_perm ( 0, in0[0], 0x5432);
- out1[1] = 0;
out1[0] = 0;
- out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
- break;
- case 27: out1[3] = __byte_perm (in0[0], in0[1], 0x4321);
- out1[2] = __byte_perm ( 0, in0[0], 0x4321);
out1[1] = 0;
- out1[0] = 0;
- out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
- break;
- case 28: out1[3] = in0[0];
out1[2] = 0;
- out1[1] = 0;
- out1[0] = 0;
- out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ out1[3] = 0;
break;
- case 29: out1[3] = __byte_perm ( 0, in0[0], 0x6543);
- out1[2] = 0;
- out1[1] = 0;
- out1[0] = 0;
+ case 21: out0[0] = __byte_perm (in1[1], in1[2], 0x4321);
+ out0[1] = __byte_perm (in1[2], in1[3], 0x4321);
+ out0[2] = __byte_perm (in1[3], 0, 0x4321);
out0[3] = 0;
- out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
- break;
- case 30: out1[3] = __byte_perm ( 0, in0[0], 0x5432);
+ out1[0] = 0;
+ out1[1] = 0;
out1[2] = 0;
+ out1[3] = 0;
+ break;
+ case 22: out0[0] = __byte_perm (in1[1], in1[2], 0x5432);
+ out0[1] = __byte_perm (in1[2], in1[3], 0x5432);
+ out0[2] = __byte_perm (in1[3], 0, 0x5432);
+ out0[3] = 0;
+ out1[0] = 0;
out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
+ break;
+ case 23: out0[0] = __byte_perm (in1[1], in1[2], 0x6543);
+ out0[1] = __byte_perm (in1[2], in1[3], 0x6543);
+ out0[2] = __byte_perm (in1[3], 0, 0x6543);
+ out0[3] = 0;
out1[0] = 0;
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
+ break;
+ case 24: out0[0] = in1[2];
+ out0[1] = in1[3];
+ out0[2] = 0;
out0[3] = 0;
+ out1[0] = 0;
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
+ break;
+ case 25: out0[0] = __byte_perm (in1[2], in1[3], 0x4321);
+ out0[1] = __byte_perm (in1[3], 0, 0x4321);
out0[2] = 0;
- out0[1] = 0;
- out0[0] = 0;
+ out0[3] = 0;
+ out1[0] = 0;
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
break;
- case 31: out1[3] = __byte_perm ( 0, in0[0], 0x4321);
+ case 26: out0[0] = __byte_perm (in1[2], in1[3], 0x5432);
+ out0[1] = __byte_perm (in1[3], 0, 0x5432);
+ out0[2] = 0;
+ out0[3] = 0;
+ out1[0] = 0;
+ out1[1] = 0;
out1[2] = 0;
+ out1[3] = 0;
+ break;
+ case 27: out0[0] = __byte_perm (in1[2], in1[3], 0x6543);
+ out0[1] = __byte_perm (in1[3], 0, 0x6543);
+ out0[2] = 0;
+ out0[3] = 0;
+ out1[0] = 0;
out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
+ break;
+ case 28: out0[0] = in1[3];
+ out0[1] = 0;
+ out0[2] = 0;
+ out0[3] = 0;
out1[0] = 0;
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
+ break;
+ case 29: out0[0] = __byte_perm (in1[3], 0, 0x4321);
+ out0[1] = 0;
+ out0[2] = 0;
out0[3] = 0;
+ out1[0] = 0;
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
+ break;
+ case 30: out0[0] = __byte_perm (in1[3], 0, 0x5432);
+ out0[1] = 0;
out0[2] = 0;
+ out0[3] = 0;
+ out1[0] = 0;
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
+ break;
+ case 31: out0[0] = __byte_perm (in1[3], 0, 0x6543);
out0[1] = 0;
- out0[0] = 0;
+ out0[2] = 0;
+ out0[3] = 0;
+ out1[0] = 0;
+ out1[1] = 0;
+ out1[2] = 0;
+ out1[3] = 0;
break;
}
#endif
#ifdef IS_NV
switch (num)
{
- case 0: out0[0] = in0[0];
- out0[1] = in0[1];
- out0[2] = in0[2];
- out0[3] = in0[3];
- out1[0] = in1[0];
- out1[1] = in1[1];
+ case 0: out1[3] = in1[3];
out1[2] = in1[2];
- out1[3] = in1[3];
- break;
- case 1: out0[0] = __byte_perm (in0[0], in0[1], 0x4321);
- out0[1] = __byte_perm (in0[1], in0[2], 0x4321);
- out0[2] = __byte_perm (in0[2], in0[3], 0x4321);
- out0[3] = __byte_perm (in0[3], in1[0], 0x4321);
- out1[0] = __byte_perm (in1[0], in1[1], 0x4321);
- out1[1] = __byte_perm (in1[1], in1[2], 0x4321);
- out1[2] = __byte_perm (in1[2], in1[3], 0x4321);
- out1[3] = __byte_perm (in1[3], 0, 0x4321);
- break;
- case 2: out0[0] = __byte_perm (in0[0], in0[1], 0x5432);
- out0[1] = __byte_perm (in0[1], in0[2], 0x5432);
- out0[2] = __byte_perm (in0[2], in0[3], 0x5432);
- out0[3] = __byte_perm (in0[3], in1[0], 0x5432);
- out1[0] = __byte_perm (in1[0], in1[1], 0x5432);
- out1[1] = __byte_perm (in1[1], in1[2], 0x5432);
- out1[2] = __byte_perm (in1[2], in1[3], 0x5432);
- out1[3] = __byte_perm (in1[3], 0, 0x5432);
- break;
- case 3: out0[0] = __byte_perm (in0[0], in0[1], 0x6543);
- out0[1] = __byte_perm (in0[1], in0[2], 0x6543);
- out0[2] = __byte_perm (in0[2], in0[3], 0x6543);
- out0[3] = __byte_perm (in0[3], in1[0], 0x6543);
- out1[0] = __byte_perm (in1[0], in1[1], 0x6543);
- out1[1] = __byte_perm (in1[1], in1[2], 0x6543);
- out1[2] = __byte_perm (in1[2], in1[3], 0x6543);
- out1[3] = __byte_perm (in1[3], 0, 0x6543);
- break;
- case 4: out0[0] = in0[1];
- out0[1] = in0[2];
- out0[2] = in0[3];
- out0[3] = in1[0];
- out1[0] = in1[1];
- out1[1] = in1[2];
- out1[2] = in1[3];
- out1[3] = 0;
- break;
- case 5: out0[0] = __byte_perm (in0[1], in0[2], 0x4321);
- out0[1] = __byte_perm (in0[2], in0[3], 0x4321);
- out0[2] = __byte_perm (in0[3], in1[0], 0x4321);
- out0[3] = __byte_perm (in1[0], in1[1], 0x4321);
- out1[0] = __byte_perm (in1[1], in1[2], 0x4321);
- out1[1] = __byte_perm (in1[2], in1[3], 0x4321);
- out1[2] = __byte_perm (in1[3], 0, 0x4321);
- out1[3] = 0;
- break;
- case 6: out0[0] = __byte_perm (in0[1], in0[2], 0x5432);
- out0[1] = __byte_perm (in0[2], in0[3], 0x5432);
- out0[2] = __byte_perm (in0[3], in1[0], 0x5432);
- out0[3] = __byte_perm (in1[0], in1[1], 0x5432);
- out1[0] = __byte_perm (in1[1], in1[2], 0x5432);
- out1[1] = __byte_perm (in1[2], in1[3], 0x5432);
- out1[2] = __byte_perm (in1[3], 0, 0x5432);
- out1[3] = 0;
- break;
- case 7: out0[0] = __byte_perm (in0[1], in0[2], 0x6543);
- out0[1] = __byte_perm (in0[2], in0[3], 0x6543);
- out0[2] = __byte_perm (in0[3], in1[0], 0x6543);
- out0[3] = __byte_perm (in1[0], in1[1], 0x6543);
- out1[0] = __byte_perm (in1[1], in1[2], 0x6543);
- out1[1] = __byte_perm (in1[2], in1[3], 0x6543);
- out1[2] = __byte_perm (in1[3], 0, 0x6543);
- out1[3] = 0;
- break;
- case 8: out0[0] = in0[2];
- out0[1] = in0[3];
- out0[2] = in1[0];
- out0[3] = in1[1];
- out1[0] = in1[2];
- out1[1] = in1[3];
- out1[2] = 0;
- out1[3] = 0;
- break;
- case 9: out0[0] = __byte_perm (in0[2], in0[3], 0x4321);
- out0[1] = __byte_perm (in0[3], in1[0], 0x4321);
- out0[2] = __byte_perm (in1[0], in1[1], 0x4321);
- out0[3] = __byte_perm (in1[1], in1[2], 0x4321);
- out1[0] = __byte_perm (in1[2], in1[3], 0x4321);
- out1[1] = __byte_perm (in1[3], 0, 0x4321);
- out1[2] = 0;
- out1[3] = 0;
- break;
- case 10: out0[0] = __byte_perm (in0[2], in0[3], 0x5432);
- out0[1] = __byte_perm (in0[3], in1[0], 0x5432);
- out0[2] = __byte_perm (in1[0], in1[1], 0x5432);
- out0[3] = __byte_perm (in1[1], in1[2], 0x5432);
- out1[0] = __byte_perm (in1[2], in1[3], 0x5432);
- out1[1] = __byte_perm (in1[3], 0, 0x5432);
- out1[2] = 0;
- out1[3] = 0;
- break;
- case 11: out0[0] = __byte_perm (in0[2], in0[3], 0x6543);
- out0[1] = __byte_perm (in0[3], in1[0], 0x6543);
- out0[2] = __byte_perm (in1[0], in1[1], 0x6543);
- out0[3] = __byte_perm (in1[1], in1[2], 0x6543);
- out1[0] = __byte_perm (in1[2], in1[3], 0x6543);
- out1[1] = __byte_perm (in1[3], 0, 0x6543);
- out1[2] = 0;
- out1[3] = 0;
- break;
- case 12: out0[0] = in0[3];
- out0[1] = in1[0];
- out0[2] = in1[1];
- out0[3] = in1[2];
- out1[0] = in1[3];
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ out1[1] = in1[1];
+ out1[0] = in1[0];
+ out0[3] = in0[3];
+ out0[2] = in0[2];
+ out0[1] = in0[1];
+ out0[0] = in0[0];
break;
- case 13:
- out0[0] = __byte_perm (in0[3], in1[0], 0x4321);
- out0[1] = __byte_perm (in1[0], in1[1], 0x4321);
- out0[2] = __byte_perm (in1[1], in1[2], 0x4321);
- out0[3] = __byte_perm (in1[2], in1[3], 0x4321);
- out1[0] = __byte_perm (in1[3], 0, 0x4321);
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ case 1: out1[3] = __byte_perm (in1[2], in1[3], 0x6543);
+ out1[2] = __byte_perm (in1[1], in1[2], 0x6543);
+ out1[1] = __byte_perm (in1[0], in1[1], 0x6543);
+ out1[0] = __byte_perm (in0[3], in1[0], 0x6543);
+ out0[3] = __byte_perm (in0[2], in0[3], 0x6543);
+ out0[2] = __byte_perm (in0[1], in0[2], 0x6543);
+ out0[1] = __byte_perm (in0[0], in0[1], 0x6543);
+ out0[0] = __byte_perm ( 0, in0[0], 0x6543);
break;
- case 14: out0[0] = __byte_perm (in0[3], in1[0], 0x5432);
- out0[1] = __byte_perm (in1[0], in1[1], 0x5432);
- out0[2] = __byte_perm (in1[1], in1[2], 0x5432);
- out0[3] = __byte_perm (in1[2], in1[3], 0x5432);
- out1[0] = __byte_perm (in1[3], 0, 0x5432);
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ case 2: out1[3] = __byte_perm (in1[2], in1[3], 0x5432);
+ out1[2] = __byte_perm (in1[1], in1[2], 0x5432);
+ out1[1] = __byte_perm (in1[0], in1[1], 0x5432);
+ out1[0] = __byte_perm (in0[3], in1[0], 0x5432);
+ out0[3] = __byte_perm (in0[2], in0[3], 0x5432);
+ out0[2] = __byte_perm (in0[1], in0[2], 0x5432);
+ out0[1] = __byte_perm (in0[0], in0[1], 0x5432);
+ out0[0] = __byte_perm ( 0, in0[0], 0x5432);
break;
- case 15: out0[0] = __byte_perm (in0[3], in1[0], 0x6543);
- out0[1] = __byte_perm (in1[0], in1[1], 0x6543);
- out0[2] = __byte_perm (in1[1], in1[2], 0x6543);
- out0[3] = __byte_perm (in1[2], in1[3], 0x6543);
- out1[0] = __byte_perm (in1[3], 0, 0x6543);
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ case 3: out1[3] = __byte_perm (in1[2], in1[3], 0x4321);
+ out1[2] = __byte_perm (in1[1], in1[2], 0x4321);
+ out1[1] = __byte_perm (in1[0], in1[1], 0x4321);
+ out1[0] = __byte_perm (in0[3], in1[0], 0x4321);
+ out0[3] = __byte_perm (in0[2], in0[3], 0x4321);
+ out0[2] = __byte_perm (in0[1], in0[2], 0x4321);
+ out0[1] = __byte_perm (in0[0], in0[1], 0x4321);
+ out0[0] = __byte_perm ( 0, in0[0], 0x4321);
break;
- case 16: out0[0] = in1[0];
- out0[1] = in1[1];
- out0[2] = in1[2];
- out0[3] = in1[3];
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ case 4: out1[3] = in1[2];
+ out1[2] = in1[1];
+ out1[1] = in1[0];
+ out1[0] = in0[3];
+ out0[3] = in0[2];
+ out0[2] = in0[1];
+ out0[1] = in0[0];
+ out0[0] = 0;
break;
- case 17: out0[0] = __byte_perm (in1[0], in1[1], 0x4321);
- out0[1] = __byte_perm (in1[1], in1[2], 0x4321);
- out0[2] = __byte_perm (in1[2], in1[3], 0x4321);
- out0[3] = __byte_perm (in1[3], 0, 0x4321);
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ case 5: out1[3] = __byte_perm (in1[1], in1[2], 0x6543);
+ out1[2] = __byte_perm (in1[0], in1[1], 0x6543);
+ out1[1] = __byte_perm (in0[3], in1[0], 0x6543);
+ out1[0] = __byte_perm (in0[2], in0[3], 0x6543);
+ out0[3] = __byte_perm (in0[1], in0[2], 0x6543);
+ out0[2] = __byte_perm (in0[0], in0[1], 0x6543);
+ out0[1] = __byte_perm ( 0, in0[0], 0x6543);
+ out0[0] = 0;
break;
- case 18: out0[0] = __byte_perm (in1[0], in1[1], 0x5432);
- out0[1] = __byte_perm (in1[1], in1[2], 0x5432);
- out0[2] = __byte_perm (in1[2], in1[3], 0x5432);
- out0[3] = __byte_perm (in1[3], 0, 0x5432);
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ case 6: out1[3] = __byte_perm (in1[1], in1[2], 0x5432);
+ out1[2] = __byte_perm (in1[0], in1[1], 0x5432);
+ out1[1] = __byte_perm (in0[3], in1[0], 0x5432);
+ out1[0] = __byte_perm (in0[2], in0[3], 0x5432);
+ out0[3] = __byte_perm (in0[1], in0[2], 0x5432);
+ out0[2] = __byte_perm (in0[0], in0[1], 0x5432);
+ out0[1] = __byte_perm ( 0, in0[0], 0x5432);
+ out0[0] = 0;
break;
- case 19: out0[0] = __byte_perm (in1[0], in1[1], 0x6543);
- out0[1] = __byte_perm (in1[1], in1[2], 0x6543);
- out0[2] = __byte_perm (in1[2], in1[3], 0x6543);
- out0[3] = __byte_perm (in1[3], 0, 0x6543);
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ case 7: out1[3] = __byte_perm (in1[1], in1[2], 0x4321);
+ out1[2] = __byte_perm (in1[0], in1[1], 0x4321);
+ out1[1] = __byte_perm (in0[3], in1[0], 0x4321);
+ out1[0] = __byte_perm (in0[2], in0[3], 0x4321);
+ out0[3] = __byte_perm (in0[1], in0[2], 0x4321);
+ out0[2] = __byte_perm (in0[0], in0[1], 0x4321);
+ out0[1] = __byte_perm ( 0, in0[0], 0x4321);
+ out0[0] = 0;
break;
- case 20: out0[0] = in1[1];
- out0[1] = in1[2];
- out0[2] = in1[3];
- out0[3] = 0;
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ case 8: out1[3] = in1[1];
+ out1[2] = in1[0];
+ out1[1] = in0[3];
+ out1[0] = in0[2];
+ out0[3] = in0[1];
+ out0[2] = in0[0];
+ out0[1] = 0;
+ out0[0] = 0;
break;
- case 21: out0[0] = __byte_perm (in1[1], in1[2], 0x4321);
- out0[1] = __byte_perm (in1[2], in1[3], 0x4321);
- out0[2] = __byte_perm (in1[3], 0, 0x4321);
- out0[3] = 0;
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ case 9: out1[3] = __byte_perm (in1[0], in1[1], 0x6543);
+ out1[2] = __byte_perm (in0[3], in1[0], 0x6543);
+ out1[1] = __byte_perm (in0[2], in0[3], 0x6543);
+ out1[0] = __byte_perm (in0[1], in0[2], 0x6543);
+ out0[3] = __byte_perm (in0[0], in0[1], 0x6543);
+ out0[2] = __byte_perm ( 0, in0[0], 0x6543);
+ out0[1] = 0;
+ out0[0] = 0;
break;
- case 22: out0[0] = __byte_perm (in1[1], in1[2], 0x5432);
- out0[1] = __byte_perm (in1[2], in1[3], 0x5432);
- out0[2] = __byte_perm (in1[3], 0, 0x5432);
- out0[3] = 0;
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ case 10: out1[3] = __byte_perm (in1[0], in1[1], 0x5432);
+ out1[2] = __byte_perm (in0[3], in1[0], 0x5432);
+ out1[1] = __byte_perm (in0[2], in0[3], 0x5432);
+ out1[0] = __byte_perm (in0[1], in0[2], 0x5432);
+ out0[3] = __byte_perm (in0[0], in0[1], 0x5432);
+ out0[2] = __byte_perm ( 0, in0[0], 0x5432);
+ out0[1] = 0;
+ out0[0] = 0;
break;
- case 23: out0[0] = __byte_perm (in1[1], in1[2], 0x6543);
- out0[1] = __byte_perm (in1[2], in1[3], 0x6543);
- out0[2] = __byte_perm (in1[3], 0, 0x6543);
- out0[3] = 0;
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ case 11: out1[3] = __byte_perm (in1[0], in1[1], 0x4321);
+ out1[2] = __byte_perm (in0[3], in1[0], 0x4321);
+ out1[1] = __byte_perm (in0[2], in0[3], 0x4321);
+ out1[0] = __byte_perm (in0[1], in0[2], 0x4321);
+ out0[3] = __byte_perm (in0[0], in0[1], 0x4321);
+ out0[2] = __byte_perm ( 0, in0[0], 0x4321);
+ out0[1] = 0;
+ out0[0] = 0;
break;
- case 24: out0[0] = in1[2];
- out0[1] = in1[3];
+ case 12: out1[3] = in1[0];
+ out1[2] = in0[3];
+ out1[1] = in0[2];
+ out1[0] = in0[1];
+ out0[3] = in0[0];
out0[2] = 0;
- out0[3] = 0;
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
break;
- case 25: out0[0] = __byte_perm (in1[2], in1[3], 0x4321);
- out0[1] = __byte_perm (in1[3], 0, 0x4321);
+ case 13: out1[3] = __byte_perm (in0[3], in1[0], 0x6543);
+ out1[2] = __byte_perm (in0[2], in0[3], 0x6543);
+ out1[1] = __byte_perm (in0[1], in0[2], 0x6543);
+ out1[0] = __byte_perm (in0[0], in0[1], 0x6543);
+ out0[3] = __byte_perm ( 0, in0[0], 0x6543);
out0[2] = 0;
- out0[3] = 0;
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
break;
- case 26: out0[0] = __byte_perm (in1[2], in1[3], 0x5432);
- out0[1] = __byte_perm (in1[3], 0, 0x5432);
+ case 14: out1[3] = __byte_perm (in0[3], in1[0], 0x5432);
+ out1[2] = __byte_perm (in0[2], in0[3], 0x5432);
+ out1[1] = __byte_perm (in0[1], in0[2], 0x5432);
+ out1[0] = __byte_perm (in0[0], in0[1], 0x5432);
+ out0[3] = __byte_perm ( 0, in0[0], 0x5432);
out0[2] = 0;
- out0[3] = 0;
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
break;
- case 27: out0[0] = __byte_perm (in1[2], in1[3], 0x6543);
- out0[1] = __byte_perm (in1[3], 0, 0x6543);
+ case 15: out1[3] = __byte_perm (in0[3], in1[0], 0x4321);
+ out1[2] = __byte_perm (in0[2], in0[3], 0x4321);
+ out1[1] = __byte_perm (in0[1], in0[2], 0x4321);
+ out1[0] = __byte_perm (in0[0], in0[1], 0x4321);
+ out0[3] = __byte_perm ( 0, in0[0], 0x4321);
out0[2] = 0;
- out0[3] = 0;
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
- break;
- case 28: out0[0] = in1[3];
out0[1] = 0;
- out0[2] = 0;
+ out0[0] = 0;
+ break;
+ case 16: out1[3] = in0[3];
+ out1[2] = in0[2];
+ out1[1] = in0[1];
+ out1[0] = in0[0];
out0[3] = 0;
- out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
break;
- case 29: out0[0] = __byte_perm (in1[3], 0, 0x4321);
+ case 17: out1[3] = __byte_perm (in0[2], in0[3], 0x6543);
+ out1[2] = __byte_perm (in0[1], in0[2], 0x6543);
+ out1[1] = __byte_perm (in0[0], in0[1], 0x6543);
+ out1[0] = __byte_perm ( 0, in0[0], 0x6543);
+ out0[3] = 0;
+ out0[2] = 0;
out0[1] = 0;
+ out0[0] = 0;
+ break;
+ case 18: out1[3] = __byte_perm (in0[2], in0[3], 0x5432);
+ out1[2] = __byte_perm (in0[1], in0[2], 0x5432);
+ out1[1] = __byte_perm (in0[0], in0[1], 0x5432);
+ out1[0] = __byte_perm ( 0, in0[0], 0x5432);
+ out0[3] = 0;
out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
+ break;
+ case 19: out1[3] = __byte_perm (in0[2], in0[3], 0x4321);
+ out1[2] = __byte_perm (in0[1], in0[2], 0x4321);
+ out1[1] = __byte_perm (in0[0], in0[1], 0x4321);
+ out1[0] = __byte_perm ( 0, in0[0], 0x4321);
out0[3] = 0;
+ out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
+ break;
+ case 20: out1[3] = in0[2];
+ out1[2] = in0[1];
+ out1[1] = in0[0];
out1[0] = 0;
- out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ out0[3] = 0;
+ out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
break;
- case 30: out0[0] = __byte_perm (in1[3], 0, 0x5432);
+ case 21: out1[3] = __byte_perm (in0[1], in0[2], 0x6543);
+ out1[2] = __byte_perm (in0[0], in0[1], 0x6543);
+ out1[1] = __byte_perm ( 0, in0[0], 0x6543);
+ out1[0] = 0;
+ out0[3] = 0;
+ out0[2] = 0;
out0[1] = 0;
+ out0[0] = 0;
+ break;
+ case 22: out1[3] = __byte_perm (in0[1], in0[2], 0x5432);
+ out1[2] = __byte_perm (in0[0], in0[1], 0x5432);
+ out1[1] = __byte_perm ( 0, in0[0], 0x5432);
+ out1[0] = 0;
+ out0[3] = 0;
out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
+ break;
+ case 23: out1[3] = __byte_perm (in0[1], in0[2], 0x4321);
+ out1[2] = __byte_perm (in0[0], in0[1], 0x4321);
+ out1[1] = __byte_perm ( 0, in0[0], 0x4321);
+ out1[0] = 0;
out0[3] = 0;
+ out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
+ break;
+ case 24: out1[3] = in0[1];
+ out1[2] = in0[0];
+ out1[1] = 0;
out1[0] = 0;
+ out0[3] = 0;
+ out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
+ break;
+ case 25: out1[3] = __byte_perm (in0[0], in0[1], 0x6543);
+ out1[2] = __byte_perm ( 0, in0[0], 0x6543);
out1[1] = 0;
- out1[2] = 0;
- out1[3] = 0;
+ out1[0] = 0;
+ out0[3] = 0;
+ out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
break;
- case 31: out0[0] = __byte_perm (in1[3], 0, 0x6543);
+ case 26: out1[3] = __byte_perm (in0[0], in0[1], 0x5432);
+ out1[2] = __byte_perm ( 0, in0[0], 0x5432);
+ out1[1] = 0;
+ out1[0] = 0;
+ out0[3] = 0;
+ out0[2] = 0;
out0[1] = 0;
+ out0[0] = 0;
+ break;
+ case 27: out1[3] = __byte_perm (in0[0], in0[1], 0x4321);
+ out1[2] = __byte_perm ( 0, in0[0], 0x4321);
+ out1[1] = 0;
+ out1[0] = 0;
+ out0[3] = 0;
out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
+ break;
+ case 28: out1[3] = in0[0];
+ out1[2] = 0;
+ out1[1] = 0;
+ out1[0] = 0;
out0[3] = 0;
+ out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
+ break;
+ case 29: out1[3] = __byte_perm ( 0, in0[0], 0x6543);
+ out1[2] = 0;
+ out1[1] = 0;
out1[0] = 0;
+ out0[3] = 0;
+ out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
+ break;
+ case 30: out1[3] = __byte_perm ( 0, in0[0], 0x5432);
+ out1[2] = 0;
out1[1] = 0;
+ out1[0] = 0;
+ out0[3] = 0;
+ out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
+ break;
+ case 31: out1[3] = __byte_perm ( 0, in0[0], 0x4321);
out1[2] = 0;
- out1[3] = 0;
+ out1[1] = 0;
+ out1[0] = 0;
+ out0[3] = 0;
+ out0[2] = 0;
+ out0[1] = 0;
+ out0[0] = 0;
break;
}
#endif
dst1[3] = __byte_perm (src_r0[2], src_r0[3], 0x5432);
break;
- case 19:
- dst1[0] = __byte_perm (src_l1[0], src_r0[0], 0x4210);
- dst1[1] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
- dst1[2] = __byte_perm (src_r0[1], src_r0[2], 0x4321);
- dst1[3] = __byte_perm (src_r0[2], src_r0[3], 0x4321);
- break;
-
- case 20:
- dst1[1] = src_r0[0];
- dst1[2] = src_r0[1];
- dst1[3] = src_r0[2];
- break;
-
- case 21:
- dst1[1] = __byte_perm (src_l1[1], src_r0[0], 0x6540);
- dst1[2] = __byte_perm (src_r0[0], src_r0[1], 0x6543);
- dst1[3] = __byte_perm (src_r0[1], src_r0[2], 0x6543);
- break;
-
- case 22:
- dst1[1] = __byte_perm (src_l1[1], src_r0[0], 0x5410);
- dst1[2] = __byte_perm (src_r0[0], src_r0[1], 0x5432);
- dst1[3] = __byte_perm (src_r0[1], src_r0[2], 0x5432);
- break;
-
- case 23:
- dst1[1] = __byte_perm (src_l1[1], src_r0[0], 0x4210);
- dst1[2] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
- dst1[3] = __byte_perm (src_r0[1], src_r0[2], 0x4321);
- break;
-
- case 24:
- dst1[2] = src_r0[0];
- dst1[3] = src_r0[1];
- break;
-
- case 25:
- dst1[2] = __byte_perm (src_l1[2], src_r0[0], 0x6540);
- dst1[3] = __byte_perm (src_r0[0], src_r0[1], 0x6543);
- break;
-
- case 26:
- dst1[2] = __byte_perm (src_l1[2], src_r0[0], 0x5410);
- dst1[3] = __byte_perm (src_r0[0], src_r0[1], 0x5432);
- break;
-
- case 27:
- dst1[2] = __byte_perm (src_l1[2], src_r0[0], 0x4210);
- dst1[3] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
- break;
-
- case 28:
- dst1[3] = src_r0[0];
- break;
-
- case 29:
- dst1[3] = __byte_perm (src_l1[3], src_r0[0], 0x6540);
- break;
-
- case 30:
- dst1[3] = __byte_perm (src_l1[3], src_r0[0], 0x5410);
- break;
-
- case 31:
- dst1[3] = __byte_perm (src_l1[3], src_r0[0], 0x4210);
- break;
- }
- #endif
-
- #if defined IS_AMD || defined IS_GENERIC
- switch (offset)
- {
- case 0:
- dst0[0] = src_r0[0];
- dst0[1] = src_r0[1];
- dst0[2] = src_r0[2];
- dst0[3] = src_r0[3];
- dst1[0] = src_r1[0];
- dst1[1] = src_r1[1];
- dst1[2] = src_r1[2];
- dst1[3] = src_r1[3];
- break;
-
- case 1:
- dst0[0] = src_l0[0]
- | src_r0[0] << 8;
- dst0[1] = amd_bytealign (src_r0[1], src_r0[0], 3);
- dst0[2] = amd_bytealign (src_r0[2], src_r0[1], 3);
- dst0[3] = amd_bytealign (src_r0[3], src_r0[2], 3);
- dst1[0] = amd_bytealign (src_r1[0], src_r0[3], 3);
- dst1[1] = amd_bytealign (src_r1[1], src_r1[0], 3);
- dst1[2] = amd_bytealign (src_r1[2], src_r1[1], 3);
- dst1[3] = amd_bytealign (src_r1[3], src_r1[2], 3);
- break;
-
- case 2:
- dst0[0] = src_l0[0]
- | src_r0[0] << 16;
- dst0[1] = amd_bytealign (src_r0[1], src_r0[0], 2);
- dst0[2] = amd_bytealign (src_r0[2], src_r0[1], 2);
- dst0[3] = amd_bytealign (src_r0[3], src_r0[2], 2);
- dst1[0] = amd_bytealign (src_r1[0], src_r0[3], 2);
- dst1[1] = amd_bytealign (src_r1[1], src_r1[0], 2);
- dst1[2] = amd_bytealign (src_r1[2], src_r1[1], 2);
- dst1[3] = amd_bytealign (src_r1[3], src_r1[2], 2);
- break;
-
- case 3:
- dst0[0] = src_l0[0]
- | src_r0[0] << 24;
- dst0[1] = amd_bytealign (src_r0[1], src_r0[0], 1);
- dst0[2] = amd_bytealign (src_r0[2], src_r0[1], 1);
- dst0[3] = amd_bytealign (src_r0[3], src_r0[2], 1);
- dst1[0] = amd_bytealign (src_r1[0], src_r0[3], 1);
- dst1[1] = amd_bytealign (src_r1[1], src_r1[0], 1);
- dst1[2] = amd_bytealign (src_r1[2], src_r1[1], 1);
- dst1[3] = amd_bytealign (src_r1[3], src_r1[2], 1);
- break;
-
- case 4:
- dst0[1] = src_r0[0];
- dst0[2] = src_r0[1];
- dst0[3] = src_r0[2];
- dst1[0] = src_r0[3];
- dst1[1] = src_r1[0];
- dst1[2] = src_r1[1];
- dst1[3] = src_r1[2];
- break;
-
- case 5:
- dst0[1] = src_l0[1]
- | src_r0[0] << 8;
- dst0[2] = amd_bytealign (src_r0[1], src_r0[0], 3);
- dst0[3] = amd_bytealign (src_r0[2], src_r0[1], 3);
- dst1[0] = amd_bytealign (src_r0[3], src_r0[2], 3);
- dst1[1] = amd_bytealign (src_r1[0], src_r0[3], 3);
- dst1[2] = amd_bytealign (src_r1[1], src_r1[0], 3);
- dst1[3] = amd_bytealign (src_r1[2], src_r1[1], 3);
- break;
-
- case 6:
- dst0[1] = src_l0[1]
- | src_r0[0] << 16;
- dst0[2] = amd_bytealign (src_r0[1], src_r0[0], 2);
- dst0[3] = amd_bytealign (src_r0[2], src_r0[1], 2);
- dst1[0] = amd_bytealign (src_r0[3], src_r0[2], 2);
- dst1[1] = amd_bytealign (src_r1[0], src_r0[3], 2);
- dst1[2] = amd_bytealign (src_r1[1], src_r1[0], 2);
- dst1[3] = amd_bytealign (src_r1[2], src_r1[1], 2);
- break;
-
- case 7:
- dst0[1] = src_l0[1]
- | src_r0[0] << 24;
- dst0[2] = amd_bytealign (src_r0[1], src_r0[0], 1);
- dst0[3] = amd_bytealign (src_r0[2], src_r0[1], 1);
- dst1[0] = amd_bytealign (src_r0[3], src_r0[2], 1);
- dst1[1] = amd_bytealign (src_r1[0], src_r0[3], 1);
- dst1[2] = amd_bytealign (src_r1[1], src_r1[0], 1);
- dst1[3] = amd_bytealign (src_r1[2], src_r1[1], 1);
- break;
-
- case 8:
- dst0[2] = src_r0[0];
- dst0[3] = src_r0[1];
- dst1[0] = src_r0[2];
- dst1[1] = src_r0[3];
- dst1[2] = src_r1[0];
- dst1[3] = src_r1[1];
- break;
-
- case 9:
- dst0[2] = src_l0[2]
- | src_r0[0] << 8;
- dst0[3] = amd_bytealign (src_r0[1], src_r0[0], 3);
- dst1[0] = amd_bytealign (src_r0[2], src_r0[1], 3);
- dst1[1] = amd_bytealign (src_r0[3], src_r0[2], 3);
- dst1[2] = amd_bytealign (src_r1[0], src_r0[3], 3);
- dst1[3] = amd_bytealign (src_r1[1], src_r1[0], 3);
- break;
-
- case 10:
- dst0[2] = src_l0[2]
- | src_r0[0] << 16;
- dst0[3] = amd_bytealign (src_r0[1], src_r0[0], 2);
- dst1[0] = amd_bytealign (src_r0[2], src_r0[1], 2);
- dst1[1] = amd_bytealign (src_r0[3], src_r0[2], 2);
- dst1[2] = amd_bytealign (src_r1[0], src_r0[3], 2);
- dst1[3] = amd_bytealign (src_r1[1], src_r1[0], 2);
- break;
-
- case 11:
- dst0[2] = src_l0[2]
- | src_r0[0] << 24;
- dst0[3] = amd_bytealign (src_r0[1], src_r0[0], 1);
- dst1[0] = amd_bytealign (src_r0[2], src_r0[1], 1);
- dst1[1] = amd_bytealign (src_r0[3], src_r0[2], 1);
- dst1[2] = amd_bytealign (src_r1[0], src_r0[3], 1);
- dst1[3] = amd_bytealign (src_r1[1], src_r1[0], 1);
- break;
-
- case 12:
- dst0[3] = src_r0[0];
- dst1[0] = src_r0[1];
- dst1[1] = src_r0[2];
- dst1[2] = src_r0[3];
- dst1[3] = src_r1[0];
- break;
-
- case 13:
- dst0[3] = src_l0[3]
- | src_r0[0] << 8;
- dst1[0] = amd_bytealign (src_r0[1], src_r0[0], 3);
- dst1[1] = amd_bytealign (src_r0[2], src_r0[1], 3);
- dst1[2] = amd_bytealign (src_r0[3], src_r0[2], 3);
- dst1[3] = amd_bytealign (src_r1[0], src_r0[3], 3);
- break;
-
- case 14:
- dst0[3] = src_l0[3]
- | src_r0[0] << 16;
- dst1[0] = amd_bytealign (src_r0[1], src_r0[0], 2);
- dst1[1] = amd_bytealign (src_r0[2], src_r0[1], 2);
- dst1[2] = amd_bytealign (src_r0[3], src_r0[2], 2);
- dst1[3] = amd_bytealign (src_r1[0], src_r0[3], 2);
- break;
-
- case 15:
- dst0[3] = src_l0[3]
- | src_r0[0] << 24;
- dst1[0] = amd_bytealign (src_r0[1], src_r0[0], 1);
- dst1[1] = amd_bytealign (src_r0[2], src_r0[1], 1);
- dst1[2] = amd_bytealign (src_r0[3], src_r0[2], 1);
- dst1[3] = amd_bytealign (src_r1[0], src_r0[3], 1);
- break;
-
- case 16:
- dst1[0] = src_r0[0];
- dst1[1] = src_r0[1];
- dst1[2] = src_r0[2];
- dst1[3] = src_r0[3];
- break;
-
- case 17:
- dst1[0] = src_l1[0]
- | src_r0[0] << 8;
- dst1[1] = amd_bytealign (src_r0[1], src_r0[0], 3);
- dst1[2] = amd_bytealign (src_r0[2], src_r0[1], 3);
- dst1[3] = amd_bytealign (src_r0[3], src_r0[2], 3);
- break;
-
- case 18:
- dst1[0] = src_l1[0]
- | src_r0[0] << 16;
- dst1[1] = amd_bytealign (src_r0[1], src_r0[0], 2);
- dst1[2] = amd_bytealign (src_r0[2], src_r0[1], 2);
- dst1[3] = amd_bytealign (src_r0[3], src_r0[2], 2);
- break;
-
- case 19:
- dst1[0] = src_l1[0]
- | src_r0[0] << 24;
- dst1[1] = amd_bytealign (src_r0[1], src_r0[0], 1);
- dst1[2] = amd_bytealign (src_r0[2], src_r0[1], 1);
- dst1[3] = amd_bytealign (src_r0[3], src_r0[2], 1);
+ case 19:
+ dst1[0] = __byte_perm (src_l1[0], src_r0[0], 0x4210);
+ dst1[1] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
+ dst1[2] = __byte_perm (src_r0[1], src_r0[2], 0x4321);
+ dst1[3] = __byte_perm (src_r0[2], src_r0[3], 0x4321);
break;
case 20:
break;
case 21:
- dst1[1] = src_l1[1]
- | src_r0[0] << 8;
- dst1[2] = amd_bytealign (src_r0[1], src_r0[0], 3);
- dst1[3] = amd_bytealign (src_r0[2], src_r0[1], 3);
+ dst1[1] = __byte_perm (src_l1[1], src_r0[0], 0x6540);
+ dst1[2] = __byte_perm (src_r0[0], src_r0[1], 0x6543);
+ dst1[3] = __byte_perm (src_r0[1], src_r0[2], 0x6543);
break;
case 22:
- dst1[1] = src_l1[1]
- | src_r0[0] << 16;
- dst1[2] = amd_bytealign (src_r0[1], src_r0[0], 2);
- dst1[3] = amd_bytealign (src_r0[2], src_r0[1], 2);
+ dst1[1] = __byte_perm (src_l1[1], src_r0[0], 0x5410);
+ dst1[2] = __byte_perm (src_r0[0], src_r0[1], 0x5432);
+ dst1[3] = __byte_perm (src_r0[1], src_r0[2], 0x5432);
break;
case 23:
- dst1[1] = src_l1[1]
- | src_r0[0] << 24;
- dst1[2] = amd_bytealign (src_r0[1], src_r0[0], 1);
- dst1[3] = amd_bytealign (src_r0[2], src_r0[1], 1);
+ dst1[1] = __byte_perm (src_l1[1], src_r0[0], 0x4210);
+ dst1[2] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
+ dst1[3] = __byte_perm (src_r0[1], src_r0[2], 0x4321);
break;
case 24:
break;
case 25:
- dst1[2] = src_l1[2]
- | src_r0[0] << 8;
- dst1[3] = amd_bytealign (src_r0[1], src_r0[0], 3);
+ dst1[2] = __byte_perm (src_l1[2], src_r0[0], 0x6540);
+ dst1[3] = __byte_perm (src_r0[0], src_r0[1], 0x6543);
break;
case 26:
- dst1[2] = src_l1[2]
- | src_r0[0] << 16;
- dst1[3] = amd_bytealign (src_r0[1], src_r0[0], 2);
+ dst1[2] = __byte_perm (src_l1[2], src_r0[0], 0x5410);
+ dst1[3] = __byte_perm (src_r0[0], src_r0[1], 0x5432);
break;
case 27:
- dst1[2] = src_l1[2]
- | src_r0[0] << 24;
- dst1[3] = amd_bytealign (src_r0[1], src_r0[0], 1);
+ dst1[2] = __byte_perm (src_l1[2], src_r0[0], 0x4210);
+ dst1[3] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
break;
case 28:
break;
case 29:
- dst1[3] = src_l1[3]
- | src_r0[0] << 8;
+ dst1[3] = __byte_perm (src_l1[3], src_r0[0], 0x6540);
break;
case 30:
- dst1[3] = src_l1[3]
- | src_r0[0] << 16;
+ dst1[3] = __byte_perm (src_l1[3], src_r0[0], 0x5410);
+ break;
+
+ case 31:
+ dst1[3] = __byte_perm (src_l1[3], src_r0[0], 0x4210);
break;
+ }
+ #endif
+ #if defined IS_AMD || defined IS_GENERIC
+ switch (offset)
+ {
case 31:
- dst1[3] = src_l1[3]
- | src_r0[0] << 24;
+ dst1[3] = src_l1[3] | src_r0[0] << 24;
+ break;
+ case 30:
+ dst1[3] = src_l1[3] | src_r0[0] << 16;
+ break;
+ case 29:
+ dst1[3] = src_l1[3] | src_r0[0] << 8;
+ break;
+ case 28:
+ dst1[3] = src_r0[0];
+ break;
+ case 27:
+ dst1[3] = amd_bytealign (src_r0[1], src_r0[0], 1);
+ dst1[2] = src_l1[2] | src_r0[0] << 24;
+ break;
+ case 26:
+ dst1[3] = amd_bytealign (src_r0[1], src_r0[0], 2);
+ dst1[2] = src_l1[2] | src_r0[0] << 16;
+ break;
+ case 25:
+ dst1[3] = amd_bytealign (src_r0[1], src_r0[0], 3);
+ dst1[2] = src_l1[2] | src_r0[0] << 8;
+ break;
+ case 24:
+ dst1[3] = src_r0[1];
+ dst1[2] = src_r0[0];
+ break;
+ case 23:
+ dst1[3] = amd_bytealign (src_r0[2], src_r0[1], 1);
+ dst1[2] = amd_bytealign (src_r0[1], src_r0[0], 1);
+ dst1[1] = src_l1[1] | src_r0[0] << 24;
+ break;
+ case 22:
+ dst1[3] = amd_bytealign (src_r0[2], src_r0[1], 2);
+ dst1[2] = amd_bytealign (src_r0[1], src_r0[0], 2);
+ dst1[1] = src_l1[1] | src_r0[0] << 16;
+ break;
+ case 21:
+ dst1[3] = amd_bytealign (src_r0[2], src_r0[1], 3);
+ dst1[2] = amd_bytealign (src_r0[1], src_r0[0], 3);
+ dst1[1] = src_l1[1] | src_r0[0] << 8;
+ break;
+ case 20:
+ dst1[3] = src_r0[2];
+ dst1[2] = src_r0[1];
+ dst1[1] = src_r0[0];
+ break;
+ case 19:
+ dst1[3] = amd_bytealign (src_r0[3], src_r0[2], 1);
+ dst1[2] = amd_bytealign (src_r0[2], src_r0[1], 1);
+ dst1[1] = amd_bytealign (src_r0[1], src_r0[0], 1);
+ dst1[0] = src_l1[0] | src_r0[0] << 24;
+ break;
+ case 18:
+ dst1[3] = amd_bytealign (src_r0[3], src_r0[2], 2);
+ dst1[2] = amd_bytealign (src_r0[2], src_r0[1], 2);
+ dst1[1] = amd_bytealign (src_r0[1], src_r0[0], 2);
+ dst1[0] = src_l1[0] | src_r0[0] << 16;
+ break;
+ case 17:
+ dst1[3] = amd_bytealign (src_r0[3], src_r0[2], 3);
+ dst1[2] = amd_bytealign (src_r0[2], src_r0[1], 3);
+ dst1[1] = amd_bytealign (src_r0[1], src_r0[0], 3);
+ dst1[0] = src_l1[0] | src_r0[0] << 8;
+ break;
+ case 16:
+ dst1[3] = src_r0[3];
+ dst1[2] = src_r0[2];
+ dst1[1] = src_r0[1];
+ dst1[0] = src_r0[0];
+ break;
+ case 15:
+ dst1[3] = amd_bytealign (src_r1[0], src_r0[3], 1);
+ dst1[2] = amd_bytealign (src_r0[3], src_r0[2], 1);
+ dst1[1] = amd_bytealign (src_r0[2], src_r0[1], 1);
+ dst1[0] = amd_bytealign (src_r0[1], src_r0[0], 1);
+ dst0[3] = src_l0[3] | src_r0[0] << 24;
+ break;
+ case 14:
+ dst1[3] = amd_bytealign (src_r1[0], src_r0[3], 2);
+ dst1[2] = amd_bytealign (src_r0[3], src_r0[2], 2);
+ dst1[1] = amd_bytealign (src_r0[2], src_r0[1], 2);
+ dst1[0] = amd_bytealign (src_r0[1], src_r0[0], 2);
+ dst0[3] = src_l0[3] | src_r0[0] << 16;
+ break;
+ case 13:
+ dst1[3] = amd_bytealign (src_r1[0], src_r0[3], 3);
+ dst1[2] = amd_bytealign (src_r0[3], src_r0[2], 3);
+ dst1[1] = amd_bytealign (src_r0[2], src_r0[1], 3);
+ dst1[0] = amd_bytealign (src_r0[1], src_r0[0], 3);
+ dst0[3] = src_l0[3] | src_r0[0] << 8;
+ break;
+ case 12:
+ dst1[3] = src_r1[0];
+ dst1[2] = src_r0[3];
+ dst1[1] = src_r0[2];
+ dst1[0] = src_r0[1];
+ dst0[3] = src_r0[0];
+ break;
+ case 11:
+ dst1[3] = amd_bytealign (src_r1[1], src_r1[0], 1);
+ dst1[2] = amd_bytealign (src_r1[0], src_r0[3], 1);
+ dst1[1] = amd_bytealign (src_r0[3], src_r0[2], 1);
+ dst1[0] = amd_bytealign (src_r0[2], src_r0[1], 1);
+ dst0[3] = amd_bytealign (src_r0[1], src_r0[0], 1);
+ dst0[2] = src_l0[2] | src_r0[0] << 24;
+ break;
+ case 10:
+ dst1[3] = amd_bytealign (src_r1[1], src_r1[0], 2);
+ dst1[2] = amd_bytealign (src_r1[0], src_r0[3], 2);
+ dst1[1] = amd_bytealign (src_r0[3], src_r0[2], 2);
+ dst1[0] = amd_bytealign (src_r0[2], src_r0[1], 2);
+ dst0[3] = amd_bytealign (src_r0[1], src_r0[0], 2);
+ dst0[2] = src_l0[2] | src_r0[0] << 16;
+ break;
+ case 9:
+ dst1[3] = amd_bytealign (src_r1[1], src_r1[0], 3);
+ dst1[2] = amd_bytealign (src_r1[0], src_r0[3], 3);
+ dst1[1] = amd_bytealign (src_r0[3], src_r0[2], 3);
+ dst1[0] = amd_bytealign (src_r0[2], src_r0[1], 3);
+ dst0[3] = amd_bytealign (src_r0[1], src_r0[0], 3);
+ dst0[2] = src_l0[2] | src_r0[0] << 8;
+ break;
+ case 8:
+ dst1[3] = src_r1[1];
+ dst1[2] = src_r1[0];
+ dst1[1] = src_r0[3];
+ dst1[0] = src_r0[2];
+ dst0[3] = src_r0[1];
+ dst0[2] = src_r0[0];
+ break;
+ case 7:
+ dst1[3] = amd_bytealign (src_r1[2], src_r1[1], 1);
+ dst1[2] = amd_bytealign (src_r1[1], src_r1[0], 1);
+ dst1[1] = amd_bytealign (src_r1[0], src_r0[3], 1);
+ dst1[0] = amd_bytealign (src_r0[3], src_r0[2], 1);
+ dst0[3] = amd_bytealign (src_r0[2], src_r0[1], 1);
+ dst0[2] = amd_bytealign (src_r0[1], src_r0[0], 1);
+ dst0[1] = src_l0[1] | src_r0[0] << 24;
+ break;
+ case 6:
+ dst1[3] = amd_bytealign (src_r1[2], src_r1[1], 2);
+ dst1[2] = amd_bytealign (src_r1[1], src_r1[0], 2);
+ dst1[1] = amd_bytealign (src_r1[0], src_r0[3], 2);
+ dst1[0] = amd_bytealign (src_r0[3], src_r0[2], 2);
+ dst0[3] = amd_bytealign (src_r0[2], src_r0[1], 2);
+ dst0[2] = amd_bytealign (src_r0[1], src_r0[0], 2);
+ dst0[1] = src_l0[1] | src_r0[0] << 16;
+ break;
+ case 5:
+ dst1[3] = amd_bytealign (src_r1[2], src_r1[1], 3);
+ dst1[2] = amd_bytealign (src_r1[1], src_r1[0], 3);
+ dst1[1] = amd_bytealign (src_r1[0], src_r0[3], 3);
+ dst1[0] = amd_bytealign (src_r0[3], src_r0[2], 3);
+ dst0[3] = amd_bytealign (src_r0[2], src_r0[1], 3);
+ dst0[2] = amd_bytealign (src_r0[1], src_r0[0], 3);
+ dst0[1] = src_l0[1] | src_r0[0] << 8;
+ break;
+ case 4:
+ dst1[3] = src_r1[2];
+ dst1[2] = src_r1[1];
+ dst1[1] = src_r1[0];
+ dst1[0] = src_r0[3];
+ dst0[3] = src_r0[2];
+ dst0[2] = src_r0[1];
+ dst0[1] = src_r0[0];
+ break;
+ case 3:
+ dst1[3] = amd_bytealign (src_r1[3], src_r1[2], 1);
+ dst1[2] = amd_bytealign (src_r1[2], src_r1[1], 1);
+ dst1[1] = amd_bytealign (src_r1[1], src_r1[0], 1);
+ dst1[0] = amd_bytealign (src_r1[0], src_r0[3], 1);
+ dst0[3] = amd_bytealign (src_r0[3], src_r0[2], 1);
+ dst0[2] = amd_bytealign (src_r0[2], src_r0[1], 1);
+ dst0[1] = amd_bytealign (src_r0[1], src_r0[0], 1);
+ dst0[0] = src_l0[0] | src_r0[0] << 24;
+ break;
+ case 2:
+ dst1[3] = amd_bytealign (src_r1[3], src_r1[2], 2);
+ dst1[2] = amd_bytealign (src_r1[2], src_r1[1], 2);
+ dst1[1] = amd_bytealign (src_r1[1], src_r1[0], 2);
+ dst1[0] = amd_bytealign (src_r1[0], src_r0[3], 2);
+ dst0[3] = amd_bytealign (src_r0[3], src_r0[2], 2);
+ dst0[2] = amd_bytealign (src_r0[2], src_r0[1], 2);
+ dst0[1] = amd_bytealign (src_r0[1], src_r0[0], 2);
+ dst0[0] = src_l0[0] | src_r0[0] << 16;
+ break;
+ case 1:
+ dst1[3] = amd_bytealign (src_r1[3], src_r1[2], 3);
+ dst1[2] = amd_bytealign (src_r1[2], src_r1[1], 3);
+ dst1[1] = amd_bytealign (src_r1[1], src_r1[0], 3);
+ dst1[0] = amd_bytealign (src_r1[0], src_r0[3], 3);
+ dst0[3] = amd_bytealign (src_r0[3], src_r0[2], 3);
+ dst0[2] = amd_bytealign (src_r0[2], src_r0[1], 3);
+ dst0[1] = amd_bytealign (src_r0[1], src_r0[0], 3);
+ dst0[0] = src_l0[0] | src_r0[0] << 8;
+ break;
+ case 0:
+ dst1[3] = src_r1[3];
+ dst1[2] = src_r1[2];
+ dst1[1] = src_r1[1];
+ dst1[0] = src_r1[0];
+ dst0[3] = src_r0[3];
+ dst0[2] = src_r0[2];
+ dst0[1] = src_r0[1];
+ dst0[0] = src_r0[0];
break;
}
#endif
u32 out_len = in_len;
- u32 tib40[4];
- u32 tib41[4];
-
- tib40[0] = buf0[0];
- tib40[1] = buf0[1];
- tib40[2] = buf0[2];
- tib40[3] = buf0[3];
- tib41[0] = buf1[0];
- tib41[1] = buf1[1];
- tib41[2] = buf1[2];
- tib41[3] = buf1[3];
-
- append_block8 (out_len, buf0, buf1, buf0, buf1, tib40, tib41);
+ append_block8 (out_len, buf0, buf1, buf0, buf1, buf0, buf1);
out_len += in_len;
return out_len;
}
-u32 apply_rules (__global u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len)
+u32 apply_rules (const __global u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len)
{
u32 out_len = len;
return out_len;
}
+
+u32 apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, const __global kernel_rule_t *rules_buf, const u32 il_pos, u32x w0[4], u32x w1[4])
+{
+ #if VECT_SIZE == 1
+
+ w0[0] = pw_buf0[0];
+ w0[1] = pw_buf0[1];
+ w0[2] = pw_buf0[2];
+ w0[3] = pw_buf0[3];
+ w1[0] = pw_buf1[0];
+ w1[1] = pw_buf1[1];
+ w1[2] = pw_buf1[2];
+ w1[3] = pw_buf1[3];
+
+ return apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len);
+
+ #else
+
+ u32 out_len = 0;
+
+ #pragma unroll
+ for (int i = 0; i < VECT_SIZE; i++)
+ {
+ u32 tmp0[4];
+ u32 tmp1[4];
+
+ tmp0[0] = pw_buf0[0];
+ tmp0[1] = pw_buf0[1];
+ tmp0[2] = pw_buf0[2];
+ tmp0[3] = pw_buf0[3];
+ tmp1[0] = pw_buf1[0];
+ tmp1[1] = pw_buf1[1];
+ tmp1[2] = pw_buf1[2];
+ tmp1[3] = pw_buf1[3];
+
+ out_len = apply_rules (rules_buf[il_pos + i].cmds, tmp0, tmp1, pw_len);
+
+ // it's guaranteed to have out_len always the same for each call in the loop
+
+ switch (i)
+ {
+ #if VECT_SIZE >= 2
+ case 0:
+ w0[0].s0 = tmp0[0];
+ w0[1].s0 = tmp0[1];
+ w0[2].s0 = tmp0[2];
+ w0[3].s0 = tmp0[3];
+ w1[0].s0 = tmp1[0];
+ w1[1].s0 = tmp1[1];
+ w1[2].s0 = tmp1[2];
+ w1[3].s0 = tmp1[3];
+ break;
+
+ case 1:
+ w0[0].s1 = tmp0[0];
+ w0[1].s1 = tmp0[1];
+ w0[2].s1 = tmp0[2];
+ w0[3].s1 = tmp0[3];
+ w1[0].s1 = tmp1[0];
+ w1[1].s1 = tmp1[1];
+ w1[2].s1 = tmp1[2];
+ w1[3].s1 = tmp1[3];
+ break;
+ #endif
+
+ #if VECT_SIZE >= 4
+ case 2:
+ w0[0].s2 = tmp0[0];
+ w0[1].s2 = tmp0[1];
+ w0[2].s2 = tmp0[2];
+ w0[3].s2 = tmp0[3];
+ w1[0].s2 = tmp1[0];
+ w1[1].s2 = tmp1[1];
+ w1[2].s2 = tmp1[2];
+ w1[3].s2 = tmp1[3];
+ break;
+
+ case 3:
+ w0[0].s3 = tmp0[0];
+ w0[1].s3 = tmp0[1];
+ w0[2].s3 = tmp0[2];
+ w0[3].s3 = tmp0[3];
+ w1[0].s3 = tmp1[0];
+ w1[1].s3 = tmp1[1];
+ w1[2].s3 = tmp1[2];
+ w1[3].s3 = tmp1[3];
+ break;
+ #endif
+
+ #if VECT_SIZE >= 8
+ case 4:
+ w0[0].s4 = tmp0[0];
+ w0[1].s4 = tmp0[1];
+ w0[2].s4 = tmp0[2];
+ w0[3].s4 = tmp0[3];
+ w1[0].s4 = tmp1[0];
+ w1[1].s4 = tmp1[1];
+ w1[2].s4 = tmp1[2];
+ w1[3].s4 = tmp1[3];
+ break;
+
+ case 5:
+ w0[0].s5 = tmp0[0];
+ w0[1].s5 = tmp0[1];
+ w0[2].s5 = tmp0[2];
+ w0[3].s5 = tmp0[3];
+ w1[0].s5 = tmp1[0];
+ w1[1].s5 = tmp1[1];
+ w1[2].s5 = tmp1[2];
+ w1[3].s5 = tmp1[3];
+ break;
+
+ case 6:
+ w0[0].s6 = tmp0[0];
+ w0[1].s6 = tmp0[1];
+ w0[2].s6 = tmp0[2];
+ w0[3].s6 = tmp0[3];
+ w1[0].s6 = tmp1[0];
+ w1[1].s6 = tmp1[1];
+ w1[2].s6 = tmp1[2];
+ w1[3].s6 = tmp1[3];
+ break;
+
+ case 7:
+ w0[0].s7 = tmp0[0];
+ w0[1].s7 = tmp0[1];
+ w0[2].s7 = tmp0[2];
+ w0[3].s7 = tmp0[3];
+ w1[0].s7 = tmp1[0];
+ w1[1].s7 = tmp1[1];
+ w1[2].s7 = tmp1[2];
+ w1[3].s7 = tmp1[3];
+ break;
+ #endif
+ }
+ }
+
+ return out_len;
+
+ #endif
+}