Fix bug in rule-engine for NV, left shifts and right shifts were switched
authorjsteube <jens.steube@gmail.com>
Sat, 16 Jan 2016 12:34:54 +0000 (13:34 +0100)
committerjsteube <jens.steube@gmail.com>
Sat, 16 Jan 2016 12:34:54 +0000 (13:34 +0100)
OpenCL/rp.c

index 741de40..d702956 100644 (file)
@@ -175,293 +175,294 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32
   #ifdef IS_NV
   switch (num)
   {
-    case  0:  out1[3] = in1[3];
-              out1[2] = in1[2];
-              out1[1] = in1[1];
-              out1[0] = in1[0];
-              out0[3] = in0[3];
-              out0[2] = in0[2];
+    case  0:  out0[0] = in0[0];
               out0[1] = in0[1];
-              out0[0] = in0[0];
-              break;
-    case  1:  out1[3] = __byte_perm (in1[2], in1[3], 0x6543);
-              out1[2] = __byte_perm (in1[1], in1[2], 0x6543);
-              out1[1] = __byte_perm (in1[0], in1[1], 0x6543);
-              out1[0] = __byte_perm (in0[3], in1[0], 0x6543);
-              out0[3] = __byte_perm (in0[2], in0[3], 0x6543);
-              out0[2] = __byte_perm (in0[1], in0[2], 0x6543);
-              out0[1] = __byte_perm (in0[0], in0[1], 0x6543);
-              out0[0] = __byte_perm (     0, in0[0], 0x6543);
-              break;
-    case  2:  out1[3] = __byte_perm (in1[2], in1[3], 0x5432);
-              out1[2] = __byte_perm (in1[1], in1[2], 0x5432);
-              out1[1] = __byte_perm (in1[0], in1[1], 0x5432);
-              out1[0] = __byte_perm (in0[3], in1[0], 0x5432);
-              out0[3] = __byte_perm (in0[2], in0[3], 0x5432);
-              out0[2] = __byte_perm (in0[1], in0[2], 0x5432);
-              out0[1] = __byte_perm (in0[0], in0[1], 0x5432);
-              out0[0] = __byte_perm (     0, in0[0], 0x5432);
-              break;
-    case  3:  out1[3] = __byte_perm (in1[2], in1[3], 0x4321);
-              out1[2] = __byte_perm (in1[1], in1[2], 0x4321);
-              out1[1] = __byte_perm (in1[0], in1[1], 0x4321);
-              out1[0] = __byte_perm (in0[3], in1[0], 0x4321);
-              out0[3] = __byte_perm (in0[2], in0[3], 0x4321);
-              out0[2] = __byte_perm (in0[1], in0[2], 0x4321);
-              out0[1] = __byte_perm (in0[0], in0[1], 0x4321);
-              out0[0] = __byte_perm (     0, in0[0], 0x4321);
-              break;
-    case  4:  out1[3] = in1[2];
-              out1[2] = in1[1];
-              out1[1] = in1[0];
-              out1[0] = in0[3];
-              out0[3] = in0[2];
-              out0[2] = in0[1];
-              out0[1] = in0[0];
-              out0[0] = 0;
+              out0[2] = in0[2];
+              out0[3] = in0[3];
+              out1[0] = in1[0];
+              out1[1] = in1[1];
+              out1[2] = in1[2];
+              out1[3] = in1[3];
               break;
-    case  5:  out1[3] = __byte_perm (in1[1], in1[2], 0x6543);
-              out1[2] = __byte_perm (in1[0], in1[1], 0x6543);
-              out1[1] = __byte_perm (in0[3], in1[0], 0x6543);
-              out1[0] = __byte_perm (in0[2], in0[3], 0x6543);
-              out0[3] = __byte_perm (in0[1], in0[2], 0x6543);
-              out0[2] = __byte_perm (in0[0], in0[1], 0x6543);
-              out0[1] = __byte_perm (     0, in0[0], 0x6543);
-              out0[0] = 0;
+    case  1:  out0[0] = __byte_perm (in0[0], in0[1], 0x4321);
+              out0[1] = __byte_perm (in0[1], in0[2], 0x4321);
+              out0[2] = __byte_perm (in0[2], in0[3], 0x4321);
+              out0[3] = __byte_perm (in0[3], in1[0], 0x4321);
+              out1[0] = __byte_perm (in1[0], in1[1], 0x4321);
+              out1[1] = __byte_perm (in1[1], in1[2], 0x4321);
+              out1[2] = __byte_perm (in1[2], in1[3], 0x4321);
+              out1[3] = __byte_perm (in1[3],      0, 0x4321);
               break;
-    case  6:  out1[3] = __byte_perm (in1[1], in1[2], 0x5432);
-              out1[2] = __byte_perm (in1[0], in1[1], 0x5432);
-              out1[1] = __byte_perm (in0[3], in1[0], 0x5432);
-              out1[0] = __byte_perm (in0[2], in0[3], 0x5432);
-              out0[3] = __byte_perm (in0[1], in0[2], 0x5432);
-              out0[2] = __byte_perm (in0[0], in0[1], 0x5432);
-              out0[1] = __byte_perm (     0, in0[0], 0x5432);
-              out0[0] = 0;
+    case  2:  out0[0] = __byte_perm (in0[0], in0[1], 0x5432);
+              out0[1] = __byte_perm (in0[1], in0[2], 0x5432);
+              out0[2] = __byte_perm (in0[2], in0[3], 0x5432);
+              out0[3] = __byte_perm (in0[3], in1[0], 0x5432);
+              out1[0] = __byte_perm (in1[0], in1[1], 0x5432);
+              out1[1] = __byte_perm (in1[1], in1[2], 0x5432);
+              out1[2] = __byte_perm (in1[2], in1[3], 0x5432);
+              out1[3] = __byte_perm (in1[3],      0, 0x5432);
               break;
-    case  7:  out1[3] = __byte_perm (in1[1], in1[2], 0x4321);
-              out1[2] = __byte_perm (in1[0], in1[1], 0x4321);
-              out1[1] = __byte_perm (in0[3], in1[0], 0x4321);
-              out1[0] = __byte_perm (in0[2], in0[3], 0x4321);
-              out0[3] = __byte_perm (in0[1], in0[2], 0x4321);
-              out0[2] = __byte_perm (in0[0], in0[1], 0x4321);
-              out0[1] = __byte_perm (     0, in0[0], 0x4321);
-              out0[0] = 0;
+    case  3:  out0[0] = __byte_perm (in0[0], in0[1], 0x6543);
+              out0[1] = __byte_perm (in0[1], in0[2], 0x6543);
+              out0[2] = __byte_perm (in0[2], in0[3], 0x6543);
+              out0[3] = __byte_perm (in0[3], in1[0], 0x6543);
+              out1[0] = __byte_perm (in1[0], in1[1], 0x6543);
+              out1[1] = __byte_perm (in1[1], in1[2], 0x6543);
+              out1[2] = __byte_perm (in1[2], in1[3], 0x6543);
+              out1[3] = __byte_perm (in1[3],      0, 0x6543);
               break;
-    case  8:  out1[3] = in1[1];
-              out1[2] = in1[0];
-              out1[1] = in0[3];
-              out1[0] = in0[2];
-              out0[3] = in0[1];
-              out0[2] = in0[0];
-              out0[1] = 0;
-              out0[0] = 0;
+    case  4:  out0[0] = in0[1];
+              out0[1] = in0[2];
+              out0[2] = in0[3];
+              out0[3] = in1[0];
+              out1[0] = in1[1];
+              out1[1] = in1[2];
+              out1[2] = in1[3];
+              out1[3] = 0;
               break;
-    case  9:  out1[3] = __byte_perm (in1[0], in1[1], 0x6543);
-              out1[2] = __byte_perm (in0[3], in1[0], 0x6543);
-              out1[1] = __byte_perm (in0[2], in0[3], 0x6543);
-              out1[0] = __byte_perm (in0[1], in0[2], 0x6543);
-              out0[3] = __byte_perm (in0[0], in0[1], 0x6543);
-              out0[2] = __byte_perm (     0, in0[0], 0x6543);
-              out0[1] = 0;
-              out0[0] = 0;
+    case  5:  out0[0] = __byte_perm (in0[1], in0[2], 0x4321);
+              out0[1] = __byte_perm (in0[2], in0[3], 0x4321);
+              out0[2] = __byte_perm (in0[3], in1[0], 0x4321);
+              out0[3] = __byte_perm (in1[0], in1[1], 0x4321);
+              out1[0] = __byte_perm (in1[1], in1[2], 0x4321);
+              out1[1] = __byte_perm (in1[2], in1[3], 0x4321);
+              out1[2] = __byte_perm (in1[3],      0, 0x4321);
+              out1[3] = 0;
               break;
-    case 10:  out1[3] = __byte_perm (in1[0], in1[1], 0x5432);
-              out1[2] = __byte_perm (in0[3], in1[0], 0x5432);
-              out1[1] = __byte_perm (in0[2], in0[3], 0x5432);
-              out1[0] = __byte_perm (in0[1], in0[2], 0x5432);
-              out0[3] = __byte_perm (in0[0], in0[1], 0x5432);
-              out0[2] = __byte_perm (     0, in0[0], 0x5432);
-              out0[1] = 0;
-              out0[0] = 0;
+    case  6:  out0[0] = __byte_perm (in0[1], in0[2], 0x5432);
+              out0[1] = __byte_perm (in0[2], in0[3], 0x5432);
+              out0[2] = __byte_perm (in0[3], in1[0], 0x5432);
+              out0[3] = __byte_perm (in1[0], in1[1], 0x5432);
+              out1[0] = __byte_perm (in1[1], in1[2], 0x5432);
+              out1[1] = __byte_perm (in1[2], in1[3], 0x5432);
+              out1[2] = __byte_perm (in1[3],      0, 0x5432);
+              out1[3] = 0;
               break;
-    case 11:  out1[3] = __byte_perm (in1[0], in1[1], 0x4321);
-              out1[2] = __byte_perm (in0[3], in1[0], 0x4321);
-              out1[1] = __byte_perm (in0[2], in0[3], 0x4321);
-              out1[0] = __byte_perm (in0[1], in0[2], 0x4321);
-              out0[3] = __byte_perm (in0[0], in0[1], 0x4321);
-              out0[2] = __byte_perm (     0, in0[0], 0x4321);
-              out0[1] = 0;
-              out0[0] = 0;
+    case  7:  out0[0] = __byte_perm (in0[1], in0[2], 0x6543);
+              out0[1] = __byte_perm (in0[2], in0[3], 0x6543);
+              out0[2] = __byte_perm (in0[3], in1[0], 0x6543);
+              out0[3] = __byte_perm (in1[0], in1[1], 0x6543);
+              out1[0] = __byte_perm (in1[1], in1[2], 0x6543);
+              out1[1] = __byte_perm (in1[2], in1[3], 0x6543);
+              out1[2] = __byte_perm (in1[3],      0, 0x6543);
+              out1[3] = 0;
               break;
-    case 12:  out1[3] = in1[0];
-              out1[2] = in0[3];
-              out1[1] = in0[2];
-              out1[0] = in0[1];
-              out0[3] = in0[0];
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+    case  8:  out0[0] = in0[2];
+              out0[1] = in0[3];
+              out0[2] = in1[0];
+              out0[3] = in1[1];
+              out1[0] = in1[2];
+              out1[1] = in1[3];
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 13:  out1[3] = __byte_perm (in0[3], in1[0], 0x6543);
-              out1[2] = __byte_perm (in0[2], in0[3], 0x6543);
-              out1[1] = __byte_perm (in0[1], in0[2], 0x6543);
-              out1[0] = __byte_perm (in0[0], in0[1], 0x6543);
-              out0[3] = __byte_perm (     0, in0[0], 0x6543);
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+    case  9:  out0[0] = __byte_perm (in0[2], in0[3], 0x4321);
+              out0[1] = __byte_perm (in0[3], in1[0], 0x4321);
+              out0[2] = __byte_perm (in1[0], in1[1], 0x4321);
+              out0[3] = __byte_perm (in1[1], in1[2], 0x4321);
+              out1[0] = __byte_perm (in1[2], in1[3], 0x4321);
+              out1[1] = __byte_perm (in1[3],      0, 0x4321);
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 14:  out1[3] = __byte_perm (in0[3], in1[0], 0x5432);
-              out1[2] = __byte_perm (in0[2], in0[3], 0x5432);
-              out1[1] = __byte_perm (in0[1], in0[2], 0x5432);
-              out1[0] = __byte_perm (in0[0], in0[1], 0x5432);
-              out0[3] = __byte_perm (     0, in0[0], 0x5432);
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+    case 10:  out0[0] = __byte_perm (in0[2], in0[3], 0x5432);
+              out0[1] = __byte_perm (in0[3], in1[0], 0x5432);
+              out0[2] = __byte_perm (in1[0], in1[1], 0x5432);
+              out0[3] = __byte_perm (in1[1], in1[2], 0x5432);
+              out1[0] = __byte_perm (in1[2], in1[3], 0x5432);
+              out1[1] = __byte_perm (in1[3],      0, 0x5432);
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 15:  out1[3] = __byte_perm (in0[3], in1[0], 0x4321);
-              out1[2] = __byte_perm (in0[2], in0[3], 0x4321);
-              out1[1] = __byte_perm (in0[1], in0[2], 0x4321);
-              out1[0] = __byte_perm (in0[0], in0[1], 0x4321);
-              out0[3] = __byte_perm (     0, in0[0], 0x4321);
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+    case 11:  out0[0] = __byte_perm (in0[2], in0[3], 0x6543);
+              out0[1] = __byte_perm (in0[3], in1[0], 0x6543);
+              out0[2] = __byte_perm (in1[0], in1[1], 0x6543);
+              out0[3] = __byte_perm (in1[1], in1[2], 0x6543);
+              out1[0] = __byte_perm (in1[2], in1[3], 0x6543);
+              out1[1] = __byte_perm (in1[3],      0, 0x6543);
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 16:  out1[3] = in0[3];
-              out1[2] = in0[2];
-              out1[1] = in0[1];
-              out1[0] = in0[0];
-              out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+    case 12:  out0[0] = in0[3];
+              out0[1] = in1[0];
+              out0[2] = in1[1];
+              out0[3] = in1[2];
+              out1[0] = in1[3];
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 17:  out1[3] = __byte_perm (in0[2], in0[3], 0x6543);
-              out1[2] = __byte_perm (in0[1], in0[2], 0x6543);
-              out1[1] = __byte_perm (in0[0], in0[1], 0x6543);
-              out1[0] = __byte_perm (     0, in0[0], 0x6543);
-              out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+    case 13:
+              out0[0] = __byte_perm (in0[3], in1[0], 0x4321);
+              out0[1] = __byte_perm (in1[0], in1[1], 0x4321);
+              out0[2] = __byte_perm (in1[1], in1[2], 0x4321);
+              out0[3] = __byte_perm (in1[2], in1[3], 0x4321);
+              out1[0] = __byte_perm (in1[3],      0, 0x4321);
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 18:  out1[3] = __byte_perm (in0[2], in0[3], 0x5432);
-              out1[2] = __byte_perm (in0[1], in0[2], 0x5432);
-              out1[1] = __byte_perm (in0[0], in0[1], 0x5432);
-              out1[0] = __byte_perm (     0, in0[0], 0x5432);
-              out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+    case 14:  out0[0] = __byte_perm (in0[3], in1[0], 0x5432);
+              out0[1] = __byte_perm (in1[0], in1[1], 0x5432);
+              out0[2] = __byte_perm (in1[1], in1[2], 0x5432);
+              out0[3] = __byte_perm (in1[2], in1[3], 0x5432);
+              out1[0] = __byte_perm (in1[3],      0, 0x5432);
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 19:  out1[3] = __byte_perm (in0[2], in0[3], 0x4321);
-              out1[2] = __byte_perm (in0[1], in0[2], 0x4321);
-              out1[1] = __byte_perm (in0[0], in0[1], 0x4321);
-              out1[0] = __byte_perm (     0, in0[0], 0x4321);
-              out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+    case 15:  out0[0] = __byte_perm (in0[3], in1[0], 0x6543);
+              out0[1] = __byte_perm (in1[0], in1[1], 0x6543);
+              out0[2] = __byte_perm (in1[1], in1[2], 0x6543);
+              out0[3] = __byte_perm (in1[2], in1[3], 0x6543);
+              out1[0] = __byte_perm (in1[3],      0, 0x6543);
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 20:  out1[3] = in0[2];
-              out1[2] = in0[1];
-              out1[1] = in0[0];
+    case 16:  out0[0] = in1[0];
+              out0[1] = in1[1];
+              out0[2] = in1[2];
+              out0[3] = in1[3];
               out1[0] = 0;
-              out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 21:  out1[3] = __byte_perm (in0[1], in0[2], 0x6543);
-              out1[2] = __byte_perm (in0[0], in0[1], 0x6543);
-              out1[1] = __byte_perm (     0, in0[0], 0x6543);
+    case 17:  out0[0] = __byte_perm (in1[0], in1[1], 0x4321);
+              out0[1] = __byte_perm (in1[1], in1[2], 0x4321);
+              out0[2] = __byte_perm (in1[2], in1[3], 0x4321);
+              out0[3] = __byte_perm (in1[3],      0, 0x4321);
               out1[0] = 0;
-              out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 22:  out1[3] = __byte_perm (in0[1], in0[2], 0x5432);
-              out1[2] = __byte_perm (in0[0], in0[1], 0x5432);
-              out1[1] = __byte_perm (     0, in0[0], 0x5432);
+    case 18:  out0[0] = __byte_perm (in1[0], in1[1], 0x5432);
+              out0[1] = __byte_perm (in1[1], in1[2], 0x5432);
+              out0[2] = __byte_perm (in1[2], in1[3], 0x5432);
+              out0[3] = __byte_perm (in1[3],      0, 0x5432);
               out1[0] = 0;
-              out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 23:  out1[3] = __byte_perm (in0[1], in0[2], 0x4321);
-              out1[2] = __byte_perm (in0[0], in0[1], 0x4321);
-              out1[1] = __byte_perm (     0, in0[0], 0x4321);
+    case 19:  out0[0] = __byte_perm (in1[0], in1[1], 0x6543);
+              out0[1] = __byte_perm (in1[1], in1[2], 0x6543);
+              out0[2] = __byte_perm (in1[2], in1[3], 0x6543);
+              out0[3] = __byte_perm (in1[3],      0, 0x6543);
               out1[0] = 0;
-              out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
-              break;
-    case 24:  out1[3] = in0[1];
-              out1[2] = in0[0];
               out1[1] = 0;
-              out1[0] = 0;
-              out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 25:  out1[3] = __byte_perm (in0[0], in0[1], 0x6543);
-              out1[2] = __byte_perm (     0, in0[0], 0x6543);
-              out1[1] = 0;
-              out1[0] = 0;
+    case 20:  out0[0] = in1[1];
+              out0[1] = in1[2];
+              out0[2] = in1[3];
               out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
-              break;
-    case 26:  out1[3] = __byte_perm (in0[0], in0[1], 0x5432);
-              out1[2] = __byte_perm (     0, in0[0], 0x5432);
-              out1[1] = 0;
               out1[0] = 0;
-              out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
-              break;
-    case 27:  out1[3] = __byte_perm (in0[0], in0[1], 0x4321);
-              out1[2] = __byte_perm (     0, in0[0], 0x4321);
               out1[1] = 0;
-              out1[0] = 0;
-              out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
-              break;
-    case 28:  out1[3] = in0[0];
               out1[2] = 0;
-              out1[1] = 0;
-              out1[0] = 0;
-              out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+              out1[3] = 0;
               break;
-    case 29:  out1[3] = __byte_perm (     0, in0[0], 0x6543);
-              out1[2] = 0;
-              out1[1] = 0;
-              out1[0] = 0;
+    case 21:  out0[0] = __byte_perm (in1[1], in1[2], 0x4321);
+              out0[1] = __byte_perm (in1[2], in1[3], 0x4321);
+              out0[2] = __byte_perm (in1[3],      0, 0x4321);
               out0[3] = 0;
-              out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
-              break;
-    case 30:  out1[3] = __byte_perm (     0, in0[0], 0x5432);
+              out1[0] = 0;
+              out1[1] = 0;
               out1[2] = 0;
+              out1[3] = 0;
+              break;
+    case 22:  out0[0] = __byte_perm (in1[1], in1[2], 0x5432);
+              out0[1] = __byte_perm (in1[2], in1[3], 0x5432);
+              out0[2] = __byte_perm (in1[3],      0, 0x5432);
+              out0[3] = 0;
+              out1[0] = 0;
               out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
+              break;
+    case 23:  out0[0] = __byte_perm (in1[1], in1[2], 0x6543);
+              out0[1] = __byte_perm (in1[2], in1[3], 0x6543);
+              out0[2] = __byte_perm (in1[3],      0, 0x6543);
+              out0[3] = 0;
               out1[0] = 0;
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
+              break;
+    case 24:  out0[0] = in1[2];
+              out0[1] = in1[3];
+              out0[2] = 0;
               out0[3] = 0;
+              out1[0] = 0;
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
+              break;
+    case 25:  out0[0] = __byte_perm (in1[2], in1[3], 0x4321);
+              out0[1] = __byte_perm (in1[3],      0, 0x4321);
               out0[2] = 0;
-              out0[1] = 0;
-              out0[0] = 0;
+              out0[3] = 0;
+              out1[0] = 0;
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
               break;
-    case 31:  out1[3] = __byte_perm (     0, in0[0], 0x4321);
+    case 26:  out0[0] = __byte_perm (in1[2], in1[3], 0x5432);
+              out0[1] = __byte_perm (in1[3],      0, 0x5432);
+              out0[2] = 0;
+              out0[3] = 0;
+              out1[0] = 0;
+              out1[1] = 0;
               out1[2] = 0;
+              out1[3] = 0;
+              break;
+    case 27:  out0[0] = __byte_perm (in1[2], in1[3], 0x6543);
+              out0[1] = __byte_perm (in1[3],      0, 0x6543);
+              out0[2] = 0;
+              out0[3] = 0;
+              out1[0] = 0;
               out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
+              break;
+    case 28:  out0[0] = in1[3];
+              out0[1] = 0;
+              out0[2] = 0;
+              out0[3] = 0;
               out1[0] = 0;
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
+              break;
+    case 29:  out0[0] = __byte_perm (in1[3],     0, 0x4321);
+              out0[1] = 0;
+              out0[2] = 0;
               out0[3] = 0;
+              out1[0] = 0;
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
+              break;
+    case 30:  out0[0] =  __byte_perm (in1[3],     0, 0x5432);
+              out0[1] = 0;
               out0[2] = 0;
+              out0[3] = 0;
+              out1[0] = 0;
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
+              break;
+    case 31:  out0[0] =  __byte_perm (in1[3],     0, 0x6543);
               out0[1] = 0;
-              out0[0] = 0;
+              out0[2] = 0;
+              out0[3] = 0;
+              out1[0] = 0;
+              out1[1] = 0;
+              out1[2] = 0;
+              out1[3] = 0;
               break;
   }
   #endif
@@ -766,294 +767,293 @@ static void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32
   #ifdef IS_NV
   switch (num)
   {
-    case  0:  out0[0] = in0[0];
-              out0[1] = in0[1];
-              out0[2] = in0[2];
-              out0[3] = in0[3];
-              out1[0] = in1[0];
-              out1[1] = in1[1];
+    case  0:  out1[3] = in1[3];
               out1[2] = in1[2];
-              out1[3] = in1[3];
-              break;
-    case  1:  out0[0] = __byte_perm (in0[0], in0[1], 0x4321);
-              out0[1] = __byte_perm (in0[1], in0[2], 0x4321);
-              out0[2] = __byte_perm (in0[2], in0[3], 0x4321);
-              out0[3] = __byte_perm (in0[3], in1[0], 0x4321);
-              out1[0] = __byte_perm (in1[0], in1[1], 0x4321);
-              out1[1] = __byte_perm (in1[1], in1[2], 0x4321);
-              out1[2] = __byte_perm (in1[2], in1[3], 0x4321);
-              out1[3] = __byte_perm (in1[3],      0, 0x4321);
-              break;
-    case  2:  out0[0] = __byte_perm (in0[0], in0[1], 0x5432);
-              out0[1] = __byte_perm (in0[1], in0[2], 0x5432);
-              out0[2] = __byte_perm (in0[2], in0[3], 0x5432);
-              out0[3] = __byte_perm (in0[3], in1[0], 0x5432);
-              out1[0] = __byte_perm (in1[0], in1[1], 0x5432);
-              out1[1] = __byte_perm (in1[1], in1[2], 0x5432);
-              out1[2] = __byte_perm (in1[2], in1[3], 0x5432);
-              out1[3] = __byte_perm (in1[3],      0, 0x5432);
-              break;
-    case  3:  out0[0] = __byte_perm (in0[0], in0[1], 0x6543);
-              out0[1] = __byte_perm (in0[1], in0[2], 0x6543);
-              out0[2] = __byte_perm (in0[2], in0[3], 0x6543);
-              out0[3] = __byte_perm (in0[3], in1[0], 0x6543);
-              out1[0] = __byte_perm (in1[0], in1[1], 0x6543);
-              out1[1] = __byte_perm (in1[1], in1[2], 0x6543);
-              out1[2] = __byte_perm (in1[2], in1[3], 0x6543);
-              out1[3] = __byte_perm (in1[3],      0, 0x6543);
-              break;
-    case  4:  out0[0] = in0[1];
-              out0[1] = in0[2];
-              out0[2] = in0[3];
-              out0[3] = in1[0];
-              out1[0] = in1[1];
-              out1[1] = in1[2];
-              out1[2] = in1[3];
-              out1[3] = 0;
-              break;
-    case  5:  out0[0] = __byte_perm (in0[1], in0[2], 0x4321);
-              out0[1] = __byte_perm (in0[2], in0[3], 0x4321);
-              out0[2] = __byte_perm (in0[3], in1[0], 0x4321);
-              out0[3] = __byte_perm (in1[0], in1[1], 0x4321);
-              out1[0] = __byte_perm (in1[1], in1[2], 0x4321);
-              out1[1] = __byte_perm (in1[2], in1[3], 0x4321);
-              out1[2] = __byte_perm (in1[3],      0, 0x4321);
-              out1[3] = 0;
-              break;
-    case  6:  out0[0] = __byte_perm (in0[1], in0[2], 0x5432);
-              out0[1] = __byte_perm (in0[2], in0[3], 0x5432);
-              out0[2] = __byte_perm (in0[3], in1[0], 0x5432);
-              out0[3] = __byte_perm (in1[0], in1[1], 0x5432);
-              out1[0] = __byte_perm (in1[1], in1[2], 0x5432);
-              out1[1] = __byte_perm (in1[2], in1[3], 0x5432);
-              out1[2] = __byte_perm (in1[3],      0, 0x5432);
-              out1[3] = 0;
-              break;
-    case  7:  out0[0] = __byte_perm (in0[1], in0[2], 0x6543);
-              out0[1] = __byte_perm (in0[2], in0[3], 0x6543);
-              out0[2] = __byte_perm (in0[3], in1[0], 0x6543);
-              out0[3] = __byte_perm (in1[0], in1[1], 0x6543);
-              out1[0] = __byte_perm (in1[1], in1[2], 0x6543);
-              out1[1] = __byte_perm (in1[2], in1[3], 0x6543);
-              out1[2] = __byte_perm (in1[3],      0, 0x6543);
-              out1[3] = 0;
-              break;
-    case  8:  out0[0] = in0[2];
-              out0[1] = in0[3];
-              out0[2] = in1[0];
-              out0[3] = in1[1];
-              out1[0] = in1[2];
-              out1[1] = in1[3];
-              out1[2] = 0;
-              out1[3] = 0;
-              break;
-    case  9:  out0[0] = __byte_perm (in0[2], in0[3], 0x4321);
-              out0[1] = __byte_perm (in0[3], in1[0], 0x4321);
-              out0[2] = __byte_perm (in1[0], in1[1], 0x4321);
-              out0[3] = __byte_perm (in1[1], in1[2], 0x4321);
-              out1[0] = __byte_perm (in1[2], in1[3], 0x4321);
-              out1[1] = __byte_perm (in1[3],      0, 0x4321);
-              out1[2] = 0;
-              out1[3] = 0;
-              break;
-    case 10:  out0[0] = __byte_perm (in0[2], in0[3], 0x5432);
-              out0[1] = __byte_perm (in0[3], in1[0], 0x5432);
-              out0[2] = __byte_perm (in1[0], in1[1], 0x5432);
-              out0[3] = __byte_perm (in1[1], in1[2], 0x5432);
-              out1[0] = __byte_perm (in1[2], in1[3], 0x5432);
-              out1[1] = __byte_perm (in1[3],      0, 0x5432);
-              out1[2] = 0;
-              out1[3] = 0;
-              break;
-    case 11:  out0[0] = __byte_perm (in0[2], in0[3], 0x6543);
-              out0[1] = __byte_perm (in0[3], in1[0], 0x6543);
-              out0[2] = __byte_perm (in1[0], in1[1], 0x6543);
-              out0[3] = __byte_perm (in1[1], in1[2], 0x6543);
-              out1[0] = __byte_perm (in1[2], in1[3], 0x6543);
-              out1[1] = __byte_perm (in1[3],      0, 0x6543);
-              out1[2] = 0;
-              out1[3] = 0;
-              break;
-    case 12:  out0[0] = in0[3];
-              out0[1] = in1[0];
-              out0[2] = in1[1];
-              out0[3] = in1[2];
-              out1[0] = in1[3];
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
-              break;
-    case 13:
-              out0[0] = __byte_perm (in0[3], in1[0], 0x4321);
-              out0[1] = __byte_perm (in1[0], in1[1], 0x4321);
-              out0[2] = __byte_perm (in1[1], in1[2], 0x4321);
-              out0[3] = __byte_perm (in1[2], in1[3], 0x4321);
-              out1[0] = __byte_perm (in1[3],      0, 0x4321);
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+              out1[1] = in1[1];
+              out1[0] = in1[0];
+              out0[3] = in0[3];
+              out0[2] = in0[2];
+              out0[1] = in0[1];
+              out0[0] = in0[0];
               break;
-    case 14:  out0[0] = __byte_perm (in0[3], in1[0], 0x5432);
-              out0[1] = __byte_perm (in1[0], in1[1], 0x5432);
-              out0[2] = __byte_perm (in1[1], in1[2], 0x5432);
-              out0[3] = __byte_perm (in1[2], in1[3], 0x5432);
-              out1[0] = __byte_perm (in1[3],      0, 0x5432);
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+    case  1:  out1[3] = __byte_perm (in1[2], in1[3], 0x6543);
+              out1[2] = __byte_perm (in1[1], in1[2], 0x6543);
+              out1[1] = __byte_perm (in1[0], in1[1], 0x6543);
+              out1[0] = __byte_perm (in0[3], in1[0], 0x6543);
+              out0[3] = __byte_perm (in0[2], in0[3], 0x6543);
+              out0[2] = __byte_perm (in0[1], in0[2], 0x6543);
+              out0[1] = __byte_perm (in0[0], in0[1], 0x6543);
+              out0[0] = __byte_perm (     0, in0[0], 0x6543);
               break;
-    case 15:  out0[0] = __byte_perm (in0[3], in1[0], 0x6543);
-              out0[1] = __byte_perm (in1[0], in1[1], 0x6543);
-              out0[2] = __byte_perm (in1[1], in1[2], 0x6543);
-              out0[3] = __byte_perm (in1[2], in1[3], 0x6543);
-              out1[0] = __byte_perm (in1[3],      0, 0x6543);
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+    case  2:  out1[3] = __byte_perm (in1[2], in1[3], 0x5432);
+              out1[2] = __byte_perm (in1[1], in1[2], 0x5432);
+              out1[1] = __byte_perm (in1[0], in1[1], 0x5432);
+              out1[0] = __byte_perm (in0[3], in1[0], 0x5432);
+              out0[3] = __byte_perm (in0[2], in0[3], 0x5432);
+              out0[2] = __byte_perm (in0[1], in0[2], 0x5432);
+              out0[1] = __byte_perm (in0[0], in0[1], 0x5432);
+              out0[0] = __byte_perm (     0, in0[0], 0x5432);
               break;
-    case 16:  out0[0] = in1[0];
-              out0[1] = in1[1];
-              out0[2] = in1[2];
-              out0[3] = in1[3];
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+    case  3:  out1[3] = __byte_perm (in1[2], in1[3], 0x4321);
+              out1[2] = __byte_perm (in1[1], in1[2], 0x4321);
+              out1[1] = __byte_perm (in1[0], in1[1], 0x4321);
+              out1[0] = __byte_perm (in0[3], in1[0], 0x4321);
+              out0[3] = __byte_perm (in0[2], in0[3], 0x4321);
+              out0[2] = __byte_perm (in0[1], in0[2], 0x4321);
+              out0[1] = __byte_perm (in0[0], in0[1], 0x4321);
+              out0[0] = __byte_perm (     0, in0[0], 0x4321);
               break;
-    case 17:  out0[0] = __byte_perm (in1[0], in1[1], 0x4321);
-              out0[1] = __byte_perm (in1[1], in1[2], 0x4321);
-              out0[2] = __byte_perm (in1[2], in1[3], 0x4321);
-              out0[3] = __byte_perm (in1[3],      0, 0x4321);
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+    case  4:  out1[3] = in1[2];
+              out1[2] = in1[1];
+              out1[1] = in1[0];
+              out1[0] = in0[3];
+              out0[3] = in0[2];
+              out0[2] = in0[1];
+              out0[1] = in0[0];
+              out0[0] = 0;
               break;
-    case 18:  out0[0] = __byte_perm (in1[0], in1[1], 0x5432);
-              out0[1] = __byte_perm (in1[1], in1[2], 0x5432);
-              out0[2] = __byte_perm (in1[2], in1[3], 0x5432);
-              out0[3] = __byte_perm (in1[3],      0, 0x5432);
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+    case  5:  out1[3] = __byte_perm (in1[1], in1[2], 0x6543);
+              out1[2] = __byte_perm (in1[0], in1[1], 0x6543);
+              out1[1] = __byte_perm (in0[3], in1[0], 0x6543);
+              out1[0] = __byte_perm (in0[2], in0[3], 0x6543);
+              out0[3] = __byte_perm (in0[1], in0[2], 0x6543);
+              out0[2] = __byte_perm (in0[0], in0[1], 0x6543);
+              out0[1] = __byte_perm (     0, in0[0], 0x6543);
+              out0[0] = 0;
               break;
-    case 19:  out0[0] = __byte_perm (in1[0], in1[1], 0x6543);
-              out0[1] = __byte_perm (in1[1], in1[2], 0x6543);
-              out0[2] = __byte_perm (in1[2], in1[3], 0x6543);
-              out0[3] = __byte_perm (in1[3],      0, 0x6543);
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+    case  6:  out1[3] = __byte_perm (in1[1], in1[2], 0x5432);
+              out1[2] = __byte_perm (in1[0], in1[1], 0x5432);
+              out1[1] = __byte_perm (in0[3], in1[0], 0x5432);
+              out1[0] = __byte_perm (in0[2], in0[3], 0x5432);
+              out0[3] = __byte_perm (in0[1], in0[2], 0x5432);
+              out0[2] = __byte_perm (in0[0], in0[1], 0x5432);
+              out0[1] = __byte_perm (     0, in0[0], 0x5432);
+              out0[0] = 0;
               break;
-    case 20:  out0[0] = in1[1];
-              out0[1] = in1[2];
-              out0[2] = in1[3];
-              out0[3] = 0;
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+    case  7:  out1[3] = __byte_perm (in1[1], in1[2], 0x4321);
+              out1[2] = __byte_perm (in1[0], in1[1], 0x4321);
+              out1[1] = __byte_perm (in0[3], in1[0], 0x4321);
+              out1[0] = __byte_perm (in0[2], in0[3], 0x4321);
+              out0[3] = __byte_perm (in0[1], in0[2], 0x4321);
+              out0[2] = __byte_perm (in0[0], in0[1], 0x4321);
+              out0[1] = __byte_perm (     0, in0[0], 0x4321);
+              out0[0] = 0;
               break;
-    case 21:  out0[0] = __byte_perm (in1[1], in1[2], 0x4321);
-              out0[1] = __byte_perm (in1[2], in1[3], 0x4321);
-              out0[2] = __byte_perm (in1[3],      0, 0x4321);
-              out0[3] = 0;
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+    case  8:  out1[3] = in1[1];
+              out1[2] = in1[0];
+              out1[1] = in0[3];
+              out1[0] = in0[2];
+              out0[3] = in0[1];
+              out0[2] = in0[0];
+              out0[1] = 0;
+              out0[0] = 0;
               break;
-    case 22:  out0[0] = __byte_perm (in1[1], in1[2], 0x5432);
-              out0[1] = __byte_perm (in1[2], in1[3], 0x5432);
-              out0[2] = __byte_perm (in1[3],      0, 0x5432);
-              out0[3] = 0;
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+    case  9:  out1[3] = __byte_perm (in1[0], in1[1], 0x6543);
+              out1[2] = __byte_perm (in0[3], in1[0], 0x6543);
+              out1[1] = __byte_perm (in0[2], in0[3], 0x6543);
+              out1[0] = __byte_perm (in0[1], in0[2], 0x6543);
+              out0[3] = __byte_perm (in0[0], in0[1], 0x6543);
+              out0[2] = __byte_perm (     0, in0[0], 0x6543);
+              out0[1] = 0;
+              out0[0] = 0;
               break;
-    case 23:  out0[0] = __byte_perm (in1[1], in1[2], 0x6543);
-              out0[1] = __byte_perm (in1[2], in1[3], 0x6543);
-              out0[2] = __byte_perm (in1[3],      0, 0x6543);
-              out0[3] = 0;
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+    case 10:  out1[3] = __byte_perm (in1[0], in1[1], 0x5432);
+              out1[2] = __byte_perm (in0[3], in1[0], 0x5432);
+              out1[1] = __byte_perm (in0[2], in0[3], 0x5432);
+              out1[0] = __byte_perm (in0[1], in0[2], 0x5432);
+              out0[3] = __byte_perm (in0[0], in0[1], 0x5432);
+              out0[2] = __byte_perm (     0, in0[0], 0x5432);
+              out0[1] = 0;
+              out0[0] = 0;
               break;
-    case 24:  out0[0] = in1[2];
-              out0[1] = in1[3];
-              out0[2] = 0;
-              out0[3] = 0;
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+    case 11:  out1[3] = __byte_perm (in1[0], in1[1], 0x4321);
+              out1[2] = __byte_perm (in0[3], in1[0], 0x4321);
+              out1[1] = __byte_perm (in0[2], in0[3], 0x4321);
+              out1[0] = __byte_perm (in0[1], in0[2], 0x4321);
+              out0[3] = __byte_perm (in0[0], in0[1], 0x4321);
+              out0[2] = __byte_perm (     0, in0[0], 0x4321);
+              out0[1] = 0;
+              out0[0] = 0;
               break;
-    case 25:  out0[0] = __byte_perm (in1[2], in1[3], 0x4321);
-              out0[1] = __byte_perm (in1[3],      0, 0x4321);
+    case 12:  out1[3] = in1[0];
+              out1[2] = in0[3];
+              out1[1] = in0[2];
+              out1[0] = in0[1];
+              out0[3] = in0[0];
               out0[2] = 0;
-              out0[3] = 0;
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
               break;
-    case 26:  out0[0] = __byte_perm (in1[2], in1[3], 0x5432);
-              out0[1] = __byte_perm (in1[3],      0, 0x5432);
+    case 13:  out1[3] = __byte_perm (in0[3], in1[0], 0x6543);
+              out1[2] = __byte_perm (in0[2], in0[3], 0x6543);
+              out1[1] = __byte_perm (in0[1], in0[2], 0x6543);
+              out1[0] = __byte_perm (in0[0], in0[1], 0x6543);
+              out0[3] = __byte_perm (     0, in0[0], 0x6543);
               out0[2] = 0;
-              out0[3] = 0;
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
               break;
-    case 27:  out0[0] = __byte_perm (in1[2], in1[3], 0x6543);
-              out0[1] = __byte_perm (in1[3],      0, 0x6543);
+    case 14:  out1[3] = __byte_perm (in0[3], in1[0], 0x5432);
+              out1[2] = __byte_perm (in0[2], in0[3], 0x5432);
+              out1[1] = __byte_perm (in0[1], in0[2], 0x5432);
+              out1[0] = __byte_perm (in0[0], in0[1], 0x5432);
+              out0[3] = __byte_perm (     0, in0[0], 0x5432);
               out0[2] = 0;
-              out0[3] = 0;
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
-              break;
-    case 28:  out0[0] = in1[3];
               out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 15:  out1[3] = __byte_perm (in0[3], in1[0], 0x4321);
+              out1[2] = __byte_perm (in0[2], in0[3], 0x4321);
+              out1[1] = __byte_perm (in0[1], in0[2], 0x4321);
+              out1[0] = __byte_perm (in0[0], in0[1], 0x4321);
+              out0[3] = __byte_perm (     0, in0[0], 0x4321);
               out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 16:  out1[3] = in0[3];
+              out1[2] = in0[2];
+              out1[1] = in0[1];
+              out1[0] = in0[0];
               out0[3] = 0;
-              out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+              out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
               break;
-    case 29:  out0[0] = __byte_perm (in1[3],     0, 0x4321);
+    case 17:  out1[3] = __byte_perm (in0[2], in0[3], 0x6543);
+              out1[2] = __byte_perm (in0[1], in0[2], 0x6543);
+              out1[1] = __byte_perm (in0[0], in0[1], 0x6543);
+              out1[0] = __byte_perm (     0, in0[0], 0x6543);
+              out0[3] = 0;
+              out0[2] = 0;
               out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 18:  out1[3] = __byte_perm (in0[2], in0[3], 0x5432);
+              out1[2] = __byte_perm (in0[1], in0[2], 0x5432);
+              out1[1] = __byte_perm (in0[0], in0[1], 0x5432);
+              out1[0] = __byte_perm (     0, in0[0], 0x5432);
+              out0[3] = 0;
               out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 19:  out1[3] = __byte_perm (in0[2], in0[3], 0x4321);
+              out1[2] = __byte_perm (in0[1], in0[2], 0x4321);
+              out1[1] = __byte_perm (in0[0], in0[1], 0x4321);
+              out1[0] = __byte_perm (     0, in0[0], 0x4321);
               out0[3] = 0;
+              out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 20:  out1[3] = in0[2];
+              out1[2] = in0[1];
+              out1[1] = in0[0];
               out1[0] = 0;
-              out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+              out0[3] = 0;
+              out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
               break;
-    case 30:  out0[0] =  __byte_perm (in1[3],     0, 0x5432);
+    case 21:  out1[3] = __byte_perm (in0[1], in0[2], 0x6543);
+              out1[2] = __byte_perm (in0[0], in0[1], 0x6543);
+              out1[1] = __byte_perm (     0, in0[0], 0x6543);
+              out1[0] = 0;
+              out0[3] = 0;
+              out0[2] = 0;
               out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 22:  out1[3] = __byte_perm (in0[1], in0[2], 0x5432);
+              out1[2] = __byte_perm (in0[0], in0[1], 0x5432);
+              out1[1] = __byte_perm (     0, in0[0], 0x5432);
+              out1[0] = 0;
+              out0[3] = 0;
               out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 23:  out1[3] = __byte_perm (in0[1], in0[2], 0x4321);
+              out1[2] = __byte_perm (in0[0], in0[1], 0x4321);
+              out1[1] = __byte_perm (     0, in0[0], 0x4321);
+              out1[0] = 0;
               out0[3] = 0;
+              out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 24:  out1[3] = in0[1];
+              out1[2] = in0[0];
+              out1[1] = 0;
               out1[0] = 0;
+              out0[3] = 0;
+              out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 25:  out1[3] = __byte_perm (in0[0], in0[1], 0x6543);
+              out1[2] = __byte_perm (     0, in0[0], 0x6543);
               out1[1] = 0;
-              out1[2] = 0;
-              out1[3] = 0;
+              out1[0] = 0;
+              out0[3] = 0;
+              out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
               break;
-    case 31:  out0[0] =  __byte_perm (in1[3],     0, 0x6543);
+    case 26:  out1[3] = __byte_perm (in0[0], in0[1], 0x5432);
+              out1[2] = __byte_perm (     0, in0[0], 0x5432);
+              out1[1] = 0;
+              out1[0] = 0;
+              out0[3] = 0;
+              out0[2] = 0;
               out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 27:  out1[3] = __byte_perm (in0[0], in0[1], 0x4321);
+              out1[2] = __byte_perm (     0, in0[0], 0x4321);
+              out1[1] = 0;
+              out1[0] = 0;
+              out0[3] = 0;
               out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 28:  out1[3] = in0[0];
+              out1[2] = 0;
+              out1[1] = 0;
+              out1[0] = 0;
               out0[3] = 0;
+              out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 29:  out1[3] = __byte_perm (     0, in0[0], 0x6543);
+              out1[2] = 0;
+              out1[1] = 0;
               out1[0] = 0;
+              out0[3] = 0;
+              out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 30:  out1[3] = __byte_perm (     0, in0[0], 0x5432);
+              out1[2] = 0;
               out1[1] = 0;
+              out1[0] = 0;
+              out0[3] = 0;
+              out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
+              break;
+    case 31:  out1[3] = __byte_perm (     0, in0[0], 0x4321);
               out1[2] = 0;
-              out1[3] = 0;
+              out1[1] = 0;
+              out1[0] = 0;
+              out0[3] = 0;
+              out0[2] = 0;
+              out0[1] = 0;
+              out0[0] = 0;
               break;
   }
   #endif