+#ifdef IS_NV
+
+#define KXX_DECL
+#define sXXX_DECL
+
+#if CUDA_ARCH >= 500
+
+//
+// Bitslice DES S-boxes with LOP3.LUT instructions
+// For NVIDIA Maxwell architecture and CUDA 7.5 RC
+// by DeepLearningJohnDoe, version 0.1.6, 2015/07/19
+//
+// Gate counts: 25 24 25 18 25 24 24 23
+// Average: 23.5
+// Depth: 8 7 7 6 8 10 10 8
+// Average: 8
+//
+// Note that same S-box function with a lower gate count isn't necessarily faster.
+//
+// These Boolean expressions corresponding to DES S-boxes were
+// discovered by <deeplearningjohndoe at gmail.com>
+//
+// This file itself is Copyright (c) 2015 by <deeplearningjohndoe at gmail.com>
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+//
+// The underlying mathematical formulas are NOT copyrighted.
+//
+
+#define LUT(a,b,c,d,e) u32 a; asm ("lop3.b32 %0, %1, %2, %3, "#e";" : "=r"(a): "r"(b), "r"(c), "r"(d));
+
+static void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ LUT(xAA55AA5500550055, a1, a4, a6, 0xC1)
+ LUT(xA55AA55AF0F5F0F5, a3, a6, xAA55AA5500550055, 0x9E)
+ LUT(x5F5F5F5FA5A5A5A5, a1, a3, a6, 0xD6)
+ LUT(xF5A0F5A0A55AA55A, a4, xAA55AA5500550055, x5F5F5F5FA5A5A5A5, 0x56)
+ LUT(x947A947AD1E7D1E7, a2, xA55AA55AF0F5F0F5, xF5A0F5A0A55AA55A, 0x6C)
+ LUT(x5FFF5FFFFFFAFFFA, a6, xAA55AA5500550055, x5F5F5F5FA5A5A5A5, 0x7B)
+ LUT(xB96CB96C69936993, a2, xF5A0F5A0A55AA55A, x5FFF5FFFFFFAFFFA, 0xD6)
+ LUT(x3, a5, x947A947AD1E7D1E7, xB96CB96C69936993, 0x6A)
+ LUT(x55EE55EE55EE55EE, a1, a2, a4, 0x7A)
+ LUT(x084C084CB77BB77B, a2, a6, xF5A0F5A0A55AA55A, 0xC9)
+ LUT(x9C329C32E295E295, x947A947AD1E7D1E7, x55EE55EE55EE55EE, x084C084CB77BB77B, 0x72)
+ LUT(xA51EA51E50E050E0, a3, a6, x55EE55EE55EE55EE, 0x29)
+ LUT(x4AD34AD3BE3CBE3C, a2, x947A947AD1E7D1E7, xA51EA51E50E050E0, 0x95)
+ LUT(x2, a5, x9C329C32E295E295, x4AD34AD3BE3CBE3C, 0xC6)
+ LUT(xD955D95595D195D1, a1, a2, x9C329C32E295E295, 0xD2)
+ LUT(x8058805811621162, x947A947AD1E7D1E7, x55EE55EE55EE55EE, x084C084CB77BB77B, 0x90)
+ LUT(x7D0F7D0FC4B3C4B3, xA51EA51E50E050E0, xD955D95595D195D1, x8058805811621162, 0x76)
+ LUT(x0805080500010001, a3, xAA55AA5500550055, xD955D95595D195D1, 0x80)
+ LUT(x4A964A96962D962D, xB96CB96C69936993, x4AD34AD3BE3CBE3C, x0805080500010001, 0xA6)
+ LUT(x4, a5, x7D0F7D0FC4B3C4B3, x4A964A96962D962D, 0xA6)
+ LUT(x148014807B087B08, a1, xAA55AA5500550055, x947A947AD1E7D1E7, 0x21)
+ LUT(x94D894D86B686B68, xA55AA55AF0F5F0F5, x8058805811621162, x148014807B087B08, 0x6A)
+ LUT(x5555555540044004, a1, a6, x084C084CB77BB77B, 0x70)
+ LUT(xAFB4AFB4BF5BBF5B, x5F5F5F5FA5A5A5A5, xA51EA51E50E050E0, x5555555540044004, 0x97)
+ LUT(x1, a5, x94D894D86B686B68, xAFB4AFB4BF5BBF5B, 0x6C)
+
+ *out1 ^= x1;
+ *out2 ^= x2;
+ *out3 ^= x3;
+ *out4 ^= x4;
+}
+
+static void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ LUT(xEEEEEEEE99999999, a1, a2, a6, 0x97)
+ LUT(xFFFFEEEE66666666, a5, a6, xEEEEEEEE99999999, 0x67)
+ LUT(x5555FFFFFFFF0000, a1, a5, a6, 0x76)
+ LUT(x6666DDDD5555AAAA, a2, xFFFFEEEE66666666, x5555FFFFFFFF0000, 0x69)
+ LUT(x6969D3D35353ACAC, a3, xFFFFEEEE66666666, x6666DDDD5555AAAA, 0x6A)
+ LUT(xCFCF3030CFCF3030, a2, a3, a5, 0x65)
+ LUT(xE4E4EEEE9999F0F0, a3, xEEEEEEEE99999999, x5555FFFFFFFF0000, 0x8D)
+ LUT(xE5E5BABACDCDB0B0, a1, xCFCF3030CFCF3030, xE4E4EEEE9999F0F0, 0xCA)
+ LUT(x3, a4, x6969D3D35353ACAC, xE5E5BABACDCDB0B0, 0xC6)
+ LUT(x3333CCCC00000000, a2, a5, a6, 0x14)
+ LUT(xCCCCDDDDFFFF0F0F, a5, xE4E4EEEE9999F0F0, x3333CCCC00000000, 0xB5)
+ LUT(x00000101F0F0F0F0, a3, a6, xFFFFEEEE66666666, 0x1C)
+ LUT(x9A9A64646A6A9595, a1, xCFCF3030CFCF3030, x00000101F0F0F0F0, 0x96)
+ LUT(x2, a4, xCCCCDDDDFFFF0F0F, x9A9A64646A6A9595, 0x6A)
+ LUT(x3333BBBB3333FFFF, a1, a2, x6666DDDD5555AAAA, 0xDE)
+ LUT(x1414141441410000, a1, a3, xE4E4EEEE9999F0F0, 0x90)
+ LUT(x7F7FF3F3F5F53939, x6969D3D35353ACAC, x9A9A64646A6A9595, x3333BBBB3333FFFF, 0x79)
+ LUT(x9494E3E34B4B3939, a5, x1414141441410000, x7F7FF3F3F5F53939, 0x29)
+ LUT(x1, a4, x3333BBBB3333FFFF, x9494E3E34B4B3939, 0xA6)
+ LUT(xB1B1BBBBCCCCA5A5, a1, a1, xE4E4EEEE9999F0F0, 0x4A)
+ LUT(xFFFFECECEEEEDDDD, a2, x3333CCCC00000000, x9A9A64646A6A9595, 0xEF)
+ LUT(xB1B1A9A9DCDC8787, xE5E5BABACDCDB0B0, xB1B1BBBBCCCCA5A5, xFFFFECECEEEEDDDD, 0x8D)
+ LUT(xFFFFCCCCEEEE4444, a2, a5, xFFFFEEEE66666666, 0x2B)
+ LUT(x4, a4, xB1B1A9A9DCDC8787, xFFFFCCCCEEEE4444, 0x6C)
+
+ *out1 ^= x1;
+ *out2 ^= x2;
+ *out3 ^= x3;
+ *out4 ^= x4;
+}
+
+static void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ LUT(xA50FA50FA50FA50F, a1, a3, a4, 0xC9)
+ LUT(xF0F00F0FF0F0F0F0, a3, a5, a6, 0x4B)
+ LUT(xAF0FA0AAAF0FAF0F, a1, xA50FA50FA50FA50F, xF0F00F0FF0F0F0F0, 0x4D)
+ LUT(x5AA5A55A5AA55AA5, a1, a4, xF0F00F0FF0F0F0F0, 0x69)
+ LUT(xAA005FFFAA005FFF, a3, a5, xA50FA50FA50FA50F, 0xD6)
+ LUT(x5AA5A55A0F5AFAA5, a6, x5AA5A55A5AA55AA5, xAA005FFFAA005FFF, 0x9C)
+ LUT(x1, a2, xAF0FA0AAAF0FAF0F, x5AA5A55A0F5AFAA5, 0xA6)
+ LUT(xAA55AA5500AA00AA, a1, a4, a6, 0x49)
+ LUT(xFAFAA50FFAFAA50F, a1, a5, xA50FA50FA50FA50F, 0x9B)
+ LUT(x50AF0F5AFA50A5A5, a1, xAA55AA5500AA00AA, xFAFAA50FFAFAA50F, 0x66)
+ LUT(xAFAFAFAFFAFAFAFA, a1, a3, a6, 0x6F)
+ LUT(xAFAFFFFFFFFAFAFF, a4, x50AF0F5AFA50A5A5, xAFAFAFAFFAFAFAFA, 0xEB)
+ LUT(x4, a2, x50AF0F5AFA50A5A5, xAFAFFFFFFFFAFAFF, 0x6C)
+ LUT(x500F500F500F500F, a1, a3, a4, 0x98)
+ LUT(xF0505A0505A5050F, x5AA5A55A0F5AFAA5, xAA55AA5500AA00AA, xAFAFAFAFFAFAFAFA, 0x1D)
+ LUT(xF0505A05AA55AAFF, a6, x500F500F500F500F, xF0505A0505A5050F, 0x9A)
+ LUT(xFF005F55FF005F55, a1, a4, xAA005FFFAA005FFF, 0xB2)
+ LUT(xA55F5AF0A55F5AF0, a5, xA50FA50FA50FA50F, x5AA5A55A5AA55AA5, 0x3D)
+ LUT(x5A5F05A5A55F5AF0, a6, xFF005F55FF005F55, xA55F5AF0A55F5AF0, 0xA6)
+ LUT(x3, a2, xF0505A05AA55AAFF, x5A5F05A5A55F5AF0, 0xA6)
+ LUT(x0F0F0F0FA5A5A5A5, a1, a3, a6, 0xC6)
+ LUT(x5FFFFF5FFFA0FFA0, x5AA5A55A5AA55AA5, xAFAFAFAFFAFAFAFA, x0F0F0F0FA5A5A5A5, 0xDB)
+ LUT(xF5555AF500A05FFF, a5, xFAFAA50FFAFAA50F, xF0505A0505A5050F, 0xB9)
+ LUT(x05A5AAF55AFA55A5, xF0505A05AA55AAFF, x0F0F0F0FA5A5A5A5, xF5555AF500A05FFF, 0x9B)
+ LUT(x2, a2, x5FFFFF5FFFA0FFA0, x05A5AAF55AFA55A5, 0xA6)
+
+ *out1 ^= x1;
+ *out2 ^= x2;
+ *out3 ^= x3;
+ *out4 ^= x4;
+}
+
+static void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ LUT(x55F055F055F055F0, a1, a3, a4, 0x72)
+ LUT(xA500F5F0A500F5F0, a3, a5, x55F055F055F055F0, 0xAD)
+ LUT(xF50AF50AF50AF50A, a1, a3, a4, 0x59)
+ LUT(xF5FA0FFFF5FA0FFF, a3, a5, xF50AF50AF50AF50A, 0xE7)
+ LUT(x61C8F93C61C8F93C, a2, xA500F5F0A500F5F0, xF5FA0FFFF5FA0FFF, 0xC6)
+ LUT(x9999666699996666, a1, a2, a5, 0x69)
+ LUT(x22C022C022C022C0, a2, a4, x55F055F055F055F0, 0x18)
+ LUT(xB35C94A6B35C94A6, xF5FA0FFFF5FA0FFF, x9999666699996666, x22C022C022C022C0, 0x63)
+ LUT(x4, a6, x61C8F93C61C8F93C, xB35C94A6B35C94A6, 0x6A)
+ LUT(x4848484848484848, a1, a2, a3, 0x12)
+ LUT(x55500AAA55500AAA, a1, a5, xF5FA0FFFF5FA0FFF, 0x28)
+ LUT(x3C90B3D63C90B3D6, x61C8F93C61C8F93C, x4848484848484848, x55500AAA55500AAA, 0x1E)
+ LUT(x8484333384843333, a1, x9999666699996666, x4848484848484848, 0x14)
+ LUT(x4452F1AC4452F1AC, xF50AF50AF50AF50A, xF5FA0FFFF5FA0FFF, xB35C94A6B35C94A6, 0x78)
+ LUT(x9586CA379586CA37, x55500AAA55500AAA, x8484333384843333, x4452F1AC4452F1AC, 0xD6)
+ LUT(x2, a6, x3C90B3D63C90B3D6, x9586CA379586CA37, 0x6A)
+ LUT(x1, a6, x3C90B3D63C90B3D6, x9586CA379586CA37, 0xA9)
+ LUT(x3, a6, x61C8F93C61C8F93C, xB35C94A6B35C94A6, 0x56)
+
+ *out1 ^= x1;
+ *out2 ^= x2;
+ *out3 ^= x3;
+ *out4 ^= x4;
+}
+
+static void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ LUT(xA0A0A0A0FFFFFFFF, a1, a3, a6, 0xAB)
+ LUT(xFFFF00005555FFFF, a1, a5, a6, 0xB9)
+ LUT(xB3B320207777FFFF, a2, xA0A0A0A0FFFFFFFF, xFFFF00005555FFFF, 0xE8)
+ LUT(x50505A5A5A5A5050, a1, a3, xFFFF00005555FFFF, 0x34)
+ LUT(xA2A2FFFF2222FFFF, a1, a5, xB3B320207777FFFF, 0xCE)
+ LUT(x2E2E6969A4A46363, a2, x50505A5A5A5A5050, xA2A2FFFF2222FFFF, 0x29)
+ LUT(x3, a4, xB3B320207777FFFF, x2E2E6969A4A46363, 0xA6)
+ LUT(xA5A50A0AA5A50A0A, a1, a3, a5, 0x49)
+ LUT(x969639396969C6C6, a2, a6, xA5A50A0AA5A50A0A, 0x96)
+ LUT(x1B1B1B1B1B1B1B1B, a1, a2, a3, 0xCA)
+ LUT(xBFBFBFBFF6F6F9F9, a3, xA0A0A0A0FFFFFFFF, x969639396969C6C6, 0x7E)
+ LUT(x5B5BA4A4B8B81D1D, xFFFF00005555FFFF, x1B1B1B1B1B1B1B1B, xBFBFBFBFF6F6F9F9, 0x96)
+ LUT(x2, a4, x969639396969C6C6, x5B5BA4A4B8B81D1D, 0xCA)
+ LUT(x5555BBBBFFFF5555, a1, a2, xFFFF00005555FFFF, 0xE5)
+ LUT(x6D6D9C9C95956969, x50505A5A5A5A5050, xA2A2FFFF2222FFFF, x969639396969C6C6, 0x97)
+ LUT(x1A1A67676A6AB4B4, xA5A50A0AA5A50A0A, x5555BBBBFFFF5555, x6D6D9C9C95956969, 0x47)
+ LUT(xA0A0FFFFAAAA0000, a3, xFFFF00005555FFFF, xA5A50A0AA5A50A0A, 0x3B)
+ LUT(x36369C9CC1C1D6D6, x969639396969C6C6, x6D6D9C9C95956969, xA0A0FFFFAAAA0000, 0xD9)
+ LUT(x1, a4, x1A1A67676A6AB4B4, x36369C9CC1C1D6D6, 0xCA)
+ LUT(x5555F0F0F5F55555, a1, a3, xFFFF00005555FFFF, 0xB1)
+ LUT(x79790202DCDC0808, xA2A2FFFF2222FFFF, xA5A50A0AA5A50A0A, x969639396969C6C6, 0x47)
+ LUT(x6C6CF2F229295D5D, xBFBFBFBFF6F6F9F9, x5555F0F0F5F55555, x79790202DCDC0808, 0x6E)
+ LUT(xA3A3505010101A1A, a2, xA2A2FFFF2222FFFF, x36369C9CC1C1D6D6, 0x94)
+ LUT(x7676C7C74F4FC7C7, a1, x2E2E6969A4A46363, xA3A3505010101A1A, 0xD9)
+ LUT(x4, a4, x6C6CF2F229295D5D, x7676C7C74F4FC7C7, 0xC6)
+
+ *out1 ^= x1;
+ *out2 ^= x2;
+ *out3 ^= x3;
+ *out4 ^= x4;
+}
+
+static void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ LUT(x5050F5F55050F5F5, a1, a3, a5, 0xB2)
+ LUT(x6363C6C66363C6C6, a1, a2, x5050F5F55050F5F5, 0x66)
+ LUT(xAAAA5555AAAA5555, a1, a1, a5, 0xA9)
+ LUT(x3A3A65653A3A6565, a3, x6363C6C66363C6C6, xAAAA5555AAAA5555, 0xA9)
+ LUT(x5963A3C65963A3C6, a4, x6363C6C66363C6C6, x3A3A65653A3A6565, 0xC6)
+ LUT(xE7E76565E7E76565, a5, x6363C6C66363C6C6, x3A3A65653A3A6565, 0xAD)
+ LUT(x455D45DF455D45DF, a1, a4, xE7E76565E7E76565, 0xE4)
+ LUT(x4, a6, x5963A3C65963A3C6, x455D45DF455D45DF, 0x6C)
+ LUT(x1101220211012202, a2, xAAAA5555AAAA5555, x5963A3C65963A3C6, 0x20)
+ LUT(xF00F0FF0F00F0FF0, a3, a4, a5, 0x69)
+ LUT(x16E94A9716E94A97, xE7E76565E7E76565, x1101220211012202, xF00F0FF0F00F0FF0, 0x9E)
+ LUT(x2992922929929229, a1, a2, xF00F0FF0F00F0FF0, 0x49)
+ LUT(xAFAF9823AFAF9823, a5, x5050F5F55050F5F5, x2992922929929229, 0x93)
+ LUT(x3, a6, x16E94A9716E94A97, xAFAF9823AFAF9823, 0x6C)
+ LUT(x4801810248018102, a4, x5963A3C65963A3C6, x1101220211012202, 0xA4)
+ LUT(x5EE8FFFD5EE8FFFD, a5, x16E94A9716E94A97, x4801810248018102, 0x76)
+ LUT(xF0FF00FFF0FF00FF, a3, a4, a5, 0xCD)
+ LUT(x942D9A67942D9A67, x3A3A65653A3A6565, x5EE8FFFD5EE8FFFD, xF0FF00FFF0FF00FF, 0x86)
+ LUT(x1, a6, x5EE8FFFD5EE8FFFD, x942D9A67942D9A67, 0xA6)
+ LUT(x6A40D4ED6F4DD4EE, a2, x4, xAFAF9823AFAF9823, 0x2D)
+ LUT(x6CA89C7869A49C79, x1101220211012202, x16E94A9716E94A97, x6A40D4ED6F4DD4EE, 0x26)
+ LUT(xD6DE73F9D6DE73F9, a3, x6363C6C66363C6C6, x455D45DF455D45DF, 0x6B)
+ LUT(x925E63E1965A63E1, x3A3A65653A3A6565, x6CA89C7869A49C79, xD6DE73F9D6DE73F9, 0xA2)
+ LUT(x2, a6, x6CA89C7869A49C79, x925E63E1965A63E1, 0xCA)
+
+ *out1 ^= x1;
+ *out2 ^= x2;
+ *out3 ^= x3;
+ *out4 ^= x4;
+}
+
+static void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ LUT(x88AA88AA88AA88AA, a1, a2, a4, 0x0B)
+ LUT(xAAAAFF00AAAAFF00, a1, a4, a5, 0x27)
+ LUT(xADAFF8A5ADAFF8A5, a3, x88AA88AA88AA88AA, xAAAAFF00AAAAFF00, 0x9E)
+ LUT(x0A0AF5F50A0AF5F5, a1, a3, a5, 0xA6)
+ LUT(x6B69C5DC6B69C5DC, a2, xADAFF8A5ADAFF8A5, x0A0AF5F50A0AF5F5, 0x6B)
+ LUT(x1C69B2DC1C69B2DC, a4, x88AA88AA88AA88AA, x6B69C5DC6B69C5DC, 0xA9)
+ LUT(x1, a6, xADAFF8A5ADAFF8A5, x1C69B2DC1C69B2DC, 0x6A)
+ LUT(x9C9C9C9C9C9C9C9C, a1, a2, a3, 0x63)
+ LUT(xE6E63BFDE6E63BFD, a2, xAAAAFF00AAAAFF00, x0A0AF5F50A0AF5F5, 0xE7)
+ LUT(x6385639E6385639E, a4, x9C9C9C9C9C9C9C9C, xE6E63BFDE6E63BFD, 0x93)
+ LUT(x5959C4CE5959C4CE, a2, x6B69C5DC6B69C5DC, xE6E63BFDE6E63BFD, 0x5D)
+ LUT(x5B53F53B5B53F53B, a4, x0A0AF5F50A0AF5F5, x5959C4CE5959C4CE, 0x6E)
+ LUT(x3, a6, x6385639E6385639E, x5B53F53B5B53F53B, 0xC6)
+ LUT(xFAF505FAFAF505FA, a3, a4, x0A0AF5F50A0AF5F5, 0x6D)
+ LUT(x6A65956A6A65956A, a3, x9C9C9C9C9C9C9C9C, xFAF505FAFAF505FA, 0xA6)
+ LUT(x8888CCCC8888CCCC, a1, a2, a5, 0x23)
+ LUT(x94E97A9494E97A94, x1C69B2DC1C69B2DC, x6A65956A6A65956A, x8888CCCC8888CCCC, 0x72)
+ LUT(x4, a6, x6A65956A6A65956A, x94E97A9494E97A94, 0xAC)
+ LUT(xA050A050A050A050, a1, a3, a4, 0x21)
+ LUT(xC1B87A2BC1B87A2B, xAAAAFF00AAAAFF00, x5B53F53B5B53F53B, x94E97A9494E97A94, 0xA4)
+ LUT(xE96016B7E96016B7, x8888CCCC8888CCCC, xA050A050A050A050, xC1B87A2BC1B87A2B, 0x96)
+ LUT(xE3CF1FD5E3CF1FD5, x88AA88AA88AA88AA, x6A65956A6A65956A, xE96016B7E96016B7, 0x3E)
+ LUT(x6776675B6776675B, xADAFF8A5ADAFF8A5, x94E97A9494E97A94, xE3CF1FD5E3CF1FD5, 0x6B)
+ LUT(x2, a6, xE96016B7E96016B7, x6776675B6776675B, 0xC6)
+
+ *out1 ^= x1;
+ *out2 ^= x2;
+ *out3 ^= x3;
+ *out4 ^= x4;
+}
+
+static void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ LUT(xEEEE3333EEEE3333, a1, a2, a5, 0x9D)
+ LUT(xBBBBBBBBBBBBBBBB, a1, a1, a2, 0x83)
+ LUT(xDDDDAAAADDDDAAAA, a1, a2, a5, 0x5B)
+ LUT(x29295A5A29295A5A, a3, xBBBBBBBBBBBBBBBB, xDDDDAAAADDDDAAAA, 0x85)
+ LUT(xC729695AC729695A, a4, xEEEE3333EEEE3333, x29295A5A29295A5A, 0xA6)
+ LUT(x3BF77B7B3BF77B7B, a2, a5, xC729695AC729695A, 0xF9)
+ LUT(x2900FF002900FF00, a4, a5, x29295A5A29295A5A, 0x0E)
+ LUT(x56B3803F56B3803F, xBBBBBBBBBBBBBBBB, x3BF77B7B3BF77B7B, x2900FF002900FF00, 0x61)
+ LUT(x4, a6, xC729695AC729695A, x56B3803F56B3803F, 0x6C)
+ LUT(xFBFBFBFBFBFBFBFB, a1, a2, a3, 0xDF)
+ LUT(x3012B7B73012B7B7, a2, a5, xC729695AC729695A, 0xD4)
+ LUT(x34E9B34C34E9B34C, a4, xFBFBFBFBFBFBFBFB, x3012B7B73012B7B7, 0x69)
+ LUT(xBFEAEBBEBFEAEBBE, a1, x29295A5A29295A5A, x34E9B34C34E9B34C, 0x6F)
+ LUT(xFFAEAFFEFFAEAFFE, a3, xBBBBBBBBBBBBBBBB, xBFEAEBBEBFEAEBBE, 0xB9)
+ LUT(x2, a6, x34E9B34C34E9B34C, xFFAEAFFEFFAEAFFE, 0xC6)
+ LUT(xCFDE88BBCFDE88BB, a2, xDDDDAAAADDDDAAAA, x34E9B34C34E9B34C, 0x5C)
+ LUT(x3055574530555745, a1, xC729695AC729695A, xCFDE88BBCFDE88BB, 0x71)
+ LUT(x99DDEEEE99DDEEEE, a4, xBBBBBBBBBBBBBBBB, xDDDDAAAADDDDAAAA, 0xB9)
+ LUT(x693CD926693CD926, x3BF77B7B3BF77B7B, x34E9B34C34E9B34C, x99DDEEEE99DDEEEE, 0x69)
+ LUT(x3, a6, x3055574530555745, x693CD926693CD926, 0x6A)
+ LUT(x9955EE559955EE55, a1, a4, x99DDEEEE99DDEEEE, 0xE2)
+ LUT(x9D48FA949D48FA94, x3BF77B7B3BF77B7B, xBFEAEBBEBFEAEBBE, x9955EE559955EE55, 0x9C)
+ LUT(x1, a6, xC729695AC729695A, x9D48FA949D48FA94, 0x39)
+
+ *out1 ^= x1;
+ *out2 ^= x2;
+ *out3 ^= x3;
+ *out4 ^= x4;
+}
+
+#else
+
+/*
+ * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC
+ * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates.
+ *
+ * Gate counts: 49 44 46 33 48 46 46 41
+ * Average: 44.125
+ *
+ * Several same-gate-count expressions for each S-box are included (for use on
+ * different CPUs/GPUs).
+ *
+ * These Boolean expressions corresponding to DES S-boxes have been generated
+ * by Roman Rusakov <roman_rus at openwall.com> for use in Openwall's
+ * John the Ripper password cracker: http://www.openwall.com/john/
+ * Being mathematical formulas, they are not copyrighted and are free for reuse
+ * by anyone.
+ *
+ * This file (a specific representation of the S-box expressions, surrounding
+ * logic) is Copyright (c) 2011 by Solar Designer <solar at openwall.com>.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted. (This is a heavily cut-down "BSD license".)
+ *
+ * The effort has been sponsored by Rapid7: http://www.rapid7.com
+ */
+
+static void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969,
+ x25202160;
+ u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93;
+ u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69;
+ u32 x0A0A0000, x0AD80096, x00999900, x0AD99996;
+ u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC;
+ u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0;
+ u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A;
+ u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205;
+ u32 x00, x01, x10, x11, x20, x21, x30, x31;
+
+ x55005500 = a1 & ~a5;
+ x5A0F5A0F = a4 ^ x55005500;
+ x3333FFFF = a3 | a6;
+ x66666666 = a1 ^ a3;
+ x22226666 = x3333FFFF & x66666666;
+ x2D2D6969 = a4 ^ x22226666;
+ x25202160 = x2D2D6969 & ~x5A0F5A0F;
+
+ x00FFFF00 = a5 ^ a6;
+ x33CCCC33 = a3 ^ x00FFFF00;
+ x4803120C = x5A0F5A0F & ~x33CCCC33;
+ x2222FFFF = a6 | x22226666;
+ x6A21EDF3 = x4803120C ^ x2222FFFF;
+ x4A01CC93 = x6A21EDF3 & ~x25202160;
+
+ x5555FFFF = a1 | a6;
+ x7F75FFFF = x6A21EDF3 | x5555FFFF;
+ x00D20096 = a5 & ~x2D2D6969;
+ x7FA7FF69 = x7F75FFFF ^ x00D20096;
+
+ x0A0A0000 = a4 & ~x5555FFFF;
+ x0AD80096 = x00D20096 ^ x0A0A0000;
+ x00999900 = x00FFFF00 & ~x66666666;
+ x0AD99996 = x0AD80096 | x00999900;
+
+ x22332233 = a3 & ~x55005500;
+ x257AA5F0 = x5A0F5A0F ^ x7F75FFFF;
+ x054885C0 = x257AA5F0 & ~x22332233;
+ xFAB77A3F = ~x054885C0;
+ x2221EDF3 = x3333FFFF & x6A21EDF3;
+ xD89697CC = xFAB77A3F ^ x2221EDF3;
+ x20 = x7FA7FF69 & ~a2;
+ x21 = x20 ^ xD89697CC;
+ *out3 ^= x21;
+
+ x05B77AC0 = x00FFFF00 ^ x054885C0;
+ x05F77AD6 = x00D20096 | x05B77AC0;
+ x36C48529 = x3333FFFF ^ x05F77AD6;
+ x6391D07C = a1 ^ x36C48529;
+ xBB0747B0 = xD89697CC ^ x6391D07C;
+ x00 = x25202160 | a2;
+ x01 = x00 ^ xBB0747B0;
+ *out1 ^= x01;
+
+ x4C460000 = x3333FFFF ^ x7F75FFFF;
+ x4EDF9996 = x0AD99996 | x4C460000;
+ x2D4E49EA = x6391D07C ^ x4EDF9996;
+ xBBFFFFB0 = x00FFFF00 | xBB0747B0;
+ x96B1B65A = x2D4E49EA ^ xBBFFFFB0;
+ x10 = x4A01CC93 | a2;
+ x11 = x10 ^ x96B1B65A;
+ *out2 ^= x11;
+
+ x5AFF5AFF = a5 | x5A0F5A0F;
+ x52B11215 = x5AFF5AFF & ~x2D4E49EA;
+ x4201C010 = x4A01CC93 & x6391D07C;
+ x10B0D205 = x52B11215 ^ x4201C010;
+ x30 = x10B0D205 | a2;
+ x31 = x30 ^ x0AD99996;
+ *out4 ^= x31;
+}
+
+static void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ u32 x33CC33CC;
+ u32 x55550000, x00AA00FF, x33BB33FF;
+ u32 x33CC0000, x11441144, x11BB11BB, x003311BB;
+ u32 x00000F0F, x336600FF, x332200FF, x332200F0;
+ u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95;
+ u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39;
+ u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53;
+ u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F;
+ u32 x0A451047, xBBDFDD7B, xB19ACD3C;
+ u32 x00, x01, x10, x11, x20, x21, x30, x31;
+
+ x33CC33CC = a2 ^ a5;
+
+ x55550000 = a1 & ~a6;
+ x00AA00FF = a5 & ~x55550000;
+ x33BB33FF = a2 | x00AA00FF;
+
+ x33CC0000 = x33CC33CC & ~a6;
+ x11441144 = a1 & x33CC33CC;
+ x11BB11BB = a5 ^ x11441144;
+ x003311BB = x11BB11BB & ~x33CC0000;
+
+ x00000F0F = a3 & a6;
+ x336600FF = x00AA00FF ^ x33CC0000;
+ x332200FF = x33BB33FF & x336600FF;
+ x332200F0 = x332200FF & ~x00000F0F;
+
+ x0302000F = a3 & x332200FF;
+ xAAAAAAAA = ~a1;
+ xA9A8AAA5 = x0302000F ^ xAAAAAAAA;
+ x33CCCC33 = a6 ^ x33CC33CC;
+ x33CCC030 = x33CCCC33 & ~x00000F0F;
+ x9A646A95 = xA9A8AAA5 ^ x33CCC030;
+ x10 = a4 & ~x332200F0;
+ x11 = x10 ^ x9A646A95;
+ *out2 ^= x11;
+
+ x00333303 = a2 & ~x33CCC030;
+ x118822B8 = x11BB11BB ^ x00333303;
+ xA8208805 = xA9A8AAA5 & ~x118822B8;
+ x3CC3C33C = a3 ^ x33CCCC33;
+ x94E34B39 = xA8208805 ^ x3CC3C33C;
+ x00 = x33BB33FF & ~a4;
+ x01 = x00 ^ x94E34B39;
+ *out1 ^= x01;
+
+ x0331330C = x0302000F ^ x00333303;
+ x3FF3F33C = x3CC3C33C | x0331330C;
+ xA9DF596A = x33BB33FF ^ x9A646A95;
+ xA9DF5F6F = x00000F0F | xA9DF596A;
+ x962CAC53 = x3FF3F33C ^ xA9DF5F6F;
+
+ xA9466A6A = x332200FF ^ x9A646A95;
+ x3DA52153 = x94E34B39 ^ xA9466A6A;
+ x29850143 = xA9DF5F6F & x3DA52153;
+ x33C0330C = x33CC33CC & x3FF3F33C;
+ x1A45324F = x29850143 ^ x33C0330C;
+ x20 = x1A45324F | a4;
+ x21 = x20 ^ x962CAC53;
+ *out3 ^= x21;
+
+ x0A451047 = x1A45324F & ~x118822B8;
+ xBBDFDD7B = x33CCCC33 | xA9DF596A;
+ xB19ACD3C = x0A451047 ^ xBBDFDD7B;
+ x30 = x003311BB | a4;
+ x31 = x30 ^ xB19ACD3C;
+ *out4 ^= x31;
+}
+
+static void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4;
+ u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00;
+ u32 x00005EF4, x00FF5EFF, x00555455, x3C699796;
+ u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F;
+ u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8;
+ u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A;
+ u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356;
+ u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B;
+ u32 x00, x01, x10, x11, x20, x21, x30, x31;
+
+ x44444444 = a1 & ~a2;
+ x0F0FF0F0 = a3 ^ a6;
+ x4F4FF4F4 = x44444444 | x0F0FF0F0;
+ x00FFFF00 = a4 ^ a6;
+ x00AAAA00 = x00FFFF00 & ~a1;
+ x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00;
+
+ x3C3CC3C3 = a2 ^ x0F0FF0F0;
+ x3C3C0000 = x3C3CC3C3 & ~a6;
+ x7373F4F4 = x4F4FF4F4 ^ x3C3C0000;
+ x0C840A00 = x4FE55EF4 & ~x7373F4F4;
+
+ x00005EF4 = a6 & x4FE55EF4;
+ x00FF5EFF = a4 | x00005EF4;
+ x00555455 = a1 & x00FF5EFF;
+ x3C699796 = x3C3CC3C3 ^ x00555455;
+ x30 = x4FE55EF4 & ~a5;
+ x31 = x30 ^ x3C699796;
+ *out4 ^= x31;
+
+ x000FF000 = x0F0FF0F0 & x00FFFF00;
+ x55AA55AA = a1 ^ a4;
+ x26D9A15E = x7373F4F4 ^ x55AA55AA;
+ x2FDFAF5F = a3 | x26D9A15E;
+ x2FD00F5F = x2FDFAF5F & ~x000FF000;
+
+ x55AAFFAA = x00AAAA00 | x55AA55AA;
+ x28410014 = x3C699796 & ~x55AAFFAA;
+ x000000FF = a4 & a6;
+ x000000CC = x000000FF & ~a2;
+ x284100D8 = x28410014 ^ x000000CC;
+
+ x204100D0 = x7373F4F4 & x284100D8;
+ x3C3CC3FF = x3C3CC3C3 | x000000FF;
+ x1C3CC32F = x3C3CC3FF & ~x204100D0;
+ x4969967A = a1 ^ x1C3CC32F;
+ x10 = x2FD00F5F & a5;
+ x11 = x10 ^ x4969967A;
+ *out2 ^= x11;
+
+ x4CC44CC4 = x4FE55EF4 & ~a2;
+ x40C040C0 = x4CC44CC4 & ~a3;
+ xC3C33C3C = ~x3C3CC3C3;
+ x9669C396 = x55AAFFAA ^ xC3C33C3C;
+ xD6A98356 = x40C040C0 ^ x9669C396;
+ x00 = a5 & ~x0C840A00;
+ x01 = x00 ^ xD6A98356;
+ *out1 ^= x01;
+
+ xD6E9C3D6 = x40C040C0 | x9669C396;
+ x4CEEEEC4 = x00AAAA00 | x4CC44CC4;
+ x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4;
+ x001A000B = a4 & ~x4FE55EF4;
+ x9A1F2D1B = x9A072D12 | x001A000B;
+ x20 = a5 & ~x284100D8;
+ x21 = x20 ^ x9A1F2D1B;
+ *out3 ^= x21;
+}
+
+static void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ u32 x5A5A5A5A, x0F0FF0F0;
+ u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F,
+ x52FBCA0F, x61C8F93C;
+ u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6;
+ u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1;
+ u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6;
+ u32 x00, x01, x10, x11, x20, x21, x30, x31;
+
+ x5A5A5A5A = a1 ^ a3;
+ x0F0FF0F0 = a3 ^ a5;
+ x33FF33FF = a2 | a4;
+ x33FFCC00 = a5 ^ x33FF33FF;
+ x0C0030F0 = x0F0FF0F0 & ~x33FFCC00;
+ x0C0CC0C0 = x0F0FF0F0 & ~a2;
+ x0CF3C03F = a4 ^ x0C0CC0C0;
+ x5EFBDA7F = x5A5A5A5A | x0CF3C03F;
+ x52FBCA0F = x5EFBDA7F & ~x0C0030F0;
+ x61C8F93C = a2 ^ x52FBCA0F;
+
+ x00C0C03C = x0CF3C03F & x61C8F93C;
+ x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C;
+ x3B92A366 = x5A5A5A5A ^ x61C8F93C;
+ x30908326 = x3B92A366 & ~x0F0F30C0;
+ x3C90B3D6 = x0C0030F0 ^ x30908326;
+
+ x33CC33CC = a2 ^ a4;
+ x0C0CFFFF = a5 | x0C0CC0C0;
+ x379E5C99 = x3B92A366 ^ x0C0CFFFF;
+ x04124C11 = x379E5C99 & ~x33CC33CC;
+ x56E9861E = x52FBCA0F ^ x04124C11;
+ x00 = a6 & ~x3C90B3D6;
+ x01 = x00 ^ x56E9861E;
+ *out1 ^= x01;
+
+ xA91679E1 = ~x56E9861E;
+ x10 = x3C90B3D6 & ~a6;
+ x11 = x10 ^ xA91679E1;
+ *out2 ^= x11;
+
+ x9586CA37 = x3C90B3D6 ^ xA91679E1;
+ x8402C833 = x9586CA37 & ~x33CC33CC;
+ x84C2C83F = x00C0C03C | x8402C833;
+ xB35C94A6 = x379E5C99 ^ x84C2C83F;
+ x20 = x61C8F93C | a6;
+ x21 = x20 ^ xB35C94A6;
+ *out3 ^= x21;
+
+ x30 = a6 & x61C8F93C;
+ x31 = x30 ^ xB35C94A6;
+ *out4 ^= x31;
+}
+
+static void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F;
+ u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B;
+ u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7;
+ u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF;
+ u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A;
+ u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2;
+ u32 x22222222, x16BCEE97, x0F080B04, x19B4E593;
+ u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D;
+ u32 x00, x01, x10, x11, x20, x21, x30, x31;
+
+ x77777777 = a1 | a3;
+ x77770000 = x77777777 & ~a6;
+ x22225555 = a1 ^ x77770000;
+ x11116666 = a3 ^ x22225555;
+ x1F1F6F6F = a4 | x11116666;
+
+ x70700000 = x77770000 & ~a4;
+ x43433333 = a3 ^ x70700000;
+ x00430033 = a5 & x43433333;
+ x55557777 = a1 | x11116666;
+ x55167744 = x00430033 ^ x55557777;
+ x5A19784B = a4 ^ x55167744;
+
+ x5A1987B4 = a6 ^ x5A19784B;
+ x7A3BD7F5 = x22225555 | x5A1987B4;
+ x003B00F5 = a5 & x7A3BD7F5;
+ x221955A0 = x22225555 ^ x003B00F5;
+ x05050707 = a4 & x55557777;
+ x271C52A7 = x221955A0 ^ x05050707;
+
+ x2A2A82A0 = x7A3BD7F5 & ~a1;
+ x6969B193 = x43433333 ^ x2A2A82A0;
+ x1FE06F90 = a5 ^ x1F1F6F6F;
+ x16804E00 = x1FE06F90 & ~x6969B193;
+ xE97FB1FF = ~x16804E00;
+ x20 = xE97FB1FF & ~a2;
+ x21 = x20 ^ x5A19784B;
+ *out3 ^= x21;
+
+ x43403302 = x43433333 & ~x003B00F5;
+ x35CAED30 = x2A2A82A0 ^ x1FE06F90;
+ x37DEFFB7 = x271C52A7 | x35CAED30;
+ x349ECCB5 = x37DEFFB7 & ~x43403302;
+ x0B01234A = x1F1F6F6F & ~x349ECCB5;
+
+ x101884B4 = x5A1987B4 & x349ECCB5;
+ x0FF8EB24 = x1FE06F90 ^ x101884B4;
+ x41413333 = x43433333 & x55557777;
+ x4FF9FB37 = x0FF8EB24 | x41413333;
+ x4FC2FBC2 = x003B00F5 ^ x4FF9FB37;
+ x30 = x4FC2FBC2 & a2;
+ x31 = x30 ^ x271C52A7;
+ *out4 ^= x31;
+
+ x22222222 = a1 ^ x77777777;
+ x16BCEE97 = x349ECCB5 ^ x22222222;
+ x0F080B04 = a4 & x0FF8EB24;
+ x19B4E593 = x16BCEE97 ^ x0F080B04;
+ x00 = x0B01234A | a2;
+ x01 = x00 ^ x19B4E593;
+ *out1 ^= x01;
+
+ x5C5C5C5C = x1F1F6F6F ^ x43433333;
+ x4448184C = x5C5C5C5C & ~x19B4E593;
+ x2DDABE71 = x22225555 ^ x0FF8EB24;
+ x6992A63D = x4448184C ^ x2DDABE71;
+ x10 = x1F1F6F6F & a2;
+ x11 = x10 ^ x6992A63D;
+ *out2 ^= x11;
+}
+
+static void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ u32 x33CC33CC;
+ u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099;
+ u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6;
+ u32 x09030C06, x09030000, x336622FF, x3A6522FF;
+ u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD;
+ u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B;
+ u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479;
+ u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5;
+ u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67;
+ u32 x00, x01, x10, x11, x20, x21, x30, x31;
+
+ x33CC33CC = a2 ^ a5;
+
+ x3333FFFF = a2 | a6;
+ x11115555 = a1 & x3333FFFF;
+ x22DD6699 = x33CC33CC ^ x11115555;
+ x22DD9966 = a6 ^ x22DD6699;
+ x00220099 = a5 & ~x22DD9966;
+
+ x00551144 = a1 & x22DD9966;
+ x33662277 = a2 ^ x00551144;
+ x5A5A5A5A = a1 ^ a3;
+ x7B7E7A7F = x33662277 | x5A5A5A5A;
+ x59A31CE6 = x22DD6699 ^ x7B7E7A7F;
+
+ x09030C06 = a3 & x59A31CE6;
+ x09030000 = x09030C06 & ~a6;
+ x336622FF = x00220099 | x33662277;
+ x3A6522FF = x09030000 ^ x336622FF;
+ x30 = x3A6522FF & a4;
+ x31 = x30 ^ x59A31CE6;
+ *out4 ^= x31;
+
+ x484D494C = a2 ^ x7B7E7A7F;
+ x0000B6B3 = a6 & ~x484D494C;
+ x0F0FB9BC = a3 ^ x0000B6B3;
+ x00FC00F9 = a5 & ~x09030C06;
+ x0FFFB9FD = x0F0FB9BC | x00FC00F9;
+
+ x5DF75DF7 = a1 | x59A31CE6;
+ x116600F7 = x336622FF & x5DF75DF7;
+ x1E69B94B = x0F0FB9BC ^ x116600F7;
+ x1668B94B = x1E69B94B & ~x09030000;
+ x20 = x00220099 | a4;
+ x21 = x20 ^ x1668B94B;
+ *out3 ^= x21;
+
+ x7B7B7B7B = a2 | x5A5A5A5A;
+ x411E5984 = x3A6522FF ^ x7B7B7B7B;
+ x1FFFFDFD = x11115555 | x0FFFB9FD;
+ x5EE1A479 = x411E5984 ^ x1FFFFDFD;
+
+ x3CB4DFD2 = x22DD6699 ^ x1E69B94B;
+ x004B002D = a5 & ~x3CB4DFD2;
+ xB7B2B6B3 = ~x484D494C;
+ xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3;
+ xCC82CDE5 = x004B002D ^ xCCC9CDC8;
+ x10 = xCC82CDE5 & ~a4;
+ x11 = x10 ^ x5EE1A479;
+ *out2 ^= x11;
+
+ x0055EEBB = a6 ^ x00551144;
+ x5A5AECE9 = a1 ^ x0F0FB9BC;
+ x0050ECA9 = x0055EEBB & x5A5AECE9;
+ xC5CAC1CE = x09030C06 ^ xCCC9CDC8;
+ xC59A2D67 = x0050ECA9 ^ xC5CAC1CE;
+ x00 = x0FFFB9FD & ~a4;
+ x01 = x00 ^ xC59A2D67;
+ *out1 ^= x01;
+}
+
+static void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841;
+ u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78;
+ u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D;
+ u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B;
+ u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB;
+ u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867;
+ u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD;
+ u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A;
+ u32 x00, x01, x10, x11, x20, x21, x30, x31;
+
+ x0FF00FF0 = a4 ^ a5;
+ x3CC33CC3 = a3 ^ x0FF00FF0;
+ x00003CC3 = a6 & x3CC33CC3;
+ x0F000F00 = a4 & x0FF00FF0;
+ x5A555A55 = a2 ^ x0F000F00;
+ x00001841 = x00003CC3 & x5A555A55;
+
+ x00000F00 = a6 & x0F000F00;
+ x33333C33 = a3 ^ x00000F00;
+ x7B777E77 = x5A555A55 | x33333C33;
+ x0FF0F00F = a6 ^ x0FF00FF0;
+ x74878E78 = x7B777E77 ^ x0FF0F00F;
+ x30 = a1 & ~x00001841;
+ x31 = x30 ^ x74878E78;
+ *out4 ^= x31;
+
+ x003C003C = a5 & ~x3CC33CC3;
+ x5A7D5A7D = x5A555A55 | x003C003C;
+ x333300F0 = x00003CC3 ^ x33333C33;
+ x694E5A8D = x5A7D5A7D ^ x333300F0;
+
+ x0FF0CCCC = x00003CC3 ^ x0FF0F00F;
+ x000F0303 = a4 & ~x0FF0CCCC;
+ x5A505854 = x5A555A55 & ~x000F0303;
+ x33CC000F = a5 ^ x333300F0;
+ x699C585B = x5A505854 ^ x33CC000F;
+
+ x7F878F78 = x0F000F00 | x74878E78;
+ x21101013 = a3 & x699C585B;
+ x7F979F7B = x7F878F78 | x21101013;
+ x30030CC0 = x3CC33CC3 & ~x0FF0F00F;
+ x4F9493BB = x7F979F7B ^ x30030CC0;
+ x00 = x4F9493BB & ~a1;
+ x01 = x00 ^ x694E5A8D;
+ *out1 ^= x01;
+
+ x6F9CDBFB = x699C585B | x4F9493BB;
+ x0000DBFB = a6 & x6F9CDBFB;
+ x00005151 = a2 & x0000DBFB;
+ x26DAC936 = x694E5A8D ^ x4F9493BB;
+ x26DA9867 = x00005151 ^ x26DAC936;
+
+ x27DA9877 = x21101013 | x26DA9867;
+ x27DA438C = x0000DBFB ^ x27DA9877;
+ x2625C9C9 = a5 ^ x26DAC936;
+ x27FFCBCD = x27DA438C | x2625C9C9;
+ x20 = x27FFCBCD & a1;
+ x21 = x20 ^ x699C585B;
+ *out3 ^= x21;
+
+ x27FF1036 = x0000DBFB ^ x27FFCBCD;
+ x27FF103E = x003C003C | x27FF1036;
+ xB06B6C44 = ~x4F9493BB;
+ x97947C7A = x27FF103E ^ xB06B6C44;
+ x10 = x97947C7A & ~a1;
+ x11 = x10 ^ x26DA9867;
+ *out2 ^= x11;
+}
+
+static void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
+{
+ u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001;
+ u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745;
+ u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3;
+ u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A;
+ u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926;
+ u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F;
+ u32 xF700A600, x61008000, x03B7856B, x62B7056B;
+ u32 x00, x01, x10, x11, x20, x21, x30, x31;
+
+ x0C0C0C0C = a3 & ~a2;
+ x0000F0F0 = a5 & ~a3;
+ x00FFF00F = a4 ^ x0000F0F0;
+ x00555005 = a1 & x00FFF00F;
+ x00515001 = x00555005 & ~x0C0C0C0C;
+
+ x33000330 = a2 & ~x00FFF00F;
+ x77555775 = a1 | x33000330;
+ x30303030 = a2 & ~a3;
+ x3030CFCF = a5 ^ x30303030;
+ x30104745 = x77555775 & x3030CFCF;
+ x30555745 = x00555005 | x30104745;
+
+ xFF000FF0 = ~x00FFF00F;
+ xCF1048B5 = x30104745 ^ xFF000FF0;
+ x080A080A = a3 & ~x77555775;
+ xC71A40BF = xCF1048B5 ^ x080A080A;
+ xCB164CB3 = x0C0C0C0C ^ xC71A40BF;
+ x10 = x00515001 | a6;
+ x11 = x10 ^ xCB164CB3;
+ *out2 ^= x11;
+
+ x9E4319E6 = a1 ^ xCB164CB3;
+ x000019E6 = a5 & x9E4319E6;
+ xF429738C = a2 ^ xC71A40BF;
+ xF4296A6A = x000019E6 ^ xF429738C;
+ xC729695A = x33000330 ^ xF4296A6A;
+
+ xC47C3D2F = x30555745 ^ xF4296A6A;
+ xF77F3F3F = a2 | xC47C3D2F;
+ x9E43E619 = a5 ^ x9E4319E6;
+ x693CD926 = xF77F3F3F ^ x9E43E619;
+ x20 = x30555745 & a6;
+ x21 = x20 ^ x693CD926;
+ *out3 ^= x21;
+
+ xF719A695 = x3030CFCF ^ xC729695A;
+ xF4FF73FF = a4 | xF429738C;
+ x03E6D56A = xF719A695 ^ xF4FF73FF;
+ x56B3803F = a1 ^ x03E6D56A;
+ x30 = x56B3803F & a6;
+ x31 = x30 ^ xC729695A;
+ *out4 ^= x31;
+
+ xF700A600 = xF719A695 & ~a4;
+ x61008000 = x693CD926 & xF700A600;
+ x03B7856B = x00515001 ^ x03E6D56A;
+ x62B7056B = x61008000 ^ x03B7856B;
+ x00 = x62B7056B | a6;
+ x01 = x00 ^ xC729695A;
+ *out1 ^= x01;
+}
+
+#endif
+#endif
+
+#ifdef IS_AMD
+