Initial commit
[hashcat.git] / amd / gpu_serpent256_amd.c
1 /* This is an independent implementation of the encryption algorithm: */
2 /* */
3 /* Serpent by Ross Anderson, Eli Biham and Lars Knudsen */
4 /* */
5 /* which is a candidate algorithm in the Advanced Encryption Standard */
6 /* programme of the US National Institute of Standards and Technology. */
7 /* */
8 /* Copyright in this implementation is held by Dr B R Gladman but I */
9 /* hereby give permission for its free direct or derivative use subject */
10 /* to acknowledgment of its origin and compliance with any conditions */
11 /* that the originators of the algorithm place on its exploitation. */
12 /* */
13 /* Dr Brian Gladman (gladman@seven77.demon.co.uk) 14th January 1999 */
14 /* */
15 /* -------------------------------------------------------------------- */
16 /* */
17 /* Cleaned and optimized for GPU use with oclHashcat by Jens Steube */
18
19 /* 15 terms */
20
21 #define sb0(a,b,c,d,e,f,g,h) \
22 t1 = a ^ d; \
23 t2 = a & d; \
24 t3 = c ^ t1; \
25 t6 = b & t1; \
26 t4 = b ^ t3; \
27 t10 = ~t3; \
28 h = t2 ^ t4; \
29 t7 = a ^ t6; \
30 t14 = ~t7; \
31 t8 = c | t7; \
32 t11 = t3 ^ t7; \
33 g = t4 ^ t8; \
34 t12 = h & t11; \
35 f = t10 ^ t12; \
36 e = t12 ^ t14
37
38 /* 15 terms */
39
40 #define ib0(a,b,c,d,e,f,g,h) \
41 t1 = ~a; \
42 t2 = a ^ b; \
43 t3 = t1 | t2; \
44 t4 = d ^ t3; \
45 t7 = d & t2; \
46 t5 = c ^ t4; \
47 t8 = t1 ^ t7; \
48 g = t2 ^ t5; \
49 t11 = a & t4; \
50 t9 = g & t8; \
51 t14 = t5 ^ t8; \
52 f = t4 ^ t9; \
53 t12 = t5 | f; \
54 h = t11 ^ t12; \
55 e = h ^ t14
56
57 /* 14 terms! */
58
59 #define sb1(a,b,c,d,e,f,g,h) \
60 t1 = ~a; \
61 t2 = b ^ t1; \
62 t3 = a | t2; \
63 t4 = d | t2; \
64 t5 = c ^ t3; \
65 g = d ^ t5; \
66 t7 = b ^ t4; \
67 t8 = t2 ^ g; \
68 t9 = t5 & t7; \
69 h = t8 ^ t9; \
70 t11 = t5 ^ t7; \
71 f = h ^ t11; \
72 t13 = t8 & t11; \
73 e = t5 ^ t13
74
75 /* 17 terms */
76
77 #define ib1(a,b,c,d,e,f,g,h) \
78 t1 = a ^ d; \
79 t2 = a & b; \
80 t3 = b ^ c; \
81 t4 = a ^ t3; \
82 t5 = b | d; \
83 t7 = c | t1; \
84 h = t4 ^ t5; \
85 t8 = b ^ t7; \
86 t11 = ~t2; \
87 t9 = t4 & t8; \
88 f = t1 ^ t9; \
89 t13 = t9 ^ t11; \
90 t12 = h & f; \
91 g = t12 ^ t13; \
92 t15 = a & d; \
93 t16 = c ^ t13; \
94 e = t15 ^ t16
95
96 /* 16 terms */
97
98 #define sb2(a,b,c,d,e,f,g,h) \
99 t1 = ~a; \
100 t2 = b ^ d; \
101 t3 = c & t1; \
102 t13 = d | t1; \
103 e = t2 ^ t3; \
104 t5 = c ^ t1; \
105 t6 = c ^ e; \
106 t7 = b & t6; \
107 t10 = e | t5; \
108 h = t5 ^ t7; \
109 t9 = d | t7; \
110 t11 = t9 & t10; \
111 t14 = t2 ^ h; \
112 g = a ^ t11; \
113 t15 = g ^ t13; \
114 f = t14 ^ t15
115
116 /* 16 terms */
117
118 #define ib2(a,b,c,d,e,f,g,h) \
119 t1 = b ^ d; \
120 t2 = ~t1; \
121 t3 = a ^ c; \
122 t4 = c ^ t1; \
123 t7 = a | t2; \
124 t5 = b & t4; \
125 t8 = d ^ t7; \
126 t11 = ~t4; \
127 e = t3 ^ t5; \
128 t9 = t3 | t8; \
129 t14 = d & t11; \
130 h = t1 ^ t9; \
131 t12 = e | h; \
132 f = t11 ^ t12; \
133 t15 = t3 ^ t12; \
134 g = t14 ^ t15
135
136 /* 17 terms */
137
138 #define sb3(a,b,c,d,e,f,g,h) \
139 t1 = a ^ c; \
140 t2 = d ^ t1; \
141 t3 = a & t2; \
142 t4 = d ^ t3; \
143 t5 = b & t4; \
144 g = t2 ^ t5; \
145 t7 = a | g; \
146 t8 = b | d; \
147 t11 = a | d; \
148 t9 = t4 & t7; \
149 f = t8 ^ t9; \
150 t12 = b ^ t11; \
151 t13 = g ^ t9; \
152 t15 = t3 ^ t8; \
153 h = t12 ^ t13; \
154 t16 = c & t15; \
155 e = t12 ^ t16
156
157 /* 16 term solution that performs less well than 17 term one
158 in my environment (PPro/PII)
159
160 #define sb3(a,b,c,d,e,f,g,h) \
161 t1 = a ^ b; \
162 t2 = a & c; \
163 t3 = a | d; \
164 t4 = c ^ d; \
165 t5 = t1 & t3; \
166 t6 = t2 | t5; \
167 g = t4 ^ t6; \
168 t8 = b ^ t3; \
169 t9 = t6 ^ t8; \
170 t10 = t4 & t9; \
171 e = t1 ^ t10; \
172 t12 = g & e; \
173 f = t9 ^ t12; \
174 t14 = b | d; \
175 t15 = t4 ^ t12; \
176 h = t14 ^ t15
177 */
178
179 /* 17 terms */
180
181 #define ib3(a,b,c,d,e,f,g,h) \
182 t1 = b ^ c; \
183 t2 = b | c; \
184 t3 = a ^ c; \
185 t7 = a ^ d; \
186 t4 = t2 ^ t3; \
187 t5 = d | t4; \
188 t9 = t2 ^ t7; \
189 e = t1 ^ t5; \
190 t8 = t1 | t5; \
191 t11 = a & t4; \
192 g = t8 ^ t9; \
193 t12 = e | t9; \
194 f = t11 ^ t12; \
195 t14 = a & g; \
196 t15 = t2 ^ t14; \
197 t16 = e & t15; \
198 h = t4 ^ t16
199
200 /* 15 terms */
201
202 #define sb4(a,b,c,d,e,f,g,h) \
203 t1 = a ^ d; \
204 t2 = d & t1; \
205 t3 = c ^ t2; \
206 t4 = b | t3; \
207 h = t1 ^ t4; \
208 t6 = ~b; \
209 t7 = t1 | t6; \
210 e = t3 ^ t7; \
211 t9 = a & e; \
212 t10 = t1 ^ t6; \
213 t11 = t4 & t10; \
214 g = t9 ^ t11; \
215 t13 = a ^ t3; \
216 t14 = t10 & g; \
217 f = t13 ^ t14
218
219 /* 17 terms */
220
221 #define ib4(a,b,c,d,e,f,g,h) \
222 t1 = c ^ d; \
223 t2 = c | d; \
224 t3 = b ^ t2; \
225 t4 = a & t3; \
226 f = t1 ^ t4; \
227 t6 = a ^ d; \
228 t7 = b | d; \
229 t8 = t6 & t7; \
230 h = t3 ^ t8; \
231 t10 = ~a; \
232 t11 = c ^ h; \
233 t12 = t10 | t11;\
234 e = t3 ^ t12; \
235 t14 = c | t4; \
236 t15 = t7 ^ t14; \
237 t16 = h | t10; \
238 g = t15 ^ t16
239
240 /* 16 terms */
241
242 #define sb5(a,b,c,d,e,f,g,h) \
243 t1 = ~a; \
244 t2 = a ^ b; \
245 t3 = a ^ d; \
246 t4 = c ^ t1; \
247 t5 = t2 | t3; \
248 e = t4 ^ t5; \
249 t7 = d & e; \
250 t8 = t2 ^ e; \
251 t10 = t1 | e; \
252 f = t7 ^ t8; \
253 t11 = t2 | t7; \
254 t12 = t3 ^ t10; \
255 t14 = b ^ t7; \
256 g = t11 ^ t12; \
257 t15 = f & t12; \
258 h = t14 ^ t15
259
260 /* 16 terms */
261
262 #define ib5(a,b,c,d,e,f,g,h) \
263 t1 = ~c; \
264 t2 = b & t1; \
265 t3 = d ^ t2; \
266 t4 = a & t3; \
267 t5 = b ^ t1; \
268 h = t4 ^ t5; \
269 t7 = b | h; \
270 t8 = a & t7; \
271 f = t3 ^ t8; \
272 t10 = a | d; \
273 t11 = t1 ^ t7; \
274 e = t10 ^ t11; \
275 t13 = a ^ c; \
276 t14 = b & t10; \
277 t15 = t4 | t13; \
278 g = t14 ^ t15
279
280 /* 15 terms */
281
282 #define sb6(a,b,c,d,e,f,g,h) \
283 t1 = ~a; \
284 t2 = a ^ d; \
285 t3 = b ^ t2; \
286 t4 = t1 | t2; \
287 t5 = c ^ t4; \
288 f = b ^ t5; \
289 t13 = ~t5; \
290 t7 = t2 | f; \
291 t8 = d ^ t7; \
292 t9 = t5 & t8; \
293 g = t3 ^ t9; \
294 t11 = t5 ^ t8; \
295 e = g ^ t11; \
296 t14 = t3 & t11; \
297 h = t13 ^ t14
298
299 /* 15 terms */
300
301 #define ib6(a,b,c,d,e,f,g,h) \
302 t1 = ~a; \
303 t2 = a ^ b; \
304 t3 = c ^ t2; \
305 t4 = c | t1; \
306 t5 = d ^ t4; \
307 t13 = d & t1; \
308 f = t3 ^ t5; \
309 t7 = t3 & t5; \
310 t8 = t2 ^ t7; \
311 t9 = b | t8; \
312 h = t5 ^ t9; \
313 t11 = b | h; \
314 e = t8 ^ t11; \
315 t14 = t3 ^ t11; \
316 g = t13 ^ t14
317
318 /* 17 terms */
319
320 #define sb7(a,b,c,d,e,f,g,h) \
321 t1 = ~c; \
322 t2 = b ^ c; \
323 t3 = b | t1; \
324 t4 = d ^ t3; \
325 t5 = a & t4; \
326 t7 = a ^ d; \
327 h = t2 ^ t5; \
328 t8 = b ^ t5; \
329 t9 = t2 | t8; \
330 t11 = d & t3; \
331 f = t7 ^ t9; \
332 t12 = t5 ^ f; \
333 t15 = t1 | t4; \
334 t13 = h & t12; \
335 g = t11 ^ t13; \
336 t16 = t12 ^ g; \
337 e = t15 ^ t16
338
339 /* 17 terms */
340
341 #define ib7(a,b,c,d,e,f,g,h) \
342 t1 = a & b; \
343 t2 = a | b; \
344 t3 = c | t1; \
345 t4 = d & t2; \
346 h = t3 ^ t4; \
347 t6 = ~d; \
348 t7 = b ^ t4; \
349 t8 = h ^ t6; \
350 t11 = c ^ t7; \
351 t9 = t7 | t8; \
352 f = a ^ t9; \
353 t12 = d | f; \
354 e = t11 ^ t12; \
355 t14 = a & h; \
356 t15 = t3 ^ f; \
357 t16 = e ^ t14; \
358 g = t15 ^ t16
359
360 #define k_xor(r,a,b,c,d) \
361 a ^= ks[4 * r + 8]; \
362 b ^= ks[4 * r + 9]; \
363 c ^= ks[4 * r + 10]; \
364 d ^= ks[4 * r + 11]
365
366 #define k_set(r,a,b,c,d) \
367 a = ks[4 * r + 8]; \
368 b = ks[4 * r + 9]; \
369 c = ks[4 * r + 10]; \
370 d = ks[4 * r + 11]
371
372 #define k_get(r,a,b,c,d) \
373 ks[4 * r + 8] = a; \
374 ks[4 * r + 9] = b; \
375 ks[4 * r + 10] = c; \
376 ks[4 * r + 11] = d
377
378 /* the linear transformation and its inverse */
379
380 #define rot(a,b,c,d) \
381 a = rotl32(a, 13); \
382 c = rotl32(c, 3); \
383 d ^= c ^ (a << 3); \
384 b ^= a ^ c; \
385 d = rotl32(d, 7); \
386 b = rotl32(b, 1); \
387 a ^= b ^ d; \
388 c ^= d ^ (b << 7); \
389 a = rotl32(a, 5); \
390 c = rotl32(c, 22)
391
392 #define irot(a,b,c,d) \
393 c = rotr32(c, 22); \
394 a = rotr32(a, 5); \
395 c ^= d ^ (b << 7); \
396 a ^= b ^ d; \
397 d = rotr32(d, 7); \
398 b = rotr32(b, 1); \
399 d ^= c ^ (a << 3); \
400 b ^= a ^ c; \
401 c = rotr32(c, 3); \
402 a = rotr32(a, 13)
403
404 static void serpent256_set_key (u32 *ks, const u32 *ukey)
405 {
406 #pragma unroll
407 for (int i = 0; i < 8; i++)
408 {
409 ks[i] = ukey[i];
410 }
411
412 #pragma unroll
413 for (int i = 0; i < 132; i++)
414 {
415 ks[i + 8] = rotl32 (ks[i + 7] ^ ks[i + 5] ^ ks[i + 3] ^ ks[i + 0] ^ 0x9e3779b9 ^ i, 11);
416 }
417
418 u32 a,b,c,d,e,f,g,h;
419 u32 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16;
420
421 k_set( 0,a,b,c,d); sb3(a,b,c,d,e,f,g,h); k_get( 0,e,f,g,h);
422 k_set( 1,a,b,c,d); sb2(a,b,c,d,e,f,g,h); k_get( 1,e,f,g,h);
423 k_set( 2,a,b,c,d); sb1(a,b,c,d,e,f,g,h); k_get( 2,e,f,g,h);
424 k_set( 3,a,b,c,d); sb0(a,b,c,d,e,f,g,h); k_get( 3,e,f,g,h);
425 k_set( 4,a,b,c,d); sb7(a,b,c,d,e,f,g,h); k_get( 4,e,f,g,h);
426 k_set( 5,a,b,c,d); sb6(a,b,c,d,e,f,g,h); k_get( 5,e,f,g,h);
427 k_set( 6,a,b,c,d); sb5(a,b,c,d,e,f,g,h); k_get( 6,e,f,g,h);
428 k_set( 7,a,b,c,d); sb4(a,b,c,d,e,f,g,h); k_get( 7,e,f,g,h);
429 k_set( 8,a,b,c,d); sb3(a,b,c,d,e,f,g,h); k_get( 8,e,f,g,h);
430 k_set( 9,a,b,c,d); sb2(a,b,c,d,e,f,g,h); k_get( 9,e,f,g,h);
431 k_set(10,a,b,c,d); sb1(a,b,c,d,e,f,g,h); k_get(10,e,f,g,h);
432 k_set(11,a,b,c,d); sb0(a,b,c,d,e,f,g,h); k_get(11,e,f,g,h);
433 k_set(12,a,b,c,d); sb7(a,b,c,d,e,f,g,h); k_get(12,e,f,g,h);
434 k_set(13,a,b,c,d); sb6(a,b,c,d,e,f,g,h); k_get(13,e,f,g,h);
435 k_set(14,a,b,c,d); sb5(a,b,c,d,e,f,g,h); k_get(14,e,f,g,h);
436 k_set(15,a,b,c,d); sb4(a,b,c,d,e,f,g,h); k_get(15,e,f,g,h);
437 k_set(16,a,b,c,d); sb3(a,b,c,d,e,f,g,h); k_get(16,e,f,g,h);
438 k_set(17,a,b,c,d); sb2(a,b,c,d,e,f,g,h); k_get(17,e,f,g,h);
439 k_set(18,a,b,c,d); sb1(a,b,c,d,e,f,g,h); k_get(18,e,f,g,h);
440 k_set(19,a,b,c,d); sb0(a,b,c,d,e,f,g,h); k_get(19,e,f,g,h);
441 k_set(20,a,b,c,d); sb7(a,b,c,d,e,f,g,h); k_get(20,e,f,g,h);
442 k_set(21,a,b,c,d); sb6(a,b,c,d,e,f,g,h); k_get(21,e,f,g,h);
443 k_set(22,a,b,c,d); sb5(a,b,c,d,e,f,g,h); k_get(22,e,f,g,h);
444 k_set(23,a,b,c,d); sb4(a,b,c,d,e,f,g,h); k_get(23,e,f,g,h);
445 k_set(24,a,b,c,d); sb3(a,b,c,d,e,f,g,h); k_get(24,e,f,g,h);
446 k_set(25,a,b,c,d); sb2(a,b,c,d,e,f,g,h); k_get(25,e,f,g,h);
447 k_set(26,a,b,c,d); sb1(a,b,c,d,e,f,g,h); k_get(26,e,f,g,h);
448 k_set(27,a,b,c,d); sb0(a,b,c,d,e,f,g,h); k_get(27,e,f,g,h);
449 k_set(28,a,b,c,d); sb7(a,b,c,d,e,f,g,h); k_get(28,e,f,g,h);
450 k_set(29,a,b,c,d); sb6(a,b,c,d,e,f,g,h); k_get(29,e,f,g,h);
451 k_set(30,a,b,c,d); sb5(a,b,c,d,e,f,g,h); k_get(30,e,f,g,h);
452 k_set(31,a,b,c,d); sb4(a,b,c,d,e,f,g,h); k_get(31,e,f,g,h);
453 k_set(32,a,b,c,d); sb3(a,b,c,d,e,f,g,h); k_get(32,e,f,g,h);
454 }
455
456 static void serpent256_encrypt (const u32 *ks, const u32 *in, u32 *out)
457 {
458 u32 a,b,c,d,e,f,g,h;
459 u32 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16;
460
461 a = in[0];
462 b = in[1];
463 c = in[2];
464 d = in[3];
465
466 k_xor( 0,a,b,c,d); sb0(a,b,c,d,e,f,g,h); rot(e,f,g,h);
467 k_xor( 1,e,f,g,h); sb1(e,f,g,h,a,b,c,d); rot(a,b,c,d);
468 k_xor( 2,a,b,c,d); sb2(a,b,c,d,e,f,g,h); rot(e,f,g,h);
469 k_xor( 3,e,f,g,h); sb3(e,f,g,h,a,b,c,d); rot(a,b,c,d);
470 k_xor( 4,a,b,c,d); sb4(a,b,c,d,e,f,g,h); rot(e,f,g,h);
471 k_xor( 5,e,f,g,h); sb5(e,f,g,h,a,b,c,d); rot(a,b,c,d);
472 k_xor( 6,a,b,c,d); sb6(a,b,c,d,e,f,g,h); rot(e,f,g,h);
473 k_xor( 7,e,f,g,h); sb7(e,f,g,h,a,b,c,d); rot(a,b,c,d);
474 k_xor( 8,a,b,c,d); sb0(a,b,c,d,e,f,g,h); rot(e,f,g,h);
475 k_xor( 9,e,f,g,h); sb1(e,f,g,h,a,b,c,d); rot(a,b,c,d);
476 k_xor(10,a,b,c,d); sb2(a,b,c,d,e,f,g,h); rot(e,f,g,h);
477 k_xor(11,e,f,g,h); sb3(e,f,g,h,a,b,c,d); rot(a,b,c,d);
478 k_xor(12,a,b,c,d); sb4(a,b,c,d,e,f,g,h); rot(e,f,g,h);
479 k_xor(13,e,f,g,h); sb5(e,f,g,h,a,b,c,d); rot(a,b,c,d);
480 k_xor(14,a,b,c,d); sb6(a,b,c,d,e,f,g,h); rot(e,f,g,h);
481 k_xor(15,e,f,g,h); sb7(e,f,g,h,a,b,c,d); rot(a,b,c,d);
482 k_xor(16,a,b,c,d); sb0(a,b,c,d,e,f,g,h); rot(e,f,g,h);
483 k_xor(17,e,f,g,h); sb1(e,f,g,h,a,b,c,d); rot(a,b,c,d);
484 k_xor(18,a,b,c,d); sb2(a,b,c,d,e,f,g,h); rot(e,f,g,h);
485 k_xor(19,e,f,g,h); sb3(e,f,g,h,a,b,c,d); rot(a,b,c,d);
486 k_xor(20,a,b,c,d); sb4(a,b,c,d,e,f,g,h); rot(e,f,g,h);
487 k_xor(21,e,f,g,h); sb5(e,f,g,h,a,b,c,d); rot(a,b,c,d);
488 k_xor(22,a,b,c,d); sb6(a,b,c,d,e,f,g,h); rot(e,f,g,h);
489 k_xor(23,e,f,g,h); sb7(e,f,g,h,a,b,c,d); rot(a,b,c,d);
490 k_xor(24,a,b,c,d); sb0(a,b,c,d,e,f,g,h); rot(e,f,g,h);
491 k_xor(25,e,f,g,h); sb1(e,f,g,h,a,b,c,d); rot(a,b,c,d);
492 k_xor(26,a,b,c,d); sb2(a,b,c,d,e,f,g,h); rot(e,f,g,h);
493 k_xor(27,e,f,g,h); sb3(e,f,g,h,a,b,c,d); rot(a,b,c,d);
494 k_xor(28,a,b,c,d); sb4(a,b,c,d,e,f,g,h); rot(e,f,g,h);
495 k_xor(29,e,f,g,h); sb5(e,f,g,h,a,b,c,d); rot(a,b,c,d);
496 k_xor(30,a,b,c,d); sb6(a,b,c,d,e,f,g,h); rot(e,f,g,h);
497 k_xor(31,e,f,g,h); sb7(e,f,g,h,a,b,c,d);
498 k_xor(32,a,b,c,d);
499
500 out[0] = a;
501 out[1] = b;
502 out[2] = c;
503 out[3] = d;
504 }
505
506 static void serpent256_decrypt (const u32 *ks, const u32 *in, u32 *out)
507 {
508 u32 a,b,c,d,e,f,g,h;
509 u32 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16;
510
511 a = in[0];
512 b = in[1];
513 c = in[2];
514 d = in[3];
515
516 k_xor(32,a,b,c,d);
517 ib7(a,b,c,d,e,f,g,h); k_xor(31,e,f,g,h);
518 irot(e,f,g,h); ib6(e,f,g,h,a,b,c,d); k_xor(30,a,b,c,d);
519 irot(a,b,c,d); ib5(a,b,c,d,e,f,g,h); k_xor(29,e,f,g,h);
520 irot(e,f,g,h); ib4(e,f,g,h,a,b,c,d); k_xor(28,a,b,c,d);
521 irot(a,b,c,d); ib3(a,b,c,d,e,f,g,h); k_xor(27,e,f,g,h);
522 irot(e,f,g,h); ib2(e,f,g,h,a,b,c,d); k_xor(26,a,b,c,d);
523 irot(a,b,c,d); ib1(a,b,c,d,e,f,g,h); k_xor(25,e,f,g,h);
524 irot(e,f,g,h); ib0(e,f,g,h,a,b,c,d); k_xor(24,a,b,c,d);
525 irot(a,b,c,d); ib7(a,b,c,d,e,f,g,h); k_xor(23,e,f,g,h);
526 irot(e,f,g,h); ib6(e,f,g,h,a,b,c,d); k_xor(22,a,b,c,d);
527 irot(a,b,c,d); ib5(a,b,c,d,e,f,g,h); k_xor(21,e,f,g,h);
528 irot(e,f,g,h); ib4(e,f,g,h,a,b,c,d); k_xor(20,a,b,c,d);
529 irot(a,b,c,d); ib3(a,b,c,d,e,f,g,h); k_xor(19,e,f,g,h);
530 irot(e,f,g,h); ib2(e,f,g,h,a,b,c,d); k_xor(18,a,b,c,d);
531 irot(a,b,c,d); ib1(a,b,c,d,e,f,g,h); k_xor(17,e,f,g,h);
532 irot(e,f,g,h); ib0(e,f,g,h,a,b,c,d); k_xor(16,a,b,c,d);
533 irot(a,b,c,d); ib7(a,b,c,d,e,f,g,h); k_xor(15,e,f,g,h);
534 irot(e,f,g,h); ib6(e,f,g,h,a,b,c,d); k_xor(14,a,b,c,d);
535 irot(a,b,c,d); ib5(a,b,c,d,e,f,g,h); k_xor(13,e,f,g,h);
536 irot(e,f,g,h); ib4(e,f,g,h,a,b,c,d); k_xor(12,a,b,c,d);
537 irot(a,b,c,d); ib3(a,b,c,d,e,f,g,h); k_xor(11,e,f,g,h);
538 irot(e,f,g,h); ib2(e,f,g,h,a,b,c,d); k_xor(10,a,b,c,d);
539 irot(a,b,c,d); ib1(a,b,c,d,e,f,g,h); k_xor( 9,e,f,g,h);
540 irot(e,f,g,h); ib0(e,f,g,h,a,b,c,d); k_xor( 8,a,b,c,d);
541 irot(a,b,c,d); ib7(a,b,c,d,e,f,g,h); k_xor( 7,e,f,g,h);
542 irot(e,f,g,h); ib6(e,f,g,h,a,b,c,d); k_xor( 6,a,b,c,d);
543 irot(a,b,c,d); ib5(a,b,c,d,e,f,g,h); k_xor( 5,e,f,g,h);
544 irot(e,f,g,h); ib4(e,f,g,h,a,b,c,d); k_xor( 4,a,b,c,d);
545 irot(a,b,c,d); ib3(a,b,c,d,e,f,g,h); k_xor( 3,e,f,g,h);
546 irot(e,f,g,h); ib2(e,f,g,h,a,b,c,d); k_xor( 2,a,b,c,d);
547 irot(a,b,c,d); ib1(a,b,c,d,e,f,g,h); k_xor( 1,e,f,g,h);
548 irot(e,f,g,h); ib0(e,f,g,h,a,b,c,d); k_xor( 0,a,b,c,d);
549
550 out[0] = a;
551 out[1] = b;
552 out[2] = c;
553 out[3] = d;
554 }
555
556 static void serpent256_decrypt_xts (const u32 *ukey1, const u32 *ukey2, const u32 *in, u32 *out)
557 {
558 u32 T[4] = { 0 };
559 u32 Z[4] = { 0 };
560
561 out[0] = in[0];
562 out[1] = in[1];
563 out[2] = in[2];
564 out[3] = in[3];
565
566 u32 ks[140];
567
568 serpent256_set_key (ks, ukey2);
569 serpent256_encrypt (ks, Z, T);
570
571 out[0] ^= T[0];
572 out[1] ^= T[1];
573 out[2] ^= T[2];
574 out[3] ^= T[3];
575
576 serpent256_set_key (ks, ukey1);
577 serpent256_decrypt (ks, out, out);
578
579 out[0] ^= T[0];
580 out[1] ^= T[1];
581 out[2] ^= T[2];
582 out[3] ^= T[3];
583 }