Prepare new SIMD code for kernel, -m 0, 10, 20, 1000 should work in -a 3 mode and...
[hashcat.git] / OpenCL / types_ocl.c
1 /**
2 * Author......: Jens Steube <jens.steube@gmail.com>
3 * License.....: MIT
4 */
5
6 typedef uchar u8;
7 typedef ushort u16;
8 typedef uint u32;
9 typedef ulong u64;
10
11 #ifndef NEW_SIMD_CODE
12 #undef VECT_SIZE
13 #define VECT_SIZE 1
14 #endif
15
16 #if VECT_SIZE == 1
17 typedef uint u32x;
18 typedef ulong u64x;
19 #endif
20
21 #if VECT_SIZE == 2
22 typedef uint2 u32x;
23 typedef ulong2 u64x;
24 #endif
25
26 #if VECT_SIZE == 4
27 typedef uint4 u32x;
28 typedef ulong4 u64x;
29 #endif
30
31 #if VECT_SIZE == 8
32 typedef uint8 u32x;
33 typedef ulong8 u64x;
34 #endif
35
36 // this one needs to die
37 #define allx(r) r
38
39 static inline u32 l32_from_64 (u64 a)
40 {
41 const u32 r = (uint) (a);
42
43 return r;
44 }
45
46 static inline u32 h32_from_64 (u64 a)
47 {
48 a >>= 32;
49
50 const u32 r = (uint) (a);
51
52 return r;
53 }
54
55 static inline u64 hl32_to_64 (const u32 a, const u32 b)
56 {
57 return as_ulong ((uint2) (b, a));
58 }
59
60 #ifdef IS_AMD
61 static inline u32 swap32 (const u32 v)
62 {
63 return (as_uint (as_uchar4 (v).s3210));
64 }
65
66 static inline u64 swap64 (const u64 v)
67 {
68 return (as_ulong (as_uchar8 (v).s76543210));
69 }
70 #endif
71
72 #ifdef IS_NV
73 static inline u32 swap32 (const u32 v)
74 {
75 u32 r;
76
77 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r) : "r"(v));
78
79 return r;
80 }
81
82 static inline u64 swap64 (const u64 v)
83 {
84 u32 il;
85 u32 ir;
86
87 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(v));
88
89 u32 tl;
90 u32 tr;
91
92 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl) : "r"(il));
93 asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr) : "r"(ir));
94
95 u64 r;
96
97 asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tr), "r"(tl));
98
99 return r;
100 }
101 #endif
102
103 #ifdef IS_GENERIC
104 static inline u32 swap32 (const u32 v)
105 {
106 return (as_uint (as_uchar4 (v).s3210));
107 }
108
109 static inline u64 swap64 (const u64 v)
110 {
111 return (as_ulong (as_uchar8 (v).s76543210));
112 }
113 #endif
114
115 #ifdef IS_AMD
116 static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
117 {
118 return amd_bfe (a, b, c);
119 }
120 #endif
121
122 #ifdef IS_NV
123 static inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
124 {
125 u32 r;
126
127 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
128
129 return r;
130 }
131
132 static inline u32x __byte_perm (const u32x a, const u32x b, const u32x c)
133 {
134 u32x r;
135
136 #if VECT_SIZE == 1
137 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c) );
138 #endif
139
140 #if VECT_SIZE == 2
141 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
142 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
143 #endif
144
145 #if VECT_SIZE == 4
146 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
147 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
148 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2));
149 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3));
150 #endif
151
152 #if VECT_SIZE == 8
153 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
154 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
155 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2));
156 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3));
157 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c.s4));
158 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c.s5));
159 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c.s6));
160 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c.s7));
161 #endif
162
163 return r;
164 }
165
166 static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
167 {
168 u32 r;
169
170 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
171
172 return r;
173 }
174
175 #if CUDA_ARCH >= 350
176 static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
177 {
178 u32 r;
179
180 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"((c & 3) * 8));
181
182 return r;
183 }
184 #else
185 static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
186 {
187 return __byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
188 }
189 #endif
190 #endif
191
192 #ifdef IS_GENERIC
193 static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
194 {
195 #define BIT(x) (1 << (x))
196 #define BIT_MASK(x) (BIT (x) - 1)
197 #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
198
199 return BFE (a, b, c);
200 }
201
202 static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
203 {
204 const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8);
205
206 return (u32) (tmp);
207 }
208
209 static inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c)
210 {
211 #if VECT_SIZE == 1
212 const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8);
213
214 return (u32x) (tmp);
215 #endif
216
217 #if VECT_SIZE == 2
218 const u64x tmp = ((((u64x) (a.s0, a.s1)) << 32) | ((u64x) (b.s0, b.s1))) >> ((c & 3) * 8);
219
220 return (u32x) (tmp.s0, tmp.s1);
221 #endif
222
223 #if VECT_SIZE == 4
224 const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3))) >> ((c & 3) * 8);
225
226 return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3);
227 #endif
228
229 #if VECT_SIZE == 8
230 const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3, b.s4, b.s5, b.s6, b.s7))) >> ((c & 3) * 8);
231
232 return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7);
233 #endif
234 }
235 #endif
236
237 #ifdef IS_AMD
238 static inline u32x rotr32 (const u32x a, const u32 n)
239 {
240 return rotate (a, 32 - n);
241 }
242
243 static inline u32x rotl32 (const u32x a, const u32 n)
244 {
245 return rotate (a, n);
246 }
247
248 static inline u64 rotr64 (const u64 a, const u32 n)
249 {
250 uint2 a2 = as_uint2 (a);
251
252 uint2 t;
253
254 t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32)
255 : amd_bitalign (a2.s1, a2.s0, n);
256 t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32)
257 : amd_bitalign (a2.s0, a2.s1, n);
258
259 return as_ulong (t);
260 }
261
262 static inline u64 rotl64 (const u64 a, const u32 n)
263 {
264 return rotr64 (a, 64 - n);
265 }
266 #endif
267
268 #ifdef IS_NV
269 static inline u32x rotr32 (const u32x a, const u32 n)
270 {
271 return rotate (a, 32 - n);
272 }
273
274 static inline u32x rotl32 (const u32x a, const u32 n)
275 {
276 return rotate (a, n);
277 }
278
279 static inline u64 rotr64 (const u64 a, const u32 n)
280 {
281 u32 il;
282 u32 ir;
283
284 asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a));
285
286 u32 tl;
287 u32 tr;
288
289 if (n >= 32)
290 {
291 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
292 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
293 }
294 else
295 {
296 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
297 asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
298 }
299
300 u64 r;
301
302 asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr));
303
304 return r;
305 }
306
307 static inline u64 rotl64 (const u64 a, const u32 n)
308 {
309 return rotr64 (a, 64 - n);
310 }
311 #endif
312
313 #ifdef IS_GENERIC
314
315 static inline u32x rotr32 (const u32x a, const u32x n)
316 {
317 return rotate (a, 32 - n);
318 }
319
320 static inline u32x rotl32 (const u32x a, const u32x n)
321 {
322 return rotate (a, n);
323 }
324
325 static inline u64 rotr64 (const u64 a, const u32 n)
326 {
327 return rotate (a, (u64) 64 - n);
328 }
329
330 static inline u64 rotl64 (const u64 a, const u32 n)
331 {
332 return rotate (a, (u64) n);
333 }
334 #endif
335
336 #ifdef IS_NV
337 #if CUDA_ARCH >= 500
338 static inline u32x lut3_2d (const u32x a, const u32x b, const u32x c)
339 {
340 u32x r;
341
342 #if VECT_SIZE == 1
343 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
344 #endif
345
346 #if VECT_SIZE == 2
347 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
348 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
349 #endif
350
351 #if VECT_SIZE == 4
352 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
353 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
354 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
355 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
356 #endif
357
358 #if VECT_SIZE == 8
359 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
360 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
361 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
362 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
363 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
364 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
365 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
366 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
367 #endif
368
369 return r;
370 }
371
372 static inline u32x lut3_39 (const u32x a, const u32x b, const u32x c)
373 {
374 u32x r;
375
376 #if VECT_SIZE == 1
377 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
378 #endif
379
380 #if VECT_SIZE == 2
381 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
382 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
383 #endif
384
385 #if VECT_SIZE == 4
386 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
387 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
388 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
389 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
390 #endif
391
392 #if VECT_SIZE == 8
393 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
394 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
395 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
396 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
397 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
398 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
399 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
400 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
401 #endif
402
403 return r;
404 }
405
406 static inline u32x lut3_59 (const u32x a, const u32x b, const u32x c)
407 {
408 u32x r;
409
410 #if VECT_SIZE == 1
411 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
412 #endif
413
414 #if VECT_SIZE == 2
415 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
416 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
417 #endif
418
419 #if VECT_SIZE == 4
420 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
421 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
422 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
423 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
424 #endif
425
426 #if VECT_SIZE == 8
427 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
428 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
429 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
430 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
431 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
432 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
433 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
434 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
435 #endif
436
437 return r;
438 }
439
440 static inline u32x lut3_96 (const u32x a, const u32x b, const u32x c)
441 {
442 u32x r;
443
444 #if VECT_SIZE == 1
445 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
446 #endif
447
448 #if VECT_SIZE == 2
449 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
450 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
451 #endif
452
453 #if VECT_SIZE == 4
454 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
455 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
456 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
457 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
458 #endif
459
460 #if VECT_SIZE == 8
461 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
462 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
463 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
464 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
465 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
466 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
467 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
468 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
469 #endif
470
471 return r;
472 }
473
474 static inline u32x lut3_e4 (const u32x a, const u32x b, const u32x c)
475 {
476 u32x r;
477
478 #if VECT_SIZE == 1
479 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
480 #endif
481
482 #if VECT_SIZE == 2
483 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
484 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
485 #endif
486
487 #if VECT_SIZE == 4
488 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
489 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
490 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
491 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
492 #endif
493
494 #if VECT_SIZE == 8
495 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
496 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
497 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
498 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
499 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
500 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
501 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
502 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
503 #endif
504
505 return r;
506 }
507
508 static inline u32x lut3_e8 (const u32x a, const u32x b, const u32x c)
509 {
510 u32x r;
511
512 #if VECT_SIZE == 1
513 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
514 #endif
515
516 #if VECT_SIZE == 2
517 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
518 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
519 #endif
520
521 #if VECT_SIZE == 4
522 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
523 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
524 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
525 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
526 #endif
527
528 #if VECT_SIZE == 8
529 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
530 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
531 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
532 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
533 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
534 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
535 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
536 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
537 #endif
538
539 return r;
540 }
541
542 static inline u32x lut3_ca (const u32x a, const u32x b, const u32x c)
543 {
544 u32x r;
545
546 #if VECT_SIZE == 1
547 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
548 #endif
549
550 #if VECT_SIZE == 2
551 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
552 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
553 #endif
554
555 #if VECT_SIZE == 4
556 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
557 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
558 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
559 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
560 #endif
561
562 #if VECT_SIZE == 8
563 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
564 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
565 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
566 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
567 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
568 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
569 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
570 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
571 #endif
572
573 return r;
574 }
575
576 #endif
577 #endif
578
579 typedef struct
580 {
581 #if defined _DES_
582 u32 digest_buf[4];
583 #elif defined _MD4_
584 u32 digest_buf[4];
585 #elif defined _MD5_
586 u32 digest_buf[4];
587 #elif defined _MD5H_
588 u32 digest_buf[4];
589 #elif defined _SHA1_
590 u32 digest_buf[5];
591 #elif defined _BCRYPT_
592 u32 digest_buf[6];
593 #elif defined _SHA256_
594 u32 digest_buf[8];
595 #elif defined _SHA384_
596 u32 digest_buf[16];
597 #elif defined _SHA512_
598 u32 digest_buf[16];
599 #elif defined _KECCAK_
600 u32 digest_buf[50];
601 #elif defined _RIPEMD160_
602 u32 digest_buf[5];
603 #elif defined _WHIRLPOOL_
604 u32 digest_buf[16];
605 #elif defined _GOST_
606 u32 digest_buf[8];
607 #elif defined _GOST2012_256_
608 u32 digest_buf[8];
609 #elif defined _GOST2012_512_
610 u32 digest_buf[16];
611 #elif defined _SAPB_
612 u32 digest_buf[4];
613 #elif defined _SAPG_
614 u32 digest_buf[5];
615 #elif defined _MYSQL323_
616 u32 digest_buf[4];
617 #elif defined _LOTUS5_
618 u32 digest_buf[4];
619 #elif defined _LOTUS6_
620 u32 digest_buf[4];
621 #elif defined _SCRYPT_
622 u32 digest_buf[8];
623 #elif defined _LOTUS8_
624 u32 digest_buf[4];
625 #elif defined _OFFICE2007_
626 u32 digest_buf[4];
627 #elif defined _OFFICE2010_
628 u32 digest_buf[4];
629 #elif defined _OFFICE2013_
630 u32 digest_buf[4];
631 #elif defined _OLDOFFICE01_
632 u32 digest_buf[4];
633 #elif defined _OLDOFFICE34_
634 u32 digest_buf[4];
635 #elif defined _SIPHASH_
636 u32 digest_buf[4];
637 #elif defined _PBKDF2_MD5_
638 u32 digest_buf[32];
639 #elif defined _PBKDF2_SHA1_
640 u32 digest_buf[32];
641 #elif defined _PBKDF2_SHA256_
642 u32 digest_buf[32];
643 #elif defined _PBKDF2_SHA512_
644 u32 digest_buf[32];
645 #elif defined _PDF17L8_
646 u32 digest_buf[8];
647 #elif defined _CRC32_
648 u32 digest_buf[4];
649 #elif defined _SEVEN_ZIP_
650 u32 digest_buf[4];
651 #elif defined _ANDROIDFDE_
652 u32 digest_buf[4];
653 #elif defined _DCC2_
654 u32 digest_buf[4];
655 #elif defined _WPA_
656 u32 digest_buf[4];
657 #elif defined _MD5_SHA1_
658 u32 digest_buf[4];
659 #elif defined _SHA1_MD5_
660 u32 digest_buf[5];
661 #elif defined _NETNTLMV2_
662 u32 digest_buf[4];
663 #elif defined _KRB5PA_
664 u32 digest_buf[4];
665 #elif defined _CLOUDKEY_
666 u32 digest_buf[8];
667 #elif defined _SCRYPT_
668 u32 digest_buf[4];
669 #elif defined _PSAFE2_
670 u32 digest_buf[5];
671 #elif defined _LOTUS8_
672 u32 digest_buf[4];
673 #elif defined _RAR3_
674 u32 digest_buf[4];
675 #elif defined _SHA256_SHA1_
676 u32 digest_buf[8];
677 #elif defined _MS_DRSR_
678 u32 digest_buf[8];
679 #elif defined _ANDROIDFDE_SAMSUNG_
680 u32 digest_buf[8];
681 #elif defined _RAR5_
682 u32 digest_buf[4];
683 #endif
684
685 } digest_t;
686
687 typedef struct
688 {
689 u32 salt_buf[16];
690 u32 salt_buf_pc[8];
691
692 u32 salt_len;
693 u32 salt_iter;
694 u32 salt_sign[2];
695
696 u32 keccak_mdlen;
697 u32 truecrypt_mdlen;
698
699 u32 digests_cnt;
700 u32 digests_done;
701
702 u32 digests_offset;
703
704 u32 scrypt_N;
705 u32 scrypt_r;
706 u32 scrypt_p;
707 u32 scrypt_tmto;
708 u32 scrypt_phy;
709
710 } salt_t;
711
712 typedef struct
713 {
714 int V;
715 int R;
716 int P;
717
718 int enc_md;
719
720 u32 id_buf[8];
721 u32 u_buf[32];
722 u32 o_buf[32];
723
724 int id_len;
725 int o_len;
726 int u_len;
727
728 u32 rc4key[2];
729 u32 rc4data[2];
730
731 } pdf_t;
732
733 typedef struct
734 {
735 u32 pke[25];
736 u32 eapol[64];
737 int eapol_size;
738 int keyver;
739
740 } wpa_t;
741
742 typedef struct
743 {
744 u32 cry_master_buf[64];
745 u32 ckey_buf[64];
746 u32 public_key_buf[64];
747
748 u32 cry_master_len;
749 u32 ckey_len;
750 u32 public_key_len;
751
752 } bitcoin_wallet_t;
753
754 typedef struct
755 {
756 u32 salt_buf[30];
757 u32 salt_len;
758
759 u32 esalt_buf[38];
760 u32 esalt_len;
761
762 } sip_t;
763
764 typedef struct
765 {
766 u32 data[384];
767
768 } androidfde_t;
769
770 typedef struct
771 {
772 u32 nr_buf[16];
773 u32 nr_len;
774
775 u32 msg_buf[128];
776 u32 msg_len;
777
778 } ikepsk_t;
779
780 typedef struct
781 {
782 u32 user_len;
783 u32 domain_len;
784 u32 srvchall_len;
785 u32 clichall_len;
786
787 u32 userdomain_buf[64];
788 u32 chall_buf[256];
789
790 } netntlm_t;
791
792 typedef struct
793 {
794 u32 user[16];
795 u32 realm[16];
796 u32 salt[32];
797 u32 timestamp[16];
798 u32 checksum[4];
799
800 } krb5pa_t;
801
802 typedef struct
803 {
804 u32 salt_buf[16];
805 u32 data_buf[112];
806 u32 keyfile_buf[16];
807
808 } tc_t;
809
810 typedef struct
811 {
812 u32 salt_buf[16];
813
814 } pbkdf2_md5_t;
815
816 typedef struct
817 {
818 u32 salt_buf[16];
819
820 } pbkdf2_sha1_t;
821
822 typedef struct
823 {
824 u32 salt_buf[16];
825
826 } pbkdf2_sha256_t;
827
828 typedef struct
829 {
830 u32 salt_buf[32];
831
832 } pbkdf2_sha512_t;
833
834 typedef struct
835 {
836 u32 salt_buf[128];
837 u32 salt_len;
838
839 } rakp_t;
840
841 typedef struct
842 {
843 u32 data_len;
844 u32 data_buf[512];
845
846 } cloudkey_t;
847
848 typedef struct
849 {
850 u32 encryptedVerifier[4];
851 u32 encryptedVerifierHash[5];
852
853 u32 keySize;
854
855 } office2007_t;
856
857 typedef struct
858 {
859 u32 encryptedVerifier[4];
860 u32 encryptedVerifierHash[8];
861
862 } office2010_t;
863
864 typedef struct
865 {
866 u32 encryptedVerifier[4];
867 u32 encryptedVerifierHash[8];
868
869 } office2013_t;
870
871 typedef struct
872 {
873 u32 version;
874 u32 encryptedVerifier[4];
875 u32 encryptedVerifierHash[4];
876 u32 rc4key[2];
877
878 } oldoffice01_t;
879
880 typedef struct
881 {
882 u32 version;
883 u32 encryptedVerifier[4];
884 u32 encryptedVerifierHash[5];
885 u32 rc4key[2];
886
887 } oldoffice34_t;
888
889 typedef struct
890 {
891 u32 digest[4];
892 u32 out[4];
893
894 } pdf14_tmp_t;
895
896 typedef struct
897 {
898 union
899 {
900 u32 dgst32[16];
901 u64 dgst64[8];
902 };
903
904 u32 dgst_len;
905 u32 W_len;
906
907 } pdf17l8_tmp_t;
908
909 typedef struct
910 {
911 u32 digest_buf[4];
912
913 } phpass_tmp_t;
914
915 typedef struct
916 {
917 u32 digest_buf[4];
918
919 } md5crypt_tmp_t;
920
921 typedef struct
922 {
923 u32 alt_result[8];
924
925 u32 p_bytes[4];
926 u32 s_bytes[4];
927
928 } sha256crypt_tmp_t;
929
930 typedef struct
931 {
932 u64 l_alt_result[8];
933
934 u64 l_p_bytes[2];
935 u64 l_s_bytes[2];
936
937 } sha512crypt_tmp_t;
938
939 typedef struct
940 {
941 u32 ipad[5];
942 u32 opad[5];
943
944 u32 dgst[10];
945 u32 out[10];
946
947 } wpa_tmp_t;
948
949 typedef struct
950 {
951 u64 dgst[8];
952
953 } bitcoin_wallet_tmp_t;
954
955 typedef struct
956 {
957 u32 ipad[5];
958 u32 opad[5];
959
960 u32 dgst[5];
961 u32 out[4];
962
963 } dcc2_tmp_t;
964
965 typedef struct
966 {
967 u32 E[18];
968
969 u32 P[18];
970
971 u32 S0[256];
972 u32 S1[256];
973 u32 S2[256];
974 u32 S3[256];
975
976 } bcrypt_tmp_t;
977
978 typedef struct
979 {
980 u32 digest[2];
981
982 u32 P[18];
983
984 u32 S0[256];
985 u32 S1[256];
986 u32 S2[256];
987 u32 S3[256];
988
989 } pwsafe2_tmp_t;
990
991 typedef struct
992 {
993 u32 digest_buf[8];
994
995 } pwsafe3_tmp_t;
996
997 typedef struct
998 {
999 u32 digest_buf[5];
1000
1001 } androidpin_tmp_t;
1002
1003 typedef struct
1004 {
1005 u32 ipad[5];
1006 u32 opad[5];
1007
1008 u32 dgst[10];
1009 u32 out[10];
1010
1011 } androidfde_tmp_t;
1012
1013 typedef struct
1014 {
1015 u32 ipad[16];
1016 u32 opad[16];
1017
1018 u32 dgst[64];
1019 u32 out[64];
1020
1021 } tc_tmp_t;
1022
1023 typedef struct
1024 {
1025 u64 ipad[8];
1026 u64 opad[8];
1027
1028 u64 dgst[32];
1029 u64 out[32];
1030
1031 } tc64_tmp_t;
1032
1033 typedef struct
1034 {
1035 u32 ipad[4];
1036 u32 opad[4];
1037
1038 u32 dgst[32];
1039 u32 out[32];
1040
1041 } pbkdf2_md5_tmp_t;
1042
1043 typedef struct
1044 {
1045 u32 ipad[5];
1046 u32 opad[5];
1047
1048 u32 dgst[32];
1049 u32 out[32];
1050
1051 } pbkdf2_sha1_tmp_t;
1052
1053 typedef struct
1054 {
1055 u32 ipad[8];
1056 u32 opad[8];
1057
1058 u32 dgst[32];
1059 u32 out[32];
1060
1061 } pbkdf2_sha256_tmp_t;
1062
1063 typedef struct
1064 {
1065 u64 ipad[8];
1066 u64 opad[8];
1067
1068 u64 dgst[16];
1069 u64 out[16];
1070
1071 } pbkdf2_sha512_tmp_t;
1072
1073 typedef struct
1074 {
1075 u64 out[8];
1076
1077 } ecryptfs_tmp_t;
1078
1079 typedef struct
1080 {
1081 u64 ipad[8];
1082 u64 opad[8];
1083
1084 u64 dgst[16];
1085 u64 out[16];
1086
1087 } oraclet_tmp_t;
1088
1089 typedef struct
1090 {
1091 u32 ipad[5];
1092 u32 opad[5];
1093
1094 u32 dgst[5];
1095 u32 out[5];
1096
1097 } agilekey_tmp_t;
1098
1099 typedef struct
1100 {
1101 u32 ipad[5];
1102 u32 opad[5];
1103
1104 u32 dgst1[5];
1105 u32 out1[5];
1106
1107 u32 dgst2[5];
1108 u32 out2[5];
1109
1110 } mywallet_tmp_t;
1111
1112 typedef struct
1113 {
1114 u32 ipad[5];
1115 u32 opad[5];
1116
1117 u32 dgst[5];
1118 u32 out[5];
1119
1120 } sha1aix_tmp_t;
1121
1122 typedef struct
1123 {
1124 u32 ipad[8];
1125 u32 opad[8];
1126
1127 u32 dgst[8];
1128 u32 out[8];
1129
1130 } sha256aix_tmp_t;
1131
1132 typedef struct
1133 {
1134 u64 ipad[8];
1135 u64 opad[8];
1136
1137 u64 dgst[8];
1138 u64 out[8];
1139
1140 } sha512aix_tmp_t;
1141
1142 typedef struct
1143 {
1144 u32 ipad[8];
1145 u32 opad[8];
1146
1147 u32 dgst[8];
1148 u32 out[8];
1149
1150 } lastpass_tmp_t;
1151
1152 typedef struct
1153 {
1154 u64 digest_buf[8];
1155
1156 } drupal7_tmp_t;
1157
1158 typedef struct
1159 {
1160 u32 ipad[5];
1161 u32 opad[5];
1162
1163 u32 dgst[5];
1164 u32 out[5];
1165
1166 } lotus8_tmp_t;
1167
1168 typedef struct
1169 {
1170 u32 out[5];
1171
1172 } office2007_tmp_t;
1173
1174 typedef struct
1175 {
1176 u32 out[5];
1177
1178 } office2010_tmp_t;
1179
1180 typedef struct
1181 {
1182 u64 out[8];
1183
1184 } office2013_tmp_t;
1185
1186 typedef struct
1187 {
1188 u32 digest_buf[5];
1189
1190 } saph_sha1_tmp_t;
1191
1192 typedef struct
1193 {
1194 u32 block[16];
1195
1196 u32 dgst[8];
1197
1198 u32 block_len;
1199 u32 final_len;
1200
1201 } seven_zip_tmp_t;
1202
1203 typedef struct
1204 {
1205 u32 Kc[16];
1206 u32 Kd[16];
1207
1208 u32 iv[2];
1209
1210 } bsdicrypt_tmp_t;
1211
1212 typedef struct
1213 {
1214 u32 dgst[17][5];
1215
1216 } rar3_tmp_t;
1217
1218 typedef struct
1219 {
1220 u32 user[16];
1221
1222 } cram_md5_t;
1223
1224 typedef struct
1225 {
1226 u32 iv_buf[4];
1227 u32 iv_len;
1228
1229 u32 salt_buf[4];
1230 u32 salt_len;
1231
1232 u32 crc;
1233
1234 u32 data_buf[96];
1235 u32 data_len;
1236
1237 u32 unpack_size;
1238
1239 } seven_zip_t;
1240
1241 typedef struct
1242 {
1243 u32 key;
1244 u64 val;
1245
1246 } hcstat_table_t;
1247
1248 typedef struct
1249 {
1250 u32 cs_buf[0x100];
1251 u32 cs_len;
1252
1253 } cs_t;
1254
1255 typedef struct
1256 {
1257 u32 cmds[0x100];
1258
1259 } kernel_rule_t;
1260
1261 typedef struct
1262 {
1263 u32 gidvid;
1264 u32 il_pos;
1265
1266 } plain_t;
1267
1268 typedef struct
1269 {
1270 u32 i[64];
1271
1272 u32 pw_len;
1273
1274 u32 alignment_placeholder_1;
1275 u32 alignment_placeholder_2;
1276 u32 alignment_placeholder_3;
1277
1278 } pw_t;
1279
1280 typedef struct
1281 {
1282 u32 i;
1283
1284 } bf_t;
1285
1286 typedef struct
1287 {
1288 u32 i[8];
1289
1290 u32 pw_len;
1291
1292 } comb_t;
1293
1294 typedef struct
1295 {
1296 u32 b[32];
1297
1298 } bs_word_t;
1299
1300 typedef struct
1301 {
1302 uint4 P[64];
1303
1304 } scrypt_tmp_t;