Initial commit
[hashcat.git] / nv / rp_nv.c
1 /**
2 * Author......: Jens Steube <jens.steube@gmail.com>
3 * License.....: MIT
4 */
5
6 __device__ static u32x generate_cmask (u32x buf)
7 {
8 const u32x rmask = ((buf & 0x40404040) >> 1)
9 & ~((buf & 0x80808080) >> 2);
10
11 const u32x hmask = (buf & 0x1f1f1f1f) + 0x05050505;
12 const u32x lmask = (buf & 0x1f1f1f1f) + 0x1f1f1f1f;
13
14 return rmask & ~hmask & lmask;
15 }
16
17 __device__ static void truncate_right (u32x w0[4], u32x w1[4], const u32 len)
18 {
19 const u32 tmp = (1 << ((len % 4) * 8)) - 1;
20
21 switch (len / 4)
22 {
23 case 0: w0[0] &= tmp;
24 w0[1] = 0;
25 w0[2] = 0;
26 w0[3] = 0;
27 w1[0] = 0;
28 w1[1] = 0;
29 w1[2] = 0;
30 w1[3] = 0;
31 break;
32 case 1: w0[1] &= tmp;
33 w0[2] = 0;
34 w0[3] = 0;
35 w1[0] = 0;
36 w1[1] = 0;
37 w1[2] = 0;
38 w1[3] = 0;
39 break;
40 case 2: w0[2] &= tmp;
41 w0[3] = 0;
42 w1[0] = 0;
43 w1[1] = 0;
44 w1[2] = 0;
45 w1[3] = 0;
46 break;
47 case 3: w0[3] &= tmp;
48 w1[0] = 0;
49 w1[1] = 0;
50 w1[2] = 0;
51 w1[3] = 0;
52 break;
53 case 4: w1[0] &= tmp;
54 w1[1] = 0;
55 w1[2] = 0;
56 w1[3] = 0;
57 break;
58 case 5: w1[1] &= tmp;
59 w1[2] = 0;
60 w1[3] = 0;
61 break;
62 case 6: w1[2] &= tmp;
63 w1[3] = 0;
64 break;
65 case 7: w1[3] &= tmp;
66 break;
67 }
68 }
69
70 __device__ static void truncate_left (u32x w0[4], u32x w1[4], const u32 len)
71 {
72 const u32 tmp = ~((1 << ((len % 4) * 8)) - 1);
73
74 switch (len / 4)
75 {
76 case 0: w0[0] &= tmp;
77 break;
78 case 1: w0[0] = 0;
79 w0[1] &= tmp;
80 break;
81 case 2: w0[0] = 0;
82 w0[1] = 0;
83 w0[2] &= tmp;
84 break;
85 case 3: w0[0] = 0;
86 w0[1] = 0;
87 w0[2] = 0;
88 w0[3] &= tmp;
89 break;
90 case 4: w0[0] = 0;
91 w0[1] = 0;
92 w0[2] = 0;
93 w0[3] = 0;
94 w1[0] &= tmp;
95 break;
96 case 5: w0[0] = 0;
97 w0[1] = 0;
98 w0[2] = 0;
99 w0[3] = 0;
100 w1[0] = 0;
101 w1[1] &= tmp;
102 break;
103 case 6: w0[0] = 0;
104 w0[1] = 0;
105 w0[2] = 0;
106 w0[3] = 0;
107 w1[0] = 0;
108 w1[1] = 0;
109 w1[2] &= tmp;
110 break;
111 case 7: w0[0] = 0;
112 w0[1] = 0;
113 w0[2] = 0;
114 w0[3] = 0;
115 w1[0] = 0;
116 w1[1] = 0;
117 w1[2] = 0;
118 w1[3] &= tmp;
119 break;
120 }
121 }
122
123 __device__ static void lshift_block (const u32x in0[4], const u32x in1[4], u32x out0[4], u32x out1[4])
124 {
125 #if __CUDA_ARCH__ >= 200
126
127 out0[0] = __byte_perm (in0[0], in0[1], 0x4321);
128 out0[1] = __byte_perm (in0[1], in0[2], 0x4321);
129 out0[2] = __byte_perm (in0[2], in0[3], 0x4321);
130 out0[3] = __byte_perm (in0[3], in1[0], 0x4321);
131 out1[0] = __byte_perm (in1[0], in1[1], 0x4321);
132 out1[1] = __byte_perm (in1[1], in1[2], 0x4321);
133 out1[2] = __byte_perm (in1[2], in1[3], 0x4321);
134 out1[3] = __byte_perm (in1[3], 0, 0x4321);
135
136 #else
137
138 out0[0] = in0[0] >> 8 | in0[1] << 24;
139 out0[1] = in0[1] >> 8 | in0[2] << 24;
140 out0[2] = in0[2] >> 8 | in0[3] << 24;
141 out0[3] = in0[3] >> 8 | in1[0] << 24;
142 out1[0] = in1[0] >> 8 | in1[1] << 24;
143 out1[1] = in1[1] >> 8 | in1[2] << 24;
144 out1[2] = in1[2] >> 8 | in1[3] << 24;
145 out1[3] = in1[3] >> 8;
146
147 #endif
148 }
149
150 __device__ static void rshift_block (const u32x in0[4], const u32x in1[4], u32x out0[4], u32x out1[4])
151 {
152 #if __CUDA_ARCH__ >= 200
153
154 out1[3] = __byte_perm (in1[2], in1[3], 0x6543);
155 out1[2] = __byte_perm (in1[1], in1[2], 0x6543);
156 out1[1] = __byte_perm (in1[0], in1[1], 0x6543);
157 out1[0] = __byte_perm (in0[3], in1[0], 0x6543);
158 out0[3] = __byte_perm (in0[2], in0[3], 0x6543);
159 out0[2] = __byte_perm (in0[1], in0[2], 0x6543);
160 out0[1] = __byte_perm (in0[0], in0[1], 0x6543);
161 out0[0] = __byte_perm ( 0, in0[0], 0x6543);
162
163 #else
164
165 out1[3] = in1[3] << 8 | in1[2] >> 24;
166 out1[2] = in1[2] << 8 | in1[1] >> 24;
167 out1[1] = in1[1] << 8 | in1[0] >> 24;
168 out1[0] = in1[0] << 8 | in0[3] >> 24;
169 out0[3] = in0[3] << 8 | in0[2] >> 24;
170 out0[2] = in0[2] << 8 | in0[1] >> 24;
171 out0[1] = in0[1] << 8 | in0[0] >> 24;
172 out0[0] = in0[0] << 8;
173
174 #endif
175 }
176
177 __device__ static void rshift_block_N (const u32x in0[4], const u32x in1[4], u32x out0[4], u32x out1[4], const u32 num)
178 {
179 #if __CUDA_ARCH__ >= 200
180
181 switch (num)
182 {
183 case 0: out1[3] = in1[3];
184 out1[2] = in1[2];
185 out1[1] = in1[1];
186 out1[0] = in1[0];
187 out0[3] = in0[3];
188 out0[2] = in0[2];
189 out0[1] = in0[1];
190 out0[0] = in0[0];
191 break;
192 case 1: out1[3] = __byte_perm (in1[2], in1[3], 0x6543);
193 out1[2] = __byte_perm (in1[1], in1[2], 0x6543);
194 out1[1] = __byte_perm (in1[0], in1[1], 0x6543);
195 out1[0] = __byte_perm (in0[3], in1[0], 0x6543);
196 out0[3] = __byte_perm (in0[2], in0[3], 0x6543);
197 out0[2] = __byte_perm (in0[1], in0[2], 0x6543);
198 out0[1] = __byte_perm (in0[0], in0[1], 0x6543);
199 out0[0] = __byte_perm ( 0, in0[0], 0x6543);
200 break;
201 case 2: out1[3] = __byte_perm (in1[2], in1[3], 0x5432);
202 out1[2] = __byte_perm (in1[1], in1[2], 0x5432);
203 out1[1] = __byte_perm (in1[0], in1[1], 0x5432);
204 out1[0] = __byte_perm (in0[3], in1[0], 0x5432);
205 out0[3] = __byte_perm (in0[2], in0[3], 0x5432);
206 out0[2] = __byte_perm (in0[1], in0[2], 0x5432);
207 out0[1] = __byte_perm (in0[0], in0[1], 0x5432);
208 out0[0] = __byte_perm ( 0, in0[0], 0x5432);
209 break;
210 case 3: out1[3] = __byte_perm (in1[2], in1[3], 0x4321);
211 out1[2] = __byte_perm (in1[1], in1[2], 0x4321);
212 out1[1] = __byte_perm (in1[0], in1[1], 0x4321);
213 out1[0] = __byte_perm (in0[3], in1[0], 0x4321);
214 out0[3] = __byte_perm (in0[2], in0[3], 0x4321);
215 out0[2] = __byte_perm (in0[1], in0[2], 0x4321);
216 out0[1] = __byte_perm (in0[0], in0[1], 0x4321);
217 out0[0] = __byte_perm ( 0, in0[0], 0x4321);
218 break;
219 case 4: out1[3] = in1[2];
220 out1[2] = in1[1];
221 out1[1] = in1[0];
222 out1[0] = in0[3];
223 out0[3] = in0[2];
224 out0[2] = in0[1];
225 out0[1] = in0[0];
226 out0[0] = 0;
227 break;
228 case 5: out1[3] = __byte_perm (in1[1], in1[2], 0x6543);
229 out1[2] = __byte_perm (in1[0], in1[1], 0x6543);
230 out1[1] = __byte_perm (in0[3], in1[0], 0x6543);
231 out1[0] = __byte_perm (in0[2], in0[3], 0x6543);
232 out0[3] = __byte_perm (in0[1], in0[2], 0x6543);
233 out0[2] = __byte_perm (in0[0], in0[1], 0x6543);
234 out0[1] = __byte_perm ( 0, in0[0], 0x6543);
235 out0[0] = 0;
236 break;
237 case 6: out1[3] = __byte_perm (in1[1], in1[2], 0x5432);
238 out1[2] = __byte_perm (in1[0], in1[1], 0x5432);
239 out1[1] = __byte_perm (in0[3], in1[0], 0x5432);
240 out1[0] = __byte_perm (in0[2], in0[3], 0x5432);
241 out0[3] = __byte_perm (in0[1], in0[2], 0x5432);
242 out0[2] = __byte_perm (in0[0], in0[1], 0x5432);
243 out0[1] = __byte_perm ( 0, in0[0], 0x5432);
244 out0[0] = 0;
245 break;
246 case 7: out1[3] = __byte_perm (in1[1], in1[2], 0x4321);
247 out1[2] = __byte_perm (in1[0], in1[1], 0x4321);
248 out1[1] = __byte_perm (in0[3], in1[0], 0x4321);
249 out1[0] = __byte_perm (in0[2], in0[3], 0x4321);
250 out0[3] = __byte_perm (in0[1], in0[2], 0x4321);
251 out0[2] = __byte_perm (in0[0], in0[1], 0x4321);
252 out0[1] = __byte_perm ( 0, in0[0], 0x4321);
253 out0[0] = 0;
254 break;
255 case 8: out1[3] = in1[1];
256 out1[2] = in1[0];
257 out1[1] = in0[3];
258 out1[0] = in0[2];
259 out0[3] = in0[1];
260 out0[2] = in0[0];
261 out0[1] = 0;
262 out0[0] = 0;
263 break;
264 case 9: out1[3] = __byte_perm (in1[0], in1[1], 0x6543);
265 out1[2] = __byte_perm (in0[3], in1[0], 0x6543);
266 out1[1] = __byte_perm (in0[2], in0[3], 0x6543);
267 out1[0] = __byte_perm (in0[1], in0[2], 0x6543);
268 out0[3] = __byte_perm (in0[0], in0[1], 0x6543);
269 out0[2] = __byte_perm ( 0, in0[0], 0x6543);
270 out0[1] = 0;
271 out0[0] = 0;
272 break;
273 case 10: out1[3] = __byte_perm (in1[0], in1[1], 0x5432);
274 out1[2] = __byte_perm (in0[3], in1[0], 0x5432);
275 out1[1] = __byte_perm (in0[2], in0[3], 0x5432);
276 out1[0] = __byte_perm (in0[1], in0[2], 0x5432);
277 out0[3] = __byte_perm (in0[0], in0[1], 0x5432);
278 out0[2] = __byte_perm ( 0, in0[0], 0x5432);
279 out0[1] = 0;
280 out0[0] = 0;
281 break;
282 case 11: out1[3] = __byte_perm (in1[0], in1[1], 0x4321);
283 out1[2] = __byte_perm (in0[3], in1[0], 0x4321);
284 out1[1] = __byte_perm (in0[2], in0[3], 0x4321);
285 out1[0] = __byte_perm (in0[1], in0[2], 0x4321);
286 out0[3] = __byte_perm (in0[0], in0[1], 0x4321);
287 out0[2] = __byte_perm ( 0, in0[0], 0x4321);
288 out0[1] = 0;
289 out0[0] = 0;
290 break;
291 case 12: out1[3] = in1[0];
292 out1[2] = in0[3];
293 out1[1] = in0[2];
294 out1[0] = in0[1];
295 out0[3] = in0[0];
296 out0[2] = 0;
297 out0[1] = 0;
298 out0[0] = 0;
299 break;
300 case 13: out1[3] = __byte_perm (in0[3], in1[0], 0x6543);
301 out1[2] = __byte_perm (in0[2], in0[3], 0x6543);
302 out1[1] = __byte_perm (in0[1], in0[2], 0x6543);
303 out1[0] = __byte_perm (in0[0], in0[1], 0x6543);
304 out0[3] = __byte_perm ( 0, in0[0], 0x6543);
305 out0[2] = 0;
306 out0[1] = 0;
307 out0[0] = 0;
308 break;
309 case 14: out1[3] = __byte_perm (in0[3], in1[0], 0x5432);
310 out1[2] = __byte_perm (in0[2], in0[3], 0x5432);
311 out1[1] = __byte_perm (in0[1], in0[2], 0x5432);
312 out1[0] = __byte_perm (in0[0], in0[1], 0x5432);
313 out0[3] = __byte_perm ( 0, in0[0], 0x5432);
314 out0[2] = 0;
315 out0[1] = 0;
316 out0[0] = 0;
317 break;
318 case 15: out1[3] = __byte_perm (in0[3], in1[0], 0x4321);
319 out1[2] = __byte_perm (in0[2], in0[3], 0x4321);
320 out1[1] = __byte_perm (in0[1], in0[2], 0x4321);
321 out1[0] = __byte_perm (in0[0], in0[1], 0x4321);
322 out0[3] = __byte_perm ( 0, in0[0], 0x4321);
323 out0[2] = 0;
324 out0[1] = 0;
325 out0[0] = 0;
326 break;
327 case 16: out1[3] = in0[3];
328 out1[2] = in0[2];
329 out1[1] = in0[1];
330 out1[0] = in0[0];
331 out0[3] = 0;
332 out0[2] = 0;
333 out0[1] = 0;
334 out0[0] = 0;
335 break;
336 case 17: out1[3] = __byte_perm (in0[2], in0[3], 0x6543);
337 out1[2] = __byte_perm (in0[1], in0[2], 0x6543);
338 out1[1] = __byte_perm (in0[0], in0[1], 0x6543);
339 out1[0] = __byte_perm ( 0, in0[0], 0x6543);
340 out0[3] = 0;
341 out0[2] = 0;
342 out0[1] = 0;
343 out0[0] = 0;
344 break;
345 case 18: out1[3] = __byte_perm (in0[2], in0[3], 0x5432);
346 out1[2] = __byte_perm (in0[1], in0[2], 0x5432);
347 out1[1] = __byte_perm (in0[0], in0[1], 0x5432);
348 out1[0] = __byte_perm ( 0, in0[0], 0x5432);
349 out0[3] = 0;
350 out0[2] = 0;
351 out0[1] = 0;
352 out0[0] = 0;
353 break;
354 case 19: out1[3] = __byte_perm (in0[2], in0[3], 0x4321);
355 out1[2] = __byte_perm (in0[1], in0[2], 0x4321);
356 out1[1] = __byte_perm (in0[0], in0[1], 0x4321);
357 out1[0] = __byte_perm ( 0, in0[0], 0x4321);
358 out0[3] = 0;
359 out0[2] = 0;
360 out0[1] = 0;
361 out0[0] = 0;
362 break;
363 case 20: out1[3] = in0[2];
364 out1[2] = in0[1];
365 out1[1] = in0[0];
366 out1[0] = 0;
367 out0[3] = 0;
368 out0[2] = 0;
369 out0[1] = 0;
370 out0[0] = 0;
371 break;
372 case 21: out1[3] = __byte_perm (in0[1], in0[2], 0x6543);
373 out1[2] = __byte_perm (in0[0], in0[1], 0x6543);
374 out1[1] = __byte_perm ( 0, in0[0], 0x6543);
375 out1[0] = 0;
376 out0[3] = 0;
377 out0[2] = 0;
378 out0[1] = 0;
379 out0[0] = 0;
380 break;
381 case 22: out1[3] = __byte_perm (in0[1], in0[2], 0x5432);
382 out1[2] = __byte_perm (in0[0], in0[1], 0x5432);
383 out1[1] = __byte_perm ( 0, in0[0], 0x5432);
384 out1[0] = 0;
385 out0[3] = 0;
386 out0[2] = 0;
387 out0[1] = 0;
388 out0[0] = 0;
389 break;
390 case 23: out1[3] = __byte_perm (in0[1], in0[2], 0x4321);
391 out1[2] = __byte_perm (in0[0], in0[1], 0x4321);
392 out1[1] = __byte_perm ( 0, in0[0], 0x4321);
393 out1[0] = 0;
394 out0[3] = 0;
395 out0[2] = 0;
396 out0[1] = 0;
397 out0[0] = 0;
398 break;
399 case 24: out1[3] = in0[1];
400 out1[2] = in0[0];
401 out1[1] = 0;
402 out1[0] = 0;
403 out0[3] = 0;
404 out0[2] = 0;
405 out0[1] = 0;
406 out0[0] = 0;
407 break;
408 case 25: out1[3] = __byte_perm (in0[0], in0[1], 0x6543);
409 out1[2] = __byte_perm ( 0, in0[0], 0x6543);
410 out1[1] = 0;
411 out1[0] = 0;
412 out0[3] = 0;
413 out0[2] = 0;
414 out0[1] = 0;
415 out0[0] = 0;
416 break;
417 case 26: out1[3] = __byte_perm (in0[0], in0[1], 0x5432);
418 out1[2] = __byte_perm ( 0, in0[0], 0x5432);
419 out1[1] = 0;
420 out1[0] = 0;
421 out0[3] = 0;
422 out0[2] = 0;
423 out0[1] = 0;
424 out0[0] = 0;
425 break;
426 case 27: out1[3] = __byte_perm (in0[0], in0[1], 0x4321);
427 out1[2] = __byte_perm ( 0, in0[0], 0x4321);
428 out1[1] = 0;
429 out1[0] = 0;
430 out0[3] = 0;
431 out0[2] = 0;
432 out0[1] = 0;
433 out0[0] = 0;
434 break;
435 case 28: out1[3] = in0[0];
436 out1[2] = 0;
437 out1[1] = 0;
438 out1[0] = 0;
439 out0[3] = 0;
440 out0[2] = 0;
441 out0[1] = 0;
442 out0[0] = 0;
443 break;
444 case 29: out1[3] = __byte_perm ( 0, in0[0], 0x6543);
445 out1[2] = 0;
446 out1[1] = 0;
447 out1[0] = 0;
448 out0[3] = 0;
449 out0[2] = 0;
450 out0[1] = 0;
451 out0[0] = 0;
452 break;
453 case 30: out1[3] = __byte_perm ( 0, in0[0], 0x5432);
454 out1[2] = 0;
455 out1[1] = 0;
456 out1[0] = 0;
457 out0[3] = 0;
458 out0[2] = 0;
459 out0[1] = 0;
460 out0[0] = 0;
461 break;
462 case 31: out1[3] = __byte_perm ( 0, in0[0], 0x4321);
463 out1[2] = 0;
464 out1[1] = 0;
465 out1[0] = 0;
466 out0[3] = 0;
467 out0[2] = 0;
468 out0[1] = 0;
469 out0[0] = 0;
470 break;
471 }
472
473 #else
474
475 switch (num)
476 {
477 case 0: out1[3] = in1[3];
478 out1[2] = in1[2];
479 out1[1] = in1[1];
480 out1[0] = in1[0];
481 out0[3] = in0[3];
482 out0[2] = in0[2];
483 out0[1] = in0[1];
484 out0[0] = in0[0];
485 break;
486 case 1: out1[3] = in1[3] << 8 | in1[2] >> 24;
487 out1[2] = in1[2] << 8 | in1[1] >> 24;
488 out1[1] = in1[1] << 8 | in1[0] >> 24;
489 out1[0] = in1[0] << 8 | in0[3] >> 24;
490 out0[3] = in0[3] << 8 | in0[2] >> 24;
491 out0[2] = in0[2] << 8 | in0[1] >> 24;
492 out0[1] = in0[1] << 8 | in0[0] >> 24;
493 out0[0] = in0[0] << 8;
494 break;
495 case 2: out1[3] = in1[3] << 16 | in1[2] >> 16;
496 out1[2] = in1[2] << 16 | in1[1] >> 16;
497 out1[1] = in1[1] << 16 | in1[0] >> 16;
498 out1[0] = in1[0] << 16 | in0[3] >> 16;
499 out0[3] = in0[3] << 16 | in0[2] >> 16;
500 out0[2] = in0[2] << 16 | in0[1] >> 16;
501 out0[1] = in0[1] << 16 | in0[0] >> 16;
502 out0[0] = in0[0] << 16;
503 break;
504 case 3: out1[3] = in1[3] << 24 | in1[2] >> 8;
505 out1[2] = in1[2] << 24 | in1[1] >> 8;
506 out1[1] = in1[1] << 24 | in1[0] >> 8;
507 out1[0] = in1[0] << 24 | in0[3] >> 8;
508 out0[3] = in0[3] << 24 | in0[2] >> 8;
509 out0[2] = in0[2] << 24 | in0[1] >> 8;
510 out0[1] = in0[1] << 24 | in0[0] >> 8;
511 out0[0] = in0[0] << 24;
512 break;
513 case 4: out1[3] = in1[2];
514 out1[2] = in1[1];
515 out1[1] = in1[0];
516 out1[0] = in0[3];
517 out0[3] = in0[2];
518 out0[2] = in0[1];
519 out0[1] = in0[0];
520 out0[0] = 0;
521 break;
522 case 5: out1[3] = in1[2] << 8 | in1[1] >> 24;
523 out1[2] = in1[1] << 8 | in1[0] >> 24;
524 out1[1] = in1[0] << 8 | in0[3] >> 24;
525 out1[0] = in0[3] << 8 | in0[2] >> 24;
526 out0[3] = in0[2] << 8 | in0[1] >> 24;
527 out0[2] = in0[1] << 8 | in0[0] >> 24;
528 out0[1] = in0[0] << 8;
529 out0[0] = 0;
530 break;
531 case 6: out1[3] = in1[2] << 16 | in1[1] >> 16;
532 out1[2] = in1[1] << 16 | in1[0] >> 16;
533 out1[1] = in1[0] << 16 | in0[3] >> 16;
534 out1[0] = in0[3] << 16 | in0[2] >> 16;
535 out0[3] = in0[2] << 16 | in0[1] >> 16;
536 out0[2] = in0[1] << 16 | in0[0] >> 16;
537 out0[1] = in0[0] << 16;
538 out0[0] = 0;
539 break;
540 case 7: out1[3] = in1[2] << 24 | in1[1] >> 8;
541 out1[2] = in1[1] << 24 | in1[0] >> 8;
542 out1[1] = in1[0] << 24 | in0[3] >> 8;
543 out1[0] = in0[3] << 24 | in0[2] >> 8;
544 out0[3] = in0[2] << 24 | in0[1] >> 8;
545 out0[2] = in0[1] << 24 | in0[0] >> 8;
546 out0[1] = in0[0] << 24;
547 out0[0] = 0;
548 break;
549 case 8: out1[3] = in1[1];
550 out1[2] = in1[0];
551 out1[1] = in0[3];
552 out1[0] = in0[2];
553 out0[3] = in0[1];
554 out0[2] = in0[0];
555 out0[1] = 0;
556 out0[0] = 0;
557 break;
558 case 9: out1[3] = in1[1] << 8 | in1[0] >> 24;
559 out1[2] = in1[0] << 8 | in0[3] >> 24;
560 out1[1] = in0[3] << 8 | in0[2] >> 24;
561 out1[0] = in0[2] << 8 | in0[1] >> 24;
562 out0[3] = in0[1] << 8 | in0[0] >> 24;
563 out0[2] = in0[0] << 8;
564 out0[1] = 0;
565 out0[0] = 0;
566 break;
567 case 10: out1[3] = in1[1] << 16 | in1[0] >> 16;
568 out1[2] = in1[0] << 16 | in0[3] >> 16;
569 out1[1] = in0[3] << 16 | in0[2] >> 16;
570 out1[0] = in0[2] << 16 | in0[1] >> 16;
571 out0[3] = in0[1] << 16 | in0[0] >> 16;
572 out0[2] = in0[0] << 16;
573 out0[1] = 0;
574 out0[0] = 0;
575 break;
576 case 11: out1[3] = in1[1] << 24 | in1[0] >> 8;
577 out1[2] = in1[0] << 24 | in0[3] >> 8;
578 out1[1] = in0[3] << 24 | in0[2] >> 8;
579 out1[0] = in0[2] << 24 | in0[1] >> 8;
580 out0[3] = in0[1] << 24 | in0[0] >> 8;
581 out0[2] = in0[0] << 24;
582 out0[1] = 0;
583 out0[0] = 0;
584 break;
585 case 12: out1[3] = in1[0];
586 out1[2] = in0[3];
587 out1[1] = in0[2];
588 out1[0] = in0[1];
589 out0[3] = in0[0];
590 out0[2] = 0;
591 out0[1] = 0;
592 out0[0] = 0;
593 break;
594 case 13: out1[3] = in1[0] << 8 | in0[3] >> 24;
595 out1[2] = in0[3] << 8 | in0[2] >> 24;
596 out1[1] = in0[2] << 8 | in0[1] >> 24;
597 out1[0] = in0[1] << 8 | in0[0] >> 24;
598 out0[3] = in0[0] << 8;
599 out0[2] = 0;
600 out0[1] = 0;
601 out0[0] = 0;
602 break;
603 case 14: out1[3] = in1[0] << 16 | in0[3] >> 16;
604 out1[2] = in0[3] << 16 | in0[2] >> 16;
605 out1[1] = in0[2] << 16 | in0[1] >> 16;
606 out1[0] = in0[1] << 16 | in0[0] >> 16;
607 out0[3] = in0[0] << 16;
608 out0[2] = 0;
609 out0[1] = 0;
610 out0[0] = 0;
611 break;
612 case 15: out1[3] = in1[0] << 24 | in0[3] >> 8;
613 out1[2] = in0[3] << 24 | in0[2] >> 8;
614 out1[1] = in0[2] << 24 | in0[1] >> 8;
615 out1[0] = in0[1] << 24 | in0[0] >> 8;
616 out0[3] = in0[0] << 24;
617 out0[2] = 0;
618 out0[1] = 0;
619 out0[0] = 0;
620 break;
621 case 16: out1[3] = in0[3];
622 out1[2] = in0[2];
623 out1[1] = in0[1];
624 out1[0] = in0[0];
625 out0[3] = 0;
626 out0[2] = 0;
627 out0[1] = 0;
628 out0[0] = 0;
629 break;
630 case 17: out1[3] = in0[3] << 8 | in0[2] >> 24;
631 out1[2] = in0[2] << 8 | in0[1] >> 24;
632 out1[1] = in0[1] << 8 | in0[0] >> 24;
633 out1[0] = in0[0] << 8;
634 out0[3] = 0;
635 out0[2] = 0;
636 out0[1] = 0;
637 out0[0] = 0;
638 break;
639 case 18: out1[3] = in0[3] << 16 | in0[2] >> 16;
640 out1[2] = in0[2] << 16 | in0[1] >> 16;
641 out1[1] = in0[1] << 16 | in0[0] >> 16;
642 out1[0] = in0[0] << 16;
643 out0[3] = 0;
644 out0[2] = 0;
645 out0[1] = 0;
646 out0[0] = 0;
647 break;
648 case 19: out1[3] = in0[3] << 24 | in0[2] >> 8;
649 out1[2] = in0[2] << 24 | in0[1] >> 8;
650 out1[1] = in0[1] << 24 | in0[0] >> 8;
651 out1[0] = in0[0] << 24;
652 out0[3] = 0;
653 out0[2] = 0;
654 out0[1] = 0;
655 out0[0] = 0;
656 break;
657 case 20: out1[3] = in0[2];
658 out1[2] = in0[1];
659 out1[1] = in0[0];
660 out1[0] = 0;
661 out0[3] = 0;
662 out0[2] = 0;
663 out0[1] = 0;
664 out0[0] = 0;
665 break;
666 case 21: out1[3] = in0[2] << 8 | in0[1] >> 24;
667 out1[2] = in0[1] << 8 | in0[0] >> 24;
668 out1[1] = in0[0] << 8;
669 out1[0] = 0;
670 out0[3] = 0;
671 out0[2] = 0;
672 out0[1] = 0;
673 out0[0] = 0;
674 break;
675 case 22: out1[3] = in0[2] << 16 | in0[1] >> 16;
676 out1[2] = in0[1] << 16 | in0[0] >> 16;
677 out1[1] = in0[0] << 16;
678 out1[0] = 0;
679 out0[3] = 0;
680 out0[2] = 0;
681 out0[1] = 0;
682 out0[0] = 0;
683 break;
684 case 23: out1[3] = in0[2] << 24 | in0[1] >> 8;
685 out1[2] = in0[1] << 24 | in0[0] >> 8;
686 out1[1] = in0[0] << 24;
687 out1[0] = 0;
688 out0[3] = 0;
689 out0[2] = 0;
690 out0[1] = 0;
691 out0[0] = 0;
692 break;
693 case 24: out1[3] = in0[1];
694 out1[2] = in0[0];
695 out1[1] = 0;
696 out1[0] = 0;
697 out0[3] = 0;
698 out0[2] = 0;
699 out0[1] = 0;
700 out0[0] = 0;
701 break;
702 case 25: out1[3] = in0[1] << 8 | in0[0] >> 24;
703 out1[2] = in0[0] << 8;
704 out1[1] = 0;
705 out1[0] = 0;
706 out0[3] = 0;
707 out0[2] = 0;
708 out0[1] = 0;
709 out0[0] = 0;
710 break;
711 case 26: out1[3] = in0[1] << 16 | in0[0] >> 16;
712 out1[2] = in0[0] << 16;
713 out1[1] = 0;
714 out1[0] = 0;
715 out0[3] = 0;
716 out0[2] = 0;
717 out0[1] = 0;
718 out0[0] = 0;
719 break;
720 case 27: out1[3] = in0[1] << 24 | in0[0] >> 8;
721 out1[2] = in0[0] << 24;
722 out1[1] = 0;
723 out1[0] = 0;
724 out0[3] = 0;
725 out0[2] = 0;
726 out0[1] = 0;
727 out0[0] = 0;
728 break;
729 case 28: out1[3] = in0[0];
730 out1[2] = 0;
731 out1[1] = 0;
732 out1[0] = 0;
733 out0[3] = 0;
734 out0[2] = 0;
735 out0[1] = 0;
736 out0[0] = 0;
737 break;
738 case 29: out1[3] = in0[0] << 8;
739 out1[2] = 0;
740 out1[1] = 0;
741 out1[0] = 0;
742 out0[3] = 0;
743 out0[2] = 0;
744 out0[1] = 0;
745 out0[0] = 0;
746 break;
747 case 30: out1[3] = in0[0] << 16;
748 out1[2] = 0;
749 out1[1] = 0;
750 out1[0] = 0;
751 out0[3] = 0;
752 out0[2] = 0;
753 out0[1] = 0;
754 out0[0] = 0;
755 break;
756 case 31: out1[3] = in0[0] << 24;
757 out1[2] = 0;
758 out1[1] = 0;
759 out1[0] = 0;
760 out0[3] = 0;
761 out0[2] = 0;
762 out0[1] = 0;
763 out0[0] = 0;
764 break;
765 }
766
767 #endif
768 }
769
770 __device__ static void lshift_block_N (const u32x in0[4], const u32x in1[4], u32x out0[4], u32x out1[4], const u32 num)
771 {
772 #if __CUDA_ARCH__ >= 200
773
774 switch (num)
775 {
776 case 0: out0[0] = in0[0];
777 out0[1] = in0[1];
778 out0[2] = in0[2];
779 out0[3] = in0[3];
780 out1[0] = in1[0];
781 out1[1] = in1[1];
782 out1[2] = in1[2];
783 out1[3] = in1[3];
784 break;
785 case 1: out0[0] = __byte_perm (in0[0], in0[1], 0x4321);
786 out0[1] = __byte_perm (in0[1], in0[2], 0x4321);
787 out0[2] = __byte_perm (in0[2], in0[3], 0x4321);
788 out0[3] = __byte_perm (in0[3], in1[0], 0x4321);
789 out1[0] = __byte_perm (in1[0], in1[1], 0x4321);
790 out1[1] = __byte_perm (in1[1], in1[2], 0x4321);
791 out1[2] = __byte_perm (in1[2], in1[3], 0x4321);
792 out1[3] = __byte_perm (in1[3], 0, 0x4321);
793 break;
794 case 2: out0[0] = __byte_perm (in0[0], in0[1], 0x5432);
795 out0[1] = __byte_perm (in0[1], in0[2], 0x5432);
796 out0[2] = __byte_perm (in0[2], in0[3], 0x5432);
797 out0[3] = __byte_perm (in0[3], in1[0], 0x5432);
798 out1[0] = __byte_perm (in1[0], in1[1], 0x5432);
799 out1[1] = __byte_perm (in1[1], in1[2], 0x5432);
800 out1[2] = __byte_perm (in1[2], in1[3], 0x5432);
801 out1[3] = __byte_perm (in1[3], 0, 0x5432);
802 break;
803 case 3: out0[0] = __byte_perm (in0[0], in0[1], 0x6543);
804 out0[1] = __byte_perm (in0[1], in0[2], 0x6543);
805 out0[2] = __byte_perm (in0[2], in0[3], 0x6543);
806 out0[3] = __byte_perm (in0[3], in1[0], 0x6543);
807 out1[0] = __byte_perm (in1[0], in1[1], 0x6543);
808 out1[1] = __byte_perm (in1[1], in1[2], 0x6543);
809 out1[2] = __byte_perm (in1[2], in1[3], 0x6543);
810 out1[3] = __byte_perm (in1[3], 0, 0x6543);
811 break;
812 case 4: out0[0] = in0[1];
813 out0[1] = in0[2];
814 out0[2] = in0[3];
815 out0[3] = in1[0];
816 out1[0] = in1[1];
817 out1[1] = in1[2];
818 out1[2] = in1[3];
819 out1[3] = 0;
820 break;
821 case 5: out0[0] = __byte_perm (in0[1], in0[2], 0x4321);
822 out0[1] = __byte_perm (in0[2], in0[3], 0x4321);
823 out0[2] = __byte_perm (in0[3], in1[0], 0x4321);
824 out0[3] = __byte_perm (in1[0], in1[1], 0x4321);
825 out1[0] = __byte_perm (in1[1], in1[2], 0x4321);
826 out1[1] = __byte_perm (in1[2], in1[3], 0x4321);
827 out1[2] = __byte_perm (in1[3], 0, 0x4321);
828 out1[3] = 0;
829 break;
830 case 6: out0[0] = __byte_perm (in0[1], in0[2], 0x5432);
831 out0[1] = __byte_perm (in0[2], in0[3], 0x5432);
832 out0[2] = __byte_perm (in0[3], in1[0], 0x5432);
833 out0[3] = __byte_perm (in1[0], in1[1], 0x5432);
834 out1[0] = __byte_perm (in1[1], in1[2], 0x5432);
835 out1[1] = __byte_perm (in1[2], in1[3], 0x5432);
836 out1[2] = __byte_perm (in1[3], 0, 0x5432);
837 out1[3] = 0;
838 break;
839 case 7: out0[0] = __byte_perm (in0[1], in0[2], 0x6543);
840 out0[1] = __byte_perm (in0[2], in0[3], 0x6543);
841 out0[2] = __byte_perm (in0[3], in1[0], 0x6543);
842 out0[3] = __byte_perm (in1[0], in1[1], 0x6543);
843 out1[0] = __byte_perm (in1[1], in1[2], 0x6543);
844 out1[1] = __byte_perm (in1[2], in1[3], 0x6543);
845 out1[2] = __byte_perm (in1[3], 0, 0x6543);
846 out1[3] = 0;
847 break;
848 case 8: out0[0] = in0[2];
849 out0[1] = in0[3];
850 out0[2] = in1[0];
851 out0[3] = in1[1];
852 out1[0] = in1[2];
853 out1[1] = in1[3];
854 out1[2] = 0;
855 out1[3] = 0;
856 break;
857 case 9: out0[0] = __byte_perm (in0[2], in0[3], 0x4321);
858 out0[1] = __byte_perm (in0[3], in1[0], 0x4321);
859 out0[2] = __byte_perm (in1[0], in1[1], 0x4321);
860 out0[3] = __byte_perm (in1[1], in1[2], 0x4321);
861 out1[0] = __byte_perm (in1[2], in1[3], 0x4321);
862 out1[1] = __byte_perm (in1[3], 0, 0x4321);
863 out1[2] = 0;
864 out1[3] = 0;
865 break;
866 case 10: out0[0] = __byte_perm (in0[2], in0[3], 0x5432);
867 out0[1] = __byte_perm (in0[3], in1[0], 0x5432);
868 out0[2] = __byte_perm (in1[0], in1[1], 0x5432);
869 out0[3] = __byte_perm (in1[1], in1[2], 0x5432);
870 out1[0] = __byte_perm (in1[2], in1[3], 0x5432);
871 out1[1] = __byte_perm (in1[3], 0, 0x5432);
872 out1[2] = 0;
873 out1[3] = 0;
874 break;
875 case 11: out0[0] = __byte_perm (in0[2], in0[3], 0x6543);
876 out0[1] = __byte_perm (in0[3], in1[0], 0x6543);
877 out0[2] = __byte_perm (in1[0], in1[1], 0x6543);
878 out0[3] = __byte_perm (in1[1], in1[2], 0x6543);
879 out1[0] = __byte_perm (in1[2], in1[3], 0x6543);
880 out1[1] = __byte_perm (in1[3], 0, 0x6543);
881 out1[2] = 0;
882 out1[3] = 0;
883 break;
884 case 12: out0[0] = in0[3];
885 out0[1] = in1[0];
886 out0[2] = in1[1];
887 out0[3] = in1[2];
888 out1[0] = in1[3];
889 out1[1] = 0;
890 out1[2] = 0;
891 out1[3] = 0;
892 break;
893 case 13:
894 out0[0] = __byte_perm (in0[3], in1[0], 0x4321);
895 out0[1] = __byte_perm (in1[0], in1[1], 0x4321);
896 out0[2] = __byte_perm (in1[1], in1[2], 0x4321);
897 out0[3] = __byte_perm (in1[2], in1[3], 0x4321);
898 out1[0] = __byte_perm (in1[3], 0, 0x4321);
899 out1[1] = 0;
900 out1[2] = 0;
901 out1[3] = 0;
902 break;
903 case 14: out0[0] = __byte_perm (in0[3], in1[0], 0x5432);
904 out0[1] = __byte_perm (in1[0], in1[1], 0x5432);
905 out0[2] = __byte_perm (in1[1], in1[2], 0x5432);
906 out0[3] = __byte_perm (in1[2], in1[3], 0x5432);
907 out1[0] = __byte_perm (in1[3], 0, 0x5432);
908 out1[1] = 0;
909 out1[2] = 0;
910 out1[3] = 0;
911 break;
912 case 15: out0[0] = __byte_perm (in0[3], in1[0], 0x6543);
913 out0[1] = __byte_perm (in1[0], in1[1], 0x6543);
914 out0[2] = __byte_perm (in1[1], in1[2], 0x6543);
915 out0[3] = __byte_perm (in1[2], in1[3], 0x6543);
916 out1[0] = __byte_perm (in1[3], 0, 0x6543);
917 out1[1] = 0;
918 out1[2] = 0;
919 out1[3] = 0;
920 break;
921 case 16: out0[0] = in1[0];
922 out0[1] = in1[1];
923 out0[2] = in1[2];
924 out0[3] = in1[3];
925 out1[0] = 0;
926 out1[1] = 0;
927 out1[2] = 0;
928 out1[3] = 0;
929 break;
930 case 17: out0[0] = __byte_perm (in1[0], in1[1], 0x4321);
931 out0[1] = __byte_perm (in1[1], in1[2], 0x4321);
932 out0[2] = __byte_perm (in1[2], in1[3], 0x4321);
933 out0[3] = __byte_perm (in1[3], 0, 0x4321);
934 out1[0] = 0;
935 out1[1] = 0;
936 out1[2] = 0;
937 out1[3] = 0;
938 break;
939 case 18: out0[0] = __byte_perm (in1[0], in1[1], 0x5432);
940 out0[1] = __byte_perm (in1[1], in1[2], 0x5432);
941 out0[2] = __byte_perm (in1[2], in1[3], 0x5432);
942 out0[3] = __byte_perm (in1[3], 0, 0x5432);
943 out1[0] = 0;
944 out1[1] = 0;
945 out1[2] = 0;
946 out1[3] = 0;
947 break;
948 case 19: out0[0] = __byte_perm (in1[0], in1[1], 0x6543);
949 out0[1] = __byte_perm (in1[1], in1[2], 0x6543);
950 out0[2] = __byte_perm (in1[2], in1[3], 0x6543);
951 out0[3] = __byte_perm (in1[3], 0, 0x6543);
952 out1[0] = 0;
953 out1[1] = 0;
954 out1[2] = 0;
955 out1[3] = 0;
956 break;
957 case 20: out0[0] = in1[1];
958 out0[1] = in1[2];
959 out0[2] = in1[3];
960 out0[3] = 0;
961 out1[0] = 0;
962 out1[1] = 0;
963 out1[2] = 0;
964 out1[3] = 0;
965 break;
966 case 21: out0[0] = __byte_perm (in1[1], in1[2], 0x4321);
967 out0[1] = __byte_perm (in1[2], in1[3], 0x4321);
968 out0[2] = __byte_perm (in1[3], 0, 0x4321);
969 out0[3] = 0;
970 out1[0] = 0;
971 out1[1] = 0;
972 out1[2] = 0;
973 out1[3] = 0;
974 break;
975 case 22: out0[0] = __byte_perm (in1[1], in1[2], 0x5432);
976 out0[1] = __byte_perm (in1[2], in1[3], 0x5432);
977 out0[2] = __byte_perm (in1[3], 0, 0x5432);
978 out0[3] = 0;
979 out1[0] = 0;
980 out1[1] = 0;
981 out1[2] = 0;
982 out1[3] = 0;
983 break;
984 case 23: out0[0] = __byte_perm (in1[1], in1[2], 0x6543);
985 out0[1] = __byte_perm (in1[2], in1[3], 0x6543);
986 out0[2] = __byte_perm (in1[3], 0, 0x6543);
987 out0[3] = 0;
988 out1[0] = 0;
989 out1[1] = 0;
990 out1[2] = 0;
991 out1[3] = 0;
992 break;
993 case 24: out0[0] = in1[2];
994 out0[1] = in1[3];
995 out0[2] = 0;
996 out0[3] = 0;
997 out1[0] = 0;
998 out1[1] = 0;
999 out1[2] = 0;
1000 out1[3] = 0;
1001 break;
1002 case 25: out0[0] = __byte_perm (in1[2], in1[3], 0x4321);
1003 out0[1] = __byte_perm (in1[3], 0, 0x4321);
1004 out0[2] = 0;
1005 out0[3] = 0;
1006 out1[0] = 0;
1007 out1[1] = 0;
1008 out1[2] = 0;
1009 out1[3] = 0;
1010 break;
1011 case 26: out0[0] = __byte_perm (in1[2], in1[3], 0x5432);
1012 out0[1] = __byte_perm (in1[3], 0, 0x5432);
1013 out0[2] = 0;
1014 out0[3] = 0;
1015 out1[0] = 0;
1016 out1[1] = 0;
1017 out1[2] = 0;
1018 out1[3] = 0;
1019 break;
1020 case 27: out0[0] = __byte_perm (in1[2], in1[3], 0x6543);
1021 out0[1] = __byte_perm (in1[3], 0, 0x6543);
1022 out0[2] = 0;
1023 out0[3] = 0;
1024 out1[0] = 0;
1025 out1[1] = 0;
1026 out1[2] = 0;
1027 out1[3] = 0;
1028 break;
1029 case 28: out0[0] = in1[3];
1030 out0[1] = 0;
1031 out0[2] = 0;
1032 out0[3] = 0;
1033 out1[0] = 0;
1034 out1[1] = 0;
1035 out1[2] = 0;
1036 out1[3] = 0;
1037 break;
1038 case 29: out0[0] = __byte_perm (in1[3], 0, 0x4321);
1039 out0[1] = 0;
1040 out0[2] = 0;
1041 out0[3] = 0;
1042 out1[0] = 0;
1043 out1[1] = 0;
1044 out1[2] = 0;
1045 out1[3] = 0;
1046 break;
1047 case 30: out0[0] = __byte_perm (in1[3], 0, 0x5432);
1048 out0[1] = 0;
1049 out0[2] = 0;
1050 out0[3] = 0;
1051 out1[0] = 0;
1052 out1[1] = 0;
1053 out1[2] = 0;
1054 out1[3] = 0;
1055 break;
1056 case 31: out0[0] = __byte_perm (in1[3], 0, 0x6543);
1057 out0[1] = 0;
1058 out0[2] = 0;
1059 out0[3] = 0;
1060 out1[0] = 0;
1061 out1[1] = 0;
1062 out1[2] = 0;
1063 out1[3] = 0;
1064 break;
1065 }
1066
1067 #else
1068
1069 switch (num)
1070 {
1071 case 0: out0[0] = in0[0];
1072 out0[1] = in0[1];
1073 out0[2] = in0[2];
1074 out0[3] = in0[3];
1075 out1[0] = in1[0];
1076 out1[1] = in1[1];
1077 out1[2] = in1[2];
1078 out1[3] = in1[3];
1079 break;
1080 case 1: out0[0] = in0[0] >> 8 | in0[1] << 24;
1081 out0[1] = in0[1] >> 8 | in0[2] << 24;
1082 out0[2] = in0[2] >> 8 | in0[3] << 24;
1083 out0[3] = in0[3] >> 8 | in1[0] << 24;
1084 out1[0] = in1[0] >> 8 | in1[1] << 24;
1085 out1[1] = in1[1] >> 8 | in1[2] << 24;
1086 out1[2] = in1[2] >> 8 | in1[3] << 24;
1087 out1[3] = in1[3] >> 8;
1088 break;
1089 case 2: out0[0] = in0[0] >> 16 | in0[1] << 16;
1090 out0[1] = in0[1] >> 16 | in0[2] << 16;
1091 out0[2] = in0[2] >> 16 | in0[3] << 16;
1092 out0[3] = in0[3] >> 16 | in1[0] << 16;
1093 out1[0] = in1[0] >> 16 | in1[1] << 16;
1094 out1[1] = in1[1] >> 16 | in1[2] << 16;
1095 out1[2] = in1[2] >> 16 | in1[3] << 16;
1096 out1[3] = in1[3] >> 16;
1097 break;
1098 case 3: out0[0] = in0[0] >> 24 | in0[1] << 8;
1099 out0[1] = in0[1] >> 24 | in0[2] << 8;
1100 out0[2] = in0[2] >> 24 | in0[3] << 8;
1101 out0[3] = in0[3] >> 24 | in1[0] << 8;
1102 out1[0] = in1[0] >> 24 | in1[1] << 8;
1103 out1[1] = in1[1] >> 24 | in1[2] << 8;
1104 out1[2] = in1[2] >> 24 | in1[3] << 8;
1105 out1[3] = in1[3] >> 24;
1106 break;
1107 case 4: out0[0] = in0[1];
1108 out0[1] = in0[2];
1109 out0[2] = in0[3];
1110 out0[3] = in1[0];
1111 out1[0] = in1[1];
1112 out1[1] = in1[2];
1113 out1[2] = in1[3];
1114 out1[3] = 0;
1115 break;
1116 case 5: out0[0] = in0[1] >> 8 | in0[2] << 24;
1117 out0[1] = in0[2] >> 8 | in0[3] << 24;
1118 out0[2] = in0[3] >> 8 | in1[0] << 24;
1119 out0[3] = in1[0] >> 8 | in1[1] << 24;
1120 out1[0] = in1[1] >> 8 | in1[2] << 24;
1121 out1[1] = in1[2] >> 8 | in1[3] << 24;
1122 out1[2] = in1[3] >> 8;
1123 out1[3] = 0;
1124 break;
1125 case 6: out0[0] = in0[1] >> 16 | in0[2] << 16;
1126 out0[1] = in0[2] >> 16 | in0[3] << 16;
1127 out0[2] = in0[3] >> 16 | in1[0] << 16;
1128 out0[3] = in1[0] >> 16 | in1[1] << 16;
1129 out1[0] = in1[1] >> 16 | in1[2] << 16;
1130 out1[1] = in1[2] >> 16 | in1[3] << 16;
1131 out1[2] = in1[3] >> 16;
1132 out1[3] = 0;
1133 break;
1134 case 7: out0[0] = in0[1] >> 24 | in0[2] << 8;
1135 out0[1] = in0[2] >> 24 | in0[3] << 8;
1136 out0[2] = in0[3] >> 24 | in1[0] << 8;
1137 out0[3] = in1[0] >> 24 | in1[1] << 8;
1138 out1[0] = in1[1] >> 24 | in1[2] << 8;
1139 out1[1] = in1[2] >> 24 | in1[3] << 8;
1140 out1[2] = in1[3] >> 24;
1141 out1[3] = 0;
1142 break;
1143 case 8: out0[0] = in0[2];
1144 out0[1] = in0[3];
1145 out0[2] = in1[0];
1146 out0[3] = in1[1];
1147 out1[0] = in1[2];
1148 out1[1] = in1[3];
1149 out1[2] = 0;
1150 out1[3] = 0;
1151 break;
1152 case 9: out0[0] = in0[2] >> 8 | in0[3] << 24;
1153 out0[1] = in0[3] >> 8 | in1[0] << 24;
1154 out0[2] = in1[0] >> 8 | in1[1] << 24;
1155 out0[3] = in1[1] >> 8 | in1[2] << 24;
1156 out1[0] = in1[2] >> 8 | in1[3] << 24;
1157 out1[1] = in1[3] >> 8;
1158 out1[2] = 0;
1159 out1[3] = 0;
1160 break;
1161 case 10: out0[0] = in0[2] >> 16 | in0[3] << 16;
1162 out0[1] = in0[3] >> 16 | in1[0] << 16;
1163 out0[2] = in1[0] >> 16 | in1[1] << 16;
1164 out0[3] = in1[1] >> 16 | in1[2] << 16;
1165 out1[0] = in1[2] >> 16 | in1[3] << 16;
1166 out1[1] = in1[3] >> 16;
1167 out1[2] = 0;
1168 out1[3] = 0;
1169 break;
1170 case 11: out0[0] = in0[2] >> 24 | in0[3] << 8;
1171 out0[1] = in0[3] >> 24 | in1[0] << 8;
1172 out0[2] = in1[0] >> 24 | in1[1] << 8;
1173 out0[3] = in1[1] >> 24 | in1[2] << 8;
1174 out1[0] = in1[2] >> 24 | in1[3] << 8;
1175 out1[1] = in1[3] >> 24;
1176 out1[2] = 0;
1177 out1[3] = 0;
1178 break;
1179 case 12: out0[0] = in0[3];
1180 out0[1] = in1[0];
1181 out0[2] = in1[1];
1182 out0[3] = in1[2];
1183 out1[0] = in1[3];
1184 out1[1] = 0;
1185 out1[2] = 0;
1186 out1[3] = 0;
1187 break;
1188 case 13:
1189 out0[0] = in0[3] >> 8 | in1[0] << 24;
1190 out0[1] = in1[0] >> 8 | in1[1] << 24;
1191 out0[2] = in1[1] >> 8 | in1[2] << 24;
1192 out0[3] = in1[2] >> 8 | in1[3] << 24;
1193 out1[0] = in1[3] >> 8;
1194 out1[1] = 0;
1195 out1[2] = 0;
1196 out1[3] = 0;
1197 break;
1198 case 14: out0[0] = in0[3] >> 16 | in1[0] << 16;
1199 out0[1] = in1[0] >> 16 | in1[1] << 16;
1200 out0[2] = in1[1] >> 16 | in1[2] << 16;
1201 out0[3] = in1[2] >> 16 | in1[3] << 16;
1202 out1[0] = in1[3] >> 16;
1203 out1[1] = 0;
1204 out1[2] = 0;
1205 out1[3] = 0;
1206 break;
1207 case 15: out0[0] = in0[3] >> 24 | in1[0] << 8;
1208 out0[1] = in1[0] >> 24 | in1[1] << 8;
1209 out0[2] = in1[1] >> 24 | in1[2] << 8;
1210 out0[3] = in1[2] >> 24 | in1[3] << 8;
1211 out1[0] = in1[3] >> 24;
1212 out1[1] = 0;
1213 out1[2] = 0;
1214 out1[3] = 0;
1215 break;
1216 case 16: out0[0] = in1[0];
1217 out0[1] = in1[1];
1218 out0[2] = in1[2];
1219 out0[3] = in1[3];
1220 out1[0] = 0;
1221 out1[1] = 0;
1222 out1[2] = 0;
1223 out1[3] = 0;
1224 break;
1225 case 17: out0[0] = in1[0] >> 8 | in1[1] << 24;
1226 out0[1] = in1[1] >> 8 | in1[2] << 24;
1227 out0[2] = in1[2] >> 8 | in1[3] << 24;
1228 out0[3] = in1[3] >> 8;
1229 out1[0] = 0;
1230 out1[1] = 0;
1231 out1[2] = 0;
1232 out1[3] = 0;
1233 break;
1234 case 18: out0[0] = in1[0] >> 16 | in1[1] << 16;
1235 out0[1] = in1[1] >> 16 | in1[2] << 16;
1236 out0[2] = in1[2] >> 16 | in1[3] << 16;
1237 out0[3] = in1[3] >> 16;
1238 out1[0] = 0;
1239 out1[1] = 0;
1240 out1[2] = 0;
1241 out1[3] = 0;
1242 break;
1243 case 19: out0[0] = in1[0] >> 24 | in1[1] << 8;
1244 out0[1] = in1[1] >> 24 | in1[2] << 8;
1245 out0[2] = in1[2] >> 24 | in1[3] << 8;
1246 out0[3] = in1[3] >> 24;
1247 out1[0] = 0;
1248 out1[1] = 0;
1249 out1[2] = 0;
1250 out1[3] = 0;
1251 break;
1252 case 20: out0[0] = in1[1];
1253 out0[1] = in1[2];
1254 out0[2] = in1[3];
1255 out0[3] = 0;
1256 out1[0] = 0;
1257 out1[1] = 0;
1258 out1[2] = 0;
1259 out1[3] = 0;
1260 break;
1261 case 21: out0[0] = in1[1] >> 8 | in1[2] << 24;
1262 out0[1] = in1[2] >> 8 | in1[3] << 24;
1263 out0[2] = in1[3] >> 8;
1264 out0[3] = 0;
1265 out1[0] = 0;
1266 out1[1] = 0;
1267 out1[2] = 0;
1268 out1[3] = 0;
1269 break;
1270 case 22: out0[0] = in1[1] >> 16 | in1[2] << 16;
1271 out0[1] = in1[2] >> 16 | in1[3] << 16;
1272 out0[2] = in1[3] >> 16;
1273 out0[3] = 0;
1274 out1[0] = 0;
1275 out1[1] = 0;
1276 out1[2] = 0;
1277 out1[3] = 0;
1278 break;
1279 case 23: out0[0] = in1[1] >> 24 | in1[2] << 8;
1280 out0[1] = in1[2] >> 24 | in1[3] << 8;
1281 out0[2] = in1[3] >> 24;
1282 out0[3] = 0;
1283 out1[0] = 0;
1284 out1[1] = 0;
1285 out1[2] = 0;
1286 out1[3] = 0;
1287 break;
1288 case 24: out0[0] = in1[2];
1289 out0[1] = in1[3];
1290 out0[2] = 0;
1291 out0[3] = 0;
1292 out1[0] = 0;
1293 out1[1] = 0;
1294 out1[2] = 0;
1295 out1[3] = 0;
1296 break;
1297 case 25: out0[0] = in1[2] >> 8 | in1[3] << 24;
1298 out0[1] = in1[3] >> 8;
1299 out0[2] = 0;
1300 out0[3] = 0;
1301 out1[0] = 0;
1302 out1[1] = 0;
1303 out1[2] = 0;
1304 out1[3] = 0;
1305 break;
1306 case 26: out0[0] = in1[2] >> 16 | in1[3] << 16;
1307 out0[1] = in1[3] >> 16;
1308 out0[2] = 0;
1309 out0[3] = 0;
1310 out1[0] = 0;
1311 out1[1] = 0;
1312 out1[2] = 0;
1313 out1[3] = 0;
1314 break;
1315 case 27: out0[0] = in1[2] >> 24 | in1[3] << 8;
1316 out0[1] = in1[3] >> 24;
1317 out0[2] = 0;
1318 out0[3] = 0;
1319 out1[0] = 0;
1320 out1[1] = 0;
1321 out1[2] = 0;
1322 out1[3] = 0;
1323 break;
1324 case 28: out0[0] = in1[3];
1325 out0[1] = 0;
1326 out0[2] = 0;
1327 out0[3] = 0;
1328 out1[0] = 0;
1329 out1[1] = 0;
1330 out1[2] = 0;
1331 out1[3] = 0;
1332 break;
1333 case 29: out0[0] = in1[3] >> 8;
1334 out0[1] = 0;
1335 out0[2] = 0;
1336 out0[3] = 0;
1337 out1[0] = 0;
1338 out1[1] = 0;
1339 out1[2] = 0;
1340 out1[3] = 0;
1341 break;
1342 case 30: out0[0] = in1[3] >> 16;
1343 out0[1] = 0;
1344 out0[2] = 0;
1345 out0[3] = 0;
1346 out1[0] = 0;
1347 out1[1] = 0;
1348 out1[2] = 0;
1349 out1[3] = 0;
1350 break;
1351 case 31: out0[0] = in1[3] >> 24;
1352 out0[1] = 0;
1353 out0[2] = 0;
1354 out0[3] = 0;
1355 out1[0] = 0;
1356 out1[1] = 0;
1357 out1[2] = 0;
1358 out1[3] = 0;
1359 break;
1360 }
1361
1362 #endif
1363 }
1364
1365 __device__ static void append_block1 (const u32 offset, u32x dst0[4], u32x dst1[4], const u32x src_r0)
1366 {
1367 u32x tmp[2];
1368
1369 switch (offset & 3)
1370 {
1371 case 0: tmp[0] = src_r0;
1372 tmp[1] = 0;
1373 break;
1374 case 1: tmp[0] = src_r0 << 8;
1375 tmp[1] = src_r0 >> 24;
1376 break;
1377 case 2: tmp[0] = src_r0 << 16;
1378 tmp[1] = src_r0 >> 16;
1379 break;
1380 case 3: tmp[0] = src_r0 << 24;
1381 tmp[1] = src_r0 >> 8;
1382 break;
1383 }
1384
1385 switch (offset / 4)
1386 {
1387 case 0: dst0[0] |= tmp[0];
1388 dst0[1] = tmp[1];
1389 break;
1390 case 1: dst0[1] |= tmp[0];
1391 dst0[2] = tmp[1];
1392 break;
1393 case 2: dst0[2] |= tmp[0];
1394 dst0[3] = tmp[1];
1395 break;
1396 case 3: dst0[3] |= tmp[0];
1397 dst1[0] = tmp[1];
1398 break;
1399 case 4: dst1[0] |= tmp[0];
1400 dst1[1] = tmp[1];
1401 break;
1402 case 5: dst1[1] |= tmp[0];
1403 dst1[2] = tmp[1];
1404 break;
1405 case 6: dst1[2] |= tmp[0];
1406 dst1[3] = tmp[1];
1407 break;
1408 case 7: dst1[3] |= tmp[0];
1409 break;
1410 }
1411 }
1412
1413 __device__ static void append_block8 (const u32 offset, u32x dst0[4], u32x dst1[4], const u32x src_l0[4], const u32x src_l1[4], const u32x src_r0[4], const u32x src_r1[4])
1414 {
1415 #if __CUDA_ARCH__ >= 200
1416
1417 switch (offset)
1418 {
1419 case 0:
1420 dst0[0] = src_r0[0];
1421 dst0[1] = src_r0[1];
1422 dst0[2] = src_r0[2];
1423 dst0[3] = src_r0[3];
1424 dst1[0] = src_r1[0];
1425 dst1[1] = src_r1[1];
1426 dst1[2] = src_r1[2];
1427 dst1[3] = src_r1[3];
1428 break;
1429
1430 case 1:
1431 dst0[0] = __byte_perm (src_l0[0], src_r0[0], 0x6540);
1432 dst0[1] = __byte_perm (src_r0[0], src_r0[1], 0x6543);
1433 dst0[2] = __byte_perm (src_r0[1], src_r0[2], 0x6543);
1434 dst0[3] = __byte_perm (src_r0[2], src_r0[3], 0x6543);
1435 dst1[0] = __byte_perm (src_r0[3], src_r1[0], 0x6543);
1436 dst1[1] = __byte_perm (src_r1[0], src_r1[1], 0x6543);
1437 dst1[2] = __byte_perm (src_r1[1], src_r1[2], 0x6543);
1438 dst1[3] = __byte_perm (src_r1[2], src_r1[3], 0x6543);
1439 break;
1440
1441 case 2:
1442 dst0[0] = __byte_perm (src_l0[0], src_r0[0], 0x5410);
1443 dst0[1] = __byte_perm (src_r0[0], src_r0[1], 0x5432);
1444 dst0[2] = __byte_perm (src_r0[1], src_r0[2], 0x5432);
1445 dst0[3] = __byte_perm (src_r0[2], src_r0[3], 0x5432);
1446 dst1[0] = __byte_perm (src_r0[3], src_r1[0], 0x5432);
1447 dst1[1] = __byte_perm (src_r1[0], src_r1[1], 0x5432);
1448 dst1[2] = __byte_perm (src_r1[1], src_r1[2], 0x5432);
1449 dst1[3] = __byte_perm (src_r1[2], src_r1[3], 0x5432);
1450 break;
1451
1452 case 3:
1453 dst0[0] = __byte_perm (src_l0[0], src_r0[0], 0x4210);
1454 dst0[1] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
1455 dst0[2] = __byte_perm (src_r0[1], src_r0[2], 0x4321);
1456 dst0[3] = __byte_perm (src_r0[2], src_r0[3], 0x4321);
1457 dst1[0] = __byte_perm (src_r0[3], src_r1[0], 0x4321);
1458 dst1[1] = __byte_perm (src_r1[0], src_r1[1], 0x4321);
1459 dst1[2] = __byte_perm (src_r1[1], src_r1[2], 0x4321);
1460 dst1[3] = __byte_perm (src_r1[2], src_r1[3], 0x4321);
1461 break;
1462
1463 case 4:
1464 dst0[1] = src_r0[0];
1465 dst0[2] = src_r0[1];
1466 dst0[3] = src_r0[2];
1467 dst1[0] = src_r0[3];
1468 dst1[1] = src_r1[0];
1469 dst1[2] = src_r1[1];
1470 dst1[3] = src_r1[2];
1471 break;
1472
1473 case 5:
1474 dst0[1] = __byte_perm (src_l0[1], src_r0[0], 0x6540);
1475 dst0[2] = __byte_perm (src_r0[0], src_r0[1], 0x6543);
1476 dst0[3] = __byte_perm (src_r0[1], src_r0[2], 0x6543);
1477 dst1[0] = __byte_perm (src_r0[2], src_r0[3], 0x6543);
1478 dst1[1] = __byte_perm (src_r0[3], src_r1[0], 0x6543);
1479 dst1[2] = __byte_perm (src_r1[0], src_r1[1], 0x6543);
1480 dst1[3] = __byte_perm (src_r1[1], src_r1[2], 0x6543);
1481 break;
1482
1483 case 6:
1484 dst0[1] = __byte_perm (src_l0[1], src_r0[0], 0x5410);
1485 dst0[2] = __byte_perm (src_r0[0], src_r0[1], 0x5432);
1486 dst0[3] = __byte_perm (src_r0[1], src_r0[2], 0x5432);
1487 dst1[0] = __byte_perm (src_r0[2], src_r0[3], 0x5432);
1488 dst1[1] = __byte_perm (src_r0[3], src_r1[0], 0x5432);
1489 dst1[2] = __byte_perm (src_r1[0], src_r1[1], 0x5432);
1490 dst1[3] = __byte_perm (src_r1[1], src_r1[2], 0x5432);
1491 break;
1492
1493 case 7:
1494 dst0[1] = __byte_perm (src_l0[1], src_r0[0], 0x4210);
1495 dst0[2] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
1496 dst0[3] = __byte_perm (src_r0[1], src_r0[2], 0x4321);
1497 dst1[0] = __byte_perm (src_r0[2], src_r0[3], 0x4321);
1498 dst1[1] = __byte_perm (src_r0[3], src_r1[0], 0x4321);
1499 dst1[2] = __byte_perm (src_r1[0], src_r1[1], 0x4321);
1500 dst1[3] = __byte_perm (src_r1[1], src_r1[2], 0x4321);
1501 break;
1502
1503 case 8:
1504 dst0[2] = src_r0[0];
1505 dst0[3] = src_r0[1];
1506 dst1[0] = src_r0[2];
1507 dst1[1] = src_r0[3];
1508 dst1[2] = src_r1[0];
1509 dst1[3] = src_r1[1];
1510 break;
1511
1512 case 9:
1513 dst0[2] = __byte_perm (src_l0[2], src_r0[0], 0x6540);
1514 dst0[3] = __byte_perm (src_r0[0], src_r0[1], 0x6543);
1515 dst1[0] = __byte_perm (src_r0[1], src_r0[2], 0x6543);
1516 dst1[1] = __byte_perm (src_r0[2], src_r0[3], 0x6543);
1517 dst1[2] = __byte_perm (src_r0[3], src_r1[0], 0x6543);
1518 dst1[3] = __byte_perm (src_r1[0], src_r1[1], 0x6543);
1519 break;
1520
1521 case 10:
1522 dst0[2] = __byte_perm (src_l0[2], src_r0[0], 0x5410);
1523 dst0[3] = __byte_perm (src_r0[0], src_r0[1], 0x5432);
1524 dst1[0] = __byte_perm (src_r0[1], src_r0[2], 0x5432);
1525 dst1[1] = __byte_perm (src_r0[2], src_r0[3], 0x5432);
1526 dst1[2] = __byte_perm (src_r0[3], src_r1[0], 0x5432);
1527 dst1[3] = __byte_perm (src_r1[0], src_r1[1], 0x5432);
1528 break;
1529
1530 case 11:
1531 dst0[2] = __byte_perm (src_l0[2], src_r0[0], 0x4210);
1532 dst0[3] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
1533 dst1[0] = __byte_perm (src_r0[1], src_r0[2], 0x4321);
1534 dst1[1] = __byte_perm (src_r0[2], src_r0[3], 0x4321);
1535 dst1[2] = __byte_perm (src_r0[3], src_r1[0], 0x4321);
1536 dst1[3] = __byte_perm (src_r1[0], src_r1[1], 0x4321);
1537 break;
1538
1539 case 12:
1540 dst0[3] = src_r0[0];
1541 dst1[0] = src_r0[1];
1542 dst1[1] = src_r0[2];
1543 dst1[2] = src_r0[3];
1544 dst1[3] = src_r1[0];
1545 break;
1546
1547 case 13:
1548 dst0[3] = __byte_perm (src_l0[3], src_r0[0], 0x6540);
1549 dst1[0] = __byte_perm (src_r0[0], src_r0[1], 0x6543);
1550 dst1[1] = __byte_perm (src_r0[1], src_r0[2], 0x6543);
1551 dst1[2] = __byte_perm (src_r0[2], src_r0[3], 0x6543);
1552 dst1[3] = __byte_perm (src_r0[3], src_r1[0], 0x6543);
1553 break;
1554
1555 case 14:
1556 dst0[3] = __byte_perm (src_l0[3], src_r0[0], 0x5410);
1557 dst1[0] = __byte_perm (src_r0[0], src_r0[1], 0x5432);
1558 dst1[1] = __byte_perm (src_r0[1], src_r0[2], 0x5432);
1559 dst1[2] = __byte_perm (src_r0[2], src_r0[3], 0x5432);
1560 dst1[3] = __byte_perm (src_r0[3], src_r1[0], 0x5432);
1561 break;
1562
1563 case 15:
1564 dst0[3] = __byte_perm (src_l0[3], src_r0[0], 0x4210);
1565 dst1[0] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
1566 dst1[1] = __byte_perm (src_r0[1], src_r0[2], 0x4321);
1567 dst1[2] = __byte_perm (src_r0[2], src_r0[3], 0x4321);
1568 dst1[3] = __byte_perm (src_r0[3], src_r1[0], 0x4321);
1569 break;
1570
1571 case 16:
1572 dst1[0] = src_r0[0];
1573 dst1[1] = src_r0[1];
1574 dst1[2] = src_r0[2];
1575 dst1[3] = src_r0[3];
1576 break;
1577
1578 case 17:
1579 dst1[0] = __byte_perm (src_l1[0], src_r0[0], 0x6540);
1580 dst1[1] = __byte_perm (src_r0[0], src_r0[1], 0x6543);
1581 dst1[2] = __byte_perm (src_r0[1], src_r0[2], 0x6543);
1582 dst1[3] = __byte_perm (src_r0[2], src_r0[3], 0x6543);
1583 break;
1584
1585 case 18:
1586 dst1[0] = __byte_perm (src_l1[0], src_r0[0], 0x5410);
1587 dst1[1] = __byte_perm (src_r0[0], src_r0[1], 0x5432);
1588 dst1[2] = __byte_perm (src_r0[1], src_r0[2], 0x5432);
1589 dst1[3] = __byte_perm (src_r0[2], src_r0[3], 0x5432);
1590 break;
1591
1592 case 19:
1593 dst1[0] = __byte_perm (src_l1[0], src_r0[0], 0x4210);
1594 dst1[1] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
1595 dst1[2] = __byte_perm (src_r0[1], src_r0[2], 0x4321);
1596 dst1[3] = __byte_perm (src_r0[2], src_r0[3], 0x4321);
1597 break;
1598
1599 case 20:
1600 dst1[1] = src_r0[0];
1601 dst1[2] = src_r0[1];
1602 dst1[3] = src_r0[2];
1603 break;
1604
1605 case 21:
1606 dst1[1] = __byte_perm (src_l1[1], src_r0[0], 0x6540);
1607 dst1[2] = __byte_perm (src_r0[0], src_r0[1], 0x6543);
1608 dst1[3] = __byte_perm (src_r0[1], src_r0[2], 0x6543);
1609 break;
1610
1611 case 22:
1612 dst1[1] = __byte_perm (src_l1[1], src_r0[0], 0x5410);
1613 dst1[2] = __byte_perm (src_r0[0], src_r0[1], 0x5432);
1614 dst1[3] = __byte_perm (src_r0[1], src_r0[2], 0x5432);
1615 break;
1616
1617 case 23:
1618 dst1[1] = __byte_perm (src_l1[1], src_r0[0], 0x4210);
1619 dst1[2] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
1620 dst1[3] = __byte_perm (src_r0[1], src_r0[2], 0x4321);
1621 break;
1622
1623 case 24:
1624 dst1[2] = src_r0[0];
1625 dst1[3] = src_r0[1];
1626 break;
1627
1628 case 25:
1629 dst1[2] = __byte_perm (src_l1[2], src_r0[0], 0x6540);
1630 dst1[3] = __byte_perm (src_r0[0], src_r0[1], 0x6543);
1631 break;
1632
1633 case 26:
1634 dst1[2] = __byte_perm (src_l1[2], src_r0[0], 0x5410);
1635 dst1[3] = __byte_perm (src_r0[0], src_r0[1], 0x5432);
1636 break;
1637
1638 case 27:
1639 dst1[2] = __byte_perm (src_l1[2], src_r0[0], 0x4210);
1640 dst1[3] = __byte_perm (src_r0[0], src_r0[1], 0x4321);
1641 break;
1642
1643 case 28:
1644 dst1[3] = src_r0[0];
1645 break;
1646
1647 case 29:
1648 dst1[3] = __byte_perm (src_l1[3], src_r0[0], 0x6540);
1649 break;
1650
1651 case 30:
1652 dst1[3] = __byte_perm (src_l1[3], src_r0[0], 0x5410);
1653 break;
1654
1655 case 31:
1656 dst1[3] = __byte_perm (src_l1[3], src_r0[0], 0x4210);
1657 break;
1658 }
1659
1660 #else
1661
1662 switch (offset)
1663 {
1664 case 0:
1665 dst0[0] = src_r0[0];
1666 dst0[1] = src_r0[1];
1667 dst0[2] = src_r0[2];
1668 dst0[3] = src_r0[3];
1669 dst1[0] = src_r1[0];
1670 dst1[1] = src_r1[1];
1671 dst1[2] = src_r1[2];
1672 dst1[3] = src_r1[3];
1673 break;
1674
1675 case 1:
1676 dst0[0] = src_l0[0] | src_r0[0] << 8;
1677 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
1678 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
1679 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
1680 dst1[0] = src_r0[3] >> 24 | src_r1[0] << 8;
1681 dst1[1] = src_r1[0] >> 24 | src_r1[1] << 8;
1682 dst1[2] = src_r1[1] >> 24 | src_r1[2] << 8;
1683 dst1[3] = src_r1[2] >> 24 | src_r1[3] << 8;
1684 break;
1685
1686 case 2:
1687 dst0[0] = src_l0[0] | src_r0[0] << 16;
1688 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
1689 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
1690 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
1691 dst1[0] = src_r0[3] >> 16 | src_r1[0] << 16;
1692 dst1[1] = src_r1[0] >> 16 | src_r1[1] << 16;
1693 dst1[2] = src_r1[1] >> 16 | src_r1[2] << 16;
1694 dst1[3] = src_r1[2] >> 16 | src_r1[3] << 16;
1695 break;
1696
1697 case 3:
1698 dst0[0] = src_l0[0] | src_r0[0] << 24;
1699 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
1700 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
1701 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
1702 dst1[0] = src_r0[3] >> 8 | src_r1[0] << 24;
1703 dst1[1] = src_r1[0] >> 8 | src_r1[1] << 24;
1704 dst1[2] = src_r1[1] >> 8 | src_r1[2] << 24;
1705 dst1[3] = src_r1[2] >> 8 | src_r1[3] << 24;
1706 break;
1707
1708 case 4:
1709 dst0[1] = src_r0[0];
1710 dst0[2] = src_r0[1];
1711 dst0[3] = src_r0[2];
1712 dst1[0] = src_r0[3];
1713 dst1[1] = src_r1[0];
1714 dst1[2] = src_r1[1];
1715 dst1[3] = src_r1[2];
1716 break;
1717
1718 case 5:
1719 dst0[1] = src_l0[1] | src_r0[0] << 8;
1720 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
1721 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
1722 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
1723 dst1[1] = src_r0[3] >> 24 | src_r1[0] << 8;
1724 dst1[2] = src_r1[0] >> 24 | src_r1[1] << 8;
1725 dst1[3] = src_r1[1] >> 24 | src_r1[2] << 8;
1726 break;
1727
1728 case 6:
1729 dst0[1] = src_l0[1] | src_r0[0] << 16;
1730 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
1731 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
1732 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
1733 dst1[1] = src_r0[3] >> 16 | src_r1[0] << 16;
1734 dst1[2] = src_r1[0] >> 16 | src_r1[1] << 16;
1735 dst1[3] = src_r1[1] >> 16 | src_r1[2] << 16;
1736 break;
1737
1738 case 7:
1739 dst0[1] = src_l0[1] | src_r0[0] << 24;
1740 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
1741 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
1742 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
1743 dst1[1] = src_r0[3] >> 8 | src_r1[0] << 24;
1744 dst1[2] = src_r1[0] >> 8 | src_r1[1] << 24;
1745 dst1[3] = src_r1[1] >> 8 | src_r1[2] << 24;
1746 break;
1747
1748 case 8:
1749 dst0[2] = src_r0[0];
1750 dst0[3] = src_r0[1];
1751 dst1[0] = src_r0[2];
1752 dst1[1] = src_r0[3];
1753 dst1[2] = src_r1[0];
1754 dst1[3] = src_r1[1];
1755 break;
1756
1757 case 9:
1758 dst0[2] = src_l0[2] | src_r0[0] << 8;
1759 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
1760 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
1761 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
1762 dst1[2] = src_r0[3] >> 24 | src_r1[0] << 8;
1763 dst1[3] = src_r1[0] >> 24 | src_r1[1] << 8;
1764 break;
1765
1766 case 10:
1767 dst0[2] = src_l0[2] | src_r0[0] << 16;
1768 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
1769 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
1770 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
1771 dst1[2] = src_r0[3] >> 16 | src_r1[0] << 16;
1772 dst1[3] = src_r1[0] >> 16 | src_r1[1] << 16;
1773 break;
1774
1775 case 11:
1776 dst0[2] = src_l0[2] | src_r0[0] << 24;
1777 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
1778 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
1779 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
1780 dst1[2] = src_r0[3] >> 8 | src_r1[0] << 24;
1781 dst1[3] = src_r1[0] >> 8 | src_r1[1] << 24;
1782 break;
1783
1784 case 12:
1785 dst0[3] = src_r0[0];
1786 dst1[0] = src_r0[1];
1787 dst1[1] = src_r0[2];
1788 dst1[2] = src_r0[3];
1789 dst1[3] = src_r1[0];
1790 break;
1791
1792 case 13:
1793 dst0[3] = src_l0[3] | src_r0[0] << 8;
1794 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
1795 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
1796 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
1797 dst1[3] = src_r0[3] >> 24 | src_r1[0] << 8;
1798 break;
1799
1800 case 14:
1801 dst0[3] = src_l0[3] | src_r0[0] << 16;
1802 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
1803 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
1804 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
1805 dst1[3] = src_r0[3] >> 16 | src_r1[0] << 16;
1806 break;
1807
1808 case 15:
1809 dst0[3] = src_l0[3] | src_r0[0] << 24;
1810 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
1811 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
1812 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
1813 dst1[3] = src_r0[3] >> 8 | src_r1[0] << 24;
1814 break;
1815
1816 case 16:
1817 dst1[0] = src_r0[0];
1818 dst1[1] = src_r0[1];
1819 dst1[2] = src_r0[2];
1820 dst1[3] = src_r0[3];
1821 break;
1822
1823 case 17:
1824 dst1[0] = src_l1[0] | src_r0[0] << 8;
1825 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
1826 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
1827 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
1828 break;
1829
1830 case 18:
1831 dst1[0] = src_l1[0] | src_r0[0] << 16;
1832 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
1833 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
1834 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
1835 break;
1836
1837 case 19:
1838 dst1[0] = src_l1[0] | src_r0[0] << 24;
1839 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
1840 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
1841 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
1842 break;
1843
1844 case 20:
1845 dst1[1] = src_r0[0];
1846 dst1[2] = src_r0[1];
1847 dst1[3] = src_r0[2];
1848 break;
1849
1850 case 21:
1851 dst1[1] = src_l1[1] | src_r0[0] << 8;
1852 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
1853 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
1854 break;
1855
1856 case 22:
1857 dst1[1] = src_l1[1] | src_r0[0] << 16;
1858 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
1859 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
1860 break;
1861
1862 case 23:
1863 dst1[1] = src_l1[1] | src_r0[0] << 24;
1864 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
1865 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
1866 break;
1867
1868 case 24:
1869 dst1[2] = src_r0[0];
1870 dst1[3] = src_r0[1];
1871 break;
1872
1873 case 25:
1874 dst1[2] = src_l1[2] | src_r0[0] << 8;
1875 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
1876 break;
1877
1878 case 26:
1879 dst1[2] = src_l1[2] | src_r0[0] << 16;
1880 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
1881 break;
1882
1883 case 27:
1884 dst1[2] = src_l1[2] | src_r0[0] << 24;
1885 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
1886 break;
1887
1888 case 28:
1889 dst1[3] = src_r0[0];
1890 break;
1891
1892 case 29:
1893 dst1[3] = src_l1[3] | src_r0[0] << 8;
1894 break;
1895
1896 case 30:
1897 dst1[3] = src_l1[3] | src_r0[0] << 16;
1898 break;
1899
1900 case 31:
1901 dst1[3] = src_l1[3] | src_r0[0] << 24;
1902 break;
1903 }
1904
1905 #endif
1906 }
1907
1908 __device__ static void reverse_block (u32x in0[4], u32x in1[4], u32x out0[4], u32x out1[4], const u32 len)
1909 {
1910 rshift_block_N (in0, in1, out0, out1, 32 - len);
1911
1912 u32x tib40[4];
1913 u32x tib41[4];
1914
1915 tib40[0] = out1[3];
1916 tib40[1] = out1[2];
1917 tib40[2] = out1[1];
1918 tib40[3] = out1[0];
1919 tib41[0] = out0[3];
1920 tib41[1] = out0[2];
1921 tib41[2] = out0[1];
1922 tib41[3] = out0[0];
1923
1924 out0[0] = swap_workaround (tib40[0]);
1925 out0[1] = swap_workaround (tib40[1]);
1926 out0[2] = swap_workaround (tib40[2]);
1927 out0[3] = swap_workaround (tib40[3]);
1928 out1[0] = swap_workaround (tib41[0]);
1929 out1[1] = swap_workaround (tib41[1]);
1930 out1[2] = swap_workaround (tib41[2]);
1931 out1[3] = swap_workaround (tib41[3]);
1932 }
1933
1934 __device__ static u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
1935 {
1936 buf0[0] |= (generate_cmask (buf0[0]));
1937 buf0[1] |= (generate_cmask (buf0[1]));
1938 buf0[2] |= (generate_cmask (buf0[2]));
1939 buf0[3] |= (generate_cmask (buf0[3]));
1940 buf1[0] |= (generate_cmask (buf1[0]));
1941 buf1[1] |= (generate_cmask (buf1[1]));
1942 buf1[2] |= (generate_cmask (buf1[2]));
1943 buf1[3] |= (generate_cmask (buf1[3]));
1944
1945 return in_len;
1946 }
1947
1948 __device__ static u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
1949 {
1950 buf0[0] &= ~(generate_cmask (buf0[0]));
1951 buf0[1] &= ~(generate_cmask (buf0[1]));
1952 buf0[2] &= ~(generate_cmask (buf0[2]));
1953 buf0[3] &= ~(generate_cmask (buf0[3]));
1954 buf1[0] &= ~(generate_cmask (buf1[0]));
1955 buf1[1] &= ~(generate_cmask (buf1[1]));
1956 buf1[2] &= ~(generate_cmask (buf1[2]));
1957 buf1[3] &= ~(generate_cmask (buf1[3]));
1958
1959 return in_len;
1960 }
1961
1962 __device__ static u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
1963 {
1964 rule_op_mangle_lrest (p0, p1, buf0, buf1, in_len);
1965
1966 buf0[0] &= ~(0x00000020 & generate_cmask (buf0[0]));
1967
1968 return in_len;
1969 }
1970
1971 __device__ static u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
1972 {
1973 rule_op_mangle_urest (p0, p1, buf0, buf1, in_len);
1974
1975 buf0[0] |= (0x00000020 & generate_cmask (buf0[0]));
1976
1977 return in_len;
1978 }
1979
1980 __device__ static u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
1981 {
1982 buf0[0] ^= (generate_cmask (buf0[0]));
1983 buf0[1] ^= (generate_cmask (buf0[1]));
1984 buf0[2] ^= (generate_cmask (buf0[2]));
1985 buf0[3] ^= (generate_cmask (buf0[3]));
1986 buf1[0] ^= (generate_cmask (buf1[0]));
1987 buf1[1] ^= (generate_cmask (buf1[1]));
1988 buf1[2] ^= (generate_cmask (buf1[2]));
1989 buf1[3] ^= (generate_cmask (buf1[3]));
1990
1991 return in_len;
1992 }
1993
1994 __device__ static u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
1995 {
1996 if (p0 >= in_len) return (in_len);
1997
1998 const u32 tmp = 0x20 << ((p0 & 3) * 8);
1999
2000 switch (p0 / 4)
2001 {
2002 case 0: buf0[0] ^= (tmp & generate_cmask (buf0[0])); break;
2003 case 1: buf0[1] ^= (tmp & generate_cmask (buf0[1])); break;
2004 case 2: buf0[2] ^= (tmp & generate_cmask (buf0[2])); break;
2005 case 3: buf0[3] ^= (tmp & generate_cmask (buf0[3])); break;
2006 case 4: buf1[0] ^= (tmp & generate_cmask (buf1[0])); break;
2007 case 5: buf1[1] ^= (tmp & generate_cmask (buf1[1])); break;
2008 case 6: buf1[2] ^= (tmp & generate_cmask (buf1[2])); break;
2009 case 7: buf1[3] ^= (tmp & generate_cmask (buf1[3])); break;
2010 }
2011
2012 return in_len;
2013 }
2014
2015 __device__ static u32 rule_op_mangle_reverse (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2016 {
2017 reverse_block (buf0, buf1, buf0, buf1, in_len);
2018
2019 return in_len;
2020 }
2021
2022 __device__ static u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2023 {
2024 if ((in_len + in_len) >= 32) return (in_len);
2025
2026 u32 out_len = in_len;
2027
2028 u32x tib40[4];
2029 u32x tib41[4];
2030
2031 tib40[0] = buf0[0];
2032 tib40[1] = buf0[1];
2033 tib40[2] = buf0[2];
2034 tib40[3] = buf0[3];
2035 tib41[0] = buf1[0];
2036 tib41[1] = buf1[1];
2037 tib41[2] = buf1[2];
2038 tib41[3] = buf1[3];
2039
2040 append_block8 (out_len, buf0, buf1, buf0, buf1, tib40, tib41);
2041
2042 out_len += in_len;
2043
2044 return out_len;
2045 }
2046
2047 __device__ static u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2048 {
2049 if (((in_len * p0) + in_len) >= 32) return (in_len);
2050
2051 u32 out_len = in_len;
2052
2053 u32x tib40[4];
2054 u32x tib41[4];
2055
2056 tib40[0] = buf0[0];
2057 tib40[1] = buf0[1];
2058 tib40[2] = buf0[2];
2059 tib40[3] = buf0[3];
2060 tib41[0] = buf1[0];
2061 tib41[1] = buf1[1];
2062 tib41[2] = buf1[2];
2063 tib41[3] = buf1[3];
2064
2065 for (u32 i = 0; i < p0; i++)
2066 {
2067 append_block8 (out_len, buf0, buf1, buf0, buf1, tib40, tib41);
2068
2069 out_len += in_len;
2070 }
2071
2072 return out_len;
2073 }
2074
2075 __device__ static u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2076 {
2077 if ((in_len + in_len) >= 32) return (in_len);
2078
2079 u32 out_len = in_len;
2080
2081 u32x tib40[4];
2082 u32x tib41[4];
2083
2084 reverse_block (buf0, buf1, tib40, tib41, out_len);
2085
2086 append_block8 (out_len, buf0, buf1, buf0, buf1, tib40, tib41);
2087
2088 out_len += in_len;
2089
2090 return out_len;
2091 }
2092
2093 __device__ static u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2094 {
2095 if ((in_len + 1) >= 32) return (in_len);
2096
2097 u32 out_len = in_len;
2098
2099 append_block1 (out_len, buf0, buf1, p0);
2100
2101 out_len++;
2102
2103 return out_len;
2104 }
2105
2106 __device__ static u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2107 {
2108 if ((in_len + 1) >= 32) return (in_len);
2109
2110 u32 out_len = in_len;
2111
2112 rshift_block (buf0, buf1, buf0, buf1);
2113
2114 buf0[0] = buf0[0] | p0;
2115
2116 out_len++;
2117
2118 return out_len;
2119 }
2120
2121 __device__ static u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2122 {
2123 if (in_len == 0) return (in_len);
2124
2125 const u32 in_len1 = in_len - 1;
2126
2127 const u32 sh = (in_len1 & 3) * 8;
2128
2129 const u32x tmp = (buf0[0] & 0xff) << sh;
2130
2131 lshift_block (buf0, buf1, buf0, buf1);
2132
2133 switch (in_len1 / 4)
2134 {
2135 case 0: buf0[0] |= tmp; break;
2136 case 1: buf0[1] |= tmp; break;
2137 case 2: buf0[2] |= tmp; break;
2138 case 3: buf0[3] |= tmp; break;
2139 case 4: buf1[0] |= tmp; break;
2140 case 5: buf1[1] |= tmp; break;
2141 case 6: buf1[2] |= tmp; break;
2142 case 7: buf1[3] |= tmp; break;
2143 }
2144
2145 return in_len;
2146 }
2147
2148 __device__ static u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2149 {
2150 if (in_len == 0) return (in_len);
2151
2152 const u32 in_len1 = in_len - 1;
2153
2154 const u32 sh = (in_len1 & 3) * 8;
2155
2156 u32x tmp = 0;
2157
2158 switch (in_len1 / 4)
2159 {
2160 case 0: tmp = (buf0[0] >> sh) & 0xff; break;
2161 case 1: tmp = (buf0[1] >> sh) & 0xff; break;
2162 case 2: tmp = (buf0[2] >> sh) & 0xff; break;
2163 case 3: tmp = (buf0[3] >> sh) & 0xff; break;
2164 case 4: tmp = (buf1[0] >> sh) & 0xff; break;
2165 case 5: tmp = (buf1[1] >> sh) & 0xff; break;
2166 case 6: tmp = (buf1[2] >> sh) & 0xff; break;
2167 case 7: tmp = (buf1[3] >> sh) & 0xff; break;
2168 }
2169
2170 rshift_block (buf0, buf1, buf0, buf1);
2171
2172 buf0[0] |= tmp;
2173
2174 truncate_right (buf0, buf1, in_len);
2175
2176 return in_len;
2177 }
2178
2179 __device__ static u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2180 {
2181 if (in_len == 0) return (in_len);
2182
2183 const u32 in_len1 = in_len - 1;
2184
2185 lshift_block (buf0, buf1, buf0, buf1);
2186
2187 return in_len1;
2188 }
2189
2190 __device__ static u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2191 {
2192 if (in_len == 0) return (in_len);
2193
2194 const u32 in_len1 = in_len - 1;
2195
2196 const u32 tmp = (1 << ((in_len1 & 3) * 8)) - 1;
2197
2198 switch (in_len1 / 4)
2199 {
2200 case 0: buf0[0] &= tmp; break;
2201 case 1: buf0[1] &= tmp; break;
2202 case 2: buf0[2] &= tmp; break;
2203 case 3: buf0[3] &= tmp; break;
2204 case 4: buf1[0] &= tmp; break;
2205 case 5: buf1[1] &= tmp; break;
2206 case 6: buf1[2] &= tmp; break;
2207 case 7: buf1[3] &= tmp; break;
2208 }
2209
2210 return in_len1;
2211 }
2212
2213 __device__ static u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2214 {
2215 if (p0 >= in_len) return (in_len);
2216
2217 u32 out_len = in_len;
2218
2219 u32x tib40[4];
2220 u32x tib41[4];
2221
2222 lshift_block (buf0, buf1, tib40, tib41);
2223
2224 const u32 ml = (1 << ((p0 & 3) * 8)) - 1;
2225 const u32 mr = ~ml;
2226
2227 switch (p0 / 4)
2228 {
2229 case 0: buf0[0] = (buf0[0] & ml)
2230 | (tib40[0] & mr);
2231 buf0[1] = tib40[1];
2232 buf0[2] = tib40[2];
2233 buf0[3] = tib40[3];
2234 buf1[0] = tib41[0];
2235 buf1[1] = tib41[1];
2236 buf1[2] = tib41[2];
2237 buf1[3] = tib41[3];
2238 break;
2239 case 1: buf0[1] = (buf0[1] & ml)
2240 | (tib40[1] & mr);
2241 buf0[2] = tib40[2];
2242 buf0[3] = tib40[3];
2243 buf1[0] = tib41[0];
2244 buf1[1] = tib41[1];
2245 buf1[2] = tib41[2];
2246 buf1[3] = tib41[3];
2247 break;
2248 case 2: buf0[2] = (buf0[2] & ml)
2249 | (tib40[2] & mr);
2250 buf0[3] = tib40[3];
2251 buf1[0] = tib41[0];
2252 buf1[1] = tib41[1];
2253 buf1[2] = tib41[2];
2254 buf1[3] = tib41[3];
2255 break;
2256 case 3: buf0[3] = (buf0[3] & ml)
2257 | (tib40[3] & mr);
2258 buf1[0] = tib41[0];
2259 buf1[1] = tib41[1];
2260 buf1[2] = tib41[2];
2261 buf1[3] = tib41[3];
2262 break;
2263 case 4: buf1[0] = (buf1[0] & ml)
2264 | (tib41[0] & mr);
2265 buf1[1] = tib41[1];
2266 buf1[2] = tib41[2];
2267 buf1[3] = tib41[3];
2268 break;
2269 case 5: buf1[1] = (buf1[1] & ml)
2270 | (tib41[1] & mr);
2271 buf1[2] = tib41[2];
2272 buf1[3] = tib41[3];
2273 break;
2274 case 6: buf1[2] = (buf1[2] & ml)
2275 | (tib41[2] & mr);
2276 buf1[3] = tib41[3];
2277 break;
2278 case 7: buf1[3] = (buf1[3] & ml)
2279 | (tib41[3] & mr);
2280 break;
2281 }
2282
2283 out_len--;
2284
2285 return out_len;
2286 }
2287
2288 __device__ static u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2289 {
2290 if (p0 >= in_len) return (in_len);
2291
2292 if ((p0 + p1) > in_len) return (in_len);
2293
2294 u32 out_len = p1;
2295
2296 lshift_block_N (buf0, buf1, buf0, buf1, p0);
2297
2298 truncate_right (buf0, buf1, out_len);
2299
2300 return out_len;
2301 }
2302
2303 __device__ static u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2304 {
2305 if (p0 >= in_len) return (in_len);
2306
2307 if ((p0 + p1) > in_len) return (in_len);
2308
2309 u32 out_len = in_len;
2310
2311 u32x tib40[4];
2312 u32x tib41[4];
2313
2314 tib40[0] = 0;
2315 tib40[1] = 0;
2316 tib40[2] = 0;
2317 tib40[3] = 0;
2318 tib41[0] = 0;
2319 tib41[1] = 0;
2320 tib41[2] = 0;
2321 tib41[3] = 0;
2322
2323 lshift_block_N (buf0, buf1, tib40, tib41, p1);
2324
2325 const u32 ml = (1 << ((p0 & 3) * 8)) - 1;
2326 const u32 mr = ~ml;
2327
2328 switch (p0 / 4)
2329 {
2330 case 0: buf0[0] = (buf0[0] & ml)
2331 | (tib40[0] & mr);
2332 buf0[1] = tib40[1];
2333 buf0[2] = tib40[2];
2334 buf0[3] = tib40[3];
2335 buf1[0] = tib41[0];
2336 buf1[1] = tib41[1];
2337 buf1[2] = tib41[2];
2338 buf1[3] = tib41[3];
2339 break;
2340 case 1: buf0[1] = (buf0[1] & ml)
2341 | (tib40[1] & mr);
2342 buf0[2] = tib40[2];
2343 buf0[3] = tib40[3];
2344 buf1[0] = tib41[0];
2345 buf1[1] = tib41[1];
2346 buf1[2] = tib41[2];
2347 buf1[3] = tib41[3];
2348 break;
2349 case 2: buf0[2] = (buf0[2] & ml)
2350 | (tib40[2] & mr);
2351 buf0[3] = tib40[3];
2352 buf1[0] = tib41[0];
2353 buf1[1] = tib41[1];
2354 buf1[2] = tib41[2];
2355 buf1[3] = tib41[3];
2356 break;
2357 case 3: buf0[3] = (buf0[3] & ml)
2358 | (tib40[3] & mr);
2359 buf1[0] = tib41[0];
2360 buf1[1] = tib41[1];
2361 buf1[2] = tib41[2];
2362 buf1[3] = tib41[3];
2363 break;
2364 case 4: buf1[0] = (buf1[0] & ml)
2365 | (tib41[0] & mr);
2366 buf1[1] = tib41[1];
2367 buf1[2] = tib41[2];
2368 buf1[3] = tib41[3];
2369 break;
2370 case 5: buf1[1] = (buf1[1] & ml)
2371 | (tib41[1] & mr);
2372 buf1[2] = tib41[2];
2373 buf1[3] = tib41[3];
2374 break;
2375 case 6: buf1[2] = (buf1[2] & ml)
2376 | (tib41[2] & mr);
2377 buf1[3] = tib41[3];
2378 break;
2379 case 7: buf1[3] = (buf1[3] & ml)
2380 | (tib41[3] & mr);
2381 break;
2382 }
2383
2384 out_len -= p1;
2385
2386 return out_len;
2387 }
2388
2389 __device__ static u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2390 {
2391 if (p0 > in_len) return (in_len);
2392
2393 if ((in_len + 1) >= 32) return (in_len);
2394
2395 u32 out_len = in_len;
2396
2397 u32x tib40[4];
2398 u32x tib41[4];
2399
2400 rshift_block (buf0, buf1, tib40, tib41);
2401
2402 const u32 p1n = p1 << ((p0 & 3) * 8);
2403
2404 const u32 ml = (1 << ((p0 & 3) * 8)) - 1;
2405
2406 const u32 mr = 0xffffff00 << ((p0 & 3) * 8);
2407
2408 switch (p0 / 4)
2409 {
2410 case 0: buf0[0] = (buf0[0] & ml) | p1n | (tib40[0] & mr);
2411 buf0[1] = tib40[1];
2412 buf0[2] = tib40[2];
2413 buf0[3] = tib40[3];
2414 buf1[0] = tib41[0];
2415 buf1[1] = tib41[1];
2416 buf1[2] = tib41[2];
2417 buf1[3] = tib41[3];
2418 break;
2419 case 1: buf0[1] = (buf0[1] & ml) | p1n | (tib40[1] & mr);
2420 buf0[2] = tib40[2];
2421 buf0[3] = tib40[3];
2422 buf1[0] = tib41[0];
2423 buf1[1] = tib41[1];
2424 buf1[2] = tib41[2];
2425 buf1[3] = tib41[3];
2426 break;
2427 case 2: buf0[2] = (buf0[2] & ml) | p1n | (tib40[2] & mr);
2428 buf0[3] = tib40[3];
2429 buf1[0] = tib41[0];
2430 buf1[1] = tib41[1];
2431 buf1[2] = tib41[2];
2432 buf1[3] = tib41[3];
2433 break;
2434 case 3: buf0[3] = (buf0[3] & ml) | p1n | (tib40[3] & mr);
2435 buf1[0] = tib41[0];
2436 buf1[1] = tib41[1];
2437 buf1[2] = tib41[2];
2438 buf1[3] = tib41[3];
2439 break;
2440 case 4: buf1[0] = (buf1[0] & ml) | p1n | (tib41[0] & mr);
2441 buf1[1] = tib41[1];
2442 buf1[2] = tib41[2];
2443 buf1[3] = tib41[3];
2444 break;
2445 case 5: buf1[1] = (buf1[1] & ml) | p1n | (tib41[1] & mr);
2446 buf1[2] = tib41[2];
2447 buf1[3] = tib41[3];
2448 break;
2449 case 6: buf1[2] = (buf1[2] & ml) | p1n | (tib41[2] & mr);
2450 buf1[3] = tib41[3];
2451 break;
2452 case 7: buf1[3] = (buf1[3] & ml) | p1n | (tib41[3] & mr);
2453 break;
2454 }
2455
2456 out_len++;
2457
2458 return out_len;
2459 }
2460
2461 __device__ static u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2462 {
2463 if (p0 >= in_len) return (in_len);
2464
2465 const u32 p1n = p1 << ((p0 & 3) * 8);
2466
2467 const u32 m = ~(0xff << ((p0 & 3) * 8));
2468
2469 switch (p0 / 4)
2470 {
2471 case 0: buf0[0] = (buf0[0] & m) | p1n; break;
2472 case 1: buf0[1] = (buf0[1] & m) | p1n; break;
2473 case 2: buf0[2] = (buf0[2] & m) | p1n; break;
2474 case 3: buf0[3] = (buf0[3] & m) | p1n; break;
2475 case 4: buf1[0] = (buf1[0] & m) | p1n; break;
2476 case 5: buf1[1] = (buf1[1] & m) | p1n; break;
2477 case 6: buf1[2] = (buf1[2] & m) | p1n; break;
2478 case 7: buf1[3] = (buf1[3] & m) | p1n; break;
2479 }
2480
2481 return in_len;
2482 }
2483
2484 __device__ static u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2485 {
2486 if (p0 >= in_len) return (in_len);
2487
2488 truncate_right (buf0, buf1, p0);
2489
2490 return p0;
2491 }
2492
2493 __device__ static u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2494 {
2495
2496
2497 #ifdef VECT_SIZE1
2498
2499 for (u32 i = 0; i < in_len; i++)
2500 {
2501 switch (i)
2502 {
2503 case 0: if ((__byte_perm (buf0[0], 0, 0x6540)) == p0) buf0[0] = __byte_perm (p1, buf0[0], 0x7650);
2504 break;
2505 case 1: if ((__byte_perm (buf0[0], 0, 0x6541)) == p0) buf0[0] = __byte_perm (p1, buf0[0], 0x7604);
2506 break;
2507 case 2: if ((__byte_perm (buf0[0], 0, 0x6542)) == p0) buf0[0] = __byte_perm (p1, buf0[0], 0x7054);
2508 break;
2509 case 3: if ((__byte_perm (buf0[0], 0, 0x6543)) == p0) buf0[0] = __byte_perm (p1, buf0[0], 0x0654);
2510 break;
2511 case 4: if ((__byte_perm (buf0[1], 0, 0x6540)) == p0) buf0[1] = __byte_perm (p1, buf0[1], 0x7650);
2512 break;
2513 case 5: if ((__byte_perm (buf0[1], 0, 0x6541)) == p0) buf0[1] = __byte_perm (p1, buf0[1], 0x7604);
2514 break;
2515 case 6: if ((__byte_perm (buf0[1], 0, 0x6542)) == p0) buf0[1] = __byte_perm (p1, buf0[1], 0x7054);
2516 break;
2517 case 7: if ((__byte_perm (buf0[1], 0, 0x6543)) == p0) buf0[1] = __byte_perm (p1, buf0[1], 0x0654);
2518 break;
2519 case 8: if ((__byte_perm (buf0[2], 0, 0x6540)) == p0) buf0[2] = __byte_perm (p1, buf0[2], 0x7650);
2520 break;
2521 case 9: if ((__byte_perm (buf0[2], 0, 0x6541)) == p0) buf0[2] = __byte_perm (p1, buf0[2], 0x7604);
2522 break;
2523 case 10: if ((__byte_perm (buf0[2], 0, 0x6542)) == p0) buf0[2] = __byte_perm (p1, buf0[2], 0x7054);
2524 break;
2525 case 11: if ((__byte_perm (buf0[2], 0, 0x6543)) == p0) buf0[2] = __byte_perm (p1, buf0[2], 0x0654);
2526 break;
2527 case 12: if ((__byte_perm (buf0[3], 0, 0x6540)) == p0) buf0[3] = __byte_perm (p1, buf0[3], 0x7650);
2528 break;
2529 case 13: if ((__byte_perm (buf0[3], 0, 0x6541)) == p0) buf0[3] = __byte_perm (p1, buf0[3], 0x7604);
2530 break;
2531 case 14: if ((__byte_perm (buf0[3], 0, 0x6542)) == p0) buf0[3] = __byte_perm (p1, buf0[3], 0x7054);
2532 break;
2533 case 15: if ((__byte_perm (buf0[3], 0, 0x6543)) == p0) buf0[3] = __byte_perm (p1, buf0[3], 0x0654);
2534 break;
2535 case 16: if ((__byte_perm (buf1[0], 0, 0x6540)) == p0) buf1[0] = __byte_perm (p1, buf1[0], 0x7650);
2536 break;
2537 case 17: if ((__byte_perm (buf1[0], 0, 0x6541)) == p0) buf1[0] = __byte_perm (p1, buf1[0], 0x7604);
2538 break;
2539 case 18: if ((__byte_perm (buf1[0], 0, 0x6542)) == p0) buf1[0] = __byte_perm (p1, buf1[0], 0x7054);
2540 break;
2541 case 19: if ((__byte_perm (buf1[0], 0, 0x6543)) == p0) buf1[0] = __byte_perm (p1, buf1[0], 0x0654);
2542 break;
2543 case 20: if ((__byte_perm (buf1[1], 0, 0x6540)) == p0) buf1[1] = __byte_perm (p1, buf1[1], 0x7650);
2544 break;
2545 case 21: if ((__byte_perm (buf1[1], 0, 0x6541)) == p0) buf1[1] = __byte_perm (p1, buf1[1], 0x7604);
2546 break;
2547 case 22: if ((__byte_perm (buf1[1], 0, 0x6542)) == p0) buf1[1] = __byte_perm (p1, buf1[1], 0x7054);
2548 break;
2549 case 23: if ((__byte_perm (buf1[1], 0, 0x6543)) == p0) buf1[1] = __byte_perm (p1, buf1[1], 0x0654);
2550 break;
2551 case 24: if ((__byte_perm (buf1[2], 0, 0x6540)) == p0) buf1[2] = __byte_perm (p1, buf1[2], 0x7650);
2552 break;
2553 case 25: if ((__byte_perm (buf1[2], 0, 0x6541)) == p0) buf1[2] = __byte_perm (p1, buf1[2], 0x7604);
2554 break;
2555 case 26: if ((__byte_perm (buf1[2], 0, 0x6542)) == p0) buf1[2] = __byte_perm (p1, buf1[2], 0x7054);
2556 break;
2557 case 27: if ((__byte_perm (buf1[2], 0, 0x6543)) == p0) buf1[2] = __byte_perm (p1, buf1[2], 0x0654);
2558 break;
2559 case 28: if ((__byte_perm (buf1[3], 0, 0x6540)) == p0) buf1[3] = __byte_perm (p1, buf1[3], 0x7650);
2560 break;
2561 case 29: if ((__byte_perm (buf1[3], 0, 0x6541)) == p0) buf1[3] = __byte_perm (p1, buf1[3], 0x7604);
2562 break;
2563 case 30: if ((__byte_perm (buf1[3], 0, 0x6542)) == p0) buf1[3] = __byte_perm (p1, buf1[3], 0x7054);
2564 break;
2565 case 31: if ((__byte_perm (buf1[3], 0, 0x6543)) == p0) buf1[3] = __byte_perm (p1, buf1[3], 0x0654);
2566 break;
2567 }
2568 }
2569
2570 #endif
2571
2572 #ifdef VECT_SIZE2
2573
2574 for (u32 i = 0; i < in_len; i++)
2575 {
2576 switch (i)
2577 {
2578 case 0: if ((__byte_perm (buf0[0].x, 0, 0x6540)) == p0) buf0[0].x = __byte_perm (p1, buf0[0].x, 0x7650);
2579 if ((__byte_perm (buf0[0].y, 0, 0x6540)) == p0) buf0[0].y = __byte_perm (p1, buf0[0].y, 0x7650);
2580 break;
2581 case 1: if ((__byte_perm (buf0[0].x, 0, 0x6541)) == p0) buf0[0].x = __byte_perm (p1, buf0[0].x, 0x7604);
2582 if ((__byte_perm (buf0[0].y, 0, 0x6541)) == p0) buf0[0].y = __byte_perm (p1, buf0[0].y, 0x7604);
2583 break;
2584 case 2: if ((__byte_perm (buf0[0].x, 0, 0x6542)) == p0) buf0[0].x = __byte_perm (p1, buf0[0].x, 0x7054);
2585 if ((__byte_perm (buf0[0].y, 0, 0x6542)) == p0) buf0[0].y = __byte_perm (p1, buf0[0].y, 0x7054);
2586 break;
2587 case 3: if ((__byte_perm (buf0[0].x, 0, 0x6543)) == p0) buf0[0].x = __byte_perm (p1, buf0[0].x, 0x0654);
2588 if ((__byte_perm (buf0[0].y, 0, 0x6543)) == p0) buf0[0].y = __byte_perm (p1, buf0[0].y, 0x0654);
2589 break;
2590 case 4: if ((__byte_perm (buf0[1].x, 0, 0x6540)) == p0) buf0[1].x = __byte_perm (p1, buf0[1].x, 0x7650);
2591 if ((__byte_perm (buf0[1].y, 0, 0x6540)) == p0) buf0[1].y = __byte_perm (p1, buf0[1].y, 0x7650);
2592 break;
2593 case 5: if ((__byte_perm (buf0[1].x, 0, 0x6541)) == p0) buf0[1].x = __byte_perm (p1, buf0[1].x, 0x7604);
2594 if ((__byte_perm (buf0[1].y, 0, 0x6541)) == p0) buf0[1].y = __byte_perm (p1, buf0[1].y, 0x7604);
2595 break;
2596 case 6: if ((__byte_perm (buf0[1].x, 0, 0x6542)) == p0) buf0[1].x = __byte_perm (p1, buf0[1].x, 0x7054);
2597 if ((__byte_perm (buf0[1].y, 0, 0x6542)) == p0) buf0[1].y = __byte_perm (p1, buf0[1].y, 0x7054);
2598 break;
2599 case 7: if ((__byte_perm (buf0[1].x, 0, 0x6543)) == p0) buf0[1].x = __byte_perm (p1, buf0[1].x, 0x0654);
2600 if ((__byte_perm (buf0[1].y, 0, 0x6543)) == p0) buf0[1].y = __byte_perm (p1, buf0[1].y, 0x0654);
2601 break;
2602 case 8: if ((__byte_perm (buf0[2].x, 0, 0x6540)) == p0) buf0[2].x = __byte_perm (p1, buf0[2].x, 0x7650);
2603 if ((__byte_perm (buf0[2].y, 0, 0x6540)) == p0) buf0[2].y = __byte_perm (p1, buf0[2].y, 0x7650);
2604 break;
2605 case 9: if ((__byte_perm (buf0[2].x, 0, 0x6541)) == p0) buf0[2].x = __byte_perm (p1, buf0[2].x, 0x7604);
2606 if ((__byte_perm (buf0[2].y, 0, 0x6541)) == p0) buf0[2].y = __byte_perm (p1, buf0[2].y, 0x7604);
2607 break;
2608 case 10: if ((__byte_perm (buf0[2].x, 0, 0x6542)) == p0) buf0[2].x = __byte_perm (p1, buf0[2].x, 0x7054);
2609 if ((__byte_perm (buf0[2].y, 0, 0x6542)) == p0) buf0[2].y = __byte_perm (p1, buf0[2].y, 0x7054);
2610 break;
2611 case 11: if ((__byte_perm (buf0[2].x, 0, 0x6543)) == p0) buf0[2].x = __byte_perm (p1, buf0[2].x, 0x0654);
2612 if ((__byte_perm (buf0[2].y, 0, 0x6543)) == p0) buf0[2].y = __byte_perm (p1, buf0[2].y, 0x0654);
2613 break;
2614 case 12: if ((__byte_perm (buf0[3].x, 0, 0x6540)) == p0) buf0[3].x = __byte_perm (p1, buf0[3].x, 0x7650);
2615 if ((__byte_perm (buf0[3].y, 0, 0x6540)) == p0) buf0[3].y = __byte_perm (p1, buf0[3].y, 0x7650);
2616 break;
2617 case 13: if ((__byte_perm (buf0[3].x, 0, 0x6541)) == p0) buf0[3].x = __byte_perm (p1, buf0[3].x, 0x7604);
2618 if ((__byte_perm (buf0[3].y, 0, 0x6541)) == p0) buf0[3].y = __byte_perm (p1, buf0[3].y, 0x7604);
2619 break;
2620 case 14: if ((__byte_perm (buf0[3].x, 0, 0x6542)) == p0) buf0[3].x = __byte_perm (p1, buf0[3].x, 0x7054);
2621 if ((__byte_perm (buf0[3].y, 0, 0x6542)) == p0) buf0[3].y = __byte_perm (p1, buf0[3].y, 0x7054);
2622 break;
2623 case 15: if ((__byte_perm (buf0[3].x, 0, 0x6543)) == p0) buf0[3].x = __byte_perm (p1, buf0[3].x, 0x0654);
2624 if ((__byte_perm (buf0[3].y, 0, 0x6543)) == p0) buf0[3].y = __byte_perm (p1, buf0[3].y, 0x0654);
2625 break;
2626 case 16: if ((__byte_perm (buf1[0].x, 0, 0x6540)) == p0) buf1[0].x = __byte_perm (p1, buf1[0].x, 0x7650);
2627 if ((__byte_perm (buf1[0].y, 0, 0x6540)) == p0) buf1[0].y = __byte_perm (p1, buf1[0].y, 0x7650);
2628 break;
2629 case 17: if ((__byte_perm (buf1[0].x, 0, 0x6541)) == p0) buf1[0].x = __byte_perm (p1, buf1[0].x, 0x7604);
2630 if ((__byte_perm (buf1[0].y, 0, 0x6541)) == p0) buf1[0].y = __byte_perm (p1, buf1[0].y, 0x7604);
2631 break;
2632 case 18: if ((__byte_perm (buf1[0].x, 0, 0x6542)) == p0) buf1[0].x = __byte_perm (p1, buf1[0].x, 0x7054);
2633 if ((__byte_perm (buf1[0].y, 0, 0x6542)) == p0) buf1[0].y = __byte_perm (p1, buf1[0].y, 0x7054);
2634 break;
2635 case 19: if ((__byte_perm (buf1[0].x, 0, 0x6543)) == p0) buf1[0].x = __byte_perm (p1, buf1[0].x, 0x0654);
2636 if ((__byte_perm (buf1[0].y, 0, 0x6543)) == p0) buf1[0].y = __byte_perm (p1, buf1[0].y, 0x0654);
2637 break;
2638 case 20: if ((__byte_perm (buf1[1].x, 0, 0x6540)) == p0) buf1[1].x = __byte_perm (p1, buf1[1].x, 0x7650);
2639 if ((__byte_perm (buf1[1].y, 0, 0x6540)) == p0) buf1[1].y = __byte_perm (p1, buf1[1].y, 0x7650);
2640 break;
2641 case 21: if ((__byte_perm (buf1[1].x, 0, 0x6541)) == p0) buf1[1].x = __byte_perm (p1, buf1[1].x, 0x7604);
2642 if ((__byte_perm (buf1[1].y, 0, 0x6541)) == p0) buf1[1].y = __byte_perm (p1, buf1[1].y, 0x7604);
2643 break;
2644 case 22: if ((__byte_perm (buf1[1].x, 0, 0x6542)) == p0) buf1[1].x = __byte_perm (p1, buf1[1].x, 0x7054);
2645 if ((__byte_perm (buf1[1].y, 0, 0x6542)) == p0) buf1[1].y = __byte_perm (p1, buf1[1].y, 0x7054);
2646 break;
2647 case 23: if ((__byte_perm (buf1[1].x, 0, 0x6543)) == p0) buf1[1].x = __byte_perm (p1, buf1[1].x, 0x0654);
2648 if ((__byte_perm (buf1[1].y, 0, 0x6543)) == p0) buf1[1].y = __byte_perm (p1, buf1[1].y, 0x0654);
2649 break;
2650 case 24: if ((__byte_perm (buf1[2].x, 0, 0x6540)) == p0) buf1[2].x = __byte_perm (p1, buf1[2].x, 0x7650);
2651 if ((__byte_perm (buf1[2].y, 0, 0x6540)) == p0) buf1[2].y = __byte_perm (p1, buf1[2].y, 0x7650);
2652 break;
2653 case 25: if ((__byte_perm (buf1[2].x, 0, 0x6541)) == p0) buf1[2].x = __byte_perm (p1, buf1[2].x, 0x7604);
2654 if ((__byte_perm (buf1[2].y, 0, 0x6541)) == p0) buf1[2].y = __byte_perm (p1, buf1[2].y, 0x7604);
2655 break;
2656 case 26: if ((__byte_perm (buf1[2].x, 0, 0x6542)) == p0) buf1[2].x = __byte_perm (p1, buf1[2].x, 0x7054);
2657 if ((__byte_perm (buf1[2].y, 0, 0x6542)) == p0) buf1[2].y = __byte_perm (p1, buf1[2].y, 0x7054);
2658 break;
2659 case 27: if ((__byte_perm (buf1[2].x, 0, 0x6543)) == p0) buf1[2].x = __byte_perm (p1, buf1[2].x, 0x0654);
2660 if ((__byte_perm (buf1[2].y, 0, 0x6543)) == p0) buf1[2].y = __byte_perm (p1, buf1[2].y, 0x0654);
2661 break;
2662 case 28: if ((__byte_perm (buf1[3].x, 0, 0x6540)) == p0) buf1[3].x = __byte_perm (p1, buf1[3].x, 0x7650);
2663 if ((__byte_perm (buf1[3].y, 0, 0x6540)) == p0) buf1[3].y = __byte_perm (p1, buf1[3].y, 0x7650);
2664 break;
2665 case 29: if ((__byte_perm (buf1[3].x, 0, 0x6541)) == p0) buf1[3].x = __byte_perm (p1, buf1[3].x, 0x7604);
2666 if ((__byte_perm (buf1[3].y, 0, 0x6541)) == p0) buf1[3].y = __byte_perm (p1, buf1[3].y, 0x7604);
2667 break;
2668 case 30: if ((__byte_perm (buf1[3].x, 0, 0x6542)) == p0) buf1[3].x = __byte_perm (p1, buf1[3].x, 0x7054);
2669 if ((__byte_perm (buf1[3].y, 0, 0x6542)) == p0) buf1[3].y = __byte_perm (p1, buf1[3].y, 0x7054);
2670 break;
2671 case 31: if ((__byte_perm (buf1[3].x, 0, 0x6543)) == p0) buf1[3].x = __byte_perm (p1, buf1[3].x, 0x0654);
2672 if ((__byte_perm (buf1[3].y, 0, 0x6543)) == p0) buf1[3].y = __byte_perm (p1, buf1[3].y, 0x0654);
2673 break;
2674 }
2675 }
2676
2677 #endif
2678
2679 #ifdef VECT_SIZE4
2680
2681 for (u32 i = 0; i < in_len; i++)
2682 {
2683 switch (i)
2684 {
2685 case 0: if ((__byte_perm (buf0[0].x, 0, 0x6540)) == p0) buf0[0].x = __byte_perm (p1, buf0[0].x, 0x7650);
2686 if ((__byte_perm (buf0[0].y, 0, 0x6540)) == p0) buf0[0].y = __byte_perm (p1, buf0[0].y, 0x7650);
2687 if ((__byte_perm (buf0[0].z, 0, 0x6540)) == p0) buf0[0].z = __byte_perm (p1, buf0[0].z, 0x7650);
2688 if ((__byte_perm (buf0[0].w, 0, 0x6540)) == p0) buf0[0].w = __byte_perm (p1, buf0[0].w, 0x7650);
2689 break;
2690 case 1: if ((__byte_perm (buf0[0].x, 0, 0x6541)) == p0) buf0[0].x = __byte_perm (p1, buf0[0].x, 0x7604);
2691 if ((__byte_perm (buf0[0].y, 0, 0x6541)) == p0) buf0[0].y = __byte_perm (p1, buf0[0].y, 0x7604);
2692 if ((__byte_perm (buf0[0].z, 0, 0x6541)) == p0) buf0[0].z = __byte_perm (p1, buf0[0].z, 0x7604);
2693 if ((__byte_perm (buf0[0].w, 0, 0x6541)) == p0) buf0[0].w = __byte_perm (p1, buf0[0].w, 0x7604);
2694 break;
2695 case 2: if ((__byte_perm (buf0[0].x, 0, 0x6542)) == p0) buf0[0].x = __byte_perm (p1, buf0[0].x, 0x7054);
2696 if ((__byte_perm (buf0[0].y, 0, 0x6542)) == p0) buf0[0].y = __byte_perm (p1, buf0[0].y, 0x7054);
2697 if ((__byte_perm (buf0[0].z, 0, 0x6542)) == p0) buf0[0].z = __byte_perm (p1, buf0[0].z, 0x7054);
2698 if ((__byte_perm (buf0[0].w, 0, 0x6542)) == p0) buf0[0].w = __byte_perm (p1, buf0[0].w, 0x7054);
2699 break;
2700 case 3: if ((__byte_perm (buf0[0].x, 0, 0x6543)) == p0) buf0[0].x = __byte_perm (p1, buf0[0].x, 0x0654);
2701 if ((__byte_perm (buf0[0].y, 0, 0x6543)) == p0) buf0[0].y = __byte_perm (p1, buf0[0].y, 0x0654);
2702 if ((__byte_perm (buf0[0].z, 0, 0x6543)) == p0) buf0[0].z = __byte_perm (p1, buf0[0].z, 0x0654);
2703 if ((__byte_perm (buf0[0].w, 0, 0x6543)) == p0) buf0[0].w = __byte_perm (p1, buf0[0].w, 0x0654);
2704 break;
2705 case 4: if ((__byte_perm (buf0[1].x, 0, 0x6540)) == p0) buf0[1].x = __byte_perm (p1, buf0[1].x, 0x7650);
2706 if ((__byte_perm (buf0[1].y, 0, 0x6540)) == p0) buf0[1].y = __byte_perm (p1, buf0[1].y, 0x7650);
2707 if ((__byte_perm (buf0[1].z, 0, 0x6540)) == p0) buf0[1].z = __byte_perm (p1, buf0[1].z, 0x7650);
2708 if ((__byte_perm (buf0[1].w, 0, 0x6540)) == p0) buf0[1].w = __byte_perm (p1, buf0[1].w, 0x7650);
2709 break;
2710 case 5: if ((__byte_perm (buf0[1].x, 0, 0x6541)) == p0) buf0[1].x = __byte_perm (p1, buf0[1].x, 0x7604);
2711 if ((__byte_perm (buf0[1].y, 0, 0x6541)) == p0) buf0[1].y = __byte_perm (p1, buf0[1].y, 0x7604);
2712 if ((__byte_perm (buf0[1].z, 0, 0x6541)) == p0) buf0[1].z = __byte_perm (p1, buf0[1].z, 0x7604);
2713 if ((__byte_perm (buf0[1].w, 0, 0x6541)) == p0) buf0[1].w = __byte_perm (p1, buf0[1].w, 0x7604);
2714 break;
2715 case 6: if ((__byte_perm (buf0[1].x, 0, 0x6542)) == p0) buf0[1].x = __byte_perm (p1, buf0[1].x, 0x7054);
2716 if ((__byte_perm (buf0[1].y, 0, 0x6542)) == p0) buf0[1].y = __byte_perm (p1, buf0[1].y, 0x7054);
2717 if ((__byte_perm (buf0[1].z, 0, 0x6542)) == p0) buf0[1].z = __byte_perm (p1, buf0[1].z, 0x7054);
2718 if ((__byte_perm (buf0[1].w, 0, 0x6542)) == p0) buf0[1].w = __byte_perm (p1, buf0[1].w, 0x7054);
2719 break;
2720 case 7: if ((__byte_perm (buf0[1].x, 0, 0x6543)) == p0) buf0[1].x = __byte_perm (p1, buf0[1].x, 0x0654);
2721 if ((__byte_perm (buf0[1].y, 0, 0x6543)) == p0) buf0[1].y = __byte_perm (p1, buf0[1].y, 0x0654);
2722 if ((__byte_perm (buf0[1].z, 0, 0x6543)) == p0) buf0[1].z = __byte_perm (p1, buf0[1].z, 0x0654);
2723 if ((__byte_perm (buf0[1].w, 0, 0x6543)) == p0) buf0[1].w = __byte_perm (p1, buf0[1].w, 0x0654);
2724 break;
2725 case 8: if ((__byte_perm (buf0[2].x, 0, 0x6540)) == p0) buf0[2].x = __byte_perm (p1, buf0[2].x, 0x7650);
2726 if ((__byte_perm (buf0[2].y, 0, 0x6540)) == p0) buf0[2].y = __byte_perm (p1, buf0[2].y, 0x7650);
2727 if ((__byte_perm (buf0[2].z, 0, 0x6540)) == p0) buf0[2].z = __byte_perm (p1, buf0[2].z, 0x7650);
2728 if ((__byte_perm (buf0[2].w, 0, 0x6540)) == p0) buf0[2].w = __byte_perm (p1, buf0[2].w, 0x7650);
2729 break;
2730 case 9: if ((__byte_perm (buf0[2].x, 0, 0x6541)) == p0) buf0[2].x = __byte_perm (p1, buf0[2].x, 0x7604);
2731 if ((__byte_perm (buf0[2].y, 0, 0x6541)) == p0) buf0[2].y = __byte_perm (p1, buf0[2].y, 0x7604);
2732 if ((__byte_perm (buf0[2].z, 0, 0x6541)) == p0) buf0[2].z = __byte_perm (p1, buf0[2].z, 0x7604);
2733 if ((__byte_perm (buf0[2].w, 0, 0x6541)) == p0) buf0[2].w = __byte_perm (p1, buf0[2].w, 0x7604);
2734 break;
2735 case 10: if ((__byte_perm (buf0[2].x, 0, 0x6542)) == p0) buf0[2].x = __byte_perm (p1, buf0[2].x, 0x7054);
2736 if ((__byte_perm (buf0[2].y, 0, 0x6542)) == p0) buf0[2].y = __byte_perm (p1, buf0[2].y, 0x7054);
2737 if ((__byte_perm (buf0[2].z, 0, 0x6542)) == p0) buf0[2].z = __byte_perm (p1, buf0[2].z, 0x7054);
2738 if ((__byte_perm (buf0[2].w, 0, 0x6542)) == p0) buf0[2].w = __byte_perm (p1, buf0[2].w, 0x7054);
2739 break;
2740 case 11: if ((__byte_perm (buf0[2].x, 0, 0x6543)) == p0) buf0[2].x = __byte_perm (p1, buf0[2].x, 0x0654);
2741 if ((__byte_perm (buf0[2].y, 0, 0x6543)) == p0) buf0[2].y = __byte_perm (p1, buf0[2].y, 0x0654);
2742 if ((__byte_perm (buf0[2].z, 0, 0x6543)) == p0) buf0[2].z = __byte_perm (p1, buf0[2].z, 0x0654);
2743 if ((__byte_perm (buf0[2].w, 0, 0x6543)) == p0) buf0[2].w = __byte_perm (p1, buf0[2].w, 0x0654);
2744 break;
2745 case 12: if ((__byte_perm (buf0[3].x, 0, 0x6540)) == p0) buf0[3].x = __byte_perm (p1, buf0[3].x, 0x7650);
2746 if ((__byte_perm (buf0[3].y, 0, 0x6540)) == p0) buf0[3].y = __byte_perm (p1, buf0[3].y, 0x7650);
2747 if ((__byte_perm (buf0[3].z, 0, 0x6540)) == p0) buf0[3].z = __byte_perm (p1, buf0[3].z, 0x7650);
2748 if ((__byte_perm (buf0[3].w, 0, 0x6540)) == p0) buf0[3].w = __byte_perm (p1, buf0[3].w, 0x7650);
2749 break;
2750 case 13: if ((__byte_perm (buf0[3].x, 0, 0x6541)) == p0) buf0[3].x = __byte_perm (p1, buf0[3].x, 0x7604);
2751 if ((__byte_perm (buf0[3].y, 0, 0x6541)) == p0) buf0[3].y = __byte_perm (p1, buf0[3].y, 0x7604);
2752 if ((__byte_perm (buf0[3].z, 0, 0x6541)) == p0) buf0[3].z = __byte_perm (p1, buf0[3].z, 0x7604);
2753 if ((__byte_perm (buf0[3].w, 0, 0x6541)) == p0) buf0[3].w = __byte_perm (p1, buf0[3].w, 0x7604);
2754 break;
2755 case 14: if ((__byte_perm (buf0[3].x, 0, 0x6542)) == p0) buf0[3].x = __byte_perm (p1, buf0[3].x, 0x7054);
2756 if ((__byte_perm (buf0[3].y, 0, 0x6542)) == p0) buf0[3].y = __byte_perm (p1, buf0[3].y, 0x7054);
2757 if ((__byte_perm (buf0[3].z, 0, 0x6542)) == p0) buf0[3].z = __byte_perm (p1, buf0[3].z, 0x7054);
2758 if ((__byte_perm (buf0[3].w, 0, 0x6542)) == p0) buf0[3].w = __byte_perm (p1, buf0[3].w, 0x7054);
2759 break;
2760 case 15: if ((__byte_perm (buf0[3].x, 0, 0x6543)) == p0) buf0[3].x = __byte_perm (p1, buf0[3].x, 0x0654);
2761 if ((__byte_perm (buf0[3].y, 0, 0x6543)) == p0) buf0[3].y = __byte_perm (p1, buf0[3].y, 0x0654);
2762 if ((__byte_perm (buf0[3].z, 0, 0x6543)) == p0) buf0[3].z = __byte_perm (p1, buf0[3].z, 0x0654);
2763 if ((__byte_perm (buf0[3].w, 0, 0x6543)) == p0) buf0[3].w = __byte_perm (p1, buf0[3].w, 0x0654);
2764 break;
2765 case 16: if ((__byte_perm (buf1[0].x, 0, 0x6540)) == p0) buf1[0].x = __byte_perm (p1, buf1[0].x, 0x7650);
2766 if ((__byte_perm (buf1[0].y, 0, 0x6540)) == p0) buf1[0].y = __byte_perm (p1, buf1[0].y, 0x7650);
2767 if ((__byte_perm (buf1[0].z, 0, 0x6540)) == p0) buf1[0].z = __byte_perm (p1, buf1[0].z, 0x7650);
2768 if ((__byte_perm (buf1[0].w, 0, 0x6540)) == p0) buf1[0].w = __byte_perm (p1, buf1[0].w, 0x7650);
2769 break;
2770 case 17: if ((__byte_perm (buf1[0].x, 0, 0x6541)) == p0) buf1[0].x = __byte_perm (p1, buf1[0].x, 0x7604);
2771 if ((__byte_perm (buf1[0].y, 0, 0x6541)) == p0) buf1[0].y = __byte_perm (p1, buf1[0].y, 0x7604);
2772 if ((__byte_perm (buf1[0].z, 0, 0x6541)) == p0) buf1[0].z = __byte_perm (p1, buf1[0].z, 0x7604);
2773 if ((__byte_perm (buf1[0].w, 0, 0x6541)) == p0) buf1[0].w = __byte_perm (p1, buf1[0].w, 0x7604);
2774 break;
2775 case 18: if ((__byte_perm (buf1[0].x, 0, 0x6542)) == p0) buf1[0].x = __byte_perm (p1, buf1[0].x, 0x7054);
2776 if ((__byte_perm (buf1[0].y, 0, 0x6542)) == p0) buf1[0].y = __byte_perm (p1, buf1[0].y, 0x7054);
2777 if ((__byte_perm (buf1[0].z, 0, 0x6542)) == p0) buf1[0].z = __byte_perm (p1, buf1[0].z, 0x7054);
2778 if ((__byte_perm (buf1[0].w, 0, 0x6542)) == p0) buf1[0].w = __byte_perm (p1, buf1[0].w, 0x7054);
2779 break;
2780 case 19: if ((__byte_perm (buf1[0].x, 0, 0x6543)) == p0) buf1[0].x = __byte_perm (p1, buf1[0].x, 0x0654);
2781 if ((__byte_perm (buf1[0].y, 0, 0x6543)) == p0) buf1[0].y = __byte_perm (p1, buf1[0].y, 0x0654);
2782 if ((__byte_perm (buf1[0].z, 0, 0x6543)) == p0) buf1[0].z = __byte_perm (p1, buf1[0].z, 0x0654);
2783 if ((__byte_perm (buf1[0].w, 0, 0x6543)) == p0) buf1[0].w = __byte_perm (p1, buf1[0].w, 0x0654);
2784 break;
2785 case 20: if ((__byte_perm (buf1[1].x, 0, 0x6540)) == p0) buf1[1].x = __byte_perm (p1, buf1[1].x, 0x7650);
2786 if ((__byte_perm (buf1[1].y, 0, 0x6540)) == p0) buf1[1].y = __byte_perm (p1, buf1[1].y, 0x7650);
2787 if ((__byte_perm (buf1[1].z, 0, 0x6540)) == p0) buf1[1].z = __byte_perm (p1, buf1[1].z, 0x7650);
2788 if ((__byte_perm (buf1[1].w, 0, 0x6540)) == p0) buf1[1].w = __byte_perm (p1, buf1[1].w, 0x7650);
2789 break;
2790 case 21: if ((__byte_perm (buf1[1].x, 0, 0x6541)) == p0) buf1[1].x = __byte_perm (p1, buf1[1].x, 0x7604);
2791 if ((__byte_perm (buf1[1].y, 0, 0x6541)) == p0) buf1[1].y = __byte_perm (p1, buf1[1].y, 0x7604);
2792 if ((__byte_perm (buf1[1].z, 0, 0x6541)) == p0) buf1[1].z = __byte_perm (p1, buf1[1].z, 0x7604);
2793 if ((__byte_perm (buf1[1].w, 0, 0x6541)) == p0) buf1[1].w = __byte_perm (p1, buf1[1].w, 0x7604);
2794 break;
2795 case 22: if ((__byte_perm (buf1[1].x, 0, 0x6542)) == p0) buf1[1].x = __byte_perm (p1, buf1[1].x, 0x7054);
2796 if ((__byte_perm (buf1[1].y, 0, 0x6542)) == p0) buf1[1].y = __byte_perm (p1, buf1[1].y, 0x7054);
2797 if ((__byte_perm (buf1[1].z, 0, 0x6542)) == p0) buf1[1].z = __byte_perm (p1, buf1[1].z, 0x7054);
2798 if ((__byte_perm (buf1[1].w, 0, 0x6542)) == p0) buf1[1].w = __byte_perm (p1, buf1[1].w, 0x7054);
2799 break;
2800 case 23: if ((__byte_perm (buf1[1].x, 0, 0x6543)) == p0) buf1[1].x = __byte_perm (p1, buf1[1].x, 0x0654);
2801 if ((__byte_perm (buf1[1].y, 0, 0x6543)) == p0) buf1[1].y = __byte_perm (p1, buf1[1].y, 0x0654);
2802 if ((__byte_perm (buf1[1].z, 0, 0x6543)) == p0) buf1[1].z = __byte_perm (p1, buf1[1].z, 0x0654);
2803 if ((__byte_perm (buf1[1].w, 0, 0x6543)) == p0) buf1[1].w = __byte_perm (p1, buf1[1].w, 0x0654);
2804 break;
2805 case 24: if ((__byte_perm (buf1[2].x, 0, 0x6540)) == p0) buf1[2].x = __byte_perm (p1, buf1[2].x, 0x7650);
2806 if ((__byte_perm (buf1[2].y, 0, 0x6540)) == p0) buf1[2].y = __byte_perm (p1, buf1[2].y, 0x7650);
2807 if ((__byte_perm (buf1[2].z, 0, 0x6540)) == p0) buf1[2].z = __byte_perm (p1, buf1[2].z, 0x7650);
2808 if ((__byte_perm (buf1[2].w, 0, 0x6540)) == p0) buf1[2].w = __byte_perm (p1, buf1[2].w, 0x7650);
2809 break;
2810 case 25: if ((__byte_perm (buf1[2].x, 0, 0x6541)) == p0) buf1[2].x = __byte_perm (p1, buf1[2].x, 0x7604);
2811 if ((__byte_perm (buf1[2].y, 0, 0x6541)) == p0) buf1[2].y = __byte_perm (p1, buf1[2].y, 0x7604);
2812 if ((__byte_perm (buf1[2].z, 0, 0x6541)) == p0) buf1[2].z = __byte_perm (p1, buf1[2].z, 0x7604);
2813 if ((__byte_perm (buf1[2].w, 0, 0x6541)) == p0) buf1[2].w = __byte_perm (p1, buf1[2].w, 0x7604);
2814 break;
2815 case 26: if ((__byte_perm (buf1[2].x, 0, 0x6542)) == p0) buf1[2].x = __byte_perm (p1, buf1[2].x, 0x7054);
2816 if ((__byte_perm (buf1[2].y, 0, 0x6542)) == p0) buf1[2].y = __byte_perm (p1, buf1[2].y, 0x7054);
2817 if ((__byte_perm (buf1[2].z, 0, 0x6542)) == p0) buf1[2].z = __byte_perm (p1, buf1[2].z, 0x7054);
2818 if ((__byte_perm (buf1[2].w, 0, 0x6542)) == p0) buf1[2].w = __byte_perm (p1, buf1[2].w, 0x7054);
2819 break;
2820 case 27: if ((__byte_perm (buf1[2].x, 0, 0x6543)) == p0) buf1[2].x = __byte_perm (p1, buf1[2].x, 0x0654);
2821 if ((__byte_perm (buf1[2].y, 0, 0x6543)) == p0) buf1[2].y = __byte_perm (p1, buf1[2].y, 0x0654);
2822 if ((__byte_perm (buf1[2].z, 0, 0x6543)) == p0) buf1[2].z = __byte_perm (p1, buf1[2].z, 0x0654);
2823 if ((__byte_perm (buf1[2].w, 0, 0x6543)) == p0) buf1[2].w = __byte_perm (p1, buf1[2].w, 0x0654);
2824 break;
2825 case 28: if ((__byte_perm (buf1[3].x, 0, 0x6540)) == p0) buf1[3].x = __byte_perm (p1, buf1[3].x, 0x7650);
2826 if ((__byte_perm (buf1[3].y, 0, 0x6540)) == p0) buf1[3].y = __byte_perm (p1, buf1[3].y, 0x7650);
2827 if ((__byte_perm (buf1[3].z, 0, 0x6540)) == p0) buf1[3].z = __byte_perm (p1, buf1[3].z, 0x7650);
2828 if ((__byte_perm (buf1[3].w, 0, 0x6540)) == p0) buf1[3].w = __byte_perm (p1, buf1[3].w, 0x7650);
2829 break;
2830 case 29: if ((__byte_perm (buf1[3].x, 0, 0x6541)) == p0) buf1[3].x = __byte_perm (p1, buf1[3].x, 0x7604);
2831 if ((__byte_perm (buf1[3].y, 0, 0x6541)) == p0) buf1[3].y = __byte_perm (p1, buf1[3].y, 0x7604);
2832 if ((__byte_perm (buf1[3].z, 0, 0x6541)) == p0) buf1[3].z = __byte_perm (p1, buf1[3].z, 0x7604);
2833 if ((__byte_perm (buf1[3].w, 0, 0x6541)) == p0) buf1[3].w = __byte_perm (p1, buf1[3].w, 0x7604);
2834 break;
2835 case 30: if ((__byte_perm (buf1[3].x, 0, 0x6542)) == p0) buf1[3].x = __byte_perm (p1, buf1[3].x, 0x7054);
2836 if ((__byte_perm (buf1[3].y, 0, 0x6542)) == p0) buf1[3].y = __byte_perm (p1, buf1[3].y, 0x7054);
2837 if ((__byte_perm (buf1[3].z, 0, 0x6542)) == p0) buf1[3].z = __byte_perm (p1, buf1[3].z, 0x7054);
2838 if ((__byte_perm (buf1[3].w, 0, 0x6542)) == p0) buf1[3].w = __byte_perm (p1, buf1[3].w, 0x7054);
2839 break;
2840 case 31: if ((__byte_perm (buf1[3].x, 0, 0x6543)) == p0) buf1[3].x = __byte_perm (p1, buf1[3].x, 0x0654);
2841 if ((__byte_perm (buf1[3].y, 0, 0x6543)) == p0) buf1[3].y = __byte_perm (p1, buf1[3].y, 0x0654);
2842 if ((__byte_perm (buf1[3].z, 0, 0x6543)) == p0) buf1[3].z = __byte_perm (p1, buf1[3].z, 0x0654);
2843 if ((__byte_perm (buf1[3].w, 0, 0x6543)) == p0) buf1[3].w = __byte_perm (p1, buf1[3].w, 0x0654);
2844 break;
2845 }
2846 }
2847
2848 #endif
2849
2850
2851 return in_len;
2852 }
2853
2854 __device__ static u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2855 {
2856 // TODO
2857 return in_len;
2858 }
2859
2860 __device__ static u32 rule_op_mangle_togglecase_rec (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2861 {
2862 // TODO
2863 return in_len;
2864 }
2865
2866 __device__ static u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
2867 {
2868 if ( in_len == 0) return (in_len);
2869 if ((in_len + p0) >= 32) return (in_len);
2870
2871 u32 out_len = in_len;
2872
2873 const u32x tmp = buf0[0] & 0xFF;
2874
2875 rshift_block_N (buf0, buf1, buf0, buf1, p0);
2876
2877 #if __CUDA_ARCH__ >= 200
2878
2879 switch (p0)
2880 {
2881 case 1: buf0[0] |= tmp;
2882 break;
2883 case 2: buf0[0] |= __byte_perm (tmp, 0, 0x5400);
2884 break;
2885 case 3: buf0[0] |= __byte_perm (tmp, 0, 0x4000);
2886 break;
2887 case 4: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2888 break;
2889 case 5: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2890 buf0[1] |= tmp;
2891 break;
2892 case 6: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2893 buf0[1] |= __byte_perm (tmp, 0, 0x5400);
2894 break;
2895 case 7: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2896 buf0[1] |= __byte_perm (tmp, 0, 0x4000);
2897 break;
2898 case 8: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2899 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2900 break;
2901 case 9: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2902 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2903 buf0[2] |= tmp;
2904 break;
2905 case 10: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2906 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2907 buf0[2] |= __byte_perm (tmp, 0, 0x5400);
2908 break;
2909 case 11: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2910 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2911 buf0[2] |= __byte_perm (tmp, 0, 0x4000);
2912 break;
2913 case 12: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2914 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2915 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2916 break;
2917 case 13: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2918 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2919 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2920 buf0[3] |= tmp;
2921 break;
2922 case 14: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2923 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2924 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2925 buf0[3] |= __byte_perm (tmp, 0, 0x5400);
2926 break;
2927 case 15: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2928 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2929 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2930 buf0[3] |= __byte_perm (tmp, 0, 0x4000);
2931 break;
2932 case 16: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2933 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2934 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2935 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
2936 break;
2937 case 17: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2938 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2939 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2940 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
2941 buf1[0] |= tmp;
2942 break;
2943 case 18: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2944 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2945 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2946 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
2947 buf1[0] |= __byte_perm (tmp, 0, 0x5400);
2948 break;
2949 case 19: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2950 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2951 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2952 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
2953 buf1[0] |= __byte_perm (tmp, 0, 0x4000);
2954 break;
2955 case 20: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2956 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2957 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2958 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
2959 buf1[0] |= __byte_perm (tmp, 0, 0x0000);
2960 break;
2961 case 21: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2962 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2963 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2964 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
2965 buf1[0] |= __byte_perm (tmp, 0, 0x0000);
2966 buf1[1] |= tmp;
2967 break;
2968 case 22: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2969 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2970 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2971 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
2972 buf1[0] |= __byte_perm (tmp, 0, 0x0000);
2973 buf1[1] |= __byte_perm (tmp, 0, 0x5400);
2974 break;
2975 case 23: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2976 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2977 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2978 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
2979 buf1[0] |= __byte_perm (tmp, 0, 0x0000);
2980 buf1[1] |= __byte_perm (tmp, 0, 0x4000);
2981 break;
2982 case 24: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2983 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2984 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2985 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
2986 buf1[0] |= __byte_perm (tmp, 0, 0x0000);
2987 buf1[1] |= __byte_perm (tmp, 0, 0x0000);
2988 break;
2989 case 25: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2990 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2991 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
2992 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
2993 buf1[0] |= __byte_perm (tmp, 0, 0x0000);
2994 buf1[1] |= __byte_perm (tmp, 0, 0x0000);
2995 buf1[2] |= tmp;
2996 break;
2997 case 26: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
2998 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
2999 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
3000 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
3001 buf1[0] |= __byte_perm (tmp, 0, 0x0000);
3002 buf1[1] |= __byte_perm (tmp, 0, 0x0000);
3003 buf1[2] |= __byte_perm (tmp, 0, 0x5400);
3004 break;
3005 case 27: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
3006 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
3007 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
3008 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
3009 buf1[0] |= __byte_perm (tmp, 0, 0x0000);
3010 buf1[1] |= __byte_perm (tmp, 0, 0x0000);
3011 buf1[2] |= __byte_perm (tmp, 0, 0x4000);
3012 break;
3013 case 28: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
3014 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
3015 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
3016 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
3017 buf1[0] |= __byte_perm (tmp, 0, 0x0000);
3018 buf1[1] |= __byte_perm (tmp, 0, 0x0000);
3019 buf1[2] |= __byte_perm (tmp, 0, 0x0000);
3020 break;
3021 case 29: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
3022 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
3023 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
3024 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
3025 buf1[0] |= __byte_perm (tmp, 0, 0x0000);
3026 buf1[1] |= __byte_perm (tmp, 0, 0x0000);
3027 buf1[2] |= __byte_perm (tmp, 0, 0x0000);
3028 buf1[3] |= tmp;
3029 break;
3030 case 30: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
3031 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
3032 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
3033 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
3034 buf1[0] |= __byte_perm (tmp, 0, 0x0000);
3035 buf1[1] |= __byte_perm (tmp, 0, 0x0000);
3036 buf1[2] |= __byte_perm (tmp, 0, 0x0000);
3037 buf1[3] |= __byte_perm (tmp, 0, 0x5400);
3038 break;
3039 case 31: buf0[0] |= __byte_perm (tmp, 0, 0x0000);
3040 buf0[1] |= __byte_perm (tmp, 0, 0x0000);
3041 buf0[2] |= __byte_perm (tmp, 0, 0x0000);
3042 buf0[3] |= __byte_perm (tmp, 0, 0x0000);
3043 buf1[0] |= __byte_perm (tmp, 0, 0x0000);
3044 buf1[1] |= __byte_perm (tmp, 0, 0x0000);
3045 buf1[2] |= __byte_perm (tmp, 0, 0x0000);
3046 buf1[3] |= __byte_perm (tmp, 0, 0x4000);
3047 break;
3048 }
3049
3050 #else
3051
3052 switch (p0)
3053 {
3054 case 1: buf0[0] |= tmp << 0;
3055 break;
3056 case 2: buf0[0] |= tmp << 0 | tmp << 8;
3057 break;
3058 case 3: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16;
3059 break;
3060 case 4: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3061 break;
3062 case 5: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3063 buf0[1] |= tmp << 0;
3064 break;
3065 case 6: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3066 buf0[1] |= tmp << 0 | tmp << 8;
3067 break;
3068 case 7: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3069 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16;
3070 break;
3071 case 8: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3072 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3073 break;
3074 case 9: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3075 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3076 buf0[2] |= tmp << 0;
3077 break;
3078 case 10: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3079 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3080 buf0[2] |= tmp << 0 | tmp << 8;
3081 break;
3082 case 11: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3083 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3084 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16;
3085 break;
3086 case 12: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3087 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3088 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3089 break;
3090 case 13: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3091 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3092 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3093 buf0[3] |= tmp << 0;
3094 break;
3095 case 14: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3096 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3097 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3098 buf0[3] |= tmp << 0 | tmp << 8;
3099 break;
3100 case 15: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3101 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3102 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3103 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16;
3104 break;
3105 case 16: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3106 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3107 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3108 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3109 break;
3110 case 17: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3111 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3112 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3113 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3114 buf1[0] |= tmp << 0;
3115 break;
3116 case 18: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3117 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3118 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3119 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3120 buf1[0] |= tmp << 0 | tmp << 8;
3121 break;
3122 case 19: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3123 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3124 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3125 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3126 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16;
3127 break;
3128 case 20: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3129 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3130 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3131 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3132 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3133 break;
3134 case 21: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3135 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3136 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3137 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3138 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3139 buf1[1] |= tmp << 0;
3140 break;
3141 case 22: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3142 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3143 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3144 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3145 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3146 buf1[1] |= tmp << 0 | tmp << 8;
3147 break;
3148 case 23: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3149 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3150 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3151 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3152 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3153 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16;
3154 break;
3155 case 24: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3156 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3157 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3158 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3159 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3160 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3161 break;
3162 case 25: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3163 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3164 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3165 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3166 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3167 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3168 buf1[2] |= tmp << 0;
3169 break;
3170 case 26: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3171 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3172 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3173 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3174 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3175 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3176 buf1[2] |= tmp << 0 | tmp << 8;
3177 break;
3178 case 27: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3179 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3180 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3181 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3182 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3183 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3184 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16;
3185 break;
3186 case 28: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3187 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3188 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3189 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3190 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3191 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3192 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3193 break;
3194 case 29: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3195 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3196 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3197 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3198 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3199 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3200 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3201 buf1[3] |= tmp << 0;
3202 break;
3203 case 30: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3204 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3205 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3206 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3207 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3208 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3209 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3210 buf1[3] |= tmp << 0 | tmp << 8;
3211 break;
3212 case 31: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3213 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3214 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3215 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3216 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3217 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3218 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
3219 buf1[3] |= tmp << 0 | tmp << 8 | tmp << 16;
3220 break;
3221 }
3222
3223 #endif
3224
3225 out_len += p0;
3226
3227 return out_len;
3228 }
3229
3230 __device__ static u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
3231 {
3232 if ( in_len == 0) return (in_len);
3233 if ((in_len + p0) >= 32) return (in_len);
3234
3235 const u32 in_len1 = in_len - 1;
3236
3237 const u32 sh = (in_len1 & 3) * 8;
3238
3239 u32x tmp = 0;
3240
3241 switch (in_len1 / 4)
3242 {
3243 case 0: tmp = (buf0[0] >> sh) & 0xff; break;
3244 case 1: tmp = (buf0[1] >> sh) & 0xff; break;
3245 case 2: tmp = (buf0[2] >> sh) & 0xff; break;
3246 case 3: tmp = (buf0[3] >> sh) & 0xff; break;
3247 case 4: tmp = (buf1[0] >> sh) & 0xff; break;
3248 case 5: tmp = (buf1[1] >> sh) & 0xff; break;
3249 case 6: tmp = (buf1[2] >> sh) & 0xff; break;
3250 case 7: tmp = (buf1[3] >> sh) & 0xff; break;
3251 }
3252
3253 u32 out_len = in_len;
3254
3255 for (u32 i = 0; i < p0; i++)
3256 {
3257 append_block1 (out_len, buf0, buf1, tmp);
3258
3259 out_len++;
3260 }
3261
3262 return out_len;
3263 }
3264
3265 __device__ static u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
3266 {
3267 if ( in_len == 0) return (in_len);
3268 if ((in_len + in_len) >= 32) return (in_len);
3269
3270 u32 out_len = in_len;
3271
3272 u32x tib40[4];
3273 u32x tib41[4];
3274
3275 #if __CUDA_ARCH__ >= 200
3276
3277 tib40[0] = __byte_perm (buf0[0], 0, 0x1100);
3278 tib40[1] = __byte_perm (buf0[0], 0, 0x3322);
3279 tib40[2] = __byte_perm (buf0[1], 0, 0x1100);
3280 tib40[3] = __byte_perm (buf0[1], 0, 0x3322);
3281 tib41[0] = __byte_perm (buf0[2], 0, 0x1100);
3282 tib41[1] = __byte_perm (buf0[2], 0, 0x3322);
3283 tib41[2] = __byte_perm (buf0[3], 0, 0x1100);
3284 tib41[3] = __byte_perm (buf0[3], 0, 0x3322);
3285
3286 buf0[0] = tib40[0];
3287 buf0[1] = tib40[1];
3288 buf0[2] = tib40[2];
3289 buf0[3] = tib40[3];
3290 buf1[0] = tib41[0];
3291 buf1[1] = tib41[1];
3292 buf1[2] = tib41[2];
3293 buf1[3] = tib41[3];
3294
3295 #else
3296
3297 tib40[0] = ((buf0[0] & 0x000000FF) << 0) | ((buf0[0] & 0x0000FF00) << 8);
3298 tib40[1] = ((buf0[0] & 0x00FF0000) >> 16) | ((buf0[0] & 0xFF000000) >> 8);
3299 tib40[2] = ((buf0[1] & 0x000000FF) << 0) | ((buf0[1] & 0x0000FF00) << 8);
3300 tib40[3] = ((buf0[1] & 0x00FF0000) >> 16) | ((buf0[1] & 0xFF000000) >> 8);
3301 tib41[0] = ((buf0[2] & 0x000000FF) << 0) | ((buf0[2] & 0x0000FF00) << 8);
3302 tib41[1] = ((buf0[2] & 0x00FF0000) >> 16) | ((buf0[2] & 0xFF000000) >> 8);
3303 tib41[2] = ((buf0[3] & 0x000000FF) << 0) | ((buf0[3] & 0x0000FF00) << 8);
3304 tib41[3] = ((buf0[3] & 0x00FF0000) >> 16) | ((buf0[3] & 0xFF000000) >> 8);
3305
3306 buf0[0] = tib40[0] | (tib40[0] << 8);
3307 buf0[1] = tib40[1] | (tib40[1] << 8);
3308 buf0[2] = tib40[2] | (tib40[2] << 8);
3309 buf0[3] = tib40[3] | (tib40[3] << 8);
3310 buf1[0] = tib41[0] | (tib41[0] << 8);
3311 buf1[1] = tib41[1] | (tib41[1] << 8);
3312 buf1[2] = tib41[2] | (tib41[2] << 8);
3313 buf1[3] = tib41[3] | (tib41[3] << 8);
3314
3315 #endif
3316
3317 out_len = out_len + out_len;
3318
3319 return out_len;
3320 }
3321
3322 __device__ static u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
3323 {
3324 if (in_len < 2) return (in_len);
3325
3326 #if __CUDA_ARCH__ >= 200
3327
3328 buf0[0] = __byte_perm (buf0[0], 0, 0x3201);
3329
3330 #else
3331
3332 buf0[0] = (buf0[0] & 0xFFFF0000) | ((buf0[0] << 8) & 0x0000FF00) | ((buf0[0] >> 8) & 0x000000FF);
3333
3334 #endif
3335
3336 return in_len;
3337 }
3338
3339 __device__ static u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
3340 {
3341 if (in_len < 2) return (in_len);
3342
3343 #if __CUDA_ARCH__ >= 200
3344
3345 switch (in_len)
3346 {
3347 case 2: buf0[0] = __byte_perm (buf0[0], 0, 0x5401);
3348 break;
3349 case 3: buf0[0] = __byte_perm (buf0[0], 0, 0x4120);
3350 break;
3351 case 4: buf0[0] = __byte_perm (buf0[0], 0, 0x2310);
3352 break;
3353 case 5: buf0[1] = __byte_perm (buf0[1], buf0[0], 0x7210);
3354 buf0[0] = __byte_perm (buf0[0], buf0[1], 0x4210);
3355 buf0[1] = __byte_perm (buf0[1], 0, 0x6543);
3356 break;
3357 case 6: buf0[1] = __byte_perm (buf0[1], 0, 0x5401);
3358 break;
3359 case 7: buf0[1] = __byte_perm (buf0[1], 0, 0x4120);
3360 break;
3361 case 8: buf0[1] = __byte_perm (buf0[1], 0, 0x2310);
3362 break;
3363 case 9: buf0[2] = __byte_perm (buf0[2], buf0[1], 0x7210);
3364 buf0[1] = __byte_perm (buf0[1], buf0[2], 0x4210);
3365 buf0[2] = __byte_perm (buf0[2], 0, 0x6543);
3366 break;
3367 case 10: buf0[2] = __byte_perm (buf0[2], 0, 0x5401);
3368 break;
3369 case 11: buf0[2] = __byte_perm (buf0[2], 0, 0x4120);
3370 break;
3371 case 12: buf0[2] = __byte_perm (buf0[2], 0, 0x2310);
3372 break;
3373 case 13: buf0[3] = __byte_perm (buf0[3], buf0[2], 0x7210);
3374 buf0[2] = __byte_perm (buf0[2], buf0[3], 0x4210);
3375 buf0[3] = __byte_perm (buf0[3], 0, 0x6543);
3376 break;
3377 case 14: buf0[3] = __byte_perm (buf0[3], 0, 0x5401);
3378 break;
3379 case 15: buf0[3] = __byte_perm (buf0[3], 0, 0x4120);
3380 break;
3381 case 16: buf0[3] = __byte_perm (buf0[3], 0, 0x2310);
3382 break;
3383 case 17: buf1[0] = __byte_perm (buf1[0], buf0[3], 0x7210);
3384 buf0[3] = __byte_perm (buf0[3], buf1[0], 0x4210);
3385 buf1[0] = __byte_perm (buf1[0], 0, 0x6543);
3386 break;
3387 case 18: buf1[0] = __byte_perm (buf1[0], 0, 0x5401);
3388 break;
3389 case 19: buf1[0] = __byte_perm (buf1[0], 0, 0x4120);
3390 break;
3391 case 20: buf1[0] = __byte_perm (buf1[0], 0, 0x2310);
3392 break;
3393 case 21: buf1[1] = __byte_perm (buf1[1], buf1[0], 0x7210);
3394 buf1[0] = __byte_perm (buf1[0], buf1[1], 0x4210);
3395 buf1[1] = __byte_perm (buf1[1], 0, 0x6543);
3396 break;
3397 case 22: buf1[1] = __byte_perm (buf1[1], 0, 0x5401);
3398 break;
3399 case 23: buf1[1] = __byte_perm (buf1[1], 0, 0x4120);
3400 break;
3401 case 24: buf1[1] = __byte_perm (buf1[1], 0, 0x2310);
3402 break;
3403 case 25: buf1[2] = __byte_perm (buf1[2], buf1[1], 0x7210);
3404 buf1[1] = __byte_perm (buf1[1], buf1[2], 0x4210);
3405 buf1[2] = __byte_perm (buf1[2], 0, 0x6543);
3406 break;
3407 case 26: buf1[2] = __byte_perm (buf1[2], 0, 0x5401);
3408 break;
3409 case 27: buf1[2] = __byte_perm (buf1[2], 0, 0x4120);
3410 break;
3411 case 28: buf1[2] = __byte_perm (buf1[2], 0, 0x2310);
3412 break;
3413 case 29: buf1[3] = __byte_perm (buf1[3], buf1[2], 0x7210);
3414 buf1[2] = __byte_perm (buf1[2], buf1[3], 0x4210);
3415 buf1[3] = __byte_perm (buf1[3], 0, 0x6543);
3416 break;
3417 case 30: buf1[3] = __byte_perm (buf1[3], 0, 0x5401);
3418 break;
3419 case 31: buf1[3] = __byte_perm (buf1[3], 0, 0x4120);
3420 break;
3421 }
3422
3423 #else
3424
3425 switch (in_len)
3426 {
3427 case 2: buf0[0] = ((buf0[0] << 8) & 0x0000FF00) | ((buf0[0] >> 8) & 0x000000FF);
3428 break;
3429 case 3: buf0[0] = (buf0[0] & 0x000000FF) | ((buf0[0] << 8) & 0x00FF0000) | ((buf0[0] >> 8) & 0x0000FF00);
3430 break;
3431 case 4: buf0[0] = (buf0[0] & 0x0000FFFF) | ((buf0[0] << 8) & 0xFF000000) | ((buf0[0] >> 8) & 0x00FF0000);
3432 break;
3433 case 5: buf0[1] = (buf0[0] & 0xFF000000) | buf0[1];
3434 buf0[0] = (buf0[0] & 0x00FFFFFF) | (buf0[1] << 24);
3435 buf0[1] = (buf0[1] >> 24);
3436 break;
3437 case 6: buf0[1] = ((buf0[1] << 8) & 0x0000FF00) | ((buf0[1] >> 8) & 0x000000FF);
3438 break;
3439 case 7: buf0[1] = (buf0[1] & 0x000000FF) | ((buf0[1] << 8) & 0x00FF0000) | ((buf0[1] >> 8) & 0x0000FF00);
3440 break;
3441 case 8: buf0[1] = (buf0[1] & 0x0000FFFF) | ((buf0[1] << 8) & 0xFF000000) | ((buf0[1] >> 8) & 0x00FF0000);
3442 break;
3443 case 9: buf0[2] = (buf0[1] & 0xFF000000) | buf0[2];
3444 buf0[1] = (buf0[1] & 0x00FFFFFF) | (buf0[2] << 24);
3445 buf0[2] = (buf0[2] >> 24);
3446 break;
3447 case 10: buf0[2] = ((buf0[2] << 8) & 0x0000FF00) | ((buf0[2] >> 8) & 0x000000FF);
3448 break;
3449 case 11: buf0[2] = (buf0[2] & 0x000000FF) | ((buf0[2] << 8) & 0x00FF0000) | ((buf0[2] >> 8) & 0x0000FF00);
3450 break;
3451 case 12: buf0[2] = (buf0[2] & 0x0000FFFF) | ((buf0[2] << 8) & 0xFF000000) | ((buf0[2] >> 8) & 0x00FF0000);
3452 break;
3453 case 13: buf0[3] = (buf0[2] & 0xFF000000) | buf0[3];
3454 buf0[2] = (buf0[2] & 0x00FFFFFF) | (buf0[3] << 24);
3455 buf0[3] = (buf0[3] >> 24);
3456 break;
3457 case 14: buf0[3] = ((buf0[3] << 8) & 0x0000FF00) | ((buf0[3] >> 8) & 0x000000FF);
3458 break;
3459 case 15: buf0[3] = (buf0[3] & 0x000000FF) | ((buf0[3] << 8) & 0x00FF0000) | ((buf0[3] >> 8) & 0x0000FF00);
3460 break;
3461 case 16: buf0[3] = (buf0[3] & 0x0000FFFF) | ((buf0[3] << 8) & 0xFF000000) | ((buf0[3] >> 8) & 0x00FF0000);
3462 break;
3463 case 17: buf1[0] = (buf0[3] & 0xFF000000) | buf1[0];
3464 buf0[3] = (buf0[3] & 0x00FFFFFF) | (buf1[0] << 24);
3465 buf1[0] = (buf1[0] >> 24);
3466 break;
3467 case 18: buf1[0] = ((buf1[0] << 8) & 0x0000FF00) | ((buf1[0] >> 8) & 0x000000FF);
3468 break;
3469 case 19: buf1[0] = (buf1[0] & 0x000000FF) | ((buf1[0] << 8) & 0x00FF0000) | ((buf1[0] >> 8) & 0x0000FF00);
3470 break;
3471 case 20: buf1[0] = (buf1[0] & 0x0000FFFF) | ((buf1[0] << 8) & 0xFF000000) | ((buf1[0] >> 8) & 0x00FF0000);
3472 break;
3473 case 21: buf1[1] = (buf1[0] & 0xFF000000) | buf1[1];
3474 buf1[0] = (buf1[0] & 0x00FFFFFF) | (buf1[1] << 24);
3475 buf1[1] = (buf1[1] >> 24);
3476 break;
3477 case 22: buf1[1] = ((buf1[1] << 8) & 0x0000FF00) | ((buf1[1] >> 8) & 0x000000FF);
3478 break;
3479 case 23: buf1[1] = (buf1[1] & 0x000000FF) | ((buf1[1] << 8) & 0x00FF0000) | ((buf1[1] >> 8) & 0x0000FF00);
3480 break;
3481 case 24: buf1[1] = (buf1[1] & 0x0000FFFF) | ((buf1[1] << 8) & 0xFF000000) | ((buf1[1] >> 8) & 0x00FF0000);
3482 break;
3483 case 25: buf1[2] = (buf1[1] & 0xFF000000) | buf1[2];
3484 buf1[1] = (buf1[1] & 0x00FFFFFF) | (buf1[2] << 24);
3485 buf1[2] = (buf1[2] >> 24);
3486 break;
3487 case 26: buf1[2] = ((buf1[2] << 8) & 0x0000FF00) | ((buf1[2] >> 8) & 0x000000FF);
3488 break;
3489 case 27: buf1[2] = (buf1[2] & 0x000000FF) | ((buf1[2] << 8) & 0x00FF0000) | ((buf1[2] >> 8) & 0x0000FF00);
3490 break;
3491 case 28: buf1[2] = (buf1[2] & 0x0000FFFF) | ((buf1[2] << 8) & 0xFF000000) | ((buf1[2] >> 8) & 0x00FF0000);
3492 break;
3493 case 29: buf1[3] = (buf1[2] & 0xFF000000) | buf1[3];
3494 buf1[2] = (buf1[2] & 0x00FFFFFF) | (buf1[3] << 24);
3495 buf1[3] = (buf1[3] >> 24);
3496 break;
3497 case 30: buf1[3] = ((buf1[3] << 8) & 0x0000FF00) | ((buf1[3] >> 8) & 0x000000FF);
3498 break;
3499 case 31: buf1[3] = (buf1[3] & 0x000000FF) | ((buf1[3] << 8) & 0x00FF0000) | ((buf1[3] >> 8) & 0x0000FF00);
3500 break;
3501 }
3502
3503 #endif
3504
3505 return in_len;
3506 }
3507
3508 __device__ static u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
3509 {
3510 if (p0 >= in_len) return (in_len);
3511 if (p1 >= in_len) return (in_len);
3512
3513 u32x tmp0 = 0;
3514 u32x tmp1 = 0;
3515
3516 #if __CUDA_ARCH__ >= 200
3517
3518 switch (p0)
3519 {
3520 case 0: tmp0 = __byte_perm (buf0[0], 0, 0x6540);
3521 break;
3522 case 1: tmp0 = __byte_perm (buf0[0], 0, 0x6541);
3523 break;
3524 case 2: tmp0 = __byte_perm (buf0[0], 0, 0x6542);
3525 break;
3526 case 3: tmp0 = __byte_perm (buf0[0], 0, 0x6543);
3527 break;
3528 case 4: tmp0 = __byte_perm (buf0[1], 0, 0x6540);
3529 break;
3530 case 5: tmp0 = __byte_perm (buf0[1], 0, 0x6541);
3531 break;
3532 case 6: tmp0 = __byte_perm (buf0[1], 0, 0x6542);
3533 break;
3534 case 7: tmp0 = __byte_perm (buf0[1], 0, 0x6543);
3535 break;
3536 case 8: tmp0 = __byte_perm (buf0[2], 0, 0x6540);
3537 break;
3538 case 9: tmp0 = __byte_perm (buf0[2], 0, 0x6541);
3539 break;
3540 case 10: tmp0 = __byte_perm (buf0[2], 0, 0x6542);
3541 break;
3542 case 11: tmp0 = __byte_perm (buf0[2], 0, 0x6543);
3543 break;
3544 case 12: tmp0 = __byte_perm (buf0[3], 0, 0x6540);
3545 break;
3546 case 13: tmp0 = __byte_perm (buf0[3], 0, 0x6541);
3547 break;
3548 case 14: tmp0 = __byte_perm (buf0[3], 0, 0x6542);
3549 break;
3550 case 15: tmp0 = __byte_perm (buf0[3], 0, 0x6543);
3551 break;
3552 case 16: tmp0 = __byte_perm (buf1[0], 0, 0x6540);
3553 break;
3554 case 17: tmp0 = __byte_perm (buf1[0], 0, 0x6541);
3555 break;
3556 case 18: tmp0 = __byte_perm (buf1[0], 0, 0x6542);
3557 break;
3558 case 19: tmp0 = __byte_perm (buf1[0], 0, 0x6543);
3559 break;
3560 case 20: tmp0 = __byte_perm (buf1[1], 0, 0x6540);
3561 break;
3562 case 21: tmp0 = __byte_perm (buf1[1], 0, 0x6541);
3563 break;
3564 case 22: tmp0 = __byte_perm (buf1[1], 0, 0x6542);
3565 break;
3566 case 23: tmp0 = __byte_perm (buf1[1], 0, 0x6543);
3567 break;
3568 case 24: tmp0 = __byte_perm (buf1[2], 0, 0x6540);
3569 break;
3570 case 25: tmp0 = __byte_perm (buf1[2], 0, 0x6541);
3571 break;
3572 case 26: tmp0 = __byte_perm (buf1[2], 0, 0x6542);
3573 break;
3574 case 27: tmp0 = __byte_perm (buf1[2], 0, 0x6543);
3575 break;
3576 case 28: tmp0 = __byte_perm (buf1[3], 0, 0x6540);
3577 break;
3578 case 29: tmp0 = __byte_perm (buf1[3], 0, 0x6541);
3579 break;
3580 case 30: tmp0 = __byte_perm (buf1[3], 0, 0x6542);
3581 break;
3582 case 31: tmp0 = __byte_perm (buf1[3], 0, 0x6543);
3583 break;
3584 }
3585
3586 switch (p1)
3587 {
3588 case 0: tmp1 = __byte_perm (buf0[0], 0, 0x6540);
3589 buf0[0] = __byte_perm (tmp0, buf0[0], 0x7650);
3590 break;
3591 case 1: tmp1 = __byte_perm (buf0[0], 0, 0x6541);
3592 buf0[0] = __byte_perm (tmp0, buf0[0], 0x7604);
3593 break;
3594 case 2: tmp1 = __byte_perm (buf0[0], 0, 0x6542);
3595 buf0[0] = __byte_perm (tmp0, buf0[0], 0x7054);
3596 break;
3597 case 3: tmp1 = __byte_perm (buf0[0], 0, 0x6543);
3598 buf0[0] = __byte_perm (tmp0, buf0[0], 0x0654);
3599 break;
3600 case 4: tmp1 = __byte_perm (buf0[1], 0, 0x6540);
3601 buf0[1] = __byte_perm (tmp0, buf0[1], 0x7650);
3602 break;
3603 case 5: tmp1 = __byte_perm (buf0[1], 0, 0x6541);
3604 buf0[1] = __byte_perm (tmp0, buf0[1], 0x7604);
3605 break;
3606 case 6: tmp1 = __byte_perm (buf0[1], 0, 0x6542);
3607 buf0[1] = __byte_perm (tmp0, buf0[1], 0x7054);
3608 break;
3609 case 7: tmp1 = __byte_perm (buf0[1], 0, 0x6543);
3610 buf0[1] = __byte_perm (tmp0, buf0[1], 0x0654);
3611 break;
3612 case 8: tmp1 = __byte_perm (buf0[2], 0, 0x6540);
3613 buf0[2] = __byte_perm (tmp0, buf0[2], 0x7650);
3614 break;
3615 case 9: tmp1 = __byte_perm (buf0[2], 0, 0x6541);
3616 buf0[2] = __byte_perm (tmp0, buf0[2], 0x7604);
3617 break;
3618 case 10: tmp1 = __byte_perm (buf0[2], 0, 0x6542);
3619 buf0[2] = __byte_perm (tmp0, buf0[2], 0x7054);
3620 break;
3621 case 11: tmp1 = __byte_perm (buf0[2], 0, 0x6543);
3622 buf0[2] = __byte_perm (tmp0, buf0[2], 0x0654);
3623 break;
3624 case 12: tmp1 = __byte_perm (buf0[3], 0, 0x6540);
3625 buf0[3] = __byte_perm (tmp0, buf0[3], 0x7650);
3626 break;
3627 case 13: tmp1 = __byte_perm (buf0[3], 0, 0x6541);
3628 buf0[3] = __byte_perm (tmp0, buf0[3], 0x7604);
3629 break;
3630 case 14: tmp1 = __byte_perm (buf0[3], 0, 0x6542);
3631 buf0[3] = __byte_perm (tmp0, buf0[3], 0x7054);
3632 break;
3633 case 15: tmp1 = __byte_perm (buf0[3], 0, 0x6543);
3634 buf0[3] = __byte_perm (tmp0, buf0[3], 0x0654);
3635 break;
3636 case 16: tmp1 = __byte_perm (buf1[0], 0, 0x6540);
3637 buf1[0] = __byte_perm (tmp0, buf1[0], 0x7650);
3638 break;
3639 case 17: tmp1 = __byte_perm (buf1[0], 0, 0x6541);
3640 buf1[0] = __byte_perm (tmp0, buf1[0], 0x7604);
3641 break;
3642 case 18: tmp1 = __byte_perm (buf1[0], 0, 0x6542);
3643 buf1[0] = __byte_perm (tmp0, buf1[0], 0x7054);
3644 break;
3645 case 19: tmp1 = __byte_perm (buf1[0], 0, 0x6543);
3646 buf1[0] = __byte_perm (tmp0, buf1[0], 0x0654);
3647 break;
3648 case 20: tmp1 = __byte_perm (buf1[1], 0, 0x6540);
3649 buf1[1] = __byte_perm (tmp0, buf1[1], 0x7650);
3650 break;
3651 case 21: tmp1 = __byte_perm (buf1[1], 0, 0x6541);
3652 buf1[1] = __byte_perm (tmp0, buf1[1], 0x7604);
3653 break;
3654 case 22: tmp1 = __byte_perm (buf1[1], 0, 0x6542);
3655 buf1[1] = __byte_perm (tmp0, buf1[1], 0x7054);
3656 break;
3657 case 23: tmp1 = __byte_perm (buf1[1], 0, 0x6543);
3658 buf1[1] = __byte_perm (tmp0, buf1[1], 0x0654);
3659 break;
3660 case 24: tmp1 = __byte_perm (buf1[2], 0, 0x6540);
3661 buf1[2] = __byte_perm (tmp0, buf1[2], 0x7650);
3662 break;
3663 case 25: tmp1 = __byte_perm (buf1[2], 0, 0x6541);
3664 buf1[2] = __byte_perm (tmp0, buf1[2], 0x7604);
3665 break;
3666 case 26: tmp1 = __byte_perm (buf1[2], 0, 0x6542);
3667 buf1[2] = __byte_perm (tmp0, buf1[2], 0x7054);
3668 break;
3669 case 27: tmp1 = __byte_perm (buf1[2], 0, 0x6543);
3670 buf1[2] = __byte_perm (tmp0, buf1[2], 0x0654);
3671 break;
3672 case 28: tmp1 = __byte_perm (buf1[3], 0, 0x6540);
3673 buf1[3] = __byte_perm (tmp0, buf1[3], 0x7650);
3674 break;
3675 case 29: tmp1 = __byte_perm (buf1[3], 0, 0x6541);
3676 buf1[3] = __byte_perm (tmp0, buf1[3], 0x7604);
3677 break;
3678 case 30: tmp1 = __byte_perm (buf1[3], 0, 0x6542);
3679 buf1[3] = __byte_perm (tmp0, buf1[3], 0x7054);
3680 break;
3681 case 31: tmp1 = __byte_perm (buf1[3], 0, 0x6543);
3682 buf1[3] = __byte_perm (tmp0, buf1[3], 0x0654);
3683 break;
3684 }
3685
3686 switch (p0)
3687 {
3688 case 0: buf0[0] = __byte_perm (tmp1, buf0[0], 0x7650);
3689 break;
3690 case 1: buf0[0] = __byte_perm (tmp1, buf0[0], 0x7604);
3691 break;
3692 case 2: buf0[0] = __byte_perm (tmp1, buf0[0], 0x7054);
3693 break;
3694 case 3: buf0[0] = __byte_perm (tmp1, buf0[0], 0x0654);
3695 break;
3696 case 4: buf0[1] = __byte_perm (tmp1, buf0[1], 0x7650);
3697 break;
3698 case 5: buf0[1] = __byte_perm (tmp1, buf0[1], 0x7604);
3699 break;
3700 case 6: buf0[1] = __byte_perm (tmp1, buf0[1], 0x7054);
3701 break;
3702 case 7: buf0[1] = __byte_perm (tmp1, buf0[1], 0x0654);
3703 break;
3704 case 8: buf0[2] = __byte_perm (tmp1, buf0[2], 0x7650);
3705 break;
3706 case 9: buf0[2] = __byte_perm (tmp1, buf0[2], 0x7604);
3707 break;
3708 case 10: buf0[2] = __byte_perm (tmp1, buf0[2], 0x7054);
3709 break;
3710 case 11: buf0[2] = __byte_perm (tmp1, buf0[2], 0x0654);
3711 break;
3712 case 12: buf0[3] = __byte_perm (tmp1, buf0[3], 0x7650);
3713 break;
3714 case 13: buf0[3] = __byte_perm (tmp1, buf0[3], 0x7604);
3715 break;
3716 case 14: buf0[3] = __byte_perm (tmp1, buf0[3], 0x7054);
3717 break;
3718 case 15: buf0[3] = __byte_perm (tmp1, buf0[3], 0x0654);
3719 break;
3720 case 16: buf1[0] = __byte_perm (tmp1, buf1[0], 0x7650);
3721 break;
3722 case 17: buf1[0] = __byte_perm (tmp1, buf1[0], 0x7604);
3723 break;
3724 case 18: buf1[0] = __byte_perm (tmp1, buf1[0], 0x7054);
3725 break;
3726 case 19: buf1[0] = __byte_perm (tmp1, buf1[0], 0x0654);
3727 break;
3728 case 20: buf1[1] = __byte_perm (tmp1, buf1[1], 0x7650);
3729 break;
3730 case 21: buf1[1] = __byte_perm (tmp1, buf1[1], 0x7604);
3731 break;
3732 case 22: buf1[1] = __byte_perm (tmp1, buf1[1], 0x7054);
3733 break;
3734 case 23: buf1[1] = __byte_perm (tmp1, buf1[1], 0x0654);
3735 break;
3736 case 24: buf1[2] = __byte_perm (tmp1, buf1[2], 0x7650);
3737 break;
3738 case 25: buf1[2] = __byte_perm (tmp1, buf1[2], 0x7604);
3739 break;
3740 case 26: buf1[2] = __byte_perm (tmp1, buf1[2], 0x7054);
3741 break;
3742 case 27: buf1[2] = __byte_perm (tmp1, buf1[2], 0x0654);
3743 break;
3744 case 28: buf1[3] = __byte_perm (tmp1, buf1[3], 0x7650);
3745 break;
3746 case 29: buf1[3] = __byte_perm (tmp1, buf1[3], 0x7604);
3747 break;
3748 case 30: buf1[3] = __byte_perm (tmp1, buf1[3], 0x7054);
3749 break;
3750 case 31: buf1[3] = __byte_perm (tmp1, buf1[3], 0x0654);
3751 break;
3752 }
3753
3754 #else
3755
3756 switch (p0)
3757 {
3758 case 0: tmp0 = (buf0[0] >> 0) & 0xFF;
3759 break;
3760 case 1: tmp0 = (buf0[0] >> 8) & 0xFF;
3761 break;
3762 case 2: tmp0 = (buf0[0] >> 16) & 0xFF;
3763 break;
3764 case 3: tmp0 = (buf0[0] >> 24) & 0xFF;
3765 break;
3766 case 4: tmp0 = (buf0[1] >> 0) & 0xFF;
3767 break;
3768 case 5: tmp0 = (buf0[1] >> 8) & 0xFF;
3769 break;
3770 case 6: tmp0 = (buf0[1] >> 16) & 0xFF;
3771 break;
3772 case 7: tmp0 = (buf0[1] >> 24) & 0xFF;
3773 break;
3774 case 8: tmp0 = (buf0[2] >> 0) & 0xFF;
3775 break;
3776 case 9: tmp0 = (buf0[2] >> 8) & 0xFF;
3777 break;
3778 case 10: tmp0 = (buf0[2] >> 16) & 0xFF;
3779 break;
3780 case 11: tmp0 = (buf0[2] >> 24) & 0xFF;
3781 break;
3782 case 12: tmp0 = (buf0[3] >> 0) & 0xFF;
3783 break;
3784 case 13: tmp0 = (buf0[3] >> 8) & 0xFF;
3785 break;
3786 case 14: tmp0 = (buf0[3] >> 16) & 0xFF;
3787 break;
3788 case 15: tmp0 = (buf0[3] >> 24) & 0xFF;
3789 break;
3790 case 16: tmp0 = (buf1[0] >> 0) & 0xFF;
3791 break;
3792 case 17: tmp0 = (buf1[0] >> 8) & 0xFF;
3793 break;
3794 case 18: tmp0 = (buf1[0] >> 16) & 0xFF;
3795 break;
3796 case 19: tmp0 = (buf1[0] >> 24) & 0xFF;
3797 break;
3798 case 20: tmp0 = (buf1[1] >> 0) & 0xFF;
3799 break;
3800 case 21: tmp0 = (buf1[1] >> 8) & 0xFF;
3801 break;
3802 case 22: tmp0 = (buf1[1] >> 16) & 0xFF;
3803 break;
3804 case 23: tmp0 = (buf1[1] >> 24) & 0xFF;
3805 break;
3806 case 24: tmp0 = (buf1[2] >> 0) & 0xFF;
3807 break;
3808 case 25: tmp0 = (buf1[2] >> 8) & 0xFF;
3809 break;
3810 case 26: tmp0 = (buf1[2] >> 16) & 0xFF;
3811 break;
3812 case 27: tmp0 = (buf1[2] >> 24) & 0xFF;
3813 break;
3814 case 28: tmp0 = (buf1[3] >> 0) & 0xFF;
3815 break;
3816 case 29: tmp0 = (buf1[3] >> 8) & 0xFF;
3817 break;
3818 case 30: tmp0 = (buf1[3] >> 16) & 0xFF;
3819 break;
3820 case 31: tmp0 = (buf1[3] >> 24) & 0xFF;
3821 break;
3822 }
3823
3824 switch (p1)
3825 {
3826 case 0: tmp1 = (buf0[0] >> 0) & 0xff;
3827 buf0[0] = (buf0[0] & 0xffffff00) | tmp0 << 0;
3828 break;
3829 case 1: tmp1 = (buf0[0] >> 8) & 0xff;
3830 buf0[0] = (buf0[0] & 0xffff00ff) | tmp0 << 8;
3831 break;
3832 case 2: tmp1 = (buf0[0] >> 16) & 0xff;
3833 buf0[0] = (buf0[0] & 0xff00ffff) | tmp0 << 16;
3834 break;
3835 case 3: tmp1 = (buf0[0] >> 24) & 0xff;
3836 buf0[0] = (buf0[0] & 0x00ffffff) | tmp0 << 24;
3837 break;
3838 case 4: tmp1 = (buf0[1] >> 0) & 0xff;
3839 buf0[1] = (buf0[1] & 0xffffff00) | tmp0 << 0;
3840 break;
3841 case 5: tmp1 = (buf0[1] >> 8) & 0xff;
3842 buf0[1] = (buf0[1] & 0xffff00ff) | tmp0 << 8;
3843 break;
3844 case 6: tmp1 = (buf0[1] >> 16) & 0xff;
3845 buf0[1] = (buf0[1] & 0xff00ffff) | tmp0 << 16;
3846 break;
3847 case 7: tmp1 = (buf0[1] >> 24) & 0xff;
3848 buf0[1] = (buf0[1] & 0x00ffffff) | tmp0 << 24;
3849 break;
3850 case 8: tmp1 = (buf0[2] >> 0) & 0xff;
3851 buf0[2] = (buf0[2] & 0xffffff00) | tmp0 << 0;
3852 break;
3853 case 9: tmp1 = (buf0[2] >> 8) & 0xff;
3854 buf0[2] = (buf0[2] & 0xffff00ff) | tmp0 << 8;
3855 break;
3856 case 10: tmp1 = (buf0[2] >> 16) & 0xff;
3857 buf0[2] = (buf0[2] & 0xff00ffff) | tmp0 << 16;
3858 break;
3859 case 11: tmp1 = (buf0[2] >> 24) & 0xff;
3860 buf0[2] = (buf0[2] & 0x00ffffff) | tmp0 << 24;
3861 break;
3862 case 12: tmp1 = (buf0[3] >> 0) & 0xff;
3863 buf0[3] = (buf0[3] & 0xffffff00) | tmp0 << 0;
3864 break;
3865 case 13: tmp1 = (buf0[3] >> 8) & 0xff;
3866 buf0[3] = (buf0[3] & 0xffff00ff) | tmp0 << 8;
3867 break;
3868 case 14: tmp1 = (buf0[3] >> 16) & 0xff;
3869 buf0[3] = (buf0[3] & 0xff00ffff) | tmp0 << 16;
3870 break;
3871 case 15: tmp1 = (buf0[3] >> 24) & 0xff;
3872 buf0[3] = (buf0[3] & 0x00ffffff) | tmp0 << 24;
3873 break;
3874 case 16: tmp1 = (buf1[0] >> 0) & 0xff;
3875 buf1[0] = (buf1[0] & 0xffffff00) | tmp0 << 0;
3876 break;
3877 case 17: tmp1 = (buf1[0] >> 8) & 0xff;
3878 buf1[0] = (buf1[0] & 0xffff00ff) | tmp0 << 8;
3879 break;
3880 case 18: tmp1 = (buf1[0] >> 16) & 0xff;
3881 buf1[0] = (buf1[0] & 0xff00ffff) | tmp0 << 16;
3882 break;
3883 case 19: tmp1 = (buf1[0] >> 24) & 0xff;
3884 buf1[0] = (buf1[0] & 0x00ffffff) | tmp0 << 24;
3885 break;
3886 case 20: tmp1 = (buf1[1] >> 0) & 0xff;
3887 buf1[1] = (buf1[1] & 0xffffff00) | tmp0 << 0;
3888 break;
3889 case 21: tmp1 = (buf1[1] >> 8) & 0xff;
3890 buf1[1] = (buf1[1] & 0xffff00ff) | tmp0 << 8;
3891 break;
3892 case 22: tmp1 = (buf1[1] >> 16) & 0xff;
3893 buf1[1] = (buf1[1] & 0xff00ffff) | tmp0 << 16;
3894 break;
3895 case 23: tmp1 = (buf1[1] >> 24) & 0xff;
3896 buf1[1] = (buf1[1] & 0x00ffffff) | tmp0 << 24;
3897 break;
3898 case 24: tmp1 = (buf1[2] >> 0) & 0xff;
3899 buf1[2] = (buf1[2] & 0xffffff00) | tmp0 << 0;
3900 break;
3901 case 25: tmp1 = (buf1[2] >> 8) & 0xff;
3902 buf1[2] = (buf1[2] & 0xffff00ff) | tmp0 << 8;
3903 break;
3904 case 26: tmp1 = (buf1[2] >> 16) & 0xff;
3905 buf1[2] = (buf1[2] & 0xff00ffff) | tmp0 << 16;
3906 break;
3907 case 27: tmp1 = (buf1[2] >> 24) & 0xff;
3908 buf1[2] = (buf1[2] & 0x00ffffff) | tmp0 << 24;
3909 break;
3910 case 28: tmp1 = (buf1[3] >> 0) & 0xff;
3911 buf1[3] = (buf1[3] & 0xffffff00) | tmp0 << 0;
3912 break;
3913 case 29: tmp1 = (buf1[3] >> 8) & 0xff;
3914 buf1[3] = (buf1[3] & 0xffff00ff) | tmp0 << 8;
3915 break;
3916 case 30: tmp1 = (buf1[3] >> 16) & 0xff;
3917 buf1[3] = (buf1[3] & 0xff00ffff) | tmp0 << 16;
3918 break;
3919 case 31: tmp1 = (buf1[3] >> 24) & 0xff;
3920 buf1[3] = (buf1[3] & 0x00ffffff) | tmp0 << 24;
3921 break;
3922 }
3923
3924 switch (p0)
3925 {
3926 case 0: buf0[0] = (buf0[0] & 0xffffff00) | tmp1 << 0;
3927 break;
3928 case 1: buf0[0] = (buf0[0] & 0xffff00ff) | tmp1 << 8;
3929 break;
3930 case 2: buf0[0] = (buf0[0] & 0xff00ffff) | tmp1 << 16;
3931 break;
3932 case 3: buf0[0] = (buf0[0] & 0x00ffffff) | tmp1 << 24;
3933 break;
3934 case 4: buf0[1] = (buf0[1] & 0xffffff00) | tmp1 << 0;
3935 break;
3936 case 5: buf0[1] = (buf0[1] & 0xffff00ff) | tmp1 << 8;
3937 break;
3938 case 6: buf0[1] = (buf0[1] & 0xff00ffff) | tmp1 << 16;
3939 break;
3940 case 7: buf0[1] = (buf0[1] & 0x00ffffff) | tmp1 << 24;
3941 break;
3942 case 8: buf0[2] = (buf0[2] & 0xffffff00) | tmp1 << 0;
3943 break;
3944 case 9: buf0[2] = (buf0[2] & 0xffff00ff) | tmp1 << 8;
3945 break;
3946 case 10: buf0[2] = (buf0[2] & 0xff00ffff) | tmp1 << 16;
3947 break;
3948 case 11: buf0[2] = (buf0[2] & 0x00ffffff) | tmp1 << 24;
3949 break;
3950 case 12: buf0[3] = (buf0[3] & 0xffffff00) | tmp1 << 0;
3951 break;
3952 case 13: buf0[3] = (buf0[3] & 0xffff00ff) | tmp1 << 8;
3953 break;
3954 case 14: buf0[3] = (buf0[3] & 0xff00ffff) | tmp1 << 16;
3955 break;
3956 case 15: buf0[3] = (buf0[3] & 0x00ffffff) | tmp1 << 24;
3957 break;
3958 case 16: buf1[0] = (buf1[0] & 0xffffff00) | tmp1 << 0;
3959 break;
3960 case 17: buf1[0] = (buf1[0] & 0xffff00ff) | tmp1 << 8;
3961 break;
3962 case 18: buf1[0] = (buf1[0] & 0xff00ffff) | tmp1 << 16;
3963 break;
3964 case 19: buf1[0] = (buf1[0] & 0x00ffffff) | tmp1 << 24;
3965 break;
3966 case 20: buf1[1] = (buf1[1] & 0xffffff00) | tmp1 << 0;
3967 break;
3968 case 21: buf1[1] = (buf1[1] & 0xffff00ff) | tmp1 << 8;
3969 break;
3970 case 22: buf1[1] = (buf1[1] & 0xff00ffff) | tmp1 << 16;
3971 break;
3972 case 23: buf1[1] = (buf1[1] & 0x00ffffff) | tmp1 << 24;
3973 break;
3974 case 24: buf1[2] = (buf1[2] & 0xffffff00) | tmp1 << 0;
3975 break;
3976 case 25: buf1[2] = (buf1[2] & 0xffff00ff) | tmp1 << 8;
3977 break;
3978 case 26: buf1[2] = (buf1[2] & 0xff00ffff) | tmp1 << 16;
3979 break;
3980 case 27: buf1[2] = (buf1[2] & 0x00ffffff) | tmp1 << 24;
3981 break;
3982 case 28: buf1[3] = (buf1[3] & 0xffffff00) | tmp1 << 0;
3983 break;
3984 case 29: buf1[3] = (buf1[3] & 0xffff00ff) | tmp1 << 8;
3985 break;
3986 case 30: buf1[3] = (buf1[3] & 0xff00ffff) | tmp1 << 16;
3987 break;
3988 case 31: buf1[3] = (buf1[3] & 0x00ffffff) | tmp1 << 24;
3989 break;
3990 }
3991
3992 #endif
3993
3994 return in_len;
3995 }
3996
3997 __device__ static u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
3998 {
3999 if (p0 >= in_len) return (in_len);
4000
4001 const u32 mr = 0xff << ((p0 & 3) * 8);
4002 const u32 ml = ~mr;
4003
4004 switch (p0 / 4)
4005 {
4006 case 0: buf0[0] = (buf0[0] & ml) | (((buf0[0] & mr) << 1) & mr); break;
4007 case 1: buf0[1] = (buf0[1] & ml) | (((buf0[1] & mr) << 1) & mr); break;
4008 case 2: buf0[2] = (buf0[2] & ml) | (((buf0[2] & mr) << 1) & mr); break;
4009 case 3: buf0[3] = (buf0[3] & ml) | (((buf0[3] & mr) << 1) & mr); break;
4010 case 4: buf1[0] = (buf1[0] & ml) | (((buf1[0] & mr) << 1) & mr); break;
4011 case 5: buf1[1] = (buf1[1] & ml) | (((buf1[1] & mr) << 1) & mr); break;
4012 case 6: buf1[2] = (buf1[2] & ml) | (((buf1[2] & mr) << 1) & mr); break;
4013 case 7: buf1[3] = (buf1[3] & ml) | (((buf1[3] & mr) << 1) & mr); break;
4014 }
4015
4016 return in_len;
4017 }
4018
4019 __device__ static u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
4020 {
4021 if (p0 >= in_len) return (in_len);
4022
4023 const u32 mr = 0xff << ((p0 & 3) * 8);
4024 const u32 ml = ~mr;
4025
4026 switch (p0 / 4)
4027 {
4028 case 0: buf0[0] = (buf0[0] & ml) | (((buf0[0] & mr) >> 1) & mr); break;
4029 case 1: buf0[1] = (buf0[1] & ml) | (((buf0[1] & mr) >> 1) & mr); break;
4030 case 2: buf0[2] = (buf0[2] & ml) | (((buf0[2] & mr) >> 1) & mr); break;
4031 case 3: buf0[3] = (buf0[3] & ml) | (((buf0[3] & mr) >> 1) & mr); break;
4032 case 4: buf1[0] = (buf1[0] & ml) | (((buf1[0] & mr) >> 1) & mr); break;
4033 case 5: buf1[1] = (buf1[1] & ml) | (((buf1[1] & mr) >> 1) & mr); break;
4034 case 6: buf1[2] = (buf1[2] & ml) | (((buf1[2] & mr) >> 1) & mr); break;
4035 case 7: buf1[3] = (buf1[3] & ml) | (((buf1[3] & mr) >> 1) & mr); break;
4036 }
4037
4038 return in_len;
4039 }
4040
4041 __device__ static u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
4042 {
4043 if (p0 >= in_len) return (in_len);
4044
4045 const u32 mr = 0xff << ((p0 & 3) * 8);
4046 const u32 ml = ~mr;
4047
4048 const u32 n = 0x01010101 & mr;
4049
4050 switch (p0 / 4)
4051 {
4052 case 0: buf0[0] = (buf0[0] & ml) | (((buf0[0] & mr) + n) & mr); break;
4053 case 1: buf0[1] = (buf0[1] & ml) | (((buf0[1] & mr) + n) & mr); break;
4054 case 2: buf0[2] = (buf0[2] & ml) | (((buf0[2] & mr) + n) & mr); break;
4055 case 3: buf0[3] = (buf0[3] & ml) | (((buf0[3] & mr) + n) & mr); break;
4056 case 4: buf1[0] = (buf1[0] & ml) | (((buf1[0] & mr) + n) & mr); break;
4057 case 5: buf1[1] = (buf1[1] & ml) | (((buf1[1] & mr) + n) & mr); break;
4058 case 6: buf1[2] = (buf1[2] & ml) | (((buf1[2] & mr) + n) & mr); break;
4059 case 7: buf1[3] = (buf1[3] & ml) | (((buf1[3] & mr) + n) & mr); break;
4060 }
4061
4062 return in_len;
4063 }
4064
4065 __device__ static u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
4066 {
4067 if (p0 >= in_len) return (in_len);
4068
4069 const u32 mr = 0xff << ((p0 & 3) * 8);
4070 const u32 ml = ~mr;
4071
4072 const u32 n = 0x01010101 & mr;
4073
4074 switch (p0 / 4)
4075 {
4076 case 0: buf0[0] = (buf0[0] & ml) | (((buf0[0] & mr) - n) & mr); break;
4077 case 1: buf0[1] = (buf0[1] & ml) | (((buf0[1] & mr) - n) & mr); break;
4078 case 2: buf0[2] = (buf0[2] & ml) | (((buf0[2] & mr) - n) & mr); break;
4079 case 3: buf0[3] = (buf0[3] & ml) | (((buf0[3] & mr) - n) & mr); break;
4080 case 4: buf1[0] = (buf1[0] & ml) | (((buf1[0] & mr) - n) & mr); break;
4081 case 5: buf1[1] = (buf1[1] & ml) | (((buf1[1] & mr) - n) & mr); break;
4082 case 6: buf1[2] = (buf1[2] & ml) | (((buf1[2] & mr) - n) & mr); break;
4083 case 7: buf1[3] = (buf1[3] & ml) | (((buf1[3] & mr) - n) & mr); break;
4084 }
4085
4086 return in_len;
4087 }
4088
4089 __device__ static u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
4090 {
4091 if ((p0 + 1) >= in_len) return (in_len);
4092
4093 u32x tib40[4];
4094 u32x tib41[4];
4095
4096 lshift_block (buf0, buf1, tib40, tib41);
4097
4098 const u32 mr = 0xff << ((p0 & 3) * 8);
4099 const u32 ml = ~mr;
4100
4101 switch (p0 / 4)
4102 {
4103 case 0: buf0[0] = (buf0[0] & ml) | (tib40[0] & mr); break;
4104 case 1: buf0[1] = (buf0[1] & ml) | (tib40[1] & mr); break;
4105 case 2: buf0[2] = (buf0[2] & ml) | (tib40[2] & mr); break;
4106 case 3: buf0[3] = (buf0[3] & ml) | (tib40[3] & mr); break;
4107 case 4: buf1[0] = (buf1[0] & ml) | (tib41[0] & mr); break;
4108 case 5: buf1[1] = (buf1[1] & ml) | (tib41[1] & mr); break;
4109 case 6: buf1[2] = (buf1[2] & ml) | (tib41[2] & mr); break;
4110 case 7: buf1[3] = (buf1[3] & ml) | (tib41[3] & mr); break;
4111 }
4112
4113 return in_len;
4114 }
4115
4116 __device__ static u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
4117 {
4118 if (p0 == 0) return (in_len);
4119
4120 if (p0 >= in_len) return (in_len);
4121
4122 u32x tib40[4];
4123 u32x tib41[4];
4124
4125 rshift_block (buf0, buf1, tib40, tib41);
4126
4127 const u32 mr = 0xff << ((p0 & 3) * 8);
4128 const u32 ml = ~mr;
4129
4130 switch (p0 / 4)
4131 {
4132 case 0: buf0[0] = (buf0[0] & ml) | (tib40[0] & mr); break;
4133 case 1: buf0[1] = (buf0[1] & ml) | (tib40[1] & mr); break;
4134 case 2: buf0[2] = (buf0[2] & ml) | (tib40[2] & mr); break;
4135 case 3: buf0[3] = (buf0[3] & ml) | (tib40[3] & mr); break;
4136 case 4: buf1[0] = (buf1[0] & ml) | (tib41[0] & mr); break;
4137 case 5: buf1[1] = (buf1[1] & ml) | (tib41[1] & mr); break;
4138 case 6: buf1[2] = (buf1[2] & ml) | (tib41[2] & mr); break;
4139 case 7: buf1[3] = (buf1[3] & ml) | (tib41[3] & mr); break;
4140 }
4141
4142 return in_len;
4143 }
4144
4145 __device__ static u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
4146 {
4147 if (p0 > in_len) return (in_len);
4148
4149 if ((in_len + p0) >= 32) return (in_len);
4150
4151 u32 out_len = in_len;
4152
4153 u32x tib40[4];
4154 u32x tib41[4];
4155
4156 tib40[0] = buf0[0];
4157 tib40[1] = buf0[1];
4158 tib40[2] = buf0[2];
4159 tib40[3] = buf0[3];
4160 tib41[0] = buf1[0];
4161 tib41[1] = buf1[1];
4162 tib41[2] = buf1[2];
4163 tib41[3] = buf1[3];
4164
4165 truncate_right (tib40, tib41, p0);
4166
4167 rshift_block_N (buf0, buf1, buf0, buf1, p0);
4168
4169 buf0[0] |= tib40[0];
4170 buf0[1] |= tib40[1];
4171 buf0[2] |= tib40[2];
4172 buf0[3] |= tib40[3];
4173 buf1[0] |= tib41[0];
4174 buf1[1] |= tib41[1];
4175 buf1[2] |= tib41[2];
4176 buf1[3] |= tib41[3];
4177
4178 out_len += p0;
4179
4180 return out_len;
4181 }
4182
4183 __device__ static u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
4184 {
4185 if (p0 > in_len) return (in_len);
4186
4187 if ((in_len + p0) >= 32) return (in_len);
4188
4189 u32 out_len = in_len;
4190
4191 u32x tib40[4];
4192 u32x tib41[4];
4193
4194 rshift_block_N (buf0, buf1, tib40, tib41, p0);
4195
4196 truncate_left (tib40, tib41, out_len);
4197
4198 buf0[0] |= tib40[0];
4199 buf0[1] |= tib40[1];
4200 buf0[2] |= tib40[2];
4201 buf0[3] |= tib40[3];
4202 buf1[0] |= tib41[0];
4203 buf1[1] |= tib41[1];
4204 buf1[2] |= tib41[2];
4205 buf1[3] |= tib41[3];
4206
4207 out_len += p0;
4208
4209 return out_len;
4210 }
4211
4212 __device__ static u32 rule_op_mangle_title (const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
4213 {
4214 buf0[0] |= (generate_cmask (buf0[0]));
4215 buf0[1] |= (generate_cmask (buf0[1]));
4216 buf0[2] |= (generate_cmask (buf0[2]));
4217 buf0[3] |= (generate_cmask (buf0[3]));
4218 buf1[0] |= (generate_cmask (buf1[0]));
4219 buf1[1] |= (generate_cmask (buf1[1]));
4220 buf1[2] |= (generate_cmask (buf1[2]));
4221 buf1[3] |= (generate_cmask (buf1[3]));
4222
4223 buf0[0] &= ~(0x00000020 & generate_cmask (buf0[0]));
4224
4225
4226
4227 for (u32 i = 0; i < in_len; i++)
4228 {
4229 u32x tmp0;
4230 u32x tmp1;
4231
4232 switch (i)
4233 {
4234 case 0: tmp0 = __byte_perm (buf0[0], 0, 0x6540);
4235 tmp1 = ~(0x00002000 & generate_cmask (buf0[0])); break;
4236 case 1: tmp0 = __byte_perm (buf0[0], 0, 0x6541);
4237 tmp1 = ~(0x00200000 & generate_cmask (buf0[0])); break;
4238 case 2: tmp0 = __byte_perm (buf0[0], 0, 0x6542);
4239 tmp1 = ~(0x20000000 & generate_cmask (buf0[0])); break;
4240 case 3: tmp0 = __byte_perm (buf0[0], 0, 0x6543);
4241 tmp1 = ~(0x00000020 & generate_cmask (buf0[1])); break;
4242 case 4: tmp0 = __byte_perm (buf0[1], 0, 0x6540);
4243 tmp1 = ~(0x00002000 & generate_cmask (buf0[1])); break;
4244 case 5: tmp0 = __byte_perm (buf0[1], 0, 0x6541);
4245 tmp1 = ~(0x00200000 & generate_cmask (buf0[1])); break;
4246 case 6: tmp0 = __byte_perm (buf0[1], 0, 0x6542);
4247 tmp1 = ~(0x20000000 & generate_cmask (buf0[1])); break;
4248 case 7: tmp0 = __byte_perm (buf0[1], 0, 0x6543);
4249 tmp1 = ~(0x00000020 & generate_cmask (buf0[2])); break;
4250 case 8: tmp0 = __byte_perm (buf0[2], 0, 0x6540);
4251 tmp1 = ~(0x00002000 & generate_cmask (buf0[2])); break;
4252 case 9: tmp0 = __byte_perm (buf0[2], 0, 0x6541);
4253 tmp1 = ~(0x00200000 & generate_cmask (buf0[2])); break;
4254 case 10: tmp0 = __byte_perm (buf0[2], 0, 0x6542);
4255 tmp1 = ~(0x20000000 & generate_cmask (buf0[2])); break;
4256 case 11: tmp0 = __byte_perm (buf0[2], 0, 0x6543);
4257 tmp1 = ~(0x00000020 & generate_cmask (buf0[3])); break;
4258 case 12: tmp0 = __byte_perm (buf0[3], 0, 0x6540);
4259 tmp1 = ~(0x00002000 & generate_cmask (buf0[3])); break;
4260 case 13: tmp0 = __byte_perm (buf0[3], 0, 0x6541);
4261 tmp1 = ~(0x00200000 & generate_cmask (buf0[3])); break;
4262 case 14: tmp0 = __byte_perm (buf0[3], 0, 0x6542);
4263 tmp1 = ~(0x20000000 & generate_cmask (buf0[3])); break;
4264 case 15: tmp0 = __byte_perm (buf0[3], 0, 0x6543);
4265 tmp1 = ~(0x00000020 & generate_cmask (buf1[0])); break;
4266 case 16: tmp0 = __byte_perm (buf1[0], 0, 0x6540);
4267 tmp1 = ~(0x00002000 & generate_cmask (buf1[0])); break;
4268 case 17: tmp0 = __byte_perm (buf1[0], 0, 0x6541);
4269 tmp1 = ~(0x00200000 & generate_cmask (buf1[0])); break;
4270 case 18: tmp0 = __byte_perm (buf1[0], 0, 0x6542);
4271 tmp1 = ~(0x20000000 & generate_cmask (buf1[0])); break;
4272 case 19: tmp0 = __byte_perm (buf1[0], 0, 0x6543);
4273 tmp1 = ~(0x00000020 & generate_cmask (buf1[1])); break;
4274 case 20: tmp0 = __byte_perm (buf1[1], 0, 0x6540);
4275 tmp1 = ~(0x00002000 & generate_cmask (buf1[1])); break;
4276 case 21: tmp0 = __byte_perm (buf1[1], 0, 0x6541);
4277 tmp1 = ~(0x00200000 & generate_cmask (buf1[1])); break;
4278 case 22: tmp0 = __byte_perm (buf1[1], 0, 0x6542);
4279 tmp1 = ~(0x20000000 & generate_cmask (buf1[1])); break;
4280 case 23: tmp0 = __byte_perm (buf1[1], 0, 0x6543);
4281 tmp1 = ~(0x00000020 & generate_cmask (buf1[2])); break;
4282 case 24: tmp0 = __byte_perm (buf1[2], 0, 0x6540);
4283 tmp1 = ~(0x00002000 & generate_cmask (buf1[2])); break;
4284 case 25: tmp0 = __byte_perm (buf1[2], 0, 0x6541);
4285 tmp1 = ~(0x00200000 & generate_cmask (buf1[2])); break;
4286 case 26: tmp0 = __byte_perm (buf1[2], 0, 0x6542);
4287 tmp1 = ~(0x20000000 & generate_cmask (buf1[2])); break;
4288 case 27: tmp0 = __byte_perm (buf1[2], 0, 0x6543);
4289 tmp1 = ~(0x00000020 & generate_cmask (buf1[3])); break;
4290 case 28: tmp0 = __byte_perm (buf1[3], 0, 0x6540);
4291 tmp1 = ~(0x00002000 & generate_cmask (buf1[3])); break;
4292 case 29: tmp0 = __byte_perm (buf1[3], 0, 0x6541);
4293 tmp1 = ~(0x00200000 & generate_cmask (buf1[3])); break;
4294 case 30: tmp0 = __byte_perm (buf1[3], 0, 0x6542);
4295 tmp1 = ~(0x20000000 & generate_cmask (buf1[3])); break;
4296 }
4297
4298 #ifdef VECT_SIZE1
4299 if (i < 3)
4300 {
4301 if (tmp0 == ' ') buf0[0] &= tmp1 ;
4302 }
4303 else if (i < 7)
4304 {
4305 if (tmp0 == ' ') buf0[1] &= tmp1 ;
4306 }
4307 else if (i < 11)
4308 {
4309 if (tmp0 == ' ') buf0[2] &= tmp1 ;
4310 }
4311 else if (i < 15)
4312 {
4313 if (tmp0 == ' ') buf0[3] &= tmp1 ;
4314 }
4315 else if (i < 19)
4316 {
4317 if (tmp0 == ' ') buf1[0] &= tmp1 ;
4318 }
4319 else if (i < 23)
4320 {
4321 if (tmp0 == ' ') buf1[1] &= tmp1 ;
4322 }
4323 else if (i < 27)
4324 {
4325 if (tmp0 == ' ') buf1[2] &= tmp1 ;
4326 }
4327 else if (i < 31)
4328 {
4329 if (tmp0 == ' ') buf1[3] &= tmp1 ;
4330 }
4331 #endif
4332
4333 #ifdef VECT_SIZE2
4334 if (i < 3)
4335 {
4336 if (tmp0.x == ' ') buf0[0].x &= tmp1.x;
4337 if (tmp0.y == ' ') buf0[0].y &= tmp1.y;
4338 }
4339 else if (i < 7)
4340 {
4341 if (tmp0.x == ' ') buf0[1].x &= tmp1.x;
4342 if (tmp0.y == ' ') buf0[1].y &= tmp1.y;
4343 }
4344 else if (i < 11)
4345 {
4346 if (tmp0.x == ' ') buf0[2].x &= tmp1.x;
4347 if (tmp0.y == ' ') buf0[2].y &= tmp1.y;
4348 }
4349 else if (i < 15)
4350 {
4351 if (tmp0.x == ' ') buf0[3].x &= tmp1.x;
4352 if (tmp0.y == ' ') buf0[3].y &= tmp1.y;
4353 }
4354 else if (i < 19)
4355 {
4356 if (tmp0.x == ' ') buf1[0].x &= tmp1.x;
4357 if (tmp0.y == ' ') buf1[0].y &= tmp1.y;
4358 }
4359 else if (i < 23)
4360 {
4361 if (tmp0.x == ' ') buf1[1].x &= tmp1.x;
4362 if (tmp0.y == ' ') buf1[1].y &= tmp1.y;
4363 }
4364 else if (i < 27)
4365 {
4366 if (tmp0.x == ' ') buf1[2].x &= tmp1.x;
4367 if (tmp0.y == ' ') buf1[2].y &= tmp1.y;
4368 }
4369 else if (i < 31)
4370 {
4371 if (tmp0.x == ' ') buf1[3].x &= tmp1.x;
4372 if (tmp0.y == ' ') buf1[3].y &= tmp1.y;
4373 }
4374 #endif
4375
4376 #ifdef VECT_SIZE4
4377 if (i < 3)
4378 {
4379 if (tmp0.x == ' ') buf0[0].x &= tmp1.x;
4380 if (tmp0.y == ' ') buf0[0].y &= tmp1.y;
4381 if (tmp0.z == ' ') buf0[0].z &= tmp1.z;
4382 if (tmp0.w == ' ') buf0[0].w &= tmp1.w;
4383 }
4384 else if (i < 7)
4385 {
4386 if (tmp0.x == ' ') buf0[1].x &= tmp1.x;
4387 if (tmp0.y == ' ') buf0[1].y &= tmp1.y;
4388 if (tmp0.z == ' ') buf0[1].z &= tmp1.z;
4389 if (tmp0.w == ' ') buf0[1].w &= tmp1.w;
4390 }
4391 else if (i < 11)
4392 {
4393 if (tmp0.x == ' ') buf0[2].x &= tmp1.x;
4394 if (tmp0.y == ' ') buf0[2].y &= tmp1.y;
4395 if (tmp0.z == ' ') buf0[2].z &= tmp1.z;
4396 if (tmp0.w == ' ') buf0[2].w &= tmp1.w;
4397 }
4398 else if (i < 15)
4399 {
4400 if (tmp0.x == ' ') buf0[3].x &= tmp1.x;
4401 if (tmp0.y == ' ') buf0[3].y &= tmp1.y;
4402 if (tmp0.z == ' ') buf0[3].z &= tmp1.z;
4403 if (tmp0.w == ' ') buf0[3].w &= tmp1.w;
4404 }
4405 else if (i < 19)
4406 {
4407 if (tmp0.x == ' ') buf1[0].x &= tmp1.x;
4408 if (tmp0.y == ' ') buf1[0].y &= tmp1.y;
4409 if (tmp0.z == ' ') buf1[0].z &= tmp1.z;
4410 if (tmp0.w == ' ') buf1[0].w &= tmp1.w;
4411 }
4412 else if (i < 23)
4413 {
4414 if (tmp0.x == ' ') buf1[1].x &= tmp1.x;
4415 if (tmp0.y == ' ') buf1[1].y &= tmp1.y;
4416 if (tmp0.z == ' ') buf1[1].z &= tmp1.z;
4417 if (tmp0.w == ' ') buf1[1].w &= tmp1.w;
4418 }
4419 else if (i < 27)
4420 {
4421 if (tmp0.x == ' ') buf1[2].x &= tmp1.x;
4422 if (tmp0.y == ' ') buf1[2].y &= tmp1.y;
4423 if (tmp0.z == ' ') buf1[2].z &= tmp1.z;
4424 if (tmp0.w == ' ') buf1[2].w &= tmp1.w;
4425 }
4426 else if (i < 31)
4427 {
4428 if (tmp0.x == ' ') buf1[3].x &= tmp1.x;
4429 if (tmp0.y == ' ') buf1[3].y &= tmp1.y;
4430 if (tmp0.z == ' ') buf1[3].z &= tmp1.z;
4431 if (tmp0.w == ' ') buf1[3].w &= tmp1.w;
4432 }
4433 #endif
4434 }
4435
4436
4437
4438 return in_len;
4439 }
4440
4441 __device__ static u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32x buf0[4], u32x buf1[4], const u32 in_len)
4442 {
4443 u32 out_len = in_len;
4444
4445 switch (name)
4446 {
4447 case RULE_OP_MANGLE_LREST: out_len = rule_op_mangle_lrest (p0, p1, buf0, buf1, out_len); break;
4448 case RULE_OP_MANGLE_UREST: out_len = rule_op_mangle_urest (p0, p1, buf0, buf1, out_len); break;
4449 case RULE_OP_MANGLE_LREST_UFIRST: out_len = rule_op_mangle_lrest_ufirst (p0, p1, buf0, buf1, out_len); break;
4450 case RULE_OP_MANGLE_UREST_LFIRST: out_len = rule_op_mangle_urest_lfirst (p0, p1, buf0, buf1, out_len); break;
4451 case RULE_OP_MANGLE_TREST: out_len = rule_op_mangle_trest (p0, p1, buf0, buf1, out_len); break;
4452 case RULE_OP_MANGLE_TOGGLE_AT: out_len = rule_op_mangle_toggle_at (p0, p1, buf0, buf1, out_len); break;
4453 case RULE_OP_MANGLE_REVERSE: out_len = rule_op_mangle_reverse (p0, p1, buf0, buf1, out_len); break;
4454 case RULE_OP_MANGLE_DUPEWORD: out_len = rule_op_mangle_dupeword (p0, p1, buf0, buf1, out_len); break;
4455 case RULE_OP_MANGLE_DUPEWORD_TIMES: out_len = rule_op_mangle_dupeword_times (p0, p1, buf0, buf1, out_len); break;
4456 case RULE_OP_MANGLE_REFLECT: out_len = rule_op_mangle_reflect (p0, p1, buf0, buf1, out_len); break;
4457 case RULE_OP_MANGLE_APPEND: out_len = rule_op_mangle_append (p0, p1, buf0, buf1, out_len); break;
4458 case RULE_OP_MANGLE_PREPEND: out_len = rule_op_mangle_prepend (p0, p1, buf0, buf1, out_len); break;
4459 case RULE_OP_MANGLE_ROTATE_LEFT: out_len = rule_op_mangle_rotate_left (p0, p1, buf0, buf1, out_len); break;
4460 case RULE_OP_MANGLE_ROTATE_RIGHT: out_len = rule_op_mangle_rotate_right (p0, p1, buf0, buf1, out_len); break;
4461 case RULE_OP_MANGLE_DELETE_FIRST: out_len = rule_op_mangle_delete_first (p0, p1, buf0, buf1, out_len); break;
4462 case RULE_OP_MANGLE_DELETE_LAST: out_len = rule_op_mangle_delete_last (p0, p1, buf0, buf1, out_len); break;
4463 case RULE_OP_MANGLE_DELETE_AT: out_len = rule_op_mangle_delete_at (p0, p1, buf0, buf1, out_len); break;
4464 case RULE_OP_MANGLE_EXTRACT: out_len = rule_op_mangle_extract (p0, p1, buf0, buf1, out_len); break;
4465 case RULE_OP_MANGLE_OMIT: out_len = rule_op_mangle_omit (p0, p1, buf0, buf1, out_len); break;
4466 case RULE_OP_MANGLE_INSERT: out_len = rule_op_mangle_insert (p0, p1, buf0, buf1, out_len); break;
4467 case RULE_OP_MANGLE_OVERSTRIKE: out_len = rule_op_mangle_overstrike (p0, p1, buf0, buf1, out_len); break;
4468 case RULE_OP_MANGLE_TRUNCATE_AT: out_len = rule_op_mangle_truncate_at (p0, p1, buf0, buf1, out_len); break;
4469 case RULE_OP_MANGLE_REPLACE: out_len = rule_op_mangle_replace (p0, p1, buf0, buf1, out_len); break;
4470 //case RULE_OP_MANGLE_PURGECHAR: out_len = rule_op_mangle_purgechar (p0, p1, buf0, buf1, out_len); break;
4471 //case RULE_OP_MANGLE_TOGGLECASE_REC: out_len = rule_op_mangle_togglecase_rec (p0, p1, buf0, buf1, out_len); break;
4472 case RULE_OP_MANGLE_DUPECHAR_FIRST: out_len = rule_op_mangle_dupechar_first (p0, p1, buf0, buf1, out_len); break;
4473 case RULE_OP_MANGLE_DUPECHAR_LAST: out_len = rule_op_mangle_dupechar_last (p0, p1, buf0, buf1, out_len); break;
4474 case RULE_OP_MANGLE_DUPECHAR_ALL: out_len = rule_op_mangle_dupechar_all (p0, p1, buf0, buf1, out_len); break;
4475 case RULE_OP_MANGLE_SWITCH_FIRST: out_len = rule_op_mangle_switch_first (p0, p1, buf0, buf1, out_len); break;
4476 case RULE_OP_MANGLE_SWITCH_LAST: out_len = rule_op_mangle_switch_last (p0, p1, buf0, buf1, out_len); break;
4477 case RULE_OP_MANGLE_SWITCH_AT: out_len = rule_op_mangle_switch_at (p0, p1, buf0, buf1, out_len); break;
4478 case RULE_OP_MANGLE_CHR_SHIFTL: out_len = rule_op_mangle_chr_shiftl (p0, p1, buf0, buf1, out_len); break;
4479 case RULE_OP_MANGLE_CHR_SHIFTR: out_len = rule_op_mangle_chr_shiftr (p0, p1, buf0, buf1, out_len); break;
4480 case RULE_OP_MANGLE_CHR_INCR: out_len = rule_op_mangle_chr_incr (p0, p1, buf0, buf1, out_len); break;
4481 case RULE_OP_MANGLE_CHR_DECR: out_len = rule_op_mangle_chr_decr (p0, p1, buf0, buf1, out_len); break;
4482 case RULE_OP_MANGLE_REPLACE_NP1: out_len = rule_op_mangle_replace_np1 (p0, p1, buf0, buf1, out_len); break;
4483 case RULE_OP_MANGLE_REPLACE_NM1: out_len = rule_op_mangle_replace_nm1 (p0, p1, buf0, buf1, out_len); break;
4484 case RULE_OP_MANGLE_DUPEBLOCK_FIRST: out_len = rule_op_mangle_dupeblock_first (p0, p1, buf0, buf1, out_len); break;
4485 case RULE_OP_MANGLE_DUPEBLOCK_LAST: out_len = rule_op_mangle_dupeblock_last (p0, p1, buf0, buf1, out_len); break;
4486 case RULE_OP_MANGLE_TITLE: out_len = rule_op_mangle_title (p0, p1, buf0, buf1, out_len); break;
4487 }
4488
4489 return out_len;
4490 }
4491
4492 __device__ static u32 apply_rules (u32 *cmds, u32x buf0[4], u32x buf1[4], const u32 len)
4493 {
4494 u32 out_len = len;
4495
4496 for (u32 i = 0; cmds[i] != 0; i++)
4497 {
4498 const u32 cmd = cmds[i];
4499
4500 const u32 name = (cmd >> 0) & 0xff;
4501 const u32 p0 = (cmd >> 8) & 0xff;
4502 const u32 p1 = (cmd >> 16) & 0xff;
4503
4504 out_len = apply_rule (name, p0, p1, buf0, buf1, out_len);
4505 }
4506
4507 return out_len;
4508 }