2 * Author......: Jens Steube <jens.steube@gmail.com>
6 __device__
static u32x
generate_cmask (u32x buf
)
8 const u32x rmask
= ((buf
& 0x40404040) >> 1)
9 & ~((buf
& 0x80808080) >> 2);
11 const u32x hmask
= (buf
& 0x1f1f1f1f) + 0x05050505;
12 const u32x lmask
= (buf
& 0x1f1f1f1f) + 0x1f1f1f1f;
14 return rmask
& ~hmask
& lmask
;
17 __device__
static void truncate_right (u32x w0
[4], u32x w1
[4], const u32 len
)
19 const u32 tmp
= (1 << ((len
% 4) * 8)) - 1;
70 __device__
static void truncate_left (u32x w0
[4], u32x w1
[4], const u32 len
)
72 const u32 tmp
= ~((1 << ((len
% 4) * 8)) - 1);
123 __device__
static void lshift_block (const u32x in0
[4], const u32x in1
[4], u32x out0
[4], u32x out1
[4])
125 #if __CUDA_ARCH__ >= 200
127 out0
[0] = __byte_perm (in0
[0], in0
[1], 0x4321);
128 out0
[1] = __byte_perm (in0
[1], in0
[2], 0x4321);
129 out0
[2] = __byte_perm (in0
[2], in0
[3], 0x4321);
130 out0
[3] = __byte_perm (in0
[3], in1
[0], 0x4321);
131 out1
[0] = __byte_perm (in1
[0], in1
[1], 0x4321);
132 out1
[1] = __byte_perm (in1
[1], in1
[2], 0x4321);
133 out1
[2] = __byte_perm (in1
[2], in1
[3], 0x4321);
134 out1
[3] = __byte_perm (in1
[3], 0, 0x4321);
138 out0
[0] = in0
[0] >> 8 | in0
[1] << 24;
139 out0
[1] = in0
[1] >> 8 | in0
[2] << 24;
140 out0
[2] = in0
[2] >> 8 | in0
[3] << 24;
141 out0
[3] = in0
[3] >> 8 | in1
[0] << 24;
142 out1
[0] = in1
[0] >> 8 | in1
[1] << 24;
143 out1
[1] = in1
[1] >> 8 | in1
[2] << 24;
144 out1
[2] = in1
[2] >> 8 | in1
[3] << 24;
145 out1
[3] = in1
[3] >> 8;
150 __device__
static void rshift_block (const u32x in0
[4], const u32x in1
[4], u32x out0
[4], u32x out1
[4])
152 #if __CUDA_ARCH__ >= 200
154 out1
[3] = __byte_perm (in1
[2], in1
[3], 0x6543);
155 out1
[2] = __byte_perm (in1
[1], in1
[2], 0x6543);
156 out1
[1] = __byte_perm (in1
[0], in1
[1], 0x6543);
157 out1
[0] = __byte_perm (in0
[3], in1
[0], 0x6543);
158 out0
[3] = __byte_perm (in0
[2], in0
[3], 0x6543);
159 out0
[2] = __byte_perm (in0
[1], in0
[2], 0x6543);
160 out0
[1] = __byte_perm (in0
[0], in0
[1], 0x6543);
161 out0
[0] = __byte_perm ( 0, in0
[0], 0x6543);
165 out1
[3] = in1
[3] << 8 | in1
[2] >> 24;
166 out1
[2] = in1
[2] << 8 | in1
[1] >> 24;
167 out1
[1] = in1
[1] << 8 | in1
[0] >> 24;
168 out1
[0] = in1
[0] << 8 | in0
[3] >> 24;
169 out0
[3] = in0
[3] << 8 | in0
[2] >> 24;
170 out0
[2] = in0
[2] << 8 | in0
[1] >> 24;
171 out0
[1] = in0
[1] << 8 | in0
[0] >> 24;
172 out0
[0] = in0
[0] << 8;
177 __device__
static void rshift_block_N (const u32x in0
[4], const u32x in1
[4], u32x out0
[4], u32x out1
[4], const u32 num
)
179 #if __CUDA_ARCH__ >= 200
183 case 0: out1
[3] = in1
[3];
192 case 1: out1
[3] = __byte_perm (in1
[2], in1
[3], 0x6543);
193 out1
[2] = __byte_perm (in1
[1], in1
[2], 0x6543);
194 out1
[1] = __byte_perm (in1
[0], in1
[1], 0x6543);
195 out1
[0] = __byte_perm (in0
[3], in1
[0], 0x6543);
196 out0
[3] = __byte_perm (in0
[2], in0
[3], 0x6543);
197 out0
[2] = __byte_perm (in0
[1], in0
[2], 0x6543);
198 out0
[1] = __byte_perm (in0
[0], in0
[1], 0x6543);
199 out0
[0] = __byte_perm ( 0, in0
[0], 0x6543);
201 case 2: out1
[3] = __byte_perm (in1
[2], in1
[3], 0x5432);
202 out1
[2] = __byte_perm (in1
[1], in1
[2], 0x5432);
203 out1
[1] = __byte_perm (in1
[0], in1
[1], 0x5432);
204 out1
[0] = __byte_perm (in0
[3], in1
[0], 0x5432);
205 out0
[3] = __byte_perm (in0
[2], in0
[3], 0x5432);
206 out0
[2] = __byte_perm (in0
[1], in0
[2], 0x5432);
207 out0
[1] = __byte_perm (in0
[0], in0
[1], 0x5432);
208 out0
[0] = __byte_perm ( 0, in0
[0], 0x5432);
210 case 3: out1
[3] = __byte_perm (in1
[2], in1
[3], 0x4321);
211 out1
[2] = __byte_perm (in1
[1], in1
[2], 0x4321);
212 out1
[1] = __byte_perm (in1
[0], in1
[1], 0x4321);
213 out1
[0] = __byte_perm (in0
[3], in1
[0], 0x4321);
214 out0
[3] = __byte_perm (in0
[2], in0
[3], 0x4321);
215 out0
[2] = __byte_perm (in0
[1], in0
[2], 0x4321);
216 out0
[1] = __byte_perm (in0
[0], in0
[1], 0x4321);
217 out0
[0] = __byte_perm ( 0, in0
[0], 0x4321);
219 case 4: out1
[3] = in1
[2];
228 case 5: out1
[3] = __byte_perm (in1
[1], in1
[2], 0x6543);
229 out1
[2] = __byte_perm (in1
[0], in1
[1], 0x6543);
230 out1
[1] = __byte_perm (in0
[3], in1
[0], 0x6543);
231 out1
[0] = __byte_perm (in0
[2], in0
[3], 0x6543);
232 out0
[3] = __byte_perm (in0
[1], in0
[2], 0x6543);
233 out0
[2] = __byte_perm (in0
[0], in0
[1], 0x6543);
234 out0
[1] = __byte_perm ( 0, in0
[0], 0x6543);
237 case 6: out1
[3] = __byte_perm (in1
[1], in1
[2], 0x5432);
238 out1
[2] = __byte_perm (in1
[0], in1
[1], 0x5432);
239 out1
[1] = __byte_perm (in0
[3], in1
[0], 0x5432);
240 out1
[0] = __byte_perm (in0
[2], in0
[3], 0x5432);
241 out0
[3] = __byte_perm (in0
[1], in0
[2], 0x5432);
242 out0
[2] = __byte_perm (in0
[0], in0
[1], 0x5432);
243 out0
[1] = __byte_perm ( 0, in0
[0], 0x5432);
246 case 7: out1
[3] = __byte_perm (in1
[1], in1
[2], 0x4321);
247 out1
[2] = __byte_perm (in1
[0], in1
[1], 0x4321);
248 out1
[1] = __byte_perm (in0
[3], in1
[0], 0x4321);
249 out1
[0] = __byte_perm (in0
[2], in0
[3], 0x4321);
250 out0
[3] = __byte_perm (in0
[1], in0
[2], 0x4321);
251 out0
[2] = __byte_perm (in0
[0], in0
[1], 0x4321);
252 out0
[1] = __byte_perm ( 0, in0
[0], 0x4321);
255 case 8: out1
[3] = in1
[1];
264 case 9: out1
[3] = __byte_perm (in1
[0], in1
[1], 0x6543);
265 out1
[2] = __byte_perm (in0
[3], in1
[0], 0x6543);
266 out1
[1] = __byte_perm (in0
[2], in0
[3], 0x6543);
267 out1
[0] = __byte_perm (in0
[1], in0
[2], 0x6543);
268 out0
[3] = __byte_perm (in0
[0], in0
[1], 0x6543);
269 out0
[2] = __byte_perm ( 0, in0
[0], 0x6543);
273 case 10: out1
[3] = __byte_perm (in1
[0], in1
[1], 0x5432);
274 out1
[2] = __byte_perm (in0
[3], in1
[0], 0x5432);
275 out1
[1] = __byte_perm (in0
[2], in0
[3], 0x5432);
276 out1
[0] = __byte_perm (in0
[1], in0
[2], 0x5432);
277 out0
[3] = __byte_perm (in0
[0], in0
[1], 0x5432);
278 out0
[2] = __byte_perm ( 0, in0
[0], 0x5432);
282 case 11: out1
[3] = __byte_perm (in1
[0], in1
[1], 0x4321);
283 out1
[2] = __byte_perm (in0
[3], in1
[0], 0x4321);
284 out1
[1] = __byte_perm (in0
[2], in0
[3], 0x4321);
285 out1
[0] = __byte_perm (in0
[1], in0
[2], 0x4321);
286 out0
[3] = __byte_perm (in0
[0], in0
[1], 0x4321);
287 out0
[2] = __byte_perm ( 0, in0
[0], 0x4321);
291 case 12: out1
[3] = in1
[0];
300 case 13: out1
[3] = __byte_perm (in0
[3], in1
[0], 0x6543);
301 out1
[2] = __byte_perm (in0
[2], in0
[3], 0x6543);
302 out1
[1] = __byte_perm (in0
[1], in0
[2], 0x6543);
303 out1
[0] = __byte_perm (in0
[0], in0
[1], 0x6543);
304 out0
[3] = __byte_perm ( 0, in0
[0], 0x6543);
309 case 14: out1
[3] = __byte_perm (in0
[3], in1
[0], 0x5432);
310 out1
[2] = __byte_perm (in0
[2], in0
[3], 0x5432);
311 out1
[1] = __byte_perm (in0
[1], in0
[2], 0x5432);
312 out1
[0] = __byte_perm (in0
[0], in0
[1], 0x5432);
313 out0
[3] = __byte_perm ( 0, in0
[0], 0x5432);
318 case 15: out1
[3] = __byte_perm (in0
[3], in1
[0], 0x4321);
319 out1
[2] = __byte_perm (in0
[2], in0
[3], 0x4321);
320 out1
[1] = __byte_perm (in0
[1], in0
[2], 0x4321);
321 out1
[0] = __byte_perm (in0
[0], in0
[1], 0x4321);
322 out0
[3] = __byte_perm ( 0, in0
[0], 0x4321);
327 case 16: out1
[3] = in0
[3];
336 case 17: out1
[3] = __byte_perm (in0
[2], in0
[3], 0x6543);
337 out1
[2] = __byte_perm (in0
[1], in0
[2], 0x6543);
338 out1
[1] = __byte_perm (in0
[0], in0
[1], 0x6543);
339 out1
[0] = __byte_perm ( 0, in0
[0], 0x6543);
345 case 18: out1
[3] = __byte_perm (in0
[2], in0
[3], 0x5432);
346 out1
[2] = __byte_perm (in0
[1], in0
[2], 0x5432);
347 out1
[1] = __byte_perm (in0
[0], in0
[1], 0x5432);
348 out1
[0] = __byte_perm ( 0, in0
[0], 0x5432);
354 case 19: out1
[3] = __byte_perm (in0
[2], in0
[3], 0x4321);
355 out1
[2] = __byte_perm (in0
[1], in0
[2], 0x4321);
356 out1
[1] = __byte_perm (in0
[0], in0
[1], 0x4321);
357 out1
[0] = __byte_perm ( 0, in0
[0], 0x4321);
363 case 20: out1
[3] = in0
[2];
372 case 21: out1
[3] = __byte_perm (in0
[1], in0
[2], 0x6543);
373 out1
[2] = __byte_perm (in0
[0], in0
[1], 0x6543);
374 out1
[1] = __byte_perm ( 0, in0
[0], 0x6543);
381 case 22: out1
[3] = __byte_perm (in0
[1], in0
[2], 0x5432);
382 out1
[2] = __byte_perm (in0
[0], in0
[1], 0x5432);
383 out1
[1] = __byte_perm ( 0, in0
[0], 0x5432);
390 case 23: out1
[3] = __byte_perm (in0
[1], in0
[2], 0x4321);
391 out1
[2] = __byte_perm (in0
[0], in0
[1], 0x4321);
392 out1
[1] = __byte_perm ( 0, in0
[0], 0x4321);
399 case 24: out1
[3] = in0
[1];
408 case 25: out1
[3] = __byte_perm (in0
[0], in0
[1], 0x6543);
409 out1
[2] = __byte_perm ( 0, in0
[0], 0x6543);
417 case 26: out1
[3] = __byte_perm (in0
[0], in0
[1], 0x5432);
418 out1
[2] = __byte_perm ( 0, in0
[0], 0x5432);
426 case 27: out1
[3] = __byte_perm (in0
[0], in0
[1], 0x4321);
427 out1
[2] = __byte_perm ( 0, in0
[0], 0x4321);
435 case 28: out1
[3] = in0
[0];
444 case 29: out1
[3] = __byte_perm ( 0, in0
[0], 0x6543);
453 case 30: out1
[3] = __byte_perm ( 0, in0
[0], 0x5432);
462 case 31: out1
[3] = __byte_perm ( 0, in0
[0], 0x4321);
477 case 0: out1
[3] = in1
[3];
486 case 1: out1
[3] = in1
[3] << 8 | in1
[2] >> 24;
487 out1
[2] = in1
[2] << 8 | in1
[1] >> 24;
488 out1
[1] = in1
[1] << 8 | in1
[0] >> 24;
489 out1
[0] = in1
[0] << 8 | in0
[3] >> 24;
490 out0
[3] = in0
[3] << 8 | in0
[2] >> 24;
491 out0
[2] = in0
[2] << 8 | in0
[1] >> 24;
492 out0
[1] = in0
[1] << 8 | in0
[0] >> 24;
493 out0
[0] = in0
[0] << 8;
495 case 2: out1
[3] = in1
[3] << 16 | in1
[2] >> 16;
496 out1
[2] = in1
[2] << 16 | in1
[1] >> 16;
497 out1
[1] = in1
[1] << 16 | in1
[0] >> 16;
498 out1
[0] = in1
[0] << 16 | in0
[3] >> 16;
499 out0
[3] = in0
[3] << 16 | in0
[2] >> 16;
500 out0
[2] = in0
[2] << 16 | in0
[1] >> 16;
501 out0
[1] = in0
[1] << 16 | in0
[0] >> 16;
502 out0
[0] = in0
[0] << 16;
504 case 3: out1
[3] = in1
[3] << 24 | in1
[2] >> 8;
505 out1
[2] = in1
[2] << 24 | in1
[1] >> 8;
506 out1
[1] = in1
[1] << 24 | in1
[0] >> 8;
507 out1
[0] = in1
[0] << 24 | in0
[3] >> 8;
508 out0
[3] = in0
[3] << 24 | in0
[2] >> 8;
509 out0
[2] = in0
[2] << 24 | in0
[1] >> 8;
510 out0
[1] = in0
[1] << 24 | in0
[0] >> 8;
511 out0
[0] = in0
[0] << 24;
513 case 4: out1
[3] = in1
[2];
522 case 5: out1
[3] = in1
[2] << 8 | in1
[1] >> 24;
523 out1
[2] = in1
[1] << 8 | in1
[0] >> 24;
524 out1
[1] = in1
[0] << 8 | in0
[3] >> 24;
525 out1
[0] = in0
[3] << 8 | in0
[2] >> 24;
526 out0
[3] = in0
[2] << 8 | in0
[1] >> 24;
527 out0
[2] = in0
[1] << 8 | in0
[0] >> 24;
528 out0
[1] = in0
[0] << 8;
531 case 6: out1
[3] = in1
[2] << 16 | in1
[1] >> 16;
532 out1
[2] = in1
[1] << 16 | in1
[0] >> 16;
533 out1
[1] = in1
[0] << 16 | in0
[3] >> 16;
534 out1
[0] = in0
[3] << 16 | in0
[2] >> 16;
535 out0
[3] = in0
[2] << 16 | in0
[1] >> 16;
536 out0
[2] = in0
[1] << 16 | in0
[0] >> 16;
537 out0
[1] = in0
[0] << 16;
540 case 7: out1
[3] = in1
[2] << 24 | in1
[1] >> 8;
541 out1
[2] = in1
[1] << 24 | in1
[0] >> 8;
542 out1
[1] = in1
[0] << 24 | in0
[3] >> 8;
543 out1
[0] = in0
[3] << 24 | in0
[2] >> 8;
544 out0
[3] = in0
[2] << 24 | in0
[1] >> 8;
545 out0
[2] = in0
[1] << 24 | in0
[0] >> 8;
546 out0
[1] = in0
[0] << 24;
549 case 8: out1
[3] = in1
[1];
558 case 9: out1
[3] = in1
[1] << 8 | in1
[0] >> 24;
559 out1
[2] = in1
[0] << 8 | in0
[3] >> 24;
560 out1
[1] = in0
[3] << 8 | in0
[2] >> 24;
561 out1
[0] = in0
[2] << 8 | in0
[1] >> 24;
562 out0
[3] = in0
[1] << 8 | in0
[0] >> 24;
563 out0
[2] = in0
[0] << 8;
567 case 10: out1
[3] = in1
[1] << 16 | in1
[0] >> 16;
568 out1
[2] = in1
[0] << 16 | in0
[3] >> 16;
569 out1
[1] = in0
[3] << 16 | in0
[2] >> 16;
570 out1
[0] = in0
[2] << 16 | in0
[1] >> 16;
571 out0
[3] = in0
[1] << 16 | in0
[0] >> 16;
572 out0
[2] = in0
[0] << 16;
576 case 11: out1
[3] = in1
[1] << 24 | in1
[0] >> 8;
577 out1
[2] = in1
[0] << 24 | in0
[3] >> 8;
578 out1
[1] = in0
[3] << 24 | in0
[2] >> 8;
579 out1
[0] = in0
[2] << 24 | in0
[1] >> 8;
580 out0
[3] = in0
[1] << 24 | in0
[0] >> 8;
581 out0
[2] = in0
[0] << 24;
585 case 12: out1
[3] = in1
[0];
594 case 13: out1
[3] = in1
[0] << 8 | in0
[3] >> 24;
595 out1
[2] = in0
[3] << 8 | in0
[2] >> 24;
596 out1
[1] = in0
[2] << 8 | in0
[1] >> 24;
597 out1
[0] = in0
[1] << 8 | in0
[0] >> 24;
598 out0
[3] = in0
[0] << 8;
603 case 14: out1
[3] = in1
[0] << 16 | in0
[3] >> 16;
604 out1
[2] = in0
[3] << 16 | in0
[2] >> 16;
605 out1
[1] = in0
[2] << 16 | in0
[1] >> 16;
606 out1
[0] = in0
[1] << 16 | in0
[0] >> 16;
607 out0
[3] = in0
[0] << 16;
612 case 15: out1
[3] = in1
[0] << 24 | in0
[3] >> 8;
613 out1
[2] = in0
[3] << 24 | in0
[2] >> 8;
614 out1
[1] = in0
[2] << 24 | in0
[1] >> 8;
615 out1
[0] = in0
[1] << 24 | in0
[0] >> 8;
616 out0
[3] = in0
[0] << 24;
621 case 16: out1
[3] = in0
[3];
630 case 17: out1
[3] = in0
[3] << 8 | in0
[2] >> 24;
631 out1
[2] = in0
[2] << 8 | in0
[1] >> 24;
632 out1
[1] = in0
[1] << 8 | in0
[0] >> 24;
633 out1
[0] = in0
[0] << 8;
639 case 18: out1
[3] = in0
[3] << 16 | in0
[2] >> 16;
640 out1
[2] = in0
[2] << 16 | in0
[1] >> 16;
641 out1
[1] = in0
[1] << 16 | in0
[0] >> 16;
642 out1
[0] = in0
[0] << 16;
648 case 19: out1
[3] = in0
[3] << 24 | in0
[2] >> 8;
649 out1
[2] = in0
[2] << 24 | in0
[1] >> 8;
650 out1
[1] = in0
[1] << 24 | in0
[0] >> 8;
651 out1
[0] = in0
[0] << 24;
657 case 20: out1
[3] = in0
[2];
666 case 21: out1
[3] = in0
[2] << 8 | in0
[1] >> 24;
667 out1
[2] = in0
[1] << 8 | in0
[0] >> 24;
668 out1
[1] = in0
[0] << 8;
675 case 22: out1
[3] = in0
[2] << 16 | in0
[1] >> 16;
676 out1
[2] = in0
[1] << 16 | in0
[0] >> 16;
677 out1
[1] = in0
[0] << 16;
684 case 23: out1
[3] = in0
[2] << 24 | in0
[1] >> 8;
685 out1
[2] = in0
[1] << 24 | in0
[0] >> 8;
686 out1
[1] = in0
[0] << 24;
693 case 24: out1
[3] = in0
[1];
702 case 25: out1
[3] = in0
[1] << 8 | in0
[0] >> 24;
703 out1
[2] = in0
[0] << 8;
711 case 26: out1
[3] = in0
[1] << 16 | in0
[0] >> 16;
712 out1
[2] = in0
[0] << 16;
720 case 27: out1
[3] = in0
[1] << 24 | in0
[0] >> 8;
721 out1
[2] = in0
[0] << 24;
729 case 28: out1
[3] = in0
[0];
738 case 29: out1
[3] = in0
[0] << 8;
747 case 30: out1
[3] = in0
[0] << 16;
756 case 31: out1
[3] = in0
[0] << 24;
770 __device__
static void lshift_block_N (const u32x in0
[4], const u32x in1
[4], u32x out0
[4], u32x out1
[4], const u32 num
)
772 #if __CUDA_ARCH__ >= 200
776 case 0: out0
[0] = in0
[0];
785 case 1: out0
[0] = __byte_perm (in0
[0], in0
[1], 0x4321);
786 out0
[1] = __byte_perm (in0
[1], in0
[2], 0x4321);
787 out0
[2] = __byte_perm (in0
[2], in0
[3], 0x4321);
788 out0
[3] = __byte_perm (in0
[3], in1
[0], 0x4321);
789 out1
[0] = __byte_perm (in1
[0], in1
[1], 0x4321);
790 out1
[1] = __byte_perm (in1
[1], in1
[2], 0x4321);
791 out1
[2] = __byte_perm (in1
[2], in1
[3], 0x4321);
792 out1
[3] = __byte_perm (in1
[3], 0, 0x4321);
794 case 2: out0
[0] = __byte_perm (in0
[0], in0
[1], 0x5432);
795 out0
[1] = __byte_perm (in0
[1], in0
[2], 0x5432);
796 out0
[2] = __byte_perm (in0
[2], in0
[3], 0x5432);
797 out0
[3] = __byte_perm (in0
[3], in1
[0], 0x5432);
798 out1
[0] = __byte_perm (in1
[0], in1
[1], 0x5432);
799 out1
[1] = __byte_perm (in1
[1], in1
[2], 0x5432);
800 out1
[2] = __byte_perm (in1
[2], in1
[3], 0x5432);
801 out1
[3] = __byte_perm (in1
[3], 0, 0x5432);
803 case 3: out0
[0] = __byte_perm (in0
[0], in0
[1], 0x6543);
804 out0
[1] = __byte_perm (in0
[1], in0
[2], 0x6543);
805 out0
[2] = __byte_perm (in0
[2], in0
[3], 0x6543);
806 out0
[3] = __byte_perm (in0
[3], in1
[0], 0x6543);
807 out1
[0] = __byte_perm (in1
[0], in1
[1], 0x6543);
808 out1
[1] = __byte_perm (in1
[1], in1
[2], 0x6543);
809 out1
[2] = __byte_perm (in1
[2], in1
[3], 0x6543);
810 out1
[3] = __byte_perm (in1
[3], 0, 0x6543);
812 case 4: out0
[0] = in0
[1];
821 case 5: out0
[0] = __byte_perm (in0
[1], in0
[2], 0x4321);
822 out0
[1] = __byte_perm (in0
[2], in0
[3], 0x4321);
823 out0
[2] = __byte_perm (in0
[3], in1
[0], 0x4321);
824 out0
[3] = __byte_perm (in1
[0], in1
[1], 0x4321);
825 out1
[0] = __byte_perm (in1
[1], in1
[2], 0x4321);
826 out1
[1] = __byte_perm (in1
[2], in1
[3], 0x4321);
827 out1
[2] = __byte_perm (in1
[3], 0, 0x4321);
830 case 6: out0
[0] = __byte_perm (in0
[1], in0
[2], 0x5432);
831 out0
[1] = __byte_perm (in0
[2], in0
[3], 0x5432);
832 out0
[2] = __byte_perm (in0
[3], in1
[0], 0x5432);
833 out0
[3] = __byte_perm (in1
[0], in1
[1], 0x5432);
834 out1
[0] = __byte_perm (in1
[1], in1
[2], 0x5432);
835 out1
[1] = __byte_perm (in1
[2], in1
[3], 0x5432);
836 out1
[2] = __byte_perm (in1
[3], 0, 0x5432);
839 case 7: out0
[0] = __byte_perm (in0
[1], in0
[2], 0x6543);
840 out0
[1] = __byte_perm (in0
[2], in0
[3], 0x6543);
841 out0
[2] = __byte_perm (in0
[3], in1
[0], 0x6543);
842 out0
[3] = __byte_perm (in1
[0], in1
[1], 0x6543);
843 out1
[0] = __byte_perm (in1
[1], in1
[2], 0x6543);
844 out1
[1] = __byte_perm (in1
[2], in1
[3], 0x6543);
845 out1
[2] = __byte_perm (in1
[3], 0, 0x6543);
848 case 8: out0
[0] = in0
[2];
857 case 9: out0
[0] = __byte_perm (in0
[2], in0
[3], 0x4321);
858 out0
[1] = __byte_perm (in0
[3], in1
[0], 0x4321);
859 out0
[2] = __byte_perm (in1
[0], in1
[1], 0x4321);
860 out0
[3] = __byte_perm (in1
[1], in1
[2], 0x4321);
861 out1
[0] = __byte_perm (in1
[2], in1
[3], 0x4321);
862 out1
[1] = __byte_perm (in1
[3], 0, 0x4321);
866 case 10: out0
[0] = __byte_perm (in0
[2], in0
[3], 0x5432);
867 out0
[1] = __byte_perm (in0
[3], in1
[0], 0x5432);
868 out0
[2] = __byte_perm (in1
[0], in1
[1], 0x5432);
869 out0
[3] = __byte_perm (in1
[1], in1
[2], 0x5432);
870 out1
[0] = __byte_perm (in1
[2], in1
[3], 0x5432);
871 out1
[1] = __byte_perm (in1
[3], 0, 0x5432);
875 case 11: out0
[0] = __byte_perm (in0
[2], in0
[3], 0x6543);
876 out0
[1] = __byte_perm (in0
[3], in1
[0], 0x6543);
877 out0
[2] = __byte_perm (in1
[0], in1
[1], 0x6543);
878 out0
[3] = __byte_perm (in1
[1], in1
[2], 0x6543);
879 out1
[0] = __byte_perm (in1
[2], in1
[3], 0x6543);
880 out1
[1] = __byte_perm (in1
[3], 0, 0x6543);
884 case 12: out0
[0] = in0
[3];
894 out0
[0] = __byte_perm (in0
[3], in1
[0], 0x4321);
895 out0
[1] = __byte_perm (in1
[0], in1
[1], 0x4321);
896 out0
[2] = __byte_perm (in1
[1], in1
[2], 0x4321);
897 out0
[3] = __byte_perm (in1
[2], in1
[3], 0x4321);
898 out1
[0] = __byte_perm (in1
[3], 0, 0x4321);
903 case 14: out0
[0] = __byte_perm (in0
[3], in1
[0], 0x5432);
904 out0
[1] = __byte_perm (in1
[0], in1
[1], 0x5432);
905 out0
[2] = __byte_perm (in1
[1], in1
[2], 0x5432);
906 out0
[3] = __byte_perm (in1
[2], in1
[3], 0x5432);
907 out1
[0] = __byte_perm (in1
[3], 0, 0x5432);
912 case 15: out0
[0] = __byte_perm (in0
[3], in1
[0], 0x6543);
913 out0
[1] = __byte_perm (in1
[0], in1
[1], 0x6543);
914 out0
[2] = __byte_perm (in1
[1], in1
[2], 0x6543);
915 out0
[3] = __byte_perm (in1
[2], in1
[3], 0x6543);
916 out1
[0] = __byte_perm (in1
[3], 0, 0x6543);
921 case 16: out0
[0] = in1
[0];
930 case 17: out0
[0] = __byte_perm (in1
[0], in1
[1], 0x4321);
931 out0
[1] = __byte_perm (in1
[1], in1
[2], 0x4321);
932 out0
[2] = __byte_perm (in1
[2], in1
[3], 0x4321);
933 out0
[3] = __byte_perm (in1
[3], 0, 0x4321);
939 case 18: out0
[0] = __byte_perm (in1
[0], in1
[1], 0x5432);
940 out0
[1] = __byte_perm (in1
[1], in1
[2], 0x5432);
941 out0
[2] = __byte_perm (in1
[2], in1
[3], 0x5432);
942 out0
[3] = __byte_perm (in1
[3], 0, 0x5432);
948 case 19: out0
[0] = __byte_perm (in1
[0], in1
[1], 0x6543);
949 out0
[1] = __byte_perm (in1
[1], in1
[2], 0x6543);
950 out0
[2] = __byte_perm (in1
[2], in1
[3], 0x6543);
951 out0
[3] = __byte_perm (in1
[3], 0, 0x6543);
957 case 20: out0
[0] = in1
[1];
966 case 21: out0
[0] = __byte_perm (in1
[1], in1
[2], 0x4321);
967 out0
[1] = __byte_perm (in1
[2], in1
[3], 0x4321);
968 out0
[2] = __byte_perm (in1
[3], 0, 0x4321);
975 case 22: out0
[0] = __byte_perm (in1
[1], in1
[2], 0x5432);
976 out0
[1] = __byte_perm (in1
[2], in1
[3], 0x5432);
977 out0
[2] = __byte_perm (in1
[3], 0, 0x5432);
984 case 23: out0
[0] = __byte_perm (in1
[1], in1
[2], 0x6543);
985 out0
[1] = __byte_perm (in1
[2], in1
[3], 0x6543);
986 out0
[2] = __byte_perm (in1
[3], 0, 0x6543);
993 case 24: out0
[0] = in1
[2];
1002 case 25: out0
[0] = __byte_perm (in1
[2], in1
[3], 0x4321);
1003 out0
[1] = __byte_perm (in1
[3], 0, 0x4321);
1011 case 26: out0
[0] = __byte_perm (in1
[2], in1
[3], 0x5432);
1012 out0
[1] = __byte_perm (in1
[3], 0, 0x5432);
1020 case 27: out0
[0] = __byte_perm (in1
[2], in1
[3], 0x6543);
1021 out0
[1] = __byte_perm (in1
[3], 0, 0x6543);
1029 case 28: out0
[0] = in1
[3];
1038 case 29: out0
[0] = __byte_perm (in1
[3], 0, 0x4321);
1047 case 30: out0
[0] = __byte_perm (in1
[3], 0, 0x5432);
1056 case 31: out0
[0] = __byte_perm (in1
[3], 0, 0x6543);
1071 case 0: out0
[0] = in0
[0];
1080 case 1: out0
[0] = in0
[0] >> 8 | in0
[1] << 24;
1081 out0
[1] = in0
[1] >> 8 | in0
[2] << 24;
1082 out0
[2] = in0
[2] >> 8 | in0
[3] << 24;
1083 out0
[3] = in0
[3] >> 8 | in1
[0] << 24;
1084 out1
[0] = in1
[0] >> 8 | in1
[1] << 24;
1085 out1
[1] = in1
[1] >> 8 | in1
[2] << 24;
1086 out1
[2] = in1
[2] >> 8 | in1
[3] << 24;
1087 out1
[3] = in1
[3] >> 8;
1089 case 2: out0
[0] = in0
[0] >> 16 | in0
[1] << 16;
1090 out0
[1] = in0
[1] >> 16 | in0
[2] << 16;
1091 out0
[2] = in0
[2] >> 16 | in0
[3] << 16;
1092 out0
[3] = in0
[3] >> 16 | in1
[0] << 16;
1093 out1
[0] = in1
[0] >> 16 | in1
[1] << 16;
1094 out1
[1] = in1
[1] >> 16 | in1
[2] << 16;
1095 out1
[2] = in1
[2] >> 16 | in1
[3] << 16;
1096 out1
[3] = in1
[3] >> 16;
1098 case 3: out0
[0] = in0
[0] >> 24 | in0
[1] << 8;
1099 out0
[1] = in0
[1] >> 24 | in0
[2] << 8;
1100 out0
[2] = in0
[2] >> 24 | in0
[3] << 8;
1101 out0
[3] = in0
[3] >> 24 | in1
[0] << 8;
1102 out1
[0] = in1
[0] >> 24 | in1
[1] << 8;
1103 out1
[1] = in1
[1] >> 24 | in1
[2] << 8;
1104 out1
[2] = in1
[2] >> 24 | in1
[3] << 8;
1105 out1
[3] = in1
[3] >> 24;
1107 case 4: out0
[0] = in0
[1];
1116 case 5: out0
[0] = in0
[1] >> 8 | in0
[2] << 24;
1117 out0
[1] = in0
[2] >> 8 | in0
[3] << 24;
1118 out0
[2] = in0
[3] >> 8 | in1
[0] << 24;
1119 out0
[3] = in1
[0] >> 8 | in1
[1] << 24;
1120 out1
[0] = in1
[1] >> 8 | in1
[2] << 24;
1121 out1
[1] = in1
[2] >> 8 | in1
[3] << 24;
1122 out1
[2] = in1
[3] >> 8;
1125 case 6: out0
[0] = in0
[1] >> 16 | in0
[2] << 16;
1126 out0
[1] = in0
[2] >> 16 | in0
[3] << 16;
1127 out0
[2] = in0
[3] >> 16 | in1
[0] << 16;
1128 out0
[3] = in1
[0] >> 16 | in1
[1] << 16;
1129 out1
[0] = in1
[1] >> 16 | in1
[2] << 16;
1130 out1
[1] = in1
[2] >> 16 | in1
[3] << 16;
1131 out1
[2] = in1
[3] >> 16;
1134 case 7: out0
[0] = in0
[1] >> 24 | in0
[2] << 8;
1135 out0
[1] = in0
[2] >> 24 | in0
[3] << 8;
1136 out0
[2] = in0
[3] >> 24 | in1
[0] << 8;
1137 out0
[3] = in1
[0] >> 24 | in1
[1] << 8;
1138 out1
[0] = in1
[1] >> 24 | in1
[2] << 8;
1139 out1
[1] = in1
[2] >> 24 | in1
[3] << 8;
1140 out1
[2] = in1
[3] >> 24;
1143 case 8: out0
[0] = in0
[2];
1152 case 9: out0
[0] = in0
[2] >> 8 | in0
[3] << 24;
1153 out0
[1] = in0
[3] >> 8 | in1
[0] << 24;
1154 out0
[2] = in1
[0] >> 8 | in1
[1] << 24;
1155 out0
[3] = in1
[1] >> 8 | in1
[2] << 24;
1156 out1
[0] = in1
[2] >> 8 | in1
[3] << 24;
1157 out1
[1] = in1
[3] >> 8;
1161 case 10: out0
[0] = in0
[2] >> 16 | in0
[3] << 16;
1162 out0
[1] = in0
[3] >> 16 | in1
[0] << 16;
1163 out0
[2] = in1
[0] >> 16 | in1
[1] << 16;
1164 out0
[3] = in1
[1] >> 16 | in1
[2] << 16;
1165 out1
[0] = in1
[2] >> 16 | in1
[3] << 16;
1166 out1
[1] = in1
[3] >> 16;
1170 case 11: out0
[0] = in0
[2] >> 24 | in0
[3] << 8;
1171 out0
[1] = in0
[3] >> 24 | in1
[0] << 8;
1172 out0
[2] = in1
[0] >> 24 | in1
[1] << 8;
1173 out0
[3] = in1
[1] >> 24 | in1
[2] << 8;
1174 out1
[0] = in1
[2] >> 24 | in1
[3] << 8;
1175 out1
[1] = in1
[3] >> 24;
1179 case 12: out0
[0] = in0
[3];
1189 out0
[0] = in0
[3] >> 8 | in1
[0] << 24;
1190 out0
[1] = in1
[0] >> 8 | in1
[1] << 24;
1191 out0
[2] = in1
[1] >> 8 | in1
[2] << 24;
1192 out0
[3] = in1
[2] >> 8 | in1
[3] << 24;
1193 out1
[0] = in1
[3] >> 8;
1198 case 14: out0
[0] = in0
[3] >> 16 | in1
[0] << 16;
1199 out0
[1] = in1
[0] >> 16 | in1
[1] << 16;
1200 out0
[2] = in1
[1] >> 16 | in1
[2] << 16;
1201 out0
[3] = in1
[2] >> 16 | in1
[3] << 16;
1202 out1
[0] = in1
[3] >> 16;
1207 case 15: out0
[0] = in0
[3] >> 24 | in1
[0] << 8;
1208 out0
[1] = in1
[0] >> 24 | in1
[1] << 8;
1209 out0
[2] = in1
[1] >> 24 | in1
[2] << 8;
1210 out0
[3] = in1
[2] >> 24 | in1
[3] << 8;
1211 out1
[0] = in1
[3] >> 24;
1216 case 16: out0
[0] = in1
[0];
1225 case 17: out0
[0] = in1
[0] >> 8 | in1
[1] << 24;
1226 out0
[1] = in1
[1] >> 8 | in1
[2] << 24;
1227 out0
[2] = in1
[2] >> 8 | in1
[3] << 24;
1228 out0
[3] = in1
[3] >> 8;
1234 case 18: out0
[0] = in1
[0] >> 16 | in1
[1] << 16;
1235 out0
[1] = in1
[1] >> 16 | in1
[2] << 16;
1236 out0
[2] = in1
[2] >> 16 | in1
[3] << 16;
1237 out0
[3] = in1
[3] >> 16;
1243 case 19: out0
[0] = in1
[0] >> 24 | in1
[1] << 8;
1244 out0
[1] = in1
[1] >> 24 | in1
[2] << 8;
1245 out0
[2] = in1
[2] >> 24 | in1
[3] << 8;
1246 out0
[3] = in1
[3] >> 24;
1252 case 20: out0
[0] = in1
[1];
1261 case 21: out0
[0] = in1
[1] >> 8 | in1
[2] << 24;
1262 out0
[1] = in1
[2] >> 8 | in1
[3] << 24;
1263 out0
[2] = in1
[3] >> 8;
1270 case 22: out0
[0] = in1
[1] >> 16 | in1
[2] << 16;
1271 out0
[1] = in1
[2] >> 16 | in1
[3] << 16;
1272 out0
[2] = in1
[3] >> 16;
1279 case 23: out0
[0] = in1
[1] >> 24 | in1
[2] << 8;
1280 out0
[1] = in1
[2] >> 24 | in1
[3] << 8;
1281 out0
[2] = in1
[3] >> 24;
1288 case 24: out0
[0] = in1
[2];
1297 case 25: out0
[0] = in1
[2] >> 8 | in1
[3] << 24;
1298 out0
[1] = in1
[3] >> 8;
1306 case 26: out0
[0] = in1
[2] >> 16 | in1
[3] << 16;
1307 out0
[1] = in1
[3] >> 16;
1315 case 27: out0
[0] = in1
[2] >> 24 | in1
[3] << 8;
1316 out0
[1] = in1
[3] >> 24;
1324 case 28: out0
[0] = in1
[3];
1333 case 29: out0
[0] = in1
[3] >> 8;
1342 case 30: out0
[0] = in1
[3] >> 16;
1351 case 31: out0
[0] = in1
[3] >> 24;
1365 __device__
static void append_block1 (const u32 offset
, u32x dst0
[4], u32x dst1
[4], const u32x src_r0
)
1371 case 0: tmp
[0] = src_r0
;
1374 case 1: tmp
[0] = src_r0
<< 8;
1375 tmp
[1] = src_r0
>> 24;
1377 case 2: tmp
[0] = src_r0
<< 16;
1378 tmp
[1] = src_r0
>> 16;
1380 case 3: tmp
[0] = src_r0
<< 24;
1381 tmp
[1] = src_r0
>> 8;
1387 case 0: dst0
[0] |= tmp
[0];
1390 case 1: dst0
[1] |= tmp
[0];
1393 case 2: dst0
[2] |= tmp
[0];
1396 case 3: dst0
[3] |= tmp
[0];
1399 case 4: dst1
[0] |= tmp
[0];
1402 case 5: dst1
[1] |= tmp
[0];
1405 case 6: dst1
[2] |= tmp
[0];
1408 case 7: dst1
[3] |= tmp
[0];
1413 __device__
static void append_block8 (const u32 offset
, u32x dst0
[4], u32x dst1
[4], const u32x src_l0
[4], const u32x src_l1
[4], const u32x src_r0
[4], const u32x src_r1
[4])
1415 #if __CUDA_ARCH__ >= 200
1420 dst0
[0] = src_r0
[0];
1421 dst0
[1] = src_r0
[1];
1422 dst0
[2] = src_r0
[2];
1423 dst0
[3] = src_r0
[3];
1424 dst1
[0] = src_r1
[0];
1425 dst1
[1] = src_r1
[1];
1426 dst1
[2] = src_r1
[2];
1427 dst1
[3] = src_r1
[3];
1431 dst0
[0] = __byte_perm (src_l0
[0], src_r0
[0], 0x6540);
1432 dst0
[1] = __byte_perm (src_r0
[0], src_r0
[1], 0x6543);
1433 dst0
[2] = __byte_perm (src_r0
[1], src_r0
[2], 0x6543);
1434 dst0
[3] = __byte_perm (src_r0
[2], src_r0
[3], 0x6543);
1435 dst1
[0] = __byte_perm (src_r0
[3], src_r1
[0], 0x6543);
1436 dst1
[1] = __byte_perm (src_r1
[0], src_r1
[1], 0x6543);
1437 dst1
[2] = __byte_perm (src_r1
[1], src_r1
[2], 0x6543);
1438 dst1
[3] = __byte_perm (src_r1
[2], src_r1
[3], 0x6543);
1442 dst0
[0] = __byte_perm (src_l0
[0], src_r0
[0], 0x5410);
1443 dst0
[1] = __byte_perm (src_r0
[0], src_r0
[1], 0x5432);
1444 dst0
[2] = __byte_perm (src_r0
[1], src_r0
[2], 0x5432);
1445 dst0
[3] = __byte_perm (src_r0
[2], src_r0
[3], 0x5432);
1446 dst1
[0] = __byte_perm (src_r0
[3], src_r1
[0], 0x5432);
1447 dst1
[1] = __byte_perm (src_r1
[0], src_r1
[1], 0x5432);
1448 dst1
[2] = __byte_perm (src_r1
[1], src_r1
[2], 0x5432);
1449 dst1
[3] = __byte_perm (src_r1
[2], src_r1
[3], 0x5432);
1453 dst0
[0] = __byte_perm (src_l0
[0], src_r0
[0], 0x4210);
1454 dst0
[1] = __byte_perm (src_r0
[0], src_r0
[1], 0x4321);
1455 dst0
[2] = __byte_perm (src_r0
[1], src_r0
[2], 0x4321);
1456 dst0
[3] = __byte_perm (src_r0
[2], src_r0
[3], 0x4321);
1457 dst1
[0] = __byte_perm (src_r0
[3], src_r1
[0], 0x4321);
1458 dst1
[1] = __byte_perm (src_r1
[0], src_r1
[1], 0x4321);
1459 dst1
[2] = __byte_perm (src_r1
[1], src_r1
[2], 0x4321);
1460 dst1
[3] = __byte_perm (src_r1
[2], src_r1
[3], 0x4321);
1464 dst0
[1] = src_r0
[0];
1465 dst0
[2] = src_r0
[1];
1466 dst0
[3] = src_r0
[2];
1467 dst1
[0] = src_r0
[3];
1468 dst1
[1] = src_r1
[0];
1469 dst1
[2] = src_r1
[1];
1470 dst1
[3] = src_r1
[2];
1474 dst0
[1] = __byte_perm (src_l0
[1], src_r0
[0], 0x6540);
1475 dst0
[2] = __byte_perm (src_r0
[0], src_r0
[1], 0x6543);
1476 dst0
[3] = __byte_perm (src_r0
[1], src_r0
[2], 0x6543);
1477 dst1
[0] = __byte_perm (src_r0
[2], src_r0
[3], 0x6543);
1478 dst1
[1] = __byte_perm (src_r0
[3], src_r1
[0], 0x6543);
1479 dst1
[2] = __byte_perm (src_r1
[0], src_r1
[1], 0x6543);
1480 dst1
[3] = __byte_perm (src_r1
[1], src_r1
[2], 0x6543);
1484 dst0
[1] = __byte_perm (src_l0
[1], src_r0
[0], 0x5410);
1485 dst0
[2] = __byte_perm (src_r0
[0], src_r0
[1], 0x5432);
1486 dst0
[3] = __byte_perm (src_r0
[1], src_r0
[2], 0x5432);
1487 dst1
[0] = __byte_perm (src_r0
[2], src_r0
[3], 0x5432);
1488 dst1
[1] = __byte_perm (src_r0
[3], src_r1
[0], 0x5432);
1489 dst1
[2] = __byte_perm (src_r1
[0], src_r1
[1], 0x5432);
1490 dst1
[3] = __byte_perm (src_r1
[1], src_r1
[2], 0x5432);
1494 dst0
[1] = __byte_perm (src_l0
[1], src_r0
[0], 0x4210);
1495 dst0
[2] = __byte_perm (src_r0
[0], src_r0
[1], 0x4321);
1496 dst0
[3] = __byte_perm (src_r0
[1], src_r0
[2], 0x4321);
1497 dst1
[0] = __byte_perm (src_r0
[2], src_r0
[3], 0x4321);
1498 dst1
[1] = __byte_perm (src_r0
[3], src_r1
[0], 0x4321);
1499 dst1
[2] = __byte_perm (src_r1
[0], src_r1
[1], 0x4321);
1500 dst1
[3] = __byte_perm (src_r1
[1], src_r1
[2], 0x4321);
1504 dst0
[2] = src_r0
[0];
1505 dst0
[3] = src_r0
[1];
1506 dst1
[0] = src_r0
[2];
1507 dst1
[1] = src_r0
[3];
1508 dst1
[2] = src_r1
[0];
1509 dst1
[3] = src_r1
[1];
1513 dst0
[2] = __byte_perm (src_l0
[2], src_r0
[0], 0x6540);
1514 dst0
[3] = __byte_perm (src_r0
[0], src_r0
[1], 0x6543);
1515 dst1
[0] = __byte_perm (src_r0
[1], src_r0
[2], 0x6543);
1516 dst1
[1] = __byte_perm (src_r0
[2], src_r0
[3], 0x6543);
1517 dst1
[2] = __byte_perm (src_r0
[3], src_r1
[0], 0x6543);
1518 dst1
[3] = __byte_perm (src_r1
[0], src_r1
[1], 0x6543);
1522 dst0
[2] = __byte_perm (src_l0
[2], src_r0
[0], 0x5410);
1523 dst0
[3] = __byte_perm (src_r0
[0], src_r0
[1], 0x5432);
1524 dst1
[0] = __byte_perm (src_r0
[1], src_r0
[2], 0x5432);
1525 dst1
[1] = __byte_perm (src_r0
[2], src_r0
[3], 0x5432);
1526 dst1
[2] = __byte_perm (src_r0
[3], src_r1
[0], 0x5432);
1527 dst1
[3] = __byte_perm (src_r1
[0], src_r1
[1], 0x5432);
1531 dst0
[2] = __byte_perm (src_l0
[2], src_r0
[0], 0x4210);
1532 dst0
[3] = __byte_perm (src_r0
[0], src_r0
[1], 0x4321);
1533 dst1
[0] = __byte_perm (src_r0
[1], src_r0
[2], 0x4321);
1534 dst1
[1] = __byte_perm (src_r0
[2], src_r0
[3], 0x4321);
1535 dst1
[2] = __byte_perm (src_r0
[3], src_r1
[0], 0x4321);
1536 dst1
[3] = __byte_perm (src_r1
[0], src_r1
[1], 0x4321);
1540 dst0
[3] = src_r0
[0];
1541 dst1
[0] = src_r0
[1];
1542 dst1
[1] = src_r0
[2];
1543 dst1
[2] = src_r0
[3];
1544 dst1
[3] = src_r1
[0];
1548 dst0
[3] = __byte_perm (src_l0
[3], src_r0
[0], 0x6540);
1549 dst1
[0] = __byte_perm (src_r0
[0], src_r0
[1], 0x6543);
1550 dst1
[1] = __byte_perm (src_r0
[1], src_r0
[2], 0x6543);
1551 dst1
[2] = __byte_perm (src_r0
[2], src_r0
[3], 0x6543);
1552 dst1
[3] = __byte_perm (src_r0
[3], src_r1
[0], 0x6543);
1556 dst0
[3] = __byte_perm (src_l0
[3], src_r0
[0], 0x5410);
1557 dst1
[0] = __byte_perm (src_r0
[0], src_r0
[1], 0x5432);
1558 dst1
[1] = __byte_perm (src_r0
[1], src_r0
[2], 0x5432);
1559 dst1
[2] = __byte_perm (src_r0
[2], src_r0
[3], 0x5432);
1560 dst1
[3] = __byte_perm (src_r0
[3], src_r1
[0], 0x5432);
1564 dst0
[3] = __byte_perm (src_l0
[3], src_r0
[0], 0x4210);
1565 dst1
[0] = __byte_perm (src_r0
[0], src_r0
[1], 0x4321);
1566 dst1
[1] = __byte_perm (src_r0
[1], src_r0
[2], 0x4321);
1567 dst1
[2] = __byte_perm (src_r0
[2], src_r0
[3], 0x4321);
1568 dst1
[3] = __byte_perm (src_r0
[3], src_r1
[0], 0x4321);
1572 dst1
[0] = src_r0
[0];
1573 dst1
[1] = src_r0
[1];
1574 dst1
[2] = src_r0
[2];
1575 dst1
[3] = src_r0
[3];
1579 dst1
[0] = __byte_perm (src_l1
[0], src_r0
[0], 0x6540);
1580 dst1
[1] = __byte_perm (src_r0
[0], src_r0
[1], 0x6543);
1581 dst1
[2] = __byte_perm (src_r0
[1], src_r0
[2], 0x6543);
1582 dst1
[3] = __byte_perm (src_r0
[2], src_r0
[3], 0x6543);
1586 dst1
[0] = __byte_perm (src_l1
[0], src_r0
[0], 0x5410);
1587 dst1
[1] = __byte_perm (src_r0
[0], src_r0
[1], 0x5432);
1588 dst1
[2] = __byte_perm (src_r0
[1], src_r0
[2], 0x5432);
1589 dst1
[3] = __byte_perm (src_r0
[2], src_r0
[3], 0x5432);
1593 dst1
[0] = __byte_perm (src_l1
[0], src_r0
[0], 0x4210);
1594 dst1
[1] = __byte_perm (src_r0
[0], src_r0
[1], 0x4321);
1595 dst1
[2] = __byte_perm (src_r0
[1], src_r0
[2], 0x4321);
1596 dst1
[3] = __byte_perm (src_r0
[2], src_r0
[3], 0x4321);
1600 dst1
[1] = src_r0
[0];
1601 dst1
[2] = src_r0
[1];
1602 dst1
[3] = src_r0
[2];
1606 dst1
[1] = __byte_perm (src_l1
[1], src_r0
[0], 0x6540);
1607 dst1
[2] = __byte_perm (src_r0
[0], src_r0
[1], 0x6543);
1608 dst1
[3] = __byte_perm (src_r0
[1], src_r0
[2], 0x6543);
1612 dst1
[1] = __byte_perm (src_l1
[1], src_r0
[0], 0x5410);
1613 dst1
[2] = __byte_perm (src_r0
[0], src_r0
[1], 0x5432);
1614 dst1
[3] = __byte_perm (src_r0
[1], src_r0
[2], 0x5432);
1618 dst1
[1] = __byte_perm (src_l1
[1], src_r0
[0], 0x4210);
1619 dst1
[2] = __byte_perm (src_r0
[0], src_r0
[1], 0x4321);
1620 dst1
[3] = __byte_perm (src_r0
[1], src_r0
[2], 0x4321);
1624 dst1
[2] = src_r0
[0];
1625 dst1
[3] = src_r0
[1];
1629 dst1
[2] = __byte_perm (src_l1
[2], src_r0
[0], 0x6540);
1630 dst1
[3] = __byte_perm (src_r0
[0], src_r0
[1], 0x6543);
1634 dst1
[2] = __byte_perm (src_l1
[2], src_r0
[0], 0x5410);
1635 dst1
[3] = __byte_perm (src_r0
[0], src_r0
[1], 0x5432);
1639 dst1
[2] = __byte_perm (src_l1
[2], src_r0
[0], 0x4210);
1640 dst1
[3] = __byte_perm (src_r0
[0], src_r0
[1], 0x4321);
1644 dst1
[3] = src_r0
[0];
1648 dst1
[3] = __byte_perm (src_l1
[3], src_r0
[0], 0x6540);
1652 dst1
[3] = __byte_perm (src_l1
[3], src_r0
[0], 0x5410);
1656 dst1
[3] = __byte_perm (src_l1
[3], src_r0
[0], 0x4210);
1665 dst0
[0] = src_r0
[0];
1666 dst0
[1] = src_r0
[1];
1667 dst0
[2] = src_r0
[2];
1668 dst0
[3] = src_r0
[3];
1669 dst1
[0] = src_r1
[0];
1670 dst1
[1] = src_r1
[1];
1671 dst1
[2] = src_r1
[2];
1672 dst1
[3] = src_r1
[3];
1676 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
1677 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
1678 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
1679 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
1680 dst1
[0] = src_r0
[3] >> 24 | src_r1
[0] << 8;
1681 dst1
[1] = src_r1
[0] >> 24 | src_r1
[1] << 8;
1682 dst1
[2] = src_r1
[1] >> 24 | src_r1
[2] << 8;
1683 dst1
[3] = src_r1
[2] >> 24 | src_r1
[3] << 8;
1687 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
1688 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
1689 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
1690 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
1691 dst1
[0] = src_r0
[3] >> 16 | src_r1
[0] << 16;
1692 dst1
[1] = src_r1
[0] >> 16 | src_r1
[1] << 16;
1693 dst1
[2] = src_r1
[1] >> 16 | src_r1
[2] << 16;
1694 dst1
[3] = src_r1
[2] >> 16 | src_r1
[3] << 16;
1698 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
1699 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
1700 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
1701 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
1702 dst1
[0] = src_r0
[3] >> 8 | src_r1
[0] << 24;
1703 dst1
[1] = src_r1
[0] >> 8 | src_r1
[1] << 24;
1704 dst1
[2] = src_r1
[1] >> 8 | src_r1
[2] << 24;
1705 dst1
[3] = src_r1
[2] >> 8 | src_r1
[3] << 24;
1709 dst0
[1] = src_r0
[0];
1710 dst0
[2] = src_r0
[1];
1711 dst0
[3] = src_r0
[2];
1712 dst1
[0] = src_r0
[3];
1713 dst1
[1] = src_r1
[0];
1714 dst1
[2] = src_r1
[1];
1715 dst1
[3] = src_r1
[2];
1719 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
1720 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
1721 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
1722 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
1723 dst1
[1] = src_r0
[3] >> 24 | src_r1
[0] << 8;
1724 dst1
[2] = src_r1
[0] >> 24 | src_r1
[1] << 8;
1725 dst1
[3] = src_r1
[1] >> 24 | src_r1
[2] << 8;
1729 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
1730 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
1731 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
1732 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
1733 dst1
[1] = src_r0
[3] >> 16 | src_r1
[0] << 16;
1734 dst1
[2] = src_r1
[0] >> 16 | src_r1
[1] << 16;
1735 dst1
[3] = src_r1
[1] >> 16 | src_r1
[2] << 16;
1739 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
1740 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
1741 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
1742 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
1743 dst1
[1] = src_r0
[3] >> 8 | src_r1
[0] << 24;
1744 dst1
[2] = src_r1
[0] >> 8 | src_r1
[1] << 24;
1745 dst1
[3] = src_r1
[1] >> 8 | src_r1
[2] << 24;
1749 dst0
[2] = src_r0
[0];
1750 dst0
[3] = src_r0
[1];
1751 dst1
[0] = src_r0
[2];
1752 dst1
[1] = src_r0
[3];
1753 dst1
[2] = src_r1
[0];
1754 dst1
[3] = src_r1
[1];
1758 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
1759 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
1760 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
1761 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
1762 dst1
[2] = src_r0
[3] >> 24 | src_r1
[0] << 8;
1763 dst1
[3] = src_r1
[0] >> 24 | src_r1
[1] << 8;
1767 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
1768 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
1769 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
1770 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
1771 dst1
[2] = src_r0
[3] >> 16 | src_r1
[0] << 16;
1772 dst1
[3] = src_r1
[0] >> 16 | src_r1
[1] << 16;
1776 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
1777 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
1778 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
1779 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
1780 dst1
[2] = src_r0
[3] >> 8 | src_r1
[0] << 24;
1781 dst1
[3] = src_r1
[0] >> 8 | src_r1
[1] << 24;
1785 dst0
[3] = src_r0
[0];
1786 dst1
[0] = src_r0
[1];
1787 dst1
[1] = src_r0
[2];
1788 dst1
[2] = src_r0
[3];
1789 dst1
[3] = src_r1
[0];
1793 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
1794 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
1795 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
1796 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
1797 dst1
[3] = src_r0
[3] >> 24 | src_r1
[0] << 8;
1801 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
1802 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
1803 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
1804 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
1805 dst1
[3] = src_r0
[3] >> 16 | src_r1
[0] << 16;
1809 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
1810 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
1811 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
1812 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
1813 dst1
[3] = src_r0
[3] >> 8 | src_r1
[0] << 24;
1817 dst1
[0] = src_r0
[0];
1818 dst1
[1] = src_r0
[1];
1819 dst1
[2] = src_r0
[2];
1820 dst1
[3] = src_r0
[3];
1824 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
1825 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
1826 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
1827 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
1831 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
1832 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
1833 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
1834 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
1838 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
1839 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
1840 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
1841 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
1845 dst1
[1] = src_r0
[0];
1846 dst1
[2] = src_r0
[1];
1847 dst1
[3] = src_r0
[2];
1851 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
1852 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
1853 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
1857 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
1858 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
1859 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
1863 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
1864 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
1865 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
1869 dst1
[2] = src_r0
[0];
1870 dst1
[3] = src_r0
[1];
1874 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
1875 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
1879 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
1880 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
1884 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
1885 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
1889 dst1
[3] = src_r0
[0];
1893 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
1897 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
1901 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
1908 __device__
static void reverse_block (u32x in0
[4], u32x in1
[4], u32x out0
[4], u32x out1
[4], const u32 len
)
1910 rshift_block_N (in0
, in1
, out0
, out1
, 32 - len
);
1924 out0
[0] = swap_workaround (tib40
[0]);
1925 out0
[1] = swap_workaround (tib40
[1]);
1926 out0
[2] = swap_workaround (tib40
[2]);
1927 out0
[3] = swap_workaround (tib40
[3]);
1928 out1
[0] = swap_workaround (tib41
[0]);
1929 out1
[1] = swap_workaround (tib41
[1]);
1930 out1
[2] = swap_workaround (tib41
[2]);
1931 out1
[3] = swap_workaround (tib41
[3]);
1934 __device__
static u32
rule_op_mangle_lrest (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
1936 buf0
[0] |= (generate_cmask (buf0
[0]));
1937 buf0
[1] |= (generate_cmask (buf0
[1]));
1938 buf0
[2] |= (generate_cmask (buf0
[2]));
1939 buf0
[3] |= (generate_cmask (buf0
[3]));
1940 buf1
[0] |= (generate_cmask (buf1
[0]));
1941 buf1
[1] |= (generate_cmask (buf1
[1]));
1942 buf1
[2] |= (generate_cmask (buf1
[2]));
1943 buf1
[3] |= (generate_cmask (buf1
[3]));
1948 __device__
static u32
rule_op_mangle_urest (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
1950 buf0
[0] &= ~(generate_cmask (buf0
[0]));
1951 buf0
[1] &= ~(generate_cmask (buf0
[1]));
1952 buf0
[2] &= ~(generate_cmask (buf0
[2]));
1953 buf0
[3] &= ~(generate_cmask (buf0
[3]));
1954 buf1
[0] &= ~(generate_cmask (buf1
[0]));
1955 buf1
[1] &= ~(generate_cmask (buf1
[1]));
1956 buf1
[2] &= ~(generate_cmask (buf1
[2]));
1957 buf1
[3] &= ~(generate_cmask (buf1
[3]));
1962 __device__
static u32
rule_op_mangle_lrest_ufirst (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
1964 rule_op_mangle_lrest (p0
, p1
, buf0
, buf1
, in_len
);
1966 buf0
[0] &= ~(0x00000020 & generate_cmask (buf0
[0]));
1971 __device__
static u32
rule_op_mangle_urest_lfirst (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
1973 rule_op_mangle_urest (p0
, p1
, buf0
, buf1
, in_len
);
1975 buf0
[0] |= (0x00000020 & generate_cmask (buf0
[0]));
1980 __device__
static u32
rule_op_mangle_trest (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
1982 buf0
[0] ^= (generate_cmask (buf0
[0]));
1983 buf0
[1] ^= (generate_cmask (buf0
[1]));
1984 buf0
[2] ^= (generate_cmask (buf0
[2]));
1985 buf0
[3] ^= (generate_cmask (buf0
[3]));
1986 buf1
[0] ^= (generate_cmask (buf1
[0]));
1987 buf1
[1] ^= (generate_cmask (buf1
[1]));
1988 buf1
[2] ^= (generate_cmask (buf1
[2]));
1989 buf1
[3] ^= (generate_cmask (buf1
[3]));
1994 __device__
static u32
rule_op_mangle_toggle_at (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
1996 if (p0
>= in_len
) return (in_len
);
1998 const u32 tmp
= 0x20 << ((p0
& 3) * 8);
2002 case 0: buf0
[0] ^= (tmp
& generate_cmask (buf0
[0])); break;
2003 case 1: buf0
[1] ^= (tmp
& generate_cmask (buf0
[1])); break;
2004 case 2: buf0
[2] ^= (tmp
& generate_cmask (buf0
[2])); break;
2005 case 3: buf0
[3] ^= (tmp
& generate_cmask (buf0
[3])); break;
2006 case 4: buf1
[0] ^= (tmp
& generate_cmask (buf1
[0])); break;
2007 case 5: buf1
[1] ^= (tmp
& generate_cmask (buf1
[1])); break;
2008 case 6: buf1
[2] ^= (tmp
& generate_cmask (buf1
[2])); break;
2009 case 7: buf1
[3] ^= (tmp
& generate_cmask (buf1
[3])); break;
2015 __device__
static u32
rule_op_mangle_reverse (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2017 reverse_block (buf0
, buf1
, buf0
, buf1
, in_len
);
2022 __device__
static u32
rule_op_mangle_dupeword (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2024 if ((in_len
+ in_len
) >= 32) return (in_len
);
2026 u32 out_len
= in_len
;
2040 append_block8 (out_len
, buf0
, buf1
, buf0
, buf1
, tib40
, tib41
);
2047 __device__
static u32
rule_op_mangle_dupeword_times (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2049 if (((in_len
* p0
) + in_len
) >= 32) return (in_len
);
2051 u32 out_len
= in_len
;
2065 for (u32 i
= 0; i
< p0
; i
++)
2067 append_block8 (out_len
, buf0
, buf1
, buf0
, buf1
, tib40
, tib41
);
2075 __device__
static u32
rule_op_mangle_reflect (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2077 if ((in_len
+ in_len
) >= 32) return (in_len
);
2079 u32 out_len
= in_len
;
2084 reverse_block (buf0
, buf1
, tib40
, tib41
, out_len
);
2086 append_block8 (out_len
, buf0
, buf1
, buf0
, buf1
, tib40
, tib41
);
2093 __device__
static u32
rule_op_mangle_append (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2095 if ((in_len
+ 1) >= 32) return (in_len
);
2097 u32 out_len
= in_len
;
2099 append_block1 (out_len
, buf0
, buf1
, p0
);
2106 __device__
static u32
rule_op_mangle_prepend (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2108 if ((in_len
+ 1) >= 32) return (in_len
);
2110 u32 out_len
= in_len
;
2112 rshift_block (buf0
, buf1
, buf0
, buf1
);
2114 buf0
[0] = buf0
[0] | p0
;
2121 __device__
static u32
rule_op_mangle_rotate_left (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2123 if (in_len
== 0) return (in_len
);
2125 const u32 in_len1
= in_len
- 1;
2127 const u32 sh
= (in_len1
& 3) * 8;
2129 const u32x tmp
= (buf0
[0] & 0xff) << sh
;
2131 lshift_block (buf0
, buf1
, buf0
, buf1
);
2133 switch (in_len1
/ 4)
2135 case 0: buf0
[0] |= tmp
; break;
2136 case 1: buf0
[1] |= tmp
; break;
2137 case 2: buf0
[2] |= tmp
; break;
2138 case 3: buf0
[3] |= tmp
; break;
2139 case 4: buf1
[0] |= tmp
; break;
2140 case 5: buf1
[1] |= tmp
; break;
2141 case 6: buf1
[2] |= tmp
; break;
2142 case 7: buf1
[3] |= tmp
; break;
2148 __device__
static u32
rule_op_mangle_rotate_right (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2150 if (in_len
== 0) return (in_len
);
2152 const u32 in_len1
= in_len
- 1;
2154 const u32 sh
= (in_len1
& 3) * 8;
2158 switch (in_len1
/ 4)
2160 case 0: tmp
= (buf0
[0] >> sh
) & 0xff; break;
2161 case 1: tmp
= (buf0
[1] >> sh
) & 0xff; break;
2162 case 2: tmp
= (buf0
[2] >> sh
) & 0xff; break;
2163 case 3: tmp
= (buf0
[3] >> sh
) & 0xff; break;
2164 case 4: tmp
= (buf1
[0] >> sh
) & 0xff; break;
2165 case 5: tmp
= (buf1
[1] >> sh
) & 0xff; break;
2166 case 6: tmp
= (buf1
[2] >> sh
) & 0xff; break;
2167 case 7: tmp
= (buf1
[3] >> sh
) & 0xff; break;
2170 rshift_block (buf0
, buf1
, buf0
, buf1
);
2174 truncate_right (buf0
, buf1
, in_len
);
2179 __device__
static u32
rule_op_mangle_delete_first (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2181 if (in_len
== 0) return (in_len
);
2183 const u32 in_len1
= in_len
- 1;
2185 lshift_block (buf0
, buf1
, buf0
, buf1
);
2190 __device__
static u32
rule_op_mangle_delete_last (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2192 if (in_len
== 0) return (in_len
);
2194 const u32 in_len1
= in_len
- 1;
2196 const u32 tmp
= (1 << ((in_len1
& 3) * 8)) - 1;
2198 switch (in_len1
/ 4)
2200 case 0: buf0
[0] &= tmp
; break;
2201 case 1: buf0
[1] &= tmp
; break;
2202 case 2: buf0
[2] &= tmp
; break;
2203 case 3: buf0
[3] &= tmp
; break;
2204 case 4: buf1
[0] &= tmp
; break;
2205 case 5: buf1
[1] &= tmp
; break;
2206 case 6: buf1
[2] &= tmp
; break;
2207 case 7: buf1
[3] &= tmp
; break;
2213 __device__
static u32
rule_op_mangle_delete_at (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2215 if (p0
>= in_len
) return (in_len
);
2217 u32 out_len
= in_len
;
2222 lshift_block (buf0
, buf1
, tib40
, tib41
);
2224 const u32 ml
= (1 << ((p0
& 3) * 8)) - 1;
2229 case 0: buf0
[0] = (buf0
[0] & ml
)
2239 case 1: buf0
[1] = (buf0
[1] & ml
)
2248 case 2: buf0
[2] = (buf0
[2] & ml
)
2256 case 3: buf0
[3] = (buf0
[3] & ml
)
2263 case 4: buf1
[0] = (buf1
[0] & ml
)
2269 case 5: buf1
[1] = (buf1
[1] & ml
)
2274 case 6: buf1
[2] = (buf1
[2] & ml
)
2278 case 7: buf1
[3] = (buf1
[3] & ml
)
2288 __device__
static u32
rule_op_mangle_extract (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2290 if (p0
>= in_len
) return (in_len
);
2292 if ((p0
+ p1
) > in_len
) return (in_len
);
2296 lshift_block_N (buf0
, buf1
, buf0
, buf1
, p0
);
2298 truncate_right (buf0
, buf1
, out_len
);
2303 __device__
static u32
rule_op_mangle_omit (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2305 if (p0
>= in_len
) return (in_len
);
2307 if ((p0
+ p1
) > in_len
) return (in_len
);
2309 u32 out_len
= in_len
;
2323 lshift_block_N (buf0
, buf1
, tib40
, tib41
, p1
);
2325 const u32 ml
= (1 << ((p0
& 3) * 8)) - 1;
2330 case 0: buf0
[0] = (buf0
[0] & ml
)
2340 case 1: buf0
[1] = (buf0
[1] & ml
)
2349 case 2: buf0
[2] = (buf0
[2] & ml
)
2357 case 3: buf0
[3] = (buf0
[3] & ml
)
2364 case 4: buf1
[0] = (buf1
[0] & ml
)
2370 case 5: buf1
[1] = (buf1
[1] & ml
)
2375 case 6: buf1
[2] = (buf1
[2] & ml
)
2379 case 7: buf1
[3] = (buf1
[3] & ml
)
2389 __device__
static u32
rule_op_mangle_insert (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2391 if (p0
> in_len
) return (in_len
);
2393 if ((in_len
+ 1) >= 32) return (in_len
);
2395 u32 out_len
= in_len
;
2400 rshift_block (buf0
, buf1
, tib40
, tib41
);
2402 const u32 p1n
= p1
<< ((p0
& 3) * 8);
2404 const u32 ml
= (1 << ((p0
& 3) * 8)) - 1;
2406 const u32 mr
= 0xffffff00 << ((p0
& 3) * 8);
2410 case 0: buf0
[0] = (buf0
[0] & ml
) | p1n
| (tib40
[0] & mr
);
2419 case 1: buf0
[1] = (buf0
[1] & ml
) | p1n
| (tib40
[1] & mr
);
2427 case 2: buf0
[2] = (buf0
[2] & ml
) | p1n
| (tib40
[2] & mr
);
2434 case 3: buf0
[3] = (buf0
[3] & ml
) | p1n
| (tib40
[3] & mr
);
2440 case 4: buf1
[0] = (buf1
[0] & ml
) | p1n
| (tib41
[0] & mr
);
2445 case 5: buf1
[1] = (buf1
[1] & ml
) | p1n
| (tib41
[1] & mr
);
2449 case 6: buf1
[2] = (buf1
[2] & ml
) | p1n
| (tib41
[2] & mr
);
2452 case 7: buf1
[3] = (buf1
[3] & ml
) | p1n
| (tib41
[3] & mr
);
2461 __device__
static u32
rule_op_mangle_overstrike (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2463 if (p0
>= in_len
) return (in_len
);
2465 const u32 p1n
= p1
<< ((p0
& 3) * 8);
2467 const u32 m
= ~(0xff << ((p0
& 3) * 8));
2471 case 0: buf0
[0] = (buf0
[0] & m
) | p1n
; break;
2472 case 1: buf0
[1] = (buf0
[1] & m
) | p1n
; break;
2473 case 2: buf0
[2] = (buf0
[2] & m
) | p1n
; break;
2474 case 3: buf0
[3] = (buf0
[3] & m
) | p1n
; break;
2475 case 4: buf1
[0] = (buf1
[0] & m
) | p1n
; break;
2476 case 5: buf1
[1] = (buf1
[1] & m
) | p1n
; break;
2477 case 6: buf1
[2] = (buf1
[2] & m
) | p1n
; break;
2478 case 7: buf1
[3] = (buf1
[3] & m
) | p1n
; break;
2484 __device__
static u32
rule_op_mangle_truncate_at (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2486 if (p0
>= in_len
) return (in_len
);
2488 truncate_right (buf0
, buf1
, p0
);
2493 __device__
static u32
rule_op_mangle_replace (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2499 for (u32 i
= 0; i
< in_len
; i
++)
2503 case 0: if ((__byte_perm (buf0
[0], 0, 0x6540)) == p0
) buf0
[0] = __byte_perm (p1
, buf0
[0], 0x7650);
2505 case 1: if ((__byte_perm (buf0
[0], 0, 0x6541)) == p0
) buf0
[0] = __byte_perm (p1
, buf0
[0], 0x7604);
2507 case 2: if ((__byte_perm (buf0
[0], 0, 0x6542)) == p0
) buf0
[0] = __byte_perm (p1
, buf0
[0], 0x7054);
2509 case 3: if ((__byte_perm (buf0
[0], 0, 0x6543)) == p0
) buf0
[0] = __byte_perm (p1
, buf0
[0], 0x0654);
2511 case 4: if ((__byte_perm (buf0
[1], 0, 0x6540)) == p0
) buf0
[1] = __byte_perm (p1
, buf0
[1], 0x7650);
2513 case 5: if ((__byte_perm (buf0
[1], 0, 0x6541)) == p0
) buf0
[1] = __byte_perm (p1
, buf0
[1], 0x7604);
2515 case 6: if ((__byte_perm (buf0
[1], 0, 0x6542)) == p0
) buf0
[1] = __byte_perm (p1
, buf0
[1], 0x7054);
2517 case 7: if ((__byte_perm (buf0
[1], 0, 0x6543)) == p0
) buf0
[1] = __byte_perm (p1
, buf0
[1], 0x0654);
2519 case 8: if ((__byte_perm (buf0
[2], 0, 0x6540)) == p0
) buf0
[2] = __byte_perm (p1
, buf0
[2], 0x7650);
2521 case 9: if ((__byte_perm (buf0
[2], 0, 0x6541)) == p0
) buf0
[2] = __byte_perm (p1
, buf0
[2], 0x7604);
2523 case 10: if ((__byte_perm (buf0
[2], 0, 0x6542)) == p0
) buf0
[2] = __byte_perm (p1
, buf0
[2], 0x7054);
2525 case 11: if ((__byte_perm (buf0
[2], 0, 0x6543)) == p0
) buf0
[2] = __byte_perm (p1
, buf0
[2], 0x0654);
2527 case 12: if ((__byte_perm (buf0
[3], 0, 0x6540)) == p0
) buf0
[3] = __byte_perm (p1
, buf0
[3], 0x7650);
2529 case 13: if ((__byte_perm (buf0
[3], 0, 0x6541)) == p0
) buf0
[3] = __byte_perm (p1
, buf0
[3], 0x7604);
2531 case 14: if ((__byte_perm (buf0
[3], 0, 0x6542)) == p0
) buf0
[3] = __byte_perm (p1
, buf0
[3], 0x7054);
2533 case 15: if ((__byte_perm (buf0
[3], 0, 0x6543)) == p0
) buf0
[3] = __byte_perm (p1
, buf0
[3], 0x0654);
2535 case 16: if ((__byte_perm (buf1
[0], 0, 0x6540)) == p0
) buf1
[0] = __byte_perm (p1
, buf1
[0], 0x7650);
2537 case 17: if ((__byte_perm (buf1
[0], 0, 0x6541)) == p0
) buf1
[0] = __byte_perm (p1
, buf1
[0], 0x7604);
2539 case 18: if ((__byte_perm (buf1
[0], 0, 0x6542)) == p0
) buf1
[0] = __byte_perm (p1
, buf1
[0], 0x7054);
2541 case 19: if ((__byte_perm (buf1
[0], 0, 0x6543)) == p0
) buf1
[0] = __byte_perm (p1
, buf1
[0], 0x0654);
2543 case 20: if ((__byte_perm (buf1
[1], 0, 0x6540)) == p0
) buf1
[1] = __byte_perm (p1
, buf1
[1], 0x7650);
2545 case 21: if ((__byte_perm (buf1
[1], 0, 0x6541)) == p0
) buf1
[1] = __byte_perm (p1
, buf1
[1], 0x7604);
2547 case 22: if ((__byte_perm (buf1
[1], 0, 0x6542)) == p0
) buf1
[1] = __byte_perm (p1
, buf1
[1], 0x7054);
2549 case 23: if ((__byte_perm (buf1
[1], 0, 0x6543)) == p0
) buf1
[1] = __byte_perm (p1
, buf1
[1], 0x0654);
2551 case 24: if ((__byte_perm (buf1
[2], 0, 0x6540)) == p0
) buf1
[2] = __byte_perm (p1
, buf1
[2], 0x7650);
2553 case 25: if ((__byte_perm (buf1
[2], 0, 0x6541)) == p0
) buf1
[2] = __byte_perm (p1
, buf1
[2], 0x7604);
2555 case 26: if ((__byte_perm (buf1
[2], 0, 0x6542)) == p0
) buf1
[2] = __byte_perm (p1
, buf1
[2], 0x7054);
2557 case 27: if ((__byte_perm (buf1
[2], 0, 0x6543)) == p0
) buf1
[2] = __byte_perm (p1
, buf1
[2], 0x0654);
2559 case 28: if ((__byte_perm (buf1
[3], 0, 0x6540)) == p0
) buf1
[3] = __byte_perm (p1
, buf1
[3], 0x7650);
2561 case 29: if ((__byte_perm (buf1
[3], 0, 0x6541)) == p0
) buf1
[3] = __byte_perm (p1
, buf1
[3], 0x7604);
2563 case 30: if ((__byte_perm (buf1
[3], 0, 0x6542)) == p0
) buf1
[3] = __byte_perm (p1
, buf1
[3], 0x7054);
2565 case 31: if ((__byte_perm (buf1
[3], 0, 0x6543)) == p0
) buf1
[3] = __byte_perm (p1
, buf1
[3], 0x0654);
2574 for (u32 i
= 0; i
< in_len
; i
++)
2578 case 0: if ((__byte_perm (buf0
[0].x
, 0, 0x6540)) == p0
) buf0
[0].x
= __byte_perm (p1
, buf0
[0].x
, 0x7650);
2579 if ((__byte_perm (buf0
[0].y
, 0, 0x6540)) == p0
) buf0
[0].y
= __byte_perm (p1
, buf0
[0].y
, 0x7650);
2581 case 1: if ((__byte_perm (buf0
[0].x
, 0, 0x6541)) == p0
) buf0
[0].x
= __byte_perm (p1
, buf0
[0].x
, 0x7604);
2582 if ((__byte_perm (buf0
[0].y
, 0, 0x6541)) == p0
) buf0
[0].y
= __byte_perm (p1
, buf0
[0].y
, 0x7604);
2584 case 2: if ((__byte_perm (buf0
[0].x
, 0, 0x6542)) == p0
) buf0
[0].x
= __byte_perm (p1
, buf0
[0].x
, 0x7054);
2585 if ((__byte_perm (buf0
[0].y
, 0, 0x6542)) == p0
) buf0
[0].y
= __byte_perm (p1
, buf0
[0].y
, 0x7054);
2587 case 3: if ((__byte_perm (buf0
[0].x
, 0, 0x6543)) == p0
) buf0
[0].x
= __byte_perm (p1
, buf0
[0].x
, 0x0654);
2588 if ((__byte_perm (buf0
[0].y
, 0, 0x6543)) == p0
) buf0
[0].y
= __byte_perm (p1
, buf0
[0].y
, 0x0654);
2590 case 4: if ((__byte_perm (buf0
[1].x
, 0, 0x6540)) == p0
) buf0
[1].x
= __byte_perm (p1
, buf0
[1].x
, 0x7650);
2591 if ((__byte_perm (buf0
[1].y
, 0, 0x6540)) == p0
) buf0
[1].y
= __byte_perm (p1
, buf0
[1].y
, 0x7650);
2593 case 5: if ((__byte_perm (buf0
[1].x
, 0, 0x6541)) == p0
) buf0
[1].x
= __byte_perm (p1
, buf0
[1].x
, 0x7604);
2594 if ((__byte_perm (buf0
[1].y
, 0, 0x6541)) == p0
) buf0
[1].y
= __byte_perm (p1
, buf0
[1].y
, 0x7604);
2596 case 6: if ((__byte_perm (buf0
[1].x
, 0, 0x6542)) == p0
) buf0
[1].x
= __byte_perm (p1
, buf0
[1].x
, 0x7054);
2597 if ((__byte_perm (buf0
[1].y
, 0, 0x6542)) == p0
) buf0
[1].y
= __byte_perm (p1
, buf0
[1].y
, 0x7054);
2599 case 7: if ((__byte_perm (buf0
[1].x
, 0, 0x6543)) == p0
) buf0
[1].x
= __byte_perm (p1
, buf0
[1].x
, 0x0654);
2600 if ((__byte_perm (buf0
[1].y
, 0, 0x6543)) == p0
) buf0
[1].y
= __byte_perm (p1
, buf0
[1].y
, 0x0654);
2602 case 8: if ((__byte_perm (buf0
[2].x
, 0, 0x6540)) == p0
) buf0
[2].x
= __byte_perm (p1
, buf0
[2].x
, 0x7650);
2603 if ((__byte_perm (buf0
[2].y
, 0, 0x6540)) == p0
) buf0
[2].y
= __byte_perm (p1
, buf0
[2].y
, 0x7650);
2605 case 9: if ((__byte_perm (buf0
[2].x
, 0, 0x6541)) == p0
) buf0
[2].x
= __byte_perm (p1
, buf0
[2].x
, 0x7604);
2606 if ((__byte_perm (buf0
[2].y
, 0, 0x6541)) == p0
) buf0
[2].y
= __byte_perm (p1
, buf0
[2].y
, 0x7604);
2608 case 10: if ((__byte_perm (buf0
[2].x
, 0, 0x6542)) == p0
) buf0
[2].x
= __byte_perm (p1
, buf0
[2].x
, 0x7054);
2609 if ((__byte_perm (buf0
[2].y
, 0, 0x6542)) == p0
) buf0
[2].y
= __byte_perm (p1
, buf0
[2].y
, 0x7054);
2611 case 11: if ((__byte_perm (buf0
[2].x
, 0, 0x6543)) == p0
) buf0
[2].x
= __byte_perm (p1
, buf0
[2].x
, 0x0654);
2612 if ((__byte_perm (buf0
[2].y
, 0, 0x6543)) == p0
) buf0
[2].y
= __byte_perm (p1
, buf0
[2].y
, 0x0654);
2614 case 12: if ((__byte_perm (buf0
[3].x
, 0, 0x6540)) == p0
) buf0
[3].x
= __byte_perm (p1
, buf0
[3].x
, 0x7650);
2615 if ((__byte_perm (buf0
[3].y
, 0, 0x6540)) == p0
) buf0
[3].y
= __byte_perm (p1
, buf0
[3].y
, 0x7650);
2617 case 13: if ((__byte_perm (buf0
[3].x
, 0, 0x6541)) == p0
) buf0
[3].x
= __byte_perm (p1
, buf0
[3].x
, 0x7604);
2618 if ((__byte_perm (buf0
[3].y
, 0, 0x6541)) == p0
) buf0
[3].y
= __byte_perm (p1
, buf0
[3].y
, 0x7604);
2620 case 14: if ((__byte_perm (buf0
[3].x
, 0, 0x6542)) == p0
) buf0
[3].x
= __byte_perm (p1
, buf0
[3].x
, 0x7054);
2621 if ((__byte_perm (buf0
[3].y
, 0, 0x6542)) == p0
) buf0
[3].y
= __byte_perm (p1
, buf0
[3].y
, 0x7054);
2623 case 15: if ((__byte_perm (buf0
[3].x
, 0, 0x6543)) == p0
) buf0
[3].x
= __byte_perm (p1
, buf0
[3].x
, 0x0654);
2624 if ((__byte_perm (buf0
[3].y
, 0, 0x6543)) == p0
) buf0
[3].y
= __byte_perm (p1
, buf0
[3].y
, 0x0654);
2626 case 16: if ((__byte_perm (buf1
[0].x
, 0, 0x6540)) == p0
) buf1
[0].x
= __byte_perm (p1
, buf1
[0].x
, 0x7650);
2627 if ((__byte_perm (buf1
[0].y
, 0, 0x6540)) == p0
) buf1
[0].y
= __byte_perm (p1
, buf1
[0].y
, 0x7650);
2629 case 17: if ((__byte_perm (buf1
[0].x
, 0, 0x6541)) == p0
) buf1
[0].x
= __byte_perm (p1
, buf1
[0].x
, 0x7604);
2630 if ((__byte_perm (buf1
[0].y
, 0, 0x6541)) == p0
) buf1
[0].y
= __byte_perm (p1
, buf1
[0].y
, 0x7604);
2632 case 18: if ((__byte_perm (buf1
[0].x
, 0, 0x6542)) == p0
) buf1
[0].x
= __byte_perm (p1
, buf1
[0].x
, 0x7054);
2633 if ((__byte_perm (buf1
[0].y
, 0, 0x6542)) == p0
) buf1
[0].y
= __byte_perm (p1
, buf1
[0].y
, 0x7054);
2635 case 19: if ((__byte_perm (buf1
[0].x
, 0, 0x6543)) == p0
) buf1
[0].x
= __byte_perm (p1
, buf1
[0].x
, 0x0654);
2636 if ((__byte_perm (buf1
[0].y
, 0, 0x6543)) == p0
) buf1
[0].y
= __byte_perm (p1
, buf1
[0].y
, 0x0654);
2638 case 20: if ((__byte_perm (buf1
[1].x
, 0, 0x6540)) == p0
) buf1
[1].x
= __byte_perm (p1
, buf1
[1].x
, 0x7650);
2639 if ((__byte_perm (buf1
[1].y
, 0, 0x6540)) == p0
) buf1
[1].y
= __byte_perm (p1
, buf1
[1].y
, 0x7650);
2641 case 21: if ((__byte_perm (buf1
[1].x
, 0, 0x6541)) == p0
) buf1
[1].x
= __byte_perm (p1
, buf1
[1].x
, 0x7604);
2642 if ((__byte_perm (buf1
[1].y
, 0, 0x6541)) == p0
) buf1
[1].y
= __byte_perm (p1
, buf1
[1].y
, 0x7604);
2644 case 22: if ((__byte_perm (buf1
[1].x
, 0, 0x6542)) == p0
) buf1
[1].x
= __byte_perm (p1
, buf1
[1].x
, 0x7054);
2645 if ((__byte_perm (buf1
[1].y
, 0, 0x6542)) == p0
) buf1
[1].y
= __byte_perm (p1
, buf1
[1].y
, 0x7054);
2647 case 23: if ((__byte_perm (buf1
[1].x
, 0, 0x6543)) == p0
) buf1
[1].x
= __byte_perm (p1
, buf1
[1].x
, 0x0654);
2648 if ((__byte_perm (buf1
[1].y
, 0, 0x6543)) == p0
) buf1
[1].y
= __byte_perm (p1
, buf1
[1].y
, 0x0654);
2650 case 24: if ((__byte_perm (buf1
[2].x
, 0, 0x6540)) == p0
) buf1
[2].x
= __byte_perm (p1
, buf1
[2].x
, 0x7650);
2651 if ((__byte_perm (buf1
[2].y
, 0, 0x6540)) == p0
) buf1
[2].y
= __byte_perm (p1
, buf1
[2].y
, 0x7650);
2653 case 25: if ((__byte_perm (buf1
[2].x
, 0, 0x6541)) == p0
) buf1
[2].x
= __byte_perm (p1
, buf1
[2].x
, 0x7604);
2654 if ((__byte_perm (buf1
[2].y
, 0, 0x6541)) == p0
) buf1
[2].y
= __byte_perm (p1
, buf1
[2].y
, 0x7604);
2656 case 26: if ((__byte_perm (buf1
[2].x
, 0, 0x6542)) == p0
) buf1
[2].x
= __byte_perm (p1
, buf1
[2].x
, 0x7054);
2657 if ((__byte_perm (buf1
[2].y
, 0, 0x6542)) == p0
) buf1
[2].y
= __byte_perm (p1
, buf1
[2].y
, 0x7054);
2659 case 27: if ((__byte_perm (buf1
[2].x
, 0, 0x6543)) == p0
) buf1
[2].x
= __byte_perm (p1
, buf1
[2].x
, 0x0654);
2660 if ((__byte_perm (buf1
[2].y
, 0, 0x6543)) == p0
) buf1
[2].y
= __byte_perm (p1
, buf1
[2].y
, 0x0654);
2662 case 28: if ((__byte_perm (buf1
[3].x
, 0, 0x6540)) == p0
) buf1
[3].x
= __byte_perm (p1
, buf1
[3].x
, 0x7650);
2663 if ((__byte_perm (buf1
[3].y
, 0, 0x6540)) == p0
) buf1
[3].y
= __byte_perm (p1
, buf1
[3].y
, 0x7650);
2665 case 29: if ((__byte_perm (buf1
[3].x
, 0, 0x6541)) == p0
) buf1
[3].x
= __byte_perm (p1
, buf1
[3].x
, 0x7604);
2666 if ((__byte_perm (buf1
[3].y
, 0, 0x6541)) == p0
) buf1
[3].y
= __byte_perm (p1
, buf1
[3].y
, 0x7604);
2668 case 30: if ((__byte_perm (buf1
[3].x
, 0, 0x6542)) == p0
) buf1
[3].x
= __byte_perm (p1
, buf1
[3].x
, 0x7054);
2669 if ((__byte_perm (buf1
[3].y
, 0, 0x6542)) == p0
) buf1
[3].y
= __byte_perm (p1
, buf1
[3].y
, 0x7054);
2671 case 31: if ((__byte_perm (buf1
[3].x
, 0, 0x6543)) == p0
) buf1
[3].x
= __byte_perm (p1
, buf1
[3].x
, 0x0654);
2672 if ((__byte_perm (buf1
[3].y
, 0, 0x6543)) == p0
) buf1
[3].y
= __byte_perm (p1
, buf1
[3].y
, 0x0654);
2681 for (u32 i
= 0; i
< in_len
; i
++)
2685 case 0: if ((__byte_perm (buf0
[0].x
, 0, 0x6540)) == p0
) buf0
[0].x
= __byte_perm (p1
, buf0
[0].x
, 0x7650);
2686 if ((__byte_perm (buf0
[0].y
, 0, 0x6540)) == p0
) buf0
[0].y
= __byte_perm (p1
, buf0
[0].y
, 0x7650);
2687 if ((__byte_perm (buf0
[0].z
, 0, 0x6540)) == p0
) buf0
[0].z
= __byte_perm (p1
, buf0
[0].z
, 0x7650);
2688 if ((__byte_perm (buf0
[0].w
, 0, 0x6540)) == p0
) buf0
[0].w
= __byte_perm (p1
, buf0
[0].w
, 0x7650);
2690 case 1: if ((__byte_perm (buf0
[0].x
, 0, 0x6541)) == p0
) buf0
[0].x
= __byte_perm (p1
, buf0
[0].x
, 0x7604);
2691 if ((__byte_perm (buf0
[0].y
, 0, 0x6541)) == p0
) buf0
[0].y
= __byte_perm (p1
, buf0
[0].y
, 0x7604);
2692 if ((__byte_perm (buf0
[0].z
, 0, 0x6541)) == p0
) buf0
[0].z
= __byte_perm (p1
, buf0
[0].z
, 0x7604);
2693 if ((__byte_perm (buf0
[0].w
, 0, 0x6541)) == p0
) buf0
[0].w
= __byte_perm (p1
, buf0
[0].w
, 0x7604);
2695 case 2: if ((__byte_perm (buf0
[0].x
, 0, 0x6542)) == p0
) buf0
[0].x
= __byte_perm (p1
, buf0
[0].x
, 0x7054);
2696 if ((__byte_perm (buf0
[0].y
, 0, 0x6542)) == p0
) buf0
[0].y
= __byte_perm (p1
, buf0
[0].y
, 0x7054);
2697 if ((__byte_perm (buf0
[0].z
, 0, 0x6542)) == p0
) buf0
[0].z
= __byte_perm (p1
, buf0
[0].z
, 0x7054);
2698 if ((__byte_perm (buf0
[0].w
, 0, 0x6542)) == p0
) buf0
[0].w
= __byte_perm (p1
, buf0
[0].w
, 0x7054);
2700 case 3: if ((__byte_perm (buf0
[0].x
, 0, 0x6543)) == p0
) buf0
[0].x
= __byte_perm (p1
, buf0
[0].x
, 0x0654);
2701 if ((__byte_perm (buf0
[0].y
, 0, 0x6543)) == p0
) buf0
[0].y
= __byte_perm (p1
, buf0
[0].y
, 0x0654);
2702 if ((__byte_perm (buf0
[0].z
, 0, 0x6543)) == p0
) buf0
[0].z
= __byte_perm (p1
, buf0
[0].z
, 0x0654);
2703 if ((__byte_perm (buf0
[0].w
, 0, 0x6543)) == p0
) buf0
[0].w
= __byte_perm (p1
, buf0
[0].w
, 0x0654);
2705 case 4: if ((__byte_perm (buf0
[1].x
, 0, 0x6540)) == p0
) buf0
[1].x
= __byte_perm (p1
, buf0
[1].x
, 0x7650);
2706 if ((__byte_perm (buf0
[1].y
, 0, 0x6540)) == p0
) buf0
[1].y
= __byte_perm (p1
, buf0
[1].y
, 0x7650);
2707 if ((__byte_perm (buf0
[1].z
, 0, 0x6540)) == p0
) buf0
[1].z
= __byte_perm (p1
, buf0
[1].z
, 0x7650);
2708 if ((__byte_perm (buf0
[1].w
, 0, 0x6540)) == p0
) buf0
[1].w
= __byte_perm (p1
, buf0
[1].w
, 0x7650);
2710 case 5: if ((__byte_perm (buf0
[1].x
, 0, 0x6541)) == p0
) buf0
[1].x
= __byte_perm (p1
, buf0
[1].x
, 0x7604);
2711 if ((__byte_perm (buf0
[1].y
, 0, 0x6541)) == p0
) buf0
[1].y
= __byte_perm (p1
, buf0
[1].y
, 0x7604);
2712 if ((__byte_perm (buf0
[1].z
, 0, 0x6541)) == p0
) buf0
[1].z
= __byte_perm (p1
, buf0
[1].z
, 0x7604);
2713 if ((__byte_perm (buf0
[1].w
, 0, 0x6541)) == p0
) buf0
[1].w
= __byte_perm (p1
, buf0
[1].w
, 0x7604);
2715 case 6: if ((__byte_perm (buf0
[1].x
, 0, 0x6542)) == p0
) buf0
[1].x
= __byte_perm (p1
, buf0
[1].x
, 0x7054);
2716 if ((__byte_perm (buf0
[1].y
, 0, 0x6542)) == p0
) buf0
[1].y
= __byte_perm (p1
, buf0
[1].y
, 0x7054);
2717 if ((__byte_perm (buf0
[1].z
, 0, 0x6542)) == p0
) buf0
[1].z
= __byte_perm (p1
, buf0
[1].z
, 0x7054);
2718 if ((__byte_perm (buf0
[1].w
, 0, 0x6542)) == p0
) buf0
[1].w
= __byte_perm (p1
, buf0
[1].w
, 0x7054);
2720 case 7: if ((__byte_perm (buf0
[1].x
, 0, 0x6543)) == p0
) buf0
[1].x
= __byte_perm (p1
, buf0
[1].x
, 0x0654);
2721 if ((__byte_perm (buf0
[1].y
, 0, 0x6543)) == p0
) buf0
[1].y
= __byte_perm (p1
, buf0
[1].y
, 0x0654);
2722 if ((__byte_perm (buf0
[1].z
, 0, 0x6543)) == p0
) buf0
[1].z
= __byte_perm (p1
, buf0
[1].z
, 0x0654);
2723 if ((__byte_perm (buf0
[1].w
, 0, 0x6543)) == p0
) buf0
[1].w
= __byte_perm (p1
, buf0
[1].w
, 0x0654);
2725 case 8: if ((__byte_perm (buf0
[2].x
, 0, 0x6540)) == p0
) buf0
[2].x
= __byte_perm (p1
, buf0
[2].x
, 0x7650);
2726 if ((__byte_perm (buf0
[2].y
, 0, 0x6540)) == p0
) buf0
[2].y
= __byte_perm (p1
, buf0
[2].y
, 0x7650);
2727 if ((__byte_perm (buf0
[2].z
, 0, 0x6540)) == p0
) buf0
[2].z
= __byte_perm (p1
, buf0
[2].z
, 0x7650);
2728 if ((__byte_perm (buf0
[2].w
, 0, 0x6540)) == p0
) buf0
[2].w
= __byte_perm (p1
, buf0
[2].w
, 0x7650);
2730 case 9: if ((__byte_perm (buf0
[2].x
, 0, 0x6541)) == p0
) buf0
[2].x
= __byte_perm (p1
, buf0
[2].x
, 0x7604);
2731 if ((__byte_perm (buf0
[2].y
, 0, 0x6541)) == p0
) buf0
[2].y
= __byte_perm (p1
, buf0
[2].y
, 0x7604);
2732 if ((__byte_perm (buf0
[2].z
, 0, 0x6541)) == p0
) buf0
[2].z
= __byte_perm (p1
, buf0
[2].z
, 0x7604);
2733 if ((__byte_perm (buf0
[2].w
, 0, 0x6541)) == p0
) buf0
[2].w
= __byte_perm (p1
, buf0
[2].w
, 0x7604);
2735 case 10: if ((__byte_perm (buf0
[2].x
, 0, 0x6542)) == p0
) buf0
[2].x
= __byte_perm (p1
, buf0
[2].x
, 0x7054);
2736 if ((__byte_perm (buf0
[2].y
, 0, 0x6542)) == p0
) buf0
[2].y
= __byte_perm (p1
, buf0
[2].y
, 0x7054);
2737 if ((__byte_perm (buf0
[2].z
, 0, 0x6542)) == p0
) buf0
[2].z
= __byte_perm (p1
, buf0
[2].z
, 0x7054);
2738 if ((__byte_perm (buf0
[2].w
, 0, 0x6542)) == p0
) buf0
[2].w
= __byte_perm (p1
, buf0
[2].w
, 0x7054);
2740 case 11: if ((__byte_perm (buf0
[2].x
, 0, 0x6543)) == p0
) buf0
[2].x
= __byte_perm (p1
, buf0
[2].x
, 0x0654);
2741 if ((__byte_perm (buf0
[2].y
, 0, 0x6543)) == p0
) buf0
[2].y
= __byte_perm (p1
, buf0
[2].y
, 0x0654);
2742 if ((__byte_perm (buf0
[2].z
, 0, 0x6543)) == p0
) buf0
[2].z
= __byte_perm (p1
, buf0
[2].z
, 0x0654);
2743 if ((__byte_perm (buf0
[2].w
, 0, 0x6543)) == p0
) buf0
[2].w
= __byte_perm (p1
, buf0
[2].w
, 0x0654);
2745 case 12: if ((__byte_perm (buf0
[3].x
, 0, 0x6540)) == p0
) buf0
[3].x
= __byte_perm (p1
, buf0
[3].x
, 0x7650);
2746 if ((__byte_perm (buf0
[3].y
, 0, 0x6540)) == p0
) buf0
[3].y
= __byte_perm (p1
, buf0
[3].y
, 0x7650);
2747 if ((__byte_perm (buf0
[3].z
, 0, 0x6540)) == p0
) buf0
[3].z
= __byte_perm (p1
, buf0
[3].z
, 0x7650);
2748 if ((__byte_perm (buf0
[3].w
, 0, 0x6540)) == p0
) buf0
[3].w
= __byte_perm (p1
, buf0
[3].w
, 0x7650);
2750 case 13: if ((__byte_perm (buf0
[3].x
, 0, 0x6541)) == p0
) buf0
[3].x
= __byte_perm (p1
, buf0
[3].x
, 0x7604);
2751 if ((__byte_perm (buf0
[3].y
, 0, 0x6541)) == p0
) buf0
[3].y
= __byte_perm (p1
, buf0
[3].y
, 0x7604);
2752 if ((__byte_perm (buf0
[3].z
, 0, 0x6541)) == p0
) buf0
[3].z
= __byte_perm (p1
, buf0
[3].z
, 0x7604);
2753 if ((__byte_perm (buf0
[3].w
, 0, 0x6541)) == p0
) buf0
[3].w
= __byte_perm (p1
, buf0
[3].w
, 0x7604);
2755 case 14: if ((__byte_perm (buf0
[3].x
, 0, 0x6542)) == p0
) buf0
[3].x
= __byte_perm (p1
, buf0
[3].x
, 0x7054);
2756 if ((__byte_perm (buf0
[3].y
, 0, 0x6542)) == p0
) buf0
[3].y
= __byte_perm (p1
, buf0
[3].y
, 0x7054);
2757 if ((__byte_perm (buf0
[3].z
, 0, 0x6542)) == p0
) buf0
[3].z
= __byte_perm (p1
, buf0
[3].z
, 0x7054);
2758 if ((__byte_perm (buf0
[3].w
, 0, 0x6542)) == p0
) buf0
[3].w
= __byte_perm (p1
, buf0
[3].w
, 0x7054);
2760 case 15: if ((__byte_perm (buf0
[3].x
, 0, 0x6543)) == p0
) buf0
[3].x
= __byte_perm (p1
, buf0
[3].x
, 0x0654);
2761 if ((__byte_perm (buf0
[3].y
, 0, 0x6543)) == p0
) buf0
[3].y
= __byte_perm (p1
, buf0
[3].y
, 0x0654);
2762 if ((__byte_perm (buf0
[3].z
, 0, 0x6543)) == p0
) buf0
[3].z
= __byte_perm (p1
, buf0
[3].z
, 0x0654);
2763 if ((__byte_perm (buf0
[3].w
, 0, 0x6543)) == p0
) buf0
[3].w
= __byte_perm (p1
, buf0
[3].w
, 0x0654);
2765 case 16: if ((__byte_perm (buf1
[0].x
, 0, 0x6540)) == p0
) buf1
[0].x
= __byte_perm (p1
, buf1
[0].x
, 0x7650);
2766 if ((__byte_perm (buf1
[0].y
, 0, 0x6540)) == p0
) buf1
[0].y
= __byte_perm (p1
, buf1
[0].y
, 0x7650);
2767 if ((__byte_perm (buf1
[0].z
, 0, 0x6540)) == p0
) buf1
[0].z
= __byte_perm (p1
, buf1
[0].z
, 0x7650);
2768 if ((__byte_perm (buf1
[0].w
, 0, 0x6540)) == p0
) buf1
[0].w
= __byte_perm (p1
, buf1
[0].w
, 0x7650);
2770 case 17: if ((__byte_perm (buf1
[0].x
, 0, 0x6541)) == p0
) buf1
[0].x
= __byte_perm (p1
, buf1
[0].x
, 0x7604);
2771 if ((__byte_perm (buf1
[0].y
, 0, 0x6541)) == p0
) buf1
[0].y
= __byte_perm (p1
, buf1
[0].y
, 0x7604);
2772 if ((__byte_perm (buf1
[0].z
, 0, 0x6541)) == p0
) buf1
[0].z
= __byte_perm (p1
, buf1
[0].z
, 0x7604);
2773 if ((__byte_perm (buf1
[0].w
, 0, 0x6541)) == p0
) buf1
[0].w
= __byte_perm (p1
, buf1
[0].w
, 0x7604);
2775 case 18: if ((__byte_perm (buf1
[0].x
, 0, 0x6542)) == p0
) buf1
[0].x
= __byte_perm (p1
, buf1
[0].x
, 0x7054);
2776 if ((__byte_perm (buf1
[0].y
, 0, 0x6542)) == p0
) buf1
[0].y
= __byte_perm (p1
, buf1
[0].y
, 0x7054);
2777 if ((__byte_perm (buf1
[0].z
, 0, 0x6542)) == p0
) buf1
[0].z
= __byte_perm (p1
, buf1
[0].z
, 0x7054);
2778 if ((__byte_perm (buf1
[0].w
, 0, 0x6542)) == p0
) buf1
[0].w
= __byte_perm (p1
, buf1
[0].w
, 0x7054);
2780 case 19: if ((__byte_perm (buf1
[0].x
, 0, 0x6543)) == p0
) buf1
[0].x
= __byte_perm (p1
, buf1
[0].x
, 0x0654);
2781 if ((__byte_perm (buf1
[0].y
, 0, 0x6543)) == p0
) buf1
[0].y
= __byte_perm (p1
, buf1
[0].y
, 0x0654);
2782 if ((__byte_perm (buf1
[0].z
, 0, 0x6543)) == p0
) buf1
[0].z
= __byte_perm (p1
, buf1
[0].z
, 0x0654);
2783 if ((__byte_perm (buf1
[0].w
, 0, 0x6543)) == p0
) buf1
[0].w
= __byte_perm (p1
, buf1
[0].w
, 0x0654);
2785 case 20: if ((__byte_perm (buf1
[1].x
, 0, 0x6540)) == p0
) buf1
[1].x
= __byte_perm (p1
, buf1
[1].x
, 0x7650);
2786 if ((__byte_perm (buf1
[1].y
, 0, 0x6540)) == p0
) buf1
[1].y
= __byte_perm (p1
, buf1
[1].y
, 0x7650);
2787 if ((__byte_perm (buf1
[1].z
, 0, 0x6540)) == p0
) buf1
[1].z
= __byte_perm (p1
, buf1
[1].z
, 0x7650);
2788 if ((__byte_perm (buf1
[1].w
, 0, 0x6540)) == p0
) buf1
[1].w
= __byte_perm (p1
, buf1
[1].w
, 0x7650);
2790 case 21: if ((__byte_perm (buf1
[1].x
, 0, 0x6541)) == p0
) buf1
[1].x
= __byte_perm (p1
, buf1
[1].x
, 0x7604);
2791 if ((__byte_perm (buf1
[1].y
, 0, 0x6541)) == p0
) buf1
[1].y
= __byte_perm (p1
, buf1
[1].y
, 0x7604);
2792 if ((__byte_perm (buf1
[1].z
, 0, 0x6541)) == p0
) buf1
[1].z
= __byte_perm (p1
, buf1
[1].z
, 0x7604);
2793 if ((__byte_perm (buf1
[1].w
, 0, 0x6541)) == p0
) buf1
[1].w
= __byte_perm (p1
, buf1
[1].w
, 0x7604);
2795 case 22: if ((__byte_perm (buf1
[1].x
, 0, 0x6542)) == p0
) buf1
[1].x
= __byte_perm (p1
, buf1
[1].x
, 0x7054);
2796 if ((__byte_perm (buf1
[1].y
, 0, 0x6542)) == p0
) buf1
[1].y
= __byte_perm (p1
, buf1
[1].y
, 0x7054);
2797 if ((__byte_perm (buf1
[1].z
, 0, 0x6542)) == p0
) buf1
[1].z
= __byte_perm (p1
, buf1
[1].z
, 0x7054);
2798 if ((__byte_perm (buf1
[1].w
, 0, 0x6542)) == p0
) buf1
[1].w
= __byte_perm (p1
, buf1
[1].w
, 0x7054);
2800 case 23: if ((__byte_perm (buf1
[1].x
, 0, 0x6543)) == p0
) buf1
[1].x
= __byte_perm (p1
, buf1
[1].x
, 0x0654);
2801 if ((__byte_perm (buf1
[1].y
, 0, 0x6543)) == p0
) buf1
[1].y
= __byte_perm (p1
, buf1
[1].y
, 0x0654);
2802 if ((__byte_perm (buf1
[1].z
, 0, 0x6543)) == p0
) buf1
[1].z
= __byte_perm (p1
, buf1
[1].z
, 0x0654);
2803 if ((__byte_perm (buf1
[1].w
, 0, 0x6543)) == p0
) buf1
[1].w
= __byte_perm (p1
, buf1
[1].w
, 0x0654);
2805 case 24: if ((__byte_perm (buf1
[2].x
, 0, 0x6540)) == p0
) buf1
[2].x
= __byte_perm (p1
, buf1
[2].x
, 0x7650);
2806 if ((__byte_perm (buf1
[2].y
, 0, 0x6540)) == p0
) buf1
[2].y
= __byte_perm (p1
, buf1
[2].y
, 0x7650);
2807 if ((__byte_perm (buf1
[2].z
, 0, 0x6540)) == p0
) buf1
[2].z
= __byte_perm (p1
, buf1
[2].z
, 0x7650);
2808 if ((__byte_perm (buf1
[2].w
, 0, 0x6540)) == p0
) buf1
[2].w
= __byte_perm (p1
, buf1
[2].w
, 0x7650);
2810 case 25: if ((__byte_perm (buf1
[2].x
, 0, 0x6541)) == p0
) buf1
[2].x
= __byte_perm (p1
, buf1
[2].x
, 0x7604);
2811 if ((__byte_perm (buf1
[2].y
, 0, 0x6541)) == p0
) buf1
[2].y
= __byte_perm (p1
, buf1
[2].y
, 0x7604);
2812 if ((__byte_perm (buf1
[2].z
, 0, 0x6541)) == p0
) buf1
[2].z
= __byte_perm (p1
, buf1
[2].z
, 0x7604);
2813 if ((__byte_perm (buf1
[2].w
, 0, 0x6541)) == p0
) buf1
[2].w
= __byte_perm (p1
, buf1
[2].w
, 0x7604);
2815 case 26: if ((__byte_perm (buf1
[2].x
, 0, 0x6542)) == p0
) buf1
[2].x
= __byte_perm (p1
, buf1
[2].x
, 0x7054);
2816 if ((__byte_perm (buf1
[2].y
, 0, 0x6542)) == p0
) buf1
[2].y
= __byte_perm (p1
, buf1
[2].y
, 0x7054);
2817 if ((__byte_perm (buf1
[2].z
, 0, 0x6542)) == p0
) buf1
[2].z
= __byte_perm (p1
, buf1
[2].z
, 0x7054);
2818 if ((__byte_perm (buf1
[2].w
, 0, 0x6542)) == p0
) buf1
[2].w
= __byte_perm (p1
, buf1
[2].w
, 0x7054);
2820 case 27: if ((__byte_perm (buf1
[2].x
, 0, 0x6543)) == p0
) buf1
[2].x
= __byte_perm (p1
, buf1
[2].x
, 0x0654);
2821 if ((__byte_perm (buf1
[2].y
, 0, 0x6543)) == p0
) buf1
[2].y
= __byte_perm (p1
, buf1
[2].y
, 0x0654);
2822 if ((__byte_perm (buf1
[2].z
, 0, 0x6543)) == p0
) buf1
[2].z
= __byte_perm (p1
, buf1
[2].z
, 0x0654);
2823 if ((__byte_perm (buf1
[2].w
, 0, 0x6543)) == p0
) buf1
[2].w
= __byte_perm (p1
, buf1
[2].w
, 0x0654);
2825 case 28: if ((__byte_perm (buf1
[3].x
, 0, 0x6540)) == p0
) buf1
[3].x
= __byte_perm (p1
, buf1
[3].x
, 0x7650);
2826 if ((__byte_perm (buf1
[3].y
, 0, 0x6540)) == p0
) buf1
[3].y
= __byte_perm (p1
, buf1
[3].y
, 0x7650);
2827 if ((__byte_perm (buf1
[3].z
, 0, 0x6540)) == p0
) buf1
[3].z
= __byte_perm (p1
, buf1
[3].z
, 0x7650);
2828 if ((__byte_perm (buf1
[3].w
, 0, 0x6540)) == p0
) buf1
[3].w
= __byte_perm (p1
, buf1
[3].w
, 0x7650);
2830 case 29: if ((__byte_perm (buf1
[3].x
, 0, 0x6541)) == p0
) buf1
[3].x
= __byte_perm (p1
, buf1
[3].x
, 0x7604);
2831 if ((__byte_perm (buf1
[3].y
, 0, 0x6541)) == p0
) buf1
[3].y
= __byte_perm (p1
, buf1
[3].y
, 0x7604);
2832 if ((__byte_perm (buf1
[3].z
, 0, 0x6541)) == p0
) buf1
[3].z
= __byte_perm (p1
, buf1
[3].z
, 0x7604);
2833 if ((__byte_perm (buf1
[3].w
, 0, 0x6541)) == p0
) buf1
[3].w
= __byte_perm (p1
, buf1
[3].w
, 0x7604);
2835 case 30: if ((__byte_perm (buf1
[3].x
, 0, 0x6542)) == p0
) buf1
[3].x
= __byte_perm (p1
, buf1
[3].x
, 0x7054);
2836 if ((__byte_perm (buf1
[3].y
, 0, 0x6542)) == p0
) buf1
[3].y
= __byte_perm (p1
, buf1
[3].y
, 0x7054);
2837 if ((__byte_perm (buf1
[3].z
, 0, 0x6542)) == p0
) buf1
[3].z
= __byte_perm (p1
, buf1
[3].z
, 0x7054);
2838 if ((__byte_perm (buf1
[3].w
, 0, 0x6542)) == p0
) buf1
[3].w
= __byte_perm (p1
, buf1
[3].w
, 0x7054);
2840 case 31: if ((__byte_perm (buf1
[3].x
, 0, 0x6543)) == p0
) buf1
[3].x
= __byte_perm (p1
, buf1
[3].x
, 0x0654);
2841 if ((__byte_perm (buf1
[3].y
, 0, 0x6543)) == p0
) buf1
[3].y
= __byte_perm (p1
, buf1
[3].y
, 0x0654);
2842 if ((__byte_perm (buf1
[3].z
, 0, 0x6543)) == p0
) buf1
[3].z
= __byte_perm (p1
, buf1
[3].z
, 0x0654);
2843 if ((__byte_perm (buf1
[3].w
, 0, 0x6543)) == p0
) buf1
[3].w
= __byte_perm (p1
, buf1
[3].w
, 0x0654);
2854 __device__
static u32
rule_op_mangle_purgechar (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2860 __device__
static u32
rule_op_mangle_togglecase_rec (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2866 __device__
static u32
rule_op_mangle_dupechar_first (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
2868 if ( in_len
== 0) return (in_len
);
2869 if ((in_len
+ p0
) >= 32) return (in_len
);
2871 u32 out_len
= in_len
;
2873 const u32x tmp
= buf0
[0] & 0xFF;
2875 rshift_block_N (buf0
, buf1
, buf0
, buf1
, p0
);
2877 #if __CUDA_ARCH__ >= 200
2881 case 1: buf0
[0] |= tmp
;
2883 case 2: buf0
[0] |= __byte_perm (tmp
, 0, 0x5400);
2885 case 3: buf0
[0] |= __byte_perm (tmp
, 0, 0x4000);
2887 case 4: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2889 case 5: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2892 case 6: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2893 buf0
[1] |= __byte_perm (tmp
, 0, 0x5400);
2895 case 7: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2896 buf0
[1] |= __byte_perm (tmp
, 0, 0x4000);
2898 case 8: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2899 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2901 case 9: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2902 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2905 case 10: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2906 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2907 buf0
[2] |= __byte_perm (tmp
, 0, 0x5400);
2909 case 11: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2910 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2911 buf0
[2] |= __byte_perm (tmp
, 0, 0x4000);
2913 case 12: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2914 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2915 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2917 case 13: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2918 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2919 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2922 case 14: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2923 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2924 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2925 buf0
[3] |= __byte_perm (tmp
, 0, 0x5400);
2927 case 15: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2928 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2929 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2930 buf0
[3] |= __byte_perm (tmp
, 0, 0x4000);
2932 case 16: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2933 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2934 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2935 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
2937 case 17: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2938 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2939 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2940 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
2943 case 18: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2944 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2945 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2946 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
2947 buf1
[0] |= __byte_perm (tmp
, 0, 0x5400);
2949 case 19: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2950 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2951 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2952 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
2953 buf1
[0] |= __byte_perm (tmp
, 0, 0x4000);
2955 case 20: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2956 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2957 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2958 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
2959 buf1
[0] |= __byte_perm (tmp
, 0, 0x0000);
2961 case 21: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2962 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2963 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2964 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
2965 buf1
[0] |= __byte_perm (tmp
, 0, 0x0000);
2968 case 22: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2969 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2970 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2971 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
2972 buf1
[0] |= __byte_perm (tmp
, 0, 0x0000);
2973 buf1
[1] |= __byte_perm (tmp
, 0, 0x5400);
2975 case 23: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2976 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2977 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2978 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
2979 buf1
[0] |= __byte_perm (tmp
, 0, 0x0000);
2980 buf1
[1] |= __byte_perm (tmp
, 0, 0x4000);
2982 case 24: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2983 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2984 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2985 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
2986 buf1
[0] |= __byte_perm (tmp
, 0, 0x0000);
2987 buf1
[1] |= __byte_perm (tmp
, 0, 0x0000);
2989 case 25: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2990 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2991 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
2992 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
2993 buf1
[0] |= __byte_perm (tmp
, 0, 0x0000);
2994 buf1
[1] |= __byte_perm (tmp
, 0, 0x0000);
2997 case 26: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
2998 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
2999 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
3000 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
3001 buf1
[0] |= __byte_perm (tmp
, 0, 0x0000);
3002 buf1
[1] |= __byte_perm (tmp
, 0, 0x0000);
3003 buf1
[2] |= __byte_perm (tmp
, 0, 0x5400);
3005 case 27: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
3006 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
3007 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
3008 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
3009 buf1
[0] |= __byte_perm (tmp
, 0, 0x0000);
3010 buf1
[1] |= __byte_perm (tmp
, 0, 0x0000);
3011 buf1
[2] |= __byte_perm (tmp
, 0, 0x4000);
3013 case 28: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
3014 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
3015 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
3016 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
3017 buf1
[0] |= __byte_perm (tmp
, 0, 0x0000);
3018 buf1
[1] |= __byte_perm (tmp
, 0, 0x0000);
3019 buf1
[2] |= __byte_perm (tmp
, 0, 0x0000);
3021 case 29: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
3022 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
3023 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
3024 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
3025 buf1
[0] |= __byte_perm (tmp
, 0, 0x0000);
3026 buf1
[1] |= __byte_perm (tmp
, 0, 0x0000);
3027 buf1
[2] |= __byte_perm (tmp
, 0, 0x0000);
3030 case 30: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
3031 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
3032 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
3033 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
3034 buf1
[0] |= __byte_perm (tmp
, 0, 0x0000);
3035 buf1
[1] |= __byte_perm (tmp
, 0, 0x0000);
3036 buf1
[2] |= __byte_perm (tmp
, 0, 0x0000);
3037 buf1
[3] |= __byte_perm (tmp
, 0, 0x5400);
3039 case 31: buf0
[0] |= __byte_perm (tmp
, 0, 0x0000);
3040 buf0
[1] |= __byte_perm (tmp
, 0, 0x0000);
3041 buf0
[2] |= __byte_perm (tmp
, 0, 0x0000);
3042 buf0
[3] |= __byte_perm (tmp
, 0, 0x0000);
3043 buf1
[0] |= __byte_perm (tmp
, 0, 0x0000);
3044 buf1
[1] |= __byte_perm (tmp
, 0, 0x0000);
3045 buf1
[2] |= __byte_perm (tmp
, 0, 0x0000);
3046 buf1
[3] |= __byte_perm (tmp
, 0, 0x4000);
3054 case 1: buf0
[0] |= tmp
<< 0;
3056 case 2: buf0
[0] |= tmp
<< 0 | tmp
<< 8;
3058 case 3: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16;
3060 case 4: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3062 case 5: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3063 buf0
[1] |= tmp
<< 0;
3065 case 6: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3066 buf0
[1] |= tmp
<< 0 | tmp
<< 8;
3068 case 7: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3069 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16;
3071 case 8: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3072 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3074 case 9: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3075 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3076 buf0
[2] |= tmp
<< 0;
3078 case 10: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3079 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3080 buf0
[2] |= tmp
<< 0 | tmp
<< 8;
3082 case 11: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3083 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3084 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16;
3086 case 12: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3087 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3088 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3090 case 13: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3091 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3092 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3093 buf0
[3] |= tmp
<< 0;
3095 case 14: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3096 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3097 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3098 buf0
[3] |= tmp
<< 0 | tmp
<< 8;
3100 case 15: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3101 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3102 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3103 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16;
3105 case 16: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3106 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3107 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3108 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3110 case 17: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3111 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3112 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3113 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3114 buf1
[0] |= tmp
<< 0;
3116 case 18: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3117 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3118 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3119 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3120 buf1
[0] |= tmp
<< 0 | tmp
<< 8;
3122 case 19: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3123 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3124 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3125 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3126 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16;
3128 case 20: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3129 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3130 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3131 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3132 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3134 case 21: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3135 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3136 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3137 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3138 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3139 buf1
[1] |= tmp
<< 0;
3141 case 22: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3142 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3143 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3144 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3145 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3146 buf1
[1] |= tmp
<< 0 | tmp
<< 8;
3148 case 23: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3149 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3150 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3151 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3152 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3153 buf1
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16;
3155 case 24: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3156 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3157 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3158 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3159 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3160 buf1
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3162 case 25: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3163 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3164 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3165 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3166 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3167 buf1
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3168 buf1
[2] |= tmp
<< 0;
3170 case 26: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3171 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3172 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3173 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3174 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3175 buf1
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3176 buf1
[2] |= tmp
<< 0 | tmp
<< 8;
3178 case 27: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3179 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3180 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3181 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3182 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3183 buf1
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3184 buf1
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16;
3186 case 28: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3187 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3188 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3189 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3190 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3191 buf1
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3192 buf1
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3194 case 29: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3195 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3196 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3197 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3198 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3199 buf1
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3200 buf1
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3201 buf1
[3] |= tmp
<< 0;
3203 case 30: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3204 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3205 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3206 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3207 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3208 buf1
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3209 buf1
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3210 buf1
[3] |= tmp
<< 0 | tmp
<< 8;
3212 case 31: buf0
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3213 buf0
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3214 buf0
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3215 buf0
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3216 buf1
[0] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3217 buf1
[1] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3218 buf1
[2] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16 | tmp
<< 24;
3219 buf1
[3] |= tmp
<< 0 | tmp
<< 8 | tmp
<< 16;
3230 __device__
static u32
rule_op_mangle_dupechar_last (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
3232 if ( in_len
== 0) return (in_len
);
3233 if ((in_len
+ p0
) >= 32) return (in_len
);
3235 const u32 in_len1
= in_len
- 1;
3237 const u32 sh
= (in_len1
& 3) * 8;
3241 switch (in_len1
/ 4)
3243 case 0: tmp
= (buf0
[0] >> sh
) & 0xff; break;
3244 case 1: tmp
= (buf0
[1] >> sh
) & 0xff; break;
3245 case 2: tmp
= (buf0
[2] >> sh
) & 0xff; break;
3246 case 3: tmp
= (buf0
[3] >> sh
) & 0xff; break;
3247 case 4: tmp
= (buf1
[0] >> sh
) & 0xff; break;
3248 case 5: tmp
= (buf1
[1] >> sh
) & 0xff; break;
3249 case 6: tmp
= (buf1
[2] >> sh
) & 0xff; break;
3250 case 7: tmp
= (buf1
[3] >> sh
) & 0xff; break;
3253 u32 out_len
= in_len
;
3255 for (u32 i
= 0; i
< p0
; i
++)
3257 append_block1 (out_len
, buf0
, buf1
, tmp
);
3265 __device__
static u32
rule_op_mangle_dupechar_all (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
3267 if ( in_len
== 0) return (in_len
);
3268 if ((in_len
+ in_len
) >= 32) return (in_len
);
3270 u32 out_len
= in_len
;
3275 #if __CUDA_ARCH__ >= 200
3277 tib40
[0] = __byte_perm (buf0
[0], 0, 0x1100);
3278 tib40
[1] = __byte_perm (buf0
[0], 0, 0x3322);
3279 tib40
[2] = __byte_perm (buf0
[1], 0, 0x1100);
3280 tib40
[3] = __byte_perm (buf0
[1], 0, 0x3322);
3281 tib41
[0] = __byte_perm (buf0
[2], 0, 0x1100);
3282 tib41
[1] = __byte_perm (buf0
[2], 0, 0x3322);
3283 tib41
[2] = __byte_perm (buf0
[3], 0, 0x1100);
3284 tib41
[3] = __byte_perm (buf0
[3], 0, 0x3322);
3297 tib40
[0] = ((buf0
[0] & 0x000000FF) << 0) | ((buf0
[0] & 0x0000FF00) << 8);
3298 tib40
[1] = ((buf0
[0] & 0x00FF0000) >> 16) | ((buf0
[0] & 0xFF000000) >> 8);
3299 tib40
[2] = ((buf0
[1] & 0x000000FF) << 0) | ((buf0
[1] & 0x0000FF00) << 8);
3300 tib40
[3] = ((buf0
[1] & 0x00FF0000) >> 16) | ((buf0
[1] & 0xFF000000) >> 8);
3301 tib41
[0] = ((buf0
[2] & 0x000000FF) << 0) | ((buf0
[2] & 0x0000FF00) << 8);
3302 tib41
[1] = ((buf0
[2] & 0x00FF0000) >> 16) | ((buf0
[2] & 0xFF000000) >> 8);
3303 tib41
[2] = ((buf0
[3] & 0x000000FF) << 0) | ((buf0
[3] & 0x0000FF00) << 8);
3304 tib41
[3] = ((buf0
[3] & 0x00FF0000) >> 16) | ((buf0
[3] & 0xFF000000) >> 8);
3306 buf0
[0] = tib40
[0] | (tib40
[0] << 8);
3307 buf0
[1] = tib40
[1] | (tib40
[1] << 8);
3308 buf0
[2] = tib40
[2] | (tib40
[2] << 8);
3309 buf0
[3] = tib40
[3] | (tib40
[3] << 8);
3310 buf1
[0] = tib41
[0] | (tib41
[0] << 8);
3311 buf1
[1] = tib41
[1] | (tib41
[1] << 8);
3312 buf1
[2] = tib41
[2] | (tib41
[2] << 8);
3313 buf1
[3] = tib41
[3] | (tib41
[3] << 8);
3317 out_len
= out_len
+ out_len
;
3322 __device__
static u32
rule_op_mangle_switch_first (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
3324 if (in_len
< 2) return (in_len
);
3326 #if __CUDA_ARCH__ >= 200
3328 buf0
[0] = __byte_perm (buf0
[0], 0, 0x3201);
3332 buf0
[0] = (buf0
[0] & 0xFFFF0000) | ((buf0
[0] << 8) & 0x0000FF00) | ((buf0
[0] >> 8) & 0x000000FF);
3339 __device__
static u32
rule_op_mangle_switch_last (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
3341 if (in_len
< 2) return (in_len
);
3343 #if __CUDA_ARCH__ >= 200
3347 case 2: buf0
[0] = __byte_perm (buf0
[0], 0, 0x5401);
3349 case 3: buf0
[0] = __byte_perm (buf0
[0], 0, 0x4120);
3351 case 4: buf0
[0] = __byte_perm (buf0
[0], 0, 0x2310);
3353 case 5: buf0
[1] = __byte_perm (buf0
[1], buf0
[0], 0x7210);
3354 buf0
[0] = __byte_perm (buf0
[0], buf0
[1], 0x4210);
3355 buf0
[1] = __byte_perm (buf0
[1], 0, 0x6543);
3357 case 6: buf0
[1] = __byte_perm (buf0
[1], 0, 0x5401);
3359 case 7: buf0
[1] = __byte_perm (buf0
[1], 0, 0x4120);
3361 case 8: buf0
[1] = __byte_perm (buf0
[1], 0, 0x2310);
3363 case 9: buf0
[2] = __byte_perm (buf0
[2], buf0
[1], 0x7210);
3364 buf0
[1] = __byte_perm (buf0
[1], buf0
[2], 0x4210);
3365 buf0
[2] = __byte_perm (buf0
[2], 0, 0x6543);
3367 case 10: buf0
[2] = __byte_perm (buf0
[2], 0, 0x5401);
3369 case 11: buf0
[2] = __byte_perm (buf0
[2], 0, 0x4120);
3371 case 12: buf0
[2] = __byte_perm (buf0
[2], 0, 0x2310);
3373 case 13: buf0
[3] = __byte_perm (buf0
[3], buf0
[2], 0x7210);
3374 buf0
[2] = __byte_perm (buf0
[2], buf0
[3], 0x4210);
3375 buf0
[3] = __byte_perm (buf0
[3], 0, 0x6543);
3377 case 14: buf0
[3] = __byte_perm (buf0
[3], 0, 0x5401);
3379 case 15: buf0
[3] = __byte_perm (buf0
[3], 0, 0x4120);
3381 case 16: buf0
[3] = __byte_perm (buf0
[3], 0, 0x2310);
3383 case 17: buf1
[0] = __byte_perm (buf1
[0], buf0
[3], 0x7210);
3384 buf0
[3] = __byte_perm (buf0
[3], buf1
[0], 0x4210);
3385 buf1
[0] = __byte_perm (buf1
[0], 0, 0x6543);
3387 case 18: buf1
[0] = __byte_perm (buf1
[0], 0, 0x5401);
3389 case 19: buf1
[0] = __byte_perm (buf1
[0], 0, 0x4120);
3391 case 20: buf1
[0] = __byte_perm (buf1
[0], 0, 0x2310);
3393 case 21: buf1
[1] = __byte_perm (buf1
[1], buf1
[0], 0x7210);
3394 buf1
[0] = __byte_perm (buf1
[0], buf1
[1], 0x4210);
3395 buf1
[1] = __byte_perm (buf1
[1], 0, 0x6543);
3397 case 22: buf1
[1] = __byte_perm (buf1
[1], 0, 0x5401);
3399 case 23: buf1
[1] = __byte_perm (buf1
[1], 0, 0x4120);
3401 case 24: buf1
[1] = __byte_perm (buf1
[1], 0, 0x2310);
3403 case 25: buf1
[2] = __byte_perm (buf1
[2], buf1
[1], 0x7210);
3404 buf1
[1] = __byte_perm (buf1
[1], buf1
[2], 0x4210);
3405 buf1
[2] = __byte_perm (buf1
[2], 0, 0x6543);
3407 case 26: buf1
[2] = __byte_perm (buf1
[2], 0, 0x5401);
3409 case 27: buf1
[2] = __byte_perm (buf1
[2], 0, 0x4120);
3411 case 28: buf1
[2] = __byte_perm (buf1
[2], 0, 0x2310);
3413 case 29: buf1
[3] = __byte_perm (buf1
[3], buf1
[2], 0x7210);
3414 buf1
[2] = __byte_perm (buf1
[2], buf1
[3], 0x4210);
3415 buf1
[3] = __byte_perm (buf1
[3], 0, 0x6543);
3417 case 30: buf1
[3] = __byte_perm (buf1
[3], 0, 0x5401);
3419 case 31: buf1
[3] = __byte_perm (buf1
[3], 0, 0x4120);
3427 case 2: buf0
[0] = ((buf0
[0] << 8) & 0x0000FF00) | ((buf0
[0] >> 8) & 0x000000FF);
3429 case 3: buf0
[0] = (buf0
[0] & 0x000000FF) | ((buf0
[0] << 8) & 0x00FF0000) | ((buf0
[0] >> 8) & 0x0000FF00);
3431 case 4: buf0
[0] = (buf0
[0] & 0x0000FFFF) | ((buf0
[0] << 8) & 0xFF000000) | ((buf0
[0] >> 8) & 0x00FF0000);
3433 case 5: buf0
[1] = (buf0
[0] & 0xFF000000) | buf0
[1];
3434 buf0
[0] = (buf0
[0] & 0x00FFFFFF) | (buf0
[1] << 24);
3435 buf0
[1] = (buf0
[1] >> 24);
3437 case 6: buf0
[1] = ((buf0
[1] << 8) & 0x0000FF00) | ((buf0
[1] >> 8) & 0x000000FF);
3439 case 7: buf0
[1] = (buf0
[1] & 0x000000FF) | ((buf0
[1] << 8) & 0x00FF0000) | ((buf0
[1] >> 8) & 0x0000FF00);
3441 case 8: buf0
[1] = (buf0
[1] & 0x0000FFFF) | ((buf0
[1] << 8) & 0xFF000000) | ((buf0
[1] >> 8) & 0x00FF0000);
3443 case 9: buf0
[2] = (buf0
[1] & 0xFF000000) | buf0
[2];
3444 buf0
[1] = (buf0
[1] & 0x00FFFFFF) | (buf0
[2] << 24);
3445 buf0
[2] = (buf0
[2] >> 24);
3447 case 10: buf0
[2] = ((buf0
[2] << 8) & 0x0000FF00) | ((buf0
[2] >> 8) & 0x000000FF);
3449 case 11: buf0
[2] = (buf0
[2] & 0x000000FF) | ((buf0
[2] << 8) & 0x00FF0000) | ((buf0
[2] >> 8) & 0x0000FF00);
3451 case 12: buf0
[2] = (buf0
[2] & 0x0000FFFF) | ((buf0
[2] << 8) & 0xFF000000) | ((buf0
[2] >> 8) & 0x00FF0000);
3453 case 13: buf0
[3] = (buf0
[2] & 0xFF000000) | buf0
[3];
3454 buf0
[2] = (buf0
[2] & 0x00FFFFFF) | (buf0
[3] << 24);
3455 buf0
[3] = (buf0
[3] >> 24);
3457 case 14: buf0
[3] = ((buf0
[3] << 8) & 0x0000FF00) | ((buf0
[3] >> 8) & 0x000000FF);
3459 case 15: buf0
[3] = (buf0
[3] & 0x000000FF) | ((buf0
[3] << 8) & 0x00FF0000) | ((buf0
[3] >> 8) & 0x0000FF00);
3461 case 16: buf0
[3] = (buf0
[3] & 0x0000FFFF) | ((buf0
[3] << 8) & 0xFF000000) | ((buf0
[3] >> 8) & 0x00FF0000);
3463 case 17: buf1
[0] = (buf0
[3] & 0xFF000000) | buf1
[0];
3464 buf0
[3] = (buf0
[3] & 0x00FFFFFF) | (buf1
[0] << 24);
3465 buf1
[0] = (buf1
[0] >> 24);
3467 case 18: buf1
[0] = ((buf1
[0] << 8) & 0x0000FF00) | ((buf1
[0] >> 8) & 0x000000FF);
3469 case 19: buf1
[0] = (buf1
[0] & 0x000000FF) | ((buf1
[0] << 8) & 0x00FF0000) | ((buf1
[0] >> 8) & 0x0000FF00);
3471 case 20: buf1
[0] = (buf1
[0] & 0x0000FFFF) | ((buf1
[0] << 8) & 0xFF000000) | ((buf1
[0] >> 8) & 0x00FF0000);
3473 case 21: buf1
[1] = (buf1
[0] & 0xFF000000) | buf1
[1];
3474 buf1
[0] = (buf1
[0] & 0x00FFFFFF) | (buf1
[1] << 24);
3475 buf1
[1] = (buf1
[1] >> 24);
3477 case 22: buf1
[1] = ((buf1
[1] << 8) & 0x0000FF00) | ((buf1
[1] >> 8) & 0x000000FF);
3479 case 23: buf1
[1] = (buf1
[1] & 0x000000FF) | ((buf1
[1] << 8) & 0x00FF0000) | ((buf1
[1] >> 8) & 0x0000FF00);
3481 case 24: buf1
[1] = (buf1
[1] & 0x0000FFFF) | ((buf1
[1] << 8) & 0xFF000000) | ((buf1
[1] >> 8) & 0x00FF0000);
3483 case 25: buf1
[2] = (buf1
[1] & 0xFF000000) | buf1
[2];
3484 buf1
[1] = (buf1
[1] & 0x00FFFFFF) | (buf1
[2] << 24);
3485 buf1
[2] = (buf1
[2] >> 24);
3487 case 26: buf1
[2] = ((buf1
[2] << 8) & 0x0000FF00) | ((buf1
[2] >> 8) & 0x000000FF);
3489 case 27: buf1
[2] = (buf1
[2] & 0x000000FF) | ((buf1
[2] << 8) & 0x00FF0000) | ((buf1
[2] >> 8) & 0x0000FF00);
3491 case 28: buf1
[2] = (buf1
[2] & 0x0000FFFF) | ((buf1
[2] << 8) & 0xFF000000) | ((buf1
[2] >> 8) & 0x00FF0000);
3493 case 29: buf1
[3] = (buf1
[2] & 0xFF000000) | buf1
[3];
3494 buf1
[2] = (buf1
[2] & 0x00FFFFFF) | (buf1
[3] << 24);
3495 buf1
[3] = (buf1
[3] >> 24);
3497 case 30: buf1
[3] = ((buf1
[3] << 8) & 0x0000FF00) | ((buf1
[3] >> 8) & 0x000000FF);
3499 case 31: buf1
[3] = (buf1
[3] & 0x000000FF) | ((buf1
[3] << 8) & 0x00FF0000) | ((buf1
[3] >> 8) & 0x0000FF00);
3508 __device__
static u32
rule_op_mangle_switch_at (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
3510 if (p0
>= in_len
) return (in_len
);
3511 if (p1
>= in_len
) return (in_len
);
3516 #if __CUDA_ARCH__ >= 200
3520 case 0: tmp0
= __byte_perm (buf0
[0], 0, 0x6540);
3522 case 1: tmp0
= __byte_perm (buf0
[0], 0, 0x6541);
3524 case 2: tmp0
= __byte_perm (buf0
[0], 0, 0x6542);
3526 case 3: tmp0
= __byte_perm (buf0
[0], 0, 0x6543);
3528 case 4: tmp0
= __byte_perm (buf0
[1], 0, 0x6540);
3530 case 5: tmp0
= __byte_perm (buf0
[1], 0, 0x6541);
3532 case 6: tmp0
= __byte_perm (buf0
[1], 0, 0x6542);
3534 case 7: tmp0
= __byte_perm (buf0
[1], 0, 0x6543);
3536 case 8: tmp0
= __byte_perm (buf0
[2], 0, 0x6540);
3538 case 9: tmp0
= __byte_perm (buf0
[2], 0, 0x6541);
3540 case 10: tmp0
= __byte_perm (buf0
[2], 0, 0x6542);
3542 case 11: tmp0
= __byte_perm (buf0
[2], 0, 0x6543);
3544 case 12: tmp0
= __byte_perm (buf0
[3], 0, 0x6540);
3546 case 13: tmp0
= __byte_perm (buf0
[3], 0, 0x6541);
3548 case 14: tmp0
= __byte_perm (buf0
[3], 0, 0x6542);
3550 case 15: tmp0
= __byte_perm (buf0
[3], 0, 0x6543);
3552 case 16: tmp0
= __byte_perm (buf1
[0], 0, 0x6540);
3554 case 17: tmp0
= __byte_perm (buf1
[0], 0, 0x6541);
3556 case 18: tmp0
= __byte_perm (buf1
[0], 0, 0x6542);
3558 case 19: tmp0
= __byte_perm (buf1
[0], 0, 0x6543);
3560 case 20: tmp0
= __byte_perm (buf1
[1], 0, 0x6540);
3562 case 21: tmp0
= __byte_perm (buf1
[1], 0, 0x6541);
3564 case 22: tmp0
= __byte_perm (buf1
[1], 0, 0x6542);
3566 case 23: tmp0
= __byte_perm (buf1
[1], 0, 0x6543);
3568 case 24: tmp0
= __byte_perm (buf1
[2], 0, 0x6540);
3570 case 25: tmp0
= __byte_perm (buf1
[2], 0, 0x6541);
3572 case 26: tmp0
= __byte_perm (buf1
[2], 0, 0x6542);
3574 case 27: tmp0
= __byte_perm (buf1
[2], 0, 0x6543);
3576 case 28: tmp0
= __byte_perm (buf1
[3], 0, 0x6540);
3578 case 29: tmp0
= __byte_perm (buf1
[3], 0, 0x6541);
3580 case 30: tmp0
= __byte_perm (buf1
[3], 0, 0x6542);
3582 case 31: tmp0
= __byte_perm (buf1
[3], 0, 0x6543);
3588 case 0: tmp1
= __byte_perm (buf0
[0], 0, 0x6540);
3589 buf0
[0] = __byte_perm (tmp0
, buf0
[0], 0x7650);
3591 case 1: tmp1
= __byte_perm (buf0
[0], 0, 0x6541);
3592 buf0
[0] = __byte_perm (tmp0
, buf0
[0], 0x7604);
3594 case 2: tmp1
= __byte_perm (buf0
[0], 0, 0x6542);
3595 buf0
[0] = __byte_perm (tmp0
, buf0
[0], 0x7054);
3597 case 3: tmp1
= __byte_perm (buf0
[0], 0, 0x6543);
3598 buf0
[0] = __byte_perm (tmp0
, buf0
[0], 0x0654);
3600 case 4: tmp1
= __byte_perm (buf0
[1], 0, 0x6540);
3601 buf0
[1] = __byte_perm (tmp0
, buf0
[1], 0x7650);
3603 case 5: tmp1
= __byte_perm (buf0
[1], 0, 0x6541);
3604 buf0
[1] = __byte_perm (tmp0
, buf0
[1], 0x7604);
3606 case 6: tmp1
= __byte_perm (buf0
[1], 0, 0x6542);
3607 buf0
[1] = __byte_perm (tmp0
, buf0
[1], 0x7054);
3609 case 7: tmp1
= __byte_perm (buf0
[1], 0, 0x6543);
3610 buf0
[1] = __byte_perm (tmp0
, buf0
[1], 0x0654);
3612 case 8: tmp1
= __byte_perm (buf0
[2], 0, 0x6540);
3613 buf0
[2] = __byte_perm (tmp0
, buf0
[2], 0x7650);
3615 case 9: tmp1
= __byte_perm (buf0
[2], 0, 0x6541);
3616 buf0
[2] = __byte_perm (tmp0
, buf0
[2], 0x7604);
3618 case 10: tmp1
= __byte_perm (buf0
[2], 0, 0x6542);
3619 buf0
[2] = __byte_perm (tmp0
, buf0
[2], 0x7054);
3621 case 11: tmp1
= __byte_perm (buf0
[2], 0, 0x6543);
3622 buf0
[2] = __byte_perm (tmp0
, buf0
[2], 0x0654);
3624 case 12: tmp1
= __byte_perm (buf0
[3], 0, 0x6540);
3625 buf0
[3] = __byte_perm (tmp0
, buf0
[3], 0x7650);
3627 case 13: tmp1
= __byte_perm (buf0
[3], 0, 0x6541);
3628 buf0
[3] = __byte_perm (tmp0
, buf0
[3], 0x7604);
3630 case 14: tmp1
= __byte_perm (buf0
[3], 0, 0x6542);
3631 buf0
[3] = __byte_perm (tmp0
, buf0
[3], 0x7054);
3633 case 15: tmp1
= __byte_perm (buf0
[3], 0, 0x6543);
3634 buf0
[3] = __byte_perm (tmp0
, buf0
[3], 0x0654);
3636 case 16: tmp1
= __byte_perm (buf1
[0], 0, 0x6540);
3637 buf1
[0] = __byte_perm (tmp0
, buf1
[0], 0x7650);
3639 case 17: tmp1
= __byte_perm (buf1
[0], 0, 0x6541);
3640 buf1
[0] = __byte_perm (tmp0
, buf1
[0], 0x7604);
3642 case 18: tmp1
= __byte_perm (buf1
[0], 0, 0x6542);
3643 buf1
[0] = __byte_perm (tmp0
, buf1
[0], 0x7054);
3645 case 19: tmp1
= __byte_perm (buf1
[0], 0, 0x6543);
3646 buf1
[0] = __byte_perm (tmp0
, buf1
[0], 0x0654);
3648 case 20: tmp1
= __byte_perm (buf1
[1], 0, 0x6540);
3649 buf1
[1] = __byte_perm (tmp0
, buf1
[1], 0x7650);
3651 case 21: tmp1
= __byte_perm (buf1
[1], 0, 0x6541);
3652 buf1
[1] = __byte_perm (tmp0
, buf1
[1], 0x7604);
3654 case 22: tmp1
= __byte_perm (buf1
[1], 0, 0x6542);
3655 buf1
[1] = __byte_perm (tmp0
, buf1
[1], 0x7054);
3657 case 23: tmp1
= __byte_perm (buf1
[1], 0, 0x6543);
3658 buf1
[1] = __byte_perm (tmp0
, buf1
[1], 0x0654);
3660 case 24: tmp1
= __byte_perm (buf1
[2], 0, 0x6540);
3661 buf1
[2] = __byte_perm (tmp0
, buf1
[2], 0x7650);
3663 case 25: tmp1
= __byte_perm (buf1
[2], 0, 0x6541);
3664 buf1
[2] = __byte_perm (tmp0
, buf1
[2], 0x7604);
3666 case 26: tmp1
= __byte_perm (buf1
[2], 0, 0x6542);
3667 buf1
[2] = __byte_perm (tmp0
, buf1
[2], 0x7054);
3669 case 27: tmp1
= __byte_perm (buf1
[2], 0, 0x6543);
3670 buf1
[2] = __byte_perm (tmp0
, buf1
[2], 0x0654);
3672 case 28: tmp1
= __byte_perm (buf1
[3], 0, 0x6540);
3673 buf1
[3] = __byte_perm (tmp0
, buf1
[3], 0x7650);
3675 case 29: tmp1
= __byte_perm (buf1
[3], 0, 0x6541);
3676 buf1
[3] = __byte_perm (tmp0
, buf1
[3], 0x7604);
3678 case 30: tmp1
= __byte_perm (buf1
[3], 0, 0x6542);
3679 buf1
[3] = __byte_perm (tmp0
, buf1
[3], 0x7054);
3681 case 31: tmp1
= __byte_perm (buf1
[3], 0, 0x6543);
3682 buf1
[3] = __byte_perm (tmp0
, buf1
[3], 0x0654);
3688 case 0: buf0
[0] = __byte_perm (tmp1
, buf0
[0], 0x7650);
3690 case 1: buf0
[0] = __byte_perm (tmp1
, buf0
[0], 0x7604);
3692 case 2: buf0
[0] = __byte_perm (tmp1
, buf0
[0], 0x7054);
3694 case 3: buf0
[0] = __byte_perm (tmp1
, buf0
[0], 0x0654);
3696 case 4: buf0
[1] = __byte_perm (tmp1
, buf0
[1], 0x7650);
3698 case 5: buf0
[1] = __byte_perm (tmp1
, buf0
[1], 0x7604);
3700 case 6: buf0
[1] = __byte_perm (tmp1
, buf0
[1], 0x7054);
3702 case 7: buf0
[1] = __byte_perm (tmp1
, buf0
[1], 0x0654);
3704 case 8: buf0
[2] = __byte_perm (tmp1
, buf0
[2], 0x7650);
3706 case 9: buf0
[2] = __byte_perm (tmp1
, buf0
[2], 0x7604);
3708 case 10: buf0
[2] = __byte_perm (tmp1
, buf0
[2], 0x7054);
3710 case 11: buf0
[2] = __byte_perm (tmp1
, buf0
[2], 0x0654);
3712 case 12: buf0
[3] = __byte_perm (tmp1
, buf0
[3], 0x7650);
3714 case 13: buf0
[3] = __byte_perm (tmp1
, buf0
[3], 0x7604);
3716 case 14: buf0
[3] = __byte_perm (tmp1
, buf0
[3], 0x7054);
3718 case 15: buf0
[3] = __byte_perm (tmp1
, buf0
[3], 0x0654);
3720 case 16: buf1
[0] = __byte_perm (tmp1
, buf1
[0], 0x7650);
3722 case 17: buf1
[0] = __byte_perm (tmp1
, buf1
[0], 0x7604);
3724 case 18: buf1
[0] = __byte_perm (tmp1
, buf1
[0], 0x7054);
3726 case 19: buf1
[0] = __byte_perm (tmp1
, buf1
[0], 0x0654);
3728 case 20: buf1
[1] = __byte_perm (tmp1
, buf1
[1], 0x7650);
3730 case 21: buf1
[1] = __byte_perm (tmp1
, buf1
[1], 0x7604);
3732 case 22: buf1
[1] = __byte_perm (tmp1
, buf1
[1], 0x7054);
3734 case 23: buf1
[1] = __byte_perm (tmp1
, buf1
[1], 0x0654);
3736 case 24: buf1
[2] = __byte_perm (tmp1
, buf1
[2], 0x7650);
3738 case 25: buf1
[2] = __byte_perm (tmp1
, buf1
[2], 0x7604);
3740 case 26: buf1
[2] = __byte_perm (tmp1
, buf1
[2], 0x7054);
3742 case 27: buf1
[2] = __byte_perm (tmp1
, buf1
[2], 0x0654);
3744 case 28: buf1
[3] = __byte_perm (tmp1
, buf1
[3], 0x7650);
3746 case 29: buf1
[3] = __byte_perm (tmp1
, buf1
[3], 0x7604);
3748 case 30: buf1
[3] = __byte_perm (tmp1
, buf1
[3], 0x7054);
3750 case 31: buf1
[3] = __byte_perm (tmp1
, buf1
[3], 0x0654);
3758 case 0: tmp0
= (buf0
[0] >> 0) & 0xFF;
3760 case 1: tmp0
= (buf0
[0] >> 8) & 0xFF;
3762 case 2: tmp0
= (buf0
[0] >> 16) & 0xFF;
3764 case 3: tmp0
= (buf0
[0] >> 24) & 0xFF;
3766 case 4: tmp0
= (buf0
[1] >> 0) & 0xFF;
3768 case 5: tmp0
= (buf0
[1] >> 8) & 0xFF;
3770 case 6: tmp0
= (buf0
[1] >> 16) & 0xFF;
3772 case 7: tmp0
= (buf0
[1] >> 24) & 0xFF;
3774 case 8: tmp0
= (buf0
[2] >> 0) & 0xFF;
3776 case 9: tmp0
= (buf0
[2] >> 8) & 0xFF;
3778 case 10: tmp0
= (buf0
[2] >> 16) & 0xFF;
3780 case 11: tmp0
= (buf0
[2] >> 24) & 0xFF;
3782 case 12: tmp0
= (buf0
[3] >> 0) & 0xFF;
3784 case 13: tmp0
= (buf0
[3] >> 8) & 0xFF;
3786 case 14: tmp0
= (buf0
[3] >> 16) & 0xFF;
3788 case 15: tmp0
= (buf0
[3] >> 24) & 0xFF;
3790 case 16: tmp0
= (buf1
[0] >> 0) & 0xFF;
3792 case 17: tmp0
= (buf1
[0] >> 8) & 0xFF;
3794 case 18: tmp0
= (buf1
[0] >> 16) & 0xFF;
3796 case 19: tmp0
= (buf1
[0] >> 24) & 0xFF;
3798 case 20: tmp0
= (buf1
[1] >> 0) & 0xFF;
3800 case 21: tmp0
= (buf1
[1] >> 8) & 0xFF;
3802 case 22: tmp0
= (buf1
[1] >> 16) & 0xFF;
3804 case 23: tmp0
= (buf1
[1] >> 24) & 0xFF;
3806 case 24: tmp0
= (buf1
[2] >> 0) & 0xFF;
3808 case 25: tmp0
= (buf1
[2] >> 8) & 0xFF;
3810 case 26: tmp0
= (buf1
[2] >> 16) & 0xFF;
3812 case 27: tmp0
= (buf1
[2] >> 24) & 0xFF;
3814 case 28: tmp0
= (buf1
[3] >> 0) & 0xFF;
3816 case 29: tmp0
= (buf1
[3] >> 8) & 0xFF;
3818 case 30: tmp0
= (buf1
[3] >> 16) & 0xFF;
3820 case 31: tmp0
= (buf1
[3] >> 24) & 0xFF;
3826 case 0: tmp1
= (buf0
[0] >> 0) & 0xff;
3827 buf0
[0] = (buf0
[0] & 0xffffff00) | tmp0
<< 0;
3829 case 1: tmp1
= (buf0
[0] >> 8) & 0xff;
3830 buf0
[0] = (buf0
[0] & 0xffff00ff) | tmp0
<< 8;
3832 case 2: tmp1
= (buf0
[0] >> 16) & 0xff;
3833 buf0
[0] = (buf0
[0] & 0xff00ffff) | tmp0
<< 16;
3835 case 3: tmp1
= (buf0
[0] >> 24) & 0xff;
3836 buf0
[0] = (buf0
[0] & 0x00ffffff) | tmp0
<< 24;
3838 case 4: tmp1
= (buf0
[1] >> 0) & 0xff;
3839 buf0
[1] = (buf0
[1] & 0xffffff00) | tmp0
<< 0;
3841 case 5: tmp1
= (buf0
[1] >> 8) & 0xff;
3842 buf0
[1] = (buf0
[1] & 0xffff00ff) | tmp0
<< 8;
3844 case 6: tmp1
= (buf0
[1] >> 16) & 0xff;
3845 buf0
[1] = (buf0
[1] & 0xff00ffff) | tmp0
<< 16;
3847 case 7: tmp1
= (buf0
[1] >> 24) & 0xff;
3848 buf0
[1] = (buf0
[1] & 0x00ffffff) | tmp0
<< 24;
3850 case 8: tmp1
= (buf0
[2] >> 0) & 0xff;
3851 buf0
[2] = (buf0
[2] & 0xffffff00) | tmp0
<< 0;
3853 case 9: tmp1
= (buf0
[2] >> 8) & 0xff;
3854 buf0
[2] = (buf0
[2] & 0xffff00ff) | tmp0
<< 8;
3856 case 10: tmp1
= (buf0
[2] >> 16) & 0xff;
3857 buf0
[2] = (buf0
[2] & 0xff00ffff) | tmp0
<< 16;
3859 case 11: tmp1
= (buf0
[2] >> 24) & 0xff;
3860 buf0
[2] = (buf0
[2] & 0x00ffffff) | tmp0
<< 24;
3862 case 12: tmp1
= (buf0
[3] >> 0) & 0xff;
3863 buf0
[3] = (buf0
[3] & 0xffffff00) | tmp0
<< 0;
3865 case 13: tmp1
= (buf0
[3] >> 8) & 0xff;
3866 buf0
[3] = (buf0
[3] & 0xffff00ff) | tmp0
<< 8;
3868 case 14: tmp1
= (buf0
[3] >> 16) & 0xff;
3869 buf0
[3] = (buf0
[3] & 0xff00ffff) | tmp0
<< 16;
3871 case 15: tmp1
= (buf0
[3] >> 24) & 0xff;
3872 buf0
[3] = (buf0
[3] & 0x00ffffff) | tmp0
<< 24;
3874 case 16: tmp1
= (buf1
[0] >> 0) & 0xff;
3875 buf1
[0] = (buf1
[0] & 0xffffff00) | tmp0
<< 0;
3877 case 17: tmp1
= (buf1
[0] >> 8) & 0xff;
3878 buf1
[0] = (buf1
[0] & 0xffff00ff) | tmp0
<< 8;
3880 case 18: tmp1
= (buf1
[0] >> 16) & 0xff;
3881 buf1
[0] = (buf1
[0] & 0xff00ffff) | tmp0
<< 16;
3883 case 19: tmp1
= (buf1
[0] >> 24) & 0xff;
3884 buf1
[0] = (buf1
[0] & 0x00ffffff) | tmp0
<< 24;
3886 case 20: tmp1
= (buf1
[1] >> 0) & 0xff;
3887 buf1
[1] = (buf1
[1] & 0xffffff00) | tmp0
<< 0;
3889 case 21: tmp1
= (buf1
[1] >> 8) & 0xff;
3890 buf1
[1] = (buf1
[1] & 0xffff00ff) | tmp0
<< 8;
3892 case 22: tmp1
= (buf1
[1] >> 16) & 0xff;
3893 buf1
[1] = (buf1
[1] & 0xff00ffff) | tmp0
<< 16;
3895 case 23: tmp1
= (buf1
[1] >> 24) & 0xff;
3896 buf1
[1] = (buf1
[1] & 0x00ffffff) | tmp0
<< 24;
3898 case 24: tmp1
= (buf1
[2] >> 0) & 0xff;
3899 buf1
[2] = (buf1
[2] & 0xffffff00) | tmp0
<< 0;
3901 case 25: tmp1
= (buf1
[2] >> 8) & 0xff;
3902 buf1
[2] = (buf1
[2] & 0xffff00ff) | tmp0
<< 8;
3904 case 26: tmp1
= (buf1
[2] >> 16) & 0xff;
3905 buf1
[2] = (buf1
[2] & 0xff00ffff) | tmp0
<< 16;
3907 case 27: tmp1
= (buf1
[2] >> 24) & 0xff;
3908 buf1
[2] = (buf1
[2] & 0x00ffffff) | tmp0
<< 24;
3910 case 28: tmp1
= (buf1
[3] >> 0) & 0xff;
3911 buf1
[3] = (buf1
[3] & 0xffffff00) | tmp0
<< 0;
3913 case 29: tmp1
= (buf1
[3] >> 8) & 0xff;
3914 buf1
[3] = (buf1
[3] & 0xffff00ff) | tmp0
<< 8;
3916 case 30: tmp1
= (buf1
[3] >> 16) & 0xff;
3917 buf1
[3] = (buf1
[3] & 0xff00ffff) | tmp0
<< 16;
3919 case 31: tmp1
= (buf1
[3] >> 24) & 0xff;
3920 buf1
[3] = (buf1
[3] & 0x00ffffff) | tmp0
<< 24;
3926 case 0: buf0
[0] = (buf0
[0] & 0xffffff00) | tmp1
<< 0;
3928 case 1: buf0
[0] = (buf0
[0] & 0xffff00ff) | tmp1
<< 8;
3930 case 2: buf0
[0] = (buf0
[0] & 0xff00ffff) | tmp1
<< 16;
3932 case 3: buf0
[0] = (buf0
[0] & 0x00ffffff) | tmp1
<< 24;
3934 case 4: buf0
[1] = (buf0
[1] & 0xffffff00) | tmp1
<< 0;
3936 case 5: buf0
[1] = (buf0
[1] & 0xffff00ff) | tmp1
<< 8;
3938 case 6: buf0
[1] = (buf0
[1] & 0xff00ffff) | tmp1
<< 16;
3940 case 7: buf0
[1] = (buf0
[1] & 0x00ffffff) | tmp1
<< 24;
3942 case 8: buf0
[2] = (buf0
[2] & 0xffffff00) | tmp1
<< 0;
3944 case 9: buf0
[2] = (buf0
[2] & 0xffff00ff) | tmp1
<< 8;
3946 case 10: buf0
[2] = (buf0
[2] & 0xff00ffff) | tmp1
<< 16;
3948 case 11: buf0
[2] = (buf0
[2] & 0x00ffffff) | tmp1
<< 24;
3950 case 12: buf0
[3] = (buf0
[3] & 0xffffff00) | tmp1
<< 0;
3952 case 13: buf0
[3] = (buf0
[3] & 0xffff00ff) | tmp1
<< 8;
3954 case 14: buf0
[3] = (buf0
[3] & 0xff00ffff) | tmp1
<< 16;
3956 case 15: buf0
[3] = (buf0
[3] & 0x00ffffff) | tmp1
<< 24;
3958 case 16: buf1
[0] = (buf1
[0] & 0xffffff00) | tmp1
<< 0;
3960 case 17: buf1
[0] = (buf1
[0] & 0xffff00ff) | tmp1
<< 8;
3962 case 18: buf1
[0] = (buf1
[0] & 0xff00ffff) | tmp1
<< 16;
3964 case 19: buf1
[0] = (buf1
[0] & 0x00ffffff) | tmp1
<< 24;
3966 case 20: buf1
[1] = (buf1
[1] & 0xffffff00) | tmp1
<< 0;
3968 case 21: buf1
[1] = (buf1
[1] & 0xffff00ff) | tmp1
<< 8;
3970 case 22: buf1
[1] = (buf1
[1] & 0xff00ffff) | tmp1
<< 16;
3972 case 23: buf1
[1] = (buf1
[1] & 0x00ffffff) | tmp1
<< 24;
3974 case 24: buf1
[2] = (buf1
[2] & 0xffffff00) | tmp1
<< 0;
3976 case 25: buf1
[2] = (buf1
[2] & 0xffff00ff) | tmp1
<< 8;
3978 case 26: buf1
[2] = (buf1
[2] & 0xff00ffff) | tmp1
<< 16;
3980 case 27: buf1
[2] = (buf1
[2] & 0x00ffffff) | tmp1
<< 24;
3982 case 28: buf1
[3] = (buf1
[3] & 0xffffff00) | tmp1
<< 0;
3984 case 29: buf1
[3] = (buf1
[3] & 0xffff00ff) | tmp1
<< 8;
3986 case 30: buf1
[3] = (buf1
[3] & 0xff00ffff) | tmp1
<< 16;
3988 case 31: buf1
[3] = (buf1
[3] & 0x00ffffff) | tmp1
<< 24;
3997 __device__
static u32
rule_op_mangle_chr_shiftl (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
3999 if (p0
>= in_len
) return (in_len
);
4001 const u32 mr
= 0xff << ((p0
& 3) * 8);
4006 case 0: buf0
[0] = (buf0
[0] & ml
) | (((buf0
[0] & mr
) << 1) & mr
); break;
4007 case 1: buf0
[1] = (buf0
[1] & ml
) | (((buf0
[1] & mr
) << 1) & mr
); break;
4008 case 2: buf0
[2] = (buf0
[2] & ml
) | (((buf0
[2] & mr
) << 1) & mr
); break;
4009 case 3: buf0
[3] = (buf0
[3] & ml
) | (((buf0
[3] & mr
) << 1) & mr
); break;
4010 case 4: buf1
[0] = (buf1
[0] & ml
) | (((buf1
[0] & mr
) << 1) & mr
); break;
4011 case 5: buf1
[1] = (buf1
[1] & ml
) | (((buf1
[1] & mr
) << 1) & mr
); break;
4012 case 6: buf1
[2] = (buf1
[2] & ml
) | (((buf1
[2] & mr
) << 1) & mr
); break;
4013 case 7: buf1
[3] = (buf1
[3] & ml
) | (((buf1
[3] & mr
) << 1) & mr
); break;
4019 __device__
static u32
rule_op_mangle_chr_shiftr (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
4021 if (p0
>= in_len
) return (in_len
);
4023 const u32 mr
= 0xff << ((p0
& 3) * 8);
4028 case 0: buf0
[0] = (buf0
[0] & ml
) | (((buf0
[0] & mr
) >> 1) & mr
); break;
4029 case 1: buf0
[1] = (buf0
[1] & ml
) | (((buf0
[1] & mr
) >> 1) & mr
); break;
4030 case 2: buf0
[2] = (buf0
[2] & ml
) | (((buf0
[2] & mr
) >> 1) & mr
); break;
4031 case 3: buf0
[3] = (buf0
[3] & ml
) | (((buf0
[3] & mr
) >> 1) & mr
); break;
4032 case 4: buf1
[0] = (buf1
[0] & ml
) | (((buf1
[0] & mr
) >> 1) & mr
); break;
4033 case 5: buf1
[1] = (buf1
[1] & ml
) | (((buf1
[1] & mr
) >> 1) & mr
); break;
4034 case 6: buf1
[2] = (buf1
[2] & ml
) | (((buf1
[2] & mr
) >> 1) & mr
); break;
4035 case 7: buf1
[3] = (buf1
[3] & ml
) | (((buf1
[3] & mr
) >> 1) & mr
); break;
4041 __device__
static u32
rule_op_mangle_chr_incr (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
4043 if (p0
>= in_len
) return (in_len
);
4045 const u32 mr
= 0xff << ((p0
& 3) * 8);
4048 const u32 n
= 0x01010101 & mr
;
4052 case 0: buf0
[0] = (buf0
[0] & ml
) | (((buf0
[0] & mr
) + n
) & mr
); break;
4053 case 1: buf0
[1] = (buf0
[1] & ml
) | (((buf0
[1] & mr
) + n
) & mr
); break;
4054 case 2: buf0
[2] = (buf0
[2] & ml
) | (((buf0
[2] & mr
) + n
) & mr
); break;
4055 case 3: buf0
[3] = (buf0
[3] & ml
) | (((buf0
[3] & mr
) + n
) & mr
); break;
4056 case 4: buf1
[0] = (buf1
[0] & ml
) | (((buf1
[0] & mr
) + n
) & mr
); break;
4057 case 5: buf1
[1] = (buf1
[1] & ml
) | (((buf1
[1] & mr
) + n
) & mr
); break;
4058 case 6: buf1
[2] = (buf1
[2] & ml
) | (((buf1
[2] & mr
) + n
) & mr
); break;
4059 case 7: buf1
[3] = (buf1
[3] & ml
) | (((buf1
[3] & mr
) + n
) & mr
); break;
4065 __device__
static u32
rule_op_mangle_chr_decr (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
4067 if (p0
>= in_len
) return (in_len
);
4069 const u32 mr
= 0xff << ((p0
& 3) * 8);
4072 const u32 n
= 0x01010101 & mr
;
4076 case 0: buf0
[0] = (buf0
[0] & ml
) | (((buf0
[0] & mr
) - n
) & mr
); break;
4077 case 1: buf0
[1] = (buf0
[1] & ml
) | (((buf0
[1] & mr
) - n
) & mr
); break;
4078 case 2: buf0
[2] = (buf0
[2] & ml
) | (((buf0
[2] & mr
) - n
) & mr
); break;
4079 case 3: buf0
[3] = (buf0
[3] & ml
) | (((buf0
[3] & mr
) - n
) & mr
); break;
4080 case 4: buf1
[0] = (buf1
[0] & ml
) | (((buf1
[0] & mr
) - n
) & mr
); break;
4081 case 5: buf1
[1] = (buf1
[1] & ml
) | (((buf1
[1] & mr
) - n
) & mr
); break;
4082 case 6: buf1
[2] = (buf1
[2] & ml
) | (((buf1
[2] & mr
) - n
) & mr
); break;
4083 case 7: buf1
[3] = (buf1
[3] & ml
) | (((buf1
[3] & mr
) - n
) & mr
); break;
4089 __device__
static u32
rule_op_mangle_replace_np1 (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
4091 if ((p0
+ 1) >= in_len
) return (in_len
);
4096 lshift_block (buf0
, buf1
, tib40
, tib41
);
4098 const u32 mr
= 0xff << ((p0
& 3) * 8);
4103 case 0: buf0
[0] = (buf0
[0] & ml
) | (tib40
[0] & mr
); break;
4104 case 1: buf0
[1] = (buf0
[1] & ml
) | (tib40
[1] & mr
); break;
4105 case 2: buf0
[2] = (buf0
[2] & ml
) | (tib40
[2] & mr
); break;
4106 case 3: buf0
[3] = (buf0
[3] & ml
) | (tib40
[3] & mr
); break;
4107 case 4: buf1
[0] = (buf1
[0] & ml
) | (tib41
[0] & mr
); break;
4108 case 5: buf1
[1] = (buf1
[1] & ml
) | (tib41
[1] & mr
); break;
4109 case 6: buf1
[2] = (buf1
[2] & ml
) | (tib41
[2] & mr
); break;
4110 case 7: buf1
[3] = (buf1
[3] & ml
) | (tib41
[3] & mr
); break;
4116 __device__
static u32
rule_op_mangle_replace_nm1 (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
4118 if (p0
== 0) return (in_len
);
4120 if (p0
>= in_len
) return (in_len
);
4125 rshift_block (buf0
, buf1
, tib40
, tib41
);
4127 const u32 mr
= 0xff << ((p0
& 3) * 8);
4132 case 0: buf0
[0] = (buf0
[0] & ml
) | (tib40
[0] & mr
); break;
4133 case 1: buf0
[1] = (buf0
[1] & ml
) | (tib40
[1] & mr
); break;
4134 case 2: buf0
[2] = (buf0
[2] & ml
) | (tib40
[2] & mr
); break;
4135 case 3: buf0
[3] = (buf0
[3] & ml
) | (tib40
[3] & mr
); break;
4136 case 4: buf1
[0] = (buf1
[0] & ml
) | (tib41
[0] & mr
); break;
4137 case 5: buf1
[1] = (buf1
[1] & ml
) | (tib41
[1] & mr
); break;
4138 case 6: buf1
[2] = (buf1
[2] & ml
) | (tib41
[2] & mr
); break;
4139 case 7: buf1
[3] = (buf1
[3] & ml
) | (tib41
[3] & mr
); break;
4145 __device__
static u32
rule_op_mangle_dupeblock_first (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
4147 if (p0
> in_len
) return (in_len
);
4149 if ((in_len
+ p0
) >= 32) return (in_len
);
4151 u32 out_len
= in_len
;
4165 truncate_right (tib40
, tib41
, p0
);
4167 rshift_block_N (buf0
, buf1
, buf0
, buf1
, p0
);
4169 buf0
[0] |= tib40
[0];
4170 buf0
[1] |= tib40
[1];
4171 buf0
[2] |= tib40
[2];
4172 buf0
[3] |= tib40
[3];
4173 buf1
[0] |= tib41
[0];
4174 buf1
[1] |= tib41
[1];
4175 buf1
[2] |= tib41
[2];
4176 buf1
[3] |= tib41
[3];
4183 __device__
static u32
rule_op_mangle_dupeblock_last (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
4185 if (p0
> in_len
) return (in_len
);
4187 if ((in_len
+ p0
) >= 32) return (in_len
);
4189 u32 out_len
= in_len
;
4194 rshift_block_N (buf0
, buf1
, tib40
, tib41
, p0
);
4196 truncate_left (tib40
, tib41
, out_len
);
4198 buf0
[0] |= tib40
[0];
4199 buf0
[1] |= tib40
[1];
4200 buf0
[2] |= tib40
[2];
4201 buf0
[3] |= tib40
[3];
4202 buf1
[0] |= tib41
[0];
4203 buf1
[1] |= tib41
[1];
4204 buf1
[2] |= tib41
[2];
4205 buf1
[3] |= tib41
[3];
4212 __device__
static u32
rule_op_mangle_title (const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
4214 buf0
[0] |= (generate_cmask (buf0
[0]));
4215 buf0
[1] |= (generate_cmask (buf0
[1]));
4216 buf0
[2] |= (generate_cmask (buf0
[2]));
4217 buf0
[3] |= (generate_cmask (buf0
[3]));
4218 buf1
[0] |= (generate_cmask (buf1
[0]));
4219 buf1
[1] |= (generate_cmask (buf1
[1]));
4220 buf1
[2] |= (generate_cmask (buf1
[2]));
4221 buf1
[3] |= (generate_cmask (buf1
[3]));
4223 buf0
[0] &= ~(0x00000020 & generate_cmask (buf0
[0]));
4227 for (u32 i
= 0; i
< in_len
; i
++)
4234 case 0: tmp0
= __byte_perm (buf0
[0], 0, 0x6540);
4235 tmp1
= ~(0x00002000 & generate_cmask (buf0
[0])); break;
4236 case 1: tmp0
= __byte_perm (buf0
[0], 0, 0x6541);
4237 tmp1
= ~(0x00200000 & generate_cmask (buf0
[0])); break;
4238 case 2: tmp0
= __byte_perm (buf0
[0], 0, 0x6542);
4239 tmp1
= ~(0x20000000 & generate_cmask (buf0
[0])); break;
4240 case 3: tmp0
= __byte_perm (buf0
[0], 0, 0x6543);
4241 tmp1
= ~(0x00000020 & generate_cmask (buf0
[1])); break;
4242 case 4: tmp0
= __byte_perm (buf0
[1], 0, 0x6540);
4243 tmp1
= ~(0x00002000 & generate_cmask (buf0
[1])); break;
4244 case 5: tmp0
= __byte_perm (buf0
[1], 0, 0x6541);
4245 tmp1
= ~(0x00200000 & generate_cmask (buf0
[1])); break;
4246 case 6: tmp0
= __byte_perm (buf0
[1], 0, 0x6542);
4247 tmp1
= ~(0x20000000 & generate_cmask (buf0
[1])); break;
4248 case 7: tmp0
= __byte_perm (buf0
[1], 0, 0x6543);
4249 tmp1
= ~(0x00000020 & generate_cmask (buf0
[2])); break;
4250 case 8: tmp0
= __byte_perm (buf0
[2], 0, 0x6540);
4251 tmp1
= ~(0x00002000 & generate_cmask (buf0
[2])); break;
4252 case 9: tmp0
= __byte_perm (buf0
[2], 0, 0x6541);
4253 tmp1
= ~(0x00200000 & generate_cmask (buf0
[2])); break;
4254 case 10: tmp0
= __byte_perm (buf0
[2], 0, 0x6542);
4255 tmp1
= ~(0x20000000 & generate_cmask (buf0
[2])); break;
4256 case 11: tmp0
= __byte_perm (buf0
[2], 0, 0x6543);
4257 tmp1
= ~(0x00000020 & generate_cmask (buf0
[3])); break;
4258 case 12: tmp0
= __byte_perm (buf0
[3], 0, 0x6540);
4259 tmp1
= ~(0x00002000 & generate_cmask (buf0
[3])); break;
4260 case 13: tmp0
= __byte_perm (buf0
[3], 0, 0x6541);
4261 tmp1
= ~(0x00200000 & generate_cmask (buf0
[3])); break;
4262 case 14: tmp0
= __byte_perm (buf0
[3], 0, 0x6542);
4263 tmp1
= ~(0x20000000 & generate_cmask (buf0
[3])); break;
4264 case 15: tmp0
= __byte_perm (buf0
[3], 0, 0x6543);
4265 tmp1
= ~(0x00000020 & generate_cmask (buf1
[0])); break;
4266 case 16: tmp0
= __byte_perm (buf1
[0], 0, 0x6540);
4267 tmp1
= ~(0x00002000 & generate_cmask (buf1
[0])); break;
4268 case 17: tmp0
= __byte_perm (buf1
[0], 0, 0x6541);
4269 tmp1
= ~(0x00200000 & generate_cmask (buf1
[0])); break;
4270 case 18: tmp0
= __byte_perm (buf1
[0], 0, 0x6542);
4271 tmp1
= ~(0x20000000 & generate_cmask (buf1
[0])); break;
4272 case 19: tmp0
= __byte_perm (buf1
[0], 0, 0x6543);
4273 tmp1
= ~(0x00000020 & generate_cmask (buf1
[1])); break;
4274 case 20: tmp0
= __byte_perm (buf1
[1], 0, 0x6540);
4275 tmp1
= ~(0x00002000 & generate_cmask (buf1
[1])); break;
4276 case 21: tmp0
= __byte_perm (buf1
[1], 0, 0x6541);
4277 tmp1
= ~(0x00200000 & generate_cmask (buf1
[1])); break;
4278 case 22: tmp0
= __byte_perm (buf1
[1], 0, 0x6542);
4279 tmp1
= ~(0x20000000 & generate_cmask (buf1
[1])); break;
4280 case 23: tmp0
= __byte_perm (buf1
[1], 0, 0x6543);
4281 tmp1
= ~(0x00000020 & generate_cmask (buf1
[2])); break;
4282 case 24: tmp0
= __byte_perm (buf1
[2], 0, 0x6540);
4283 tmp1
= ~(0x00002000 & generate_cmask (buf1
[2])); break;
4284 case 25: tmp0
= __byte_perm (buf1
[2], 0, 0x6541);
4285 tmp1
= ~(0x00200000 & generate_cmask (buf1
[2])); break;
4286 case 26: tmp0
= __byte_perm (buf1
[2], 0, 0x6542);
4287 tmp1
= ~(0x20000000 & generate_cmask (buf1
[2])); break;
4288 case 27: tmp0
= __byte_perm (buf1
[2], 0, 0x6543);
4289 tmp1
= ~(0x00000020 & generate_cmask (buf1
[3])); break;
4290 case 28: tmp0
= __byte_perm (buf1
[3], 0, 0x6540);
4291 tmp1
= ~(0x00002000 & generate_cmask (buf1
[3])); break;
4292 case 29: tmp0
= __byte_perm (buf1
[3], 0, 0x6541);
4293 tmp1
= ~(0x00200000 & generate_cmask (buf1
[3])); break;
4294 case 30: tmp0
= __byte_perm (buf1
[3], 0, 0x6542);
4295 tmp1
= ~(0x20000000 & generate_cmask (buf1
[3])); break;
4301 if (tmp0
== ' ') buf0
[0] &= tmp1
;
4305 if (tmp0
== ' ') buf0
[1] &= tmp1
;
4309 if (tmp0
== ' ') buf0
[2] &= tmp1
;
4313 if (tmp0
== ' ') buf0
[3] &= tmp1
;
4317 if (tmp0
== ' ') buf1
[0] &= tmp1
;
4321 if (tmp0
== ' ') buf1
[1] &= tmp1
;
4325 if (tmp0
== ' ') buf1
[2] &= tmp1
;
4329 if (tmp0
== ' ') buf1
[3] &= tmp1
;
4336 if (tmp0
.x
== ' ') buf0
[0].x
&= tmp1
.x
;
4337 if (tmp0
.y
== ' ') buf0
[0].y
&= tmp1
.y
;
4341 if (tmp0
.x
== ' ') buf0
[1].x
&= tmp1
.x
;
4342 if (tmp0
.y
== ' ') buf0
[1].y
&= tmp1
.y
;
4346 if (tmp0
.x
== ' ') buf0
[2].x
&= tmp1
.x
;
4347 if (tmp0
.y
== ' ') buf0
[2].y
&= tmp1
.y
;
4351 if (tmp0
.x
== ' ') buf0
[3].x
&= tmp1
.x
;
4352 if (tmp0
.y
== ' ') buf0
[3].y
&= tmp1
.y
;
4356 if (tmp0
.x
== ' ') buf1
[0].x
&= tmp1
.x
;
4357 if (tmp0
.y
== ' ') buf1
[0].y
&= tmp1
.y
;
4361 if (tmp0
.x
== ' ') buf1
[1].x
&= tmp1
.x
;
4362 if (tmp0
.y
== ' ') buf1
[1].y
&= tmp1
.y
;
4366 if (tmp0
.x
== ' ') buf1
[2].x
&= tmp1
.x
;
4367 if (tmp0
.y
== ' ') buf1
[2].y
&= tmp1
.y
;
4371 if (tmp0
.x
== ' ') buf1
[3].x
&= tmp1
.x
;
4372 if (tmp0
.y
== ' ') buf1
[3].y
&= tmp1
.y
;
4379 if (tmp0
.x
== ' ') buf0
[0].x
&= tmp1
.x
;
4380 if (tmp0
.y
== ' ') buf0
[0].y
&= tmp1
.y
;
4381 if (tmp0
.z
== ' ') buf0
[0].z
&= tmp1
.z
;
4382 if (tmp0
.w
== ' ') buf0
[0].w
&= tmp1
.w
;
4386 if (tmp0
.x
== ' ') buf0
[1].x
&= tmp1
.x
;
4387 if (tmp0
.y
== ' ') buf0
[1].y
&= tmp1
.y
;
4388 if (tmp0
.z
== ' ') buf0
[1].z
&= tmp1
.z
;
4389 if (tmp0
.w
== ' ') buf0
[1].w
&= tmp1
.w
;
4393 if (tmp0
.x
== ' ') buf0
[2].x
&= tmp1
.x
;
4394 if (tmp0
.y
== ' ') buf0
[2].y
&= tmp1
.y
;
4395 if (tmp0
.z
== ' ') buf0
[2].z
&= tmp1
.z
;
4396 if (tmp0
.w
== ' ') buf0
[2].w
&= tmp1
.w
;
4400 if (tmp0
.x
== ' ') buf0
[3].x
&= tmp1
.x
;
4401 if (tmp0
.y
== ' ') buf0
[3].y
&= tmp1
.y
;
4402 if (tmp0
.z
== ' ') buf0
[3].z
&= tmp1
.z
;
4403 if (tmp0
.w
== ' ') buf0
[3].w
&= tmp1
.w
;
4407 if (tmp0
.x
== ' ') buf1
[0].x
&= tmp1
.x
;
4408 if (tmp0
.y
== ' ') buf1
[0].y
&= tmp1
.y
;
4409 if (tmp0
.z
== ' ') buf1
[0].z
&= tmp1
.z
;
4410 if (tmp0
.w
== ' ') buf1
[0].w
&= tmp1
.w
;
4414 if (tmp0
.x
== ' ') buf1
[1].x
&= tmp1
.x
;
4415 if (tmp0
.y
== ' ') buf1
[1].y
&= tmp1
.y
;
4416 if (tmp0
.z
== ' ') buf1
[1].z
&= tmp1
.z
;
4417 if (tmp0
.w
== ' ') buf1
[1].w
&= tmp1
.w
;
4421 if (tmp0
.x
== ' ') buf1
[2].x
&= tmp1
.x
;
4422 if (tmp0
.y
== ' ') buf1
[2].y
&= tmp1
.y
;
4423 if (tmp0
.z
== ' ') buf1
[2].z
&= tmp1
.z
;
4424 if (tmp0
.w
== ' ') buf1
[2].w
&= tmp1
.w
;
4428 if (tmp0
.x
== ' ') buf1
[3].x
&= tmp1
.x
;
4429 if (tmp0
.y
== ' ') buf1
[3].y
&= tmp1
.y
;
4430 if (tmp0
.z
== ' ') buf1
[3].z
&= tmp1
.z
;
4431 if (tmp0
.w
== ' ') buf1
[3].w
&= tmp1
.w
;
4441 __device__
static u32
apply_rule (const u32 name
, const u32 p0
, const u32 p1
, u32x buf0
[4], u32x buf1
[4], const u32 in_len
)
4443 u32 out_len
= in_len
;
4447 case RULE_OP_MANGLE_LREST
: out_len
= rule_op_mangle_lrest (p0
, p1
, buf0
, buf1
, out_len
); break;
4448 case RULE_OP_MANGLE_UREST
: out_len
= rule_op_mangle_urest (p0
, p1
, buf0
, buf1
, out_len
); break;
4449 case RULE_OP_MANGLE_LREST_UFIRST
: out_len
= rule_op_mangle_lrest_ufirst (p0
, p1
, buf0
, buf1
, out_len
); break;
4450 case RULE_OP_MANGLE_UREST_LFIRST
: out_len
= rule_op_mangle_urest_lfirst (p0
, p1
, buf0
, buf1
, out_len
); break;
4451 case RULE_OP_MANGLE_TREST
: out_len
= rule_op_mangle_trest (p0
, p1
, buf0
, buf1
, out_len
); break;
4452 case RULE_OP_MANGLE_TOGGLE_AT
: out_len
= rule_op_mangle_toggle_at (p0
, p1
, buf0
, buf1
, out_len
); break;
4453 case RULE_OP_MANGLE_REVERSE
: out_len
= rule_op_mangle_reverse (p0
, p1
, buf0
, buf1
, out_len
); break;
4454 case RULE_OP_MANGLE_DUPEWORD
: out_len
= rule_op_mangle_dupeword (p0
, p1
, buf0
, buf1
, out_len
); break;
4455 case RULE_OP_MANGLE_DUPEWORD_TIMES
: out_len
= rule_op_mangle_dupeword_times (p0
, p1
, buf0
, buf1
, out_len
); break;
4456 case RULE_OP_MANGLE_REFLECT
: out_len
= rule_op_mangle_reflect (p0
, p1
, buf0
, buf1
, out_len
); break;
4457 case RULE_OP_MANGLE_APPEND
: out_len
= rule_op_mangle_append (p0
, p1
, buf0
, buf1
, out_len
); break;
4458 case RULE_OP_MANGLE_PREPEND
: out_len
= rule_op_mangle_prepend (p0
, p1
, buf0
, buf1
, out_len
); break;
4459 case RULE_OP_MANGLE_ROTATE_LEFT
: out_len
= rule_op_mangle_rotate_left (p0
, p1
, buf0
, buf1
, out_len
); break;
4460 case RULE_OP_MANGLE_ROTATE_RIGHT
: out_len
= rule_op_mangle_rotate_right (p0
, p1
, buf0
, buf1
, out_len
); break;
4461 case RULE_OP_MANGLE_DELETE_FIRST
: out_len
= rule_op_mangle_delete_first (p0
, p1
, buf0
, buf1
, out_len
); break;
4462 case RULE_OP_MANGLE_DELETE_LAST
: out_len
= rule_op_mangle_delete_last (p0
, p1
, buf0
, buf1
, out_len
); break;
4463 case RULE_OP_MANGLE_DELETE_AT
: out_len
= rule_op_mangle_delete_at (p0
, p1
, buf0
, buf1
, out_len
); break;
4464 case RULE_OP_MANGLE_EXTRACT
: out_len
= rule_op_mangle_extract (p0
, p1
, buf0
, buf1
, out_len
); break;
4465 case RULE_OP_MANGLE_OMIT
: out_len
= rule_op_mangle_omit (p0
, p1
, buf0
, buf1
, out_len
); break;
4466 case RULE_OP_MANGLE_INSERT
: out_len
= rule_op_mangle_insert (p0
, p1
, buf0
, buf1
, out_len
); break;
4467 case RULE_OP_MANGLE_OVERSTRIKE
: out_len
= rule_op_mangle_overstrike (p0
, p1
, buf0
, buf1
, out_len
); break;
4468 case RULE_OP_MANGLE_TRUNCATE_AT
: out_len
= rule_op_mangle_truncate_at (p0
, p1
, buf0
, buf1
, out_len
); break;
4469 case RULE_OP_MANGLE_REPLACE
: out_len
= rule_op_mangle_replace (p0
, p1
, buf0
, buf1
, out_len
); break;
4470 //case RULE_OP_MANGLE_PURGECHAR: out_len = rule_op_mangle_purgechar (p0, p1, buf0, buf1, out_len); break;
4471 //case RULE_OP_MANGLE_TOGGLECASE_REC: out_len = rule_op_mangle_togglecase_rec (p0, p1, buf0, buf1, out_len); break;
4472 case RULE_OP_MANGLE_DUPECHAR_FIRST
: out_len
= rule_op_mangle_dupechar_first (p0
, p1
, buf0
, buf1
, out_len
); break;
4473 case RULE_OP_MANGLE_DUPECHAR_LAST
: out_len
= rule_op_mangle_dupechar_last (p0
, p1
, buf0
, buf1
, out_len
); break;
4474 case RULE_OP_MANGLE_DUPECHAR_ALL
: out_len
= rule_op_mangle_dupechar_all (p0
, p1
, buf0
, buf1
, out_len
); break;
4475 case RULE_OP_MANGLE_SWITCH_FIRST
: out_len
= rule_op_mangle_switch_first (p0
, p1
, buf0
, buf1
, out_len
); break;
4476 case RULE_OP_MANGLE_SWITCH_LAST
: out_len
= rule_op_mangle_switch_last (p0
, p1
, buf0
, buf1
, out_len
); break;
4477 case RULE_OP_MANGLE_SWITCH_AT
: out_len
= rule_op_mangle_switch_at (p0
, p1
, buf0
, buf1
, out_len
); break;
4478 case RULE_OP_MANGLE_CHR_SHIFTL
: out_len
= rule_op_mangle_chr_shiftl (p0
, p1
, buf0
, buf1
, out_len
); break;
4479 case RULE_OP_MANGLE_CHR_SHIFTR
: out_len
= rule_op_mangle_chr_shiftr (p0
, p1
, buf0
, buf1
, out_len
); break;
4480 case RULE_OP_MANGLE_CHR_INCR
: out_len
= rule_op_mangle_chr_incr (p0
, p1
, buf0
, buf1
, out_len
); break;
4481 case RULE_OP_MANGLE_CHR_DECR
: out_len
= rule_op_mangle_chr_decr (p0
, p1
, buf0
, buf1
, out_len
); break;
4482 case RULE_OP_MANGLE_REPLACE_NP1
: out_len
= rule_op_mangle_replace_np1 (p0
, p1
, buf0
, buf1
, out_len
); break;
4483 case RULE_OP_MANGLE_REPLACE_NM1
: out_len
= rule_op_mangle_replace_nm1 (p0
, p1
, buf0
, buf1
, out_len
); break;
4484 case RULE_OP_MANGLE_DUPEBLOCK_FIRST
: out_len
= rule_op_mangle_dupeblock_first (p0
, p1
, buf0
, buf1
, out_len
); break;
4485 case RULE_OP_MANGLE_DUPEBLOCK_LAST
: out_len
= rule_op_mangle_dupeblock_last (p0
, p1
, buf0
, buf1
, out_len
); break;
4486 case RULE_OP_MANGLE_TITLE
: out_len
= rule_op_mangle_title (p0
, p1
, buf0
, buf1
, out_len
); break;
4492 __device__
static u32
apply_rules (u32
*cmds
, u32x buf0
[4], u32x buf1
[4], const u32 len
)
4496 for (u32 i
= 0; cmds
[i
] != 0; i
++)
4498 const u32 cmd
= cmds
[i
];
4500 const u32 name
= (cmd
>> 0) & 0xff;
4501 const u32 p0
= (cmd
>> 8) & 0xff;
4502 const u32 p1
= (cmd
>> 16) & 0xff;
4504 out_len
= apply_rule (name
, p0
, p1
, buf0
, buf1
, out_len
);