Initial commit
[hashcat.git] / nv / common_nv.c
1 /**
2 * Author......: Jens Steube <jens.steube@gmail.com>
3 * License.....: MIT
4 */
5
6 __device__ static int device_memcmp (const u32 d1[4], const u32 *d2)
7 {
8 if (d1[3] > d2[DGST_R3]) return ( 1);
9 if (d1[3] < d2[DGST_R3]) return (-1);
10 if (d1[2] > d2[DGST_R2]) return ( 1);
11 if (d1[2] < d2[DGST_R2]) return (-1);
12 if (d1[1] > d2[DGST_R1]) return ( 1);
13 if (d1[1] < d2[DGST_R1]) return (-1);
14 if (d1[0] > d2[DGST_R0]) return ( 1);
15 if (d1[0] < d2[DGST_R0]) return (-1);
16
17 return (0);
18 }
19
20 __device__ static int find_hash (const u32 digest[4], const u32 digests_cnt, const digest_t *digests_buf)
21 {
22 for (u32 l = 0, r = digests_cnt; r; r >>= 1)
23 {
24 const u32 m = r >> 1;
25
26 const u32 c = l + m;
27
28 const int cmp = device_memcmp (digest, digests_buf[c].digest_buf);
29
30 if (cmp > 0)
31 {
32 l += m + 1;
33
34 r--;
35 }
36
37 if (cmp == 0) return (c);
38 }
39
40 return (-1);
41 }
42
43 __device__ static u32 check_bitmap (const u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest)
44 {
45 return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f)));
46 }
47
48 __device__ static u32 check (const u32 digest[2], const u32 *bitmap_s1_a, const u32 *bitmap_s1_b, const u32 *bitmap_s1_c, const u32 *bitmap_s1_d, const u32 *bitmap_s2_a, const u32 *bitmap_s2_b, const u32 *bitmap_s2_c, const u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2)
49 {
50 if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0);
51 if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0);
52 if (check_bitmap (bitmap_s1_c, bitmap_mask, bitmap_shift1, digest[2]) == 0) return (0);
53 if (check_bitmap (bitmap_s1_d, bitmap_mask, bitmap_shift1, digest[3]) == 0) return (0);
54
55 if (check_bitmap (bitmap_s2_a, bitmap_mask, bitmap_shift2, digest[0]) == 0) return (0);
56 if (check_bitmap (bitmap_s2_b, bitmap_mask, bitmap_shift2, digest[1]) == 0) return (0);
57 if (check_bitmap (bitmap_s2_c, bitmap_mask, bitmap_shift2, digest[2]) == 0) return (0);
58 if (check_bitmap (bitmap_s2_d, bitmap_mask, bitmap_shift2, digest[3]) == 0) return (0);
59
60 return (1);
61 }
62
63 #ifdef VECT_SIZE1
64 __device__ static void mark_hash_s0 (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
65 {
66 hashes_shown[hash_pos] = 1;
67
68 plains_buf[hash_pos].gidvid = (gid * 1) + 0;
69 plains_buf[hash_pos].il_pos = il_pos;
70 }
71
72 __device__ static void mark_hash_s0_warp (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
73 {
74 hashes_shown[hash_pos] = 1;
75
76 plains_buf[hash_pos].gidvid = gid;
77 plains_buf[hash_pos].il_pos = (il_pos * 1) + 0;
78 }
79 #endif
80
81 #ifdef VECT_SIZE2
82 __device__ static void mark_hash_s0 (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
83 {
84 hashes_shown[hash_pos] = 1;
85
86 plains_buf[hash_pos].gidvid = (gid * 2) + 0;
87 plains_buf[hash_pos].il_pos = il_pos;
88 }
89
90 __device__ static void mark_hash_s1 (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
91 {
92 hashes_shown[hash_pos] = 1;
93
94 plains_buf[hash_pos].gidvid = (gid * 2) + 1;
95 plains_buf[hash_pos].il_pos = il_pos;
96 }
97
98 __device__ static void mark_hash_s0_warp (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
99 {
100 hashes_shown[hash_pos] = 1;
101
102 plains_buf[hash_pos].gidvid = gid;
103 plains_buf[hash_pos].il_pos = (il_pos * 2) + 0;
104 }
105
106 __device__ static void mark_hash_s1_warp (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
107 {
108 hashes_shown[hash_pos] = 1;
109
110 plains_buf[hash_pos].gidvid = gid;
111 plains_buf[hash_pos].il_pos = (il_pos * 2) + 1;
112 }
113 #endif
114
115 #ifdef VECT_SIZE4
116 __device__ static void mark_hash_s0 (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
117 {
118 hashes_shown[hash_pos] = 1;
119
120 plains_buf[hash_pos].gidvid = (gid * 4) + 0;
121 plains_buf[hash_pos].il_pos = il_pos;
122 }
123
124 __device__ static void mark_hash_s1 (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
125 {
126 hashes_shown[hash_pos] = 1;
127
128 plains_buf[hash_pos].gidvid = (gid * 4) + 1;
129 plains_buf[hash_pos].il_pos = il_pos;
130 }
131
132 __device__ static void mark_hash_s2 (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
133 {
134 hashes_shown[hash_pos] = 1;
135
136 plains_buf[hash_pos].gidvid = (gid * 4) + 2;
137 plains_buf[hash_pos].il_pos = il_pos;
138 }
139
140 __device__ static void mark_hash_s3 (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
141 {
142 hashes_shown[hash_pos] = 1;
143
144 plains_buf[hash_pos].gidvid = (gid * 4) + 3;
145 plains_buf[hash_pos].il_pos = il_pos;
146 }
147
148 __device__ static void mark_hash_s0_warp (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
149 {
150 hashes_shown[hash_pos] = 1;
151
152 plains_buf[hash_pos].gidvid = gid;
153 plains_buf[hash_pos].il_pos = (il_pos * 4) + 0;
154 }
155
156 __device__ static void mark_hash_s1_warp (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
157 {
158 hashes_shown[hash_pos] = 1;
159
160 plains_buf[hash_pos].gidvid = gid;
161 plains_buf[hash_pos].il_pos = (il_pos * 4) + 1;
162 }
163
164 __device__ static void mark_hash_s2_warp (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
165 {
166 hashes_shown[hash_pos] = 1;
167
168 plains_buf[hash_pos].gidvid = gid;
169 plains_buf[hash_pos].il_pos = (il_pos * 4) + 2;
170 }
171
172 __device__ static void mark_hash_s3_warp (plain_t *plains_buf, u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
173 {
174 hashes_shown[hash_pos] = 1;
175
176 plains_buf[hash_pos].gidvid = gid;
177 plains_buf[hash_pos].il_pos = (il_pos * 4) + 3;
178 }
179 #endif
180
181 /**
182 * scalar
183 */
184
185 __device__ static u32 swap_workaround (const u32 v)
186 {
187 #if __CUDA_ARCH__ >= 200
188 return __byte_perm (v, 0, 0x0123);
189
190 #else
191 return (v << 24) + ((v & 0x0000FF00) << 8) + ((v & 0x00FF0000) >> 8) + (v >> 24);
192
193 #endif
194 }
195
196 __device__ static u64 swap_workaround (const u64 v)
197 {
198 return (((v & 0xff00000000000000ull) >> 56)
199 | ((v & 0x00ff000000000000ull) >> 40)
200 | ((v & 0x0000ff0000000000ull) >> 24)
201 | ((v & 0x000000ff00000000ull) >> 8)
202 | ((v & 0x00000000ff000000ull) << 8)
203 | ((v & 0x0000000000ff0000ull) << 24)
204 | ((v & 0x000000000000ff00ull) << 40)
205 | ((v & 0x00000000000000ffull) << 56));
206 }
207
208 __device__ static void truncate_block (u32 w[4], const u32 len)
209 {
210 switch (len)
211 {
212 case 0: w[0] &= 0;
213 w[1] &= 0;
214 w[2] &= 0;
215 w[3] &= 0;
216 break;
217 case 1: w[0] &= 0x000000FF;
218 w[1] &= 0;
219 w[2] &= 0;
220 w[3] &= 0;
221 break;
222 case 2: w[0] &= 0x0000FFFF;
223 w[1] &= 0;
224 w[2] &= 0;
225 w[3] &= 0;
226 break;
227 case 3: w[0] &= 0x00FFFFFF;
228 w[1] &= 0;
229 w[2] &= 0;
230 w[3] &= 0;
231 break;
232 case 4: w[1] &= 0;
233 w[2] &= 0;
234 w[3] &= 0;
235 break;
236 case 5: w[1] &= 0x000000FF;
237 w[2] &= 0;
238 w[3] &= 0;
239 break;
240 case 6: w[1] &= 0x0000FFFF;
241 w[2] &= 0;
242 w[3] &= 0;
243 break;
244 case 7: w[1] &= 0x00FFFFFF;
245 w[2] &= 0;
246 w[3] &= 0;
247 break;
248 case 8: w[2] &= 0;
249 w[3] &= 0;
250 break;
251 case 9: w[2] &= 0x000000FF;
252 w[3] &= 0;
253 break;
254 case 10: w[2] &= 0x0000FFFF;
255 w[3] &= 0;
256 break;
257 case 11: w[2] &= 0x00FFFFFF;
258 w[3] &= 0;
259 break;
260 case 12: w[3] &= 0;
261 break;
262 case 13: w[3] &= 0x000000FF;
263 break;
264 case 14: w[3] &= 0x0000FFFF;
265 break;
266 case 15: w[3] &= 0x00FFFFFF;
267 break;
268 }
269 }
270
271 __device__ static void make_unicode (const u32 in[4], u32 out1[4], u32 out2[4])
272 {
273 #if __CUDA_ARCH__ >= 200
274 out2[3] = __byte_perm (in[3], 0, 0x7372);
275 out2[2] = __byte_perm (in[3], 0, 0x7170);
276 out2[1] = __byte_perm (in[2], 0, 0x7372);
277 out2[0] = __byte_perm (in[2], 0, 0x7170);
278 out1[3] = __byte_perm (in[1], 0, 0x7372);
279 out1[2] = __byte_perm (in[1], 0, 0x7170);
280 out1[1] = __byte_perm (in[0], 0, 0x7372);
281 out1[0] = __byte_perm (in[0], 0, 0x7170);
282 #else
283 out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
284 out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
285 out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
286 out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
287 out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
288 out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
289 out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
290 out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
291 #endif
292 }
293
294 __device__ static void undo_unicode (const u32 in1[4], const u32 in2[4], u32 out[4])
295 {
296 #if __CUDA_ARCH__ >= 200
297 out[0] = __byte_perm (in1[0], in1[1], 0x6420);
298 out[1] = __byte_perm (in1[2], in1[3], 0x6420);
299 out[2] = __byte_perm (in2[0], in2[1], 0x6420);
300 out[3] = __byte_perm (in2[2], in2[3], 0x6420);
301 #else
302 out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
303 | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
304 out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
305 | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
306 out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
307 | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
308 out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
309 | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
310 #endif
311 }
312
313 __device__ static void append_0x01_1 (u32 w0[4], const u32 offset)
314 {
315 switch (offset)
316 {
317 case 0:
318 w0[0] = 0x01;
319 break;
320
321 case 1:
322 w0[0] = w0[0] | 0x0100;
323 break;
324
325 case 2:
326 w0[0] = w0[0] | 0x010000;
327 break;
328
329 case 3:
330 w0[0] = w0[0] | 0x01000000;
331 break;
332
333 case 4:
334 w0[1] = 0x01;
335 break;
336
337 case 5:
338 w0[1] = w0[1] | 0x0100;
339 break;
340
341 case 6:
342 w0[1] = w0[1] | 0x010000;
343 break;
344
345 case 7:
346 w0[1] = w0[1] | 0x01000000;
347 break;
348
349 case 8:
350 w0[2] = 0x01;
351 break;
352
353 case 9:
354 w0[2] = w0[2] | 0x0100;
355 break;
356
357 case 10:
358 w0[2] = w0[2] | 0x010000;
359 break;
360
361 case 11:
362 w0[2] = w0[2] | 0x01000000;
363 break;
364
365 case 12:
366 w0[3] = 0x01;
367 break;
368
369 case 13:
370 w0[3] = w0[3] | 0x0100;
371 break;
372
373 case 14:
374 w0[3] = w0[3] | 0x010000;
375 break;
376
377 case 15:
378 w0[3] = w0[3] | 0x01000000;
379 break;
380 }
381 }
382
383 __device__ static void append_0x01_2 (u32 w0[4], u32 w1[4], const u32 offset)
384 {
385 switch (offset)
386 {
387 case 0:
388 w0[0] = 0x01;
389 break;
390
391 case 1:
392 w0[0] = w0[0] | 0x0100;
393 break;
394
395 case 2:
396 w0[0] = w0[0] | 0x010000;
397 break;
398
399 case 3:
400 w0[0] = w0[0] | 0x01000000;
401 break;
402
403 case 4:
404 w0[1] = 0x01;
405 break;
406
407 case 5:
408 w0[1] = w0[1] | 0x0100;
409 break;
410
411 case 6:
412 w0[1] = w0[1] | 0x010000;
413 break;
414
415 case 7:
416 w0[1] = w0[1] | 0x01000000;
417 break;
418
419 case 8:
420 w0[2] = 0x01;
421 break;
422
423 case 9:
424 w0[2] = w0[2] | 0x0100;
425 break;
426
427 case 10:
428 w0[2] = w0[2] | 0x010000;
429 break;
430
431 case 11:
432 w0[2] = w0[2] | 0x01000000;
433 break;
434
435 case 12:
436 w0[3] = 0x01;
437 break;
438
439 case 13:
440 w0[3] = w0[3] | 0x0100;
441 break;
442
443 case 14:
444 w0[3] = w0[3] | 0x010000;
445 break;
446
447 case 15:
448 w0[3] = w0[3] | 0x01000000;
449 break;
450
451 case 16:
452 w1[0] = 0x01;
453 break;
454
455 case 17:
456 w1[0] = w1[0] | 0x0100;
457 break;
458
459 case 18:
460 w1[0] = w1[0] | 0x010000;
461 break;
462
463 case 19:
464 w1[0] = w1[0] | 0x01000000;
465 break;
466
467 case 20:
468 w1[1] = 0x01;
469 break;
470
471 case 21:
472 w1[1] = w1[1] | 0x0100;
473 break;
474
475 case 22:
476 w1[1] = w1[1] | 0x010000;
477 break;
478
479 case 23:
480 w1[1] = w1[1] | 0x01000000;
481 break;
482
483 case 24:
484 w1[2] = 0x01;
485 break;
486
487 case 25:
488 w1[2] = w1[2] | 0x0100;
489 break;
490
491 case 26:
492 w1[2] = w1[2] | 0x010000;
493 break;
494
495 case 27:
496 w1[2] = w1[2] | 0x01000000;
497 break;
498
499 case 28:
500 w1[3] = 0x01;
501 break;
502
503 case 29:
504 w1[3] = w1[3] | 0x0100;
505 break;
506
507 case 30:
508 w1[3] = w1[3] | 0x010000;
509 break;
510
511 case 31:
512 w1[3] = w1[3] | 0x01000000;
513 break;
514 }
515 }
516
517 __device__ static void append_0x01_3 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
518 {
519 switch (offset)
520 {
521 case 0:
522 w0[0] = 0x01;
523 break;
524
525 case 1:
526 w0[0] = w0[0] | 0x0100;
527 break;
528
529 case 2:
530 w0[0] = w0[0] | 0x010000;
531 break;
532
533 case 3:
534 w0[0] = w0[0] | 0x01000000;
535 break;
536
537 case 4:
538 w0[1] = 0x01;
539 break;
540
541 case 5:
542 w0[1] = w0[1] | 0x0100;
543 break;
544
545 case 6:
546 w0[1] = w0[1] | 0x010000;
547 break;
548
549 case 7:
550 w0[1] = w0[1] | 0x01000000;
551 break;
552
553 case 8:
554 w0[2] = 0x01;
555 break;
556
557 case 9:
558 w0[2] = w0[2] | 0x0100;
559 break;
560
561 case 10:
562 w0[2] = w0[2] | 0x010000;
563 break;
564
565 case 11:
566 w0[2] = w0[2] | 0x01000000;
567 break;
568
569 case 12:
570 w0[3] = 0x01;
571 break;
572
573 case 13:
574 w0[3] = w0[3] | 0x0100;
575 break;
576
577 case 14:
578 w0[3] = w0[3] | 0x010000;
579 break;
580
581 case 15:
582 w0[3] = w0[3] | 0x01000000;
583 break;
584
585 case 16:
586 w1[0] = 0x01;
587 break;
588
589 case 17:
590 w1[0] = w1[0] | 0x0100;
591 break;
592
593 case 18:
594 w1[0] = w1[0] | 0x010000;
595 break;
596
597 case 19:
598 w1[0] = w1[0] | 0x01000000;
599 break;
600
601 case 20:
602 w1[1] = 0x01;
603 break;
604
605 case 21:
606 w1[1] = w1[1] | 0x0100;
607 break;
608
609 case 22:
610 w1[1] = w1[1] | 0x010000;
611 break;
612
613 case 23:
614 w1[1] = w1[1] | 0x01000000;
615 break;
616
617 case 24:
618 w1[2] = 0x01;
619 break;
620
621 case 25:
622 w1[2] = w1[2] | 0x0100;
623 break;
624
625 case 26:
626 w1[2] = w1[2] | 0x010000;
627 break;
628
629 case 27:
630 w1[2] = w1[2] | 0x01000000;
631 break;
632
633 case 28:
634 w1[3] = 0x01;
635 break;
636
637 case 29:
638 w1[3] = w1[3] | 0x0100;
639 break;
640
641 case 30:
642 w1[3] = w1[3] | 0x010000;
643 break;
644
645 case 31:
646 w1[3] = w1[3] | 0x01000000;
647 break;
648
649 case 32:
650 w2[0] = 0x01;
651 break;
652
653 case 33:
654 w2[0] = w2[0] | 0x0100;
655 break;
656
657 case 34:
658 w2[0] = w2[0] | 0x010000;
659 break;
660
661 case 35:
662 w2[0] = w2[0] | 0x01000000;
663 break;
664
665 case 36:
666 w2[1] = 0x01;
667 break;
668
669 case 37:
670 w2[1] = w2[1] | 0x0100;
671 break;
672
673 case 38:
674 w2[1] = w2[1] | 0x010000;
675 break;
676
677 case 39:
678 w2[1] = w2[1] | 0x01000000;
679 break;
680
681 case 40:
682 w2[2] = 0x01;
683 break;
684
685 case 41:
686 w2[2] = w2[2] | 0x0100;
687 break;
688
689 case 42:
690 w2[2] = w2[2] | 0x010000;
691 break;
692
693 case 43:
694 w2[2] = w2[2] | 0x01000000;
695 break;
696
697 case 44:
698 w2[3] = 0x01;
699 break;
700
701 case 45:
702 w2[3] = w2[3] | 0x0100;
703 break;
704
705 case 46:
706 w2[3] = w2[3] | 0x010000;
707 break;
708
709 case 47:
710 w2[3] = w2[3] | 0x01000000;
711 break;
712 }
713 }
714
715 __device__ static void append_0x01_4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
716 {
717 switch (offset)
718 {
719 case 0:
720 w0[0] = 0x01;
721 break;
722
723 case 1:
724 w0[0] = w0[0] | 0x0100;
725 break;
726
727 case 2:
728 w0[0] = w0[0] | 0x010000;
729 break;
730
731 case 3:
732 w0[0] = w0[0] | 0x01000000;
733 break;
734
735 case 4:
736 w0[1] = 0x01;
737 break;
738
739 case 5:
740 w0[1] = w0[1] | 0x0100;
741 break;
742
743 case 6:
744 w0[1] = w0[1] | 0x010000;
745 break;
746
747 case 7:
748 w0[1] = w0[1] | 0x01000000;
749 break;
750
751 case 8:
752 w0[2] = 0x01;
753 break;
754
755 case 9:
756 w0[2] = w0[2] | 0x0100;
757 break;
758
759 case 10:
760 w0[2] = w0[2] | 0x010000;
761 break;
762
763 case 11:
764 w0[2] = w0[2] | 0x01000000;
765 break;
766
767 case 12:
768 w0[3] = 0x01;
769 break;
770
771 case 13:
772 w0[3] = w0[3] | 0x0100;
773 break;
774
775 case 14:
776 w0[3] = w0[3] | 0x010000;
777 break;
778
779 case 15:
780 w0[3] = w0[3] | 0x01000000;
781 break;
782
783 case 16:
784 w1[0] = 0x01;
785 break;
786
787 case 17:
788 w1[0] = w1[0] | 0x0100;
789 break;
790
791 case 18:
792 w1[0] = w1[0] | 0x010000;
793 break;
794
795 case 19:
796 w1[0] = w1[0] | 0x01000000;
797 break;
798
799 case 20:
800 w1[1] = 0x01;
801 break;
802
803 case 21:
804 w1[1] = w1[1] | 0x0100;
805 break;
806
807 case 22:
808 w1[1] = w1[1] | 0x010000;
809 break;
810
811 case 23:
812 w1[1] = w1[1] | 0x01000000;
813 break;
814
815 case 24:
816 w1[2] = 0x01;
817 break;
818
819 case 25:
820 w1[2] = w1[2] | 0x0100;
821 break;
822
823 case 26:
824 w1[2] = w1[2] | 0x010000;
825 break;
826
827 case 27:
828 w1[2] = w1[2] | 0x01000000;
829 break;
830
831 case 28:
832 w1[3] = 0x01;
833 break;
834
835 case 29:
836 w1[3] = w1[3] | 0x0100;
837 break;
838
839 case 30:
840 w1[3] = w1[3] | 0x010000;
841 break;
842
843 case 31:
844 w1[3] = w1[3] | 0x01000000;
845 break;
846
847 case 32:
848 w2[0] = 0x01;
849 break;
850
851 case 33:
852 w2[0] = w2[0] | 0x0100;
853 break;
854
855 case 34:
856 w2[0] = w2[0] | 0x010000;
857 break;
858
859 case 35:
860 w2[0] = w2[0] | 0x01000000;
861 break;
862
863 case 36:
864 w2[1] = 0x01;
865 break;
866
867 case 37:
868 w2[1] = w2[1] | 0x0100;
869 break;
870
871 case 38:
872 w2[1] = w2[1] | 0x010000;
873 break;
874
875 case 39:
876 w2[1] = w2[1] | 0x01000000;
877 break;
878
879 case 40:
880 w2[2] = 0x01;
881 break;
882
883 case 41:
884 w2[2] = w2[2] | 0x0100;
885 break;
886
887 case 42:
888 w2[2] = w2[2] | 0x010000;
889 break;
890
891 case 43:
892 w2[2] = w2[2] | 0x01000000;
893 break;
894
895 case 44:
896 w2[3] = 0x01;
897 break;
898
899 case 45:
900 w2[3] = w2[3] | 0x0100;
901 break;
902
903 case 46:
904 w2[3] = w2[3] | 0x010000;
905 break;
906
907 case 47:
908 w2[3] = w2[3] | 0x01000000;
909 break;
910
911 case 48:
912 w3[0] = 0x01;
913 break;
914
915 case 49:
916 w3[0] = w3[0] | 0x0100;
917 break;
918
919 case 50:
920 w3[0] = w3[0] | 0x010000;
921 break;
922
923 case 51:
924 w3[0] = w3[0] | 0x01000000;
925 break;
926
927 case 52:
928 w3[1] = 0x01;
929 break;
930
931 case 53:
932 w3[1] = w3[1] | 0x0100;
933 break;
934
935 case 54:
936 w3[1] = w3[1] | 0x010000;
937 break;
938
939 case 55:
940 w3[1] = w3[1] | 0x01000000;
941 break;
942
943 case 56:
944 w3[2] = 0x01;
945 break;
946
947 case 57:
948 w3[2] = w3[2] | 0x0100;
949 break;
950
951 case 58:
952 w3[2] = w3[2] | 0x010000;
953 break;
954
955 case 59:
956 w3[2] = w3[2] | 0x01000000;
957 break;
958
959 case 60:
960 w3[3] = 0x01;
961 break;
962
963 case 61:
964 w3[3] = w3[3] | 0x0100;
965 break;
966
967 case 62:
968 w3[3] = w3[3] | 0x010000;
969 break;
970
971 case 63:
972 w3[3] = w3[3] | 0x01000000;
973 break;
974 }
975 }
976
977 __device__ static void append_0x01_8 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset)
978 {
979 switch (offset)
980 {
981 case 0:
982 w0[0] = 0x01;
983 break;
984
985 case 1:
986 w0[0] = w0[0] | 0x0100;
987 break;
988
989 case 2:
990 w0[0] = w0[0] | 0x010000;
991 break;
992
993 case 3:
994 w0[0] = w0[0] | 0x01000000;
995 break;
996
997 case 4:
998 w0[1] = 0x01;
999 break;
1000
1001 case 5:
1002 w0[1] = w0[1] | 0x0100;
1003 break;
1004
1005 case 6:
1006 w0[1] = w0[1] | 0x010000;
1007 break;
1008
1009 case 7:
1010 w0[1] = w0[1] | 0x01000000;
1011 break;
1012
1013 case 8:
1014 w0[2] = 0x01;
1015 break;
1016
1017 case 9:
1018 w0[2] = w0[2] | 0x0100;
1019 break;
1020
1021 case 10:
1022 w0[2] = w0[2] | 0x010000;
1023 break;
1024
1025 case 11:
1026 w0[2] = w0[2] | 0x01000000;
1027 break;
1028
1029 case 12:
1030 w0[3] = 0x01;
1031 break;
1032
1033 case 13:
1034 w0[3] = w0[3] | 0x0100;
1035 break;
1036
1037 case 14:
1038 w0[3] = w0[3] | 0x010000;
1039 break;
1040
1041 case 15:
1042 w0[3] = w0[3] | 0x01000000;
1043 break;
1044
1045 case 16:
1046 w1[0] = 0x01;
1047 break;
1048
1049 case 17:
1050 w1[0] = w1[0] | 0x0100;
1051 break;
1052
1053 case 18:
1054 w1[0] = w1[0] | 0x010000;
1055 break;
1056
1057 case 19:
1058 w1[0] = w1[0] | 0x01000000;
1059 break;
1060
1061 case 20:
1062 w1[1] = 0x01;
1063 break;
1064
1065 case 21:
1066 w1[1] = w1[1] | 0x0100;
1067 break;
1068
1069 case 22:
1070 w1[1] = w1[1] | 0x010000;
1071 break;
1072
1073 case 23:
1074 w1[1] = w1[1] | 0x01000000;
1075 break;
1076
1077 case 24:
1078 w1[2] = 0x01;
1079 break;
1080
1081 case 25:
1082 w1[2] = w1[2] | 0x0100;
1083 break;
1084
1085 case 26:
1086 w1[2] = w1[2] | 0x010000;
1087 break;
1088
1089 case 27:
1090 w1[2] = w1[2] | 0x01000000;
1091 break;
1092
1093 case 28:
1094 w1[3] = 0x01;
1095 break;
1096
1097 case 29:
1098 w1[3] = w1[3] | 0x0100;
1099 break;
1100
1101 case 30:
1102 w1[3] = w1[3] | 0x010000;
1103 break;
1104
1105 case 31:
1106 w1[3] = w1[3] | 0x01000000;
1107 break;
1108
1109 case 32:
1110 w2[0] = 0x01;
1111 break;
1112
1113 case 33:
1114 w2[0] = w2[0] | 0x0100;
1115 break;
1116
1117 case 34:
1118 w2[0] = w2[0] | 0x010000;
1119 break;
1120
1121 case 35:
1122 w2[0] = w2[0] | 0x01000000;
1123 break;
1124
1125 case 36:
1126 w2[1] = 0x01;
1127 break;
1128
1129 case 37:
1130 w2[1] = w2[1] | 0x0100;
1131 break;
1132
1133 case 38:
1134 w2[1] = w2[1] | 0x010000;
1135 break;
1136
1137 case 39:
1138 w2[1] = w2[1] | 0x01000000;
1139 break;
1140
1141 case 40:
1142 w2[2] = 0x01;
1143 break;
1144
1145 case 41:
1146 w2[2] = w2[2] | 0x0100;
1147 break;
1148
1149 case 42:
1150 w2[2] = w2[2] | 0x010000;
1151 break;
1152
1153 case 43:
1154 w2[2] = w2[2] | 0x01000000;
1155 break;
1156
1157 case 44:
1158 w2[3] = 0x01;
1159 break;
1160
1161 case 45:
1162 w2[3] = w2[3] | 0x0100;
1163 break;
1164
1165 case 46:
1166 w2[3] = w2[3] | 0x010000;
1167 break;
1168
1169 case 47:
1170 w2[3] = w2[3] | 0x01000000;
1171 break;
1172
1173 case 48:
1174 w3[0] = 0x01;
1175 break;
1176
1177 case 49:
1178 w3[0] = w3[0] | 0x0100;
1179 break;
1180
1181 case 50:
1182 w3[0] = w3[0] | 0x010000;
1183 break;
1184
1185 case 51:
1186 w3[0] = w3[0] | 0x01000000;
1187 break;
1188
1189 case 52:
1190 w3[1] = 0x01;
1191 break;
1192
1193 case 53:
1194 w3[1] = w3[1] | 0x0100;
1195 break;
1196
1197 case 54:
1198 w3[1] = w3[1] | 0x010000;
1199 break;
1200
1201 case 55:
1202 w3[1] = w3[1] | 0x01000000;
1203 break;
1204
1205 case 56:
1206 w3[2] = 0x01;
1207 break;
1208
1209 case 57:
1210 w3[2] = w3[2] | 0x0100;
1211 break;
1212
1213 case 58:
1214 w3[2] = w3[2] | 0x010000;
1215 break;
1216
1217 case 59:
1218 w3[2] = w3[2] | 0x01000000;
1219 break;
1220
1221 case 60:
1222 w3[3] = 0x01;
1223 break;
1224
1225 case 61:
1226 w3[3] = w3[3] | 0x0100;
1227 break;
1228
1229 case 62:
1230 w3[3] = w3[3] | 0x010000;
1231 break;
1232
1233 case 63:
1234 w3[3] = w3[3] | 0x01000000;
1235 break;
1236
1237 case 64:
1238 w4[0] = 0x01;
1239 break;
1240
1241 case 65:
1242 w4[0] = w4[0] | 0x0100;
1243 break;
1244
1245 case 66:
1246 w4[0] = w4[0] | 0x010000;
1247 break;
1248
1249 case 67:
1250 w4[0] = w4[0] | 0x01000000;
1251 break;
1252
1253 case 68:
1254 w4[1] = 0x01;
1255 break;
1256
1257 case 69:
1258 w4[1] = w4[1] | 0x0100;
1259 break;
1260
1261 case 70:
1262 w4[1] = w4[1] | 0x010000;
1263 break;
1264
1265 case 71:
1266 w4[1] = w4[1] | 0x01000000;
1267 break;
1268
1269 case 72:
1270 w4[2] = 0x01;
1271 break;
1272
1273 case 73:
1274 w4[2] = w4[2] | 0x0100;
1275 break;
1276
1277 case 74:
1278 w4[2] = w4[2] | 0x010000;
1279 break;
1280
1281 case 75:
1282 w4[2] = w4[2] | 0x01000000;
1283 break;
1284
1285 case 76:
1286 w4[3] = 0x01;
1287 break;
1288
1289 case 77:
1290 w4[3] = w4[3] | 0x0100;
1291 break;
1292
1293 case 78:
1294 w4[3] = w4[3] | 0x010000;
1295 break;
1296
1297 case 79:
1298 w4[3] = w4[3] | 0x01000000;
1299 break;
1300
1301 case 80:
1302 w5[0] = 0x01;
1303 break;
1304
1305 case 81:
1306 w5[0] = w5[0] | 0x0100;
1307 break;
1308
1309 case 82:
1310 w5[0] = w5[0] | 0x010000;
1311 break;
1312
1313 case 83:
1314 w5[0] = w5[0] | 0x01000000;
1315 break;
1316
1317 case 84:
1318 w5[1] = 0x01;
1319 break;
1320
1321 case 85:
1322 w5[1] = w5[1] | 0x0100;
1323 break;
1324
1325 case 86:
1326 w5[1] = w5[1] | 0x010000;
1327 break;
1328
1329 case 87:
1330 w5[1] = w5[1] | 0x01000000;
1331 break;
1332
1333 case 88:
1334 w5[2] = 0x01;
1335 break;
1336
1337 case 89:
1338 w5[2] = w5[2] | 0x0100;
1339 break;
1340
1341 case 90:
1342 w5[2] = w5[2] | 0x010000;
1343 break;
1344
1345 case 91:
1346 w5[2] = w5[2] | 0x01000000;
1347 break;
1348
1349 case 92:
1350 w5[3] = 0x01;
1351 break;
1352
1353 case 93:
1354 w5[3] = w5[3] | 0x0100;
1355 break;
1356
1357 case 94:
1358 w5[3] = w5[3] | 0x010000;
1359 break;
1360
1361 case 95:
1362 w5[3] = w5[3] | 0x01000000;
1363 break;
1364
1365 case 96:
1366 w6[0] = 0x01;
1367 break;
1368
1369 case 97:
1370 w6[0] = w6[0] | 0x0100;
1371 break;
1372
1373 case 98:
1374 w6[0] = w6[0] | 0x010000;
1375 break;
1376
1377 case 99:
1378 w6[0] = w6[0] | 0x01000000;
1379 break;
1380
1381 case 100:
1382 w6[1] = 0x01;
1383 break;
1384
1385 case 101:
1386 w6[1] = w6[1] | 0x0100;
1387 break;
1388
1389 case 102:
1390 w6[1] = w6[1] | 0x010000;
1391 break;
1392
1393 case 103:
1394 w6[1] = w6[1] | 0x01000000;
1395 break;
1396
1397 case 104:
1398 w6[2] = 0x01;
1399 break;
1400
1401 case 105:
1402 w6[2] = w6[2] | 0x0100;
1403 break;
1404
1405 case 106:
1406 w6[2] = w6[2] | 0x010000;
1407 break;
1408
1409 case 107:
1410 w6[2] = w6[2] | 0x01000000;
1411 break;
1412
1413 case 108:
1414 w6[3] = 0x01;
1415 break;
1416
1417 case 109:
1418 w6[3] = w6[3] | 0x0100;
1419 break;
1420
1421 case 110:
1422 w6[3] = w6[3] | 0x010000;
1423 break;
1424
1425 case 111:
1426 w6[3] = w6[3] | 0x01000000;
1427 break;
1428
1429 case 112:
1430 w7[0] = 0x01;
1431 break;
1432
1433 case 113:
1434 w7[0] = w7[0] | 0x0100;
1435 break;
1436
1437 case 114:
1438 w7[0] = w7[0] | 0x010000;
1439 break;
1440
1441 case 115:
1442 w7[0] = w7[0] | 0x01000000;
1443 break;
1444
1445 case 116:
1446 w7[1] = 0x01;
1447 break;
1448
1449 case 117:
1450 w7[1] = w7[1] | 0x0100;
1451 break;
1452
1453 case 118:
1454 w7[1] = w7[1] | 0x010000;
1455 break;
1456
1457 case 119:
1458 w7[1] = w7[1] | 0x01000000;
1459 break;
1460
1461 case 120:
1462 w7[2] = 0x01;
1463 break;
1464
1465 case 121:
1466 w7[2] = w7[2] | 0x0100;
1467 break;
1468
1469 case 122:
1470 w7[2] = w7[2] | 0x010000;
1471 break;
1472
1473 case 123:
1474 w7[2] = w7[2] | 0x01000000;
1475 break;
1476
1477 case 124:
1478 w7[3] = 0x01;
1479 break;
1480
1481 case 125:
1482 w7[3] = w7[3] | 0x0100;
1483 break;
1484
1485 case 126:
1486 w7[3] = w7[3] | 0x010000;
1487 break;
1488
1489 case 127:
1490 w7[3] = w7[3] | 0x01000000;
1491 break;
1492 }
1493 }
1494
1495 __device__ static void append_0x02_1 (u32 w0[4], const u32 offset)
1496 {
1497 switch (offset)
1498 {
1499 case 0:
1500 w0[0] = 0x02;
1501 break;
1502
1503 case 1:
1504 w0[0] = w0[0] | 0x0200;
1505 break;
1506
1507 case 2:
1508 w0[0] = w0[0] | 0x020000;
1509 break;
1510
1511 case 3:
1512 w0[0] = w0[0] | 0x02000000;
1513 break;
1514
1515 case 4:
1516 w0[1] = 0x02;
1517 break;
1518
1519 case 5:
1520 w0[1] = w0[1] | 0x0200;
1521 break;
1522
1523 case 6:
1524 w0[1] = w0[1] | 0x020000;
1525 break;
1526
1527 case 7:
1528 w0[1] = w0[1] | 0x02000000;
1529 break;
1530
1531 case 8:
1532 w0[2] = 0x02;
1533 break;
1534
1535 case 9:
1536 w0[2] = w0[2] | 0x0200;
1537 break;
1538
1539 case 10:
1540 w0[2] = w0[2] | 0x020000;
1541 break;
1542
1543 case 11:
1544 w0[2] = w0[2] | 0x02000000;
1545 break;
1546
1547 case 12:
1548 w0[3] = 0x02;
1549 break;
1550
1551 case 13:
1552 w0[3] = w0[3] | 0x0200;
1553 break;
1554
1555 case 14:
1556 w0[3] = w0[3] | 0x020000;
1557 break;
1558
1559 case 15:
1560 w0[3] = w0[3] | 0x02000000;
1561 break;
1562 }
1563 }
1564
1565 __device__ static void append_0x02_2 (u32 w0[4], u32 w1[4], const u32 offset)
1566 {
1567 switch (offset)
1568 {
1569 case 0:
1570 w0[0] = 0x02;
1571 break;
1572
1573 case 1:
1574 w0[0] = w0[0] | 0x0200;
1575 break;
1576
1577 case 2:
1578 w0[0] = w0[0] | 0x020000;
1579 break;
1580
1581 case 3:
1582 w0[0] = w0[0] | 0x02000000;
1583 break;
1584
1585 case 4:
1586 w0[1] = 0x02;
1587 break;
1588
1589 case 5:
1590 w0[1] = w0[1] | 0x0200;
1591 break;
1592
1593 case 6:
1594 w0[1] = w0[1] | 0x020000;
1595 break;
1596
1597 case 7:
1598 w0[1] = w0[1] | 0x02000000;
1599 break;
1600
1601 case 8:
1602 w0[2] = 0x02;
1603 break;
1604
1605 case 9:
1606 w0[2] = w0[2] | 0x0200;
1607 break;
1608
1609 case 10:
1610 w0[2] = w0[2] | 0x020000;
1611 break;
1612
1613 case 11:
1614 w0[2] = w0[2] | 0x02000000;
1615 break;
1616
1617 case 12:
1618 w0[3] = 0x02;
1619 break;
1620
1621 case 13:
1622 w0[3] = w0[3] | 0x0200;
1623 break;
1624
1625 case 14:
1626 w0[3] = w0[3] | 0x020000;
1627 break;
1628
1629 case 15:
1630 w0[3] = w0[3] | 0x02000000;
1631 break;
1632
1633 case 16:
1634 w1[0] = 0x02;
1635 break;
1636
1637 case 17:
1638 w1[0] = w1[0] | 0x0200;
1639 break;
1640
1641 case 18:
1642 w1[0] = w1[0] | 0x020000;
1643 break;
1644
1645 case 19:
1646 w1[0] = w1[0] | 0x02000000;
1647 break;
1648
1649 case 20:
1650 w1[1] = 0x02;
1651 break;
1652
1653 case 21:
1654 w1[1] = w1[1] | 0x0200;
1655 break;
1656
1657 case 22:
1658 w1[1] = w1[1] | 0x020000;
1659 break;
1660
1661 case 23:
1662 w1[1] = w1[1] | 0x02000000;
1663 break;
1664
1665 case 24:
1666 w1[2] = 0x02;
1667 break;
1668
1669 case 25:
1670 w1[2] = w1[2] | 0x0200;
1671 break;
1672
1673 case 26:
1674 w1[2] = w1[2] | 0x020000;
1675 break;
1676
1677 case 27:
1678 w1[2] = w1[2] | 0x02000000;
1679 break;
1680
1681 case 28:
1682 w1[3] = 0x02;
1683 break;
1684
1685 case 29:
1686 w1[3] = w1[3] | 0x0200;
1687 break;
1688
1689 case 30:
1690 w1[3] = w1[3] | 0x020000;
1691 break;
1692
1693 case 31:
1694 w1[3] = w1[3] | 0x02000000;
1695 break;
1696 }
1697 }
1698
1699 __device__ static void append_0x02_3 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
1700 {
1701 switch (offset)
1702 {
1703 case 0:
1704 w0[0] = 0x02;
1705 break;
1706
1707 case 1:
1708 w0[0] = w0[0] | 0x0200;
1709 break;
1710
1711 case 2:
1712 w0[0] = w0[0] | 0x020000;
1713 break;
1714
1715 case 3:
1716 w0[0] = w0[0] | 0x02000000;
1717 break;
1718
1719 case 4:
1720 w0[1] = 0x02;
1721 break;
1722
1723 case 5:
1724 w0[1] = w0[1] | 0x0200;
1725 break;
1726
1727 case 6:
1728 w0[1] = w0[1] | 0x020000;
1729 break;
1730
1731 case 7:
1732 w0[1] = w0[1] | 0x02000000;
1733 break;
1734
1735 case 8:
1736 w0[2] = 0x02;
1737 break;
1738
1739 case 9:
1740 w0[2] = w0[2] | 0x0200;
1741 break;
1742
1743 case 10:
1744 w0[2] = w0[2] | 0x020000;
1745 break;
1746
1747 case 11:
1748 w0[2] = w0[2] | 0x02000000;
1749 break;
1750
1751 case 12:
1752 w0[3] = 0x02;
1753 break;
1754
1755 case 13:
1756 w0[3] = w0[3] | 0x0200;
1757 break;
1758
1759 case 14:
1760 w0[3] = w0[3] | 0x020000;
1761 break;
1762
1763 case 15:
1764 w0[3] = w0[3] | 0x02000000;
1765 break;
1766
1767 case 16:
1768 w1[0] = 0x02;
1769 break;
1770
1771 case 17:
1772 w1[0] = w1[0] | 0x0200;
1773 break;
1774
1775 case 18:
1776 w1[0] = w1[0] | 0x020000;
1777 break;
1778
1779 case 19:
1780 w1[0] = w1[0] | 0x02000000;
1781 break;
1782
1783 case 20:
1784 w1[1] = 0x02;
1785 break;
1786
1787 case 21:
1788 w1[1] = w1[1] | 0x0200;
1789 break;
1790
1791 case 22:
1792 w1[1] = w1[1] | 0x020000;
1793 break;
1794
1795 case 23:
1796 w1[1] = w1[1] | 0x02000000;
1797 break;
1798
1799 case 24:
1800 w1[2] = 0x02;
1801 break;
1802
1803 case 25:
1804 w1[2] = w1[2] | 0x0200;
1805 break;
1806
1807 case 26:
1808 w1[2] = w1[2] | 0x020000;
1809 break;
1810
1811 case 27:
1812 w1[2] = w1[2] | 0x02000000;
1813 break;
1814
1815 case 28:
1816 w1[3] = 0x02;
1817 break;
1818
1819 case 29:
1820 w1[3] = w1[3] | 0x0200;
1821 break;
1822
1823 case 30:
1824 w1[3] = w1[3] | 0x020000;
1825 break;
1826
1827 case 31:
1828 w1[3] = w1[3] | 0x02000000;
1829 break;
1830
1831 case 32:
1832 w2[0] = 0x02;
1833 break;
1834
1835 case 33:
1836 w2[0] = w2[0] | 0x0200;
1837 break;
1838
1839 case 34:
1840 w2[0] = w2[0] | 0x020000;
1841 break;
1842
1843 case 35:
1844 w2[0] = w2[0] | 0x02000000;
1845 break;
1846
1847 case 36:
1848 w2[1] = 0x02;
1849 break;
1850
1851 case 37:
1852 w2[1] = w2[1] | 0x0200;
1853 break;
1854
1855 case 38:
1856 w2[1] = w2[1] | 0x020000;
1857 break;
1858
1859 case 39:
1860 w2[1] = w2[1] | 0x02000000;
1861 break;
1862
1863 case 40:
1864 w2[2] = 0x02;
1865 break;
1866
1867 case 41:
1868 w2[2] = w2[2] | 0x0200;
1869 break;
1870
1871 case 42:
1872 w2[2] = w2[2] | 0x020000;
1873 break;
1874
1875 case 43:
1876 w2[2] = w2[2] | 0x02000000;
1877 break;
1878
1879 case 44:
1880 w2[3] = 0x02;
1881 break;
1882
1883 case 45:
1884 w2[3] = w2[3] | 0x0200;
1885 break;
1886
1887 case 46:
1888 w2[3] = w2[3] | 0x020000;
1889 break;
1890
1891 case 47:
1892 w2[3] = w2[3] | 0x02000000;
1893 break;
1894 }
1895 }
1896
1897 __device__ static void append_0x02_4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
1898 {
1899 switch (offset)
1900 {
1901 case 0:
1902 w0[0] = 0x02;
1903 break;
1904
1905 case 1:
1906 w0[0] = w0[0] | 0x0200;
1907 break;
1908
1909 case 2:
1910 w0[0] = w0[0] | 0x020000;
1911 break;
1912
1913 case 3:
1914 w0[0] = w0[0] | 0x02000000;
1915 break;
1916
1917 case 4:
1918 w0[1] = 0x02;
1919 break;
1920
1921 case 5:
1922 w0[1] = w0[1] | 0x0200;
1923 break;
1924
1925 case 6:
1926 w0[1] = w0[1] | 0x020000;
1927 break;
1928
1929 case 7:
1930 w0[1] = w0[1] | 0x02000000;
1931 break;
1932
1933 case 8:
1934 w0[2] = 0x02;
1935 break;
1936
1937 case 9:
1938 w0[2] = w0[2] | 0x0200;
1939 break;
1940
1941 case 10:
1942 w0[2] = w0[2] | 0x020000;
1943 break;
1944
1945 case 11:
1946 w0[2] = w0[2] | 0x02000000;
1947 break;
1948
1949 case 12:
1950 w0[3] = 0x02;
1951 break;
1952
1953 case 13:
1954 w0[3] = w0[3] | 0x0200;
1955 break;
1956
1957 case 14:
1958 w0[3] = w0[3] | 0x020000;
1959 break;
1960
1961 case 15:
1962 w0[3] = w0[3] | 0x02000000;
1963 break;
1964
1965 case 16:
1966 w1[0] = 0x02;
1967 break;
1968
1969 case 17:
1970 w1[0] = w1[0] | 0x0200;
1971 break;
1972
1973 case 18:
1974 w1[0] = w1[0] | 0x020000;
1975 break;
1976
1977 case 19:
1978 w1[0] = w1[0] | 0x02000000;
1979 break;
1980
1981 case 20:
1982 w1[1] = 0x02;
1983 break;
1984
1985 case 21:
1986 w1[1] = w1[1] | 0x0200;
1987 break;
1988
1989 case 22:
1990 w1[1] = w1[1] | 0x020000;
1991 break;
1992
1993 case 23:
1994 w1[1] = w1[1] | 0x02000000;
1995 break;
1996
1997 case 24:
1998 w1[2] = 0x02;
1999 break;
2000
2001 case 25:
2002 w1[2] = w1[2] | 0x0200;
2003 break;
2004
2005 case 26:
2006 w1[2] = w1[2] | 0x020000;
2007 break;
2008
2009 case 27:
2010 w1[2] = w1[2] | 0x02000000;
2011 break;
2012
2013 case 28:
2014 w1[3] = 0x02;
2015 break;
2016
2017 case 29:
2018 w1[3] = w1[3] | 0x0200;
2019 break;
2020
2021 case 30:
2022 w1[3] = w1[3] | 0x020000;
2023 break;
2024
2025 case 31:
2026 w1[3] = w1[3] | 0x02000000;
2027 break;
2028
2029 case 32:
2030 w2[0] = 0x02;
2031 break;
2032
2033 case 33:
2034 w2[0] = w2[0] | 0x0200;
2035 break;
2036
2037 case 34:
2038 w2[0] = w2[0] | 0x020000;
2039 break;
2040
2041 case 35:
2042 w2[0] = w2[0] | 0x02000000;
2043 break;
2044
2045 case 36:
2046 w2[1] = 0x02;
2047 break;
2048
2049 case 37:
2050 w2[1] = w2[1] | 0x0200;
2051 break;
2052
2053 case 38:
2054 w2[1] = w2[1] | 0x020000;
2055 break;
2056
2057 case 39:
2058 w2[1] = w2[1] | 0x02000000;
2059 break;
2060
2061 case 40:
2062 w2[2] = 0x02;
2063 break;
2064
2065 case 41:
2066 w2[2] = w2[2] | 0x0200;
2067 break;
2068
2069 case 42:
2070 w2[2] = w2[2] | 0x020000;
2071 break;
2072
2073 case 43:
2074 w2[2] = w2[2] | 0x02000000;
2075 break;
2076
2077 case 44:
2078 w2[3] = 0x02;
2079 break;
2080
2081 case 45:
2082 w2[3] = w2[3] | 0x0200;
2083 break;
2084
2085 case 46:
2086 w2[3] = w2[3] | 0x020000;
2087 break;
2088
2089 case 47:
2090 w2[3] = w2[3] | 0x02000000;
2091 break;
2092
2093 case 48:
2094 w3[0] = 0x02;
2095 break;
2096
2097 case 49:
2098 w3[0] = w3[0] | 0x0200;
2099 break;
2100
2101 case 50:
2102 w3[0] = w3[0] | 0x020000;
2103 break;
2104
2105 case 51:
2106 w3[0] = w3[0] | 0x02000000;
2107 break;
2108
2109 case 52:
2110 w3[1] = 0x02;
2111 break;
2112
2113 case 53:
2114 w3[1] = w3[1] | 0x0200;
2115 break;
2116
2117 case 54:
2118 w3[1] = w3[1] | 0x020000;
2119 break;
2120
2121 case 55:
2122 w3[1] = w3[1] | 0x02000000;
2123 break;
2124
2125 case 56:
2126 w3[2] = 0x02;
2127 break;
2128
2129 case 57:
2130 w3[2] = w3[2] | 0x0200;
2131 break;
2132
2133 case 58:
2134 w3[2] = w3[2] | 0x020000;
2135 break;
2136
2137 case 59:
2138 w3[2] = w3[2] | 0x02000000;
2139 break;
2140
2141 case 60:
2142 w3[3] = 0x02;
2143 break;
2144
2145 case 61:
2146 w3[3] = w3[3] | 0x0200;
2147 break;
2148
2149 case 62:
2150 w3[3] = w3[3] | 0x020000;
2151 break;
2152
2153 case 63:
2154 w3[3] = w3[3] | 0x02000000;
2155 break;
2156 }
2157 }
2158
2159 __device__ static void append_0x02_8 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset)
2160 {
2161 switch (offset)
2162 {
2163 case 0:
2164 w0[0] = 0x02;
2165 break;
2166
2167 case 1:
2168 w0[0] = w0[0] | 0x0200;
2169 break;
2170
2171 case 2:
2172 w0[0] = w0[0] | 0x020000;
2173 break;
2174
2175 case 3:
2176 w0[0] = w0[0] | 0x02000000;
2177 break;
2178
2179 case 4:
2180 w0[1] = 0x02;
2181 break;
2182
2183 case 5:
2184 w0[1] = w0[1] | 0x0200;
2185 break;
2186
2187 case 6:
2188 w0[1] = w0[1] | 0x020000;
2189 break;
2190
2191 case 7:
2192 w0[1] = w0[1] | 0x02000000;
2193 break;
2194
2195 case 8:
2196 w0[2] = 0x02;
2197 break;
2198
2199 case 9:
2200 w0[2] = w0[2] | 0x0200;
2201 break;
2202
2203 case 10:
2204 w0[2] = w0[2] | 0x020000;
2205 break;
2206
2207 case 11:
2208 w0[2] = w0[2] | 0x02000000;
2209 break;
2210
2211 case 12:
2212 w0[3] = 0x02;
2213 break;
2214
2215 case 13:
2216 w0[3] = w0[3] | 0x0200;
2217 break;
2218
2219 case 14:
2220 w0[3] = w0[3] | 0x020000;
2221 break;
2222
2223 case 15:
2224 w0[3] = w0[3] | 0x02000000;
2225 break;
2226
2227 case 16:
2228 w1[0] = 0x02;
2229 break;
2230
2231 case 17:
2232 w1[0] = w1[0] | 0x0200;
2233 break;
2234
2235 case 18:
2236 w1[0] = w1[0] | 0x020000;
2237 break;
2238
2239 case 19:
2240 w1[0] = w1[0] | 0x02000000;
2241 break;
2242
2243 case 20:
2244 w1[1] = 0x02;
2245 break;
2246
2247 case 21:
2248 w1[1] = w1[1] | 0x0200;
2249 break;
2250
2251 case 22:
2252 w1[1] = w1[1] | 0x020000;
2253 break;
2254
2255 case 23:
2256 w1[1] = w1[1] | 0x02000000;
2257 break;
2258
2259 case 24:
2260 w1[2] = 0x02;
2261 break;
2262
2263 case 25:
2264 w1[2] = w1[2] | 0x0200;
2265 break;
2266
2267 case 26:
2268 w1[2] = w1[2] | 0x020000;
2269 break;
2270
2271 case 27:
2272 w1[2] = w1[2] | 0x02000000;
2273 break;
2274
2275 case 28:
2276 w1[3] = 0x02;
2277 break;
2278
2279 case 29:
2280 w1[3] = w1[3] | 0x0200;
2281 break;
2282
2283 case 30:
2284 w1[3] = w1[3] | 0x020000;
2285 break;
2286
2287 case 31:
2288 w1[3] = w1[3] | 0x02000000;
2289 break;
2290
2291 case 32:
2292 w2[0] = 0x02;
2293 break;
2294
2295 case 33:
2296 w2[0] = w2[0] | 0x0200;
2297 break;
2298
2299 case 34:
2300 w2[0] = w2[0] | 0x020000;
2301 break;
2302
2303 case 35:
2304 w2[0] = w2[0] | 0x02000000;
2305 break;
2306
2307 case 36:
2308 w2[1] = 0x02;
2309 break;
2310
2311 case 37:
2312 w2[1] = w2[1] | 0x0200;
2313 break;
2314
2315 case 38:
2316 w2[1] = w2[1] | 0x020000;
2317 break;
2318
2319 case 39:
2320 w2[1] = w2[1] | 0x02000000;
2321 break;
2322
2323 case 40:
2324 w2[2] = 0x02;
2325 break;
2326
2327 case 41:
2328 w2[2] = w2[2] | 0x0200;
2329 break;
2330
2331 case 42:
2332 w2[2] = w2[2] | 0x020000;
2333 break;
2334
2335 case 43:
2336 w2[2] = w2[2] | 0x02000000;
2337 break;
2338
2339 case 44:
2340 w2[3] = 0x02;
2341 break;
2342
2343 case 45:
2344 w2[3] = w2[3] | 0x0200;
2345 break;
2346
2347 case 46:
2348 w2[3] = w2[3] | 0x020000;
2349 break;
2350
2351 case 47:
2352 w2[3] = w2[3] | 0x02000000;
2353 break;
2354
2355 case 48:
2356 w3[0] = 0x02;
2357 break;
2358
2359 case 49:
2360 w3[0] = w3[0] | 0x0200;
2361 break;
2362
2363 case 50:
2364 w3[0] = w3[0] | 0x020000;
2365 break;
2366
2367 case 51:
2368 w3[0] = w3[0] | 0x02000000;
2369 break;
2370
2371 case 52:
2372 w3[1] = 0x02;
2373 break;
2374
2375 case 53:
2376 w3[1] = w3[1] | 0x0200;
2377 break;
2378
2379 case 54:
2380 w3[1] = w3[1] | 0x020000;
2381 break;
2382
2383 case 55:
2384 w3[1] = w3[1] | 0x02000000;
2385 break;
2386
2387 case 56:
2388 w3[2] = 0x02;
2389 break;
2390
2391 case 57:
2392 w3[2] = w3[2] | 0x0200;
2393 break;
2394
2395 case 58:
2396 w3[2] = w3[2] | 0x020000;
2397 break;
2398
2399 case 59:
2400 w3[2] = w3[2] | 0x02000000;
2401 break;
2402
2403 case 60:
2404 w3[3] = 0x02;
2405 break;
2406
2407 case 61:
2408 w3[3] = w3[3] | 0x0200;
2409 break;
2410
2411 case 62:
2412 w3[3] = w3[3] | 0x020000;
2413 break;
2414
2415 case 63:
2416 w3[3] = w3[3] | 0x02000000;
2417 break;
2418
2419 case 64:
2420 w4[0] = 0x02;
2421 break;
2422
2423 case 65:
2424 w4[0] = w4[0] | 0x0200;
2425 break;
2426
2427 case 66:
2428 w4[0] = w4[0] | 0x020000;
2429 break;
2430
2431 case 67:
2432 w4[0] = w4[0] | 0x02000000;
2433 break;
2434
2435 case 68:
2436 w4[1] = 0x02;
2437 break;
2438
2439 case 69:
2440 w4[1] = w4[1] | 0x0200;
2441 break;
2442
2443 case 70:
2444 w4[1] = w4[1] | 0x020000;
2445 break;
2446
2447 case 71:
2448 w4[1] = w4[1] | 0x02000000;
2449 break;
2450
2451 case 72:
2452 w4[2] = 0x02;
2453 break;
2454
2455 case 73:
2456 w4[2] = w4[2] | 0x0200;
2457 break;
2458
2459 case 74:
2460 w4[2] = w4[2] | 0x020000;
2461 break;
2462
2463 case 75:
2464 w4[2] = w4[2] | 0x02000000;
2465 break;
2466
2467 case 76:
2468 w4[3] = 0x02;
2469 break;
2470
2471 case 77:
2472 w4[3] = w4[3] | 0x0200;
2473 break;
2474
2475 case 78:
2476 w4[3] = w4[3] | 0x020000;
2477 break;
2478
2479 case 79:
2480 w4[3] = w4[3] | 0x02000000;
2481 break;
2482
2483 case 80:
2484 w5[0] = 0x02;
2485 break;
2486
2487 case 81:
2488 w5[0] = w5[0] | 0x0200;
2489 break;
2490
2491 case 82:
2492 w5[0] = w5[0] | 0x020000;
2493 break;
2494
2495 case 83:
2496 w5[0] = w5[0] | 0x02000000;
2497 break;
2498
2499 case 84:
2500 w5[1] = 0x02;
2501 break;
2502
2503 case 85:
2504 w5[1] = w5[1] | 0x0200;
2505 break;
2506
2507 case 86:
2508 w5[1] = w5[1] | 0x020000;
2509 break;
2510
2511 case 87:
2512 w5[1] = w5[1] | 0x02000000;
2513 break;
2514
2515 case 88:
2516 w5[2] = 0x02;
2517 break;
2518
2519 case 89:
2520 w5[2] = w5[2] | 0x0200;
2521 break;
2522
2523 case 90:
2524 w5[2] = w5[2] | 0x020000;
2525 break;
2526
2527 case 91:
2528 w5[2] = w5[2] | 0x02000000;
2529 break;
2530
2531 case 92:
2532 w5[3] = 0x02;
2533 break;
2534
2535 case 93:
2536 w5[3] = w5[3] | 0x0200;
2537 break;
2538
2539 case 94:
2540 w5[3] = w5[3] | 0x020000;
2541 break;
2542
2543 case 95:
2544 w5[3] = w5[3] | 0x02000000;
2545 break;
2546
2547 case 96:
2548 w6[0] = 0x02;
2549 break;
2550
2551 case 97:
2552 w6[0] = w6[0] | 0x0200;
2553 break;
2554
2555 case 98:
2556 w6[0] = w6[0] | 0x020000;
2557 break;
2558
2559 case 99:
2560 w6[0] = w6[0] | 0x02000000;
2561 break;
2562
2563 case 100:
2564 w6[1] = 0x02;
2565 break;
2566
2567 case 101:
2568 w6[1] = w6[1] | 0x0200;
2569 break;
2570
2571 case 102:
2572 w6[1] = w6[1] | 0x020000;
2573 break;
2574
2575 case 103:
2576 w6[1] = w6[1] | 0x02000000;
2577 break;
2578
2579 case 104:
2580 w6[2] = 0x02;
2581 break;
2582
2583 case 105:
2584 w6[2] = w6[2] | 0x0200;
2585 break;
2586
2587 case 106:
2588 w6[2] = w6[2] | 0x020000;
2589 break;
2590
2591 case 107:
2592 w6[2] = w6[2] | 0x02000000;
2593 break;
2594
2595 case 108:
2596 w6[3] = 0x02;
2597 break;
2598
2599 case 109:
2600 w6[3] = w6[3] | 0x0200;
2601 break;
2602
2603 case 110:
2604 w6[3] = w6[3] | 0x020000;
2605 break;
2606
2607 case 111:
2608 w6[3] = w6[3] | 0x02000000;
2609 break;
2610
2611 case 112:
2612 w7[0] = 0x02;
2613 break;
2614
2615 case 113:
2616 w7[0] = w7[0] | 0x0200;
2617 break;
2618
2619 case 114:
2620 w7[0] = w7[0] | 0x020000;
2621 break;
2622
2623 case 115:
2624 w7[0] = w7[0] | 0x02000000;
2625 break;
2626
2627 case 116:
2628 w7[1] = 0x02;
2629 break;
2630
2631 case 117:
2632 w7[1] = w7[1] | 0x0200;
2633 break;
2634
2635 case 118:
2636 w7[1] = w7[1] | 0x020000;
2637 break;
2638
2639 case 119:
2640 w7[1] = w7[1] | 0x02000000;
2641 break;
2642
2643 case 120:
2644 w7[2] = 0x02;
2645 break;
2646
2647 case 121:
2648 w7[2] = w7[2] | 0x0200;
2649 break;
2650
2651 case 122:
2652 w7[2] = w7[2] | 0x020000;
2653 break;
2654
2655 case 123:
2656 w7[2] = w7[2] | 0x02000000;
2657 break;
2658
2659 case 124:
2660 w7[3] = 0x02;
2661 break;
2662
2663 case 125:
2664 w7[3] = w7[3] | 0x0200;
2665 break;
2666
2667 case 126:
2668 w7[3] = w7[3] | 0x020000;
2669 break;
2670
2671 case 127:
2672 w7[3] = w7[3] | 0x02000000;
2673 break;
2674 }
2675 }
2676
2677 __device__ static void append_0x80_1 (u32 w0[4], const u32 offset)
2678 {
2679 switch (offset)
2680 {
2681 case 0:
2682 w0[0] = 0x80;
2683 break;
2684
2685 case 1:
2686 w0[0] = w0[0] | 0x8000;
2687 break;
2688
2689 case 2:
2690 w0[0] = w0[0] | 0x800000;
2691 break;
2692
2693 case 3:
2694 w0[0] = w0[0] | 0x80000000;
2695 break;
2696
2697 case 4:
2698 w0[1] = 0x80;
2699 break;
2700
2701 case 5:
2702 w0[1] = w0[1] | 0x8000;
2703 break;
2704
2705 case 6:
2706 w0[1] = w0[1] | 0x800000;
2707 break;
2708
2709 case 7:
2710 w0[1] = w0[1] | 0x80000000;
2711 break;
2712
2713 case 8:
2714 w0[2] = 0x80;
2715 break;
2716
2717 case 9:
2718 w0[2] = w0[2] | 0x8000;
2719 break;
2720
2721 case 10:
2722 w0[2] = w0[2] | 0x800000;
2723 break;
2724
2725 case 11:
2726 w0[2] = w0[2] | 0x80000000;
2727 break;
2728
2729 case 12:
2730 w0[3] = 0x80;
2731 break;
2732
2733 case 13:
2734 w0[3] = w0[3] | 0x8000;
2735 break;
2736
2737 case 14:
2738 w0[3] = w0[3] | 0x800000;
2739 break;
2740
2741 case 15:
2742 w0[3] = w0[3] | 0x80000000;
2743 break;
2744 }
2745 }
2746
2747 __device__ static void append_0x80_2 (u32 w0[4], u32 w1[4], const u32 offset)
2748 {
2749 switch (offset)
2750 {
2751 case 0:
2752 w0[0] = 0x80;
2753 break;
2754
2755 case 1:
2756 w0[0] = w0[0] | 0x8000;
2757 break;
2758
2759 case 2:
2760 w0[0] = w0[0] | 0x800000;
2761 break;
2762
2763 case 3:
2764 w0[0] = w0[0] | 0x80000000;
2765 break;
2766
2767 case 4:
2768 w0[1] = 0x80;
2769 break;
2770
2771 case 5:
2772 w0[1] = w0[1] | 0x8000;
2773 break;
2774
2775 case 6:
2776 w0[1] = w0[1] | 0x800000;
2777 break;
2778
2779 case 7:
2780 w0[1] = w0[1] | 0x80000000;
2781 break;
2782
2783 case 8:
2784 w0[2] = 0x80;
2785 break;
2786
2787 case 9:
2788 w0[2] = w0[2] | 0x8000;
2789 break;
2790
2791 case 10:
2792 w0[2] = w0[2] | 0x800000;
2793 break;
2794
2795 case 11:
2796 w0[2] = w0[2] | 0x80000000;
2797 break;
2798
2799 case 12:
2800 w0[3] = 0x80;
2801 break;
2802
2803 case 13:
2804 w0[3] = w0[3] | 0x8000;
2805 break;
2806
2807 case 14:
2808 w0[3] = w0[3] | 0x800000;
2809 break;
2810
2811 case 15:
2812 w0[3] = w0[3] | 0x80000000;
2813 break;
2814
2815 case 16:
2816 w1[0] = 0x80;
2817 break;
2818
2819 case 17:
2820 w1[0] = w1[0] | 0x8000;
2821 break;
2822
2823 case 18:
2824 w1[0] = w1[0] | 0x800000;
2825 break;
2826
2827 case 19:
2828 w1[0] = w1[0] | 0x80000000;
2829 break;
2830
2831 case 20:
2832 w1[1] = 0x80;
2833 break;
2834
2835 case 21:
2836 w1[1] = w1[1] | 0x8000;
2837 break;
2838
2839 case 22:
2840 w1[1] = w1[1] | 0x800000;
2841 break;
2842
2843 case 23:
2844 w1[1] = w1[1] | 0x80000000;
2845 break;
2846
2847 case 24:
2848 w1[2] = 0x80;
2849 break;
2850
2851 case 25:
2852 w1[2] = w1[2] | 0x8000;
2853 break;
2854
2855 case 26:
2856 w1[2] = w1[2] | 0x800000;
2857 break;
2858
2859 case 27:
2860 w1[2] = w1[2] | 0x80000000;
2861 break;
2862
2863 case 28:
2864 w1[3] = 0x80;
2865 break;
2866
2867 case 29:
2868 w1[3] = w1[3] | 0x8000;
2869 break;
2870
2871 case 30:
2872 w1[3] = w1[3] | 0x800000;
2873 break;
2874
2875 case 31:
2876 w1[3] = w1[3] | 0x80000000;
2877 break;
2878 }
2879 }
2880
2881 __device__ static void append_0x80_3 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
2882 {
2883 switch (offset)
2884 {
2885 case 0:
2886 w0[0] = 0x80;
2887 break;
2888
2889 case 1:
2890 w0[0] = w0[0] | 0x8000;
2891 break;
2892
2893 case 2:
2894 w0[0] = w0[0] | 0x800000;
2895 break;
2896
2897 case 3:
2898 w0[0] = w0[0] | 0x80000000;
2899 break;
2900
2901 case 4:
2902 w0[1] = 0x80;
2903 break;
2904
2905 case 5:
2906 w0[1] = w0[1] | 0x8000;
2907 break;
2908
2909 case 6:
2910 w0[1] = w0[1] | 0x800000;
2911 break;
2912
2913 case 7:
2914 w0[1] = w0[1] | 0x80000000;
2915 break;
2916
2917 case 8:
2918 w0[2] = 0x80;
2919 break;
2920
2921 case 9:
2922 w0[2] = w0[2] | 0x8000;
2923 break;
2924
2925 case 10:
2926 w0[2] = w0[2] | 0x800000;
2927 break;
2928
2929 case 11:
2930 w0[2] = w0[2] | 0x80000000;
2931 break;
2932
2933 case 12:
2934 w0[3] = 0x80;
2935 break;
2936
2937 case 13:
2938 w0[3] = w0[3] | 0x8000;
2939 break;
2940
2941 case 14:
2942 w0[3] = w0[3] | 0x800000;
2943 break;
2944
2945 case 15:
2946 w0[3] = w0[3] | 0x80000000;
2947 break;
2948
2949 case 16:
2950 w1[0] = 0x80;
2951 break;
2952
2953 case 17:
2954 w1[0] = w1[0] | 0x8000;
2955 break;
2956
2957 case 18:
2958 w1[0] = w1[0] | 0x800000;
2959 break;
2960
2961 case 19:
2962 w1[0] = w1[0] | 0x80000000;
2963 break;
2964
2965 case 20:
2966 w1[1] = 0x80;
2967 break;
2968
2969 case 21:
2970 w1[1] = w1[1] | 0x8000;
2971 break;
2972
2973 case 22:
2974 w1[1] = w1[1] | 0x800000;
2975 break;
2976
2977 case 23:
2978 w1[1] = w1[1] | 0x80000000;
2979 break;
2980
2981 case 24:
2982 w1[2] = 0x80;
2983 break;
2984
2985 case 25:
2986 w1[2] = w1[2] | 0x8000;
2987 break;
2988
2989 case 26:
2990 w1[2] = w1[2] | 0x800000;
2991 break;
2992
2993 case 27:
2994 w1[2] = w1[2] | 0x80000000;
2995 break;
2996
2997 case 28:
2998 w1[3] = 0x80;
2999 break;
3000
3001 case 29:
3002 w1[3] = w1[3] | 0x8000;
3003 break;
3004
3005 case 30:
3006 w1[3] = w1[3] | 0x800000;
3007 break;
3008
3009 case 31:
3010 w1[3] = w1[3] | 0x80000000;
3011 break;
3012
3013 case 32:
3014 w2[0] = 0x80;
3015 break;
3016
3017 case 33:
3018 w2[0] = w2[0] | 0x8000;
3019 break;
3020
3021 case 34:
3022 w2[0] = w2[0] | 0x800000;
3023 break;
3024
3025 case 35:
3026 w2[0] = w2[0] | 0x80000000;
3027 break;
3028
3029 case 36:
3030 w2[1] = 0x80;
3031 break;
3032
3033 case 37:
3034 w2[1] = w2[1] | 0x8000;
3035 break;
3036
3037 case 38:
3038 w2[1] = w2[1] | 0x800000;
3039 break;
3040
3041 case 39:
3042 w2[1] = w2[1] | 0x80000000;
3043 break;
3044
3045 case 40:
3046 w2[2] = 0x80;
3047 break;
3048
3049 case 41:
3050 w2[2] = w2[2] | 0x8000;
3051 break;
3052
3053 case 42:
3054 w2[2] = w2[2] | 0x800000;
3055 break;
3056
3057 case 43:
3058 w2[2] = w2[2] | 0x80000000;
3059 break;
3060
3061 case 44:
3062 w2[3] = 0x80;
3063 break;
3064
3065 case 45:
3066 w2[3] = w2[3] | 0x8000;
3067 break;
3068
3069 case 46:
3070 w2[3] = w2[3] | 0x800000;
3071 break;
3072
3073 case 47:
3074 w2[3] = w2[3] | 0x80000000;
3075 break;
3076 }
3077 }
3078
3079 __device__ static void append_0x80_4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
3080 {
3081 switch (offset)
3082 {
3083 case 0:
3084 w0[0] = 0x80;
3085 break;
3086
3087 case 1:
3088 w0[0] = w0[0] | 0x8000;
3089 break;
3090
3091 case 2:
3092 w0[0] = w0[0] | 0x800000;
3093 break;
3094
3095 case 3:
3096 w0[0] = w0[0] | 0x80000000;
3097 break;
3098
3099 case 4:
3100 w0[1] = 0x80;
3101 break;
3102
3103 case 5:
3104 w0[1] = w0[1] | 0x8000;
3105 break;
3106
3107 case 6:
3108 w0[1] = w0[1] | 0x800000;
3109 break;
3110
3111 case 7:
3112 w0[1] = w0[1] | 0x80000000;
3113 break;
3114
3115 case 8:
3116 w0[2] = 0x80;
3117 break;
3118
3119 case 9:
3120 w0[2] = w0[2] | 0x8000;
3121 break;
3122
3123 case 10:
3124 w0[2] = w0[2] | 0x800000;
3125 break;
3126
3127 case 11:
3128 w0[2] = w0[2] | 0x80000000;
3129 break;
3130
3131 case 12:
3132 w0[3] = 0x80;
3133 break;
3134
3135 case 13:
3136 w0[3] = w0[3] | 0x8000;
3137 break;
3138
3139 case 14:
3140 w0[3] = w0[3] | 0x800000;
3141 break;
3142
3143 case 15:
3144 w0[3] = w0[3] | 0x80000000;
3145 break;
3146
3147 case 16:
3148 w1[0] = 0x80;
3149 break;
3150
3151 case 17:
3152 w1[0] = w1[0] | 0x8000;
3153 break;
3154
3155 case 18:
3156 w1[0] = w1[0] | 0x800000;
3157 break;
3158
3159 case 19:
3160 w1[0] = w1[0] | 0x80000000;
3161 break;
3162
3163 case 20:
3164 w1[1] = 0x80;
3165 break;
3166
3167 case 21:
3168 w1[1] = w1[1] | 0x8000;
3169 break;
3170
3171 case 22:
3172 w1[1] = w1[1] | 0x800000;
3173 break;
3174
3175 case 23:
3176 w1[1] = w1[1] | 0x80000000;
3177 break;
3178
3179 case 24:
3180 w1[2] = 0x80;
3181 break;
3182
3183 case 25:
3184 w1[2] = w1[2] | 0x8000;
3185 break;
3186
3187 case 26:
3188 w1[2] = w1[2] | 0x800000;
3189 break;
3190
3191 case 27:
3192 w1[2] = w1[2] | 0x80000000;
3193 break;
3194
3195 case 28:
3196 w1[3] = 0x80;
3197 break;
3198
3199 case 29:
3200 w1[3] = w1[3] | 0x8000;
3201 break;
3202
3203 case 30:
3204 w1[3] = w1[3] | 0x800000;
3205 break;
3206
3207 case 31:
3208 w1[3] = w1[3] | 0x80000000;
3209 break;
3210
3211 case 32:
3212 w2[0] = 0x80;
3213 break;
3214
3215 case 33:
3216 w2[0] = w2[0] | 0x8000;
3217 break;
3218
3219 case 34:
3220 w2[0] = w2[0] | 0x800000;
3221 break;
3222
3223 case 35:
3224 w2[0] = w2[0] | 0x80000000;
3225 break;
3226
3227 case 36:
3228 w2[1] = 0x80;
3229 break;
3230
3231 case 37:
3232 w2[1] = w2[1] | 0x8000;
3233 break;
3234
3235 case 38:
3236 w2[1] = w2[1] | 0x800000;
3237 break;
3238
3239 case 39:
3240 w2[1] = w2[1] | 0x80000000;
3241 break;
3242
3243 case 40:
3244 w2[2] = 0x80;
3245 break;
3246
3247 case 41:
3248 w2[2] = w2[2] | 0x8000;
3249 break;
3250
3251 case 42:
3252 w2[2] = w2[2] | 0x800000;
3253 break;
3254
3255 case 43:
3256 w2[2] = w2[2] | 0x80000000;
3257 break;
3258
3259 case 44:
3260 w2[3] = 0x80;
3261 break;
3262
3263 case 45:
3264 w2[3] = w2[3] | 0x8000;
3265 break;
3266
3267 case 46:
3268 w2[3] = w2[3] | 0x800000;
3269 break;
3270
3271 case 47:
3272 w2[3] = w2[3] | 0x80000000;
3273 break;
3274
3275 case 48:
3276 w3[0] = 0x80;
3277 break;
3278
3279 case 49:
3280 w3[0] = w3[0] | 0x8000;
3281 break;
3282
3283 case 50:
3284 w3[0] = w3[0] | 0x800000;
3285 break;
3286
3287 case 51:
3288 w3[0] = w3[0] | 0x80000000;
3289 break;
3290
3291 case 52:
3292 w3[1] = 0x80;
3293 break;
3294
3295 case 53:
3296 w3[1] = w3[1] | 0x8000;
3297 break;
3298
3299 case 54:
3300 w3[1] = w3[1] | 0x800000;
3301 break;
3302
3303 case 55:
3304 w3[1] = w3[1] | 0x80000000;
3305 break;
3306
3307 case 56:
3308 w3[2] = 0x80;
3309 break;
3310
3311 case 57:
3312 w3[2] = w3[2] | 0x8000;
3313 break;
3314
3315 case 58:
3316 w3[2] = w3[2] | 0x800000;
3317 break;
3318
3319 case 59:
3320 w3[2] = w3[2] | 0x80000000;
3321 break;
3322
3323 case 60:
3324 w3[3] = 0x80;
3325 break;
3326
3327 case 61:
3328 w3[3] = w3[3] | 0x8000;
3329 break;
3330
3331 case 62:
3332 w3[3] = w3[3] | 0x800000;
3333 break;
3334
3335 case 63:
3336 w3[3] = w3[3] | 0x80000000;
3337 break;
3338 }
3339 }
3340
3341 __device__ static void append_0x80_8 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset)
3342 {
3343 switch (offset)
3344 {
3345 case 0:
3346 w0[0] = 0x80;
3347 break;
3348
3349 case 1:
3350 w0[0] = w0[0] | 0x8000;
3351 break;
3352
3353 case 2:
3354 w0[0] = w0[0] | 0x800000;
3355 break;
3356
3357 case 3:
3358 w0[0] = w0[0] | 0x80000000;
3359 break;
3360
3361 case 4:
3362 w0[1] = 0x80;
3363 break;
3364
3365 case 5:
3366 w0[1] = w0[1] | 0x8000;
3367 break;
3368
3369 case 6:
3370 w0[1] = w0[1] | 0x800000;
3371 break;
3372
3373 case 7:
3374 w0[1] = w0[1] | 0x80000000;
3375 break;
3376
3377 case 8:
3378 w0[2] = 0x80;
3379 break;
3380
3381 case 9:
3382 w0[2] = w0[2] | 0x8000;
3383 break;
3384
3385 case 10:
3386 w0[2] = w0[2] | 0x800000;
3387 break;
3388
3389 case 11:
3390 w0[2] = w0[2] | 0x80000000;
3391 break;
3392
3393 case 12:
3394 w0[3] = 0x80;
3395 break;
3396
3397 case 13:
3398 w0[3] = w0[3] | 0x8000;
3399 break;
3400
3401 case 14:
3402 w0[3] = w0[3] | 0x800000;
3403 break;
3404
3405 case 15:
3406 w0[3] = w0[3] | 0x80000000;
3407 break;
3408
3409 case 16:
3410 w1[0] = 0x80;
3411 break;
3412
3413 case 17:
3414 w1[0] = w1[0] | 0x8000;
3415 break;
3416
3417 case 18:
3418 w1[0] = w1[0] | 0x800000;
3419 break;
3420
3421 case 19:
3422 w1[0] = w1[0] | 0x80000000;
3423 break;
3424
3425 case 20:
3426 w1[1] = 0x80;
3427 break;
3428
3429 case 21:
3430 w1[1] = w1[1] | 0x8000;
3431 break;
3432
3433 case 22:
3434 w1[1] = w1[1] | 0x800000;
3435 break;
3436
3437 case 23:
3438 w1[1] = w1[1] | 0x80000000;
3439 break;
3440
3441 case 24:
3442 w1[2] = 0x80;
3443 break;
3444
3445 case 25:
3446 w1[2] = w1[2] | 0x8000;
3447 break;
3448
3449 case 26:
3450 w1[2] = w1[2] | 0x800000;
3451 break;
3452
3453 case 27:
3454 w1[2] = w1[2] | 0x80000000;
3455 break;
3456
3457 case 28:
3458 w1[3] = 0x80;
3459 break;
3460
3461 case 29:
3462 w1[3] = w1[3] | 0x8000;
3463 break;
3464
3465 case 30:
3466 w1[3] = w1[3] | 0x800000;
3467 break;
3468
3469 case 31:
3470 w1[3] = w1[3] | 0x80000000;
3471 break;
3472
3473 case 32:
3474 w2[0] = 0x80;
3475 break;
3476
3477 case 33:
3478 w2[0] = w2[0] | 0x8000;
3479 break;
3480
3481 case 34:
3482 w2[0] = w2[0] | 0x800000;
3483 break;
3484
3485 case 35:
3486 w2[0] = w2[0] | 0x80000000;
3487 break;
3488
3489 case 36:
3490 w2[1] = 0x80;
3491 break;
3492
3493 case 37:
3494 w2[1] = w2[1] | 0x8000;
3495 break;
3496
3497 case 38:
3498 w2[1] = w2[1] | 0x800000;
3499 break;
3500
3501 case 39:
3502 w2[1] = w2[1] | 0x80000000;
3503 break;
3504
3505 case 40:
3506 w2[2] = 0x80;
3507 break;
3508
3509 case 41:
3510 w2[2] = w2[2] | 0x8000;
3511 break;
3512
3513 case 42:
3514 w2[2] = w2[2] | 0x800000;
3515 break;
3516
3517 case 43:
3518 w2[2] = w2[2] | 0x80000000;
3519 break;
3520
3521 case 44:
3522 w2[3] = 0x80;
3523 break;
3524
3525 case 45:
3526 w2[3] = w2[3] | 0x8000;
3527 break;
3528
3529 case 46:
3530 w2[3] = w2[3] | 0x800000;
3531 break;
3532
3533 case 47:
3534 w2[3] = w2[3] | 0x80000000;
3535 break;
3536
3537 case 48:
3538 w3[0] = 0x80;
3539 break;
3540
3541 case 49:
3542 w3[0] = w3[0] | 0x8000;
3543 break;
3544
3545 case 50:
3546 w3[0] = w3[0] | 0x800000;
3547 break;
3548
3549 case 51:
3550 w3[0] = w3[0] | 0x80000000;
3551 break;
3552
3553 case 52:
3554 w3[1] = 0x80;
3555 break;
3556
3557 case 53:
3558 w3[1] = w3[1] | 0x8000;
3559 break;
3560
3561 case 54:
3562 w3[1] = w3[1] | 0x800000;
3563 break;
3564
3565 case 55:
3566 w3[1] = w3[1] | 0x80000000;
3567 break;
3568
3569 case 56:
3570 w3[2] = 0x80;
3571 break;
3572
3573 case 57:
3574 w3[2] = w3[2] | 0x8000;
3575 break;
3576
3577 case 58:
3578 w3[2] = w3[2] | 0x800000;
3579 break;
3580
3581 case 59:
3582 w3[2] = w3[2] | 0x80000000;
3583 break;
3584
3585 case 60:
3586 w3[3] = 0x80;
3587 break;
3588
3589 case 61:
3590 w3[3] = w3[3] | 0x8000;
3591 break;
3592
3593 case 62:
3594 w3[3] = w3[3] | 0x800000;
3595 break;
3596
3597 case 63:
3598 w3[3] = w3[3] | 0x80000000;
3599 break;
3600
3601 case 64:
3602 w4[0] = 0x80;
3603 break;
3604
3605 case 65:
3606 w4[0] = w4[0] | 0x8000;
3607 break;
3608
3609 case 66:
3610 w4[0] = w4[0] | 0x800000;
3611 break;
3612
3613 case 67:
3614 w4[0] = w4[0] | 0x80000000;
3615 break;
3616
3617 case 68:
3618 w4[1] = 0x80;
3619 break;
3620
3621 case 69:
3622 w4[1] = w4[1] | 0x8000;
3623 break;
3624
3625 case 70:
3626 w4[1] = w4[1] | 0x800000;
3627 break;
3628
3629 case 71:
3630 w4[1] = w4[1] | 0x80000000;
3631 break;
3632
3633 case 72:
3634 w4[2] = 0x80;
3635 break;
3636
3637 case 73:
3638 w4[2] = w4[2] | 0x8000;
3639 break;
3640
3641 case 74:
3642 w4[2] = w4[2] | 0x800000;
3643 break;
3644
3645 case 75:
3646 w4[2] = w4[2] | 0x80000000;
3647 break;
3648
3649 case 76:
3650 w4[3] = 0x80;
3651 break;
3652
3653 case 77:
3654 w4[3] = w4[3] | 0x8000;
3655 break;
3656
3657 case 78:
3658 w4[3] = w4[3] | 0x800000;
3659 break;
3660
3661 case 79:
3662 w4[3] = w4[3] | 0x80000000;
3663 break;
3664
3665 case 80:
3666 w5[0] = 0x80;
3667 break;
3668
3669 case 81:
3670 w5[0] = w5[0] | 0x8000;
3671 break;
3672
3673 case 82:
3674 w5[0] = w5[0] | 0x800000;
3675 break;
3676
3677 case 83:
3678 w5[0] = w5[0] | 0x80000000;
3679 break;
3680
3681 case 84:
3682 w5[1] = 0x80;
3683 break;
3684
3685 case 85:
3686 w5[1] = w5[1] | 0x8000;
3687 break;
3688
3689 case 86:
3690 w5[1] = w5[1] | 0x800000;
3691 break;
3692
3693 case 87:
3694 w5[1] = w5[1] | 0x80000000;
3695 break;
3696
3697 case 88:
3698 w5[2] = 0x80;
3699 break;
3700
3701 case 89:
3702 w5[2] = w5[2] | 0x8000;
3703 break;
3704
3705 case 90:
3706 w5[2] = w5[2] | 0x800000;
3707 break;
3708
3709 case 91:
3710 w5[2] = w5[2] | 0x80000000;
3711 break;
3712
3713 case 92:
3714 w5[3] = 0x80;
3715 break;
3716
3717 case 93:
3718 w5[3] = w5[3] | 0x8000;
3719 break;
3720
3721 case 94:
3722 w5[3] = w5[3] | 0x800000;
3723 break;
3724
3725 case 95:
3726 w5[3] = w5[3] | 0x80000000;
3727 break;
3728
3729 case 96:
3730 w6[0] = 0x80;
3731 break;
3732
3733 case 97:
3734 w6[0] = w6[0] | 0x8000;
3735 break;
3736
3737 case 98:
3738 w6[0] = w6[0] | 0x800000;
3739 break;
3740
3741 case 99:
3742 w6[0] = w6[0] | 0x80000000;
3743 break;
3744
3745 case 100:
3746 w6[1] = 0x80;
3747 break;
3748
3749 case 101:
3750 w6[1] = w6[1] | 0x8000;
3751 break;
3752
3753 case 102:
3754 w6[1] = w6[1] | 0x800000;
3755 break;
3756
3757 case 103:
3758 w6[1] = w6[1] | 0x80000000;
3759 break;
3760
3761 case 104:
3762 w6[2] = 0x80;
3763 break;
3764
3765 case 105:
3766 w6[2] = w6[2] | 0x8000;
3767 break;
3768
3769 case 106:
3770 w6[2] = w6[2] | 0x800000;
3771 break;
3772
3773 case 107:
3774 w6[2] = w6[2] | 0x80000000;
3775 break;
3776
3777 case 108:
3778 w6[3] = 0x80;
3779 break;
3780
3781 case 109:
3782 w6[3] = w6[3] | 0x8000;
3783 break;
3784
3785 case 110:
3786 w6[3] = w6[3] | 0x800000;
3787 break;
3788
3789 case 111:
3790 w6[3] = w6[3] | 0x80000000;
3791 break;
3792
3793 case 112:
3794 w7[0] = 0x80;
3795 break;
3796
3797 case 113:
3798 w7[0] = w7[0] | 0x8000;
3799 break;
3800
3801 case 114:
3802 w7[0] = w7[0] | 0x800000;
3803 break;
3804
3805 case 115:
3806 w7[0] = w7[0] | 0x80000000;
3807 break;
3808
3809 case 116:
3810 w7[1] = 0x80;
3811 break;
3812
3813 case 117:
3814 w7[1] = w7[1] | 0x8000;
3815 break;
3816
3817 case 118:
3818 w7[1] = w7[1] | 0x800000;
3819 break;
3820
3821 case 119:
3822 w7[1] = w7[1] | 0x80000000;
3823 break;
3824
3825 case 120:
3826 w7[2] = 0x80;
3827 break;
3828
3829 case 121:
3830 w7[2] = w7[2] | 0x8000;
3831 break;
3832
3833 case 122:
3834 w7[2] = w7[2] | 0x800000;
3835 break;
3836
3837 case 123:
3838 w7[2] = w7[2] | 0x80000000;
3839 break;
3840
3841 case 124:
3842 w7[3] = 0x80;
3843 break;
3844
3845 case 125:
3846 w7[3] = w7[3] | 0x8000;
3847 break;
3848
3849 case 126:
3850 w7[3] = w7[3] | 0x800000;
3851 break;
3852
3853 case 127:
3854 w7[3] = w7[3] | 0x80000000;
3855 break;
3856 }
3857 }
3858
3859 __device__ static void append_0x80_4 (u32 w[16], const u32 offset)
3860 {
3861 switch (offset)
3862 {
3863 case 0:
3864 w[ 0] = 0x80;
3865 break;
3866
3867 case 1:
3868 w[ 0] = w[ 0] | 0x8000;
3869 break;
3870
3871 case 2:
3872 w[ 0] = w[ 0] | 0x800000;
3873 break;
3874
3875 case 3:
3876 w[ 0] = w[ 0] | 0x80000000;
3877 break;
3878
3879 case 4:
3880 w[ 1] = 0x80;
3881 break;
3882
3883 case 5:
3884 w[ 1] = w[ 1] | 0x8000;
3885 break;
3886
3887 case 6:
3888 w[ 1] = w[ 1] | 0x800000;
3889 break;
3890
3891 case 7:
3892 w[ 1] = w[ 1] | 0x80000000;
3893 break;
3894
3895 case 8:
3896 w[ 2] = 0x80;
3897 break;
3898
3899 case 9:
3900 w[ 2] = w[ 2] | 0x8000;
3901 break;
3902
3903 case 10:
3904 w[ 2] = w[ 2] | 0x800000;
3905 break;
3906
3907 case 11:
3908 w[ 2] = w[ 2] | 0x80000000;
3909 break;
3910
3911 case 12:
3912 w[ 3] = 0x80;
3913 break;
3914
3915 case 13:
3916 w[ 3] = w[ 3] | 0x8000;
3917 break;
3918
3919 case 14:
3920 w[ 3] = w[ 3] | 0x800000;
3921 break;
3922
3923 case 15:
3924 w[ 3] = w[ 3] | 0x80000000;
3925 break;
3926
3927 case 16:
3928 w[ 4] = 0x80;
3929 break;
3930
3931 case 17:
3932 w[ 4] = w[ 4] | 0x8000;
3933 break;
3934
3935 case 18:
3936 w[ 4] = w[ 4] | 0x800000;
3937 break;
3938
3939 case 19:
3940 w[ 4] = w[ 4] | 0x80000000;
3941 break;
3942
3943 case 20:
3944 w[ 5] = 0x80;
3945 break;
3946
3947 case 21:
3948 w[ 5] = w[ 5] | 0x8000;
3949 break;
3950
3951 case 22:
3952 w[ 5] = w[ 5] | 0x800000;
3953 break;
3954
3955 case 23:
3956 w[ 5] = w[ 5] | 0x80000000;
3957 break;
3958
3959 case 24:
3960 w[ 6] = 0x80;
3961 break;
3962
3963 case 25:
3964 w[ 6] = w[ 6] | 0x8000;
3965 break;
3966
3967 case 26:
3968 w[ 6] = w[ 6] | 0x800000;
3969 break;
3970
3971 case 27:
3972 w[ 6] = w[ 6] | 0x80000000;
3973 break;
3974
3975 case 28:
3976 w[ 7] = 0x80;
3977 break;
3978
3979 case 29:
3980 w[ 7] = w[ 7] | 0x8000;
3981 break;
3982
3983 case 30:
3984 w[ 7] = w[ 7] | 0x800000;
3985 break;
3986
3987 case 31:
3988 w[ 7] = w[ 7] | 0x80000000;
3989 break;
3990
3991 case 32:
3992 w[ 8] = 0x80;
3993 break;
3994
3995 case 33:
3996 w[ 8] = w[ 8] | 0x8000;
3997 break;
3998
3999 case 34:
4000 w[ 8] = w[ 8] | 0x800000;
4001 break;
4002
4003 case 35:
4004 w[ 8] = w[ 8] | 0x80000000;
4005 break;
4006
4007 case 36:
4008 w[ 9] = 0x80;
4009 break;
4010
4011 case 37:
4012 w[ 9] = w[ 9] | 0x8000;
4013 break;
4014
4015 case 38:
4016 w[ 9] = w[ 9] | 0x800000;
4017 break;
4018
4019 case 39:
4020 w[ 9] = w[ 9] | 0x80000000;
4021 break;
4022
4023 case 40:
4024 w[10] = 0x80;
4025 break;
4026
4027 case 41:
4028 w[10] = w[10] | 0x8000;
4029 break;
4030
4031 case 42:
4032 w[10] = w[10] | 0x800000;
4033 break;
4034
4035 case 43:
4036 w[10] = w[10] | 0x80000000;
4037 break;
4038
4039 case 44:
4040 w[11] = 0x80;
4041 break;
4042
4043 case 45:
4044 w[11] = w[11] | 0x8000;
4045 break;
4046
4047 case 46:
4048 w[11] = w[11] | 0x800000;
4049 break;
4050
4051 case 47:
4052 w[11] = w[11] | 0x80000000;
4053 break;
4054
4055 case 48:
4056 w[12] = 0x80;
4057 break;
4058
4059 case 49:
4060 w[12] = w[12] | 0x8000;
4061 break;
4062
4063 case 50:
4064 w[12] = w[12] | 0x800000;
4065 break;
4066
4067 case 51:
4068 w[12] = w[12] | 0x80000000;
4069 break;
4070
4071 case 52:
4072 w[13] = 0x80;
4073 break;
4074
4075 case 53:
4076 w[13] = w[13] | 0x8000;
4077 break;
4078
4079 case 54:
4080 w[13] = w[13] | 0x800000;
4081 break;
4082
4083 case 55:
4084 w[13] = w[13] | 0x80000000;
4085 break;
4086
4087 case 56:
4088 w[14] = 0x80;
4089 break;
4090
4091 case 57:
4092 w[14] = w[14] | 0x8000;
4093 break;
4094
4095 case 58:
4096 w[14] = w[14] | 0x800000;
4097 break;
4098
4099 case 59:
4100 w[14] = w[14] | 0x80000000;
4101 break;
4102
4103 case 60:
4104 w[15] = 0x80;
4105 break;
4106
4107 case 61:
4108 w[15] = w[15] | 0x8000;
4109 break;
4110
4111 case 62:
4112 w[15] = w[15] | 0x800000;
4113 break;
4114
4115 case 63:
4116 w[15] = w[15] | 0x80000000;
4117 break;
4118 }
4119 }
4120
4121 __device__ static void append_0x80_8 (u32 w[32], const u32 offset)
4122 {
4123 switch (offset)
4124 {
4125 case 0:
4126 w[ 0] = 0x80;
4127 break;
4128
4129 case 1:
4130 w[ 0] = w[ 0] | 0x8000;
4131 break;
4132
4133 case 2:
4134 w[ 0] = w[ 0] | 0x800000;
4135 break;
4136
4137 case 3:
4138 w[ 0] = w[ 0] | 0x80000000;
4139 break;
4140
4141 case 4:
4142 w[ 1] = 0x80;
4143 break;
4144
4145 case 5:
4146 w[ 1] = w[ 1] | 0x8000;
4147 break;
4148
4149 case 6:
4150 w[ 1] = w[ 1] | 0x800000;
4151 break;
4152
4153 case 7:
4154 w[ 1] = w[ 1] | 0x80000000;
4155 break;
4156
4157 case 8:
4158 w[ 2] = 0x80;
4159 break;
4160
4161 case 9:
4162 w[ 2] = w[ 2] | 0x8000;
4163 break;
4164
4165 case 10:
4166 w[ 2] = w[ 2] | 0x800000;
4167 break;
4168
4169 case 11:
4170 w[ 2] = w[ 2] | 0x80000000;
4171 break;
4172
4173 case 12:
4174 w[ 3] = 0x80;
4175 break;
4176
4177 case 13:
4178 w[ 3] = w[ 3] | 0x8000;
4179 break;
4180
4181 case 14:
4182 w[ 3] = w[ 3] | 0x800000;
4183 break;
4184
4185 case 15:
4186 w[ 3] = w[ 3] | 0x80000000;
4187 break;
4188
4189 case 16:
4190 w[ 4] = 0x80;
4191 break;
4192
4193 case 17:
4194 w[ 4] = w[ 4] | 0x8000;
4195 break;
4196
4197 case 18:
4198 w[ 4] = w[ 4] | 0x800000;
4199 break;
4200
4201 case 19:
4202 w[ 4] = w[ 4] | 0x80000000;
4203 break;
4204
4205 case 20:
4206 w[ 5] = 0x80;
4207 break;
4208
4209 case 21:
4210 w[ 5] = w[ 5] | 0x8000;
4211 break;
4212
4213 case 22:
4214 w[ 5] = w[ 5] | 0x800000;
4215 break;
4216
4217 case 23:
4218 w[ 5] = w[ 5] | 0x80000000;
4219 break;
4220
4221 case 24:
4222 w[ 6] = 0x80;
4223 break;
4224
4225 case 25:
4226 w[ 6] = w[ 6] | 0x8000;
4227 break;
4228
4229 case 26:
4230 w[ 6] = w[ 6] | 0x800000;
4231 break;
4232
4233 case 27:
4234 w[ 6] = w[ 6] | 0x80000000;
4235 break;
4236
4237 case 28:
4238 w[ 7] = 0x80;
4239 break;
4240
4241 case 29:
4242 w[ 7] = w[ 7] | 0x8000;
4243 break;
4244
4245 case 30:
4246 w[ 7] = w[ 7] | 0x800000;
4247 break;
4248
4249 case 31:
4250 w[ 7] = w[ 7] | 0x80000000;
4251 break;
4252
4253 case 32:
4254 w[ 8] = 0x80;
4255 break;
4256
4257 case 33:
4258 w[ 8] = w[ 8] | 0x8000;
4259 break;
4260
4261 case 34:
4262 w[ 8] = w[ 8] | 0x800000;
4263 break;
4264
4265 case 35:
4266 w[ 8] = w[ 8] | 0x80000000;
4267 break;
4268
4269 case 36:
4270 w[ 9] = 0x80;
4271 break;
4272
4273 case 37:
4274 w[ 9] = w[ 9] | 0x8000;
4275 break;
4276
4277 case 38:
4278 w[ 9] = w[ 9] | 0x800000;
4279 break;
4280
4281 case 39:
4282 w[ 9] = w[ 9] | 0x80000000;
4283 break;
4284
4285 case 40:
4286 w[10] = 0x80;
4287 break;
4288
4289 case 41:
4290 w[10] = w[10] | 0x8000;
4291 break;
4292
4293 case 42:
4294 w[10] = w[10] | 0x800000;
4295 break;
4296
4297 case 43:
4298 w[10] = w[10] | 0x80000000;
4299 break;
4300
4301 case 44:
4302 w[11] = 0x80;
4303 break;
4304
4305 case 45:
4306 w[11] = w[11] | 0x8000;
4307 break;
4308
4309 case 46:
4310 w[11] = w[11] | 0x800000;
4311 break;
4312
4313 case 47:
4314 w[11] = w[11] | 0x80000000;
4315 break;
4316
4317 case 48:
4318 w[12] = 0x80;
4319 break;
4320
4321 case 49:
4322 w[12] = w[12] | 0x8000;
4323 break;
4324
4325 case 50:
4326 w[12] = w[12] | 0x800000;
4327 break;
4328
4329 case 51:
4330 w[12] = w[12] | 0x80000000;
4331 break;
4332
4333 case 52:
4334 w[13] = 0x80;
4335 break;
4336
4337 case 53:
4338 w[13] = w[13] | 0x8000;
4339 break;
4340
4341 case 54:
4342 w[13] = w[13] | 0x800000;
4343 break;
4344
4345 case 55:
4346 w[13] = w[13] | 0x80000000;
4347 break;
4348
4349 case 56:
4350 w[14] = 0x80;
4351 break;
4352
4353 case 57:
4354 w[14] = w[14] | 0x8000;
4355 break;
4356
4357 case 58:
4358 w[14] = w[14] | 0x800000;
4359 break;
4360
4361 case 59:
4362 w[14] = w[14] | 0x80000000;
4363 break;
4364
4365 case 60:
4366 w[15] = 0x80;
4367 break;
4368
4369 case 61:
4370 w[15] = w[15] | 0x8000;
4371 break;
4372
4373 case 62:
4374 w[15] = w[15] | 0x800000;
4375 break;
4376
4377 case 63:
4378 w[15] = w[15] | 0x80000000;
4379 break;
4380
4381 case 64:
4382 w[16] = 0x80;
4383 break;
4384
4385 case 65:
4386 w[16] = w[16] | 0x8000;
4387 break;
4388
4389 case 66:
4390 w[16] = w[16] | 0x800000;
4391 break;
4392
4393 case 67:
4394 w[16] = w[16] | 0x80000000;
4395 break;
4396
4397 case 68:
4398 w[17] = 0x80;
4399 break;
4400
4401 case 69:
4402 w[17] = w[17] | 0x8000;
4403 break;
4404
4405 case 70:
4406 w[17] = w[17] | 0x800000;
4407 break;
4408
4409 case 71:
4410 w[17] = w[17] | 0x80000000;
4411 break;
4412
4413 case 72:
4414 w[18] = 0x80;
4415 break;
4416
4417 case 73:
4418 w[18] = w[18] | 0x8000;
4419 break;
4420
4421 case 74:
4422 w[18] = w[18] | 0x800000;
4423 break;
4424
4425 case 75:
4426 w[18] = w[18] | 0x80000000;
4427 break;
4428
4429 case 76:
4430 w[19] = 0x80;
4431 break;
4432
4433 case 77:
4434 w[19] = w[19] | 0x8000;
4435 break;
4436
4437 case 78:
4438 w[19] = w[19] | 0x800000;
4439 break;
4440
4441 case 79:
4442 w[19] = w[19] | 0x80000000;
4443 break;
4444
4445 case 80:
4446 w[20] = 0x80;
4447 break;
4448
4449 case 81:
4450 w[20] = w[20] | 0x8000;
4451 break;
4452
4453 case 82:
4454 w[20] = w[20] | 0x800000;
4455 break;
4456
4457 case 83:
4458 w[20] = w[20] | 0x80000000;
4459 break;
4460
4461 case 84:
4462 w[21] = 0x80;
4463 break;
4464
4465 case 85:
4466 w[21] = w[21] | 0x8000;
4467 break;
4468
4469 case 86:
4470 w[21] = w[21] | 0x800000;
4471 break;
4472
4473 case 87:
4474 w[21] = w[21] | 0x80000000;
4475 break;
4476
4477 case 88:
4478 w[22] = 0x80;
4479 break;
4480
4481 case 89:
4482 w[22] = w[22] | 0x8000;
4483 break;
4484
4485 case 90:
4486 w[22] = w[22] | 0x800000;
4487 break;
4488
4489 case 91:
4490 w[22] = w[22] | 0x80000000;
4491 break;
4492
4493 case 92:
4494 w[23] = 0x80;
4495 break;
4496
4497 case 93:
4498 w[23] = w[23] | 0x8000;
4499 break;
4500
4501 case 94:
4502 w[23] = w[23] | 0x800000;
4503 break;
4504
4505 case 95:
4506 w[23] = w[23] | 0x80000000;
4507 break;
4508
4509 case 96:
4510 w[24] = 0x80;
4511 break;
4512
4513 case 97:
4514 w[24] = w[24] | 0x8000;
4515 break;
4516
4517 case 98:
4518 w[24] = w[24] | 0x800000;
4519 break;
4520
4521 case 99:
4522 w[24] = w[24] | 0x80000000;
4523 break;
4524
4525 case 100:
4526 w[25] = 0x80;
4527 break;
4528
4529 case 101:
4530 w[25] = w[25] | 0x8000;
4531 break;
4532
4533 case 102:
4534 w[25] = w[25] | 0x800000;
4535 break;
4536
4537 case 103:
4538 w[25] = w[25] | 0x80000000;
4539 break;
4540
4541 case 104:
4542 w[26] = 0x80;
4543 break;
4544
4545 case 105:
4546 w[26] = w[26] | 0x8000;
4547 break;
4548
4549 case 106:
4550 w[26] = w[26] | 0x800000;
4551 break;
4552
4553 case 107:
4554 w[26] = w[26] | 0x80000000;
4555 break;
4556
4557 case 108:
4558 w[27] = 0x80;
4559 break;
4560
4561 case 109:
4562 w[27] = w[27] | 0x8000;
4563 break;
4564
4565 case 110:
4566 w[27] = w[27] | 0x800000;
4567 break;
4568
4569 case 111:
4570 w[27] = w[27] | 0x80000000;
4571 break;
4572
4573 case 112:
4574 w[28] = 0x80;
4575 break;
4576
4577 case 113:
4578 w[28] = w[28] | 0x8000;
4579 break;
4580
4581 case 114:
4582 w[28] = w[28] | 0x800000;
4583 break;
4584
4585 case 115:
4586 w[28] = w[28] | 0x80000000;
4587 break;
4588
4589 case 116:
4590 w[29] = 0x80;
4591 break;
4592
4593 case 117:
4594 w[29] = w[29] | 0x8000;
4595 break;
4596
4597 case 118:
4598 w[29] = w[29] | 0x800000;
4599 break;
4600
4601 case 119:
4602 w[29] = w[29] | 0x80000000;
4603 break;
4604
4605 case 120:
4606 w[30] = 0x80;
4607 break;
4608
4609 case 121:
4610 w[30] = w[30] | 0x8000;
4611 break;
4612
4613 case 122:
4614 w[30] = w[30] | 0x800000;
4615 break;
4616
4617 case 123:
4618 w[30] = w[30] | 0x80000000;
4619 break;
4620
4621 case 124:
4622 w[31] = 0x80;
4623 break;
4624
4625 case 125:
4626 w[31] = w[31] | 0x8000;
4627 break;
4628
4629 case 126:
4630 w[31] = w[31] | 0x800000;
4631 break;
4632
4633 case 127:
4634 w[31] = w[31] | 0x80000000;
4635 break;
4636 }
4637 }
4638
4639 __device__ static void device_memcat2L (const u32 offset, u32 dst0[2], u32 src_l0[2], u32 src_r0[2])
4640 {
4641 switch (offset)
4642 {
4643 case 1:
4644 dst0[0] = src_l0[0] | src_r0[0] << 8;
4645 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
4646 break;
4647
4648 case 2:
4649 dst0[0] = src_l0[0] | src_r0[0] << 16;
4650 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
4651 break;
4652
4653 case 3:
4654 dst0[0] = src_l0[0] | src_r0[0] << 24;
4655 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
4656 break;
4657
4658 case 4:
4659 dst0[1] = src_r0[0];
4660 break;
4661
4662 case 5:
4663 dst0[1] = src_l0[1] | src_r0[0] << 8;
4664 break;
4665
4666 case 6:
4667 dst0[1] = src_l0[1] | src_r0[0] << 16;
4668 break;
4669
4670 case 7:
4671 dst0[1] = src_l0[1] | src_r0[0] << 24;
4672 break;
4673 }
4674 }
4675
4676 __device__ static void device_memcat4L (const u32 offset, u32 dst0[4], u32 src_l0[4], u32 src_r0[4])
4677 {
4678 switch (offset)
4679 {
4680 case 1:
4681 dst0[0] = src_l0[0] | src_r0[0] << 8;
4682 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
4683 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
4684 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
4685 break;
4686
4687 case 2:
4688 dst0[0] = src_l0[0] | src_r0[0] << 16;
4689 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
4690 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
4691 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
4692 break;
4693
4694 case 3:
4695 dst0[0] = src_l0[0] | src_r0[0] << 24;
4696 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
4697 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
4698 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
4699 break;
4700
4701 case 4:
4702 dst0[1] = src_r0[0];
4703 dst0[2] = src_r0[1];
4704 dst0[3] = src_r0[2];
4705 break;
4706
4707 case 5:
4708 dst0[1] = src_l0[1] | src_r0[0] << 8;
4709 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
4710 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
4711 break;
4712
4713 case 6:
4714 dst0[1] = src_l0[1] | src_r0[0] << 16;
4715 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
4716 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
4717 break;
4718
4719 case 7:
4720 dst0[1] = src_l0[1] | src_r0[0] << 24;
4721 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
4722 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
4723 break;
4724
4725 case 8:
4726 dst0[2] = src_r0[0];
4727 dst0[3] = src_r0[1];
4728 break;
4729
4730 case 9:
4731 dst0[2] = src_l0[2] | src_r0[0] << 8;
4732 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
4733 break;
4734
4735 case 10:
4736 dst0[2] = src_l0[2] | src_r0[0] << 16;
4737 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
4738 break;
4739
4740 case 11:
4741 dst0[2] = src_l0[2] | src_r0[0] << 24;
4742 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
4743 break;
4744
4745 case 12:
4746 dst0[3] = src_r0[0];
4747 break;
4748
4749 case 13:
4750 dst0[3] = src_l0[3] | src_r0[0] << 8;
4751 break;
4752
4753 case 14:
4754 dst0[3] = src_l0[3] | src_r0[0] << 16;
4755 break;
4756
4757 case 15:
4758 dst0[3] = src_l0[3] | src_r0[0] << 24;
4759 break;
4760 }
4761 }
4762
4763 __device__ static void device_memcat8L (const u32 offset, u32 dst0[4], u32 dst1[4], u32 src_l0[4], u32 src_l1[4], u32 src_r0[4])
4764 {
4765 switch (offset)
4766 {
4767 case 1:
4768 dst0[0] = src_l0[0] | src_r0[0] << 8;
4769 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
4770 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
4771 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
4772 dst1[0] = src_r0[3] >> 24;
4773 break;
4774
4775 case 2:
4776 dst0[0] = src_l0[0] | src_r0[0] << 16;
4777 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
4778 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
4779 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
4780 dst1[0] = src_r0[3] >> 16;
4781 break;
4782
4783 case 3:
4784 dst0[0] = src_l0[0] | src_r0[0] << 24;
4785 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
4786 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
4787 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
4788 dst1[0] = src_r0[3] >> 8;
4789 break;
4790
4791 case 4:
4792 dst0[1] = src_r0[0];
4793 dst0[2] = src_r0[1];
4794 dst0[3] = src_r0[2];
4795 dst1[0] = src_r0[3];
4796 break;
4797
4798 case 5:
4799 dst0[1] = src_l0[1] | src_r0[0] << 8;
4800 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
4801 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
4802 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
4803 dst1[1] = src_r0[3] >> 24;
4804 break;
4805
4806 case 6:
4807 dst0[1] = src_l0[1] | src_r0[0] << 16;
4808 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
4809 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
4810 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
4811 dst1[1] = src_r0[3] >> 16;
4812 break;
4813
4814 case 7:
4815 dst0[1] = src_l0[1] | src_r0[0] << 24;
4816 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
4817 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
4818 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
4819 dst1[1] = src_r0[3] >> 8;
4820 break;
4821
4822 case 8:
4823 dst0[2] = src_r0[0];
4824 dst0[3] = src_r0[1];
4825 dst1[0] = src_r0[2];
4826 dst1[1] = src_r0[3];
4827 break;
4828
4829 case 9:
4830 dst0[2] = src_l0[2] | src_r0[0] << 8;
4831 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
4832 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
4833 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
4834 dst1[2] = src_r0[3] >> 24;
4835 break;
4836
4837 case 10:
4838 dst0[2] = src_l0[2] | src_r0[0] << 16;
4839 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
4840 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
4841 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
4842 dst1[2] = src_r0[3] >> 16;
4843 break;
4844
4845 case 11:
4846 dst0[2] = src_l0[2] | src_r0[0] << 24;
4847 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
4848 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
4849 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
4850 dst1[2] = src_r0[3] >> 8;
4851 break;
4852
4853 case 12:
4854 dst0[3] = src_r0[0];
4855 dst1[0] = src_r0[1];
4856 dst1[1] = src_r0[2];
4857 dst1[2] = src_r0[3];
4858 break;
4859
4860 case 13:
4861 dst0[3] = src_l0[3] | src_r0[0] << 8;
4862 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
4863 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
4864 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
4865 dst1[3] = src_r0[3] >> 24;
4866 break;
4867
4868 case 14:
4869 dst0[3] = src_l0[3] | src_r0[0] << 16;
4870 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
4871 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
4872 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
4873 dst1[3] = src_r0[3] >> 16;
4874 break;
4875
4876 case 15:
4877 dst0[3] = src_l0[3] | src_r0[0] << 24;
4878 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
4879 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
4880 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
4881 dst1[3] = src_r0[3] >> 8;
4882 break;
4883
4884 case 16:
4885 dst1[0] = src_r0[0];
4886 dst1[1] = src_r0[1];
4887 dst1[2] = src_r0[2];
4888 dst1[3] = src_r0[3];
4889 break;
4890
4891 case 17:
4892 dst1[0] = src_l1[0] | src_r0[0] << 8;
4893 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
4894 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
4895 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
4896 break;
4897
4898 case 18:
4899 dst1[0] = src_l1[0] | src_r0[0] << 16;
4900 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
4901 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
4902 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
4903 break;
4904
4905 case 19:
4906 dst1[0] = src_l1[0] | src_r0[0] << 24;
4907 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
4908 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
4909 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
4910 break;
4911
4912 case 20:
4913 dst1[1] = src_r0[0];
4914 dst1[2] = src_r0[1];
4915 dst1[3] = src_r0[2];
4916 break;
4917
4918 case 21:
4919 dst1[1] = src_l1[1] | src_r0[0] << 8;
4920 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
4921 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
4922 break;
4923
4924 case 22:
4925 dst1[1] = src_l1[1] | src_r0[0] << 16;
4926 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
4927 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
4928 break;
4929
4930 case 23:
4931 dst1[1] = src_l1[1] | src_r0[0] << 24;
4932 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
4933 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
4934 break;
4935
4936 case 24:
4937 dst1[2] = src_r0[0];
4938 dst1[3] = src_r0[1];
4939 break;
4940
4941 case 25:
4942 dst1[2] = src_l1[2] | src_r0[0] << 8;
4943 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
4944 break;
4945
4946 case 26:
4947 dst1[2] = src_l1[2] | src_r0[0] << 16;
4948 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
4949 break;
4950
4951 case 27:
4952 dst1[2] = src_l1[2] | src_r0[0] << 24;
4953 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
4954 break;
4955
4956 case 28:
4957 dst1[3] = src_r0[0];
4958 break;
4959
4960 case 29:
4961 dst1[3] = src_l1[3] | src_r0[0] << 8;
4962 break;
4963
4964 case 30:
4965 dst1[3] = src_l1[3] | src_r0[0] << 16;
4966 break;
4967
4968 case 31:
4969 dst1[3] = src_l1[3] | src_r0[0] << 24;
4970 break;
4971 }
4972 }
4973
4974 __device__ static void device_memcat12L (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4])
4975 {
4976 switch (offset)
4977 {
4978 case 1:
4979 dst0[0] = src_l0[0] | src_r0[0] << 8;
4980 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
4981 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
4982 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
4983 dst1[0] = src_r0[3] >> 24;
4984 break;
4985
4986 case 2:
4987 dst0[0] = src_l0[0] | src_r0[0] << 16;
4988 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
4989 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
4990 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
4991 dst1[0] = src_r0[3] >> 16;
4992 break;
4993
4994 case 3:
4995 dst0[0] = src_l0[0] | src_r0[0] << 24;
4996 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
4997 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
4998 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
4999 dst1[0] = src_r0[3] >> 8;
5000 break;
5001
5002 case 4:
5003 dst0[1] = src_r0[0];
5004 dst0[2] = src_r0[1];
5005 dst0[3] = src_r0[2];
5006 dst1[0] = src_r0[3];
5007 break;
5008
5009 case 5:
5010 dst0[1] = src_l0[1] | src_r0[0] << 8;
5011 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
5012 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
5013 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
5014 dst1[1] = src_r0[3] >> 24;
5015 break;
5016
5017 case 6:
5018 dst0[1] = src_l0[1] | src_r0[0] << 16;
5019 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
5020 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
5021 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
5022 dst1[1] = src_r0[3] >> 16;
5023 break;
5024
5025 case 7:
5026 dst0[1] = src_l0[1] | src_r0[0] << 24;
5027 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
5028 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
5029 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
5030 dst1[1] = src_r0[3] >> 8;
5031 break;
5032
5033 case 8:
5034 dst0[2] = src_r0[0];
5035 dst0[3] = src_r0[1];
5036 dst1[0] = src_r0[2];
5037 dst1[1] = src_r0[3];
5038 break;
5039
5040 case 9:
5041 dst0[2] = src_l0[2] | src_r0[0] << 8;
5042 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
5043 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
5044 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
5045 dst1[2] = src_r0[3] >> 24;
5046 break;
5047
5048 case 10:
5049 dst0[2] = src_l0[2] | src_r0[0] << 16;
5050 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
5051 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
5052 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
5053 dst1[2] = src_r0[3] >> 16;
5054 break;
5055
5056 case 11:
5057 dst0[2] = src_l0[2] | src_r0[0] << 24;
5058 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
5059 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
5060 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
5061 dst1[2] = src_r0[3] >> 8;
5062 break;
5063
5064 case 12:
5065 dst0[3] = src_r0[0];
5066 dst1[0] = src_r0[1];
5067 dst1[1] = src_r0[2];
5068 dst1[2] = src_r0[3];
5069 break;
5070
5071 case 13:
5072 dst0[3] = src_l0[3] | src_r0[0] << 8;
5073 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
5074 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
5075 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
5076 dst1[3] = src_r0[3] >> 24;
5077 break;
5078
5079 case 14:
5080 dst0[3] = src_l0[3] | src_r0[0] << 16;
5081 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
5082 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
5083 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
5084 dst1[3] = src_r0[3] >> 16;
5085 break;
5086
5087 case 15:
5088 dst0[3] = src_l0[3] | src_r0[0] << 24;
5089 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
5090 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
5091 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
5092 dst1[3] = src_r0[3] >> 8;
5093 break;
5094
5095 case 16:
5096 dst1[0] = src_r0[0];
5097 dst1[1] = src_r0[1];
5098 dst1[2] = src_r0[2];
5099 dst1[3] = src_r0[3];
5100 break;
5101
5102 case 17:
5103 dst1[0] = src_l1[0] | src_r0[0] << 8;
5104 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
5105 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
5106 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
5107 dst2[0] = src_r0[3] >> 24;
5108 break;
5109
5110 case 18:
5111 dst1[0] = src_l1[0] | src_r0[0] << 16;
5112 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
5113 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
5114 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
5115 dst2[0] = src_r0[3] >> 16;
5116 break;
5117
5118 case 19:
5119 dst1[0] = src_l1[0] | src_r0[0] << 24;
5120 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
5121 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
5122 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
5123 dst2[0] = src_r0[3] >> 8;
5124 break;
5125
5126 case 20:
5127 dst1[1] = src_r0[0];
5128 dst1[2] = src_r0[1];
5129 dst1[3] = src_r0[2];
5130 dst2[0] = src_r0[3];
5131 break;
5132
5133 case 21:
5134 dst1[1] = src_l1[1] | src_r0[0] << 8;
5135 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
5136 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
5137 dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8;
5138 dst2[1] = src_r0[3] >> 24;
5139 break;
5140
5141 case 22:
5142 dst1[1] = src_l1[1] | src_r0[0] << 16;
5143 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
5144 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
5145 dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16;
5146 dst2[1] = src_r0[3] >> 16;
5147 break;
5148
5149 case 23:
5150 dst1[1] = src_l1[1] | src_r0[0] << 24;
5151 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
5152 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
5153 dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24;
5154 dst2[1] = src_r0[3] >> 8;
5155 break;
5156
5157 case 24:
5158 dst1[2] = src_r0[0];
5159 dst1[3] = src_r0[1];
5160 dst2[0] = src_r0[2];
5161 dst2[1] = src_r0[3];
5162 break;
5163
5164 case 25:
5165 dst1[2] = src_l1[2] | src_r0[0] << 8;
5166 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
5167 dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8;
5168 dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8;
5169 dst2[2] = src_r0[3] >> 24;
5170 break;
5171
5172 case 26:
5173 dst1[2] = src_l1[2] | src_r0[0] << 16;
5174 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
5175 dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16;
5176 dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16;
5177 dst2[2] = src_r0[3] >> 16;
5178 break;
5179
5180 case 27:
5181 dst1[2] = src_l1[2] | src_r0[0] << 24;
5182 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
5183 dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24;
5184 dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24;
5185 dst2[2] = src_r0[3] >> 8;
5186 break;
5187
5188 case 28:
5189 dst1[3] = src_r0[0];
5190 dst2[0] = src_r0[1];
5191 dst2[1] = src_r0[2];
5192 dst2[2] = src_r0[3];
5193 break;
5194
5195 case 29:
5196 dst1[3] = src_l1[3] | src_r0[0] << 8;
5197 dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8;
5198 dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8;
5199 dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8;
5200 dst2[3] = src_r0[3] >> 24;
5201 break;
5202
5203 case 30:
5204 dst1[3] = src_l1[3] | src_r0[0] << 16;
5205 dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16;
5206 dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16;
5207 dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16;
5208 dst2[3] = src_r0[3] >> 16;
5209 break;
5210
5211 case 31:
5212 dst1[3] = src_l1[3] | src_r0[0] << 24;
5213 dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24;
5214 dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24;
5215 dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24;
5216 dst2[3] = src_r0[3] >> 8;
5217 break;
5218
5219 case 32:
5220 dst2[0] = src_r0[0];
5221 dst2[1] = src_r0[1];
5222 dst2[2] = src_r0[2];
5223 dst2[3] = src_r0[3];
5224 break;
5225
5226 case 33:
5227 dst2[0] = src_l2[0] | src_r0[0] << 8;
5228 dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8;
5229 dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8;
5230 dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8;
5231 break;
5232
5233 case 34:
5234 dst2[0] = src_l2[0] | src_r0[0] << 16;
5235 dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16;
5236 dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16;
5237 dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16;
5238 break;
5239
5240 case 35:
5241 dst2[0] = src_l2[0] | src_r0[0] << 24;
5242 dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24;
5243 dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24;
5244 dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24;
5245 break;
5246
5247 case 36:
5248 dst2[1] = src_r0[0];
5249 dst2[2] = src_r0[1];
5250 dst2[3] = src_r0[2];
5251 break;
5252
5253 case 37:
5254 dst2[1] = src_l2[1] | src_r0[0] << 8;
5255 dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8;
5256 dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8;
5257 break;
5258
5259 case 38:
5260 dst2[1] = src_l2[1] | src_r0[0] << 16;
5261 dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16;
5262 dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16;
5263 break;
5264
5265 case 39:
5266 dst2[1] = src_l2[1] | src_r0[0] << 24;
5267 dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24;
5268 dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24;
5269 break;
5270
5271 case 40:
5272 dst2[2] = src_r0[0];
5273 dst2[3] = src_r0[1];
5274 break;
5275
5276 case 41:
5277 dst2[2] = src_l2[2] | src_r0[0] << 8;
5278 dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8;
5279 break;
5280
5281 case 42:
5282 dst2[2] = src_l2[2] | src_r0[0] << 16;
5283 dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16;
5284 break;
5285
5286 case 43:
5287 dst2[2] = src_l2[2] | src_r0[0] << 24;
5288 dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24;
5289 break;
5290
5291 case 44:
5292 dst2[3] = src_r0[0];
5293 break;
5294
5295 case 45:
5296 dst2[3] = src_l2[3] | src_r0[0] << 8;
5297 break;
5298
5299 case 46:
5300 dst2[3] = src_l2[3] | src_r0[0] << 16;
5301 break;
5302
5303 case 47:
5304 dst2[3] = src_l2[3] | src_r0[0] << 24;
5305 break;
5306 }
5307 }
5308
5309 __device__ static void device_memcat12L (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4], u32 src_r1[4])
5310 {
5311 switch (offset)
5312 {
5313 case 0:
5314 dst0[0] = src_r0[0];
5315 dst0[1] = src_r0[1];
5316 dst0[2] = src_r0[2];
5317 dst0[3] = src_r0[3];
5318 dst1[0] = src_r1[0];
5319 dst1[1] = src_r1[1];
5320 dst1[2] = src_r1[2];
5321 dst1[3] = src_r1[3];
5322 break;
5323
5324 case 1:
5325 dst0[0] = src_l0[0] | src_r0[0] << 8;
5326 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
5327 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
5328 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
5329 dst1[0] = src_r0[3] >> 24 | src_r1[0] << 8;
5330 dst1[1] = src_r1[0] >> 24 | src_r1[1] << 8;
5331 dst1[2] = src_r1[1] >> 24 | src_r1[2] << 8;
5332 dst1[3] = src_r1[2] >> 24 | src_r1[3] << 8;
5333 dst2[0] = src_r1[3] >> 24;
5334 break;
5335
5336 case 2:
5337 dst0[0] = src_l0[0] | src_r0[0] << 16;
5338 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
5339 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
5340 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
5341 dst1[0] = src_r0[3] >> 16 | src_r1[0] << 16;
5342 dst1[1] = src_r1[0] >> 16 | src_r1[1] << 16;
5343 dst1[2] = src_r1[1] >> 16 | src_r1[2] << 16;
5344 dst1[3] = src_r1[2] >> 16 | src_r1[3] << 16;
5345 dst2[0] = src_r1[3] >> 16;
5346 break;
5347
5348 case 3:
5349 dst0[0] = src_l0[0] | src_r0[0] << 24;
5350 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
5351 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
5352 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
5353 dst1[0] = src_r0[3] >> 8 | src_r1[0] << 24;
5354 dst1[1] = src_r1[0] >> 8 | src_r1[1] << 24;
5355 dst1[2] = src_r1[1] >> 8 | src_r1[2] << 24;
5356 dst1[3] = src_r1[2] >> 8 | src_r1[3] << 24;
5357 dst2[0] = src_r1[3] >> 8;
5358 break;
5359
5360 case 4:
5361 dst0[1] = src_r0[0];
5362 dst0[2] = src_r0[1];
5363 dst0[3] = src_r0[2];
5364 dst1[0] = src_r0[3];
5365 dst1[1] = src_r1[0];
5366 dst1[2] = src_r1[1];
5367 dst1[3] = src_r1[2];
5368 dst2[0] = src_r1[3];
5369 break;
5370
5371 case 5:
5372 dst0[1] = src_l0[1] | src_r0[0] << 8;
5373 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
5374 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
5375 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
5376 dst1[1] = src_r0[3] >> 24 | src_r1[0] << 8;
5377 dst1[2] = src_r1[0] >> 24 | src_r1[1] << 8;
5378 dst1[3] = src_r1[1] >> 24 | src_r1[2] << 8;
5379 dst2[0] = src_r1[2] >> 24 | src_r1[3] << 8;
5380 dst2[1] = src_r1[3] >> 24;
5381 break;
5382
5383 case 6:
5384 dst0[1] = src_l0[1] | src_r0[0] << 16;
5385 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
5386 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
5387 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
5388 dst1[1] = src_r0[3] >> 16 | src_r1[0] << 16;
5389 dst1[2] = src_r1[0] >> 16 | src_r1[1] << 16;
5390 dst1[3] = src_r1[1] >> 16 | src_r1[2] << 16;
5391 dst2[0] = src_r1[2] >> 16 | src_r1[3] << 16;
5392 dst2[1] = src_r1[3] >> 16;
5393 break;
5394
5395 case 7:
5396 dst0[1] = src_l0[1] | src_r0[0] << 24;
5397 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
5398 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
5399 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
5400 dst1[1] = src_r0[3] >> 8 | src_r1[0] << 24;
5401 dst1[2] = src_r1[0] >> 8 | src_r1[1] << 24;
5402 dst1[3] = src_r1[1] >> 8 | src_r1[2] << 24;
5403 dst2[0] = src_r1[2] >> 8 | src_r1[3] << 24;
5404 dst2[1] = src_r1[3] >> 8;
5405 break;
5406
5407 case 8:
5408 dst0[2] = src_r0[0];
5409 dst0[3] = src_r0[1];
5410 dst1[0] = src_r0[2];
5411 dst1[1] = src_r0[3];
5412 dst1[2] = src_r1[0];
5413 dst1[3] = src_r1[1];
5414 dst2[0] = src_r1[2];
5415 dst2[1] = src_r1[3];
5416 break;
5417
5418 case 9:
5419 dst0[2] = src_l0[2] | src_r0[0] << 8;
5420 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
5421 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
5422 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
5423 dst1[2] = src_r0[3] >> 24 | src_r1[0] << 8;
5424 dst1[3] = src_r1[0] >> 24 | src_r1[1] << 8;
5425 dst2[0] = src_r1[1] >> 24 | src_r1[2] << 8;
5426 dst2[1] = src_r1[2] >> 24 | src_r1[3] << 8;
5427 dst2[2] = src_r1[3] >> 24;
5428 break;
5429
5430 case 10:
5431 dst0[2] = src_l0[2] | src_r0[0] << 16;
5432 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
5433 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
5434 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
5435 dst1[2] = src_r0[3] >> 16 | src_r1[0] << 16;
5436 dst1[3] = src_r1[0] >> 16 | src_r1[1] << 16;
5437 dst2[0] = src_r1[1] >> 16 | src_r1[2] << 16;
5438 dst2[1] = src_r1[2] >> 16 | src_r1[3] << 16;
5439 dst2[2] = src_r1[3] >> 16;
5440 break;
5441
5442 case 11:
5443 dst0[2] = src_l0[2] | src_r0[0] << 24;
5444 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
5445 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
5446 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
5447 dst1[2] = src_r0[3] >> 8 | src_r1[0] << 24;
5448 dst1[3] = src_r1[0] >> 8 | src_r1[1] << 24;
5449 dst2[0] = src_r1[1] >> 8 | src_r1[2] << 24;
5450 dst2[1] = src_r1[2] >> 8 | src_r1[3] << 24;
5451 dst2[2] = src_r1[3] >> 8;
5452 break;
5453
5454 case 12:
5455 dst0[3] = src_r0[0];
5456 dst1[0] = src_r0[1];
5457 dst1[1] = src_r0[2];
5458 dst1[2] = src_r0[3];
5459 dst1[3] = src_r1[0];
5460 dst2[0] = src_r1[1];
5461 dst2[1] = src_r1[2];
5462 dst2[2] = src_r1[3];
5463 break;
5464
5465 case 13:
5466 dst0[3] = src_l0[3] | src_r0[0] << 8;
5467 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
5468 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
5469 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
5470 dst1[3] = src_r0[3] >> 24 | src_r1[0] << 8;
5471 dst2[0] = src_r1[0] >> 24 | src_r1[1] << 8;
5472 dst2[1] = src_r1[1] >> 24 | src_r1[2] << 8;
5473 dst2[2] = src_r1[2] >> 24 | src_r1[3] << 8;
5474 dst2[3] = src_r1[3] >> 24;
5475 break;
5476
5477 case 14:
5478 dst0[3] = src_l0[3] | src_r0[0] << 16;
5479 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
5480 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
5481 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
5482 dst1[3] = src_r0[3] >> 16 | src_r1[0] << 16;
5483 dst2[0] = src_r1[0] >> 16 | src_r1[1] << 16;
5484 dst2[1] = src_r1[1] >> 16 | src_r1[2] << 16;
5485 dst2[2] = src_r1[2] >> 16 | src_r1[3] << 16;
5486 dst2[3] = src_r1[3] >> 16;
5487 break;
5488
5489 case 15:
5490 dst0[3] = src_l0[3] | src_r0[0] << 24;
5491 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
5492 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
5493 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
5494 dst1[3] = src_r0[3] >> 8 | src_r1[0] << 24;
5495 dst2[0] = src_r1[0] >> 8 | src_r1[1] << 24;
5496 dst2[1] = src_r1[1] >> 8 | src_r1[2] << 24;
5497 dst2[2] = src_r1[2] >> 8 | src_r1[3] << 24;
5498 dst2[3] = src_r1[3] >> 8;
5499 break;
5500
5501 case 16:
5502 dst1[0] = src_r0[0];
5503 dst1[1] = src_r0[1];
5504 dst1[2] = src_r0[2];
5505 dst1[3] = src_r0[3];
5506 dst2[0] = src_r1[0];
5507 dst2[1] = src_r1[1];
5508 dst2[2] = src_r1[2];
5509 dst2[3] = src_r1[3];
5510 break;
5511
5512 case 17:
5513 dst1[0] = src_l1[0] | src_r0[0] << 8;
5514 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
5515 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
5516 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
5517 dst2[0] = src_r0[3] >> 24 | src_r1[0] << 8;
5518 dst2[1] = src_r1[0] >> 24 | src_r1[1] << 8;
5519 dst2[2] = src_r1[1] >> 24 | src_r1[2] << 8;
5520 dst2[3] = src_r1[2] >> 24 | src_r1[3] << 8;
5521 break;
5522
5523 case 18:
5524 dst1[0] = src_l1[0] | src_r0[0] << 16;
5525 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
5526 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
5527 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
5528 dst2[0] = src_r0[3] >> 16 | src_r1[0] << 16;
5529 dst2[1] = src_r1[0] >> 16 | src_r1[1] << 16;
5530 dst2[2] = src_r1[1] >> 16 | src_r1[2] << 16;
5531 dst2[3] = src_r1[2] >> 16 | src_r1[3] << 16;
5532 break;
5533
5534 case 19:
5535 dst1[0] = src_l1[0] | src_r0[0] << 24;
5536 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
5537 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
5538 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
5539 dst2[0] = src_r0[3] >> 8 | src_r1[0] << 24;
5540 dst2[1] = src_r1[0] >> 8 | src_r1[1] << 24;
5541 dst2[2] = src_r1[1] >> 8 | src_r1[2] << 24;
5542 dst2[3] = src_r1[2] >> 8 | src_r1[3] << 24;
5543 break;
5544
5545 case 20:
5546 dst1[1] = src_r1[0];
5547 dst1[2] = src_r0[1];
5548 dst1[3] = src_r0[2];
5549 dst2[0] = src_r0[3];
5550 dst2[1] = src_r1[0];
5551 dst2[2] = src_r1[1];
5552 dst2[3] = src_r1[2];
5553 break;
5554
5555 case 21:
5556 dst1[1] = src_l1[1] | src_r0[0] << 8;
5557 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
5558 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
5559 dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8;
5560 dst2[1] = src_r0[3] >> 24 | src_r1[0] << 8;
5561 dst2[2] = src_r1[0] >> 24 | src_r1[1] << 8;
5562 dst2[3] = src_r1[1] >> 24 | src_r1[2] << 8;
5563 break;
5564
5565 case 22:
5566 dst1[1] = src_l1[1] | src_r0[0] << 16;
5567 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
5568 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
5569 dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16;
5570 dst2[1] = src_r0[3] >> 16 | src_r1[0] << 16;
5571 dst2[2] = src_r1[0] >> 16 | src_r1[1] << 16;
5572 dst2[3] = src_r1[1] >> 16 | src_r1[2] << 16;
5573 break;
5574
5575 case 23:
5576 dst1[1] = src_l1[1] | src_r0[0] << 24;
5577 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
5578 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
5579 dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24;
5580 dst2[1] = src_r0[3] >> 8 | src_r1[0] << 24;
5581 dst2[2] = src_r1[0] >> 8 | src_r1[1] << 24;
5582 dst2[3] = src_r1[1] >> 8 | src_r1[2] << 24;
5583 break;
5584
5585 case 24:
5586 dst1[2] = src_r1[0];
5587 dst1[3] = src_r0[1];
5588 dst2[0] = src_r0[2];
5589 dst2[1] = src_r0[3];
5590 dst2[2] = src_r1[0];
5591 dst2[3] = src_r1[1];
5592 break;
5593
5594 case 25:
5595 dst1[2] = src_l1[2] | src_r0[0] << 8;
5596 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
5597 dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8;
5598 dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8;
5599 dst2[2] = src_r0[3] >> 24 | src_r1[0] << 8;
5600 dst2[3] = src_r1[0] >> 24 | src_r1[1] << 8;
5601 break;
5602
5603 case 26:
5604 dst1[2] = src_l1[2] | src_r0[0] << 16;
5605 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
5606 dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16;
5607 dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16;
5608 dst2[2] = src_r0[3] >> 16 | src_r1[0] << 16;
5609 dst2[3] = src_r1[0] >> 16 | src_r1[1] << 16;
5610 break;
5611
5612 case 27:
5613 dst1[2] = src_l1[2] | src_r0[0] << 24;
5614 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
5615 dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24;
5616 dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24;
5617 dst2[2] = src_r0[3] >> 8 | src_r1[0] << 24;
5618 dst2[3] = src_r1[0] >> 8 | src_r1[1] << 24;
5619 break;
5620
5621 case 28:
5622 dst1[3] = src_r1[0];
5623 dst2[0] = src_r0[1];
5624 dst2[1] = src_r0[2];
5625 dst2[2] = src_r0[3];
5626 dst2[3] = src_r1[0];
5627 break;
5628
5629 case 29:
5630 dst1[3] = src_l1[3] | src_r0[0] << 8;
5631 dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8;
5632 dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8;
5633 dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8;
5634 dst2[3] = src_r0[3] >> 24 | src_r1[0] << 8;
5635 break;
5636
5637 case 30:
5638 dst1[3] = src_l1[3] | src_r0[0] << 16;
5639 dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16;
5640 dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16;
5641 dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16;
5642 dst2[3] = src_r0[3] >> 16 | src_r1[0] << 16;
5643 break;
5644
5645 case 31:
5646 dst1[3] = src_l1[3] | src_r0[0] << 24;
5647 dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24;
5648 dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24;
5649 dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24;
5650 dst2[3] = src_r0[3] >> 8 | src_r1[0] << 24;
5651 break;
5652
5653 case 32:
5654 dst2[0] = src_r0[0];
5655 dst2[1] = src_r0[1];
5656 dst2[2] = src_r0[2];
5657 dst2[3] = src_r0[3];
5658 break;
5659
5660 case 33:
5661 dst2[0] = src_l2[0] | src_r0[0] << 8;
5662 dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8;
5663 dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8;
5664 dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8;
5665 break;
5666
5667 case 34:
5668 dst2[0] = src_l2[0] | src_r0[0] << 16;
5669 dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16;
5670 dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16;
5671 dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16;
5672 break;
5673
5674 case 35:
5675 dst2[0] = src_l2[0] | src_r0[0] << 24;
5676 dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24;
5677 dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24;
5678 dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24;
5679 break;
5680
5681 case 36:
5682 dst2[1] = src_r0[0];
5683 dst2[2] = src_r0[1];
5684 dst2[3] = src_r0[2];
5685 break;
5686
5687 case 37:
5688 dst2[1] = src_l2[1] | src_r0[0] << 8;
5689 dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8;
5690 dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8;
5691 break;
5692
5693 case 38:
5694 dst2[1] = src_l2[1] | src_r0[0] << 16;
5695 dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16;
5696 dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16;
5697 break;
5698
5699 case 39:
5700 dst2[1] = src_l2[1] | src_r0[0] << 24;
5701 dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24;
5702 dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24;
5703 break;
5704
5705 case 40:
5706 dst2[2] = src_r0[0];
5707 dst2[3] = src_r0[1];
5708 break;
5709
5710 case 41:
5711 dst2[2] = src_l2[2] | src_r0[0] << 8;
5712 dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8;
5713 break;
5714
5715 case 42:
5716 dst2[2] = src_l2[2] | src_r0[0] << 16;
5717 dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16;
5718 break;
5719
5720 case 43:
5721 dst2[2] = src_l2[2] | src_r0[0] << 24;
5722 dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24;
5723 break;
5724
5725 case 44:
5726 dst2[3] = src_r0[0];
5727 break;
5728
5729 case 45:
5730 dst2[3] = src_l2[3] | src_r0[0] << 8;
5731 break;
5732
5733 case 46:
5734 dst2[3] = src_l2[3] | src_r0[0] << 16;
5735 break;
5736
5737 case 47:
5738 dst2[3] = src_l2[3] | src_r0[0] << 24;
5739 break;
5740 }
5741 }
5742
5743 __device__ static void memcat16_9 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset)
5744 {
5745 switch (offset)
5746 {
5747 case 0:
5748 w0[0] = append0[0];
5749 w0[1] = append0[1];
5750 w0[2] = append0[2];
5751 w0[3] = append0[3];
5752 w1[0] = append1[0];
5753 w1[1] = append1[1];
5754 w1[2] = append1[2];
5755 w1[3] = append1[3];
5756 w2[0] = append2[0];
5757 break;
5758
5759 case 1:
5760 w0[0] = w0[0] | append0[0] << 8;
5761 w0[1] = append0[0] >> 24 | append0[1] << 8;
5762 w0[2] = append0[1] >> 24 | append0[2] << 8;
5763 w0[3] = append0[2] >> 24 | append0[3] << 8;
5764 w1[0] = append0[3] >> 24 | append1[0] << 8;
5765 w1[1] = append1[0] >> 24 | append1[1] << 8;
5766 w1[2] = append1[1] >> 24 | append1[2] << 8;
5767 w1[3] = append1[2] >> 24 | append1[3] << 8;
5768 w2[0] = append1[3] >> 24 | append2[0] << 8;
5769 w2[1] = append2[0] >> 24;
5770 break;
5771
5772 case 2:
5773 w0[0] = w0[0] | append0[0] << 16;
5774 w0[1] = append0[0] >> 16 | append0[1] << 16;
5775 w0[2] = append0[1] >> 16 | append0[2] << 16;
5776 w0[3] = append0[2] >> 16 | append0[3] << 16;
5777 w1[0] = append0[3] >> 16 | append1[0] << 16;
5778 w1[1] = append1[0] >> 16 | append1[1] << 16;
5779 w1[2] = append1[1] >> 16 | append1[2] << 16;
5780 w1[3] = append1[2] >> 16 | append1[3] << 16;
5781 w2[0] = append1[3] >> 16 | append2[0] << 16;
5782 w2[1] = append2[0] >> 16;
5783 break;
5784
5785 case 3:
5786 w0[0] = w0[0] | append0[0] << 24;
5787 w0[1] = append0[0] >> 8 | append0[1] << 24;
5788 w0[2] = append0[1] >> 8 | append0[2] << 24;
5789 w0[3] = append0[2] >> 8 | append0[3] << 24;
5790 w1[0] = append0[3] >> 8 | append1[0] << 24;
5791 w1[1] = append1[0] >> 8 | append1[1] << 24;
5792 w1[2] = append1[1] >> 8 | append1[2] << 24;
5793 w1[3] = append1[2] >> 8 | append1[3] << 24;
5794 w2[0] = append1[3] >> 8 | append2[0] << 24;
5795 w2[1] = append2[0] >> 8;
5796 break;
5797
5798 case 4:
5799 w0[1] = append0[0];
5800 w0[2] = append0[1];
5801 w0[3] = append0[2];
5802 w1[0] = append0[3];
5803 w1[1] = append1[0];
5804 w1[2] = append1[1];
5805 w1[3] = append1[2];
5806 w2[0] = append1[3];
5807 w2[1] = append2[0];
5808 break;
5809
5810 case 5:
5811 w0[1] = w0[1] | append0[0] << 8;
5812 w0[2] = append0[0] >> 24 | append0[1] << 8;
5813 w0[3] = append0[1] >> 24 | append0[2] << 8;
5814 w1[0] = append0[2] >> 24 | append0[3] << 8;
5815 w1[1] = append0[3] >> 24 | append1[0] << 8;
5816 w1[2] = append1[0] >> 24 | append1[1] << 8;
5817 w1[3] = append1[1] >> 24 | append1[2] << 8;
5818 w2[0] = append1[2] >> 24 | append1[3] << 8;
5819 w2[1] = append1[3] >> 24 | append2[0] << 8;
5820 w2[2] = append2[0] >> 24;
5821 break;
5822
5823 case 6:
5824 w0[1] = w0[1] | append0[0] << 16;
5825 w0[2] = append0[0] >> 16 | append0[1] << 16;
5826 w0[3] = append0[1] >> 16 | append0[2] << 16;
5827 w1[0] = append0[2] >> 16 | append0[3] << 16;
5828 w1[1] = append0[3] >> 16 | append1[0] << 16;
5829 w1[2] = append1[0] >> 16 | append1[1] << 16;
5830 w1[3] = append1[1] >> 16 | append1[2] << 16;
5831 w2[0] = append1[2] >> 16 | append1[3] << 16;
5832 w2[1] = append1[3] >> 16 | append2[0] << 16;
5833 w2[2] = append2[0] >> 16;
5834 break;
5835
5836 case 7:
5837 w0[1] = w0[1] | append0[0] << 24;
5838 w0[2] = append0[0] >> 8 | append0[1] << 24;
5839 w0[3] = append0[1] >> 8 | append0[2] << 24;
5840 w1[0] = append0[2] >> 8 | append0[3] << 24;
5841 w1[1] = append0[3] >> 8 | append1[0] << 24;
5842 w1[2] = append1[0] >> 8 | append1[1] << 24;
5843 w1[3] = append1[1] >> 8 | append1[2] << 24;
5844 w2[0] = append1[2] >> 8 | append1[3] << 24;
5845 w2[1] = append1[3] >> 8 | append2[0] << 24;
5846 w2[2] = append2[0] >> 8;
5847 break;
5848
5849 case 8:
5850 w0[2] = append0[0];
5851 w0[3] = append0[1];
5852 w1[0] = append0[2];
5853 w1[1] = append0[3];
5854 w1[2] = append1[0];
5855 w1[3] = append1[1];
5856 w2[0] = append1[2];
5857 w2[1] = append1[3];
5858 w2[2] = append2[0];
5859 break;
5860
5861 case 9:
5862 w0[2] = w0[2] | append0[0] << 8;
5863 w0[3] = append0[0] >> 24 | append0[1] << 8;
5864 w1[0] = append0[1] >> 24 | append0[2] << 8;
5865 w1[1] = append0[2] >> 24 | append0[3] << 8;
5866 w1[2] = append0[3] >> 24 | append1[0] << 8;
5867 w1[3] = append1[0] >> 24 | append1[1] << 8;
5868 w2[0] = append1[1] >> 24 | append1[2] << 8;
5869 w2[1] = append1[2] >> 24 | append1[3] << 8;
5870 w2[2] = append1[3] >> 24 | append2[0] << 8;
5871 w2[3] = append2[0] >> 24;
5872 break;
5873
5874 case 10:
5875 w0[2] = w0[2] | append0[0] << 16;
5876 w0[3] = append0[0] >> 16 | append0[1] << 16;
5877 w1[0] = append0[1] >> 16 | append0[2] << 16;
5878 w1[1] = append0[2] >> 16 | append0[3] << 16;
5879 w1[2] = append0[3] >> 16 | append1[0] << 16;
5880 w1[3] = append1[0] >> 16 | append1[1] << 16;
5881 w2[0] = append1[1] >> 16 | append1[2] << 16;
5882 w2[1] = append1[2] >> 16 | append1[3] << 16;
5883 w2[2] = append1[3] >> 16 | append2[0] << 16;
5884 w2[3] = append2[0] >> 16;
5885 break;
5886
5887 case 11:
5888 w0[2] = w0[2] | append0[0] << 24;
5889 w0[3] = append0[0] >> 8 | append0[1] << 24;
5890 w1[0] = append0[1] >> 8 | append0[2] << 24;
5891 w1[1] = append0[2] >> 8 | append0[3] << 24;
5892 w1[2] = append0[3] >> 8 | append1[0] << 24;
5893 w1[3] = append1[0] >> 8 | append1[1] << 24;
5894 w2[0] = append1[1] >> 8 | append1[2] << 24;
5895 w2[1] = append1[2] >> 8 | append1[3] << 24;
5896 w2[2] = append1[3] >> 8 | append2[0] << 24;
5897 w2[3] = append2[0] >> 8;
5898 break;
5899
5900 case 12:
5901 w0[3] = append0[0];
5902 w1[0] = append0[1];
5903 w1[1] = append0[2];
5904 w1[2] = append0[3];
5905 w1[3] = append1[0];
5906 w2[0] = append1[1];
5907 w2[1] = append1[2];
5908 w2[2] = append1[3];
5909 w2[3] = append2[0];
5910 break;
5911
5912 case 13:
5913 w0[3] = w0[3] | append0[0] << 8;
5914 w1[0] = append0[0] >> 24 | append0[1] << 8;
5915 w1[1] = append0[1] >> 24 | append0[2] << 8;
5916 w1[2] = append0[2] >> 24 | append0[3] << 8;
5917 w1[3] = append0[3] >> 24 | append1[0] << 8;
5918 w2[0] = append1[0] >> 24 | append1[1] << 8;
5919 w2[1] = append1[1] >> 24 | append1[2] << 8;
5920 w2[2] = append1[2] >> 24 | append1[3] << 8;
5921 w2[3] = append1[3] >> 24 | append2[0] << 8;
5922 w3[0] = append2[0] >> 24;
5923 break;
5924
5925 case 14:
5926 w0[3] = w0[3] | append0[0] << 16;
5927 w1[0] = append0[0] >> 16 | append0[1] << 16;
5928 w1[1] = append0[1] >> 16 | append0[2] << 16;
5929 w1[2] = append0[2] >> 16 | append0[3] << 16;
5930 w1[3] = append0[3] >> 16 | append1[0] << 16;
5931 w2[0] = append1[0] >> 16 | append1[1] << 16;
5932 w2[1] = append1[1] >> 16 | append1[2] << 16;
5933 w2[2] = append1[2] >> 16 | append1[3] << 16;
5934 w2[3] = append1[3] >> 16 | append2[0] << 16;
5935 w3[0] = append2[0] >> 16;
5936 break;
5937
5938 case 15:
5939 w0[3] = w0[3] | append0[0] << 24;
5940 w1[0] = append0[0] >> 8 | append0[1] << 24;
5941 w1[1] = append0[1] >> 8 | append0[2] << 24;
5942 w1[2] = append0[2] >> 8 | append0[3] << 24;
5943 w1[3] = append0[3] >> 8 | append1[0] << 24;
5944 w2[0] = append1[0] >> 8 | append1[1] << 24;
5945 w2[1] = append1[1] >> 8 | append1[2] << 24;
5946 w2[2] = append1[2] >> 8 | append1[3] << 24;
5947 w2[3] = append1[3] >> 8 | append2[0] << 24;
5948 w3[0] = append2[0] >> 8;
5949 break;
5950 }
5951 }
5952
5953 __device__ static void memcat32_8 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 offset)
5954 {
5955 switch (offset)
5956 {
5957 case 0:
5958 w0[0] = append0[0];
5959 w0[1] = append0[1];
5960 w0[2] = append0[2];
5961 w0[3] = append0[3];
5962 w1[0] = append1[0];
5963 w1[1] = append1[1];
5964 w1[2] = append1[2];
5965 w1[3] = append1[3];
5966 break;
5967
5968 case 1:
5969 w0[0] = w0[0] | append0[0] << 8;
5970 w0[1] = append0[0] >> 24 | append0[1] << 8;
5971 w0[2] = append0[1] >> 24 | append0[2] << 8;
5972 w0[3] = append0[2] >> 24 | append0[3] << 8;
5973 w1[0] = append0[3] >> 24 | append1[0] << 8;
5974 w1[1] = append1[0] >> 24 | append1[1] << 8;
5975 w1[2] = append1[1] >> 24 | append1[2] << 8;
5976 w1[3] = append1[2] >> 24 | append1[3] << 8;
5977 w2[0] = append1[3] >> 24;
5978 break;
5979
5980 case 2:
5981 w0[0] = w0[0] | append0[0] << 16;
5982 w0[1] = append0[0] >> 16 | append0[1] << 16;
5983 w0[2] = append0[1] >> 16 | append0[2] << 16;
5984 w0[3] = append0[2] >> 16 | append0[3] << 16;
5985 w1[0] = append0[3] >> 16 | append1[0] << 16;
5986 w1[1] = append1[0] >> 16 | append1[1] << 16;
5987 w1[2] = append1[1] >> 16 | append1[2] << 16;
5988 w1[3] = append1[2] >> 16 | append1[3] << 16;
5989 w2[0] = append1[3] >> 16;
5990 break;
5991
5992 case 3:
5993 w0[0] = w0[0] | append0[0] << 24;
5994 w0[1] = append0[0] >> 8 | append0[1] << 24;
5995 w0[2] = append0[1] >> 8 | append0[2] << 24;
5996 w0[3] = append0[2] >> 8 | append0[3] << 24;
5997 w1[0] = append0[3] >> 8 | append1[0] << 24;
5998 w1[1] = append1[0] >> 8 | append1[1] << 24;
5999 w1[2] = append1[1] >> 8 | append1[2] << 24;
6000 w1[3] = append1[2] >> 8 | append1[3] << 24;
6001 w2[0] = append1[3] >> 8;
6002 break;
6003
6004 case 4:
6005 w0[1] = append0[0];
6006 w0[2] = append0[1];
6007 w0[3] = append0[2];
6008 w1[0] = append0[3];
6009 w1[1] = append1[0];
6010 w1[2] = append1[1];
6011 w1[3] = append1[2];
6012 w2[0] = append1[3];
6013 break;
6014
6015 case 5:
6016 w0[1] = w0[1] | append0[0] << 8;
6017 w0[2] = append0[0] >> 24 | append0[1] << 8;
6018 w0[3] = append0[1] >> 24 | append0[2] << 8;
6019 w1[0] = append0[2] >> 24 | append0[3] << 8;
6020 w1[1] = append0[3] >> 24 | append1[0] << 8;
6021 w1[2] = append1[0] >> 24 | append1[1] << 8;
6022 w1[3] = append1[1] >> 24 | append1[2] << 8;
6023 w2[0] = append1[2] >> 24 | append1[3] << 8;
6024 w2[1] = append1[3] >> 24;
6025 break;
6026
6027 case 6:
6028 w0[1] = w0[1] | append0[0] << 16;
6029 w0[2] = append0[0] >> 16 | append0[1] << 16;
6030 w0[3] = append0[1] >> 16 | append0[2] << 16;
6031 w1[0] = append0[2] >> 16 | append0[3] << 16;
6032 w1[1] = append0[3] >> 16 | append1[0] << 16;
6033 w1[2] = append1[0] >> 16 | append1[1] << 16;
6034 w1[3] = append1[1] >> 16 | append1[2] << 16;
6035 w2[0] = append1[2] >> 16 | append1[3] << 16;
6036 w2[1] = append1[3] >> 16;
6037 break;
6038
6039 case 7:
6040 w0[1] = w0[1] | append0[0] << 24;
6041 w0[2] = append0[0] >> 8 | append0[1] << 24;
6042 w0[3] = append0[1] >> 8 | append0[2] << 24;
6043 w1[0] = append0[2] >> 8 | append0[3] << 24;
6044 w1[1] = append0[3] >> 8 | append1[0] << 24;
6045 w1[2] = append1[0] >> 8 | append1[1] << 24;
6046 w1[3] = append1[1] >> 8 | append1[2] << 24;
6047 w2[0] = append1[2] >> 8 | append1[3] << 24;
6048 w2[1] = append1[3] >> 8;
6049 break;
6050
6051 case 8:
6052 w0[2] = append0[0];
6053 w0[3] = append0[1];
6054 w1[0] = append0[2];
6055 w1[1] = append0[3];
6056 w1[2] = append1[0];
6057 w1[3] = append1[1];
6058 w2[0] = append1[2];
6059 w2[1] = append1[3];
6060 break;
6061
6062 case 9:
6063 w0[2] = w0[2] | append0[0] << 8;
6064 w0[3] = append0[0] >> 24 | append0[1] << 8;
6065 w1[0] = append0[1] >> 24 | append0[2] << 8;
6066 w1[1] = append0[2] >> 24 | append0[3] << 8;
6067 w1[2] = append0[3] >> 24 | append1[0] << 8;
6068 w1[3] = append1[0] >> 24 | append1[1] << 8;
6069 w2[0] = append1[1] >> 24 | append1[2] << 8;
6070 w2[1] = append1[2] >> 24 | append1[3] << 8;
6071 w2[2] = append1[3] >> 24;
6072 break;
6073
6074 case 10:
6075 w0[2] = w0[2] | append0[0] << 16;
6076 w0[3] = append0[0] >> 16 | append0[1] << 16;
6077 w1[0] = append0[1] >> 16 | append0[2] << 16;
6078 w1[1] = append0[2] >> 16 | append0[3] << 16;
6079 w1[2] = append0[3] >> 16 | append1[0] << 16;
6080 w1[3] = append1[0] >> 16 | append1[1] << 16;
6081 w2[0] = append1[1] >> 16 | append1[2] << 16;
6082 w2[1] = append1[2] >> 16 | append1[3] << 16;
6083 w2[2] = append1[3] >> 16;
6084 break;
6085
6086 case 11:
6087 w0[2] = w0[2] | append0[0] << 24;
6088 w0[3] = append0[0] >> 8 | append0[1] << 24;
6089 w1[0] = append0[1] >> 8 | append0[2] << 24;
6090 w1[1] = append0[2] >> 8 | append0[3] << 24;
6091 w1[2] = append0[3] >> 8 | append1[0] << 24;
6092 w1[3] = append1[0] >> 8 | append1[1] << 24;
6093 w2[0] = append1[1] >> 8 | append1[2] << 24;
6094 w2[1] = append1[2] >> 8 | append1[3] << 24;
6095 w2[2] = append1[3] >> 8;
6096 break;
6097
6098 case 12:
6099 w0[3] = append0[0];
6100 w1[0] = append0[1];
6101 w1[1] = append0[2];
6102 w1[2] = append0[3];
6103 w1[3] = append1[0];
6104 w2[0] = append1[1];
6105 w2[1] = append1[2];
6106 w2[2] = append1[3];
6107 break;
6108
6109 case 13:
6110 w0[3] = w0[3] | append0[0] << 8;
6111 w1[0] = append0[0] >> 24 | append0[1] << 8;
6112 w1[1] = append0[1] >> 24 | append0[2] << 8;
6113 w1[2] = append0[2] >> 24 | append0[3] << 8;
6114 w1[3] = append0[3] >> 24 | append1[0] << 8;
6115 w2[0] = append1[0] >> 24 | append1[1] << 8;
6116 w2[1] = append1[1] >> 24 | append1[2] << 8;
6117 w2[2] = append1[2] >> 24 | append1[3] << 8;
6118 w2[3] = append1[3] >> 24;
6119 break;
6120
6121 case 14:
6122 w0[3] = w0[3] | append0[0] << 16;
6123 w1[0] = append0[0] >> 16 | append0[1] << 16;
6124 w1[1] = append0[1] >> 16 | append0[2] << 16;
6125 w1[2] = append0[2] >> 16 | append0[3] << 16;
6126 w1[3] = append0[3] >> 16 | append1[0] << 16;
6127 w2[0] = append1[0] >> 16 | append1[1] << 16;
6128 w2[1] = append1[1] >> 16 | append1[2] << 16;
6129 w2[2] = append1[2] >> 16 | append1[3] << 16;
6130 w2[3] = append1[3] >> 16;
6131 break;
6132
6133 case 15:
6134 w0[3] = w0[3] | append0[0] << 24;
6135 w1[0] = append0[0] >> 8 | append0[1] << 24;
6136 w1[1] = append0[1] >> 8 | append0[2] << 24;
6137 w1[2] = append0[2] >> 8 | append0[3] << 24;
6138 w1[3] = append0[3] >> 8 | append1[0] << 24;
6139 w2[0] = append1[0] >> 8 | append1[1] << 24;
6140 w2[1] = append1[1] >> 8 | append1[2] << 24;
6141 w2[2] = append1[2] >> 8 | append1[3] << 24;
6142 w2[3] = append1[3] >> 8;
6143 break;
6144
6145 case 16:
6146 w1[0] = append0[0];
6147 w1[1] = append0[1];
6148 w1[2] = append0[2];
6149 w1[3] = append0[3];
6150 w2[0] = append1[0];
6151 w2[1] = append1[1];
6152 w2[2] = append1[2];
6153 w2[3] = append1[3];
6154 break;
6155
6156 case 17:
6157 w1[0] = w1[0] | append0[0] << 8;
6158 w1[1] = append0[0] >> 24 | append0[1] << 8;
6159 w1[2] = append0[1] >> 24 | append0[2] << 8;
6160 w1[3] = append0[2] >> 24 | append0[3] << 8;
6161 w2[0] = append0[3] >> 24 | append1[0] << 8;
6162 w2[1] = append1[0] >> 24 | append1[1] << 8;
6163 w2[2] = append1[1] >> 24 | append1[2] << 8;
6164 w2[3] = append1[2] >> 24 | append1[3] << 8;
6165 w3[0] = append1[3] >> 24;
6166 break;
6167
6168 case 18:
6169 w1[0] = w1[0] | append0[0] << 16;
6170 w1[1] = append0[0] >> 16 | append0[1] << 16;
6171 w1[2] = append0[1] >> 16 | append0[2] << 16;
6172 w1[3] = append0[2] >> 16 | append0[3] << 16;
6173 w2[0] = append0[3] >> 16 | append1[0] << 16;
6174 w2[1] = append1[0] >> 16 | append1[1] << 16;
6175 w2[2] = append1[1] >> 16 | append1[2] << 16;
6176 w2[3] = append1[2] >> 16 | append1[3] << 16;
6177 w3[0] = append1[3] >> 16;
6178 break;
6179
6180 case 19:
6181 w1[0] = w1[0] | append0[0] << 24;
6182 w1[1] = append0[0] >> 8 | append0[1] << 24;
6183 w1[2] = append0[1] >> 8 | append0[2] << 24;
6184 w1[3] = append0[2] >> 8 | append0[3] << 24;
6185 w2[0] = append0[3] >> 8 | append1[0] << 24;
6186 w2[1] = append1[0] >> 8 | append1[1] << 24;
6187 w2[2] = append1[1] >> 8 | append1[2] << 24;
6188 w2[3] = append1[2] >> 8 | append1[3] << 24;
6189 w3[0] = append1[3] >> 8;
6190 break;
6191
6192 case 20:
6193 w1[1] = append0[0];
6194 w1[2] = append0[1];
6195 w1[3] = append0[2];
6196 w2[0] = append0[3];
6197 w2[1] = append1[0];
6198 w2[2] = append1[1];
6199 w2[3] = append1[2];
6200 w3[0] = append1[3];
6201 break;
6202
6203 case 21:
6204 w1[1] = w1[1] | append0[0] << 8;
6205 w1[2] = append0[0] >> 24 | append0[1] << 8;
6206 w1[3] = append0[1] >> 24 | append0[2] << 8;
6207 w2[0] = append0[2] >> 24 | append0[3] << 8;
6208 w2[1] = append0[3] >> 24 | append1[0] << 8;
6209 w2[2] = append1[0] >> 24 | append1[1] << 8;
6210 w2[3] = append1[1] >> 24 | append1[2] << 8;
6211 w3[0] = append1[2] >> 24 | append1[3] << 8;
6212 w3[1] = append1[3] >> 24;
6213 break;
6214
6215 case 22:
6216 w1[1] = w1[1] | append0[0] << 16;
6217 w1[2] = append0[0] >> 16 | append0[1] << 16;
6218 w1[3] = append0[1] >> 16 | append0[2] << 16;
6219 w2[0] = append0[2] >> 16 | append0[3] << 16;
6220 w2[1] = append0[3] >> 16 | append1[0] << 16;
6221 w2[2] = append1[0] >> 16 | append1[1] << 16;
6222 w2[3] = append1[1] >> 16 | append1[2] << 16;
6223 w3[0] = append1[2] >> 16 | append1[3] << 16;
6224 w3[1] = append1[3] >> 16;
6225 break;
6226
6227 case 23:
6228 w1[1] = w1[1] | append0[0] << 24;
6229 w1[2] = append0[0] >> 8 | append0[1] << 24;
6230 w1[3] = append0[1] >> 8 | append0[2] << 24;
6231 w2[0] = append0[2] >> 8 | append0[3] << 24;
6232 w2[1] = append0[3] >> 8 | append1[0] << 24;
6233 w2[2] = append1[0] >> 8 | append1[1] << 24;
6234 w2[3] = append1[1] >> 8 | append1[2] << 24;
6235 w3[0] = append1[2] >> 8 | append1[3] << 24;
6236 w3[1] = append1[3] >> 8;
6237 break;
6238
6239 case 24:
6240 w1[2] = append0[0];
6241 w1[3] = append0[1];
6242 w2[0] = append0[2];
6243 w2[1] = append0[3];
6244 w2[2] = append1[0];
6245 w2[3] = append1[1];
6246 w3[0] = append1[2];
6247 w3[1] = append1[3];
6248 break;
6249
6250 case 25:
6251 w1[2] = w1[2] | append0[0] << 8;
6252 w1[3] = append0[0] >> 24 | append0[1] << 8;
6253 w2[0] = append0[1] >> 24 | append0[2] << 8;
6254 w2[1] = append0[2] >> 24 | append0[3] << 8;
6255 w2[2] = append0[3] >> 24 | append1[0] << 8;
6256 w2[3] = append1[0] >> 24 | append1[1] << 8;
6257 w3[0] = append1[1] >> 24 | append1[2] << 8;
6258 w3[1] = append1[2] >> 24 | append1[3] << 8;
6259 break;
6260
6261 case 26:
6262 w1[2] = w1[2] | append0[0] << 16;
6263 w1[3] = append0[0] >> 16 | append0[1] << 16;
6264 w2[0] = append0[1] >> 16 | append0[2] << 16;
6265 w2[1] = append0[2] >> 16 | append0[3] << 16;
6266 w2[2] = append0[3] >> 16 | append1[0] << 16;
6267 w2[3] = append1[0] >> 16 | append1[1] << 16;
6268 w3[0] = append1[1] >> 16 | append1[2] << 16;
6269 w3[1] = append1[2] >> 16 | append1[3] << 16;
6270 break;
6271
6272 case 27:
6273 w1[2] = w1[2] | append0[0] << 24;
6274 w1[3] = append0[0] >> 8 | append0[1] << 24;
6275 w2[0] = append0[1] >> 8 | append0[2] << 24;
6276 w2[1] = append0[2] >> 8 | append0[3] << 24;
6277 w2[2] = append0[3] >> 8 | append1[0] << 24;
6278 w2[3] = append1[0] >> 8 | append1[1] << 24;
6279 w3[0] = append1[1] >> 8 | append1[2] << 24;
6280 w3[1] = append1[2] >> 8 | append1[3] << 24;
6281 break;
6282
6283 case 28:
6284 w1[3] = append0[0];
6285 w2[0] = append0[1];
6286 w2[1] = append0[2];
6287 w2[2] = append0[3];
6288 w2[3] = append1[0];
6289 w3[0] = append1[1];
6290 w3[1] = append1[2];
6291 break;
6292
6293 case 29:
6294 w1[3] = w1[3] | append0[0] << 8;
6295 w2[0] = append0[0] >> 24 | append0[1] << 8;
6296 w2[1] = append0[1] >> 24 | append0[2] << 8;
6297 w2[2] = append0[2] >> 24 | append0[3] << 8;
6298 w2[3] = append0[3] >> 24 | append1[0] << 8;
6299 w3[0] = append1[0] >> 24 | append1[1] << 8;
6300 w3[1] = append1[1] >> 24 | append1[2] << 8;
6301 break;
6302
6303 case 30:
6304 w1[3] = w1[3] | append0[0] << 16;
6305 w2[0] = append0[0] >> 16 | append0[1] << 16;
6306 w2[1] = append0[1] >> 16 | append0[2] << 16;
6307 w2[2] = append0[2] >> 16 | append0[3] << 16;
6308 w2[3] = append0[3] >> 16 | append1[0] << 16;
6309 w3[0] = append1[0] >> 16 | append1[1] << 16;
6310 w3[1] = append1[1] >> 16 | append1[2] << 16;
6311 break;
6312
6313 case 31:
6314 w1[3] = w1[3] | append0[0] << 24;
6315 w2[0] = append0[0] >> 8 | append0[1] << 24;
6316 w2[1] = append0[1] >> 8 | append0[2] << 24;
6317 w2[2] = append0[2] >> 8 | append0[3] << 24;
6318 w2[3] = append0[3] >> 8 | append1[0] << 24;
6319 w3[0] = append1[0] >> 8 | append1[1] << 24;
6320 w3[1] = append1[1] >> 8 | append1[2] << 24;
6321 break;
6322
6323 case 32:
6324 w2[0] = append0[0];
6325 w2[1] = append0[1];
6326 w2[2] = append0[2];
6327 w2[3] = append0[3];
6328 w3[0] = append1[0];
6329 w3[1] = append1[1];
6330 break;
6331 }
6332 }
6333
6334 __device__ static void memcat32_9 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset)
6335 {
6336 switch (offset)
6337 {
6338 case 0:
6339 w0[0] = append0[0];
6340 w0[1] = append0[1];
6341 w0[2] = append0[2];
6342 w0[3] = append0[3];
6343 w1[0] = append1[0];
6344 w1[1] = append1[1];
6345 w1[2] = append1[2];
6346 w1[3] = append1[3];
6347 w2[0] = append2[0];
6348 break;
6349
6350 case 1:
6351 w0[0] = w0[0] | append0[0] << 8;
6352 w0[1] = append0[0] >> 24 | append0[1] << 8;
6353 w0[2] = append0[1] >> 24 | append0[2] << 8;
6354 w0[3] = append0[2] >> 24 | append0[3] << 8;
6355 w1[0] = append0[3] >> 24 | append1[0] << 8;
6356 w1[1] = append1[0] >> 24 | append1[1] << 8;
6357 w1[2] = append1[1] >> 24 | append1[2] << 8;
6358 w1[3] = append1[2] >> 24 | append1[3] << 8;
6359 w2[0] = append1[3] >> 24 | append2[0] << 8;
6360 w2[1] = append2[0] >> 24;
6361 break;
6362
6363 case 2:
6364 w0[0] = w0[0] | append0[0] << 16;
6365 w0[1] = append0[0] >> 16 | append0[1] << 16;
6366 w0[2] = append0[1] >> 16 | append0[2] << 16;
6367 w0[3] = append0[2] >> 16 | append0[3] << 16;
6368 w1[0] = append0[3] >> 16 | append1[0] << 16;
6369 w1[1] = append1[0] >> 16 | append1[1] << 16;
6370 w1[2] = append1[1] >> 16 | append1[2] << 16;
6371 w1[3] = append1[2] >> 16 | append1[3] << 16;
6372 w2[0] = append1[3] >> 16 | append2[0] << 16;
6373 w2[1] = append2[0] >> 16;
6374 break;
6375
6376 case 3:
6377 w0[0] = w0[0] | append0[0] << 24;
6378 w0[1] = append0[0] >> 8 | append0[1] << 24;
6379 w0[2] = append0[1] >> 8 | append0[2] << 24;
6380 w0[3] = append0[2] >> 8 | append0[3] << 24;
6381 w1[0] = append0[3] >> 8 | append1[0] << 24;
6382 w1[1] = append1[0] >> 8 | append1[1] << 24;
6383 w1[2] = append1[1] >> 8 | append1[2] << 24;
6384 w1[3] = append1[2] >> 8 | append1[3] << 24;
6385 w2[0] = append1[3] >> 8 | append2[0] << 24;
6386 w2[1] = append2[0] >> 8;
6387 break;
6388
6389 case 4:
6390 w0[1] = append0[0];
6391 w0[2] = append0[1];
6392 w0[3] = append0[2];
6393 w1[0] = append0[3];
6394 w1[1] = append1[0];
6395 w1[2] = append1[1];
6396 w1[3] = append1[2];
6397 w2[0] = append1[3];
6398 w2[1] = append2[0];
6399 break;
6400
6401 case 5:
6402 w0[1] = w0[1] | append0[0] << 8;
6403 w0[2] = append0[0] >> 24 | append0[1] << 8;
6404 w0[3] = append0[1] >> 24 | append0[2] << 8;
6405 w1[0] = append0[2] >> 24 | append0[3] << 8;
6406 w1[1] = append0[3] >> 24 | append1[0] << 8;
6407 w1[2] = append1[0] >> 24 | append1[1] << 8;
6408 w1[3] = append1[1] >> 24 | append1[2] << 8;
6409 w2[0] = append1[2] >> 24 | append1[3] << 8;
6410 w2[1] = append1[3] >> 24 | append2[0] << 8;
6411 w2[2] = append2[0] >> 24;
6412 break;
6413
6414 case 6:
6415 w0[1] = w0[1] | append0[0] << 16;
6416 w0[2] = append0[0] >> 16 | append0[1] << 16;
6417 w0[3] = append0[1] >> 16 | append0[2] << 16;
6418 w1[0] = append0[2] >> 16 | append0[3] << 16;
6419 w1[1] = append0[3] >> 16 | append1[0] << 16;
6420 w1[2] = append1[0] >> 16 | append1[1] << 16;
6421 w1[3] = append1[1] >> 16 | append1[2] << 16;
6422 w2[0] = append1[2] >> 16 | append1[3] << 16;
6423 w2[1] = append1[3] >> 16 | append2[0] << 16;
6424 w2[2] = append2[0] >> 16;
6425 break;
6426
6427 case 7:
6428 w0[1] = w0[1] | append0[0] << 24;
6429 w0[2] = append0[0] >> 8 | append0[1] << 24;
6430 w0[3] = append0[1] >> 8 | append0[2] << 24;
6431 w1[0] = append0[2] >> 8 | append0[3] << 24;
6432 w1[1] = append0[3] >> 8 | append1[0] << 24;
6433 w1[2] = append1[0] >> 8 | append1[1] << 24;
6434 w1[3] = append1[1] >> 8 | append1[2] << 24;
6435 w2[0] = append1[2] >> 8 | append1[3] << 24;
6436 w2[1] = append1[3] >> 8 | append2[0] << 24;
6437 w2[2] = append2[0] >> 8;
6438 break;
6439
6440 case 8:
6441 w0[2] = append0[0];
6442 w0[3] = append0[1];
6443 w1[0] = append0[2];
6444 w1[1] = append0[3];
6445 w1[2] = append1[0];
6446 w1[3] = append1[1];
6447 w2[0] = append1[2];
6448 w2[1] = append1[3];
6449 w2[2] = append2[0];
6450 break;
6451
6452 case 9:
6453 w0[2] = w0[2] | append0[0] << 8;
6454 w0[3] = append0[0] >> 24 | append0[1] << 8;
6455 w1[0] = append0[1] >> 24 | append0[2] << 8;
6456 w1[1] = append0[2] >> 24 | append0[3] << 8;
6457 w1[2] = append0[3] >> 24 | append1[0] << 8;
6458 w1[3] = append1[0] >> 24 | append1[1] << 8;
6459 w2[0] = append1[1] >> 24 | append1[2] << 8;
6460 w2[1] = append1[2] >> 24 | append1[3] << 8;
6461 w2[2] = append1[3] >> 24 | append2[0] << 8;
6462 w2[3] = append2[0] >> 24;
6463 break;
6464
6465 case 10:
6466 w0[2] = w0[2] | append0[0] << 16;
6467 w0[3] = append0[0] >> 16 | append0[1] << 16;
6468 w1[0] = append0[1] >> 16 | append0[2] << 16;
6469 w1[1] = append0[2] >> 16 | append0[3] << 16;
6470 w1[2] = append0[3] >> 16 | append1[0] << 16;
6471 w1[3] = append1[0] >> 16 | append1[1] << 16;
6472 w2[0] = append1[1] >> 16 | append1[2] << 16;
6473 w2[1] = append1[2] >> 16 | append1[3] << 16;
6474 w2[2] = append1[3] >> 16 | append2[0] << 16;
6475 w2[3] = append2[0] >> 16;
6476 break;
6477
6478 case 11:
6479 w0[2] = w0[2] | append0[0] << 24;
6480 w0[3] = append0[0] >> 8 | append0[1] << 24;
6481 w1[0] = append0[1] >> 8 | append0[2] << 24;
6482 w1[1] = append0[2] >> 8 | append0[3] << 24;
6483 w1[2] = append0[3] >> 8 | append1[0] << 24;
6484 w1[3] = append1[0] >> 8 | append1[1] << 24;
6485 w2[0] = append1[1] >> 8 | append1[2] << 24;
6486 w2[1] = append1[2] >> 8 | append1[3] << 24;
6487 w2[2] = append1[3] >> 8 | append2[0] << 24;
6488 w2[3] = append2[0] >> 8;
6489 break;
6490
6491 case 12:
6492 w0[3] = append0[0];
6493 w1[0] = append0[1];
6494 w1[1] = append0[2];
6495 w1[2] = append0[3];
6496 w1[3] = append1[0];
6497 w2[0] = append1[1];
6498 w2[1] = append1[2];
6499 w2[2] = append1[3];
6500 w2[3] = append2[0];
6501 break;
6502
6503 case 13:
6504 w0[3] = w0[3] | append0[0] << 8;
6505 w1[0] = append0[0] >> 24 | append0[1] << 8;
6506 w1[1] = append0[1] >> 24 | append0[2] << 8;
6507 w1[2] = append0[2] >> 24 | append0[3] << 8;
6508 w1[3] = append0[3] >> 24 | append1[0] << 8;
6509 w2[0] = append1[0] >> 24 | append1[1] << 8;
6510 w2[1] = append1[1] >> 24 | append1[2] << 8;
6511 w2[2] = append1[2] >> 24 | append1[3] << 8;
6512 w2[3] = append1[3] >> 24 | append2[0] << 8;
6513 w3[0] = append2[0] >> 24;
6514 break;
6515
6516 case 14:
6517 w0[3] = w0[3] | append0[0] << 16;
6518 w1[0] = append0[0] >> 16 | append0[1] << 16;
6519 w1[1] = append0[1] >> 16 | append0[2] << 16;
6520 w1[2] = append0[2] >> 16 | append0[3] << 16;
6521 w1[3] = append0[3] >> 16 | append1[0] << 16;
6522 w2[0] = append1[0] >> 16 | append1[1] << 16;
6523 w2[1] = append1[1] >> 16 | append1[2] << 16;
6524 w2[2] = append1[2] >> 16 | append1[3] << 16;
6525 w2[3] = append1[3] >> 16 | append2[0] << 16;
6526 w3[0] = append2[0] >> 16;
6527 break;
6528
6529 case 15:
6530 w0[3] = w0[3] | append0[0] << 24;
6531 w1[0] = append0[0] >> 8 | append0[1] << 24;
6532 w1[1] = append0[1] >> 8 | append0[2] << 24;
6533 w1[2] = append0[2] >> 8 | append0[3] << 24;
6534 w1[3] = append0[3] >> 8 | append1[0] << 24;
6535 w2[0] = append1[0] >> 8 | append1[1] << 24;
6536 w2[1] = append1[1] >> 8 | append1[2] << 24;
6537 w2[2] = append1[2] >> 8 | append1[3] << 24;
6538 w2[3] = append1[3] >> 8 | append2[0] << 24;
6539 w3[0] = append2[0] >> 8;
6540 break;
6541
6542 case 16:
6543 w1[0] = append0[0];
6544 w1[1] = append0[1];
6545 w1[2] = append0[2];
6546 w1[3] = append0[3];
6547 w2[0] = append1[0];
6548 w2[1] = append1[1];
6549 w2[2] = append1[2];
6550 w2[3] = append1[3];
6551 w3[0] = append2[0];
6552 break;
6553
6554 case 17:
6555 w1[0] = w1[0] | append0[0] << 8;
6556 w1[1] = append0[0] >> 24 | append0[1] << 8;
6557 w1[2] = append0[1] >> 24 | append0[2] << 8;
6558 w1[3] = append0[2] >> 24 | append0[3] << 8;
6559 w2[0] = append0[3] >> 24 | append1[0] << 8;
6560 w2[1] = append1[0] >> 24 | append1[1] << 8;
6561 w2[2] = append1[1] >> 24 | append1[2] << 8;
6562 w2[3] = append1[2] >> 24 | append1[3] << 8;
6563 w3[0] = append1[3] >> 24 | append2[0] << 8;
6564 w3[1] = append2[0] >> 24;
6565 break;
6566
6567 case 18:
6568 w1[0] = w1[0] | append0[0] << 16;
6569 w1[1] = append0[0] >> 16 | append0[1] << 16;
6570 w1[2] = append0[1] >> 16 | append0[2] << 16;
6571 w1[3] = append0[2] >> 16 | append0[3] << 16;
6572 w2[0] = append0[3] >> 16 | append1[0] << 16;
6573 w2[1] = append1[0] >> 16 | append1[1] << 16;
6574 w2[2] = append1[1] >> 16 | append1[2] << 16;
6575 w2[3] = append1[2] >> 16 | append1[3] << 16;
6576 w3[0] = append1[3] >> 16 | append2[0] << 16;
6577 w3[1] = append2[0] >> 16;
6578 break;
6579
6580 case 19:
6581 w1[0] = w1[0] | append0[0] << 24;
6582 w1[1] = append0[0] >> 8 | append0[1] << 24;
6583 w1[2] = append0[1] >> 8 | append0[2] << 24;
6584 w1[3] = append0[2] >> 8 | append0[3] << 24;
6585 w2[0] = append0[3] >> 8 | append1[0] << 24;
6586 w2[1] = append1[0] >> 8 | append1[1] << 24;
6587 w2[2] = append1[1] >> 8 | append1[2] << 24;
6588 w2[3] = append1[2] >> 8 | append1[3] << 24;
6589 w3[0] = append1[3] >> 8 | append2[0] << 24;
6590 w3[1] = append2[0] >> 8;
6591 break;
6592
6593 case 20:
6594 w1[1] = append0[0];
6595 w1[2] = append0[1];
6596 w1[3] = append0[2];
6597 w2[0] = append0[3];
6598 w2[1] = append1[0];
6599 w2[2] = append1[1];
6600 w2[3] = append1[2];
6601 w3[0] = append1[3];
6602 w3[1] = append2[0];
6603 break;
6604
6605 case 21:
6606 w1[1] = w1[1] | append0[0] << 8;
6607 w1[2] = append0[0] >> 24 | append0[1] << 8;
6608 w1[3] = append0[1] >> 24 | append0[2] << 8;
6609 w2[0] = append0[2] >> 24 | append0[3] << 8;
6610 w2[1] = append0[3] >> 24 | append1[0] << 8;
6611 w2[2] = append1[0] >> 24 | append1[1] << 8;
6612 w2[3] = append1[1] >> 24 | append1[2] << 8;
6613 w3[0] = append1[2] >> 24 | append1[3] << 8;
6614 w3[1] = append1[3] >> 24 | append2[0] << 8;
6615 break;
6616
6617 case 22:
6618 w1[1] = w1[1] | append0[0] << 16;
6619 w1[2] = append0[0] >> 16 | append0[1] << 16;
6620 w1[3] = append0[1] >> 16 | append0[2] << 16;
6621 w2[0] = append0[2] >> 16 | append0[3] << 16;
6622 w2[1] = append0[3] >> 16 | append1[0] << 16;
6623 w2[2] = append1[0] >> 16 | append1[1] << 16;
6624 w2[3] = append1[1] >> 16 | append1[2] << 16;
6625 w3[0] = append1[2] >> 16 | append1[3] << 16;
6626 w3[1] = append1[3] >> 16 | append2[0] << 16;
6627 break;
6628
6629 case 23:
6630 w1[1] = w1[1] | append0[0] << 24;
6631 w1[2] = append0[0] >> 8 | append0[1] << 24;
6632 w1[3] = append0[1] >> 8 | append0[2] << 24;
6633 w2[0] = append0[2] >> 8 | append0[3] << 24;
6634 w2[1] = append0[3] >> 8 | append1[0] << 24;
6635 w2[2] = append1[0] >> 8 | append1[1] << 24;
6636 w2[3] = append1[1] >> 8 | append1[2] << 24;
6637 w3[0] = append1[2] >> 8 | append1[3] << 24;
6638 w3[1] = append1[3] >> 8 | append2[0] << 24;
6639 break;
6640
6641 case 24:
6642 w1[2] = append0[0];
6643 w1[3] = append0[1];
6644 w2[0] = append0[2];
6645 w2[1] = append0[3];
6646 w2[2] = append1[0];
6647 w2[3] = append1[1];
6648 w3[0] = append1[2];
6649 w3[1] = append1[3];
6650 break;
6651
6652 case 25:
6653 w1[2] = w1[2] | append0[0] << 8;
6654 w1[3] = append0[0] >> 24 | append0[1] << 8;
6655 w2[0] = append0[1] >> 24 | append0[2] << 8;
6656 w2[1] = append0[2] >> 24 | append0[3] << 8;
6657 w2[2] = append0[3] >> 24 | append1[0] << 8;
6658 w2[3] = append1[0] >> 24 | append1[1] << 8;
6659 w3[0] = append1[1] >> 24 | append1[2] << 8;
6660 w3[1] = append1[2] >> 24 | append1[3] << 8;
6661 break;
6662
6663 case 26:
6664 w1[2] = w1[2] | append0[0] << 16;
6665 w1[3] = append0[0] >> 16 | append0[1] << 16;
6666 w2[0] = append0[1] >> 16 | append0[2] << 16;
6667 w2[1] = append0[2] >> 16 | append0[3] << 16;
6668 w2[2] = append0[3] >> 16 | append1[0] << 16;
6669 w2[3] = append1[0] >> 16 | append1[1] << 16;
6670 w3[0] = append1[1] >> 16 | append1[2] << 16;
6671 w3[1] = append1[2] >> 16 | append1[3] << 16;
6672 break;
6673
6674 case 27:
6675 w1[2] = w1[2] | append0[0] << 24;
6676 w1[3] = append0[0] >> 8 | append0[1] << 24;
6677 w2[0] = append0[1] >> 8 | append0[2] << 24;
6678 w2[1] = append0[2] >> 8 | append0[3] << 24;
6679 w2[2] = append0[3] >> 8 | append1[0] << 24;
6680 w2[3] = append1[0] >> 8 | append1[1] << 24;
6681 w3[0] = append1[1] >> 8 | append1[2] << 24;
6682 w3[1] = append1[2] >> 8 | append1[3] << 24;
6683 break;
6684
6685 case 28:
6686 w1[3] = append0[0];
6687 w2[0] = append0[1];
6688 w2[1] = append0[2];
6689 w2[2] = append0[3];
6690 w2[3] = append1[0];
6691 w3[0] = append1[1];
6692 w3[1] = append1[2];
6693 break;
6694
6695 case 29:
6696 w1[3] = w1[3] | append0[0] << 8;
6697 w2[0] = append0[0] >> 24 | append0[1] << 8;
6698 w2[1] = append0[1] >> 24 | append0[2] << 8;
6699 w2[2] = append0[2] >> 24 | append0[3] << 8;
6700 w2[3] = append0[3] >> 24 | append1[0] << 8;
6701 w3[0] = append1[0] >> 24 | append1[1] << 8;
6702 w3[1] = append1[1] >> 24 | append1[2] << 8;
6703 break;
6704
6705 case 30:
6706 w1[3] = w1[3] | append0[0] << 16;
6707 w2[0] = append0[0] >> 16 | append0[1] << 16;
6708 w2[1] = append0[1] >> 16 | append0[2] << 16;
6709 w2[2] = append0[2] >> 16 | append0[3] << 16;
6710 w2[3] = append0[3] >> 16 | append1[0] << 16;
6711 w3[0] = append1[0] >> 16 | append1[1] << 16;
6712 w3[1] = append1[1] >> 16 | append1[2] << 16;
6713 break;
6714
6715 case 31:
6716 w1[3] = w1[3] | append0[0] << 24;
6717 w2[0] = append0[0] >> 8 | append0[1] << 24;
6718 w2[1] = append0[1] >> 8 | append0[2] << 24;
6719 w2[2] = append0[2] >> 8 | append0[3] << 24;
6720 w2[3] = append0[3] >> 8 | append1[0] << 24;
6721 w3[0] = append1[0] >> 8 | append1[1] << 24;
6722 w3[1] = append1[1] >> 8 | append1[2] << 24;
6723 break;
6724
6725 case 32:
6726 w2[0] = append0[0];
6727 w2[1] = append0[1];
6728 w2[2] = append0[2];
6729 w2[3] = append0[3];
6730 w3[0] = append1[0];
6731 w3[1] = append1[1];
6732 break;
6733 }
6734 }
6735
6736 __device__ static void switch_buffer_by_offset (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
6737 {
6738 #if __CUDA_ARCH__ >= 200
6739
6740 const int offset_minus_4 = 4 - (offset % 4);
6741
6742 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
6743
6744 switch (offset / 4)
6745 {
6746 case 0:
6747 w3[1] = __byte_perm (w3[0], w3[1], selector);
6748 w3[0] = __byte_perm (w2[3], w3[0], selector);
6749 w2[3] = __byte_perm (w2[2], w2[3], selector);
6750 w2[2] = __byte_perm (w2[1], w2[2], selector);
6751 w2[1] = __byte_perm (w2[0], w2[1], selector);
6752 w2[0] = __byte_perm (w1[3], w2[0], selector);
6753 w1[3] = __byte_perm (w1[2], w1[3], selector);
6754 w1[2] = __byte_perm (w1[1], w1[2], selector);
6755 w1[1] = __byte_perm (w1[0], w1[1], selector);
6756 w1[0] = __byte_perm (w0[3], w1[0], selector);
6757 w0[3] = __byte_perm (w0[2], w0[3], selector);
6758 w0[2] = __byte_perm (w0[1], w0[2], selector);
6759 w0[1] = __byte_perm (w0[0], w0[1], selector);
6760 w0[0] = __byte_perm ( 0, w0[0], selector);
6761
6762 break;
6763
6764 case 1:
6765 w3[1] = __byte_perm (w2[3], w3[0], selector);
6766 w3[0] = __byte_perm (w2[2], w2[3], selector);
6767 w2[3] = __byte_perm (w2[1], w2[2], selector);
6768 w2[2] = __byte_perm (w2[0], w2[1], selector);
6769 w2[1] = __byte_perm (w1[3], w2[0], selector);
6770 w2[0] = __byte_perm (w1[2], w1[3], selector);
6771 w1[3] = __byte_perm (w1[1], w1[2], selector);
6772 w1[2] = __byte_perm (w1[0], w1[1], selector);
6773 w1[1] = __byte_perm (w0[3], w1[0], selector);
6774 w1[0] = __byte_perm (w0[2], w0[3], selector);
6775 w0[3] = __byte_perm (w0[1], w0[2], selector);
6776 w0[2] = __byte_perm (w0[0], w0[1], selector);
6777 w0[1] = __byte_perm ( 0, w0[0], selector);
6778 w0[0] = 0;
6779
6780 break;
6781
6782 case 2:
6783 w3[1] = __byte_perm (w2[2], w2[3], selector);
6784 w3[0] = __byte_perm (w2[1], w2[2], selector);
6785 w2[3] = __byte_perm (w2[0], w2[1], selector);
6786 w2[2] = __byte_perm (w1[3], w2[0], selector);
6787 w2[1] = __byte_perm (w1[2], w1[3], selector);
6788 w2[0] = __byte_perm (w1[1], w1[2], selector);
6789 w1[3] = __byte_perm (w1[0], w1[1], selector);
6790 w1[2] = __byte_perm (w0[3], w1[0], selector);
6791 w1[1] = __byte_perm (w0[2], w0[3], selector);
6792 w1[0] = __byte_perm (w0[1], w0[2], selector);
6793 w0[3] = __byte_perm (w0[0], w0[1], selector);
6794 w0[2] = __byte_perm ( 0, w0[0], selector);
6795 w0[1] = 0;
6796 w0[0] = 0;
6797
6798 break;
6799
6800 case 3:
6801 w3[1] = __byte_perm (w2[1], w2[2], selector);
6802 w3[0] = __byte_perm (w2[0], w2[1], selector);
6803 w2[3] = __byte_perm (w1[3], w2[0], selector);
6804 w2[2] = __byte_perm (w1[2], w1[3], selector);
6805 w2[1] = __byte_perm (w1[1], w1[2], selector);
6806 w2[0] = __byte_perm (w1[0], w1[1], selector);
6807 w1[3] = __byte_perm (w0[3], w1[0], selector);
6808 w1[2] = __byte_perm (w0[2], w0[3], selector);
6809 w1[1] = __byte_perm (w0[1], w0[2], selector);
6810 w1[0] = __byte_perm (w0[0], w0[1], selector);
6811 w0[3] = __byte_perm ( 0, w0[0], selector);
6812 w0[2] = 0;
6813 w0[1] = 0;
6814 w0[0] = 0;
6815
6816 break;
6817
6818 case 4:
6819 w3[1] = __byte_perm (w2[0], w2[1], selector);
6820 w3[0] = __byte_perm (w1[3], w2[0], selector);
6821 w2[3] = __byte_perm (w1[2], w1[3], selector);
6822 w2[2] = __byte_perm (w1[1], w1[2], selector);
6823 w2[1] = __byte_perm (w1[0], w1[1], selector);
6824 w2[0] = __byte_perm (w0[3], w1[0], selector);
6825 w1[3] = __byte_perm (w0[2], w0[3], selector);
6826 w1[2] = __byte_perm (w0[1], w0[2], selector);
6827 w1[1] = __byte_perm (w0[0], w0[1], selector);
6828 w1[0] = __byte_perm ( 0, w0[0], selector);
6829 w0[3] = 0;
6830 w0[2] = 0;
6831 w0[1] = 0;
6832 w0[0] = 0;
6833
6834 break;
6835
6836 case 5:
6837 w3[1] = __byte_perm (w1[3], w2[0], selector);
6838 w3[0] = __byte_perm (w1[2], w1[3], selector);
6839 w2[3] = __byte_perm (w1[1], w1[2], selector);
6840 w2[2] = __byte_perm (w1[0], w1[1], selector);
6841 w2[1] = __byte_perm (w0[3], w1[0], selector);
6842 w2[0] = __byte_perm (w0[2], w0[3], selector);
6843 w1[3] = __byte_perm (w0[1], w0[2], selector);
6844 w1[2] = __byte_perm (w0[0], w0[1], selector);
6845 w1[1] = __byte_perm ( 0, w0[0], selector);
6846 w1[0] = 0;
6847 w0[3] = 0;
6848 w0[2] = 0;
6849 w0[1] = 0;
6850 w0[0] = 0;
6851
6852 break;
6853
6854 case 6:
6855 w3[1] = __byte_perm (w1[2], w1[3], selector);
6856 w3[0] = __byte_perm (w1[1], w1[2], selector);
6857 w2[3] = __byte_perm (w1[0], w1[1], selector);
6858 w2[2] = __byte_perm (w0[3], w1[0], selector);
6859 w2[1] = __byte_perm (w0[2], w0[3], selector);
6860 w2[0] = __byte_perm (w0[1], w0[2], selector);
6861 w1[3] = __byte_perm (w0[0], w0[1], selector);
6862 w1[2] = __byte_perm ( 0, w0[0], selector);
6863 w1[1] = 0;
6864 w1[0] = 0;
6865 w0[3] = 0;
6866 w0[2] = 0;
6867 w0[1] = 0;
6868 w0[0] = 0;
6869
6870 break;
6871
6872 case 7:
6873 w3[1] = __byte_perm (w1[1], w1[2], selector);
6874 w3[0] = __byte_perm (w1[0], w1[1], selector);
6875 w2[3] = __byte_perm (w0[3], w1[0], selector);
6876 w2[2] = __byte_perm (w0[2], w0[3], selector);
6877 w2[1] = __byte_perm (w0[1], w0[2], selector);
6878 w2[0] = __byte_perm (w0[0], w0[1], selector);
6879 w1[3] = __byte_perm ( 0, w0[0], selector);
6880 w1[2] = 0;
6881 w1[1] = 0;
6882 w1[0] = 0;
6883 w0[3] = 0;
6884 w0[2] = 0;
6885 w0[1] = 0;
6886 w0[0] = 0;
6887
6888 break;
6889
6890 case 8:
6891 w3[1] = __byte_perm (w1[0], w1[1], selector);
6892 w3[0] = __byte_perm (w0[3], w1[0], selector);
6893 w2[3] = __byte_perm (w0[2], w0[3], selector);
6894 w2[2] = __byte_perm (w0[1], w0[2], selector);
6895 w2[1] = __byte_perm (w0[0], w0[1], selector);
6896 w2[0] = __byte_perm ( 0, w0[0], selector);
6897 w1[3] = 0;
6898 w1[2] = 0;
6899 w1[1] = 0;
6900 w1[0] = 0;
6901 w0[3] = 0;
6902 w0[2] = 0;
6903 w0[1] = 0;
6904 w0[0] = 0;
6905
6906 break;
6907
6908 case 9:
6909 w3[1] = __byte_perm (w0[3], w1[0], selector);
6910 w3[0] = __byte_perm (w0[2], w0[3], selector);
6911 w2[3] = __byte_perm (w0[1], w0[2], selector);
6912 w2[2] = __byte_perm (w0[0], w0[1], selector);
6913 w2[1] = __byte_perm ( 0, w0[0], selector);
6914 w2[0] = 0;
6915 w1[3] = 0;
6916 w1[2] = 0;
6917 w1[1] = 0;
6918 w1[0] = 0;
6919 w0[3] = 0;
6920 w0[2] = 0;
6921 w0[1] = 0;
6922 w0[0] = 0;
6923
6924 break;
6925
6926 case 10:
6927 w3[1] = __byte_perm (w0[2], w0[3], selector);
6928 w3[0] = __byte_perm (w0[1], w0[2], selector);
6929 w2[3] = __byte_perm (w0[0], w0[1], selector);
6930 w2[2] = __byte_perm ( 0, w0[0], selector);
6931 w2[1] = 0;
6932 w2[0] = 0;
6933 w1[3] = 0;
6934 w1[2] = 0;
6935 w1[1] = 0;
6936 w1[0] = 0;
6937 w0[3] = 0;
6938 w0[2] = 0;
6939 w0[1] = 0;
6940 w0[0] = 0;
6941
6942 break;
6943
6944 case 11:
6945 w3[1] = __byte_perm (w0[1], w0[2], selector);
6946 w3[0] = __byte_perm (w0[0], w0[1], selector);
6947 w2[3] = __byte_perm ( 0, w0[0], selector);
6948 w2[2] = 0;
6949 w2[1] = 0;
6950 w2[0] = 0;
6951 w1[3] = 0;
6952 w1[2] = 0;
6953 w1[1] = 0;
6954 w1[0] = 0;
6955 w0[3] = 0;
6956 w0[2] = 0;
6957 w0[1] = 0;
6958 w0[0] = 0;
6959
6960 break;
6961
6962 case 12:
6963 w3[1] = __byte_perm (w0[0], w0[1], selector);
6964 w3[0] = __byte_perm ( 0, w0[0], selector);
6965 w2[3] = 0;
6966 w2[2] = 0;
6967 w2[1] = 0;
6968 w2[0] = 0;
6969 w1[3] = 0;
6970 w1[2] = 0;
6971 w1[1] = 0;
6972 w1[0] = 0;
6973 w0[3] = 0;
6974 w0[2] = 0;
6975 w0[1] = 0;
6976 w0[0] = 0;
6977
6978 break;
6979
6980 case 13:
6981 w3[1] = __byte_perm ( 0, w0[0], selector);
6982 w3[0] = 0;
6983 w2[3] = 0;
6984 w2[2] = 0;
6985 w2[1] = 0;
6986 w2[0] = 0;
6987 w1[3] = 0;
6988 w1[2] = 0;
6989 w1[1] = 0;
6990 w1[0] = 0;
6991 w0[3] = 0;
6992 w0[2] = 0;
6993 w0[1] = 0;
6994 w0[0] = 0;
6995
6996 break;
6997 }
6998
6999 #else
7000
7001 u32 tmp0[4];
7002 u32 tmp1[4];
7003 u32 tmp2[1];
7004
7005 switch (offset % 4)
7006 {
7007 case 0:
7008 tmp0[0] = w0[0];
7009 tmp0[1] = w0[1];
7010 tmp0[2] = w0[2];
7011 tmp0[3] = w0[3];
7012 tmp1[0] = w1[0];
7013 tmp1[1] = w1[1];
7014 tmp1[2] = w1[2];
7015 tmp1[3] = w1[3];
7016 tmp2[0] = 0;
7017 break;
7018
7019 case 1:
7020 tmp0[0] = w0[0] << 8;
7021 tmp0[1] = w0[0] >> 24 | w0[1] << 8;
7022 tmp0[2] = w0[1] >> 24 | w0[2] << 8;
7023 tmp0[3] = w0[2] >> 24 | w0[3] << 8;
7024 tmp1[0] = w0[3] >> 24 | w1[0] << 8;
7025 tmp1[1] = w1[0] >> 24 | w1[1] << 8;
7026 tmp1[2] = w1[1] >> 24 | w1[2] << 8;
7027 tmp1[3] = w1[2] >> 24 | w1[3] << 8;
7028 tmp2[0] = w1[3] >> 24;
7029 break;
7030
7031 case 2:
7032 tmp0[0] = w0[0] << 16;
7033 tmp0[1] = w0[0] >> 16 | w0[1] << 16;
7034 tmp0[2] = w0[1] >> 16 | w0[2] << 16;
7035 tmp0[3] = w0[2] >> 16 | w0[3] << 16;
7036 tmp1[0] = w0[3] >> 16 | w1[0] << 16;
7037 tmp1[1] = w1[0] >> 16 | w1[1] << 16;
7038 tmp1[2] = w1[1] >> 16 | w1[2] << 16;
7039 tmp1[3] = w1[2] >> 16 | w1[3] << 16;
7040 tmp2[0] = w1[3] >> 16;
7041 break;
7042
7043 case 3:
7044 tmp0[0] = w0[0] << 24;
7045 tmp0[1] = w0[0] >> 8 | w0[1] << 24;
7046 tmp0[2] = w0[1] >> 8 | w0[2] << 24;
7047 tmp0[3] = w0[2] >> 8 | w0[3] << 24;
7048 tmp1[0] = w0[3] >> 8 | w1[0] << 24;
7049 tmp1[1] = w1[0] >> 8 | w1[1] << 24;
7050 tmp1[2] = w1[1] >> 8 | w1[2] << 24;
7051 tmp1[3] = w1[2] >> 8 | w1[3] << 24;
7052 tmp2[0] = w1[3] >> 8;
7053 break;
7054 }
7055
7056 switch (offset / 4)
7057 {
7058 case 0:
7059 w0[0] = tmp0[0];
7060 w0[1] = tmp0[1];
7061 w0[2] = tmp0[2];
7062 w0[3] = tmp0[3];
7063 w1[0] = tmp1[0];
7064 w1[1] = tmp1[1];
7065 w1[2] = tmp1[2];
7066 w1[3] = tmp1[3];
7067 w2[0] = tmp2[0];
7068 break;
7069
7070 case 1:
7071 w0[0] = 0;
7072 w0[1] = tmp0[0];
7073 w0[2] = tmp0[1];
7074 w0[3] = tmp0[2];
7075 w1[0] = tmp0[3];
7076 w1[1] = tmp1[0];
7077 w1[2] = tmp1[1];
7078 w1[3] = tmp1[2];
7079 w2[0] = tmp1[3];
7080 w2[1] = tmp2[0];
7081 break;
7082
7083 case 2:
7084 w0[0] = 0;
7085 w0[1] = 0;
7086 w0[2] = tmp0[0];
7087 w0[3] = tmp0[1];
7088 w1[0] = tmp0[2];
7089 w1[1] = tmp0[3];
7090 w1[2] = tmp1[0];
7091 w1[3] = tmp1[1];
7092 w2[0] = tmp1[2];
7093 w2[1] = tmp1[3];
7094 w2[2] = tmp2[0];
7095 break;
7096
7097 case 3:
7098 w0[0] = 0;
7099 w0[1] = 0;
7100 w0[2] = 0;
7101 w0[3] = tmp0[0];
7102 w1[0] = tmp0[1];
7103 w1[1] = tmp0[2];
7104 w1[2] = tmp0[3];
7105 w1[3] = tmp1[0];
7106 w2[0] = tmp1[1];
7107 w2[1] = tmp1[2];
7108 w2[2] = tmp1[3];
7109 w2[3] = tmp2[0];
7110 break;
7111
7112 case 4:
7113 w0[0] = 0;
7114 w0[1] = 0;
7115 w0[2] = 0;
7116 w0[3] = 0;
7117 w1[0] = tmp0[0];
7118 w1[1] = tmp0[1];
7119 w1[2] = tmp0[2];
7120 w1[3] = tmp0[3];
7121 w2[0] = tmp1[0];
7122 w2[1] = tmp1[1];
7123 w2[2] = tmp1[2];
7124 w2[3] = tmp1[3];
7125 w3[0] = tmp2[0];
7126 break;
7127
7128 case 5:
7129 w0[0] = 0;
7130 w0[1] = 0;
7131 w0[2] = 0;
7132 w0[3] = 0;
7133 w1[0] = 0;
7134 w1[1] = tmp0[0];
7135 w1[2] = tmp0[1];
7136 w1[3] = tmp0[2];
7137 w2[0] = tmp0[3];
7138 w2[1] = tmp1[0];
7139 w2[2] = tmp1[1];
7140 w2[3] = tmp1[2];
7141 w3[0] = tmp1[3];
7142 w3[1] = tmp2[0];
7143 break;
7144
7145 case 6:
7146 w0[0] = 0;
7147 w0[1] = 0;
7148 w0[2] = 0;
7149 w0[3] = 0;
7150 w1[0] = 0;
7151 w1[1] = 0;
7152 w1[2] = tmp0[0];
7153 w1[3] = tmp0[1];
7154 w2[0] = tmp0[2];
7155 w2[1] = tmp0[3];
7156 w2[2] = tmp1[0];
7157 w2[3] = tmp1[1];
7158 w3[0] = tmp1[2];
7159 w3[1] = tmp1[3];
7160 w3[2] = tmp2[0];
7161 break;
7162
7163 case 7:
7164 w0[0] = 0;
7165 w0[1] = 0;
7166 w0[2] = 0;
7167 w0[3] = 0;
7168 w1[0] = 0;
7169 w1[1] = 0;
7170 w1[2] = 0;
7171 w1[3] = tmp0[0];
7172 w2[0] = tmp0[1];
7173 w2[1] = tmp0[2];
7174 w2[2] = tmp0[3];
7175 w2[3] = tmp1[0];
7176 w3[0] = tmp1[1];
7177 w3[1] = tmp1[2];
7178 w3[2] = tmp1[3];
7179 w3[3] = tmp2[0];
7180 break;
7181
7182 case 8:
7183 w0[0] = 0;
7184 w0[1] = 0;
7185 w0[2] = 0;
7186 w0[3] = 0;
7187 w1[0] = 0;
7188 w1[1] = 0;
7189 w1[2] = 0;
7190 w1[3] = 0;
7191 w2[0] = tmp0[0];
7192 w2[1] = tmp0[1];
7193 w2[2] = tmp0[2];
7194 w2[3] = tmp0[3];
7195 w3[0] = tmp1[0];
7196 w3[1] = tmp1[1];
7197 w3[2] = tmp1[2];
7198 w3[3] = tmp1[3];
7199 break;
7200
7201 case 9:
7202 w0[0] = 0;
7203 w0[1] = 0;
7204 w0[2] = 0;
7205 w0[3] = 0;
7206 w1[0] = 0;
7207 w1[1] = 0;
7208 w1[2] = 0;
7209 w1[3] = 0;
7210 w2[0] = 0;
7211 w2[1] = tmp0[0];
7212 w2[2] = tmp0[1];
7213 w2[3] = tmp0[2];
7214 w3[0] = tmp0[3];
7215 w3[1] = tmp1[0];
7216 w3[2] = tmp1[1];
7217 w3[3] = tmp1[2];
7218 break;
7219
7220 case 10:
7221 w0[0] = 0;
7222 w0[1] = 0;
7223 w0[2] = 0;
7224 w0[3] = 0;
7225 w1[0] = 0;
7226 w1[1] = 0;
7227 w1[2] = 0;
7228 w1[3] = 0;
7229 w2[0] = 0;
7230 w2[1] = 0;
7231 w2[2] = tmp0[0];
7232 w2[3] = tmp0[1];
7233 w3[0] = tmp0[2];
7234 w3[1] = tmp0[3];
7235 w3[2] = tmp1[0];
7236 w3[3] = tmp1[1];
7237 break;
7238
7239 case 11:
7240 w0[0] = 0;
7241 w0[1] = 0;
7242 w0[2] = 0;
7243 w0[3] = 0;
7244 w1[0] = 0;
7245 w1[1] = 0;
7246 w1[2] = 0;
7247 w1[3] = 0;
7248 w2[0] = 0;
7249 w2[1] = 0;
7250 w2[2] = 0;
7251 w2[3] = tmp0[0];
7252 w3[0] = tmp0[1];
7253 w3[1] = tmp0[2];
7254 w3[2] = tmp0[3];
7255 w3[3] = tmp1[0];
7256 break;
7257
7258 case 12:
7259 w0[0] = 0;
7260 w0[1] = 0;
7261 w0[2] = 0;
7262 w0[3] = 0;
7263 w1[0] = 0;
7264 w1[1] = 0;
7265 w1[2] = 0;
7266 w1[3] = 0;
7267 w2[0] = 0;
7268 w2[1] = 0;
7269 w2[2] = 0;
7270 w2[3] = 0;
7271 w3[0] = tmp0[0];
7272 w3[1] = tmp0[1];
7273 w3[2] = tmp0[2];
7274 w3[3] = tmp0[3];
7275 break;
7276
7277 case 13:
7278 w0[0] = 0;
7279 w0[1] = 0;
7280 w0[2] = 0;
7281 w0[3] = 0;
7282 w1[0] = 0;
7283 w1[1] = 0;
7284 w1[2] = 0;
7285 w1[3] = 0;
7286 w2[0] = 0;
7287 w2[1] = 0;
7288 w2[2] = 0;
7289 w2[3] = 0;
7290 w3[0] = 0;
7291 w3[1] = tmp0[0];
7292 w3[2] = tmp0[1];
7293 w3[3] = tmp0[2];
7294 break;
7295
7296 }
7297
7298 #endif
7299 }
7300
7301 __device__ static void switch_buffer_by_offset_be (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
7302 {
7303 const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
7304
7305 switch (offset / 4)
7306 {
7307 case 0:
7308 w3[1] = __byte_perm (w3[1], w3[0], selector);
7309 w3[0] = __byte_perm (w3[0], w2[3], selector);
7310 w2[3] = __byte_perm (w2[3], w2[2], selector);
7311 w2[2] = __byte_perm (w2[2], w2[1], selector);
7312 w2[1] = __byte_perm (w2[1], w2[0], selector);
7313 w2[0] = __byte_perm (w2[0], w1[3], selector);
7314 w1[3] = __byte_perm (w1[3], w1[2], selector);
7315 w1[2] = __byte_perm (w1[2], w1[1], selector);
7316 w1[1] = __byte_perm (w1[1], w1[0], selector);
7317 w1[0] = __byte_perm (w1[0], w0[3], selector);
7318 w0[3] = __byte_perm (w0[3], w0[2], selector);
7319 w0[2] = __byte_perm (w0[2], w0[1], selector);
7320 w0[1] = __byte_perm (w0[1], w0[0], selector);
7321 w0[0] = __byte_perm (w0[0], 0, selector);
7322 break;
7323
7324 case 1:
7325 w3[1] = __byte_perm (w3[0], w2[3], selector);
7326 w3[0] = __byte_perm (w2[3], w2[2], selector);
7327 w2[3] = __byte_perm (w2[2], w2[1], selector);
7328 w2[2] = __byte_perm (w2[1], w2[0], selector);
7329 w2[1] = __byte_perm (w2[0], w1[3], selector);
7330 w2[0] = __byte_perm (w1[3], w1[2], selector);
7331 w1[3] = __byte_perm (w1[2], w1[1], selector);
7332 w1[2] = __byte_perm (w1[1], w1[0], selector);
7333 w1[1] = __byte_perm (w1[0], w0[3], selector);
7334 w1[0] = __byte_perm (w0[3], w0[2], selector);
7335 w0[3] = __byte_perm (w0[2], w0[1], selector);
7336 w0[2] = __byte_perm (w0[1], w0[0], selector);
7337 w0[1] = __byte_perm (w0[0], 0, selector);
7338 w0[0] = 0;
7339 break;
7340
7341 case 2:
7342 w3[1] = __byte_perm (w2[3], w2[2], selector);
7343 w3[0] = __byte_perm (w2[2], w2[1], selector);
7344 w2[3] = __byte_perm (w2[1], w2[0], selector);
7345 w2[2] = __byte_perm (w2[0], w1[3], selector);
7346 w2[1] = __byte_perm (w1[3], w1[2], selector);
7347 w2[0] = __byte_perm (w1[2], w1[1], selector);
7348 w1[3] = __byte_perm (w1[1], w1[0], selector);
7349 w1[2] = __byte_perm (w1[0], w0[3], selector);
7350 w1[1] = __byte_perm (w0[3], w0[2], selector);
7351 w1[0] = __byte_perm (w0[2], w0[1], selector);
7352 w0[3] = __byte_perm (w0[1], w0[0], selector);
7353 w0[2] = __byte_perm (w0[0], 0, selector);
7354 w0[1] = 0;
7355 w0[0] = 0;
7356 break;
7357
7358 case 3:
7359 w3[1] = __byte_perm (w2[2], w2[1], selector);
7360 w3[0] = __byte_perm (w2[1], w2[0], selector);
7361 w2[3] = __byte_perm (w2[0], w1[3], selector);
7362 w2[2] = __byte_perm (w1[3], w1[2], selector);
7363 w2[1] = __byte_perm (w1[2], w1[1], selector);
7364 w2[0] = __byte_perm (w1[1], w1[0], selector);
7365 w1[3] = __byte_perm (w1[0], w0[3], selector);
7366 w1[2] = __byte_perm (w0[3], w0[2], selector);
7367 w1[1] = __byte_perm (w0[2], w0[1], selector);
7368 w1[0] = __byte_perm (w0[1], w0[0], selector);
7369 w0[3] = __byte_perm (w0[0], 0, selector);
7370 w0[2] = 0;
7371 w0[1] = 0;
7372 w0[0] = 0;
7373 break;
7374
7375 case 4:
7376 w3[1] = __byte_perm (w2[1], w2[0], selector);
7377 w3[0] = __byte_perm (w2[0], w1[3], selector);
7378 w2[3] = __byte_perm (w1[3], w1[2], selector);
7379 w2[2] = __byte_perm (w1[2], w1[1], selector);
7380 w2[1] = __byte_perm (w1[1], w1[0], selector);
7381 w2[0] = __byte_perm (w1[0], w0[3], selector);
7382 w1[3] = __byte_perm (w0[3], w0[2], selector);
7383 w1[2] = __byte_perm (w0[2], w0[1], selector);
7384 w1[1] = __byte_perm (w0[1], w0[0], selector);
7385 w1[0] = __byte_perm (w0[0], 0, selector);
7386 w0[3] = 0;
7387 w0[2] = 0;
7388 w0[1] = 0;
7389 w0[0] = 0;
7390 break;
7391
7392 case 5:
7393 w3[1] = __byte_perm (w2[0], w1[3], selector);
7394 w3[0] = __byte_perm (w1[3], w1[2], selector);
7395 w2[3] = __byte_perm (w1[2], w1[1], selector);
7396 w2[2] = __byte_perm (w1[1], w1[0], selector);
7397 w2[1] = __byte_perm (w1[0], w0[3], selector);
7398 w2[0] = __byte_perm (w0[3], w0[2], selector);
7399 w1[3] = __byte_perm (w0[2], w0[1], selector);
7400 w1[2] = __byte_perm (w0[1], w0[0], selector);
7401 w1[1] = __byte_perm (w0[0], 0, selector);
7402 w1[0] = 0;
7403 w0[3] = 0;
7404 w0[2] = 0;
7405 w0[1] = 0;
7406 w0[0] = 0;
7407 break;
7408
7409 case 6:
7410 w3[1] = __byte_perm (w1[3], w1[2], selector);
7411 w3[0] = __byte_perm (w1[2], w1[1], selector);
7412 w2[3] = __byte_perm (w1[1], w1[0], selector);
7413 w2[2] = __byte_perm (w1[0], w0[3], selector);
7414 w2[1] = __byte_perm (w0[3], w0[2], selector);
7415 w2[0] = __byte_perm (w0[2], w0[1], selector);
7416 w1[3] = __byte_perm (w0[1], w0[0], selector);
7417 w1[2] = __byte_perm (w0[0], 0, selector);
7418 w1[1] = 0;
7419 w1[0] = 0;
7420 w0[3] = 0;
7421 w0[2] = 0;
7422 w0[1] = 0;
7423 w0[0] = 0;
7424 break;
7425
7426 case 7:
7427 w3[1] = __byte_perm (w1[2], w1[1], selector);
7428 w3[0] = __byte_perm (w1[1], w1[0], selector);
7429 w2[3] = __byte_perm (w1[0], w0[3], selector);
7430 w2[2] = __byte_perm (w0[3], w0[2], selector);
7431 w2[1] = __byte_perm (w0[2], w0[1], selector);
7432 w2[0] = __byte_perm (w0[1], w0[0], selector);
7433 w1[3] = __byte_perm (w0[0], 0, selector);
7434 w1[2] = 0;
7435 w1[1] = 0;
7436 w1[0] = 0;
7437 w0[3] = 0;
7438 w0[2] = 0;
7439 w0[1] = 0;
7440 w0[0] = 0;
7441 break;
7442
7443 case 8:
7444 w3[1] = __byte_perm (w1[1], w1[0], selector);
7445 w3[0] = __byte_perm (w1[0], w0[3], selector);
7446 w2[3] = __byte_perm (w0[3], w0[2], selector);
7447 w2[2] = __byte_perm (w0[2], w0[1], selector);
7448 w2[1] = __byte_perm (w0[1], w0[0], selector);
7449 w2[0] = __byte_perm (w0[0], 0, selector);
7450 w1[3] = 0;
7451 w1[2] = 0;
7452 w1[1] = 0;
7453 w1[0] = 0;
7454 w0[3] = 0;
7455 w0[2] = 0;
7456 w0[1] = 0;
7457 w0[0] = 0;
7458 break;
7459
7460 case 9:
7461 w3[1] = __byte_perm (w1[0], w0[3], selector);
7462 w3[0] = __byte_perm (w0[3], w0[2], selector);
7463 w2[3] = __byte_perm (w0[2], w0[1], selector);
7464 w2[2] = __byte_perm (w0[1], w0[0], selector);
7465 w2[1] = __byte_perm (w0[0], 0, selector);
7466 w2[0] = 0;
7467 w1[3] = 0;
7468 w1[2] = 0;
7469 w1[1] = 0;
7470 w1[0] = 0;
7471 w0[3] = 0;
7472 w0[2] = 0;
7473 w0[1] = 0;
7474 w0[0] = 0;
7475 break;
7476
7477 case 10:
7478 w3[1] = __byte_perm (w0[3], w0[2], selector);
7479 w3[0] = __byte_perm (w0[2], w0[1], selector);
7480 w2[3] = __byte_perm (w0[1], w0[0], selector);
7481 w2[2] = __byte_perm (w0[0], 0, selector);
7482 w2[1] = 0;
7483 w2[0] = 0;
7484 w1[3] = 0;
7485 w1[2] = 0;
7486 w1[1] = 0;
7487 w1[0] = 0;
7488 w0[3] = 0;
7489 w0[2] = 0;
7490 w0[1] = 0;
7491 w0[0] = 0;
7492 break;
7493
7494 case 11:
7495 w3[1] = __byte_perm (w0[2], w0[1], selector);
7496 w3[0] = __byte_perm (w0[1], w0[0], selector);
7497 w2[3] = __byte_perm (w0[0], 0, selector);
7498 w2[2] = 0;
7499 w2[1] = 0;
7500 w2[0] = 0;
7501 w1[3] = 0;
7502 w1[2] = 0;
7503 w1[1] = 0;
7504 w1[0] = 0;
7505 w0[3] = 0;
7506 w0[2] = 0;
7507 w0[1] = 0;
7508 w0[0] = 0;
7509 break;
7510
7511 case 12:
7512 w3[1] = __byte_perm (w0[1], w0[0], selector);
7513 w3[0] = __byte_perm (w0[0], 0, selector);
7514 w2[3] = 0;
7515 w2[2] = 0;
7516 w2[1] = 0;
7517 w2[0] = 0;
7518 w1[3] = 0;
7519 w1[2] = 0;
7520 w1[1] = 0;
7521 w1[0] = 0;
7522 w0[3] = 0;
7523 w0[2] = 0;
7524 w0[1] = 0;
7525 w0[0] = 0;
7526 break;
7527
7528 case 13:
7529 w3[1] = __byte_perm (w0[0], 0, selector);
7530 w3[0] = 0;
7531 w2[3] = 0;
7532 w2[2] = 0;
7533 w2[1] = 0;
7534 w2[0] = 0;
7535 w1[3] = 0;
7536 w1[2] = 0;
7537 w1[1] = 0;
7538 w1[0] = 0;
7539 w0[3] = 0;
7540 w0[2] = 0;
7541 w0[1] = 0;
7542 w0[0] = 0;
7543 break;
7544 }
7545 }
7546
7547 /**
7548 * vector
7549 */
7550
7551 #ifndef VECT_SIZE1
7552 __device__ static u32x swap_workaround (const u32x v)
7553 {
7554 #if __CUDA_ARCH__ >= 200
7555 return __byte_perm (v, 0, 0x0123);
7556
7557 #else
7558 return (v << 24) + ((v & 0x0000FF00) << 8) + ((v & 0x00FF0000) >> 8) + (v >> 24);
7559
7560 #endif
7561 }
7562
7563 __device__ static u64x swap_workaround (const u64x v)
7564 {
7565 return (((v & 0xff00000000000000) >> 56)
7566 | ((v & 0x00ff000000000000) >> 40)
7567 | ((v & 0x0000ff0000000000) >> 24)
7568 | ((v & 0x000000ff00000000) >> 8)
7569 | ((v & 0x00000000ff000000) << 8)
7570 | ((v & 0x0000000000ff0000) << 24)
7571 | ((v & 0x000000000000ff00) << 40)
7572 | ((v & 0x00000000000000ff) << 56));
7573 }
7574
7575 __device__ static void truncate_block (u32x w[4], const u32 len)
7576 {
7577 switch (len)
7578 {
7579 case 0: w[0] &= 0;
7580 w[1] &= 0;
7581 w[2] &= 0;
7582 w[3] &= 0;
7583 break;
7584 case 1: w[0] &= 0x000000FF;
7585 w[1] &= 0;
7586 w[2] &= 0;
7587 w[3] &= 0;
7588 break;
7589 case 2: w[0] &= 0x0000FFFF;
7590 w[1] &= 0;
7591 w[2] &= 0;
7592 w[3] &= 0;
7593 break;
7594 case 3: w[0] &= 0x00FFFFFF;
7595 w[1] &= 0;
7596 w[2] &= 0;
7597 w[3] &= 0;
7598 break;
7599 case 4: w[1] &= 0;
7600 w[2] &= 0;
7601 w[3] &= 0;
7602 break;
7603 case 5: w[1] &= 0x000000FF;
7604 w[2] &= 0;
7605 w[3] &= 0;
7606 break;
7607 case 6: w[1] &= 0x0000FFFF;
7608 w[2] &= 0;
7609 w[3] &= 0;
7610 break;
7611 case 7: w[1] &= 0x00FFFFFF;
7612 w[2] &= 0;
7613 w[3] &= 0;
7614 break;
7615 case 8: w[2] &= 0;
7616 w[3] &= 0;
7617 break;
7618 case 9: w[2] &= 0x000000FF;
7619 w[3] &= 0;
7620 break;
7621 case 10: w[2] &= 0x0000FFFF;
7622 w[3] &= 0;
7623 break;
7624 case 11: w[2] &= 0x00FFFFFF;
7625 w[3] &= 0;
7626 break;
7627 case 12: w[3] &= 0;
7628 break;
7629 case 13: w[3] &= 0x000000FF;
7630 break;
7631 case 14: w[3] &= 0x0000FFFF;
7632 break;
7633 case 15: w[3] &= 0x00FFFFFF;
7634 break;
7635 }
7636 }
7637
7638 __device__ static void make_unicode (const u32x in[4], u32x out1[4], u32x out2[4])
7639 {
7640 #if __CUDA_ARCH__ >= 200
7641 out2[3] = __byte_perm (in[3], 0, 0x7372);
7642 out2[2] = __byte_perm (in[3], 0, 0x7170);
7643 out2[1] = __byte_perm (in[2], 0, 0x7372);
7644 out2[0] = __byte_perm (in[2], 0, 0x7170);
7645 out1[3] = __byte_perm (in[1], 0, 0x7372);
7646 out1[2] = __byte_perm (in[1], 0, 0x7170);
7647 out1[1] = __byte_perm (in[0], 0, 0x7372);
7648 out1[0] = __byte_perm (in[0], 0, 0x7170);
7649 #else
7650 out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
7651 out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
7652 out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
7653 out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
7654 out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
7655 out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
7656 out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
7657 out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
7658 #endif
7659 }
7660
7661 __device__ static void append_0x01_1 (u32x w0[4], const u32 offset)
7662 {
7663 switch (offset)
7664 {
7665 case 0:
7666 w0[0] = 0x01;
7667 break;
7668
7669 case 1:
7670 w0[0] = w0[0] | 0x0100;
7671 break;
7672
7673 case 2:
7674 w0[0] = w0[0] | 0x010000;
7675 break;
7676
7677 case 3:
7678 w0[0] = w0[0] | 0x01000000;
7679 break;
7680
7681 case 4:
7682 w0[1] = 0x01;
7683 break;
7684
7685 case 5:
7686 w0[1] = w0[1] | 0x0100;
7687 break;
7688
7689 case 6:
7690 w0[1] = w0[1] | 0x010000;
7691 break;
7692
7693 case 7:
7694 w0[1] = w0[1] | 0x01000000;
7695 break;
7696
7697 case 8:
7698 w0[2] = 0x01;
7699 break;
7700
7701 case 9:
7702 w0[2] = w0[2] | 0x0100;
7703 break;
7704
7705 case 10:
7706 w0[2] = w0[2] | 0x010000;
7707 break;
7708
7709 case 11:
7710 w0[2] = w0[2] | 0x01000000;
7711 break;
7712
7713 case 12:
7714 w0[3] = 0x01;
7715 break;
7716
7717 case 13:
7718 w0[3] = w0[3] | 0x0100;
7719 break;
7720
7721 case 14:
7722 w0[3] = w0[3] | 0x010000;
7723 break;
7724
7725 case 15:
7726 w0[3] = w0[3] | 0x01000000;
7727 break;
7728 }
7729 }
7730
7731 __device__ static void append_0x01_2 (u32x w0[4], u32x w1[4], const u32 offset)
7732 {
7733 switch (offset)
7734 {
7735 case 0:
7736 w0[0] = 0x01;
7737 break;
7738
7739 case 1:
7740 w0[0] = w0[0] | 0x0100;
7741 break;
7742
7743 case 2:
7744 w0[0] = w0[0] | 0x010000;
7745 break;
7746
7747 case 3:
7748 w0[0] = w0[0] | 0x01000000;
7749 break;
7750
7751 case 4:
7752 w0[1] = 0x01;
7753 break;
7754
7755 case 5:
7756 w0[1] = w0[1] | 0x0100;
7757 break;
7758
7759 case 6:
7760 w0[1] = w0[1] | 0x010000;
7761 break;
7762
7763 case 7:
7764 w0[1] = w0[1] | 0x01000000;
7765 break;
7766
7767 case 8:
7768 w0[2] = 0x01;
7769 break;
7770
7771 case 9:
7772 w0[2] = w0[2] | 0x0100;
7773 break;
7774
7775 case 10:
7776 w0[2] = w0[2] | 0x010000;
7777 break;
7778
7779 case 11:
7780 w0[2] = w0[2] | 0x01000000;
7781 break;
7782
7783 case 12:
7784 w0[3] = 0x01;
7785 break;
7786
7787 case 13:
7788 w0[3] = w0[3] | 0x0100;
7789 break;
7790
7791 case 14:
7792 w0[3] = w0[3] | 0x010000;
7793 break;
7794
7795 case 15:
7796 w0[3] = w0[3] | 0x01000000;
7797 break;
7798
7799 case 16:
7800 w1[0] = 0x01;
7801 break;
7802
7803 case 17:
7804 w1[0] = w1[0] | 0x0100;
7805 break;
7806
7807 case 18:
7808 w1[0] = w1[0] | 0x010000;
7809 break;
7810
7811 case 19:
7812 w1[0] = w1[0] | 0x01000000;
7813 break;
7814
7815 case 20:
7816 w1[1] = 0x01;
7817 break;
7818
7819 case 21:
7820 w1[1] = w1[1] | 0x0100;
7821 break;
7822
7823 case 22:
7824 w1[1] = w1[1] | 0x010000;
7825 break;
7826
7827 case 23:
7828 w1[1] = w1[1] | 0x01000000;
7829 break;
7830
7831 case 24:
7832 w1[2] = 0x01;
7833 break;
7834
7835 case 25:
7836 w1[2] = w1[2] | 0x0100;
7837 break;
7838
7839 case 26:
7840 w1[2] = w1[2] | 0x010000;
7841 break;
7842
7843 case 27:
7844 w1[2] = w1[2] | 0x01000000;
7845 break;
7846
7847 case 28:
7848 w1[3] = 0x01;
7849 break;
7850
7851 case 29:
7852 w1[3] = w1[3] | 0x0100;
7853 break;
7854
7855 case 30:
7856 w1[3] = w1[3] | 0x010000;
7857 break;
7858
7859 case 31:
7860 w1[3] = w1[3] | 0x01000000;
7861 break;
7862 }
7863 }
7864
7865 __device__ static void append_0x01_3 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
7866 {
7867 switch (offset)
7868 {
7869 case 0:
7870 w0[0] = 0x01;
7871 break;
7872
7873 case 1:
7874 w0[0] = w0[0] | 0x0100;
7875 break;
7876
7877 case 2:
7878 w0[0] = w0[0] | 0x010000;
7879 break;
7880
7881 case 3:
7882 w0[0] = w0[0] | 0x01000000;
7883 break;
7884
7885 case 4:
7886 w0[1] = 0x01;
7887 break;
7888
7889 case 5:
7890 w0[1] = w0[1] | 0x0100;
7891 break;
7892
7893 case 6:
7894 w0[1] = w0[1] | 0x010000;
7895 break;
7896
7897 case 7:
7898 w0[1] = w0[1] | 0x01000000;
7899 break;
7900
7901 case 8:
7902 w0[2] = 0x01;
7903 break;
7904
7905 case 9:
7906 w0[2] = w0[2] | 0x0100;
7907 break;
7908
7909 case 10:
7910 w0[2] = w0[2] | 0x010000;
7911 break;
7912
7913 case 11:
7914 w0[2] = w0[2] | 0x01000000;
7915 break;
7916
7917 case 12:
7918 w0[3] = 0x01;
7919 break;
7920
7921 case 13:
7922 w0[3] = w0[3] | 0x0100;
7923 break;
7924
7925 case 14:
7926 w0[3] = w0[3] | 0x010000;
7927 break;
7928
7929 case 15:
7930 w0[3] = w0[3] | 0x01000000;
7931 break;
7932
7933 case 16:
7934 w1[0] = 0x01;
7935 break;
7936
7937 case 17:
7938 w1[0] = w1[0] | 0x0100;
7939 break;
7940
7941 case 18:
7942 w1[0] = w1[0] | 0x010000;
7943 break;
7944
7945 case 19:
7946 w1[0] = w1[0] | 0x01000000;
7947 break;
7948
7949 case 20:
7950 w1[1] = 0x01;
7951 break;
7952
7953 case 21:
7954 w1[1] = w1[1] | 0x0100;
7955 break;
7956
7957 case 22:
7958 w1[1] = w1[1] | 0x010000;
7959 break;
7960
7961 case 23:
7962 w1[1] = w1[1] | 0x01000000;
7963 break;
7964
7965 case 24:
7966 w1[2] = 0x01;
7967 break;
7968
7969 case 25:
7970 w1[2] = w1[2] | 0x0100;
7971 break;
7972
7973 case 26:
7974 w1[2] = w1[2] | 0x010000;
7975 break;
7976
7977 case 27:
7978 w1[2] = w1[2] | 0x01000000;
7979 break;
7980
7981 case 28:
7982 w1[3] = 0x01;
7983 break;
7984
7985 case 29:
7986 w1[3] = w1[3] | 0x0100;
7987 break;
7988
7989 case 30:
7990 w1[3] = w1[3] | 0x010000;
7991 break;
7992
7993 case 31:
7994 w1[3] = w1[3] | 0x01000000;
7995 break;
7996
7997 case 32:
7998 w2[0] = 0x01;
7999 break;
8000
8001 case 33:
8002 w2[0] = w2[0] | 0x0100;
8003 break;
8004
8005 case 34:
8006 w2[0] = w2[0] | 0x010000;
8007 break;
8008
8009 case 35:
8010 w2[0] = w2[0] | 0x01000000;
8011 break;
8012
8013 case 36:
8014 w2[1] = 0x01;
8015 break;
8016
8017 case 37:
8018 w2[1] = w2[1] | 0x0100;
8019 break;
8020
8021 case 38:
8022 w2[1] = w2[1] | 0x010000;
8023 break;
8024
8025 case 39:
8026 w2[1] = w2[1] | 0x01000000;
8027 break;
8028
8029 case 40:
8030 w2[2] = 0x01;
8031 break;
8032
8033 case 41:
8034 w2[2] = w2[2] | 0x0100;
8035 break;
8036
8037 case 42:
8038 w2[2] = w2[2] | 0x010000;
8039 break;
8040
8041 case 43:
8042 w2[2] = w2[2] | 0x01000000;
8043 break;
8044
8045 case 44:
8046 w2[3] = 0x01;
8047 break;
8048
8049 case 45:
8050 w2[3] = w2[3] | 0x0100;
8051 break;
8052
8053 case 46:
8054 w2[3] = w2[3] | 0x010000;
8055 break;
8056
8057 case 47:
8058 w2[3] = w2[3] | 0x01000000;
8059 break;
8060 }
8061 }
8062
8063 __device__ static void append_0x01_4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
8064 {
8065 switch (offset)
8066 {
8067 case 0:
8068 w0[0] = 0x01;
8069 break;
8070
8071 case 1:
8072 w0[0] = w0[0] | 0x0100;
8073 break;
8074
8075 case 2:
8076 w0[0] = w0[0] | 0x010000;
8077 break;
8078
8079 case 3:
8080 w0[0] = w0[0] | 0x01000000;
8081 break;
8082
8083 case 4:
8084 w0[1] = 0x01;
8085 break;
8086
8087 case 5:
8088 w0[1] = w0[1] | 0x0100;
8089 break;
8090
8091 case 6:
8092 w0[1] = w0[1] | 0x010000;
8093 break;
8094
8095 case 7:
8096 w0[1] = w0[1] | 0x01000000;
8097 break;
8098
8099 case 8:
8100 w0[2] = 0x01;
8101 break;
8102
8103 case 9:
8104 w0[2] = w0[2] | 0x0100;
8105 break;
8106
8107 case 10:
8108 w0[2] = w0[2] | 0x010000;
8109 break;
8110
8111 case 11:
8112 w0[2] = w0[2] | 0x01000000;
8113 break;
8114
8115 case 12:
8116 w0[3] = 0x01;
8117 break;
8118
8119 case 13:
8120 w0[3] = w0[3] | 0x0100;
8121 break;
8122
8123 case 14:
8124 w0[3] = w0[3] | 0x010000;
8125 break;
8126
8127 case 15:
8128 w0[3] = w0[3] | 0x01000000;
8129 break;
8130
8131 case 16:
8132 w1[0] = 0x01;
8133 break;
8134
8135 case 17:
8136 w1[0] = w1[0] | 0x0100;
8137 break;
8138
8139 case 18:
8140 w1[0] = w1[0] | 0x010000;
8141 break;
8142
8143 case 19:
8144 w1[0] = w1[0] | 0x01000000;
8145 break;
8146
8147 case 20:
8148 w1[1] = 0x01;
8149 break;
8150
8151 case 21:
8152 w1[1] = w1[1] | 0x0100;
8153 break;
8154
8155 case 22:
8156 w1[1] = w1[1] | 0x010000;
8157 break;
8158
8159 case 23:
8160 w1[1] = w1[1] | 0x01000000;
8161 break;
8162
8163 case 24:
8164 w1[2] = 0x01;
8165 break;
8166
8167 case 25:
8168 w1[2] = w1[2] | 0x0100;
8169 break;
8170
8171 case 26:
8172 w1[2] = w1[2] | 0x010000;
8173 break;
8174
8175 case 27:
8176 w1[2] = w1[2] | 0x01000000;
8177 break;
8178
8179 case 28:
8180 w1[3] = 0x01;
8181 break;
8182
8183 case 29:
8184 w1[3] = w1[3] | 0x0100;
8185 break;
8186
8187 case 30:
8188 w1[3] = w1[3] | 0x010000;
8189 break;
8190
8191 case 31:
8192 w1[3] = w1[3] | 0x01000000;
8193 break;
8194
8195 case 32:
8196 w2[0] = 0x01;
8197 break;
8198
8199 case 33:
8200 w2[0] = w2[0] | 0x0100;
8201 break;
8202
8203 case 34:
8204 w2[0] = w2[0] | 0x010000;
8205 break;
8206
8207 case 35:
8208 w2[0] = w2[0] | 0x01000000;
8209 break;
8210
8211 case 36:
8212 w2[1] = 0x01;
8213 break;
8214
8215 case 37:
8216 w2[1] = w2[1] | 0x0100;
8217 break;
8218
8219 case 38:
8220 w2[1] = w2[1] | 0x010000;
8221 break;
8222
8223 case 39:
8224 w2[1] = w2[1] | 0x01000000;
8225 break;
8226
8227 case 40:
8228 w2[2] = 0x01;
8229 break;
8230
8231 case 41:
8232 w2[2] = w2[2] | 0x0100;
8233 break;
8234
8235 case 42:
8236 w2[2] = w2[2] | 0x010000;
8237 break;
8238
8239 case 43:
8240 w2[2] = w2[2] | 0x01000000;
8241 break;
8242
8243 case 44:
8244 w2[3] = 0x01;
8245 break;
8246
8247 case 45:
8248 w2[3] = w2[3] | 0x0100;
8249 break;
8250
8251 case 46:
8252 w2[3] = w2[3] | 0x010000;
8253 break;
8254
8255 case 47:
8256 w2[3] = w2[3] | 0x01000000;
8257 break;
8258
8259 case 48:
8260 w3[0] = 0x01;
8261 break;
8262
8263 case 49:
8264 w3[0] = w3[0] | 0x0100;
8265 break;
8266
8267 case 50:
8268 w3[0] = w3[0] | 0x010000;
8269 break;
8270
8271 case 51:
8272 w3[0] = w3[0] | 0x01000000;
8273 break;
8274
8275 case 52:
8276 w3[1] = 0x01;
8277 break;
8278
8279 case 53:
8280 w3[1] = w3[1] | 0x0100;
8281 break;
8282
8283 case 54:
8284 w3[1] = w3[1] | 0x010000;
8285 break;
8286
8287 case 55:
8288 w3[1] = w3[1] | 0x01000000;
8289 break;
8290
8291 case 56:
8292 w3[2] = 0x01;
8293 break;
8294
8295 case 57:
8296 w3[2] = w3[2] | 0x0100;
8297 break;
8298
8299 case 58:
8300 w3[2] = w3[2] | 0x010000;
8301 break;
8302
8303 case 59:
8304 w3[2] = w3[2] | 0x01000000;
8305 break;
8306
8307 case 60:
8308 w3[3] = 0x01;
8309 break;
8310
8311 case 61:
8312 w3[3] = w3[3] | 0x0100;
8313 break;
8314
8315 case 62:
8316 w3[3] = w3[3] | 0x010000;
8317 break;
8318
8319 case 63:
8320 w3[3] = w3[3] | 0x01000000;
8321 break;
8322 }
8323 }
8324
8325 __device__ static void append_0x01_8 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
8326 {
8327 switch (offset)
8328 {
8329 case 0:
8330 w0[0] = 0x01;
8331 break;
8332
8333 case 1:
8334 w0[0] = w0[0] | 0x0100;
8335 break;
8336
8337 case 2:
8338 w0[0] = w0[0] | 0x010000;
8339 break;
8340
8341 case 3:
8342 w0[0] = w0[0] | 0x01000000;
8343 break;
8344
8345 case 4:
8346 w0[1] = 0x01;
8347 break;
8348
8349 case 5:
8350 w0[1] = w0[1] | 0x0100;
8351 break;
8352
8353 case 6:
8354 w0[1] = w0[1] | 0x010000;
8355 break;
8356
8357 case 7:
8358 w0[1] = w0[1] | 0x01000000;
8359 break;
8360
8361 case 8:
8362 w0[2] = 0x01;
8363 break;
8364
8365 case 9:
8366 w0[2] = w0[2] | 0x0100;
8367 break;
8368
8369 case 10:
8370 w0[2] = w0[2] | 0x010000;
8371 break;
8372
8373 case 11:
8374 w0[2] = w0[2] | 0x01000000;
8375 break;
8376
8377 case 12:
8378 w0[3] = 0x01;
8379 break;
8380
8381 case 13:
8382 w0[3] = w0[3] | 0x0100;
8383 break;
8384
8385 case 14:
8386 w0[3] = w0[3] | 0x010000;
8387 break;
8388
8389 case 15:
8390 w0[3] = w0[3] | 0x01000000;
8391 break;
8392
8393 case 16:
8394 w1[0] = 0x01;
8395 break;
8396
8397 case 17:
8398 w1[0] = w1[0] | 0x0100;
8399 break;
8400
8401 case 18:
8402 w1[0] = w1[0] | 0x010000;
8403 break;
8404
8405 case 19:
8406 w1[0] = w1[0] | 0x01000000;
8407 break;
8408
8409 case 20:
8410 w1[1] = 0x01;
8411 break;
8412
8413 case 21:
8414 w1[1] = w1[1] | 0x0100;
8415 break;
8416
8417 case 22:
8418 w1[1] = w1[1] | 0x010000;
8419 break;
8420
8421 case 23:
8422 w1[1] = w1[1] | 0x01000000;
8423 break;
8424
8425 case 24:
8426 w1[2] = 0x01;
8427 break;
8428
8429 case 25:
8430 w1[2] = w1[2] | 0x0100;
8431 break;
8432
8433 case 26:
8434 w1[2] = w1[2] | 0x010000;
8435 break;
8436
8437 case 27:
8438 w1[2] = w1[2] | 0x01000000;
8439 break;
8440
8441 case 28:
8442 w1[3] = 0x01;
8443 break;
8444
8445 case 29:
8446 w1[3] = w1[3] | 0x0100;
8447 break;
8448
8449 case 30:
8450 w1[3] = w1[3] | 0x010000;
8451 break;
8452
8453 case 31:
8454 w1[3] = w1[3] | 0x01000000;
8455 break;
8456
8457 case 32:
8458 w2[0] = 0x01;
8459 break;
8460
8461 case 33:
8462 w2[0] = w2[0] | 0x0100;
8463 break;
8464
8465 case 34:
8466 w2[0] = w2[0] | 0x010000;
8467 break;
8468
8469 case 35:
8470 w2[0] = w2[0] | 0x01000000;
8471 break;
8472
8473 case 36:
8474 w2[1] = 0x01;
8475 break;
8476
8477 case 37:
8478 w2[1] = w2[1] | 0x0100;
8479 break;
8480
8481 case 38:
8482 w2[1] = w2[1] | 0x010000;
8483 break;
8484
8485 case 39:
8486 w2[1] = w2[1] | 0x01000000;
8487 break;
8488
8489 case 40:
8490 w2[2] = 0x01;
8491 break;
8492
8493 case 41:
8494 w2[2] = w2[2] | 0x0100;
8495 break;
8496
8497 case 42:
8498 w2[2] = w2[2] | 0x010000;
8499 break;
8500
8501 case 43:
8502 w2[2] = w2[2] | 0x01000000;
8503 break;
8504
8505 case 44:
8506 w2[3] = 0x01;
8507 break;
8508
8509 case 45:
8510 w2[3] = w2[3] | 0x0100;
8511 break;
8512
8513 case 46:
8514 w2[3] = w2[3] | 0x010000;
8515 break;
8516
8517 case 47:
8518 w2[3] = w2[3] | 0x01000000;
8519 break;
8520
8521 case 48:
8522 w3[0] = 0x01;
8523 break;
8524
8525 case 49:
8526 w3[0] = w3[0] | 0x0100;
8527 break;
8528
8529 case 50:
8530 w3[0] = w3[0] | 0x010000;
8531 break;
8532
8533 case 51:
8534 w3[0] = w3[0] | 0x01000000;
8535 break;
8536
8537 case 52:
8538 w3[1] = 0x01;
8539 break;
8540
8541 case 53:
8542 w3[1] = w3[1] | 0x0100;
8543 break;
8544
8545 case 54:
8546 w3[1] = w3[1] | 0x010000;
8547 break;
8548
8549 case 55:
8550 w3[1] = w3[1] | 0x01000000;
8551 break;
8552
8553 case 56:
8554 w3[2] = 0x01;
8555 break;
8556
8557 case 57:
8558 w3[2] = w3[2] | 0x0100;
8559 break;
8560
8561 case 58:
8562 w3[2] = w3[2] | 0x010000;
8563 break;
8564
8565 case 59:
8566 w3[2] = w3[2] | 0x01000000;
8567 break;
8568
8569 case 60:
8570 w3[3] = 0x01;
8571 break;
8572
8573 case 61:
8574 w3[3] = w3[3] | 0x0100;
8575 break;
8576
8577 case 62:
8578 w3[3] = w3[3] | 0x010000;
8579 break;
8580
8581 case 63:
8582 w3[3] = w3[3] | 0x01000000;
8583 break;
8584
8585 case 64:
8586 w4[0] = 0x01;
8587 break;
8588
8589 case 65:
8590 w4[0] = w4[0] | 0x0100;
8591 break;
8592
8593 case 66:
8594 w4[0] = w4[0] | 0x010000;
8595 break;
8596
8597 case 67:
8598 w4[0] = w4[0] | 0x01000000;
8599 break;
8600
8601 case 68:
8602 w4[1] = 0x01;
8603 break;
8604
8605 case 69:
8606 w4[1] = w4[1] | 0x0100;
8607 break;
8608
8609 case 70:
8610 w4[1] = w4[1] | 0x010000;
8611 break;
8612
8613 case 71:
8614 w4[1] = w4[1] | 0x01000000;
8615 break;
8616
8617 case 72:
8618 w4[2] = 0x01;
8619 break;
8620
8621 case 73:
8622 w4[2] = w4[2] | 0x0100;
8623 break;
8624
8625 case 74:
8626 w4[2] = w4[2] | 0x010000;
8627 break;
8628
8629 case 75:
8630 w4[2] = w4[2] | 0x01000000;
8631 break;
8632
8633 case 76:
8634 w4[3] = 0x01;
8635 break;
8636
8637 case 77:
8638 w4[3] = w4[3] | 0x0100;
8639 break;
8640
8641 case 78:
8642 w4[3] = w4[3] | 0x010000;
8643 break;
8644
8645 case 79:
8646 w4[3] = w4[3] | 0x01000000;
8647 break;
8648
8649 case 80:
8650 w5[0] = 0x01;
8651 break;
8652
8653 case 81:
8654 w5[0] = w5[0] | 0x0100;
8655 break;
8656
8657 case 82:
8658 w5[0] = w5[0] | 0x010000;
8659 break;
8660
8661 case 83:
8662 w5[0] = w5[0] | 0x01000000;
8663 break;
8664
8665 case 84:
8666 w5[1] = 0x01;
8667 break;
8668
8669 case 85:
8670 w5[1] = w5[1] | 0x0100;
8671 break;
8672
8673 case 86:
8674 w5[1] = w5[1] | 0x010000;
8675 break;
8676
8677 case 87:
8678 w5[1] = w5[1] | 0x01000000;
8679 break;
8680
8681 case 88:
8682 w5[2] = 0x01;
8683 break;
8684
8685 case 89:
8686 w5[2] = w5[2] | 0x0100;
8687 break;
8688
8689 case 90:
8690 w5[2] = w5[2] | 0x010000;
8691 break;
8692
8693 case 91:
8694 w5[2] = w5[2] | 0x01000000;
8695 break;
8696
8697 case 92:
8698 w5[3] = 0x01;
8699 break;
8700
8701 case 93:
8702 w5[3] = w5[3] | 0x0100;
8703 break;
8704
8705 case 94:
8706 w5[3] = w5[3] | 0x010000;
8707 break;
8708
8709 case 95:
8710 w5[3] = w5[3] | 0x01000000;
8711 break;
8712
8713 case 96:
8714 w6[0] = 0x01;
8715 break;
8716
8717 case 97:
8718 w6[0] = w6[0] | 0x0100;
8719 break;
8720
8721 case 98:
8722 w6[0] = w6[0] | 0x010000;
8723 break;
8724
8725 case 99:
8726 w6[0] = w6[0] | 0x01000000;
8727 break;
8728
8729 case 100:
8730 w6[1] = 0x01;
8731 break;
8732
8733 case 101:
8734 w6[1] = w6[1] | 0x0100;
8735 break;
8736
8737 case 102:
8738 w6[1] = w6[1] | 0x010000;
8739 break;
8740
8741 case 103:
8742 w6[1] = w6[1] | 0x01000000;
8743 break;
8744
8745 case 104:
8746 w6[2] = 0x01;
8747 break;
8748
8749 case 105:
8750 w6[2] = w6[2] | 0x0100;
8751 break;
8752
8753 case 106:
8754 w6[2] = w6[2] | 0x010000;
8755 break;
8756
8757 case 107:
8758 w6[2] = w6[2] | 0x01000000;
8759 break;
8760
8761 case 108:
8762 w6[3] = 0x01;
8763 break;
8764
8765 case 109:
8766 w6[3] = w6[3] | 0x0100;
8767 break;
8768
8769 case 110:
8770 w6[3] = w6[3] | 0x010000;
8771 break;
8772
8773 case 111:
8774 w6[3] = w6[3] | 0x01000000;
8775 break;
8776
8777 case 112:
8778 w7[0] = 0x01;
8779 break;
8780
8781 case 113:
8782 w7[0] = w7[0] | 0x0100;
8783 break;
8784
8785 case 114:
8786 w7[0] = w7[0] | 0x010000;
8787 break;
8788
8789 case 115:
8790 w7[0] = w7[0] | 0x01000000;
8791 break;
8792
8793 case 116:
8794 w7[1] = 0x01;
8795 break;
8796
8797 case 117:
8798 w7[1] = w7[1] | 0x0100;
8799 break;
8800
8801 case 118:
8802 w7[1] = w7[1] | 0x010000;
8803 break;
8804
8805 case 119:
8806 w7[1] = w7[1] | 0x01000000;
8807 break;
8808
8809 case 120:
8810 w7[2] = 0x01;
8811 break;
8812
8813 case 121:
8814 w7[2] = w7[2] | 0x0100;
8815 break;
8816
8817 case 122:
8818 w7[2] = w7[2] | 0x010000;
8819 break;
8820
8821 case 123:
8822 w7[2] = w7[2] | 0x01000000;
8823 break;
8824
8825 case 124:
8826 w7[3] = 0x01;
8827 break;
8828
8829 case 125:
8830 w7[3] = w7[3] | 0x0100;
8831 break;
8832
8833 case 126:
8834 w7[3] = w7[3] | 0x010000;
8835 break;
8836
8837 case 127:
8838 w7[3] = w7[3] | 0x01000000;
8839 break;
8840 }
8841 }
8842
8843 __device__ static void append_0x02_1 (u32x w0[4], const u32 offset)
8844 {
8845 switch (offset)
8846 {
8847 case 0:
8848 w0[0] = 0x02;
8849 break;
8850
8851 case 1:
8852 w0[0] = w0[0] | 0x0200;
8853 break;
8854
8855 case 2:
8856 w0[0] = w0[0] | 0x020000;
8857 break;
8858
8859 case 3:
8860 w0[0] = w0[0] | 0x02000000;
8861 break;
8862
8863 case 4:
8864 w0[1] = 0x02;
8865 break;
8866
8867 case 5:
8868 w0[1] = w0[1] | 0x0200;
8869 break;
8870
8871 case 6:
8872 w0[1] = w0[1] | 0x020000;
8873 break;
8874
8875 case 7:
8876 w0[1] = w0[1] | 0x02000000;
8877 break;
8878
8879 case 8:
8880 w0[2] = 0x02;
8881 break;
8882
8883 case 9:
8884 w0[2] = w0[2] | 0x0200;
8885 break;
8886
8887 case 10:
8888 w0[2] = w0[2] | 0x020000;
8889 break;
8890
8891 case 11:
8892 w0[2] = w0[2] | 0x02000000;
8893 break;
8894
8895 case 12:
8896 w0[3] = 0x02;
8897 break;
8898
8899 case 13:
8900 w0[3] = w0[3] | 0x0200;
8901 break;
8902
8903 case 14:
8904 w0[3] = w0[3] | 0x020000;
8905 break;
8906
8907 case 15:
8908 w0[3] = w0[3] | 0x02000000;
8909 break;
8910 }
8911 }
8912
8913 __device__ static void append_0x02_2 (u32x w0[4], u32x w1[4], const u32 offset)
8914 {
8915 switch (offset)
8916 {
8917 case 0:
8918 w0[0] = 0x02;
8919 break;
8920
8921 case 1:
8922 w0[0] = w0[0] | 0x0200;
8923 break;
8924
8925 case 2:
8926 w0[0] = w0[0] | 0x020000;
8927 break;
8928
8929 case 3:
8930 w0[0] = w0[0] | 0x02000000;
8931 break;
8932
8933 case 4:
8934 w0[1] = 0x02;
8935 break;
8936
8937 case 5:
8938 w0[1] = w0[1] | 0x0200;
8939 break;
8940
8941 case 6:
8942 w0[1] = w0[1] | 0x020000;
8943 break;
8944
8945 case 7:
8946 w0[1] = w0[1] | 0x02000000;
8947 break;
8948
8949 case 8:
8950 w0[2] = 0x02;
8951 break;
8952
8953 case 9:
8954 w0[2] = w0[2] | 0x0200;
8955 break;
8956
8957 case 10:
8958 w0[2] = w0[2] | 0x020000;
8959 break;
8960
8961 case 11:
8962 w0[2] = w0[2] | 0x02000000;
8963 break;
8964
8965 case 12:
8966 w0[3] = 0x02;
8967 break;
8968
8969 case 13:
8970 w0[3] = w0[3] | 0x0200;
8971 break;
8972
8973 case 14:
8974 w0[3] = w0[3] | 0x020000;
8975 break;
8976
8977 case 15:
8978 w0[3] = w0[3] | 0x02000000;
8979 break;
8980
8981 case 16:
8982 w1[0] = 0x02;
8983 break;
8984
8985 case 17:
8986 w1[0] = w1[0] | 0x0200;
8987 break;
8988
8989 case 18:
8990 w1[0] = w1[0] | 0x020000;
8991 break;
8992
8993 case 19:
8994 w1[0] = w1[0] | 0x02000000;
8995 break;
8996
8997 case 20:
8998 w1[1] = 0x02;
8999 break;
9000
9001 case 21:
9002 w1[1] = w1[1] | 0x0200;
9003 break;
9004
9005 case 22:
9006 w1[1] = w1[1] | 0x020000;
9007 break;
9008
9009 case 23:
9010 w1[1] = w1[1] | 0x02000000;
9011 break;
9012
9013 case 24:
9014 w1[2] = 0x02;
9015 break;
9016
9017 case 25:
9018 w1[2] = w1[2] | 0x0200;
9019 break;
9020
9021 case 26:
9022 w1[2] = w1[2] | 0x020000;
9023 break;
9024
9025 case 27:
9026 w1[2] = w1[2] | 0x02000000;
9027 break;
9028
9029 case 28:
9030 w1[3] = 0x02;
9031 break;
9032
9033 case 29:
9034 w1[3] = w1[3] | 0x0200;
9035 break;
9036
9037 case 30:
9038 w1[3] = w1[3] | 0x020000;
9039 break;
9040
9041 case 31:
9042 w1[3] = w1[3] | 0x02000000;
9043 break;
9044 }
9045 }
9046
9047 __device__ static void append_0x02_3 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
9048 {
9049 switch (offset)
9050 {
9051 case 0:
9052 w0[0] = 0x02;
9053 break;
9054
9055 case 1:
9056 w0[0] = w0[0] | 0x0200;
9057 break;
9058
9059 case 2:
9060 w0[0] = w0[0] | 0x020000;
9061 break;
9062
9063 case 3:
9064 w0[0] = w0[0] | 0x02000000;
9065 break;
9066
9067 case 4:
9068 w0[1] = 0x02;
9069 break;
9070
9071 case 5:
9072 w0[1] = w0[1] | 0x0200;
9073 break;
9074
9075 case 6:
9076 w0[1] = w0[1] | 0x020000;
9077 break;
9078
9079 case 7:
9080 w0[1] = w0[1] | 0x02000000;
9081 break;
9082
9083 case 8:
9084 w0[2] = 0x02;
9085 break;
9086
9087 case 9:
9088 w0[2] = w0[2] | 0x0200;
9089 break;
9090
9091 case 10:
9092 w0[2] = w0[2] | 0x020000;
9093 break;
9094
9095 case 11:
9096 w0[2] = w0[2] | 0x02000000;
9097 break;
9098
9099 case 12:
9100 w0[3] = 0x02;
9101 break;
9102
9103 case 13:
9104 w0[3] = w0[3] | 0x0200;
9105 break;
9106
9107 case 14:
9108 w0[3] = w0[3] | 0x020000;
9109 break;
9110
9111 case 15:
9112 w0[3] = w0[3] | 0x02000000;
9113 break;
9114
9115 case 16:
9116 w1[0] = 0x02;
9117 break;
9118
9119 case 17:
9120 w1[0] = w1[0] | 0x0200;
9121 break;
9122
9123 case 18:
9124 w1[0] = w1[0] | 0x020000;
9125 break;
9126
9127 case 19:
9128 w1[0] = w1[0] | 0x02000000;
9129 break;
9130
9131 case 20:
9132 w1[1] = 0x02;
9133 break;
9134
9135 case 21:
9136 w1[1] = w1[1] | 0x0200;
9137 break;
9138
9139 case 22:
9140 w1[1] = w1[1] | 0x020000;
9141 break;
9142
9143 case 23:
9144 w1[1] = w1[1] | 0x02000000;
9145 break;
9146
9147 case 24:
9148 w1[2] = 0x02;
9149 break;
9150
9151 case 25:
9152 w1[2] = w1[2] | 0x0200;
9153 break;
9154
9155 case 26:
9156 w1[2] = w1[2] | 0x020000;
9157 break;
9158
9159 case 27:
9160 w1[2] = w1[2] | 0x02000000;
9161 break;
9162
9163 case 28:
9164 w1[3] = 0x02;
9165 break;
9166
9167 case 29:
9168 w1[3] = w1[3] | 0x0200;
9169 break;
9170
9171 case 30:
9172 w1[3] = w1[3] | 0x020000;
9173 break;
9174
9175 case 31:
9176 w1[3] = w1[3] | 0x02000000;
9177 break;
9178
9179 case 32:
9180 w2[0] = 0x02;
9181 break;
9182
9183 case 33:
9184 w2[0] = w2[0] | 0x0200;
9185 break;
9186
9187 case 34:
9188 w2[0] = w2[0] | 0x020000;
9189 break;
9190
9191 case 35:
9192 w2[0] = w2[0] | 0x02000000;
9193 break;
9194
9195 case 36:
9196 w2[1] = 0x02;
9197 break;
9198
9199 case 37:
9200 w2[1] = w2[1] | 0x0200;
9201 break;
9202
9203 case 38:
9204 w2[1] = w2[1] | 0x020000;
9205 break;
9206
9207 case 39:
9208 w2[1] = w2[1] | 0x02000000;
9209 break;
9210
9211 case 40:
9212 w2[2] = 0x02;
9213 break;
9214
9215 case 41:
9216 w2[2] = w2[2] | 0x0200;
9217 break;
9218
9219 case 42:
9220 w2[2] = w2[2] | 0x020000;
9221 break;
9222
9223 case 43:
9224 w2[2] = w2[2] | 0x02000000;
9225 break;
9226
9227 case 44:
9228 w2[3] = 0x02;
9229 break;
9230
9231 case 45:
9232 w2[3] = w2[3] | 0x0200;
9233 break;
9234
9235 case 46:
9236 w2[3] = w2[3] | 0x020000;
9237 break;
9238
9239 case 47:
9240 w2[3] = w2[3] | 0x02000000;
9241 break;
9242 }
9243 }
9244
9245 __device__ static void append_0x02_4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
9246 {
9247 switch (offset)
9248 {
9249 case 0:
9250 w0[0] = 0x02;
9251 break;
9252
9253 case 1:
9254 w0[0] = w0[0] | 0x0200;
9255 break;
9256
9257 case 2:
9258 w0[0] = w0[0] | 0x020000;
9259 break;
9260
9261 case 3:
9262 w0[0] = w0[0] | 0x02000000;
9263 break;
9264
9265 case 4:
9266 w0[1] = 0x02;
9267 break;
9268
9269 case 5:
9270 w0[1] = w0[1] | 0x0200;
9271 break;
9272
9273 case 6:
9274 w0[1] = w0[1] | 0x020000;
9275 break;
9276
9277 case 7:
9278 w0[1] = w0[1] | 0x02000000;
9279 break;
9280
9281 case 8:
9282 w0[2] = 0x02;
9283 break;
9284
9285 case 9:
9286 w0[2] = w0[2] | 0x0200;
9287 break;
9288
9289 case 10:
9290 w0[2] = w0[2] | 0x020000;
9291 break;
9292
9293 case 11:
9294 w0[2] = w0[2] | 0x02000000;
9295 break;
9296
9297 case 12:
9298 w0[3] = 0x02;
9299 break;
9300
9301 case 13:
9302 w0[3] = w0[3] | 0x0200;
9303 break;
9304
9305 case 14:
9306 w0[3] = w0[3] | 0x020000;
9307 break;
9308
9309 case 15:
9310 w0[3] = w0[3] | 0x02000000;
9311 break;
9312
9313 case 16:
9314 w1[0] = 0x02;
9315 break;
9316
9317 case 17:
9318 w1[0] = w1[0] | 0x0200;
9319 break;
9320
9321 case 18:
9322 w1[0] = w1[0] | 0x020000;
9323 break;
9324
9325 case 19:
9326 w1[0] = w1[0] | 0x02000000;
9327 break;
9328
9329 case 20:
9330 w1[1] = 0x02;
9331 break;
9332
9333 case 21:
9334 w1[1] = w1[1] | 0x0200;
9335 break;
9336
9337 case 22:
9338 w1[1] = w1[1] | 0x020000;
9339 break;
9340
9341 case 23:
9342 w1[1] = w1[1] | 0x02000000;
9343 break;
9344
9345 case 24:
9346 w1[2] = 0x02;
9347 break;
9348
9349 case 25:
9350 w1[2] = w1[2] | 0x0200;
9351 break;
9352
9353 case 26:
9354 w1[2] = w1[2] | 0x020000;
9355 break;
9356
9357 case 27:
9358 w1[2] = w1[2] | 0x02000000;
9359 break;
9360
9361 case 28:
9362 w1[3] = 0x02;
9363 break;
9364
9365 case 29:
9366 w1[3] = w1[3] | 0x0200;
9367 break;
9368
9369 case 30:
9370 w1[3] = w1[3] | 0x020000;
9371 break;
9372
9373 case 31:
9374 w1[3] = w1[3] | 0x02000000;
9375 break;
9376
9377 case 32:
9378 w2[0] = 0x02;
9379 break;
9380
9381 case 33:
9382 w2[0] = w2[0] | 0x0200;
9383 break;
9384
9385 case 34:
9386 w2[0] = w2[0] | 0x020000;
9387 break;
9388
9389 case 35:
9390 w2[0] = w2[0] | 0x02000000;
9391 break;
9392
9393 case 36:
9394 w2[1] = 0x02;
9395 break;
9396
9397 case 37:
9398 w2[1] = w2[1] | 0x0200;
9399 break;
9400
9401 case 38:
9402 w2[1] = w2[1] | 0x020000;
9403 break;
9404
9405 case 39:
9406 w2[1] = w2[1] | 0x02000000;
9407 break;
9408
9409 case 40:
9410 w2[2] = 0x02;
9411 break;
9412
9413 case 41:
9414 w2[2] = w2[2] | 0x0200;
9415 break;
9416
9417 case 42:
9418 w2[2] = w2[2] | 0x020000;
9419 break;
9420
9421 case 43:
9422 w2[2] = w2[2] | 0x02000000;
9423 break;
9424
9425 case 44:
9426 w2[3] = 0x02;
9427 break;
9428
9429 case 45:
9430 w2[3] = w2[3] | 0x0200;
9431 break;
9432
9433 case 46:
9434 w2[3] = w2[3] | 0x020000;
9435 break;
9436
9437 case 47:
9438 w2[3] = w2[3] | 0x02000000;
9439 break;
9440
9441 case 48:
9442 w3[0] = 0x02;
9443 break;
9444
9445 case 49:
9446 w3[0] = w3[0] | 0x0200;
9447 break;
9448
9449 case 50:
9450 w3[0] = w3[0] | 0x020000;
9451 break;
9452
9453 case 51:
9454 w3[0] = w3[0] | 0x02000000;
9455 break;
9456
9457 case 52:
9458 w3[1] = 0x02;
9459 break;
9460
9461 case 53:
9462 w3[1] = w3[1] | 0x0200;
9463 break;
9464
9465 case 54:
9466 w3[1] = w3[1] | 0x020000;
9467 break;
9468
9469 case 55:
9470 w3[1] = w3[1] | 0x02000000;
9471 break;
9472
9473 case 56:
9474 w3[2] = 0x02;
9475 break;
9476
9477 case 57:
9478 w3[2] = w3[2] | 0x0200;
9479 break;
9480
9481 case 58:
9482 w3[2] = w3[2] | 0x020000;
9483 break;
9484
9485 case 59:
9486 w3[2] = w3[2] | 0x02000000;
9487 break;
9488
9489 case 60:
9490 w3[3] = 0x02;
9491 break;
9492
9493 case 61:
9494 w3[3] = w3[3] | 0x0200;
9495 break;
9496
9497 case 62:
9498 w3[3] = w3[3] | 0x020000;
9499 break;
9500
9501 case 63:
9502 w3[3] = w3[3] | 0x02000000;
9503 break;
9504 }
9505 }
9506
9507 __device__ static void append_0x02_8 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
9508 {
9509 switch (offset)
9510 {
9511 case 0:
9512 w0[0] = 0x02;
9513 break;
9514
9515 case 1:
9516 w0[0] = w0[0] | 0x0200;
9517 break;
9518
9519 case 2:
9520 w0[0] = w0[0] | 0x020000;
9521 break;
9522
9523 case 3:
9524 w0[0] = w0[0] | 0x02000000;
9525 break;
9526
9527 case 4:
9528 w0[1] = 0x02;
9529 break;
9530
9531 case 5:
9532 w0[1] = w0[1] | 0x0200;
9533 break;
9534
9535 case 6:
9536 w0[1] = w0[1] | 0x020000;
9537 break;
9538
9539 case 7:
9540 w0[1] = w0[1] | 0x02000000;
9541 break;
9542
9543 case 8:
9544 w0[2] = 0x02;
9545 break;
9546
9547 case 9:
9548 w0[2] = w0[2] | 0x0200;
9549 break;
9550
9551 case 10:
9552 w0[2] = w0[2] | 0x020000;
9553 break;
9554
9555 case 11:
9556 w0[2] = w0[2] | 0x02000000;
9557 break;
9558
9559 case 12:
9560 w0[3] = 0x02;
9561 break;
9562
9563 case 13:
9564 w0[3] = w0[3] | 0x0200;
9565 break;
9566
9567 case 14:
9568 w0[3] = w0[3] | 0x020000;
9569 break;
9570
9571 case 15:
9572 w0[3] = w0[3] | 0x02000000;
9573 break;
9574
9575 case 16:
9576 w1[0] = 0x02;
9577 break;
9578
9579 case 17:
9580 w1[0] = w1[0] | 0x0200;
9581 break;
9582
9583 case 18:
9584 w1[0] = w1[0] | 0x020000;
9585 break;
9586
9587 case 19:
9588 w1[0] = w1[0] | 0x02000000;
9589 break;
9590
9591 case 20:
9592 w1[1] = 0x02;
9593 break;
9594
9595 case 21:
9596 w1[1] = w1[1] | 0x0200;
9597 break;
9598
9599 case 22:
9600 w1[1] = w1[1] | 0x020000;
9601 break;
9602
9603 case 23:
9604 w1[1] = w1[1] | 0x02000000;
9605 break;
9606
9607 case 24:
9608 w1[2] = 0x02;
9609 break;
9610
9611 case 25:
9612 w1[2] = w1[2] | 0x0200;
9613 break;
9614
9615 case 26:
9616 w1[2] = w1[2] | 0x020000;
9617 break;
9618
9619 case 27:
9620 w1[2] = w1[2] | 0x02000000;
9621 break;
9622
9623 case 28:
9624 w1[3] = 0x02;
9625 break;
9626
9627 case 29:
9628 w1[3] = w1[3] | 0x0200;
9629 break;
9630
9631 case 30:
9632 w1[3] = w1[3] | 0x020000;
9633 break;
9634
9635 case 31:
9636 w1[3] = w1[3] | 0x02000000;
9637 break;
9638
9639 case 32:
9640 w2[0] = 0x02;
9641 break;
9642
9643 case 33:
9644 w2[0] = w2[0] | 0x0200;
9645 break;
9646
9647 case 34:
9648 w2[0] = w2[0] | 0x020000;
9649 break;
9650
9651 case 35:
9652 w2[0] = w2[0] | 0x02000000;
9653 break;
9654
9655 case 36:
9656 w2[1] = 0x02;
9657 break;
9658
9659 case 37:
9660 w2[1] = w2[1] | 0x0200;
9661 break;
9662
9663 case 38:
9664 w2[1] = w2[1] | 0x020000;
9665 break;
9666
9667 case 39:
9668 w2[1] = w2[1] | 0x02000000;
9669 break;
9670
9671 case 40:
9672 w2[2] = 0x02;
9673 break;
9674
9675 case 41:
9676 w2[2] = w2[2] | 0x0200;
9677 break;
9678
9679 case 42:
9680 w2[2] = w2[2] | 0x020000;
9681 break;
9682
9683 case 43:
9684 w2[2] = w2[2] | 0x02000000;
9685 break;
9686
9687 case 44:
9688 w2[3] = 0x02;
9689 break;
9690
9691 case 45:
9692 w2[3] = w2[3] | 0x0200;
9693 break;
9694
9695 case 46:
9696 w2[3] = w2[3] | 0x020000;
9697 break;
9698
9699 case 47:
9700 w2[3] = w2[3] | 0x02000000;
9701 break;
9702
9703 case 48:
9704 w3[0] = 0x02;
9705 break;
9706
9707 case 49:
9708 w3[0] = w3[0] | 0x0200;
9709 break;
9710
9711 case 50:
9712 w3[0] = w3[0] | 0x020000;
9713 break;
9714
9715 case 51:
9716 w3[0] = w3[0] | 0x02000000;
9717 break;
9718
9719 case 52:
9720 w3[1] = 0x02;
9721 break;
9722
9723 case 53:
9724 w3[1] = w3[1] | 0x0200;
9725 break;
9726
9727 case 54:
9728 w3[1] = w3[1] | 0x020000;
9729 break;
9730
9731 case 55:
9732 w3[1] = w3[1] | 0x02000000;
9733 break;
9734
9735 case 56:
9736 w3[2] = 0x02;
9737 break;
9738
9739 case 57:
9740 w3[2] = w3[2] | 0x0200;
9741 break;
9742
9743 case 58:
9744 w3[2] = w3[2] | 0x020000;
9745 break;
9746
9747 case 59:
9748 w3[2] = w3[2] | 0x02000000;
9749 break;
9750
9751 case 60:
9752 w3[3] = 0x02;
9753 break;
9754
9755 case 61:
9756 w3[3] = w3[3] | 0x0200;
9757 break;
9758
9759 case 62:
9760 w3[3] = w3[3] | 0x020000;
9761 break;
9762
9763 case 63:
9764 w3[3] = w3[3] | 0x02000000;
9765 break;
9766
9767 case 64:
9768 w4[0] = 0x02;
9769 break;
9770
9771 case 65:
9772 w4[0] = w4[0] | 0x0200;
9773 break;
9774
9775 case 66:
9776 w4[0] = w4[0] | 0x020000;
9777 break;
9778
9779 case 67:
9780 w4[0] = w4[0] | 0x02000000;
9781 break;
9782
9783 case 68:
9784 w4[1] = 0x02;
9785 break;
9786
9787 case 69:
9788 w4[1] = w4[1] | 0x0200;
9789 break;
9790
9791 case 70:
9792 w4[1] = w4[1] | 0x020000;
9793 break;
9794
9795 case 71:
9796 w4[1] = w4[1] | 0x02000000;
9797 break;
9798
9799 case 72:
9800 w4[2] = 0x02;
9801 break;
9802
9803 case 73:
9804 w4[2] = w4[2] | 0x0200;
9805 break;
9806
9807 case 74:
9808 w4[2] = w4[2] | 0x020000;
9809 break;
9810
9811 case 75:
9812 w4[2] = w4[2] | 0x02000000;
9813 break;
9814
9815 case 76:
9816 w4[3] = 0x02;
9817 break;
9818
9819 case 77:
9820 w4[3] = w4[3] | 0x0200;
9821 break;
9822
9823 case 78:
9824 w4[3] = w4[3] | 0x020000;
9825 break;
9826
9827 case 79:
9828 w4[3] = w4[3] | 0x02000000;
9829 break;
9830
9831 case 80:
9832 w5[0] = 0x02;
9833 break;
9834
9835 case 81:
9836 w5[0] = w5[0] | 0x0200;
9837 break;
9838
9839 case 82:
9840 w5[0] = w5[0] | 0x020000;
9841 break;
9842
9843 case 83:
9844 w5[0] = w5[0] | 0x02000000;
9845 break;
9846
9847 case 84:
9848 w5[1] = 0x02;
9849 break;
9850
9851 case 85:
9852 w5[1] = w5[1] | 0x0200;
9853 break;
9854
9855 case 86:
9856 w5[1] = w5[1] | 0x020000;
9857 break;
9858
9859 case 87:
9860 w5[1] = w5[1] | 0x02000000;
9861 break;
9862
9863 case 88:
9864 w5[2] = 0x02;
9865 break;
9866
9867 case 89:
9868 w5[2] = w5[2] | 0x0200;
9869 break;
9870
9871 case 90:
9872 w5[2] = w5[2] | 0x020000;
9873 break;
9874
9875 case 91:
9876 w5[2] = w5[2] | 0x02000000;
9877 break;
9878
9879 case 92:
9880 w5[3] = 0x02;
9881 break;
9882
9883 case 93:
9884 w5[3] = w5[3] | 0x0200;
9885 break;
9886
9887 case 94:
9888 w5[3] = w5[3] | 0x020000;
9889 break;
9890
9891 case 95:
9892 w5[3] = w5[3] | 0x02000000;
9893 break;
9894
9895 case 96:
9896 w6[0] = 0x02;
9897 break;
9898
9899 case 97:
9900 w6[0] = w6[0] | 0x0200;
9901 break;
9902
9903 case 98:
9904 w6[0] = w6[0] | 0x020000;
9905 break;
9906
9907 case 99:
9908 w6[0] = w6[0] | 0x02000000;
9909 break;
9910
9911 case 100:
9912 w6[1] = 0x02;
9913 break;
9914
9915 case 101:
9916 w6[1] = w6[1] | 0x0200;
9917 break;
9918
9919 case 102:
9920 w6[1] = w6[1] | 0x020000;
9921 break;
9922
9923 case 103:
9924 w6[1] = w6[1] | 0x02000000;
9925 break;
9926
9927 case 104:
9928 w6[2] = 0x02;
9929 break;
9930
9931 case 105:
9932 w6[2] = w6[2] | 0x0200;
9933 break;
9934
9935 case 106:
9936 w6[2] = w6[2] | 0x020000;
9937 break;
9938
9939 case 107:
9940 w6[2] = w6[2] | 0x02000000;
9941 break;
9942
9943 case 108:
9944 w6[3] = 0x02;
9945 break;
9946
9947 case 109:
9948 w6[3] = w6[3] | 0x0200;
9949 break;
9950
9951 case 110:
9952 w6[3] = w6[3] | 0x020000;
9953 break;
9954
9955 case 111:
9956 w6[3] = w6[3] | 0x02000000;
9957 break;
9958
9959 case 112:
9960 w7[0] = 0x02;
9961 break;
9962
9963 case 113:
9964 w7[0] = w7[0] | 0x0200;
9965 break;
9966
9967 case 114:
9968 w7[0] = w7[0] | 0x020000;
9969 break;
9970
9971 case 115:
9972 w7[0] = w7[0] | 0x02000000;
9973 break;
9974
9975 case 116:
9976 w7[1] = 0x02;
9977 break;
9978
9979 case 117:
9980 w7[1] = w7[1] | 0x0200;
9981 break;
9982
9983 case 118:
9984 w7[1] = w7[1] | 0x020000;
9985 break;
9986
9987 case 119:
9988 w7[1] = w7[1] | 0x02000000;
9989 break;
9990
9991 case 120:
9992 w7[2] = 0x02;
9993 break;
9994
9995 case 121:
9996 w7[2] = w7[2] | 0x0200;
9997 break;
9998
9999 case 122:
10000 w7[2] = w7[2] | 0x020000;
10001 break;
10002
10003 case 123:
10004 w7[2] = w7[2] | 0x02000000;
10005 break;
10006
10007 case 124:
10008 w7[3] = 0x02;
10009 break;
10010
10011 case 125:
10012 w7[3] = w7[3] | 0x0200;
10013 break;
10014
10015 case 126:
10016 w7[3] = w7[3] | 0x020000;
10017 break;
10018
10019 case 127:
10020 w7[3] = w7[3] | 0x02000000;
10021 break;
10022 }
10023 }
10024
10025 __device__ static void append_0x80_1 (u32x w0[4], const u32 offset)
10026 {
10027 switch (offset)
10028 {
10029 case 0:
10030 w0[0] = 0x80;
10031 break;
10032
10033 case 1:
10034 w0[0] = w0[0] | 0x8000;
10035 break;
10036
10037 case 2:
10038 w0[0] = w0[0] | 0x800000;
10039 break;
10040
10041 case 3:
10042 w0[0] = w0[0] | 0x80000000;
10043 break;
10044
10045 case 4:
10046 w0[1] = 0x80;
10047 break;
10048
10049 case 5:
10050 w0[1] = w0[1] | 0x8000;
10051 break;
10052
10053 case 6:
10054 w0[1] = w0[1] | 0x800000;
10055 break;
10056
10057 case 7:
10058 w0[1] = w0[1] | 0x80000000;
10059 break;
10060
10061 case 8:
10062 w0[2] = 0x80;
10063 break;
10064
10065 case 9:
10066 w0[2] = w0[2] | 0x8000;
10067 break;
10068
10069 case 10:
10070 w0[2] = w0[2] | 0x800000;
10071 break;
10072
10073 case 11:
10074 w0[2] = w0[2] | 0x80000000;
10075 break;
10076
10077 case 12:
10078 w0[3] = 0x80;
10079 break;
10080
10081 case 13:
10082 w0[3] = w0[3] | 0x8000;
10083 break;
10084
10085 case 14:
10086 w0[3] = w0[3] | 0x800000;
10087 break;
10088
10089 case 15:
10090 w0[3] = w0[3] | 0x80000000;
10091 break;
10092 }
10093 }
10094
10095 __device__ static void append_0x80_2 (u32x w0[4], u32x w1[4], const u32 offset)
10096 {
10097 switch (offset)
10098 {
10099 case 0:
10100 w0[0] = 0x80;
10101 break;
10102
10103 case 1:
10104 w0[0] = w0[0] | 0x8000;
10105 break;
10106
10107 case 2:
10108 w0[0] = w0[0] | 0x800000;
10109 break;
10110
10111 case 3:
10112 w0[0] = w0[0] | 0x80000000;
10113 break;
10114
10115 case 4:
10116 w0[1] = 0x80;
10117 break;
10118
10119 case 5:
10120 w0[1] = w0[1] | 0x8000;
10121 break;
10122
10123 case 6:
10124 w0[1] = w0[1] | 0x800000;
10125 break;
10126
10127 case 7:
10128 w0[1] = w0[1] | 0x80000000;
10129 break;
10130
10131 case 8:
10132 w0[2] = 0x80;
10133 break;
10134
10135 case 9:
10136 w0[2] = w0[2] | 0x8000;
10137 break;
10138
10139 case 10:
10140 w0[2] = w0[2] | 0x800000;
10141 break;
10142
10143 case 11:
10144 w0[2] = w0[2] | 0x80000000;
10145 break;
10146
10147 case 12:
10148 w0[3] = 0x80;
10149 break;
10150
10151 case 13:
10152 w0[3] = w0[3] | 0x8000;
10153 break;
10154
10155 case 14:
10156 w0[3] = w0[3] | 0x800000;
10157 break;
10158
10159 case 15:
10160 w0[3] = w0[3] | 0x80000000;
10161 break;
10162
10163 case 16:
10164 w1[0] = 0x80;
10165 break;
10166
10167 case 17:
10168 w1[0] = w1[0] | 0x8000;
10169 break;
10170
10171 case 18:
10172 w1[0] = w1[0] | 0x800000;
10173 break;
10174
10175 case 19:
10176 w1[0] = w1[0] | 0x80000000;
10177 break;
10178
10179 case 20:
10180 w1[1] = 0x80;
10181 break;
10182
10183 case 21:
10184 w1[1] = w1[1] | 0x8000;
10185 break;
10186
10187 case 22:
10188 w1[1] = w1[1] | 0x800000;
10189 break;
10190
10191 case 23:
10192 w1[1] = w1[1] | 0x80000000;
10193 break;
10194
10195 case 24:
10196 w1[2] = 0x80;
10197 break;
10198
10199 case 25:
10200 w1[2] = w1[2] | 0x8000;
10201 break;
10202
10203 case 26:
10204 w1[2] = w1[2] | 0x800000;
10205 break;
10206
10207 case 27:
10208 w1[2] = w1[2] | 0x80000000;
10209 break;
10210
10211 case 28:
10212 w1[3] = 0x80;
10213 break;
10214
10215 case 29:
10216 w1[3] = w1[3] | 0x8000;
10217 break;
10218
10219 case 30:
10220 w1[3] = w1[3] | 0x800000;
10221 break;
10222
10223 case 31:
10224 w1[3] = w1[3] | 0x80000000;
10225 break;
10226 }
10227 }
10228
10229 __device__ static void append_0x80_3 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
10230 {
10231 switch (offset)
10232 {
10233 case 0:
10234 w0[0] = 0x80;
10235 break;
10236
10237 case 1:
10238 w0[0] = w0[0] | 0x8000;
10239 break;
10240
10241 case 2:
10242 w0[0] = w0[0] | 0x800000;
10243 break;
10244
10245 case 3:
10246 w0[0] = w0[0] | 0x80000000;
10247 break;
10248
10249 case 4:
10250 w0[1] = 0x80;
10251 break;
10252
10253 case 5:
10254 w0[1] = w0[1] | 0x8000;
10255 break;
10256
10257 case 6:
10258 w0[1] = w0[1] | 0x800000;
10259 break;
10260
10261 case 7:
10262 w0[1] = w0[1] | 0x80000000;
10263 break;
10264
10265 case 8:
10266 w0[2] = 0x80;
10267 break;
10268
10269 case 9:
10270 w0[2] = w0[2] | 0x8000;
10271 break;
10272
10273 case 10:
10274 w0[2] = w0[2] | 0x800000;
10275 break;
10276
10277 case 11:
10278 w0[2] = w0[2] | 0x80000000;
10279 break;
10280
10281 case 12:
10282 w0[3] = 0x80;
10283 break;
10284
10285 case 13:
10286 w0[3] = w0[3] | 0x8000;
10287 break;
10288
10289 case 14:
10290 w0[3] = w0[3] | 0x800000;
10291 break;
10292
10293 case 15:
10294 w0[3] = w0[3] | 0x80000000;
10295 break;
10296
10297 case 16:
10298 w1[0] = 0x80;
10299 break;
10300
10301 case 17:
10302 w1[0] = w1[0] | 0x8000;
10303 break;
10304
10305 case 18:
10306 w1[0] = w1[0] | 0x800000;
10307 break;
10308
10309 case 19:
10310 w1[0] = w1[0] | 0x80000000;
10311 break;
10312
10313 case 20:
10314 w1[1] = 0x80;
10315 break;
10316
10317 case 21:
10318 w1[1] = w1[1] | 0x8000;
10319 break;
10320
10321 case 22:
10322 w1[1] = w1[1] | 0x800000;
10323 break;
10324
10325 case 23:
10326 w1[1] = w1[1] | 0x80000000;
10327 break;
10328
10329 case 24:
10330 w1[2] = 0x80;
10331 break;
10332
10333 case 25:
10334 w1[2] = w1[2] | 0x8000;
10335 break;
10336
10337 case 26:
10338 w1[2] = w1[2] | 0x800000;
10339 break;
10340
10341 case 27:
10342 w1[2] = w1[2] | 0x80000000;
10343 break;
10344
10345 case 28:
10346 w1[3] = 0x80;
10347 break;
10348
10349 case 29:
10350 w1[3] = w1[3] | 0x8000;
10351 break;
10352
10353 case 30:
10354 w1[3] = w1[3] | 0x800000;
10355 break;
10356
10357 case 31:
10358 w1[3] = w1[3] | 0x80000000;
10359 break;
10360
10361 case 32:
10362 w2[0] = 0x80;
10363 break;
10364
10365 case 33:
10366 w2[0] = w2[0] | 0x8000;
10367 break;
10368
10369 case 34:
10370 w2[0] = w2[0] | 0x800000;
10371 break;
10372
10373 case 35:
10374 w2[0] = w2[0] | 0x80000000;
10375 break;
10376
10377 case 36:
10378 w2[1] = 0x80;
10379 break;
10380
10381 case 37:
10382 w2[1] = w2[1] | 0x8000;
10383 break;
10384
10385 case 38:
10386 w2[1] = w2[1] | 0x800000;
10387 break;
10388
10389 case 39:
10390 w2[1] = w2[1] | 0x80000000;
10391 break;
10392
10393 case 40:
10394 w2[2] = 0x80;
10395 break;
10396
10397 case 41:
10398 w2[2] = w2[2] | 0x8000;
10399 break;
10400
10401 case 42:
10402 w2[2] = w2[2] | 0x800000;
10403 break;
10404
10405 case 43:
10406 w2[2] = w2[2] | 0x80000000;
10407 break;
10408
10409 case 44:
10410 w2[3] = 0x80;
10411 break;
10412
10413 case 45:
10414 w2[3] = w2[3] | 0x8000;
10415 break;
10416
10417 case 46:
10418 w2[3] = w2[3] | 0x800000;
10419 break;
10420
10421 case 47:
10422 w2[3] = w2[3] | 0x80000000;
10423 break;
10424 }
10425 }
10426
10427 __device__ static void append_0x80_4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
10428 {
10429 switch (offset)
10430 {
10431 case 0:
10432 w0[0] = 0x80;
10433 break;
10434
10435 case 1:
10436 w0[0] = w0[0] | 0x8000;
10437 break;
10438
10439 case 2:
10440 w0[0] = w0[0] | 0x800000;
10441 break;
10442
10443 case 3:
10444 w0[0] = w0[0] | 0x80000000;
10445 break;
10446
10447 case 4:
10448 w0[1] = 0x80;
10449 break;
10450
10451 case 5:
10452 w0[1] = w0[1] | 0x8000;
10453 break;
10454
10455 case 6:
10456 w0[1] = w0[1] | 0x800000;
10457 break;
10458
10459 case 7:
10460 w0[1] = w0[1] | 0x80000000;
10461 break;
10462
10463 case 8:
10464 w0[2] = 0x80;
10465 break;
10466
10467 case 9:
10468 w0[2] = w0[2] | 0x8000;
10469 break;
10470
10471 case 10:
10472 w0[2] = w0[2] | 0x800000;
10473 break;
10474
10475 case 11:
10476 w0[2] = w0[2] | 0x80000000;
10477 break;
10478
10479 case 12:
10480 w0[3] = 0x80;
10481 break;
10482
10483 case 13:
10484 w0[3] = w0[3] | 0x8000;
10485 break;
10486
10487 case 14:
10488 w0[3] = w0[3] | 0x800000;
10489 break;
10490
10491 case 15:
10492 w0[3] = w0[3] | 0x80000000;
10493 break;
10494
10495 case 16:
10496 w1[0] = 0x80;
10497 break;
10498
10499 case 17:
10500 w1[0] = w1[0] | 0x8000;
10501 break;
10502
10503 case 18:
10504 w1[0] = w1[0] | 0x800000;
10505 break;
10506
10507 case 19:
10508 w1[0] = w1[0] | 0x80000000;
10509 break;
10510
10511 case 20:
10512 w1[1] = 0x80;
10513 break;
10514
10515 case 21:
10516 w1[1] = w1[1] | 0x8000;
10517 break;
10518
10519 case 22:
10520 w1[1] = w1[1] | 0x800000;
10521 break;
10522
10523 case 23:
10524 w1[1] = w1[1] | 0x80000000;
10525 break;
10526
10527 case 24:
10528 w1[2] = 0x80;
10529 break;
10530
10531 case 25:
10532 w1[2] = w1[2] | 0x8000;
10533 break;
10534
10535 case 26:
10536 w1[2] = w1[2] | 0x800000;
10537 break;
10538
10539 case 27:
10540 w1[2] = w1[2] | 0x80000000;
10541 break;
10542
10543 case 28:
10544 w1[3] = 0x80;
10545 break;
10546
10547 case 29:
10548 w1[3] = w1[3] | 0x8000;
10549 break;
10550
10551 case 30:
10552 w1[3] = w1[3] | 0x800000;
10553 break;
10554
10555 case 31:
10556 w1[3] = w1[3] | 0x80000000;
10557 break;
10558
10559 case 32:
10560 w2[0] = 0x80;
10561 break;
10562
10563 case 33:
10564 w2[0] = w2[0] | 0x8000;
10565 break;
10566
10567 case 34:
10568 w2[0] = w2[0] | 0x800000;
10569 break;
10570
10571 case 35:
10572 w2[0] = w2[0] | 0x80000000;
10573 break;
10574
10575 case 36:
10576 w2[1] = 0x80;
10577 break;
10578
10579 case 37:
10580 w2[1] = w2[1] | 0x8000;
10581 break;
10582
10583 case 38:
10584 w2[1] = w2[1] | 0x800000;
10585 break;
10586
10587 case 39:
10588 w2[1] = w2[1] | 0x80000000;
10589 break;
10590
10591 case 40:
10592 w2[2] = 0x80;
10593 break;
10594
10595 case 41:
10596 w2[2] = w2[2] | 0x8000;
10597 break;
10598
10599 case 42:
10600 w2[2] = w2[2] | 0x800000;
10601 break;
10602
10603 case 43:
10604 w2[2] = w2[2] | 0x80000000;
10605 break;
10606
10607 case 44:
10608 w2[3] = 0x80;
10609 break;
10610
10611 case 45:
10612 w2[3] = w2[3] | 0x8000;
10613 break;
10614
10615 case 46:
10616 w2[3] = w2[3] | 0x800000;
10617 break;
10618
10619 case 47:
10620 w2[3] = w2[3] | 0x80000000;
10621 break;
10622
10623 case 48:
10624 w3[0] = 0x80;
10625 break;
10626
10627 case 49:
10628 w3[0] = w3[0] | 0x8000;
10629 break;
10630
10631 case 50:
10632 w3[0] = w3[0] | 0x800000;
10633 break;
10634
10635 case 51:
10636 w3[0] = w3[0] | 0x80000000;
10637 break;
10638
10639 case 52:
10640 w3[1] = 0x80;
10641 break;
10642
10643 case 53:
10644 w3[1] = w3[1] | 0x8000;
10645 break;
10646
10647 case 54:
10648 w3[1] = w3[1] | 0x800000;
10649 break;
10650
10651 case 55:
10652 w3[1] = w3[1] | 0x80000000;
10653 break;
10654
10655 case 56:
10656 w3[2] = 0x80;
10657 break;
10658
10659 case 57:
10660 w3[2] = w3[2] | 0x8000;
10661 break;
10662
10663 case 58:
10664 w3[2] = w3[2] | 0x800000;
10665 break;
10666
10667 case 59:
10668 w3[2] = w3[2] | 0x80000000;
10669 break;
10670
10671 case 60:
10672 w3[3] = 0x80;
10673 break;
10674
10675 case 61:
10676 w3[3] = w3[3] | 0x8000;
10677 break;
10678
10679 case 62:
10680 w3[3] = w3[3] | 0x800000;
10681 break;
10682
10683 case 63:
10684 w3[3] = w3[3] | 0x80000000;
10685 break;
10686 }
10687 }
10688
10689 __device__ static void append_0x80_8 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
10690 {
10691 switch (offset)
10692 {
10693 case 0:
10694 w0[0] = 0x80;
10695 break;
10696
10697 case 1:
10698 w0[0] = w0[0] | 0x8000;
10699 break;
10700
10701 case 2:
10702 w0[0] = w0[0] | 0x800000;
10703 break;
10704
10705 case 3:
10706 w0[0] = w0[0] | 0x80000000;
10707 break;
10708
10709 case 4:
10710 w0[1] = 0x80;
10711 break;
10712
10713 case 5:
10714 w0[1] = w0[1] | 0x8000;
10715 break;
10716
10717 case 6:
10718 w0[1] = w0[1] | 0x800000;
10719 break;
10720
10721 case 7:
10722 w0[1] = w0[1] | 0x80000000;
10723 break;
10724
10725 case 8:
10726 w0[2] = 0x80;
10727 break;
10728
10729 case 9:
10730 w0[2] = w0[2] | 0x8000;
10731 break;
10732
10733 case 10:
10734 w0[2] = w0[2] | 0x800000;
10735 break;
10736
10737 case 11:
10738 w0[2] = w0[2] | 0x80000000;
10739 break;
10740
10741 case 12:
10742 w0[3] = 0x80;
10743 break;
10744
10745 case 13:
10746 w0[3] = w0[3] | 0x8000;
10747 break;
10748
10749 case 14:
10750 w0[3] = w0[3] | 0x800000;
10751 break;
10752
10753 case 15:
10754 w0[3] = w0[3] | 0x80000000;
10755 break;
10756
10757 case 16:
10758 w1[0] = 0x80;
10759 break;
10760
10761 case 17:
10762 w1[0] = w1[0] | 0x8000;
10763 break;
10764
10765 case 18:
10766 w1[0] = w1[0] | 0x800000;
10767 break;
10768
10769 case 19:
10770 w1[0] = w1[0] | 0x80000000;
10771 break;
10772
10773 case 20:
10774 w1[1] = 0x80;
10775 break;
10776
10777 case 21:
10778 w1[1] = w1[1] | 0x8000;
10779 break;
10780
10781 case 22:
10782 w1[1] = w1[1] | 0x800000;
10783 break;
10784
10785 case 23:
10786 w1[1] = w1[1] | 0x80000000;
10787 break;
10788
10789 case 24:
10790 w1[2] = 0x80;
10791 break;
10792
10793 case 25:
10794 w1[2] = w1[2] | 0x8000;
10795 break;
10796
10797 case 26:
10798 w1[2] = w1[2] | 0x800000;
10799 break;
10800
10801 case 27:
10802 w1[2] = w1[2] | 0x80000000;
10803 break;
10804
10805 case 28:
10806 w1[3] = 0x80;
10807 break;
10808
10809 case 29:
10810 w1[3] = w1[3] | 0x8000;
10811 break;
10812
10813 case 30:
10814 w1[3] = w1[3] | 0x800000;
10815 break;
10816
10817 case 31:
10818 w1[3] = w1[3] | 0x80000000;
10819 break;
10820
10821 case 32:
10822 w2[0] = 0x80;
10823 break;
10824
10825 case 33:
10826 w2[0] = w2[0] | 0x8000;
10827 break;
10828
10829 case 34:
10830 w2[0] = w2[0] | 0x800000;
10831 break;
10832
10833 case 35:
10834 w2[0] = w2[0] | 0x80000000;
10835 break;
10836
10837 case 36:
10838 w2[1] = 0x80;
10839 break;
10840
10841 case 37:
10842 w2[1] = w2[1] | 0x8000;
10843 break;
10844
10845 case 38:
10846 w2[1] = w2[1] | 0x800000;
10847 break;
10848
10849 case 39:
10850 w2[1] = w2[1] | 0x80000000;
10851 break;
10852
10853 case 40:
10854 w2[2] = 0x80;
10855 break;
10856
10857 case 41:
10858 w2[2] = w2[2] | 0x8000;
10859 break;
10860
10861 case 42:
10862 w2[2] = w2[2] | 0x800000;
10863 break;
10864
10865 case 43:
10866 w2[2] = w2[2] | 0x80000000;
10867 break;
10868
10869 case 44:
10870 w2[3] = 0x80;
10871 break;
10872
10873 case 45:
10874 w2[3] = w2[3] | 0x8000;
10875 break;
10876
10877 case 46:
10878 w2[3] = w2[3] | 0x800000;
10879 break;
10880
10881 case 47:
10882 w2[3] = w2[3] | 0x80000000;
10883 break;
10884
10885 case 48:
10886 w3[0] = 0x80;
10887 break;
10888
10889 case 49:
10890 w3[0] = w3[0] | 0x8000;
10891 break;
10892
10893 case 50:
10894 w3[0] = w3[0] | 0x800000;
10895 break;
10896
10897 case 51:
10898 w3[0] = w3[0] | 0x80000000;
10899 break;
10900
10901 case 52:
10902 w3[1] = 0x80;
10903 break;
10904
10905 case 53:
10906 w3[1] = w3[1] | 0x8000;
10907 break;
10908
10909 case 54:
10910 w3[1] = w3[1] | 0x800000;
10911 break;
10912
10913 case 55:
10914 w3[1] = w3[1] | 0x80000000;
10915 break;
10916
10917 case 56:
10918 w3[2] = 0x80;
10919 break;
10920
10921 case 57:
10922 w3[2] = w3[2] | 0x8000;
10923 break;
10924
10925 case 58:
10926 w3[2] = w3[2] | 0x800000;
10927 break;
10928
10929 case 59:
10930 w3[2] = w3[2] | 0x80000000;
10931 break;
10932
10933 case 60:
10934 w3[3] = 0x80;
10935 break;
10936
10937 case 61:
10938 w3[3] = w3[3] | 0x8000;
10939 break;
10940
10941 case 62:
10942 w3[3] = w3[3] | 0x800000;
10943 break;
10944
10945 case 63:
10946 w3[3] = w3[3] | 0x80000000;
10947 break;
10948
10949 case 64:
10950 w4[0] = 0x80;
10951 break;
10952
10953 case 65:
10954 w4[0] = w4[0] | 0x8000;
10955 break;
10956
10957 case 66:
10958 w4[0] = w4[0] | 0x800000;
10959 break;
10960
10961 case 67:
10962 w4[0] = w4[0] | 0x80000000;
10963 break;
10964
10965 case 68:
10966 w4[1] = 0x80;
10967 break;
10968
10969 case 69:
10970 w4[1] = w4[1] | 0x8000;
10971 break;
10972
10973 case 70:
10974 w4[1] = w4[1] | 0x800000;
10975 break;
10976
10977 case 71:
10978 w4[1] = w4[1] | 0x80000000;
10979 break;
10980
10981 case 72:
10982 w4[2] = 0x80;
10983 break;
10984
10985 case 73:
10986 w4[2] = w4[2] | 0x8000;
10987 break;
10988
10989 case 74:
10990 w4[2] = w4[2] | 0x800000;
10991 break;
10992
10993 case 75:
10994 w4[2] = w4[2] | 0x80000000;
10995 break;
10996
10997 case 76:
10998 w4[3] = 0x80;
10999 break;
11000
11001 case 77:
11002 w4[3] = w4[3] | 0x8000;
11003 break;
11004
11005 case 78:
11006 w4[3] = w4[3] | 0x800000;
11007 break;
11008
11009 case 79:
11010 w4[3] = w4[3] | 0x80000000;
11011 break;
11012
11013 case 80:
11014 w5[0] = 0x80;
11015 break;
11016
11017 case 81:
11018 w5[0] = w5[0] | 0x8000;
11019 break;
11020
11021 case 82:
11022 w5[0] = w5[0] | 0x800000;
11023 break;
11024
11025 case 83:
11026 w5[0] = w5[0] | 0x80000000;
11027 break;
11028
11029 case 84:
11030 w5[1] = 0x80;
11031 break;
11032
11033 case 85:
11034 w5[1] = w5[1] | 0x8000;
11035 break;
11036
11037 case 86:
11038 w5[1] = w5[1] | 0x800000;
11039 break;
11040
11041 case 87:
11042 w5[1] = w5[1] | 0x80000000;
11043 break;
11044
11045 case 88:
11046 w5[2] = 0x80;
11047 break;
11048
11049 case 89:
11050 w5[2] = w5[2] | 0x8000;
11051 break;
11052
11053 case 90:
11054 w5[2] = w5[2] | 0x800000;
11055 break;
11056
11057 case 91:
11058 w5[2] = w5[2] | 0x80000000;
11059 break;
11060
11061 case 92:
11062 w5[3] = 0x80;
11063 break;
11064
11065 case 93:
11066 w5[3] = w5[3] | 0x8000;
11067 break;
11068
11069 case 94:
11070 w5[3] = w5[3] | 0x800000;
11071 break;
11072
11073 case 95:
11074 w5[3] = w5[3] | 0x80000000;
11075 break;
11076
11077 case 96:
11078 w6[0] = 0x80;
11079 break;
11080
11081 case 97:
11082 w6[0] = w6[0] | 0x8000;
11083 break;
11084
11085 case 98:
11086 w6[0] = w6[0] | 0x800000;
11087 break;
11088
11089 case 99:
11090 w6[0] = w6[0] | 0x80000000;
11091 break;
11092
11093 case 100:
11094 w6[1] = 0x80;
11095 break;
11096
11097 case 101:
11098 w6[1] = w6[1] | 0x8000;
11099 break;
11100
11101 case 102:
11102 w6[1] = w6[1] | 0x800000;
11103 break;
11104
11105 case 103:
11106 w6[1] = w6[1] | 0x80000000;
11107 break;
11108
11109 case 104:
11110 w6[2] = 0x80;
11111 break;
11112
11113 case 105:
11114 w6[2] = w6[2] | 0x8000;
11115 break;
11116
11117 case 106:
11118 w6[2] = w6[2] | 0x800000;
11119 break;
11120
11121 case 107:
11122 w6[2] = w6[2] | 0x80000000;
11123 break;
11124
11125 case 108:
11126 w6[3] = 0x80;
11127 break;
11128
11129 case 109:
11130 w6[3] = w6[3] | 0x8000;
11131 break;
11132
11133 case 110:
11134 w6[3] = w6[3] | 0x800000;
11135 break;
11136
11137 case 111:
11138 w6[3] = w6[3] | 0x80000000;
11139 break;
11140
11141 case 112:
11142 w7[0] = 0x80;
11143 break;
11144
11145 case 113:
11146 w7[0] = w7[0] | 0x8000;
11147 break;
11148
11149 case 114:
11150 w7[0] = w7[0] | 0x800000;
11151 break;
11152
11153 case 115:
11154 w7[0] = w7[0] | 0x80000000;
11155 break;
11156
11157 case 116:
11158 w7[1] = 0x80;
11159 break;
11160
11161 case 117:
11162 w7[1] = w7[1] | 0x8000;
11163 break;
11164
11165 case 118:
11166 w7[1] = w7[1] | 0x800000;
11167 break;
11168
11169 case 119:
11170 w7[1] = w7[1] | 0x80000000;
11171 break;
11172
11173 case 120:
11174 w7[2] = 0x80;
11175 break;
11176
11177 case 121:
11178 w7[2] = w7[2] | 0x8000;
11179 break;
11180
11181 case 122:
11182 w7[2] = w7[2] | 0x800000;
11183 break;
11184
11185 case 123:
11186 w7[2] = w7[2] | 0x80000000;
11187 break;
11188
11189 case 124:
11190 w7[3] = 0x80;
11191 break;
11192
11193 case 125:
11194 w7[3] = w7[3] | 0x8000;
11195 break;
11196
11197 case 126:
11198 w7[3] = w7[3] | 0x800000;
11199 break;
11200
11201 case 127:
11202 w7[3] = w7[3] | 0x80000000;
11203 break;
11204 }
11205 }
11206
11207 __device__ static void device_memcat2L (const u32 offset, u32x dst0[2], u32x src_l0[2], u32 src_r0[2])
11208 {
11209 switch (offset)
11210 {
11211 case 1:
11212 dst0[0] = src_l0[0] | src_r0[0] << 8;
11213 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
11214 break;
11215
11216 case 2:
11217 dst0[0] = src_l0[0] | src_r0[0] << 16;
11218 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
11219 break;
11220
11221 case 3:
11222 dst0[0] = src_l0[0] | src_r0[0] << 24;
11223 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
11224 break;
11225
11226 case 4:
11227 dst0[1] = src_r0[0];
11228 break;
11229
11230 case 5:
11231 dst0[1] = src_l0[1] | src_r0[0] << 8;
11232 break;
11233
11234 case 6:
11235 dst0[1] = src_l0[1] | src_r0[0] << 16;
11236 break;
11237
11238 case 7:
11239 dst0[1] = src_l0[1] | src_r0[0] << 24;
11240 break;
11241 }
11242 }
11243
11244 __device__ static void device_memcat2L (const u32 offset, u32x dst0[2], u32x src_l0[2], u32x src_r0[2])
11245 {
11246 switch (offset)
11247 {
11248 case 1:
11249 dst0[0] = src_l0[0] | src_r0[0] << 8;
11250 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
11251 break;
11252
11253 case 2:
11254 dst0[0] = src_l0[0] | src_r0[0] << 16;
11255 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
11256 break;
11257
11258 case 3:
11259 dst0[0] = src_l0[0] | src_r0[0] << 24;
11260 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
11261 break;
11262
11263 case 4:
11264 dst0[1] = src_r0[0];
11265 break;
11266
11267 case 5:
11268 dst0[1] = src_l0[1] | src_r0[0] << 8;
11269 break;
11270
11271 case 6:
11272 dst0[1] = src_l0[1] | src_r0[0] << 16;
11273 break;
11274
11275 case 7:
11276 dst0[1] = src_l0[1] | src_r0[0] << 24;
11277 break;
11278 }
11279 }
11280
11281 __device__ static void device_memcat4L (const u32 offset, u32x dst0[4], u32x src_l0[4], u32 src_r0[4])
11282 {
11283 switch (offset)
11284 {
11285 case 1:
11286 dst0[0] = src_l0[0] | src_r0[0] << 8;
11287 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
11288 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
11289 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
11290 break;
11291
11292 case 2:
11293 dst0[0] = src_l0[0] | src_r0[0] << 16;
11294 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
11295 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
11296 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
11297 break;
11298
11299 case 3:
11300 dst0[0] = src_l0[0] | src_r0[0] << 24;
11301 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
11302 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
11303 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
11304 break;
11305
11306 case 4:
11307 dst0[1] = src_r0[0];
11308 dst0[2] = src_r0[1];
11309 dst0[3] = src_r0[2];
11310 break;
11311
11312 case 5:
11313 dst0[1] = src_l0[1] | src_r0[0] << 8;
11314 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
11315 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
11316 break;
11317
11318 case 6:
11319 dst0[1] = src_l0[1] | src_r0[0] << 16;
11320 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
11321 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
11322 break;
11323
11324 case 7:
11325 dst0[1] = src_l0[1] | src_r0[0] << 24;
11326 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
11327 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
11328 break;
11329
11330 case 8:
11331 dst0[2] = src_r0[0];
11332 dst0[3] = src_r0[1];
11333 break;
11334
11335 case 9:
11336 dst0[2] = src_l0[2] | src_r0[0] << 8;
11337 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
11338 break;
11339
11340 case 10:
11341 dst0[2] = src_l0[2] | src_r0[0] << 16;
11342 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
11343 break;
11344
11345 case 11:
11346 dst0[2] = src_l0[2] | src_r0[0] << 24;
11347 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
11348 break;
11349
11350 case 12:
11351 dst0[3] = src_r0[0];
11352 break;
11353
11354 case 13:
11355 dst0[3] = src_l0[3] | src_r0[0] << 8;
11356 break;
11357
11358 case 14:
11359 dst0[3] = src_l0[3] | src_r0[0] << 16;
11360 break;
11361
11362 case 15:
11363 dst0[3] = src_l0[3] | src_r0[0] << 24;
11364 break;
11365 }
11366 }
11367
11368 __device__ static void device_memcat4L (const u32 offset, u32x dst0[4], u32x src_l0[4], u32x src_r0[4])
11369 {
11370 switch (offset)
11371 {
11372 case 1:
11373 dst0[0] = src_l0[0] | src_r0[0] << 8;
11374 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
11375 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
11376 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
11377 break;
11378
11379 case 2:
11380 dst0[0] = src_l0[0] | src_r0[0] << 16;
11381 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
11382 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
11383 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
11384 break;
11385
11386 case 3:
11387 dst0[0] = src_l0[0] | src_r0[0] << 24;
11388 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
11389 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
11390 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
11391 break;
11392
11393 case 4:
11394 dst0[1] = src_r0[0];
11395 dst0[2] = src_r0[1];
11396 dst0[3] = src_r0[2];
11397 break;
11398
11399 case 5:
11400 dst0[1] = src_l0[1] | src_r0[0] << 8;
11401 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
11402 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
11403 break;
11404
11405 case 6:
11406 dst0[1] = src_l0[1] | src_r0[0] << 16;
11407 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
11408 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
11409 break;
11410
11411 case 7:
11412 dst0[1] = src_l0[1] | src_r0[0] << 24;
11413 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
11414 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
11415 break;
11416
11417 case 8:
11418 dst0[2] = src_r0[0];
11419 dst0[3] = src_r0[1];
11420 break;
11421
11422 case 9:
11423 dst0[2] = src_l0[2] | src_r0[0] << 8;
11424 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
11425 break;
11426
11427 case 10:
11428 dst0[2] = src_l0[2] | src_r0[0] << 16;
11429 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
11430 break;
11431
11432 case 11:
11433 dst0[2] = src_l0[2] | src_r0[0] << 24;
11434 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
11435 break;
11436
11437 case 12:
11438 dst0[3] = src_r0[0];
11439 break;
11440
11441 case 13:
11442 dst0[3] = src_l0[3] | src_r0[0] << 8;
11443 break;
11444
11445 case 14:
11446 dst0[3] = src_l0[3] | src_r0[0] << 16;
11447 break;
11448
11449 case 15:
11450 dst0[3] = src_l0[3] | src_r0[0] << 24;
11451 break;
11452 }
11453 }
11454
11455 __device__ static void device_memcat8L (const u32 offset, u32x dst0[4], u32x dst1[4], u32x src_l0[4], u32x src_l1[4], u32 src_r0[4])
11456 {
11457 switch (offset)
11458 {
11459 case 1:
11460 dst0[0] = src_l0[0] | src_r0[0] << 8;
11461 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
11462 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
11463 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
11464 dst1[0] = src_r0[3] >> 24;
11465 break;
11466
11467 case 2:
11468 dst0[0] = src_l0[0] | src_r0[0] << 16;
11469 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
11470 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
11471 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
11472 dst1[0] = src_r0[3] >> 16;
11473 break;
11474
11475 case 3:
11476 dst0[0] = src_l0[0] | src_r0[0] << 24;
11477 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
11478 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
11479 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
11480 dst1[0] = src_r0[3] >> 8;
11481 break;
11482
11483 case 4:
11484 dst0[1] = src_r0[0];
11485 dst0[2] = src_r0[1];
11486 dst0[3] = src_r0[2];
11487 dst1[0] = src_r0[3];
11488 break;
11489
11490 case 5:
11491 dst0[1] = src_l0[1] | src_r0[0] << 8;
11492 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
11493 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
11494 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
11495 dst1[1] = src_r0[3] >> 24;
11496 break;
11497
11498 case 6:
11499 dst0[1] = src_l0[1] | src_r0[0] << 16;
11500 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
11501 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
11502 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
11503 dst1[1] = src_r0[3] >> 16;
11504 break;
11505
11506 case 7:
11507 dst0[1] = src_l0[1] | src_r0[0] << 24;
11508 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
11509 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
11510 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
11511 dst1[1] = src_r0[3] >> 8;
11512 break;
11513
11514 case 8:
11515 dst0[2] = src_r0[0];
11516 dst0[3] = src_r0[1];
11517 dst1[0] = src_r0[2];
11518 dst1[1] = src_r0[3];
11519 break;
11520
11521 case 9:
11522 dst0[2] = src_l0[2] | src_r0[0] << 8;
11523 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
11524 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
11525 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
11526 dst1[2] = src_r0[3] >> 24;
11527 break;
11528
11529 case 10:
11530 dst0[2] = src_l0[2] | src_r0[0] << 16;
11531 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
11532 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
11533 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
11534 dst1[2] = src_r0[3] >> 16;
11535 break;
11536
11537 case 11:
11538 dst0[2] = src_l0[2] | src_r0[0] << 24;
11539 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
11540 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
11541 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
11542 dst1[2] = src_r0[3] >> 8;
11543 break;
11544
11545 case 12:
11546 dst0[3] = src_r0[0];
11547 dst1[0] = src_r0[1];
11548 dst1[1] = src_r0[2];
11549 dst1[2] = src_r0[3];
11550 break;
11551
11552 case 13:
11553 dst0[3] = src_l0[3] | src_r0[0] << 8;
11554 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
11555 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
11556 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
11557 dst1[3] = src_r0[3] >> 24;
11558 break;
11559
11560 case 14:
11561 dst0[3] = src_l0[3] | src_r0[0] << 16;
11562 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
11563 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
11564 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
11565 dst1[3] = src_r0[3] >> 16;
11566 break;
11567
11568 case 15:
11569 dst0[3] = src_l0[3] | src_r0[0] << 24;
11570 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
11571 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
11572 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
11573 dst1[3] = src_r0[3] >> 8;
11574 break;
11575
11576 case 16:
11577 dst1[0] = src_r0[0];
11578 dst1[1] = src_r0[1];
11579 dst1[2] = src_r0[2];
11580 dst1[3] = src_r0[3];
11581 break;
11582
11583 case 17:
11584 dst1[0] = src_l1[0] | src_r0[0] << 8;
11585 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
11586 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
11587 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
11588 break;
11589
11590 case 18:
11591 dst1[0] = src_l1[0] | src_r0[0] << 16;
11592 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
11593 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
11594 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
11595 break;
11596
11597 case 19:
11598 dst1[0] = src_l1[0] | src_r0[0] << 24;
11599 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
11600 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
11601 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
11602 break;
11603
11604 case 20:
11605 dst1[1] = src_r0[0];
11606 dst1[2] = src_r0[1];
11607 dst1[3] = src_r0[2];
11608 break;
11609
11610 case 21:
11611 dst1[1] = src_l1[1] | src_r0[0] << 8;
11612 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
11613 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
11614 break;
11615
11616 case 22:
11617 dst1[1] = src_l1[1] | src_r0[0] << 16;
11618 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
11619 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
11620 break;
11621
11622 case 23:
11623 dst1[1] = src_l1[1] | src_r0[0] << 24;
11624 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
11625 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
11626 break;
11627
11628 case 24:
11629 dst1[2] = src_r0[0];
11630 dst1[3] = src_r0[1];
11631 break;
11632
11633 case 25:
11634 dst1[2] = src_l1[2] | src_r0[0] << 8;
11635 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
11636 break;
11637
11638 case 26:
11639 dst1[2] = src_l1[2] | src_r0[0] << 16;
11640 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
11641 break;
11642
11643 case 27:
11644 dst1[2] = src_l1[2] | src_r0[0] << 24;
11645 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
11646 break;
11647
11648 case 28:
11649 dst1[3] = src_r0[0];
11650 break;
11651
11652 case 29:
11653 dst1[3] = src_l1[3] | src_r0[0] << 8;
11654 break;
11655
11656 case 30:
11657 dst1[3] = src_l1[3] | src_r0[0] << 16;
11658 break;
11659
11660 case 31:
11661 dst1[3] = src_l1[3] | src_r0[0] << 24;
11662 break;
11663 }
11664 }
11665
11666 __device__ static void device_memcat8L (const u32 offset, u32x dst0[4], u32x dst1[4], u32x src_l0[4], u32x src_l1[4], u32x src_r0[4])
11667 {
11668 switch (offset)
11669 {
11670 case 1:
11671 dst0[0] = src_l0[0] | src_r0[0] << 8;
11672 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
11673 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
11674 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
11675 dst1[0] = src_r0[3] >> 24;
11676 break;
11677
11678 case 2:
11679 dst0[0] = src_l0[0] | src_r0[0] << 16;
11680 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
11681 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
11682 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
11683 dst1[0] = src_r0[3] >> 16;
11684 break;
11685
11686 case 3:
11687 dst0[0] = src_l0[0] | src_r0[0] << 24;
11688 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
11689 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
11690 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
11691 dst1[0] = src_r0[3] >> 8;
11692 break;
11693
11694 case 4:
11695 dst0[1] = src_r0[0];
11696 dst0[2] = src_r0[1];
11697 dst0[3] = src_r0[2];
11698 dst1[0] = src_r0[3];
11699 break;
11700
11701 case 5:
11702 dst0[1] = src_l0[1] | src_r0[0] << 8;
11703 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
11704 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
11705 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
11706 dst1[1] = src_r0[3] >> 24;
11707 break;
11708
11709 case 6:
11710 dst0[1] = src_l0[1] | src_r0[0] << 16;
11711 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
11712 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
11713 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
11714 dst1[1] = src_r0[3] >> 16;
11715 break;
11716
11717 case 7:
11718 dst0[1] = src_l0[1] | src_r0[0] << 24;
11719 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
11720 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
11721 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
11722 dst1[1] = src_r0[3] >> 8;
11723 break;
11724
11725 case 8:
11726 dst0[2] = src_r0[0];
11727 dst0[3] = src_r0[1];
11728 dst1[0] = src_r0[2];
11729 dst1[1] = src_r0[3];
11730 break;
11731
11732 case 9:
11733 dst0[2] = src_l0[2] | src_r0[0] << 8;
11734 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
11735 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
11736 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
11737 dst1[2] = src_r0[3] >> 24;
11738 break;
11739
11740 case 10:
11741 dst0[2] = src_l0[2] | src_r0[0] << 16;
11742 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
11743 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
11744 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
11745 dst1[2] = src_r0[3] >> 16;
11746 break;
11747
11748 case 11:
11749 dst0[2] = src_l0[2] | src_r0[0] << 24;
11750 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
11751 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
11752 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
11753 dst1[2] = src_r0[3] >> 8;
11754 break;
11755
11756 case 12:
11757 dst0[3] = src_r0[0];
11758 dst1[0] = src_r0[1];
11759 dst1[1] = src_r0[2];
11760 dst1[2] = src_r0[3];
11761 break;
11762
11763 case 13:
11764 dst0[3] = src_l0[3] | src_r0[0] << 8;
11765 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
11766 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
11767 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
11768 dst1[3] = src_r0[3] >> 24;
11769 break;
11770
11771 case 14:
11772 dst0[3] = src_l0[3] | src_r0[0] << 16;
11773 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
11774 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
11775 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
11776 dst1[3] = src_r0[3] >> 16;
11777 break;
11778
11779 case 15:
11780 dst0[3] = src_l0[3] | src_r0[0] << 24;
11781 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
11782 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
11783 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
11784 dst1[3] = src_r0[3] >> 8;
11785 break;
11786
11787 case 16:
11788 dst1[0] = src_r0[0];
11789 dst1[1] = src_r0[1];
11790 dst1[2] = src_r0[2];
11791 dst1[3] = src_r0[3];
11792 break;
11793
11794 case 17:
11795 dst1[0] = src_l1[0] | src_r0[0] << 8;
11796 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
11797 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
11798 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
11799 break;
11800
11801 case 18:
11802 dst1[0] = src_l1[0] | src_r0[0] << 16;
11803 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
11804 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
11805 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
11806 break;
11807
11808 case 19:
11809 dst1[0] = src_l1[0] | src_r0[0] << 24;
11810 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
11811 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
11812 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
11813 break;
11814
11815 case 20:
11816 dst1[1] = src_r0[0];
11817 dst1[2] = src_r0[1];
11818 dst1[3] = src_r0[2];
11819 break;
11820
11821 case 21:
11822 dst1[1] = src_l1[1] | src_r0[0] << 8;
11823 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
11824 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
11825 break;
11826
11827 case 22:
11828 dst1[1] = src_l1[1] | src_r0[0] << 16;
11829 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
11830 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
11831 break;
11832
11833 case 23:
11834 dst1[1] = src_l1[1] | src_r0[0] << 24;
11835 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
11836 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
11837 break;
11838
11839 case 24:
11840 dst1[2] = src_r0[0];
11841 dst1[3] = src_r0[1];
11842 break;
11843
11844 case 25:
11845 dst1[2] = src_l1[2] | src_r0[0] << 8;
11846 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
11847 break;
11848
11849 case 26:
11850 dst1[2] = src_l1[2] | src_r0[0] << 16;
11851 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
11852 break;
11853
11854 case 27:
11855 dst1[2] = src_l1[2] | src_r0[0] << 24;
11856 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
11857 break;
11858
11859 case 28:
11860 dst1[3] = src_r0[0];
11861 break;
11862
11863 case 29:
11864 dst1[3] = src_l1[3] | src_r0[0] << 8;
11865 break;
11866
11867 case 30:
11868 dst1[3] = src_l1[3] | src_r0[0] << 16;
11869 break;
11870
11871 case 31:
11872 dst1[3] = src_l1[3] | src_r0[0] << 24;
11873 break;
11874 }
11875 }
11876
11877 __device__ static void device_memcat12L (const u32 offset, u32x dst0[4], u32x dst1[4], u32x dst2[4], u32x src_l0[4], u32x src_l1[4], u32x src_l2[4], u32 src_r0[4])
11878 {
11879 switch (offset)
11880 {
11881 case 1:
11882 dst0[0] = src_l0[0] | src_r0[0] << 8;
11883 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
11884 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
11885 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
11886 dst1[0] = src_r0[3] >> 24;
11887 break;
11888
11889 case 2:
11890 dst0[0] = src_l0[0] | src_r0[0] << 16;
11891 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
11892 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
11893 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
11894 dst1[0] = src_r0[3] >> 16;
11895 break;
11896
11897 case 3:
11898 dst0[0] = src_l0[0] | src_r0[0] << 24;
11899 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
11900 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
11901 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
11902 dst1[0] = src_r0[3] >> 8;
11903 break;
11904
11905 case 4:
11906 dst0[1] = src_r0[0];
11907 dst0[2] = src_r0[1];
11908 dst0[3] = src_r0[2];
11909 dst1[0] = src_r0[3];
11910 break;
11911
11912 case 5:
11913 dst0[1] = src_l0[1] | src_r0[0] << 8;
11914 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
11915 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
11916 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
11917 dst1[1] = src_r0[3] >> 24;
11918 break;
11919
11920 case 6:
11921 dst0[1] = src_l0[1] | src_r0[0] << 16;
11922 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
11923 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
11924 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
11925 dst1[1] = src_r0[3] >> 16;
11926 break;
11927
11928 case 7:
11929 dst0[1] = src_l0[1] | src_r0[0] << 24;
11930 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
11931 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
11932 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
11933 dst1[1] = src_r0[3] >> 8;
11934 break;
11935
11936 case 8:
11937 dst0[2] = src_r0[0];
11938 dst0[3] = src_r0[1];
11939 dst1[0] = src_r0[2];
11940 dst1[1] = src_r0[3];
11941 break;
11942
11943 case 9:
11944 dst0[2] = src_l0[2] | src_r0[0] << 8;
11945 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
11946 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
11947 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
11948 dst1[2] = src_r0[3] >> 24;
11949 break;
11950
11951 case 10:
11952 dst0[2] = src_l0[2] | src_r0[0] << 16;
11953 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
11954 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
11955 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
11956 dst1[2] = src_r0[3] >> 16;
11957 break;
11958
11959 case 11:
11960 dst0[2] = src_l0[2] | src_r0[0] << 24;
11961 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
11962 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
11963 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
11964 dst1[2] = src_r0[3] >> 8;
11965 break;
11966
11967 case 12:
11968 dst0[3] = src_r0[0];
11969 dst1[0] = src_r0[1];
11970 dst1[1] = src_r0[2];
11971 dst1[2] = src_r0[3];
11972 break;
11973
11974 case 13:
11975 dst0[3] = src_l0[3] | src_r0[0] << 8;
11976 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
11977 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
11978 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
11979 dst1[3] = src_r0[3] >> 24;
11980 break;
11981
11982 case 14:
11983 dst0[3] = src_l0[3] | src_r0[0] << 16;
11984 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
11985 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
11986 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
11987 dst1[3] = src_r0[3] >> 16;
11988 break;
11989
11990 case 15:
11991 dst0[3] = src_l0[3] | src_r0[0] << 24;
11992 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
11993 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
11994 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
11995 dst1[3] = src_r0[3] >> 8;
11996 break;
11997
11998 case 16:
11999 dst1[0] = src_r0[0];
12000 dst1[1] = src_r0[1];
12001 dst1[2] = src_r0[2];
12002 dst1[3] = src_r0[3];
12003 break;
12004
12005 case 17:
12006 dst1[0] = src_l1[0] | src_r0[0] << 8;
12007 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
12008 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
12009 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
12010 dst2[0] = src_r0[3] >> 24;
12011 break;
12012
12013 case 18:
12014 dst1[0] = src_l1[0] | src_r0[0] << 16;
12015 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
12016 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
12017 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
12018 dst2[0] = src_r0[3] >> 16;
12019 break;
12020
12021 case 19:
12022 dst1[0] = src_l1[0] | src_r0[0] << 24;
12023 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
12024 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
12025 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
12026 dst2[0] = src_r0[3] >> 8;
12027 break;
12028
12029 case 20:
12030 dst1[1] = src_r0[0];
12031 dst1[2] = src_r0[1];
12032 dst1[3] = src_r0[2];
12033 dst2[0] = src_r0[3];
12034 break;
12035
12036 case 21:
12037 dst1[1] = src_l1[1] | src_r0[0] << 8;
12038 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
12039 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
12040 dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8;
12041 dst2[1] = src_r0[3] >> 24;
12042 break;
12043
12044 case 22:
12045 dst1[1] = src_l1[1] | src_r0[0] << 16;
12046 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
12047 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
12048 dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16;
12049 dst2[1] = src_r0[3] >> 16;
12050 break;
12051
12052 case 23:
12053 dst1[1] = src_l1[1] | src_r0[0] << 24;
12054 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
12055 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
12056 dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24;
12057 dst2[1] = src_r0[3] >> 8;
12058 break;
12059
12060 case 24:
12061 dst1[2] = src_r0[0];
12062 dst1[3] = src_r0[1];
12063 dst2[0] = src_r0[2];
12064 dst2[1] = src_r0[3];
12065 break;
12066
12067 case 25:
12068 dst1[2] = src_l1[2] | src_r0[0] << 8;
12069 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
12070 dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8;
12071 dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8;
12072 dst2[2] = src_r0[3] >> 24;
12073 break;
12074
12075 case 26:
12076 dst1[2] = src_l1[2] | src_r0[0] << 16;
12077 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
12078 dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16;
12079 dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16;
12080 dst2[2] = src_r0[3] >> 16;
12081 break;
12082
12083 case 27:
12084 dst1[2] = src_l1[2] | src_r0[0] << 24;
12085 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
12086 dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24;
12087 dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24;
12088 dst2[2] = src_r0[3] >> 8;
12089 break;
12090
12091 case 28:
12092 dst1[3] = src_r0[0];
12093 dst2[0] = src_r0[1];
12094 dst2[1] = src_r0[2];
12095 dst2[2] = src_r0[3];
12096 break;
12097
12098 case 29:
12099 dst1[3] = src_l1[3] | src_r0[0] << 8;
12100 dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8;
12101 dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8;
12102 dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8;
12103 dst2[3] = src_r0[3] >> 24;
12104 break;
12105
12106 case 30:
12107 dst1[3] = src_l1[3] | src_r0[0] << 16;
12108 dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16;
12109 dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16;
12110 dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16;
12111 dst2[3] = src_r0[3] >> 16;
12112 break;
12113
12114 case 31:
12115 dst1[3] = src_l1[3] | src_r0[0] << 24;
12116 dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24;
12117 dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24;
12118 dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24;
12119 dst2[3] = src_r0[3] >> 8;
12120 break;
12121
12122 case 32:
12123 dst2[0] = src_r0[0];
12124 dst2[1] = src_r0[1];
12125 dst2[2] = src_r0[2];
12126 dst2[3] = src_r0[3];
12127 break;
12128
12129 case 33:
12130 dst2[0] = src_l2[0] | src_r0[0] << 8;
12131 dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8;
12132 dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8;
12133 dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8;
12134 break;
12135
12136 case 34:
12137 dst2[0] = src_l2[0] | src_r0[0] << 16;
12138 dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16;
12139 dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16;
12140 dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16;
12141 break;
12142
12143 case 35:
12144 dst2[0] = src_l2[0] | src_r0[0] << 24;
12145 dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24;
12146 dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24;
12147 dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24;
12148 break;
12149
12150 case 36:
12151 dst2[1] = src_r0[0];
12152 dst2[2] = src_r0[1];
12153 dst2[3] = src_r0[2];
12154 break;
12155
12156 case 37:
12157 dst2[1] = src_l2[1] | src_r0[0] << 8;
12158 dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8;
12159 dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8;
12160 break;
12161
12162 case 38:
12163 dst2[1] = src_l2[1] | src_r0[0] << 16;
12164 dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16;
12165 dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16;
12166 break;
12167
12168 case 39:
12169 dst2[1] = src_l2[1] | src_r0[0] << 24;
12170 dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24;
12171 dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24;
12172 break;
12173
12174 case 40:
12175 dst2[2] = src_r0[0];
12176 dst2[3] = src_r0[1];
12177 break;
12178
12179 case 41:
12180 dst2[2] = src_l2[2] | src_r0[0] << 8;
12181 dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8;
12182 break;
12183
12184 case 42:
12185 dst2[2] = src_l2[2] | src_r0[0] << 16;
12186 dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16;
12187 break;
12188
12189 case 43:
12190 dst2[2] = src_l2[2] | src_r0[0] << 24;
12191 dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24;
12192 break;
12193
12194 case 44:
12195 dst2[3] = src_r0[0];
12196 break;
12197
12198 case 45:
12199 dst2[3] = src_l2[3] | src_r0[0] << 8;
12200 break;
12201
12202 case 46:
12203 dst2[3] = src_l2[3] | src_r0[0] << 16;
12204 break;
12205
12206 case 47:
12207 dst2[3] = src_l2[3] | src_r0[0] << 24;
12208 break;
12209 }
12210 }
12211
12212 __device__ static void device_memcat12L (const u32 offset, u32x dst0[4], u32x dst1[4], u32x dst2[4], u32x src_l0[4], u32x src_l1[4], u32x src_l2[4], u32x src_r0[4])
12213 {
12214 switch (offset)
12215 {
12216 case 1:
12217 dst0[0] = src_l0[0] | src_r0[0] << 8;
12218 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
12219 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
12220 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
12221 dst1[0] = src_r0[3] >> 24;
12222 break;
12223
12224 case 2:
12225 dst0[0] = src_l0[0] | src_r0[0] << 16;
12226 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
12227 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
12228 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
12229 dst1[0] = src_r0[3] >> 16;
12230 break;
12231
12232 case 3:
12233 dst0[0] = src_l0[0] | src_r0[0] << 24;
12234 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
12235 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
12236 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
12237 dst1[0] = src_r0[3] >> 8;
12238 break;
12239
12240 case 4:
12241 dst0[1] = src_r0[0];
12242 dst0[2] = src_r0[1];
12243 dst0[3] = src_r0[2];
12244 dst1[0] = src_r0[3];
12245 break;
12246
12247 case 5:
12248 dst0[1] = src_l0[1] | src_r0[0] << 8;
12249 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
12250 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
12251 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
12252 dst1[1] = src_r0[3] >> 24;
12253 break;
12254
12255 case 6:
12256 dst0[1] = src_l0[1] | src_r0[0] << 16;
12257 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
12258 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
12259 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
12260 dst1[1] = src_r0[3] >> 16;
12261 break;
12262
12263 case 7:
12264 dst0[1] = src_l0[1] | src_r0[0] << 24;
12265 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
12266 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
12267 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
12268 dst1[1] = src_r0[3] >> 8;
12269 break;
12270
12271 case 8:
12272 dst0[2] = src_r0[0];
12273 dst0[3] = src_r0[1];
12274 dst1[0] = src_r0[2];
12275 dst1[1] = src_r0[3];
12276 break;
12277
12278 case 9:
12279 dst0[2] = src_l0[2] | src_r0[0] << 8;
12280 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
12281 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
12282 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
12283 dst1[2] = src_r0[3] >> 24;
12284 break;
12285
12286 case 10:
12287 dst0[2] = src_l0[2] | src_r0[0] << 16;
12288 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
12289 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
12290 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
12291 dst1[2] = src_r0[3] >> 16;
12292 break;
12293
12294 case 11:
12295 dst0[2] = src_l0[2] | src_r0[0] << 24;
12296 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
12297 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
12298 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
12299 dst1[2] = src_r0[3] >> 8;
12300 break;
12301
12302 case 12:
12303 dst0[3] = src_r0[0];
12304 dst1[0] = src_r0[1];
12305 dst1[1] = src_r0[2];
12306 dst1[2] = src_r0[3];
12307 break;
12308
12309 case 13:
12310 dst0[3] = src_l0[3] | src_r0[0] << 8;
12311 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
12312 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
12313 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
12314 dst1[3] = src_r0[3] >> 24;
12315 break;
12316
12317 case 14:
12318 dst0[3] = src_l0[3] | src_r0[0] << 16;
12319 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
12320 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
12321 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
12322 dst1[3] = src_r0[3] >> 16;
12323 break;
12324
12325 case 15:
12326 dst0[3] = src_l0[3] | src_r0[0] << 24;
12327 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
12328 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
12329 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
12330 dst1[3] = src_r0[3] >> 8;
12331 break;
12332
12333 case 16:
12334 dst1[0] = src_r0[0];
12335 dst1[1] = src_r0[1];
12336 dst1[2] = src_r0[2];
12337 dst1[3] = src_r0[3];
12338 break;
12339
12340 case 17:
12341 dst1[0] = src_l1[0] | src_r0[0] << 8;
12342 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
12343 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
12344 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
12345 dst2[0] = src_r0[3] >> 24;
12346 break;
12347
12348 case 18:
12349 dst1[0] = src_l1[0] | src_r0[0] << 16;
12350 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
12351 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
12352 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
12353 dst2[0] = src_r0[3] >> 16;
12354 break;
12355
12356 case 19:
12357 dst1[0] = src_l1[0] | src_r0[0] << 24;
12358 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
12359 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
12360 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
12361 dst2[0] = src_r0[3] >> 8;
12362 break;
12363
12364 case 20:
12365 dst1[1] = src_r0[0];
12366 dst1[2] = src_r0[1];
12367 dst1[3] = src_r0[2];
12368 dst2[0] = src_r0[3];
12369 break;
12370
12371 case 21:
12372 dst1[1] = src_l1[1] | src_r0[0] << 8;
12373 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
12374 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
12375 dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8;
12376 dst2[1] = src_r0[3] >> 24;
12377 break;
12378
12379 case 22:
12380 dst1[1] = src_l1[1] | src_r0[0] << 16;
12381 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
12382 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
12383 dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16;
12384 dst2[1] = src_r0[3] >> 16;
12385 break;
12386
12387 case 23:
12388 dst1[1] = src_l1[1] | src_r0[0] << 24;
12389 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
12390 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
12391 dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24;
12392 dst2[1] = src_r0[3] >> 8;
12393 break;
12394
12395 case 24:
12396 dst1[2] = src_r0[0];
12397 dst1[3] = src_r0[1];
12398 dst2[0] = src_r0[2];
12399 dst2[1] = src_r0[3];
12400 break;
12401
12402 case 25:
12403 dst1[2] = src_l1[2] | src_r0[0] << 8;
12404 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
12405 dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8;
12406 dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8;
12407 dst2[2] = src_r0[3] >> 24;
12408 break;
12409
12410 case 26:
12411 dst1[2] = src_l1[2] | src_r0[0] << 16;
12412 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
12413 dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16;
12414 dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16;
12415 dst2[2] = src_r0[3] >> 16;
12416 break;
12417
12418 case 27:
12419 dst1[2] = src_l1[2] | src_r0[0] << 24;
12420 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
12421 dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24;
12422 dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24;
12423 dst2[2] = src_r0[3] >> 8;
12424 break;
12425
12426 case 28:
12427 dst1[3] = src_r0[0];
12428 dst2[0] = src_r0[1];
12429 dst2[1] = src_r0[2];
12430 dst2[2] = src_r0[3];
12431 break;
12432
12433 case 29:
12434 dst1[3] = src_l1[3] | src_r0[0] << 8;
12435 dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8;
12436 dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8;
12437 dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8;
12438 dst2[3] = src_r0[3] >> 24;
12439 break;
12440
12441 case 30:
12442 dst1[3] = src_l1[3] | src_r0[0] << 16;
12443 dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16;
12444 dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16;
12445 dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16;
12446 dst2[3] = src_r0[3] >> 16;
12447 break;
12448
12449 case 31:
12450 dst1[3] = src_l1[3] | src_r0[0] << 24;
12451 dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24;
12452 dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24;
12453 dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24;
12454 dst2[3] = src_r0[3] >> 8;
12455 break;
12456
12457 case 32:
12458 dst2[0] = src_r0[0];
12459 dst2[1] = src_r0[1];
12460 dst2[2] = src_r0[2];
12461 dst2[3] = src_r0[3];
12462 break;
12463
12464 case 33:
12465 dst2[0] = src_l2[0] | src_r0[0] << 8;
12466 dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8;
12467 dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8;
12468 dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8;
12469 break;
12470
12471 case 34:
12472 dst2[0] = src_l2[0] | src_r0[0] << 16;
12473 dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16;
12474 dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16;
12475 dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16;
12476 break;
12477
12478 case 35:
12479 dst2[0] = src_l2[0] | src_r0[0] << 24;
12480 dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24;
12481 dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24;
12482 dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24;
12483 break;
12484
12485 case 36:
12486 dst2[1] = src_r0[0];
12487 dst2[2] = src_r0[1];
12488 dst2[3] = src_r0[2];
12489 break;
12490
12491 case 37:
12492 dst2[1] = src_l2[1] | src_r0[0] << 8;
12493 dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8;
12494 dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8;
12495 break;
12496
12497 case 38:
12498 dst2[1] = src_l2[1] | src_r0[0] << 16;
12499 dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16;
12500 dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16;
12501 break;
12502
12503 case 39:
12504 dst2[1] = src_l2[1] | src_r0[0] << 24;
12505 dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24;
12506 dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24;
12507 break;
12508
12509 case 40:
12510 dst2[2] = src_r0[0];
12511 dst2[3] = src_r0[1];
12512 break;
12513
12514 case 41:
12515 dst2[2] = src_l2[2] | src_r0[0] << 8;
12516 dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8;
12517 break;
12518
12519 case 42:
12520 dst2[2] = src_l2[2] | src_r0[0] << 16;
12521 dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16;
12522 break;
12523
12524 case 43:
12525 dst2[2] = src_l2[2] | src_r0[0] << 24;
12526 dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24;
12527 break;
12528
12529 case 44:
12530 dst2[3] = src_r0[0];
12531 break;
12532
12533 case 45:
12534 dst2[3] = src_l2[3] | src_r0[0] << 8;
12535 break;
12536
12537 case 46:
12538 dst2[3] = src_l2[3] | src_r0[0] << 16;
12539 break;
12540
12541 case 47:
12542 dst2[3] = src_l2[3] | src_r0[0] << 24;
12543 break;
12544 }
12545 }
12546
12547 __device__ static void device_memcat12L (const u32 offset, u32x dst0[4], u32x dst1[4], u32x dst2[4], u32x src_l0[4], u32x src_l1[4], u32x src_l2[4], u32x src_r0[4], u32x src_r1[4])
12548 {
12549 switch (offset)
12550 {
12551 case 0:
12552 dst0[0] = src_r0[0];
12553 dst0[1] = src_r0[1];
12554 dst0[2] = src_r0[2];
12555 dst0[3] = src_r0[3];
12556 dst1[0] = src_r1[0];
12557 dst1[1] = src_r1[1];
12558 dst1[2] = src_r1[2];
12559 dst1[3] = src_r1[3];
12560 break;
12561
12562 case 1:
12563 dst0[0] = src_l0[0] | src_r0[0] << 8;
12564 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
12565 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
12566 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
12567 dst1[0] = src_r0[3] >> 24 | src_r1[0] << 8;
12568 dst1[1] = src_r1[0] >> 24 | src_r1[1] << 8;
12569 dst1[2] = src_r1[1] >> 24 | src_r1[2] << 8;
12570 dst1[3] = src_r1[2] >> 24 | src_r1[3] << 8;
12571 dst2[0] = src_r1[3] >> 24;
12572 break;
12573
12574 case 2:
12575 dst0[0] = src_l0[0] | src_r0[0] << 16;
12576 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
12577 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
12578 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
12579 dst1[0] = src_r0[3] >> 16 | src_r1[0] << 16;
12580 dst1[1] = src_r1[0] >> 16 | src_r1[1] << 16;
12581 dst1[2] = src_r1[1] >> 16 | src_r1[2] << 16;
12582 dst1[3] = src_r1[2] >> 16 | src_r1[3] << 16;
12583 dst2[0] = src_r1[3] >> 16;
12584 break;
12585
12586 case 3:
12587 dst0[0] = src_l0[0] | src_r0[0] << 24;
12588 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
12589 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
12590 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
12591 dst1[0] = src_r0[3] >> 8 | src_r1[0] << 24;
12592 dst1[1] = src_r1[0] >> 8 | src_r1[1] << 24;
12593 dst1[2] = src_r1[1] >> 8 | src_r1[2] << 24;
12594 dst1[3] = src_r1[2] >> 8 | src_r1[3] << 24;
12595 dst2[0] = src_r1[3] >> 8;
12596 break;
12597
12598 case 4:
12599 dst0[1] = src_r0[0];
12600 dst0[2] = src_r0[1];
12601 dst0[3] = src_r0[2];
12602 dst1[0] = src_r0[3];
12603 dst1[1] = src_r1[0];
12604 dst1[2] = src_r1[1];
12605 dst1[3] = src_r1[2];
12606 dst2[0] = src_r1[3];
12607 break;
12608
12609 case 5:
12610 dst0[1] = src_l0[1] | src_r0[0] << 8;
12611 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
12612 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
12613 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
12614 dst1[1] = src_r0[3] >> 24 | src_r1[0] << 8;
12615 dst1[2] = src_r1[0] >> 24 | src_r1[1] << 8;
12616 dst1[3] = src_r1[1] >> 24 | src_r1[2] << 8;
12617 dst2[0] = src_r1[2] >> 24 | src_r1[3] << 8;
12618 dst2[1] = src_r1[3] >> 24;
12619 break;
12620
12621 case 6:
12622 dst0[1] = src_l0[1] | src_r0[0] << 16;
12623 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
12624 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
12625 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
12626 dst1[1] = src_r0[3] >> 16 | src_r1[0] << 16;
12627 dst1[2] = src_r1[0] >> 16 | src_r1[1] << 16;
12628 dst1[3] = src_r1[1] >> 16 | src_r1[2] << 16;
12629 dst2[0] = src_r1[2] >> 16 | src_r1[3] << 16;
12630 dst2[1] = src_r1[3] >> 16;
12631 break;
12632
12633 case 7:
12634 dst0[1] = src_l0[1] | src_r0[0] << 24;
12635 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
12636 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
12637 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
12638 dst1[1] = src_r0[3] >> 8 | src_r1[0] << 24;
12639 dst1[2] = src_r1[0] >> 8 | src_r1[1] << 24;
12640 dst1[3] = src_r1[1] >> 8 | src_r1[2] << 24;
12641 dst2[0] = src_r1[2] >> 8 | src_r1[3] << 24;
12642 dst2[1] = src_r1[3] >> 8;
12643 break;
12644
12645 case 8:
12646 dst0[2] = src_r0[0];
12647 dst0[3] = src_r0[1];
12648 dst1[0] = src_r0[2];
12649 dst1[1] = src_r0[3];
12650 dst1[2] = src_r1[0];
12651 dst1[3] = src_r1[1];
12652 dst2[0] = src_r1[2];
12653 dst2[1] = src_r1[3];
12654 break;
12655
12656 case 9:
12657 dst0[2] = src_l0[2] | src_r0[0] << 8;
12658 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
12659 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
12660 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
12661 dst1[2] = src_r0[3] >> 24 | src_r1[0] << 8;
12662 dst1[3] = src_r1[0] >> 24 | src_r1[1] << 8;
12663 dst2[0] = src_r1[1] >> 24 | src_r1[2] << 8;
12664 dst2[1] = src_r1[2] >> 24 | src_r1[3] << 8;
12665 dst2[2] = src_r1[3] >> 24;
12666 break;
12667
12668 case 10:
12669 dst0[2] = src_l0[2] | src_r0[0] << 16;
12670 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
12671 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
12672 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
12673 dst1[2] = src_r0[3] >> 16 | src_r1[0] << 16;
12674 dst1[3] = src_r1[0] >> 16 | src_r1[1] << 16;
12675 dst2[0] = src_r1[1] >> 16 | src_r1[2] << 16;
12676 dst2[1] = src_r1[2] >> 16 | src_r1[3] << 16;
12677 dst2[2] = src_r1[3] >> 16;
12678 break;
12679
12680 case 11:
12681 dst0[2] = src_l0[2] | src_r0[0] << 24;
12682 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
12683 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
12684 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
12685 dst1[2] = src_r0[3] >> 8 | src_r1[0] << 24;
12686 dst1[3] = src_r1[0] >> 8 | src_r1[1] << 24;
12687 dst2[0] = src_r1[1] >> 8 | src_r1[2] << 24;
12688 dst2[1] = src_r1[2] >> 8 | src_r1[3] << 24;
12689 dst2[2] = src_r1[3] >> 8;
12690 break;
12691
12692 case 12:
12693 dst0[3] = src_r0[0];
12694 dst1[0] = src_r0[1];
12695 dst1[1] = src_r0[2];
12696 dst1[2] = src_r0[3];
12697 dst1[3] = src_r1[0];
12698 dst2[0] = src_r1[1];
12699 dst2[1] = src_r1[2];
12700 dst2[2] = src_r1[3];
12701 break;
12702
12703 case 13:
12704 dst0[3] = src_l0[3] | src_r0[0] << 8;
12705 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
12706 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
12707 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
12708 dst1[3] = src_r0[3] >> 24 | src_r1[0] << 8;
12709 dst2[0] = src_r1[0] >> 24 | src_r1[1] << 8;
12710 dst2[1] = src_r1[1] >> 24 | src_r1[2] << 8;
12711 dst2[2] = src_r1[2] >> 24 | src_r1[3] << 8;
12712 dst2[3] = src_r1[3] >> 24;
12713 break;
12714
12715 case 14:
12716 dst0[3] = src_l0[3] | src_r0[0] << 16;
12717 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
12718 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
12719 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
12720 dst1[3] = src_r0[3] >> 16 | src_r1[0] << 16;
12721 dst2[0] = src_r1[0] >> 16 | src_r1[1] << 16;
12722 dst2[1] = src_r1[1] >> 16 | src_r1[2] << 16;
12723 dst2[2] = src_r1[2] >> 16 | src_r1[3] << 16;
12724 dst2[3] = src_r1[3] >> 16;
12725 break;
12726
12727 case 15:
12728 dst0[3] = src_l0[3] | src_r0[0] << 24;
12729 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
12730 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
12731 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
12732 dst1[3] = src_r0[3] >> 8 | src_r1[0] << 24;
12733 dst2[0] = src_r1[0] >> 8 | src_r1[1] << 24;
12734 dst2[1] = src_r1[1] >> 8 | src_r1[2] << 24;
12735 dst2[2] = src_r1[2] >> 8 | src_r1[3] << 24;
12736 dst2[3] = src_r1[3] >> 8;
12737 break;
12738
12739 case 16:
12740 dst1[0] = src_r0[0];
12741 dst1[1] = src_r0[1];
12742 dst1[2] = src_r0[2];
12743 dst1[3] = src_r0[3];
12744 dst2[0] = src_r1[0];
12745 dst2[1] = src_r1[1];
12746 dst2[2] = src_r1[2];
12747 dst2[3] = src_r1[3];
12748 break;
12749
12750 case 17:
12751 dst1[0] = src_l1[0] | src_r0[0] << 8;
12752 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
12753 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
12754 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
12755 dst2[0] = src_r0[3] >> 24 | src_r1[0] << 8;
12756 dst2[1] = src_r1[0] >> 24 | src_r1[1] << 8;
12757 dst2[2] = src_r1[1] >> 24 | src_r1[2] << 8;
12758 dst2[3] = src_r1[2] >> 24 | src_r1[3] << 8;
12759 break;
12760
12761 case 18:
12762 dst1[0] = src_l1[0] | src_r0[0] << 16;
12763 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
12764 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
12765 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
12766 dst2[0] = src_r0[3] >> 16 | src_r1[0] << 16;
12767 dst2[1] = src_r1[0] >> 16 | src_r1[1] << 16;
12768 dst2[2] = src_r1[1] >> 16 | src_r1[2] << 16;
12769 dst2[3] = src_r1[2] >> 16 | src_r1[3] << 16;
12770 break;
12771
12772 case 19:
12773 dst1[0] = src_l1[0] | src_r0[0] << 24;
12774 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
12775 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
12776 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
12777 dst2[0] = src_r0[3] >> 8 | src_r1[0] << 24;
12778 dst2[1] = src_r1[0] >> 8 | src_r1[1] << 24;
12779 dst2[2] = src_r1[1] >> 8 | src_r1[2] << 24;
12780 dst2[3] = src_r1[2] >> 8 | src_r1[3] << 24;
12781 break;
12782
12783 case 20:
12784 dst1[1] = src_r1[0];
12785 dst1[2] = src_r0[1];
12786 dst1[3] = src_r0[2];
12787 dst2[0] = src_r0[3];
12788 dst2[1] = src_r1[0];
12789 dst2[2] = src_r1[1];
12790 dst2[3] = src_r1[2];
12791 break;
12792
12793 case 21:
12794 dst1[1] = src_l1[1] | src_r0[0] << 8;
12795 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
12796 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
12797 dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8;
12798 dst2[1] = src_r0[3] >> 24 | src_r1[0] << 8;
12799 dst2[2] = src_r1[0] >> 24 | src_r1[1] << 8;
12800 dst2[3] = src_r1[1] >> 24 | src_r1[2] << 8;
12801 break;
12802
12803 case 22:
12804 dst1[1] = src_l1[1] | src_r0[0] << 16;
12805 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
12806 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
12807 dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16;
12808 dst2[1] = src_r0[3] >> 16 | src_r1[0] << 16;
12809 dst2[2] = src_r1[0] >> 16 | src_r1[1] << 16;
12810 dst2[3] = src_r1[1] >> 16 | src_r1[2] << 16;
12811 break;
12812
12813 case 23:
12814 dst1[1] = src_l1[1] | src_r0[0] << 24;
12815 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
12816 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
12817 dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24;
12818 dst2[1] = src_r0[3] >> 8 | src_r1[0] << 24;
12819 dst2[2] = src_r1[0] >> 8 | src_r1[1] << 24;
12820 dst2[3] = src_r1[1] >> 8 | src_r1[2] << 24;
12821 break;
12822
12823 case 24:
12824 dst1[2] = src_r1[0];
12825 dst1[3] = src_r0[1];
12826 dst2[0] = src_r0[2];
12827 dst2[1] = src_r0[3];
12828 dst2[2] = src_r1[0];
12829 dst2[3] = src_r1[1];
12830 break;
12831
12832 case 25:
12833 dst1[2] = src_l1[2] | src_r0[0] << 8;
12834 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
12835 dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8;
12836 dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8;
12837 dst2[2] = src_r0[3] >> 24 | src_r1[0] << 8;
12838 dst2[3] = src_r1[0] >> 24 | src_r1[1] << 8;
12839 break;
12840
12841 case 26:
12842 dst1[2] = src_l1[2] | src_r0[0] << 16;
12843 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
12844 dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16;
12845 dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16;
12846 dst2[2] = src_r0[3] >> 16 | src_r1[0] << 16;
12847 dst2[3] = src_r1[0] >> 16 | src_r1[1] << 16;
12848 break;
12849
12850 case 27:
12851 dst1[2] = src_l1[2] | src_r0[0] << 24;
12852 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
12853 dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24;
12854 dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24;
12855 dst2[2] = src_r0[3] >> 8 | src_r1[0] << 24;
12856 dst2[3] = src_r1[0] >> 8 | src_r1[1] << 24;
12857 break;
12858
12859 case 28:
12860 dst1[3] = src_r1[0];
12861 dst2[0] = src_r0[1];
12862 dst2[1] = src_r0[2];
12863 dst2[2] = src_r0[3];
12864 dst2[3] = src_r1[0];
12865 break;
12866
12867 case 29:
12868 dst1[3] = src_l1[3] | src_r0[0] << 8;
12869 dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8;
12870 dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8;
12871 dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8;
12872 dst2[3] = src_r0[3] >> 24 | src_r1[0] << 8;
12873 break;
12874
12875 case 30:
12876 dst1[3] = src_l1[3] | src_r0[0] << 16;
12877 dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16;
12878 dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16;
12879 dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16;
12880 dst2[3] = src_r0[3] >> 16 | src_r1[0] << 16;
12881 break;
12882
12883 case 31:
12884 dst1[3] = src_l1[3] | src_r0[0] << 24;
12885 dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24;
12886 dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24;
12887 dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24;
12888 dst2[3] = src_r0[3] >> 8 | src_r1[0] << 24;
12889 break;
12890
12891 case 32:
12892 dst2[0] = src_r0[0];
12893 dst2[1] = src_r0[1];
12894 dst2[2] = src_r0[2];
12895 dst2[3] = src_r0[3];
12896 break;
12897
12898 case 33:
12899 dst2[0] = src_l2[0] | src_r0[0] << 8;
12900 dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8;
12901 dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8;
12902 dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8;
12903 break;
12904
12905 case 34:
12906 dst2[0] = src_l2[0] | src_r0[0] << 16;
12907 dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16;
12908 dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16;
12909 dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16;
12910 break;
12911
12912 case 35:
12913 dst2[0] = src_l2[0] | src_r0[0] << 24;
12914 dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24;
12915 dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24;
12916 dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24;
12917 break;
12918
12919 case 36:
12920 dst2[1] = src_r0[0];
12921 dst2[2] = src_r0[1];
12922 dst2[3] = src_r0[2];
12923 break;
12924
12925 case 37:
12926 dst2[1] = src_l2[1] | src_r0[0] << 8;
12927 dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8;
12928 dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8;
12929 break;
12930
12931 case 38:
12932 dst2[1] = src_l2[1] | src_r0[0] << 16;
12933 dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16;
12934 dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16;
12935 break;
12936
12937 case 39:
12938 dst2[1] = src_l2[1] | src_r0[0] << 24;
12939 dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24;
12940 dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24;
12941 break;
12942
12943 case 40:
12944 dst2[2] = src_r0[0];
12945 dst2[3] = src_r0[1];
12946 break;
12947
12948 case 41:
12949 dst2[2] = src_l2[2] | src_r0[0] << 8;
12950 dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8;
12951 break;
12952
12953 case 42:
12954 dst2[2] = src_l2[2] | src_r0[0] << 16;
12955 dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16;
12956 break;
12957
12958 case 43:
12959 dst2[2] = src_l2[2] | src_r0[0] << 24;
12960 dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24;
12961 break;
12962
12963 case 44:
12964 dst2[3] = src_r0[0];
12965 break;
12966
12967 case 45:
12968 dst2[3] = src_l2[3] | src_r0[0] << 8;
12969 break;
12970
12971 case 46:
12972 dst2[3] = src_l2[3] | src_r0[0] << 16;
12973 break;
12974
12975 case 47:
12976 dst2[3] = src_l2[3] | src_r0[0] << 24;
12977 break;
12978 }
12979 }
12980
12981 __device__ static void memcat16_9 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset)
12982 {
12983 switch (offset)
12984 {
12985 case 0:
12986 w0[0] = append0[0];
12987 w0[1] = append0[1];
12988 w0[2] = append0[2];
12989 w0[3] = append0[3];
12990 w1[0] = append1[0];
12991 w1[1] = append1[1];
12992 w1[2] = append1[2];
12993 w1[3] = append1[3];
12994 w2[0] = append2[0];
12995 break;
12996
12997 case 1:
12998 w0[0] = w0[0] | append0[0] << 8;
12999 w0[1] = append0[0] >> 24 | append0[1] << 8;
13000 w0[2] = append0[1] >> 24 | append0[2] << 8;
13001 w0[3] = append0[2] >> 24 | append0[3] << 8;
13002 w1[0] = append0[3] >> 24 | append1[0] << 8;
13003 w1[1] = append1[0] >> 24 | append1[1] << 8;
13004 w1[2] = append1[1] >> 24 | append1[2] << 8;
13005 w1[3] = append1[2] >> 24 | append1[3] << 8;
13006 w2[0] = append1[3] >> 24 | append2[0] << 8;
13007 w2[1] = append2[0] >> 24;
13008 break;
13009
13010 case 2:
13011 w0[0] = w0[0] | append0[0] << 16;
13012 w0[1] = append0[0] >> 16 | append0[1] << 16;
13013 w0[2] = append0[1] >> 16 | append0[2] << 16;
13014 w0[3] = append0[2] >> 16 | append0[3] << 16;
13015 w1[0] = append0[3] >> 16 | append1[0] << 16;
13016 w1[1] = append1[0] >> 16 | append1[1] << 16;
13017 w1[2] = append1[1] >> 16 | append1[2] << 16;
13018 w1[3] = append1[2] >> 16 | append1[3] << 16;
13019 w2[0] = append1[3] >> 16 | append2[0] << 16;
13020 w2[1] = append2[0] >> 16;
13021 break;
13022
13023 case 3:
13024 w0[0] = w0[0] | append0[0] << 24;
13025 w0[1] = append0[0] >> 8 | append0[1] << 24;
13026 w0[2] = append0[1] >> 8 | append0[2] << 24;
13027 w0[3] = append0[2] >> 8 | append0[3] << 24;
13028 w1[0] = append0[3] >> 8 | append1[0] << 24;
13029 w1[1] = append1[0] >> 8 | append1[1] << 24;
13030 w1[2] = append1[1] >> 8 | append1[2] << 24;
13031 w1[3] = append1[2] >> 8 | append1[3] << 24;
13032 w2[0] = append1[3] >> 8 | append2[0] << 24;
13033 w2[1] = append2[0] >> 8;
13034 break;
13035
13036 case 4:
13037 w0[1] = append0[0];
13038 w0[2] = append0[1];
13039 w0[3] = append0[2];
13040 w1[0] = append0[3];
13041 w1[1] = append1[0];
13042 w1[2] = append1[1];
13043 w1[3] = append1[2];
13044 w2[0] = append1[3];
13045 w2[1] = append2[0];
13046 break;
13047
13048 case 5:
13049 w0[1] = w0[1] | append0[0] << 8;
13050 w0[2] = append0[0] >> 24 | append0[1] << 8;
13051 w0[3] = append0[1] >> 24 | append0[2] << 8;
13052 w1[0] = append0[2] >> 24 | append0[3] << 8;
13053 w1[1] = append0[3] >> 24 | append1[0] << 8;
13054 w1[2] = append1[0] >> 24 | append1[1] << 8;
13055 w1[3] = append1[1] >> 24 | append1[2] << 8;
13056 w2[0] = append1[2] >> 24 | append1[3] << 8;
13057 w2[1] = append1[3] >> 24 | append2[0] << 8;
13058 w2[2] = append2[0] >> 24;
13059 break;
13060
13061 case 6:
13062 w0[1] = w0[1] | append0[0] << 16;
13063 w0[2] = append0[0] >> 16 | append0[1] << 16;
13064 w0[3] = append0[1] >> 16 | append0[2] << 16;
13065 w1[0] = append0[2] >> 16 | append0[3] << 16;
13066 w1[1] = append0[3] >> 16 | append1[0] << 16;
13067 w1[2] = append1[0] >> 16 | append1[1] << 16;
13068 w1[3] = append1[1] >> 16 | append1[2] << 16;
13069 w2[0] = append1[2] >> 16 | append1[3] << 16;
13070 w2[1] = append1[3] >> 16 | append2[0] << 16;
13071 w2[2] = append2[0] >> 16;
13072 break;
13073
13074 case 7:
13075 w0[1] = w0[1] | append0[0] << 24;
13076 w0[2] = append0[0] >> 8 | append0[1] << 24;
13077 w0[3] = append0[1] >> 8 | append0[2] << 24;
13078 w1[0] = append0[2] >> 8 | append0[3] << 24;
13079 w1[1] = append0[3] >> 8 | append1[0] << 24;
13080 w1[2] = append1[0] >> 8 | append1[1] << 24;
13081 w1[3] = append1[1] >> 8 | append1[2] << 24;
13082 w2[0] = append1[2] >> 8 | append1[3] << 24;
13083 w2[1] = append1[3] >> 8 | append2[0] << 24;
13084 w2[2] = append2[0] >> 8;
13085 break;
13086
13087 case 8:
13088 w0[2] = append0[0];
13089 w0[3] = append0[1];
13090 w1[0] = append0[2];
13091 w1[1] = append0[3];
13092 w1[2] = append1[0];
13093 w1[3] = append1[1];
13094 w2[0] = append1[2];
13095 w2[1] = append1[3];
13096 w2[2] = append2[0];
13097 break;
13098
13099 case 9:
13100 w0[2] = w0[2] | append0[0] << 8;
13101 w0[3] = append0[0] >> 24 | append0[1] << 8;
13102 w1[0] = append0[1] >> 24 | append0[2] << 8;
13103 w1[1] = append0[2] >> 24 | append0[3] << 8;
13104 w1[2] = append0[3] >> 24 | append1[0] << 8;
13105 w1[3] = append1[0] >> 24 | append1[1] << 8;
13106 w2[0] = append1[1] >> 24 | append1[2] << 8;
13107 w2[1] = append1[2] >> 24 | append1[3] << 8;
13108 w2[2] = append1[3] >> 24 | append2[0] << 8;
13109 w2[3] = append2[0] >> 24;
13110 break;
13111
13112 case 10:
13113 w0[2] = w0[2] | append0[0] << 16;
13114 w0[3] = append0[0] >> 16 | append0[1] << 16;
13115 w1[0] = append0[1] >> 16 | append0[2] << 16;
13116 w1[1] = append0[2] >> 16 | append0[3] << 16;
13117 w1[2] = append0[3] >> 16 | append1[0] << 16;
13118 w1[3] = append1[0] >> 16 | append1[1] << 16;
13119 w2[0] = append1[1] >> 16 | append1[2] << 16;
13120 w2[1] = append1[2] >> 16 | append1[3] << 16;
13121 w2[2] = append1[3] >> 16 | append2[0] << 16;
13122 w2[3] = append2[0] >> 16;
13123 break;
13124
13125 case 11:
13126 w0[2] = w0[2] | append0[0] << 24;
13127 w0[3] = append0[0] >> 8 | append0[1] << 24;
13128 w1[0] = append0[1] >> 8 | append0[2] << 24;
13129 w1[1] = append0[2] >> 8 | append0[3] << 24;
13130 w1[2] = append0[3] >> 8 | append1[0] << 24;
13131 w1[3] = append1[0] >> 8 | append1[1] << 24;
13132 w2[0] = append1[1] >> 8 | append1[2] << 24;
13133 w2[1] = append1[2] >> 8 | append1[3] << 24;
13134 w2[2] = append1[3] >> 8 | append2[0] << 24;
13135 w2[3] = append2[0] >> 8;
13136 break;
13137
13138 case 12:
13139 w0[3] = append0[0];
13140 w1[0] = append0[1];
13141 w1[1] = append0[2];
13142 w1[2] = append0[3];
13143 w1[3] = append1[0];
13144 w2[0] = append1[1];
13145 w2[1] = append1[2];
13146 w2[2] = append1[3];
13147 w2[3] = append2[0];
13148 break;
13149
13150 case 13:
13151 w0[3] = w0[3] | append0[0] << 8;
13152 w1[0] = append0[0] >> 24 | append0[1] << 8;
13153 w1[1] = append0[1] >> 24 | append0[2] << 8;
13154 w1[2] = append0[2] >> 24 | append0[3] << 8;
13155 w1[3] = append0[3] >> 24 | append1[0] << 8;
13156 w2[0] = append1[0] >> 24 | append1[1] << 8;
13157 w2[1] = append1[1] >> 24 | append1[2] << 8;
13158 w2[2] = append1[2] >> 24 | append1[3] << 8;
13159 w2[3] = append1[3] >> 24 | append2[0] << 8;
13160 w3[0] = append2[0] >> 24;
13161 break;
13162
13163 case 14:
13164 w0[3] = w0[3] | append0[0] << 16;
13165 w1[0] = append0[0] >> 16 | append0[1] << 16;
13166 w1[1] = append0[1] >> 16 | append0[2] << 16;
13167 w1[2] = append0[2] >> 16 | append0[3] << 16;
13168 w1[3] = append0[3] >> 16 | append1[0] << 16;
13169 w2[0] = append1[0] >> 16 | append1[1] << 16;
13170 w2[1] = append1[1] >> 16 | append1[2] << 16;
13171 w2[2] = append1[2] >> 16 | append1[3] << 16;
13172 w2[3] = append1[3] >> 16 | append2[0] << 16;
13173 w3[0] = append2[0] >> 16;
13174 break;
13175
13176 case 15:
13177 w0[3] = w0[3] | append0[0] << 24;
13178 w1[0] = append0[0] >> 8 | append0[1] << 24;
13179 w1[1] = append0[1] >> 8 | append0[2] << 24;
13180 w1[2] = append0[2] >> 8 | append0[3] << 24;
13181 w1[3] = append0[3] >> 8 | append1[0] << 24;
13182 w2[0] = append1[0] >> 8 | append1[1] << 24;
13183 w2[1] = append1[1] >> 8 | append1[2] << 24;
13184 w2[2] = append1[2] >> 8 | append1[3] << 24;
13185 w2[3] = append1[3] >> 8 | append2[0] << 24;
13186 w3[0] = append2[0] >> 8;
13187 break;
13188 }
13189 }
13190
13191 __device__ static void memcat16_9 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32 offset)
13192 {
13193 switch (offset)
13194 {
13195 case 0:
13196 w0[0] = append0[0];
13197 w0[1] = append0[1];
13198 w0[2] = append0[2];
13199 w0[3] = append0[3];
13200 w1[0] = append1[0];
13201 w1[1] = append1[1];
13202 w1[2] = append1[2];
13203 w1[3] = append1[3];
13204 w2[0] = append2[0];
13205 break;
13206
13207 case 1:
13208 w0[0] = w0[0] | append0[0] << 8;
13209 w0[1] = append0[0] >> 24 | append0[1] << 8;
13210 w0[2] = append0[1] >> 24 | append0[2] << 8;
13211 w0[3] = append0[2] >> 24 | append0[3] << 8;
13212 w1[0] = append0[3] >> 24 | append1[0] << 8;
13213 w1[1] = append1[0] >> 24 | append1[1] << 8;
13214 w1[2] = append1[1] >> 24 | append1[2] << 8;
13215 w1[3] = append1[2] >> 24 | append1[3] << 8;
13216 w2[0] = append1[3] >> 24 | append2[0] << 8;
13217 w2[1] = append2[0] >> 24;
13218 break;
13219
13220 case 2:
13221 w0[0] = w0[0] | append0[0] << 16;
13222 w0[1] = append0[0] >> 16 | append0[1] << 16;
13223 w0[2] = append0[1] >> 16 | append0[2] << 16;
13224 w0[3] = append0[2] >> 16 | append0[3] << 16;
13225 w1[0] = append0[3] >> 16 | append1[0] << 16;
13226 w1[1] = append1[0] >> 16 | append1[1] << 16;
13227 w1[2] = append1[1] >> 16 | append1[2] << 16;
13228 w1[3] = append1[2] >> 16 | append1[3] << 16;
13229 w2[0] = append1[3] >> 16 | append2[0] << 16;
13230 w2[1] = append2[0] >> 16;
13231 break;
13232
13233 case 3:
13234 w0[0] = w0[0] | append0[0] << 24;
13235 w0[1] = append0[0] >> 8 | append0[1] << 24;
13236 w0[2] = append0[1] >> 8 | append0[2] << 24;
13237 w0[3] = append0[2] >> 8 | append0[3] << 24;
13238 w1[0] = append0[3] >> 8 | append1[0] << 24;
13239 w1[1] = append1[0] >> 8 | append1[1] << 24;
13240 w1[2] = append1[1] >> 8 | append1[2] << 24;
13241 w1[3] = append1[2] >> 8 | append1[3] << 24;
13242 w2[0] = append1[3] >> 8 | append2[0] << 24;
13243 w2[1] = append2[0] >> 8;
13244 break;
13245
13246 case 4:
13247 w0[1] = append0[0];
13248 w0[2] = append0[1];
13249 w0[3] = append0[2];
13250 w1[0] = append0[3];
13251 w1[1] = append1[0];
13252 w1[2] = append1[1];
13253 w1[3] = append1[2];
13254 w2[0] = append1[3];
13255 w2[1] = append2[0];
13256 break;
13257
13258 case 5:
13259 w0[1] = w0[1] | append0[0] << 8;
13260 w0[2] = append0[0] >> 24 | append0[1] << 8;
13261 w0[3] = append0[1] >> 24 | append0[2] << 8;
13262 w1[0] = append0[2] >> 24 | append0[3] << 8;
13263 w1[1] = append0[3] >> 24 | append1[0] << 8;
13264 w1[2] = append1[0] >> 24 | append1[1] << 8;
13265 w1[3] = append1[1] >> 24 | append1[2] << 8;
13266 w2[0] = append1[2] >> 24 | append1[3] << 8;
13267 w2[1] = append1[3] >> 24 | append2[0] << 8;
13268 w2[2] = append2[0] >> 24;
13269 break;
13270
13271 case 6:
13272 w0[1] = w0[1] | append0[0] << 16;
13273 w0[2] = append0[0] >> 16 | append0[1] << 16;
13274 w0[3] = append0[1] >> 16 | append0[2] << 16;
13275 w1[0] = append0[2] >> 16 | append0[3] << 16;
13276 w1[1] = append0[3] >> 16 | append1[0] << 16;
13277 w1[2] = append1[0] >> 16 | append1[1] << 16;
13278 w1[3] = append1[1] >> 16 | append1[2] << 16;
13279 w2[0] = append1[2] >> 16 | append1[3] << 16;
13280 w2[1] = append1[3] >> 16 | append2[0] << 16;
13281 w2[2] = append2[0] >> 16;
13282 break;
13283
13284 case 7:
13285 w0[1] = w0[1] | append0[0] << 24;
13286 w0[2] = append0[0] >> 8 | append0[1] << 24;
13287 w0[3] = append0[1] >> 8 | append0[2] << 24;
13288 w1[0] = append0[2] >> 8 | append0[3] << 24;
13289 w1[1] = append0[3] >> 8 | append1[0] << 24;
13290 w1[2] = append1[0] >> 8 | append1[1] << 24;
13291 w1[3] = append1[1] >> 8 | append1[2] << 24;
13292 w2[0] = append1[2] >> 8 | append1[3] << 24;
13293 w2[1] = append1[3] >> 8 | append2[0] << 24;
13294 w2[2] = append2[0] >> 8;
13295 break;
13296
13297 case 8:
13298 w0[2] = append0[0];
13299 w0[3] = append0[1];
13300 w1[0] = append0[2];
13301 w1[1] = append0[3];
13302 w1[2] = append1[0];
13303 w1[3] = append1[1];
13304 w2[0] = append1[2];
13305 w2[1] = append1[3];
13306 w2[2] = append2[0];
13307 break;
13308
13309 case 9:
13310 w0[2] = w0[2] | append0[0] << 8;
13311 w0[3] = append0[0] >> 24 | append0[1] << 8;
13312 w1[0] = append0[1] >> 24 | append0[2] << 8;
13313 w1[1] = append0[2] >> 24 | append0[3] << 8;
13314 w1[2] = append0[3] >> 24 | append1[0] << 8;
13315 w1[3] = append1[0] >> 24 | append1[1] << 8;
13316 w2[0] = append1[1] >> 24 | append1[2] << 8;
13317 w2[1] = append1[2] >> 24 | append1[3] << 8;
13318 w2[2] = append1[3] >> 24 | append2[0] << 8;
13319 w2[3] = append2[0] >> 24;
13320 break;
13321
13322 case 10:
13323 w0[2] = w0[2] | append0[0] << 16;
13324 w0[3] = append0[0] >> 16 | append0[1] << 16;
13325 w1[0] = append0[1] >> 16 | append0[2] << 16;
13326 w1[1] = append0[2] >> 16 | append0[3] << 16;
13327 w1[2] = append0[3] >> 16 | append1[0] << 16;
13328 w1[3] = append1[0] >> 16 | append1[1] << 16;
13329 w2[0] = append1[1] >> 16 | append1[2] << 16;
13330 w2[1] = append1[2] >> 16 | append1[3] << 16;
13331 w2[2] = append1[3] >> 16 | append2[0] << 16;
13332 w2[3] = append2[0] >> 16;
13333 break;
13334
13335 case 11:
13336 w0[2] = w0[2] | append0[0] << 24;
13337 w0[3] = append0[0] >> 8 | append0[1] << 24;
13338 w1[0] = append0[1] >> 8 | append0[2] << 24;
13339 w1[1] = append0[2] >> 8 | append0[3] << 24;
13340 w1[2] = append0[3] >> 8 | append1[0] << 24;
13341 w1[3] = append1[0] >> 8 | append1[1] << 24;
13342 w2[0] = append1[1] >> 8 | append1[2] << 24;
13343 w2[1] = append1[2] >> 8 | append1[3] << 24;
13344 w2[2] = append1[3] >> 8 | append2[0] << 24;
13345 w2[3] = append2[0] >> 8;
13346 break;
13347
13348 case 12:
13349 w0[3] = append0[0];
13350 w1[0] = append0[1];
13351 w1[1] = append0[2];
13352 w1[2] = append0[3];
13353 w1[3] = append1[0];
13354 w2[0] = append1[1];
13355 w2[1] = append1[2];
13356 w2[2] = append1[3];
13357 w2[3] = append2[0];
13358 break;
13359
13360 case 13:
13361 w0[3] = w0[3] | append0[0] << 8;
13362 w1[0] = append0[0] >> 24 | append0[1] << 8;
13363 w1[1] = append0[1] >> 24 | append0[2] << 8;
13364 w1[2] = append0[2] >> 24 | append0[3] << 8;
13365 w1[3] = append0[3] >> 24 | append1[0] << 8;
13366 w2[0] = append1[0] >> 24 | append1[1] << 8;
13367 w2[1] = append1[1] >> 24 | append1[2] << 8;
13368 w2[2] = append1[2] >> 24 | append1[3] << 8;
13369 w2[3] = append1[3] >> 24 | append2[0] << 8;
13370 w3[0] = append2[0] >> 24;
13371 break;
13372
13373 case 14:
13374 w0[3] = w0[3] | append0[0] << 16;
13375 w1[0] = append0[0] >> 16 | append0[1] << 16;
13376 w1[1] = append0[1] >> 16 | append0[2] << 16;
13377 w1[2] = append0[2] >> 16 | append0[3] << 16;
13378 w1[3] = append0[3] >> 16 | append1[0] << 16;
13379 w2[0] = append1[0] >> 16 | append1[1] << 16;
13380 w2[1] = append1[1] >> 16 | append1[2] << 16;
13381 w2[2] = append1[2] >> 16 | append1[3] << 16;
13382 w2[3] = append1[3] >> 16 | append2[0] << 16;
13383 w3[0] = append2[0] >> 16;
13384 break;
13385
13386 case 15:
13387 w0[3] = w0[3] | append0[0] << 24;
13388 w1[0] = append0[0] >> 8 | append0[1] << 24;
13389 w1[1] = append0[1] >> 8 | append0[2] << 24;
13390 w1[2] = append0[2] >> 8 | append0[3] << 24;
13391 w1[3] = append0[3] >> 8 | append1[0] << 24;
13392 w2[0] = append1[0] >> 8 | append1[1] << 24;
13393 w2[1] = append1[1] >> 8 | append1[2] << 24;
13394 w2[2] = append1[2] >> 8 | append1[3] << 24;
13395 w2[3] = append1[3] >> 8 | append2[0] << 24;
13396 w3[0] = append2[0] >> 8;
13397 break;
13398 }
13399 }
13400
13401 __device__ static void memcat32_8 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 append0[4], const u32 append1[4], const u32 offset)
13402 {
13403 switch (offset)
13404 {
13405 case 0:
13406 w0[0] = append0[0];
13407 w0[1] = append0[1];
13408 w0[2] = append0[2];
13409 w0[3] = append0[3];
13410 w1[0] = append1[0];
13411 w1[1] = append1[1];
13412 w1[2] = append1[2];
13413 w1[3] = append1[3];
13414 break;
13415
13416 case 1:
13417 w0[0] = w0[0] | append0[0] << 8;
13418 w0[1] = append0[0] >> 24 | append0[1] << 8;
13419 w0[2] = append0[1] >> 24 | append0[2] << 8;
13420 w0[3] = append0[2] >> 24 | append0[3] << 8;
13421 w1[0] = append0[3] >> 24 | append1[0] << 8;
13422 w1[1] = append1[0] >> 24 | append1[1] << 8;
13423 w1[2] = append1[1] >> 24 | append1[2] << 8;
13424 w1[3] = append1[2] >> 24 | append1[3] << 8;
13425 w2[0] = append1[3] >> 24;
13426 break;
13427
13428 case 2:
13429 w0[0] = w0[0] | append0[0] << 16;
13430 w0[1] = append0[0] >> 16 | append0[1] << 16;
13431 w0[2] = append0[1] >> 16 | append0[2] << 16;
13432 w0[3] = append0[2] >> 16 | append0[3] << 16;
13433 w1[0] = append0[3] >> 16 | append1[0] << 16;
13434 w1[1] = append1[0] >> 16 | append1[1] << 16;
13435 w1[2] = append1[1] >> 16 | append1[2] << 16;
13436 w1[3] = append1[2] >> 16 | append1[3] << 16;
13437 w2[0] = append1[3] >> 16;
13438 break;
13439
13440 case 3:
13441 w0[0] = w0[0] | append0[0] << 24;
13442 w0[1] = append0[0] >> 8 | append0[1] << 24;
13443 w0[2] = append0[1] >> 8 | append0[2] << 24;
13444 w0[3] = append0[2] >> 8 | append0[3] << 24;
13445 w1[0] = append0[3] >> 8 | append1[0] << 24;
13446 w1[1] = append1[0] >> 8 | append1[1] << 24;
13447 w1[2] = append1[1] >> 8 | append1[2] << 24;
13448 w1[3] = append1[2] >> 8 | append1[3] << 24;
13449 w2[0] = append1[3] >> 8;
13450 break;
13451
13452 case 4:
13453 w0[1] = append0[0];
13454 w0[2] = append0[1];
13455 w0[3] = append0[2];
13456 w1[0] = append0[3];
13457 w1[1] = append1[0];
13458 w1[2] = append1[1];
13459 w1[3] = append1[2];
13460 w2[0] = append1[3];
13461 break;
13462
13463 case 5:
13464 w0[1] = w0[1] | append0[0] << 8;
13465 w0[2] = append0[0] >> 24 | append0[1] << 8;
13466 w0[3] = append0[1] >> 24 | append0[2] << 8;
13467 w1[0] = append0[2] >> 24 | append0[3] << 8;
13468 w1[1] = append0[3] >> 24 | append1[0] << 8;
13469 w1[2] = append1[0] >> 24 | append1[1] << 8;
13470 w1[3] = append1[1] >> 24 | append1[2] << 8;
13471 w2[0] = append1[2] >> 24 | append1[3] << 8;
13472 w2[1] = append1[3] >> 24;
13473 break;
13474
13475 case 6:
13476 w0[1] = w0[1] | append0[0] << 16;
13477 w0[2] = append0[0] >> 16 | append0[1] << 16;
13478 w0[3] = append0[1] >> 16 | append0[2] << 16;
13479 w1[0] = append0[2] >> 16 | append0[3] << 16;
13480 w1[1] = append0[3] >> 16 | append1[0] << 16;
13481 w1[2] = append1[0] >> 16 | append1[1] << 16;
13482 w1[3] = append1[1] >> 16 | append1[2] << 16;
13483 w2[0] = append1[2] >> 16 | append1[3] << 16;
13484 w2[1] = append1[3] >> 16;
13485 break;
13486
13487 case 7:
13488 w0[1] = w0[1] | append0[0] << 24;
13489 w0[2] = append0[0] >> 8 | append0[1] << 24;
13490 w0[3] = append0[1] >> 8 | append0[2] << 24;
13491 w1[0] = append0[2] >> 8 | append0[3] << 24;
13492 w1[1] = append0[3] >> 8 | append1[0] << 24;
13493 w1[2] = append1[0] >> 8 | append1[1] << 24;
13494 w1[3] = append1[1] >> 8 | append1[2] << 24;
13495 w2[0] = append1[2] >> 8 | append1[3] << 24;
13496 w2[1] = append1[3] >> 8;
13497 break;
13498
13499 case 8:
13500 w0[2] = append0[0];
13501 w0[3] = append0[1];
13502 w1[0] = append0[2];
13503 w1[1] = append0[3];
13504 w1[2] = append1[0];
13505 w1[3] = append1[1];
13506 w2[0] = append1[2];
13507 w2[1] = append1[3];
13508 break;
13509
13510 case 9:
13511 w0[2] = w0[2] | append0[0] << 8;
13512 w0[3] = append0[0] >> 24 | append0[1] << 8;
13513 w1[0] = append0[1] >> 24 | append0[2] << 8;
13514 w1[1] = append0[2] >> 24 | append0[3] << 8;
13515 w1[2] = append0[3] >> 24 | append1[0] << 8;
13516 w1[3] = append1[0] >> 24 | append1[1] << 8;
13517 w2[0] = append1[1] >> 24 | append1[2] << 8;
13518 w2[1] = append1[2] >> 24 | append1[3] << 8;
13519 w2[2] = append1[3] >> 24;
13520 break;
13521
13522 case 10:
13523 w0[2] = w0[2] | append0[0] << 16;
13524 w0[3] = append0[0] >> 16 | append0[1] << 16;
13525 w1[0] = append0[1] >> 16 | append0[2] << 16;
13526 w1[1] = append0[2] >> 16 | append0[3] << 16;
13527 w1[2] = append0[3] >> 16 | append1[0] << 16;
13528 w1[3] = append1[0] >> 16 | append1[1] << 16;
13529 w2[0] = append1[1] >> 16 | append1[2] << 16;
13530 w2[1] = append1[2] >> 16 | append1[3] << 16;
13531 w2[2] = append1[3] >> 16;
13532 break;
13533
13534 case 11:
13535 w0[2] = w0[2] | append0[0] << 24;
13536 w0[3] = append0[0] >> 8 | append0[1] << 24;
13537 w1[0] = append0[1] >> 8 | append0[2] << 24;
13538 w1[1] = append0[2] >> 8 | append0[3] << 24;
13539 w1[2] = append0[3] >> 8 | append1[0] << 24;
13540 w1[3] = append1[0] >> 8 | append1[1] << 24;
13541 w2[0] = append1[1] >> 8 | append1[2] << 24;
13542 w2[1] = append1[2] >> 8 | append1[3] << 24;
13543 w2[2] = append1[3] >> 8;
13544 break;
13545
13546 case 12:
13547 w0[3] = append0[0];
13548 w1[0] = append0[1];
13549 w1[1] = append0[2];
13550 w1[2] = append0[3];
13551 w1[3] = append1[0];
13552 w2[0] = append1[1];
13553 w2[1] = append1[2];
13554 w2[2] = append1[3];
13555 break;
13556
13557 case 13:
13558 w0[3] = w0[3] | append0[0] << 8;
13559 w1[0] = append0[0] >> 24 | append0[1] << 8;
13560 w1[1] = append0[1] >> 24 | append0[2] << 8;
13561 w1[2] = append0[2] >> 24 | append0[3] << 8;
13562 w1[3] = append0[3] >> 24 | append1[0] << 8;
13563 w2[0] = append1[0] >> 24 | append1[1] << 8;
13564 w2[1] = append1[1] >> 24 | append1[2] << 8;
13565 w2[2] = append1[2] >> 24 | append1[3] << 8;
13566 w2[3] = append1[3] >> 24;
13567 break;
13568
13569 case 14:
13570 w0[3] = w0[3] | append0[0] << 16;
13571 w1[0] = append0[0] >> 16 | append0[1] << 16;
13572 w1[1] = append0[1] >> 16 | append0[2] << 16;
13573 w1[2] = append0[2] >> 16 | append0[3] << 16;
13574 w1[3] = append0[3] >> 16 | append1[0] << 16;
13575 w2[0] = append1[0] >> 16 | append1[1] << 16;
13576 w2[1] = append1[1] >> 16 | append1[2] << 16;
13577 w2[2] = append1[2] >> 16 | append1[3] << 16;
13578 w2[3] = append1[3] >> 16;
13579 break;
13580
13581 case 15:
13582 w0[3] = w0[3] | append0[0] << 24;
13583 w1[0] = append0[0] >> 8 | append0[1] << 24;
13584 w1[1] = append0[1] >> 8 | append0[2] << 24;
13585 w1[2] = append0[2] >> 8 | append0[3] << 24;
13586 w1[3] = append0[3] >> 8 | append1[0] << 24;
13587 w2[0] = append1[0] >> 8 | append1[1] << 24;
13588 w2[1] = append1[1] >> 8 | append1[2] << 24;
13589 w2[2] = append1[2] >> 8 | append1[3] << 24;
13590 w2[3] = append1[3] >> 8;
13591 break;
13592
13593 case 16:
13594 w1[0] = append0[0];
13595 w1[1] = append0[1];
13596 w1[2] = append0[2];
13597 w1[3] = append0[3];
13598 w2[0] = append1[0];
13599 w2[1] = append1[1];
13600 w2[2] = append1[2];
13601 w2[3] = append1[3];
13602 break;
13603
13604 case 17:
13605 w1[0] = w1[0] | append0[0] << 8;
13606 w1[1] = append0[0] >> 24 | append0[1] << 8;
13607 w1[2] = append0[1] >> 24 | append0[2] << 8;
13608 w1[3] = append0[2] >> 24 | append0[3] << 8;
13609 w2[0] = append0[3] >> 24 | append1[0] << 8;
13610 w2[1] = append1[0] >> 24 | append1[1] << 8;
13611 w2[2] = append1[1] >> 24 | append1[2] << 8;
13612 w2[3] = append1[2] >> 24 | append1[3] << 8;
13613 w3[0] = append1[3] >> 24;
13614 break;
13615
13616 case 18:
13617 w1[0] = w1[0] | append0[0] << 16;
13618 w1[1] = append0[0] >> 16 | append0[1] << 16;
13619 w1[2] = append0[1] >> 16 | append0[2] << 16;
13620 w1[3] = append0[2] >> 16 | append0[3] << 16;
13621 w2[0] = append0[3] >> 16 | append1[0] << 16;
13622 w2[1] = append1[0] >> 16 | append1[1] << 16;
13623 w2[2] = append1[1] >> 16 | append1[2] << 16;
13624 w2[3] = append1[2] >> 16 | append1[3] << 16;
13625 w3[0] = append1[3] >> 16;
13626 break;
13627
13628 case 19:
13629 w1[0] = w1[0] | append0[0] << 24;
13630 w1[1] = append0[0] >> 8 | append0[1] << 24;
13631 w1[2] = append0[1] >> 8 | append0[2] << 24;
13632 w1[3] = append0[2] >> 8 | append0[3] << 24;
13633 w2[0] = append0[3] >> 8 | append1[0] << 24;
13634 w2[1] = append1[0] >> 8 | append1[1] << 24;
13635 w2[2] = append1[1] >> 8 | append1[2] << 24;
13636 w2[3] = append1[2] >> 8 | append1[3] << 24;
13637 w3[0] = append1[3] >> 8;
13638 break;
13639
13640 case 20:
13641 w1[1] = append0[0];
13642 w1[2] = append0[1];
13643 w1[3] = append0[2];
13644 w2[0] = append0[3];
13645 w2[1] = append1[0];
13646 w2[2] = append1[1];
13647 w2[3] = append1[2];
13648 w3[0] = append1[3];
13649 break;
13650
13651 case 21:
13652 w1[1] = w1[1] | append0[0] << 8;
13653 w1[2] = append0[0] >> 24 | append0[1] << 8;
13654 w1[3] = append0[1] >> 24 | append0[2] << 8;
13655 w2[0] = append0[2] >> 24 | append0[3] << 8;
13656 w2[1] = append0[3] >> 24 | append1[0] << 8;
13657 w2[2] = append1[0] >> 24 | append1[1] << 8;
13658 w2[3] = append1[1] >> 24 | append1[2] << 8;
13659 w3[0] = append1[2] >> 24 | append1[3] << 8;
13660 w3[1] = append1[3] >> 24;
13661 break;
13662
13663 case 22:
13664 w1[1] = w1[1] | append0[0] << 16;
13665 w1[2] = append0[0] >> 16 | append0[1] << 16;
13666 w1[3] = append0[1] >> 16 | append0[2] << 16;
13667 w2[0] = append0[2] >> 16 | append0[3] << 16;
13668 w2[1] = append0[3] >> 16 | append1[0] << 16;
13669 w2[2] = append1[0] >> 16 | append1[1] << 16;
13670 w2[3] = append1[1] >> 16 | append1[2] << 16;
13671 w3[0] = append1[2] >> 16 | append1[3] << 16;
13672 w3[1] = append1[3] >> 16;
13673 break;
13674
13675 case 23:
13676 w1[1] = w1[1] | append0[0] << 24;
13677 w1[2] = append0[0] >> 8 | append0[1] << 24;
13678 w1[3] = append0[1] >> 8 | append0[2] << 24;
13679 w2[0] = append0[2] >> 8 | append0[3] << 24;
13680 w2[1] = append0[3] >> 8 | append1[0] << 24;
13681 w2[2] = append1[0] >> 8 | append1[1] << 24;
13682 w2[3] = append1[1] >> 8 | append1[2] << 24;
13683 w3[0] = append1[2] >> 8 | append1[3] << 24;
13684 w3[1] = append1[3] >> 8;
13685 break;
13686
13687 case 24:
13688 w1[2] = append0[0];
13689 w1[3] = append0[1];
13690 w2[0] = append0[2];
13691 w2[1] = append0[3];
13692 w2[2] = append1[0];
13693 w2[3] = append1[1];
13694 w3[0] = append1[2];
13695 w3[1] = append1[3];
13696 break;
13697
13698 case 25:
13699 w1[2] = w1[2] | append0[0] << 8;
13700 w1[3] = append0[0] >> 24 | append0[1] << 8;
13701 w2[0] = append0[1] >> 24 | append0[2] << 8;
13702 w2[1] = append0[2] >> 24 | append0[3] << 8;
13703 w2[2] = append0[3] >> 24 | append1[0] << 8;
13704 w2[3] = append1[0] >> 24 | append1[1] << 8;
13705 w3[0] = append1[1] >> 24 | append1[2] << 8;
13706 w3[1] = append1[2] >> 24 | append1[3] << 8;
13707 break;
13708
13709 case 26:
13710 w1[2] = w1[2] | append0[0] << 16;
13711 w1[3] = append0[0] >> 16 | append0[1] << 16;
13712 w2[0] = append0[1] >> 16 | append0[2] << 16;
13713 w2[1] = append0[2] >> 16 | append0[3] << 16;
13714 w2[2] = append0[3] >> 16 | append1[0] << 16;
13715 w2[3] = append1[0] >> 16 | append1[1] << 16;
13716 w3[0] = append1[1] >> 16 | append1[2] << 16;
13717 w3[1] = append1[2] >> 16 | append1[3] << 16;
13718 break;
13719
13720 case 27:
13721 w1[2] = w1[2] | append0[0] << 24;
13722 w1[3] = append0[0] >> 8 | append0[1] << 24;
13723 w2[0] = append0[1] >> 8 | append0[2] << 24;
13724 w2[1] = append0[2] >> 8 | append0[3] << 24;
13725 w2[2] = append0[3] >> 8 | append1[0] << 24;
13726 w2[3] = append1[0] >> 8 | append1[1] << 24;
13727 w3[0] = append1[1] >> 8 | append1[2] << 24;
13728 w3[1] = append1[2] >> 8 | append1[3] << 24;
13729 break;
13730
13731 case 28:
13732 w1[3] = append0[0];
13733 w2[0] = append0[1];
13734 w2[1] = append0[2];
13735 w2[2] = append0[3];
13736 w2[3] = append1[0];
13737 w3[0] = append1[1];
13738 w3[1] = append1[2];
13739 break;
13740
13741 case 29:
13742 w1[3] = w1[3] | append0[0] << 8;
13743 w2[0] = append0[0] >> 24 | append0[1] << 8;
13744 w2[1] = append0[1] >> 24 | append0[2] << 8;
13745 w2[2] = append0[2] >> 24 | append0[3] << 8;
13746 w2[3] = append0[3] >> 24 | append1[0] << 8;
13747 w3[0] = append1[0] >> 24 | append1[1] << 8;
13748 w3[1] = append1[1] >> 24 | append1[2] << 8;
13749 break;
13750
13751 case 30:
13752 w1[3] = w1[3] | append0[0] << 16;
13753 w2[0] = append0[0] >> 16 | append0[1] << 16;
13754 w2[1] = append0[1] >> 16 | append0[2] << 16;
13755 w2[2] = append0[2] >> 16 | append0[3] << 16;
13756 w2[3] = append0[3] >> 16 | append1[0] << 16;
13757 w3[0] = append1[0] >> 16 | append1[1] << 16;
13758 w3[1] = append1[1] >> 16 | append1[2] << 16;
13759 break;
13760
13761 case 31:
13762 w1[3] = w1[3] | append0[0] << 24;
13763 w2[0] = append0[0] >> 8 | append0[1] << 24;
13764 w2[1] = append0[1] >> 8 | append0[2] << 24;
13765 w2[2] = append0[2] >> 8 | append0[3] << 24;
13766 w2[3] = append0[3] >> 8 | append1[0] << 24;
13767 w3[0] = append1[0] >> 8 | append1[1] << 24;
13768 w3[1] = append1[1] >> 8 | append1[2] << 24;
13769 break;
13770
13771 case 32:
13772 w2[0] = append0[0];
13773 w2[1] = append0[1];
13774 w2[2] = append0[2];
13775 w2[3] = append0[3];
13776 w3[0] = append1[0];
13777 w3[1] = append1[1];
13778 break;
13779 }
13780 }
13781
13782 __device__ static void memcat32_9 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset)
13783 {
13784 switch (offset)
13785 {
13786 case 0:
13787 w0[0] = append0[0];
13788 w0[1] = append0[1];
13789 w0[2] = append0[2];
13790 w0[3] = append0[3];
13791 w1[0] = append1[0];
13792 w1[1] = append1[1];
13793 w1[2] = append1[2];
13794 w1[3] = append1[3];
13795 w2[0] = append2[0];
13796 break;
13797
13798 case 1:
13799 w0[0] = w0[0] | append0[0] << 8;
13800 w0[1] = append0[0] >> 24 | append0[1] << 8;
13801 w0[2] = append0[1] >> 24 | append0[2] << 8;
13802 w0[3] = append0[2] >> 24 | append0[3] << 8;
13803 w1[0] = append0[3] >> 24 | append1[0] << 8;
13804 w1[1] = append1[0] >> 24 | append1[1] << 8;
13805 w1[2] = append1[1] >> 24 | append1[2] << 8;
13806 w1[3] = append1[2] >> 24 | append1[3] << 8;
13807 w2[0] = append1[3] >> 24 | append2[0] << 8;
13808 w2[1] = append2[0] >> 24;
13809 break;
13810
13811 case 2:
13812 w0[0] = w0[0] | append0[0] << 16;
13813 w0[1] = append0[0] >> 16 | append0[1] << 16;
13814 w0[2] = append0[1] >> 16 | append0[2] << 16;
13815 w0[3] = append0[2] >> 16 | append0[3] << 16;
13816 w1[0] = append0[3] >> 16 | append1[0] << 16;
13817 w1[1] = append1[0] >> 16 | append1[1] << 16;
13818 w1[2] = append1[1] >> 16 | append1[2] << 16;
13819 w1[3] = append1[2] >> 16 | append1[3] << 16;
13820 w2[0] = append1[3] >> 16 | append2[0] << 16;
13821 w2[1] = append2[0] >> 16;
13822 break;
13823
13824 case 3:
13825 w0[0] = w0[0] | append0[0] << 24;
13826 w0[1] = append0[0] >> 8 | append0[1] << 24;
13827 w0[2] = append0[1] >> 8 | append0[2] << 24;
13828 w0[3] = append0[2] >> 8 | append0[3] << 24;
13829 w1[0] = append0[3] >> 8 | append1[0] << 24;
13830 w1[1] = append1[0] >> 8 | append1[1] << 24;
13831 w1[2] = append1[1] >> 8 | append1[2] << 24;
13832 w1[3] = append1[2] >> 8 | append1[3] << 24;
13833 w2[0] = append1[3] >> 8 | append2[0] << 24;
13834 w2[1] = append2[0] >> 8;
13835 break;
13836
13837 case 4:
13838 w0[1] = append0[0];
13839 w0[2] = append0[1];
13840 w0[3] = append0[2];
13841 w1[0] = append0[3];
13842 w1[1] = append1[0];
13843 w1[2] = append1[1];
13844 w1[3] = append1[2];
13845 w2[0] = append1[3];
13846 w2[1] = append2[0];
13847 break;
13848
13849 case 5:
13850 w0[1] = w0[1] | append0[0] << 8;
13851 w0[2] = append0[0] >> 24 | append0[1] << 8;
13852 w0[3] = append0[1] >> 24 | append0[2] << 8;
13853 w1[0] = append0[2] >> 24 | append0[3] << 8;
13854 w1[1] = append0[3] >> 24 | append1[0] << 8;
13855 w1[2] = append1[0] >> 24 | append1[1] << 8;
13856 w1[3] = append1[1] >> 24 | append1[2] << 8;
13857 w2[0] = append1[2] >> 24 | append1[3] << 8;
13858 w2[1] = append1[3] >> 24 | append2[0] << 8;
13859 w2[2] = append2[0] >> 24;
13860 break;
13861
13862 case 6:
13863 w0[1] = w0[1] | append0[0] << 16;
13864 w0[2] = append0[0] >> 16 | append0[1] << 16;
13865 w0[3] = append0[1] >> 16 | append0[2] << 16;
13866 w1[0] = append0[2] >> 16 | append0[3] << 16;
13867 w1[1] = append0[3] >> 16 | append1[0] << 16;
13868 w1[2] = append1[0] >> 16 | append1[1] << 16;
13869 w1[3] = append1[1] >> 16 | append1[2] << 16;
13870 w2[0] = append1[2] >> 16 | append1[3] << 16;
13871 w2[1] = append1[3] >> 16 | append2[0] << 16;
13872 w2[2] = append2[0] >> 16;
13873 break;
13874
13875 case 7:
13876 w0[1] = w0[1] | append0[0] << 24;
13877 w0[2] = append0[0] >> 8 | append0[1] << 24;
13878 w0[3] = append0[1] >> 8 | append0[2] << 24;
13879 w1[0] = append0[2] >> 8 | append0[3] << 24;
13880 w1[1] = append0[3] >> 8 | append1[0] << 24;
13881 w1[2] = append1[0] >> 8 | append1[1] << 24;
13882 w1[3] = append1[1] >> 8 | append1[2] << 24;
13883 w2[0] = append1[2] >> 8 | append1[3] << 24;
13884 w2[1] = append1[3] >> 8 | append2[0] << 24;
13885 w2[2] = append2[0] >> 8;
13886 break;
13887
13888 case 8:
13889 w0[2] = append0[0];
13890 w0[3] = append0[1];
13891 w1[0] = append0[2];
13892 w1[1] = append0[3];
13893 w1[2] = append1[0];
13894 w1[3] = append1[1];
13895 w2[0] = append1[2];
13896 w2[1] = append1[3];
13897 w2[2] = append2[0];
13898 break;
13899
13900 case 9:
13901 w0[2] = w0[2] | append0[0] << 8;
13902 w0[3] = append0[0] >> 24 | append0[1] << 8;
13903 w1[0] = append0[1] >> 24 | append0[2] << 8;
13904 w1[1] = append0[2] >> 24 | append0[3] << 8;
13905 w1[2] = append0[3] >> 24 | append1[0] << 8;
13906 w1[3] = append1[0] >> 24 | append1[1] << 8;
13907 w2[0] = append1[1] >> 24 | append1[2] << 8;
13908 w2[1] = append1[2] >> 24 | append1[3] << 8;
13909 w2[2] = append1[3] >> 24 | append2[0] << 8;
13910 w2[3] = append2[0] >> 24;
13911 break;
13912
13913 case 10:
13914 w0[2] = w0[2] | append0[0] << 16;
13915 w0[3] = append0[0] >> 16 | append0[1] << 16;
13916 w1[0] = append0[1] >> 16 | append0[2] << 16;
13917 w1[1] = append0[2] >> 16 | append0[3] << 16;
13918 w1[2] = append0[3] >> 16 | append1[0] << 16;
13919 w1[3] = append1[0] >> 16 | append1[1] << 16;
13920 w2[0] = append1[1] >> 16 | append1[2] << 16;
13921 w2[1] = append1[2] >> 16 | append1[3] << 16;
13922 w2[2] = append1[3] >> 16 | append2[0] << 16;
13923 w2[3] = append2[0] >> 16;
13924 break;
13925
13926 case 11:
13927 w0[2] = w0[2] | append0[0] << 24;
13928 w0[3] = append0[0] >> 8 | append0[1] << 24;
13929 w1[0] = append0[1] >> 8 | append0[2] << 24;
13930 w1[1] = append0[2] >> 8 | append0[3] << 24;
13931 w1[2] = append0[3] >> 8 | append1[0] << 24;
13932 w1[3] = append1[0] >> 8 | append1[1] << 24;
13933 w2[0] = append1[1] >> 8 | append1[2] << 24;
13934 w2[1] = append1[2] >> 8 | append1[3] << 24;
13935 w2[2] = append1[3] >> 8 | append2[0] << 24;
13936 w2[3] = append2[0] >> 8;
13937 break;
13938
13939 case 12:
13940 w0[3] = append0[0];
13941 w1[0] = append0[1];
13942 w1[1] = append0[2];
13943 w1[2] = append0[3];
13944 w1[3] = append1[0];
13945 w2[0] = append1[1];
13946 w2[1] = append1[2];
13947 w2[2] = append1[3];
13948 w2[3] = append2[0];
13949 break;
13950
13951 case 13:
13952 w0[3] = w0[3] | append0[0] << 8;
13953 w1[0] = append0[0] >> 24 | append0[1] << 8;
13954 w1[1] = append0[1] >> 24 | append0[2] << 8;
13955 w1[2] = append0[2] >> 24 | append0[3] << 8;
13956 w1[3] = append0[3] >> 24 | append1[0] << 8;
13957 w2[0] = append1[0] >> 24 | append1[1] << 8;
13958 w2[1] = append1[1] >> 24 | append1[2] << 8;
13959 w2[2] = append1[2] >> 24 | append1[3] << 8;
13960 w2[3] = append1[3] >> 24 | append2[0] << 8;
13961 w3[0] = append2[0] >> 24;
13962 break;
13963
13964 case 14:
13965 w0[3] = w0[3] | append0[0] << 16;
13966 w1[0] = append0[0] >> 16 | append0[1] << 16;
13967 w1[1] = append0[1] >> 16 | append0[2] << 16;
13968 w1[2] = append0[2] >> 16 | append0[3] << 16;
13969 w1[3] = append0[3] >> 16 | append1[0] << 16;
13970 w2[0] = append1[0] >> 16 | append1[1] << 16;
13971 w2[1] = append1[1] >> 16 | append1[2] << 16;
13972 w2[2] = append1[2] >> 16 | append1[3] << 16;
13973 w2[3] = append1[3] >> 16 | append2[0] << 16;
13974 w3[0] = append2[0] >> 16;
13975 break;
13976
13977 case 15:
13978 w0[3] = w0[3] | append0[0] << 24;
13979 w1[0] = append0[0] >> 8 | append0[1] << 24;
13980 w1[1] = append0[1] >> 8 | append0[2] << 24;
13981 w1[2] = append0[2] >> 8 | append0[3] << 24;
13982 w1[3] = append0[3] >> 8 | append1[0] << 24;
13983 w2[0] = append1[0] >> 8 | append1[1] << 24;
13984 w2[1] = append1[1] >> 8 | append1[2] << 24;
13985 w2[2] = append1[2] >> 8 | append1[3] << 24;
13986 w2[3] = append1[3] >> 8 | append2[0] << 24;
13987 w3[0] = append2[0] >> 8;
13988 break;
13989
13990 case 16:
13991 w1[0] = append0[0];
13992 w1[1] = append0[1];
13993 w1[2] = append0[2];
13994 w1[3] = append0[3];
13995 w2[0] = append1[0];
13996 w2[1] = append1[1];
13997 w2[2] = append1[2];
13998 w2[3] = append1[3];
13999 w3[0] = append2[0];
14000 break;
14001
14002 case 17:
14003 w1[0] = w1[0] | append0[0] << 8;
14004 w1[1] = append0[0] >> 24 | append0[1] << 8;
14005 w1[2] = append0[1] >> 24 | append0[2] << 8;
14006 w1[3] = append0[2] >> 24 | append0[3] << 8;
14007 w2[0] = append0[3] >> 24 | append1[0] << 8;
14008 w2[1] = append1[0] >> 24 | append1[1] << 8;
14009 w2[2] = append1[1] >> 24 | append1[2] << 8;
14010 w2[3] = append1[2] >> 24 | append1[3] << 8;
14011 w3[0] = append1[3] >> 24 | append2[0] << 8;
14012 w3[1] = append2[0] >> 24;
14013 break;
14014
14015 case 18:
14016 w1[0] = w1[0] | append0[0] << 16;
14017 w1[1] = append0[0] >> 16 | append0[1] << 16;
14018 w1[2] = append0[1] >> 16 | append0[2] << 16;
14019 w1[3] = append0[2] >> 16 | append0[3] << 16;
14020 w2[0] = append0[3] >> 16 | append1[0] << 16;
14021 w2[1] = append1[0] >> 16 | append1[1] << 16;
14022 w2[2] = append1[1] >> 16 | append1[2] << 16;
14023 w2[3] = append1[2] >> 16 | append1[3] << 16;
14024 w3[0] = append1[3] >> 16 | append2[0] << 16;
14025 w3[1] = append2[0] >> 16;
14026 break;
14027
14028 case 19:
14029 w1[0] = w1[0] | append0[0] << 24;
14030 w1[1] = append0[0] >> 8 | append0[1] << 24;
14031 w1[2] = append0[1] >> 8 | append0[2] << 24;
14032 w1[3] = append0[2] >> 8 | append0[3] << 24;
14033 w2[0] = append0[3] >> 8 | append1[0] << 24;
14034 w2[1] = append1[0] >> 8 | append1[1] << 24;
14035 w2[2] = append1[1] >> 8 | append1[2] << 24;
14036 w2[3] = append1[2] >> 8 | append1[3] << 24;
14037 w3[0] = append1[3] >> 8 | append2[0] << 24;
14038 w3[1] = append2[0] >> 8;
14039 break;
14040
14041 case 20:
14042 w1[1] = append0[0];
14043 w1[2] = append0[1];
14044 w1[3] = append0[2];
14045 w2[0] = append0[3];
14046 w2[1] = append1[0];
14047 w2[2] = append1[1];
14048 w2[3] = append1[2];
14049 w3[0] = append1[3];
14050 w3[1] = append2[0];
14051 break;
14052
14053 case 21:
14054 w1[1] = w1[1] | append0[0] << 8;
14055 w1[2] = append0[0] >> 24 | append0[1] << 8;
14056 w1[3] = append0[1] >> 24 | append0[2] << 8;
14057 w2[0] = append0[2] >> 24 | append0[3] << 8;
14058 w2[1] = append0[3] >> 24 | append1[0] << 8;
14059 w2[2] = append1[0] >> 24 | append1[1] << 8;
14060 w2[3] = append1[1] >> 24 | append1[2] << 8;
14061 w3[0] = append1[2] >> 24 | append1[3] << 8;
14062 w3[1] = append1[3] >> 24 | append2[0] << 8;
14063 break;
14064
14065 case 22:
14066 w1[1] = w1[1] | append0[0] << 16;
14067 w1[2] = append0[0] >> 16 | append0[1] << 16;
14068 w1[3] = append0[1] >> 16 | append0[2] << 16;
14069 w2[0] = append0[2] >> 16 | append0[3] << 16;
14070 w2[1] = append0[3] >> 16 | append1[0] << 16;
14071 w2[2] = append1[0] >> 16 | append1[1] << 16;
14072 w2[3] = append1[1] >> 16 | append1[2] << 16;
14073 w3[0] = append1[2] >> 16 | append1[3] << 16;
14074 w3[1] = append1[3] >> 16 | append2[0] << 16;
14075 break;
14076
14077 case 23:
14078 w1[1] = w1[1] | append0[0] << 24;
14079 w1[2] = append0[0] >> 8 | append0[1] << 24;
14080 w1[3] = append0[1] >> 8 | append0[2] << 24;
14081 w2[0] = append0[2] >> 8 | append0[3] << 24;
14082 w2[1] = append0[3] >> 8 | append1[0] << 24;
14083 w2[2] = append1[0] >> 8 | append1[1] << 24;
14084 w2[3] = append1[1] >> 8 | append1[2] << 24;
14085 w3[0] = append1[2] >> 8 | append1[3] << 24;
14086 w3[1] = append1[3] >> 8 | append2[0] << 24;
14087 break;
14088
14089 case 24:
14090 w1[2] = append0[0];
14091 w1[3] = append0[1];
14092 w2[0] = append0[2];
14093 w2[1] = append0[3];
14094 w2[2] = append1[0];
14095 w2[3] = append1[1];
14096 w3[0] = append1[2];
14097 w3[1] = append1[3];
14098 break;
14099
14100 case 25:
14101 w1[2] = w1[2] | append0[0] << 8;
14102 w1[3] = append0[0] >> 24 | append0[1] << 8;
14103 w2[0] = append0[1] >> 24 | append0[2] << 8;
14104 w2[1] = append0[2] >> 24 | append0[3] << 8;
14105 w2[2] = append0[3] >> 24 | append1[0] << 8;
14106 w2[3] = append1[0] >> 24 | append1[1] << 8;
14107 w3[0] = append1[1] >> 24 | append1[2] << 8;
14108 w3[1] = append1[2] >> 24 | append1[3] << 8;
14109 break;
14110
14111 case 26:
14112 w1[2] = w1[2] | append0[0] << 16;
14113 w1[3] = append0[0] >> 16 | append0[1] << 16;
14114 w2[0] = append0[1] >> 16 | append0[2] << 16;
14115 w2[1] = append0[2] >> 16 | append0[3] << 16;
14116 w2[2] = append0[3] >> 16 | append1[0] << 16;
14117 w2[3] = append1[0] >> 16 | append1[1] << 16;
14118 w3[0] = append1[1] >> 16 | append1[2] << 16;
14119 w3[1] = append1[2] >> 16 | append1[3] << 16;
14120 break;
14121
14122 case 27:
14123 w1[2] = w1[2] | append0[0] << 24;
14124 w1[3] = append0[0] >> 8 | append0[1] << 24;
14125 w2[0] = append0[1] >> 8 | append0[2] << 24;
14126 w2[1] = append0[2] >> 8 | append0[3] << 24;
14127 w2[2] = append0[3] >> 8 | append1[0] << 24;
14128 w2[3] = append1[0] >> 8 | append1[1] << 24;
14129 w3[0] = append1[1] >> 8 | append1[2] << 24;
14130 w3[1] = append1[2] >> 8 | append1[3] << 24;
14131 break;
14132
14133 case 28:
14134 w1[3] = append0[0];
14135 w2[0] = append0[1];
14136 w2[1] = append0[2];
14137 w2[2] = append0[3];
14138 w2[3] = append1[0];
14139 w3[0] = append1[1];
14140 w3[1] = append1[2];
14141 break;
14142
14143 case 29:
14144 w1[3] = w1[3] | append0[0] << 8;
14145 w2[0] = append0[0] >> 24 | append0[1] << 8;
14146 w2[1] = append0[1] >> 24 | append0[2] << 8;
14147 w2[2] = append0[2] >> 24 | append0[3] << 8;
14148 w2[3] = append0[3] >> 24 | append1[0] << 8;
14149 w3[0] = append1[0] >> 24 | append1[1] << 8;
14150 w3[1] = append1[1] >> 24 | append1[2] << 8;
14151 break;
14152
14153 case 30:
14154 w1[3] = w1[3] | append0[0] << 16;
14155 w2[0] = append0[0] >> 16 | append0[1] << 16;
14156 w2[1] = append0[1] >> 16 | append0[2] << 16;
14157 w2[2] = append0[2] >> 16 | append0[3] << 16;
14158 w2[3] = append0[3] >> 16 | append1[0] << 16;
14159 w3[0] = append1[0] >> 16 | append1[1] << 16;
14160 w3[1] = append1[1] >> 16 | append1[2] << 16;
14161 break;
14162
14163 case 31:
14164 w1[3] = w1[3] | append0[0] << 24;
14165 w2[0] = append0[0] >> 8 | append0[1] << 24;
14166 w2[1] = append0[1] >> 8 | append0[2] << 24;
14167 w2[2] = append0[2] >> 8 | append0[3] << 24;
14168 w2[3] = append0[3] >> 8 | append1[0] << 24;
14169 w3[0] = append1[0] >> 8 | append1[1] << 24;
14170 w3[1] = append1[1] >> 8 | append1[2] << 24;
14171 break;
14172
14173 case 32:
14174 w2[0] = append0[0];
14175 w2[1] = append0[1];
14176 w2[2] = append0[2];
14177 w2[3] = append0[3];
14178 w3[0] = append1[0];
14179 w3[1] = append1[1];
14180 break;
14181 }
14182 }
14183
14184 __device__ static void switch_buffer_by_offset (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
14185 {
14186 #if __CUDA_ARCH__ >= 200
14187
14188 const int offset_minus_4 = 4 - (offset % 4);
14189
14190 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
14191
14192 switch (offset / 4)
14193 {
14194 case 0:
14195 w3[1] = __byte_perm (w3[0], w3[1], selector);
14196 w3[0] = __byte_perm (w2[3], w3[0], selector);
14197 w2[3] = __byte_perm (w2[2], w2[3], selector);
14198 w2[2] = __byte_perm (w2[1], w2[2], selector);
14199 w2[1] = __byte_perm (w2[0], w2[1], selector);
14200 w2[0] = __byte_perm (w1[3], w2[0], selector);
14201 w1[3] = __byte_perm (w1[2], w1[3], selector);
14202 w1[2] = __byte_perm (w1[1], w1[2], selector);
14203 w1[1] = __byte_perm (w1[0], w1[1], selector);
14204 w1[0] = __byte_perm (w0[3], w1[0], selector);
14205 w0[3] = __byte_perm (w0[2], w0[3], selector);
14206 w0[2] = __byte_perm (w0[1], w0[2], selector);
14207 w0[1] = __byte_perm (w0[0], w0[1], selector);
14208 w0[0] = __byte_perm ( 0, w0[0], selector);
14209
14210 break;
14211
14212 case 1:
14213 w3[1] = __byte_perm (w2[3], w3[0], selector);
14214 w3[0] = __byte_perm (w2[2], w2[3], selector);
14215 w2[3] = __byte_perm (w2[1], w2[2], selector);
14216 w2[2] = __byte_perm (w2[0], w2[1], selector);
14217 w2[1] = __byte_perm (w1[3], w2[0], selector);
14218 w2[0] = __byte_perm (w1[2], w1[3], selector);
14219 w1[3] = __byte_perm (w1[1], w1[2], selector);
14220 w1[2] = __byte_perm (w1[0], w1[1], selector);
14221 w1[1] = __byte_perm (w0[3], w1[0], selector);
14222 w1[0] = __byte_perm (w0[2], w0[3], selector);
14223 w0[3] = __byte_perm (w0[1], w0[2], selector);
14224 w0[2] = __byte_perm (w0[0], w0[1], selector);
14225 w0[1] = __byte_perm ( 0, w0[0], selector);
14226 w0[0] = 0;
14227
14228 break;
14229
14230 case 2:
14231 w3[1] = __byte_perm (w2[2], w2[3], selector);
14232 w3[0] = __byte_perm (w2[1], w2[2], selector);
14233 w2[3] = __byte_perm (w2[0], w2[1], selector);
14234 w2[2] = __byte_perm (w1[3], w2[0], selector);
14235 w2[1] = __byte_perm (w1[2], w1[3], selector);
14236 w2[0] = __byte_perm (w1[1], w1[2], selector);
14237 w1[3] = __byte_perm (w1[0], w1[1], selector);
14238 w1[2] = __byte_perm (w0[3], w1[0], selector);
14239 w1[1] = __byte_perm (w0[2], w0[3], selector);
14240 w1[0] = __byte_perm (w0[1], w0[2], selector);
14241 w0[3] = __byte_perm (w0[0], w0[1], selector);
14242 w0[2] = __byte_perm ( 0, w0[0], selector);
14243 w0[1] = 0;
14244 w0[0] = 0;
14245
14246 break;
14247
14248 case 3:
14249 w3[1] = __byte_perm (w2[1], w2[2], selector);
14250 w3[0] = __byte_perm (w2[0], w2[1], selector);
14251 w2[3] = __byte_perm (w1[3], w2[0], selector);
14252 w2[2] = __byte_perm (w1[2], w1[3], selector);
14253 w2[1] = __byte_perm (w1[1], w1[2], selector);
14254 w2[0] = __byte_perm (w1[0], w1[1], selector);
14255 w1[3] = __byte_perm (w0[3], w1[0], selector);
14256 w1[2] = __byte_perm (w0[2], w0[3], selector);
14257 w1[1] = __byte_perm (w0[1], w0[2], selector);
14258 w1[0] = __byte_perm (w0[0], w0[1], selector);
14259 w0[3] = __byte_perm ( 0, w0[0], selector);
14260 w0[2] = 0;
14261 w0[1] = 0;
14262 w0[0] = 0;
14263
14264 break;
14265
14266 case 4:
14267 w3[1] = __byte_perm (w2[0], w2[1], selector);
14268 w3[0] = __byte_perm (w1[3], w2[0], selector);
14269 w2[3] = __byte_perm (w1[2], w1[3], selector);
14270 w2[2] = __byte_perm (w1[1], w1[2], selector);
14271 w2[1] = __byte_perm (w1[0], w1[1], selector);
14272 w2[0] = __byte_perm (w0[3], w1[0], selector);
14273 w1[3] = __byte_perm (w0[2], w0[3], selector);
14274 w1[2] = __byte_perm (w0[1], w0[2], selector);
14275 w1[1] = __byte_perm (w0[0], w0[1], selector);
14276 w1[0] = __byte_perm ( 0, w0[0], selector);
14277 w0[3] = 0;
14278 w0[2] = 0;
14279 w0[1] = 0;
14280 w0[0] = 0;
14281
14282 break;
14283
14284 case 5:
14285 w3[1] = __byte_perm (w1[3], w2[0], selector);
14286 w3[0] = __byte_perm (w1[2], w1[3], selector);
14287 w2[3] = __byte_perm (w1[1], w1[2], selector);
14288 w2[2] = __byte_perm (w1[0], w1[1], selector);
14289 w2[1] = __byte_perm (w0[3], w1[0], selector);
14290 w2[0] = __byte_perm (w0[2], w0[3], selector);
14291 w1[3] = __byte_perm (w0[1], w0[2], selector);
14292 w1[2] = __byte_perm (w0[0], w0[1], selector);
14293 w1[1] = __byte_perm ( 0, w0[0], selector);
14294 w1[0] = 0;
14295 w0[3] = 0;
14296 w0[2] = 0;
14297 w0[1] = 0;
14298 w0[0] = 0;
14299
14300 break;
14301
14302 case 6:
14303 w3[1] = __byte_perm (w1[2], w1[3], selector);
14304 w3[0] = __byte_perm (w1[1], w1[2], selector);
14305 w2[3] = __byte_perm (w1[0], w1[1], selector);
14306 w2[2] = __byte_perm (w0[3], w1[0], selector);
14307 w2[1] = __byte_perm (w0[2], w0[3], selector);
14308 w2[0] = __byte_perm (w0[1], w0[2], selector);
14309 w1[3] = __byte_perm (w0[0], w0[1], selector);
14310 w1[2] = __byte_perm ( 0, w0[0], selector);
14311 w1[1] = 0;
14312 w1[0] = 0;
14313 w0[3] = 0;
14314 w0[2] = 0;
14315 w0[1] = 0;
14316 w0[0] = 0;
14317
14318 break;
14319
14320 case 7:
14321 w3[1] = __byte_perm (w1[1], w1[2], selector);
14322 w3[0] = __byte_perm (w1[0], w1[1], selector);
14323 w2[3] = __byte_perm (w0[3], w1[0], selector);
14324 w2[2] = __byte_perm (w0[2], w0[3], selector);
14325 w2[1] = __byte_perm (w0[1], w0[2], selector);
14326 w2[0] = __byte_perm (w0[0], w0[1], selector);
14327 w1[3] = __byte_perm ( 0, w0[0], selector);
14328 w1[2] = 0;
14329 w1[1] = 0;
14330 w1[0] = 0;
14331 w0[3] = 0;
14332 w0[2] = 0;
14333 w0[1] = 0;
14334 w0[0] = 0;
14335
14336 break;
14337
14338 case 8:
14339 w3[1] = __byte_perm (w1[0], w1[1], selector);
14340 w3[0] = __byte_perm (w0[3], w1[0], selector);
14341 w2[3] = __byte_perm (w0[2], w0[3], selector);
14342 w2[2] = __byte_perm (w0[1], w0[2], selector);
14343 w2[1] = __byte_perm (w0[0], w0[1], selector);
14344 w2[0] = __byte_perm ( 0, w0[0], selector);
14345 w1[3] = 0;
14346 w1[2] = 0;
14347 w1[1] = 0;
14348 w1[0] = 0;
14349 w0[3] = 0;
14350 w0[2] = 0;
14351 w0[1] = 0;
14352 w0[0] = 0;
14353
14354 break;
14355
14356 case 9:
14357 w3[1] = __byte_perm (w0[3], w1[0], selector);
14358 w3[0] = __byte_perm (w0[2], w0[3], selector);
14359 w2[3] = __byte_perm (w0[1], w0[2], selector);
14360 w2[2] = __byte_perm (w0[0], w0[1], selector);
14361 w2[1] = __byte_perm ( 0, w0[0], selector);
14362 w2[0] = 0;
14363 w1[3] = 0;
14364 w1[2] = 0;
14365 w1[1] = 0;
14366 w1[0] = 0;
14367 w0[3] = 0;
14368 w0[2] = 0;
14369 w0[1] = 0;
14370 w0[0] = 0;
14371
14372 break;
14373
14374 case 10:
14375 w3[1] = __byte_perm (w0[2], w0[3], selector);
14376 w3[0] = __byte_perm (w0[1], w0[2], selector);
14377 w2[3] = __byte_perm (w0[0], w0[1], selector);
14378 w2[2] = __byte_perm ( 0, w0[0], selector);
14379 w2[1] = 0;
14380 w2[0] = 0;
14381 w1[3] = 0;
14382 w1[2] = 0;
14383 w1[1] = 0;
14384 w1[0] = 0;
14385 w0[3] = 0;
14386 w0[2] = 0;
14387 w0[1] = 0;
14388 w0[0] = 0;
14389
14390 break;
14391
14392 case 11:
14393 w3[1] = __byte_perm (w0[1], w0[2], selector);
14394 w3[0] = __byte_perm (w0[0], w0[1], selector);
14395 w2[3] = __byte_perm ( 0, w0[0], selector);
14396 w2[2] = 0;
14397 w2[1] = 0;
14398 w2[0] = 0;
14399 w1[3] = 0;
14400 w1[2] = 0;
14401 w1[1] = 0;
14402 w1[0] = 0;
14403 w0[3] = 0;
14404 w0[2] = 0;
14405 w0[1] = 0;
14406 w0[0] = 0;
14407
14408 break;
14409
14410 case 12:
14411 w3[1] = __byte_perm (w0[0], w0[1], selector);
14412 w3[0] = __byte_perm ( 0, w0[0], selector);
14413 w2[3] = 0;
14414 w2[2] = 0;
14415 w2[1] = 0;
14416 w2[0] = 0;
14417 w1[3] = 0;
14418 w1[2] = 0;
14419 w1[1] = 0;
14420 w1[0] = 0;
14421 w0[3] = 0;
14422 w0[2] = 0;
14423 w0[1] = 0;
14424 w0[0] = 0;
14425
14426 break;
14427
14428 case 13:
14429 w3[1] = __byte_perm ( 0, w0[0], selector);
14430 w3[0] = 0;
14431 w2[3] = 0;
14432 w2[2] = 0;
14433 w2[1] = 0;
14434 w2[0] = 0;
14435 w1[3] = 0;
14436 w1[2] = 0;
14437 w1[1] = 0;
14438 w1[0] = 0;
14439 w0[3] = 0;
14440 w0[2] = 0;
14441 w0[1] = 0;
14442 w0[0] = 0;
14443
14444 break;
14445 }
14446
14447 #else
14448
14449 u32x tmp0[4];
14450 u32x tmp1[4];
14451 u32x tmp2[1];
14452
14453 switch (offset % 4)
14454 {
14455 case 0:
14456 tmp0[0] = w0[0];
14457 tmp0[1] = w0[1];
14458 tmp0[2] = w0[2];
14459 tmp0[3] = w0[3];
14460 tmp1[0] = w1[0];
14461 tmp1[1] = w1[1];
14462 tmp1[2] = w1[2];
14463 tmp1[3] = w1[3];
14464 tmp2[0] = 0;
14465 break;
14466
14467 case 1:
14468 tmp0[0] = w0[0] << 8;
14469 tmp0[1] = w0[0] >> 24 | w0[1] << 8;
14470 tmp0[2] = w0[1] >> 24 | w0[2] << 8;
14471 tmp0[3] = w0[2] >> 24 | w0[3] << 8;
14472 tmp1[0] = w0[3] >> 24 | w1[0] << 8;
14473 tmp1[1] = w1[0] >> 24 | w1[1] << 8;
14474 tmp1[2] = w1[1] >> 24 | w1[2] << 8;
14475 tmp1[3] = w1[2] >> 24 | w1[3] << 8;
14476 tmp2[0] = w1[3] >> 24;
14477 break;
14478
14479 case 2:
14480 tmp0[0] = w0[0] << 16;
14481 tmp0[1] = w0[0] >> 16 | w0[1] << 16;
14482 tmp0[2] = w0[1] >> 16 | w0[2] << 16;
14483 tmp0[3] = w0[2] >> 16 | w0[3] << 16;
14484 tmp1[0] = w0[3] >> 16 | w1[0] << 16;
14485 tmp1[1] = w1[0] >> 16 | w1[1] << 16;
14486 tmp1[2] = w1[1] >> 16 | w1[2] << 16;
14487 tmp1[3] = w1[2] >> 16 | w1[3] << 16;
14488 tmp2[0] = w1[3] >> 16;
14489 break;
14490
14491 case 3:
14492 tmp0[0] = w0[0] << 24;
14493 tmp0[1] = w0[0] >> 8 | w0[1] << 24;
14494 tmp0[2] = w0[1] >> 8 | w0[2] << 24;
14495 tmp0[3] = w0[2] >> 8 | w0[3] << 24;
14496 tmp1[0] = w0[3] >> 8 | w1[0] << 24;
14497 tmp1[1] = w1[0] >> 8 | w1[1] << 24;
14498 tmp1[2] = w1[1] >> 8 | w1[2] << 24;
14499 tmp1[3] = w1[2] >> 8 | w1[3] << 24;
14500 tmp2[0] = w1[3] >> 8;
14501 break;
14502 }
14503
14504 switch (offset / 4)
14505 {
14506 case 0:
14507 w0[0] = tmp0[0];
14508 w0[1] = tmp0[1];
14509 w0[2] = tmp0[2];
14510 w0[3] = tmp0[3];
14511 w1[0] = tmp1[0];
14512 w1[1] = tmp1[1];
14513 w1[2] = tmp1[2];
14514 w1[3] = tmp1[3];
14515 w2[0] = tmp2[0];
14516 break;
14517
14518 case 1:
14519 w0[0] = 0;
14520 w0[1] = tmp0[0];
14521 w0[2] = tmp0[1];
14522 w0[3] = tmp0[2];
14523 w1[0] = tmp0[3];
14524 w1[1] = tmp1[0];
14525 w1[2] = tmp1[1];
14526 w1[3] = tmp1[2];
14527 w2[0] = tmp1[3];
14528 w2[1] = tmp2[0];
14529 break;
14530
14531 case 2:
14532 w0[0] = 0;
14533 w0[1] = 0;
14534 w0[2] = tmp0[0];
14535 w0[3] = tmp0[1];
14536 w1[0] = tmp0[2];
14537 w1[1] = tmp0[3];
14538 w1[2] = tmp1[0];
14539 w1[3] = tmp1[1];
14540 w2[0] = tmp1[2];
14541 w2[1] = tmp1[3];
14542 w2[2] = tmp2[0];
14543 break;
14544
14545 case 3:
14546 w0[0] = 0;
14547 w0[1] = 0;
14548 w0[2] = 0;
14549 w0[3] = tmp0[0];
14550 w1[0] = tmp0[1];
14551 w1[1] = tmp0[2];
14552 w1[2] = tmp0[3];
14553 w1[3] = tmp1[0];
14554 w2[0] = tmp1[1];
14555 w2[1] = tmp1[2];
14556 w2[2] = tmp1[3];
14557 w2[3] = tmp2[0];
14558 break;
14559
14560 case 4:
14561 w0[0] = 0;
14562 w0[1] = 0;
14563 w0[2] = 0;
14564 w0[3] = 0;
14565 w1[0] = tmp0[0];
14566 w1[1] = tmp0[1];
14567 w1[2] = tmp0[2];
14568 w1[3] = tmp0[3];
14569 w2[0] = tmp1[0];
14570 w2[1] = tmp1[1];
14571 w2[2] = tmp1[2];
14572 w2[3] = tmp1[3];
14573 w3[0] = tmp2[0];
14574 break;
14575
14576 case 5:
14577 w0[0] = 0;
14578 w0[1] = 0;
14579 w0[2] = 0;
14580 w0[3] = 0;
14581 w1[0] = 0;
14582 w1[1] = tmp0[0];
14583 w1[2] = tmp0[1];
14584 w1[3] = tmp0[2];
14585 w2[0] = tmp0[3];
14586 w2[1] = tmp1[0];
14587 w2[2] = tmp1[1];
14588 w2[3] = tmp1[2];
14589 w3[0] = tmp1[3];
14590 w3[1] = tmp2[0];
14591 break;
14592
14593 case 6:
14594 w0[0] = 0;
14595 w0[1] = 0;
14596 w0[2] = 0;
14597 w0[3] = 0;
14598 w1[0] = 0;
14599 w1[1] = 0;
14600 w1[2] = tmp0[0];
14601 w1[3] = tmp0[1];
14602 w2[0] = tmp0[2];
14603 w2[1] = tmp0[3];
14604 w2[2] = tmp1[0];
14605 w2[3] = tmp1[1];
14606 w3[0] = tmp1[2];
14607 w3[1] = tmp1[3];
14608 w3[2] = tmp2[0];
14609 break;
14610
14611 case 7:
14612 w0[0] = 0;
14613 w0[1] = 0;
14614 w0[2] = 0;
14615 w0[3] = 0;
14616 w1[0] = 0;
14617 w1[1] = 0;
14618 w1[2] = 0;
14619 w1[3] = tmp0[0];
14620 w2[0] = tmp0[1];
14621 w2[1] = tmp0[2];
14622 w2[2] = tmp0[3];
14623 w2[3] = tmp1[0];
14624 w3[0] = tmp1[1];
14625 w3[1] = tmp1[2];
14626 w3[2] = tmp1[3];
14627 w3[3] = tmp2[0];
14628 break;
14629
14630 case 8:
14631 w0[0] = 0;
14632 w0[1] = 0;
14633 w0[2] = 0;
14634 w0[3] = 0;
14635 w1[0] = 0;
14636 w1[1] = 0;
14637 w1[2] = 0;
14638 w1[3] = 0;
14639 w2[0] = tmp0[0];
14640 w2[1] = tmp0[1];
14641 w2[2] = tmp0[2];
14642 w2[3] = tmp0[3];
14643 w3[0] = tmp1[0];
14644 w3[1] = tmp1[1];
14645 w3[2] = tmp1[2];
14646 w3[3] = tmp1[3];
14647 break;
14648
14649 case 9:
14650 w0[0] = 0;
14651 w0[1] = 0;
14652 w0[2] = 0;
14653 w0[3] = 0;
14654 w1[0] = 0;
14655 w1[1] = 0;
14656 w1[2] = 0;
14657 w1[3] = 0;
14658 w2[0] = 0;
14659 w2[1] = tmp0[0];
14660 w2[2] = tmp0[1];
14661 w2[3] = tmp0[2];
14662 w3[0] = tmp0[3];
14663 w3[1] = tmp1[0];
14664 w3[2] = tmp1[1];
14665 w3[3] = tmp1[2];
14666 break;
14667
14668 case 10:
14669 w0[0] = 0;
14670 w0[1] = 0;
14671 w0[2] = 0;
14672 w0[3] = 0;
14673 w1[0] = 0;
14674 w1[1] = 0;
14675 w1[2] = 0;
14676 w1[3] = 0;
14677 w2[0] = 0;
14678 w2[1] = 0;
14679 w2[2] = tmp0[0];
14680 w2[3] = tmp0[1];
14681 w3[0] = tmp0[2];
14682 w3[1] = tmp0[3];
14683 w3[2] = tmp1[0];
14684 w3[3] = tmp1[1];
14685 break;
14686
14687 case 11:
14688 w0[0] = 0;
14689 w0[1] = 0;
14690 w0[2] = 0;
14691 w0[3] = 0;
14692 w1[0] = 0;
14693 w1[1] = 0;
14694 w1[2] = 0;
14695 w1[3] = 0;
14696 w2[0] = 0;
14697 w2[1] = 0;
14698 w2[2] = 0;
14699 w2[3] = tmp0[0];
14700 w3[0] = tmp0[1];
14701 w3[1] = tmp0[2];
14702 w3[2] = tmp0[3];
14703 w3[3] = tmp1[0];
14704 break;
14705
14706 case 12:
14707 w0[0] = 0;
14708 w0[1] = 0;
14709 w0[2] = 0;
14710 w0[3] = 0;
14711 w1[0] = 0;
14712 w1[1] = 0;
14713 w1[2] = 0;
14714 w1[3] = 0;
14715 w2[0] = 0;
14716 w2[1] = 0;
14717 w2[2] = 0;
14718 w2[3] = 0;
14719 w3[0] = tmp0[0];
14720 w3[1] = tmp0[1];
14721 w3[2] = tmp0[2];
14722 w3[3] = tmp0[3];
14723 break;
14724
14725 case 13:
14726 w0[0] = 0;
14727 w0[1] = 0;
14728 w0[2] = 0;
14729 w0[3] = 0;
14730 w1[0] = 0;
14731 w1[1] = 0;
14732 w1[2] = 0;
14733 w1[3] = 0;
14734 w2[0] = 0;
14735 w2[1] = 0;
14736 w2[2] = 0;
14737 w2[3] = 0;
14738 w3[0] = 0;
14739 w3[1] = tmp0[0];
14740 w3[2] = tmp0[1];
14741 w3[3] = tmp0[2];
14742 break;
14743
14744 }
14745
14746 #endif
14747 }
14748
14749 __device__ static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
14750 {
14751 const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
14752
14753 switch (offset / 4)
14754 {
14755 case 0:
14756 w3[1] = __byte_perm (w3[1], w3[0], selector);
14757 w3[0] = __byte_perm (w3[0], w2[3], selector);
14758 w2[3] = __byte_perm (w2[3], w2[2], selector);
14759 w2[2] = __byte_perm (w2[2], w2[1], selector);
14760 w2[1] = __byte_perm (w2[1], w2[0], selector);
14761 w2[0] = __byte_perm (w2[0], w1[3], selector);
14762 w1[3] = __byte_perm (w1[3], w1[2], selector);
14763 w1[2] = __byte_perm (w1[2], w1[1], selector);
14764 w1[1] = __byte_perm (w1[1], w1[0], selector);
14765 w1[0] = __byte_perm (w1[0], w0[3], selector);
14766 w0[3] = __byte_perm (w0[3], w0[2], selector);
14767 w0[2] = __byte_perm (w0[2], w0[1], selector);
14768 w0[1] = __byte_perm (w0[1], w0[0], selector);
14769 w0[0] = __byte_perm (w0[0], 0, selector);
14770 break;
14771
14772 case 1:
14773 w3[1] = __byte_perm (w3[0], w2[3], selector);
14774 w3[0] = __byte_perm (w2[3], w2[2], selector);
14775 w2[3] = __byte_perm (w2[2], w2[1], selector);
14776 w2[2] = __byte_perm (w2[1], w2[0], selector);
14777 w2[1] = __byte_perm (w2[0], w1[3], selector);
14778 w2[0] = __byte_perm (w1[3], w1[2], selector);
14779 w1[3] = __byte_perm (w1[2], w1[1], selector);
14780 w1[2] = __byte_perm (w1[1], w1[0], selector);
14781 w1[1] = __byte_perm (w1[0], w0[3], selector);
14782 w1[0] = __byte_perm (w0[3], w0[2], selector);
14783 w0[3] = __byte_perm (w0[2], w0[1], selector);
14784 w0[2] = __byte_perm (w0[1], w0[0], selector);
14785 w0[1] = __byte_perm (w0[0], 0, selector);
14786 w0[0] = 0;
14787 break;
14788
14789 case 2:
14790 w3[1] = __byte_perm (w2[3], w2[2], selector);
14791 w3[0] = __byte_perm (w2[2], w2[1], selector);
14792 w2[3] = __byte_perm (w2[1], w2[0], selector);
14793 w2[2] = __byte_perm (w2[0], w1[3], selector);
14794 w2[1] = __byte_perm (w1[3], w1[2], selector);
14795 w2[0] = __byte_perm (w1[2], w1[1], selector);
14796 w1[3] = __byte_perm (w1[1], w1[0], selector);
14797 w1[2] = __byte_perm (w1[0], w0[3], selector);
14798 w1[1] = __byte_perm (w0[3], w0[2], selector);
14799 w1[0] = __byte_perm (w0[2], w0[1], selector);
14800 w0[3] = __byte_perm (w0[1], w0[0], selector);
14801 w0[2] = __byte_perm (w0[0], 0, selector);
14802 w0[1] = 0;
14803 w0[0] = 0;
14804 break;
14805
14806 case 3:
14807 w3[1] = __byte_perm (w2[2], w2[1], selector);
14808 w3[0] = __byte_perm (w2[1], w2[0], selector);
14809 w2[3] = __byte_perm (w2[0], w1[3], selector);
14810 w2[2] = __byte_perm (w1[3], w1[2], selector);
14811 w2[1] = __byte_perm (w1[2], w1[1], selector);
14812 w2[0] = __byte_perm (w1[1], w1[0], selector);
14813 w1[3] = __byte_perm (w1[0], w0[3], selector);
14814 w1[2] = __byte_perm (w0[3], w0[2], selector);
14815 w1[1] = __byte_perm (w0[2], w0[1], selector);
14816 w1[0] = __byte_perm (w0[1], w0[0], selector);
14817 w0[3] = __byte_perm (w0[0], 0, selector);
14818 w0[2] = 0;
14819 w0[1] = 0;
14820 w0[0] = 0;
14821 break;
14822
14823 case 4:
14824 w3[1] = __byte_perm (w2[1], w2[0], selector);
14825 w3[0] = __byte_perm (w2[0], w1[3], selector);
14826 w2[3] = __byte_perm (w1[3], w1[2], selector);
14827 w2[2] = __byte_perm (w1[2], w1[1], selector);
14828 w2[1] = __byte_perm (w1[1], w1[0], selector);
14829 w2[0] = __byte_perm (w1[0], w0[3], selector);
14830 w1[3] = __byte_perm (w0[3], w0[2], selector);
14831 w1[2] = __byte_perm (w0[2], w0[1], selector);
14832 w1[1] = __byte_perm (w0[1], w0[0], selector);
14833 w1[0] = __byte_perm (w0[0], 0, selector);
14834 w0[3] = 0;
14835 w0[2] = 0;
14836 w0[1] = 0;
14837 w0[0] = 0;
14838 break;
14839
14840 case 5:
14841 w3[1] = __byte_perm (w2[0], w1[3], selector);
14842 w3[0] = __byte_perm (w1[3], w1[2], selector);
14843 w2[3] = __byte_perm (w1[2], w1[1], selector);
14844 w2[2] = __byte_perm (w1[1], w1[0], selector);
14845 w2[1] = __byte_perm (w1[0], w0[3], selector);
14846 w2[0] = __byte_perm (w0[3], w0[2], selector);
14847 w1[3] = __byte_perm (w0[2], w0[1], selector);
14848 w1[2] = __byte_perm (w0[1], w0[0], selector);
14849 w1[1] = __byte_perm (w0[0], 0, selector);
14850 w1[0] = 0;
14851 w0[3] = 0;
14852 w0[2] = 0;
14853 w0[1] = 0;
14854 w0[0] = 0;
14855 break;
14856
14857 case 6:
14858 w3[1] = __byte_perm (w1[3], w1[2], selector);
14859 w3[0] = __byte_perm (w1[2], w1[1], selector);
14860 w2[3] = __byte_perm (w1[1], w1[0], selector);
14861 w2[2] = __byte_perm (w1[0], w0[3], selector);
14862 w2[1] = __byte_perm (w0[3], w0[2], selector);
14863 w2[0] = __byte_perm (w0[2], w0[1], selector);
14864 w1[3] = __byte_perm (w0[1], w0[0], selector);
14865 w1[2] = __byte_perm (w0[0], 0, selector);
14866 w1[1] = 0;
14867 w1[0] = 0;
14868 w0[3] = 0;
14869 w0[2] = 0;
14870 w0[1] = 0;
14871 w0[0] = 0;
14872 break;
14873
14874 case 7:
14875 w3[1] = __byte_perm (w1[2], w1[1], selector);
14876 w3[0] = __byte_perm (w1[1], w1[0], selector);
14877 w2[3] = __byte_perm (w1[0], w0[3], selector);
14878 w2[2] = __byte_perm (w0[3], w0[2], selector);
14879 w2[1] = __byte_perm (w0[2], w0[1], selector);
14880 w2[0] = __byte_perm (w0[1], w0[0], selector);
14881 w1[3] = __byte_perm (w0[0], 0, selector);
14882 w1[2] = 0;
14883 w1[1] = 0;
14884 w1[0] = 0;
14885 w0[3] = 0;
14886 w0[2] = 0;
14887 w0[1] = 0;
14888 w0[0] = 0;
14889 break;
14890
14891 case 8:
14892 w3[1] = __byte_perm (w1[1], w1[0], selector);
14893 w3[0] = __byte_perm (w1[0], w0[3], selector);
14894 w2[3] = __byte_perm (w0[3], w0[2], selector);
14895 w2[2] = __byte_perm (w0[2], w0[1], selector);
14896 w2[1] = __byte_perm (w0[1], w0[0], selector);
14897 w2[0] = __byte_perm (w0[0], 0, selector);
14898 w1[3] = 0;
14899 w1[2] = 0;
14900 w1[1] = 0;
14901 w1[0] = 0;
14902 w0[3] = 0;
14903 w0[2] = 0;
14904 w0[1] = 0;
14905 w0[0] = 0;
14906 break;
14907
14908 case 9:
14909 w3[1] = __byte_perm (w1[0], w0[3], selector);
14910 w3[0] = __byte_perm (w0[3], w0[2], selector);
14911 w2[3] = __byte_perm (w0[2], w0[1], selector);
14912 w2[2] = __byte_perm (w0[1], w0[0], selector);
14913 w2[1] = __byte_perm (w0[0], 0, selector);
14914 w2[0] = 0;
14915 w1[3] = 0;
14916 w1[2] = 0;
14917 w1[1] = 0;
14918 w1[0] = 0;
14919 w0[3] = 0;
14920 w0[2] = 0;
14921 w0[1] = 0;
14922 w0[0] = 0;
14923 break;
14924
14925 case 10:
14926 w3[1] = __byte_perm (w0[3], w0[2], selector);
14927 w3[0] = __byte_perm (w0[2], w0[1], selector);
14928 w2[3] = __byte_perm (w0[1], w0[0], selector);
14929 w2[2] = __byte_perm (w0[0], 0, selector);
14930 w2[1] = 0;
14931 w2[0] = 0;
14932 w1[3] = 0;
14933 w1[2] = 0;
14934 w1[1] = 0;
14935 w1[0] = 0;
14936 w0[3] = 0;
14937 w0[2] = 0;
14938 w0[1] = 0;
14939 w0[0] = 0;
14940 break;
14941
14942 case 11:
14943 w3[1] = __byte_perm (w0[2], w0[1], selector);
14944 w3[0] = __byte_perm (w0[1], w0[0], selector);
14945 w2[3] = __byte_perm (w0[0], 0, selector);
14946 w2[2] = 0;
14947 w2[1] = 0;
14948 w2[0] = 0;
14949 w1[3] = 0;
14950 w1[2] = 0;
14951 w1[1] = 0;
14952 w1[0] = 0;
14953 w0[3] = 0;
14954 w0[2] = 0;
14955 w0[1] = 0;
14956 w0[0] = 0;
14957 break;
14958
14959 case 12:
14960 w3[1] = __byte_perm (w0[1], w0[0], selector);
14961 w3[0] = __byte_perm (w0[0], 0, selector);
14962 w2[3] = 0;
14963 w2[2] = 0;
14964 w2[1] = 0;
14965 w2[0] = 0;
14966 w1[3] = 0;
14967 w1[2] = 0;
14968 w1[1] = 0;
14969 w1[0] = 0;
14970 w0[3] = 0;
14971 w0[2] = 0;
14972 w0[1] = 0;
14973 w0[0] = 0;
14974 break;
14975
14976 case 13:
14977 w3[1] = __byte_perm (w0[0], 0, selector);
14978 w3[0] = 0;
14979 w2[3] = 0;
14980 w2[2] = 0;
14981 w2[1] = 0;
14982 w2[0] = 0;
14983 w1[3] = 0;
14984 w1[2] = 0;
14985 w1[1] = 0;
14986 w1[0] = 0;
14987 w0[3] = 0;
14988 w0[2] = 0;
14989 w0[1] = 0;
14990 w0[0] = 0;
14991 break;
14992 }
14993 }
14994
14995 #endif
14996
14997 __device__ static u32 check_vector_accessible (const u32 il_pos, const u32 bf_loops, const u32 bfs_cnt, const u32 element)
14998 {
14999 #ifdef VECT_SIZE1
15000
15001 // nothing to do here
15002
15003 #else
15004
15005 if ((il_pos + 1) == bf_loops)
15006 {
15007 #ifdef VECT_SIZE2
15008 u32 bfs_over = bfs_cnt % 2;
15009
15010 if (bfs_over == 0) bfs_over = 2;
15011 #endif
15012
15013 #ifdef VECT_SIZE4
15014 u32 bfs_over = bfs_cnt % 4;
15015
15016 if (bfs_over == 0) bfs_over = 4;
15017 #endif
15018
15019 if (element >= bfs_over) return 0;
15020 }
15021
15022 #endif
15023
15024 return 1;
15025 }