2 * Author......: Jens Steube <jens.steube@gmail.com>
6 __device__
static int device_memcmp (const u32 d1
[4], const u32
*d2
)
8 if (d1
[3] > d2
[DGST_R3
]) return ( 1);
9 if (d1
[3] < d2
[DGST_R3
]) return (-1);
10 if (d1
[2] > d2
[DGST_R2
]) return ( 1);
11 if (d1
[2] < d2
[DGST_R2
]) return (-1);
12 if (d1
[1] > d2
[DGST_R1
]) return ( 1);
13 if (d1
[1] < d2
[DGST_R1
]) return (-1);
14 if (d1
[0] > d2
[DGST_R0
]) return ( 1);
15 if (d1
[0] < d2
[DGST_R0
]) return (-1);
20 __device__
static int find_hash (const u32 digest
[4], const u32 digests_cnt
, const digest_t
*digests_buf
)
22 for (u32 l
= 0, r
= digests_cnt
; r
; r
>>= 1)
28 const int cmp
= device_memcmp (digest
, digests_buf
[c
].digest_buf
);
37 if (cmp
== 0) return (c
);
43 __device__
static u32
check_bitmap (const u32
*bitmap
, const u32 bitmap_mask
, const u32 bitmap_shift
, const u32 digest
)
45 return (bitmap
[(digest
>> bitmap_shift
) & bitmap_mask
] & (1 << (digest
& 0x1f)));
48 __device__
static u32
check (const u32 digest
[2], const u32
*bitmap_s1_a
, const u32
*bitmap_s1_b
, const u32
*bitmap_s1_c
, const u32
*bitmap_s1_d
, const u32
*bitmap_s2_a
, const u32
*bitmap_s2_b
, const u32
*bitmap_s2_c
, const u32
*bitmap_s2_d
, const u32 bitmap_mask
, const u32 bitmap_shift1
, const u32 bitmap_shift2
)
50 if (check_bitmap (bitmap_s1_a
, bitmap_mask
, bitmap_shift1
, digest
[0]) == 0) return (0);
51 if (check_bitmap (bitmap_s1_b
, bitmap_mask
, bitmap_shift1
, digest
[1]) == 0) return (0);
52 if (check_bitmap (bitmap_s1_c
, bitmap_mask
, bitmap_shift1
, digest
[2]) == 0) return (0);
53 if (check_bitmap (bitmap_s1_d
, bitmap_mask
, bitmap_shift1
, digest
[3]) == 0) return (0);
55 if (check_bitmap (bitmap_s2_a
, bitmap_mask
, bitmap_shift2
, digest
[0]) == 0) return (0);
56 if (check_bitmap (bitmap_s2_b
, bitmap_mask
, bitmap_shift2
, digest
[1]) == 0) return (0);
57 if (check_bitmap (bitmap_s2_c
, bitmap_mask
, bitmap_shift2
, digest
[2]) == 0) return (0);
58 if (check_bitmap (bitmap_s2_d
, bitmap_mask
, bitmap_shift2
, digest
[3]) == 0) return (0);
64 __device__
static void mark_hash_s0 (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
66 hashes_shown
[hash_pos
] = 1;
68 plains_buf
[hash_pos
].gidvid
= (gid
* 1) + 0;
69 plains_buf
[hash_pos
].il_pos
= il_pos
;
72 __device__
static void mark_hash_s0_warp (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
74 hashes_shown
[hash_pos
] = 1;
76 plains_buf
[hash_pos
].gidvid
= gid
;
77 plains_buf
[hash_pos
].il_pos
= (il_pos
* 1) + 0;
82 __device__
static void mark_hash_s0 (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
84 hashes_shown
[hash_pos
] = 1;
86 plains_buf
[hash_pos
].gidvid
= (gid
* 2) + 0;
87 plains_buf
[hash_pos
].il_pos
= il_pos
;
90 __device__
static void mark_hash_s1 (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
92 hashes_shown
[hash_pos
] = 1;
94 plains_buf
[hash_pos
].gidvid
= (gid
* 2) + 1;
95 plains_buf
[hash_pos
].il_pos
= il_pos
;
98 __device__
static void mark_hash_s0_warp (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
100 hashes_shown
[hash_pos
] = 1;
102 plains_buf
[hash_pos
].gidvid
= gid
;
103 plains_buf
[hash_pos
].il_pos
= (il_pos
* 2) + 0;
106 __device__
static void mark_hash_s1_warp (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
108 hashes_shown
[hash_pos
] = 1;
110 plains_buf
[hash_pos
].gidvid
= gid
;
111 plains_buf
[hash_pos
].il_pos
= (il_pos
* 2) + 1;
116 __device__
static void mark_hash_s0 (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
118 hashes_shown
[hash_pos
] = 1;
120 plains_buf
[hash_pos
].gidvid
= (gid
* 4) + 0;
121 plains_buf
[hash_pos
].il_pos
= il_pos
;
124 __device__
static void mark_hash_s1 (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
126 hashes_shown
[hash_pos
] = 1;
128 plains_buf
[hash_pos
].gidvid
= (gid
* 4) + 1;
129 plains_buf
[hash_pos
].il_pos
= il_pos
;
132 __device__
static void mark_hash_s2 (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
134 hashes_shown
[hash_pos
] = 1;
136 plains_buf
[hash_pos
].gidvid
= (gid
* 4) + 2;
137 plains_buf
[hash_pos
].il_pos
= il_pos
;
140 __device__
static void mark_hash_s3 (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
142 hashes_shown
[hash_pos
] = 1;
144 plains_buf
[hash_pos
].gidvid
= (gid
* 4) + 3;
145 plains_buf
[hash_pos
].il_pos
= il_pos
;
148 __device__
static void mark_hash_s0_warp (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
150 hashes_shown
[hash_pos
] = 1;
152 plains_buf
[hash_pos
].gidvid
= gid
;
153 plains_buf
[hash_pos
].il_pos
= (il_pos
* 4) + 0;
156 __device__
static void mark_hash_s1_warp (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
158 hashes_shown
[hash_pos
] = 1;
160 plains_buf
[hash_pos
].gidvid
= gid
;
161 plains_buf
[hash_pos
].il_pos
= (il_pos
* 4) + 1;
164 __device__
static void mark_hash_s2_warp (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
166 hashes_shown
[hash_pos
] = 1;
168 plains_buf
[hash_pos
].gidvid
= gid
;
169 plains_buf
[hash_pos
].il_pos
= (il_pos
* 4) + 2;
172 __device__
static void mark_hash_s3_warp (plain_t
*plains_buf
, u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
174 hashes_shown
[hash_pos
] = 1;
176 plains_buf
[hash_pos
].gidvid
= gid
;
177 plains_buf
[hash_pos
].il_pos
= (il_pos
* 4) + 3;
185 __device__
static u32
swap_workaround (const u32 v
)
187 #if __CUDA_ARCH__ >= 200
188 return __byte_perm (v
, 0, 0x0123);
191 return (v
<< 24) + ((v
& 0x0000FF00) << 8) + ((v
& 0x00FF0000) >> 8) + (v
>> 24);
196 __device__
static u64
swap_workaround (const u64 v
)
198 return (((v
& 0xff00000000000000ull
) >> 56)
199 | ((v
& 0x00ff000000000000ull
) >> 40)
200 | ((v
& 0x0000ff0000000000ull
) >> 24)
201 | ((v
& 0x000000ff00000000ull
) >> 8)
202 | ((v
& 0x00000000ff000000ull
) << 8)
203 | ((v
& 0x0000000000ff0000ull
) << 24)
204 | ((v
& 0x000000000000ff00ull
) << 40)
205 | ((v
& 0x00000000000000ffull
) << 56));
208 __device__
static void truncate_block (u32 w
[4], const u32 len
)
217 case 1: w
[0] &= 0x000000FF;
222 case 2: w
[0] &= 0x0000FFFF;
227 case 3: w
[0] &= 0x00FFFFFF;
236 case 5: w
[1] &= 0x000000FF;
240 case 6: w
[1] &= 0x0000FFFF;
244 case 7: w
[1] &= 0x00FFFFFF;
251 case 9: w
[2] &= 0x000000FF;
254 case 10: w
[2] &= 0x0000FFFF;
257 case 11: w
[2] &= 0x00FFFFFF;
262 case 13: w
[3] &= 0x000000FF;
264 case 14: w
[3] &= 0x0000FFFF;
266 case 15: w
[3] &= 0x00FFFFFF;
271 __device__
static void make_unicode (const u32 in
[4], u32 out1
[4], u32 out2
[4])
273 #if __CUDA_ARCH__ >= 200
274 out2
[3] = __byte_perm (in
[3], 0, 0x7372);
275 out2
[2] = __byte_perm (in
[3], 0, 0x7170);
276 out2
[1] = __byte_perm (in
[2], 0, 0x7372);
277 out2
[0] = __byte_perm (in
[2], 0, 0x7170);
278 out1
[3] = __byte_perm (in
[1], 0, 0x7372);
279 out1
[2] = __byte_perm (in
[1], 0, 0x7170);
280 out1
[1] = __byte_perm (in
[0], 0, 0x7372);
281 out1
[0] = __byte_perm (in
[0], 0, 0x7170);
283 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
284 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
285 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
286 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
287 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
288 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
289 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
290 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
294 __device__
static void undo_unicode (const u32 in1
[4], const u32 in2
[4], u32 out
[4])
296 #if __CUDA_ARCH__ >= 200
297 out
[0] = __byte_perm (in1
[0], in1
[1], 0x6420);
298 out
[1] = __byte_perm (in1
[2], in1
[3], 0x6420);
299 out
[2] = __byte_perm (in2
[0], in2
[1], 0x6420);
300 out
[3] = __byte_perm (in2
[2], in2
[3], 0x6420);
302 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
303 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
304 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
305 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
306 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
307 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
308 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
309 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
313 __device__
static void append_0x01_1 (u32 w0
[4], const u32 offset
)
322 w0
[0] = w0
[0] | 0x0100;
326 w0
[0] = w0
[0] | 0x010000;
330 w0
[0] = w0
[0] | 0x01000000;
338 w0
[1] = w0
[1] | 0x0100;
342 w0
[1] = w0
[1] | 0x010000;
346 w0
[1] = w0
[1] | 0x01000000;
354 w0
[2] = w0
[2] | 0x0100;
358 w0
[2] = w0
[2] | 0x010000;
362 w0
[2] = w0
[2] | 0x01000000;
370 w0
[3] = w0
[3] | 0x0100;
374 w0
[3] = w0
[3] | 0x010000;
378 w0
[3] = w0
[3] | 0x01000000;
383 __device__
static void append_0x01_2 (u32 w0
[4], u32 w1
[4], const u32 offset
)
392 w0
[0] = w0
[0] | 0x0100;
396 w0
[0] = w0
[0] | 0x010000;
400 w0
[0] = w0
[0] | 0x01000000;
408 w0
[1] = w0
[1] | 0x0100;
412 w0
[1] = w0
[1] | 0x010000;
416 w0
[1] = w0
[1] | 0x01000000;
424 w0
[2] = w0
[2] | 0x0100;
428 w0
[2] = w0
[2] | 0x010000;
432 w0
[2] = w0
[2] | 0x01000000;
440 w0
[3] = w0
[3] | 0x0100;
444 w0
[3] = w0
[3] | 0x010000;
448 w0
[3] = w0
[3] | 0x01000000;
456 w1
[0] = w1
[0] | 0x0100;
460 w1
[0] = w1
[0] | 0x010000;
464 w1
[0] = w1
[0] | 0x01000000;
472 w1
[1] = w1
[1] | 0x0100;
476 w1
[1] = w1
[1] | 0x010000;
480 w1
[1] = w1
[1] | 0x01000000;
488 w1
[2] = w1
[2] | 0x0100;
492 w1
[2] = w1
[2] | 0x010000;
496 w1
[2] = w1
[2] | 0x01000000;
504 w1
[3] = w1
[3] | 0x0100;
508 w1
[3] = w1
[3] | 0x010000;
512 w1
[3] = w1
[3] | 0x01000000;
517 __device__
static void append_0x01_3 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
526 w0
[0] = w0
[0] | 0x0100;
530 w0
[0] = w0
[0] | 0x010000;
534 w0
[0] = w0
[0] | 0x01000000;
542 w0
[1] = w0
[1] | 0x0100;
546 w0
[1] = w0
[1] | 0x010000;
550 w0
[1] = w0
[1] | 0x01000000;
558 w0
[2] = w0
[2] | 0x0100;
562 w0
[2] = w0
[2] | 0x010000;
566 w0
[2] = w0
[2] | 0x01000000;
574 w0
[3] = w0
[3] | 0x0100;
578 w0
[3] = w0
[3] | 0x010000;
582 w0
[3] = w0
[3] | 0x01000000;
590 w1
[0] = w1
[0] | 0x0100;
594 w1
[0] = w1
[0] | 0x010000;
598 w1
[0] = w1
[0] | 0x01000000;
606 w1
[1] = w1
[1] | 0x0100;
610 w1
[1] = w1
[1] | 0x010000;
614 w1
[1] = w1
[1] | 0x01000000;
622 w1
[2] = w1
[2] | 0x0100;
626 w1
[2] = w1
[2] | 0x010000;
630 w1
[2] = w1
[2] | 0x01000000;
638 w1
[3] = w1
[3] | 0x0100;
642 w1
[3] = w1
[3] | 0x010000;
646 w1
[3] = w1
[3] | 0x01000000;
654 w2
[0] = w2
[0] | 0x0100;
658 w2
[0] = w2
[0] | 0x010000;
662 w2
[0] = w2
[0] | 0x01000000;
670 w2
[1] = w2
[1] | 0x0100;
674 w2
[1] = w2
[1] | 0x010000;
678 w2
[1] = w2
[1] | 0x01000000;
686 w2
[2] = w2
[2] | 0x0100;
690 w2
[2] = w2
[2] | 0x010000;
694 w2
[2] = w2
[2] | 0x01000000;
702 w2
[3] = w2
[3] | 0x0100;
706 w2
[3] = w2
[3] | 0x010000;
710 w2
[3] = w2
[3] | 0x01000000;
715 __device__
static void append_0x01_4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
724 w0
[0] = w0
[0] | 0x0100;
728 w0
[0] = w0
[0] | 0x010000;
732 w0
[0] = w0
[0] | 0x01000000;
740 w0
[1] = w0
[1] | 0x0100;
744 w0
[1] = w0
[1] | 0x010000;
748 w0
[1] = w0
[1] | 0x01000000;
756 w0
[2] = w0
[2] | 0x0100;
760 w0
[2] = w0
[2] | 0x010000;
764 w0
[2] = w0
[2] | 0x01000000;
772 w0
[3] = w0
[3] | 0x0100;
776 w0
[3] = w0
[3] | 0x010000;
780 w0
[3] = w0
[3] | 0x01000000;
788 w1
[0] = w1
[0] | 0x0100;
792 w1
[0] = w1
[0] | 0x010000;
796 w1
[0] = w1
[0] | 0x01000000;
804 w1
[1] = w1
[1] | 0x0100;
808 w1
[1] = w1
[1] | 0x010000;
812 w1
[1] = w1
[1] | 0x01000000;
820 w1
[2] = w1
[2] | 0x0100;
824 w1
[2] = w1
[2] | 0x010000;
828 w1
[2] = w1
[2] | 0x01000000;
836 w1
[3] = w1
[3] | 0x0100;
840 w1
[3] = w1
[3] | 0x010000;
844 w1
[3] = w1
[3] | 0x01000000;
852 w2
[0] = w2
[0] | 0x0100;
856 w2
[0] = w2
[0] | 0x010000;
860 w2
[0] = w2
[0] | 0x01000000;
868 w2
[1] = w2
[1] | 0x0100;
872 w2
[1] = w2
[1] | 0x010000;
876 w2
[1] = w2
[1] | 0x01000000;
884 w2
[2] = w2
[2] | 0x0100;
888 w2
[2] = w2
[2] | 0x010000;
892 w2
[2] = w2
[2] | 0x01000000;
900 w2
[3] = w2
[3] | 0x0100;
904 w2
[3] = w2
[3] | 0x010000;
908 w2
[3] = w2
[3] | 0x01000000;
916 w3
[0] = w3
[0] | 0x0100;
920 w3
[0] = w3
[0] | 0x010000;
924 w3
[0] = w3
[0] | 0x01000000;
932 w3
[1] = w3
[1] | 0x0100;
936 w3
[1] = w3
[1] | 0x010000;
940 w3
[1] = w3
[1] | 0x01000000;
948 w3
[2] = w3
[2] | 0x0100;
952 w3
[2] = w3
[2] | 0x010000;
956 w3
[2] = w3
[2] | 0x01000000;
964 w3
[3] = w3
[3] | 0x0100;
968 w3
[3] = w3
[3] | 0x010000;
972 w3
[3] = w3
[3] | 0x01000000;
977 __device__
static void append_0x01_8 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
986 w0
[0] = w0
[0] | 0x0100;
990 w0
[0] = w0
[0] | 0x010000;
994 w0
[0] = w0
[0] | 0x01000000;
1002 w0
[1] = w0
[1] | 0x0100;
1006 w0
[1] = w0
[1] | 0x010000;
1010 w0
[1] = w0
[1] | 0x01000000;
1018 w0
[2] = w0
[2] | 0x0100;
1022 w0
[2] = w0
[2] | 0x010000;
1026 w0
[2] = w0
[2] | 0x01000000;
1034 w0
[3] = w0
[3] | 0x0100;
1038 w0
[3] = w0
[3] | 0x010000;
1042 w0
[3] = w0
[3] | 0x01000000;
1050 w1
[0] = w1
[0] | 0x0100;
1054 w1
[0] = w1
[0] | 0x010000;
1058 w1
[0] = w1
[0] | 0x01000000;
1066 w1
[1] = w1
[1] | 0x0100;
1070 w1
[1] = w1
[1] | 0x010000;
1074 w1
[1] = w1
[1] | 0x01000000;
1082 w1
[2] = w1
[2] | 0x0100;
1086 w1
[2] = w1
[2] | 0x010000;
1090 w1
[2] = w1
[2] | 0x01000000;
1098 w1
[3] = w1
[3] | 0x0100;
1102 w1
[3] = w1
[3] | 0x010000;
1106 w1
[3] = w1
[3] | 0x01000000;
1114 w2
[0] = w2
[0] | 0x0100;
1118 w2
[0] = w2
[0] | 0x010000;
1122 w2
[0] = w2
[0] | 0x01000000;
1130 w2
[1] = w2
[1] | 0x0100;
1134 w2
[1] = w2
[1] | 0x010000;
1138 w2
[1] = w2
[1] | 0x01000000;
1146 w2
[2] = w2
[2] | 0x0100;
1150 w2
[2] = w2
[2] | 0x010000;
1154 w2
[2] = w2
[2] | 0x01000000;
1162 w2
[3] = w2
[3] | 0x0100;
1166 w2
[3] = w2
[3] | 0x010000;
1170 w2
[3] = w2
[3] | 0x01000000;
1178 w3
[0] = w3
[0] | 0x0100;
1182 w3
[0] = w3
[0] | 0x010000;
1186 w3
[0] = w3
[0] | 0x01000000;
1194 w3
[1] = w3
[1] | 0x0100;
1198 w3
[1] = w3
[1] | 0x010000;
1202 w3
[1] = w3
[1] | 0x01000000;
1210 w3
[2] = w3
[2] | 0x0100;
1214 w3
[2] = w3
[2] | 0x010000;
1218 w3
[2] = w3
[2] | 0x01000000;
1226 w3
[3] = w3
[3] | 0x0100;
1230 w3
[3] = w3
[3] | 0x010000;
1234 w3
[3] = w3
[3] | 0x01000000;
1242 w4
[0] = w4
[0] | 0x0100;
1246 w4
[0] = w4
[0] | 0x010000;
1250 w4
[0] = w4
[0] | 0x01000000;
1258 w4
[1] = w4
[1] | 0x0100;
1262 w4
[1] = w4
[1] | 0x010000;
1266 w4
[1] = w4
[1] | 0x01000000;
1274 w4
[2] = w4
[2] | 0x0100;
1278 w4
[2] = w4
[2] | 0x010000;
1282 w4
[2] = w4
[2] | 0x01000000;
1290 w4
[3] = w4
[3] | 0x0100;
1294 w4
[3] = w4
[3] | 0x010000;
1298 w4
[3] = w4
[3] | 0x01000000;
1306 w5
[0] = w5
[0] | 0x0100;
1310 w5
[0] = w5
[0] | 0x010000;
1314 w5
[0] = w5
[0] | 0x01000000;
1322 w5
[1] = w5
[1] | 0x0100;
1326 w5
[1] = w5
[1] | 0x010000;
1330 w5
[1] = w5
[1] | 0x01000000;
1338 w5
[2] = w5
[2] | 0x0100;
1342 w5
[2] = w5
[2] | 0x010000;
1346 w5
[2] = w5
[2] | 0x01000000;
1354 w5
[3] = w5
[3] | 0x0100;
1358 w5
[3] = w5
[3] | 0x010000;
1362 w5
[3] = w5
[3] | 0x01000000;
1370 w6
[0] = w6
[0] | 0x0100;
1374 w6
[0] = w6
[0] | 0x010000;
1378 w6
[0] = w6
[0] | 0x01000000;
1386 w6
[1] = w6
[1] | 0x0100;
1390 w6
[1] = w6
[1] | 0x010000;
1394 w6
[1] = w6
[1] | 0x01000000;
1402 w6
[2] = w6
[2] | 0x0100;
1406 w6
[2] = w6
[2] | 0x010000;
1410 w6
[2] = w6
[2] | 0x01000000;
1418 w6
[3] = w6
[3] | 0x0100;
1422 w6
[3] = w6
[3] | 0x010000;
1426 w6
[3] = w6
[3] | 0x01000000;
1434 w7
[0] = w7
[0] | 0x0100;
1438 w7
[0] = w7
[0] | 0x010000;
1442 w7
[0] = w7
[0] | 0x01000000;
1450 w7
[1] = w7
[1] | 0x0100;
1454 w7
[1] = w7
[1] | 0x010000;
1458 w7
[1] = w7
[1] | 0x01000000;
1466 w7
[2] = w7
[2] | 0x0100;
1470 w7
[2] = w7
[2] | 0x010000;
1474 w7
[2] = w7
[2] | 0x01000000;
1482 w7
[3] = w7
[3] | 0x0100;
1486 w7
[3] = w7
[3] | 0x010000;
1490 w7
[3] = w7
[3] | 0x01000000;
1495 __device__
static void append_0x02_1 (u32 w0
[4], const u32 offset
)
1504 w0
[0] = w0
[0] | 0x0200;
1508 w0
[0] = w0
[0] | 0x020000;
1512 w0
[0] = w0
[0] | 0x02000000;
1520 w0
[1] = w0
[1] | 0x0200;
1524 w0
[1] = w0
[1] | 0x020000;
1528 w0
[1] = w0
[1] | 0x02000000;
1536 w0
[2] = w0
[2] | 0x0200;
1540 w0
[2] = w0
[2] | 0x020000;
1544 w0
[2] = w0
[2] | 0x02000000;
1552 w0
[3] = w0
[3] | 0x0200;
1556 w0
[3] = w0
[3] | 0x020000;
1560 w0
[3] = w0
[3] | 0x02000000;
1565 __device__
static void append_0x02_2 (u32 w0
[4], u32 w1
[4], const u32 offset
)
1574 w0
[0] = w0
[0] | 0x0200;
1578 w0
[0] = w0
[0] | 0x020000;
1582 w0
[0] = w0
[0] | 0x02000000;
1590 w0
[1] = w0
[1] | 0x0200;
1594 w0
[1] = w0
[1] | 0x020000;
1598 w0
[1] = w0
[1] | 0x02000000;
1606 w0
[2] = w0
[2] | 0x0200;
1610 w0
[2] = w0
[2] | 0x020000;
1614 w0
[2] = w0
[2] | 0x02000000;
1622 w0
[3] = w0
[3] | 0x0200;
1626 w0
[3] = w0
[3] | 0x020000;
1630 w0
[3] = w0
[3] | 0x02000000;
1638 w1
[0] = w1
[0] | 0x0200;
1642 w1
[0] = w1
[0] | 0x020000;
1646 w1
[0] = w1
[0] | 0x02000000;
1654 w1
[1] = w1
[1] | 0x0200;
1658 w1
[1] = w1
[1] | 0x020000;
1662 w1
[1] = w1
[1] | 0x02000000;
1670 w1
[2] = w1
[2] | 0x0200;
1674 w1
[2] = w1
[2] | 0x020000;
1678 w1
[2] = w1
[2] | 0x02000000;
1686 w1
[3] = w1
[3] | 0x0200;
1690 w1
[3] = w1
[3] | 0x020000;
1694 w1
[3] = w1
[3] | 0x02000000;
1699 __device__
static void append_0x02_3 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
1708 w0
[0] = w0
[0] | 0x0200;
1712 w0
[0] = w0
[0] | 0x020000;
1716 w0
[0] = w0
[0] | 0x02000000;
1724 w0
[1] = w0
[1] | 0x0200;
1728 w0
[1] = w0
[1] | 0x020000;
1732 w0
[1] = w0
[1] | 0x02000000;
1740 w0
[2] = w0
[2] | 0x0200;
1744 w0
[2] = w0
[2] | 0x020000;
1748 w0
[2] = w0
[2] | 0x02000000;
1756 w0
[3] = w0
[3] | 0x0200;
1760 w0
[3] = w0
[3] | 0x020000;
1764 w0
[3] = w0
[3] | 0x02000000;
1772 w1
[0] = w1
[0] | 0x0200;
1776 w1
[0] = w1
[0] | 0x020000;
1780 w1
[0] = w1
[0] | 0x02000000;
1788 w1
[1] = w1
[1] | 0x0200;
1792 w1
[1] = w1
[1] | 0x020000;
1796 w1
[1] = w1
[1] | 0x02000000;
1804 w1
[2] = w1
[2] | 0x0200;
1808 w1
[2] = w1
[2] | 0x020000;
1812 w1
[2] = w1
[2] | 0x02000000;
1820 w1
[3] = w1
[3] | 0x0200;
1824 w1
[3] = w1
[3] | 0x020000;
1828 w1
[3] = w1
[3] | 0x02000000;
1836 w2
[0] = w2
[0] | 0x0200;
1840 w2
[0] = w2
[0] | 0x020000;
1844 w2
[0] = w2
[0] | 0x02000000;
1852 w2
[1] = w2
[1] | 0x0200;
1856 w2
[1] = w2
[1] | 0x020000;
1860 w2
[1] = w2
[1] | 0x02000000;
1868 w2
[2] = w2
[2] | 0x0200;
1872 w2
[2] = w2
[2] | 0x020000;
1876 w2
[2] = w2
[2] | 0x02000000;
1884 w2
[3] = w2
[3] | 0x0200;
1888 w2
[3] = w2
[3] | 0x020000;
1892 w2
[3] = w2
[3] | 0x02000000;
1897 __device__
static void append_0x02_4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
1906 w0
[0] = w0
[0] | 0x0200;
1910 w0
[0] = w0
[0] | 0x020000;
1914 w0
[0] = w0
[0] | 0x02000000;
1922 w0
[1] = w0
[1] | 0x0200;
1926 w0
[1] = w0
[1] | 0x020000;
1930 w0
[1] = w0
[1] | 0x02000000;
1938 w0
[2] = w0
[2] | 0x0200;
1942 w0
[2] = w0
[2] | 0x020000;
1946 w0
[2] = w0
[2] | 0x02000000;
1954 w0
[3] = w0
[3] | 0x0200;
1958 w0
[3] = w0
[3] | 0x020000;
1962 w0
[3] = w0
[3] | 0x02000000;
1970 w1
[0] = w1
[0] | 0x0200;
1974 w1
[0] = w1
[0] | 0x020000;
1978 w1
[0] = w1
[0] | 0x02000000;
1986 w1
[1] = w1
[1] | 0x0200;
1990 w1
[1] = w1
[1] | 0x020000;
1994 w1
[1] = w1
[1] | 0x02000000;
2002 w1
[2] = w1
[2] | 0x0200;
2006 w1
[2] = w1
[2] | 0x020000;
2010 w1
[2] = w1
[2] | 0x02000000;
2018 w1
[3] = w1
[3] | 0x0200;
2022 w1
[3] = w1
[3] | 0x020000;
2026 w1
[3] = w1
[3] | 0x02000000;
2034 w2
[0] = w2
[0] | 0x0200;
2038 w2
[0] = w2
[0] | 0x020000;
2042 w2
[0] = w2
[0] | 0x02000000;
2050 w2
[1] = w2
[1] | 0x0200;
2054 w2
[1] = w2
[1] | 0x020000;
2058 w2
[1] = w2
[1] | 0x02000000;
2066 w2
[2] = w2
[2] | 0x0200;
2070 w2
[2] = w2
[2] | 0x020000;
2074 w2
[2] = w2
[2] | 0x02000000;
2082 w2
[3] = w2
[3] | 0x0200;
2086 w2
[3] = w2
[3] | 0x020000;
2090 w2
[3] = w2
[3] | 0x02000000;
2098 w3
[0] = w3
[0] | 0x0200;
2102 w3
[0] = w3
[0] | 0x020000;
2106 w3
[0] = w3
[0] | 0x02000000;
2114 w3
[1] = w3
[1] | 0x0200;
2118 w3
[1] = w3
[1] | 0x020000;
2122 w3
[1] = w3
[1] | 0x02000000;
2130 w3
[2] = w3
[2] | 0x0200;
2134 w3
[2] = w3
[2] | 0x020000;
2138 w3
[2] = w3
[2] | 0x02000000;
2146 w3
[3] = w3
[3] | 0x0200;
2150 w3
[3] = w3
[3] | 0x020000;
2154 w3
[3] = w3
[3] | 0x02000000;
2159 __device__
static void append_0x02_8 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
2168 w0
[0] = w0
[0] | 0x0200;
2172 w0
[0] = w0
[0] | 0x020000;
2176 w0
[0] = w0
[0] | 0x02000000;
2184 w0
[1] = w0
[1] | 0x0200;
2188 w0
[1] = w0
[1] | 0x020000;
2192 w0
[1] = w0
[1] | 0x02000000;
2200 w0
[2] = w0
[2] | 0x0200;
2204 w0
[2] = w0
[2] | 0x020000;
2208 w0
[2] = w0
[2] | 0x02000000;
2216 w0
[3] = w0
[3] | 0x0200;
2220 w0
[3] = w0
[3] | 0x020000;
2224 w0
[3] = w0
[3] | 0x02000000;
2232 w1
[0] = w1
[0] | 0x0200;
2236 w1
[0] = w1
[0] | 0x020000;
2240 w1
[0] = w1
[0] | 0x02000000;
2248 w1
[1] = w1
[1] | 0x0200;
2252 w1
[1] = w1
[1] | 0x020000;
2256 w1
[1] = w1
[1] | 0x02000000;
2264 w1
[2] = w1
[2] | 0x0200;
2268 w1
[2] = w1
[2] | 0x020000;
2272 w1
[2] = w1
[2] | 0x02000000;
2280 w1
[3] = w1
[3] | 0x0200;
2284 w1
[3] = w1
[3] | 0x020000;
2288 w1
[3] = w1
[3] | 0x02000000;
2296 w2
[0] = w2
[0] | 0x0200;
2300 w2
[0] = w2
[0] | 0x020000;
2304 w2
[0] = w2
[0] | 0x02000000;
2312 w2
[1] = w2
[1] | 0x0200;
2316 w2
[1] = w2
[1] | 0x020000;
2320 w2
[1] = w2
[1] | 0x02000000;
2328 w2
[2] = w2
[2] | 0x0200;
2332 w2
[2] = w2
[2] | 0x020000;
2336 w2
[2] = w2
[2] | 0x02000000;
2344 w2
[3] = w2
[3] | 0x0200;
2348 w2
[3] = w2
[3] | 0x020000;
2352 w2
[3] = w2
[3] | 0x02000000;
2360 w3
[0] = w3
[0] | 0x0200;
2364 w3
[0] = w3
[0] | 0x020000;
2368 w3
[0] = w3
[0] | 0x02000000;
2376 w3
[1] = w3
[1] | 0x0200;
2380 w3
[1] = w3
[1] | 0x020000;
2384 w3
[1] = w3
[1] | 0x02000000;
2392 w3
[2] = w3
[2] | 0x0200;
2396 w3
[2] = w3
[2] | 0x020000;
2400 w3
[2] = w3
[2] | 0x02000000;
2408 w3
[3] = w3
[3] | 0x0200;
2412 w3
[3] = w3
[3] | 0x020000;
2416 w3
[3] = w3
[3] | 0x02000000;
2424 w4
[0] = w4
[0] | 0x0200;
2428 w4
[0] = w4
[0] | 0x020000;
2432 w4
[0] = w4
[0] | 0x02000000;
2440 w4
[1] = w4
[1] | 0x0200;
2444 w4
[1] = w4
[1] | 0x020000;
2448 w4
[1] = w4
[1] | 0x02000000;
2456 w4
[2] = w4
[2] | 0x0200;
2460 w4
[2] = w4
[2] | 0x020000;
2464 w4
[2] = w4
[2] | 0x02000000;
2472 w4
[3] = w4
[3] | 0x0200;
2476 w4
[3] = w4
[3] | 0x020000;
2480 w4
[3] = w4
[3] | 0x02000000;
2488 w5
[0] = w5
[0] | 0x0200;
2492 w5
[0] = w5
[0] | 0x020000;
2496 w5
[0] = w5
[0] | 0x02000000;
2504 w5
[1] = w5
[1] | 0x0200;
2508 w5
[1] = w5
[1] | 0x020000;
2512 w5
[1] = w5
[1] | 0x02000000;
2520 w5
[2] = w5
[2] | 0x0200;
2524 w5
[2] = w5
[2] | 0x020000;
2528 w5
[2] = w5
[2] | 0x02000000;
2536 w5
[3] = w5
[3] | 0x0200;
2540 w5
[3] = w5
[3] | 0x020000;
2544 w5
[3] = w5
[3] | 0x02000000;
2552 w6
[0] = w6
[0] | 0x0200;
2556 w6
[0] = w6
[0] | 0x020000;
2560 w6
[0] = w6
[0] | 0x02000000;
2568 w6
[1] = w6
[1] | 0x0200;
2572 w6
[1] = w6
[1] | 0x020000;
2576 w6
[1] = w6
[1] | 0x02000000;
2584 w6
[2] = w6
[2] | 0x0200;
2588 w6
[2] = w6
[2] | 0x020000;
2592 w6
[2] = w6
[2] | 0x02000000;
2600 w6
[3] = w6
[3] | 0x0200;
2604 w6
[3] = w6
[3] | 0x020000;
2608 w6
[3] = w6
[3] | 0x02000000;
2616 w7
[0] = w7
[0] | 0x0200;
2620 w7
[0] = w7
[0] | 0x020000;
2624 w7
[0] = w7
[0] | 0x02000000;
2632 w7
[1] = w7
[1] | 0x0200;
2636 w7
[1] = w7
[1] | 0x020000;
2640 w7
[1] = w7
[1] | 0x02000000;
2648 w7
[2] = w7
[2] | 0x0200;
2652 w7
[2] = w7
[2] | 0x020000;
2656 w7
[2] = w7
[2] | 0x02000000;
2664 w7
[3] = w7
[3] | 0x0200;
2668 w7
[3] = w7
[3] | 0x020000;
2672 w7
[3] = w7
[3] | 0x02000000;
2677 __device__
static void append_0x80_1 (u32 w0
[4], const u32 offset
)
2686 w0
[0] = w0
[0] | 0x8000;
2690 w0
[0] = w0
[0] | 0x800000;
2694 w0
[0] = w0
[0] | 0x80000000;
2702 w0
[1] = w0
[1] | 0x8000;
2706 w0
[1] = w0
[1] | 0x800000;
2710 w0
[1] = w0
[1] | 0x80000000;
2718 w0
[2] = w0
[2] | 0x8000;
2722 w0
[2] = w0
[2] | 0x800000;
2726 w0
[2] = w0
[2] | 0x80000000;
2734 w0
[3] = w0
[3] | 0x8000;
2738 w0
[3] = w0
[3] | 0x800000;
2742 w0
[3] = w0
[3] | 0x80000000;
2747 __device__
static void append_0x80_2 (u32 w0
[4], u32 w1
[4], const u32 offset
)
2756 w0
[0] = w0
[0] | 0x8000;
2760 w0
[0] = w0
[0] | 0x800000;
2764 w0
[0] = w0
[0] | 0x80000000;
2772 w0
[1] = w0
[1] | 0x8000;
2776 w0
[1] = w0
[1] | 0x800000;
2780 w0
[1] = w0
[1] | 0x80000000;
2788 w0
[2] = w0
[2] | 0x8000;
2792 w0
[2] = w0
[2] | 0x800000;
2796 w0
[2] = w0
[2] | 0x80000000;
2804 w0
[3] = w0
[3] | 0x8000;
2808 w0
[3] = w0
[3] | 0x800000;
2812 w0
[3] = w0
[3] | 0x80000000;
2820 w1
[0] = w1
[0] | 0x8000;
2824 w1
[0] = w1
[0] | 0x800000;
2828 w1
[0] = w1
[0] | 0x80000000;
2836 w1
[1] = w1
[1] | 0x8000;
2840 w1
[1] = w1
[1] | 0x800000;
2844 w1
[1] = w1
[1] | 0x80000000;
2852 w1
[2] = w1
[2] | 0x8000;
2856 w1
[2] = w1
[2] | 0x800000;
2860 w1
[2] = w1
[2] | 0x80000000;
2868 w1
[3] = w1
[3] | 0x8000;
2872 w1
[3] = w1
[3] | 0x800000;
2876 w1
[3] = w1
[3] | 0x80000000;
2881 __device__
static void append_0x80_3 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
2890 w0
[0] = w0
[0] | 0x8000;
2894 w0
[0] = w0
[0] | 0x800000;
2898 w0
[0] = w0
[0] | 0x80000000;
2906 w0
[1] = w0
[1] | 0x8000;
2910 w0
[1] = w0
[1] | 0x800000;
2914 w0
[1] = w0
[1] | 0x80000000;
2922 w0
[2] = w0
[2] | 0x8000;
2926 w0
[2] = w0
[2] | 0x800000;
2930 w0
[2] = w0
[2] | 0x80000000;
2938 w0
[3] = w0
[3] | 0x8000;
2942 w0
[3] = w0
[3] | 0x800000;
2946 w0
[3] = w0
[3] | 0x80000000;
2954 w1
[0] = w1
[0] | 0x8000;
2958 w1
[0] = w1
[0] | 0x800000;
2962 w1
[0] = w1
[0] | 0x80000000;
2970 w1
[1] = w1
[1] | 0x8000;
2974 w1
[1] = w1
[1] | 0x800000;
2978 w1
[1] = w1
[1] | 0x80000000;
2986 w1
[2] = w1
[2] | 0x8000;
2990 w1
[2] = w1
[2] | 0x800000;
2994 w1
[2] = w1
[2] | 0x80000000;
3002 w1
[3] = w1
[3] | 0x8000;
3006 w1
[3] = w1
[3] | 0x800000;
3010 w1
[3] = w1
[3] | 0x80000000;
3018 w2
[0] = w2
[0] | 0x8000;
3022 w2
[0] = w2
[0] | 0x800000;
3026 w2
[0] = w2
[0] | 0x80000000;
3034 w2
[1] = w2
[1] | 0x8000;
3038 w2
[1] = w2
[1] | 0x800000;
3042 w2
[1] = w2
[1] | 0x80000000;
3050 w2
[2] = w2
[2] | 0x8000;
3054 w2
[2] = w2
[2] | 0x800000;
3058 w2
[2] = w2
[2] | 0x80000000;
3066 w2
[3] = w2
[3] | 0x8000;
3070 w2
[3] = w2
[3] | 0x800000;
3074 w2
[3] = w2
[3] | 0x80000000;
3079 __device__
static void append_0x80_4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
3088 w0
[0] = w0
[0] | 0x8000;
3092 w0
[0] = w0
[0] | 0x800000;
3096 w0
[0] = w0
[0] | 0x80000000;
3104 w0
[1] = w0
[1] | 0x8000;
3108 w0
[1] = w0
[1] | 0x800000;
3112 w0
[1] = w0
[1] | 0x80000000;
3120 w0
[2] = w0
[2] | 0x8000;
3124 w0
[2] = w0
[2] | 0x800000;
3128 w0
[2] = w0
[2] | 0x80000000;
3136 w0
[3] = w0
[3] | 0x8000;
3140 w0
[3] = w0
[3] | 0x800000;
3144 w0
[3] = w0
[3] | 0x80000000;
3152 w1
[0] = w1
[0] | 0x8000;
3156 w1
[0] = w1
[0] | 0x800000;
3160 w1
[0] = w1
[0] | 0x80000000;
3168 w1
[1] = w1
[1] | 0x8000;
3172 w1
[1] = w1
[1] | 0x800000;
3176 w1
[1] = w1
[1] | 0x80000000;
3184 w1
[2] = w1
[2] | 0x8000;
3188 w1
[2] = w1
[2] | 0x800000;
3192 w1
[2] = w1
[2] | 0x80000000;
3200 w1
[3] = w1
[3] | 0x8000;
3204 w1
[3] = w1
[3] | 0x800000;
3208 w1
[3] = w1
[3] | 0x80000000;
3216 w2
[0] = w2
[0] | 0x8000;
3220 w2
[0] = w2
[0] | 0x800000;
3224 w2
[0] = w2
[0] | 0x80000000;
3232 w2
[1] = w2
[1] | 0x8000;
3236 w2
[1] = w2
[1] | 0x800000;
3240 w2
[1] = w2
[1] | 0x80000000;
3248 w2
[2] = w2
[2] | 0x8000;
3252 w2
[2] = w2
[2] | 0x800000;
3256 w2
[2] = w2
[2] | 0x80000000;
3264 w2
[3] = w2
[3] | 0x8000;
3268 w2
[3] = w2
[3] | 0x800000;
3272 w2
[3] = w2
[3] | 0x80000000;
3280 w3
[0] = w3
[0] | 0x8000;
3284 w3
[0] = w3
[0] | 0x800000;
3288 w3
[0] = w3
[0] | 0x80000000;
3296 w3
[1] = w3
[1] | 0x8000;
3300 w3
[1] = w3
[1] | 0x800000;
3304 w3
[1] = w3
[1] | 0x80000000;
3312 w3
[2] = w3
[2] | 0x8000;
3316 w3
[2] = w3
[2] | 0x800000;
3320 w3
[2] = w3
[2] | 0x80000000;
3328 w3
[3] = w3
[3] | 0x8000;
3332 w3
[3] = w3
[3] | 0x800000;
3336 w3
[3] = w3
[3] | 0x80000000;
3341 __device__
static void append_0x80_8 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
3350 w0
[0] = w0
[0] | 0x8000;
3354 w0
[0] = w0
[0] | 0x800000;
3358 w0
[0] = w0
[0] | 0x80000000;
3366 w0
[1] = w0
[1] | 0x8000;
3370 w0
[1] = w0
[1] | 0x800000;
3374 w0
[1] = w0
[1] | 0x80000000;
3382 w0
[2] = w0
[2] | 0x8000;
3386 w0
[2] = w0
[2] | 0x800000;
3390 w0
[2] = w0
[2] | 0x80000000;
3398 w0
[3] = w0
[3] | 0x8000;
3402 w0
[3] = w0
[3] | 0x800000;
3406 w0
[3] = w0
[3] | 0x80000000;
3414 w1
[0] = w1
[0] | 0x8000;
3418 w1
[0] = w1
[0] | 0x800000;
3422 w1
[0] = w1
[0] | 0x80000000;
3430 w1
[1] = w1
[1] | 0x8000;
3434 w1
[1] = w1
[1] | 0x800000;
3438 w1
[1] = w1
[1] | 0x80000000;
3446 w1
[2] = w1
[2] | 0x8000;
3450 w1
[2] = w1
[2] | 0x800000;
3454 w1
[2] = w1
[2] | 0x80000000;
3462 w1
[3] = w1
[3] | 0x8000;
3466 w1
[3] = w1
[3] | 0x800000;
3470 w1
[3] = w1
[3] | 0x80000000;
3478 w2
[0] = w2
[0] | 0x8000;
3482 w2
[0] = w2
[0] | 0x800000;
3486 w2
[0] = w2
[0] | 0x80000000;
3494 w2
[1] = w2
[1] | 0x8000;
3498 w2
[1] = w2
[1] | 0x800000;
3502 w2
[1] = w2
[1] | 0x80000000;
3510 w2
[2] = w2
[2] | 0x8000;
3514 w2
[2] = w2
[2] | 0x800000;
3518 w2
[2] = w2
[2] | 0x80000000;
3526 w2
[3] = w2
[3] | 0x8000;
3530 w2
[3] = w2
[3] | 0x800000;
3534 w2
[3] = w2
[3] | 0x80000000;
3542 w3
[0] = w3
[0] | 0x8000;
3546 w3
[0] = w3
[0] | 0x800000;
3550 w3
[0] = w3
[0] | 0x80000000;
3558 w3
[1] = w3
[1] | 0x8000;
3562 w3
[1] = w3
[1] | 0x800000;
3566 w3
[1] = w3
[1] | 0x80000000;
3574 w3
[2] = w3
[2] | 0x8000;
3578 w3
[2] = w3
[2] | 0x800000;
3582 w3
[2] = w3
[2] | 0x80000000;
3590 w3
[3] = w3
[3] | 0x8000;
3594 w3
[3] = w3
[3] | 0x800000;
3598 w3
[3] = w3
[3] | 0x80000000;
3606 w4
[0] = w4
[0] | 0x8000;
3610 w4
[0] = w4
[0] | 0x800000;
3614 w4
[0] = w4
[0] | 0x80000000;
3622 w4
[1] = w4
[1] | 0x8000;
3626 w4
[1] = w4
[1] | 0x800000;
3630 w4
[1] = w4
[1] | 0x80000000;
3638 w4
[2] = w4
[2] | 0x8000;
3642 w4
[2] = w4
[2] | 0x800000;
3646 w4
[2] = w4
[2] | 0x80000000;
3654 w4
[3] = w4
[3] | 0x8000;
3658 w4
[3] = w4
[3] | 0x800000;
3662 w4
[3] = w4
[3] | 0x80000000;
3670 w5
[0] = w5
[0] | 0x8000;
3674 w5
[0] = w5
[0] | 0x800000;
3678 w5
[0] = w5
[0] | 0x80000000;
3686 w5
[1] = w5
[1] | 0x8000;
3690 w5
[1] = w5
[1] | 0x800000;
3694 w5
[1] = w5
[1] | 0x80000000;
3702 w5
[2] = w5
[2] | 0x8000;
3706 w5
[2] = w5
[2] | 0x800000;
3710 w5
[2] = w5
[2] | 0x80000000;
3718 w5
[3] = w5
[3] | 0x8000;
3722 w5
[3] = w5
[3] | 0x800000;
3726 w5
[3] = w5
[3] | 0x80000000;
3734 w6
[0] = w6
[0] | 0x8000;
3738 w6
[0] = w6
[0] | 0x800000;
3742 w6
[0] = w6
[0] | 0x80000000;
3750 w6
[1] = w6
[1] | 0x8000;
3754 w6
[1] = w6
[1] | 0x800000;
3758 w6
[1] = w6
[1] | 0x80000000;
3766 w6
[2] = w6
[2] | 0x8000;
3770 w6
[2] = w6
[2] | 0x800000;
3774 w6
[2] = w6
[2] | 0x80000000;
3782 w6
[3] = w6
[3] | 0x8000;
3786 w6
[3] = w6
[3] | 0x800000;
3790 w6
[3] = w6
[3] | 0x80000000;
3798 w7
[0] = w7
[0] | 0x8000;
3802 w7
[0] = w7
[0] | 0x800000;
3806 w7
[0] = w7
[0] | 0x80000000;
3814 w7
[1] = w7
[1] | 0x8000;
3818 w7
[1] = w7
[1] | 0x800000;
3822 w7
[1] = w7
[1] | 0x80000000;
3830 w7
[2] = w7
[2] | 0x8000;
3834 w7
[2] = w7
[2] | 0x800000;
3838 w7
[2] = w7
[2] | 0x80000000;
3846 w7
[3] = w7
[3] | 0x8000;
3850 w7
[3] = w7
[3] | 0x800000;
3854 w7
[3] = w7
[3] | 0x80000000;
3859 __device__
static void append_0x80_4 (u32 w
[16], const u32 offset
)
3868 w
[ 0] = w
[ 0] | 0x8000;
3872 w
[ 0] = w
[ 0] | 0x800000;
3876 w
[ 0] = w
[ 0] | 0x80000000;
3884 w
[ 1] = w
[ 1] | 0x8000;
3888 w
[ 1] = w
[ 1] | 0x800000;
3892 w
[ 1] = w
[ 1] | 0x80000000;
3900 w
[ 2] = w
[ 2] | 0x8000;
3904 w
[ 2] = w
[ 2] | 0x800000;
3908 w
[ 2] = w
[ 2] | 0x80000000;
3916 w
[ 3] = w
[ 3] | 0x8000;
3920 w
[ 3] = w
[ 3] | 0x800000;
3924 w
[ 3] = w
[ 3] | 0x80000000;
3932 w
[ 4] = w
[ 4] | 0x8000;
3936 w
[ 4] = w
[ 4] | 0x800000;
3940 w
[ 4] = w
[ 4] | 0x80000000;
3948 w
[ 5] = w
[ 5] | 0x8000;
3952 w
[ 5] = w
[ 5] | 0x800000;
3956 w
[ 5] = w
[ 5] | 0x80000000;
3964 w
[ 6] = w
[ 6] | 0x8000;
3968 w
[ 6] = w
[ 6] | 0x800000;
3972 w
[ 6] = w
[ 6] | 0x80000000;
3980 w
[ 7] = w
[ 7] | 0x8000;
3984 w
[ 7] = w
[ 7] | 0x800000;
3988 w
[ 7] = w
[ 7] | 0x80000000;
3996 w
[ 8] = w
[ 8] | 0x8000;
4000 w
[ 8] = w
[ 8] | 0x800000;
4004 w
[ 8] = w
[ 8] | 0x80000000;
4012 w
[ 9] = w
[ 9] | 0x8000;
4016 w
[ 9] = w
[ 9] | 0x800000;
4020 w
[ 9] = w
[ 9] | 0x80000000;
4028 w
[10] = w
[10] | 0x8000;
4032 w
[10] = w
[10] | 0x800000;
4036 w
[10] = w
[10] | 0x80000000;
4044 w
[11] = w
[11] | 0x8000;
4048 w
[11] = w
[11] | 0x800000;
4052 w
[11] = w
[11] | 0x80000000;
4060 w
[12] = w
[12] | 0x8000;
4064 w
[12] = w
[12] | 0x800000;
4068 w
[12] = w
[12] | 0x80000000;
4076 w
[13] = w
[13] | 0x8000;
4080 w
[13] = w
[13] | 0x800000;
4084 w
[13] = w
[13] | 0x80000000;
4092 w
[14] = w
[14] | 0x8000;
4096 w
[14] = w
[14] | 0x800000;
4100 w
[14] = w
[14] | 0x80000000;
4108 w
[15] = w
[15] | 0x8000;
4112 w
[15] = w
[15] | 0x800000;
4116 w
[15] = w
[15] | 0x80000000;
4121 __device__
static void append_0x80_8 (u32 w
[32], const u32 offset
)
4130 w
[ 0] = w
[ 0] | 0x8000;
4134 w
[ 0] = w
[ 0] | 0x800000;
4138 w
[ 0] = w
[ 0] | 0x80000000;
4146 w
[ 1] = w
[ 1] | 0x8000;
4150 w
[ 1] = w
[ 1] | 0x800000;
4154 w
[ 1] = w
[ 1] | 0x80000000;
4162 w
[ 2] = w
[ 2] | 0x8000;
4166 w
[ 2] = w
[ 2] | 0x800000;
4170 w
[ 2] = w
[ 2] | 0x80000000;
4178 w
[ 3] = w
[ 3] | 0x8000;
4182 w
[ 3] = w
[ 3] | 0x800000;
4186 w
[ 3] = w
[ 3] | 0x80000000;
4194 w
[ 4] = w
[ 4] | 0x8000;
4198 w
[ 4] = w
[ 4] | 0x800000;
4202 w
[ 4] = w
[ 4] | 0x80000000;
4210 w
[ 5] = w
[ 5] | 0x8000;
4214 w
[ 5] = w
[ 5] | 0x800000;
4218 w
[ 5] = w
[ 5] | 0x80000000;
4226 w
[ 6] = w
[ 6] | 0x8000;
4230 w
[ 6] = w
[ 6] | 0x800000;
4234 w
[ 6] = w
[ 6] | 0x80000000;
4242 w
[ 7] = w
[ 7] | 0x8000;
4246 w
[ 7] = w
[ 7] | 0x800000;
4250 w
[ 7] = w
[ 7] | 0x80000000;
4258 w
[ 8] = w
[ 8] | 0x8000;
4262 w
[ 8] = w
[ 8] | 0x800000;
4266 w
[ 8] = w
[ 8] | 0x80000000;
4274 w
[ 9] = w
[ 9] | 0x8000;
4278 w
[ 9] = w
[ 9] | 0x800000;
4282 w
[ 9] = w
[ 9] | 0x80000000;
4290 w
[10] = w
[10] | 0x8000;
4294 w
[10] = w
[10] | 0x800000;
4298 w
[10] = w
[10] | 0x80000000;
4306 w
[11] = w
[11] | 0x8000;
4310 w
[11] = w
[11] | 0x800000;
4314 w
[11] = w
[11] | 0x80000000;
4322 w
[12] = w
[12] | 0x8000;
4326 w
[12] = w
[12] | 0x800000;
4330 w
[12] = w
[12] | 0x80000000;
4338 w
[13] = w
[13] | 0x8000;
4342 w
[13] = w
[13] | 0x800000;
4346 w
[13] = w
[13] | 0x80000000;
4354 w
[14] = w
[14] | 0x8000;
4358 w
[14] = w
[14] | 0x800000;
4362 w
[14] = w
[14] | 0x80000000;
4370 w
[15] = w
[15] | 0x8000;
4374 w
[15] = w
[15] | 0x800000;
4378 w
[15] = w
[15] | 0x80000000;
4386 w
[16] = w
[16] | 0x8000;
4390 w
[16] = w
[16] | 0x800000;
4394 w
[16] = w
[16] | 0x80000000;
4402 w
[17] = w
[17] | 0x8000;
4406 w
[17] = w
[17] | 0x800000;
4410 w
[17] = w
[17] | 0x80000000;
4418 w
[18] = w
[18] | 0x8000;
4422 w
[18] = w
[18] | 0x800000;
4426 w
[18] = w
[18] | 0x80000000;
4434 w
[19] = w
[19] | 0x8000;
4438 w
[19] = w
[19] | 0x800000;
4442 w
[19] = w
[19] | 0x80000000;
4450 w
[20] = w
[20] | 0x8000;
4454 w
[20] = w
[20] | 0x800000;
4458 w
[20] = w
[20] | 0x80000000;
4466 w
[21] = w
[21] | 0x8000;
4470 w
[21] = w
[21] | 0x800000;
4474 w
[21] = w
[21] | 0x80000000;
4482 w
[22] = w
[22] | 0x8000;
4486 w
[22] = w
[22] | 0x800000;
4490 w
[22] = w
[22] | 0x80000000;
4498 w
[23] = w
[23] | 0x8000;
4502 w
[23] = w
[23] | 0x800000;
4506 w
[23] = w
[23] | 0x80000000;
4514 w
[24] = w
[24] | 0x8000;
4518 w
[24] = w
[24] | 0x800000;
4522 w
[24] = w
[24] | 0x80000000;
4530 w
[25] = w
[25] | 0x8000;
4534 w
[25] = w
[25] | 0x800000;
4538 w
[25] = w
[25] | 0x80000000;
4546 w
[26] = w
[26] | 0x8000;
4550 w
[26] = w
[26] | 0x800000;
4554 w
[26] = w
[26] | 0x80000000;
4562 w
[27] = w
[27] | 0x8000;
4566 w
[27] = w
[27] | 0x800000;
4570 w
[27] = w
[27] | 0x80000000;
4578 w
[28] = w
[28] | 0x8000;
4582 w
[28] = w
[28] | 0x800000;
4586 w
[28] = w
[28] | 0x80000000;
4594 w
[29] = w
[29] | 0x8000;
4598 w
[29] = w
[29] | 0x800000;
4602 w
[29] = w
[29] | 0x80000000;
4610 w
[30] = w
[30] | 0x8000;
4614 w
[30] = w
[30] | 0x800000;
4618 w
[30] = w
[30] | 0x80000000;
4626 w
[31] = w
[31] | 0x8000;
4630 w
[31] = w
[31] | 0x800000;
4634 w
[31] = w
[31] | 0x80000000;
4639 __device__
static void device_memcat2L (const u32 offset
, u32 dst0
[2], u32 src_l0
[2], u32 src_r0
[2])
4644 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
4645 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4649 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
4650 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4654 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
4655 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4659 dst0
[1] = src_r0
[0];
4663 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
4667 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
4671 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
4676 __device__
static void device_memcat4L (const u32 offset
, u32 dst0
[4], u32 src_l0
[4], u32 src_r0
[4])
4681 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
4682 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4683 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4684 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4688 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
4689 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4690 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4691 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4695 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
4696 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4697 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4698 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4702 dst0
[1] = src_r0
[0];
4703 dst0
[2] = src_r0
[1];
4704 dst0
[3] = src_r0
[2];
4708 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
4709 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4710 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4714 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
4715 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4716 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4720 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
4721 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4722 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4726 dst0
[2] = src_r0
[0];
4727 dst0
[3] = src_r0
[1];
4731 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
4732 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4736 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
4737 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4741 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
4742 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4746 dst0
[3] = src_r0
[0];
4750 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
4754 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
4758 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
4763 __device__
static void device_memcat8L (const u32 offset
, u32 dst0
[4], u32 dst1
[4], u32 src_l0
[4], u32 src_l1
[4], u32 src_r0
[4])
4768 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
4769 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4770 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4771 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4772 dst1
[0] = src_r0
[3] >> 24;
4776 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
4777 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4778 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4779 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4780 dst1
[0] = src_r0
[3] >> 16;
4784 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
4785 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4786 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4787 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4788 dst1
[0] = src_r0
[3] >> 8;
4792 dst0
[1] = src_r0
[0];
4793 dst0
[2] = src_r0
[1];
4794 dst0
[3] = src_r0
[2];
4795 dst1
[0] = src_r0
[3];
4799 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
4800 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4801 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4802 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4803 dst1
[1] = src_r0
[3] >> 24;
4807 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
4808 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4809 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4810 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4811 dst1
[1] = src_r0
[3] >> 16;
4815 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
4816 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4817 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4818 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4819 dst1
[1] = src_r0
[3] >> 8;
4823 dst0
[2] = src_r0
[0];
4824 dst0
[3] = src_r0
[1];
4825 dst1
[0] = src_r0
[2];
4826 dst1
[1] = src_r0
[3];
4830 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
4831 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4832 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4833 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4834 dst1
[2] = src_r0
[3] >> 24;
4838 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
4839 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4840 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4841 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4842 dst1
[2] = src_r0
[3] >> 16;
4846 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
4847 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4848 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4849 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4850 dst1
[2] = src_r0
[3] >> 8;
4854 dst0
[3] = src_r0
[0];
4855 dst1
[0] = src_r0
[1];
4856 dst1
[1] = src_r0
[2];
4857 dst1
[2] = src_r0
[3];
4861 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
4862 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4863 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4864 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4865 dst1
[3] = src_r0
[3] >> 24;
4869 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
4870 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4871 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4872 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4873 dst1
[3] = src_r0
[3] >> 16;
4877 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
4878 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4879 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4880 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4881 dst1
[3] = src_r0
[3] >> 8;
4885 dst1
[0] = src_r0
[0];
4886 dst1
[1] = src_r0
[1];
4887 dst1
[2] = src_r0
[2];
4888 dst1
[3] = src_r0
[3];
4892 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
4893 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4894 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4895 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4899 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
4900 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4901 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4902 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4906 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
4907 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4908 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4909 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4913 dst1
[1] = src_r0
[0];
4914 dst1
[2] = src_r0
[1];
4915 dst1
[3] = src_r0
[2];
4919 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
4920 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4921 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4925 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
4926 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4927 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4931 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
4932 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4933 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4937 dst1
[2] = src_r0
[0];
4938 dst1
[3] = src_r0
[1];
4942 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
4943 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4947 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
4948 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4952 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
4953 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4957 dst1
[3] = src_r0
[0];
4961 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
4965 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
4969 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
4974 __device__
static void device_memcat12L (const u32 offset
, u32 dst0
[4], u32 dst1
[4], u32 dst2
[4], u32 src_l0
[4], u32 src_l1
[4], u32 src_l2
[4], u32 src_r0
[4])
4979 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
4980 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4981 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4982 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4983 dst1
[0] = src_r0
[3] >> 24;
4987 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
4988 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4989 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4990 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4991 dst1
[0] = src_r0
[3] >> 16;
4995 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
4996 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4997 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4998 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4999 dst1
[0] = src_r0
[3] >> 8;
5003 dst0
[1] = src_r0
[0];
5004 dst0
[2] = src_r0
[1];
5005 dst0
[3] = src_r0
[2];
5006 dst1
[0] = src_r0
[3];
5010 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
5011 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5012 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5013 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5014 dst1
[1] = src_r0
[3] >> 24;
5018 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
5019 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5020 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5021 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5022 dst1
[1] = src_r0
[3] >> 16;
5026 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
5027 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5028 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5029 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5030 dst1
[1] = src_r0
[3] >> 8;
5034 dst0
[2] = src_r0
[0];
5035 dst0
[3] = src_r0
[1];
5036 dst1
[0] = src_r0
[2];
5037 dst1
[1] = src_r0
[3];
5041 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
5042 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5043 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5044 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5045 dst1
[2] = src_r0
[3] >> 24;
5049 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
5050 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5051 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5052 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5053 dst1
[2] = src_r0
[3] >> 16;
5057 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
5058 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5059 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5060 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5061 dst1
[2] = src_r0
[3] >> 8;
5065 dst0
[3] = src_r0
[0];
5066 dst1
[0] = src_r0
[1];
5067 dst1
[1] = src_r0
[2];
5068 dst1
[2] = src_r0
[3];
5072 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
5073 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5074 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5075 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5076 dst1
[3] = src_r0
[3] >> 24;
5080 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
5081 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5082 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5083 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5084 dst1
[3] = src_r0
[3] >> 16;
5088 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
5089 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5090 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5091 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5092 dst1
[3] = src_r0
[3] >> 8;
5096 dst1
[0] = src_r0
[0];
5097 dst1
[1] = src_r0
[1];
5098 dst1
[2] = src_r0
[2];
5099 dst1
[3] = src_r0
[3];
5103 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
5104 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5105 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5106 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5107 dst2
[0] = src_r0
[3] >> 24;
5111 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
5112 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5113 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5114 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5115 dst2
[0] = src_r0
[3] >> 16;
5119 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
5120 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5121 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5122 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5123 dst2
[0] = src_r0
[3] >> 8;
5127 dst1
[1] = src_r0
[0];
5128 dst1
[2] = src_r0
[1];
5129 dst1
[3] = src_r0
[2];
5130 dst2
[0] = src_r0
[3];
5134 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
5135 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5136 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5137 dst2
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5138 dst2
[1] = src_r0
[3] >> 24;
5142 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
5143 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5144 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5145 dst2
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5146 dst2
[1] = src_r0
[3] >> 16;
5150 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
5151 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5152 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5153 dst2
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5154 dst2
[1] = src_r0
[3] >> 8;
5158 dst1
[2] = src_r0
[0];
5159 dst1
[3] = src_r0
[1];
5160 dst2
[0] = src_r0
[2];
5161 dst2
[1] = src_r0
[3];
5165 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
5166 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5167 dst2
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5168 dst2
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5169 dst2
[2] = src_r0
[3] >> 24;
5173 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
5174 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5175 dst2
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5176 dst2
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5177 dst2
[2] = src_r0
[3] >> 16;
5181 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
5182 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5183 dst2
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5184 dst2
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5185 dst2
[2] = src_r0
[3] >> 8;
5189 dst1
[3] = src_r0
[0];
5190 dst2
[0] = src_r0
[1];
5191 dst2
[1] = src_r0
[2];
5192 dst2
[2] = src_r0
[3];
5196 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
5197 dst2
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5198 dst2
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5199 dst2
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5200 dst2
[3] = src_r0
[3] >> 24;
5204 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
5205 dst2
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5206 dst2
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5207 dst2
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5208 dst2
[3] = src_r0
[3] >> 16;
5212 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
5213 dst2
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5214 dst2
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5215 dst2
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5216 dst2
[3] = src_r0
[3] >> 8;
5220 dst2
[0] = src_r0
[0];
5221 dst2
[1] = src_r0
[1];
5222 dst2
[2] = src_r0
[2];
5223 dst2
[3] = src_r0
[3];
5227 dst2
[0] = src_l2
[0] | src_r0
[0] << 8;
5228 dst2
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5229 dst2
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5230 dst2
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5234 dst2
[0] = src_l2
[0] | src_r0
[0] << 16;
5235 dst2
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5236 dst2
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5237 dst2
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5241 dst2
[0] = src_l2
[0] | src_r0
[0] << 24;
5242 dst2
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5243 dst2
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5244 dst2
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5248 dst2
[1] = src_r0
[0];
5249 dst2
[2] = src_r0
[1];
5250 dst2
[3] = src_r0
[2];
5254 dst2
[1] = src_l2
[1] | src_r0
[0] << 8;
5255 dst2
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5256 dst2
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5260 dst2
[1] = src_l2
[1] | src_r0
[0] << 16;
5261 dst2
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5262 dst2
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5266 dst2
[1] = src_l2
[1] | src_r0
[0] << 24;
5267 dst2
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5268 dst2
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5272 dst2
[2] = src_r0
[0];
5273 dst2
[3] = src_r0
[1];
5277 dst2
[2] = src_l2
[2] | src_r0
[0] << 8;
5278 dst2
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5282 dst2
[2] = src_l2
[2] | src_r0
[0] << 16;
5283 dst2
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5287 dst2
[2] = src_l2
[2] | src_r0
[0] << 24;
5288 dst2
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5292 dst2
[3] = src_r0
[0];
5296 dst2
[3] = src_l2
[3] | src_r0
[0] << 8;
5300 dst2
[3] = src_l2
[3] | src_r0
[0] << 16;
5304 dst2
[3] = src_l2
[3] | src_r0
[0] << 24;
5309 __device__
static void device_memcat12L (const u32 offset
, u32 dst0
[4], u32 dst1
[4], u32 dst2
[4], u32 src_l0
[4], u32 src_l1
[4], u32 src_l2
[4], u32 src_r0
[4], u32 src_r1
[4])
5314 dst0
[0] = src_r0
[0];
5315 dst0
[1] = src_r0
[1];
5316 dst0
[2] = src_r0
[2];
5317 dst0
[3] = src_r0
[3];
5318 dst1
[0] = src_r1
[0];
5319 dst1
[1] = src_r1
[1];
5320 dst1
[2] = src_r1
[2];
5321 dst1
[3] = src_r1
[3];
5325 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
5326 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5327 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5328 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5329 dst1
[0] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5330 dst1
[1] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5331 dst1
[2] = src_r1
[1] >> 24 | src_r1
[2] << 8;
5332 dst1
[3] = src_r1
[2] >> 24 | src_r1
[3] << 8;
5333 dst2
[0] = src_r1
[3] >> 24;
5337 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
5338 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5339 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5340 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5341 dst1
[0] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5342 dst1
[1] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5343 dst1
[2] = src_r1
[1] >> 16 | src_r1
[2] << 16;
5344 dst1
[3] = src_r1
[2] >> 16 | src_r1
[3] << 16;
5345 dst2
[0] = src_r1
[3] >> 16;
5349 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
5350 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5351 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5352 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5353 dst1
[0] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5354 dst1
[1] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5355 dst1
[2] = src_r1
[1] >> 8 | src_r1
[2] << 24;
5356 dst1
[3] = src_r1
[2] >> 8 | src_r1
[3] << 24;
5357 dst2
[0] = src_r1
[3] >> 8;
5361 dst0
[1] = src_r0
[0];
5362 dst0
[2] = src_r0
[1];
5363 dst0
[3] = src_r0
[2];
5364 dst1
[0] = src_r0
[3];
5365 dst1
[1] = src_r1
[0];
5366 dst1
[2] = src_r1
[1];
5367 dst1
[3] = src_r1
[2];
5368 dst2
[0] = src_r1
[3];
5372 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
5373 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5374 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5375 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5376 dst1
[1] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5377 dst1
[2] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5378 dst1
[3] = src_r1
[1] >> 24 | src_r1
[2] << 8;
5379 dst2
[0] = src_r1
[2] >> 24 | src_r1
[3] << 8;
5380 dst2
[1] = src_r1
[3] >> 24;
5384 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
5385 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5386 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5387 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5388 dst1
[1] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5389 dst1
[2] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5390 dst1
[3] = src_r1
[1] >> 16 | src_r1
[2] << 16;
5391 dst2
[0] = src_r1
[2] >> 16 | src_r1
[3] << 16;
5392 dst2
[1] = src_r1
[3] >> 16;
5396 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
5397 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5398 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5399 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5400 dst1
[1] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5401 dst1
[2] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5402 dst1
[3] = src_r1
[1] >> 8 | src_r1
[2] << 24;
5403 dst2
[0] = src_r1
[2] >> 8 | src_r1
[3] << 24;
5404 dst2
[1] = src_r1
[3] >> 8;
5408 dst0
[2] = src_r0
[0];
5409 dst0
[3] = src_r0
[1];
5410 dst1
[0] = src_r0
[2];
5411 dst1
[1] = src_r0
[3];
5412 dst1
[2] = src_r1
[0];
5413 dst1
[3] = src_r1
[1];
5414 dst2
[0] = src_r1
[2];
5415 dst2
[1] = src_r1
[3];
5419 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
5420 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5421 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5422 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5423 dst1
[2] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5424 dst1
[3] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5425 dst2
[0] = src_r1
[1] >> 24 | src_r1
[2] << 8;
5426 dst2
[1] = src_r1
[2] >> 24 | src_r1
[3] << 8;
5427 dst2
[2] = src_r1
[3] >> 24;
5431 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
5432 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5433 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5434 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5435 dst1
[2] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5436 dst1
[3] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5437 dst2
[0] = src_r1
[1] >> 16 | src_r1
[2] << 16;
5438 dst2
[1] = src_r1
[2] >> 16 | src_r1
[3] << 16;
5439 dst2
[2] = src_r1
[3] >> 16;
5443 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
5444 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5445 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5446 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5447 dst1
[2] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5448 dst1
[3] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5449 dst2
[0] = src_r1
[1] >> 8 | src_r1
[2] << 24;
5450 dst2
[1] = src_r1
[2] >> 8 | src_r1
[3] << 24;
5451 dst2
[2] = src_r1
[3] >> 8;
5455 dst0
[3] = src_r0
[0];
5456 dst1
[0] = src_r0
[1];
5457 dst1
[1] = src_r0
[2];
5458 dst1
[2] = src_r0
[3];
5459 dst1
[3] = src_r1
[0];
5460 dst2
[0] = src_r1
[1];
5461 dst2
[1] = src_r1
[2];
5462 dst2
[2] = src_r1
[3];
5466 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
5467 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5468 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5469 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5470 dst1
[3] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5471 dst2
[0] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5472 dst2
[1] = src_r1
[1] >> 24 | src_r1
[2] << 8;
5473 dst2
[2] = src_r1
[2] >> 24 | src_r1
[3] << 8;
5474 dst2
[3] = src_r1
[3] >> 24;
5478 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
5479 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5480 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5481 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5482 dst1
[3] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5483 dst2
[0] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5484 dst2
[1] = src_r1
[1] >> 16 | src_r1
[2] << 16;
5485 dst2
[2] = src_r1
[2] >> 16 | src_r1
[3] << 16;
5486 dst2
[3] = src_r1
[3] >> 16;
5490 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
5491 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5492 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5493 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5494 dst1
[3] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5495 dst2
[0] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5496 dst2
[1] = src_r1
[1] >> 8 | src_r1
[2] << 24;
5497 dst2
[2] = src_r1
[2] >> 8 | src_r1
[3] << 24;
5498 dst2
[3] = src_r1
[3] >> 8;
5502 dst1
[0] = src_r0
[0];
5503 dst1
[1] = src_r0
[1];
5504 dst1
[2] = src_r0
[2];
5505 dst1
[3] = src_r0
[3];
5506 dst2
[0] = src_r1
[0];
5507 dst2
[1] = src_r1
[1];
5508 dst2
[2] = src_r1
[2];
5509 dst2
[3] = src_r1
[3];
5513 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
5514 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5515 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5516 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5517 dst2
[0] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5518 dst2
[1] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5519 dst2
[2] = src_r1
[1] >> 24 | src_r1
[2] << 8;
5520 dst2
[3] = src_r1
[2] >> 24 | src_r1
[3] << 8;
5524 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
5525 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5526 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5527 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5528 dst2
[0] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5529 dst2
[1] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5530 dst2
[2] = src_r1
[1] >> 16 | src_r1
[2] << 16;
5531 dst2
[3] = src_r1
[2] >> 16 | src_r1
[3] << 16;
5535 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
5536 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5537 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5538 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5539 dst2
[0] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5540 dst2
[1] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5541 dst2
[2] = src_r1
[1] >> 8 | src_r1
[2] << 24;
5542 dst2
[3] = src_r1
[2] >> 8 | src_r1
[3] << 24;
5546 dst1
[1] = src_r1
[0];
5547 dst1
[2] = src_r0
[1];
5548 dst1
[3] = src_r0
[2];
5549 dst2
[0] = src_r0
[3];
5550 dst2
[1] = src_r1
[0];
5551 dst2
[2] = src_r1
[1];
5552 dst2
[3] = src_r1
[2];
5556 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
5557 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5558 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5559 dst2
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5560 dst2
[1] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5561 dst2
[2] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5562 dst2
[3] = src_r1
[1] >> 24 | src_r1
[2] << 8;
5566 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
5567 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5568 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5569 dst2
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5570 dst2
[1] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5571 dst2
[2] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5572 dst2
[3] = src_r1
[1] >> 16 | src_r1
[2] << 16;
5576 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
5577 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5578 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5579 dst2
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5580 dst2
[1] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5581 dst2
[2] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5582 dst2
[3] = src_r1
[1] >> 8 | src_r1
[2] << 24;
5586 dst1
[2] = src_r1
[0];
5587 dst1
[3] = src_r0
[1];
5588 dst2
[0] = src_r0
[2];
5589 dst2
[1] = src_r0
[3];
5590 dst2
[2] = src_r1
[0];
5591 dst2
[3] = src_r1
[1];
5595 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
5596 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5597 dst2
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5598 dst2
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5599 dst2
[2] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5600 dst2
[3] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5604 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
5605 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5606 dst2
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5607 dst2
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5608 dst2
[2] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5609 dst2
[3] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5613 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
5614 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5615 dst2
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5616 dst2
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5617 dst2
[2] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5618 dst2
[3] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5622 dst1
[3] = src_r1
[0];
5623 dst2
[0] = src_r0
[1];
5624 dst2
[1] = src_r0
[2];
5625 dst2
[2] = src_r0
[3];
5626 dst2
[3] = src_r1
[0];
5630 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
5631 dst2
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5632 dst2
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5633 dst2
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5634 dst2
[3] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5638 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
5639 dst2
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5640 dst2
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5641 dst2
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5642 dst2
[3] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5646 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
5647 dst2
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5648 dst2
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5649 dst2
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5650 dst2
[3] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5654 dst2
[0] = src_r0
[0];
5655 dst2
[1] = src_r0
[1];
5656 dst2
[2] = src_r0
[2];
5657 dst2
[3] = src_r0
[3];
5661 dst2
[0] = src_l2
[0] | src_r0
[0] << 8;
5662 dst2
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5663 dst2
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5664 dst2
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5668 dst2
[0] = src_l2
[0] | src_r0
[0] << 16;
5669 dst2
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5670 dst2
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5671 dst2
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5675 dst2
[0] = src_l2
[0] | src_r0
[0] << 24;
5676 dst2
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5677 dst2
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5678 dst2
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5682 dst2
[1] = src_r0
[0];
5683 dst2
[2] = src_r0
[1];
5684 dst2
[3] = src_r0
[2];
5688 dst2
[1] = src_l2
[1] | src_r0
[0] << 8;
5689 dst2
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5690 dst2
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5694 dst2
[1] = src_l2
[1] | src_r0
[0] << 16;
5695 dst2
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5696 dst2
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5700 dst2
[1] = src_l2
[1] | src_r0
[0] << 24;
5701 dst2
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5702 dst2
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5706 dst2
[2] = src_r0
[0];
5707 dst2
[3] = src_r0
[1];
5711 dst2
[2] = src_l2
[2] | src_r0
[0] << 8;
5712 dst2
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5716 dst2
[2] = src_l2
[2] | src_r0
[0] << 16;
5717 dst2
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5721 dst2
[2] = src_l2
[2] | src_r0
[0] << 24;
5722 dst2
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5726 dst2
[3] = src_r0
[0];
5730 dst2
[3] = src_l2
[3] | src_r0
[0] << 8;
5734 dst2
[3] = src_l2
[3] | src_r0
[0] << 16;
5738 dst2
[3] = src_l2
[3] | src_r0
[0] << 24;
5743 __device__
static void memcat16_9 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 append0
[4], const u32 append1
[4], const u32 append2
[4], const u32 offset
)
5760 w0
[0] = w0
[0] | append0
[0] << 8;
5761 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
5762 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
5763 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
5764 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
5765 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
5766 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
5767 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
5768 w2
[0] = append1
[3] >> 24 | append2
[0] << 8;
5769 w2
[1] = append2
[0] >> 24;
5773 w0
[0] = w0
[0] | append0
[0] << 16;
5774 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
5775 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
5776 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
5777 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
5778 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
5779 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
5780 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
5781 w2
[0] = append1
[3] >> 16 | append2
[0] << 16;
5782 w2
[1] = append2
[0] >> 16;
5786 w0
[0] = w0
[0] | append0
[0] << 24;
5787 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
5788 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
5789 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
5790 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
5791 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
5792 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
5793 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
5794 w2
[0] = append1
[3] >> 8 | append2
[0] << 24;
5795 w2
[1] = append2
[0] >> 8;
5811 w0
[1] = w0
[1] | append0
[0] << 8;
5812 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
5813 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
5814 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
5815 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
5816 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
5817 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
5818 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
5819 w2
[1] = append1
[3] >> 24 | append2
[0] << 8;
5820 w2
[2] = append2
[0] >> 24;
5824 w0
[1] = w0
[1] | append0
[0] << 16;
5825 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
5826 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
5827 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
5828 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
5829 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
5830 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
5831 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
5832 w2
[1] = append1
[3] >> 16 | append2
[0] << 16;
5833 w2
[2] = append2
[0] >> 16;
5837 w0
[1] = w0
[1] | append0
[0] << 24;
5838 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
5839 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
5840 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
5841 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
5842 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
5843 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
5844 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
5845 w2
[1] = append1
[3] >> 8 | append2
[0] << 24;
5846 w2
[2] = append2
[0] >> 8;
5862 w0
[2] = w0
[2] | append0
[0] << 8;
5863 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
5864 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
5865 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
5866 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
5867 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
5868 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
5869 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
5870 w2
[2] = append1
[3] >> 24 | append2
[0] << 8;
5871 w2
[3] = append2
[0] >> 24;
5875 w0
[2] = w0
[2] | append0
[0] << 16;
5876 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
5877 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
5878 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
5879 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
5880 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
5881 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
5882 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
5883 w2
[2] = append1
[3] >> 16 | append2
[0] << 16;
5884 w2
[3] = append2
[0] >> 16;
5888 w0
[2] = w0
[2] | append0
[0] << 24;
5889 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
5890 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
5891 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
5892 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
5893 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
5894 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
5895 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
5896 w2
[2] = append1
[3] >> 8 | append2
[0] << 24;
5897 w2
[3] = append2
[0] >> 8;
5913 w0
[3] = w0
[3] | append0
[0] << 8;
5914 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
5915 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
5916 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
5917 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
5918 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
5919 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
5920 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
5921 w2
[3] = append1
[3] >> 24 | append2
[0] << 8;
5922 w3
[0] = append2
[0] >> 24;
5926 w0
[3] = w0
[3] | append0
[0] << 16;
5927 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
5928 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
5929 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
5930 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
5931 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
5932 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
5933 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
5934 w2
[3] = append1
[3] >> 16 | append2
[0] << 16;
5935 w3
[0] = append2
[0] >> 16;
5939 w0
[3] = w0
[3] | append0
[0] << 24;
5940 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
5941 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
5942 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
5943 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
5944 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
5945 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
5946 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
5947 w2
[3] = append1
[3] >> 8 | append2
[0] << 24;
5948 w3
[0] = append2
[0] >> 8;
5953 __device__
static void memcat32_8 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 append0
[4], const u32 append1
[4], const u32 offset
)
5969 w0
[0] = w0
[0] | append0
[0] << 8;
5970 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
5971 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
5972 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
5973 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
5974 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
5975 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
5976 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
5977 w2
[0] = append1
[3] >> 24;
5981 w0
[0] = w0
[0] | append0
[0] << 16;
5982 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
5983 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
5984 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
5985 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
5986 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
5987 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
5988 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
5989 w2
[0] = append1
[3] >> 16;
5993 w0
[0] = w0
[0] | append0
[0] << 24;
5994 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
5995 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
5996 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
5997 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
5998 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
5999 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
6000 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
6001 w2
[0] = append1
[3] >> 8;
6016 w0
[1] = w0
[1] | append0
[0] << 8;
6017 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
6018 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
6019 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
6020 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
6021 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
6022 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
6023 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
6024 w2
[1] = append1
[3] >> 24;
6028 w0
[1] = w0
[1] | append0
[0] << 16;
6029 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
6030 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
6031 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
6032 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
6033 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
6034 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
6035 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
6036 w2
[1] = append1
[3] >> 16;
6040 w0
[1] = w0
[1] | append0
[0] << 24;
6041 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
6042 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
6043 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
6044 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
6045 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
6046 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
6047 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
6048 w2
[1] = append1
[3] >> 8;
6063 w0
[2] = w0
[2] | append0
[0] << 8;
6064 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
6065 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
6066 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
6067 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
6068 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
6069 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
6070 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
6071 w2
[2] = append1
[3] >> 24;
6075 w0
[2] = w0
[2] | append0
[0] << 16;
6076 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
6077 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
6078 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
6079 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
6080 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
6081 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
6082 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
6083 w2
[2] = append1
[3] >> 16;
6087 w0
[2] = w0
[2] | append0
[0] << 24;
6088 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
6089 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
6090 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
6091 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
6092 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
6093 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
6094 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
6095 w2
[2] = append1
[3] >> 8;
6110 w0
[3] = w0
[3] | append0
[0] << 8;
6111 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
6112 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
6113 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
6114 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
6115 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
6116 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
6117 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
6118 w2
[3] = append1
[3] >> 24;
6122 w0
[3] = w0
[3] | append0
[0] << 16;
6123 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
6124 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
6125 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
6126 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
6127 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
6128 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
6129 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
6130 w2
[3] = append1
[3] >> 16;
6134 w0
[3] = w0
[3] | append0
[0] << 24;
6135 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
6136 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
6137 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
6138 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
6139 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
6140 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
6141 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
6142 w2
[3] = append1
[3] >> 8;
6157 w1
[0] = w1
[0] | append0
[0] << 8;
6158 w1
[1] = append0
[0] >> 24 | append0
[1] << 8;
6159 w1
[2] = append0
[1] >> 24 | append0
[2] << 8;
6160 w1
[3] = append0
[2] >> 24 | append0
[3] << 8;
6161 w2
[0] = append0
[3] >> 24 | append1
[0] << 8;
6162 w2
[1] = append1
[0] >> 24 | append1
[1] << 8;
6163 w2
[2] = append1
[1] >> 24 | append1
[2] << 8;
6164 w2
[3] = append1
[2] >> 24 | append1
[3] << 8;
6165 w3
[0] = append1
[3] >> 24;
6169 w1
[0] = w1
[0] | append0
[0] << 16;
6170 w1
[1] = append0
[0] >> 16 | append0
[1] << 16;
6171 w1
[2] = append0
[1] >> 16 | append0
[2] << 16;
6172 w1
[3] = append0
[2] >> 16 | append0
[3] << 16;
6173 w2
[0] = append0
[3] >> 16 | append1
[0] << 16;
6174 w2
[1] = append1
[0] >> 16 | append1
[1] << 16;
6175 w2
[2] = append1
[1] >> 16 | append1
[2] << 16;
6176 w2
[3] = append1
[2] >> 16 | append1
[3] << 16;
6177 w3
[0] = append1
[3] >> 16;
6181 w1
[0] = w1
[0] | append0
[0] << 24;
6182 w1
[1] = append0
[0] >> 8 | append0
[1] << 24;
6183 w1
[2] = append0
[1] >> 8 | append0
[2] << 24;
6184 w1
[3] = append0
[2] >> 8 | append0
[3] << 24;
6185 w2
[0] = append0
[3] >> 8 | append1
[0] << 24;
6186 w2
[1] = append1
[0] >> 8 | append1
[1] << 24;
6187 w2
[2] = append1
[1] >> 8 | append1
[2] << 24;
6188 w2
[3] = append1
[2] >> 8 | append1
[3] << 24;
6189 w3
[0] = append1
[3] >> 8;
6204 w1
[1] = w1
[1] | append0
[0] << 8;
6205 w1
[2] = append0
[0] >> 24 | append0
[1] << 8;
6206 w1
[3] = append0
[1] >> 24 | append0
[2] << 8;
6207 w2
[0] = append0
[2] >> 24 | append0
[3] << 8;
6208 w2
[1] = append0
[3] >> 24 | append1
[0] << 8;
6209 w2
[2] = append1
[0] >> 24 | append1
[1] << 8;
6210 w2
[3] = append1
[1] >> 24 | append1
[2] << 8;
6211 w3
[0] = append1
[2] >> 24 | append1
[3] << 8;
6212 w3
[1] = append1
[3] >> 24;
6216 w1
[1] = w1
[1] | append0
[0] << 16;
6217 w1
[2] = append0
[0] >> 16 | append0
[1] << 16;
6218 w1
[3] = append0
[1] >> 16 | append0
[2] << 16;
6219 w2
[0] = append0
[2] >> 16 | append0
[3] << 16;
6220 w2
[1] = append0
[3] >> 16 | append1
[0] << 16;
6221 w2
[2] = append1
[0] >> 16 | append1
[1] << 16;
6222 w2
[3] = append1
[1] >> 16 | append1
[2] << 16;
6223 w3
[0] = append1
[2] >> 16 | append1
[3] << 16;
6224 w3
[1] = append1
[3] >> 16;
6228 w1
[1] = w1
[1] | append0
[0] << 24;
6229 w1
[2] = append0
[0] >> 8 | append0
[1] << 24;
6230 w1
[3] = append0
[1] >> 8 | append0
[2] << 24;
6231 w2
[0] = append0
[2] >> 8 | append0
[3] << 24;
6232 w2
[1] = append0
[3] >> 8 | append1
[0] << 24;
6233 w2
[2] = append1
[0] >> 8 | append1
[1] << 24;
6234 w2
[3] = append1
[1] >> 8 | append1
[2] << 24;
6235 w3
[0] = append1
[2] >> 8 | append1
[3] << 24;
6236 w3
[1] = append1
[3] >> 8;
6251 w1
[2] = w1
[2] | append0
[0] << 8;
6252 w1
[3] = append0
[0] >> 24 | append0
[1] << 8;
6253 w2
[0] = append0
[1] >> 24 | append0
[2] << 8;
6254 w2
[1] = append0
[2] >> 24 | append0
[3] << 8;
6255 w2
[2] = append0
[3] >> 24 | append1
[0] << 8;
6256 w2
[3] = append1
[0] >> 24 | append1
[1] << 8;
6257 w3
[0] = append1
[1] >> 24 | append1
[2] << 8;
6258 w3
[1] = append1
[2] >> 24 | append1
[3] << 8;
6262 w1
[2] = w1
[2] | append0
[0] << 16;
6263 w1
[3] = append0
[0] >> 16 | append0
[1] << 16;
6264 w2
[0] = append0
[1] >> 16 | append0
[2] << 16;
6265 w2
[1] = append0
[2] >> 16 | append0
[3] << 16;
6266 w2
[2] = append0
[3] >> 16 | append1
[0] << 16;
6267 w2
[3] = append1
[0] >> 16 | append1
[1] << 16;
6268 w3
[0] = append1
[1] >> 16 | append1
[2] << 16;
6269 w3
[1] = append1
[2] >> 16 | append1
[3] << 16;
6273 w1
[2] = w1
[2] | append0
[0] << 24;
6274 w1
[3] = append0
[0] >> 8 | append0
[1] << 24;
6275 w2
[0] = append0
[1] >> 8 | append0
[2] << 24;
6276 w2
[1] = append0
[2] >> 8 | append0
[3] << 24;
6277 w2
[2] = append0
[3] >> 8 | append1
[0] << 24;
6278 w2
[3] = append1
[0] >> 8 | append1
[1] << 24;
6279 w3
[0] = append1
[1] >> 8 | append1
[2] << 24;
6280 w3
[1] = append1
[2] >> 8 | append1
[3] << 24;
6294 w1
[3] = w1
[3] | append0
[0] << 8;
6295 w2
[0] = append0
[0] >> 24 | append0
[1] << 8;
6296 w2
[1] = append0
[1] >> 24 | append0
[2] << 8;
6297 w2
[2] = append0
[2] >> 24 | append0
[3] << 8;
6298 w2
[3] = append0
[3] >> 24 | append1
[0] << 8;
6299 w3
[0] = append1
[0] >> 24 | append1
[1] << 8;
6300 w3
[1] = append1
[1] >> 24 | append1
[2] << 8;
6304 w1
[3] = w1
[3] | append0
[0] << 16;
6305 w2
[0] = append0
[0] >> 16 | append0
[1] << 16;
6306 w2
[1] = append0
[1] >> 16 | append0
[2] << 16;
6307 w2
[2] = append0
[2] >> 16 | append0
[3] << 16;
6308 w2
[3] = append0
[3] >> 16 | append1
[0] << 16;
6309 w3
[0] = append1
[0] >> 16 | append1
[1] << 16;
6310 w3
[1] = append1
[1] >> 16 | append1
[2] << 16;
6314 w1
[3] = w1
[3] | append0
[0] << 24;
6315 w2
[0] = append0
[0] >> 8 | append0
[1] << 24;
6316 w2
[1] = append0
[1] >> 8 | append0
[2] << 24;
6317 w2
[2] = append0
[2] >> 8 | append0
[3] << 24;
6318 w2
[3] = append0
[3] >> 8 | append1
[0] << 24;
6319 w3
[0] = append1
[0] >> 8 | append1
[1] << 24;
6320 w3
[1] = append1
[1] >> 8 | append1
[2] << 24;
6334 __device__
static void memcat32_9 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 append0
[4], const u32 append1
[4], const u32 append2
[4], const u32 offset
)
6351 w0
[0] = w0
[0] | append0
[0] << 8;
6352 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
6353 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
6354 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
6355 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
6356 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
6357 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
6358 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
6359 w2
[0] = append1
[3] >> 24 | append2
[0] << 8;
6360 w2
[1] = append2
[0] >> 24;
6364 w0
[0] = w0
[0] | append0
[0] << 16;
6365 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
6366 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
6367 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
6368 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
6369 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
6370 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
6371 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
6372 w2
[0] = append1
[3] >> 16 | append2
[0] << 16;
6373 w2
[1] = append2
[0] >> 16;
6377 w0
[0] = w0
[0] | append0
[0] << 24;
6378 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
6379 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
6380 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
6381 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
6382 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
6383 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
6384 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
6385 w2
[0] = append1
[3] >> 8 | append2
[0] << 24;
6386 w2
[1] = append2
[0] >> 8;
6402 w0
[1] = w0
[1] | append0
[0] << 8;
6403 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
6404 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
6405 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
6406 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
6407 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
6408 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
6409 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
6410 w2
[1] = append1
[3] >> 24 | append2
[0] << 8;
6411 w2
[2] = append2
[0] >> 24;
6415 w0
[1] = w0
[1] | append0
[0] << 16;
6416 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
6417 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
6418 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
6419 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
6420 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
6421 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
6422 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
6423 w2
[1] = append1
[3] >> 16 | append2
[0] << 16;
6424 w2
[2] = append2
[0] >> 16;
6428 w0
[1] = w0
[1] | append0
[0] << 24;
6429 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
6430 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
6431 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
6432 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
6433 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
6434 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
6435 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
6436 w2
[1] = append1
[3] >> 8 | append2
[0] << 24;
6437 w2
[2] = append2
[0] >> 8;
6453 w0
[2] = w0
[2] | append0
[0] << 8;
6454 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
6455 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
6456 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
6457 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
6458 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
6459 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
6460 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
6461 w2
[2] = append1
[3] >> 24 | append2
[0] << 8;
6462 w2
[3] = append2
[0] >> 24;
6466 w0
[2] = w0
[2] | append0
[0] << 16;
6467 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
6468 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
6469 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
6470 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
6471 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
6472 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
6473 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
6474 w2
[2] = append1
[3] >> 16 | append2
[0] << 16;
6475 w2
[3] = append2
[0] >> 16;
6479 w0
[2] = w0
[2] | append0
[0] << 24;
6480 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
6481 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
6482 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
6483 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
6484 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
6485 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
6486 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
6487 w2
[2] = append1
[3] >> 8 | append2
[0] << 24;
6488 w2
[3] = append2
[0] >> 8;
6504 w0
[3] = w0
[3] | append0
[0] << 8;
6505 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
6506 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
6507 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
6508 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
6509 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
6510 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
6511 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
6512 w2
[3] = append1
[3] >> 24 | append2
[0] << 8;
6513 w3
[0] = append2
[0] >> 24;
6517 w0
[3] = w0
[3] | append0
[0] << 16;
6518 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
6519 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
6520 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
6521 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
6522 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
6523 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
6524 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
6525 w2
[3] = append1
[3] >> 16 | append2
[0] << 16;
6526 w3
[0] = append2
[0] >> 16;
6530 w0
[3] = w0
[3] | append0
[0] << 24;
6531 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
6532 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
6533 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
6534 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
6535 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
6536 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
6537 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
6538 w2
[3] = append1
[3] >> 8 | append2
[0] << 24;
6539 w3
[0] = append2
[0] >> 8;
6555 w1
[0] = w1
[0] | append0
[0] << 8;
6556 w1
[1] = append0
[0] >> 24 | append0
[1] << 8;
6557 w1
[2] = append0
[1] >> 24 | append0
[2] << 8;
6558 w1
[3] = append0
[2] >> 24 | append0
[3] << 8;
6559 w2
[0] = append0
[3] >> 24 | append1
[0] << 8;
6560 w2
[1] = append1
[0] >> 24 | append1
[1] << 8;
6561 w2
[2] = append1
[1] >> 24 | append1
[2] << 8;
6562 w2
[3] = append1
[2] >> 24 | append1
[3] << 8;
6563 w3
[0] = append1
[3] >> 24 | append2
[0] << 8;
6564 w3
[1] = append2
[0] >> 24;
6568 w1
[0] = w1
[0] | append0
[0] << 16;
6569 w1
[1] = append0
[0] >> 16 | append0
[1] << 16;
6570 w1
[2] = append0
[1] >> 16 | append0
[2] << 16;
6571 w1
[3] = append0
[2] >> 16 | append0
[3] << 16;
6572 w2
[0] = append0
[3] >> 16 | append1
[0] << 16;
6573 w2
[1] = append1
[0] >> 16 | append1
[1] << 16;
6574 w2
[2] = append1
[1] >> 16 | append1
[2] << 16;
6575 w2
[3] = append1
[2] >> 16 | append1
[3] << 16;
6576 w3
[0] = append1
[3] >> 16 | append2
[0] << 16;
6577 w3
[1] = append2
[0] >> 16;
6581 w1
[0] = w1
[0] | append0
[0] << 24;
6582 w1
[1] = append0
[0] >> 8 | append0
[1] << 24;
6583 w1
[2] = append0
[1] >> 8 | append0
[2] << 24;
6584 w1
[3] = append0
[2] >> 8 | append0
[3] << 24;
6585 w2
[0] = append0
[3] >> 8 | append1
[0] << 24;
6586 w2
[1] = append1
[0] >> 8 | append1
[1] << 24;
6587 w2
[2] = append1
[1] >> 8 | append1
[2] << 24;
6588 w2
[3] = append1
[2] >> 8 | append1
[3] << 24;
6589 w3
[0] = append1
[3] >> 8 | append2
[0] << 24;
6590 w3
[1] = append2
[0] >> 8;
6606 w1
[1] = w1
[1] | append0
[0] << 8;
6607 w1
[2] = append0
[0] >> 24 | append0
[1] << 8;
6608 w1
[3] = append0
[1] >> 24 | append0
[2] << 8;
6609 w2
[0] = append0
[2] >> 24 | append0
[3] << 8;
6610 w2
[1] = append0
[3] >> 24 | append1
[0] << 8;
6611 w2
[2] = append1
[0] >> 24 | append1
[1] << 8;
6612 w2
[3] = append1
[1] >> 24 | append1
[2] << 8;
6613 w3
[0] = append1
[2] >> 24 | append1
[3] << 8;
6614 w3
[1] = append1
[3] >> 24 | append2
[0] << 8;
6618 w1
[1] = w1
[1] | append0
[0] << 16;
6619 w1
[2] = append0
[0] >> 16 | append0
[1] << 16;
6620 w1
[3] = append0
[1] >> 16 | append0
[2] << 16;
6621 w2
[0] = append0
[2] >> 16 | append0
[3] << 16;
6622 w2
[1] = append0
[3] >> 16 | append1
[0] << 16;
6623 w2
[2] = append1
[0] >> 16 | append1
[1] << 16;
6624 w2
[3] = append1
[1] >> 16 | append1
[2] << 16;
6625 w3
[0] = append1
[2] >> 16 | append1
[3] << 16;
6626 w3
[1] = append1
[3] >> 16 | append2
[0] << 16;
6630 w1
[1] = w1
[1] | append0
[0] << 24;
6631 w1
[2] = append0
[0] >> 8 | append0
[1] << 24;
6632 w1
[3] = append0
[1] >> 8 | append0
[2] << 24;
6633 w2
[0] = append0
[2] >> 8 | append0
[3] << 24;
6634 w2
[1] = append0
[3] >> 8 | append1
[0] << 24;
6635 w2
[2] = append1
[0] >> 8 | append1
[1] << 24;
6636 w2
[3] = append1
[1] >> 8 | append1
[2] << 24;
6637 w3
[0] = append1
[2] >> 8 | append1
[3] << 24;
6638 w3
[1] = append1
[3] >> 8 | append2
[0] << 24;
6653 w1
[2] = w1
[2] | append0
[0] << 8;
6654 w1
[3] = append0
[0] >> 24 | append0
[1] << 8;
6655 w2
[0] = append0
[1] >> 24 | append0
[2] << 8;
6656 w2
[1] = append0
[2] >> 24 | append0
[3] << 8;
6657 w2
[2] = append0
[3] >> 24 | append1
[0] << 8;
6658 w2
[3] = append1
[0] >> 24 | append1
[1] << 8;
6659 w3
[0] = append1
[1] >> 24 | append1
[2] << 8;
6660 w3
[1] = append1
[2] >> 24 | append1
[3] << 8;
6664 w1
[2] = w1
[2] | append0
[0] << 16;
6665 w1
[3] = append0
[0] >> 16 | append0
[1] << 16;
6666 w2
[0] = append0
[1] >> 16 | append0
[2] << 16;
6667 w2
[1] = append0
[2] >> 16 | append0
[3] << 16;
6668 w2
[2] = append0
[3] >> 16 | append1
[0] << 16;
6669 w2
[3] = append1
[0] >> 16 | append1
[1] << 16;
6670 w3
[0] = append1
[1] >> 16 | append1
[2] << 16;
6671 w3
[1] = append1
[2] >> 16 | append1
[3] << 16;
6675 w1
[2] = w1
[2] | append0
[0] << 24;
6676 w1
[3] = append0
[0] >> 8 | append0
[1] << 24;
6677 w2
[0] = append0
[1] >> 8 | append0
[2] << 24;
6678 w2
[1] = append0
[2] >> 8 | append0
[3] << 24;
6679 w2
[2] = append0
[3] >> 8 | append1
[0] << 24;
6680 w2
[3] = append1
[0] >> 8 | append1
[1] << 24;
6681 w3
[0] = append1
[1] >> 8 | append1
[2] << 24;
6682 w3
[1] = append1
[2] >> 8 | append1
[3] << 24;
6696 w1
[3] = w1
[3] | append0
[0] << 8;
6697 w2
[0] = append0
[0] >> 24 | append0
[1] << 8;
6698 w2
[1] = append0
[1] >> 24 | append0
[2] << 8;
6699 w2
[2] = append0
[2] >> 24 | append0
[3] << 8;
6700 w2
[3] = append0
[3] >> 24 | append1
[0] << 8;
6701 w3
[0] = append1
[0] >> 24 | append1
[1] << 8;
6702 w3
[1] = append1
[1] >> 24 | append1
[2] << 8;
6706 w1
[3] = w1
[3] | append0
[0] << 16;
6707 w2
[0] = append0
[0] >> 16 | append0
[1] << 16;
6708 w2
[1] = append0
[1] >> 16 | append0
[2] << 16;
6709 w2
[2] = append0
[2] >> 16 | append0
[3] << 16;
6710 w2
[3] = append0
[3] >> 16 | append1
[0] << 16;
6711 w3
[0] = append1
[0] >> 16 | append1
[1] << 16;
6712 w3
[1] = append1
[1] >> 16 | append1
[2] << 16;
6716 w1
[3] = w1
[3] | append0
[0] << 24;
6717 w2
[0] = append0
[0] >> 8 | append0
[1] << 24;
6718 w2
[1] = append0
[1] >> 8 | append0
[2] << 24;
6719 w2
[2] = append0
[2] >> 8 | append0
[3] << 24;
6720 w2
[3] = append0
[3] >> 8 | append1
[0] << 24;
6721 w3
[0] = append1
[0] >> 8 | append1
[1] << 24;
6722 w3
[1] = append1
[1] >> 8 | append1
[2] << 24;
6736 __device__
static void switch_buffer_by_offset (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
6738 #if __CUDA_ARCH__ >= 200
6740 const int offset_minus_4
= 4 - (offset
% 4);
6742 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
6747 w3
[1] = __byte_perm (w3
[0], w3
[1], selector
);
6748 w3
[0] = __byte_perm (w2
[3], w3
[0], selector
);
6749 w2
[3] = __byte_perm (w2
[2], w2
[3], selector
);
6750 w2
[2] = __byte_perm (w2
[1], w2
[2], selector
);
6751 w2
[1] = __byte_perm (w2
[0], w2
[1], selector
);
6752 w2
[0] = __byte_perm (w1
[3], w2
[0], selector
);
6753 w1
[3] = __byte_perm (w1
[2], w1
[3], selector
);
6754 w1
[2] = __byte_perm (w1
[1], w1
[2], selector
);
6755 w1
[1] = __byte_perm (w1
[0], w1
[1], selector
);
6756 w1
[0] = __byte_perm (w0
[3], w1
[0], selector
);
6757 w0
[3] = __byte_perm (w0
[2], w0
[3], selector
);
6758 w0
[2] = __byte_perm (w0
[1], w0
[2], selector
);
6759 w0
[1] = __byte_perm (w0
[0], w0
[1], selector
);
6760 w0
[0] = __byte_perm ( 0, w0
[0], selector
);
6765 w3
[1] = __byte_perm (w2
[3], w3
[0], selector
);
6766 w3
[0] = __byte_perm (w2
[2], w2
[3], selector
);
6767 w2
[3] = __byte_perm (w2
[1], w2
[2], selector
);
6768 w2
[2] = __byte_perm (w2
[0], w2
[1], selector
);
6769 w2
[1] = __byte_perm (w1
[3], w2
[0], selector
);
6770 w2
[0] = __byte_perm (w1
[2], w1
[3], selector
);
6771 w1
[3] = __byte_perm (w1
[1], w1
[2], selector
);
6772 w1
[2] = __byte_perm (w1
[0], w1
[1], selector
);
6773 w1
[1] = __byte_perm (w0
[3], w1
[0], selector
);
6774 w1
[0] = __byte_perm (w0
[2], w0
[3], selector
);
6775 w0
[3] = __byte_perm (w0
[1], w0
[2], selector
);
6776 w0
[2] = __byte_perm (w0
[0], w0
[1], selector
);
6777 w0
[1] = __byte_perm ( 0, w0
[0], selector
);
6783 w3
[1] = __byte_perm (w2
[2], w2
[3], selector
);
6784 w3
[0] = __byte_perm (w2
[1], w2
[2], selector
);
6785 w2
[3] = __byte_perm (w2
[0], w2
[1], selector
);
6786 w2
[2] = __byte_perm (w1
[3], w2
[0], selector
);
6787 w2
[1] = __byte_perm (w1
[2], w1
[3], selector
);
6788 w2
[0] = __byte_perm (w1
[1], w1
[2], selector
);
6789 w1
[3] = __byte_perm (w1
[0], w1
[1], selector
);
6790 w1
[2] = __byte_perm (w0
[3], w1
[0], selector
);
6791 w1
[1] = __byte_perm (w0
[2], w0
[3], selector
);
6792 w1
[0] = __byte_perm (w0
[1], w0
[2], selector
);
6793 w0
[3] = __byte_perm (w0
[0], w0
[1], selector
);
6794 w0
[2] = __byte_perm ( 0, w0
[0], selector
);
6801 w3
[1] = __byte_perm (w2
[1], w2
[2], selector
);
6802 w3
[0] = __byte_perm (w2
[0], w2
[1], selector
);
6803 w2
[3] = __byte_perm (w1
[3], w2
[0], selector
);
6804 w2
[2] = __byte_perm (w1
[2], w1
[3], selector
);
6805 w2
[1] = __byte_perm (w1
[1], w1
[2], selector
);
6806 w2
[0] = __byte_perm (w1
[0], w1
[1], selector
);
6807 w1
[3] = __byte_perm (w0
[3], w1
[0], selector
);
6808 w1
[2] = __byte_perm (w0
[2], w0
[3], selector
);
6809 w1
[1] = __byte_perm (w0
[1], w0
[2], selector
);
6810 w1
[0] = __byte_perm (w0
[0], w0
[1], selector
);
6811 w0
[3] = __byte_perm ( 0, w0
[0], selector
);
6819 w3
[1] = __byte_perm (w2
[0], w2
[1], selector
);
6820 w3
[0] = __byte_perm (w1
[3], w2
[0], selector
);
6821 w2
[3] = __byte_perm (w1
[2], w1
[3], selector
);
6822 w2
[2] = __byte_perm (w1
[1], w1
[2], selector
);
6823 w2
[1] = __byte_perm (w1
[0], w1
[1], selector
);
6824 w2
[0] = __byte_perm (w0
[3], w1
[0], selector
);
6825 w1
[3] = __byte_perm (w0
[2], w0
[3], selector
);
6826 w1
[2] = __byte_perm (w0
[1], w0
[2], selector
);
6827 w1
[1] = __byte_perm (w0
[0], w0
[1], selector
);
6828 w1
[0] = __byte_perm ( 0, w0
[0], selector
);
6837 w3
[1] = __byte_perm (w1
[3], w2
[0], selector
);
6838 w3
[0] = __byte_perm (w1
[2], w1
[3], selector
);
6839 w2
[3] = __byte_perm (w1
[1], w1
[2], selector
);
6840 w2
[2] = __byte_perm (w1
[0], w1
[1], selector
);
6841 w2
[1] = __byte_perm (w0
[3], w1
[0], selector
);
6842 w2
[0] = __byte_perm (w0
[2], w0
[3], selector
);
6843 w1
[3] = __byte_perm (w0
[1], w0
[2], selector
);
6844 w1
[2] = __byte_perm (w0
[0], w0
[1], selector
);
6845 w1
[1] = __byte_perm ( 0, w0
[0], selector
);
6855 w3
[1] = __byte_perm (w1
[2], w1
[3], selector
);
6856 w3
[0] = __byte_perm (w1
[1], w1
[2], selector
);
6857 w2
[3] = __byte_perm (w1
[0], w1
[1], selector
);
6858 w2
[2] = __byte_perm (w0
[3], w1
[0], selector
);
6859 w2
[1] = __byte_perm (w0
[2], w0
[3], selector
);
6860 w2
[0] = __byte_perm (w0
[1], w0
[2], selector
);
6861 w1
[3] = __byte_perm (w0
[0], w0
[1], selector
);
6862 w1
[2] = __byte_perm ( 0, w0
[0], selector
);
6873 w3
[1] = __byte_perm (w1
[1], w1
[2], selector
);
6874 w3
[0] = __byte_perm (w1
[0], w1
[1], selector
);
6875 w2
[3] = __byte_perm (w0
[3], w1
[0], selector
);
6876 w2
[2] = __byte_perm (w0
[2], w0
[3], selector
);
6877 w2
[1] = __byte_perm (w0
[1], w0
[2], selector
);
6878 w2
[0] = __byte_perm (w0
[0], w0
[1], selector
);
6879 w1
[3] = __byte_perm ( 0, w0
[0], selector
);
6891 w3
[1] = __byte_perm (w1
[0], w1
[1], selector
);
6892 w3
[0] = __byte_perm (w0
[3], w1
[0], selector
);
6893 w2
[3] = __byte_perm (w0
[2], w0
[3], selector
);
6894 w2
[2] = __byte_perm (w0
[1], w0
[2], selector
);
6895 w2
[1] = __byte_perm (w0
[0], w0
[1], selector
);
6896 w2
[0] = __byte_perm ( 0, w0
[0], selector
);
6909 w3
[1] = __byte_perm (w0
[3], w1
[0], selector
);
6910 w3
[0] = __byte_perm (w0
[2], w0
[3], selector
);
6911 w2
[3] = __byte_perm (w0
[1], w0
[2], selector
);
6912 w2
[2] = __byte_perm (w0
[0], w0
[1], selector
);
6913 w2
[1] = __byte_perm ( 0, w0
[0], selector
);
6927 w3
[1] = __byte_perm (w0
[2], w0
[3], selector
);
6928 w3
[0] = __byte_perm (w0
[1], w0
[2], selector
);
6929 w2
[3] = __byte_perm (w0
[0], w0
[1], selector
);
6930 w2
[2] = __byte_perm ( 0, w0
[0], selector
);
6945 w3
[1] = __byte_perm (w0
[1], w0
[2], selector
);
6946 w3
[0] = __byte_perm (w0
[0], w0
[1], selector
);
6947 w2
[3] = __byte_perm ( 0, w0
[0], selector
);
6963 w3
[1] = __byte_perm (w0
[0], w0
[1], selector
);
6964 w3
[0] = __byte_perm ( 0, w0
[0], selector
);
6981 w3
[1] = __byte_perm ( 0, w0
[0], selector
);
7020 tmp0
[0] = w0
[0] << 8;
7021 tmp0
[1] = w0
[0] >> 24 | w0
[1] << 8;
7022 tmp0
[2] = w0
[1] >> 24 | w0
[2] << 8;
7023 tmp0
[3] = w0
[2] >> 24 | w0
[3] << 8;
7024 tmp1
[0] = w0
[3] >> 24 | w1
[0] << 8;
7025 tmp1
[1] = w1
[0] >> 24 | w1
[1] << 8;
7026 tmp1
[2] = w1
[1] >> 24 | w1
[2] << 8;
7027 tmp1
[3] = w1
[2] >> 24 | w1
[3] << 8;
7028 tmp2
[0] = w1
[3] >> 24;
7032 tmp0
[0] = w0
[0] << 16;
7033 tmp0
[1] = w0
[0] >> 16 | w0
[1] << 16;
7034 tmp0
[2] = w0
[1] >> 16 | w0
[2] << 16;
7035 tmp0
[3] = w0
[2] >> 16 | w0
[3] << 16;
7036 tmp1
[0] = w0
[3] >> 16 | w1
[0] << 16;
7037 tmp1
[1] = w1
[0] >> 16 | w1
[1] << 16;
7038 tmp1
[2] = w1
[1] >> 16 | w1
[2] << 16;
7039 tmp1
[3] = w1
[2] >> 16 | w1
[3] << 16;
7040 tmp2
[0] = w1
[3] >> 16;
7044 tmp0
[0] = w0
[0] << 24;
7045 tmp0
[1] = w0
[0] >> 8 | w0
[1] << 24;
7046 tmp0
[2] = w0
[1] >> 8 | w0
[2] << 24;
7047 tmp0
[3] = w0
[2] >> 8 | w0
[3] << 24;
7048 tmp1
[0] = w0
[3] >> 8 | w1
[0] << 24;
7049 tmp1
[1] = w1
[0] >> 8 | w1
[1] << 24;
7050 tmp1
[2] = w1
[1] >> 8 | w1
[2] << 24;
7051 tmp1
[3] = w1
[2] >> 8 | w1
[3] << 24;
7052 tmp2
[0] = w1
[3] >> 8;
7301 __device__
static void switch_buffer_by_offset_be (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
7303 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
7308 w3
[1] = __byte_perm (w3
[1], w3
[0], selector
);
7309 w3
[0] = __byte_perm (w3
[0], w2
[3], selector
);
7310 w2
[3] = __byte_perm (w2
[3], w2
[2], selector
);
7311 w2
[2] = __byte_perm (w2
[2], w2
[1], selector
);
7312 w2
[1] = __byte_perm (w2
[1], w2
[0], selector
);
7313 w2
[0] = __byte_perm (w2
[0], w1
[3], selector
);
7314 w1
[3] = __byte_perm (w1
[3], w1
[2], selector
);
7315 w1
[2] = __byte_perm (w1
[2], w1
[1], selector
);
7316 w1
[1] = __byte_perm (w1
[1], w1
[0], selector
);
7317 w1
[0] = __byte_perm (w1
[0], w0
[3], selector
);
7318 w0
[3] = __byte_perm (w0
[3], w0
[2], selector
);
7319 w0
[2] = __byte_perm (w0
[2], w0
[1], selector
);
7320 w0
[1] = __byte_perm (w0
[1], w0
[0], selector
);
7321 w0
[0] = __byte_perm (w0
[0], 0, selector
);
7325 w3
[1] = __byte_perm (w3
[0], w2
[3], selector
);
7326 w3
[0] = __byte_perm (w2
[3], w2
[2], selector
);
7327 w2
[3] = __byte_perm (w2
[2], w2
[1], selector
);
7328 w2
[2] = __byte_perm (w2
[1], w2
[0], selector
);
7329 w2
[1] = __byte_perm (w2
[0], w1
[3], selector
);
7330 w2
[0] = __byte_perm (w1
[3], w1
[2], selector
);
7331 w1
[3] = __byte_perm (w1
[2], w1
[1], selector
);
7332 w1
[2] = __byte_perm (w1
[1], w1
[0], selector
);
7333 w1
[1] = __byte_perm (w1
[0], w0
[3], selector
);
7334 w1
[0] = __byte_perm (w0
[3], w0
[2], selector
);
7335 w0
[3] = __byte_perm (w0
[2], w0
[1], selector
);
7336 w0
[2] = __byte_perm (w0
[1], w0
[0], selector
);
7337 w0
[1] = __byte_perm (w0
[0], 0, selector
);
7342 w3
[1] = __byte_perm (w2
[3], w2
[2], selector
);
7343 w3
[0] = __byte_perm (w2
[2], w2
[1], selector
);
7344 w2
[3] = __byte_perm (w2
[1], w2
[0], selector
);
7345 w2
[2] = __byte_perm (w2
[0], w1
[3], selector
);
7346 w2
[1] = __byte_perm (w1
[3], w1
[2], selector
);
7347 w2
[0] = __byte_perm (w1
[2], w1
[1], selector
);
7348 w1
[3] = __byte_perm (w1
[1], w1
[0], selector
);
7349 w1
[2] = __byte_perm (w1
[0], w0
[3], selector
);
7350 w1
[1] = __byte_perm (w0
[3], w0
[2], selector
);
7351 w1
[0] = __byte_perm (w0
[2], w0
[1], selector
);
7352 w0
[3] = __byte_perm (w0
[1], w0
[0], selector
);
7353 w0
[2] = __byte_perm (w0
[0], 0, selector
);
7359 w3
[1] = __byte_perm (w2
[2], w2
[1], selector
);
7360 w3
[0] = __byte_perm (w2
[1], w2
[0], selector
);
7361 w2
[3] = __byte_perm (w2
[0], w1
[3], selector
);
7362 w2
[2] = __byte_perm (w1
[3], w1
[2], selector
);
7363 w2
[1] = __byte_perm (w1
[2], w1
[1], selector
);
7364 w2
[0] = __byte_perm (w1
[1], w1
[0], selector
);
7365 w1
[3] = __byte_perm (w1
[0], w0
[3], selector
);
7366 w1
[2] = __byte_perm (w0
[3], w0
[2], selector
);
7367 w1
[1] = __byte_perm (w0
[2], w0
[1], selector
);
7368 w1
[0] = __byte_perm (w0
[1], w0
[0], selector
);
7369 w0
[3] = __byte_perm (w0
[0], 0, selector
);
7376 w3
[1] = __byte_perm (w2
[1], w2
[0], selector
);
7377 w3
[0] = __byte_perm (w2
[0], w1
[3], selector
);
7378 w2
[3] = __byte_perm (w1
[3], w1
[2], selector
);
7379 w2
[2] = __byte_perm (w1
[2], w1
[1], selector
);
7380 w2
[1] = __byte_perm (w1
[1], w1
[0], selector
);
7381 w2
[0] = __byte_perm (w1
[0], w0
[3], selector
);
7382 w1
[3] = __byte_perm (w0
[3], w0
[2], selector
);
7383 w1
[2] = __byte_perm (w0
[2], w0
[1], selector
);
7384 w1
[1] = __byte_perm (w0
[1], w0
[0], selector
);
7385 w1
[0] = __byte_perm (w0
[0], 0, selector
);
7393 w3
[1] = __byte_perm (w2
[0], w1
[3], selector
);
7394 w3
[0] = __byte_perm (w1
[3], w1
[2], selector
);
7395 w2
[3] = __byte_perm (w1
[2], w1
[1], selector
);
7396 w2
[2] = __byte_perm (w1
[1], w1
[0], selector
);
7397 w2
[1] = __byte_perm (w1
[0], w0
[3], selector
);
7398 w2
[0] = __byte_perm (w0
[3], w0
[2], selector
);
7399 w1
[3] = __byte_perm (w0
[2], w0
[1], selector
);
7400 w1
[2] = __byte_perm (w0
[1], w0
[0], selector
);
7401 w1
[1] = __byte_perm (w0
[0], 0, selector
);
7410 w3
[1] = __byte_perm (w1
[3], w1
[2], selector
);
7411 w3
[0] = __byte_perm (w1
[2], w1
[1], selector
);
7412 w2
[3] = __byte_perm (w1
[1], w1
[0], selector
);
7413 w2
[2] = __byte_perm (w1
[0], w0
[3], selector
);
7414 w2
[1] = __byte_perm (w0
[3], w0
[2], selector
);
7415 w2
[0] = __byte_perm (w0
[2], w0
[1], selector
);
7416 w1
[3] = __byte_perm (w0
[1], w0
[0], selector
);
7417 w1
[2] = __byte_perm (w0
[0], 0, selector
);
7427 w3
[1] = __byte_perm (w1
[2], w1
[1], selector
);
7428 w3
[0] = __byte_perm (w1
[1], w1
[0], selector
);
7429 w2
[3] = __byte_perm (w1
[0], w0
[3], selector
);
7430 w2
[2] = __byte_perm (w0
[3], w0
[2], selector
);
7431 w2
[1] = __byte_perm (w0
[2], w0
[1], selector
);
7432 w2
[0] = __byte_perm (w0
[1], w0
[0], selector
);
7433 w1
[3] = __byte_perm (w0
[0], 0, selector
);
7444 w3
[1] = __byte_perm (w1
[1], w1
[0], selector
);
7445 w3
[0] = __byte_perm (w1
[0], w0
[3], selector
);
7446 w2
[3] = __byte_perm (w0
[3], w0
[2], selector
);
7447 w2
[2] = __byte_perm (w0
[2], w0
[1], selector
);
7448 w2
[1] = __byte_perm (w0
[1], w0
[0], selector
);
7449 w2
[0] = __byte_perm (w0
[0], 0, selector
);
7461 w3
[1] = __byte_perm (w1
[0], w0
[3], selector
);
7462 w3
[0] = __byte_perm (w0
[3], w0
[2], selector
);
7463 w2
[3] = __byte_perm (w0
[2], w0
[1], selector
);
7464 w2
[2] = __byte_perm (w0
[1], w0
[0], selector
);
7465 w2
[1] = __byte_perm (w0
[0], 0, selector
);
7478 w3
[1] = __byte_perm (w0
[3], w0
[2], selector
);
7479 w3
[0] = __byte_perm (w0
[2], w0
[1], selector
);
7480 w2
[3] = __byte_perm (w0
[1], w0
[0], selector
);
7481 w2
[2] = __byte_perm (w0
[0], 0, selector
);
7495 w3
[1] = __byte_perm (w0
[2], w0
[1], selector
);
7496 w3
[0] = __byte_perm (w0
[1], w0
[0], selector
);
7497 w2
[3] = __byte_perm (w0
[0], 0, selector
);
7512 w3
[1] = __byte_perm (w0
[1], w0
[0], selector
);
7513 w3
[0] = __byte_perm (w0
[0], 0, selector
);
7529 w3
[1] = __byte_perm (w0
[0], 0, selector
);
7552 __device__
static u32x
swap_workaround (const u32x v
)
7554 #if __CUDA_ARCH__ >= 200
7555 return __byte_perm (v
, 0, 0x0123);
7558 return (v
<< 24) + ((v
& 0x0000FF00) << 8) + ((v
& 0x00FF0000) >> 8) + (v
>> 24);
7563 __device__
static u64x
swap_workaround (const u64x v
)
7565 return (((v
& 0xff00000000000000) >> 56)
7566 | ((v
& 0x00ff000000000000) >> 40)
7567 | ((v
& 0x0000ff0000000000) >> 24)
7568 | ((v
& 0x000000ff00000000) >> 8)
7569 | ((v
& 0x00000000ff000000) << 8)
7570 | ((v
& 0x0000000000ff0000) << 24)
7571 | ((v
& 0x000000000000ff00) << 40)
7572 | ((v
& 0x00000000000000ff) << 56));
7575 __device__
static void truncate_block (u32x w
[4], const u32 len
)
7584 case 1: w
[0] &= 0x000000FF;
7589 case 2: w
[0] &= 0x0000FFFF;
7594 case 3: w
[0] &= 0x00FFFFFF;
7603 case 5: w
[1] &= 0x000000FF;
7607 case 6: w
[1] &= 0x0000FFFF;
7611 case 7: w
[1] &= 0x00FFFFFF;
7618 case 9: w
[2] &= 0x000000FF;
7621 case 10: w
[2] &= 0x0000FFFF;
7624 case 11: w
[2] &= 0x00FFFFFF;
7629 case 13: w
[3] &= 0x000000FF;
7631 case 14: w
[3] &= 0x0000FFFF;
7633 case 15: w
[3] &= 0x00FFFFFF;
7638 __device__
static void make_unicode (const u32x in
[4], u32x out1
[4], u32x out2
[4])
7640 #if __CUDA_ARCH__ >= 200
7641 out2
[3] = __byte_perm (in
[3], 0, 0x7372);
7642 out2
[2] = __byte_perm (in
[3], 0, 0x7170);
7643 out2
[1] = __byte_perm (in
[2], 0, 0x7372);
7644 out2
[0] = __byte_perm (in
[2], 0, 0x7170);
7645 out1
[3] = __byte_perm (in
[1], 0, 0x7372);
7646 out1
[2] = __byte_perm (in
[1], 0, 0x7170);
7647 out1
[1] = __byte_perm (in
[0], 0, 0x7372);
7648 out1
[0] = __byte_perm (in
[0], 0, 0x7170);
7650 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
7651 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
7652 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
7653 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
7654 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
7655 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
7656 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
7657 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
7661 __device__
static void append_0x01_1 (u32x w0
[4], const u32 offset
)
7670 w0
[0] = w0
[0] | 0x0100;
7674 w0
[0] = w0
[0] | 0x010000;
7678 w0
[0] = w0
[0] | 0x01000000;
7686 w0
[1] = w0
[1] | 0x0100;
7690 w0
[1] = w0
[1] | 0x010000;
7694 w0
[1] = w0
[1] | 0x01000000;
7702 w0
[2] = w0
[2] | 0x0100;
7706 w0
[2] = w0
[2] | 0x010000;
7710 w0
[2] = w0
[2] | 0x01000000;
7718 w0
[3] = w0
[3] | 0x0100;
7722 w0
[3] = w0
[3] | 0x010000;
7726 w0
[3] = w0
[3] | 0x01000000;
7731 __device__
static void append_0x01_2 (u32x w0
[4], u32x w1
[4], const u32 offset
)
7740 w0
[0] = w0
[0] | 0x0100;
7744 w0
[0] = w0
[0] | 0x010000;
7748 w0
[0] = w0
[0] | 0x01000000;
7756 w0
[1] = w0
[1] | 0x0100;
7760 w0
[1] = w0
[1] | 0x010000;
7764 w0
[1] = w0
[1] | 0x01000000;
7772 w0
[2] = w0
[2] | 0x0100;
7776 w0
[2] = w0
[2] | 0x010000;
7780 w0
[2] = w0
[2] | 0x01000000;
7788 w0
[3] = w0
[3] | 0x0100;
7792 w0
[3] = w0
[3] | 0x010000;
7796 w0
[3] = w0
[3] | 0x01000000;
7804 w1
[0] = w1
[0] | 0x0100;
7808 w1
[0] = w1
[0] | 0x010000;
7812 w1
[0] = w1
[0] | 0x01000000;
7820 w1
[1] = w1
[1] | 0x0100;
7824 w1
[1] = w1
[1] | 0x010000;
7828 w1
[1] = w1
[1] | 0x01000000;
7836 w1
[2] = w1
[2] | 0x0100;
7840 w1
[2] = w1
[2] | 0x010000;
7844 w1
[2] = w1
[2] | 0x01000000;
7852 w1
[3] = w1
[3] | 0x0100;
7856 w1
[3] = w1
[3] | 0x010000;
7860 w1
[3] = w1
[3] | 0x01000000;
7865 __device__
static void append_0x01_3 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
7874 w0
[0] = w0
[0] | 0x0100;
7878 w0
[0] = w0
[0] | 0x010000;
7882 w0
[0] = w0
[0] | 0x01000000;
7890 w0
[1] = w0
[1] | 0x0100;
7894 w0
[1] = w0
[1] | 0x010000;
7898 w0
[1] = w0
[1] | 0x01000000;
7906 w0
[2] = w0
[2] | 0x0100;
7910 w0
[2] = w0
[2] | 0x010000;
7914 w0
[2] = w0
[2] | 0x01000000;
7922 w0
[3] = w0
[3] | 0x0100;
7926 w0
[3] = w0
[3] | 0x010000;
7930 w0
[3] = w0
[3] | 0x01000000;
7938 w1
[0] = w1
[0] | 0x0100;
7942 w1
[0] = w1
[0] | 0x010000;
7946 w1
[0] = w1
[0] | 0x01000000;
7954 w1
[1] = w1
[1] | 0x0100;
7958 w1
[1] = w1
[1] | 0x010000;
7962 w1
[1] = w1
[1] | 0x01000000;
7970 w1
[2] = w1
[2] | 0x0100;
7974 w1
[2] = w1
[2] | 0x010000;
7978 w1
[2] = w1
[2] | 0x01000000;
7986 w1
[3] = w1
[3] | 0x0100;
7990 w1
[3] = w1
[3] | 0x010000;
7994 w1
[3] = w1
[3] | 0x01000000;
8002 w2
[0] = w2
[0] | 0x0100;
8006 w2
[0] = w2
[0] | 0x010000;
8010 w2
[0] = w2
[0] | 0x01000000;
8018 w2
[1] = w2
[1] | 0x0100;
8022 w2
[1] = w2
[1] | 0x010000;
8026 w2
[1] = w2
[1] | 0x01000000;
8034 w2
[2] = w2
[2] | 0x0100;
8038 w2
[2] = w2
[2] | 0x010000;
8042 w2
[2] = w2
[2] | 0x01000000;
8050 w2
[3] = w2
[3] | 0x0100;
8054 w2
[3] = w2
[3] | 0x010000;
8058 w2
[3] = w2
[3] | 0x01000000;
8063 __device__
static void append_0x01_4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
8072 w0
[0] = w0
[0] | 0x0100;
8076 w0
[0] = w0
[0] | 0x010000;
8080 w0
[0] = w0
[0] | 0x01000000;
8088 w0
[1] = w0
[1] | 0x0100;
8092 w0
[1] = w0
[1] | 0x010000;
8096 w0
[1] = w0
[1] | 0x01000000;
8104 w0
[2] = w0
[2] | 0x0100;
8108 w0
[2] = w0
[2] | 0x010000;
8112 w0
[2] = w0
[2] | 0x01000000;
8120 w0
[3] = w0
[3] | 0x0100;
8124 w0
[3] = w0
[3] | 0x010000;
8128 w0
[3] = w0
[3] | 0x01000000;
8136 w1
[0] = w1
[0] | 0x0100;
8140 w1
[0] = w1
[0] | 0x010000;
8144 w1
[0] = w1
[0] | 0x01000000;
8152 w1
[1] = w1
[1] | 0x0100;
8156 w1
[1] = w1
[1] | 0x010000;
8160 w1
[1] = w1
[1] | 0x01000000;
8168 w1
[2] = w1
[2] | 0x0100;
8172 w1
[2] = w1
[2] | 0x010000;
8176 w1
[2] = w1
[2] | 0x01000000;
8184 w1
[3] = w1
[3] | 0x0100;
8188 w1
[3] = w1
[3] | 0x010000;
8192 w1
[3] = w1
[3] | 0x01000000;
8200 w2
[0] = w2
[0] | 0x0100;
8204 w2
[0] = w2
[0] | 0x010000;
8208 w2
[0] = w2
[0] | 0x01000000;
8216 w2
[1] = w2
[1] | 0x0100;
8220 w2
[1] = w2
[1] | 0x010000;
8224 w2
[1] = w2
[1] | 0x01000000;
8232 w2
[2] = w2
[2] | 0x0100;
8236 w2
[2] = w2
[2] | 0x010000;
8240 w2
[2] = w2
[2] | 0x01000000;
8248 w2
[3] = w2
[3] | 0x0100;
8252 w2
[3] = w2
[3] | 0x010000;
8256 w2
[3] = w2
[3] | 0x01000000;
8264 w3
[0] = w3
[0] | 0x0100;
8268 w3
[0] = w3
[0] | 0x010000;
8272 w3
[0] = w3
[0] | 0x01000000;
8280 w3
[1] = w3
[1] | 0x0100;
8284 w3
[1] = w3
[1] | 0x010000;
8288 w3
[1] = w3
[1] | 0x01000000;
8296 w3
[2] = w3
[2] | 0x0100;
8300 w3
[2] = w3
[2] | 0x010000;
8304 w3
[2] = w3
[2] | 0x01000000;
8312 w3
[3] = w3
[3] | 0x0100;
8316 w3
[3] = w3
[3] | 0x010000;
8320 w3
[3] = w3
[3] | 0x01000000;
8325 __device__
static void append_0x01_8 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
8334 w0
[0] = w0
[0] | 0x0100;
8338 w0
[0] = w0
[0] | 0x010000;
8342 w0
[0] = w0
[0] | 0x01000000;
8350 w0
[1] = w0
[1] | 0x0100;
8354 w0
[1] = w0
[1] | 0x010000;
8358 w0
[1] = w0
[1] | 0x01000000;
8366 w0
[2] = w0
[2] | 0x0100;
8370 w0
[2] = w0
[2] | 0x010000;
8374 w0
[2] = w0
[2] | 0x01000000;
8382 w0
[3] = w0
[3] | 0x0100;
8386 w0
[3] = w0
[3] | 0x010000;
8390 w0
[3] = w0
[3] | 0x01000000;
8398 w1
[0] = w1
[0] | 0x0100;
8402 w1
[0] = w1
[0] | 0x010000;
8406 w1
[0] = w1
[0] | 0x01000000;
8414 w1
[1] = w1
[1] | 0x0100;
8418 w1
[1] = w1
[1] | 0x010000;
8422 w1
[1] = w1
[1] | 0x01000000;
8430 w1
[2] = w1
[2] | 0x0100;
8434 w1
[2] = w1
[2] | 0x010000;
8438 w1
[2] = w1
[2] | 0x01000000;
8446 w1
[3] = w1
[3] | 0x0100;
8450 w1
[3] = w1
[3] | 0x010000;
8454 w1
[3] = w1
[3] | 0x01000000;
8462 w2
[0] = w2
[0] | 0x0100;
8466 w2
[0] = w2
[0] | 0x010000;
8470 w2
[0] = w2
[0] | 0x01000000;
8478 w2
[1] = w2
[1] | 0x0100;
8482 w2
[1] = w2
[1] | 0x010000;
8486 w2
[1] = w2
[1] | 0x01000000;
8494 w2
[2] = w2
[2] | 0x0100;
8498 w2
[2] = w2
[2] | 0x010000;
8502 w2
[2] = w2
[2] | 0x01000000;
8510 w2
[3] = w2
[3] | 0x0100;
8514 w2
[3] = w2
[3] | 0x010000;
8518 w2
[3] = w2
[3] | 0x01000000;
8526 w3
[0] = w3
[0] | 0x0100;
8530 w3
[0] = w3
[0] | 0x010000;
8534 w3
[0] = w3
[0] | 0x01000000;
8542 w3
[1] = w3
[1] | 0x0100;
8546 w3
[1] = w3
[1] | 0x010000;
8550 w3
[1] = w3
[1] | 0x01000000;
8558 w3
[2] = w3
[2] | 0x0100;
8562 w3
[2] = w3
[2] | 0x010000;
8566 w3
[2] = w3
[2] | 0x01000000;
8574 w3
[3] = w3
[3] | 0x0100;
8578 w3
[3] = w3
[3] | 0x010000;
8582 w3
[3] = w3
[3] | 0x01000000;
8590 w4
[0] = w4
[0] | 0x0100;
8594 w4
[0] = w4
[0] | 0x010000;
8598 w4
[0] = w4
[0] | 0x01000000;
8606 w4
[1] = w4
[1] | 0x0100;
8610 w4
[1] = w4
[1] | 0x010000;
8614 w4
[1] = w4
[1] | 0x01000000;
8622 w4
[2] = w4
[2] | 0x0100;
8626 w4
[2] = w4
[2] | 0x010000;
8630 w4
[2] = w4
[2] | 0x01000000;
8638 w4
[3] = w4
[3] | 0x0100;
8642 w4
[3] = w4
[3] | 0x010000;
8646 w4
[3] = w4
[3] | 0x01000000;
8654 w5
[0] = w5
[0] | 0x0100;
8658 w5
[0] = w5
[0] | 0x010000;
8662 w5
[0] = w5
[0] | 0x01000000;
8670 w5
[1] = w5
[1] | 0x0100;
8674 w5
[1] = w5
[1] | 0x010000;
8678 w5
[1] = w5
[1] | 0x01000000;
8686 w5
[2] = w5
[2] | 0x0100;
8690 w5
[2] = w5
[2] | 0x010000;
8694 w5
[2] = w5
[2] | 0x01000000;
8702 w5
[3] = w5
[3] | 0x0100;
8706 w5
[3] = w5
[3] | 0x010000;
8710 w5
[3] = w5
[3] | 0x01000000;
8718 w6
[0] = w6
[0] | 0x0100;
8722 w6
[0] = w6
[0] | 0x010000;
8726 w6
[0] = w6
[0] | 0x01000000;
8734 w6
[1] = w6
[1] | 0x0100;
8738 w6
[1] = w6
[1] | 0x010000;
8742 w6
[1] = w6
[1] | 0x01000000;
8750 w6
[2] = w6
[2] | 0x0100;
8754 w6
[2] = w6
[2] | 0x010000;
8758 w6
[2] = w6
[2] | 0x01000000;
8766 w6
[3] = w6
[3] | 0x0100;
8770 w6
[3] = w6
[3] | 0x010000;
8774 w6
[3] = w6
[3] | 0x01000000;
8782 w7
[0] = w7
[0] | 0x0100;
8786 w7
[0] = w7
[0] | 0x010000;
8790 w7
[0] = w7
[0] | 0x01000000;
8798 w7
[1] = w7
[1] | 0x0100;
8802 w7
[1] = w7
[1] | 0x010000;
8806 w7
[1] = w7
[1] | 0x01000000;
8814 w7
[2] = w7
[2] | 0x0100;
8818 w7
[2] = w7
[2] | 0x010000;
8822 w7
[2] = w7
[2] | 0x01000000;
8830 w7
[3] = w7
[3] | 0x0100;
8834 w7
[3] = w7
[3] | 0x010000;
8838 w7
[3] = w7
[3] | 0x01000000;
8843 __device__
static void append_0x02_1 (u32x w0
[4], const u32 offset
)
8852 w0
[0] = w0
[0] | 0x0200;
8856 w0
[0] = w0
[0] | 0x020000;
8860 w0
[0] = w0
[0] | 0x02000000;
8868 w0
[1] = w0
[1] | 0x0200;
8872 w0
[1] = w0
[1] | 0x020000;
8876 w0
[1] = w0
[1] | 0x02000000;
8884 w0
[2] = w0
[2] | 0x0200;
8888 w0
[2] = w0
[2] | 0x020000;
8892 w0
[2] = w0
[2] | 0x02000000;
8900 w0
[3] = w0
[3] | 0x0200;
8904 w0
[3] = w0
[3] | 0x020000;
8908 w0
[3] = w0
[3] | 0x02000000;
8913 __device__
static void append_0x02_2 (u32x w0
[4], u32x w1
[4], const u32 offset
)
8922 w0
[0] = w0
[0] | 0x0200;
8926 w0
[0] = w0
[0] | 0x020000;
8930 w0
[0] = w0
[0] | 0x02000000;
8938 w0
[1] = w0
[1] | 0x0200;
8942 w0
[1] = w0
[1] | 0x020000;
8946 w0
[1] = w0
[1] | 0x02000000;
8954 w0
[2] = w0
[2] | 0x0200;
8958 w0
[2] = w0
[2] | 0x020000;
8962 w0
[2] = w0
[2] | 0x02000000;
8970 w0
[3] = w0
[3] | 0x0200;
8974 w0
[3] = w0
[3] | 0x020000;
8978 w0
[3] = w0
[3] | 0x02000000;
8986 w1
[0] = w1
[0] | 0x0200;
8990 w1
[0] = w1
[0] | 0x020000;
8994 w1
[0] = w1
[0] | 0x02000000;
9002 w1
[1] = w1
[1] | 0x0200;
9006 w1
[1] = w1
[1] | 0x020000;
9010 w1
[1] = w1
[1] | 0x02000000;
9018 w1
[2] = w1
[2] | 0x0200;
9022 w1
[2] = w1
[2] | 0x020000;
9026 w1
[2] = w1
[2] | 0x02000000;
9034 w1
[3] = w1
[3] | 0x0200;
9038 w1
[3] = w1
[3] | 0x020000;
9042 w1
[3] = w1
[3] | 0x02000000;
9047 __device__
static void append_0x02_3 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
9056 w0
[0] = w0
[0] | 0x0200;
9060 w0
[0] = w0
[0] | 0x020000;
9064 w0
[0] = w0
[0] | 0x02000000;
9072 w0
[1] = w0
[1] | 0x0200;
9076 w0
[1] = w0
[1] | 0x020000;
9080 w0
[1] = w0
[1] | 0x02000000;
9088 w0
[2] = w0
[2] | 0x0200;
9092 w0
[2] = w0
[2] | 0x020000;
9096 w0
[2] = w0
[2] | 0x02000000;
9104 w0
[3] = w0
[3] | 0x0200;
9108 w0
[3] = w0
[3] | 0x020000;
9112 w0
[3] = w0
[3] | 0x02000000;
9120 w1
[0] = w1
[0] | 0x0200;
9124 w1
[0] = w1
[0] | 0x020000;
9128 w1
[0] = w1
[0] | 0x02000000;
9136 w1
[1] = w1
[1] | 0x0200;
9140 w1
[1] = w1
[1] | 0x020000;
9144 w1
[1] = w1
[1] | 0x02000000;
9152 w1
[2] = w1
[2] | 0x0200;
9156 w1
[2] = w1
[2] | 0x020000;
9160 w1
[2] = w1
[2] | 0x02000000;
9168 w1
[3] = w1
[3] | 0x0200;
9172 w1
[3] = w1
[3] | 0x020000;
9176 w1
[3] = w1
[3] | 0x02000000;
9184 w2
[0] = w2
[0] | 0x0200;
9188 w2
[0] = w2
[0] | 0x020000;
9192 w2
[0] = w2
[0] | 0x02000000;
9200 w2
[1] = w2
[1] | 0x0200;
9204 w2
[1] = w2
[1] | 0x020000;
9208 w2
[1] = w2
[1] | 0x02000000;
9216 w2
[2] = w2
[2] | 0x0200;
9220 w2
[2] = w2
[2] | 0x020000;
9224 w2
[2] = w2
[2] | 0x02000000;
9232 w2
[3] = w2
[3] | 0x0200;
9236 w2
[3] = w2
[3] | 0x020000;
9240 w2
[3] = w2
[3] | 0x02000000;
9245 __device__
static void append_0x02_4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
9254 w0
[0] = w0
[0] | 0x0200;
9258 w0
[0] = w0
[0] | 0x020000;
9262 w0
[0] = w0
[0] | 0x02000000;
9270 w0
[1] = w0
[1] | 0x0200;
9274 w0
[1] = w0
[1] | 0x020000;
9278 w0
[1] = w0
[1] | 0x02000000;
9286 w0
[2] = w0
[2] | 0x0200;
9290 w0
[2] = w0
[2] | 0x020000;
9294 w0
[2] = w0
[2] | 0x02000000;
9302 w0
[3] = w0
[3] | 0x0200;
9306 w0
[3] = w0
[3] | 0x020000;
9310 w0
[3] = w0
[3] | 0x02000000;
9318 w1
[0] = w1
[0] | 0x0200;
9322 w1
[0] = w1
[0] | 0x020000;
9326 w1
[0] = w1
[0] | 0x02000000;
9334 w1
[1] = w1
[1] | 0x0200;
9338 w1
[1] = w1
[1] | 0x020000;
9342 w1
[1] = w1
[1] | 0x02000000;
9350 w1
[2] = w1
[2] | 0x0200;
9354 w1
[2] = w1
[2] | 0x020000;
9358 w1
[2] = w1
[2] | 0x02000000;
9366 w1
[3] = w1
[3] | 0x0200;
9370 w1
[3] = w1
[3] | 0x020000;
9374 w1
[3] = w1
[3] | 0x02000000;
9382 w2
[0] = w2
[0] | 0x0200;
9386 w2
[0] = w2
[0] | 0x020000;
9390 w2
[0] = w2
[0] | 0x02000000;
9398 w2
[1] = w2
[1] | 0x0200;
9402 w2
[1] = w2
[1] | 0x020000;
9406 w2
[1] = w2
[1] | 0x02000000;
9414 w2
[2] = w2
[2] | 0x0200;
9418 w2
[2] = w2
[2] | 0x020000;
9422 w2
[2] = w2
[2] | 0x02000000;
9430 w2
[3] = w2
[3] | 0x0200;
9434 w2
[3] = w2
[3] | 0x020000;
9438 w2
[3] = w2
[3] | 0x02000000;
9446 w3
[0] = w3
[0] | 0x0200;
9450 w3
[0] = w3
[0] | 0x020000;
9454 w3
[0] = w3
[0] | 0x02000000;
9462 w3
[1] = w3
[1] | 0x0200;
9466 w3
[1] = w3
[1] | 0x020000;
9470 w3
[1] = w3
[1] | 0x02000000;
9478 w3
[2] = w3
[2] | 0x0200;
9482 w3
[2] = w3
[2] | 0x020000;
9486 w3
[2] = w3
[2] | 0x02000000;
9494 w3
[3] = w3
[3] | 0x0200;
9498 w3
[3] = w3
[3] | 0x020000;
9502 w3
[3] = w3
[3] | 0x02000000;
9507 __device__
static void append_0x02_8 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
9516 w0
[0] = w0
[0] | 0x0200;
9520 w0
[0] = w0
[0] | 0x020000;
9524 w0
[0] = w0
[0] | 0x02000000;
9532 w0
[1] = w0
[1] | 0x0200;
9536 w0
[1] = w0
[1] | 0x020000;
9540 w0
[1] = w0
[1] | 0x02000000;
9548 w0
[2] = w0
[2] | 0x0200;
9552 w0
[2] = w0
[2] | 0x020000;
9556 w0
[2] = w0
[2] | 0x02000000;
9564 w0
[3] = w0
[3] | 0x0200;
9568 w0
[3] = w0
[3] | 0x020000;
9572 w0
[3] = w0
[3] | 0x02000000;
9580 w1
[0] = w1
[0] | 0x0200;
9584 w1
[0] = w1
[0] | 0x020000;
9588 w1
[0] = w1
[0] | 0x02000000;
9596 w1
[1] = w1
[1] | 0x0200;
9600 w1
[1] = w1
[1] | 0x020000;
9604 w1
[1] = w1
[1] | 0x02000000;
9612 w1
[2] = w1
[2] | 0x0200;
9616 w1
[2] = w1
[2] | 0x020000;
9620 w1
[2] = w1
[2] | 0x02000000;
9628 w1
[3] = w1
[3] | 0x0200;
9632 w1
[3] = w1
[3] | 0x020000;
9636 w1
[3] = w1
[3] | 0x02000000;
9644 w2
[0] = w2
[0] | 0x0200;
9648 w2
[0] = w2
[0] | 0x020000;
9652 w2
[0] = w2
[0] | 0x02000000;
9660 w2
[1] = w2
[1] | 0x0200;
9664 w2
[1] = w2
[1] | 0x020000;
9668 w2
[1] = w2
[1] | 0x02000000;
9676 w2
[2] = w2
[2] | 0x0200;
9680 w2
[2] = w2
[2] | 0x020000;
9684 w2
[2] = w2
[2] | 0x02000000;
9692 w2
[3] = w2
[3] | 0x0200;
9696 w2
[3] = w2
[3] | 0x020000;
9700 w2
[3] = w2
[3] | 0x02000000;
9708 w3
[0] = w3
[0] | 0x0200;
9712 w3
[0] = w3
[0] | 0x020000;
9716 w3
[0] = w3
[0] | 0x02000000;
9724 w3
[1] = w3
[1] | 0x0200;
9728 w3
[1] = w3
[1] | 0x020000;
9732 w3
[1] = w3
[1] | 0x02000000;
9740 w3
[2] = w3
[2] | 0x0200;
9744 w3
[2] = w3
[2] | 0x020000;
9748 w3
[2] = w3
[2] | 0x02000000;
9756 w3
[3] = w3
[3] | 0x0200;
9760 w3
[3] = w3
[3] | 0x020000;
9764 w3
[3] = w3
[3] | 0x02000000;
9772 w4
[0] = w4
[0] | 0x0200;
9776 w4
[0] = w4
[0] | 0x020000;
9780 w4
[0] = w4
[0] | 0x02000000;
9788 w4
[1] = w4
[1] | 0x0200;
9792 w4
[1] = w4
[1] | 0x020000;
9796 w4
[1] = w4
[1] | 0x02000000;
9804 w4
[2] = w4
[2] | 0x0200;
9808 w4
[2] = w4
[2] | 0x020000;
9812 w4
[2] = w4
[2] | 0x02000000;
9820 w4
[3] = w4
[3] | 0x0200;
9824 w4
[3] = w4
[3] | 0x020000;
9828 w4
[3] = w4
[3] | 0x02000000;
9836 w5
[0] = w5
[0] | 0x0200;
9840 w5
[0] = w5
[0] | 0x020000;
9844 w5
[0] = w5
[0] | 0x02000000;
9852 w5
[1] = w5
[1] | 0x0200;
9856 w5
[1] = w5
[1] | 0x020000;
9860 w5
[1] = w5
[1] | 0x02000000;
9868 w5
[2] = w5
[2] | 0x0200;
9872 w5
[2] = w5
[2] | 0x020000;
9876 w5
[2] = w5
[2] | 0x02000000;
9884 w5
[3] = w5
[3] | 0x0200;
9888 w5
[3] = w5
[3] | 0x020000;
9892 w5
[3] = w5
[3] | 0x02000000;
9900 w6
[0] = w6
[0] | 0x0200;
9904 w6
[0] = w6
[0] | 0x020000;
9908 w6
[0] = w6
[0] | 0x02000000;
9916 w6
[1] = w6
[1] | 0x0200;
9920 w6
[1] = w6
[1] | 0x020000;
9924 w6
[1] = w6
[1] | 0x02000000;
9932 w6
[2] = w6
[2] | 0x0200;
9936 w6
[2] = w6
[2] | 0x020000;
9940 w6
[2] = w6
[2] | 0x02000000;
9948 w6
[3] = w6
[3] | 0x0200;
9952 w6
[3] = w6
[3] | 0x020000;
9956 w6
[3] = w6
[3] | 0x02000000;
9964 w7
[0] = w7
[0] | 0x0200;
9968 w7
[0] = w7
[0] | 0x020000;
9972 w7
[0] = w7
[0] | 0x02000000;
9980 w7
[1] = w7
[1] | 0x0200;
9984 w7
[1] = w7
[1] | 0x020000;
9988 w7
[1] = w7
[1] | 0x02000000;
9996 w7
[2] = w7
[2] | 0x0200;
10000 w7
[2] = w7
[2] | 0x020000;
10004 w7
[2] = w7
[2] | 0x02000000;
10012 w7
[3] = w7
[3] | 0x0200;
10016 w7
[3] = w7
[3] | 0x020000;
10020 w7
[3] = w7
[3] | 0x02000000;
10025 __device__
static void append_0x80_1 (u32x w0
[4], const u32 offset
)
10034 w0
[0] = w0
[0] | 0x8000;
10038 w0
[0] = w0
[0] | 0x800000;
10042 w0
[0] = w0
[0] | 0x80000000;
10050 w0
[1] = w0
[1] | 0x8000;
10054 w0
[1] = w0
[1] | 0x800000;
10058 w0
[1] = w0
[1] | 0x80000000;
10066 w0
[2] = w0
[2] | 0x8000;
10070 w0
[2] = w0
[2] | 0x800000;
10074 w0
[2] = w0
[2] | 0x80000000;
10082 w0
[3] = w0
[3] | 0x8000;
10086 w0
[3] = w0
[3] | 0x800000;
10090 w0
[3] = w0
[3] | 0x80000000;
10095 __device__
static void append_0x80_2 (u32x w0
[4], u32x w1
[4], const u32 offset
)
10104 w0
[0] = w0
[0] | 0x8000;
10108 w0
[0] = w0
[0] | 0x800000;
10112 w0
[0] = w0
[0] | 0x80000000;
10120 w0
[1] = w0
[1] | 0x8000;
10124 w0
[1] = w0
[1] | 0x800000;
10128 w0
[1] = w0
[1] | 0x80000000;
10136 w0
[2] = w0
[2] | 0x8000;
10140 w0
[2] = w0
[2] | 0x800000;
10144 w0
[2] = w0
[2] | 0x80000000;
10152 w0
[3] = w0
[3] | 0x8000;
10156 w0
[3] = w0
[3] | 0x800000;
10160 w0
[3] = w0
[3] | 0x80000000;
10168 w1
[0] = w1
[0] | 0x8000;
10172 w1
[0] = w1
[0] | 0x800000;
10176 w1
[0] = w1
[0] | 0x80000000;
10184 w1
[1] = w1
[1] | 0x8000;
10188 w1
[1] = w1
[1] | 0x800000;
10192 w1
[1] = w1
[1] | 0x80000000;
10200 w1
[2] = w1
[2] | 0x8000;
10204 w1
[2] = w1
[2] | 0x800000;
10208 w1
[2] = w1
[2] | 0x80000000;
10216 w1
[3] = w1
[3] | 0x8000;
10220 w1
[3] = w1
[3] | 0x800000;
10224 w1
[3] = w1
[3] | 0x80000000;
10229 __device__
static void append_0x80_3 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
10238 w0
[0] = w0
[0] | 0x8000;
10242 w0
[0] = w0
[0] | 0x800000;
10246 w0
[0] = w0
[0] | 0x80000000;
10254 w0
[1] = w0
[1] | 0x8000;
10258 w0
[1] = w0
[1] | 0x800000;
10262 w0
[1] = w0
[1] | 0x80000000;
10270 w0
[2] = w0
[2] | 0x8000;
10274 w0
[2] = w0
[2] | 0x800000;
10278 w0
[2] = w0
[2] | 0x80000000;
10286 w0
[3] = w0
[3] | 0x8000;
10290 w0
[3] = w0
[3] | 0x800000;
10294 w0
[3] = w0
[3] | 0x80000000;
10302 w1
[0] = w1
[0] | 0x8000;
10306 w1
[0] = w1
[0] | 0x800000;
10310 w1
[0] = w1
[0] | 0x80000000;
10318 w1
[1] = w1
[1] | 0x8000;
10322 w1
[1] = w1
[1] | 0x800000;
10326 w1
[1] = w1
[1] | 0x80000000;
10334 w1
[2] = w1
[2] | 0x8000;
10338 w1
[2] = w1
[2] | 0x800000;
10342 w1
[2] = w1
[2] | 0x80000000;
10350 w1
[3] = w1
[3] | 0x8000;
10354 w1
[3] = w1
[3] | 0x800000;
10358 w1
[3] = w1
[3] | 0x80000000;
10366 w2
[0] = w2
[0] | 0x8000;
10370 w2
[0] = w2
[0] | 0x800000;
10374 w2
[0] = w2
[0] | 0x80000000;
10382 w2
[1] = w2
[1] | 0x8000;
10386 w2
[1] = w2
[1] | 0x800000;
10390 w2
[1] = w2
[1] | 0x80000000;
10398 w2
[2] = w2
[2] | 0x8000;
10402 w2
[2] = w2
[2] | 0x800000;
10406 w2
[2] = w2
[2] | 0x80000000;
10414 w2
[3] = w2
[3] | 0x8000;
10418 w2
[3] = w2
[3] | 0x800000;
10422 w2
[3] = w2
[3] | 0x80000000;
10427 __device__
static void append_0x80_4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
10436 w0
[0] = w0
[0] | 0x8000;
10440 w0
[0] = w0
[0] | 0x800000;
10444 w0
[0] = w0
[0] | 0x80000000;
10452 w0
[1] = w0
[1] | 0x8000;
10456 w0
[1] = w0
[1] | 0x800000;
10460 w0
[1] = w0
[1] | 0x80000000;
10468 w0
[2] = w0
[2] | 0x8000;
10472 w0
[2] = w0
[2] | 0x800000;
10476 w0
[2] = w0
[2] | 0x80000000;
10484 w0
[3] = w0
[3] | 0x8000;
10488 w0
[3] = w0
[3] | 0x800000;
10492 w0
[3] = w0
[3] | 0x80000000;
10500 w1
[0] = w1
[0] | 0x8000;
10504 w1
[0] = w1
[0] | 0x800000;
10508 w1
[0] = w1
[0] | 0x80000000;
10516 w1
[1] = w1
[1] | 0x8000;
10520 w1
[1] = w1
[1] | 0x800000;
10524 w1
[1] = w1
[1] | 0x80000000;
10532 w1
[2] = w1
[2] | 0x8000;
10536 w1
[2] = w1
[2] | 0x800000;
10540 w1
[2] = w1
[2] | 0x80000000;
10548 w1
[3] = w1
[3] | 0x8000;
10552 w1
[3] = w1
[3] | 0x800000;
10556 w1
[3] = w1
[3] | 0x80000000;
10564 w2
[0] = w2
[0] | 0x8000;
10568 w2
[0] = w2
[0] | 0x800000;
10572 w2
[0] = w2
[0] | 0x80000000;
10580 w2
[1] = w2
[1] | 0x8000;
10584 w2
[1] = w2
[1] | 0x800000;
10588 w2
[1] = w2
[1] | 0x80000000;
10596 w2
[2] = w2
[2] | 0x8000;
10600 w2
[2] = w2
[2] | 0x800000;
10604 w2
[2] = w2
[2] | 0x80000000;
10612 w2
[3] = w2
[3] | 0x8000;
10616 w2
[3] = w2
[3] | 0x800000;
10620 w2
[3] = w2
[3] | 0x80000000;
10628 w3
[0] = w3
[0] | 0x8000;
10632 w3
[0] = w3
[0] | 0x800000;
10636 w3
[0] = w3
[0] | 0x80000000;
10644 w3
[1] = w3
[1] | 0x8000;
10648 w3
[1] = w3
[1] | 0x800000;
10652 w3
[1] = w3
[1] | 0x80000000;
10660 w3
[2] = w3
[2] | 0x8000;
10664 w3
[2] = w3
[2] | 0x800000;
10668 w3
[2] = w3
[2] | 0x80000000;
10676 w3
[3] = w3
[3] | 0x8000;
10680 w3
[3] = w3
[3] | 0x800000;
10684 w3
[3] = w3
[3] | 0x80000000;
10689 __device__
static void append_0x80_8 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
10698 w0
[0] = w0
[0] | 0x8000;
10702 w0
[0] = w0
[0] | 0x800000;
10706 w0
[0] = w0
[0] | 0x80000000;
10714 w0
[1] = w0
[1] | 0x8000;
10718 w0
[1] = w0
[1] | 0x800000;
10722 w0
[1] = w0
[1] | 0x80000000;
10730 w0
[2] = w0
[2] | 0x8000;
10734 w0
[2] = w0
[2] | 0x800000;
10738 w0
[2] = w0
[2] | 0x80000000;
10746 w0
[3] = w0
[3] | 0x8000;
10750 w0
[3] = w0
[3] | 0x800000;
10754 w0
[3] = w0
[3] | 0x80000000;
10762 w1
[0] = w1
[0] | 0x8000;
10766 w1
[0] = w1
[0] | 0x800000;
10770 w1
[0] = w1
[0] | 0x80000000;
10778 w1
[1] = w1
[1] | 0x8000;
10782 w1
[1] = w1
[1] | 0x800000;
10786 w1
[1] = w1
[1] | 0x80000000;
10794 w1
[2] = w1
[2] | 0x8000;
10798 w1
[2] = w1
[2] | 0x800000;
10802 w1
[2] = w1
[2] | 0x80000000;
10810 w1
[3] = w1
[3] | 0x8000;
10814 w1
[3] = w1
[3] | 0x800000;
10818 w1
[3] = w1
[3] | 0x80000000;
10826 w2
[0] = w2
[0] | 0x8000;
10830 w2
[0] = w2
[0] | 0x800000;
10834 w2
[0] = w2
[0] | 0x80000000;
10842 w2
[1] = w2
[1] | 0x8000;
10846 w2
[1] = w2
[1] | 0x800000;
10850 w2
[1] = w2
[1] | 0x80000000;
10858 w2
[2] = w2
[2] | 0x8000;
10862 w2
[2] = w2
[2] | 0x800000;
10866 w2
[2] = w2
[2] | 0x80000000;
10874 w2
[3] = w2
[3] | 0x8000;
10878 w2
[3] = w2
[3] | 0x800000;
10882 w2
[3] = w2
[3] | 0x80000000;
10890 w3
[0] = w3
[0] | 0x8000;
10894 w3
[0] = w3
[0] | 0x800000;
10898 w3
[0] = w3
[0] | 0x80000000;
10906 w3
[1] = w3
[1] | 0x8000;
10910 w3
[1] = w3
[1] | 0x800000;
10914 w3
[1] = w3
[1] | 0x80000000;
10922 w3
[2] = w3
[2] | 0x8000;
10926 w3
[2] = w3
[2] | 0x800000;
10930 w3
[2] = w3
[2] | 0x80000000;
10938 w3
[3] = w3
[3] | 0x8000;
10942 w3
[3] = w3
[3] | 0x800000;
10946 w3
[3] = w3
[3] | 0x80000000;
10954 w4
[0] = w4
[0] | 0x8000;
10958 w4
[0] = w4
[0] | 0x800000;
10962 w4
[0] = w4
[0] | 0x80000000;
10970 w4
[1] = w4
[1] | 0x8000;
10974 w4
[1] = w4
[1] | 0x800000;
10978 w4
[1] = w4
[1] | 0x80000000;
10986 w4
[2] = w4
[2] | 0x8000;
10990 w4
[2] = w4
[2] | 0x800000;
10994 w4
[2] = w4
[2] | 0x80000000;
11002 w4
[3] = w4
[3] | 0x8000;
11006 w4
[3] = w4
[3] | 0x800000;
11010 w4
[3] = w4
[3] | 0x80000000;
11018 w5
[0] = w5
[0] | 0x8000;
11022 w5
[0] = w5
[0] | 0x800000;
11026 w5
[0] = w5
[0] | 0x80000000;
11034 w5
[1] = w5
[1] | 0x8000;
11038 w5
[1] = w5
[1] | 0x800000;
11042 w5
[1] = w5
[1] | 0x80000000;
11050 w5
[2] = w5
[2] | 0x8000;
11054 w5
[2] = w5
[2] | 0x800000;
11058 w5
[2] = w5
[2] | 0x80000000;
11066 w5
[3] = w5
[3] | 0x8000;
11070 w5
[3] = w5
[3] | 0x800000;
11074 w5
[3] = w5
[3] | 0x80000000;
11082 w6
[0] = w6
[0] | 0x8000;
11086 w6
[0] = w6
[0] | 0x800000;
11090 w6
[0] = w6
[0] | 0x80000000;
11098 w6
[1] = w6
[1] | 0x8000;
11102 w6
[1] = w6
[1] | 0x800000;
11106 w6
[1] = w6
[1] | 0x80000000;
11114 w6
[2] = w6
[2] | 0x8000;
11118 w6
[2] = w6
[2] | 0x800000;
11122 w6
[2] = w6
[2] | 0x80000000;
11130 w6
[3] = w6
[3] | 0x8000;
11134 w6
[3] = w6
[3] | 0x800000;
11138 w6
[3] = w6
[3] | 0x80000000;
11146 w7
[0] = w7
[0] | 0x8000;
11150 w7
[0] = w7
[0] | 0x800000;
11154 w7
[0] = w7
[0] | 0x80000000;
11162 w7
[1] = w7
[1] | 0x8000;
11166 w7
[1] = w7
[1] | 0x800000;
11170 w7
[1] = w7
[1] | 0x80000000;
11178 w7
[2] = w7
[2] | 0x8000;
11182 w7
[2] = w7
[2] | 0x800000;
11186 w7
[2] = w7
[2] | 0x80000000;
11194 w7
[3] = w7
[3] | 0x8000;
11198 w7
[3] = w7
[3] | 0x800000;
11202 w7
[3] = w7
[3] | 0x80000000;
11207 __device__
static void device_memcat2L (const u32 offset
, u32x dst0
[2], u32x src_l0
[2], u32 src_r0
[2])
11212 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
11213 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11217 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
11218 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11222 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
11223 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11227 dst0
[1] = src_r0
[0];
11231 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
11235 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
11239 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
11244 __device__
static void device_memcat2L (const u32 offset
, u32x dst0
[2], u32x src_l0
[2], u32x src_r0
[2])
11249 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
11250 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11254 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
11255 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11259 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
11260 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11264 dst0
[1] = src_r0
[0];
11268 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
11272 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
11276 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
11281 __device__
static void device_memcat4L (const u32 offset
, u32x dst0
[4], u32x src_l0
[4], u32 src_r0
[4])
11286 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
11287 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11288 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11289 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11293 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
11294 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11295 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11296 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11300 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
11301 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11302 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11303 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11307 dst0
[1] = src_r0
[0];
11308 dst0
[2] = src_r0
[1];
11309 dst0
[3] = src_r0
[2];
11313 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
11314 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11315 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11319 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
11320 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11321 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11325 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
11326 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11327 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11331 dst0
[2] = src_r0
[0];
11332 dst0
[3] = src_r0
[1];
11336 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
11337 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11341 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
11342 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11346 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
11347 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11351 dst0
[3] = src_r0
[0];
11355 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
11359 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
11363 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
11368 __device__
static void device_memcat4L (const u32 offset
, u32x dst0
[4], u32x src_l0
[4], u32x src_r0
[4])
11373 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
11374 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11375 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11376 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11380 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
11381 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11382 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11383 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11387 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
11388 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11389 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11390 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11394 dst0
[1] = src_r0
[0];
11395 dst0
[2] = src_r0
[1];
11396 dst0
[3] = src_r0
[2];
11400 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
11401 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11402 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11406 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
11407 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11408 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11412 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
11413 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11414 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11418 dst0
[2] = src_r0
[0];
11419 dst0
[3] = src_r0
[1];
11423 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
11424 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11428 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
11429 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11433 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
11434 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11438 dst0
[3] = src_r0
[0];
11442 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
11446 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
11450 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
11455 __device__
static void device_memcat8L (const u32 offset
, u32x dst0
[4], u32x dst1
[4], u32x src_l0
[4], u32x src_l1
[4], u32 src_r0
[4])
11460 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
11461 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11462 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11463 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11464 dst1
[0] = src_r0
[3] >> 24;
11468 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
11469 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11470 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11471 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11472 dst1
[0] = src_r0
[3] >> 16;
11476 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
11477 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11478 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11479 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11480 dst1
[0] = src_r0
[3] >> 8;
11484 dst0
[1] = src_r0
[0];
11485 dst0
[2] = src_r0
[1];
11486 dst0
[3] = src_r0
[2];
11487 dst1
[0] = src_r0
[3];
11491 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
11492 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11493 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11494 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11495 dst1
[1] = src_r0
[3] >> 24;
11499 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
11500 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11501 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11502 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11503 dst1
[1] = src_r0
[3] >> 16;
11507 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
11508 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11509 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11510 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11511 dst1
[1] = src_r0
[3] >> 8;
11515 dst0
[2] = src_r0
[0];
11516 dst0
[3] = src_r0
[1];
11517 dst1
[0] = src_r0
[2];
11518 dst1
[1] = src_r0
[3];
11522 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
11523 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11524 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11525 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11526 dst1
[2] = src_r0
[3] >> 24;
11530 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
11531 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11532 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11533 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11534 dst1
[2] = src_r0
[3] >> 16;
11538 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
11539 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11540 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11541 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11542 dst1
[2] = src_r0
[3] >> 8;
11546 dst0
[3] = src_r0
[0];
11547 dst1
[0] = src_r0
[1];
11548 dst1
[1] = src_r0
[2];
11549 dst1
[2] = src_r0
[3];
11553 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
11554 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11555 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11556 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11557 dst1
[3] = src_r0
[3] >> 24;
11561 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
11562 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11563 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11564 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11565 dst1
[3] = src_r0
[3] >> 16;
11569 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
11570 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11571 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11572 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11573 dst1
[3] = src_r0
[3] >> 8;
11577 dst1
[0] = src_r0
[0];
11578 dst1
[1] = src_r0
[1];
11579 dst1
[2] = src_r0
[2];
11580 dst1
[3] = src_r0
[3];
11584 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
11585 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11586 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11587 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11591 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
11592 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11593 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11594 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11598 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
11599 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11600 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11601 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11605 dst1
[1] = src_r0
[0];
11606 dst1
[2] = src_r0
[1];
11607 dst1
[3] = src_r0
[2];
11611 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
11612 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11613 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11617 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
11618 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11619 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11623 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
11624 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11625 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11629 dst1
[2] = src_r0
[0];
11630 dst1
[3] = src_r0
[1];
11634 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
11635 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11639 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
11640 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11644 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
11645 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11649 dst1
[3] = src_r0
[0];
11653 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
11657 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
11661 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
11666 __device__
static void device_memcat8L (const u32 offset
, u32x dst0
[4], u32x dst1
[4], u32x src_l0
[4], u32x src_l1
[4], u32x src_r0
[4])
11671 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
11672 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11673 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11674 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11675 dst1
[0] = src_r0
[3] >> 24;
11679 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
11680 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11681 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11682 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11683 dst1
[0] = src_r0
[3] >> 16;
11687 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
11688 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11689 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11690 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11691 dst1
[0] = src_r0
[3] >> 8;
11695 dst0
[1] = src_r0
[0];
11696 dst0
[2] = src_r0
[1];
11697 dst0
[3] = src_r0
[2];
11698 dst1
[0] = src_r0
[3];
11702 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
11703 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11704 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11705 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11706 dst1
[1] = src_r0
[3] >> 24;
11710 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
11711 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11712 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11713 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11714 dst1
[1] = src_r0
[3] >> 16;
11718 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
11719 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11720 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11721 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11722 dst1
[1] = src_r0
[3] >> 8;
11726 dst0
[2] = src_r0
[0];
11727 dst0
[3] = src_r0
[1];
11728 dst1
[0] = src_r0
[2];
11729 dst1
[1] = src_r0
[3];
11733 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
11734 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11735 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11736 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11737 dst1
[2] = src_r0
[3] >> 24;
11741 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
11742 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11743 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11744 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11745 dst1
[2] = src_r0
[3] >> 16;
11749 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
11750 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11751 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11752 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11753 dst1
[2] = src_r0
[3] >> 8;
11757 dst0
[3] = src_r0
[0];
11758 dst1
[0] = src_r0
[1];
11759 dst1
[1] = src_r0
[2];
11760 dst1
[2] = src_r0
[3];
11764 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
11765 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11766 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11767 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11768 dst1
[3] = src_r0
[3] >> 24;
11772 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
11773 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11774 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11775 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11776 dst1
[3] = src_r0
[3] >> 16;
11780 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
11781 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11782 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11783 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11784 dst1
[3] = src_r0
[3] >> 8;
11788 dst1
[0] = src_r0
[0];
11789 dst1
[1] = src_r0
[1];
11790 dst1
[2] = src_r0
[2];
11791 dst1
[3] = src_r0
[3];
11795 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
11796 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11797 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11798 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11802 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
11803 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11804 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11805 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11809 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
11810 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11811 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11812 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11816 dst1
[1] = src_r0
[0];
11817 dst1
[2] = src_r0
[1];
11818 dst1
[3] = src_r0
[2];
11822 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
11823 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11824 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11828 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
11829 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11830 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11834 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
11835 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11836 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11840 dst1
[2] = src_r0
[0];
11841 dst1
[3] = src_r0
[1];
11845 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
11846 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11850 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
11851 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11855 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
11856 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11860 dst1
[3] = src_r0
[0];
11864 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
11868 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
11872 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
11877 __device__
static void device_memcat12L (const u32 offset
, u32x dst0
[4], u32x dst1
[4], u32x dst2
[4], u32x src_l0
[4], u32x src_l1
[4], u32x src_l2
[4], u32 src_r0
[4])
11882 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
11883 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11884 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11885 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11886 dst1
[0] = src_r0
[3] >> 24;
11890 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
11891 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11892 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11893 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11894 dst1
[0] = src_r0
[3] >> 16;
11898 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
11899 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11900 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11901 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11902 dst1
[0] = src_r0
[3] >> 8;
11906 dst0
[1] = src_r0
[0];
11907 dst0
[2] = src_r0
[1];
11908 dst0
[3] = src_r0
[2];
11909 dst1
[0] = src_r0
[3];
11913 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
11914 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11915 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11916 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11917 dst1
[1] = src_r0
[3] >> 24;
11921 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
11922 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11923 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11924 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11925 dst1
[1] = src_r0
[3] >> 16;
11929 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
11930 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11931 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11932 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11933 dst1
[1] = src_r0
[3] >> 8;
11937 dst0
[2] = src_r0
[0];
11938 dst0
[3] = src_r0
[1];
11939 dst1
[0] = src_r0
[2];
11940 dst1
[1] = src_r0
[3];
11944 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
11945 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11946 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11947 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11948 dst1
[2] = src_r0
[3] >> 24;
11952 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
11953 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11954 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11955 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11956 dst1
[2] = src_r0
[3] >> 16;
11960 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
11961 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11962 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11963 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11964 dst1
[2] = src_r0
[3] >> 8;
11968 dst0
[3] = src_r0
[0];
11969 dst1
[0] = src_r0
[1];
11970 dst1
[1] = src_r0
[2];
11971 dst1
[2] = src_r0
[3];
11975 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
11976 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
11977 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
11978 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
11979 dst1
[3] = src_r0
[3] >> 24;
11983 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
11984 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
11985 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
11986 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
11987 dst1
[3] = src_r0
[3] >> 16;
11991 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
11992 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
11993 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
11994 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
11995 dst1
[3] = src_r0
[3] >> 8;
11999 dst1
[0] = src_r0
[0];
12000 dst1
[1] = src_r0
[1];
12001 dst1
[2] = src_r0
[2];
12002 dst1
[3] = src_r0
[3];
12006 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
12007 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12008 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12009 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12010 dst2
[0] = src_r0
[3] >> 24;
12014 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
12015 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12016 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12017 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12018 dst2
[0] = src_r0
[3] >> 16;
12022 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
12023 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12024 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12025 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12026 dst2
[0] = src_r0
[3] >> 8;
12030 dst1
[1] = src_r0
[0];
12031 dst1
[2] = src_r0
[1];
12032 dst1
[3] = src_r0
[2];
12033 dst2
[0] = src_r0
[3];
12037 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
12038 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12039 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12040 dst2
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12041 dst2
[1] = src_r0
[3] >> 24;
12045 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
12046 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12047 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12048 dst2
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12049 dst2
[1] = src_r0
[3] >> 16;
12053 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
12054 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12055 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12056 dst2
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12057 dst2
[1] = src_r0
[3] >> 8;
12061 dst1
[2] = src_r0
[0];
12062 dst1
[3] = src_r0
[1];
12063 dst2
[0] = src_r0
[2];
12064 dst2
[1] = src_r0
[3];
12068 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
12069 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12070 dst2
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12071 dst2
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12072 dst2
[2] = src_r0
[3] >> 24;
12076 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
12077 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12078 dst2
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12079 dst2
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12080 dst2
[2] = src_r0
[3] >> 16;
12084 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
12085 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12086 dst2
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12087 dst2
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12088 dst2
[2] = src_r0
[3] >> 8;
12092 dst1
[3] = src_r0
[0];
12093 dst2
[0] = src_r0
[1];
12094 dst2
[1] = src_r0
[2];
12095 dst2
[2] = src_r0
[3];
12099 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
12100 dst2
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12101 dst2
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12102 dst2
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12103 dst2
[3] = src_r0
[3] >> 24;
12107 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
12108 dst2
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12109 dst2
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12110 dst2
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12111 dst2
[3] = src_r0
[3] >> 16;
12115 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
12116 dst2
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12117 dst2
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12118 dst2
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12119 dst2
[3] = src_r0
[3] >> 8;
12123 dst2
[0] = src_r0
[0];
12124 dst2
[1] = src_r0
[1];
12125 dst2
[2] = src_r0
[2];
12126 dst2
[3] = src_r0
[3];
12130 dst2
[0] = src_l2
[0] | src_r0
[0] << 8;
12131 dst2
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12132 dst2
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12133 dst2
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12137 dst2
[0] = src_l2
[0] | src_r0
[0] << 16;
12138 dst2
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12139 dst2
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12140 dst2
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12144 dst2
[0] = src_l2
[0] | src_r0
[0] << 24;
12145 dst2
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12146 dst2
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12147 dst2
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12151 dst2
[1] = src_r0
[0];
12152 dst2
[2] = src_r0
[1];
12153 dst2
[3] = src_r0
[2];
12157 dst2
[1] = src_l2
[1] | src_r0
[0] << 8;
12158 dst2
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12159 dst2
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12163 dst2
[1] = src_l2
[1] | src_r0
[0] << 16;
12164 dst2
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12165 dst2
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12169 dst2
[1] = src_l2
[1] | src_r0
[0] << 24;
12170 dst2
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12171 dst2
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12175 dst2
[2] = src_r0
[0];
12176 dst2
[3] = src_r0
[1];
12180 dst2
[2] = src_l2
[2] | src_r0
[0] << 8;
12181 dst2
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12185 dst2
[2] = src_l2
[2] | src_r0
[0] << 16;
12186 dst2
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12190 dst2
[2] = src_l2
[2] | src_r0
[0] << 24;
12191 dst2
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12195 dst2
[3] = src_r0
[0];
12199 dst2
[3] = src_l2
[3] | src_r0
[0] << 8;
12203 dst2
[3] = src_l2
[3] | src_r0
[0] << 16;
12207 dst2
[3] = src_l2
[3] | src_r0
[0] << 24;
12212 __device__
static void device_memcat12L (const u32 offset
, u32x dst0
[4], u32x dst1
[4], u32x dst2
[4], u32x src_l0
[4], u32x src_l1
[4], u32x src_l2
[4], u32x src_r0
[4])
12217 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
12218 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12219 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12220 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12221 dst1
[0] = src_r0
[3] >> 24;
12225 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
12226 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12227 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12228 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12229 dst1
[0] = src_r0
[3] >> 16;
12233 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
12234 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12235 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12236 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12237 dst1
[0] = src_r0
[3] >> 8;
12241 dst0
[1] = src_r0
[0];
12242 dst0
[2] = src_r0
[1];
12243 dst0
[3] = src_r0
[2];
12244 dst1
[0] = src_r0
[3];
12248 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
12249 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12250 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12251 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12252 dst1
[1] = src_r0
[3] >> 24;
12256 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
12257 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12258 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12259 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12260 dst1
[1] = src_r0
[3] >> 16;
12264 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
12265 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12266 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12267 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12268 dst1
[1] = src_r0
[3] >> 8;
12272 dst0
[2] = src_r0
[0];
12273 dst0
[3] = src_r0
[1];
12274 dst1
[0] = src_r0
[2];
12275 dst1
[1] = src_r0
[3];
12279 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
12280 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12281 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12282 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12283 dst1
[2] = src_r0
[3] >> 24;
12287 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
12288 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12289 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12290 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12291 dst1
[2] = src_r0
[3] >> 16;
12295 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
12296 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12297 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12298 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12299 dst1
[2] = src_r0
[3] >> 8;
12303 dst0
[3] = src_r0
[0];
12304 dst1
[0] = src_r0
[1];
12305 dst1
[1] = src_r0
[2];
12306 dst1
[2] = src_r0
[3];
12310 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
12311 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12312 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12313 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12314 dst1
[3] = src_r0
[3] >> 24;
12318 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
12319 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12320 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12321 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12322 dst1
[3] = src_r0
[3] >> 16;
12326 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
12327 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12328 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12329 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12330 dst1
[3] = src_r0
[3] >> 8;
12334 dst1
[0] = src_r0
[0];
12335 dst1
[1] = src_r0
[1];
12336 dst1
[2] = src_r0
[2];
12337 dst1
[3] = src_r0
[3];
12341 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
12342 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12343 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12344 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12345 dst2
[0] = src_r0
[3] >> 24;
12349 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
12350 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12351 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12352 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12353 dst2
[0] = src_r0
[3] >> 16;
12357 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
12358 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12359 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12360 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12361 dst2
[0] = src_r0
[3] >> 8;
12365 dst1
[1] = src_r0
[0];
12366 dst1
[2] = src_r0
[1];
12367 dst1
[3] = src_r0
[2];
12368 dst2
[0] = src_r0
[3];
12372 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
12373 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12374 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12375 dst2
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12376 dst2
[1] = src_r0
[3] >> 24;
12380 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
12381 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12382 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12383 dst2
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12384 dst2
[1] = src_r0
[3] >> 16;
12388 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
12389 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12390 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12391 dst2
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12392 dst2
[1] = src_r0
[3] >> 8;
12396 dst1
[2] = src_r0
[0];
12397 dst1
[3] = src_r0
[1];
12398 dst2
[0] = src_r0
[2];
12399 dst2
[1] = src_r0
[3];
12403 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
12404 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12405 dst2
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12406 dst2
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12407 dst2
[2] = src_r0
[3] >> 24;
12411 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
12412 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12413 dst2
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12414 dst2
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12415 dst2
[2] = src_r0
[3] >> 16;
12419 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
12420 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12421 dst2
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12422 dst2
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12423 dst2
[2] = src_r0
[3] >> 8;
12427 dst1
[3] = src_r0
[0];
12428 dst2
[0] = src_r0
[1];
12429 dst2
[1] = src_r0
[2];
12430 dst2
[2] = src_r0
[3];
12434 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
12435 dst2
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12436 dst2
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12437 dst2
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12438 dst2
[3] = src_r0
[3] >> 24;
12442 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
12443 dst2
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12444 dst2
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12445 dst2
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12446 dst2
[3] = src_r0
[3] >> 16;
12450 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
12451 dst2
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12452 dst2
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12453 dst2
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12454 dst2
[3] = src_r0
[3] >> 8;
12458 dst2
[0] = src_r0
[0];
12459 dst2
[1] = src_r0
[1];
12460 dst2
[2] = src_r0
[2];
12461 dst2
[3] = src_r0
[3];
12465 dst2
[0] = src_l2
[0] | src_r0
[0] << 8;
12466 dst2
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12467 dst2
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12468 dst2
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12472 dst2
[0] = src_l2
[0] | src_r0
[0] << 16;
12473 dst2
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12474 dst2
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12475 dst2
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12479 dst2
[0] = src_l2
[0] | src_r0
[0] << 24;
12480 dst2
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12481 dst2
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12482 dst2
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12486 dst2
[1] = src_r0
[0];
12487 dst2
[2] = src_r0
[1];
12488 dst2
[3] = src_r0
[2];
12492 dst2
[1] = src_l2
[1] | src_r0
[0] << 8;
12493 dst2
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12494 dst2
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12498 dst2
[1] = src_l2
[1] | src_r0
[0] << 16;
12499 dst2
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12500 dst2
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12504 dst2
[1] = src_l2
[1] | src_r0
[0] << 24;
12505 dst2
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12506 dst2
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12510 dst2
[2] = src_r0
[0];
12511 dst2
[3] = src_r0
[1];
12515 dst2
[2] = src_l2
[2] | src_r0
[0] << 8;
12516 dst2
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12520 dst2
[2] = src_l2
[2] | src_r0
[0] << 16;
12521 dst2
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12525 dst2
[2] = src_l2
[2] | src_r0
[0] << 24;
12526 dst2
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12530 dst2
[3] = src_r0
[0];
12534 dst2
[3] = src_l2
[3] | src_r0
[0] << 8;
12538 dst2
[3] = src_l2
[3] | src_r0
[0] << 16;
12542 dst2
[3] = src_l2
[3] | src_r0
[0] << 24;
12547 __device__
static void device_memcat12L (const u32 offset
, u32x dst0
[4], u32x dst1
[4], u32x dst2
[4], u32x src_l0
[4], u32x src_l1
[4], u32x src_l2
[4], u32x src_r0
[4], u32x src_r1
[4])
12552 dst0
[0] = src_r0
[0];
12553 dst0
[1] = src_r0
[1];
12554 dst0
[2] = src_r0
[2];
12555 dst0
[3] = src_r0
[3];
12556 dst1
[0] = src_r1
[0];
12557 dst1
[1] = src_r1
[1];
12558 dst1
[2] = src_r1
[2];
12559 dst1
[3] = src_r1
[3];
12563 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
12564 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12565 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12566 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12567 dst1
[0] = src_r0
[3] >> 24 | src_r1
[0] << 8;
12568 dst1
[1] = src_r1
[0] >> 24 | src_r1
[1] << 8;
12569 dst1
[2] = src_r1
[1] >> 24 | src_r1
[2] << 8;
12570 dst1
[3] = src_r1
[2] >> 24 | src_r1
[3] << 8;
12571 dst2
[0] = src_r1
[3] >> 24;
12575 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
12576 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12577 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12578 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12579 dst1
[0] = src_r0
[3] >> 16 | src_r1
[0] << 16;
12580 dst1
[1] = src_r1
[0] >> 16 | src_r1
[1] << 16;
12581 dst1
[2] = src_r1
[1] >> 16 | src_r1
[2] << 16;
12582 dst1
[3] = src_r1
[2] >> 16 | src_r1
[3] << 16;
12583 dst2
[0] = src_r1
[3] >> 16;
12587 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
12588 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12589 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12590 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12591 dst1
[0] = src_r0
[3] >> 8 | src_r1
[0] << 24;
12592 dst1
[1] = src_r1
[0] >> 8 | src_r1
[1] << 24;
12593 dst1
[2] = src_r1
[1] >> 8 | src_r1
[2] << 24;
12594 dst1
[3] = src_r1
[2] >> 8 | src_r1
[3] << 24;
12595 dst2
[0] = src_r1
[3] >> 8;
12599 dst0
[1] = src_r0
[0];
12600 dst0
[2] = src_r0
[1];
12601 dst0
[3] = src_r0
[2];
12602 dst1
[0] = src_r0
[3];
12603 dst1
[1] = src_r1
[0];
12604 dst1
[2] = src_r1
[1];
12605 dst1
[3] = src_r1
[2];
12606 dst2
[0] = src_r1
[3];
12610 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
12611 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12612 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12613 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12614 dst1
[1] = src_r0
[3] >> 24 | src_r1
[0] << 8;
12615 dst1
[2] = src_r1
[0] >> 24 | src_r1
[1] << 8;
12616 dst1
[3] = src_r1
[1] >> 24 | src_r1
[2] << 8;
12617 dst2
[0] = src_r1
[2] >> 24 | src_r1
[3] << 8;
12618 dst2
[1] = src_r1
[3] >> 24;
12622 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
12623 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12624 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12625 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12626 dst1
[1] = src_r0
[3] >> 16 | src_r1
[0] << 16;
12627 dst1
[2] = src_r1
[0] >> 16 | src_r1
[1] << 16;
12628 dst1
[3] = src_r1
[1] >> 16 | src_r1
[2] << 16;
12629 dst2
[0] = src_r1
[2] >> 16 | src_r1
[3] << 16;
12630 dst2
[1] = src_r1
[3] >> 16;
12634 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
12635 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12636 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12637 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12638 dst1
[1] = src_r0
[3] >> 8 | src_r1
[0] << 24;
12639 dst1
[2] = src_r1
[0] >> 8 | src_r1
[1] << 24;
12640 dst1
[3] = src_r1
[1] >> 8 | src_r1
[2] << 24;
12641 dst2
[0] = src_r1
[2] >> 8 | src_r1
[3] << 24;
12642 dst2
[1] = src_r1
[3] >> 8;
12646 dst0
[2] = src_r0
[0];
12647 dst0
[3] = src_r0
[1];
12648 dst1
[0] = src_r0
[2];
12649 dst1
[1] = src_r0
[3];
12650 dst1
[2] = src_r1
[0];
12651 dst1
[3] = src_r1
[1];
12652 dst2
[0] = src_r1
[2];
12653 dst2
[1] = src_r1
[3];
12657 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
12658 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12659 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12660 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12661 dst1
[2] = src_r0
[3] >> 24 | src_r1
[0] << 8;
12662 dst1
[3] = src_r1
[0] >> 24 | src_r1
[1] << 8;
12663 dst2
[0] = src_r1
[1] >> 24 | src_r1
[2] << 8;
12664 dst2
[1] = src_r1
[2] >> 24 | src_r1
[3] << 8;
12665 dst2
[2] = src_r1
[3] >> 24;
12669 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
12670 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12671 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12672 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12673 dst1
[2] = src_r0
[3] >> 16 | src_r1
[0] << 16;
12674 dst1
[3] = src_r1
[0] >> 16 | src_r1
[1] << 16;
12675 dst2
[0] = src_r1
[1] >> 16 | src_r1
[2] << 16;
12676 dst2
[1] = src_r1
[2] >> 16 | src_r1
[3] << 16;
12677 dst2
[2] = src_r1
[3] >> 16;
12681 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
12682 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12683 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12684 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12685 dst1
[2] = src_r0
[3] >> 8 | src_r1
[0] << 24;
12686 dst1
[3] = src_r1
[0] >> 8 | src_r1
[1] << 24;
12687 dst2
[0] = src_r1
[1] >> 8 | src_r1
[2] << 24;
12688 dst2
[1] = src_r1
[2] >> 8 | src_r1
[3] << 24;
12689 dst2
[2] = src_r1
[3] >> 8;
12693 dst0
[3] = src_r0
[0];
12694 dst1
[0] = src_r0
[1];
12695 dst1
[1] = src_r0
[2];
12696 dst1
[2] = src_r0
[3];
12697 dst1
[3] = src_r1
[0];
12698 dst2
[0] = src_r1
[1];
12699 dst2
[1] = src_r1
[2];
12700 dst2
[2] = src_r1
[3];
12704 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
12705 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12706 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12707 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12708 dst1
[3] = src_r0
[3] >> 24 | src_r1
[0] << 8;
12709 dst2
[0] = src_r1
[0] >> 24 | src_r1
[1] << 8;
12710 dst2
[1] = src_r1
[1] >> 24 | src_r1
[2] << 8;
12711 dst2
[2] = src_r1
[2] >> 24 | src_r1
[3] << 8;
12712 dst2
[3] = src_r1
[3] >> 24;
12716 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
12717 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12718 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12719 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12720 dst1
[3] = src_r0
[3] >> 16 | src_r1
[0] << 16;
12721 dst2
[0] = src_r1
[0] >> 16 | src_r1
[1] << 16;
12722 dst2
[1] = src_r1
[1] >> 16 | src_r1
[2] << 16;
12723 dst2
[2] = src_r1
[2] >> 16 | src_r1
[3] << 16;
12724 dst2
[3] = src_r1
[3] >> 16;
12728 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
12729 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12730 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12731 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12732 dst1
[3] = src_r0
[3] >> 8 | src_r1
[0] << 24;
12733 dst2
[0] = src_r1
[0] >> 8 | src_r1
[1] << 24;
12734 dst2
[1] = src_r1
[1] >> 8 | src_r1
[2] << 24;
12735 dst2
[2] = src_r1
[2] >> 8 | src_r1
[3] << 24;
12736 dst2
[3] = src_r1
[3] >> 8;
12740 dst1
[0] = src_r0
[0];
12741 dst1
[1] = src_r0
[1];
12742 dst1
[2] = src_r0
[2];
12743 dst1
[3] = src_r0
[3];
12744 dst2
[0] = src_r1
[0];
12745 dst2
[1] = src_r1
[1];
12746 dst2
[2] = src_r1
[2];
12747 dst2
[3] = src_r1
[3];
12751 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
12752 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12753 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12754 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12755 dst2
[0] = src_r0
[3] >> 24 | src_r1
[0] << 8;
12756 dst2
[1] = src_r1
[0] >> 24 | src_r1
[1] << 8;
12757 dst2
[2] = src_r1
[1] >> 24 | src_r1
[2] << 8;
12758 dst2
[3] = src_r1
[2] >> 24 | src_r1
[3] << 8;
12762 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
12763 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12764 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12765 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12766 dst2
[0] = src_r0
[3] >> 16 | src_r1
[0] << 16;
12767 dst2
[1] = src_r1
[0] >> 16 | src_r1
[1] << 16;
12768 dst2
[2] = src_r1
[1] >> 16 | src_r1
[2] << 16;
12769 dst2
[3] = src_r1
[2] >> 16 | src_r1
[3] << 16;
12773 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
12774 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12775 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12776 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12777 dst2
[0] = src_r0
[3] >> 8 | src_r1
[0] << 24;
12778 dst2
[1] = src_r1
[0] >> 8 | src_r1
[1] << 24;
12779 dst2
[2] = src_r1
[1] >> 8 | src_r1
[2] << 24;
12780 dst2
[3] = src_r1
[2] >> 8 | src_r1
[3] << 24;
12784 dst1
[1] = src_r1
[0];
12785 dst1
[2] = src_r0
[1];
12786 dst1
[3] = src_r0
[2];
12787 dst2
[0] = src_r0
[3];
12788 dst2
[1] = src_r1
[0];
12789 dst2
[2] = src_r1
[1];
12790 dst2
[3] = src_r1
[2];
12794 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
12795 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12796 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12797 dst2
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12798 dst2
[1] = src_r0
[3] >> 24 | src_r1
[0] << 8;
12799 dst2
[2] = src_r1
[0] >> 24 | src_r1
[1] << 8;
12800 dst2
[3] = src_r1
[1] >> 24 | src_r1
[2] << 8;
12804 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
12805 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12806 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12807 dst2
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12808 dst2
[1] = src_r0
[3] >> 16 | src_r1
[0] << 16;
12809 dst2
[2] = src_r1
[0] >> 16 | src_r1
[1] << 16;
12810 dst2
[3] = src_r1
[1] >> 16 | src_r1
[2] << 16;
12814 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
12815 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12816 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12817 dst2
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12818 dst2
[1] = src_r0
[3] >> 8 | src_r1
[0] << 24;
12819 dst2
[2] = src_r1
[0] >> 8 | src_r1
[1] << 24;
12820 dst2
[3] = src_r1
[1] >> 8 | src_r1
[2] << 24;
12824 dst1
[2] = src_r1
[0];
12825 dst1
[3] = src_r0
[1];
12826 dst2
[0] = src_r0
[2];
12827 dst2
[1] = src_r0
[3];
12828 dst2
[2] = src_r1
[0];
12829 dst2
[3] = src_r1
[1];
12833 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
12834 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12835 dst2
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12836 dst2
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12837 dst2
[2] = src_r0
[3] >> 24 | src_r1
[0] << 8;
12838 dst2
[3] = src_r1
[0] >> 24 | src_r1
[1] << 8;
12842 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
12843 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12844 dst2
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12845 dst2
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12846 dst2
[2] = src_r0
[3] >> 16 | src_r1
[0] << 16;
12847 dst2
[3] = src_r1
[0] >> 16 | src_r1
[1] << 16;
12851 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
12852 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12853 dst2
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12854 dst2
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12855 dst2
[2] = src_r0
[3] >> 8 | src_r1
[0] << 24;
12856 dst2
[3] = src_r1
[0] >> 8 | src_r1
[1] << 24;
12860 dst1
[3] = src_r1
[0];
12861 dst2
[0] = src_r0
[1];
12862 dst2
[1] = src_r0
[2];
12863 dst2
[2] = src_r0
[3];
12864 dst2
[3] = src_r1
[0];
12868 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
12869 dst2
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12870 dst2
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12871 dst2
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12872 dst2
[3] = src_r0
[3] >> 24 | src_r1
[0] << 8;
12876 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
12877 dst2
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12878 dst2
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12879 dst2
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12880 dst2
[3] = src_r0
[3] >> 16 | src_r1
[0] << 16;
12884 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
12885 dst2
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12886 dst2
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12887 dst2
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12888 dst2
[3] = src_r0
[3] >> 8 | src_r1
[0] << 24;
12892 dst2
[0] = src_r0
[0];
12893 dst2
[1] = src_r0
[1];
12894 dst2
[2] = src_r0
[2];
12895 dst2
[3] = src_r0
[3];
12899 dst2
[0] = src_l2
[0] | src_r0
[0] << 8;
12900 dst2
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12901 dst2
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12902 dst2
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
12906 dst2
[0] = src_l2
[0] | src_r0
[0] << 16;
12907 dst2
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12908 dst2
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12909 dst2
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
12913 dst2
[0] = src_l2
[0] | src_r0
[0] << 24;
12914 dst2
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12915 dst2
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12916 dst2
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
12920 dst2
[1] = src_r0
[0];
12921 dst2
[2] = src_r0
[1];
12922 dst2
[3] = src_r0
[2];
12926 dst2
[1] = src_l2
[1] | src_r0
[0] << 8;
12927 dst2
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12928 dst2
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
12932 dst2
[1] = src_l2
[1] | src_r0
[0] << 16;
12933 dst2
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12934 dst2
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
12938 dst2
[1] = src_l2
[1] | src_r0
[0] << 24;
12939 dst2
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12940 dst2
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
12944 dst2
[2] = src_r0
[0];
12945 dst2
[3] = src_r0
[1];
12949 dst2
[2] = src_l2
[2] | src_r0
[0] << 8;
12950 dst2
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
12954 dst2
[2] = src_l2
[2] | src_r0
[0] << 16;
12955 dst2
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
12959 dst2
[2] = src_l2
[2] | src_r0
[0] << 24;
12960 dst2
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
12964 dst2
[3] = src_r0
[0];
12968 dst2
[3] = src_l2
[3] | src_r0
[0] << 8;
12972 dst2
[3] = src_l2
[3] | src_r0
[0] << 16;
12976 dst2
[3] = src_l2
[3] | src_r0
[0] << 24;
12981 __device__
static void memcat16_9 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 append0
[4], const u32 append1
[4], const u32 append2
[4], const u32 offset
)
12986 w0
[0] = append0
[0];
12987 w0
[1] = append0
[1];
12988 w0
[2] = append0
[2];
12989 w0
[3] = append0
[3];
12990 w1
[0] = append1
[0];
12991 w1
[1] = append1
[1];
12992 w1
[2] = append1
[2];
12993 w1
[3] = append1
[3];
12994 w2
[0] = append2
[0];
12998 w0
[0] = w0
[0] | append0
[0] << 8;
12999 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
13000 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
13001 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
13002 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
13003 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
13004 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
13005 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
13006 w2
[0] = append1
[3] >> 24 | append2
[0] << 8;
13007 w2
[1] = append2
[0] >> 24;
13011 w0
[0] = w0
[0] | append0
[0] << 16;
13012 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
13013 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
13014 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
13015 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
13016 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
13017 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
13018 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
13019 w2
[0] = append1
[3] >> 16 | append2
[0] << 16;
13020 w2
[1] = append2
[0] >> 16;
13024 w0
[0] = w0
[0] | append0
[0] << 24;
13025 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
13026 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
13027 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
13028 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
13029 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
13030 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
13031 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
13032 w2
[0] = append1
[3] >> 8 | append2
[0] << 24;
13033 w2
[1] = append2
[0] >> 8;
13037 w0
[1] = append0
[0];
13038 w0
[2] = append0
[1];
13039 w0
[3] = append0
[2];
13040 w1
[0] = append0
[3];
13041 w1
[1] = append1
[0];
13042 w1
[2] = append1
[1];
13043 w1
[3] = append1
[2];
13044 w2
[0] = append1
[3];
13045 w2
[1] = append2
[0];
13049 w0
[1] = w0
[1] | append0
[0] << 8;
13050 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
13051 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
13052 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
13053 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
13054 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
13055 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
13056 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
13057 w2
[1] = append1
[3] >> 24 | append2
[0] << 8;
13058 w2
[2] = append2
[0] >> 24;
13062 w0
[1] = w0
[1] | append0
[0] << 16;
13063 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
13064 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
13065 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
13066 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
13067 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
13068 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
13069 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
13070 w2
[1] = append1
[3] >> 16 | append2
[0] << 16;
13071 w2
[2] = append2
[0] >> 16;
13075 w0
[1] = w0
[1] | append0
[0] << 24;
13076 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
13077 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
13078 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
13079 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
13080 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
13081 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
13082 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
13083 w2
[1] = append1
[3] >> 8 | append2
[0] << 24;
13084 w2
[2] = append2
[0] >> 8;
13088 w0
[2] = append0
[0];
13089 w0
[3] = append0
[1];
13090 w1
[0] = append0
[2];
13091 w1
[1] = append0
[3];
13092 w1
[2] = append1
[0];
13093 w1
[3] = append1
[1];
13094 w2
[0] = append1
[2];
13095 w2
[1] = append1
[3];
13096 w2
[2] = append2
[0];
13100 w0
[2] = w0
[2] | append0
[0] << 8;
13101 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
13102 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
13103 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
13104 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
13105 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
13106 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
13107 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
13108 w2
[2] = append1
[3] >> 24 | append2
[0] << 8;
13109 w2
[3] = append2
[0] >> 24;
13113 w0
[2] = w0
[2] | append0
[0] << 16;
13114 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
13115 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
13116 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
13117 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
13118 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
13119 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
13120 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
13121 w2
[2] = append1
[3] >> 16 | append2
[0] << 16;
13122 w2
[3] = append2
[0] >> 16;
13126 w0
[2] = w0
[2] | append0
[0] << 24;
13127 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
13128 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
13129 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
13130 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
13131 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
13132 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
13133 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
13134 w2
[2] = append1
[3] >> 8 | append2
[0] << 24;
13135 w2
[3] = append2
[0] >> 8;
13139 w0
[3] = append0
[0];
13140 w1
[0] = append0
[1];
13141 w1
[1] = append0
[2];
13142 w1
[2] = append0
[3];
13143 w1
[3] = append1
[0];
13144 w2
[0] = append1
[1];
13145 w2
[1] = append1
[2];
13146 w2
[2] = append1
[3];
13147 w2
[3] = append2
[0];
13151 w0
[3] = w0
[3] | append0
[0] << 8;
13152 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
13153 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
13154 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
13155 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
13156 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
13157 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
13158 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
13159 w2
[3] = append1
[3] >> 24 | append2
[0] << 8;
13160 w3
[0] = append2
[0] >> 24;
13164 w0
[3] = w0
[3] | append0
[0] << 16;
13165 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
13166 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
13167 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
13168 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
13169 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
13170 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
13171 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
13172 w2
[3] = append1
[3] >> 16 | append2
[0] << 16;
13173 w3
[0] = append2
[0] >> 16;
13177 w0
[3] = w0
[3] | append0
[0] << 24;
13178 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
13179 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
13180 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
13181 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
13182 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
13183 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
13184 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
13185 w2
[3] = append1
[3] >> 8 | append2
[0] << 24;
13186 w3
[0] = append2
[0] >> 8;
13191 __device__
static void memcat16_9 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x append0
[4], const u32x append1
[4], const u32x append2
[4], const u32 offset
)
13196 w0
[0] = append0
[0];
13197 w0
[1] = append0
[1];
13198 w0
[2] = append0
[2];
13199 w0
[3] = append0
[3];
13200 w1
[0] = append1
[0];
13201 w1
[1] = append1
[1];
13202 w1
[2] = append1
[2];
13203 w1
[3] = append1
[3];
13204 w2
[0] = append2
[0];
13208 w0
[0] = w0
[0] | append0
[0] << 8;
13209 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
13210 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
13211 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
13212 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
13213 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
13214 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
13215 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
13216 w2
[0] = append1
[3] >> 24 | append2
[0] << 8;
13217 w2
[1] = append2
[0] >> 24;
13221 w0
[0] = w0
[0] | append0
[0] << 16;
13222 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
13223 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
13224 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
13225 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
13226 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
13227 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
13228 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
13229 w2
[0] = append1
[3] >> 16 | append2
[0] << 16;
13230 w2
[1] = append2
[0] >> 16;
13234 w0
[0] = w0
[0] | append0
[0] << 24;
13235 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
13236 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
13237 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
13238 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
13239 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
13240 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
13241 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
13242 w2
[0] = append1
[3] >> 8 | append2
[0] << 24;
13243 w2
[1] = append2
[0] >> 8;
13247 w0
[1] = append0
[0];
13248 w0
[2] = append0
[1];
13249 w0
[3] = append0
[2];
13250 w1
[0] = append0
[3];
13251 w1
[1] = append1
[0];
13252 w1
[2] = append1
[1];
13253 w1
[3] = append1
[2];
13254 w2
[0] = append1
[3];
13255 w2
[1] = append2
[0];
13259 w0
[1] = w0
[1] | append0
[0] << 8;
13260 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
13261 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
13262 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
13263 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
13264 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
13265 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
13266 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
13267 w2
[1] = append1
[3] >> 24 | append2
[0] << 8;
13268 w2
[2] = append2
[0] >> 24;
13272 w0
[1] = w0
[1] | append0
[0] << 16;
13273 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
13274 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
13275 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
13276 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
13277 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
13278 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
13279 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
13280 w2
[1] = append1
[3] >> 16 | append2
[0] << 16;
13281 w2
[2] = append2
[0] >> 16;
13285 w0
[1] = w0
[1] | append0
[0] << 24;
13286 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
13287 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
13288 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
13289 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
13290 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
13291 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
13292 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
13293 w2
[1] = append1
[3] >> 8 | append2
[0] << 24;
13294 w2
[2] = append2
[0] >> 8;
13298 w0
[2] = append0
[0];
13299 w0
[3] = append0
[1];
13300 w1
[0] = append0
[2];
13301 w1
[1] = append0
[3];
13302 w1
[2] = append1
[0];
13303 w1
[3] = append1
[1];
13304 w2
[0] = append1
[2];
13305 w2
[1] = append1
[3];
13306 w2
[2] = append2
[0];
13310 w0
[2] = w0
[2] | append0
[0] << 8;
13311 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
13312 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
13313 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
13314 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
13315 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
13316 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
13317 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
13318 w2
[2] = append1
[3] >> 24 | append2
[0] << 8;
13319 w2
[3] = append2
[0] >> 24;
13323 w0
[2] = w0
[2] | append0
[0] << 16;
13324 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
13325 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
13326 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
13327 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
13328 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
13329 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
13330 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
13331 w2
[2] = append1
[3] >> 16 | append2
[0] << 16;
13332 w2
[3] = append2
[0] >> 16;
13336 w0
[2] = w0
[2] | append0
[0] << 24;
13337 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
13338 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
13339 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
13340 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
13341 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
13342 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
13343 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
13344 w2
[2] = append1
[3] >> 8 | append2
[0] << 24;
13345 w2
[3] = append2
[0] >> 8;
13349 w0
[3] = append0
[0];
13350 w1
[0] = append0
[1];
13351 w1
[1] = append0
[2];
13352 w1
[2] = append0
[3];
13353 w1
[3] = append1
[0];
13354 w2
[0] = append1
[1];
13355 w2
[1] = append1
[2];
13356 w2
[2] = append1
[3];
13357 w2
[3] = append2
[0];
13361 w0
[3] = w0
[3] | append0
[0] << 8;
13362 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
13363 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
13364 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
13365 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
13366 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
13367 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
13368 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
13369 w2
[3] = append1
[3] >> 24 | append2
[0] << 8;
13370 w3
[0] = append2
[0] >> 24;
13374 w0
[3] = w0
[3] | append0
[0] << 16;
13375 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
13376 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
13377 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
13378 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
13379 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
13380 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
13381 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
13382 w2
[3] = append1
[3] >> 16 | append2
[0] << 16;
13383 w3
[0] = append2
[0] >> 16;
13387 w0
[3] = w0
[3] | append0
[0] << 24;
13388 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
13389 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
13390 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
13391 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
13392 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
13393 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
13394 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
13395 w2
[3] = append1
[3] >> 8 | append2
[0] << 24;
13396 w3
[0] = append2
[0] >> 8;
13401 __device__
static void memcat32_8 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 append0
[4], const u32 append1
[4], const u32 offset
)
13406 w0
[0] = append0
[0];
13407 w0
[1] = append0
[1];
13408 w0
[2] = append0
[2];
13409 w0
[3] = append0
[3];
13410 w1
[0] = append1
[0];
13411 w1
[1] = append1
[1];
13412 w1
[2] = append1
[2];
13413 w1
[3] = append1
[3];
13417 w0
[0] = w0
[0] | append0
[0] << 8;
13418 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
13419 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
13420 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
13421 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
13422 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
13423 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
13424 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
13425 w2
[0] = append1
[3] >> 24;
13429 w0
[0] = w0
[0] | append0
[0] << 16;
13430 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
13431 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
13432 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
13433 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
13434 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
13435 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
13436 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
13437 w2
[0] = append1
[3] >> 16;
13441 w0
[0] = w0
[0] | append0
[0] << 24;
13442 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
13443 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
13444 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
13445 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
13446 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
13447 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
13448 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
13449 w2
[0] = append1
[3] >> 8;
13453 w0
[1] = append0
[0];
13454 w0
[2] = append0
[1];
13455 w0
[3] = append0
[2];
13456 w1
[0] = append0
[3];
13457 w1
[1] = append1
[0];
13458 w1
[2] = append1
[1];
13459 w1
[3] = append1
[2];
13460 w2
[0] = append1
[3];
13464 w0
[1] = w0
[1] | append0
[0] << 8;
13465 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
13466 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
13467 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
13468 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
13469 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
13470 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
13471 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
13472 w2
[1] = append1
[3] >> 24;
13476 w0
[1] = w0
[1] | append0
[0] << 16;
13477 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
13478 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
13479 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
13480 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
13481 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
13482 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
13483 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
13484 w2
[1] = append1
[3] >> 16;
13488 w0
[1] = w0
[1] | append0
[0] << 24;
13489 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
13490 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
13491 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
13492 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
13493 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
13494 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
13495 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
13496 w2
[1] = append1
[3] >> 8;
13500 w0
[2] = append0
[0];
13501 w0
[3] = append0
[1];
13502 w1
[0] = append0
[2];
13503 w1
[1] = append0
[3];
13504 w1
[2] = append1
[0];
13505 w1
[3] = append1
[1];
13506 w2
[0] = append1
[2];
13507 w2
[1] = append1
[3];
13511 w0
[2] = w0
[2] | append0
[0] << 8;
13512 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
13513 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
13514 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
13515 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
13516 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
13517 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
13518 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
13519 w2
[2] = append1
[3] >> 24;
13523 w0
[2] = w0
[2] | append0
[0] << 16;
13524 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
13525 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
13526 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
13527 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
13528 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
13529 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
13530 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
13531 w2
[2] = append1
[3] >> 16;
13535 w0
[2] = w0
[2] | append0
[0] << 24;
13536 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
13537 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
13538 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
13539 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
13540 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
13541 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
13542 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
13543 w2
[2] = append1
[3] >> 8;
13547 w0
[3] = append0
[0];
13548 w1
[0] = append0
[1];
13549 w1
[1] = append0
[2];
13550 w1
[2] = append0
[3];
13551 w1
[3] = append1
[0];
13552 w2
[0] = append1
[1];
13553 w2
[1] = append1
[2];
13554 w2
[2] = append1
[3];
13558 w0
[3] = w0
[3] | append0
[0] << 8;
13559 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
13560 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
13561 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
13562 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
13563 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
13564 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
13565 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
13566 w2
[3] = append1
[3] >> 24;
13570 w0
[3] = w0
[3] | append0
[0] << 16;
13571 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
13572 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
13573 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
13574 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
13575 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
13576 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
13577 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
13578 w2
[3] = append1
[3] >> 16;
13582 w0
[3] = w0
[3] | append0
[0] << 24;
13583 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
13584 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
13585 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
13586 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
13587 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
13588 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
13589 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
13590 w2
[3] = append1
[3] >> 8;
13594 w1
[0] = append0
[0];
13595 w1
[1] = append0
[1];
13596 w1
[2] = append0
[2];
13597 w1
[3] = append0
[3];
13598 w2
[0] = append1
[0];
13599 w2
[1] = append1
[1];
13600 w2
[2] = append1
[2];
13601 w2
[3] = append1
[3];
13605 w1
[0] = w1
[0] | append0
[0] << 8;
13606 w1
[1] = append0
[0] >> 24 | append0
[1] << 8;
13607 w1
[2] = append0
[1] >> 24 | append0
[2] << 8;
13608 w1
[3] = append0
[2] >> 24 | append0
[3] << 8;
13609 w2
[0] = append0
[3] >> 24 | append1
[0] << 8;
13610 w2
[1] = append1
[0] >> 24 | append1
[1] << 8;
13611 w2
[2] = append1
[1] >> 24 | append1
[2] << 8;
13612 w2
[3] = append1
[2] >> 24 | append1
[3] << 8;
13613 w3
[0] = append1
[3] >> 24;
13617 w1
[0] = w1
[0] | append0
[0] << 16;
13618 w1
[1] = append0
[0] >> 16 | append0
[1] << 16;
13619 w1
[2] = append0
[1] >> 16 | append0
[2] << 16;
13620 w1
[3] = append0
[2] >> 16 | append0
[3] << 16;
13621 w2
[0] = append0
[3] >> 16 | append1
[0] << 16;
13622 w2
[1] = append1
[0] >> 16 | append1
[1] << 16;
13623 w2
[2] = append1
[1] >> 16 | append1
[2] << 16;
13624 w2
[3] = append1
[2] >> 16 | append1
[3] << 16;
13625 w3
[0] = append1
[3] >> 16;
13629 w1
[0] = w1
[0] | append0
[0] << 24;
13630 w1
[1] = append0
[0] >> 8 | append0
[1] << 24;
13631 w1
[2] = append0
[1] >> 8 | append0
[2] << 24;
13632 w1
[3] = append0
[2] >> 8 | append0
[3] << 24;
13633 w2
[0] = append0
[3] >> 8 | append1
[0] << 24;
13634 w2
[1] = append1
[0] >> 8 | append1
[1] << 24;
13635 w2
[2] = append1
[1] >> 8 | append1
[2] << 24;
13636 w2
[3] = append1
[2] >> 8 | append1
[3] << 24;
13637 w3
[0] = append1
[3] >> 8;
13641 w1
[1] = append0
[0];
13642 w1
[2] = append0
[1];
13643 w1
[3] = append0
[2];
13644 w2
[0] = append0
[3];
13645 w2
[1] = append1
[0];
13646 w2
[2] = append1
[1];
13647 w2
[3] = append1
[2];
13648 w3
[0] = append1
[3];
13652 w1
[1] = w1
[1] | append0
[0] << 8;
13653 w1
[2] = append0
[0] >> 24 | append0
[1] << 8;
13654 w1
[3] = append0
[1] >> 24 | append0
[2] << 8;
13655 w2
[0] = append0
[2] >> 24 | append0
[3] << 8;
13656 w2
[1] = append0
[3] >> 24 | append1
[0] << 8;
13657 w2
[2] = append1
[0] >> 24 | append1
[1] << 8;
13658 w2
[3] = append1
[1] >> 24 | append1
[2] << 8;
13659 w3
[0] = append1
[2] >> 24 | append1
[3] << 8;
13660 w3
[1] = append1
[3] >> 24;
13664 w1
[1] = w1
[1] | append0
[0] << 16;
13665 w1
[2] = append0
[0] >> 16 | append0
[1] << 16;
13666 w1
[3] = append0
[1] >> 16 | append0
[2] << 16;
13667 w2
[0] = append0
[2] >> 16 | append0
[3] << 16;
13668 w2
[1] = append0
[3] >> 16 | append1
[0] << 16;
13669 w2
[2] = append1
[0] >> 16 | append1
[1] << 16;
13670 w2
[3] = append1
[1] >> 16 | append1
[2] << 16;
13671 w3
[0] = append1
[2] >> 16 | append1
[3] << 16;
13672 w3
[1] = append1
[3] >> 16;
13676 w1
[1] = w1
[1] | append0
[0] << 24;
13677 w1
[2] = append0
[0] >> 8 | append0
[1] << 24;
13678 w1
[3] = append0
[1] >> 8 | append0
[2] << 24;
13679 w2
[0] = append0
[2] >> 8 | append0
[3] << 24;
13680 w2
[1] = append0
[3] >> 8 | append1
[0] << 24;
13681 w2
[2] = append1
[0] >> 8 | append1
[1] << 24;
13682 w2
[3] = append1
[1] >> 8 | append1
[2] << 24;
13683 w3
[0] = append1
[2] >> 8 | append1
[3] << 24;
13684 w3
[1] = append1
[3] >> 8;
13688 w1
[2] = append0
[0];
13689 w1
[3] = append0
[1];
13690 w2
[0] = append0
[2];
13691 w2
[1] = append0
[3];
13692 w2
[2] = append1
[0];
13693 w2
[3] = append1
[1];
13694 w3
[0] = append1
[2];
13695 w3
[1] = append1
[3];
13699 w1
[2] = w1
[2] | append0
[0] << 8;
13700 w1
[3] = append0
[0] >> 24 | append0
[1] << 8;
13701 w2
[0] = append0
[1] >> 24 | append0
[2] << 8;
13702 w2
[1] = append0
[2] >> 24 | append0
[3] << 8;
13703 w2
[2] = append0
[3] >> 24 | append1
[0] << 8;
13704 w2
[3] = append1
[0] >> 24 | append1
[1] << 8;
13705 w3
[0] = append1
[1] >> 24 | append1
[2] << 8;
13706 w3
[1] = append1
[2] >> 24 | append1
[3] << 8;
13710 w1
[2] = w1
[2] | append0
[0] << 16;
13711 w1
[3] = append0
[0] >> 16 | append0
[1] << 16;
13712 w2
[0] = append0
[1] >> 16 | append0
[2] << 16;
13713 w2
[1] = append0
[2] >> 16 | append0
[3] << 16;
13714 w2
[2] = append0
[3] >> 16 | append1
[0] << 16;
13715 w2
[3] = append1
[0] >> 16 | append1
[1] << 16;
13716 w3
[0] = append1
[1] >> 16 | append1
[2] << 16;
13717 w3
[1] = append1
[2] >> 16 | append1
[3] << 16;
13721 w1
[2] = w1
[2] | append0
[0] << 24;
13722 w1
[3] = append0
[0] >> 8 | append0
[1] << 24;
13723 w2
[0] = append0
[1] >> 8 | append0
[2] << 24;
13724 w2
[1] = append0
[2] >> 8 | append0
[3] << 24;
13725 w2
[2] = append0
[3] >> 8 | append1
[0] << 24;
13726 w2
[3] = append1
[0] >> 8 | append1
[1] << 24;
13727 w3
[0] = append1
[1] >> 8 | append1
[2] << 24;
13728 w3
[1] = append1
[2] >> 8 | append1
[3] << 24;
13732 w1
[3] = append0
[0];
13733 w2
[0] = append0
[1];
13734 w2
[1] = append0
[2];
13735 w2
[2] = append0
[3];
13736 w2
[3] = append1
[0];
13737 w3
[0] = append1
[1];
13738 w3
[1] = append1
[2];
13742 w1
[3] = w1
[3] | append0
[0] << 8;
13743 w2
[0] = append0
[0] >> 24 | append0
[1] << 8;
13744 w2
[1] = append0
[1] >> 24 | append0
[2] << 8;
13745 w2
[2] = append0
[2] >> 24 | append0
[3] << 8;
13746 w2
[3] = append0
[3] >> 24 | append1
[0] << 8;
13747 w3
[0] = append1
[0] >> 24 | append1
[1] << 8;
13748 w3
[1] = append1
[1] >> 24 | append1
[2] << 8;
13752 w1
[3] = w1
[3] | append0
[0] << 16;
13753 w2
[0] = append0
[0] >> 16 | append0
[1] << 16;
13754 w2
[1] = append0
[1] >> 16 | append0
[2] << 16;
13755 w2
[2] = append0
[2] >> 16 | append0
[3] << 16;
13756 w2
[3] = append0
[3] >> 16 | append1
[0] << 16;
13757 w3
[0] = append1
[0] >> 16 | append1
[1] << 16;
13758 w3
[1] = append1
[1] >> 16 | append1
[2] << 16;
13762 w1
[3] = w1
[3] | append0
[0] << 24;
13763 w2
[0] = append0
[0] >> 8 | append0
[1] << 24;
13764 w2
[1] = append0
[1] >> 8 | append0
[2] << 24;
13765 w2
[2] = append0
[2] >> 8 | append0
[3] << 24;
13766 w2
[3] = append0
[3] >> 8 | append1
[0] << 24;
13767 w3
[0] = append1
[0] >> 8 | append1
[1] << 24;
13768 w3
[1] = append1
[1] >> 8 | append1
[2] << 24;
13772 w2
[0] = append0
[0];
13773 w2
[1] = append0
[1];
13774 w2
[2] = append0
[2];
13775 w2
[3] = append0
[3];
13776 w3
[0] = append1
[0];
13777 w3
[1] = append1
[1];
13782 __device__
static void memcat32_9 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 append0
[4], const u32 append1
[4], const u32 append2
[4], const u32 offset
)
13787 w0
[0] = append0
[0];
13788 w0
[1] = append0
[1];
13789 w0
[2] = append0
[2];
13790 w0
[3] = append0
[3];
13791 w1
[0] = append1
[0];
13792 w1
[1] = append1
[1];
13793 w1
[2] = append1
[2];
13794 w1
[3] = append1
[3];
13795 w2
[0] = append2
[0];
13799 w0
[0] = w0
[0] | append0
[0] << 8;
13800 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
13801 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
13802 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
13803 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
13804 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
13805 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
13806 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
13807 w2
[0] = append1
[3] >> 24 | append2
[0] << 8;
13808 w2
[1] = append2
[0] >> 24;
13812 w0
[0] = w0
[0] | append0
[0] << 16;
13813 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
13814 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
13815 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
13816 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
13817 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
13818 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
13819 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
13820 w2
[0] = append1
[3] >> 16 | append2
[0] << 16;
13821 w2
[1] = append2
[0] >> 16;
13825 w0
[0] = w0
[0] | append0
[0] << 24;
13826 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
13827 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
13828 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
13829 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
13830 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
13831 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
13832 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
13833 w2
[0] = append1
[3] >> 8 | append2
[0] << 24;
13834 w2
[1] = append2
[0] >> 8;
13838 w0
[1] = append0
[0];
13839 w0
[2] = append0
[1];
13840 w0
[3] = append0
[2];
13841 w1
[0] = append0
[3];
13842 w1
[1] = append1
[0];
13843 w1
[2] = append1
[1];
13844 w1
[3] = append1
[2];
13845 w2
[0] = append1
[3];
13846 w2
[1] = append2
[0];
13850 w0
[1] = w0
[1] | append0
[0] << 8;
13851 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
13852 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
13853 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
13854 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
13855 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
13856 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
13857 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
13858 w2
[1] = append1
[3] >> 24 | append2
[0] << 8;
13859 w2
[2] = append2
[0] >> 24;
13863 w0
[1] = w0
[1] | append0
[0] << 16;
13864 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
13865 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
13866 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
13867 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
13868 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
13869 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
13870 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
13871 w2
[1] = append1
[3] >> 16 | append2
[0] << 16;
13872 w2
[2] = append2
[0] >> 16;
13876 w0
[1] = w0
[1] | append0
[0] << 24;
13877 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
13878 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
13879 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
13880 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
13881 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
13882 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
13883 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
13884 w2
[1] = append1
[3] >> 8 | append2
[0] << 24;
13885 w2
[2] = append2
[0] >> 8;
13889 w0
[2] = append0
[0];
13890 w0
[3] = append0
[1];
13891 w1
[0] = append0
[2];
13892 w1
[1] = append0
[3];
13893 w1
[2] = append1
[0];
13894 w1
[3] = append1
[1];
13895 w2
[0] = append1
[2];
13896 w2
[1] = append1
[3];
13897 w2
[2] = append2
[0];
13901 w0
[2] = w0
[2] | append0
[0] << 8;
13902 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
13903 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
13904 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
13905 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
13906 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
13907 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
13908 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
13909 w2
[2] = append1
[3] >> 24 | append2
[0] << 8;
13910 w2
[3] = append2
[0] >> 24;
13914 w0
[2] = w0
[2] | append0
[0] << 16;
13915 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
13916 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
13917 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
13918 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
13919 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
13920 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
13921 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
13922 w2
[2] = append1
[3] >> 16 | append2
[0] << 16;
13923 w2
[3] = append2
[0] >> 16;
13927 w0
[2] = w0
[2] | append0
[0] << 24;
13928 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
13929 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
13930 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
13931 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
13932 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
13933 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
13934 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
13935 w2
[2] = append1
[3] >> 8 | append2
[0] << 24;
13936 w2
[3] = append2
[0] >> 8;
13940 w0
[3] = append0
[0];
13941 w1
[0] = append0
[1];
13942 w1
[1] = append0
[2];
13943 w1
[2] = append0
[3];
13944 w1
[3] = append1
[0];
13945 w2
[0] = append1
[1];
13946 w2
[1] = append1
[2];
13947 w2
[2] = append1
[3];
13948 w2
[3] = append2
[0];
13952 w0
[3] = w0
[3] | append0
[0] << 8;
13953 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
13954 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
13955 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
13956 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
13957 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
13958 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
13959 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
13960 w2
[3] = append1
[3] >> 24 | append2
[0] << 8;
13961 w3
[0] = append2
[0] >> 24;
13965 w0
[3] = w0
[3] | append0
[0] << 16;
13966 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
13967 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
13968 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
13969 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
13970 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
13971 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
13972 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
13973 w2
[3] = append1
[3] >> 16 | append2
[0] << 16;
13974 w3
[0] = append2
[0] >> 16;
13978 w0
[3] = w0
[3] | append0
[0] << 24;
13979 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
13980 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
13981 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
13982 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
13983 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
13984 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
13985 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
13986 w2
[3] = append1
[3] >> 8 | append2
[0] << 24;
13987 w3
[0] = append2
[0] >> 8;
13991 w1
[0] = append0
[0];
13992 w1
[1] = append0
[1];
13993 w1
[2] = append0
[2];
13994 w1
[3] = append0
[3];
13995 w2
[0] = append1
[0];
13996 w2
[1] = append1
[1];
13997 w2
[2] = append1
[2];
13998 w2
[3] = append1
[3];
13999 w3
[0] = append2
[0];
14003 w1
[0] = w1
[0] | append0
[0] << 8;
14004 w1
[1] = append0
[0] >> 24 | append0
[1] << 8;
14005 w1
[2] = append0
[1] >> 24 | append0
[2] << 8;
14006 w1
[3] = append0
[2] >> 24 | append0
[3] << 8;
14007 w2
[0] = append0
[3] >> 24 | append1
[0] << 8;
14008 w2
[1] = append1
[0] >> 24 | append1
[1] << 8;
14009 w2
[2] = append1
[1] >> 24 | append1
[2] << 8;
14010 w2
[3] = append1
[2] >> 24 | append1
[3] << 8;
14011 w3
[0] = append1
[3] >> 24 | append2
[0] << 8;
14012 w3
[1] = append2
[0] >> 24;
14016 w1
[0] = w1
[0] | append0
[0] << 16;
14017 w1
[1] = append0
[0] >> 16 | append0
[1] << 16;
14018 w1
[2] = append0
[1] >> 16 | append0
[2] << 16;
14019 w1
[3] = append0
[2] >> 16 | append0
[3] << 16;
14020 w2
[0] = append0
[3] >> 16 | append1
[0] << 16;
14021 w2
[1] = append1
[0] >> 16 | append1
[1] << 16;
14022 w2
[2] = append1
[1] >> 16 | append1
[2] << 16;
14023 w2
[3] = append1
[2] >> 16 | append1
[3] << 16;
14024 w3
[0] = append1
[3] >> 16 | append2
[0] << 16;
14025 w3
[1] = append2
[0] >> 16;
14029 w1
[0] = w1
[0] | append0
[0] << 24;
14030 w1
[1] = append0
[0] >> 8 | append0
[1] << 24;
14031 w1
[2] = append0
[1] >> 8 | append0
[2] << 24;
14032 w1
[3] = append0
[2] >> 8 | append0
[3] << 24;
14033 w2
[0] = append0
[3] >> 8 | append1
[0] << 24;
14034 w2
[1] = append1
[0] >> 8 | append1
[1] << 24;
14035 w2
[2] = append1
[1] >> 8 | append1
[2] << 24;
14036 w2
[3] = append1
[2] >> 8 | append1
[3] << 24;
14037 w3
[0] = append1
[3] >> 8 | append2
[0] << 24;
14038 w3
[1] = append2
[0] >> 8;
14042 w1
[1] = append0
[0];
14043 w1
[2] = append0
[1];
14044 w1
[3] = append0
[2];
14045 w2
[0] = append0
[3];
14046 w2
[1] = append1
[0];
14047 w2
[2] = append1
[1];
14048 w2
[3] = append1
[2];
14049 w3
[0] = append1
[3];
14050 w3
[1] = append2
[0];
14054 w1
[1] = w1
[1] | append0
[0] << 8;
14055 w1
[2] = append0
[0] >> 24 | append0
[1] << 8;
14056 w1
[3] = append0
[1] >> 24 | append0
[2] << 8;
14057 w2
[0] = append0
[2] >> 24 | append0
[3] << 8;
14058 w2
[1] = append0
[3] >> 24 | append1
[0] << 8;
14059 w2
[2] = append1
[0] >> 24 | append1
[1] << 8;
14060 w2
[3] = append1
[1] >> 24 | append1
[2] << 8;
14061 w3
[0] = append1
[2] >> 24 | append1
[3] << 8;
14062 w3
[1] = append1
[3] >> 24 | append2
[0] << 8;
14066 w1
[1] = w1
[1] | append0
[0] << 16;
14067 w1
[2] = append0
[0] >> 16 | append0
[1] << 16;
14068 w1
[3] = append0
[1] >> 16 | append0
[2] << 16;
14069 w2
[0] = append0
[2] >> 16 | append0
[3] << 16;
14070 w2
[1] = append0
[3] >> 16 | append1
[0] << 16;
14071 w2
[2] = append1
[0] >> 16 | append1
[1] << 16;
14072 w2
[3] = append1
[1] >> 16 | append1
[2] << 16;
14073 w3
[0] = append1
[2] >> 16 | append1
[3] << 16;
14074 w3
[1] = append1
[3] >> 16 | append2
[0] << 16;
14078 w1
[1] = w1
[1] | append0
[0] << 24;
14079 w1
[2] = append0
[0] >> 8 | append0
[1] << 24;
14080 w1
[3] = append0
[1] >> 8 | append0
[2] << 24;
14081 w2
[0] = append0
[2] >> 8 | append0
[3] << 24;
14082 w2
[1] = append0
[3] >> 8 | append1
[0] << 24;
14083 w2
[2] = append1
[0] >> 8 | append1
[1] << 24;
14084 w2
[3] = append1
[1] >> 8 | append1
[2] << 24;
14085 w3
[0] = append1
[2] >> 8 | append1
[3] << 24;
14086 w3
[1] = append1
[3] >> 8 | append2
[0] << 24;
14090 w1
[2] = append0
[0];
14091 w1
[3] = append0
[1];
14092 w2
[0] = append0
[2];
14093 w2
[1] = append0
[3];
14094 w2
[2] = append1
[0];
14095 w2
[3] = append1
[1];
14096 w3
[0] = append1
[2];
14097 w3
[1] = append1
[3];
14101 w1
[2] = w1
[2] | append0
[0] << 8;
14102 w1
[3] = append0
[0] >> 24 | append0
[1] << 8;
14103 w2
[0] = append0
[1] >> 24 | append0
[2] << 8;
14104 w2
[1] = append0
[2] >> 24 | append0
[3] << 8;
14105 w2
[2] = append0
[3] >> 24 | append1
[0] << 8;
14106 w2
[3] = append1
[0] >> 24 | append1
[1] << 8;
14107 w3
[0] = append1
[1] >> 24 | append1
[2] << 8;
14108 w3
[1] = append1
[2] >> 24 | append1
[3] << 8;
14112 w1
[2] = w1
[2] | append0
[0] << 16;
14113 w1
[3] = append0
[0] >> 16 | append0
[1] << 16;
14114 w2
[0] = append0
[1] >> 16 | append0
[2] << 16;
14115 w2
[1] = append0
[2] >> 16 | append0
[3] << 16;
14116 w2
[2] = append0
[3] >> 16 | append1
[0] << 16;
14117 w2
[3] = append1
[0] >> 16 | append1
[1] << 16;
14118 w3
[0] = append1
[1] >> 16 | append1
[2] << 16;
14119 w3
[1] = append1
[2] >> 16 | append1
[3] << 16;
14123 w1
[2] = w1
[2] | append0
[0] << 24;
14124 w1
[3] = append0
[0] >> 8 | append0
[1] << 24;
14125 w2
[0] = append0
[1] >> 8 | append0
[2] << 24;
14126 w2
[1] = append0
[2] >> 8 | append0
[3] << 24;
14127 w2
[2] = append0
[3] >> 8 | append1
[0] << 24;
14128 w2
[3] = append1
[0] >> 8 | append1
[1] << 24;
14129 w3
[0] = append1
[1] >> 8 | append1
[2] << 24;
14130 w3
[1] = append1
[2] >> 8 | append1
[3] << 24;
14134 w1
[3] = append0
[0];
14135 w2
[0] = append0
[1];
14136 w2
[1] = append0
[2];
14137 w2
[2] = append0
[3];
14138 w2
[3] = append1
[0];
14139 w3
[0] = append1
[1];
14140 w3
[1] = append1
[2];
14144 w1
[3] = w1
[3] | append0
[0] << 8;
14145 w2
[0] = append0
[0] >> 24 | append0
[1] << 8;
14146 w2
[1] = append0
[1] >> 24 | append0
[2] << 8;
14147 w2
[2] = append0
[2] >> 24 | append0
[3] << 8;
14148 w2
[3] = append0
[3] >> 24 | append1
[0] << 8;
14149 w3
[0] = append1
[0] >> 24 | append1
[1] << 8;
14150 w3
[1] = append1
[1] >> 24 | append1
[2] << 8;
14154 w1
[3] = w1
[3] | append0
[0] << 16;
14155 w2
[0] = append0
[0] >> 16 | append0
[1] << 16;
14156 w2
[1] = append0
[1] >> 16 | append0
[2] << 16;
14157 w2
[2] = append0
[2] >> 16 | append0
[3] << 16;
14158 w2
[3] = append0
[3] >> 16 | append1
[0] << 16;
14159 w3
[0] = append1
[0] >> 16 | append1
[1] << 16;
14160 w3
[1] = append1
[1] >> 16 | append1
[2] << 16;
14164 w1
[3] = w1
[3] | append0
[0] << 24;
14165 w2
[0] = append0
[0] >> 8 | append0
[1] << 24;
14166 w2
[1] = append0
[1] >> 8 | append0
[2] << 24;
14167 w2
[2] = append0
[2] >> 8 | append0
[3] << 24;
14168 w2
[3] = append0
[3] >> 8 | append1
[0] << 24;
14169 w3
[0] = append1
[0] >> 8 | append1
[1] << 24;
14170 w3
[1] = append1
[1] >> 8 | append1
[2] << 24;
14174 w2
[0] = append0
[0];
14175 w2
[1] = append0
[1];
14176 w2
[2] = append0
[2];
14177 w2
[3] = append0
[3];
14178 w3
[0] = append1
[0];
14179 w3
[1] = append1
[1];
14184 __device__
static void switch_buffer_by_offset (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
14186 #if __CUDA_ARCH__ >= 200
14188 const int offset_minus_4
= 4 - (offset
% 4);
14190 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
14192 switch (offset
/ 4)
14195 w3
[1] = __byte_perm (w3
[0], w3
[1], selector
);
14196 w3
[0] = __byte_perm (w2
[3], w3
[0], selector
);
14197 w2
[3] = __byte_perm (w2
[2], w2
[3], selector
);
14198 w2
[2] = __byte_perm (w2
[1], w2
[2], selector
);
14199 w2
[1] = __byte_perm (w2
[0], w2
[1], selector
);
14200 w2
[0] = __byte_perm (w1
[3], w2
[0], selector
);
14201 w1
[3] = __byte_perm (w1
[2], w1
[3], selector
);
14202 w1
[2] = __byte_perm (w1
[1], w1
[2], selector
);
14203 w1
[1] = __byte_perm (w1
[0], w1
[1], selector
);
14204 w1
[0] = __byte_perm (w0
[3], w1
[0], selector
);
14205 w0
[3] = __byte_perm (w0
[2], w0
[3], selector
);
14206 w0
[2] = __byte_perm (w0
[1], w0
[2], selector
);
14207 w0
[1] = __byte_perm (w0
[0], w0
[1], selector
);
14208 w0
[0] = __byte_perm ( 0, w0
[0], selector
);
14213 w3
[1] = __byte_perm (w2
[3], w3
[0], selector
);
14214 w3
[0] = __byte_perm (w2
[2], w2
[3], selector
);
14215 w2
[3] = __byte_perm (w2
[1], w2
[2], selector
);
14216 w2
[2] = __byte_perm (w2
[0], w2
[1], selector
);
14217 w2
[1] = __byte_perm (w1
[3], w2
[0], selector
);
14218 w2
[0] = __byte_perm (w1
[2], w1
[3], selector
);
14219 w1
[3] = __byte_perm (w1
[1], w1
[2], selector
);
14220 w1
[2] = __byte_perm (w1
[0], w1
[1], selector
);
14221 w1
[1] = __byte_perm (w0
[3], w1
[0], selector
);
14222 w1
[0] = __byte_perm (w0
[2], w0
[3], selector
);
14223 w0
[3] = __byte_perm (w0
[1], w0
[2], selector
);
14224 w0
[2] = __byte_perm (w0
[0], w0
[1], selector
);
14225 w0
[1] = __byte_perm ( 0, w0
[0], selector
);
14231 w3
[1] = __byte_perm (w2
[2], w2
[3], selector
);
14232 w3
[0] = __byte_perm (w2
[1], w2
[2], selector
);
14233 w2
[3] = __byte_perm (w2
[0], w2
[1], selector
);
14234 w2
[2] = __byte_perm (w1
[3], w2
[0], selector
);
14235 w2
[1] = __byte_perm (w1
[2], w1
[3], selector
);
14236 w2
[0] = __byte_perm (w1
[1], w1
[2], selector
);
14237 w1
[3] = __byte_perm (w1
[0], w1
[1], selector
);
14238 w1
[2] = __byte_perm (w0
[3], w1
[0], selector
);
14239 w1
[1] = __byte_perm (w0
[2], w0
[3], selector
);
14240 w1
[0] = __byte_perm (w0
[1], w0
[2], selector
);
14241 w0
[3] = __byte_perm (w0
[0], w0
[1], selector
);
14242 w0
[2] = __byte_perm ( 0, w0
[0], selector
);
14249 w3
[1] = __byte_perm (w2
[1], w2
[2], selector
);
14250 w3
[0] = __byte_perm (w2
[0], w2
[1], selector
);
14251 w2
[3] = __byte_perm (w1
[3], w2
[0], selector
);
14252 w2
[2] = __byte_perm (w1
[2], w1
[3], selector
);
14253 w2
[1] = __byte_perm (w1
[1], w1
[2], selector
);
14254 w2
[0] = __byte_perm (w1
[0], w1
[1], selector
);
14255 w1
[3] = __byte_perm (w0
[3], w1
[0], selector
);
14256 w1
[2] = __byte_perm (w0
[2], w0
[3], selector
);
14257 w1
[1] = __byte_perm (w0
[1], w0
[2], selector
);
14258 w1
[0] = __byte_perm (w0
[0], w0
[1], selector
);
14259 w0
[3] = __byte_perm ( 0, w0
[0], selector
);
14267 w3
[1] = __byte_perm (w2
[0], w2
[1], selector
);
14268 w3
[0] = __byte_perm (w1
[3], w2
[0], selector
);
14269 w2
[3] = __byte_perm (w1
[2], w1
[3], selector
);
14270 w2
[2] = __byte_perm (w1
[1], w1
[2], selector
);
14271 w2
[1] = __byte_perm (w1
[0], w1
[1], selector
);
14272 w2
[0] = __byte_perm (w0
[3], w1
[0], selector
);
14273 w1
[3] = __byte_perm (w0
[2], w0
[3], selector
);
14274 w1
[2] = __byte_perm (w0
[1], w0
[2], selector
);
14275 w1
[1] = __byte_perm (w0
[0], w0
[1], selector
);
14276 w1
[0] = __byte_perm ( 0, w0
[0], selector
);
14285 w3
[1] = __byte_perm (w1
[3], w2
[0], selector
);
14286 w3
[0] = __byte_perm (w1
[2], w1
[3], selector
);
14287 w2
[3] = __byte_perm (w1
[1], w1
[2], selector
);
14288 w2
[2] = __byte_perm (w1
[0], w1
[1], selector
);
14289 w2
[1] = __byte_perm (w0
[3], w1
[0], selector
);
14290 w2
[0] = __byte_perm (w0
[2], w0
[3], selector
);
14291 w1
[3] = __byte_perm (w0
[1], w0
[2], selector
);
14292 w1
[2] = __byte_perm (w0
[0], w0
[1], selector
);
14293 w1
[1] = __byte_perm ( 0, w0
[0], selector
);
14303 w3
[1] = __byte_perm (w1
[2], w1
[3], selector
);
14304 w3
[0] = __byte_perm (w1
[1], w1
[2], selector
);
14305 w2
[3] = __byte_perm (w1
[0], w1
[1], selector
);
14306 w2
[2] = __byte_perm (w0
[3], w1
[0], selector
);
14307 w2
[1] = __byte_perm (w0
[2], w0
[3], selector
);
14308 w2
[0] = __byte_perm (w0
[1], w0
[2], selector
);
14309 w1
[3] = __byte_perm (w0
[0], w0
[1], selector
);
14310 w1
[2] = __byte_perm ( 0, w0
[0], selector
);
14321 w3
[1] = __byte_perm (w1
[1], w1
[2], selector
);
14322 w3
[0] = __byte_perm (w1
[0], w1
[1], selector
);
14323 w2
[3] = __byte_perm (w0
[3], w1
[0], selector
);
14324 w2
[2] = __byte_perm (w0
[2], w0
[3], selector
);
14325 w2
[1] = __byte_perm (w0
[1], w0
[2], selector
);
14326 w2
[0] = __byte_perm (w0
[0], w0
[1], selector
);
14327 w1
[3] = __byte_perm ( 0, w0
[0], selector
);
14339 w3
[1] = __byte_perm (w1
[0], w1
[1], selector
);
14340 w3
[0] = __byte_perm (w0
[3], w1
[0], selector
);
14341 w2
[3] = __byte_perm (w0
[2], w0
[3], selector
);
14342 w2
[2] = __byte_perm (w0
[1], w0
[2], selector
);
14343 w2
[1] = __byte_perm (w0
[0], w0
[1], selector
);
14344 w2
[0] = __byte_perm ( 0, w0
[0], selector
);
14357 w3
[1] = __byte_perm (w0
[3], w1
[0], selector
);
14358 w3
[0] = __byte_perm (w0
[2], w0
[3], selector
);
14359 w2
[3] = __byte_perm (w0
[1], w0
[2], selector
);
14360 w2
[2] = __byte_perm (w0
[0], w0
[1], selector
);
14361 w2
[1] = __byte_perm ( 0, w0
[0], selector
);
14375 w3
[1] = __byte_perm (w0
[2], w0
[3], selector
);
14376 w3
[0] = __byte_perm (w0
[1], w0
[2], selector
);
14377 w2
[3] = __byte_perm (w0
[0], w0
[1], selector
);
14378 w2
[2] = __byte_perm ( 0, w0
[0], selector
);
14393 w3
[1] = __byte_perm (w0
[1], w0
[2], selector
);
14394 w3
[0] = __byte_perm (w0
[0], w0
[1], selector
);
14395 w2
[3] = __byte_perm ( 0, w0
[0], selector
);
14411 w3
[1] = __byte_perm (w0
[0], w0
[1], selector
);
14412 w3
[0] = __byte_perm ( 0, w0
[0], selector
);
14429 w3
[1] = __byte_perm ( 0, w0
[0], selector
);
14453 switch (offset
% 4)
14468 tmp0
[0] = w0
[0] << 8;
14469 tmp0
[1] = w0
[0] >> 24 | w0
[1] << 8;
14470 tmp0
[2] = w0
[1] >> 24 | w0
[2] << 8;
14471 tmp0
[3] = w0
[2] >> 24 | w0
[3] << 8;
14472 tmp1
[0] = w0
[3] >> 24 | w1
[0] << 8;
14473 tmp1
[1] = w1
[0] >> 24 | w1
[1] << 8;
14474 tmp1
[2] = w1
[1] >> 24 | w1
[2] << 8;
14475 tmp1
[3] = w1
[2] >> 24 | w1
[3] << 8;
14476 tmp2
[0] = w1
[3] >> 24;
14480 tmp0
[0] = w0
[0] << 16;
14481 tmp0
[1] = w0
[0] >> 16 | w0
[1] << 16;
14482 tmp0
[2] = w0
[1] >> 16 | w0
[2] << 16;
14483 tmp0
[3] = w0
[2] >> 16 | w0
[3] << 16;
14484 tmp1
[0] = w0
[3] >> 16 | w1
[0] << 16;
14485 tmp1
[1] = w1
[0] >> 16 | w1
[1] << 16;
14486 tmp1
[2] = w1
[1] >> 16 | w1
[2] << 16;
14487 tmp1
[3] = w1
[2] >> 16 | w1
[3] << 16;
14488 tmp2
[0] = w1
[3] >> 16;
14492 tmp0
[0] = w0
[0] << 24;
14493 tmp0
[1] = w0
[0] >> 8 | w0
[1] << 24;
14494 tmp0
[2] = w0
[1] >> 8 | w0
[2] << 24;
14495 tmp0
[3] = w0
[2] >> 8 | w0
[3] << 24;
14496 tmp1
[0] = w0
[3] >> 8 | w1
[0] << 24;
14497 tmp1
[1] = w1
[0] >> 8 | w1
[1] << 24;
14498 tmp1
[2] = w1
[1] >> 8 | w1
[2] << 24;
14499 tmp1
[3] = w1
[2] >> 8 | w1
[3] << 24;
14500 tmp2
[0] = w1
[3] >> 8;
14504 switch (offset
/ 4)
14749 __device__
static void switch_buffer_by_offset_be (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
14751 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
14753 switch (offset
/ 4)
14756 w3
[1] = __byte_perm (w3
[1], w3
[0], selector
);
14757 w3
[0] = __byte_perm (w3
[0], w2
[3], selector
);
14758 w2
[3] = __byte_perm (w2
[3], w2
[2], selector
);
14759 w2
[2] = __byte_perm (w2
[2], w2
[1], selector
);
14760 w2
[1] = __byte_perm (w2
[1], w2
[0], selector
);
14761 w2
[0] = __byte_perm (w2
[0], w1
[3], selector
);
14762 w1
[3] = __byte_perm (w1
[3], w1
[2], selector
);
14763 w1
[2] = __byte_perm (w1
[2], w1
[1], selector
);
14764 w1
[1] = __byte_perm (w1
[1], w1
[0], selector
);
14765 w1
[0] = __byte_perm (w1
[0], w0
[3], selector
);
14766 w0
[3] = __byte_perm (w0
[3], w0
[2], selector
);
14767 w0
[2] = __byte_perm (w0
[2], w0
[1], selector
);
14768 w0
[1] = __byte_perm (w0
[1], w0
[0], selector
);
14769 w0
[0] = __byte_perm (w0
[0], 0, selector
);
14773 w3
[1] = __byte_perm (w3
[0], w2
[3], selector
);
14774 w3
[0] = __byte_perm (w2
[3], w2
[2], selector
);
14775 w2
[3] = __byte_perm (w2
[2], w2
[1], selector
);
14776 w2
[2] = __byte_perm (w2
[1], w2
[0], selector
);
14777 w2
[1] = __byte_perm (w2
[0], w1
[3], selector
);
14778 w2
[0] = __byte_perm (w1
[3], w1
[2], selector
);
14779 w1
[3] = __byte_perm (w1
[2], w1
[1], selector
);
14780 w1
[2] = __byte_perm (w1
[1], w1
[0], selector
);
14781 w1
[1] = __byte_perm (w1
[0], w0
[3], selector
);
14782 w1
[0] = __byte_perm (w0
[3], w0
[2], selector
);
14783 w0
[3] = __byte_perm (w0
[2], w0
[1], selector
);
14784 w0
[2] = __byte_perm (w0
[1], w0
[0], selector
);
14785 w0
[1] = __byte_perm (w0
[0], 0, selector
);
14790 w3
[1] = __byte_perm (w2
[3], w2
[2], selector
);
14791 w3
[0] = __byte_perm (w2
[2], w2
[1], selector
);
14792 w2
[3] = __byte_perm (w2
[1], w2
[0], selector
);
14793 w2
[2] = __byte_perm (w2
[0], w1
[3], selector
);
14794 w2
[1] = __byte_perm (w1
[3], w1
[2], selector
);
14795 w2
[0] = __byte_perm (w1
[2], w1
[1], selector
);
14796 w1
[3] = __byte_perm (w1
[1], w1
[0], selector
);
14797 w1
[2] = __byte_perm (w1
[0], w0
[3], selector
);
14798 w1
[1] = __byte_perm (w0
[3], w0
[2], selector
);
14799 w1
[0] = __byte_perm (w0
[2], w0
[1], selector
);
14800 w0
[3] = __byte_perm (w0
[1], w0
[0], selector
);
14801 w0
[2] = __byte_perm (w0
[0], 0, selector
);
14807 w3
[1] = __byte_perm (w2
[2], w2
[1], selector
);
14808 w3
[0] = __byte_perm (w2
[1], w2
[0], selector
);
14809 w2
[3] = __byte_perm (w2
[0], w1
[3], selector
);
14810 w2
[2] = __byte_perm (w1
[3], w1
[2], selector
);
14811 w2
[1] = __byte_perm (w1
[2], w1
[1], selector
);
14812 w2
[0] = __byte_perm (w1
[1], w1
[0], selector
);
14813 w1
[3] = __byte_perm (w1
[0], w0
[3], selector
);
14814 w1
[2] = __byte_perm (w0
[3], w0
[2], selector
);
14815 w1
[1] = __byte_perm (w0
[2], w0
[1], selector
);
14816 w1
[0] = __byte_perm (w0
[1], w0
[0], selector
);
14817 w0
[3] = __byte_perm (w0
[0], 0, selector
);
14824 w3
[1] = __byte_perm (w2
[1], w2
[0], selector
);
14825 w3
[0] = __byte_perm (w2
[0], w1
[3], selector
);
14826 w2
[3] = __byte_perm (w1
[3], w1
[2], selector
);
14827 w2
[2] = __byte_perm (w1
[2], w1
[1], selector
);
14828 w2
[1] = __byte_perm (w1
[1], w1
[0], selector
);
14829 w2
[0] = __byte_perm (w1
[0], w0
[3], selector
);
14830 w1
[3] = __byte_perm (w0
[3], w0
[2], selector
);
14831 w1
[2] = __byte_perm (w0
[2], w0
[1], selector
);
14832 w1
[1] = __byte_perm (w0
[1], w0
[0], selector
);
14833 w1
[0] = __byte_perm (w0
[0], 0, selector
);
14841 w3
[1] = __byte_perm (w2
[0], w1
[3], selector
);
14842 w3
[0] = __byte_perm (w1
[3], w1
[2], selector
);
14843 w2
[3] = __byte_perm (w1
[2], w1
[1], selector
);
14844 w2
[2] = __byte_perm (w1
[1], w1
[0], selector
);
14845 w2
[1] = __byte_perm (w1
[0], w0
[3], selector
);
14846 w2
[0] = __byte_perm (w0
[3], w0
[2], selector
);
14847 w1
[3] = __byte_perm (w0
[2], w0
[1], selector
);
14848 w1
[2] = __byte_perm (w0
[1], w0
[0], selector
);
14849 w1
[1] = __byte_perm (w0
[0], 0, selector
);
14858 w3
[1] = __byte_perm (w1
[3], w1
[2], selector
);
14859 w3
[0] = __byte_perm (w1
[2], w1
[1], selector
);
14860 w2
[3] = __byte_perm (w1
[1], w1
[0], selector
);
14861 w2
[2] = __byte_perm (w1
[0], w0
[3], selector
);
14862 w2
[1] = __byte_perm (w0
[3], w0
[2], selector
);
14863 w2
[0] = __byte_perm (w0
[2], w0
[1], selector
);
14864 w1
[3] = __byte_perm (w0
[1], w0
[0], selector
);
14865 w1
[2] = __byte_perm (w0
[0], 0, selector
);
14875 w3
[1] = __byte_perm (w1
[2], w1
[1], selector
);
14876 w3
[0] = __byte_perm (w1
[1], w1
[0], selector
);
14877 w2
[3] = __byte_perm (w1
[0], w0
[3], selector
);
14878 w2
[2] = __byte_perm (w0
[3], w0
[2], selector
);
14879 w2
[1] = __byte_perm (w0
[2], w0
[1], selector
);
14880 w2
[0] = __byte_perm (w0
[1], w0
[0], selector
);
14881 w1
[3] = __byte_perm (w0
[0], 0, selector
);
14892 w3
[1] = __byte_perm (w1
[1], w1
[0], selector
);
14893 w3
[0] = __byte_perm (w1
[0], w0
[3], selector
);
14894 w2
[3] = __byte_perm (w0
[3], w0
[2], selector
);
14895 w2
[2] = __byte_perm (w0
[2], w0
[1], selector
);
14896 w2
[1] = __byte_perm (w0
[1], w0
[0], selector
);
14897 w2
[0] = __byte_perm (w0
[0], 0, selector
);
14909 w3
[1] = __byte_perm (w1
[0], w0
[3], selector
);
14910 w3
[0] = __byte_perm (w0
[3], w0
[2], selector
);
14911 w2
[3] = __byte_perm (w0
[2], w0
[1], selector
);
14912 w2
[2] = __byte_perm (w0
[1], w0
[0], selector
);
14913 w2
[1] = __byte_perm (w0
[0], 0, selector
);
14926 w3
[1] = __byte_perm (w0
[3], w0
[2], selector
);
14927 w3
[0] = __byte_perm (w0
[2], w0
[1], selector
);
14928 w2
[3] = __byte_perm (w0
[1], w0
[0], selector
);
14929 w2
[2] = __byte_perm (w0
[0], 0, selector
);
14943 w3
[1] = __byte_perm (w0
[2], w0
[1], selector
);
14944 w3
[0] = __byte_perm (w0
[1], w0
[0], selector
);
14945 w2
[3] = __byte_perm (w0
[0], 0, selector
);
14960 w3
[1] = __byte_perm (w0
[1], w0
[0], selector
);
14961 w3
[0] = __byte_perm (w0
[0], 0, selector
);
14977 w3
[1] = __byte_perm (w0
[0], 0, selector
);
14997 __device__
static u32
check_vector_accessible (const u32 il_pos
, const u32 bf_loops
, const u32 bfs_cnt
, const u32 element
)
15001 // nothing to do here
15005 if ((il_pos
+ 1) == bf_loops
)
15008 u32 bfs_over
= bfs_cnt
% 2;
15010 if (bfs_over
== 0) bfs_over
= 2;
15014 u32 bfs_over
= bfs_cnt
% 4;
15016 if (bfs_over
== 0) bfs_over
= 4;
15019 if (element
>= bfs_over
) return 0;