2 * Author......: Jens Steube <jens.steube@gmail.com>
6 static int device_memcmp (const u32 d1
[4], __global u32
*d2
)
8 if (d1
[3] > d2
[DGST_R3
]) return ( 1);
9 if (d1
[3] < d2
[DGST_R3
]) return (-1);
10 if (d1
[2] > d2
[DGST_R2
]) return ( 1);
11 if (d1
[2] < d2
[DGST_R2
]) return (-1);
12 if (d1
[1] > d2
[DGST_R1
]) return ( 1);
13 if (d1
[1] < d2
[DGST_R1
]) return (-1);
14 if (d1
[0] > d2
[DGST_R0
]) return ( 1);
15 if (d1
[0] < d2
[DGST_R0
]) return (-1);
20 static int find_hash (const u32 digest
[4], const u32 digests_cnt
, __global digest_t
*digests_buf
)
22 for (u32 l
= 0, r
= digests_cnt
; r
; r
>>= 1)
28 const int cmp
= device_memcmp (digest
, digests_buf
[c
].digest_buf
);
37 if (cmp
== 0) return (c
);
43 static u32
check_bitmap (__global u32
*bitmap
, const u32 bitmap_mask
, const u32 bitmap_shift
, const u32 digest
)
45 return (bitmap
[(digest
>> bitmap_shift
) & bitmap_mask
] & (1 << (digest
& 0x1f)));
48 static u32
check (const u32 digest
[2], __global u32
*bitmap_s1_a
, __global u32
*bitmap_s1_b
, __global u32
*bitmap_s1_c
, __global u32
*bitmap_s1_d
, __global u32
*bitmap_s2_a
, __global u32
*bitmap_s2_b
, __global u32
*bitmap_s2_c
, __global u32
*bitmap_s2_d
, const u32 bitmap_mask
, const u32 bitmap_shift1
, const u32 bitmap_shift2
)
50 if (check_bitmap (bitmap_s1_a
, bitmap_mask
, bitmap_shift1
, digest
[0]) == 0) return (0);
51 if (check_bitmap (bitmap_s1_b
, bitmap_mask
, bitmap_shift1
, digest
[1]) == 0) return (0);
52 if (check_bitmap (bitmap_s1_c
, bitmap_mask
, bitmap_shift1
, digest
[2]) == 0) return (0);
53 if (check_bitmap (bitmap_s1_d
, bitmap_mask
, bitmap_shift1
, digest
[3]) == 0) return (0);
55 if (check_bitmap (bitmap_s2_a
, bitmap_mask
, bitmap_shift2
, digest
[0]) == 0) return (0);
56 if (check_bitmap (bitmap_s2_b
, bitmap_mask
, bitmap_shift2
, digest
[1]) == 0) return (0);
57 if (check_bitmap (bitmap_s2_c
, bitmap_mask
, bitmap_shift2
, digest
[2]) == 0) return (0);
58 if (check_bitmap (bitmap_s2_d
, bitmap_mask
, bitmap_shift2
, digest
[3]) == 0) return (0);
63 static void mark_hash (__global plain_t
*plains_buf
, __global u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
65 hashes_shown
[hash_pos
] = 1;
67 plains_buf
[hash_pos
].gidvid
= (gid
* 1) + 0;
68 plains_buf
[hash_pos
].il_pos
= il_pos
;
71 static void truncate_block (u32 w
[4], const u32 len
)
80 case 1: w
[0] &= 0x000000FF;
85 case 2: w
[0] &= 0x0000FFFF;
90 case 3: w
[0] &= 0x00FFFFFF;
99 case 5: w
[1] &= 0x000000FF;
103 case 6: w
[1] &= 0x0000FFFF;
107 case 7: w
[1] &= 0x00FFFFFF;
114 case 9: w
[2] &= 0x000000FF;
117 case 10: w
[2] &= 0x0000FFFF;
120 case 11: w
[2] &= 0x00FFFFFF;
125 case 13: w
[3] &= 0x000000FF;
127 case 14: w
[3] &= 0x0000FFFF;
129 case 15: w
[3] &= 0x00FFFFFF;
134 static void make_unicode (const u32 in
[4], u32 out1
[4], u32 out2
[4])
137 out2
[3] = __byte_perm (in
[3], 0, 0x7372);
138 out2
[2] = __byte_perm (in
[3], 0, 0x7170);
139 out2
[1] = __byte_perm (in
[2], 0, 0x7372);
140 out2
[0] = __byte_perm (in
[2], 0, 0x7170);
141 out1
[3] = __byte_perm (in
[1], 0, 0x7372);
142 out1
[2] = __byte_perm (in
[1], 0, 0x7170);
143 out1
[1] = __byte_perm (in
[0], 0, 0x7372);
144 out1
[0] = __byte_perm (in
[0], 0, 0x7170);
148 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
149 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
150 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
151 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
152 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
153 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
154 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
155 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
159 static void undo_unicode (const u32 in1
[4], const u32 in2
[4], u32 out
[4])
162 out
[0] = __byte_perm (in1
[0], in1
[1], 0x6420);
163 out
[1] = __byte_perm (in1
[2], in1
[3], 0x6420);
164 out
[2] = __byte_perm (in2
[0], in2
[1], 0x6420);
165 out
[3] = __byte_perm (in2
[2], in2
[3], 0x6420);
169 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
170 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
171 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
172 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
173 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
174 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
175 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
176 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
180 // before: append_0x01_1
181 static void append_0x01_1x4 (u32 w0
[4], const u32 offset
)
190 w0
[0] = w0
[0] | 0x0100;
194 w0
[0] = w0
[0] | 0x010000;
198 w0
[0] = w0
[0] | 0x01000000;
206 w0
[1] = w0
[1] | 0x0100;
210 w0
[1] = w0
[1] | 0x010000;
214 w0
[1] = w0
[1] | 0x01000000;
222 w0
[2] = w0
[2] | 0x0100;
226 w0
[2] = w0
[2] | 0x010000;
230 w0
[2] = w0
[2] | 0x01000000;
238 w0
[3] = w0
[3] | 0x0100;
242 w0
[3] = w0
[3] | 0x010000;
246 w0
[3] = w0
[3] | 0x01000000;
251 // before: append_0x01_2
252 static void append_0x01_2x4 (u32 w0
[4], u32 w1
[4], const u32 offset
)
261 w0
[0] = w0
[0] | 0x0100;
265 w0
[0] = w0
[0] | 0x010000;
269 w0
[0] = w0
[0] | 0x01000000;
277 w0
[1] = w0
[1] | 0x0100;
281 w0
[1] = w0
[1] | 0x010000;
285 w0
[1] = w0
[1] | 0x01000000;
293 w0
[2] = w0
[2] | 0x0100;
297 w0
[2] = w0
[2] | 0x010000;
301 w0
[2] = w0
[2] | 0x01000000;
309 w0
[3] = w0
[3] | 0x0100;
313 w0
[3] = w0
[3] | 0x010000;
317 w0
[3] = w0
[3] | 0x01000000;
325 w1
[0] = w1
[0] | 0x0100;
329 w1
[0] = w1
[0] | 0x010000;
333 w1
[0] = w1
[0] | 0x01000000;
341 w1
[1] = w1
[1] | 0x0100;
345 w1
[1] = w1
[1] | 0x010000;
349 w1
[1] = w1
[1] | 0x01000000;
357 w1
[2] = w1
[2] | 0x0100;
361 w1
[2] = w1
[2] | 0x010000;
365 w1
[2] = w1
[2] | 0x01000000;
373 w1
[3] = w1
[3] | 0x0100;
377 w1
[3] = w1
[3] | 0x010000;
381 w1
[3] = w1
[3] | 0x01000000;
386 // before: append_0x01_3
387 static void append_0x01_3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
396 w0
[0] = w0
[0] | 0x0100;
400 w0
[0] = w0
[0] | 0x010000;
404 w0
[0] = w0
[0] | 0x01000000;
412 w0
[1] = w0
[1] | 0x0100;
416 w0
[1] = w0
[1] | 0x010000;
420 w0
[1] = w0
[1] | 0x01000000;
428 w0
[2] = w0
[2] | 0x0100;
432 w0
[2] = w0
[2] | 0x010000;
436 w0
[2] = w0
[2] | 0x01000000;
444 w0
[3] = w0
[3] | 0x0100;
448 w0
[3] = w0
[3] | 0x010000;
452 w0
[3] = w0
[3] | 0x01000000;
460 w1
[0] = w1
[0] | 0x0100;
464 w1
[0] = w1
[0] | 0x010000;
468 w1
[0] = w1
[0] | 0x01000000;
476 w1
[1] = w1
[1] | 0x0100;
480 w1
[1] = w1
[1] | 0x010000;
484 w1
[1] = w1
[1] | 0x01000000;
492 w1
[2] = w1
[2] | 0x0100;
496 w1
[2] = w1
[2] | 0x010000;
500 w1
[2] = w1
[2] | 0x01000000;
508 w1
[3] = w1
[3] | 0x0100;
512 w1
[3] = w1
[3] | 0x010000;
516 w1
[3] = w1
[3] | 0x01000000;
524 w2
[0] = w2
[0] | 0x0100;
528 w2
[0] = w2
[0] | 0x010000;
532 w2
[0] = w2
[0] | 0x01000000;
540 w2
[1] = w2
[1] | 0x0100;
544 w2
[1] = w2
[1] | 0x010000;
548 w2
[1] = w2
[1] | 0x01000000;
556 w2
[2] = w2
[2] | 0x0100;
560 w2
[2] = w2
[2] | 0x010000;
564 w2
[2] = w2
[2] | 0x01000000;
572 w2
[3] = w2
[3] | 0x0100;
576 w2
[3] = w2
[3] | 0x010000;
580 w2
[3] = w2
[3] | 0x01000000;
585 // before: append_0x01_4
586 static void append_0x01_4x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
595 w0
[0] = w0
[0] | 0x0100;
599 w0
[0] = w0
[0] | 0x010000;
603 w0
[0] = w0
[0] | 0x01000000;
611 w0
[1] = w0
[1] | 0x0100;
615 w0
[1] = w0
[1] | 0x010000;
619 w0
[1] = w0
[1] | 0x01000000;
627 w0
[2] = w0
[2] | 0x0100;
631 w0
[2] = w0
[2] | 0x010000;
635 w0
[2] = w0
[2] | 0x01000000;
643 w0
[3] = w0
[3] | 0x0100;
647 w0
[3] = w0
[3] | 0x010000;
651 w0
[3] = w0
[3] | 0x01000000;
659 w1
[0] = w1
[0] | 0x0100;
663 w1
[0] = w1
[0] | 0x010000;
667 w1
[0] = w1
[0] | 0x01000000;
675 w1
[1] = w1
[1] | 0x0100;
679 w1
[1] = w1
[1] | 0x010000;
683 w1
[1] = w1
[1] | 0x01000000;
691 w1
[2] = w1
[2] | 0x0100;
695 w1
[2] = w1
[2] | 0x010000;
699 w1
[2] = w1
[2] | 0x01000000;
707 w1
[3] = w1
[3] | 0x0100;
711 w1
[3] = w1
[3] | 0x010000;
715 w1
[3] = w1
[3] | 0x01000000;
723 w2
[0] = w2
[0] | 0x0100;
727 w2
[0] = w2
[0] | 0x010000;
731 w2
[0] = w2
[0] | 0x01000000;
739 w2
[1] = w2
[1] | 0x0100;
743 w2
[1] = w2
[1] | 0x010000;
747 w2
[1] = w2
[1] | 0x01000000;
755 w2
[2] = w2
[2] | 0x0100;
759 w2
[2] = w2
[2] | 0x010000;
763 w2
[2] = w2
[2] | 0x01000000;
771 w2
[3] = w2
[3] | 0x0100;
775 w2
[3] = w2
[3] | 0x010000;
779 w2
[3] = w2
[3] | 0x01000000;
787 w3
[0] = w3
[0] | 0x0100;
791 w3
[0] = w3
[0] | 0x010000;
795 w3
[0] = w3
[0] | 0x01000000;
803 w3
[1] = w3
[1] | 0x0100;
807 w3
[1] = w3
[1] | 0x010000;
811 w3
[1] = w3
[1] | 0x01000000;
819 w3
[2] = w3
[2] | 0x0100;
823 w3
[2] = w3
[2] | 0x010000;
827 w3
[2] = w3
[2] | 0x01000000;
835 w3
[3] = w3
[3] | 0x0100;
839 w3
[3] = w3
[3] | 0x010000;
843 w3
[3] = w3
[3] | 0x01000000;
848 // before: append_0x01_8
849 static void append_0x01_8x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
858 w0
[0] = w0
[0] | 0x0100;
862 w0
[0] = w0
[0] | 0x010000;
866 w0
[0] = w0
[0] | 0x01000000;
874 w0
[1] = w0
[1] | 0x0100;
878 w0
[1] = w0
[1] | 0x010000;
882 w0
[1] = w0
[1] | 0x01000000;
890 w0
[2] = w0
[2] | 0x0100;
894 w0
[2] = w0
[2] | 0x010000;
898 w0
[2] = w0
[2] | 0x01000000;
906 w0
[3] = w0
[3] | 0x0100;
910 w0
[3] = w0
[3] | 0x010000;
914 w0
[3] = w0
[3] | 0x01000000;
922 w1
[0] = w1
[0] | 0x0100;
926 w1
[0] = w1
[0] | 0x010000;
930 w1
[0] = w1
[0] | 0x01000000;
938 w1
[1] = w1
[1] | 0x0100;
942 w1
[1] = w1
[1] | 0x010000;
946 w1
[1] = w1
[1] | 0x01000000;
954 w1
[2] = w1
[2] | 0x0100;
958 w1
[2] = w1
[2] | 0x010000;
962 w1
[2] = w1
[2] | 0x01000000;
970 w1
[3] = w1
[3] | 0x0100;
974 w1
[3] = w1
[3] | 0x010000;
978 w1
[3] = w1
[3] | 0x01000000;
986 w2
[0] = w2
[0] | 0x0100;
990 w2
[0] = w2
[0] | 0x010000;
994 w2
[0] = w2
[0] | 0x01000000;
1002 w2
[1] = w2
[1] | 0x0100;
1006 w2
[1] = w2
[1] | 0x010000;
1010 w2
[1] = w2
[1] | 0x01000000;
1018 w2
[2] = w2
[2] | 0x0100;
1022 w2
[2] = w2
[2] | 0x010000;
1026 w2
[2] = w2
[2] | 0x01000000;
1034 w2
[3] = w2
[3] | 0x0100;
1038 w2
[3] = w2
[3] | 0x010000;
1042 w2
[3] = w2
[3] | 0x01000000;
1050 w3
[0] = w3
[0] | 0x0100;
1054 w3
[0] = w3
[0] | 0x010000;
1058 w3
[0] = w3
[0] | 0x01000000;
1066 w3
[1] = w3
[1] | 0x0100;
1070 w3
[1] = w3
[1] | 0x010000;
1074 w3
[1] = w3
[1] | 0x01000000;
1082 w3
[2] = w3
[2] | 0x0100;
1086 w3
[2] = w3
[2] | 0x010000;
1090 w3
[2] = w3
[2] | 0x01000000;
1098 w3
[3] = w3
[3] | 0x0100;
1102 w3
[3] = w3
[3] | 0x010000;
1106 w3
[3] = w3
[3] | 0x01000000;
1114 w4
[0] = w4
[0] | 0x0100;
1118 w4
[0] = w4
[0] | 0x010000;
1122 w4
[0] = w4
[0] | 0x01000000;
1130 w4
[1] = w4
[1] | 0x0100;
1134 w4
[1] = w4
[1] | 0x010000;
1138 w4
[1] = w4
[1] | 0x01000000;
1146 w4
[2] = w4
[2] | 0x0100;
1150 w4
[2] = w4
[2] | 0x010000;
1154 w4
[2] = w4
[2] | 0x01000000;
1162 w4
[3] = w4
[3] | 0x0100;
1166 w4
[3] = w4
[3] | 0x010000;
1170 w4
[3] = w4
[3] | 0x01000000;
1178 w5
[0] = w5
[0] | 0x0100;
1182 w5
[0] = w5
[0] | 0x010000;
1186 w5
[0] = w5
[0] | 0x01000000;
1194 w5
[1] = w5
[1] | 0x0100;
1198 w5
[1] = w5
[1] | 0x010000;
1202 w5
[1] = w5
[1] | 0x01000000;
1210 w5
[2] = w5
[2] | 0x0100;
1214 w5
[2] = w5
[2] | 0x010000;
1218 w5
[2] = w5
[2] | 0x01000000;
1226 w5
[3] = w5
[3] | 0x0100;
1230 w5
[3] = w5
[3] | 0x010000;
1234 w5
[3] = w5
[3] | 0x01000000;
1242 w6
[0] = w6
[0] | 0x0100;
1246 w6
[0] = w6
[0] | 0x010000;
1250 w6
[0] = w6
[0] | 0x01000000;
1258 w6
[1] = w6
[1] | 0x0100;
1262 w6
[1] = w6
[1] | 0x010000;
1266 w6
[1] = w6
[1] | 0x01000000;
1274 w6
[2] = w6
[2] | 0x0100;
1278 w6
[2] = w6
[2] | 0x010000;
1282 w6
[2] = w6
[2] | 0x01000000;
1290 w6
[3] = w6
[3] | 0x0100;
1294 w6
[3] = w6
[3] | 0x010000;
1298 w6
[3] = w6
[3] | 0x01000000;
1306 w7
[0] = w7
[0] | 0x0100;
1310 w7
[0] = w7
[0] | 0x010000;
1314 w7
[0] = w7
[0] | 0x01000000;
1322 w7
[1] = w7
[1] | 0x0100;
1326 w7
[1] = w7
[1] | 0x010000;
1330 w7
[1] = w7
[1] | 0x01000000;
1338 w7
[2] = w7
[2] | 0x0100;
1342 w7
[2] = w7
[2] | 0x010000;
1346 w7
[2] = w7
[2] | 0x01000000;
1354 w7
[3] = w7
[3] | 0x0100;
1358 w7
[3] = w7
[3] | 0x010000;
1362 w7
[3] = w7
[3] | 0x01000000;
1367 // before: append_0x02_1
1368 static void append_0x02_1x4 (u32 w0
[4], const u32 offset
)
1377 w0
[0] = w0
[0] | 0x0200;
1381 w0
[0] = w0
[0] | 0x020000;
1385 w0
[0] = w0
[0] | 0x02000000;
1393 w0
[1] = w0
[1] | 0x0200;
1397 w0
[1] = w0
[1] | 0x020000;
1401 w0
[1] = w0
[1] | 0x02000000;
1409 w0
[2] = w0
[2] | 0x0200;
1413 w0
[2] = w0
[2] | 0x020000;
1417 w0
[2] = w0
[2] | 0x02000000;
1425 w0
[3] = w0
[3] | 0x0200;
1429 w0
[3] = w0
[3] | 0x020000;
1433 w0
[3] = w0
[3] | 0x02000000;
1438 // before: append_0x02_2
1439 static void append_0x02_2x4 (u32 w0
[4], u32 w1
[4], const u32 offset
)
1448 w0
[0] = w0
[0] | 0x0200;
1452 w0
[0] = w0
[0] | 0x020000;
1456 w0
[0] = w0
[0] | 0x02000000;
1464 w0
[1] = w0
[1] | 0x0200;
1468 w0
[1] = w0
[1] | 0x020000;
1472 w0
[1] = w0
[1] | 0x02000000;
1480 w0
[2] = w0
[2] | 0x0200;
1484 w0
[2] = w0
[2] | 0x020000;
1488 w0
[2] = w0
[2] | 0x02000000;
1496 w0
[3] = w0
[3] | 0x0200;
1500 w0
[3] = w0
[3] | 0x020000;
1504 w0
[3] = w0
[3] | 0x02000000;
1512 w1
[0] = w1
[0] | 0x0200;
1516 w1
[0] = w1
[0] | 0x020000;
1520 w1
[0] = w1
[0] | 0x02000000;
1528 w1
[1] = w1
[1] | 0x0200;
1532 w1
[1] = w1
[1] | 0x020000;
1536 w1
[1] = w1
[1] | 0x02000000;
1544 w1
[2] = w1
[2] | 0x0200;
1548 w1
[2] = w1
[2] | 0x020000;
1552 w1
[2] = w1
[2] | 0x02000000;
1560 w1
[3] = w1
[3] | 0x0200;
1564 w1
[3] = w1
[3] | 0x020000;
1568 w1
[3] = w1
[3] | 0x02000000;
1573 // before: append_0x02_3
1574 static void append_0x02_3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
1583 w0
[0] = w0
[0] | 0x0200;
1587 w0
[0] = w0
[0] | 0x020000;
1591 w0
[0] = w0
[0] | 0x02000000;
1599 w0
[1] = w0
[1] | 0x0200;
1603 w0
[1] = w0
[1] | 0x020000;
1607 w0
[1] = w0
[1] | 0x02000000;
1615 w0
[2] = w0
[2] | 0x0200;
1619 w0
[2] = w0
[2] | 0x020000;
1623 w0
[2] = w0
[2] | 0x02000000;
1631 w0
[3] = w0
[3] | 0x0200;
1635 w0
[3] = w0
[3] | 0x020000;
1639 w0
[3] = w0
[3] | 0x02000000;
1647 w1
[0] = w1
[0] | 0x0200;
1651 w1
[0] = w1
[0] | 0x020000;
1655 w1
[0] = w1
[0] | 0x02000000;
1663 w1
[1] = w1
[1] | 0x0200;
1667 w1
[1] = w1
[1] | 0x020000;
1671 w1
[1] = w1
[1] | 0x02000000;
1679 w1
[2] = w1
[2] | 0x0200;
1683 w1
[2] = w1
[2] | 0x020000;
1687 w1
[2] = w1
[2] | 0x02000000;
1695 w1
[3] = w1
[3] | 0x0200;
1699 w1
[3] = w1
[3] | 0x020000;
1703 w1
[3] = w1
[3] | 0x02000000;
1711 w2
[0] = w2
[0] | 0x0200;
1715 w2
[0] = w2
[0] | 0x020000;
1719 w2
[0] = w2
[0] | 0x02000000;
1727 w2
[1] = w2
[1] | 0x0200;
1731 w2
[1] = w2
[1] | 0x020000;
1735 w2
[1] = w2
[1] | 0x02000000;
1743 w2
[2] = w2
[2] | 0x0200;
1747 w2
[2] = w2
[2] | 0x020000;
1751 w2
[2] = w2
[2] | 0x02000000;
1759 w2
[3] = w2
[3] | 0x0200;
1763 w2
[3] = w2
[3] | 0x020000;
1767 w2
[3] = w2
[3] | 0x02000000;
1772 // before: append_0x02_4
1773 static void append_0x02_4x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
1782 w0
[0] = w0
[0] | 0x0200;
1786 w0
[0] = w0
[0] | 0x020000;
1790 w0
[0] = w0
[0] | 0x02000000;
1798 w0
[1] = w0
[1] | 0x0200;
1802 w0
[1] = w0
[1] | 0x020000;
1806 w0
[1] = w0
[1] | 0x02000000;
1814 w0
[2] = w0
[2] | 0x0200;
1818 w0
[2] = w0
[2] | 0x020000;
1822 w0
[2] = w0
[2] | 0x02000000;
1830 w0
[3] = w0
[3] | 0x0200;
1834 w0
[3] = w0
[3] | 0x020000;
1838 w0
[3] = w0
[3] | 0x02000000;
1846 w1
[0] = w1
[0] | 0x0200;
1850 w1
[0] = w1
[0] | 0x020000;
1854 w1
[0] = w1
[0] | 0x02000000;
1862 w1
[1] = w1
[1] | 0x0200;
1866 w1
[1] = w1
[1] | 0x020000;
1870 w1
[1] = w1
[1] | 0x02000000;
1878 w1
[2] = w1
[2] | 0x0200;
1882 w1
[2] = w1
[2] | 0x020000;
1886 w1
[2] = w1
[2] | 0x02000000;
1894 w1
[3] = w1
[3] | 0x0200;
1898 w1
[3] = w1
[3] | 0x020000;
1902 w1
[3] = w1
[3] | 0x02000000;
1910 w2
[0] = w2
[0] | 0x0200;
1914 w2
[0] = w2
[0] | 0x020000;
1918 w2
[0] = w2
[0] | 0x02000000;
1926 w2
[1] = w2
[1] | 0x0200;
1930 w2
[1] = w2
[1] | 0x020000;
1934 w2
[1] = w2
[1] | 0x02000000;
1942 w2
[2] = w2
[2] | 0x0200;
1946 w2
[2] = w2
[2] | 0x020000;
1950 w2
[2] = w2
[2] | 0x02000000;
1958 w2
[3] = w2
[3] | 0x0200;
1962 w2
[3] = w2
[3] | 0x020000;
1966 w2
[3] = w2
[3] | 0x02000000;
1974 w3
[0] = w3
[0] | 0x0200;
1978 w3
[0] = w3
[0] | 0x020000;
1982 w3
[0] = w3
[0] | 0x02000000;
1990 w3
[1] = w3
[1] | 0x0200;
1994 w3
[1] = w3
[1] | 0x020000;
1998 w3
[1] = w3
[1] | 0x02000000;
2006 w3
[2] = w3
[2] | 0x0200;
2010 w3
[2] = w3
[2] | 0x020000;
2014 w3
[2] = w3
[2] | 0x02000000;
2022 w3
[3] = w3
[3] | 0x0200;
2026 w3
[3] = w3
[3] | 0x020000;
2030 w3
[3] = w3
[3] | 0x02000000;
2035 // before: append_0x02_8
2036 static void append_0x02_8 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
2045 w0
[0] = w0
[0] | 0x0200;
2049 w0
[0] = w0
[0] | 0x020000;
2053 w0
[0] = w0
[0] | 0x02000000;
2061 w0
[1] = w0
[1] | 0x0200;
2065 w0
[1] = w0
[1] | 0x020000;
2069 w0
[1] = w0
[1] | 0x02000000;
2077 w0
[2] = w0
[2] | 0x0200;
2081 w0
[2] = w0
[2] | 0x020000;
2085 w0
[2] = w0
[2] | 0x02000000;
2093 w0
[3] = w0
[3] | 0x0200;
2097 w0
[3] = w0
[3] | 0x020000;
2101 w0
[3] = w0
[3] | 0x02000000;
2109 w1
[0] = w1
[0] | 0x0200;
2113 w1
[0] = w1
[0] | 0x020000;
2117 w1
[0] = w1
[0] | 0x02000000;
2125 w1
[1] = w1
[1] | 0x0200;
2129 w1
[1] = w1
[1] | 0x020000;
2133 w1
[1] = w1
[1] | 0x02000000;
2141 w1
[2] = w1
[2] | 0x0200;
2145 w1
[2] = w1
[2] | 0x020000;
2149 w1
[2] = w1
[2] | 0x02000000;
2157 w1
[3] = w1
[3] | 0x0200;
2161 w1
[3] = w1
[3] | 0x020000;
2165 w1
[3] = w1
[3] | 0x02000000;
2173 w2
[0] = w2
[0] | 0x0200;
2177 w2
[0] = w2
[0] | 0x020000;
2181 w2
[0] = w2
[0] | 0x02000000;
2189 w2
[1] = w2
[1] | 0x0200;
2193 w2
[1] = w2
[1] | 0x020000;
2197 w2
[1] = w2
[1] | 0x02000000;
2205 w2
[2] = w2
[2] | 0x0200;
2209 w2
[2] = w2
[2] | 0x020000;
2213 w2
[2] = w2
[2] | 0x02000000;
2221 w2
[3] = w2
[3] | 0x0200;
2225 w2
[3] = w2
[3] | 0x020000;
2229 w2
[3] = w2
[3] | 0x02000000;
2237 w3
[0] = w3
[0] | 0x0200;
2241 w3
[0] = w3
[0] | 0x020000;
2245 w3
[0] = w3
[0] | 0x02000000;
2253 w3
[1] = w3
[1] | 0x0200;
2257 w3
[1] = w3
[1] | 0x020000;
2261 w3
[1] = w3
[1] | 0x02000000;
2269 w3
[2] = w3
[2] | 0x0200;
2273 w3
[2] = w3
[2] | 0x020000;
2277 w3
[2] = w3
[2] | 0x02000000;
2285 w3
[3] = w3
[3] | 0x0200;
2289 w3
[3] = w3
[3] | 0x020000;
2293 w3
[3] = w3
[3] | 0x02000000;
2301 w4
[0] = w4
[0] | 0x0200;
2305 w4
[0] = w4
[0] | 0x020000;
2309 w4
[0] = w4
[0] | 0x02000000;
2317 w4
[1] = w4
[1] | 0x0200;
2321 w4
[1] = w4
[1] | 0x020000;
2325 w4
[1] = w4
[1] | 0x02000000;
2333 w4
[2] = w4
[2] | 0x0200;
2337 w4
[2] = w4
[2] | 0x020000;
2341 w4
[2] = w4
[2] | 0x02000000;
2349 w4
[3] = w4
[3] | 0x0200;
2353 w4
[3] = w4
[3] | 0x020000;
2357 w4
[3] = w4
[3] | 0x02000000;
2365 w5
[0] = w5
[0] | 0x0200;
2369 w5
[0] = w5
[0] | 0x020000;
2373 w5
[0] = w5
[0] | 0x02000000;
2381 w5
[1] = w5
[1] | 0x0200;
2385 w5
[1] = w5
[1] | 0x020000;
2389 w5
[1] = w5
[1] | 0x02000000;
2397 w5
[2] = w5
[2] | 0x0200;
2401 w5
[2] = w5
[2] | 0x020000;
2405 w5
[2] = w5
[2] | 0x02000000;
2413 w5
[3] = w5
[3] | 0x0200;
2417 w5
[3] = w5
[3] | 0x020000;
2421 w5
[3] = w5
[3] | 0x02000000;
2429 w6
[0] = w6
[0] | 0x0200;
2433 w6
[0] = w6
[0] | 0x020000;
2437 w6
[0] = w6
[0] | 0x02000000;
2445 w6
[1] = w6
[1] | 0x0200;
2449 w6
[1] = w6
[1] | 0x020000;
2453 w6
[1] = w6
[1] | 0x02000000;
2461 w6
[2] = w6
[2] | 0x0200;
2465 w6
[2] = w6
[2] | 0x020000;
2469 w6
[2] = w6
[2] | 0x02000000;
2477 w6
[3] = w6
[3] | 0x0200;
2481 w6
[3] = w6
[3] | 0x020000;
2485 w6
[3] = w6
[3] | 0x02000000;
2493 w7
[0] = w7
[0] | 0x0200;
2497 w7
[0] = w7
[0] | 0x020000;
2501 w7
[0] = w7
[0] | 0x02000000;
2509 w7
[1] = w7
[1] | 0x0200;
2513 w7
[1] = w7
[1] | 0x020000;
2517 w7
[1] = w7
[1] | 0x02000000;
2525 w7
[2] = w7
[2] | 0x0200;
2529 w7
[2] = w7
[2] | 0x020000;
2533 w7
[2] = w7
[2] | 0x02000000;
2541 w7
[3] = w7
[3] | 0x0200;
2545 w7
[3] = w7
[3] | 0x020000;
2549 w7
[3] = w7
[3] | 0x02000000;
2554 // before: append_0x80_1
2555 static void append_0x80_1x4 (u32 w0
[4], const u32 offset
)
2564 w0
[0] = w0
[0] | 0x8000;
2568 w0
[0] = w0
[0] | 0x800000;
2572 w0
[0] = w0
[0] | 0x80000000;
2580 w0
[1] = w0
[1] | 0x8000;
2584 w0
[1] = w0
[1] | 0x800000;
2588 w0
[1] = w0
[1] | 0x80000000;
2596 w0
[2] = w0
[2] | 0x8000;
2600 w0
[2] = w0
[2] | 0x800000;
2604 w0
[2] = w0
[2] | 0x80000000;
2612 w0
[3] = w0
[3] | 0x8000;
2616 w0
[3] = w0
[3] | 0x800000;
2620 w0
[3] = w0
[3] | 0x80000000;
2625 // before: append_0x80_2
2626 static void append_0x80_2x4 (u32 w0
[4], u32 w1
[4], const u32 offset
)
2635 w0
[0] = w0
[0] | 0x8000;
2639 w0
[0] = w0
[0] | 0x800000;
2643 w0
[0] = w0
[0] | 0x80000000;
2651 w0
[1] = w0
[1] | 0x8000;
2655 w0
[1] = w0
[1] | 0x800000;
2659 w0
[1] = w0
[1] | 0x80000000;
2667 w0
[2] = w0
[2] | 0x8000;
2671 w0
[2] = w0
[2] | 0x800000;
2675 w0
[2] = w0
[2] | 0x80000000;
2683 w0
[3] = w0
[3] | 0x8000;
2687 w0
[3] = w0
[3] | 0x800000;
2691 w0
[3] = w0
[3] | 0x80000000;
2699 w1
[0] = w1
[0] | 0x8000;
2703 w1
[0] = w1
[0] | 0x800000;
2707 w1
[0] = w1
[0] | 0x80000000;
2715 w1
[1] = w1
[1] | 0x8000;
2719 w1
[1] = w1
[1] | 0x800000;
2723 w1
[1] = w1
[1] | 0x80000000;
2731 w1
[2] = w1
[2] | 0x8000;
2735 w1
[2] = w1
[2] | 0x800000;
2739 w1
[2] = w1
[2] | 0x80000000;
2747 w1
[3] = w1
[3] | 0x8000;
2751 w1
[3] = w1
[3] | 0x800000;
2755 w1
[3] = w1
[3] | 0x80000000;
2760 // before: append_0x80_2_be
2761 static void append_0x80_2x4_be (u32 w0
[4], u32 w1
[4], const u32 offset
)
2766 w0
[0] |= 0x80000000;
2782 w0
[1] |= 0x80000000;
2798 w0
[2] |= 0x80000000;
2814 w0
[3] |= 0x80000000;
2830 w1
[0] |= 0x80000000;
2846 w1
[1] |= 0x80000000;
2862 w1
[2] |= 0x80000000;
2878 w1
[3] |= 0x80000000;
2895 // before: append_0x80_3
2896 static void append_0x80_3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
2905 w0
[0] = w0
[0] | 0x8000;
2909 w0
[0] = w0
[0] | 0x800000;
2913 w0
[0] = w0
[0] | 0x80000000;
2921 w0
[1] = w0
[1] | 0x8000;
2925 w0
[1] = w0
[1] | 0x800000;
2929 w0
[1] = w0
[1] | 0x80000000;
2937 w0
[2] = w0
[2] | 0x8000;
2941 w0
[2] = w0
[2] | 0x800000;
2945 w0
[2] = w0
[2] | 0x80000000;
2953 w0
[3] = w0
[3] | 0x8000;
2957 w0
[3] = w0
[3] | 0x800000;
2961 w0
[3] = w0
[3] | 0x80000000;
2969 w1
[0] = w1
[0] | 0x8000;
2973 w1
[0] = w1
[0] | 0x800000;
2977 w1
[0] = w1
[0] | 0x80000000;
2985 w1
[1] = w1
[1] | 0x8000;
2989 w1
[1] = w1
[1] | 0x800000;
2993 w1
[1] = w1
[1] | 0x80000000;
3001 w1
[2] = w1
[2] | 0x8000;
3005 w1
[2] = w1
[2] | 0x800000;
3009 w1
[2] = w1
[2] | 0x80000000;
3017 w1
[3] = w1
[3] | 0x8000;
3021 w1
[3] = w1
[3] | 0x800000;
3025 w1
[3] = w1
[3] | 0x80000000;
3033 w2
[0] = w2
[0] | 0x8000;
3037 w2
[0] = w2
[0] | 0x800000;
3041 w2
[0] = w2
[0] | 0x80000000;
3049 w2
[1] = w2
[1] | 0x8000;
3053 w2
[1] = w2
[1] | 0x800000;
3057 w2
[1] = w2
[1] | 0x80000000;
3065 w2
[2] = w2
[2] | 0x8000;
3069 w2
[2] = w2
[2] | 0x800000;
3073 w2
[2] = w2
[2] | 0x80000000;
3081 w2
[3] = w2
[3] | 0x8000;
3085 w2
[3] = w2
[3] | 0x800000;
3089 w2
[3] = w2
[3] | 0x80000000;
3094 // before: append_0x80_4
3095 static void append_0x80_4x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
3104 w0
[0] = w0
[0] | 0x8000;
3108 w0
[0] = w0
[0] | 0x800000;
3112 w0
[0] = w0
[0] | 0x80000000;
3120 w0
[1] = w0
[1] | 0x8000;
3124 w0
[1] = w0
[1] | 0x800000;
3128 w0
[1] = w0
[1] | 0x80000000;
3136 w0
[2] = w0
[2] | 0x8000;
3140 w0
[2] = w0
[2] | 0x800000;
3144 w0
[2] = w0
[2] | 0x80000000;
3152 w0
[3] = w0
[3] | 0x8000;
3156 w0
[3] = w0
[3] | 0x800000;
3160 w0
[3] = w0
[3] | 0x80000000;
3168 w1
[0] = w1
[0] | 0x8000;
3172 w1
[0] = w1
[0] | 0x800000;
3176 w1
[0] = w1
[0] | 0x80000000;
3184 w1
[1] = w1
[1] | 0x8000;
3188 w1
[1] = w1
[1] | 0x800000;
3192 w1
[1] = w1
[1] | 0x80000000;
3200 w1
[2] = w1
[2] | 0x8000;
3204 w1
[2] = w1
[2] | 0x800000;
3208 w1
[2] = w1
[2] | 0x80000000;
3216 w1
[3] = w1
[3] | 0x8000;
3220 w1
[3] = w1
[3] | 0x800000;
3224 w1
[3] = w1
[3] | 0x80000000;
3232 w2
[0] = w2
[0] | 0x8000;
3236 w2
[0] = w2
[0] | 0x800000;
3240 w2
[0] = w2
[0] | 0x80000000;
3248 w2
[1] = w2
[1] | 0x8000;
3252 w2
[1] = w2
[1] | 0x800000;
3256 w2
[1] = w2
[1] | 0x80000000;
3264 w2
[2] = w2
[2] | 0x8000;
3268 w2
[2] = w2
[2] | 0x800000;
3272 w2
[2] = w2
[2] | 0x80000000;
3280 w2
[3] = w2
[3] | 0x8000;
3284 w2
[3] = w2
[3] | 0x800000;
3288 w2
[3] = w2
[3] | 0x80000000;
3296 w3
[0] = w3
[0] | 0x8000;
3300 w3
[0] = w3
[0] | 0x800000;
3304 w3
[0] = w3
[0] | 0x80000000;
3312 w3
[1] = w3
[1] | 0x8000;
3316 w3
[1] = w3
[1] | 0x800000;
3320 w3
[1] = w3
[1] | 0x80000000;
3328 w3
[2] = w3
[2] | 0x8000;
3332 w3
[2] = w3
[2] | 0x800000;
3336 w3
[2] = w3
[2] | 0x80000000;
3344 w3
[3] = w3
[3] | 0x8000;
3348 w3
[3] = w3
[3] | 0x800000;
3352 w3
[3] = w3
[3] | 0x80000000;
3357 // before: append_0x80_8
3358 static void append_0x80_8x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
3367 w0
[0] = w0
[0] | 0x8000;
3371 w0
[0] = w0
[0] | 0x800000;
3375 w0
[0] = w0
[0] | 0x80000000;
3383 w0
[1] = w0
[1] | 0x8000;
3387 w0
[1] = w0
[1] | 0x800000;
3391 w0
[1] = w0
[1] | 0x80000000;
3399 w0
[2] = w0
[2] | 0x8000;
3403 w0
[2] = w0
[2] | 0x800000;
3407 w0
[2] = w0
[2] | 0x80000000;
3415 w0
[3] = w0
[3] | 0x8000;
3419 w0
[3] = w0
[3] | 0x800000;
3423 w0
[3] = w0
[3] | 0x80000000;
3431 w1
[0] = w1
[0] | 0x8000;
3435 w1
[0] = w1
[0] | 0x800000;
3439 w1
[0] = w1
[0] | 0x80000000;
3447 w1
[1] = w1
[1] | 0x8000;
3451 w1
[1] = w1
[1] | 0x800000;
3455 w1
[1] = w1
[1] | 0x80000000;
3463 w1
[2] = w1
[2] | 0x8000;
3467 w1
[2] = w1
[2] | 0x800000;
3471 w1
[2] = w1
[2] | 0x80000000;
3479 w1
[3] = w1
[3] | 0x8000;
3483 w1
[3] = w1
[3] | 0x800000;
3487 w1
[3] = w1
[3] | 0x80000000;
3495 w2
[0] = w2
[0] | 0x8000;
3499 w2
[0] = w2
[0] | 0x800000;
3503 w2
[0] = w2
[0] | 0x80000000;
3511 w2
[1] = w2
[1] | 0x8000;
3515 w2
[1] = w2
[1] | 0x800000;
3519 w2
[1] = w2
[1] | 0x80000000;
3527 w2
[2] = w2
[2] | 0x8000;
3531 w2
[2] = w2
[2] | 0x800000;
3535 w2
[2] = w2
[2] | 0x80000000;
3543 w2
[3] = w2
[3] | 0x8000;
3547 w2
[3] = w2
[3] | 0x800000;
3551 w2
[3] = w2
[3] | 0x80000000;
3559 w3
[0] = w3
[0] | 0x8000;
3563 w3
[0] = w3
[0] | 0x800000;
3567 w3
[0] = w3
[0] | 0x80000000;
3575 w3
[1] = w3
[1] | 0x8000;
3579 w3
[1] = w3
[1] | 0x800000;
3583 w3
[1] = w3
[1] | 0x80000000;
3591 w3
[2] = w3
[2] | 0x8000;
3595 w3
[2] = w3
[2] | 0x800000;
3599 w3
[2] = w3
[2] | 0x80000000;
3607 w3
[3] = w3
[3] | 0x8000;
3611 w3
[3] = w3
[3] | 0x800000;
3615 w3
[3] = w3
[3] | 0x80000000;
3623 w4
[0] = w4
[0] | 0x8000;
3627 w4
[0] = w4
[0] | 0x800000;
3631 w4
[0] = w4
[0] | 0x80000000;
3639 w4
[1] = w4
[1] | 0x8000;
3643 w4
[1] = w4
[1] | 0x800000;
3647 w4
[1] = w4
[1] | 0x80000000;
3655 w4
[2] = w4
[2] | 0x8000;
3659 w4
[2] = w4
[2] | 0x800000;
3663 w4
[2] = w4
[2] | 0x80000000;
3671 w4
[3] = w4
[3] | 0x8000;
3675 w4
[3] = w4
[3] | 0x800000;
3679 w4
[3] = w4
[3] | 0x80000000;
3687 w5
[0] = w5
[0] | 0x8000;
3691 w5
[0] = w5
[0] | 0x800000;
3695 w5
[0] = w5
[0] | 0x80000000;
3703 w5
[1] = w5
[1] | 0x8000;
3707 w5
[1] = w5
[1] | 0x800000;
3711 w5
[1] = w5
[1] | 0x80000000;
3719 w5
[2] = w5
[2] | 0x8000;
3723 w5
[2] = w5
[2] | 0x800000;
3727 w5
[2] = w5
[2] | 0x80000000;
3735 w5
[3] = w5
[3] | 0x8000;
3739 w5
[3] = w5
[3] | 0x800000;
3743 w5
[3] = w5
[3] | 0x80000000;
3751 w6
[0] = w6
[0] | 0x8000;
3755 w6
[0] = w6
[0] | 0x800000;
3759 w6
[0] = w6
[0] | 0x80000000;
3767 w6
[1] = w6
[1] | 0x8000;
3771 w6
[1] = w6
[1] | 0x800000;
3775 w6
[1] = w6
[1] | 0x80000000;
3783 w6
[2] = w6
[2] | 0x8000;
3787 w6
[2] = w6
[2] | 0x800000;
3791 w6
[2] = w6
[2] | 0x80000000;
3799 w6
[3] = w6
[3] | 0x8000;
3803 w6
[3] = w6
[3] | 0x800000;
3807 w6
[3] = w6
[3] | 0x80000000;
3815 w7
[0] = w7
[0] | 0x8000;
3819 w7
[0] = w7
[0] | 0x800000;
3823 w7
[0] = w7
[0] | 0x80000000;
3831 w7
[1] = w7
[1] | 0x8000;
3835 w7
[1] = w7
[1] | 0x800000;
3839 w7
[1] = w7
[1] | 0x80000000;
3847 w7
[2] = w7
[2] | 0x8000;
3851 w7
[2] = w7
[2] | 0x800000;
3855 w7
[2] = w7
[2] | 0x80000000;
3863 w7
[3] = w7
[3] | 0x8000;
3867 w7
[3] = w7
[3] | 0x800000;
3871 w7
[3] = w7
[3] | 0x80000000;
3876 // before: append_0x80_4
3877 static void append_0x80_1x16 (u32 w
[16], const u32 offset
)
3886 w
[ 0] = w
[ 0] | 0x8000;
3890 w
[ 0] = w
[ 0] | 0x800000;
3894 w
[ 0] = w
[ 0] | 0x80000000;
3902 w
[ 1] = w
[ 1] | 0x8000;
3906 w
[ 1] = w
[ 1] | 0x800000;
3910 w
[ 1] = w
[ 1] | 0x80000000;
3918 w
[ 2] = w
[ 2] | 0x8000;
3922 w
[ 2] = w
[ 2] | 0x800000;
3926 w
[ 2] = w
[ 2] | 0x80000000;
3934 w
[ 3] = w
[ 3] | 0x8000;
3938 w
[ 3] = w
[ 3] | 0x800000;
3942 w
[ 3] = w
[ 3] | 0x80000000;
3950 w
[ 4] = w
[ 4] | 0x8000;
3954 w
[ 4] = w
[ 4] | 0x800000;
3958 w
[ 4] = w
[ 4] | 0x80000000;
3966 w
[ 5] = w
[ 5] | 0x8000;
3970 w
[ 5] = w
[ 5] | 0x800000;
3974 w
[ 5] = w
[ 5] | 0x80000000;
3982 w
[ 6] = w
[ 6] | 0x8000;
3986 w
[ 6] = w
[ 6] | 0x800000;
3990 w
[ 6] = w
[ 6] | 0x80000000;
3998 w
[ 7] = w
[ 7] | 0x8000;
4002 w
[ 7] = w
[ 7] | 0x800000;
4006 w
[ 7] = w
[ 7] | 0x80000000;
4014 w
[ 8] = w
[ 8] | 0x8000;
4018 w
[ 8] = w
[ 8] | 0x800000;
4022 w
[ 8] = w
[ 8] | 0x80000000;
4030 w
[ 9] = w
[ 9] | 0x8000;
4034 w
[ 9] = w
[ 9] | 0x800000;
4038 w
[ 9] = w
[ 9] | 0x80000000;
4046 w
[10] = w
[10] | 0x8000;
4050 w
[10] = w
[10] | 0x800000;
4054 w
[10] = w
[10] | 0x80000000;
4062 w
[11] = w
[11] | 0x8000;
4066 w
[11] = w
[11] | 0x800000;
4070 w
[11] = w
[11] | 0x80000000;
4078 w
[12] = w
[12] | 0x8000;
4082 w
[12] = w
[12] | 0x800000;
4086 w
[12] = w
[12] | 0x80000000;
4094 w
[13] = w
[13] | 0x8000;
4098 w
[13] = w
[13] | 0x800000;
4102 w
[13] = w
[13] | 0x80000000;
4110 w
[14] = w
[14] | 0x8000;
4114 w
[14] = w
[14] | 0x800000;
4118 w
[14] = w
[14] | 0x80000000;
4126 w
[15] = w
[15] | 0x8000;
4130 w
[15] = w
[15] | 0x800000;
4134 w
[15] = w
[15] | 0x80000000;
4139 // before: append_0x80_8
4140 static void append_0x80_1x32 (u32 w
[32], const u32 offset
)
4149 w
[ 0] = w
[ 0] | 0x8000;
4153 w
[ 0] = w
[ 0] | 0x800000;
4157 w
[ 0] = w
[ 0] | 0x80000000;
4165 w
[ 1] = w
[ 1] | 0x8000;
4169 w
[ 1] = w
[ 1] | 0x800000;
4173 w
[ 1] = w
[ 1] | 0x80000000;
4181 w
[ 2] = w
[ 2] | 0x8000;
4185 w
[ 2] = w
[ 2] | 0x800000;
4189 w
[ 2] = w
[ 2] | 0x80000000;
4197 w
[ 3] = w
[ 3] | 0x8000;
4201 w
[ 3] = w
[ 3] | 0x800000;
4205 w
[ 3] = w
[ 3] | 0x80000000;
4213 w
[ 4] = w
[ 4] | 0x8000;
4217 w
[ 4] = w
[ 4] | 0x800000;
4221 w
[ 4] = w
[ 4] | 0x80000000;
4229 w
[ 5] = w
[ 5] | 0x8000;
4233 w
[ 5] = w
[ 5] | 0x800000;
4237 w
[ 5] = w
[ 5] | 0x80000000;
4245 w
[ 6] = w
[ 6] | 0x8000;
4249 w
[ 6] = w
[ 6] | 0x800000;
4253 w
[ 6] = w
[ 6] | 0x80000000;
4261 w
[ 7] = w
[ 7] | 0x8000;
4265 w
[ 7] = w
[ 7] | 0x800000;
4269 w
[ 7] = w
[ 7] | 0x80000000;
4277 w
[ 8] = w
[ 8] | 0x8000;
4281 w
[ 8] = w
[ 8] | 0x800000;
4285 w
[ 8] = w
[ 8] | 0x80000000;
4293 w
[ 9] = w
[ 9] | 0x8000;
4297 w
[ 9] = w
[ 9] | 0x800000;
4301 w
[ 9] = w
[ 9] | 0x80000000;
4309 w
[10] = w
[10] | 0x8000;
4313 w
[10] = w
[10] | 0x800000;
4317 w
[10] = w
[10] | 0x80000000;
4325 w
[11] = w
[11] | 0x8000;
4329 w
[11] = w
[11] | 0x800000;
4333 w
[11] = w
[11] | 0x80000000;
4341 w
[12] = w
[12] | 0x8000;
4345 w
[12] = w
[12] | 0x800000;
4349 w
[12] = w
[12] | 0x80000000;
4357 w
[13] = w
[13] | 0x8000;
4361 w
[13] = w
[13] | 0x800000;
4365 w
[13] = w
[13] | 0x80000000;
4373 w
[14] = w
[14] | 0x8000;
4377 w
[14] = w
[14] | 0x800000;
4381 w
[14] = w
[14] | 0x80000000;
4389 w
[15] = w
[15] | 0x8000;
4393 w
[15] = w
[15] | 0x800000;
4397 w
[15] = w
[15] | 0x80000000;
4405 w
[16] = w
[16] | 0x8000;
4409 w
[16] = w
[16] | 0x800000;
4413 w
[16] = w
[16] | 0x80000000;
4421 w
[17] = w
[17] | 0x8000;
4425 w
[17] = w
[17] | 0x800000;
4429 w
[17] = w
[17] | 0x80000000;
4437 w
[18] = w
[18] | 0x8000;
4441 w
[18] = w
[18] | 0x800000;
4445 w
[18] = w
[18] | 0x80000000;
4453 w
[19] = w
[19] | 0x8000;
4457 w
[19] = w
[19] | 0x800000;
4461 w
[19] = w
[19] | 0x80000000;
4469 w
[20] = w
[20] | 0x8000;
4473 w
[20] = w
[20] | 0x800000;
4477 w
[20] = w
[20] | 0x80000000;
4485 w
[21] = w
[21] | 0x8000;
4489 w
[21] = w
[21] | 0x800000;
4493 w
[21] = w
[21] | 0x80000000;
4501 w
[22] = w
[22] | 0x8000;
4505 w
[22] = w
[22] | 0x800000;
4509 w
[22] = w
[22] | 0x80000000;
4517 w
[23] = w
[23] | 0x8000;
4521 w
[23] = w
[23] | 0x800000;
4525 w
[23] = w
[23] | 0x80000000;
4533 w
[24] = w
[24] | 0x8000;
4537 w
[24] = w
[24] | 0x800000;
4541 w
[24] = w
[24] | 0x80000000;
4549 w
[25] = w
[25] | 0x8000;
4553 w
[25] = w
[25] | 0x800000;
4557 w
[25] = w
[25] | 0x80000000;
4565 w
[26] = w
[26] | 0x8000;
4569 w
[26] = w
[26] | 0x800000;
4573 w
[26] = w
[26] | 0x80000000;
4581 w
[27] = w
[27] | 0x8000;
4585 w
[27] = w
[27] | 0x800000;
4589 w
[27] = w
[27] | 0x80000000;
4597 w
[28] = w
[28] | 0x8000;
4601 w
[28] = w
[28] | 0x800000;
4605 w
[28] = w
[28] | 0x80000000;
4613 w
[29] = w
[29] | 0x8000;
4617 w
[29] = w
[29] | 0x800000;
4621 w
[29] = w
[29] | 0x80000000;
4629 w
[30] = w
[30] | 0x8000;
4633 w
[30] = w
[30] | 0x800000;
4637 w
[30] = w
[30] | 0x80000000;
4645 w
[31] = w
[31] | 0x8000;
4649 w
[31] = w
[31] | 0x800000;
4653 w
[31] = w
[31] | 0x80000000;
4658 // before: device_memcat2L
4659 static void memcat_c7_d1x2_sl1x2_sr1x2 (const u32 offset
, u32 dst0
[2], u32 src_l0
[2], u32 src_r0
[2])
4664 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
4665 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4669 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
4670 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4674 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
4675 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4679 dst0
[1] = src_r0
[0];
4683 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
4687 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
4691 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
4696 // before: device_memcat4L
4697 static void memcat_c15_d1x4_sl1x4_sr1x4 (const u32 offset
, u32 dst0
[4], u32 src_l0
[4], u32 src_r0
[4])
4702 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
4703 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4704 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4705 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4709 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
4710 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4711 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4712 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4716 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
4717 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4718 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4719 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4723 dst0
[1] = src_r0
[0];
4724 dst0
[2] = src_r0
[1];
4725 dst0
[3] = src_r0
[2];
4729 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
4730 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4731 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4735 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
4736 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4737 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4741 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
4742 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4743 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4747 dst0
[2] = src_r0
[0];
4748 dst0
[3] = src_r0
[1];
4752 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
4753 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4757 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
4758 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4762 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
4763 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4767 dst0
[3] = src_r0
[0];
4771 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
4775 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
4779 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
4784 // before: device_memcat8L
4785 static void memcat_c31_d2x4_sl2x4_sr1x4 (const u32 offset
, u32 dst0
[4], u32 dst1
[4], u32 src_l0
[4], u32 src_l1
[4], u32 src_r0
[4])
4790 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
4791 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4792 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4793 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4794 dst1
[0] = src_r0
[3] >> 24;
4798 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
4799 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4800 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4801 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4802 dst1
[0] = src_r0
[3] >> 16;
4806 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
4807 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4808 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4809 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4810 dst1
[0] = src_r0
[3] >> 8;
4814 dst0
[1] = src_r0
[0];
4815 dst0
[2] = src_r0
[1];
4816 dst0
[3] = src_r0
[2];
4817 dst1
[0] = src_r0
[3];
4821 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
4822 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4823 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4824 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4825 dst1
[1] = src_r0
[3] >> 24;
4829 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
4830 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4831 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4832 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4833 dst1
[1] = src_r0
[3] >> 16;
4837 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
4838 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4839 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4840 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4841 dst1
[1] = src_r0
[3] >> 8;
4845 dst0
[2] = src_r0
[0];
4846 dst0
[3] = src_r0
[1];
4847 dst1
[0] = src_r0
[2];
4848 dst1
[1] = src_r0
[3];
4852 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
4853 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4854 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4855 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4856 dst1
[2] = src_r0
[3] >> 24;
4860 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
4861 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4862 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4863 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4864 dst1
[2] = src_r0
[3] >> 16;
4868 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
4869 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4870 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4871 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4872 dst1
[2] = src_r0
[3] >> 8;
4876 dst0
[3] = src_r0
[0];
4877 dst1
[0] = src_r0
[1];
4878 dst1
[1] = src_r0
[2];
4879 dst1
[2] = src_r0
[3];
4883 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
4884 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4885 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4886 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4887 dst1
[3] = src_r0
[3] >> 24;
4891 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
4892 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4893 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4894 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4895 dst1
[3] = src_r0
[3] >> 16;
4899 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
4900 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4901 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4902 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4903 dst1
[3] = src_r0
[3] >> 8;
4907 dst1
[0] = src_r0
[0];
4908 dst1
[1] = src_r0
[1];
4909 dst1
[2] = src_r0
[2];
4910 dst1
[3] = src_r0
[3];
4914 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
4915 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4916 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4917 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4921 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
4922 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4923 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4924 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4928 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
4929 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4930 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4931 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4935 dst1
[1] = src_r0
[0];
4936 dst1
[2] = src_r0
[1];
4937 dst1
[3] = src_r0
[2];
4941 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
4942 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4943 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4947 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
4948 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4949 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4953 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
4954 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4955 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4959 dst1
[2] = src_r0
[0];
4960 dst1
[3] = src_r0
[1];
4964 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
4965 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4969 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
4970 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4974 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
4975 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4979 dst1
[3] = src_r0
[0];
4983 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
4987 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
4991 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
4996 // before: device_memcat12L
4997 static void memcat_c47_d3x4_sl3x4_sr1x4 (const u32 offset
, u32 dst0
[4], u32 dst1
[4], u32 dst2
[4], u32 src_l0
[4], u32 src_l1
[4], u32 src_l2
[4], u32 src_r0
[4])
5002 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
5003 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5004 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5005 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5006 dst1
[0] = src_r0
[3] >> 24;
5010 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
5011 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5012 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5013 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5014 dst1
[0] = src_r0
[3] >> 16;
5018 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
5019 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5020 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5021 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5022 dst1
[0] = src_r0
[3] >> 8;
5026 dst0
[1] = src_r0
[0];
5027 dst0
[2] = src_r0
[1];
5028 dst0
[3] = src_r0
[2];
5029 dst1
[0] = src_r0
[3];
5033 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
5034 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5035 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5036 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5037 dst1
[1] = src_r0
[3] >> 24;
5041 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
5042 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5043 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5044 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5045 dst1
[1] = src_r0
[3] >> 16;
5049 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
5050 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5051 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5052 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5053 dst1
[1] = src_r0
[3] >> 8;
5057 dst0
[2] = src_r0
[0];
5058 dst0
[3] = src_r0
[1];
5059 dst1
[0] = src_r0
[2];
5060 dst1
[1] = src_r0
[3];
5064 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
5065 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5066 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5067 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5068 dst1
[2] = src_r0
[3] >> 24;
5072 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
5073 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5074 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5075 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5076 dst1
[2] = src_r0
[3] >> 16;
5080 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
5081 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5082 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5083 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5084 dst1
[2] = src_r0
[3] >> 8;
5088 dst0
[3] = src_r0
[0];
5089 dst1
[0] = src_r0
[1];
5090 dst1
[1] = src_r0
[2];
5091 dst1
[2] = src_r0
[3];
5095 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
5096 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5097 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5098 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5099 dst1
[3] = src_r0
[3] >> 24;
5103 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
5104 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5105 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5106 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5107 dst1
[3] = src_r0
[3] >> 16;
5111 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
5112 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5113 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5114 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5115 dst1
[3] = src_r0
[3] >> 8;
5119 dst1
[0] = src_r0
[0];
5120 dst1
[1] = src_r0
[1];
5121 dst1
[2] = src_r0
[2];
5122 dst1
[3] = src_r0
[3];
5126 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
5127 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5128 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5129 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5130 dst2
[0] = src_r0
[3] >> 24;
5134 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
5135 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5136 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5137 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5138 dst2
[0] = src_r0
[3] >> 16;
5142 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
5143 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5144 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5145 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5146 dst2
[0] = src_r0
[3] >> 8;
5150 dst1
[1] = src_r0
[0];
5151 dst1
[2] = src_r0
[1];
5152 dst1
[3] = src_r0
[2];
5153 dst2
[0] = src_r0
[3];
5157 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
5158 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5159 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5160 dst2
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5161 dst2
[1] = src_r0
[3] >> 24;
5165 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
5166 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5167 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5168 dst2
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5169 dst2
[1] = src_r0
[3] >> 16;
5173 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
5174 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5175 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5176 dst2
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5177 dst2
[1] = src_r0
[3] >> 8;
5181 dst1
[2] = src_r0
[0];
5182 dst1
[3] = src_r0
[1];
5183 dst2
[0] = src_r0
[2];
5184 dst2
[1] = src_r0
[3];
5188 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
5189 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5190 dst2
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5191 dst2
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5192 dst2
[2] = src_r0
[3] >> 24;
5196 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
5197 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5198 dst2
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5199 dst2
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5200 dst2
[2] = src_r0
[3] >> 16;
5204 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
5205 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5206 dst2
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5207 dst2
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5208 dst2
[2] = src_r0
[3] >> 8;
5212 dst1
[3] = src_r0
[0];
5213 dst2
[0] = src_r0
[1];
5214 dst2
[1] = src_r0
[2];
5215 dst2
[2] = src_r0
[3];
5219 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
5220 dst2
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5221 dst2
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5222 dst2
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5223 dst2
[3] = src_r0
[3] >> 24;
5227 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
5228 dst2
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5229 dst2
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5230 dst2
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5231 dst2
[3] = src_r0
[3] >> 16;
5235 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
5236 dst2
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5237 dst2
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5238 dst2
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5239 dst2
[3] = src_r0
[3] >> 8;
5243 dst2
[0] = src_r0
[0];
5244 dst2
[1] = src_r0
[1];
5245 dst2
[2] = src_r0
[2];
5246 dst2
[3] = src_r0
[3];
5250 dst2
[0] = src_l2
[0] | src_r0
[0] << 8;
5251 dst2
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5252 dst2
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5253 dst2
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5257 dst2
[0] = src_l2
[0] | src_r0
[0] << 16;
5258 dst2
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5259 dst2
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5260 dst2
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5264 dst2
[0] = src_l2
[0] | src_r0
[0] << 24;
5265 dst2
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5266 dst2
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5267 dst2
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5271 dst2
[1] = src_r0
[0];
5272 dst2
[2] = src_r0
[1];
5273 dst2
[3] = src_r0
[2];
5277 dst2
[1] = src_l2
[1] | src_r0
[0] << 8;
5278 dst2
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5279 dst2
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5283 dst2
[1] = src_l2
[1] | src_r0
[0] << 16;
5284 dst2
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5285 dst2
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5289 dst2
[1] = src_l2
[1] | src_r0
[0] << 24;
5290 dst2
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5291 dst2
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5295 dst2
[2] = src_r0
[0];
5296 dst2
[3] = src_r0
[1];
5300 dst2
[2] = src_l2
[2] | src_r0
[0] << 8;
5301 dst2
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5305 dst2
[2] = src_l2
[2] | src_r0
[0] << 16;
5306 dst2
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5310 dst2
[2] = src_l2
[2] | src_r0
[0] << 24;
5311 dst2
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5315 dst2
[3] = src_r0
[0];
5319 dst2
[3] = src_l2
[3] | src_r0
[0] << 8;
5323 dst2
[3] = src_l2
[3] | src_r0
[0] << 16;
5327 dst2
[3] = src_l2
[3] | src_r0
[0] << 24;
5332 // before: device_memcat12L
5333 static void memcat_c47_d3x4_sl3x4_sr2x4 (const u32 offset
, u32 dst0
[4], u32 dst1
[4], u32 dst2
[4], u32 src_l0
[4], u32 src_l1
[4], u32 src_l2
[4], u32 src_r0
[4], u32 src_r1
[4])
5338 dst0
[0] = src_r0
[0];
5339 dst0
[1] = src_r0
[1];
5340 dst0
[2] = src_r0
[2];
5341 dst0
[3] = src_r0
[3];
5342 dst1
[0] = src_r1
[0];
5343 dst1
[1] = src_r1
[1];
5344 dst1
[2] = src_r1
[2];
5345 dst1
[3] = src_r1
[3];
5349 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
5350 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5351 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5352 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5353 dst1
[0] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5354 dst1
[1] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5355 dst1
[2] = src_r1
[1] >> 24 | src_r1
[2] << 8;
5356 dst1
[3] = src_r1
[2] >> 24 | src_r1
[3] << 8;
5357 dst2
[0] = src_r1
[3] >> 24;
5361 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
5362 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5363 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5364 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5365 dst1
[0] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5366 dst1
[1] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5367 dst1
[2] = src_r1
[1] >> 16 | src_r1
[2] << 16;
5368 dst1
[3] = src_r1
[2] >> 16 | src_r1
[3] << 16;
5369 dst2
[0] = src_r1
[3] >> 16;
5373 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
5374 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5375 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5376 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5377 dst1
[0] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5378 dst1
[1] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5379 dst1
[2] = src_r1
[1] >> 8 | src_r1
[2] << 24;
5380 dst1
[3] = src_r1
[2] >> 8 | src_r1
[3] << 24;
5381 dst2
[0] = src_r1
[3] >> 8;
5385 dst0
[1] = src_r0
[0];
5386 dst0
[2] = src_r0
[1];
5387 dst0
[3] = src_r0
[2];
5388 dst1
[0] = src_r0
[3];
5389 dst1
[1] = src_r1
[0];
5390 dst1
[2] = src_r1
[1];
5391 dst1
[3] = src_r1
[2];
5392 dst2
[0] = src_r1
[3];
5396 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
5397 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5398 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5399 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5400 dst1
[1] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5401 dst1
[2] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5402 dst1
[3] = src_r1
[1] >> 24 | src_r1
[2] << 8;
5403 dst2
[0] = src_r1
[2] >> 24 | src_r1
[3] << 8;
5404 dst2
[1] = src_r1
[3] >> 24;
5408 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
5409 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5410 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5411 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5412 dst1
[1] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5413 dst1
[2] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5414 dst1
[3] = src_r1
[1] >> 16 | src_r1
[2] << 16;
5415 dst2
[0] = src_r1
[2] >> 16 | src_r1
[3] << 16;
5416 dst2
[1] = src_r1
[3] >> 16;
5420 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
5421 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5422 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5423 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5424 dst1
[1] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5425 dst1
[2] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5426 dst1
[3] = src_r1
[1] >> 8 | src_r1
[2] << 24;
5427 dst2
[0] = src_r1
[2] >> 8 | src_r1
[3] << 24;
5428 dst2
[1] = src_r1
[3] >> 8;
5432 dst0
[2] = src_r0
[0];
5433 dst0
[3] = src_r0
[1];
5434 dst1
[0] = src_r0
[2];
5435 dst1
[1] = src_r0
[3];
5436 dst1
[2] = src_r1
[0];
5437 dst1
[3] = src_r1
[1];
5438 dst2
[0] = src_r1
[2];
5439 dst2
[1] = src_r1
[3];
5443 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
5444 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5445 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5446 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5447 dst1
[2] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5448 dst1
[3] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5449 dst2
[0] = src_r1
[1] >> 24 | src_r1
[2] << 8;
5450 dst2
[1] = src_r1
[2] >> 24 | src_r1
[3] << 8;
5451 dst2
[2] = src_r1
[3] >> 24;
5455 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
5456 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5457 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5458 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5459 dst1
[2] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5460 dst1
[3] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5461 dst2
[0] = src_r1
[1] >> 16 | src_r1
[2] << 16;
5462 dst2
[1] = src_r1
[2] >> 16 | src_r1
[3] << 16;
5463 dst2
[2] = src_r1
[3] >> 16;
5467 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
5468 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5469 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5470 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5471 dst1
[2] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5472 dst1
[3] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5473 dst2
[0] = src_r1
[1] >> 8 | src_r1
[2] << 24;
5474 dst2
[1] = src_r1
[2] >> 8 | src_r1
[3] << 24;
5475 dst2
[2] = src_r1
[3] >> 8;
5479 dst0
[3] = src_r0
[0];
5480 dst1
[0] = src_r0
[1];
5481 dst1
[1] = src_r0
[2];
5482 dst1
[2] = src_r0
[3];
5483 dst1
[3] = src_r1
[0];
5484 dst2
[0] = src_r1
[1];
5485 dst2
[1] = src_r1
[2];
5486 dst2
[2] = src_r1
[3];
5490 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
5491 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5492 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5493 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5494 dst1
[3] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5495 dst2
[0] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5496 dst2
[1] = src_r1
[1] >> 24 | src_r1
[2] << 8;
5497 dst2
[2] = src_r1
[2] >> 24 | src_r1
[3] << 8;
5498 dst2
[3] = src_r1
[3] >> 24;
5502 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
5503 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5504 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5505 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5506 dst1
[3] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5507 dst2
[0] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5508 dst2
[1] = src_r1
[1] >> 16 | src_r1
[2] << 16;
5509 dst2
[2] = src_r1
[2] >> 16 | src_r1
[3] << 16;
5510 dst2
[3] = src_r1
[3] >> 16;
5514 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
5515 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5516 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5517 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5518 dst1
[3] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5519 dst2
[0] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5520 dst2
[1] = src_r1
[1] >> 8 | src_r1
[2] << 24;
5521 dst2
[2] = src_r1
[2] >> 8 | src_r1
[3] << 24;
5522 dst2
[3] = src_r1
[3] >> 8;
5526 dst1
[0] = src_r0
[0];
5527 dst1
[1] = src_r0
[1];
5528 dst1
[2] = src_r0
[2];
5529 dst1
[3] = src_r0
[3];
5530 dst2
[0] = src_r1
[0];
5531 dst2
[1] = src_r1
[1];
5532 dst2
[2] = src_r1
[2];
5533 dst2
[3] = src_r1
[3];
5537 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
5538 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5539 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5540 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5541 dst2
[0] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5542 dst2
[1] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5543 dst2
[2] = src_r1
[1] >> 24 | src_r1
[2] << 8;
5544 dst2
[3] = src_r1
[2] >> 24 | src_r1
[3] << 8;
5548 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
5549 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5550 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5551 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5552 dst2
[0] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5553 dst2
[1] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5554 dst2
[2] = src_r1
[1] >> 16 | src_r1
[2] << 16;
5555 dst2
[3] = src_r1
[2] >> 16 | src_r1
[3] << 16;
5559 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
5560 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5561 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5562 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5563 dst2
[0] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5564 dst2
[1] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5565 dst2
[2] = src_r1
[1] >> 8 | src_r1
[2] << 24;
5566 dst2
[3] = src_r1
[2] >> 8 | src_r1
[3] << 24;
5570 dst1
[1] = src_r1
[0];
5571 dst1
[2] = src_r0
[1];
5572 dst1
[3] = src_r0
[2];
5573 dst2
[0] = src_r0
[3];
5574 dst2
[1] = src_r1
[0];
5575 dst2
[2] = src_r1
[1];
5576 dst2
[3] = src_r1
[2];
5580 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
5581 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5582 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5583 dst2
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5584 dst2
[1] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5585 dst2
[2] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5586 dst2
[3] = src_r1
[1] >> 24 | src_r1
[2] << 8;
5590 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
5591 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5592 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5593 dst2
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5594 dst2
[1] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5595 dst2
[2] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5596 dst2
[3] = src_r1
[1] >> 16 | src_r1
[2] << 16;
5600 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
5601 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5602 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5603 dst2
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5604 dst2
[1] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5605 dst2
[2] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5606 dst2
[3] = src_r1
[1] >> 8 | src_r1
[2] << 24;
5610 dst1
[2] = src_r1
[0];
5611 dst1
[3] = src_r0
[1];
5612 dst2
[0] = src_r0
[2];
5613 dst2
[1] = src_r0
[3];
5614 dst2
[2] = src_r1
[0];
5615 dst2
[3] = src_r1
[1];
5619 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
5620 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5621 dst2
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5622 dst2
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5623 dst2
[2] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5624 dst2
[3] = src_r1
[0] >> 24 | src_r1
[1] << 8;
5628 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
5629 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5630 dst2
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5631 dst2
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5632 dst2
[2] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5633 dst2
[3] = src_r1
[0] >> 16 | src_r1
[1] << 16;
5637 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
5638 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5639 dst2
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5640 dst2
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5641 dst2
[2] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5642 dst2
[3] = src_r1
[0] >> 8 | src_r1
[1] << 24;
5646 dst1
[3] = src_r1
[0];
5647 dst2
[0] = src_r0
[1];
5648 dst2
[1] = src_r0
[2];
5649 dst2
[2] = src_r0
[3];
5650 dst2
[3] = src_r1
[0];
5654 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
5655 dst2
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5656 dst2
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5657 dst2
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5658 dst2
[3] = src_r0
[3] >> 24 | src_r1
[0] << 8;
5662 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
5663 dst2
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5664 dst2
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5665 dst2
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5666 dst2
[3] = src_r0
[3] >> 16 | src_r1
[0] << 16;
5670 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
5671 dst2
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5672 dst2
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5673 dst2
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5674 dst2
[3] = src_r0
[3] >> 8 | src_r1
[0] << 24;
5678 dst2
[0] = src_r0
[0];
5679 dst2
[1] = src_r0
[1];
5680 dst2
[2] = src_r0
[2];
5681 dst2
[3] = src_r0
[3];
5685 dst2
[0] = src_l2
[0] | src_r0
[0] << 8;
5686 dst2
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5687 dst2
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5688 dst2
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
5692 dst2
[0] = src_l2
[0] | src_r0
[0] << 16;
5693 dst2
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5694 dst2
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5695 dst2
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
5699 dst2
[0] = src_l2
[0] | src_r0
[0] << 24;
5700 dst2
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5701 dst2
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5702 dst2
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
5706 dst2
[1] = src_r0
[0];
5707 dst2
[2] = src_r0
[1];
5708 dst2
[3] = src_r0
[2];
5712 dst2
[1] = src_l2
[1] | src_r0
[0] << 8;
5713 dst2
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5714 dst2
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
5718 dst2
[1] = src_l2
[1] | src_r0
[0] << 16;
5719 dst2
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5720 dst2
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
5724 dst2
[1] = src_l2
[1] | src_r0
[0] << 24;
5725 dst2
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5726 dst2
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
5730 dst2
[2] = src_r0
[0];
5731 dst2
[3] = src_r0
[1];
5735 dst2
[2] = src_l2
[2] | src_r0
[0] << 8;
5736 dst2
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
5740 dst2
[2] = src_l2
[2] | src_r0
[0] << 16;
5741 dst2
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
5745 dst2
[2] = src_l2
[2] | src_r0
[0] << 24;
5746 dst2
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
5750 dst2
[3] = src_r0
[0];
5754 dst2
[3] = src_l2
[3] | src_r0
[0] << 8;
5758 dst2
[3] = src_l2
[3] | src_r0
[0] << 16;
5762 dst2
[3] = src_l2
[3] | src_r0
[0] << 24;
5767 // before: memcat16_9
5768 static void memcat_c15_w4x4_a3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 append0
[4], const u32 append1
[4], const u32 append2
[4], const u32 offset
)
5785 w0
[0] = w0
[0] | append0
[0] << 8;
5786 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
5787 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
5788 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
5789 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
5790 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
5791 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
5792 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
5793 w2
[0] = append1
[3] >> 24 | append2
[0] << 8;
5794 w2
[1] = append2
[0] >> 24;
5798 w0
[0] = w0
[0] | append0
[0] << 16;
5799 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
5800 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
5801 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
5802 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
5803 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
5804 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
5805 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
5806 w2
[0] = append1
[3] >> 16 | append2
[0] << 16;
5807 w2
[1] = append2
[0] >> 16;
5811 w0
[0] = w0
[0] | append0
[0] << 24;
5812 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
5813 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
5814 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
5815 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
5816 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
5817 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
5818 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
5819 w2
[0] = append1
[3] >> 8 | append2
[0] << 24;
5820 w2
[1] = append2
[0] >> 8;
5836 w0
[1] = w0
[1] | append0
[0] << 8;
5837 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
5838 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
5839 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
5840 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
5841 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
5842 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
5843 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
5844 w2
[1] = append1
[3] >> 24 | append2
[0] << 8;
5845 w2
[2] = append2
[0] >> 24;
5849 w0
[1] = w0
[1] | append0
[0] << 16;
5850 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
5851 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
5852 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
5853 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
5854 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
5855 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
5856 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
5857 w2
[1] = append1
[3] >> 16 | append2
[0] << 16;
5858 w2
[2] = append2
[0] >> 16;
5862 w0
[1] = w0
[1] | append0
[0] << 24;
5863 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
5864 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
5865 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
5866 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
5867 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
5868 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
5869 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
5870 w2
[1] = append1
[3] >> 8 | append2
[0] << 24;
5871 w2
[2] = append2
[0] >> 8;
5887 w0
[2] = w0
[2] | append0
[0] << 8;
5888 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
5889 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
5890 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
5891 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
5892 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
5893 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
5894 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
5895 w2
[2] = append1
[3] >> 24 | append2
[0] << 8;
5896 w2
[3] = append2
[0] >> 24;
5900 w0
[2] = w0
[2] | append0
[0] << 16;
5901 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
5902 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
5903 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
5904 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
5905 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
5906 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
5907 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
5908 w2
[2] = append1
[3] >> 16 | append2
[0] << 16;
5909 w2
[3] = append2
[0] >> 16;
5913 w0
[2] = w0
[2] | append0
[0] << 24;
5914 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
5915 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
5916 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
5917 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
5918 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
5919 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
5920 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
5921 w2
[2] = append1
[3] >> 8 | append2
[0] << 24;
5922 w2
[3] = append2
[0] >> 8;
5938 w0
[3] = w0
[3] | append0
[0] << 8;
5939 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
5940 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
5941 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
5942 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
5943 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
5944 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
5945 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
5946 w2
[3] = append1
[3] >> 24 | append2
[0] << 8;
5947 w3
[0] = append2
[0] >> 24;
5951 w0
[3] = w0
[3] | append0
[0] << 16;
5952 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
5953 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
5954 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
5955 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
5956 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
5957 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
5958 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
5959 w2
[3] = append1
[3] >> 16 | append2
[0] << 16;
5960 w3
[0] = append2
[0] >> 16;
5964 w0
[3] = w0
[3] | append0
[0] << 24;
5965 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
5966 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
5967 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
5968 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
5969 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
5970 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
5971 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
5972 w2
[3] = append1
[3] >> 8 | append2
[0] << 24;
5973 w3
[0] = append2
[0] >> 8;
5978 // before: memcat32_8
5979 static void memcat_c32_w4x4_a2x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 append0
[4], const u32 append1
[4], const u32 offset
)
5995 w0
[0] = w0
[0] | append0
[0] << 8;
5996 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
5997 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
5998 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
5999 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
6000 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
6001 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
6002 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
6003 w2
[0] = append1
[3] >> 24;
6007 w0
[0] = w0
[0] | append0
[0] << 16;
6008 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
6009 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
6010 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
6011 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
6012 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
6013 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
6014 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
6015 w2
[0] = append1
[3] >> 16;
6019 w0
[0] = w0
[0] | append0
[0] << 24;
6020 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
6021 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
6022 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
6023 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
6024 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
6025 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
6026 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
6027 w2
[0] = append1
[3] >> 8;
6042 w0
[1] = w0
[1] | append0
[0] << 8;
6043 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
6044 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
6045 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
6046 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
6047 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
6048 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
6049 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
6050 w2
[1] = append1
[3] >> 24;
6054 w0
[1] = w0
[1] | append0
[0] << 16;
6055 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
6056 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
6057 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
6058 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
6059 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
6060 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
6061 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
6062 w2
[1] = append1
[3] >> 16;
6066 w0
[1] = w0
[1] | append0
[0] << 24;
6067 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
6068 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
6069 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
6070 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
6071 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
6072 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
6073 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
6074 w2
[1] = append1
[3] >> 8;
6089 w0
[2] = w0
[2] | append0
[0] << 8;
6090 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
6091 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
6092 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
6093 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
6094 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
6095 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
6096 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
6097 w2
[2] = append1
[3] >> 24;
6101 w0
[2] = w0
[2] | append0
[0] << 16;
6102 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
6103 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
6104 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
6105 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
6106 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
6107 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
6108 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
6109 w2
[2] = append1
[3] >> 16;
6113 w0
[2] = w0
[2] | append0
[0] << 24;
6114 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
6115 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
6116 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
6117 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
6118 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
6119 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
6120 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
6121 w2
[2] = append1
[3] >> 8;
6136 w0
[3] = w0
[3] | append0
[0] << 8;
6137 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
6138 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
6139 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
6140 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
6141 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
6142 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
6143 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
6144 w2
[3] = append1
[3] >> 24;
6148 w0
[3] = w0
[3] | append0
[0] << 16;
6149 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
6150 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
6151 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
6152 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
6153 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
6154 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
6155 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
6156 w2
[3] = append1
[3] >> 16;
6160 w0
[3] = w0
[3] | append0
[0] << 24;
6161 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
6162 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
6163 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
6164 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
6165 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
6166 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
6167 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
6168 w2
[3] = append1
[3] >> 8;
6183 w1
[0] = w1
[0] | append0
[0] << 8;
6184 w1
[1] = append0
[0] >> 24 | append0
[1] << 8;
6185 w1
[2] = append0
[1] >> 24 | append0
[2] << 8;
6186 w1
[3] = append0
[2] >> 24 | append0
[3] << 8;
6187 w2
[0] = append0
[3] >> 24 | append1
[0] << 8;
6188 w2
[1] = append1
[0] >> 24 | append1
[1] << 8;
6189 w2
[2] = append1
[1] >> 24 | append1
[2] << 8;
6190 w2
[3] = append1
[2] >> 24 | append1
[3] << 8;
6191 w3
[0] = append1
[3] >> 24;
6195 w1
[0] = w1
[0] | append0
[0] << 16;
6196 w1
[1] = append0
[0] >> 16 | append0
[1] << 16;
6197 w1
[2] = append0
[1] >> 16 | append0
[2] << 16;
6198 w1
[3] = append0
[2] >> 16 | append0
[3] << 16;
6199 w2
[0] = append0
[3] >> 16 | append1
[0] << 16;
6200 w2
[1] = append1
[0] >> 16 | append1
[1] << 16;
6201 w2
[2] = append1
[1] >> 16 | append1
[2] << 16;
6202 w2
[3] = append1
[2] >> 16 | append1
[3] << 16;
6203 w3
[0] = append1
[3] >> 16;
6207 w1
[0] = w1
[0] | append0
[0] << 24;
6208 w1
[1] = append0
[0] >> 8 | append0
[1] << 24;
6209 w1
[2] = append0
[1] >> 8 | append0
[2] << 24;
6210 w1
[3] = append0
[2] >> 8 | append0
[3] << 24;
6211 w2
[0] = append0
[3] >> 8 | append1
[0] << 24;
6212 w2
[1] = append1
[0] >> 8 | append1
[1] << 24;
6213 w2
[2] = append1
[1] >> 8 | append1
[2] << 24;
6214 w2
[3] = append1
[2] >> 8 | append1
[3] << 24;
6215 w3
[0] = append1
[3] >> 8;
6230 w1
[1] = w1
[1] | append0
[0] << 8;
6231 w1
[2] = append0
[0] >> 24 | append0
[1] << 8;
6232 w1
[3] = append0
[1] >> 24 | append0
[2] << 8;
6233 w2
[0] = append0
[2] >> 24 | append0
[3] << 8;
6234 w2
[1] = append0
[3] >> 24 | append1
[0] << 8;
6235 w2
[2] = append1
[0] >> 24 | append1
[1] << 8;
6236 w2
[3] = append1
[1] >> 24 | append1
[2] << 8;
6237 w3
[0] = append1
[2] >> 24 | append1
[3] << 8;
6238 w3
[1] = append1
[3] >> 24;
6242 w1
[1] = w1
[1] | append0
[0] << 16;
6243 w1
[2] = append0
[0] >> 16 | append0
[1] << 16;
6244 w1
[3] = append0
[1] >> 16 | append0
[2] << 16;
6245 w2
[0] = append0
[2] >> 16 | append0
[3] << 16;
6246 w2
[1] = append0
[3] >> 16 | append1
[0] << 16;
6247 w2
[2] = append1
[0] >> 16 | append1
[1] << 16;
6248 w2
[3] = append1
[1] >> 16 | append1
[2] << 16;
6249 w3
[0] = append1
[2] >> 16 | append1
[3] << 16;
6250 w3
[1] = append1
[3] >> 16;
6254 w1
[1] = w1
[1] | append0
[0] << 24;
6255 w1
[2] = append0
[0] >> 8 | append0
[1] << 24;
6256 w1
[3] = append0
[1] >> 8 | append0
[2] << 24;
6257 w2
[0] = append0
[2] >> 8 | append0
[3] << 24;
6258 w2
[1] = append0
[3] >> 8 | append1
[0] << 24;
6259 w2
[2] = append1
[0] >> 8 | append1
[1] << 24;
6260 w2
[3] = append1
[1] >> 8 | append1
[2] << 24;
6261 w3
[0] = append1
[2] >> 8 | append1
[3] << 24;
6262 w3
[1] = append1
[3] >> 8;
6277 w1
[2] = w1
[2] | append0
[0] << 8;
6278 w1
[3] = append0
[0] >> 24 | append0
[1] << 8;
6279 w2
[0] = append0
[1] >> 24 | append0
[2] << 8;
6280 w2
[1] = append0
[2] >> 24 | append0
[3] << 8;
6281 w2
[2] = append0
[3] >> 24 | append1
[0] << 8;
6282 w2
[3] = append1
[0] >> 24 | append1
[1] << 8;
6283 w3
[0] = append1
[1] >> 24 | append1
[2] << 8;
6284 w3
[1] = append1
[2] >> 24 | append1
[3] << 8;
6288 w1
[2] = w1
[2] | append0
[0] << 16;
6289 w1
[3] = append0
[0] >> 16 | append0
[1] << 16;
6290 w2
[0] = append0
[1] >> 16 | append0
[2] << 16;
6291 w2
[1] = append0
[2] >> 16 | append0
[3] << 16;
6292 w2
[2] = append0
[3] >> 16 | append1
[0] << 16;
6293 w2
[3] = append1
[0] >> 16 | append1
[1] << 16;
6294 w3
[0] = append1
[1] >> 16 | append1
[2] << 16;
6295 w3
[1] = append1
[2] >> 16 | append1
[3] << 16;
6299 w1
[2] = w1
[2] | append0
[0] << 24;
6300 w1
[3] = append0
[0] >> 8 | append0
[1] << 24;
6301 w2
[0] = append0
[1] >> 8 | append0
[2] << 24;
6302 w2
[1] = append0
[2] >> 8 | append0
[3] << 24;
6303 w2
[2] = append0
[3] >> 8 | append1
[0] << 24;
6304 w2
[3] = append1
[0] >> 8 | append1
[1] << 24;
6305 w3
[0] = append1
[1] >> 8 | append1
[2] << 24;
6306 w3
[1] = append1
[2] >> 8 | append1
[3] << 24;
6320 w1
[3] = w1
[3] | append0
[0] << 8;
6321 w2
[0] = append0
[0] >> 24 | append0
[1] << 8;
6322 w2
[1] = append0
[1] >> 24 | append0
[2] << 8;
6323 w2
[2] = append0
[2] >> 24 | append0
[3] << 8;
6324 w2
[3] = append0
[3] >> 24 | append1
[0] << 8;
6325 w3
[0] = append1
[0] >> 24 | append1
[1] << 8;
6326 w3
[1] = append1
[1] >> 24 | append1
[2] << 8;
6330 w1
[3] = w1
[3] | append0
[0] << 16;
6331 w2
[0] = append0
[0] >> 16 | append0
[1] << 16;
6332 w2
[1] = append0
[1] >> 16 | append0
[2] << 16;
6333 w2
[2] = append0
[2] >> 16 | append0
[3] << 16;
6334 w2
[3] = append0
[3] >> 16 | append1
[0] << 16;
6335 w3
[0] = append1
[0] >> 16 | append1
[1] << 16;
6336 w3
[1] = append1
[1] >> 16 | append1
[2] << 16;
6340 w1
[3] = w1
[3] | append0
[0] << 24;
6341 w2
[0] = append0
[0] >> 8 | append0
[1] << 24;
6342 w2
[1] = append0
[1] >> 8 | append0
[2] << 24;
6343 w2
[2] = append0
[2] >> 8 | append0
[3] << 24;
6344 w2
[3] = append0
[3] >> 8 | append1
[0] << 24;
6345 w3
[0] = append1
[0] >> 8 | append1
[1] << 24;
6346 w3
[1] = append1
[1] >> 8 | append1
[2] << 24;
6360 // before: memcat32_9
6361 static void memcat_c32_w4x4_a3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 append0
[4], const u32 append1
[4], const u32 append2
[4], const u32 offset
)
6378 w0
[0] = w0
[0] | append0
[0] << 8;
6379 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
6380 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
6381 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
6382 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
6383 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
6384 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
6385 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
6386 w2
[0] = append1
[3] >> 24 | append2
[0] << 8;
6387 w2
[1] = append2
[0] >> 24;
6391 w0
[0] = w0
[0] | append0
[0] << 16;
6392 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
6393 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
6394 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
6395 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
6396 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
6397 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
6398 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
6399 w2
[0] = append1
[3] >> 16 | append2
[0] << 16;
6400 w2
[1] = append2
[0] >> 16;
6404 w0
[0] = w0
[0] | append0
[0] << 24;
6405 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
6406 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
6407 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
6408 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
6409 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
6410 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
6411 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
6412 w2
[0] = append1
[3] >> 8 | append2
[0] << 24;
6413 w2
[1] = append2
[0] >> 8;
6429 w0
[1] = w0
[1] | append0
[0] << 8;
6430 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
6431 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
6432 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
6433 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
6434 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
6435 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
6436 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
6437 w2
[1] = append1
[3] >> 24 | append2
[0] << 8;
6438 w2
[2] = append2
[0] >> 24;
6442 w0
[1] = w0
[1] | append0
[0] << 16;
6443 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
6444 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
6445 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
6446 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
6447 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
6448 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
6449 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
6450 w2
[1] = append1
[3] >> 16 | append2
[0] << 16;
6451 w2
[2] = append2
[0] >> 16;
6455 w0
[1] = w0
[1] | append0
[0] << 24;
6456 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
6457 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
6458 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
6459 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
6460 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
6461 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
6462 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
6463 w2
[1] = append1
[3] >> 8 | append2
[0] << 24;
6464 w2
[2] = append2
[0] >> 8;
6480 w0
[2] = w0
[2] | append0
[0] << 8;
6481 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
6482 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
6483 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
6484 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
6485 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
6486 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
6487 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
6488 w2
[2] = append1
[3] >> 24 | append2
[0] << 8;
6489 w2
[3] = append2
[0] >> 24;
6493 w0
[2] = w0
[2] | append0
[0] << 16;
6494 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
6495 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
6496 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
6497 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
6498 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
6499 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
6500 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
6501 w2
[2] = append1
[3] >> 16 | append2
[0] << 16;
6502 w2
[3] = append2
[0] >> 16;
6506 w0
[2] = w0
[2] | append0
[0] << 24;
6507 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
6508 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
6509 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
6510 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
6511 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
6512 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
6513 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
6514 w2
[2] = append1
[3] >> 8 | append2
[0] << 24;
6515 w2
[3] = append2
[0] >> 8;
6531 w0
[3] = w0
[3] | append0
[0] << 8;
6532 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
6533 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
6534 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
6535 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
6536 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
6537 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
6538 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
6539 w2
[3] = append1
[3] >> 24 | append2
[0] << 8;
6540 w3
[0] = append2
[0] >> 24;
6544 w0
[3] = w0
[3] | append0
[0] << 16;
6545 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
6546 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
6547 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
6548 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
6549 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
6550 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
6551 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
6552 w2
[3] = append1
[3] >> 16 | append2
[0] << 16;
6553 w3
[0] = append2
[0] >> 16;
6557 w0
[3] = w0
[3] | append0
[0] << 24;
6558 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
6559 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
6560 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
6561 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
6562 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
6563 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
6564 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
6565 w2
[3] = append1
[3] >> 8 | append2
[0] << 24;
6566 w3
[0] = append2
[0] >> 8;
6582 w1
[0] = w1
[0] | append0
[0] << 8;
6583 w1
[1] = append0
[0] >> 24 | append0
[1] << 8;
6584 w1
[2] = append0
[1] >> 24 | append0
[2] << 8;
6585 w1
[3] = append0
[2] >> 24 | append0
[3] << 8;
6586 w2
[0] = append0
[3] >> 24 | append1
[0] << 8;
6587 w2
[1] = append1
[0] >> 24 | append1
[1] << 8;
6588 w2
[2] = append1
[1] >> 24 | append1
[2] << 8;
6589 w2
[3] = append1
[2] >> 24 | append1
[3] << 8;
6590 w3
[0] = append1
[3] >> 24 | append2
[0] << 8;
6591 w3
[1] = append2
[0] >> 24;
6595 w1
[0] = w1
[0] | append0
[0] << 16;
6596 w1
[1] = append0
[0] >> 16 | append0
[1] << 16;
6597 w1
[2] = append0
[1] >> 16 | append0
[2] << 16;
6598 w1
[3] = append0
[2] >> 16 | append0
[3] << 16;
6599 w2
[0] = append0
[3] >> 16 | append1
[0] << 16;
6600 w2
[1] = append1
[0] >> 16 | append1
[1] << 16;
6601 w2
[2] = append1
[1] >> 16 | append1
[2] << 16;
6602 w2
[3] = append1
[2] >> 16 | append1
[3] << 16;
6603 w3
[0] = append1
[3] >> 16 | append2
[0] << 16;
6604 w3
[1] = append2
[0] >> 16;
6608 w1
[0] = w1
[0] | append0
[0] << 24;
6609 w1
[1] = append0
[0] >> 8 | append0
[1] << 24;
6610 w1
[2] = append0
[1] >> 8 | append0
[2] << 24;
6611 w1
[3] = append0
[2] >> 8 | append0
[3] << 24;
6612 w2
[0] = append0
[3] >> 8 | append1
[0] << 24;
6613 w2
[1] = append1
[0] >> 8 | append1
[1] << 24;
6614 w2
[2] = append1
[1] >> 8 | append1
[2] << 24;
6615 w2
[3] = append1
[2] >> 8 | append1
[3] << 24;
6616 w3
[0] = append1
[3] >> 8 | append2
[0] << 24;
6617 w3
[1] = append2
[0] >> 8;
6633 w1
[1] = w1
[1] | append0
[0] << 8;
6634 w1
[2] = append0
[0] >> 24 | append0
[1] << 8;
6635 w1
[3] = append0
[1] >> 24 | append0
[2] << 8;
6636 w2
[0] = append0
[2] >> 24 | append0
[3] << 8;
6637 w2
[1] = append0
[3] >> 24 | append1
[0] << 8;
6638 w2
[2] = append1
[0] >> 24 | append1
[1] << 8;
6639 w2
[3] = append1
[1] >> 24 | append1
[2] << 8;
6640 w3
[0] = append1
[2] >> 24 | append1
[3] << 8;
6641 w3
[1] = append1
[3] >> 24 | append2
[0] << 8;
6645 w1
[1] = w1
[1] | append0
[0] << 16;
6646 w1
[2] = append0
[0] >> 16 | append0
[1] << 16;
6647 w1
[3] = append0
[1] >> 16 | append0
[2] << 16;
6648 w2
[0] = append0
[2] >> 16 | append0
[3] << 16;
6649 w2
[1] = append0
[3] >> 16 | append1
[0] << 16;
6650 w2
[2] = append1
[0] >> 16 | append1
[1] << 16;
6651 w2
[3] = append1
[1] >> 16 | append1
[2] << 16;
6652 w3
[0] = append1
[2] >> 16 | append1
[3] << 16;
6653 w3
[1] = append1
[3] >> 16 | append2
[0] << 16;
6657 w1
[1] = w1
[1] | append0
[0] << 24;
6658 w1
[2] = append0
[0] >> 8 | append0
[1] << 24;
6659 w1
[3] = append0
[1] >> 8 | append0
[2] << 24;
6660 w2
[0] = append0
[2] >> 8 | append0
[3] << 24;
6661 w2
[1] = append0
[3] >> 8 | append1
[0] << 24;
6662 w2
[2] = append1
[0] >> 8 | append1
[1] << 24;
6663 w2
[3] = append1
[1] >> 8 | append1
[2] << 24;
6664 w3
[0] = append1
[2] >> 8 | append1
[3] << 24;
6665 w3
[1] = append1
[3] >> 8 | append2
[0] << 24;
6680 w1
[2] = w1
[2] | append0
[0] << 8;
6681 w1
[3] = append0
[0] >> 24 | append0
[1] << 8;
6682 w2
[0] = append0
[1] >> 24 | append0
[2] << 8;
6683 w2
[1] = append0
[2] >> 24 | append0
[3] << 8;
6684 w2
[2] = append0
[3] >> 24 | append1
[0] << 8;
6685 w2
[3] = append1
[0] >> 24 | append1
[1] << 8;
6686 w3
[0] = append1
[1] >> 24 | append1
[2] << 8;
6687 w3
[1] = append1
[2] >> 24 | append1
[3] << 8;
6691 w1
[2] = w1
[2] | append0
[0] << 16;
6692 w1
[3] = append0
[0] >> 16 | append0
[1] << 16;
6693 w2
[0] = append0
[1] >> 16 | append0
[2] << 16;
6694 w2
[1] = append0
[2] >> 16 | append0
[3] << 16;
6695 w2
[2] = append0
[3] >> 16 | append1
[0] << 16;
6696 w2
[3] = append1
[0] >> 16 | append1
[1] << 16;
6697 w3
[0] = append1
[1] >> 16 | append1
[2] << 16;
6698 w3
[1] = append1
[2] >> 16 | append1
[3] << 16;
6702 w1
[2] = w1
[2] | append0
[0] << 24;
6703 w1
[3] = append0
[0] >> 8 | append0
[1] << 24;
6704 w2
[0] = append0
[1] >> 8 | append0
[2] << 24;
6705 w2
[1] = append0
[2] >> 8 | append0
[3] << 24;
6706 w2
[2] = append0
[3] >> 8 | append1
[0] << 24;
6707 w2
[3] = append1
[0] >> 8 | append1
[1] << 24;
6708 w3
[0] = append1
[1] >> 8 | append1
[2] << 24;
6709 w3
[1] = append1
[2] >> 8 | append1
[3] << 24;
6723 w1
[3] = w1
[3] | append0
[0] << 8;
6724 w2
[0] = append0
[0] >> 24 | append0
[1] << 8;
6725 w2
[1] = append0
[1] >> 24 | append0
[2] << 8;
6726 w2
[2] = append0
[2] >> 24 | append0
[3] << 8;
6727 w2
[3] = append0
[3] >> 24 | append1
[0] << 8;
6728 w3
[0] = append1
[0] >> 24 | append1
[1] << 8;
6729 w3
[1] = append1
[1] >> 24 | append1
[2] << 8;
6733 w1
[3] = w1
[3] | append0
[0] << 16;
6734 w2
[0] = append0
[0] >> 16 | append0
[1] << 16;
6735 w2
[1] = append0
[1] >> 16 | append0
[2] << 16;
6736 w2
[2] = append0
[2] >> 16 | append0
[3] << 16;
6737 w2
[3] = append0
[3] >> 16 | append1
[0] << 16;
6738 w3
[0] = append1
[0] >> 16 | append1
[1] << 16;
6739 w3
[1] = append1
[1] >> 16 | append1
[2] << 16;
6743 w1
[3] = w1
[3] | append0
[0] << 24;
6744 w2
[0] = append0
[0] >> 8 | append0
[1] << 24;
6745 w2
[1] = append0
[1] >> 8 | append0
[2] << 24;
6746 w2
[2] = append0
[2] >> 8 | append0
[3] << 24;
6747 w2
[3] = append0
[3] >> 8 | append1
[0] << 24;
6748 w3
[0] = append1
[0] >> 8 | append1
[1] << 24;
6749 w3
[1] = append1
[1] >> 8 | append1
[2] << 24;
6763 static void switch_buffer_by_offset (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
6766 const int offset_mod_4
= offset
& 3;
6768 const int offset_minus_4
= 4 - offset
;
6773 w3
[2] = amd_bytealign ( 0, w3
[1], offset_minus_4
);
6774 w3
[1] = amd_bytealign (w3
[1], w3
[0], offset_minus_4
);
6775 w3
[0] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
6776 w2
[3] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
6777 w2
[2] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
6778 w2
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
6779 w2
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
6780 w1
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
6781 w1
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
6782 w1
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
6783 w1
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6784 w0
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6785 w0
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6786 w0
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6787 w0
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6789 if (offset_mod_4
== 0)
6811 w3
[2] = amd_bytealign ( 0, w3
[0], offset_minus_4
);
6812 w3
[1] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
6813 w3
[0] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
6814 w2
[3] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
6815 w2
[2] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
6816 w2
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
6817 w2
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
6818 w1
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
6819 w1
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
6820 w1
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6821 w1
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6822 w0
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6823 w0
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6824 w0
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6827 if (offset_mod_4
== 0)
6848 w3
[2] = amd_bytealign ( 0, w2
[3], offset_minus_4
);
6849 w3
[1] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
6850 w3
[0] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
6851 w2
[3] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
6852 w2
[2] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
6853 w2
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
6854 w2
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
6855 w1
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
6856 w1
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6857 w1
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6858 w1
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6859 w0
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6860 w0
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6864 if (offset_mod_4
== 0)
6884 w3
[2] = amd_bytealign ( 0, w2
[2], offset_minus_4
);
6885 w3
[1] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
6886 w3
[0] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
6887 w2
[3] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
6888 w2
[2] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
6889 w2
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
6890 w2
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
6891 w1
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6892 w1
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6893 w1
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6894 w1
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6895 w0
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6900 if (offset_mod_4
== 0)
6919 w3
[2] = amd_bytealign ( 0, w2
[1], offset_minus_4
);
6920 w3
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
6921 w3
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
6922 w2
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
6923 w2
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
6924 w2
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
6925 w2
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6926 w1
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6927 w1
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6928 w1
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6929 w1
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6935 if (offset_mod_4
== 0)
6953 w3
[2] = amd_bytealign ( 0, w2
[0], offset_minus_4
);
6954 w3
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
6955 w3
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
6956 w2
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
6957 w2
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
6958 w2
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6959 w2
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6960 w1
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6961 w1
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6962 w1
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6969 if (offset_mod_4
== 0)
6986 w3
[2] = amd_bytealign ( 0, w1
[3], offset_minus_4
);
6987 w3
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
6988 w3
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
6989 w2
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
6990 w2
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6991 w2
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6992 w2
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6993 w1
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6994 w1
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
7002 if (offset_mod_4
== 0)
7018 w3
[2] = amd_bytealign ( 0, w1
[2], offset_minus_4
);
7019 w3
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
7020 w3
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
7021 w2
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
7022 w2
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
7023 w2
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
7024 w2
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
7025 w1
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
7034 if (offset_mod_4
== 0)
7049 w3
[2] = amd_bytealign ( 0, w1
[1], offset_minus_4
);
7050 w3
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
7051 w3
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
7052 w2
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
7053 w2
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
7054 w2
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
7055 w2
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
7065 if (offset_mod_4
== 0)
7079 w3
[2] = amd_bytealign ( 0, w1
[0], offset_minus_4
);
7080 w3
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
7081 w3
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
7082 w2
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
7083 w2
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
7084 w2
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
7095 if (offset_mod_4
== 0)
7108 w3
[2] = amd_bytealign ( 0, w0
[3], offset_minus_4
);
7109 w3
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
7110 w3
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
7111 w2
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
7112 w2
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
7124 if (offset_mod_4
== 0)
7136 w3
[2] = amd_bytealign ( 0, w0
[2], offset_minus_4
);
7137 w3
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
7138 w3
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
7139 w2
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
7152 if (offset_mod_4
== 0)
7163 w3
[2] = amd_bytealign ( 0, w0
[1], offset_minus_4
);
7164 w3
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
7165 w3
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
7179 if (offset_mod_4
== 0)
7189 w3
[2] = amd_bytealign ( 0, w0
[0], offset_minus_4
);
7190 w3
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
7205 if (offset_mod_4
== 0)
7216 const int offset_minus_4
= 4 - (offset
% 4);
7218 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
7223 w3
[1] = __byte_perm (w3
[0], w3
[1], selector
);
7224 w3
[0] = __byte_perm (w2
[3], w3
[0], selector
);
7225 w2
[3] = __byte_perm (w2
[2], w2
[3], selector
);
7226 w2
[2] = __byte_perm (w2
[1], w2
[2], selector
);
7227 w2
[1] = __byte_perm (w2
[0], w2
[1], selector
);
7228 w2
[0] = __byte_perm (w1
[3], w2
[0], selector
);
7229 w1
[3] = __byte_perm (w1
[2], w1
[3], selector
);
7230 w1
[2] = __byte_perm (w1
[1], w1
[2], selector
);
7231 w1
[1] = __byte_perm (w1
[0], w1
[1], selector
);
7232 w1
[0] = __byte_perm (w0
[3], w1
[0], selector
);
7233 w0
[3] = __byte_perm (w0
[2], w0
[3], selector
);
7234 w0
[2] = __byte_perm (w0
[1], w0
[2], selector
);
7235 w0
[1] = __byte_perm (w0
[0], w0
[1], selector
);
7236 w0
[0] = __byte_perm ( 0, w0
[0], selector
);
7241 w3
[1] = __byte_perm (w2
[3], w3
[0], selector
);
7242 w3
[0] = __byte_perm (w2
[2], w2
[3], selector
);
7243 w2
[3] = __byte_perm (w2
[1], w2
[2], selector
);
7244 w2
[2] = __byte_perm (w2
[0], w2
[1], selector
);
7245 w2
[1] = __byte_perm (w1
[3], w2
[0], selector
);
7246 w2
[0] = __byte_perm (w1
[2], w1
[3], selector
);
7247 w1
[3] = __byte_perm (w1
[1], w1
[2], selector
);
7248 w1
[2] = __byte_perm (w1
[0], w1
[1], selector
);
7249 w1
[1] = __byte_perm (w0
[3], w1
[0], selector
);
7250 w1
[0] = __byte_perm (w0
[2], w0
[3], selector
);
7251 w0
[3] = __byte_perm (w0
[1], w0
[2], selector
);
7252 w0
[2] = __byte_perm (w0
[0], w0
[1], selector
);
7253 w0
[1] = __byte_perm ( 0, w0
[0], selector
);
7259 w3
[1] = __byte_perm (w2
[2], w2
[3], selector
);
7260 w3
[0] = __byte_perm (w2
[1], w2
[2], selector
);
7261 w2
[3] = __byte_perm (w2
[0], w2
[1], selector
);
7262 w2
[2] = __byte_perm (w1
[3], w2
[0], selector
);
7263 w2
[1] = __byte_perm (w1
[2], w1
[3], selector
);
7264 w2
[0] = __byte_perm (w1
[1], w1
[2], selector
);
7265 w1
[3] = __byte_perm (w1
[0], w1
[1], selector
);
7266 w1
[2] = __byte_perm (w0
[3], w1
[0], selector
);
7267 w1
[1] = __byte_perm (w0
[2], w0
[3], selector
);
7268 w1
[0] = __byte_perm (w0
[1], w0
[2], selector
);
7269 w0
[3] = __byte_perm (w0
[0], w0
[1], selector
);
7270 w0
[2] = __byte_perm ( 0, w0
[0], selector
);
7277 w3
[1] = __byte_perm (w2
[1], w2
[2], selector
);
7278 w3
[0] = __byte_perm (w2
[0], w2
[1], selector
);
7279 w2
[3] = __byte_perm (w1
[3], w2
[0], selector
);
7280 w2
[2] = __byte_perm (w1
[2], w1
[3], selector
);
7281 w2
[1] = __byte_perm (w1
[1], w1
[2], selector
);
7282 w2
[0] = __byte_perm (w1
[0], w1
[1], selector
);
7283 w1
[3] = __byte_perm (w0
[3], w1
[0], selector
);
7284 w1
[2] = __byte_perm (w0
[2], w0
[3], selector
);
7285 w1
[1] = __byte_perm (w0
[1], w0
[2], selector
);
7286 w1
[0] = __byte_perm (w0
[0], w0
[1], selector
);
7287 w0
[3] = __byte_perm ( 0, w0
[0], selector
);
7295 w3
[1] = __byte_perm (w2
[0], w2
[1], selector
);
7296 w3
[0] = __byte_perm (w1
[3], w2
[0], selector
);
7297 w2
[3] = __byte_perm (w1
[2], w1
[3], selector
);
7298 w2
[2] = __byte_perm (w1
[1], w1
[2], selector
);
7299 w2
[1] = __byte_perm (w1
[0], w1
[1], selector
);
7300 w2
[0] = __byte_perm (w0
[3], w1
[0], selector
);
7301 w1
[3] = __byte_perm (w0
[2], w0
[3], selector
);
7302 w1
[2] = __byte_perm (w0
[1], w0
[2], selector
);
7303 w1
[1] = __byte_perm (w0
[0], w0
[1], selector
);
7304 w1
[0] = __byte_perm ( 0, w0
[0], selector
);
7313 w3
[1] = __byte_perm (w1
[3], w2
[0], selector
);
7314 w3
[0] = __byte_perm (w1
[2], w1
[3], selector
);
7315 w2
[3] = __byte_perm (w1
[1], w1
[2], selector
);
7316 w2
[2] = __byte_perm (w1
[0], w1
[1], selector
);
7317 w2
[1] = __byte_perm (w0
[3], w1
[0], selector
);
7318 w2
[0] = __byte_perm (w0
[2], w0
[3], selector
);
7319 w1
[3] = __byte_perm (w0
[1], w0
[2], selector
);
7320 w1
[2] = __byte_perm (w0
[0], w0
[1], selector
);
7321 w1
[1] = __byte_perm ( 0, w0
[0], selector
);
7331 w3
[1] = __byte_perm (w1
[2], w1
[3], selector
);
7332 w3
[0] = __byte_perm (w1
[1], w1
[2], selector
);
7333 w2
[3] = __byte_perm (w1
[0], w1
[1], selector
);
7334 w2
[2] = __byte_perm (w0
[3], w1
[0], selector
);
7335 w2
[1] = __byte_perm (w0
[2], w0
[3], selector
);
7336 w2
[0] = __byte_perm (w0
[1], w0
[2], selector
);
7337 w1
[3] = __byte_perm (w0
[0], w0
[1], selector
);
7338 w1
[2] = __byte_perm ( 0, w0
[0], selector
);
7349 w3
[1] = __byte_perm (w1
[1], w1
[2], selector
);
7350 w3
[0] = __byte_perm (w1
[0], w1
[1], selector
);
7351 w2
[3] = __byte_perm (w0
[3], w1
[0], selector
);
7352 w2
[2] = __byte_perm (w0
[2], w0
[3], selector
);
7353 w2
[1] = __byte_perm (w0
[1], w0
[2], selector
);
7354 w2
[0] = __byte_perm (w0
[0], w0
[1], selector
);
7355 w1
[3] = __byte_perm ( 0, w0
[0], selector
);
7367 w3
[1] = __byte_perm (w1
[0], w1
[1], selector
);
7368 w3
[0] = __byte_perm (w0
[3], w1
[0], selector
);
7369 w2
[3] = __byte_perm (w0
[2], w0
[3], selector
);
7370 w2
[2] = __byte_perm (w0
[1], w0
[2], selector
);
7371 w2
[1] = __byte_perm (w0
[0], w0
[1], selector
);
7372 w2
[0] = __byte_perm ( 0, w0
[0], selector
);
7385 w3
[1] = __byte_perm (w0
[3], w1
[0], selector
);
7386 w3
[0] = __byte_perm (w0
[2], w0
[3], selector
);
7387 w2
[3] = __byte_perm (w0
[1], w0
[2], selector
);
7388 w2
[2] = __byte_perm (w0
[0], w0
[1], selector
);
7389 w2
[1] = __byte_perm ( 0, w0
[0], selector
);
7403 w3
[1] = __byte_perm (w0
[2], w0
[3], selector
);
7404 w3
[0] = __byte_perm (w0
[1], w0
[2], selector
);
7405 w2
[3] = __byte_perm (w0
[0], w0
[1], selector
);
7406 w2
[2] = __byte_perm ( 0, w0
[0], selector
);
7421 w3
[1] = __byte_perm (w0
[1], w0
[2], selector
);
7422 w3
[0] = __byte_perm (w0
[0], w0
[1], selector
);
7423 w2
[3] = __byte_perm ( 0, w0
[0], selector
);
7439 w3
[1] = __byte_perm (w0
[0], w0
[1], selector
);
7440 w3
[0] = __byte_perm ( 0, w0
[0], selector
);
7457 w3
[1] = __byte_perm ( 0, w0
[0], selector
);
7477 static void switch_buffer_by_offset_be (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
7483 w3
[2] = amd_bytealign (w3
[1], 0, offset
);
7484 w3
[1] = amd_bytealign (w3
[0], w3
[1], offset
);
7485 w3
[0] = amd_bytealign (w2
[3], w3
[0], offset
);
7486 w2
[3] = amd_bytealign (w2
[2], w2
[3], offset
);
7487 w2
[2] = amd_bytealign (w2
[1], w2
[2], offset
);
7488 w2
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
7489 w2
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
7490 w1
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
7491 w1
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
7492 w1
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
7493 w1
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
7494 w0
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
7495 w0
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
7496 w0
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
7497 w0
[0] = amd_bytealign ( 0, w0
[0], offset
);
7501 w3
[2] = amd_bytealign (w3
[0], 0, offset
);
7502 w3
[1] = amd_bytealign (w2
[3], w3
[0], offset
);
7503 w3
[0] = amd_bytealign (w2
[2], w2
[3], offset
);
7504 w2
[3] = amd_bytealign (w2
[1], w2
[2], offset
);
7505 w2
[2] = amd_bytealign (w2
[0], w2
[1], offset
);
7506 w2
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
7507 w2
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
7508 w1
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
7509 w1
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
7510 w1
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
7511 w1
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
7512 w0
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
7513 w0
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
7514 w0
[1] = amd_bytealign ( 0, w0
[0], offset
);
7519 w3
[2] = amd_bytealign (w2
[3], 0, offset
);
7520 w3
[1] = amd_bytealign (w2
[2], w2
[3], offset
);
7521 w3
[0] = amd_bytealign (w2
[1], w2
[2], offset
);
7522 w2
[3] = amd_bytealign (w2
[0], w2
[1], offset
);
7523 w2
[2] = amd_bytealign (w1
[3], w2
[0], offset
);
7524 w2
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
7525 w2
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
7526 w1
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
7527 w1
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
7528 w1
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
7529 w1
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
7530 w0
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
7531 w0
[2] = amd_bytealign ( 0, w0
[0], offset
);
7537 w3
[2] = amd_bytealign (w2
[2], 0, offset
);
7538 w3
[1] = amd_bytealign (w2
[1], w2
[2], offset
);
7539 w3
[0] = amd_bytealign (w2
[0], w2
[1], offset
);
7540 w2
[3] = amd_bytealign (w1
[3], w2
[0], offset
);
7541 w2
[2] = amd_bytealign (w1
[2], w1
[3], offset
);
7542 w2
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
7543 w2
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
7544 w1
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
7545 w1
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
7546 w1
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
7547 w1
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
7548 w0
[3] = amd_bytealign ( 0, w0
[0], offset
);
7555 w3
[2] = amd_bytealign (w2
[1], 0, offset
);
7556 w3
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
7557 w3
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
7558 w2
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
7559 w2
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
7560 w2
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
7561 w2
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
7562 w1
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
7563 w1
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
7564 w1
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
7565 w1
[0] = amd_bytealign ( 0, w0
[0], offset
);
7573 w3
[2] = amd_bytealign (w2
[0], 0, offset
);
7574 w3
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
7575 w3
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
7576 w2
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
7577 w2
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
7578 w2
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
7579 w2
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
7580 w1
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
7581 w1
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
7582 w1
[1] = amd_bytealign ( 0, w0
[0], offset
);
7591 w3
[2] = amd_bytealign (w1
[3], 0, offset
);
7592 w3
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
7593 w3
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
7594 w2
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
7595 w2
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
7596 w2
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
7597 w2
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
7598 w1
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
7599 w1
[2] = amd_bytealign ( 0, w0
[0], offset
);
7609 w3
[2] = amd_bytealign (w1
[2], 0, offset
);
7610 w3
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
7611 w3
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
7612 w2
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
7613 w2
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
7614 w2
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
7615 w2
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
7616 w1
[3] = amd_bytealign ( 0, w0
[0], offset
);
7627 w3
[2] = amd_bytealign (w1
[1], 0, offset
);
7628 w3
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
7629 w3
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
7630 w2
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
7631 w2
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
7632 w2
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
7633 w2
[0] = amd_bytealign ( 0, w0
[0], offset
);
7645 w3
[2] = amd_bytealign (w1
[0], 0, offset
);
7646 w3
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
7647 w3
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
7648 w2
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
7649 w2
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
7650 w2
[1] = amd_bytealign ( 0, w0
[0], offset
);
7663 w3
[2] = amd_bytealign (w0
[3], 0, offset
);
7664 w3
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
7665 w3
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
7666 w2
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
7667 w2
[2] = amd_bytealign ( 0, w0
[0], offset
);
7681 w3
[2] = amd_bytealign (w0
[2], 0, offset
);
7682 w3
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
7683 w3
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
7684 w2
[3] = amd_bytealign ( 0, w0
[0], offset
);
7699 w3
[2] = amd_bytealign (w0
[1], 0, offset
);
7700 w3
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
7701 w3
[0] = amd_bytealign ( 0, w0
[0], offset
);
7717 w3
[2] = amd_bytealign (w0
[0], 0, offset
);
7718 w3
[1] = amd_bytealign ( 0, w0
[0], offset
);
7737 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
7742 w3
[1] = __byte_perm (w3
[1], w3
[0], selector
);
7743 w3
[0] = __byte_perm (w3
[0], w2
[3], selector
);
7744 w2
[3] = __byte_perm (w2
[3], w2
[2], selector
);
7745 w2
[2] = __byte_perm (w2
[2], w2
[1], selector
);
7746 w2
[1] = __byte_perm (w2
[1], w2
[0], selector
);
7747 w2
[0] = __byte_perm (w2
[0], w1
[3], selector
);
7748 w1
[3] = __byte_perm (w1
[3], w1
[2], selector
);
7749 w1
[2] = __byte_perm (w1
[2], w1
[1], selector
);
7750 w1
[1] = __byte_perm (w1
[1], w1
[0], selector
);
7751 w1
[0] = __byte_perm (w1
[0], w0
[3], selector
);
7752 w0
[3] = __byte_perm (w0
[3], w0
[2], selector
);
7753 w0
[2] = __byte_perm (w0
[2], w0
[1], selector
);
7754 w0
[1] = __byte_perm (w0
[1], w0
[0], selector
);
7755 w0
[0] = __byte_perm (w0
[0], 0, selector
);
7759 w3
[1] = __byte_perm (w3
[0], w2
[3], selector
);
7760 w3
[0] = __byte_perm (w2
[3], w2
[2], selector
);
7761 w2
[3] = __byte_perm (w2
[2], w2
[1], selector
);
7762 w2
[2] = __byte_perm (w2
[1], w2
[0], selector
);
7763 w2
[1] = __byte_perm (w2
[0], w1
[3], selector
);
7764 w2
[0] = __byte_perm (w1
[3], w1
[2], selector
);
7765 w1
[3] = __byte_perm (w1
[2], w1
[1], selector
);
7766 w1
[2] = __byte_perm (w1
[1], w1
[0], selector
);
7767 w1
[1] = __byte_perm (w1
[0], w0
[3], selector
);
7768 w1
[0] = __byte_perm (w0
[3], w0
[2], selector
);
7769 w0
[3] = __byte_perm (w0
[2], w0
[1], selector
);
7770 w0
[2] = __byte_perm (w0
[1], w0
[0], selector
);
7771 w0
[1] = __byte_perm (w0
[0], 0, selector
);
7776 w3
[1] = __byte_perm (w2
[3], w2
[2], selector
);
7777 w3
[0] = __byte_perm (w2
[2], w2
[1], selector
);
7778 w2
[3] = __byte_perm (w2
[1], w2
[0], selector
);
7779 w2
[2] = __byte_perm (w2
[0], w1
[3], selector
);
7780 w2
[1] = __byte_perm (w1
[3], w1
[2], selector
);
7781 w2
[0] = __byte_perm (w1
[2], w1
[1], selector
);
7782 w1
[3] = __byte_perm (w1
[1], w1
[0], selector
);
7783 w1
[2] = __byte_perm (w1
[0], w0
[3], selector
);
7784 w1
[1] = __byte_perm (w0
[3], w0
[2], selector
);
7785 w1
[0] = __byte_perm (w0
[2], w0
[1], selector
);
7786 w0
[3] = __byte_perm (w0
[1], w0
[0], selector
);
7787 w0
[2] = __byte_perm (w0
[0], 0, selector
);
7793 w3
[1] = __byte_perm (w2
[2], w2
[1], selector
);
7794 w3
[0] = __byte_perm (w2
[1], w2
[0], selector
);
7795 w2
[3] = __byte_perm (w2
[0], w1
[3], selector
);
7796 w2
[2] = __byte_perm (w1
[3], w1
[2], selector
);
7797 w2
[1] = __byte_perm (w1
[2], w1
[1], selector
);
7798 w2
[0] = __byte_perm (w1
[1], w1
[0], selector
);
7799 w1
[3] = __byte_perm (w1
[0], w0
[3], selector
);
7800 w1
[2] = __byte_perm (w0
[3], w0
[2], selector
);
7801 w1
[1] = __byte_perm (w0
[2], w0
[1], selector
);
7802 w1
[0] = __byte_perm (w0
[1], w0
[0], selector
);
7803 w0
[3] = __byte_perm (w0
[0], 0, selector
);
7810 w3
[1] = __byte_perm (w2
[1], w2
[0], selector
);
7811 w3
[0] = __byte_perm (w2
[0], w1
[3], selector
);
7812 w2
[3] = __byte_perm (w1
[3], w1
[2], selector
);
7813 w2
[2] = __byte_perm (w1
[2], w1
[1], selector
);
7814 w2
[1] = __byte_perm (w1
[1], w1
[0], selector
);
7815 w2
[0] = __byte_perm (w1
[0], w0
[3], selector
);
7816 w1
[3] = __byte_perm (w0
[3], w0
[2], selector
);
7817 w1
[2] = __byte_perm (w0
[2], w0
[1], selector
);
7818 w1
[1] = __byte_perm (w0
[1], w0
[0], selector
);
7819 w1
[0] = __byte_perm (w0
[0], 0, selector
);
7827 w3
[1] = __byte_perm (w2
[0], w1
[3], selector
);
7828 w3
[0] = __byte_perm (w1
[3], w1
[2], selector
);
7829 w2
[3] = __byte_perm (w1
[2], w1
[1], selector
);
7830 w2
[2] = __byte_perm (w1
[1], w1
[0], selector
);
7831 w2
[1] = __byte_perm (w1
[0], w0
[3], selector
);
7832 w2
[0] = __byte_perm (w0
[3], w0
[2], selector
);
7833 w1
[3] = __byte_perm (w0
[2], w0
[1], selector
);
7834 w1
[2] = __byte_perm (w0
[1], w0
[0], selector
);
7835 w1
[1] = __byte_perm (w0
[0], 0, selector
);
7844 w3
[1] = __byte_perm (w1
[3], w1
[2], selector
);
7845 w3
[0] = __byte_perm (w1
[2], w1
[1], selector
);
7846 w2
[3] = __byte_perm (w1
[1], w1
[0], selector
);
7847 w2
[2] = __byte_perm (w1
[0], w0
[3], selector
);
7848 w2
[1] = __byte_perm (w0
[3], w0
[2], selector
);
7849 w2
[0] = __byte_perm (w0
[2], w0
[1], selector
);
7850 w1
[3] = __byte_perm (w0
[1], w0
[0], selector
);
7851 w1
[2] = __byte_perm (w0
[0], 0, selector
);
7861 w3
[1] = __byte_perm (w1
[2], w1
[1], selector
);
7862 w3
[0] = __byte_perm (w1
[1], w1
[0], selector
);
7863 w2
[3] = __byte_perm (w1
[0], w0
[3], selector
);
7864 w2
[2] = __byte_perm (w0
[3], w0
[2], selector
);
7865 w2
[1] = __byte_perm (w0
[2], w0
[1], selector
);
7866 w2
[0] = __byte_perm (w0
[1], w0
[0], selector
);
7867 w1
[3] = __byte_perm (w0
[0], 0, selector
);
7878 w3
[1] = __byte_perm (w1
[1], w1
[0], selector
);
7879 w3
[0] = __byte_perm (w1
[0], w0
[3], selector
);
7880 w2
[3] = __byte_perm (w0
[3], w0
[2], selector
);
7881 w2
[2] = __byte_perm (w0
[2], w0
[1], selector
);
7882 w2
[1] = __byte_perm (w0
[1], w0
[0], selector
);
7883 w2
[0] = __byte_perm (w0
[0], 0, selector
);
7895 w3
[1] = __byte_perm (w1
[0], w0
[3], selector
);
7896 w3
[0] = __byte_perm (w0
[3], w0
[2], selector
);
7897 w2
[3] = __byte_perm (w0
[2], w0
[1], selector
);
7898 w2
[2] = __byte_perm (w0
[1], w0
[0], selector
);
7899 w2
[1] = __byte_perm (w0
[0], 0, selector
);
7912 w3
[1] = __byte_perm (w0
[3], w0
[2], selector
);
7913 w3
[0] = __byte_perm (w0
[2], w0
[1], selector
);
7914 w2
[3] = __byte_perm (w0
[1], w0
[0], selector
);
7915 w2
[2] = __byte_perm (w0
[0], 0, selector
);
7929 w3
[1] = __byte_perm (w0
[2], w0
[1], selector
);
7930 w3
[0] = __byte_perm (w0
[1], w0
[0], selector
);
7931 w2
[3] = __byte_perm (w0
[0], 0, selector
);
7946 w3
[1] = __byte_perm (w0
[1], w0
[0], selector
);
7947 w3
[0] = __byte_perm (w0
[0], 0, selector
);
7963 w3
[1] = __byte_perm (w0
[0], 0, selector
);