2 * Author......: Jens Steube <jens.steube@gmail.com>
6 static int hash_comp (const u32 d1
[4], __global u32
*d2
)
8 if (d1
[3] > d2
[DGST_R3
]) return ( 1);
9 if (d1
[3] < d2
[DGST_R3
]) return (-1);
10 if (d1
[2] > d2
[DGST_R2
]) return ( 1);
11 if (d1
[2] < d2
[DGST_R2
]) return (-1);
12 if (d1
[1] > d2
[DGST_R1
]) return ( 1);
13 if (d1
[1] < d2
[DGST_R1
]) return (-1);
14 if (d1
[0] > d2
[DGST_R0
]) return ( 1);
15 if (d1
[0] < d2
[DGST_R0
]) return (-1);
20 static int find_hash (const u32 digest
[4], const u32 digests_cnt
, __global digest_t
*digests_buf
)
22 for (u32 l
= 0, r
= digests_cnt
; r
; r
>>= 1)
28 const int cmp
= hash_comp (digest
, digests_buf
[c
].digest_buf
);
37 if (cmp
== 0) return (c
);
43 static u32
check_bitmap (__global u32
*bitmap
, const u32 bitmap_mask
, const u32 bitmap_shift
, const u32 digest
)
45 return (bitmap
[(digest
>> bitmap_shift
) & bitmap_mask
] & (1 << (digest
& 0x1f)));
48 static u32
check (const u32 digest
[2], __global u32
*bitmap_s1_a
, __global u32
*bitmap_s1_b
, __global u32
*bitmap_s1_c
, __global u32
*bitmap_s1_d
, __global u32
*bitmap_s2_a
, __global u32
*bitmap_s2_b
, __global u32
*bitmap_s2_c
, __global u32
*bitmap_s2_d
, const u32 bitmap_mask
, const u32 bitmap_shift1
, const u32 bitmap_shift2
)
50 if (check_bitmap (bitmap_s1_a
, bitmap_mask
, bitmap_shift1
, digest
[0]) == 0) return (0);
51 if (check_bitmap (bitmap_s1_b
, bitmap_mask
, bitmap_shift1
, digest
[1]) == 0) return (0);
52 if (check_bitmap (bitmap_s1_c
, bitmap_mask
, bitmap_shift1
, digest
[2]) == 0) return (0);
53 if (check_bitmap (bitmap_s1_d
, bitmap_mask
, bitmap_shift1
, digest
[3]) == 0) return (0);
55 if (check_bitmap (bitmap_s2_a
, bitmap_mask
, bitmap_shift2
, digest
[0]) == 0) return (0);
56 if (check_bitmap (bitmap_s2_b
, bitmap_mask
, bitmap_shift2
, digest
[1]) == 0) return (0);
57 if (check_bitmap (bitmap_s2_c
, bitmap_mask
, bitmap_shift2
, digest
[2]) == 0) return (0);
58 if (check_bitmap (bitmap_s2_d
, bitmap_mask
, bitmap_shift2
, digest
[3]) == 0) return (0);
63 static void mark_hash (__global plain_t
*plains_buf
, __global u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
65 hashes_shown
[hash_pos
] = 1;
67 plains_buf
[hash_pos
].gidvid
= (gid
* 1) + 0;
68 plains_buf
[hash_pos
].il_pos
= il_pos
;
71 static void truncate_block (u32 w
[4], const u32 len
)
80 case 1: w
[0] &= 0x000000FF;
85 case 2: w
[0] &= 0x0000FFFF;
90 case 3: w
[0] &= 0x00FFFFFF;
99 case 5: w
[1] &= 0x000000FF;
103 case 6: w
[1] &= 0x0000FFFF;
107 case 7: w
[1] &= 0x00FFFFFF;
114 case 9: w
[2] &= 0x000000FF;
117 case 10: w
[2] &= 0x0000FFFF;
120 case 11: w
[2] &= 0x00FFFFFF;
125 case 13: w
[3] &= 0x000000FF;
127 case 14: w
[3] &= 0x0000FFFF;
129 case 15: w
[3] &= 0x00FFFFFF;
134 static void make_unicode (const u32 in
[4], u32 out1
[4], u32 out2
[4])
137 out2
[3] = __byte_perm (in
[3], 0, 0x7372);
138 out2
[2] = __byte_perm (in
[3], 0, 0x7170);
139 out2
[1] = __byte_perm (in
[2], 0, 0x7372);
140 out2
[0] = __byte_perm (in
[2], 0, 0x7170);
141 out1
[3] = __byte_perm (in
[1], 0, 0x7372);
142 out1
[2] = __byte_perm (in
[1], 0, 0x7170);
143 out1
[1] = __byte_perm (in
[0], 0, 0x7372);
144 out1
[0] = __byte_perm (in
[0], 0, 0x7170);
148 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
149 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
150 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
151 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
152 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
153 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
154 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
155 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
159 static void undo_unicode (const u32 in1
[4], const u32 in2
[4], u32 out
[4])
162 out
[0] = __byte_perm (in1
[0], in1
[1], 0x6420);
163 out
[1] = __byte_perm (in1
[2], in1
[3], 0x6420);
164 out
[2] = __byte_perm (in2
[0], in2
[1], 0x6420);
165 out
[3] = __byte_perm (in2
[2], in2
[3], 0x6420);
169 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
170 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
171 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
172 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
173 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
174 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
175 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
176 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
180 // before: append_0x01_1
181 static void append_0x01_1x4 (u32 w0
[4], const u32 offset
)
190 w0
[0] = w0
[0] | 0x0100;
194 w0
[0] = w0
[0] | 0x010000;
198 w0
[0] = w0
[0] | 0x01000000;
206 w0
[1] = w0
[1] | 0x0100;
210 w0
[1] = w0
[1] | 0x010000;
214 w0
[1] = w0
[1] | 0x01000000;
222 w0
[2] = w0
[2] | 0x0100;
226 w0
[2] = w0
[2] | 0x010000;
230 w0
[2] = w0
[2] | 0x01000000;
238 w0
[3] = w0
[3] | 0x0100;
242 w0
[3] = w0
[3] | 0x010000;
246 w0
[3] = w0
[3] | 0x01000000;
251 // before: append_0x01_2
252 static void append_0x01_2x4 (u32 w0
[4], u32 w1
[4], const u32 offset
)
261 w0
[0] = w0
[0] | 0x0100;
265 w0
[0] = w0
[0] | 0x010000;
269 w0
[0] = w0
[0] | 0x01000000;
277 w0
[1] = w0
[1] | 0x0100;
281 w0
[1] = w0
[1] | 0x010000;
285 w0
[1] = w0
[1] | 0x01000000;
293 w0
[2] = w0
[2] | 0x0100;
297 w0
[2] = w0
[2] | 0x010000;
301 w0
[2] = w0
[2] | 0x01000000;
309 w0
[3] = w0
[3] | 0x0100;
313 w0
[3] = w0
[3] | 0x010000;
317 w0
[3] = w0
[3] | 0x01000000;
325 w1
[0] = w1
[0] | 0x0100;
329 w1
[0] = w1
[0] | 0x010000;
333 w1
[0] = w1
[0] | 0x01000000;
341 w1
[1] = w1
[1] | 0x0100;
345 w1
[1] = w1
[1] | 0x010000;
349 w1
[1] = w1
[1] | 0x01000000;
357 w1
[2] = w1
[2] | 0x0100;
361 w1
[2] = w1
[2] | 0x010000;
365 w1
[2] = w1
[2] | 0x01000000;
373 w1
[3] = w1
[3] | 0x0100;
377 w1
[3] = w1
[3] | 0x010000;
381 w1
[3] = w1
[3] | 0x01000000;
386 // before: append_0x01_3
387 static void append_0x01_3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
396 w0
[0] = w0
[0] | 0x0100;
400 w0
[0] = w0
[0] | 0x010000;
404 w0
[0] = w0
[0] | 0x01000000;
412 w0
[1] = w0
[1] | 0x0100;
416 w0
[1] = w0
[1] | 0x010000;
420 w0
[1] = w0
[1] | 0x01000000;
428 w0
[2] = w0
[2] | 0x0100;
432 w0
[2] = w0
[2] | 0x010000;
436 w0
[2] = w0
[2] | 0x01000000;
444 w0
[3] = w0
[3] | 0x0100;
448 w0
[3] = w0
[3] | 0x010000;
452 w0
[3] = w0
[3] | 0x01000000;
460 w1
[0] = w1
[0] | 0x0100;
464 w1
[0] = w1
[0] | 0x010000;
468 w1
[0] = w1
[0] | 0x01000000;
476 w1
[1] = w1
[1] | 0x0100;
480 w1
[1] = w1
[1] | 0x010000;
484 w1
[1] = w1
[1] | 0x01000000;
492 w1
[2] = w1
[2] | 0x0100;
496 w1
[2] = w1
[2] | 0x010000;
500 w1
[2] = w1
[2] | 0x01000000;
508 w1
[3] = w1
[3] | 0x0100;
512 w1
[3] = w1
[3] | 0x010000;
516 w1
[3] = w1
[3] | 0x01000000;
524 w2
[0] = w2
[0] | 0x0100;
528 w2
[0] = w2
[0] | 0x010000;
532 w2
[0] = w2
[0] | 0x01000000;
540 w2
[1] = w2
[1] | 0x0100;
544 w2
[1] = w2
[1] | 0x010000;
548 w2
[1] = w2
[1] | 0x01000000;
556 w2
[2] = w2
[2] | 0x0100;
560 w2
[2] = w2
[2] | 0x010000;
564 w2
[2] = w2
[2] | 0x01000000;
572 w2
[3] = w2
[3] | 0x0100;
576 w2
[3] = w2
[3] | 0x010000;
580 w2
[3] = w2
[3] | 0x01000000;
585 // before: append_0x01_4
586 static void append_0x01_4x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
595 w0
[0] = w0
[0] | 0x0100;
599 w0
[0] = w0
[0] | 0x010000;
603 w0
[0] = w0
[0] | 0x01000000;
611 w0
[1] = w0
[1] | 0x0100;
615 w0
[1] = w0
[1] | 0x010000;
619 w0
[1] = w0
[1] | 0x01000000;
627 w0
[2] = w0
[2] | 0x0100;
631 w0
[2] = w0
[2] | 0x010000;
635 w0
[2] = w0
[2] | 0x01000000;
643 w0
[3] = w0
[3] | 0x0100;
647 w0
[3] = w0
[3] | 0x010000;
651 w0
[3] = w0
[3] | 0x01000000;
659 w1
[0] = w1
[0] | 0x0100;
663 w1
[0] = w1
[0] | 0x010000;
667 w1
[0] = w1
[0] | 0x01000000;
675 w1
[1] = w1
[1] | 0x0100;
679 w1
[1] = w1
[1] | 0x010000;
683 w1
[1] = w1
[1] | 0x01000000;
691 w1
[2] = w1
[2] | 0x0100;
695 w1
[2] = w1
[2] | 0x010000;
699 w1
[2] = w1
[2] | 0x01000000;
707 w1
[3] = w1
[3] | 0x0100;
711 w1
[3] = w1
[3] | 0x010000;
715 w1
[3] = w1
[3] | 0x01000000;
723 w2
[0] = w2
[0] | 0x0100;
727 w2
[0] = w2
[0] | 0x010000;
731 w2
[0] = w2
[0] | 0x01000000;
739 w2
[1] = w2
[1] | 0x0100;
743 w2
[1] = w2
[1] | 0x010000;
747 w2
[1] = w2
[1] | 0x01000000;
755 w2
[2] = w2
[2] | 0x0100;
759 w2
[2] = w2
[2] | 0x010000;
763 w2
[2] = w2
[2] | 0x01000000;
771 w2
[3] = w2
[3] | 0x0100;
775 w2
[3] = w2
[3] | 0x010000;
779 w2
[3] = w2
[3] | 0x01000000;
787 w3
[0] = w3
[0] | 0x0100;
791 w3
[0] = w3
[0] | 0x010000;
795 w3
[0] = w3
[0] | 0x01000000;
803 w3
[1] = w3
[1] | 0x0100;
807 w3
[1] = w3
[1] | 0x010000;
811 w3
[1] = w3
[1] | 0x01000000;
819 w3
[2] = w3
[2] | 0x0100;
823 w3
[2] = w3
[2] | 0x010000;
827 w3
[2] = w3
[2] | 0x01000000;
835 w3
[3] = w3
[3] | 0x0100;
839 w3
[3] = w3
[3] | 0x010000;
843 w3
[3] = w3
[3] | 0x01000000;
848 // before: append_0x01_8
849 static void append_0x01_8x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
858 w0
[0] = w0
[0] | 0x0100;
862 w0
[0] = w0
[0] | 0x010000;
866 w0
[0] = w0
[0] | 0x01000000;
874 w0
[1] = w0
[1] | 0x0100;
878 w0
[1] = w0
[1] | 0x010000;
882 w0
[1] = w0
[1] | 0x01000000;
890 w0
[2] = w0
[2] | 0x0100;
894 w0
[2] = w0
[2] | 0x010000;
898 w0
[2] = w0
[2] | 0x01000000;
906 w0
[3] = w0
[3] | 0x0100;
910 w0
[3] = w0
[3] | 0x010000;
914 w0
[3] = w0
[3] | 0x01000000;
922 w1
[0] = w1
[0] | 0x0100;
926 w1
[0] = w1
[0] | 0x010000;
930 w1
[0] = w1
[0] | 0x01000000;
938 w1
[1] = w1
[1] | 0x0100;
942 w1
[1] = w1
[1] | 0x010000;
946 w1
[1] = w1
[1] | 0x01000000;
954 w1
[2] = w1
[2] | 0x0100;
958 w1
[2] = w1
[2] | 0x010000;
962 w1
[2] = w1
[2] | 0x01000000;
970 w1
[3] = w1
[3] | 0x0100;
974 w1
[3] = w1
[3] | 0x010000;
978 w1
[3] = w1
[3] | 0x01000000;
986 w2
[0] = w2
[0] | 0x0100;
990 w2
[0] = w2
[0] | 0x010000;
994 w2
[0] = w2
[0] | 0x01000000;
1002 w2
[1] = w2
[1] | 0x0100;
1006 w2
[1] = w2
[1] | 0x010000;
1010 w2
[1] = w2
[1] | 0x01000000;
1018 w2
[2] = w2
[2] | 0x0100;
1022 w2
[2] = w2
[2] | 0x010000;
1026 w2
[2] = w2
[2] | 0x01000000;
1034 w2
[3] = w2
[3] | 0x0100;
1038 w2
[3] = w2
[3] | 0x010000;
1042 w2
[3] = w2
[3] | 0x01000000;
1050 w3
[0] = w3
[0] | 0x0100;
1054 w3
[0] = w3
[0] | 0x010000;
1058 w3
[0] = w3
[0] | 0x01000000;
1066 w3
[1] = w3
[1] | 0x0100;
1070 w3
[1] = w3
[1] | 0x010000;
1074 w3
[1] = w3
[1] | 0x01000000;
1082 w3
[2] = w3
[2] | 0x0100;
1086 w3
[2] = w3
[2] | 0x010000;
1090 w3
[2] = w3
[2] | 0x01000000;
1098 w3
[3] = w3
[3] | 0x0100;
1102 w3
[3] = w3
[3] | 0x010000;
1106 w3
[3] = w3
[3] | 0x01000000;
1114 w4
[0] = w4
[0] | 0x0100;
1118 w4
[0] = w4
[0] | 0x010000;
1122 w4
[0] = w4
[0] | 0x01000000;
1130 w4
[1] = w4
[1] | 0x0100;
1134 w4
[1] = w4
[1] | 0x010000;
1138 w4
[1] = w4
[1] | 0x01000000;
1146 w4
[2] = w4
[2] | 0x0100;
1150 w4
[2] = w4
[2] | 0x010000;
1154 w4
[2] = w4
[2] | 0x01000000;
1162 w4
[3] = w4
[3] | 0x0100;
1166 w4
[3] = w4
[3] | 0x010000;
1170 w4
[3] = w4
[3] | 0x01000000;
1178 w5
[0] = w5
[0] | 0x0100;
1182 w5
[0] = w5
[0] | 0x010000;
1186 w5
[0] = w5
[0] | 0x01000000;
1194 w5
[1] = w5
[1] | 0x0100;
1198 w5
[1] = w5
[1] | 0x010000;
1202 w5
[1] = w5
[1] | 0x01000000;
1210 w5
[2] = w5
[2] | 0x0100;
1214 w5
[2] = w5
[2] | 0x010000;
1218 w5
[2] = w5
[2] | 0x01000000;
1226 w5
[3] = w5
[3] | 0x0100;
1230 w5
[3] = w5
[3] | 0x010000;
1234 w5
[3] = w5
[3] | 0x01000000;
1242 w6
[0] = w6
[0] | 0x0100;
1246 w6
[0] = w6
[0] | 0x010000;
1250 w6
[0] = w6
[0] | 0x01000000;
1258 w6
[1] = w6
[1] | 0x0100;
1262 w6
[1] = w6
[1] | 0x010000;
1266 w6
[1] = w6
[1] | 0x01000000;
1274 w6
[2] = w6
[2] | 0x0100;
1278 w6
[2] = w6
[2] | 0x010000;
1282 w6
[2] = w6
[2] | 0x01000000;
1290 w6
[3] = w6
[3] | 0x0100;
1294 w6
[3] = w6
[3] | 0x010000;
1298 w6
[3] = w6
[3] | 0x01000000;
1306 w7
[0] = w7
[0] | 0x0100;
1310 w7
[0] = w7
[0] | 0x010000;
1314 w7
[0] = w7
[0] | 0x01000000;
1322 w7
[1] = w7
[1] | 0x0100;
1326 w7
[1] = w7
[1] | 0x010000;
1330 w7
[1] = w7
[1] | 0x01000000;
1338 w7
[2] = w7
[2] | 0x0100;
1342 w7
[2] = w7
[2] | 0x010000;
1346 w7
[2] = w7
[2] | 0x01000000;
1354 w7
[3] = w7
[3] | 0x0100;
1358 w7
[3] = w7
[3] | 0x010000;
1362 w7
[3] = w7
[3] | 0x01000000;
1367 // before: append_0x02_1
1368 static void append_0x02_1x4 (u32 w0
[4], const u32 offset
)
1377 w0
[0] = w0
[0] | 0x0200;
1381 w0
[0] = w0
[0] | 0x020000;
1385 w0
[0] = w0
[0] | 0x02000000;
1393 w0
[1] = w0
[1] | 0x0200;
1397 w0
[1] = w0
[1] | 0x020000;
1401 w0
[1] = w0
[1] | 0x02000000;
1409 w0
[2] = w0
[2] | 0x0200;
1413 w0
[2] = w0
[2] | 0x020000;
1417 w0
[2] = w0
[2] | 0x02000000;
1425 w0
[3] = w0
[3] | 0x0200;
1429 w0
[3] = w0
[3] | 0x020000;
1433 w0
[3] = w0
[3] | 0x02000000;
1438 // before: append_0x02_2
1439 static void append_0x02_2x4 (u32 w0
[4], u32 w1
[4], const u32 offset
)
1448 w0
[0] = w0
[0] | 0x0200;
1452 w0
[0] = w0
[0] | 0x020000;
1456 w0
[0] = w0
[0] | 0x02000000;
1464 w0
[1] = w0
[1] | 0x0200;
1468 w0
[1] = w0
[1] | 0x020000;
1472 w0
[1] = w0
[1] | 0x02000000;
1480 w0
[2] = w0
[2] | 0x0200;
1484 w0
[2] = w0
[2] | 0x020000;
1488 w0
[2] = w0
[2] | 0x02000000;
1496 w0
[3] = w0
[3] | 0x0200;
1500 w0
[3] = w0
[3] | 0x020000;
1504 w0
[3] = w0
[3] | 0x02000000;
1512 w1
[0] = w1
[0] | 0x0200;
1516 w1
[0] = w1
[0] | 0x020000;
1520 w1
[0] = w1
[0] | 0x02000000;
1528 w1
[1] = w1
[1] | 0x0200;
1532 w1
[1] = w1
[1] | 0x020000;
1536 w1
[1] = w1
[1] | 0x02000000;
1544 w1
[2] = w1
[2] | 0x0200;
1548 w1
[2] = w1
[2] | 0x020000;
1552 w1
[2] = w1
[2] | 0x02000000;
1560 w1
[3] = w1
[3] | 0x0200;
1564 w1
[3] = w1
[3] | 0x020000;
1568 w1
[3] = w1
[3] | 0x02000000;
1573 // before: append_0x02_3
1574 static void append_0x02_3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
1583 w0
[0] = w0
[0] | 0x0200;
1587 w0
[0] = w0
[0] | 0x020000;
1591 w0
[0] = w0
[0] | 0x02000000;
1599 w0
[1] = w0
[1] | 0x0200;
1603 w0
[1] = w0
[1] | 0x020000;
1607 w0
[1] = w0
[1] | 0x02000000;
1615 w0
[2] = w0
[2] | 0x0200;
1619 w0
[2] = w0
[2] | 0x020000;
1623 w0
[2] = w0
[2] | 0x02000000;
1631 w0
[3] = w0
[3] | 0x0200;
1635 w0
[3] = w0
[3] | 0x020000;
1639 w0
[3] = w0
[3] | 0x02000000;
1647 w1
[0] = w1
[0] | 0x0200;
1651 w1
[0] = w1
[0] | 0x020000;
1655 w1
[0] = w1
[0] | 0x02000000;
1663 w1
[1] = w1
[1] | 0x0200;
1667 w1
[1] = w1
[1] | 0x020000;
1671 w1
[1] = w1
[1] | 0x02000000;
1679 w1
[2] = w1
[2] | 0x0200;
1683 w1
[2] = w1
[2] | 0x020000;
1687 w1
[2] = w1
[2] | 0x02000000;
1695 w1
[3] = w1
[3] | 0x0200;
1699 w1
[3] = w1
[3] | 0x020000;
1703 w1
[3] = w1
[3] | 0x02000000;
1711 w2
[0] = w2
[0] | 0x0200;
1715 w2
[0] = w2
[0] | 0x020000;
1719 w2
[0] = w2
[0] | 0x02000000;
1727 w2
[1] = w2
[1] | 0x0200;
1731 w2
[1] = w2
[1] | 0x020000;
1735 w2
[1] = w2
[1] | 0x02000000;
1743 w2
[2] = w2
[2] | 0x0200;
1747 w2
[2] = w2
[2] | 0x020000;
1751 w2
[2] = w2
[2] | 0x02000000;
1759 w2
[3] = w2
[3] | 0x0200;
1763 w2
[3] = w2
[3] | 0x020000;
1767 w2
[3] = w2
[3] | 0x02000000;
1772 // before: append_0x02_4
1773 static void append_0x02_4x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
1782 w0
[0] = w0
[0] | 0x0200;
1786 w0
[0] = w0
[0] | 0x020000;
1790 w0
[0] = w0
[0] | 0x02000000;
1798 w0
[1] = w0
[1] | 0x0200;
1802 w0
[1] = w0
[1] | 0x020000;
1806 w0
[1] = w0
[1] | 0x02000000;
1814 w0
[2] = w0
[2] | 0x0200;
1818 w0
[2] = w0
[2] | 0x020000;
1822 w0
[2] = w0
[2] | 0x02000000;
1830 w0
[3] = w0
[3] | 0x0200;
1834 w0
[3] = w0
[3] | 0x020000;
1838 w0
[3] = w0
[3] | 0x02000000;
1846 w1
[0] = w1
[0] | 0x0200;
1850 w1
[0] = w1
[0] | 0x020000;
1854 w1
[0] = w1
[0] | 0x02000000;
1862 w1
[1] = w1
[1] | 0x0200;
1866 w1
[1] = w1
[1] | 0x020000;
1870 w1
[1] = w1
[1] | 0x02000000;
1878 w1
[2] = w1
[2] | 0x0200;
1882 w1
[2] = w1
[2] | 0x020000;
1886 w1
[2] = w1
[2] | 0x02000000;
1894 w1
[3] = w1
[3] | 0x0200;
1898 w1
[3] = w1
[3] | 0x020000;
1902 w1
[3] = w1
[3] | 0x02000000;
1910 w2
[0] = w2
[0] | 0x0200;
1914 w2
[0] = w2
[0] | 0x020000;
1918 w2
[0] = w2
[0] | 0x02000000;
1926 w2
[1] = w2
[1] | 0x0200;
1930 w2
[1] = w2
[1] | 0x020000;
1934 w2
[1] = w2
[1] | 0x02000000;
1942 w2
[2] = w2
[2] | 0x0200;
1946 w2
[2] = w2
[2] | 0x020000;
1950 w2
[2] = w2
[2] | 0x02000000;
1958 w2
[3] = w2
[3] | 0x0200;
1962 w2
[3] = w2
[3] | 0x020000;
1966 w2
[3] = w2
[3] | 0x02000000;
1974 w3
[0] = w3
[0] | 0x0200;
1978 w3
[0] = w3
[0] | 0x020000;
1982 w3
[0] = w3
[0] | 0x02000000;
1990 w3
[1] = w3
[1] | 0x0200;
1994 w3
[1] = w3
[1] | 0x020000;
1998 w3
[1] = w3
[1] | 0x02000000;
2006 w3
[2] = w3
[2] | 0x0200;
2010 w3
[2] = w3
[2] | 0x020000;
2014 w3
[2] = w3
[2] | 0x02000000;
2022 w3
[3] = w3
[3] | 0x0200;
2026 w3
[3] = w3
[3] | 0x020000;
2030 w3
[3] = w3
[3] | 0x02000000;
2035 // before: append_0x02_8
2036 static void append_0x02_8x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
2045 w0
[0] = w0
[0] | 0x0200;
2049 w0
[0] = w0
[0] | 0x020000;
2053 w0
[0] = w0
[0] | 0x02000000;
2061 w0
[1] = w0
[1] | 0x0200;
2065 w0
[1] = w0
[1] | 0x020000;
2069 w0
[1] = w0
[1] | 0x02000000;
2077 w0
[2] = w0
[2] | 0x0200;
2081 w0
[2] = w0
[2] | 0x020000;
2085 w0
[2] = w0
[2] | 0x02000000;
2093 w0
[3] = w0
[3] | 0x0200;
2097 w0
[3] = w0
[3] | 0x020000;
2101 w0
[3] = w0
[3] | 0x02000000;
2109 w1
[0] = w1
[0] | 0x0200;
2113 w1
[0] = w1
[0] | 0x020000;
2117 w1
[0] = w1
[0] | 0x02000000;
2125 w1
[1] = w1
[1] | 0x0200;
2129 w1
[1] = w1
[1] | 0x020000;
2133 w1
[1] = w1
[1] | 0x02000000;
2141 w1
[2] = w1
[2] | 0x0200;
2145 w1
[2] = w1
[2] | 0x020000;
2149 w1
[2] = w1
[2] | 0x02000000;
2157 w1
[3] = w1
[3] | 0x0200;
2161 w1
[3] = w1
[3] | 0x020000;
2165 w1
[3] = w1
[3] | 0x02000000;
2173 w2
[0] = w2
[0] | 0x0200;
2177 w2
[0] = w2
[0] | 0x020000;
2181 w2
[0] = w2
[0] | 0x02000000;
2189 w2
[1] = w2
[1] | 0x0200;
2193 w2
[1] = w2
[1] | 0x020000;
2197 w2
[1] = w2
[1] | 0x02000000;
2205 w2
[2] = w2
[2] | 0x0200;
2209 w2
[2] = w2
[2] | 0x020000;
2213 w2
[2] = w2
[2] | 0x02000000;
2221 w2
[3] = w2
[3] | 0x0200;
2225 w2
[3] = w2
[3] | 0x020000;
2229 w2
[3] = w2
[3] | 0x02000000;
2237 w3
[0] = w3
[0] | 0x0200;
2241 w3
[0] = w3
[0] | 0x020000;
2245 w3
[0] = w3
[0] | 0x02000000;
2253 w3
[1] = w3
[1] | 0x0200;
2257 w3
[1] = w3
[1] | 0x020000;
2261 w3
[1] = w3
[1] | 0x02000000;
2269 w3
[2] = w3
[2] | 0x0200;
2273 w3
[2] = w3
[2] | 0x020000;
2277 w3
[2] = w3
[2] | 0x02000000;
2285 w3
[3] = w3
[3] | 0x0200;
2289 w3
[3] = w3
[3] | 0x020000;
2293 w3
[3] = w3
[3] | 0x02000000;
2301 w4
[0] = w4
[0] | 0x0200;
2305 w4
[0] = w4
[0] | 0x020000;
2309 w4
[0] = w4
[0] | 0x02000000;
2317 w4
[1] = w4
[1] | 0x0200;
2321 w4
[1] = w4
[1] | 0x020000;
2325 w4
[1] = w4
[1] | 0x02000000;
2333 w4
[2] = w4
[2] | 0x0200;
2337 w4
[2] = w4
[2] | 0x020000;
2341 w4
[2] = w4
[2] | 0x02000000;
2349 w4
[3] = w4
[3] | 0x0200;
2353 w4
[3] = w4
[3] | 0x020000;
2357 w4
[3] = w4
[3] | 0x02000000;
2365 w5
[0] = w5
[0] | 0x0200;
2369 w5
[0] = w5
[0] | 0x020000;
2373 w5
[0] = w5
[0] | 0x02000000;
2381 w5
[1] = w5
[1] | 0x0200;
2385 w5
[1] = w5
[1] | 0x020000;
2389 w5
[1] = w5
[1] | 0x02000000;
2397 w5
[2] = w5
[2] | 0x0200;
2401 w5
[2] = w5
[2] | 0x020000;
2405 w5
[2] = w5
[2] | 0x02000000;
2413 w5
[3] = w5
[3] | 0x0200;
2417 w5
[3] = w5
[3] | 0x020000;
2421 w5
[3] = w5
[3] | 0x02000000;
2429 w6
[0] = w6
[0] | 0x0200;
2433 w6
[0] = w6
[0] | 0x020000;
2437 w6
[0] = w6
[0] | 0x02000000;
2445 w6
[1] = w6
[1] | 0x0200;
2449 w6
[1] = w6
[1] | 0x020000;
2453 w6
[1] = w6
[1] | 0x02000000;
2461 w6
[2] = w6
[2] | 0x0200;
2465 w6
[2] = w6
[2] | 0x020000;
2469 w6
[2] = w6
[2] | 0x02000000;
2477 w6
[3] = w6
[3] | 0x0200;
2481 w6
[3] = w6
[3] | 0x020000;
2485 w6
[3] = w6
[3] | 0x02000000;
2493 w7
[0] = w7
[0] | 0x0200;
2497 w7
[0] = w7
[0] | 0x020000;
2501 w7
[0] = w7
[0] | 0x02000000;
2509 w7
[1] = w7
[1] | 0x0200;
2513 w7
[1] = w7
[1] | 0x020000;
2517 w7
[1] = w7
[1] | 0x02000000;
2525 w7
[2] = w7
[2] | 0x0200;
2529 w7
[2] = w7
[2] | 0x020000;
2533 w7
[2] = w7
[2] | 0x02000000;
2541 w7
[3] = w7
[3] | 0x0200;
2545 w7
[3] = w7
[3] | 0x020000;
2549 w7
[3] = w7
[3] | 0x02000000;
2554 // before: append_0x80_1
2555 static void append_0x80_1x4 (u32 w0
[4], const u32 offset
)
2564 w0
[0] = w0
[0] | 0x8000;
2568 w0
[0] = w0
[0] | 0x800000;
2572 w0
[0] = w0
[0] | 0x80000000;
2580 w0
[1] = w0
[1] | 0x8000;
2584 w0
[1] = w0
[1] | 0x800000;
2588 w0
[1] = w0
[1] | 0x80000000;
2596 w0
[2] = w0
[2] | 0x8000;
2600 w0
[2] = w0
[2] | 0x800000;
2604 w0
[2] = w0
[2] | 0x80000000;
2612 w0
[3] = w0
[3] | 0x8000;
2616 w0
[3] = w0
[3] | 0x800000;
2620 w0
[3] = w0
[3] | 0x80000000;
2625 // before: append_0x80_2
2626 static void append_0x80_2x4 (u32 w0
[4], u32 w1
[4], const u32 offset
)
2635 w0
[0] = w0
[0] | 0x8000;
2639 w0
[0] = w0
[0] | 0x800000;
2643 w0
[0] = w0
[0] | 0x80000000;
2651 w0
[1] = w0
[1] | 0x8000;
2655 w0
[1] = w0
[1] | 0x800000;
2659 w0
[1] = w0
[1] | 0x80000000;
2667 w0
[2] = w0
[2] | 0x8000;
2671 w0
[2] = w0
[2] | 0x800000;
2675 w0
[2] = w0
[2] | 0x80000000;
2683 w0
[3] = w0
[3] | 0x8000;
2687 w0
[3] = w0
[3] | 0x800000;
2691 w0
[3] = w0
[3] | 0x80000000;
2699 w1
[0] = w1
[0] | 0x8000;
2703 w1
[0] = w1
[0] | 0x800000;
2707 w1
[0] = w1
[0] | 0x80000000;
2715 w1
[1] = w1
[1] | 0x8000;
2719 w1
[1] = w1
[1] | 0x800000;
2723 w1
[1] = w1
[1] | 0x80000000;
2731 w1
[2] = w1
[2] | 0x8000;
2735 w1
[2] = w1
[2] | 0x800000;
2739 w1
[2] = w1
[2] | 0x80000000;
2747 w1
[3] = w1
[3] | 0x8000;
2751 w1
[3] = w1
[3] | 0x800000;
2755 w1
[3] = w1
[3] | 0x80000000;
2760 // before: append_0x80_3
2761 static void append_0x80_3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
2770 w0
[0] = w0
[0] | 0x8000;
2774 w0
[0] = w0
[0] | 0x800000;
2778 w0
[0] = w0
[0] | 0x80000000;
2786 w0
[1] = w0
[1] | 0x8000;
2790 w0
[1] = w0
[1] | 0x800000;
2794 w0
[1] = w0
[1] | 0x80000000;
2802 w0
[2] = w0
[2] | 0x8000;
2806 w0
[2] = w0
[2] | 0x800000;
2810 w0
[2] = w0
[2] | 0x80000000;
2818 w0
[3] = w0
[3] | 0x8000;
2822 w0
[3] = w0
[3] | 0x800000;
2826 w0
[3] = w0
[3] | 0x80000000;
2834 w1
[0] = w1
[0] | 0x8000;
2838 w1
[0] = w1
[0] | 0x800000;
2842 w1
[0] = w1
[0] | 0x80000000;
2850 w1
[1] = w1
[1] | 0x8000;
2854 w1
[1] = w1
[1] | 0x800000;
2858 w1
[1] = w1
[1] | 0x80000000;
2866 w1
[2] = w1
[2] | 0x8000;
2870 w1
[2] = w1
[2] | 0x800000;
2874 w1
[2] = w1
[2] | 0x80000000;
2882 w1
[3] = w1
[3] | 0x8000;
2886 w1
[3] = w1
[3] | 0x800000;
2890 w1
[3] = w1
[3] | 0x80000000;
2898 w2
[0] = w2
[0] | 0x8000;
2902 w2
[0] = w2
[0] | 0x800000;
2906 w2
[0] = w2
[0] | 0x80000000;
2914 w2
[1] = w2
[1] | 0x8000;
2918 w2
[1] = w2
[1] | 0x800000;
2922 w2
[1] = w2
[1] | 0x80000000;
2930 w2
[2] = w2
[2] | 0x8000;
2934 w2
[2] = w2
[2] | 0x800000;
2938 w2
[2] = w2
[2] | 0x80000000;
2946 w2
[3] = w2
[3] | 0x8000;
2950 w2
[3] = w2
[3] | 0x800000;
2954 w2
[3] = w2
[3] | 0x80000000;
2959 // before: append_0x80_4
2960 static void append_0x80_4x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
2969 w0
[0] = w0
[0] | 0x8000;
2973 w0
[0] = w0
[0] | 0x800000;
2977 w0
[0] = w0
[0] | 0x80000000;
2985 w0
[1] = w0
[1] | 0x8000;
2989 w0
[1] = w0
[1] | 0x800000;
2993 w0
[1] = w0
[1] | 0x80000000;
3001 w0
[2] = w0
[2] | 0x8000;
3005 w0
[2] = w0
[2] | 0x800000;
3009 w0
[2] = w0
[2] | 0x80000000;
3017 w0
[3] = w0
[3] | 0x8000;
3021 w0
[3] = w0
[3] | 0x800000;
3025 w0
[3] = w0
[3] | 0x80000000;
3033 w1
[0] = w1
[0] | 0x8000;
3037 w1
[0] = w1
[0] | 0x800000;
3041 w1
[0] = w1
[0] | 0x80000000;
3049 w1
[1] = w1
[1] | 0x8000;
3053 w1
[1] = w1
[1] | 0x800000;
3057 w1
[1] = w1
[1] | 0x80000000;
3065 w1
[2] = w1
[2] | 0x8000;
3069 w1
[2] = w1
[2] | 0x800000;
3073 w1
[2] = w1
[2] | 0x80000000;
3081 w1
[3] = w1
[3] | 0x8000;
3085 w1
[3] = w1
[3] | 0x800000;
3089 w1
[3] = w1
[3] | 0x80000000;
3097 w2
[0] = w2
[0] | 0x8000;
3101 w2
[0] = w2
[0] | 0x800000;
3105 w2
[0] = w2
[0] | 0x80000000;
3113 w2
[1] = w2
[1] | 0x8000;
3117 w2
[1] = w2
[1] | 0x800000;
3121 w2
[1] = w2
[1] | 0x80000000;
3129 w2
[2] = w2
[2] | 0x8000;
3133 w2
[2] = w2
[2] | 0x800000;
3137 w2
[2] = w2
[2] | 0x80000000;
3145 w2
[3] = w2
[3] | 0x8000;
3149 w2
[3] = w2
[3] | 0x800000;
3153 w2
[3] = w2
[3] | 0x80000000;
3161 w3
[0] = w3
[0] | 0x8000;
3165 w3
[0] = w3
[0] | 0x800000;
3169 w3
[0] = w3
[0] | 0x80000000;
3177 w3
[1] = w3
[1] | 0x8000;
3181 w3
[1] = w3
[1] | 0x800000;
3185 w3
[1] = w3
[1] | 0x80000000;
3193 w3
[2] = w3
[2] | 0x8000;
3197 w3
[2] = w3
[2] | 0x800000;
3201 w3
[2] = w3
[2] | 0x80000000;
3209 w3
[3] = w3
[3] | 0x8000;
3213 w3
[3] = w3
[3] | 0x800000;
3217 w3
[3] = w3
[3] | 0x80000000;
3222 // before: append_0x80_8
3223 static void append_0x80_8x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
3232 w0
[0] = w0
[0] | 0x8000;
3236 w0
[0] = w0
[0] | 0x800000;
3240 w0
[0] = w0
[0] | 0x80000000;
3248 w0
[1] = w0
[1] | 0x8000;
3252 w0
[1] = w0
[1] | 0x800000;
3256 w0
[1] = w0
[1] | 0x80000000;
3264 w0
[2] = w0
[2] | 0x8000;
3268 w0
[2] = w0
[2] | 0x800000;
3272 w0
[2] = w0
[2] | 0x80000000;
3280 w0
[3] = w0
[3] | 0x8000;
3284 w0
[3] = w0
[3] | 0x800000;
3288 w0
[3] = w0
[3] | 0x80000000;
3296 w1
[0] = w1
[0] | 0x8000;
3300 w1
[0] = w1
[0] | 0x800000;
3304 w1
[0] = w1
[0] | 0x80000000;
3312 w1
[1] = w1
[1] | 0x8000;
3316 w1
[1] = w1
[1] | 0x800000;
3320 w1
[1] = w1
[1] | 0x80000000;
3328 w1
[2] = w1
[2] | 0x8000;
3332 w1
[2] = w1
[2] | 0x800000;
3336 w1
[2] = w1
[2] | 0x80000000;
3344 w1
[3] = w1
[3] | 0x8000;
3348 w1
[3] = w1
[3] | 0x800000;
3352 w1
[3] = w1
[3] | 0x80000000;
3360 w2
[0] = w2
[0] | 0x8000;
3364 w2
[0] = w2
[0] | 0x800000;
3368 w2
[0] = w2
[0] | 0x80000000;
3376 w2
[1] = w2
[1] | 0x8000;
3380 w2
[1] = w2
[1] | 0x800000;
3384 w2
[1] = w2
[1] | 0x80000000;
3392 w2
[2] = w2
[2] | 0x8000;
3396 w2
[2] = w2
[2] | 0x800000;
3400 w2
[2] = w2
[2] | 0x80000000;
3408 w2
[3] = w2
[3] | 0x8000;
3412 w2
[3] = w2
[3] | 0x800000;
3416 w2
[3] = w2
[3] | 0x80000000;
3424 w3
[0] = w3
[0] | 0x8000;
3428 w3
[0] = w3
[0] | 0x800000;
3432 w3
[0] = w3
[0] | 0x80000000;
3440 w3
[1] = w3
[1] | 0x8000;
3444 w3
[1] = w3
[1] | 0x800000;
3448 w3
[1] = w3
[1] | 0x80000000;
3456 w3
[2] = w3
[2] | 0x8000;
3460 w3
[2] = w3
[2] | 0x800000;
3464 w3
[2] = w3
[2] | 0x80000000;
3472 w3
[3] = w3
[3] | 0x8000;
3476 w3
[3] = w3
[3] | 0x800000;
3480 w3
[3] = w3
[3] | 0x80000000;
3488 w4
[0] = w4
[0] | 0x8000;
3492 w4
[0] = w4
[0] | 0x800000;
3496 w4
[0] = w4
[0] | 0x80000000;
3504 w4
[1] = w4
[1] | 0x8000;
3508 w4
[1] = w4
[1] | 0x800000;
3512 w4
[1] = w4
[1] | 0x80000000;
3520 w4
[2] = w4
[2] | 0x8000;
3524 w4
[2] = w4
[2] | 0x800000;
3528 w4
[2] = w4
[2] | 0x80000000;
3536 w4
[3] = w4
[3] | 0x8000;
3540 w4
[3] = w4
[3] | 0x800000;
3544 w4
[3] = w4
[3] | 0x80000000;
3552 w5
[0] = w5
[0] | 0x8000;
3556 w5
[0] = w5
[0] | 0x800000;
3560 w5
[0] = w5
[0] | 0x80000000;
3568 w5
[1] = w5
[1] | 0x8000;
3572 w5
[1] = w5
[1] | 0x800000;
3576 w5
[1] = w5
[1] | 0x80000000;
3584 w5
[2] = w5
[2] | 0x8000;
3588 w5
[2] = w5
[2] | 0x800000;
3592 w5
[2] = w5
[2] | 0x80000000;
3600 w5
[3] = w5
[3] | 0x8000;
3604 w5
[3] = w5
[3] | 0x800000;
3608 w5
[3] = w5
[3] | 0x80000000;
3616 w6
[0] = w6
[0] | 0x8000;
3620 w6
[0] = w6
[0] | 0x800000;
3624 w6
[0] = w6
[0] | 0x80000000;
3632 w6
[1] = w6
[1] | 0x8000;
3636 w6
[1] = w6
[1] | 0x800000;
3640 w6
[1] = w6
[1] | 0x80000000;
3648 w6
[2] = w6
[2] | 0x8000;
3652 w6
[2] = w6
[2] | 0x800000;
3656 w6
[2] = w6
[2] | 0x80000000;
3664 w6
[3] = w6
[3] | 0x8000;
3668 w6
[3] = w6
[3] | 0x800000;
3672 w6
[3] = w6
[3] | 0x80000000;
3680 w7
[0] = w7
[0] | 0x8000;
3684 w7
[0] = w7
[0] | 0x800000;
3688 w7
[0] = w7
[0] | 0x80000000;
3696 w7
[1] = w7
[1] | 0x8000;
3700 w7
[1] = w7
[1] | 0x800000;
3704 w7
[1] = w7
[1] | 0x80000000;
3712 w7
[2] = w7
[2] | 0x8000;
3716 w7
[2] = w7
[2] | 0x800000;
3720 w7
[2] = w7
[2] | 0x80000000;
3728 w7
[3] = w7
[3] | 0x8000;
3732 w7
[3] = w7
[3] | 0x800000;
3736 w7
[3] = w7
[3] | 0x80000000;
3741 // before: device_memcat2L
3742 static void memcat_c7_d1x2_sl1x2_sr1x2 (const u32 offset
, u32 dst0
[2], u32 src_l0
[2], u32 src_r0
[2])
3747 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
3748 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
3752 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
3753 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
3757 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
3758 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
3762 dst0
[1] = src_r0
[0];
3766 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
3770 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
3774 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
3779 // before: device_memcat4L
3780 static void memcat_c15_d1x4_sl1x4_sr1x4 (const u32 offset
, u32 dst0
[4], u32 src_l0
[4], u32 src_r0
[4])
3785 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
3786 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
3787 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
3788 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
3792 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
3793 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
3794 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
3795 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
3799 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
3800 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
3801 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
3802 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
3806 dst0
[1] = src_r0
[0];
3807 dst0
[2] = src_r0
[1];
3808 dst0
[3] = src_r0
[2];
3812 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
3813 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
3814 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
3818 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
3819 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
3820 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
3824 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
3825 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
3826 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
3830 dst0
[2] = src_r0
[0];
3831 dst0
[3] = src_r0
[1];
3835 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
3836 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
3840 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
3841 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
3845 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
3846 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
3850 dst0
[3] = src_r0
[0];
3854 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
3858 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
3862 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
3867 // before: device_memcat8L
3868 static void memcat_c31_d2x4_sl2x4_sr1x4 (const u32 offset
, u32 dst0
[4], u32 dst1
[4], u32 src_l0
[4], u32 src_l1
[4], u32 src_r0
[4])
3873 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
3874 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
3875 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
3876 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
3877 dst1
[0] = src_r0
[3] >> 24;
3881 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
3882 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
3883 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
3884 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
3885 dst1
[0] = src_r0
[3] >> 16;
3889 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
3890 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
3891 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
3892 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
3893 dst1
[0] = src_r0
[3] >> 8;
3897 dst0
[1] = src_r0
[0];
3898 dst0
[2] = src_r0
[1];
3899 dst0
[3] = src_r0
[2];
3900 dst1
[0] = src_r0
[3];
3904 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
3905 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
3906 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
3907 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
3908 dst1
[1] = src_r0
[3] >> 24;
3912 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
3913 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
3914 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
3915 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
3916 dst1
[1] = src_r0
[3] >> 16;
3920 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
3921 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
3922 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
3923 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
3924 dst1
[1] = src_r0
[3] >> 8;
3928 dst0
[2] = src_r0
[0];
3929 dst0
[3] = src_r0
[1];
3930 dst1
[0] = src_r0
[2];
3931 dst1
[1] = src_r0
[3];
3935 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
3936 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
3937 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
3938 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
3939 dst1
[2] = src_r0
[3] >> 24;
3943 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
3944 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
3945 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
3946 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
3947 dst1
[2] = src_r0
[3] >> 16;
3951 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
3952 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
3953 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
3954 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
3955 dst1
[2] = src_r0
[3] >> 8;
3959 dst0
[3] = src_r0
[0];
3960 dst1
[0] = src_r0
[1];
3961 dst1
[1] = src_r0
[2];
3962 dst1
[2] = src_r0
[3];
3966 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
3967 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
3968 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
3969 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
3970 dst1
[3] = src_r0
[3] >> 24;
3974 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
3975 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
3976 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
3977 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
3978 dst1
[3] = src_r0
[3] >> 16;
3982 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
3983 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
3984 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
3985 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
3986 dst1
[3] = src_r0
[3] >> 8;
3990 dst1
[0] = src_r0
[0];
3991 dst1
[1] = src_r0
[1];
3992 dst1
[2] = src_r0
[2];
3993 dst1
[3] = src_r0
[3];
3997 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
3998 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
3999 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4000 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4004 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
4005 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4006 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4007 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4011 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
4012 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4013 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4014 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4018 dst1
[1] = src_r0
[0];
4019 dst1
[2] = src_r0
[1];
4020 dst1
[3] = src_r0
[2];
4024 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
4025 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4026 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4030 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
4031 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4032 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4036 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
4037 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4038 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4042 dst1
[2] = src_r0
[0];
4043 dst1
[3] = src_r0
[1];
4047 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
4048 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4052 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
4053 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4057 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
4058 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4062 dst1
[3] = src_r0
[0];
4066 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
4070 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
4074 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
4079 // before: device_memcat12L
4080 static void memcat_c47_d3x4_sl3x4_sr1x4 (const u32 offset
, u32 dst0
[4], u32 dst1
[4], u32 dst2
[4], u32 src_l0
[4], u32 src_l1
[4], u32 src_l2
[4], u32 src_r0
[4])
4085 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
4086 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4087 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4088 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4089 dst1
[0] = src_r0
[3] >> 24;
4093 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
4094 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4095 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4096 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4097 dst1
[0] = src_r0
[3] >> 16;
4101 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
4102 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4103 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4104 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4105 dst1
[0] = src_r0
[3] >> 8;
4109 dst0
[1] = src_r0
[0];
4110 dst0
[2] = src_r0
[1];
4111 dst0
[3] = src_r0
[2];
4112 dst1
[0] = src_r0
[3];
4116 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
4117 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4118 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4119 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4120 dst1
[1] = src_r0
[3] >> 24;
4124 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
4125 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4126 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4127 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4128 dst1
[1] = src_r0
[3] >> 16;
4132 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
4133 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4134 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4135 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4136 dst1
[1] = src_r0
[3] >> 8;
4140 dst0
[2] = src_r0
[0];
4141 dst0
[3] = src_r0
[1];
4142 dst1
[0] = src_r0
[2];
4143 dst1
[1] = src_r0
[3];
4147 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
4148 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4149 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4150 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4151 dst1
[2] = src_r0
[3] >> 24;
4155 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
4156 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4157 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4158 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4159 dst1
[2] = src_r0
[3] >> 16;
4163 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
4164 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4165 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4166 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4167 dst1
[2] = src_r0
[3] >> 8;
4171 dst0
[3] = src_r0
[0];
4172 dst1
[0] = src_r0
[1];
4173 dst1
[1] = src_r0
[2];
4174 dst1
[2] = src_r0
[3];
4178 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
4179 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4180 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4181 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4182 dst1
[3] = src_r0
[3] >> 24;
4186 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
4187 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4188 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4189 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4190 dst1
[3] = src_r0
[3] >> 16;
4194 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
4195 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4196 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4197 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4198 dst1
[3] = src_r0
[3] >> 8;
4202 dst1
[0] = src_r0
[0];
4203 dst1
[1] = src_r0
[1];
4204 dst1
[2] = src_r0
[2];
4205 dst1
[3] = src_r0
[3];
4209 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
4210 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4211 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4212 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4213 dst2
[0] = src_r0
[3] >> 24;
4217 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
4218 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4219 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4220 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4221 dst2
[0] = src_r0
[3] >> 16;
4225 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
4226 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4227 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4228 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4229 dst2
[0] = src_r0
[3] >> 8;
4233 dst1
[1] = src_r0
[0];
4234 dst1
[2] = src_r0
[1];
4235 dst1
[3] = src_r0
[2];
4236 dst2
[0] = src_r0
[3];
4240 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
4241 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4242 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4243 dst2
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4244 dst2
[1] = src_r0
[3] >> 24;
4248 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
4249 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4250 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4251 dst2
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4252 dst2
[1] = src_r0
[3] >> 16;
4256 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
4257 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4258 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4259 dst2
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4260 dst2
[1] = src_r0
[3] >> 8;
4264 dst1
[2] = src_r0
[0];
4265 dst1
[3] = src_r0
[1];
4266 dst2
[0] = src_r0
[2];
4267 dst2
[1] = src_r0
[3];
4271 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
4272 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4273 dst2
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4274 dst2
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4275 dst2
[2] = src_r0
[3] >> 24;
4279 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
4280 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4281 dst2
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4282 dst2
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4283 dst2
[2] = src_r0
[3] >> 16;
4287 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
4288 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4289 dst2
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4290 dst2
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4291 dst2
[2] = src_r0
[3] >> 8;
4295 dst1
[3] = src_r0
[0];
4296 dst2
[0] = src_r0
[1];
4297 dst2
[1] = src_r0
[2];
4298 dst2
[2] = src_r0
[3];
4302 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
4303 dst2
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4304 dst2
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4305 dst2
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4306 dst2
[3] = src_r0
[3] >> 24;
4310 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
4311 dst2
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4312 dst2
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4313 dst2
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4314 dst2
[3] = src_r0
[3] >> 16;
4318 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
4319 dst2
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4320 dst2
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4321 dst2
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4322 dst2
[3] = src_r0
[3] >> 8;
4326 dst2
[0] = src_r0
[0];
4327 dst2
[1] = src_r0
[1];
4328 dst2
[2] = src_r0
[2];
4329 dst2
[3] = src_r0
[3];
4333 dst2
[0] = src_l2
[0] | src_r0
[0] << 8;
4334 dst2
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4335 dst2
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4336 dst2
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4340 dst2
[0] = src_l2
[0] | src_r0
[0] << 16;
4341 dst2
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4342 dst2
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4343 dst2
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4347 dst2
[0] = src_l2
[0] | src_r0
[0] << 24;
4348 dst2
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4349 dst2
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4350 dst2
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4354 dst2
[1] = src_r0
[0];
4355 dst2
[2] = src_r0
[1];
4356 dst2
[3] = src_r0
[2];
4360 dst2
[1] = src_l2
[1] | src_r0
[0] << 8;
4361 dst2
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4362 dst2
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4366 dst2
[1] = src_l2
[1] | src_r0
[0] << 16;
4367 dst2
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4368 dst2
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4372 dst2
[1] = src_l2
[1] | src_r0
[0] << 24;
4373 dst2
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4374 dst2
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4378 dst2
[2] = src_r0
[0];
4379 dst2
[3] = src_r0
[1];
4383 dst2
[2] = src_l2
[2] | src_r0
[0] << 8;
4384 dst2
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4388 dst2
[2] = src_l2
[2] | src_r0
[0] << 16;
4389 dst2
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4393 dst2
[2] = src_l2
[2] | src_r0
[0] << 24;
4394 dst2
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4398 dst2
[3] = src_r0
[0];
4402 dst2
[3] = src_l2
[3] | src_r0
[0] << 8;
4406 dst2
[3] = src_l2
[3] | src_r0
[0] << 16;
4410 dst2
[3] = src_l2
[3] | src_r0
[0] << 24;
4415 // before: device_memcat12L
4416 static void memcat_c47_d3x4_sl3x4_sr2x4 (const u32 offset
, u32 dst0
[4], u32 dst1
[4], u32 dst2
[4], u32 src_l0
[4], u32 src_l1
[4], u32 src_l2
[4], u32 src_r0
[4], u32 src_r1
[4])
4421 dst0
[0] = src_r0
[0];
4422 dst0
[1] = src_r0
[1];
4423 dst0
[2] = src_r0
[2];
4424 dst0
[3] = src_r0
[3];
4425 dst1
[0] = src_r1
[0];
4426 dst1
[1] = src_r1
[1];
4427 dst1
[2] = src_r1
[2];
4428 dst1
[3] = src_r1
[3];
4432 dst0
[0] = src_l0
[0] | src_r0
[0] << 8;
4433 dst0
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4434 dst0
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4435 dst0
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4436 dst1
[0] = src_r0
[3] >> 24 | src_r1
[0] << 8;
4437 dst1
[1] = src_r1
[0] >> 24 | src_r1
[1] << 8;
4438 dst1
[2] = src_r1
[1] >> 24 | src_r1
[2] << 8;
4439 dst1
[3] = src_r1
[2] >> 24 | src_r1
[3] << 8;
4440 dst2
[0] = src_r1
[3] >> 24;
4444 dst0
[0] = src_l0
[0] | src_r0
[0] << 16;
4445 dst0
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4446 dst0
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4447 dst0
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4448 dst1
[0] = src_r0
[3] >> 16 | src_r1
[0] << 16;
4449 dst1
[1] = src_r1
[0] >> 16 | src_r1
[1] << 16;
4450 dst1
[2] = src_r1
[1] >> 16 | src_r1
[2] << 16;
4451 dst1
[3] = src_r1
[2] >> 16 | src_r1
[3] << 16;
4452 dst2
[0] = src_r1
[3] >> 16;
4456 dst0
[0] = src_l0
[0] | src_r0
[0] << 24;
4457 dst0
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4458 dst0
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4459 dst0
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4460 dst1
[0] = src_r0
[3] >> 8 | src_r1
[0] << 24;
4461 dst1
[1] = src_r1
[0] >> 8 | src_r1
[1] << 24;
4462 dst1
[2] = src_r1
[1] >> 8 | src_r1
[2] << 24;
4463 dst1
[3] = src_r1
[2] >> 8 | src_r1
[3] << 24;
4464 dst2
[0] = src_r1
[3] >> 8;
4468 dst0
[1] = src_r0
[0];
4469 dst0
[2] = src_r0
[1];
4470 dst0
[3] = src_r0
[2];
4471 dst1
[0] = src_r0
[3];
4472 dst1
[1] = src_r1
[0];
4473 dst1
[2] = src_r1
[1];
4474 dst1
[3] = src_r1
[2];
4475 dst2
[0] = src_r1
[3];
4479 dst0
[1] = src_l0
[1] | src_r0
[0] << 8;
4480 dst0
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4481 dst0
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4482 dst1
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4483 dst1
[1] = src_r0
[3] >> 24 | src_r1
[0] << 8;
4484 dst1
[2] = src_r1
[0] >> 24 | src_r1
[1] << 8;
4485 dst1
[3] = src_r1
[1] >> 24 | src_r1
[2] << 8;
4486 dst2
[0] = src_r1
[2] >> 24 | src_r1
[3] << 8;
4487 dst2
[1] = src_r1
[3] >> 24;
4491 dst0
[1] = src_l0
[1] | src_r0
[0] << 16;
4492 dst0
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4493 dst0
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4494 dst1
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4495 dst1
[1] = src_r0
[3] >> 16 | src_r1
[0] << 16;
4496 dst1
[2] = src_r1
[0] >> 16 | src_r1
[1] << 16;
4497 dst1
[3] = src_r1
[1] >> 16 | src_r1
[2] << 16;
4498 dst2
[0] = src_r1
[2] >> 16 | src_r1
[3] << 16;
4499 dst2
[1] = src_r1
[3] >> 16;
4503 dst0
[1] = src_l0
[1] | src_r0
[0] << 24;
4504 dst0
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4505 dst0
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4506 dst1
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4507 dst1
[1] = src_r0
[3] >> 8 | src_r1
[0] << 24;
4508 dst1
[2] = src_r1
[0] >> 8 | src_r1
[1] << 24;
4509 dst1
[3] = src_r1
[1] >> 8 | src_r1
[2] << 24;
4510 dst2
[0] = src_r1
[2] >> 8 | src_r1
[3] << 24;
4511 dst2
[1] = src_r1
[3] >> 8;
4515 dst0
[2] = src_r0
[0];
4516 dst0
[3] = src_r0
[1];
4517 dst1
[0] = src_r0
[2];
4518 dst1
[1] = src_r0
[3];
4519 dst1
[2] = src_r1
[0];
4520 dst1
[3] = src_r1
[1];
4521 dst2
[0] = src_r1
[2];
4522 dst2
[1] = src_r1
[3];
4526 dst0
[2] = src_l0
[2] | src_r0
[0] << 8;
4527 dst0
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4528 dst1
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4529 dst1
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4530 dst1
[2] = src_r0
[3] >> 24 | src_r1
[0] << 8;
4531 dst1
[3] = src_r1
[0] >> 24 | src_r1
[1] << 8;
4532 dst2
[0] = src_r1
[1] >> 24 | src_r1
[2] << 8;
4533 dst2
[1] = src_r1
[2] >> 24 | src_r1
[3] << 8;
4534 dst2
[2] = src_r1
[3] >> 24;
4538 dst0
[2] = src_l0
[2] | src_r0
[0] << 16;
4539 dst0
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4540 dst1
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4541 dst1
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4542 dst1
[2] = src_r0
[3] >> 16 | src_r1
[0] << 16;
4543 dst1
[3] = src_r1
[0] >> 16 | src_r1
[1] << 16;
4544 dst2
[0] = src_r1
[1] >> 16 | src_r1
[2] << 16;
4545 dst2
[1] = src_r1
[2] >> 16 | src_r1
[3] << 16;
4546 dst2
[2] = src_r1
[3] >> 16;
4550 dst0
[2] = src_l0
[2] | src_r0
[0] << 24;
4551 dst0
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4552 dst1
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4553 dst1
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4554 dst1
[2] = src_r0
[3] >> 8 | src_r1
[0] << 24;
4555 dst1
[3] = src_r1
[0] >> 8 | src_r1
[1] << 24;
4556 dst2
[0] = src_r1
[1] >> 8 | src_r1
[2] << 24;
4557 dst2
[1] = src_r1
[2] >> 8 | src_r1
[3] << 24;
4558 dst2
[2] = src_r1
[3] >> 8;
4562 dst0
[3] = src_r0
[0];
4563 dst1
[0] = src_r0
[1];
4564 dst1
[1] = src_r0
[2];
4565 dst1
[2] = src_r0
[3];
4566 dst1
[3] = src_r1
[0];
4567 dst2
[0] = src_r1
[1];
4568 dst2
[1] = src_r1
[2];
4569 dst2
[2] = src_r1
[3];
4573 dst0
[3] = src_l0
[3] | src_r0
[0] << 8;
4574 dst1
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4575 dst1
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4576 dst1
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4577 dst1
[3] = src_r0
[3] >> 24 | src_r1
[0] << 8;
4578 dst2
[0] = src_r1
[0] >> 24 | src_r1
[1] << 8;
4579 dst2
[1] = src_r1
[1] >> 24 | src_r1
[2] << 8;
4580 dst2
[2] = src_r1
[2] >> 24 | src_r1
[3] << 8;
4581 dst2
[3] = src_r1
[3] >> 24;
4585 dst0
[3] = src_l0
[3] | src_r0
[0] << 16;
4586 dst1
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4587 dst1
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4588 dst1
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4589 dst1
[3] = src_r0
[3] >> 16 | src_r1
[0] << 16;
4590 dst2
[0] = src_r1
[0] >> 16 | src_r1
[1] << 16;
4591 dst2
[1] = src_r1
[1] >> 16 | src_r1
[2] << 16;
4592 dst2
[2] = src_r1
[2] >> 16 | src_r1
[3] << 16;
4593 dst2
[3] = src_r1
[3] >> 16;
4597 dst0
[3] = src_l0
[3] | src_r0
[0] << 24;
4598 dst1
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4599 dst1
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4600 dst1
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4601 dst1
[3] = src_r0
[3] >> 8 | src_r1
[0] << 24;
4602 dst2
[0] = src_r1
[0] >> 8 | src_r1
[1] << 24;
4603 dst2
[1] = src_r1
[1] >> 8 | src_r1
[2] << 24;
4604 dst2
[2] = src_r1
[2] >> 8 | src_r1
[3] << 24;
4605 dst2
[3] = src_r1
[3] >> 8;
4609 dst1
[0] = src_r0
[0];
4610 dst1
[1] = src_r0
[1];
4611 dst1
[2] = src_r0
[2];
4612 dst1
[3] = src_r0
[3];
4613 dst2
[0] = src_r1
[0];
4614 dst2
[1] = src_r1
[1];
4615 dst2
[2] = src_r1
[2];
4616 dst2
[3] = src_r1
[3];
4620 dst1
[0] = src_l1
[0] | src_r0
[0] << 8;
4621 dst1
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4622 dst1
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4623 dst1
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4624 dst2
[0] = src_r0
[3] >> 24 | src_r1
[0] << 8;
4625 dst2
[1] = src_r1
[0] >> 24 | src_r1
[1] << 8;
4626 dst2
[2] = src_r1
[1] >> 24 | src_r1
[2] << 8;
4627 dst2
[3] = src_r1
[2] >> 24 | src_r1
[3] << 8;
4631 dst1
[0] = src_l1
[0] | src_r0
[0] << 16;
4632 dst1
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4633 dst1
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4634 dst1
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4635 dst2
[0] = src_r0
[3] >> 16 | src_r1
[0] << 16;
4636 dst2
[1] = src_r1
[0] >> 16 | src_r1
[1] << 16;
4637 dst2
[2] = src_r1
[1] >> 16 | src_r1
[2] << 16;
4638 dst2
[3] = src_r1
[2] >> 16 | src_r1
[3] << 16;
4642 dst1
[0] = src_l1
[0] | src_r0
[0] << 24;
4643 dst1
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4644 dst1
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4645 dst1
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4646 dst2
[0] = src_r0
[3] >> 8 | src_r1
[0] << 24;
4647 dst2
[1] = src_r1
[0] >> 8 | src_r1
[1] << 24;
4648 dst2
[2] = src_r1
[1] >> 8 | src_r1
[2] << 24;
4649 dst2
[3] = src_r1
[2] >> 8 | src_r1
[3] << 24;
4653 dst1
[1] = src_r1
[0];
4654 dst1
[2] = src_r0
[1];
4655 dst1
[3] = src_r0
[2];
4656 dst2
[0] = src_r0
[3];
4657 dst2
[1] = src_r1
[0];
4658 dst2
[2] = src_r1
[1];
4659 dst2
[3] = src_r1
[2];
4663 dst1
[1] = src_l1
[1] | src_r0
[0] << 8;
4664 dst1
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4665 dst1
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4666 dst2
[0] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4667 dst2
[1] = src_r0
[3] >> 24 | src_r1
[0] << 8;
4668 dst2
[2] = src_r1
[0] >> 24 | src_r1
[1] << 8;
4669 dst2
[3] = src_r1
[1] >> 24 | src_r1
[2] << 8;
4673 dst1
[1] = src_l1
[1] | src_r0
[0] << 16;
4674 dst1
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4675 dst1
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4676 dst2
[0] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4677 dst2
[1] = src_r0
[3] >> 16 | src_r1
[0] << 16;
4678 dst2
[2] = src_r1
[0] >> 16 | src_r1
[1] << 16;
4679 dst2
[3] = src_r1
[1] >> 16 | src_r1
[2] << 16;
4683 dst1
[1] = src_l1
[1] | src_r0
[0] << 24;
4684 dst1
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4685 dst1
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4686 dst2
[0] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4687 dst2
[1] = src_r0
[3] >> 8 | src_r1
[0] << 24;
4688 dst2
[2] = src_r1
[0] >> 8 | src_r1
[1] << 24;
4689 dst2
[3] = src_r1
[1] >> 8 | src_r1
[2] << 24;
4693 dst1
[2] = src_r1
[0];
4694 dst1
[3] = src_r0
[1];
4695 dst2
[0] = src_r0
[2];
4696 dst2
[1] = src_r0
[3];
4697 dst2
[2] = src_r1
[0];
4698 dst2
[3] = src_r1
[1];
4702 dst1
[2] = src_l1
[2] | src_r0
[0] << 8;
4703 dst1
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4704 dst2
[0] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4705 dst2
[1] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4706 dst2
[2] = src_r0
[3] >> 24 | src_r1
[0] << 8;
4707 dst2
[3] = src_r1
[0] >> 24 | src_r1
[1] << 8;
4711 dst1
[2] = src_l1
[2] | src_r0
[0] << 16;
4712 dst1
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4713 dst2
[0] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4714 dst2
[1] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4715 dst2
[2] = src_r0
[3] >> 16 | src_r1
[0] << 16;
4716 dst2
[3] = src_r1
[0] >> 16 | src_r1
[1] << 16;
4720 dst1
[2] = src_l1
[2] | src_r0
[0] << 24;
4721 dst1
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4722 dst2
[0] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4723 dst2
[1] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4724 dst2
[2] = src_r0
[3] >> 8 | src_r1
[0] << 24;
4725 dst2
[3] = src_r1
[0] >> 8 | src_r1
[1] << 24;
4729 dst1
[3] = src_r1
[0];
4730 dst2
[0] = src_r0
[1];
4731 dst2
[1] = src_r0
[2];
4732 dst2
[2] = src_r0
[3];
4733 dst2
[3] = src_r1
[0];
4737 dst1
[3] = src_l1
[3] | src_r0
[0] << 8;
4738 dst2
[0] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4739 dst2
[1] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4740 dst2
[2] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4741 dst2
[3] = src_r0
[3] >> 24 | src_r1
[0] << 8;
4745 dst1
[3] = src_l1
[3] | src_r0
[0] << 16;
4746 dst2
[0] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4747 dst2
[1] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4748 dst2
[2] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4749 dst2
[3] = src_r0
[3] >> 16 | src_r1
[0] << 16;
4753 dst1
[3] = src_l1
[3] | src_r0
[0] << 24;
4754 dst2
[0] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4755 dst2
[1] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4756 dst2
[2] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4757 dst2
[3] = src_r0
[3] >> 8 | src_r1
[0] << 24;
4761 dst2
[0] = src_r0
[0];
4762 dst2
[1] = src_r0
[1];
4763 dst2
[2] = src_r0
[2];
4764 dst2
[3] = src_r0
[3];
4768 dst2
[0] = src_l2
[0] | src_r0
[0] << 8;
4769 dst2
[1] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4770 dst2
[2] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4771 dst2
[3] = src_r0
[2] >> 24 | src_r0
[3] << 8;
4775 dst2
[0] = src_l2
[0] | src_r0
[0] << 16;
4776 dst2
[1] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4777 dst2
[2] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4778 dst2
[3] = src_r0
[2] >> 16 | src_r0
[3] << 16;
4782 dst2
[0] = src_l2
[0] | src_r0
[0] << 24;
4783 dst2
[1] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4784 dst2
[2] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4785 dst2
[3] = src_r0
[2] >> 8 | src_r0
[3] << 24;
4789 dst2
[1] = src_r0
[0];
4790 dst2
[2] = src_r0
[1];
4791 dst2
[3] = src_r0
[2];
4795 dst2
[1] = src_l2
[1] | src_r0
[0] << 8;
4796 dst2
[2] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4797 dst2
[3] = src_r0
[1] >> 24 | src_r0
[2] << 8;
4801 dst2
[1] = src_l2
[1] | src_r0
[0] << 16;
4802 dst2
[2] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4803 dst2
[3] = src_r0
[1] >> 16 | src_r0
[2] << 16;
4807 dst2
[1] = src_l2
[1] | src_r0
[0] << 24;
4808 dst2
[2] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4809 dst2
[3] = src_r0
[1] >> 8 | src_r0
[2] << 24;
4813 dst2
[2] = src_r0
[0];
4814 dst2
[3] = src_r0
[1];
4818 dst2
[2] = src_l2
[2] | src_r0
[0] << 8;
4819 dst2
[3] = src_r0
[0] >> 24 | src_r0
[1] << 8;
4823 dst2
[2] = src_l2
[2] | src_r0
[0] << 16;
4824 dst2
[3] = src_r0
[0] >> 16 | src_r0
[1] << 16;
4828 dst2
[2] = src_l2
[2] | src_r0
[0] << 24;
4829 dst2
[3] = src_r0
[0] >> 8 | src_r0
[1] << 24;
4833 dst2
[3] = src_r0
[0];
4837 dst2
[3] = src_l2
[3] | src_r0
[0] << 8;
4841 dst2
[3] = src_l2
[3] | src_r0
[0] << 16;
4845 dst2
[3] = src_l2
[3] | src_r0
[0] << 24;
4850 // before: memcat16_9
4851 static void memcat_c15_w4x4_a3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 append0
[4], const u32 append1
[4], const u32 append2
[4], const u32 offset
)
4868 w0
[0] = w0
[0] | append0
[0] << 8;
4869 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
4870 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
4871 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
4872 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
4873 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
4874 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
4875 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
4876 w2
[0] = append1
[3] >> 24 | append2
[0] << 8;
4877 w2
[1] = append2
[0] >> 24;
4881 w0
[0] = w0
[0] | append0
[0] << 16;
4882 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
4883 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
4884 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
4885 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
4886 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
4887 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
4888 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
4889 w2
[0] = append1
[3] >> 16 | append2
[0] << 16;
4890 w2
[1] = append2
[0] >> 16;
4894 w0
[0] = w0
[0] | append0
[0] << 24;
4895 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
4896 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
4897 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
4898 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
4899 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
4900 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
4901 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
4902 w2
[0] = append1
[3] >> 8 | append2
[0] << 24;
4903 w2
[1] = append2
[0] >> 8;
4919 w0
[1] = w0
[1] | append0
[0] << 8;
4920 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
4921 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
4922 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
4923 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
4924 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
4925 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
4926 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
4927 w2
[1] = append1
[3] >> 24 | append2
[0] << 8;
4928 w2
[2] = append2
[0] >> 24;
4932 w0
[1] = w0
[1] | append0
[0] << 16;
4933 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
4934 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
4935 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
4936 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
4937 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
4938 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
4939 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
4940 w2
[1] = append1
[3] >> 16 | append2
[0] << 16;
4941 w2
[2] = append2
[0] >> 16;
4945 w0
[1] = w0
[1] | append0
[0] << 24;
4946 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
4947 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
4948 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
4949 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
4950 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
4951 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
4952 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
4953 w2
[1] = append1
[3] >> 8 | append2
[0] << 24;
4954 w2
[2] = append2
[0] >> 8;
4970 w0
[2] = w0
[2] | append0
[0] << 8;
4971 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
4972 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
4973 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
4974 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
4975 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
4976 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
4977 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
4978 w2
[2] = append1
[3] >> 24 | append2
[0] << 8;
4979 w2
[3] = append2
[0] >> 24;
4983 w0
[2] = w0
[2] | append0
[0] << 16;
4984 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
4985 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
4986 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
4987 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
4988 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
4989 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
4990 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
4991 w2
[2] = append1
[3] >> 16 | append2
[0] << 16;
4992 w2
[3] = append2
[0] >> 16;
4996 w0
[2] = w0
[2] | append0
[0] << 24;
4997 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
4998 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
4999 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
5000 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
5001 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
5002 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
5003 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
5004 w2
[2] = append1
[3] >> 8 | append2
[0] << 24;
5005 w2
[3] = append2
[0] >> 8;
5021 w0
[3] = w0
[3] | append0
[0] << 8;
5022 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
5023 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
5024 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
5025 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
5026 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
5027 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
5028 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
5029 w2
[3] = append1
[3] >> 24 | append2
[0] << 8;
5030 w3
[0] = append2
[0] >> 24;
5034 w0
[3] = w0
[3] | append0
[0] << 16;
5035 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
5036 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
5037 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
5038 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
5039 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
5040 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
5041 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
5042 w2
[3] = append1
[3] >> 16 | append2
[0] << 16;
5043 w3
[0] = append2
[0] >> 16;
5047 w0
[3] = w0
[3] | append0
[0] << 24;
5048 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
5049 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
5050 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
5051 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
5052 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
5053 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
5054 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
5055 w2
[3] = append1
[3] >> 8 | append2
[0] << 24;
5056 w3
[0] = append2
[0] >> 8;
5061 // before: memcat32_8
5062 static void memcat_c32_w4x4_a2x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 append0
[4], const u32 append1
[4], const u32 offset
)
5078 w0
[0] = w0
[0] | append0
[0] << 8;
5079 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
5080 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
5081 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
5082 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
5083 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
5084 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
5085 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
5086 w2
[0] = append1
[3] >> 24;
5090 w0
[0] = w0
[0] | append0
[0] << 16;
5091 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
5092 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
5093 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
5094 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
5095 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
5096 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
5097 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
5098 w2
[0] = append1
[3] >> 16;
5102 w0
[0] = w0
[0] | append0
[0] << 24;
5103 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
5104 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
5105 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
5106 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
5107 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
5108 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
5109 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
5110 w2
[0] = append1
[3] >> 8;
5125 w0
[1] = w0
[1] | append0
[0] << 8;
5126 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
5127 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
5128 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
5129 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
5130 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
5131 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
5132 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
5133 w2
[1] = append1
[3] >> 24;
5137 w0
[1] = w0
[1] | append0
[0] << 16;
5138 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
5139 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
5140 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
5141 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
5142 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
5143 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
5144 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
5145 w2
[1] = append1
[3] >> 16;
5149 w0
[1] = w0
[1] | append0
[0] << 24;
5150 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
5151 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
5152 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
5153 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
5154 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
5155 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
5156 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
5157 w2
[1] = append1
[3] >> 8;
5172 w0
[2] = w0
[2] | append0
[0] << 8;
5173 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
5174 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
5175 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
5176 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
5177 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
5178 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
5179 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
5180 w2
[2] = append1
[3] >> 24;
5184 w0
[2] = w0
[2] | append0
[0] << 16;
5185 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
5186 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
5187 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
5188 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
5189 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
5190 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
5191 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
5192 w2
[2] = append1
[3] >> 16;
5196 w0
[2] = w0
[2] | append0
[0] << 24;
5197 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
5198 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
5199 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
5200 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
5201 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
5202 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
5203 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
5204 w2
[2] = append1
[3] >> 8;
5219 w0
[3] = w0
[3] | append0
[0] << 8;
5220 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
5221 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
5222 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
5223 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
5224 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
5225 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
5226 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
5227 w2
[3] = append1
[3] >> 24;
5231 w0
[3] = w0
[3] | append0
[0] << 16;
5232 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
5233 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
5234 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
5235 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
5236 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
5237 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
5238 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
5239 w2
[3] = append1
[3] >> 16;
5243 w0
[3] = w0
[3] | append0
[0] << 24;
5244 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
5245 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
5246 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
5247 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
5248 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
5249 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
5250 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
5251 w2
[3] = append1
[3] >> 8;
5266 w1
[0] = w1
[0] | append0
[0] << 8;
5267 w1
[1] = append0
[0] >> 24 | append0
[1] << 8;
5268 w1
[2] = append0
[1] >> 24 | append0
[2] << 8;
5269 w1
[3] = append0
[2] >> 24 | append0
[3] << 8;
5270 w2
[0] = append0
[3] >> 24 | append1
[0] << 8;
5271 w2
[1] = append1
[0] >> 24 | append1
[1] << 8;
5272 w2
[2] = append1
[1] >> 24 | append1
[2] << 8;
5273 w2
[3] = append1
[2] >> 24 | append1
[3] << 8;
5274 w3
[0] = append1
[3] >> 24;
5278 w1
[0] = w1
[0] | append0
[0] << 16;
5279 w1
[1] = append0
[0] >> 16 | append0
[1] << 16;
5280 w1
[2] = append0
[1] >> 16 | append0
[2] << 16;
5281 w1
[3] = append0
[2] >> 16 | append0
[3] << 16;
5282 w2
[0] = append0
[3] >> 16 | append1
[0] << 16;
5283 w2
[1] = append1
[0] >> 16 | append1
[1] << 16;
5284 w2
[2] = append1
[1] >> 16 | append1
[2] << 16;
5285 w2
[3] = append1
[2] >> 16 | append1
[3] << 16;
5286 w3
[0] = append1
[3] >> 16;
5290 w1
[0] = w1
[0] | append0
[0] << 24;
5291 w1
[1] = append0
[0] >> 8 | append0
[1] << 24;
5292 w1
[2] = append0
[1] >> 8 | append0
[2] << 24;
5293 w1
[3] = append0
[2] >> 8 | append0
[3] << 24;
5294 w2
[0] = append0
[3] >> 8 | append1
[0] << 24;
5295 w2
[1] = append1
[0] >> 8 | append1
[1] << 24;
5296 w2
[2] = append1
[1] >> 8 | append1
[2] << 24;
5297 w2
[3] = append1
[2] >> 8 | append1
[3] << 24;
5298 w3
[0] = append1
[3] >> 8;
5313 w1
[1] = w1
[1] | append0
[0] << 8;
5314 w1
[2] = append0
[0] >> 24 | append0
[1] << 8;
5315 w1
[3] = append0
[1] >> 24 | append0
[2] << 8;
5316 w2
[0] = append0
[2] >> 24 | append0
[3] << 8;
5317 w2
[1] = append0
[3] >> 24 | append1
[0] << 8;
5318 w2
[2] = append1
[0] >> 24 | append1
[1] << 8;
5319 w2
[3] = append1
[1] >> 24 | append1
[2] << 8;
5320 w3
[0] = append1
[2] >> 24 | append1
[3] << 8;
5321 w3
[1] = append1
[3] >> 24;
5325 w1
[1] = w1
[1] | append0
[0] << 16;
5326 w1
[2] = append0
[0] >> 16 | append0
[1] << 16;
5327 w1
[3] = append0
[1] >> 16 | append0
[2] << 16;
5328 w2
[0] = append0
[2] >> 16 | append0
[3] << 16;
5329 w2
[1] = append0
[3] >> 16 | append1
[0] << 16;
5330 w2
[2] = append1
[0] >> 16 | append1
[1] << 16;
5331 w2
[3] = append1
[1] >> 16 | append1
[2] << 16;
5332 w3
[0] = append1
[2] >> 16 | append1
[3] << 16;
5333 w3
[1] = append1
[3] >> 16;
5337 w1
[1] = w1
[1] | append0
[0] << 24;
5338 w1
[2] = append0
[0] >> 8 | append0
[1] << 24;
5339 w1
[3] = append0
[1] >> 8 | append0
[2] << 24;
5340 w2
[0] = append0
[2] >> 8 | append0
[3] << 24;
5341 w2
[1] = append0
[3] >> 8 | append1
[0] << 24;
5342 w2
[2] = append1
[0] >> 8 | append1
[1] << 24;
5343 w2
[3] = append1
[1] >> 8 | append1
[2] << 24;
5344 w3
[0] = append1
[2] >> 8 | append1
[3] << 24;
5345 w3
[1] = append1
[3] >> 8;
5360 w1
[2] = w1
[2] | append0
[0] << 8;
5361 w1
[3] = append0
[0] >> 24 | append0
[1] << 8;
5362 w2
[0] = append0
[1] >> 24 | append0
[2] << 8;
5363 w2
[1] = append0
[2] >> 24 | append0
[3] << 8;
5364 w2
[2] = append0
[3] >> 24 | append1
[0] << 8;
5365 w2
[3] = append1
[0] >> 24 | append1
[1] << 8;
5366 w3
[0] = append1
[1] >> 24 | append1
[2] << 8;
5367 w3
[1] = append1
[2] >> 24 | append1
[3] << 8;
5371 w1
[2] = w1
[2] | append0
[0] << 16;
5372 w1
[3] = append0
[0] >> 16 | append0
[1] << 16;
5373 w2
[0] = append0
[1] >> 16 | append0
[2] << 16;
5374 w2
[1] = append0
[2] >> 16 | append0
[3] << 16;
5375 w2
[2] = append0
[3] >> 16 | append1
[0] << 16;
5376 w2
[3] = append1
[0] >> 16 | append1
[1] << 16;
5377 w3
[0] = append1
[1] >> 16 | append1
[2] << 16;
5378 w3
[1] = append1
[2] >> 16 | append1
[3] << 16;
5382 w1
[2] = w1
[2] | append0
[0] << 24;
5383 w1
[3] = append0
[0] >> 8 | append0
[1] << 24;
5384 w2
[0] = append0
[1] >> 8 | append0
[2] << 24;
5385 w2
[1] = append0
[2] >> 8 | append0
[3] << 24;
5386 w2
[2] = append0
[3] >> 8 | append1
[0] << 24;
5387 w2
[3] = append1
[0] >> 8 | append1
[1] << 24;
5388 w3
[0] = append1
[1] >> 8 | append1
[2] << 24;
5389 w3
[1] = append1
[2] >> 8 | append1
[3] << 24;
5403 w1
[3] = w1
[3] | append0
[0] << 8;
5404 w2
[0] = append0
[0] >> 24 | append0
[1] << 8;
5405 w2
[1] = append0
[1] >> 24 | append0
[2] << 8;
5406 w2
[2] = append0
[2] >> 24 | append0
[3] << 8;
5407 w2
[3] = append0
[3] >> 24 | append1
[0] << 8;
5408 w3
[0] = append1
[0] >> 24 | append1
[1] << 8;
5409 w3
[1] = append1
[1] >> 24 | append1
[2] << 8;
5413 w1
[3] = w1
[3] | append0
[0] << 16;
5414 w2
[0] = append0
[0] >> 16 | append0
[1] << 16;
5415 w2
[1] = append0
[1] >> 16 | append0
[2] << 16;
5416 w2
[2] = append0
[2] >> 16 | append0
[3] << 16;
5417 w2
[3] = append0
[3] >> 16 | append1
[0] << 16;
5418 w3
[0] = append1
[0] >> 16 | append1
[1] << 16;
5419 w3
[1] = append1
[1] >> 16 | append1
[2] << 16;
5423 w1
[3] = w1
[3] | append0
[0] << 24;
5424 w2
[0] = append0
[0] >> 8 | append0
[1] << 24;
5425 w2
[1] = append0
[1] >> 8 | append0
[2] << 24;
5426 w2
[2] = append0
[2] >> 8 | append0
[3] << 24;
5427 w2
[3] = append0
[3] >> 8 | append1
[0] << 24;
5428 w3
[0] = append1
[0] >> 8 | append1
[1] << 24;
5429 w3
[1] = append1
[1] >> 8 | append1
[2] << 24;
5443 // before: memcat32_9
5444 static void memcat_c32_w4x4_a3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 append0
[4], const u32 append1
[4], const u32 append2
[4], const u32 offset
)
5461 w0
[0] = w0
[0] | append0
[0] << 8;
5462 w0
[1] = append0
[0] >> 24 | append0
[1] << 8;
5463 w0
[2] = append0
[1] >> 24 | append0
[2] << 8;
5464 w0
[3] = append0
[2] >> 24 | append0
[3] << 8;
5465 w1
[0] = append0
[3] >> 24 | append1
[0] << 8;
5466 w1
[1] = append1
[0] >> 24 | append1
[1] << 8;
5467 w1
[2] = append1
[1] >> 24 | append1
[2] << 8;
5468 w1
[3] = append1
[2] >> 24 | append1
[3] << 8;
5469 w2
[0] = append1
[3] >> 24 | append2
[0] << 8;
5470 w2
[1] = append2
[0] >> 24;
5474 w0
[0] = w0
[0] | append0
[0] << 16;
5475 w0
[1] = append0
[0] >> 16 | append0
[1] << 16;
5476 w0
[2] = append0
[1] >> 16 | append0
[2] << 16;
5477 w0
[3] = append0
[2] >> 16 | append0
[3] << 16;
5478 w1
[0] = append0
[3] >> 16 | append1
[0] << 16;
5479 w1
[1] = append1
[0] >> 16 | append1
[1] << 16;
5480 w1
[2] = append1
[1] >> 16 | append1
[2] << 16;
5481 w1
[3] = append1
[2] >> 16 | append1
[3] << 16;
5482 w2
[0] = append1
[3] >> 16 | append2
[0] << 16;
5483 w2
[1] = append2
[0] >> 16;
5487 w0
[0] = w0
[0] | append0
[0] << 24;
5488 w0
[1] = append0
[0] >> 8 | append0
[1] << 24;
5489 w0
[2] = append0
[1] >> 8 | append0
[2] << 24;
5490 w0
[3] = append0
[2] >> 8 | append0
[3] << 24;
5491 w1
[0] = append0
[3] >> 8 | append1
[0] << 24;
5492 w1
[1] = append1
[0] >> 8 | append1
[1] << 24;
5493 w1
[2] = append1
[1] >> 8 | append1
[2] << 24;
5494 w1
[3] = append1
[2] >> 8 | append1
[3] << 24;
5495 w2
[0] = append1
[3] >> 8 | append2
[0] << 24;
5496 w2
[1] = append2
[0] >> 8;
5512 w0
[1] = w0
[1] | append0
[0] << 8;
5513 w0
[2] = append0
[0] >> 24 | append0
[1] << 8;
5514 w0
[3] = append0
[1] >> 24 | append0
[2] << 8;
5515 w1
[0] = append0
[2] >> 24 | append0
[3] << 8;
5516 w1
[1] = append0
[3] >> 24 | append1
[0] << 8;
5517 w1
[2] = append1
[0] >> 24 | append1
[1] << 8;
5518 w1
[3] = append1
[1] >> 24 | append1
[2] << 8;
5519 w2
[0] = append1
[2] >> 24 | append1
[3] << 8;
5520 w2
[1] = append1
[3] >> 24 | append2
[0] << 8;
5521 w2
[2] = append2
[0] >> 24;
5525 w0
[1] = w0
[1] | append0
[0] << 16;
5526 w0
[2] = append0
[0] >> 16 | append0
[1] << 16;
5527 w0
[3] = append0
[1] >> 16 | append0
[2] << 16;
5528 w1
[0] = append0
[2] >> 16 | append0
[3] << 16;
5529 w1
[1] = append0
[3] >> 16 | append1
[0] << 16;
5530 w1
[2] = append1
[0] >> 16 | append1
[1] << 16;
5531 w1
[3] = append1
[1] >> 16 | append1
[2] << 16;
5532 w2
[0] = append1
[2] >> 16 | append1
[3] << 16;
5533 w2
[1] = append1
[3] >> 16 | append2
[0] << 16;
5534 w2
[2] = append2
[0] >> 16;
5538 w0
[1] = w0
[1] | append0
[0] << 24;
5539 w0
[2] = append0
[0] >> 8 | append0
[1] << 24;
5540 w0
[3] = append0
[1] >> 8 | append0
[2] << 24;
5541 w1
[0] = append0
[2] >> 8 | append0
[3] << 24;
5542 w1
[1] = append0
[3] >> 8 | append1
[0] << 24;
5543 w1
[2] = append1
[0] >> 8 | append1
[1] << 24;
5544 w1
[3] = append1
[1] >> 8 | append1
[2] << 24;
5545 w2
[0] = append1
[2] >> 8 | append1
[3] << 24;
5546 w2
[1] = append1
[3] >> 8 | append2
[0] << 24;
5547 w2
[2] = append2
[0] >> 8;
5563 w0
[2] = w0
[2] | append0
[0] << 8;
5564 w0
[3] = append0
[0] >> 24 | append0
[1] << 8;
5565 w1
[0] = append0
[1] >> 24 | append0
[2] << 8;
5566 w1
[1] = append0
[2] >> 24 | append0
[3] << 8;
5567 w1
[2] = append0
[3] >> 24 | append1
[0] << 8;
5568 w1
[3] = append1
[0] >> 24 | append1
[1] << 8;
5569 w2
[0] = append1
[1] >> 24 | append1
[2] << 8;
5570 w2
[1] = append1
[2] >> 24 | append1
[3] << 8;
5571 w2
[2] = append1
[3] >> 24 | append2
[0] << 8;
5572 w2
[3] = append2
[0] >> 24;
5576 w0
[2] = w0
[2] | append0
[0] << 16;
5577 w0
[3] = append0
[0] >> 16 | append0
[1] << 16;
5578 w1
[0] = append0
[1] >> 16 | append0
[2] << 16;
5579 w1
[1] = append0
[2] >> 16 | append0
[3] << 16;
5580 w1
[2] = append0
[3] >> 16 | append1
[0] << 16;
5581 w1
[3] = append1
[0] >> 16 | append1
[1] << 16;
5582 w2
[0] = append1
[1] >> 16 | append1
[2] << 16;
5583 w2
[1] = append1
[2] >> 16 | append1
[3] << 16;
5584 w2
[2] = append1
[3] >> 16 | append2
[0] << 16;
5585 w2
[3] = append2
[0] >> 16;
5589 w0
[2] = w0
[2] | append0
[0] << 24;
5590 w0
[3] = append0
[0] >> 8 | append0
[1] << 24;
5591 w1
[0] = append0
[1] >> 8 | append0
[2] << 24;
5592 w1
[1] = append0
[2] >> 8 | append0
[3] << 24;
5593 w1
[2] = append0
[3] >> 8 | append1
[0] << 24;
5594 w1
[3] = append1
[0] >> 8 | append1
[1] << 24;
5595 w2
[0] = append1
[1] >> 8 | append1
[2] << 24;
5596 w2
[1] = append1
[2] >> 8 | append1
[3] << 24;
5597 w2
[2] = append1
[3] >> 8 | append2
[0] << 24;
5598 w2
[3] = append2
[0] >> 8;
5614 w0
[3] = w0
[3] | append0
[0] << 8;
5615 w1
[0] = append0
[0] >> 24 | append0
[1] << 8;
5616 w1
[1] = append0
[1] >> 24 | append0
[2] << 8;
5617 w1
[2] = append0
[2] >> 24 | append0
[3] << 8;
5618 w1
[3] = append0
[3] >> 24 | append1
[0] << 8;
5619 w2
[0] = append1
[0] >> 24 | append1
[1] << 8;
5620 w2
[1] = append1
[1] >> 24 | append1
[2] << 8;
5621 w2
[2] = append1
[2] >> 24 | append1
[3] << 8;
5622 w2
[3] = append1
[3] >> 24 | append2
[0] << 8;
5623 w3
[0] = append2
[0] >> 24;
5627 w0
[3] = w0
[3] | append0
[0] << 16;
5628 w1
[0] = append0
[0] >> 16 | append0
[1] << 16;
5629 w1
[1] = append0
[1] >> 16 | append0
[2] << 16;
5630 w1
[2] = append0
[2] >> 16 | append0
[3] << 16;
5631 w1
[3] = append0
[3] >> 16 | append1
[0] << 16;
5632 w2
[0] = append1
[0] >> 16 | append1
[1] << 16;
5633 w2
[1] = append1
[1] >> 16 | append1
[2] << 16;
5634 w2
[2] = append1
[2] >> 16 | append1
[3] << 16;
5635 w2
[3] = append1
[3] >> 16 | append2
[0] << 16;
5636 w3
[0] = append2
[0] >> 16;
5640 w0
[3] = w0
[3] | append0
[0] << 24;
5641 w1
[0] = append0
[0] >> 8 | append0
[1] << 24;
5642 w1
[1] = append0
[1] >> 8 | append0
[2] << 24;
5643 w1
[2] = append0
[2] >> 8 | append0
[3] << 24;
5644 w1
[3] = append0
[3] >> 8 | append1
[0] << 24;
5645 w2
[0] = append1
[0] >> 8 | append1
[1] << 24;
5646 w2
[1] = append1
[1] >> 8 | append1
[2] << 24;
5647 w2
[2] = append1
[2] >> 8 | append1
[3] << 24;
5648 w2
[3] = append1
[3] >> 8 | append2
[0] << 24;
5649 w3
[0] = append2
[0] >> 8;
5665 w1
[0] = w1
[0] | append0
[0] << 8;
5666 w1
[1] = append0
[0] >> 24 | append0
[1] << 8;
5667 w1
[2] = append0
[1] >> 24 | append0
[2] << 8;
5668 w1
[3] = append0
[2] >> 24 | append0
[3] << 8;
5669 w2
[0] = append0
[3] >> 24 | append1
[0] << 8;
5670 w2
[1] = append1
[0] >> 24 | append1
[1] << 8;
5671 w2
[2] = append1
[1] >> 24 | append1
[2] << 8;
5672 w2
[3] = append1
[2] >> 24 | append1
[3] << 8;
5673 w3
[0] = append1
[3] >> 24 | append2
[0] << 8;
5674 w3
[1] = append2
[0] >> 24;
5678 w1
[0] = w1
[0] | append0
[0] << 16;
5679 w1
[1] = append0
[0] >> 16 | append0
[1] << 16;
5680 w1
[2] = append0
[1] >> 16 | append0
[2] << 16;
5681 w1
[3] = append0
[2] >> 16 | append0
[3] << 16;
5682 w2
[0] = append0
[3] >> 16 | append1
[0] << 16;
5683 w2
[1] = append1
[0] >> 16 | append1
[1] << 16;
5684 w2
[2] = append1
[1] >> 16 | append1
[2] << 16;
5685 w2
[3] = append1
[2] >> 16 | append1
[3] << 16;
5686 w3
[0] = append1
[3] >> 16 | append2
[0] << 16;
5687 w3
[1] = append2
[0] >> 16;
5691 w1
[0] = w1
[0] | append0
[0] << 24;
5692 w1
[1] = append0
[0] >> 8 | append0
[1] << 24;
5693 w1
[2] = append0
[1] >> 8 | append0
[2] << 24;
5694 w1
[3] = append0
[2] >> 8 | append0
[3] << 24;
5695 w2
[0] = append0
[3] >> 8 | append1
[0] << 24;
5696 w2
[1] = append1
[0] >> 8 | append1
[1] << 24;
5697 w2
[2] = append1
[1] >> 8 | append1
[2] << 24;
5698 w2
[3] = append1
[2] >> 8 | append1
[3] << 24;
5699 w3
[0] = append1
[3] >> 8 | append2
[0] << 24;
5700 w3
[1] = append2
[0] >> 8;
5716 w1
[1] = w1
[1] | append0
[0] << 8;
5717 w1
[2] = append0
[0] >> 24 | append0
[1] << 8;
5718 w1
[3] = append0
[1] >> 24 | append0
[2] << 8;
5719 w2
[0] = append0
[2] >> 24 | append0
[3] << 8;
5720 w2
[1] = append0
[3] >> 24 | append1
[0] << 8;
5721 w2
[2] = append1
[0] >> 24 | append1
[1] << 8;
5722 w2
[3] = append1
[1] >> 24 | append1
[2] << 8;
5723 w3
[0] = append1
[2] >> 24 | append1
[3] << 8;
5724 w3
[1] = append1
[3] >> 24 | append2
[0] << 8;
5728 w1
[1] = w1
[1] | append0
[0] << 16;
5729 w1
[2] = append0
[0] >> 16 | append0
[1] << 16;
5730 w1
[3] = append0
[1] >> 16 | append0
[2] << 16;
5731 w2
[0] = append0
[2] >> 16 | append0
[3] << 16;
5732 w2
[1] = append0
[3] >> 16 | append1
[0] << 16;
5733 w2
[2] = append1
[0] >> 16 | append1
[1] << 16;
5734 w2
[3] = append1
[1] >> 16 | append1
[2] << 16;
5735 w3
[0] = append1
[2] >> 16 | append1
[3] << 16;
5736 w3
[1] = append1
[3] >> 16 | append2
[0] << 16;
5740 w1
[1] = w1
[1] | append0
[0] << 24;
5741 w1
[2] = append0
[0] >> 8 | append0
[1] << 24;
5742 w1
[3] = append0
[1] >> 8 | append0
[2] << 24;
5743 w2
[0] = append0
[2] >> 8 | append0
[3] << 24;
5744 w2
[1] = append0
[3] >> 8 | append1
[0] << 24;
5745 w2
[2] = append1
[0] >> 8 | append1
[1] << 24;
5746 w2
[3] = append1
[1] >> 8 | append1
[2] << 24;
5747 w3
[0] = append1
[2] >> 8 | append1
[3] << 24;
5748 w3
[1] = append1
[3] >> 8 | append2
[0] << 24;
5763 w1
[2] = w1
[2] | append0
[0] << 8;
5764 w1
[3] = append0
[0] >> 24 | append0
[1] << 8;
5765 w2
[0] = append0
[1] >> 24 | append0
[2] << 8;
5766 w2
[1] = append0
[2] >> 24 | append0
[3] << 8;
5767 w2
[2] = append0
[3] >> 24 | append1
[0] << 8;
5768 w2
[3] = append1
[0] >> 24 | append1
[1] << 8;
5769 w3
[0] = append1
[1] >> 24 | append1
[2] << 8;
5770 w3
[1] = append1
[2] >> 24 | append1
[3] << 8;
5774 w1
[2] = w1
[2] | append0
[0] << 16;
5775 w1
[3] = append0
[0] >> 16 | append0
[1] << 16;
5776 w2
[0] = append0
[1] >> 16 | append0
[2] << 16;
5777 w2
[1] = append0
[2] >> 16 | append0
[3] << 16;
5778 w2
[2] = append0
[3] >> 16 | append1
[0] << 16;
5779 w2
[3] = append1
[0] >> 16 | append1
[1] << 16;
5780 w3
[0] = append1
[1] >> 16 | append1
[2] << 16;
5781 w3
[1] = append1
[2] >> 16 | append1
[3] << 16;
5785 w1
[2] = w1
[2] | append0
[0] << 24;
5786 w1
[3] = append0
[0] >> 8 | append0
[1] << 24;
5787 w2
[0] = append0
[1] >> 8 | append0
[2] << 24;
5788 w2
[1] = append0
[2] >> 8 | append0
[3] << 24;
5789 w2
[2] = append0
[3] >> 8 | append1
[0] << 24;
5790 w2
[3] = append1
[0] >> 8 | append1
[1] << 24;
5791 w3
[0] = append1
[1] >> 8 | append1
[2] << 24;
5792 w3
[1] = append1
[2] >> 8 | append1
[3] << 24;
5806 w1
[3] = w1
[3] | append0
[0] << 8;
5807 w2
[0] = append0
[0] >> 24 | append0
[1] << 8;
5808 w2
[1] = append0
[1] >> 24 | append0
[2] << 8;
5809 w2
[2] = append0
[2] >> 24 | append0
[3] << 8;
5810 w2
[3] = append0
[3] >> 24 | append1
[0] << 8;
5811 w3
[0] = append1
[0] >> 24 | append1
[1] << 8;
5812 w3
[1] = append1
[1] >> 24 | append1
[2] << 8;
5816 w1
[3] = w1
[3] | append0
[0] << 16;
5817 w2
[0] = append0
[0] >> 16 | append0
[1] << 16;
5818 w2
[1] = append0
[1] >> 16 | append0
[2] << 16;
5819 w2
[2] = append0
[2] >> 16 | append0
[3] << 16;
5820 w2
[3] = append0
[3] >> 16 | append1
[0] << 16;
5821 w3
[0] = append1
[0] >> 16 | append1
[1] << 16;
5822 w3
[1] = append1
[1] >> 16 | append1
[2] << 16;
5826 w1
[3] = w1
[3] | append0
[0] << 24;
5827 w2
[0] = append0
[0] >> 8 | append0
[1] << 24;
5828 w2
[1] = append0
[1] >> 8 | append0
[2] << 24;
5829 w2
[2] = append0
[2] >> 8 | append0
[3] << 24;
5830 w2
[3] = append0
[3] >> 8 | append1
[0] << 24;
5831 w3
[0] = append1
[0] >> 8 | append1
[1] << 24;
5832 w3
[1] = append1
[1] >> 8 | append1
[2] << 24;
5846 static void switch_buffer_by_offset (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
5849 const int offset_mod_4
= offset
& 3;
5851 const int offset_minus_4
= 4 - offset
;
5856 w3
[2] = amd_bytealign ( 0, w3
[1], offset_minus_4
);
5857 w3
[1] = amd_bytealign (w3
[1], w3
[0], offset_minus_4
);
5858 w3
[0] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
5859 w2
[3] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
5860 w2
[2] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
5861 w2
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
5862 w2
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
5863 w1
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
5864 w1
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
5865 w1
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5866 w1
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5867 w0
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5868 w0
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5869 w0
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5870 w0
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5872 if (offset_mod_4
== 0)
5894 w3
[2] = amd_bytealign ( 0, w3
[0], offset_minus_4
);
5895 w3
[1] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
5896 w3
[0] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
5897 w2
[3] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
5898 w2
[2] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
5899 w2
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
5900 w2
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
5901 w1
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
5902 w1
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5903 w1
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5904 w1
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5905 w0
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5906 w0
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5907 w0
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5910 if (offset_mod_4
== 0)
5931 w3
[2] = amd_bytealign ( 0, w2
[3], offset_minus_4
);
5932 w3
[1] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
5933 w3
[0] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
5934 w2
[3] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
5935 w2
[2] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
5936 w2
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
5937 w2
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
5938 w1
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5939 w1
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5940 w1
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5941 w1
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5942 w0
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5943 w0
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5947 if (offset_mod_4
== 0)
5967 w3
[2] = amd_bytealign ( 0, w2
[2], offset_minus_4
);
5968 w3
[1] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
5969 w3
[0] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
5970 w2
[3] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
5971 w2
[2] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
5972 w2
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
5973 w2
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5974 w1
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5975 w1
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5976 w1
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5977 w1
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5978 w0
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5983 if (offset_mod_4
== 0)
6002 w3
[2] = amd_bytealign ( 0, w2
[1], offset_minus_4
);
6003 w3
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
6004 w3
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
6005 w2
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
6006 w2
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
6007 w2
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
6008 w2
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6009 w1
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6010 w1
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6011 w1
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6012 w1
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6018 if (offset_mod_4
== 0)
6036 w3
[2] = amd_bytealign ( 0, w2
[0], offset_minus_4
);
6037 w3
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
6038 w3
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
6039 w2
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
6040 w2
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
6041 w2
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6042 w2
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6043 w1
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6044 w1
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6045 w1
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6052 if (offset_mod_4
== 0)
6069 w3
[2] = amd_bytealign ( 0, w1
[3], offset_minus_4
);
6070 w3
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
6071 w3
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
6072 w2
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
6073 w2
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6074 w2
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6075 w2
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6076 w1
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6077 w1
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6085 if (offset_mod_4
== 0)
6101 w3
[2] = amd_bytealign ( 0, w1
[2], offset_minus_4
);
6102 w3
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
6103 w3
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
6104 w2
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6105 w2
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6106 w2
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6107 w2
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6108 w1
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6117 if (offset_mod_4
== 0)
6132 w3
[2] = amd_bytealign ( 0, w1
[1], offset_minus_4
);
6133 w3
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
6134 w3
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6135 w2
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6136 w2
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6137 w2
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6138 w2
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6148 if (offset_mod_4
== 0)
6162 w3
[2] = amd_bytealign ( 0, w1
[0], offset_minus_4
);
6163 w3
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
6164 w3
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6165 w2
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6166 w2
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6167 w2
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6178 if (offset_mod_4
== 0)
6191 w3
[2] = amd_bytealign ( 0, w0
[3], offset_minus_4
);
6192 w3
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
6193 w3
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6194 w2
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6195 w2
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6207 if (offset_mod_4
== 0)
6219 w3
[2] = amd_bytealign ( 0, w0
[2], offset_minus_4
);
6220 w3
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
6221 w3
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6222 w2
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6235 if (offset_mod_4
== 0)
6246 w3
[2] = amd_bytealign ( 0, w0
[1], offset_minus_4
);
6247 w3
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
6248 w3
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6262 if (offset_mod_4
== 0)
6272 w3
[2] = amd_bytealign ( 0, w0
[0], offset_minus_4
);
6273 w3
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
6288 if (offset_mod_4
== 0)
6299 const int offset_minus_4
= 4 - (offset
% 4);
6301 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
6306 w3
[1] = __byte_perm (w3
[0], w3
[1], selector
);
6307 w3
[0] = __byte_perm (w2
[3], w3
[0], selector
);
6308 w2
[3] = __byte_perm (w2
[2], w2
[3], selector
);
6309 w2
[2] = __byte_perm (w2
[1], w2
[2], selector
);
6310 w2
[1] = __byte_perm (w2
[0], w2
[1], selector
);
6311 w2
[0] = __byte_perm (w1
[3], w2
[0], selector
);
6312 w1
[3] = __byte_perm (w1
[2], w1
[3], selector
);
6313 w1
[2] = __byte_perm (w1
[1], w1
[2], selector
);
6314 w1
[1] = __byte_perm (w1
[0], w1
[1], selector
);
6315 w1
[0] = __byte_perm (w0
[3], w1
[0], selector
);
6316 w0
[3] = __byte_perm (w0
[2], w0
[3], selector
);
6317 w0
[2] = __byte_perm (w0
[1], w0
[2], selector
);
6318 w0
[1] = __byte_perm (w0
[0], w0
[1], selector
);
6319 w0
[0] = __byte_perm ( 0, w0
[0], selector
);
6324 w3
[1] = __byte_perm (w2
[3], w3
[0], selector
);
6325 w3
[0] = __byte_perm (w2
[2], w2
[3], selector
);
6326 w2
[3] = __byte_perm (w2
[1], w2
[2], selector
);
6327 w2
[2] = __byte_perm (w2
[0], w2
[1], selector
);
6328 w2
[1] = __byte_perm (w1
[3], w2
[0], selector
);
6329 w2
[0] = __byte_perm (w1
[2], w1
[3], selector
);
6330 w1
[3] = __byte_perm (w1
[1], w1
[2], selector
);
6331 w1
[2] = __byte_perm (w1
[0], w1
[1], selector
);
6332 w1
[1] = __byte_perm (w0
[3], w1
[0], selector
);
6333 w1
[0] = __byte_perm (w0
[2], w0
[3], selector
);
6334 w0
[3] = __byte_perm (w0
[1], w0
[2], selector
);
6335 w0
[2] = __byte_perm (w0
[0], w0
[1], selector
);
6336 w0
[1] = __byte_perm ( 0, w0
[0], selector
);
6342 w3
[1] = __byte_perm (w2
[2], w2
[3], selector
);
6343 w3
[0] = __byte_perm (w2
[1], w2
[2], selector
);
6344 w2
[3] = __byte_perm (w2
[0], w2
[1], selector
);
6345 w2
[2] = __byte_perm (w1
[3], w2
[0], selector
);
6346 w2
[1] = __byte_perm (w1
[2], w1
[3], selector
);
6347 w2
[0] = __byte_perm (w1
[1], w1
[2], selector
);
6348 w1
[3] = __byte_perm (w1
[0], w1
[1], selector
);
6349 w1
[2] = __byte_perm (w0
[3], w1
[0], selector
);
6350 w1
[1] = __byte_perm (w0
[2], w0
[3], selector
);
6351 w1
[0] = __byte_perm (w0
[1], w0
[2], selector
);
6352 w0
[3] = __byte_perm (w0
[0], w0
[1], selector
);
6353 w0
[2] = __byte_perm ( 0, w0
[0], selector
);
6360 w3
[1] = __byte_perm (w2
[1], w2
[2], selector
);
6361 w3
[0] = __byte_perm (w2
[0], w2
[1], selector
);
6362 w2
[3] = __byte_perm (w1
[3], w2
[0], selector
);
6363 w2
[2] = __byte_perm (w1
[2], w1
[3], selector
);
6364 w2
[1] = __byte_perm (w1
[1], w1
[2], selector
);
6365 w2
[0] = __byte_perm (w1
[0], w1
[1], selector
);
6366 w1
[3] = __byte_perm (w0
[3], w1
[0], selector
);
6367 w1
[2] = __byte_perm (w0
[2], w0
[3], selector
);
6368 w1
[1] = __byte_perm (w0
[1], w0
[2], selector
);
6369 w1
[0] = __byte_perm (w0
[0], w0
[1], selector
);
6370 w0
[3] = __byte_perm ( 0, w0
[0], selector
);
6378 w3
[1] = __byte_perm (w2
[0], w2
[1], selector
);
6379 w3
[0] = __byte_perm (w1
[3], w2
[0], selector
);
6380 w2
[3] = __byte_perm (w1
[2], w1
[3], selector
);
6381 w2
[2] = __byte_perm (w1
[1], w1
[2], selector
);
6382 w2
[1] = __byte_perm (w1
[0], w1
[1], selector
);
6383 w2
[0] = __byte_perm (w0
[3], w1
[0], selector
);
6384 w1
[3] = __byte_perm (w0
[2], w0
[3], selector
);
6385 w1
[2] = __byte_perm (w0
[1], w0
[2], selector
);
6386 w1
[1] = __byte_perm (w0
[0], w0
[1], selector
);
6387 w1
[0] = __byte_perm ( 0, w0
[0], selector
);
6396 w3
[1] = __byte_perm (w1
[3], w2
[0], selector
);
6397 w3
[0] = __byte_perm (w1
[2], w1
[3], selector
);
6398 w2
[3] = __byte_perm (w1
[1], w1
[2], selector
);
6399 w2
[2] = __byte_perm (w1
[0], w1
[1], selector
);
6400 w2
[1] = __byte_perm (w0
[3], w1
[0], selector
);
6401 w2
[0] = __byte_perm (w0
[2], w0
[3], selector
);
6402 w1
[3] = __byte_perm (w0
[1], w0
[2], selector
);
6403 w1
[2] = __byte_perm (w0
[0], w0
[1], selector
);
6404 w1
[1] = __byte_perm ( 0, w0
[0], selector
);
6414 w3
[1] = __byte_perm (w1
[2], w1
[3], selector
);
6415 w3
[0] = __byte_perm (w1
[1], w1
[2], selector
);
6416 w2
[3] = __byte_perm (w1
[0], w1
[1], selector
);
6417 w2
[2] = __byte_perm (w0
[3], w1
[0], selector
);
6418 w2
[1] = __byte_perm (w0
[2], w0
[3], selector
);
6419 w2
[0] = __byte_perm (w0
[1], w0
[2], selector
);
6420 w1
[3] = __byte_perm (w0
[0], w0
[1], selector
);
6421 w1
[2] = __byte_perm ( 0, w0
[0], selector
);
6432 w3
[1] = __byte_perm (w1
[1], w1
[2], selector
);
6433 w3
[0] = __byte_perm (w1
[0], w1
[1], selector
);
6434 w2
[3] = __byte_perm (w0
[3], w1
[0], selector
);
6435 w2
[2] = __byte_perm (w0
[2], w0
[3], selector
);
6436 w2
[1] = __byte_perm (w0
[1], w0
[2], selector
);
6437 w2
[0] = __byte_perm (w0
[0], w0
[1], selector
);
6438 w1
[3] = __byte_perm ( 0, w0
[0], selector
);
6450 w3
[1] = __byte_perm (w1
[0], w1
[1], selector
);
6451 w3
[0] = __byte_perm (w0
[3], w1
[0], selector
);
6452 w2
[3] = __byte_perm (w0
[2], w0
[3], selector
);
6453 w2
[2] = __byte_perm (w0
[1], w0
[2], selector
);
6454 w2
[1] = __byte_perm (w0
[0], w0
[1], selector
);
6455 w2
[0] = __byte_perm ( 0, w0
[0], selector
);
6468 w3
[1] = __byte_perm (w0
[3], w1
[0], selector
);
6469 w3
[0] = __byte_perm (w0
[2], w0
[3], selector
);
6470 w2
[3] = __byte_perm (w0
[1], w0
[2], selector
);
6471 w2
[2] = __byte_perm (w0
[0], w0
[1], selector
);
6472 w2
[1] = __byte_perm ( 0, w0
[0], selector
);
6486 w3
[1] = __byte_perm (w0
[2], w0
[3], selector
);
6487 w3
[0] = __byte_perm (w0
[1], w0
[2], selector
);
6488 w2
[3] = __byte_perm (w0
[0], w0
[1], selector
);
6489 w2
[2] = __byte_perm ( 0, w0
[0], selector
);
6504 w3
[1] = __byte_perm (w0
[1], w0
[2], selector
);
6505 w3
[0] = __byte_perm (w0
[0], w0
[1], selector
);
6506 w2
[3] = __byte_perm ( 0, w0
[0], selector
);
6522 w3
[1] = __byte_perm (w0
[0], w0
[1], selector
);
6523 w3
[0] = __byte_perm ( 0, w0
[0], selector
);
6540 w3
[1] = __byte_perm ( 0, w0
[0], selector
);
6560 static void switch_buffer_by_offset_be (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
6566 w3
[2] = amd_bytealign (w3
[1], 0, offset
);
6567 w3
[1] = amd_bytealign (w3
[0], w3
[1], offset
);
6568 w3
[0] = amd_bytealign (w2
[3], w3
[0], offset
);
6569 w2
[3] = amd_bytealign (w2
[2], w2
[3], offset
);
6570 w2
[2] = amd_bytealign (w2
[1], w2
[2], offset
);
6571 w2
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
6572 w2
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
6573 w1
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
6574 w1
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
6575 w1
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
6576 w1
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
6577 w0
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
6578 w0
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
6579 w0
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
6580 w0
[0] = amd_bytealign ( 0, w0
[0], offset
);
6584 w3
[2] = amd_bytealign (w3
[0], 0, offset
);
6585 w3
[1] = amd_bytealign (w2
[3], w3
[0], offset
);
6586 w3
[0] = amd_bytealign (w2
[2], w2
[3], offset
);
6587 w2
[3] = amd_bytealign (w2
[1], w2
[2], offset
);
6588 w2
[2] = amd_bytealign (w2
[0], w2
[1], offset
);
6589 w2
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
6590 w2
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
6591 w1
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
6592 w1
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
6593 w1
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
6594 w1
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
6595 w0
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
6596 w0
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
6597 w0
[1] = amd_bytealign ( 0, w0
[0], offset
);
6602 w3
[2] = amd_bytealign (w2
[3], 0, offset
);
6603 w3
[1] = amd_bytealign (w2
[2], w2
[3], offset
);
6604 w3
[0] = amd_bytealign (w2
[1], w2
[2], offset
);
6605 w2
[3] = amd_bytealign (w2
[0], w2
[1], offset
);
6606 w2
[2] = amd_bytealign (w1
[3], w2
[0], offset
);
6607 w2
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
6608 w2
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
6609 w1
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
6610 w1
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
6611 w1
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
6612 w1
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
6613 w0
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
6614 w0
[2] = amd_bytealign ( 0, w0
[0], offset
);
6620 w3
[2] = amd_bytealign (w2
[2], 0, offset
);
6621 w3
[1] = amd_bytealign (w2
[1], w2
[2], offset
);
6622 w3
[0] = amd_bytealign (w2
[0], w2
[1], offset
);
6623 w2
[3] = amd_bytealign (w1
[3], w2
[0], offset
);
6624 w2
[2] = amd_bytealign (w1
[2], w1
[3], offset
);
6625 w2
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
6626 w2
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
6627 w1
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
6628 w1
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
6629 w1
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
6630 w1
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
6631 w0
[3] = amd_bytealign ( 0, w0
[0], offset
);
6638 w3
[2] = amd_bytealign (w2
[1], 0, offset
);
6639 w3
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
6640 w3
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
6641 w2
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
6642 w2
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
6643 w2
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
6644 w2
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
6645 w1
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
6646 w1
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
6647 w1
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
6648 w1
[0] = amd_bytealign ( 0, w0
[0], offset
);
6656 w3
[2] = amd_bytealign (w2
[0], 0, offset
);
6657 w3
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
6658 w3
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
6659 w2
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
6660 w2
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
6661 w2
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
6662 w2
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
6663 w1
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
6664 w1
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
6665 w1
[1] = amd_bytealign ( 0, w0
[0], offset
);
6674 w3
[2] = amd_bytealign (w1
[3], 0, offset
);
6675 w3
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
6676 w3
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
6677 w2
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
6678 w2
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
6679 w2
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
6680 w2
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
6681 w1
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
6682 w1
[2] = amd_bytealign ( 0, w0
[0], offset
);
6692 w3
[2] = amd_bytealign (w1
[2], 0, offset
);
6693 w3
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
6694 w3
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
6695 w2
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
6696 w2
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
6697 w2
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
6698 w2
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
6699 w1
[3] = amd_bytealign ( 0, w0
[0], offset
);
6710 w3
[2] = amd_bytealign (w1
[1], 0, offset
);
6711 w3
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
6712 w3
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
6713 w2
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
6714 w2
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
6715 w2
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
6716 w2
[0] = amd_bytealign ( 0, w0
[0], offset
);
6728 w3
[2] = amd_bytealign (w1
[0], 0, offset
);
6729 w3
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
6730 w3
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
6731 w2
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
6732 w2
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
6733 w2
[1] = amd_bytealign ( 0, w0
[0], offset
);
6746 w3
[2] = amd_bytealign (w0
[3], 0, offset
);
6747 w3
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
6748 w3
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
6749 w2
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
6750 w2
[2] = amd_bytealign ( 0, w0
[0], offset
);
6764 w3
[2] = amd_bytealign (w0
[2], 0, offset
);
6765 w3
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
6766 w3
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
6767 w2
[3] = amd_bytealign ( 0, w0
[0], offset
);
6782 w3
[2] = amd_bytealign (w0
[1], 0, offset
);
6783 w3
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
6784 w3
[0] = amd_bytealign ( 0, w0
[0], offset
);
6800 w3
[2] = amd_bytealign (w0
[0], 0, offset
);
6801 w3
[1] = amd_bytealign ( 0, w0
[0], offset
);
6820 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
6825 w3
[1] = __byte_perm (w3
[1], w3
[0], selector
);
6826 w3
[0] = __byte_perm (w3
[0], w2
[3], selector
);
6827 w2
[3] = __byte_perm (w2
[3], w2
[2], selector
);
6828 w2
[2] = __byte_perm (w2
[2], w2
[1], selector
);
6829 w2
[1] = __byte_perm (w2
[1], w2
[0], selector
);
6830 w2
[0] = __byte_perm (w2
[0], w1
[3], selector
);
6831 w1
[3] = __byte_perm (w1
[3], w1
[2], selector
);
6832 w1
[2] = __byte_perm (w1
[2], w1
[1], selector
);
6833 w1
[1] = __byte_perm (w1
[1], w1
[0], selector
);
6834 w1
[0] = __byte_perm (w1
[0], w0
[3], selector
);
6835 w0
[3] = __byte_perm (w0
[3], w0
[2], selector
);
6836 w0
[2] = __byte_perm (w0
[2], w0
[1], selector
);
6837 w0
[1] = __byte_perm (w0
[1], w0
[0], selector
);
6838 w0
[0] = __byte_perm (w0
[0], 0, selector
);
6842 w3
[1] = __byte_perm (w3
[0], w2
[3], selector
);
6843 w3
[0] = __byte_perm (w2
[3], w2
[2], selector
);
6844 w2
[3] = __byte_perm (w2
[2], w2
[1], selector
);
6845 w2
[2] = __byte_perm (w2
[1], w2
[0], selector
);
6846 w2
[1] = __byte_perm (w2
[0], w1
[3], selector
);
6847 w2
[0] = __byte_perm (w1
[3], w1
[2], selector
);
6848 w1
[3] = __byte_perm (w1
[2], w1
[1], selector
);
6849 w1
[2] = __byte_perm (w1
[1], w1
[0], selector
);
6850 w1
[1] = __byte_perm (w1
[0], w0
[3], selector
);
6851 w1
[0] = __byte_perm (w0
[3], w0
[2], selector
);
6852 w0
[3] = __byte_perm (w0
[2], w0
[1], selector
);
6853 w0
[2] = __byte_perm (w0
[1], w0
[0], selector
);
6854 w0
[1] = __byte_perm (w0
[0], 0, selector
);
6859 w3
[1] = __byte_perm (w2
[3], w2
[2], selector
);
6860 w3
[0] = __byte_perm (w2
[2], w2
[1], selector
);
6861 w2
[3] = __byte_perm (w2
[1], w2
[0], selector
);
6862 w2
[2] = __byte_perm (w2
[0], w1
[3], selector
);
6863 w2
[1] = __byte_perm (w1
[3], w1
[2], selector
);
6864 w2
[0] = __byte_perm (w1
[2], w1
[1], selector
);
6865 w1
[3] = __byte_perm (w1
[1], w1
[0], selector
);
6866 w1
[2] = __byte_perm (w1
[0], w0
[3], selector
);
6867 w1
[1] = __byte_perm (w0
[3], w0
[2], selector
);
6868 w1
[0] = __byte_perm (w0
[2], w0
[1], selector
);
6869 w0
[3] = __byte_perm (w0
[1], w0
[0], selector
);
6870 w0
[2] = __byte_perm (w0
[0], 0, selector
);
6876 w3
[1] = __byte_perm (w2
[2], w2
[1], selector
);
6877 w3
[0] = __byte_perm (w2
[1], w2
[0], selector
);
6878 w2
[3] = __byte_perm (w2
[0], w1
[3], selector
);
6879 w2
[2] = __byte_perm (w1
[3], w1
[2], selector
);
6880 w2
[1] = __byte_perm (w1
[2], w1
[1], selector
);
6881 w2
[0] = __byte_perm (w1
[1], w1
[0], selector
);
6882 w1
[3] = __byte_perm (w1
[0], w0
[3], selector
);
6883 w1
[2] = __byte_perm (w0
[3], w0
[2], selector
);
6884 w1
[1] = __byte_perm (w0
[2], w0
[1], selector
);
6885 w1
[0] = __byte_perm (w0
[1], w0
[0], selector
);
6886 w0
[3] = __byte_perm (w0
[0], 0, selector
);
6893 w3
[1] = __byte_perm (w2
[1], w2
[0], selector
);
6894 w3
[0] = __byte_perm (w2
[0], w1
[3], selector
);
6895 w2
[3] = __byte_perm (w1
[3], w1
[2], selector
);
6896 w2
[2] = __byte_perm (w1
[2], w1
[1], selector
);
6897 w2
[1] = __byte_perm (w1
[1], w1
[0], selector
);
6898 w2
[0] = __byte_perm (w1
[0], w0
[3], selector
);
6899 w1
[3] = __byte_perm (w0
[3], w0
[2], selector
);
6900 w1
[2] = __byte_perm (w0
[2], w0
[1], selector
);
6901 w1
[1] = __byte_perm (w0
[1], w0
[0], selector
);
6902 w1
[0] = __byte_perm (w0
[0], 0, selector
);
6910 w3
[1] = __byte_perm (w2
[0], w1
[3], selector
);
6911 w3
[0] = __byte_perm (w1
[3], w1
[2], selector
);
6912 w2
[3] = __byte_perm (w1
[2], w1
[1], selector
);
6913 w2
[2] = __byte_perm (w1
[1], w1
[0], selector
);
6914 w2
[1] = __byte_perm (w1
[0], w0
[3], selector
);
6915 w2
[0] = __byte_perm (w0
[3], w0
[2], selector
);
6916 w1
[3] = __byte_perm (w0
[2], w0
[1], selector
);
6917 w1
[2] = __byte_perm (w0
[1], w0
[0], selector
);
6918 w1
[1] = __byte_perm (w0
[0], 0, selector
);
6927 w3
[1] = __byte_perm (w1
[3], w1
[2], selector
);
6928 w3
[0] = __byte_perm (w1
[2], w1
[1], selector
);
6929 w2
[3] = __byte_perm (w1
[1], w1
[0], selector
);
6930 w2
[2] = __byte_perm (w1
[0], w0
[3], selector
);
6931 w2
[1] = __byte_perm (w0
[3], w0
[2], selector
);
6932 w2
[0] = __byte_perm (w0
[2], w0
[1], selector
);
6933 w1
[3] = __byte_perm (w0
[1], w0
[0], selector
);
6934 w1
[2] = __byte_perm (w0
[0], 0, selector
);
6944 w3
[1] = __byte_perm (w1
[2], w1
[1], selector
);
6945 w3
[0] = __byte_perm (w1
[1], w1
[0], selector
);
6946 w2
[3] = __byte_perm (w1
[0], w0
[3], selector
);
6947 w2
[2] = __byte_perm (w0
[3], w0
[2], selector
);
6948 w2
[1] = __byte_perm (w0
[2], w0
[1], selector
);
6949 w2
[0] = __byte_perm (w0
[1], w0
[0], selector
);
6950 w1
[3] = __byte_perm (w0
[0], 0, selector
);
6961 w3
[1] = __byte_perm (w1
[1], w1
[0], selector
);
6962 w3
[0] = __byte_perm (w1
[0], w0
[3], selector
);
6963 w2
[3] = __byte_perm (w0
[3], w0
[2], selector
);
6964 w2
[2] = __byte_perm (w0
[2], w0
[1], selector
);
6965 w2
[1] = __byte_perm (w0
[1], w0
[0], selector
);
6966 w2
[0] = __byte_perm (w0
[0], 0, selector
);
6978 w3
[1] = __byte_perm (w1
[0], w0
[3], selector
);
6979 w3
[0] = __byte_perm (w0
[3], w0
[2], selector
);
6980 w2
[3] = __byte_perm (w0
[2], w0
[1], selector
);
6981 w2
[2] = __byte_perm (w0
[1], w0
[0], selector
);
6982 w2
[1] = __byte_perm (w0
[0], 0, selector
);
6995 w3
[1] = __byte_perm (w0
[3], w0
[2], selector
);
6996 w3
[0] = __byte_perm (w0
[2], w0
[1], selector
);
6997 w2
[3] = __byte_perm (w0
[1], w0
[0], selector
);
6998 w2
[2] = __byte_perm (w0
[0], 0, selector
);
7012 w3
[1] = __byte_perm (w0
[2], w0
[1], selector
);
7013 w3
[0] = __byte_perm (w0
[1], w0
[0], selector
);
7014 w2
[3] = __byte_perm (w0
[0], 0, selector
);
7029 w3
[1] = __byte_perm (w0
[1], w0
[0], selector
);
7030 w3
[0] = __byte_perm (w0
[0], 0, selector
);
7046 w3
[1] = __byte_perm (w0
[0], 0, selector
);
7065 /* not needed anymore?
7066 // before: append_0x80_2_be
7067 static void append_0x80_2x4_be (u32 w0[4], u32 w1[4], const u32 offset)
7072 w0[0] |= 0x80000000;
7088 w0[1] |= 0x80000000;
7104 w0[2] |= 0x80000000;
7120 w0[3] |= 0x80000000;
7136 w1[0] |= 0x80000000;
7152 w1[1] |= 0x80000000;
7168 w1[2] |= 0x80000000;
7184 w1[3] |= 0x80000000;
7201 // before: append_0x80_4
7202 static void append_0x80_1x16 (u32 w[16], const u32 offset)
7211 w[ 0] = w[ 0] | 0x8000;
7215 w[ 0] = w[ 0] | 0x800000;
7219 w[ 0] = w[ 0] | 0x80000000;
7227 w[ 1] = w[ 1] | 0x8000;
7231 w[ 1] = w[ 1] | 0x800000;
7235 w[ 1] = w[ 1] | 0x80000000;
7243 w[ 2] = w[ 2] | 0x8000;
7247 w[ 2] = w[ 2] | 0x800000;
7251 w[ 2] = w[ 2] | 0x80000000;
7259 w[ 3] = w[ 3] | 0x8000;
7263 w[ 3] = w[ 3] | 0x800000;
7267 w[ 3] = w[ 3] | 0x80000000;
7275 w[ 4] = w[ 4] | 0x8000;
7279 w[ 4] = w[ 4] | 0x800000;
7283 w[ 4] = w[ 4] | 0x80000000;
7291 w[ 5] = w[ 5] | 0x8000;
7295 w[ 5] = w[ 5] | 0x800000;
7299 w[ 5] = w[ 5] | 0x80000000;
7307 w[ 6] = w[ 6] | 0x8000;
7311 w[ 6] = w[ 6] | 0x800000;
7315 w[ 6] = w[ 6] | 0x80000000;
7323 w[ 7] = w[ 7] | 0x8000;
7327 w[ 7] = w[ 7] | 0x800000;
7331 w[ 7] = w[ 7] | 0x80000000;
7339 w[ 8] = w[ 8] | 0x8000;
7343 w[ 8] = w[ 8] | 0x800000;
7347 w[ 8] = w[ 8] | 0x80000000;
7355 w[ 9] = w[ 9] | 0x8000;
7359 w[ 9] = w[ 9] | 0x800000;
7363 w[ 9] = w[ 9] | 0x80000000;
7371 w[10] = w[10] | 0x8000;
7375 w[10] = w[10] | 0x800000;
7379 w[10] = w[10] | 0x80000000;
7387 w[11] = w[11] | 0x8000;
7391 w[11] = w[11] | 0x800000;
7395 w[11] = w[11] | 0x80000000;
7403 w[12] = w[12] | 0x8000;
7407 w[12] = w[12] | 0x800000;
7411 w[12] = w[12] | 0x80000000;
7419 w[13] = w[13] | 0x8000;
7423 w[13] = w[13] | 0x800000;
7427 w[13] = w[13] | 0x80000000;
7435 w[14] = w[14] | 0x8000;
7439 w[14] = w[14] | 0x800000;
7443 w[14] = w[14] | 0x80000000;
7451 w[15] = w[15] | 0x8000;
7455 w[15] = w[15] | 0x800000;
7459 w[15] = w[15] | 0x80000000;
7464 // before: append_0x80_8
7465 static void append_0x80_1x32 (u32 w[32], const u32 offset)
7474 w[ 0] = w[ 0] | 0x8000;
7478 w[ 0] = w[ 0] | 0x800000;
7482 w[ 0] = w[ 0] | 0x80000000;
7490 w[ 1] = w[ 1] | 0x8000;
7494 w[ 1] = w[ 1] | 0x800000;
7498 w[ 1] = w[ 1] | 0x80000000;
7506 w[ 2] = w[ 2] | 0x8000;
7510 w[ 2] = w[ 2] | 0x800000;
7514 w[ 2] = w[ 2] | 0x80000000;
7522 w[ 3] = w[ 3] | 0x8000;
7526 w[ 3] = w[ 3] | 0x800000;
7530 w[ 3] = w[ 3] | 0x80000000;
7538 w[ 4] = w[ 4] | 0x8000;
7542 w[ 4] = w[ 4] | 0x800000;
7546 w[ 4] = w[ 4] | 0x80000000;
7554 w[ 5] = w[ 5] | 0x8000;
7558 w[ 5] = w[ 5] | 0x800000;
7562 w[ 5] = w[ 5] | 0x80000000;
7570 w[ 6] = w[ 6] | 0x8000;
7574 w[ 6] = w[ 6] | 0x800000;
7578 w[ 6] = w[ 6] | 0x80000000;
7586 w[ 7] = w[ 7] | 0x8000;
7590 w[ 7] = w[ 7] | 0x800000;
7594 w[ 7] = w[ 7] | 0x80000000;
7602 w[ 8] = w[ 8] | 0x8000;
7606 w[ 8] = w[ 8] | 0x800000;
7610 w[ 8] = w[ 8] | 0x80000000;
7618 w[ 9] = w[ 9] | 0x8000;
7622 w[ 9] = w[ 9] | 0x800000;
7626 w[ 9] = w[ 9] | 0x80000000;
7634 w[10] = w[10] | 0x8000;
7638 w[10] = w[10] | 0x800000;
7642 w[10] = w[10] | 0x80000000;
7650 w[11] = w[11] | 0x8000;
7654 w[11] = w[11] | 0x800000;
7658 w[11] = w[11] | 0x80000000;
7666 w[12] = w[12] | 0x8000;
7670 w[12] = w[12] | 0x800000;
7674 w[12] = w[12] | 0x80000000;
7682 w[13] = w[13] | 0x8000;
7686 w[13] = w[13] | 0x800000;
7690 w[13] = w[13] | 0x80000000;
7698 w[14] = w[14] | 0x8000;
7702 w[14] = w[14] | 0x800000;
7706 w[14] = w[14] | 0x80000000;
7714 w[15] = w[15] | 0x8000;
7718 w[15] = w[15] | 0x800000;
7722 w[15] = w[15] | 0x80000000;
7730 w[16] = w[16] | 0x8000;
7734 w[16] = w[16] | 0x800000;
7738 w[16] = w[16] | 0x80000000;
7746 w[17] = w[17] | 0x8000;
7750 w[17] = w[17] | 0x800000;
7754 w[17] = w[17] | 0x80000000;
7762 w[18] = w[18] | 0x8000;
7766 w[18] = w[18] | 0x800000;
7770 w[18] = w[18] | 0x80000000;
7778 w[19] = w[19] | 0x8000;
7782 w[19] = w[19] | 0x800000;
7786 w[19] = w[19] | 0x80000000;
7794 w[20] = w[20] | 0x8000;
7798 w[20] = w[20] | 0x800000;
7802 w[20] = w[20] | 0x80000000;
7810 w[21] = w[21] | 0x8000;
7814 w[21] = w[21] | 0x800000;
7818 w[21] = w[21] | 0x80000000;
7826 w[22] = w[22] | 0x8000;
7830 w[22] = w[22] | 0x800000;
7834 w[22] = w[22] | 0x80000000;
7842 w[23] = w[23] | 0x8000;
7846 w[23] = w[23] | 0x800000;
7850 w[23] = w[23] | 0x80000000;
7858 w[24] = w[24] | 0x8000;
7862 w[24] = w[24] | 0x800000;
7866 w[24] = w[24] | 0x80000000;
7874 w[25] = w[25] | 0x8000;
7878 w[25] = w[25] | 0x800000;
7882 w[25] = w[25] | 0x80000000;
7890 w[26] = w[26] | 0x8000;
7894 w[26] = w[26] | 0x800000;
7898 w[26] = w[26] | 0x80000000;
7906 w[27] = w[27] | 0x8000;
7910 w[27] = w[27] | 0x800000;
7914 w[27] = w[27] | 0x80000000;
7922 w[28] = w[28] | 0x8000;
7926 w[28] = w[28] | 0x800000;
7930 w[28] = w[28] | 0x80000000;
7938 w[29] = w[29] | 0x8000;
7942 w[29] = w[29] | 0x800000;
7946 w[29] = w[29] | 0x80000000;
7954 w[30] = w[30] | 0x8000;
7958 w[30] = w[30] | 0x800000;
7962 w[30] = w[30] | 0x80000000;
7970 w[31] = w[31] | 0x8000;
7974 w[31] = w[31] | 0x800000;
7978 w[31] = w[31] | 0x80000000;