cdef71738485fe1de33ae9d0deff971812526741
2 * Author......: Jens Steube <jens.steube@gmail.com>
6 static int hash_comp (const u32 d1
[4], __global u32
*d2
)
8 if (d1
[3] > d2
[DGST_R3
]) return ( 1);
9 if (d1
[3] < d2
[DGST_R3
]) return (-1);
10 if (d1
[2] > d2
[DGST_R2
]) return ( 1);
11 if (d1
[2] < d2
[DGST_R2
]) return (-1);
12 if (d1
[1] > d2
[DGST_R1
]) return ( 1);
13 if (d1
[1] < d2
[DGST_R1
]) return (-1);
14 if (d1
[0] > d2
[DGST_R0
]) return ( 1);
15 if (d1
[0] < d2
[DGST_R0
]) return (-1);
20 static int find_hash (const u32 digest
[4], const u32 digests_cnt
, __global digest_t
*digests_buf
)
22 for (u32 l
= 0, r
= digests_cnt
; r
; r
>>= 1)
28 const int cmp
= hash_comp (digest
, digests_buf
[c
].digest_buf
);
37 if (cmp
== 0) return (c
);
43 static u32
check_bitmap (__global u32
*bitmap
, const u32 bitmap_mask
, const u32 bitmap_shift
, const u32 digest
)
45 return (bitmap
[(digest
>> bitmap_shift
) & bitmap_mask
] & (1 << (digest
& 0x1f)));
48 static u32
check (const u32 digest
[2], __global u32
*bitmap_s1_a
, __global u32
*bitmap_s1_b
, __global u32
*bitmap_s1_c
, __global u32
*bitmap_s1_d
, __global u32
*bitmap_s2_a
, __global u32
*bitmap_s2_b
, __global u32
*bitmap_s2_c
, __global u32
*bitmap_s2_d
, const u32 bitmap_mask
, const u32 bitmap_shift1
, const u32 bitmap_shift2
)
50 if (check_bitmap (bitmap_s1_a
, bitmap_mask
, bitmap_shift1
, digest
[0]) == 0) return (0);
51 if (check_bitmap (bitmap_s1_b
, bitmap_mask
, bitmap_shift1
, digest
[1]) == 0) return (0);
52 if (check_bitmap (bitmap_s1_c
, bitmap_mask
, bitmap_shift1
, digest
[2]) == 0) return (0);
53 if (check_bitmap (bitmap_s1_d
, bitmap_mask
, bitmap_shift1
, digest
[3]) == 0) return (0);
55 if (check_bitmap (bitmap_s2_a
, bitmap_mask
, bitmap_shift2
, digest
[0]) == 0) return (0);
56 if (check_bitmap (bitmap_s2_b
, bitmap_mask
, bitmap_shift2
, digest
[1]) == 0) return (0);
57 if (check_bitmap (bitmap_s2_c
, bitmap_mask
, bitmap_shift2
, digest
[2]) == 0) return (0);
58 if (check_bitmap (bitmap_s2_d
, bitmap_mask
, bitmap_shift2
, digest
[3]) == 0) return (0);
63 static void mark_hash (__global plain_t
*plains_buf
, __global u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
65 hashes_shown
[hash_pos
] = 1;
67 plains_buf
[hash_pos
].gidvid
= (gid
* 1) + 0;
68 plains_buf
[hash_pos
].il_pos
= il_pos
;
71 static void truncate_block (u32 w
[4], const u32 len
)
80 case 1: w
[0] &= 0x000000FF;
85 case 2: w
[0] &= 0x0000FFFF;
90 case 3: w
[0] &= 0x00FFFFFF;
99 case 5: w
[1] &= 0x000000FF;
103 case 6: w
[1] &= 0x0000FFFF;
107 case 7: w
[1] &= 0x00FFFFFF;
114 case 9: w
[2] &= 0x000000FF;
117 case 10: w
[2] &= 0x0000FFFF;
120 case 11: w
[2] &= 0x00FFFFFF;
125 case 13: w
[3] &= 0x000000FF;
127 case 14: w
[3] &= 0x0000FFFF;
129 case 15: w
[3] &= 0x00FFFFFF;
134 static void make_unicode (const u32 in
[4], u32 out1
[4], u32 out2
[4])
137 out2
[3] = __byte_perm (in
[3], 0, 0x7372);
138 out2
[2] = __byte_perm (in
[3], 0, 0x7170);
139 out2
[1] = __byte_perm (in
[2], 0, 0x7372);
140 out2
[0] = __byte_perm (in
[2], 0, 0x7170);
141 out1
[3] = __byte_perm (in
[1], 0, 0x7372);
142 out1
[2] = __byte_perm (in
[1], 0, 0x7170);
143 out1
[1] = __byte_perm (in
[0], 0, 0x7372);
144 out1
[0] = __byte_perm (in
[0], 0, 0x7170);
147 #if defined IS_AMD || defined IS_GENERIC
148 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
149 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
150 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
151 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
152 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
153 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
154 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
155 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
159 static void undo_unicode (const u32 in1
[4], const u32 in2
[4], u32 out
[4])
162 out
[0] = __byte_perm (in1
[0], in1
[1], 0x6420);
163 out
[1] = __byte_perm (in1
[2], in1
[3], 0x6420);
164 out
[2] = __byte_perm (in2
[0], in2
[1], 0x6420);
165 out
[3] = __byte_perm (in2
[2], in2
[3], 0x6420);
168 #if defined IS_AMD || defined IS_GENERIC
169 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
170 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
171 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
172 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
173 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
174 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
175 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
176 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
180 static void append_0x01_1x4 (u32 w0
[4], const u32 offset
)
189 w0
[0] = w0
[0] | 0x0100;
193 w0
[0] = w0
[0] | 0x010000;
197 w0
[0] = w0
[0] | 0x01000000;
205 w0
[1] = w0
[1] | 0x0100;
209 w0
[1] = w0
[1] | 0x010000;
213 w0
[1] = w0
[1] | 0x01000000;
221 w0
[2] = w0
[2] | 0x0100;
225 w0
[2] = w0
[2] | 0x010000;
229 w0
[2] = w0
[2] | 0x01000000;
237 w0
[3] = w0
[3] | 0x0100;
241 w0
[3] = w0
[3] | 0x010000;
245 w0
[3] = w0
[3] | 0x01000000;
250 static void append_0x01_2x4 (u32 w0
[4], u32 w1
[4], const u32 offset
)
259 w0
[0] = w0
[0] | 0x0100;
263 w0
[0] = w0
[0] | 0x010000;
267 w0
[0] = w0
[0] | 0x01000000;
275 w0
[1] = w0
[1] | 0x0100;
279 w0
[1] = w0
[1] | 0x010000;
283 w0
[1] = w0
[1] | 0x01000000;
291 w0
[2] = w0
[2] | 0x0100;
295 w0
[2] = w0
[2] | 0x010000;
299 w0
[2] = w0
[2] | 0x01000000;
307 w0
[3] = w0
[3] | 0x0100;
311 w0
[3] = w0
[3] | 0x010000;
315 w0
[3] = w0
[3] | 0x01000000;
323 w1
[0] = w1
[0] | 0x0100;
327 w1
[0] = w1
[0] | 0x010000;
331 w1
[0] = w1
[0] | 0x01000000;
339 w1
[1] = w1
[1] | 0x0100;
343 w1
[1] = w1
[1] | 0x010000;
347 w1
[1] = w1
[1] | 0x01000000;
355 w1
[2] = w1
[2] | 0x0100;
359 w1
[2] = w1
[2] | 0x010000;
363 w1
[2] = w1
[2] | 0x01000000;
371 w1
[3] = w1
[3] | 0x0100;
375 w1
[3] = w1
[3] | 0x010000;
379 w1
[3] = w1
[3] | 0x01000000;
384 static void append_0x01_3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
393 w0
[0] = w0
[0] | 0x0100;
397 w0
[0] = w0
[0] | 0x010000;
401 w0
[0] = w0
[0] | 0x01000000;
409 w0
[1] = w0
[1] | 0x0100;
413 w0
[1] = w0
[1] | 0x010000;
417 w0
[1] = w0
[1] | 0x01000000;
425 w0
[2] = w0
[2] | 0x0100;
429 w0
[2] = w0
[2] | 0x010000;
433 w0
[2] = w0
[2] | 0x01000000;
441 w0
[3] = w0
[3] | 0x0100;
445 w0
[3] = w0
[3] | 0x010000;
449 w0
[3] = w0
[3] | 0x01000000;
457 w1
[0] = w1
[0] | 0x0100;
461 w1
[0] = w1
[0] | 0x010000;
465 w1
[0] = w1
[0] | 0x01000000;
473 w1
[1] = w1
[1] | 0x0100;
477 w1
[1] = w1
[1] | 0x010000;
481 w1
[1] = w1
[1] | 0x01000000;
489 w1
[2] = w1
[2] | 0x0100;
493 w1
[2] = w1
[2] | 0x010000;
497 w1
[2] = w1
[2] | 0x01000000;
505 w1
[3] = w1
[3] | 0x0100;
509 w1
[3] = w1
[3] | 0x010000;
513 w1
[3] = w1
[3] | 0x01000000;
521 w2
[0] = w2
[0] | 0x0100;
525 w2
[0] = w2
[0] | 0x010000;
529 w2
[0] = w2
[0] | 0x01000000;
537 w2
[1] = w2
[1] | 0x0100;
541 w2
[1] = w2
[1] | 0x010000;
545 w2
[1] = w2
[1] | 0x01000000;
553 w2
[2] = w2
[2] | 0x0100;
557 w2
[2] = w2
[2] | 0x010000;
561 w2
[2] = w2
[2] | 0x01000000;
569 w2
[3] = w2
[3] | 0x0100;
573 w2
[3] = w2
[3] | 0x010000;
577 w2
[3] = w2
[3] | 0x01000000;
582 static void append_0x01_4x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
591 w0
[0] = w0
[0] | 0x0100;
595 w0
[0] = w0
[0] | 0x010000;
599 w0
[0] = w0
[0] | 0x01000000;
607 w0
[1] = w0
[1] | 0x0100;
611 w0
[1] = w0
[1] | 0x010000;
615 w0
[1] = w0
[1] | 0x01000000;
623 w0
[2] = w0
[2] | 0x0100;
627 w0
[2] = w0
[2] | 0x010000;
631 w0
[2] = w0
[2] | 0x01000000;
639 w0
[3] = w0
[3] | 0x0100;
643 w0
[3] = w0
[3] | 0x010000;
647 w0
[3] = w0
[3] | 0x01000000;
655 w1
[0] = w1
[0] | 0x0100;
659 w1
[0] = w1
[0] | 0x010000;
663 w1
[0] = w1
[0] | 0x01000000;
671 w1
[1] = w1
[1] | 0x0100;
675 w1
[1] = w1
[1] | 0x010000;
679 w1
[1] = w1
[1] | 0x01000000;
687 w1
[2] = w1
[2] | 0x0100;
691 w1
[2] = w1
[2] | 0x010000;
695 w1
[2] = w1
[2] | 0x01000000;
703 w1
[3] = w1
[3] | 0x0100;
707 w1
[3] = w1
[3] | 0x010000;
711 w1
[3] = w1
[3] | 0x01000000;
719 w2
[0] = w2
[0] | 0x0100;
723 w2
[0] = w2
[0] | 0x010000;
727 w2
[0] = w2
[0] | 0x01000000;
735 w2
[1] = w2
[1] | 0x0100;
739 w2
[1] = w2
[1] | 0x010000;
743 w2
[1] = w2
[1] | 0x01000000;
751 w2
[2] = w2
[2] | 0x0100;
755 w2
[2] = w2
[2] | 0x010000;
759 w2
[2] = w2
[2] | 0x01000000;
767 w2
[3] = w2
[3] | 0x0100;
771 w2
[3] = w2
[3] | 0x010000;
775 w2
[3] = w2
[3] | 0x01000000;
783 w3
[0] = w3
[0] | 0x0100;
787 w3
[0] = w3
[0] | 0x010000;
791 w3
[0] = w3
[0] | 0x01000000;
799 w3
[1] = w3
[1] | 0x0100;
803 w3
[1] = w3
[1] | 0x010000;
807 w3
[1] = w3
[1] | 0x01000000;
815 w3
[2] = w3
[2] | 0x0100;
819 w3
[2] = w3
[2] | 0x010000;
823 w3
[2] = w3
[2] | 0x01000000;
831 w3
[3] = w3
[3] | 0x0100;
835 w3
[3] = w3
[3] | 0x010000;
839 w3
[3] = w3
[3] | 0x01000000;
844 static void append_0x01_8x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
853 w0
[0] = w0
[0] | 0x0100;
857 w0
[0] = w0
[0] | 0x010000;
861 w0
[0] = w0
[0] | 0x01000000;
869 w0
[1] = w0
[1] | 0x0100;
873 w0
[1] = w0
[1] | 0x010000;
877 w0
[1] = w0
[1] | 0x01000000;
885 w0
[2] = w0
[2] | 0x0100;
889 w0
[2] = w0
[2] | 0x010000;
893 w0
[2] = w0
[2] | 0x01000000;
901 w0
[3] = w0
[3] | 0x0100;
905 w0
[3] = w0
[3] | 0x010000;
909 w0
[3] = w0
[3] | 0x01000000;
917 w1
[0] = w1
[0] | 0x0100;
921 w1
[0] = w1
[0] | 0x010000;
925 w1
[0] = w1
[0] | 0x01000000;
933 w1
[1] = w1
[1] | 0x0100;
937 w1
[1] = w1
[1] | 0x010000;
941 w1
[1] = w1
[1] | 0x01000000;
949 w1
[2] = w1
[2] | 0x0100;
953 w1
[2] = w1
[2] | 0x010000;
957 w1
[2] = w1
[2] | 0x01000000;
965 w1
[3] = w1
[3] | 0x0100;
969 w1
[3] = w1
[3] | 0x010000;
973 w1
[3] = w1
[3] | 0x01000000;
981 w2
[0] = w2
[0] | 0x0100;
985 w2
[0] = w2
[0] | 0x010000;
989 w2
[0] = w2
[0] | 0x01000000;
997 w2
[1] = w2
[1] | 0x0100;
1001 w2
[1] = w2
[1] | 0x010000;
1005 w2
[1] = w2
[1] | 0x01000000;
1013 w2
[2] = w2
[2] | 0x0100;
1017 w2
[2] = w2
[2] | 0x010000;
1021 w2
[2] = w2
[2] | 0x01000000;
1029 w2
[3] = w2
[3] | 0x0100;
1033 w2
[3] = w2
[3] | 0x010000;
1037 w2
[3] = w2
[3] | 0x01000000;
1045 w3
[0] = w3
[0] | 0x0100;
1049 w3
[0] = w3
[0] | 0x010000;
1053 w3
[0] = w3
[0] | 0x01000000;
1061 w3
[1] = w3
[1] | 0x0100;
1065 w3
[1] = w3
[1] | 0x010000;
1069 w3
[1] = w3
[1] | 0x01000000;
1077 w3
[2] = w3
[2] | 0x0100;
1081 w3
[2] = w3
[2] | 0x010000;
1085 w3
[2] = w3
[2] | 0x01000000;
1093 w3
[3] = w3
[3] | 0x0100;
1097 w3
[3] = w3
[3] | 0x010000;
1101 w3
[3] = w3
[3] | 0x01000000;
1109 w4
[0] = w4
[0] | 0x0100;
1113 w4
[0] = w4
[0] | 0x010000;
1117 w4
[0] = w4
[0] | 0x01000000;
1125 w4
[1] = w4
[1] | 0x0100;
1129 w4
[1] = w4
[1] | 0x010000;
1133 w4
[1] = w4
[1] | 0x01000000;
1141 w4
[2] = w4
[2] | 0x0100;
1145 w4
[2] = w4
[2] | 0x010000;
1149 w4
[2] = w4
[2] | 0x01000000;
1157 w4
[3] = w4
[3] | 0x0100;
1161 w4
[3] = w4
[3] | 0x010000;
1165 w4
[3] = w4
[3] | 0x01000000;
1173 w5
[0] = w5
[0] | 0x0100;
1177 w5
[0] = w5
[0] | 0x010000;
1181 w5
[0] = w5
[0] | 0x01000000;
1189 w5
[1] = w5
[1] | 0x0100;
1193 w5
[1] = w5
[1] | 0x010000;
1197 w5
[1] = w5
[1] | 0x01000000;
1205 w5
[2] = w5
[2] | 0x0100;
1209 w5
[2] = w5
[2] | 0x010000;
1213 w5
[2] = w5
[2] | 0x01000000;
1221 w5
[3] = w5
[3] | 0x0100;
1225 w5
[3] = w5
[3] | 0x010000;
1229 w5
[3] = w5
[3] | 0x01000000;
1237 w6
[0] = w6
[0] | 0x0100;
1241 w6
[0] = w6
[0] | 0x010000;
1245 w6
[0] = w6
[0] | 0x01000000;
1253 w6
[1] = w6
[1] | 0x0100;
1257 w6
[1] = w6
[1] | 0x010000;
1261 w6
[1] = w6
[1] | 0x01000000;
1269 w6
[2] = w6
[2] | 0x0100;
1273 w6
[2] = w6
[2] | 0x010000;
1277 w6
[2] = w6
[2] | 0x01000000;
1285 w6
[3] = w6
[3] | 0x0100;
1289 w6
[3] = w6
[3] | 0x010000;
1293 w6
[3] = w6
[3] | 0x01000000;
1301 w7
[0] = w7
[0] | 0x0100;
1305 w7
[0] = w7
[0] | 0x010000;
1309 w7
[0] = w7
[0] | 0x01000000;
1317 w7
[1] = w7
[1] | 0x0100;
1321 w7
[1] = w7
[1] | 0x010000;
1325 w7
[1] = w7
[1] | 0x01000000;
1333 w7
[2] = w7
[2] | 0x0100;
1337 w7
[2] = w7
[2] | 0x010000;
1341 w7
[2] = w7
[2] | 0x01000000;
1349 w7
[3] = w7
[3] | 0x0100;
1353 w7
[3] = w7
[3] | 0x010000;
1357 w7
[3] = w7
[3] | 0x01000000;
1362 static void append_0x02_1x4 (u32 w0
[4], const u32 offset
)
1371 w0
[0] = w0
[0] | 0x0200;
1375 w0
[0] = w0
[0] | 0x020000;
1379 w0
[0] = w0
[0] | 0x02000000;
1387 w0
[1] = w0
[1] | 0x0200;
1391 w0
[1] = w0
[1] | 0x020000;
1395 w0
[1] = w0
[1] | 0x02000000;
1403 w0
[2] = w0
[2] | 0x0200;
1407 w0
[2] = w0
[2] | 0x020000;
1411 w0
[2] = w0
[2] | 0x02000000;
1419 w0
[3] = w0
[3] | 0x0200;
1423 w0
[3] = w0
[3] | 0x020000;
1427 w0
[3] = w0
[3] | 0x02000000;
1432 static void append_0x02_2x4 (u32 w0
[4], u32 w1
[4], const u32 offset
)
1441 w0
[0] = w0
[0] | 0x0200;
1445 w0
[0] = w0
[0] | 0x020000;
1449 w0
[0] = w0
[0] | 0x02000000;
1457 w0
[1] = w0
[1] | 0x0200;
1461 w0
[1] = w0
[1] | 0x020000;
1465 w0
[1] = w0
[1] | 0x02000000;
1473 w0
[2] = w0
[2] | 0x0200;
1477 w0
[2] = w0
[2] | 0x020000;
1481 w0
[2] = w0
[2] | 0x02000000;
1489 w0
[3] = w0
[3] | 0x0200;
1493 w0
[3] = w0
[3] | 0x020000;
1497 w0
[3] = w0
[3] | 0x02000000;
1505 w1
[0] = w1
[0] | 0x0200;
1509 w1
[0] = w1
[0] | 0x020000;
1513 w1
[0] = w1
[0] | 0x02000000;
1521 w1
[1] = w1
[1] | 0x0200;
1525 w1
[1] = w1
[1] | 0x020000;
1529 w1
[1] = w1
[1] | 0x02000000;
1537 w1
[2] = w1
[2] | 0x0200;
1541 w1
[2] = w1
[2] | 0x020000;
1545 w1
[2] = w1
[2] | 0x02000000;
1553 w1
[3] = w1
[3] | 0x0200;
1557 w1
[3] = w1
[3] | 0x020000;
1561 w1
[3] = w1
[3] | 0x02000000;
1566 static void append_0x02_3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
1575 w0
[0] = w0
[0] | 0x0200;
1579 w0
[0] = w0
[0] | 0x020000;
1583 w0
[0] = w0
[0] | 0x02000000;
1591 w0
[1] = w0
[1] | 0x0200;
1595 w0
[1] = w0
[1] | 0x020000;
1599 w0
[1] = w0
[1] | 0x02000000;
1607 w0
[2] = w0
[2] | 0x0200;
1611 w0
[2] = w0
[2] | 0x020000;
1615 w0
[2] = w0
[2] | 0x02000000;
1623 w0
[3] = w0
[3] | 0x0200;
1627 w0
[3] = w0
[3] | 0x020000;
1631 w0
[3] = w0
[3] | 0x02000000;
1639 w1
[0] = w1
[0] | 0x0200;
1643 w1
[0] = w1
[0] | 0x020000;
1647 w1
[0] = w1
[0] | 0x02000000;
1655 w1
[1] = w1
[1] | 0x0200;
1659 w1
[1] = w1
[1] | 0x020000;
1663 w1
[1] = w1
[1] | 0x02000000;
1671 w1
[2] = w1
[2] | 0x0200;
1675 w1
[2] = w1
[2] | 0x020000;
1679 w1
[2] = w1
[2] | 0x02000000;
1687 w1
[3] = w1
[3] | 0x0200;
1691 w1
[3] = w1
[3] | 0x020000;
1695 w1
[3] = w1
[3] | 0x02000000;
1703 w2
[0] = w2
[0] | 0x0200;
1707 w2
[0] = w2
[0] | 0x020000;
1711 w2
[0] = w2
[0] | 0x02000000;
1719 w2
[1] = w2
[1] | 0x0200;
1723 w2
[1] = w2
[1] | 0x020000;
1727 w2
[1] = w2
[1] | 0x02000000;
1735 w2
[2] = w2
[2] | 0x0200;
1739 w2
[2] = w2
[2] | 0x020000;
1743 w2
[2] = w2
[2] | 0x02000000;
1751 w2
[3] = w2
[3] | 0x0200;
1755 w2
[3] = w2
[3] | 0x020000;
1759 w2
[3] = w2
[3] | 0x02000000;
1764 static void append_0x02_4x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
1773 w0
[0] = w0
[0] | 0x0200;
1777 w0
[0] = w0
[0] | 0x020000;
1781 w0
[0] = w0
[0] | 0x02000000;
1789 w0
[1] = w0
[1] | 0x0200;
1793 w0
[1] = w0
[1] | 0x020000;
1797 w0
[1] = w0
[1] | 0x02000000;
1805 w0
[2] = w0
[2] | 0x0200;
1809 w0
[2] = w0
[2] | 0x020000;
1813 w0
[2] = w0
[2] | 0x02000000;
1821 w0
[3] = w0
[3] | 0x0200;
1825 w0
[3] = w0
[3] | 0x020000;
1829 w0
[3] = w0
[3] | 0x02000000;
1837 w1
[0] = w1
[0] | 0x0200;
1841 w1
[0] = w1
[0] | 0x020000;
1845 w1
[0] = w1
[0] | 0x02000000;
1853 w1
[1] = w1
[1] | 0x0200;
1857 w1
[1] = w1
[1] | 0x020000;
1861 w1
[1] = w1
[1] | 0x02000000;
1869 w1
[2] = w1
[2] | 0x0200;
1873 w1
[2] = w1
[2] | 0x020000;
1877 w1
[2] = w1
[2] | 0x02000000;
1885 w1
[3] = w1
[3] | 0x0200;
1889 w1
[3] = w1
[3] | 0x020000;
1893 w1
[3] = w1
[3] | 0x02000000;
1901 w2
[0] = w2
[0] | 0x0200;
1905 w2
[0] = w2
[0] | 0x020000;
1909 w2
[0] = w2
[0] | 0x02000000;
1917 w2
[1] = w2
[1] | 0x0200;
1921 w2
[1] = w2
[1] | 0x020000;
1925 w2
[1] = w2
[1] | 0x02000000;
1933 w2
[2] = w2
[2] | 0x0200;
1937 w2
[2] = w2
[2] | 0x020000;
1941 w2
[2] = w2
[2] | 0x02000000;
1949 w2
[3] = w2
[3] | 0x0200;
1953 w2
[3] = w2
[3] | 0x020000;
1957 w2
[3] = w2
[3] | 0x02000000;
1965 w3
[0] = w3
[0] | 0x0200;
1969 w3
[0] = w3
[0] | 0x020000;
1973 w3
[0] = w3
[0] | 0x02000000;
1981 w3
[1] = w3
[1] | 0x0200;
1985 w3
[1] = w3
[1] | 0x020000;
1989 w3
[1] = w3
[1] | 0x02000000;
1997 w3
[2] = w3
[2] | 0x0200;
2001 w3
[2] = w3
[2] | 0x020000;
2005 w3
[2] = w3
[2] | 0x02000000;
2013 w3
[3] = w3
[3] | 0x0200;
2017 w3
[3] = w3
[3] | 0x020000;
2021 w3
[3] = w3
[3] | 0x02000000;
2026 static void append_0x02_8x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
2035 w0
[0] = w0
[0] | 0x0200;
2039 w0
[0] = w0
[0] | 0x020000;
2043 w0
[0] = w0
[0] | 0x02000000;
2051 w0
[1] = w0
[1] | 0x0200;
2055 w0
[1] = w0
[1] | 0x020000;
2059 w0
[1] = w0
[1] | 0x02000000;
2067 w0
[2] = w0
[2] | 0x0200;
2071 w0
[2] = w0
[2] | 0x020000;
2075 w0
[2] = w0
[2] | 0x02000000;
2083 w0
[3] = w0
[3] | 0x0200;
2087 w0
[3] = w0
[3] | 0x020000;
2091 w0
[3] = w0
[3] | 0x02000000;
2099 w1
[0] = w1
[0] | 0x0200;
2103 w1
[0] = w1
[0] | 0x020000;
2107 w1
[0] = w1
[0] | 0x02000000;
2115 w1
[1] = w1
[1] | 0x0200;
2119 w1
[1] = w1
[1] | 0x020000;
2123 w1
[1] = w1
[1] | 0x02000000;
2131 w1
[2] = w1
[2] | 0x0200;
2135 w1
[2] = w1
[2] | 0x020000;
2139 w1
[2] = w1
[2] | 0x02000000;
2147 w1
[3] = w1
[3] | 0x0200;
2151 w1
[3] = w1
[3] | 0x020000;
2155 w1
[3] = w1
[3] | 0x02000000;
2163 w2
[0] = w2
[0] | 0x0200;
2167 w2
[0] = w2
[0] | 0x020000;
2171 w2
[0] = w2
[0] | 0x02000000;
2179 w2
[1] = w2
[1] | 0x0200;
2183 w2
[1] = w2
[1] | 0x020000;
2187 w2
[1] = w2
[1] | 0x02000000;
2195 w2
[2] = w2
[2] | 0x0200;
2199 w2
[2] = w2
[2] | 0x020000;
2203 w2
[2] = w2
[2] | 0x02000000;
2211 w2
[3] = w2
[3] | 0x0200;
2215 w2
[3] = w2
[3] | 0x020000;
2219 w2
[3] = w2
[3] | 0x02000000;
2227 w3
[0] = w3
[0] | 0x0200;
2231 w3
[0] = w3
[0] | 0x020000;
2235 w3
[0] = w3
[0] | 0x02000000;
2243 w3
[1] = w3
[1] | 0x0200;
2247 w3
[1] = w3
[1] | 0x020000;
2251 w3
[1] = w3
[1] | 0x02000000;
2259 w3
[2] = w3
[2] | 0x0200;
2263 w3
[2] = w3
[2] | 0x020000;
2267 w3
[2] = w3
[2] | 0x02000000;
2275 w3
[3] = w3
[3] | 0x0200;
2279 w3
[3] = w3
[3] | 0x020000;
2283 w3
[3] = w3
[3] | 0x02000000;
2291 w4
[0] = w4
[0] | 0x0200;
2295 w4
[0] = w4
[0] | 0x020000;
2299 w4
[0] = w4
[0] | 0x02000000;
2307 w4
[1] = w4
[1] | 0x0200;
2311 w4
[1] = w4
[1] | 0x020000;
2315 w4
[1] = w4
[1] | 0x02000000;
2323 w4
[2] = w4
[2] | 0x0200;
2327 w4
[2] = w4
[2] | 0x020000;
2331 w4
[2] = w4
[2] | 0x02000000;
2339 w4
[3] = w4
[3] | 0x0200;
2343 w4
[3] = w4
[3] | 0x020000;
2347 w4
[3] = w4
[3] | 0x02000000;
2355 w5
[0] = w5
[0] | 0x0200;
2359 w5
[0] = w5
[0] | 0x020000;
2363 w5
[0] = w5
[0] | 0x02000000;
2371 w5
[1] = w5
[1] | 0x0200;
2375 w5
[1] = w5
[1] | 0x020000;
2379 w5
[1] = w5
[1] | 0x02000000;
2387 w5
[2] = w5
[2] | 0x0200;
2391 w5
[2] = w5
[2] | 0x020000;
2395 w5
[2] = w5
[2] | 0x02000000;
2403 w5
[3] = w5
[3] | 0x0200;
2407 w5
[3] = w5
[3] | 0x020000;
2411 w5
[3] = w5
[3] | 0x02000000;
2419 w6
[0] = w6
[0] | 0x0200;
2423 w6
[0] = w6
[0] | 0x020000;
2427 w6
[0] = w6
[0] | 0x02000000;
2435 w6
[1] = w6
[1] | 0x0200;
2439 w6
[1] = w6
[1] | 0x020000;
2443 w6
[1] = w6
[1] | 0x02000000;
2451 w6
[2] = w6
[2] | 0x0200;
2455 w6
[2] = w6
[2] | 0x020000;
2459 w6
[2] = w6
[2] | 0x02000000;
2467 w6
[3] = w6
[3] | 0x0200;
2471 w6
[3] = w6
[3] | 0x020000;
2475 w6
[3] = w6
[3] | 0x02000000;
2483 w7
[0] = w7
[0] | 0x0200;
2487 w7
[0] = w7
[0] | 0x020000;
2491 w7
[0] = w7
[0] | 0x02000000;
2499 w7
[1] = w7
[1] | 0x0200;
2503 w7
[1] = w7
[1] | 0x020000;
2507 w7
[1] = w7
[1] | 0x02000000;
2515 w7
[2] = w7
[2] | 0x0200;
2519 w7
[2] = w7
[2] | 0x020000;
2523 w7
[2] = w7
[2] | 0x02000000;
2531 w7
[3] = w7
[3] | 0x0200;
2535 w7
[3] = w7
[3] | 0x020000;
2539 w7
[3] = w7
[3] | 0x02000000;
2544 static void append_0x80_1x4 (u32 w0
[4], const u32 offset
)
2553 w0
[0] = w0
[0] | 0x8000;
2557 w0
[0] = w0
[0] | 0x800000;
2561 w0
[0] = w0
[0] | 0x80000000;
2569 w0
[1] = w0
[1] | 0x8000;
2573 w0
[1] = w0
[1] | 0x800000;
2577 w0
[1] = w0
[1] | 0x80000000;
2585 w0
[2] = w0
[2] | 0x8000;
2589 w0
[2] = w0
[2] | 0x800000;
2593 w0
[2] = w0
[2] | 0x80000000;
2601 w0
[3] = w0
[3] | 0x8000;
2605 w0
[3] = w0
[3] | 0x800000;
2609 w0
[3] = w0
[3] | 0x80000000;
2614 static void append_0x80_2x4 (u32 w0
[4], u32 w1
[4], const u32 offset
)
2623 w0
[0] = w0
[0] | 0x8000;
2627 w0
[0] = w0
[0] | 0x800000;
2631 w0
[0] = w0
[0] | 0x80000000;
2639 w0
[1] = w0
[1] | 0x8000;
2643 w0
[1] = w0
[1] | 0x800000;
2647 w0
[1] = w0
[1] | 0x80000000;
2655 w0
[2] = w0
[2] | 0x8000;
2659 w0
[2] = w0
[2] | 0x800000;
2663 w0
[2] = w0
[2] | 0x80000000;
2671 w0
[3] = w0
[3] | 0x8000;
2675 w0
[3] = w0
[3] | 0x800000;
2679 w0
[3] = w0
[3] | 0x80000000;
2687 w1
[0] = w1
[0] | 0x8000;
2691 w1
[0] = w1
[0] | 0x800000;
2695 w1
[0] = w1
[0] | 0x80000000;
2703 w1
[1] = w1
[1] | 0x8000;
2707 w1
[1] = w1
[1] | 0x800000;
2711 w1
[1] = w1
[1] | 0x80000000;
2719 w1
[2] = w1
[2] | 0x8000;
2723 w1
[2] = w1
[2] | 0x800000;
2727 w1
[2] = w1
[2] | 0x80000000;
2735 w1
[3] = w1
[3] | 0x8000;
2739 w1
[3] = w1
[3] | 0x800000;
2743 w1
[3] = w1
[3] | 0x80000000;
2748 static void append_0x80_3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
2757 w0
[0] = w0
[0] | 0x8000;
2761 w0
[0] = w0
[0] | 0x800000;
2765 w0
[0] = w0
[0] | 0x80000000;
2773 w0
[1] = w0
[1] | 0x8000;
2777 w0
[1] = w0
[1] | 0x800000;
2781 w0
[1] = w0
[1] | 0x80000000;
2789 w0
[2] = w0
[2] | 0x8000;
2793 w0
[2] = w0
[2] | 0x800000;
2797 w0
[2] = w0
[2] | 0x80000000;
2805 w0
[3] = w0
[3] | 0x8000;
2809 w0
[3] = w0
[3] | 0x800000;
2813 w0
[3] = w0
[3] | 0x80000000;
2821 w1
[0] = w1
[0] | 0x8000;
2825 w1
[0] = w1
[0] | 0x800000;
2829 w1
[0] = w1
[0] | 0x80000000;
2837 w1
[1] = w1
[1] | 0x8000;
2841 w1
[1] = w1
[1] | 0x800000;
2845 w1
[1] = w1
[1] | 0x80000000;
2853 w1
[2] = w1
[2] | 0x8000;
2857 w1
[2] = w1
[2] | 0x800000;
2861 w1
[2] = w1
[2] | 0x80000000;
2869 w1
[3] = w1
[3] | 0x8000;
2873 w1
[3] = w1
[3] | 0x800000;
2877 w1
[3] = w1
[3] | 0x80000000;
2885 w2
[0] = w2
[0] | 0x8000;
2889 w2
[0] = w2
[0] | 0x800000;
2893 w2
[0] = w2
[0] | 0x80000000;
2901 w2
[1] = w2
[1] | 0x8000;
2905 w2
[1] = w2
[1] | 0x800000;
2909 w2
[1] = w2
[1] | 0x80000000;
2917 w2
[2] = w2
[2] | 0x8000;
2921 w2
[2] = w2
[2] | 0x800000;
2925 w2
[2] = w2
[2] | 0x80000000;
2933 w2
[3] = w2
[3] | 0x8000;
2937 w2
[3] = w2
[3] | 0x800000;
2941 w2
[3] = w2
[3] | 0x80000000;
2946 static void append_0x80_4x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
2955 w0
[0] = w0
[0] | 0x8000;
2959 w0
[0] = w0
[0] | 0x800000;
2963 w0
[0] = w0
[0] | 0x80000000;
2971 w0
[1] = w0
[1] | 0x8000;
2975 w0
[1] = w0
[1] | 0x800000;
2979 w0
[1] = w0
[1] | 0x80000000;
2987 w0
[2] = w0
[2] | 0x8000;
2991 w0
[2] = w0
[2] | 0x800000;
2995 w0
[2] = w0
[2] | 0x80000000;
3003 w0
[3] = w0
[3] | 0x8000;
3007 w0
[3] = w0
[3] | 0x800000;
3011 w0
[3] = w0
[3] | 0x80000000;
3019 w1
[0] = w1
[0] | 0x8000;
3023 w1
[0] = w1
[0] | 0x800000;
3027 w1
[0] = w1
[0] | 0x80000000;
3035 w1
[1] = w1
[1] | 0x8000;
3039 w1
[1] = w1
[1] | 0x800000;
3043 w1
[1] = w1
[1] | 0x80000000;
3051 w1
[2] = w1
[2] | 0x8000;
3055 w1
[2] = w1
[2] | 0x800000;
3059 w1
[2] = w1
[2] | 0x80000000;
3067 w1
[3] = w1
[3] | 0x8000;
3071 w1
[3] = w1
[3] | 0x800000;
3075 w1
[3] = w1
[3] | 0x80000000;
3083 w2
[0] = w2
[0] | 0x8000;
3087 w2
[0] = w2
[0] | 0x800000;
3091 w2
[0] = w2
[0] | 0x80000000;
3099 w2
[1] = w2
[1] | 0x8000;
3103 w2
[1] = w2
[1] | 0x800000;
3107 w2
[1] = w2
[1] | 0x80000000;
3115 w2
[2] = w2
[2] | 0x8000;
3119 w2
[2] = w2
[2] | 0x800000;
3123 w2
[2] = w2
[2] | 0x80000000;
3131 w2
[3] = w2
[3] | 0x8000;
3135 w2
[3] = w2
[3] | 0x800000;
3139 w2
[3] = w2
[3] | 0x80000000;
3147 w3
[0] = w3
[0] | 0x8000;
3151 w3
[0] = w3
[0] | 0x800000;
3155 w3
[0] = w3
[0] | 0x80000000;
3163 w3
[1] = w3
[1] | 0x8000;
3167 w3
[1] = w3
[1] | 0x800000;
3171 w3
[1] = w3
[1] | 0x80000000;
3179 w3
[2] = w3
[2] | 0x8000;
3183 w3
[2] = w3
[2] | 0x800000;
3187 w3
[2] = w3
[2] | 0x80000000;
3195 w3
[3] = w3
[3] | 0x8000;
3199 w3
[3] = w3
[3] | 0x800000;
3203 w3
[3] = w3
[3] | 0x80000000;
3208 static void append_0x80_8x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
3217 w0
[0] = w0
[0] | 0x8000;
3221 w0
[0] = w0
[0] | 0x800000;
3225 w0
[0] = w0
[0] | 0x80000000;
3233 w0
[1] = w0
[1] | 0x8000;
3237 w0
[1] = w0
[1] | 0x800000;
3241 w0
[1] = w0
[1] | 0x80000000;
3249 w0
[2] = w0
[2] | 0x8000;
3253 w0
[2] = w0
[2] | 0x800000;
3257 w0
[2] = w0
[2] | 0x80000000;
3265 w0
[3] = w0
[3] | 0x8000;
3269 w0
[3] = w0
[3] | 0x800000;
3273 w0
[3] = w0
[3] | 0x80000000;
3281 w1
[0] = w1
[0] | 0x8000;
3285 w1
[0] = w1
[0] | 0x800000;
3289 w1
[0] = w1
[0] | 0x80000000;
3297 w1
[1] = w1
[1] | 0x8000;
3301 w1
[1] = w1
[1] | 0x800000;
3305 w1
[1] = w1
[1] | 0x80000000;
3313 w1
[2] = w1
[2] | 0x8000;
3317 w1
[2] = w1
[2] | 0x800000;
3321 w1
[2] = w1
[2] | 0x80000000;
3329 w1
[3] = w1
[3] | 0x8000;
3333 w1
[3] = w1
[3] | 0x800000;
3337 w1
[3] = w1
[3] | 0x80000000;
3345 w2
[0] = w2
[0] | 0x8000;
3349 w2
[0] = w2
[0] | 0x800000;
3353 w2
[0] = w2
[0] | 0x80000000;
3361 w2
[1] = w2
[1] | 0x8000;
3365 w2
[1] = w2
[1] | 0x800000;
3369 w2
[1] = w2
[1] | 0x80000000;
3377 w2
[2] = w2
[2] | 0x8000;
3381 w2
[2] = w2
[2] | 0x800000;
3385 w2
[2] = w2
[2] | 0x80000000;
3393 w2
[3] = w2
[3] | 0x8000;
3397 w2
[3] = w2
[3] | 0x800000;
3401 w2
[3] = w2
[3] | 0x80000000;
3409 w3
[0] = w3
[0] | 0x8000;
3413 w3
[0] = w3
[0] | 0x800000;
3417 w3
[0] = w3
[0] | 0x80000000;
3425 w3
[1] = w3
[1] | 0x8000;
3429 w3
[1] = w3
[1] | 0x800000;
3433 w3
[1] = w3
[1] | 0x80000000;
3441 w3
[2] = w3
[2] | 0x8000;
3445 w3
[2] = w3
[2] | 0x800000;
3449 w3
[2] = w3
[2] | 0x80000000;
3457 w3
[3] = w3
[3] | 0x8000;
3461 w3
[3] = w3
[3] | 0x800000;
3465 w3
[3] = w3
[3] | 0x80000000;
3473 w4
[0] = w4
[0] | 0x8000;
3477 w4
[0] = w4
[0] | 0x800000;
3481 w4
[0] = w4
[0] | 0x80000000;
3489 w4
[1] = w4
[1] | 0x8000;
3493 w4
[1] = w4
[1] | 0x800000;
3497 w4
[1] = w4
[1] | 0x80000000;
3505 w4
[2] = w4
[2] | 0x8000;
3509 w4
[2] = w4
[2] | 0x800000;
3513 w4
[2] = w4
[2] | 0x80000000;
3521 w4
[3] = w4
[3] | 0x8000;
3525 w4
[3] = w4
[3] | 0x800000;
3529 w4
[3] = w4
[3] | 0x80000000;
3537 w5
[0] = w5
[0] | 0x8000;
3541 w5
[0] = w5
[0] | 0x800000;
3545 w5
[0] = w5
[0] | 0x80000000;
3553 w5
[1] = w5
[1] | 0x8000;
3557 w5
[1] = w5
[1] | 0x800000;
3561 w5
[1] = w5
[1] | 0x80000000;
3569 w5
[2] = w5
[2] | 0x8000;
3573 w5
[2] = w5
[2] | 0x800000;
3577 w5
[2] = w5
[2] | 0x80000000;
3585 w5
[3] = w5
[3] | 0x8000;
3589 w5
[3] = w5
[3] | 0x800000;
3593 w5
[3] = w5
[3] | 0x80000000;
3601 w6
[0] = w6
[0] | 0x8000;
3605 w6
[0] = w6
[0] | 0x800000;
3609 w6
[0] = w6
[0] | 0x80000000;
3617 w6
[1] = w6
[1] | 0x8000;
3621 w6
[1] = w6
[1] | 0x800000;
3625 w6
[1] = w6
[1] | 0x80000000;
3633 w6
[2] = w6
[2] | 0x8000;
3637 w6
[2] = w6
[2] | 0x800000;
3641 w6
[2] = w6
[2] | 0x80000000;
3649 w6
[3] = w6
[3] | 0x8000;
3653 w6
[3] = w6
[3] | 0x800000;
3657 w6
[3] = w6
[3] | 0x80000000;
3665 w7
[0] = w7
[0] | 0x8000;
3669 w7
[0] = w7
[0] | 0x800000;
3673 w7
[0] = w7
[0] | 0x80000000;
3681 w7
[1] = w7
[1] | 0x8000;
3685 w7
[1] = w7
[1] | 0x800000;
3689 w7
[1] = w7
[1] | 0x80000000;
3697 w7
[2] = w7
[2] | 0x8000;
3701 w7
[2] = w7
[2] | 0x800000;
3705 w7
[2] = w7
[2] | 0x80000000;
3713 w7
[3] = w7
[3] | 0x8000;
3717 w7
[3] = w7
[3] | 0x800000;
3721 w7
[3] = w7
[3] | 0x80000000;
3726 static void append_0x80_1x16 (u32 w
[16], const u32 offset
)
3735 w
[ 0] = w
[ 0] | 0x8000;
3739 w
[ 0] = w
[ 0] | 0x800000;
3743 w
[ 0] = w
[ 0] | 0x80000000;
3751 w
[ 1] = w
[ 1] | 0x8000;
3755 w
[ 1] = w
[ 1] | 0x800000;
3759 w
[ 1] = w
[ 1] | 0x80000000;
3767 w
[ 2] = w
[ 2] | 0x8000;
3771 w
[ 2] = w
[ 2] | 0x800000;
3775 w
[ 2] = w
[ 2] | 0x80000000;
3783 w
[ 3] = w
[ 3] | 0x8000;
3787 w
[ 3] = w
[ 3] | 0x800000;
3791 w
[ 3] = w
[ 3] | 0x80000000;
3799 w
[ 4] = w
[ 4] | 0x8000;
3803 w
[ 4] = w
[ 4] | 0x800000;
3807 w
[ 4] = w
[ 4] | 0x80000000;
3815 w
[ 5] = w
[ 5] | 0x8000;
3819 w
[ 5] = w
[ 5] | 0x800000;
3823 w
[ 5] = w
[ 5] | 0x80000000;
3831 w
[ 6] = w
[ 6] | 0x8000;
3835 w
[ 6] = w
[ 6] | 0x800000;
3839 w
[ 6] = w
[ 6] | 0x80000000;
3847 w
[ 7] = w
[ 7] | 0x8000;
3851 w
[ 7] = w
[ 7] | 0x800000;
3855 w
[ 7] = w
[ 7] | 0x80000000;
3863 w
[ 8] = w
[ 8] | 0x8000;
3867 w
[ 8] = w
[ 8] | 0x800000;
3871 w
[ 8] = w
[ 8] | 0x80000000;
3879 w
[ 9] = w
[ 9] | 0x8000;
3883 w
[ 9] = w
[ 9] | 0x800000;
3887 w
[ 9] = w
[ 9] | 0x80000000;
3895 w
[10] = w
[10] | 0x8000;
3899 w
[10] = w
[10] | 0x800000;
3903 w
[10] = w
[10] | 0x80000000;
3911 w
[11] = w
[11] | 0x8000;
3915 w
[11] = w
[11] | 0x800000;
3919 w
[11] = w
[11] | 0x80000000;
3927 w
[12] = w
[12] | 0x8000;
3931 w
[12] = w
[12] | 0x800000;
3935 w
[12] = w
[12] | 0x80000000;
3943 w
[13] = w
[13] | 0x8000;
3947 w
[13] = w
[13] | 0x800000;
3951 w
[13] = w
[13] | 0x80000000;
3959 w
[14] = w
[14] | 0x8000;
3963 w
[14] = w
[14] | 0x800000;
3967 w
[14] = w
[14] | 0x80000000;
3975 w
[15] = w
[15] | 0x8000;
3979 w
[15] = w
[15] | 0x800000;
3983 w
[15] = w
[15] | 0x80000000;
3988 static void switch_buffer_by_offset (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
3990 #if defined IS_AMD || defined IS_GENERIC
3991 const int offset_mod_4
= offset
& 3;
3993 const int offset_minus_4
= 4 - offset
;
3998 w3
[2] = amd_bytealign ( 0, w3
[1], offset_minus_4
);
3999 w3
[1] = amd_bytealign (w3
[1], w3
[0], offset_minus_4
);
4000 w3
[0] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
4001 w2
[3] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4002 w2
[2] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4003 w2
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4004 w2
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4005 w1
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4006 w1
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4007 w1
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4008 w1
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4009 w0
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4010 w0
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4011 w0
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4012 w0
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4014 if (offset_mod_4
== 0)
4036 w3
[2] = amd_bytealign ( 0, w3
[0], offset_minus_4
);
4037 w3
[1] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
4038 w3
[0] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4039 w2
[3] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4040 w2
[2] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4041 w2
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4042 w2
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4043 w1
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4044 w1
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4045 w1
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4046 w1
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4047 w0
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4048 w0
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4049 w0
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4052 if (offset_mod_4
== 0)
4073 w3
[2] = amd_bytealign ( 0, w2
[3], offset_minus_4
);
4074 w3
[1] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4075 w3
[0] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4076 w2
[3] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4077 w2
[2] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4078 w2
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4079 w2
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4080 w1
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4081 w1
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4082 w1
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4083 w1
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4084 w0
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4085 w0
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4089 if (offset_mod_4
== 0)
4109 w3
[2] = amd_bytealign ( 0, w2
[2], offset_minus_4
);
4110 w3
[1] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4111 w3
[0] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4112 w2
[3] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4113 w2
[2] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4114 w2
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4115 w2
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4116 w1
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4117 w1
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4118 w1
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4119 w1
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4120 w0
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4125 if (offset_mod_4
== 0)
4144 w3
[2] = amd_bytealign ( 0, w2
[1], offset_minus_4
);
4145 w3
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4146 w3
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4147 w2
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4148 w2
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4149 w2
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4150 w2
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4151 w1
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4152 w1
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4153 w1
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4154 w1
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4160 if (offset_mod_4
== 0)
4178 w3
[2] = amd_bytealign ( 0, w2
[0], offset_minus_4
);
4179 w3
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4180 w3
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4181 w2
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4182 w2
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4183 w2
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4184 w2
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4185 w1
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4186 w1
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4187 w1
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4194 if (offset_mod_4
== 0)
4211 w3
[2] = amd_bytealign ( 0, w1
[3], offset_minus_4
);
4212 w3
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4213 w3
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4214 w2
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4215 w2
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4216 w2
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4217 w2
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4218 w1
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4219 w1
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4227 if (offset_mod_4
== 0)
4243 w3
[2] = amd_bytealign ( 0, w1
[2], offset_minus_4
);
4244 w3
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4245 w3
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4246 w2
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4247 w2
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4248 w2
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4249 w2
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4250 w1
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4259 if (offset_mod_4
== 0)
4274 w3
[2] = amd_bytealign ( 0, w1
[1], offset_minus_4
);
4275 w3
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4276 w3
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4277 w2
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4278 w2
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4279 w2
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4280 w2
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4290 if (offset_mod_4
== 0)
4304 w3
[2] = amd_bytealign ( 0, w1
[0], offset_minus_4
);
4305 w3
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4306 w3
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4307 w2
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4308 w2
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4309 w2
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4320 if (offset_mod_4
== 0)
4333 w3
[2] = amd_bytealign ( 0, w0
[3], offset_minus_4
);
4334 w3
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4335 w3
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4336 w2
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4337 w2
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4349 if (offset_mod_4
== 0)
4361 w3
[2] = amd_bytealign ( 0, w0
[2], offset_minus_4
);
4362 w3
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4363 w3
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4364 w2
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4377 if (offset_mod_4
== 0)
4388 w3
[2] = amd_bytealign ( 0, w0
[1], offset_minus_4
);
4389 w3
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4390 w3
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4404 if (offset_mod_4
== 0)
4414 w3
[2] = amd_bytealign ( 0, w0
[0], offset_minus_4
);
4415 w3
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4430 if (offset_mod_4
== 0)
4441 const int offset_minus_4
= 4 - (offset
% 4);
4443 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
4448 w3
[1] = __byte_perm (w3
[0], w3
[1], selector
);
4449 w3
[0] = __byte_perm (w2
[3], w3
[0], selector
);
4450 w2
[3] = __byte_perm (w2
[2], w2
[3], selector
);
4451 w2
[2] = __byte_perm (w2
[1], w2
[2], selector
);
4452 w2
[1] = __byte_perm (w2
[0], w2
[1], selector
);
4453 w2
[0] = __byte_perm (w1
[3], w2
[0], selector
);
4454 w1
[3] = __byte_perm (w1
[2], w1
[3], selector
);
4455 w1
[2] = __byte_perm (w1
[1], w1
[2], selector
);
4456 w1
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4457 w1
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4458 w0
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4459 w0
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4460 w0
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4461 w0
[0] = __byte_perm ( 0, w0
[0], selector
);
4466 w3
[1] = __byte_perm (w2
[3], w3
[0], selector
);
4467 w3
[0] = __byte_perm (w2
[2], w2
[3], selector
);
4468 w2
[3] = __byte_perm (w2
[1], w2
[2], selector
);
4469 w2
[2] = __byte_perm (w2
[0], w2
[1], selector
);
4470 w2
[1] = __byte_perm (w1
[3], w2
[0], selector
);
4471 w2
[0] = __byte_perm (w1
[2], w1
[3], selector
);
4472 w1
[3] = __byte_perm (w1
[1], w1
[2], selector
);
4473 w1
[2] = __byte_perm (w1
[0], w1
[1], selector
);
4474 w1
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4475 w1
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4476 w0
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4477 w0
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4478 w0
[1] = __byte_perm ( 0, w0
[0], selector
);
4484 w3
[1] = __byte_perm (w2
[2], w2
[3], selector
);
4485 w3
[0] = __byte_perm (w2
[1], w2
[2], selector
);
4486 w2
[3] = __byte_perm (w2
[0], w2
[1], selector
);
4487 w2
[2] = __byte_perm (w1
[3], w2
[0], selector
);
4488 w2
[1] = __byte_perm (w1
[2], w1
[3], selector
);
4489 w2
[0] = __byte_perm (w1
[1], w1
[2], selector
);
4490 w1
[3] = __byte_perm (w1
[0], w1
[1], selector
);
4491 w1
[2] = __byte_perm (w0
[3], w1
[0], selector
);
4492 w1
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4493 w1
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4494 w0
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4495 w0
[2] = __byte_perm ( 0, w0
[0], selector
);
4502 w3
[1] = __byte_perm (w2
[1], w2
[2], selector
);
4503 w3
[0] = __byte_perm (w2
[0], w2
[1], selector
);
4504 w2
[3] = __byte_perm (w1
[3], w2
[0], selector
);
4505 w2
[2] = __byte_perm (w1
[2], w1
[3], selector
);
4506 w2
[1] = __byte_perm (w1
[1], w1
[2], selector
);
4507 w2
[0] = __byte_perm (w1
[0], w1
[1], selector
);
4508 w1
[3] = __byte_perm (w0
[3], w1
[0], selector
);
4509 w1
[2] = __byte_perm (w0
[2], w0
[3], selector
);
4510 w1
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4511 w1
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4512 w0
[3] = __byte_perm ( 0, w0
[0], selector
);
4520 w3
[1] = __byte_perm (w2
[0], w2
[1], selector
);
4521 w3
[0] = __byte_perm (w1
[3], w2
[0], selector
);
4522 w2
[3] = __byte_perm (w1
[2], w1
[3], selector
);
4523 w2
[2] = __byte_perm (w1
[1], w1
[2], selector
);
4524 w2
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4525 w2
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4526 w1
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4527 w1
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4528 w1
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4529 w1
[0] = __byte_perm ( 0, w0
[0], selector
);
4538 w3
[1] = __byte_perm (w1
[3], w2
[0], selector
);
4539 w3
[0] = __byte_perm (w1
[2], w1
[3], selector
);
4540 w2
[3] = __byte_perm (w1
[1], w1
[2], selector
);
4541 w2
[2] = __byte_perm (w1
[0], w1
[1], selector
);
4542 w2
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4543 w2
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4544 w1
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4545 w1
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4546 w1
[1] = __byte_perm ( 0, w0
[0], selector
);
4556 w3
[1] = __byte_perm (w1
[2], w1
[3], selector
);
4557 w3
[0] = __byte_perm (w1
[1], w1
[2], selector
);
4558 w2
[3] = __byte_perm (w1
[0], w1
[1], selector
);
4559 w2
[2] = __byte_perm (w0
[3], w1
[0], selector
);
4560 w2
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4561 w2
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4562 w1
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4563 w1
[2] = __byte_perm ( 0, w0
[0], selector
);
4574 w3
[1] = __byte_perm (w1
[1], w1
[2], selector
);
4575 w3
[0] = __byte_perm (w1
[0], w1
[1], selector
);
4576 w2
[3] = __byte_perm (w0
[3], w1
[0], selector
);
4577 w2
[2] = __byte_perm (w0
[2], w0
[3], selector
);
4578 w2
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4579 w2
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4580 w1
[3] = __byte_perm ( 0, w0
[0], selector
);
4592 w3
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4593 w3
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4594 w2
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4595 w2
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4596 w2
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4597 w2
[0] = __byte_perm ( 0, w0
[0], selector
);
4610 w3
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4611 w3
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4612 w2
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4613 w2
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4614 w2
[1] = __byte_perm ( 0, w0
[0], selector
);
4628 w3
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4629 w3
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4630 w2
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4631 w2
[2] = __byte_perm ( 0, w0
[0], selector
);
4646 w3
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4647 w3
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4648 w2
[3] = __byte_perm ( 0, w0
[0], selector
);
4664 w3
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4665 w3
[0] = __byte_perm ( 0, w0
[0], selector
);
4682 w3
[1] = __byte_perm ( 0, w0
[0], selector
);
4702 static void switch_buffer_by_offset_be (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
4704 #if defined IS_AMD || defined IS_GENERIC
4708 w3
[2] = amd_bytealign (w3
[1], 0, offset
);
4709 w3
[1] = amd_bytealign (w3
[0], w3
[1], offset
);
4710 w3
[0] = amd_bytealign (w2
[3], w3
[0], offset
);
4711 w2
[3] = amd_bytealign (w2
[2], w2
[3], offset
);
4712 w2
[2] = amd_bytealign (w2
[1], w2
[2], offset
);
4713 w2
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
4714 w2
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
4715 w1
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
4716 w1
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
4717 w1
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4718 w1
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4719 w0
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4720 w0
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4721 w0
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4722 w0
[0] = amd_bytealign ( 0, w0
[0], offset
);
4726 w3
[2] = amd_bytealign (w3
[0], 0, offset
);
4727 w3
[1] = amd_bytealign (w2
[3], w3
[0], offset
);
4728 w3
[0] = amd_bytealign (w2
[2], w2
[3], offset
);
4729 w2
[3] = amd_bytealign (w2
[1], w2
[2], offset
);
4730 w2
[2] = amd_bytealign (w2
[0], w2
[1], offset
);
4731 w2
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
4732 w2
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
4733 w1
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
4734 w1
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
4735 w1
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4736 w1
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4737 w0
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4738 w0
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4739 w0
[1] = amd_bytealign ( 0, w0
[0], offset
);
4744 w3
[2] = amd_bytealign (w2
[3], 0, offset
);
4745 w3
[1] = amd_bytealign (w2
[2], w2
[3], offset
);
4746 w3
[0] = amd_bytealign (w2
[1], w2
[2], offset
);
4747 w2
[3] = amd_bytealign (w2
[0], w2
[1], offset
);
4748 w2
[2] = amd_bytealign (w1
[3], w2
[0], offset
);
4749 w2
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
4750 w2
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
4751 w1
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
4752 w1
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
4753 w1
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4754 w1
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4755 w0
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4756 w0
[2] = amd_bytealign ( 0, w0
[0], offset
);
4762 w3
[2] = amd_bytealign (w2
[2], 0, offset
);
4763 w3
[1] = amd_bytealign (w2
[1], w2
[2], offset
);
4764 w3
[0] = amd_bytealign (w2
[0], w2
[1], offset
);
4765 w2
[3] = amd_bytealign (w1
[3], w2
[0], offset
);
4766 w2
[2] = amd_bytealign (w1
[2], w1
[3], offset
);
4767 w2
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
4768 w2
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
4769 w1
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
4770 w1
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
4771 w1
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4772 w1
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4773 w0
[3] = amd_bytealign ( 0, w0
[0], offset
);
4780 w3
[2] = amd_bytealign (w2
[1], 0, offset
);
4781 w3
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
4782 w3
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
4783 w2
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
4784 w2
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
4785 w2
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4786 w2
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4787 w1
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4788 w1
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4789 w1
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4790 w1
[0] = amd_bytealign ( 0, w0
[0], offset
);
4798 w3
[2] = amd_bytealign (w2
[0], 0, offset
);
4799 w3
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
4800 w3
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
4801 w2
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
4802 w2
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
4803 w2
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4804 w2
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4805 w1
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4806 w1
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4807 w1
[1] = amd_bytealign ( 0, w0
[0], offset
);
4816 w3
[2] = amd_bytealign (w1
[3], 0, offset
);
4817 w3
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
4818 w3
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
4819 w2
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
4820 w2
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
4821 w2
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4822 w2
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4823 w1
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4824 w1
[2] = amd_bytealign ( 0, w0
[0], offset
);
4834 w3
[2] = amd_bytealign (w1
[2], 0, offset
);
4835 w3
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
4836 w3
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
4837 w2
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
4838 w2
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
4839 w2
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4840 w2
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4841 w1
[3] = amd_bytealign ( 0, w0
[0], offset
);
4852 w3
[2] = amd_bytealign (w1
[1], 0, offset
);
4853 w3
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4854 w3
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4855 w2
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4856 w2
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4857 w2
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4858 w2
[0] = amd_bytealign ( 0, w0
[0], offset
);
4870 w3
[2] = amd_bytealign (w1
[0], 0, offset
);
4871 w3
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4872 w3
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4873 w2
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4874 w2
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4875 w2
[1] = amd_bytealign ( 0, w0
[0], offset
);
4888 w3
[2] = amd_bytealign (w0
[3], 0, offset
);
4889 w3
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4890 w3
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4891 w2
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4892 w2
[2] = amd_bytealign ( 0, w0
[0], offset
);
4906 w3
[2] = amd_bytealign (w0
[2], 0, offset
);
4907 w3
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4908 w3
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4909 w2
[3] = amd_bytealign ( 0, w0
[0], offset
);
4924 w3
[2] = amd_bytealign (w0
[1], 0, offset
);
4925 w3
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4926 w3
[0] = amd_bytealign ( 0, w0
[0], offset
);
4942 w3
[2] = amd_bytealign (w0
[0], 0, offset
);
4943 w3
[1] = amd_bytealign ( 0, w0
[0], offset
);
4962 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
4967 w3
[1] = __byte_perm (w3
[1], w3
[0], selector
);
4968 w3
[0] = __byte_perm (w3
[0], w2
[3], selector
);
4969 w2
[3] = __byte_perm (w2
[3], w2
[2], selector
);
4970 w2
[2] = __byte_perm (w2
[2], w2
[1], selector
);
4971 w2
[1] = __byte_perm (w2
[1], w2
[0], selector
);
4972 w2
[0] = __byte_perm (w2
[0], w1
[3], selector
);
4973 w1
[3] = __byte_perm (w1
[3], w1
[2], selector
);
4974 w1
[2] = __byte_perm (w1
[2], w1
[1], selector
);
4975 w1
[1] = __byte_perm (w1
[1], w1
[0], selector
);
4976 w1
[0] = __byte_perm (w1
[0], w0
[3], selector
);
4977 w0
[3] = __byte_perm (w0
[3], w0
[2], selector
);
4978 w0
[2] = __byte_perm (w0
[2], w0
[1], selector
);
4979 w0
[1] = __byte_perm (w0
[1], w0
[0], selector
);
4980 w0
[0] = __byte_perm (w0
[0], 0, selector
);
4984 w3
[1] = __byte_perm (w3
[0], w2
[3], selector
);
4985 w3
[0] = __byte_perm (w2
[3], w2
[2], selector
);
4986 w2
[3] = __byte_perm (w2
[2], w2
[1], selector
);
4987 w2
[2] = __byte_perm (w2
[1], w2
[0], selector
);
4988 w2
[1] = __byte_perm (w2
[0], w1
[3], selector
);
4989 w2
[0] = __byte_perm (w1
[3], w1
[2], selector
);
4990 w1
[3] = __byte_perm (w1
[2], w1
[1], selector
);
4991 w1
[2] = __byte_perm (w1
[1], w1
[0], selector
);
4992 w1
[1] = __byte_perm (w1
[0], w0
[3], selector
);
4993 w1
[0] = __byte_perm (w0
[3], w0
[2], selector
);
4994 w0
[3] = __byte_perm (w0
[2], w0
[1], selector
);
4995 w0
[2] = __byte_perm (w0
[1], w0
[0], selector
);
4996 w0
[1] = __byte_perm (w0
[0], 0, selector
);
5001 w3
[1] = __byte_perm (w2
[3], w2
[2], selector
);
5002 w3
[0] = __byte_perm (w2
[2], w2
[1], selector
);
5003 w2
[3] = __byte_perm (w2
[1], w2
[0], selector
);
5004 w2
[2] = __byte_perm (w2
[0], w1
[3], selector
);
5005 w2
[1] = __byte_perm (w1
[3], w1
[2], selector
);
5006 w2
[0] = __byte_perm (w1
[2], w1
[1], selector
);
5007 w1
[3] = __byte_perm (w1
[1], w1
[0], selector
);
5008 w1
[2] = __byte_perm (w1
[0], w0
[3], selector
);
5009 w1
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5010 w1
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5011 w0
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5012 w0
[2] = __byte_perm (w0
[0], 0, selector
);
5018 w3
[1] = __byte_perm (w2
[2], w2
[1], selector
);
5019 w3
[0] = __byte_perm (w2
[1], w2
[0], selector
);
5020 w2
[3] = __byte_perm (w2
[0], w1
[3], selector
);
5021 w2
[2] = __byte_perm (w1
[3], w1
[2], selector
);
5022 w2
[1] = __byte_perm (w1
[2], w1
[1], selector
);
5023 w2
[0] = __byte_perm (w1
[1], w1
[0], selector
);
5024 w1
[3] = __byte_perm (w1
[0], w0
[3], selector
);
5025 w1
[2] = __byte_perm (w0
[3], w0
[2], selector
);
5026 w1
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5027 w1
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5028 w0
[3] = __byte_perm (w0
[0], 0, selector
);
5035 w3
[1] = __byte_perm (w2
[1], w2
[0], selector
);
5036 w3
[0] = __byte_perm (w2
[0], w1
[3], selector
);
5037 w2
[3] = __byte_perm (w1
[3], w1
[2], selector
);
5038 w2
[2] = __byte_perm (w1
[2], w1
[1], selector
);
5039 w2
[1] = __byte_perm (w1
[1], w1
[0], selector
);
5040 w2
[0] = __byte_perm (w1
[0], w0
[3], selector
);
5041 w1
[3] = __byte_perm (w0
[3], w0
[2], selector
);
5042 w1
[2] = __byte_perm (w0
[2], w0
[1], selector
);
5043 w1
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5044 w1
[0] = __byte_perm (w0
[0], 0, selector
);
5052 w3
[1] = __byte_perm (w2
[0], w1
[3], selector
);
5053 w3
[0] = __byte_perm (w1
[3], w1
[2], selector
);
5054 w2
[3] = __byte_perm (w1
[2], w1
[1], selector
);
5055 w2
[2] = __byte_perm (w1
[1], w1
[0], selector
);
5056 w2
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5057 w2
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5058 w1
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5059 w1
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5060 w1
[1] = __byte_perm (w0
[0], 0, selector
);
5069 w3
[1] = __byte_perm (w1
[3], w1
[2], selector
);
5070 w3
[0] = __byte_perm (w1
[2], w1
[1], selector
);
5071 w2
[3] = __byte_perm (w1
[1], w1
[0], selector
);
5072 w2
[2] = __byte_perm (w1
[0], w0
[3], selector
);
5073 w2
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5074 w2
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5075 w1
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5076 w1
[2] = __byte_perm (w0
[0], 0, selector
);
5086 w3
[1] = __byte_perm (w1
[2], w1
[1], selector
);
5087 w3
[0] = __byte_perm (w1
[1], w1
[0], selector
);
5088 w2
[3] = __byte_perm (w1
[0], w0
[3], selector
);
5089 w2
[2] = __byte_perm (w0
[3], w0
[2], selector
);
5090 w2
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5091 w2
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5092 w1
[3] = __byte_perm (w0
[0], 0, selector
);
5103 w3
[1] = __byte_perm (w1
[1], w1
[0], selector
);
5104 w3
[0] = __byte_perm (w1
[0], w0
[3], selector
);
5105 w2
[3] = __byte_perm (w0
[3], w0
[2], selector
);
5106 w2
[2] = __byte_perm (w0
[2], w0
[1], selector
);
5107 w2
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5108 w2
[0] = __byte_perm (w0
[0], 0, selector
);
5120 w3
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5121 w3
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5122 w2
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5123 w2
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5124 w2
[1] = __byte_perm (w0
[0], 0, selector
);
5137 w3
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5138 w3
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5139 w2
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5140 w2
[2] = __byte_perm (w0
[0], 0, selector
);
5154 w3
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5155 w3
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5156 w2
[3] = __byte_perm (w0
[0], 0, selector
);
5171 w3
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5172 w3
[0] = __byte_perm (w0
[0], 0, selector
);
5188 w3
[1] = __byte_perm (w0
[0], 0, selector
);
5207 /* not needed anymore?
5209 // before: append_0x80_2_be
5210 static void append_0x80_2x4_be (u32 w0[4], u32 w1[4], const u32 offset)
5215 w0[0] |= 0x80000000;
5231 w0[1] |= 0x80000000;
5247 w0[2] |= 0x80000000;
5263 w0[3] |= 0x80000000;
5279 w1[0] |= 0x80000000;
5295 w1[1] |= 0x80000000;
5311 w1[2] |= 0x80000000;
5327 w1[3] |= 0x80000000;
5344 // before: append_0x80_8
5345 static void append_0x80_1x32 (u32 w[32], const u32 offset)
5354 w[ 0] = w[ 0] | 0x8000;
5358 w[ 0] = w[ 0] | 0x800000;
5362 w[ 0] = w[ 0] | 0x80000000;
5370 w[ 1] = w[ 1] | 0x8000;
5374 w[ 1] = w[ 1] | 0x800000;
5378 w[ 1] = w[ 1] | 0x80000000;
5386 w[ 2] = w[ 2] | 0x8000;
5390 w[ 2] = w[ 2] | 0x800000;
5394 w[ 2] = w[ 2] | 0x80000000;
5402 w[ 3] = w[ 3] | 0x8000;
5406 w[ 3] = w[ 3] | 0x800000;
5410 w[ 3] = w[ 3] | 0x80000000;
5418 w[ 4] = w[ 4] | 0x8000;
5422 w[ 4] = w[ 4] | 0x800000;
5426 w[ 4] = w[ 4] | 0x80000000;
5434 w[ 5] = w[ 5] | 0x8000;
5438 w[ 5] = w[ 5] | 0x800000;
5442 w[ 5] = w[ 5] | 0x80000000;
5450 w[ 6] = w[ 6] | 0x8000;
5454 w[ 6] = w[ 6] | 0x800000;
5458 w[ 6] = w[ 6] | 0x80000000;
5466 w[ 7] = w[ 7] | 0x8000;
5470 w[ 7] = w[ 7] | 0x800000;
5474 w[ 7] = w[ 7] | 0x80000000;
5482 w[ 8] = w[ 8] | 0x8000;
5486 w[ 8] = w[ 8] | 0x800000;
5490 w[ 8] = w[ 8] | 0x80000000;
5498 w[ 9] = w[ 9] | 0x8000;
5502 w[ 9] = w[ 9] | 0x800000;
5506 w[ 9] = w[ 9] | 0x80000000;
5514 w[10] = w[10] | 0x8000;
5518 w[10] = w[10] | 0x800000;
5522 w[10] = w[10] | 0x80000000;
5530 w[11] = w[11] | 0x8000;
5534 w[11] = w[11] | 0x800000;
5538 w[11] = w[11] | 0x80000000;
5546 w[12] = w[12] | 0x8000;
5550 w[12] = w[12] | 0x800000;
5554 w[12] = w[12] | 0x80000000;
5562 w[13] = w[13] | 0x8000;
5566 w[13] = w[13] | 0x800000;
5570 w[13] = w[13] | 0x80000000;
5578 w[14] = w[14] | 0x8000;
5582 w[14] = w[14] | 0x800000;
5586 w[14] = w[14] | 0x80000000;
5594 w[15] = w[15] | 0x8000;
5598 w[15] = w[15] | 0x800000;
5602 w[15] = w[15] | 0x80000000;
5610 w[16] = w[16] | 0x8000;
5614 w[16] = w[16] | 0x800000;
5618 w[16] = w[16] | 0x80000000;
5626 w[17] = w[17] | 0x8000;
5630 w[17] = w[17] | 0x800000;
5634 w[17] = w[17] | 0x80000000;
5642 w[18] = w[18] | 0x8000;
5646 w[18] = w[18] | 0x800000;
5650 w[18] = w[18] | 0x80000000;
5658 w[19] = w[19] | 0x8000;
5662 w[19] = w[19] | 0x800000;
5666 w[19] = w[19] | 0x80000000;
5674 w[20] = w[20] | 0x8000;
5678 w[20] = w[20] | 0x800000;
5682 w[20] = w[20] | 0x80000000;
5690 w[21] = w[21] | 0x8000;
5694 w[21] = w[21] | 0x800000;
5698 w[21] = w[21] | 0x80000000;
5706 w[22] = w[22] | 0x8000;
5710 w[22] = w[22] | 0x800000;
5714 w[22] = w[22] | 0x80000000;
5722 w[23] = w[23] | 0x8000;
5726 w[23] = w[23] | 0x800000;
5730 w[23] = w[23] | 0x80000000;
5738 w[24] = w[24] | 0x8000;
5742 w[24] = w[24] | 0x800000;
5746 w[24] = w[24] | 0x80000000;
5754 w[25] = w[25] | 0x8000;
5758 w[25] = w[25] | 0x800000;
5762 w[25] = w[25] | 0x80000000;
5770 w[26] = w[26] | 0x8000;
5774 w[26] = w[26] | 0x800000;
5778 w[26] = w[26] | 0x80000000;
5786 w[27] = w[27] | 0x8000;
5790 w[27] = w[27] | 0x800000;
5794 w[27] = w[27] | 0x80000000;
5802 w[28] = w[28] | 0x8000;
5806 w[28] = w[28] | 0x800000;
5810 w[28] = w[28] | 0x80000000;
5818 w[29] = w[29] | 0x8000;
5822 w[29] = w[29] | 0x800000;
5826 w[29] = w[29] | 0x80000000;
5834 w[30] = w[30] | 0x8000;
5838 w[30] = w[30] | 0x800000;
5842 w[30] = w[30] | 0x80000000;
5850 w[31] = w[31] | 0x8000;
5854 w[31] = w[31] | 0x800000;
5858 w[31] = w[31] | 0x80000000;
5863 // before: device_memcat2L
5864 static void memcat_c7_d1x2_sl1x2_sr1x2 (const u32 offset, u32 dst0[2], u32 src_l0[2], u32 src_r0[2])
5869 dst0[0] = src_l0[0] | src_r0[0] << 8;
5870 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
5874 dst0[0] = src_l0[0] | src_r0[0] << 16;
5875 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
5879 dst0[0] = src_l0[0] | src_r0[0] << 24;
5880 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
5884 dst0[1] = src_r0[0];
5888 dst0[1] = src_l0[1] | src_r0[0] << 8;
5892 dst0[1] = src_l0[1] | src_r0[0] << 16;
5896 dst0[1] = src_l0[1] | src_r0[0] << 24;
5901 // before: device_memcat4L
5902 static void memcat_c15_d1x4_sl1x4_sr1x4 (const u32 offset, u32 dst0[4], u32 src_l0[4], u32 src_r0[4])
5907 dst0[0] = src_l0[0] | src_r0[0] << 8;
5908 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
5909 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
5910 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
5914 dst0[0] = src_l0[0] | src_r0[0] << 16;
5915 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
5916 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
5917 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
5921 dst0[0] = src_l0[0] | src_r0[0] << 24;
5922 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
5923 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
5924 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
5928 dst0[1] = src_r0[0];
5929 dst0[2] = src_r0[1];
5930 dst0[3] = src_r0[2];
5934 dst0[1] = src_l0[1] | src_r0[0] << 8;
5935 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
5936 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
5940 dst0[1] = src_l0[1] | src_r0[0] << 16;
5941 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
5942 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
5946 dst0[1] = src_l0[1] | src_r0[0] << 24;
5947 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
5948 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
5952 dst0[2] = src_r0[0];
5953 dst0[3] = src_r0[1];
5957 dst0[2] = src_l0[2] | src_r0[0] << 8;
5958 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
5962 dst0[2] = src_l0[2] | src_r0[0] << 16;
5963 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
5967 dst0[2] = src_l0[2] | src_r0[0] << 24;
5968 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
5972 dst0[3] = src_r0[0];
5976 dst0[3] = src_l0[3] | src_r0[0] << 8;
5980 dst0[3] = src_l0[3] | src_r0[0] << 16;
5984 dst0[3] = src_l0[3] | src_r0[0] << 24;
5989 // before: device_memcat8L
5990 static void memcat_c31_d2x4_sl2x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 src_l0[4], u32 src_l1[4], u32 src_r0[4])
5995 dst0[0] = src_l0[0] | src_r0[0] << 8;
5996 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
5997 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
5998 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
5999 dst1[0] = src_r0[3] >> 24;
6003 dst0[0] = src_l0[0] | src_r0[0] << 16;
6004 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
6005 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
6006 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
6007 dst1[0] = src_r0[3] >> 16;
6011 dst0[0] = src_l0[0] | src_r0[0] << 24;
6012 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
6013 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
6014 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
6015 dst1[0] = src_r0[3] >> 8;
6019 dst0[1] = src_r0[0];
6020 dst0[2] = src_r0[1];
6021 dst0[3] = src_r0[2];
6022 dst1[0] = src_r0[3];
6026 dst0[1] = src_l0[1] | src_r0[0] << 8;
6027 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
6028 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
6029 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
6030 dst1[1] = src_r0[3] >> 24;
6034 dst0[1] = src_l0[1] | src_r0[0] << 16;
6035 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
6036 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
6037 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
6038 dst1[1] = src_r0[3] >> 16;
6042 dst0[1] = src_l0[1] | src_r0[0] << 24;
6043 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
6044 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
6045 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
6046 dst1[1] = src_r0[3] >> 8;
6050 dst0[2] = src_r0[0];
6051 dst0[3] = src_r0[1];
6052 dst1[0] = src_r0[2];
6053 dst1[1] = src_r0[3];
6057 dst0[2] = src_l0[2] | src_r0[0] << 8;
6058 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
6059 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
6060 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
6061 dst1[2] = src_r0[3] >> 24;
6065 dst0[2] = src_l0[2] | src_r0[0] << 16;
6066 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
6067 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
6068 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
6069 dst1[2] = src_r0[3] >> 16;
6073 dst0[2] = src_l0[2] | src_r0[0] << 24;
6074 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
6075 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
6076 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
6077 dst1[2] = src_r0[3] >> 8;
6081 dst0[3] = src_r0[0];
6082 dst1[0] = src_r0[1];
6083 dst1[1] = src_r0[2];
6084 dst1[2] = src_r0[3];
6088 dst0[3] = src_l0[3] | src_r0[0] << 8;
6089 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
6090 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
6091 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
6092 dst1[3] = src_r0[3] >> 24;
6096 dst0[3] = src_l0[3] | src_r0[0] << 16;
6097 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
6098 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
6099 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
6100 dst1[3] = src_r0[3] >> 16;
6104 dst0[3] = src_l0[3] | src_r0[0] << 24;
6105 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
6106 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
6107 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
6108 dst1[3] = src_r0[3] >> 8;
6112 dst1[0] = src_r0[0];
6113 dst1[1] = src_r0[1];
6114 dst1[2] = src_r0[2];
6115 dst1[3] = src_r0[3];
6119 dst1[0] = src_l1[0] | src_r0[0] << 8;
6120 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
6121 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
6122 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
6126 dst1[0] = src_l1[0] | src_r0[0] << 16;
6127 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
6128 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
6129 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
6133 dst1[0] = src_l1[0] | src_r0[0] << 24;
6134 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
6135 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
6136 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
6140 dst1[1] = src_r0[0];
6141 dst1[2] = src_r0[1];
6142 dst1[3] = src_r0[2];
6146 dst1[1] = src_l1[1] | src_r0[0] << 8;
6147 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
6148 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
6152 dst1[1] = src_l1[1] | src_r0[0] << 16;
6153 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
6154 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
6158 dst1[1] = src_l1[1] | src_r0[0] << 24;
6159 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
6160 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
6164 dst1[2] = src_r0[0];
6165 dst1[3] = src_r0[1];
6169 dst1[2] = src_l1[2] | src_r0[0] << 8;
6170 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
6174 dst1[2] = src_l1[2] | src_r0[0] << 16;
6175 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
6179 dst1[2] = src_l1[2] | src_r0[0] << 24;
6180 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
6184 dst1[3] = src_r0[0];
6188 dst1[3] = src_l1[3] | src_r0[0] << 8;
6192 dst1[3] = src_l1[3] | src_r0[0] << 16;
6196 dst1[3] = src_l1[3] | src_r0[0] << 24;
6201 // before: device_memcat12L
6202 static void memcat_c47_d3x4_sl3x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4])
6207 dst0[0] = src_l0[0] | src_r0[0] << 8;
6208 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
6209 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
6210 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
6211 dst1[0] = src_r0[3] >> 24;
6215 dst0[0] = src_l0[0] | src_r0[0] << 16;
6216 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
6217 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
6218 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
6219 dst1[0] = src_r0[3] >> 16;
6223 dst0[0] = src_l0[0] | src_r0[0] << 24;
6224 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
6225 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
6226 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
6227 dst1[0] = src_r0[3] >> 8;
6231 dst0[1] = src_r0[0];
6232 dst0[2] = src_r0[1];
6233 dst0[3] = src_r0[2];
6234 dst1[0] = src_r0[3];
6238 dst0[1] = src_l0[1] | src_r0[0] << 8;
6239 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
6240 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
6241 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
6242 dst1[1] = src_r0[3] >> 24;
6246 dst0[1] = src_l0[1] | src_r0[0] << 16;
6247 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
6248 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
6249 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
6250 dst1[1] = src_r0[3] >> 16;
6254 dst0[1] = src_l0[1] | src_r0[0] << 24;
6255 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
6256 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
6257 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
6258 dst1[1] = src_r0[3] >> 8;
6262 dst0[2] = src_r0[0];
6263 dst0[3] = src_r0[1];
6264 dst1[0] = src_r0[2];
6265 dst1[1] = src_r0[3];
6269 dst0[2] = src_l0[2] | src_r0[0] << 8;
6270 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
6271 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
6272 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
6273 dst1[2] = src_r0[3] >> 24;
6277 dst0[2] = src_l0[2] | src_r0[0] << 16;
6278 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
6279 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
6280 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
6281 dst1[2] = src_r0[3] >> 16;
6285 dst0[2] = src_l0[2] | src_r0[0] << 24;
6286 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
6287 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
6288 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
6289 dst1[2] = src_r0[3] >> 8;
6293 dst0[3] = src_r0[0];
6294 dst1[0] = src_r0[1];
6295 dst1[1] = src_r0[2];
6296 dst1[2] = src_r0[3];
6300 dst0[3] = src_l0[3] | src_r0[0] << 8;
6301 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
6302 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
6303 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
6304 dst1[3] = src_r0[3] >> 24;
6308 dst0[3] = src_l0[3] | src_r0[0] << 16;
6309 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
6310 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
6311 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
6312 dst1[3] = src_r0[3] >> 16;
6316 dst0[3] = src_l0[3] | src_r0[0] << 24;
6317 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
6318 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
6319 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
6320 dst1[3] = src_r0[3] >> 8;
6324 dst1[0] = src_r0[0];
6325 dst1[1] = src_r0[1];
6326 dst1[2] = src_r0[2];
6327 dst1[3] = src_r0[3];
6331 dst1[0] = src_l1[0] | src_r0[0] << 8;
6332 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
6333 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
6334 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
6335 dst2[0] = src_r0[3] >> 24;
6339 dst1[0] = src_l1[0] | src_r0[0] << 16;
6340 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
6341 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
6342 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
6343 dst2[0] = src_r0[3] >> 16;
6347 dst1[0] = src_l1[0] | src_r0[0] << 24;
6348 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
6349 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
6350 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
6351 dst2[0] = src_r0[3] >> 8;
6355 dst1[1] = src_r0[0];
6356 dst1[2] = src_r0[1];
6357 dst1[3] = src_r0[2];
6358 dst2[0] = src_r0[3];
6362 dst1[1] = src_l1[1] | src_r0[0] << 8;
6363 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
6364 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
6365 dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8;
6366 dst2[1] = src_r0[3] >> 24;
6370 dst1[1] = src_l1[1] | src_r0[0] << 16;
6371 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
6372 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
6373 dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16;
6374 dst2[1] = src_r0[3] >> 16;
6378 dst1[1] = src_l1[1] | src_r0[0] << 24;
6379 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
6380 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
6381 dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24;
6382 dst2[1] = src_r0[3] >> 8;
6386 dst1[2] = src_r0[0];
6387 dst1[3] = src_r0[1];
6388 dst2[0] = src_r0[2];
6389 dst2[1] = src_r0[3];
6393 dst1[2] = src_l1[2] | src_r0[0] << 8;
6394 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
6395 dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8;
6396 dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8;
6397 dst2[2] = src_r0[3] >> 24;
6401 dst1[2] = src_l1[2] | src_r0[0] << 16;
6402 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
6403 dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16;
6404 dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16;
6405 dst2[2] = src_r0[3] >> 16;
6409 dst1[2] = src_l1[2] | src_r0[0] << 24;
6410 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
6411 dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24;
6412 dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24;
6413 dst2[2] = src_r0[3] >> 8;
6417 dst1[3] = src_r0[0];
6418 dst2[0] = src_r0[1];
6419 dst2[1] = src_r0[2];
6420 dst2[2] = src_r0[3];
6424 dst1[3] = src_l1[3] | src_r0[0] << 8;
6425 dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8;
6426 dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8;
6427 dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8;
6428 dst2[3] = src_r0[3] >> 24;
6432 dst1[3] = src_l1[3] | src_r0[0] << 16;
6433 dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16;
6434 dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16;
6435 dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16;
6436 dst2[3] = src_r0[3] >> 16;
6440 dst1[3] = src_l1[3] | src_r0[0] << 24;
6441 dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24;
6442 dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24;
6443 dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24;
6444 dst2[3] = src_r0[3] >> 8;
6448 dst2[0] = src_r0[0];
6449 dst2[1] = src_r0[1];
6450 dst2[2] = src_r0[2];
6451 dst2[3] = src_r0[3];
6455 dst2[0] = src_l2[0] | src_r0[0] << 8;
6456 dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8;
6457 dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8;
6458 dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8;
6462 dst2[0] = src_l2[0] | src_r0[0] << 16;
6463 dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16;
6464 dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16;
6465 dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16;
6469 dst2[0] = src_l2[0] | src_r0[0] << 24;
6470 dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24;
6471 dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24;
6472 dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24;
6476 dst2[1] = src_r0[0];
6477 dst2[2] = src_r0[1];
6478 dst2[3] = src_r0[2];
6482 dst2[1] = src_l2[1] | src_r0[0] << 8;
6483 dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8;
6484 dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8;
6488 dst2[1] = src_l2[1] | src_r0[0] << 16;
6489 dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16;
6490 dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16;
6494 dst2[1] = src_l2[1] | src_r0[0] << 24;
6495 dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24;
6496 dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24;
6500 dst2[2] = src_r0[0];
6501 dst2[3] = src_r0[1];
6505 dst2[2] = src_l2[2] | src_r0[0] << 8;
6506 dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8;
6510 dst2[2] = src_l2[2] | src_r0[0] << 16;
6511 dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16;
6515 dst2[2] = src_l2[2] | src_r0[0] << 24;
6516 dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24;
6520 dst2[3] = src_r0[0];
6524 dst2[3] = src_l2[3] | src_r0[0] << 8;
6528 dst2[3] = src_l2[3] | src_r0[0] << 16;
6532 dst2[3] = src_l2[3] | src_r0[0] << 24;
6537 // before: device_memcat12L
6538 static void memcat_c47_d3x4_sl3x4_sr2x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4], u32 src_r1[4])
6543 dst0[0] = src_r0[0];
6544 dst0[1] = src_r0[1];
6545 dst0[2] = src_r0[2];
6546 dst0[3] = src_r0[3];
6547 dst1[0] = src_r1[0];
6548 dst1[1] = src_r1[1];
6549 dst1[2] = src_r1[2];
6550 dst1[3] = src_r1[3];
6554 dst0[0] = src_l0[0] | src_r0[0] << 8;
6555 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
6556 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
6557 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
6558 dst1[0] = src_r0[3] >> 24 | src_r1[0] << 8;
6559 dst1[1] = src_r1[0] >> 24 | src_r1[1] << 8;
6560 dst1[2] = src_r1[1] >> 24 | src_r1[2] << 8;
6561 dst1[3] = src_r1[2] >> 24 | src_r1[3] << 8;
6562 dst2[0] = src_r1[3] >> 24;
6566 dst0[0] = src_l0[0] | src_r0[0] << 16;
6567 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
6568 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
6569 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
6570 dst1[0] = src_r0[3] >> 16 | src_r1[0] << 16;
6571 dst1[1] = src_r1[0] >> 16 | src_r1[1] << 16;
6572 dst1[2] = src_r1[1] >> 16 | src_r1[2] << 16;
6573 dst1[3] = src_r1[2] >> 16 | src_r1[3] << 16;
6574 dst2[0] = src_r1[3] >> 16;
6578 dst0[0] = src_l0[0] | src_r0[0] << 24;
6579 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
6580 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
6581 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
6582 dst1[0] = src_r0[3] >> 8 | src_r1[0] << 24;
6583 dst1[1] = src_r1[0] >> 8 | src_r1[1] << 24;
6584 dst1[2] = src_r1[1] >> 8 | src_r1[2] << 24;
6585 dst1[3] = src_r1[2] >> 8 | src_r1[3] << 24;
6586 dst2[0] = src_r1[3] >> 8;
6590 dst0[1] = src_r0[0];
6591 dst0[2] = src_r0[1];
6592 dst0[3] = src_r0[2];
6593 dst1[0] = src_r0[3];
6594 dst1[1] = src_r1[0];
6595 dst1[2] = src_r1[1];
6596 dst1[3] = src_r1[2];
6597 dst2[0] = src_r1[3];
6601 dst0[1] = src_l0[1] | src_r0[0] << 8;
6602 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
6603 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
6604 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
6605 dst1[1] = src_r0[3] >> 24 | src_r1[0] << 8;
6606 dst1[2] = src_r1[0] >> 24 | src_r1[1] << 8;
6607 dst1[3] = src_r1[1] >> 24 | src_r1[2] << 8;
6608 dst2[0] = src_r1[2] >> 24 | src_r1[3] << 8;
6609 dst2[1] = src_r1[3] >> 24;
6613 dst0[1] = src_l0[1] | src_r0[0] << 16;
6614 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
6615 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
6616 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
6617 dst1[1] = src_r0[3] >> 16 | src_r1[0] << 16;
6618 dst1[2] = src_r1[0] >> 16 | src_r1[1] << 16;
6619 dst1[3] = src_r1[1] >> 16 | src_r1[2] << 16;
6620 dst2[0] = src_r1[2] >> 16 | src_r1[3] << 16;
6621 dst2[1] = src_r1[3] >> 16;
6625 dst0[1] = src_l0[1] | src_r0[0] << 24;
6626 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
6627 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
6628 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
6629 dst1[1] = src_r0[3] >> 8 | src_r1[0] << 24;
6630 dst1[2] = src_r1[0] >> 8 | src_r1[1] << 24;
6631 dst1[3] = src_r1[1] >> 8 | src_r1[2] << 24;
6632 dst2[0] = src_r1[2] >> 8 | src_r1[3] << 24;
6633 dst2[1] = src_r1[3] >> 8;
6637 dst0[2] = src_r0[0];
6638 dst0[3] = src_r0[1];
6639 dst1[0] = src_r0[2];
6640 dst1[1] = src_r0[3];
6641 dst1[2] = src_r1[0];
6642 dst1[3] = src_r1[1];
6643 dst2[0] = src_r1[2];
6644 dst2[1] = src_r1[3];
6648 dst0[2] = src_l0[2] | src_r0[0] << 8;
6649 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
6650 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
6651 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
6652 dst1[2] = src_r0[3] >> 24 | src_r1[0] << 8;
6653 dst1[3] = src_r1[0] >> 24 | src_r1[1] << 8;
6654 dst2[0] = src_r1[1] >> 24 | src_r1[2] << 8;
6655 dst2[1] = src_r1[2] >> 24 | src_r1[3] << 8;
6656 dst2[2] = src_r1[3] >> 24;
6660 dst0[2] = src_l0[2] | src_r0[0] << 16;
6661 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
6662 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
6663 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
6664 dst1[2] = src_r0[3] >> 16 | src_r1[0] << 16;
6665 dst1[3] = src_r1[0] >> 16 | src_r1[1] << 16;
6666 dst2[0] = src_r1[1] >> 16 | src_r1[2] << 16;
6667 dst2[1] = src_r1[2] >> 16 | src_r1[3] << 16;
6668 dst2[2] = src_r1[3] >> 16;
6672 dst0[2] = src_l0[2] | src_r0[0] << 24;
6673 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
6674 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
6675 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
6676 dst1[2] = src_r0[3] >> 8 | src_r1[0] << 24;
6677 dst1[3] = src_r1[0] >> 8 | src_r1[1] << 24;
6678 dst2[0] = src_r1[1] >> 8 | src_r1[2] << 24;
6679 dst2[1] = src_r1[2] >> 8 | src_r1[3] << 24;
6680 dst2[2] = src_r1[3] >> 8;
6684 dst0[3] = src_r0[0];
6685 dst1[0] = src_r0[1];
6686 dst1[1] = src_r0[2];
6687 dst1[2] = src_r0[3];
6688 dst1[3] = src_r1[0];
6689 dst2[0] = src_r1[1];
6690 dst2[1] = src_r1[2];
6691 dst2[2] = src_r1[3];
6695 dst0[3] = src_l0[3] | src_r0[0] << 8;
6696 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
6697 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
6698 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
6699 dst1[3] = src_r0[3] >> 24 | src_r1[0] << 8;
6700 dst2[0] = src_r1[0] >> 24 | src_r1[1] << 8;
6701 dst2[1] = src_r1[1] >> 24 | src_r1[2] << 8;
6702 dst2[2] = src_r1[2] >> 24 | src_r1[3] << 8;
6703 dst2[3] = src_r1[3] >> 24;
6707 dst0[3] = src_l0[3] | src_r0[0] << 16;
6708 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
6709 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
6710 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
6711 dst1[3] = src_r0[3] >> 16 | src_r1[0] << 16;
6712 dst2[0] = src_r1[0] >> 16 | src_r1[1] << 16;
6713 dst2[1] = src_r1[1] >> 16 | src_r1[2] << 16;
6714 dst2[2] = src_r1[2] >> 16 | src_r1[3] << 16;
6715 dst2[3] = src_r1[3] >> 16;
6719 dst0[3] = src_l0[3] | src_r0[0] << 24;
6720 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
6721 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
6722 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
6723 dst1[3] = src_r0[3] >> 8 | src_r1[0] << 24;
6724 dst2[0] = src_r1[0] >> 8 | src_r1[1] << 24;
6725 dst2[1] = src_r1[1] >> 8 | src_r1[2] << 24;
6726 dst2[2] = src_r1[2] >> 8 | src_r1[3] << 24;
6727 dst2[3] = src_r1[3] >> 8;
6731 dst1[0] = src_r0[0];
6732 dst1[1] = src_r0[1];
6733 dst1[2] = src_r0[2];
6734 dst1[3] = src_r0[3];
6735 dst2[0] = src_r1[0];
6736 dst2[1] = src_r1[1];
6737 dst2[2] = src_r1[2];
6738 dst2[3] = src_r1[3];
6742 dst1[0] = src_l1[0] | src_r0[0] << 8;
6743 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
6744 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
6745 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
6746 dst2[0] = src_r0[3] >> 24 | src_r1[0] << 8;
6747 dst2[1] = src_r1[0] >> 24 | src_r1[1] << 8;
6748 dst2[2] = src_r1[1] >> 24 | src_r1[2] << 8;
6749 dst2[3] = src_r1[2] >> 24 | src_r1[3] << 8;
6753 dst1[0] = src_l1[0] | src_r0[0] << 16;
6754 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
6755 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
6756 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
6757 dst2[0] = src_r0[3] >> 16 | src_r1[0] << 16;
6758 dst2[1] = src_r1[0] >> 16 | src_r1[1] << 16;
6759 dst2[2] = src_r1[1] >> 16 | src_r1[2] << 16;
6760 dst2[3] = src_r1[2] >> 16 | src_r1[3] << 16;
6764 dst1[0] = src_l1[0] | src_r0[0] << 24;
6765 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
6766 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
6767 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
6768 dst2[0] = src_r0[3] >> 8 | src_r1[0] << 24;
6769 dst2[1] = src_r1[0] >> 8 | src_r1[1] << 24;
6770 dst2[2] = src_r1[1] >> 8 | src_r1[2] << 24;
6771 dst2[3] = src_r1[2] >> 8 | src_r1[3] << 24;
6775 dst1[1] = src_r1[0];
6776 dst1[2] = src_r0[1];
6777 dst1[3] = src_r0[2];
6778 dst2[0] = src_r0[3];
6779 dst2[1] = src_r1[0];
6780 dst2[2] = src_r1[1];
6781 dst2[3] = src_r1[2];
6785 dst1[1] = src_l1[1] | src_r0[0] << 8;
6786 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
6787 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
6788 dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8;
6789 dst2[1] = src_r0[3] >> 24 | src_r1[0] << 8;
6790 dst2[2] = src_r1[0] >> 24 | src_r1[1] << 8;
6791 dst2[3] = src_r1[1] >> 24 | src_r1[2] << 8;
6795 dst1[1] = src_l1[1] | src_r0[0] << 16;
6796 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
6797 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
6798 dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16;
6799 dst2[1] = src_r0[3] >> 16 | src_r1[0] << 16;
6800 dst2[2] = src_r1[0] >> 16 | src_r1[1] << 16;
6801 dst2[3] = src_r1[1] >> 16 | src_r1[2] << 16;
6805 dst1[1] = src_l1[1] | src_r0[0] << 24;
6806 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
6807 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
6808 dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24;
6809 dst2[1] = src_r0[3] >> 8 | src_r1[0] << 24;
6810 dst2[2] = src_r1[0] >> 8 | src_r1[1] << 24;
6811 dst2[3] = src_r1[1] >> 8 | src_r1[2] << 24;
6815 dst1[2] = src_r1[0];
6816 dst1[3] = src_r0[1];
6817 dst2[0] = src_r0[2];
6818 dst2[1] = src_r0[3];
6819 dst2[2] = src_r1[0];
6820 dst2[3] = src_r1[1];
6824 dst1[2] = src_l1[2] | src_r0[0] << 8;
6825 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
6826 dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8;
6827 dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8;
6828 dst2[2] = src_r0[3] >> 24 | src_r1[0] << 8;
6829 dst2[3] = src_r1[0] >> 24 | src_r1[1] << 8;
6833 dst1[2] = src_l1[2] | src_r0[0] << 16;
6834 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
6835 dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16;
6836 dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16;
6837 dst2[2] = src_r0[3] >> 16 | src_r1[0] << 16;
6838 dst2[3] = src_r1[0] >> 16 | src_r1[1] << 16;
6842 dst1[2] = src_l1[2] | src_r0[0] << 24;
6843 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
6844 dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24;
6845 dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24;
6846 dst2[2] = src_r0[3] >> 8 | src_r1[0] << 24;
6847 dst2[3] = src_r1[0] >> 8 | src_r1[1] << 24;
6851 dst1[3] = src_r1[0];
6852 dst2[0] = src_r0[1];
6853 dst2[1] = src_r0[2];
6854 dst2[2] = src_r0[3];
6855 dst2[3] = src_r1[0];
6859 dst1[3] = src_l1[3] | src_r0[0] << 8;
6860 dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8;
6861 dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8;
6862 dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8;
6863 dst2[3] = src_r0[3] >> 24 | src_r1[0] << 8;
6867 dst1[3] = src_l1[3] | src_r0[0] << 16;
6868 dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16;
6869 dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16;
6870 dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16;
6871 dst2[3] = src_r0[3] >> 16 | src_r1[0] << 16;
6875 dst1[3] = src_l1[3] | src_r0[0] << 24;
6876 dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24;
6877 dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24;
6878 dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24;
6879 dst2[3] = src_r0[3] >> 8 | src_r1[0] << 24;
6883 dst2[0] = src_r0[0];
6884 dst2[1] = src_r0[1];
6885 dst2[2] = src_r0[2];
6886 dst2[3] = src_r0[3];
6890 dst2[0] = src_l2[0] | src_r0[0] << 8;
6891 dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8;
6892 dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8;
6893 dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8;
6897 dst2[0] = src_l2[0] | src_r0[0] << 16;
6898 dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16;
6899 dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16;
6900 dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16;
6904 dst2[0] = src_l2[0] | src_r0[0] << 24;
6905 dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24;
6906 dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24;
6907 dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24;
6911 dst2[1] = src_r0[0];
6912 dst2[2] = src_r0[1];
6913 dst2[3] = src_r0[2];
6917 dst2[1] = src_l2[1] | src_r0[0] << 8;
6918 dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8;
6919 dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8;
6923 dst2[1] = src_l2[1] | src_r0[0] << 16;
6924 dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16;
6925 dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16;
6929 dst2[1] = src_l2[1] | src_r0[0] << 24;
6930 dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24;
6931 dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24;
6935 dst2[2] = src_r0[0];
6936 dst2[3] = src_r0[1];
6940 dst2[2] = src_l2[2] | src_r0[0] << 8;
6941 dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8;
6945 dst2[2] = src_l2[2] | src_r0[0] << 16;
6946 dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16;
6950 dst2[2] = src_l2[2] | src_r0[0] << 24;
6951 dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24;
6955 dst2[3] = src_r0[0];
6959 dst2[3] = src_l2[3] | src_r0[0] << 8;
6963 dst2[3] = src_l2[3] | src_r0[0] << 16;
6967 dst2[3] = src_l2[3] | src_r0[0] << 24;
6972 // before: memcat16_9
6973 static void memcat_c15_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset)
6990 w0[0] = w0[0] | append0[0] << 8;
6991 w0[1] = append0[0] >> 24 | append0[1] << 8;
6992 w0[2] = append0[1] >> 24 | append0[2] << 8;
6993 w0[3] = append0[2] >> 24 | append0[3] << 8;
6994 w1[0] = append0[3] >> 24 | append1[0] << 8;
6995 w1[1] = append1[0] >> 24 | append1[1] << 8;
6996 w1[2] = append1[1] >> 24 | append1[2] << 8;
6997 w1[3] = append1[2] >> 24 | append1[3] << 8;
6998 w2[0] = append1[3] >> 24 | append2[0] << 8;
6999 w2[1] = append2[0] >> 24;
7003 w0[0] = w0[0] | append0[0] << 16;
7004 w0[1] = append0[0] >> 16 | append0[1] << 16;
7005 w0[2] = append0[1] >> 16 | append0[2] << 16;
7006 w0[3] = append0[2] >> 16 | append0[3] << 16;
7007 w1[0] = append0[3] >> 16 | append1[0] << 16;
7008 w1[1] = append1[0] >> 16 | append1[1] << 16;
7009 w1[2] = append1[1] >> 16 | append1[2] << 16;
7010 w1[3] = append1[2] >> 16 | append1[3] << 16;
7011 w2[0] = append1[3] >> 16 | append2[0] << 16;
7012 w2[1] = append2[0] >> 16;
7016 w0[0] = w0[0] | append0[0] << 24;
7017 w0[1] = append0[0] >> 8 | append0[1] << 24;
7018 w0[2] = append0[1] >> 8 | append0[2] << 24;
7019 w0[3] = append0[2] >> 8 | append0[3] << 24;
7020 w1[0] = append0[3] >> 8 | append1[0] << 24;
7021 w1[1] = append1[0] >> 8 | append1[1] << 24;
7022 w1[2] = append1[1] >> 8 | append1[2] << 24;
7023 w1[3] = append1[2] >> 8 | append1[3] << 24;
7024 w2[0] = append1[3] >> 8 | append2[0] << 24;
7025 w2[1] = append2[0] >> 8;
7041 w0[1] = w0[1] | append0[0] << 8;
7042 w0[2] = append0[0] >> 24 | append0[1] << 8;
7043 w0[3] = append0[1] >> 24 | append0[2] << 8;
7044 w1[0] = append0[2] >> 24 | append0[3] << 8;
7045 w1[1] = append0[3] >> 24 | append1[0] << 8;
7046 w1[2] = append1[0] >> 24 | append1[1] << 8;
7047 w1[3] = append1[1] >> 24 | append1[2] << 8;
7048 w2[0] = append1[2] >> 24 | append1[3] << 8;
7049 w2[1] = append1[3] >> 24 | append2[0] << 8;
7050 w2[2] = append2[0] >> 24;
7054 w0[1] = w0[1] | append0[0] << 16;
7055 w0[2] = append0[0] >> 16 | append0[1] << 16;
7056 w0[3] = append0[1] >> 16 | append0[2] << 16;
7057 w1[0] = append0[2] >> 16 | append0[3] << 16;
7058 w1[1] = append0[3] >> 16 | append1[0] << 16;
7059 w1[2] = append1[0] >> 16 | append1[1] << 16;
7060 w1[3] = append1[1] >> 16 | append1[2] << 16;
7061 w2[0] = append1[2] >> 16 | append1[3] << 16;
7062 w2[1] = append1[3] >> 16 | append2[0] << 16;
7063 w2[2] = append2[0] >> 16;
7067 w0[1] = w0[1] | append0[0] << 24;
7068 w0[2] = append0[0] >> 8 | append0[1] << 24;
7069 w0[3] = append0[1] >> 8 | append0[2] << 24;
7070 w1[0] = append0[2] >> 8 | append0[3] << 24;
7071 w1[1] = append0[3] >> 8 | append1[0] << 24;
7072 w1[2] = append1[0] >> 8 | append1[1] << 24;
7073 w1[3] = append1[1] >> 8 | append1[2] << 24;
7074 w2[0] = append1[2] >> 8 | append1[3] << 24;
7075 w2[1] = append1[3] >> 8 | append2[0] << 24;
7076 w2[2] = append2[0] >> 8;
7092 w0[2] = w0[2] | append0[0] << 8;
7093 w0[3] = append0[0] >> 24 | append0[1] << 8;
7094 w1[0] = append0[1] >> 24 | append0[2] << 8;
7095 w1[1] = append0[2] >> 24 | append0[3] << 8;
7096 w1[2] = append0[3] >> 24 | append1[0] << 8;
7097 w1[3] = append1[0] >> 24 | append1[1] << 8;
7098 w2[0] = append1[1] >> 24 | append1[2] << 8;
7099 w2[1] = append1[2] >> 24 | append1[3] << 8;
7100 w2[2] = append1[3] >> 24 | append2[0] << 8;
7101 w2[3] = append2[0] >> 24;
7105 w0[2] = w0[2] | append0[0] << 16;
7106 w0[3] = append0[0] >> 16 | append0[1] << 16;
7107 w1[0] = append0[1] >> 16 | append0[2] << 16;
7108 w1[1] = append0[2] >> 16 | append0[3] << 16;
7109 w1[2] = append0[3] >> 16 | append1[0] << 16;
7110 w1[3] = append1[0] >> 16 | append1[1] << 16;
7111 w2[0] = append1[1] >> 16 | append1[2] << 16;
7112 w2[1] = append1[2] >> 16 | append1[3] << 16;
7113 w2[2] = append1[3] >> 16 | append2[0] << 16;
7114 w2[3] = append2[0] >> 16;
7118 w0[2] = w0[2] | append0[0] << 24;
7119 w0[3] = append0[0] >> 8 | append0[1] << 24;
7120 w1[0] = append0[1] >> 8 | append0[2] << 24;
7121 w1[1] = append0[2] >> 8 | append0[3] << 24;
7122 w1[2] = append0[3] >> 8 | append1[0] << 24;
7123 w1[3] = append1[0] >> 8 | append1[1] << 24;
7124 w2[0] = append1[1] >> 8 | append1[2] << 24;
7125 w2[1] = append1[2] >> 8 | append1[3] << 24;
7126 w2[2] = append1[3] >> 8 | append2[0] << 24;
7127 w2[3] = append2[0] >> 8;
7143 w0[3] = w0[3] | append0[0] << 8;
7144 w1[0] = append0[0] >> 24 | append0[1] << 8;
7145 w1[1] = append0[1] >> 24 | append0[2] << 8;
7146 w1[2] = append0[2] >> 24 | append0[3] << 8;
7147 w1[3] = append0[3] >> 24 | append1[0] << 8;
7148 w2[0] = append1[0] >> 24 | append1[1] << 8;
7149 w2[1] = append1[1] >> 24 | append1[2] << 8;
7150 w2[2] = append1[2] >> 24 | append1[3] << 8;
7151 w2[3] = append1[3] >> 24 | append2[0] << 8;
7152 w3[0] = append2[0] >> 24;
7156 w0[3] = w0[3] | append0[0] << 16;
7157 w1[0] = append0[0] >> 16 | append0[1] << 16;
7158 w1[1] = append0[1] >> 16 | append0[2] << 16;
7159 w1[2] = append0[2] >> 16 | append0[3] << 16;
7160 w1[3] = append0[3] >> 16 | append1[0] << 16;
7161 w2[0] = append1[0] >> 16 | append1[1] << 16;
7162 w2[1] = append1[1] >> 16 | append1[2] << 16;
7163 w2[2] = append1[2] >> 16 | append1[3] << 16;
7164 w2[3] = append1[3] >> 16 | append2[0] << 16;
7165 w3[0] = append2[0] >> 16;
7169 w0[3] = w0[3] | append0[0] << 24;
7170 w1[0] = append0[0] >> 8 | append0[1] << 24;
7171 w1[1] = append0[1] >> 8 | append0[2] << 24;
7172 w1[2] = append0[2] >> 8 | append0[3] << 24;
7173 w1[3] = append0[3] >> 8 | append1[0] << 24;
7174 w2[0] = append1[0] >> 8 | append1[1] << 24;
7175 w2[1] = append1[1] >> 8 | append1[2] << 24;
7176 w2[2] = append1[2] >> 8 | append1[3] << 24;
7177 w2[3] = append1[3] >> 8 | append2[0] << 24;
7178 w3[0] = append2[0] >> 8;
7183 // before: memcat32_8
7184 static void memcat_c32_w4x4_a2x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 offset)
7200 w0[0] = w0[0] | append0[0] << 8;
7201 w0[1] = append0[0] >> 24 | append0[1] << 8;
7202 w0[2] = append0[1] >> 24 | append0[2] << 8;
7203 w0[3] = append0[2] >> 24 | append0[3] << 8;
7204 w1[0] = append0[3] >> 24 | append1[0] << 8;
7205 w1[1] = append1[0] >> 24 | append1[1] << 8;
7206 w1[2] = append1[1] >> 24 | append1[2] << 8;
7207 w1[3] = append1[2] >> 24 | append1[3] << 8;
7208 w2[0] = append1[3] >> 24;
7212 w0[0] = w0[0] | append0[0] << 16;
7213 w0[1] = append0[0] >> 16 | append0[1] << 16;
7214 w0[2] = append0[1] >> 16 | append0[2] << 16;
7215 w0[3] = append0[2] >> 16 | append0[3] << 16;
7216 w1[0] = append0[3] >> 16 | append1[0] << 16;
7217 w1[1] = append1[0] >> 16 | append1[1] << 16;
7218 w1[2] = append1[1] >> 16 | append1[2] << 16;
7219 w1[3] = append1[2] >> 16 | append1[3] << 16;
7220 w2[0] = append1[3] >> 16;
7224 w0[0] = w0[0] | append0[0] << 24;
7225 w0[1] = append0[0] >> 8 | append0[1] << 24;
7226 w0[2] = append0[1] >> 8 | append0[2] << 24;
7227 w0[3] = append0[2] >> 8 | append0[3] << 24;
7228 w1[0] = append0[3] >> 8 | append1[0] << 24;
7229 w1[1] = append1[0] >> 8 | append1[1] << 24;
7230 w1[2] = append1[1] >> 8 | append1[2] << 24;
7231 w1[3] = append1[2] >> 8 | append1[3] << 24;
7232 w2[0] = append1[3] >> 8;
7247 w0[1] = w0[1] | append0[0] << 8;
7248 w0[2] = append0[0] >> 24 | append0[1] << 8;
7249 w0[3] = append0[1] >> 24 | append0[2] << 8;
7250 w1[0] = append0[2] >> 24 | append0[3] << 8;
7251 w1[1] = append0[3] >> 24 | append1[0] << 8;
7252 w1[2] = append1[0] >> 24 | append1[1] << 8;
7253 w1[3] = append1[1] >> 24 | append1[2] << 8;
7254 w2[0] = append1[2] >> 24 | append1[3] << 8;
7255 w2[1] = append1[3] >> 24;
7259 w0[1] = w0[1] | append0[0] << 16;
7260 w0[2] = append0[0] >> 16 | append0[1] << 16;
7261 w0[3] = append0[1] >> 16 | append0[2] << 16;
7262 w1[0] = append0[2] >> 16 | append0[3] << 16;
7263 w1[1] = append0[3] >> 16 | append1[0] << 16;
7264 w1[2] = append1[0] >> 16 | append1[1] << 16;
7265 w1[3] = append1[1] >> 16 | append1[2] << 16;
7266 w2[0] = append1[2] >> 16 | append1[3] << 16;
7267 w2[1] = append1[3] >> 16;
7271 w0[1] = w0[1] | append0[0] << 24;
7272 w0[2] = append0[0] >> 8 | append0[1] << 24;
7273 w0[3] = append0[1] >> 8 | append0[2] << 24;
7274 w1[0] = append0[2] >> 8 | append0[3] << 24;
7275 w1[1] = append0[3] >> 8 | append1[0] << 24;
7276 w1[2] = append1[0] >> 8 | append1[1] << 24;
7277 w1[3] = append1[1] >> 8 | append1[2] << 24;
7278 w2[0] = append1[2] >> 8 | append1[3] << 24;
7279 w2[1] = append1[3] >> 8;
7294 w0[2] = w0[2] | append0[0] << 8;
7295 w0[3] = append0[0] >> 24 | append0[1] << 8;
7296 w1[0] = append0[1] >> 24 | append0[2] << 8;
7297 w1[1] = append0[2] >> 24 | append0[3] << 8;
7298 w1[2] = append0[3] >> 24 | append1[0] << 8;
7299 w1[3] = append1[0] >> 24 | append1[1] << 8;
7300 w2[0] = append1[1] >> 24 | append1[2] << 8;
7301 w2[1] = append1[2] >> 24 | append1[3] << 8;
7302 w2[2] = append1[3] >> 24;
7306 w0[2] = w0[2] | append0[0] << 16;
7307 w0[3] = append0[0] >> 16 | append0[1] << 16;
7308 w1[0] = append0[1] >> 16 | append0[2] << 16;
7309 w1[1] = append0[2] >> 16 | append0[3] << 16;
7310 w1[2] = append0[3] >> 16 | append1[0] << 16;
7311 w1[3] = append1[0] >> 16 | append1[1] << 16;
7312 w2[0] = append1[1] >> 16 | append1[2] << 16;
7313 w2[1] = append1[2] >> 16 | append1[3] << 16;
7314 w2[2] = append1[3] >> 16;
7318 w0[2] = w0[2] | append0[0] << 24;
7319 w0[3] = append0[0] >> 8 | append0[1] << 24;
7320 w1[0] = append0[1] >> 8 | append0[2] << 24;
7321 w1[1] = append0[2] >> 8 | append0[3] << 24;
7322 w1[2] = append0[3] >> 8 | append1[0] << 24;
7323 w1[3] = append1[0] >> 8 | append1[1] << 24;
7324 w2[0] = append1[1] >> 8 | append1[2] << 24;
7325 w2[1] = append1[2] >> 8 | append1[3] << 24;
7326 w2[2] = append1[3] >> 8;
7341 w0[3] = w0[3] | append0[0] << 8;
7342 w1[0] = append0[0] >> 24 | append0[1] << 8;
7343 w1[1] = append0[1] >> 24 | append0[2] << 8;
7344 w1[2] = append0[2] >> 24 | append0[3] << 8;
7345 w1[3] = append0[3] >> 24 | append1[0] << 8;
7346 w2[0] = append1[0] >> 24 | append1[1] << 8;
7347 w2[1] = append1[1] >> 24 | append1[2] << 8;
7348 w2[2] = append1[2] >> 24 | append1[3] << 8;
7349 w2[3] = append1[3] >> 24;
7353 w0[3] = w0[3] | append0[0] << 16;
7354 w1[0] = append0[0] >> 16 | append0[1] << 16;
7355 w1[1] = append0[1] >> 16 | append0[2] << 16;
7356 w1[2] = append0[2] >> 16 | append0[3] << 16;
7357 w1[3] = append0[3] >> 16 | append1[0] << 16;
7358 w2[0] = append1[0] >> 16 | append1[1] << 16;
7359 w2[1] = append1[1] >> 16 | append1[2] << 16;
7360 w2[2] = append1[2] >> 16 | append1[3] << 16;
7361 w2[3] = append1[3] >> 16;
7365 w0[3] = w0[3] | append0[0] << 24;
7366 w1[0] = append0[0] >> 8 | append0[1] << 24;
7367 w1[1] = append0[1] >> 8 | append0[2] << 24;
7368 w1[2] = append0[2] >> 8 | append0[3] << 24;
7369 w1[3] = append0[3] >> 8 | append1[0] << 24;
7370 w2[0] = append1[0] >> 8 | append1[1] << 24;
7371 w2[1] = append1[1] >> 8 | append1[2] << 24;
7372 w2[2] = append1[2] >> 8 | append1[3] << 24;
7373 w2[3] = append1[3] >> 8;
7388 w1[0] = w1[0] | append0[0] << 8;
7389 w1[1] = append0[0] >> 24 | append0[1] << 8;
7390 w1[2] = append0[1] >> 24 | append0[2] << 8;
7391 w1[3] = append0[2] >> 24 | append0[3] << 8;
7392 w2[0] = append0[3] >> 24 | append1[0] << 8;
7393 w2[1] = append1[0] >> 24 | append1[1] << 8;
7394 w2[2] = append1[1] >> 24 | append1[2] << 8;
7395 w2[3] = append1[2] >> 24 | append1[3] << 8;
7396 w3[0] = append1[3] >> 24;
7400 w1[0] = w1[0] | append0[0] << 16;
7401 w1[1] = append0[0] >> 16 | append0[1] << 16;
7402 w1[2] = append0[1] >> 16 | append0[2] << 16;
7403 w1[3] = append0[2] >> 16 | append0[3] << 16;
7404 w2[0] = append0[3] >> 16 | append1[0] << 16;
7405 w2[1] = append1[0] >> 16 | append1[1] << 16;
7406 w2[2] = append1[1] >> 16 | append1[2] << 16;
7407 w2[3] = append1[2] >> 16 | append1[3] << 16;
7408 w3[0] = append1[3] >> 16;
7412 w1[0] = w1[0] | append0[0] << 24;
7413 w1[1] = append0[0] >> 8 | append0[1] << 24;
7414 w1[2] = append0[1] >> 8 | append0[2] << 24;
7415 w1[3] = append0[2] >> 8 | append0[3] << 24;
7416 w2[0] = append0[3] >> 8 | append1[0] << 24;
7417 w2[1] = append1[0] >> 8 | append1[1] << 24;
7418 w2[2] = append1[1] >> 8 | append1[2] << 24;
7419 w2[3] = append1[2] >> 8 | append1[3] << 24;
7420 w3[0] = append1[3] >> 8;
7435 w1[1] = w1[1] | append0[0] << 8;
7436 w1[2] = append0[0] >> 24 | append0[1] << 8;
7437 w1[3] = append0[1] >> 24 | append0[2] << 8;
7438 w2[0] = append0[2] >> 24 | append0[3] << 8;
7439 w2[1] = append0[3] >> 24 | append1[0] << 8;
7440 w2[2] = append1[0] >> 24 | append1[1] << 8;
7441 w2[3] = append1[1] >> 24 | append1[2] << 8;
7442 w3[0] = append1[2] >> 24 | append1[3] << 8;
7443 w3[1] = append1[3] >> 24;
7447 w1[1] = w1[1] | append0[0] << 16;
7448 w1[2] = append0[0] >> 16 | append0[1] << 16;
7449 w1[3] = append0[1] >> 16 | append0[2] << 16;
7450 w2[0] = append0[2] >> 16 | append0[3] << 16;
7451 w2[1] = append0[3] >> 16 | append1[0] << 16;
7452 w2[2] = append1[0] >> 16 | append1[1] << 16;
7453 w2[3] = append1[1] >> 16 | append1[2] << 16;
7454 w3[0] = append1[2] >> 16 | append1[3] << 16;
7455 w3[1] = append1[3] >> 16;
7459 w1[1] = w1[1] | append0[0] << 24;
7460 w1[2] = append0[0] >> 8 | append0[1] << 24;
7461 w1[3] = append0[1] >> 8 | append0[2] << 24;
7462 w2[0] = append0[2] >> 8 | append0[3] << 24;
7463 w2[1] = append0[3] >> 8 | append1[0] << 24;
7464 w2[2] = append1[0] >> 8 | append1[1] << 24;
7465 w2[3] = append1[1] >> 8 | append1[2] << 24;
7466 w3[0] = append1[2] >> 8 | append1[3] << 24;
7467 w3[1] = append1[3] >> 8;
7482 w1[2] = w1[2] | append0[0] << 8;
7483 w1[3] = append0[0] >> 24 | append0[1] << 8;
7484 w2[0] = append0[1] >> 24 | append0[2] << 8;
7485 w2[1] = append0[2] >> 24 | append0[3] << 8;
7486 w2[2] = append0[3] >> 24 | append1[0] << 8;
7487 w2[3] = append1[0] >> 24 | append1[1] << 8;
7488 w3[0] = append1[1] >> 24 | append1[2] << 8;
7489 w3[1] = append1[2] >> 24 | append1[3] << 8;
7493 w1[2] = w1[2] | append0[0] << 16;
7494 w1[3] = append0[0] >> 16 | append0[1] << 16;
7495 w2[0] = append0[1] >> 16 | append0[2] << 16;
7496 w2[1] = append0[2] >> 16 | append0[3] << 16;
7497 w2[2] = append0[3] >> 16 | append1[0] << 16;
7498 w2[3] = append1[0] >> 16 | append1[1] << 16;
7499 w3[0] = append1[1] >> 16 | append1[2] << 16;
7500 w3[1] = append1[2] >> 16 | append1[3] << 16;
7504 w1[2] = w1[2] | append0[0] << 24;
7505 w1[3] = append0[0] >> 8 | append0[1] << 24;
7506 w2[0] = append0[1] >> 8 | append0[2] << 24;
7507 w2[1] = append0[2] >> 8 | append0[3] << 24;
7508 w2[2] = append0[3] >> 8 | append1[0] << 24;
7509 w2[3] = append1[0] >> 8 | append1[1] << 24;
7510 w3[0] = append1[1] >> 8 | append1[2] << 24;
7511 w3[1] = append1[2] >> 8 | append1[3] << 24;
7525 w1[3] = w1[3] | append0[0] << 8;
7526 w2[0] = append0[0] >> 24 | append0[1] << 8;
7527 w2[1] = append0[1] >> 24 | append0[2] << 8;
7528 w2[2] = append0[2] >> 24 | append0[3] << 8;
7529 w2[3] = append0[3] >> 24 | append1[0] << 8;
7530 w3[0] = append1[0] >> 24 | append1[1] << 8;
7531 w3[1] = append1[1] >> 24 | append1[2] << 8;
7535 w1[3] = w1[3] | append0[0] << 16;
7536 w2[0] = append0[0] >> 16 | append0[1] << 16;
7537 w2[1] = append0[1] >> 16 | append0[2] << 16;
7538 w2[2] = append0[2] >> 16 | append0[3] << 16;
7539 w2[3] = append0[3] >> 16 | append1[0] << 16;
7540 w3[0] = append1[0] >> 16 | append1[1] << 16;
7541 w3[1] = append1[1] >> 16 | append1[2] << 16;
7545 w1[3] = w1[3] | append0[0] << 24;
7546 w2[0] = append0[0] >> 8 | append0[1] << 24;
7547 w2[1] = append0[1] >> 8 | append0[2] << 24;
7548 w2[2] = append0[2] >> 8 | append0[3] << 24;
7549 w2[3] = append0[3] >> 8 | append1[0] << 24;
7550 w3[0] = append1[0] >> 8 | append1[1] << 24;
7551 w3[1] = append1[1] >> 8 | append1[2] << 24;
7565 // before: memcat32_9
7566 static void memcat_c32_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset)
7583 w0[0] = w0[0] | append0[0] << 8;
7584 w0[1] = append0[0] >> 24 | append0[1] << 8;
7585 w0[2] = append0[1] >> 24 | append0[2] << 8;
7586 w0[3] = append0[2] >> 24 | append0[3] << 8;
7587 w1[0] = append0[3] >> 24 | append1[0] << 8;
7588 w1[1] = append1[0] >> 24 | append1[1] << 8;
7589 w1[2] = append1[1] >> 24 | append1[2] << 8;
7590 w1[3] = append1[2] >> 24 | append1[3] << 8;
7591 w2[0] = append1[3] >> 24 | append2[0] << 8;
7592 w2[1] = append2[0] >> 24;
7596 w0[0] = w0[0] | append0[0] << 16;
7597 w0[1] = append0[0] >> 16 | append0[1] << 16;
7598 w0[2] = append0[1] >> 16 | append0[2] << 16;
7599 w0[3] = append0[2] >> 16 | append0[3] << 16;
7600 w1[0] = append0[3] >> 16 | append1[0] << 16;
7601 w1[1] = append1[0] >> 16 | append1[1] << 16;
7602 w1[2] = append1[1] >> 16 | append1[2] << 16;
7603 w1[3] = append1[2] >> 16 | append1[3] << 16;
7604 w2[0] = append1[3] >> 16 | append2[0] << 16;
7605 w2[1] = append2[0] >> 16;
7609 w0[0] = w0[0] | append0[0] << 24;
7610 w0[1] = append0[0] >> 8 | append0[1] << 24;
7611 w0[2] = append0[1] >> 8 | append0[2] << 24;
7612 w0[3] = append0[2] >> 8 | append0[3] << 24;
7613 w1[0] = append0[3] >> 8 | append1[0] << 24;
7614 w1[1] = append1[0] >> 8 | append1[1] << 24;
7615 w1[2] = append1[1] >> 8 | append1[2] << 24;
7616 w1[3] = append1[2] >> 8 | append1[3] << 24;
7617 w2[0] = append1[3] >> 8 | append2[0] << 24;
7618 w2[1] = append2[0] >> 8;
7634 w0[1] = w0[1] | append0[0] << 8;
7635 w0[2] = append0[0] >> 24 | append0[1] << 8;
7636 w0[3] = append0[1] >> 24 | append0[2] << 8;
7637 w1[0] = append0[2] >> 24 | append0[3] << 8;
7638 w1[1] = append0[3] >> 24 | append1[0] << 8;
7639 w1[2] = append1[0] >> 24 | append1[1] << 8;
7640 w1[3] = append1[1] >> 24 | append1[2] << 8;
7641 w2[0] = append1[2] >> 24 | append1[3] << 8;
7642 w2[1] = append1[3] >> 24 | append2[0] << 8;
7643 w2[2] = append2[0] >> 24;
7647 w0[1] = w0[1] | append0[0] << 16;
7648 w0[2] = append0[0] >> 16 | append0[1] << 16;
7649 w0[3] = append0[1] >> 16 | append0[2] << 16;
7650 w1[0] = append0[2] >> 16 | append0[3] << 16;
7651 w1[1] = append0[3] >> 16 | append1[0] << 16;
7652 w1[2] = append1[0] >> 16 | append1[1] << 16;
7653 w1[3] = append1[1] >> 16 | append1[2] << 16;
7654 w2[0] = append1[2] >> 16 | append1[3] << 16;
7655 w2[1] = append1[3] >> 16 | append2[0] << 16;
7656 w2[2] = append2[0] >> 16;
7660 w0[1] = w0[1] | append0[0] << 24;
7661 w0[2] = append0[0] >> 8 | append0[1] << 24;
7662 w0[3] = append0[1] >> 8 | append0[2] << 24;
7663 w1[0] = append0[2] >> 8 | append0[3] << 24;
7664 w1[1] = append0[3] >> 8 | append1[0] << 24;
7665 w1[2] = append1[0] >> 8 | append1[1] << 24;
7666 w1[3] = append1[1] >> 8 | append1[2] << 24;
7667 w2[0] = append1[2] >> 8 | append1[3] << 24;
7668 w2[1] = append1[3] >> 8 | append2[0] << 24;
7669 w2[2] = append2[0] >> 8;
7685 w0[2] = w0[2] | append0[0] << 8;
7686 w0[3] = append0[0] >> 24 | append0[1] << 8;
7687 w1[0] = append0[1] >> 24 | append0[2] << 8;
7688 w1[1] = append0[2] >> 24 | append0[3] << 8;
7689 w1[2] = append0[3] >> 24 | append1[0] << 8;
7690 w1[3] = append1[0] >> 24 | append1[1] << 8;
7691 w2[0] = append1[1] >> 24 | append1[2] << 8;
7692 w2[1] = append1[2] >> 24 | append1[3] << 8;
7693 w2[2] = append1[3] >> 24 | append2[0] << 8;
7694 w2[3] = append2[0] >> 24;
7698 w0[2] = w0[2] | append0[0] << 16;
7699 w0[3] = append0[0] >> 16 | append0[1] << 16;
7700 w1[0] = append0[1] >> 16 | append0[2] << 16;
7701 w1[1] = append0[2] >> 16 | append0[3] << 16;
7702 w1[2] = append0[3] >> 16 | append1[0] << 16;
7703 w1[3] = append1[0] >> 16 | append1[1] << 16;
7704 w2[0] = append1[1] >> 16 | append1[2] << 16;
7705 w2[1] = append1[2] >> 16 | append1[3] << 16;
7706 w2[2] = append1[3] >> 16 | append2[0] << 16;
7707 w2[3] = append2[0] >> 16;
7711 w0[2] = w0[2] | append0[0] << 24;
7712 w0[3] = append0[0] >> 8 | append0[1] << 24;
7713 w1[0] = append0[1] >> 8 | append0[2] << 24;
7714 w1[1] = append0[2] >> 8 | append0[3] << 24;
7715 w1[2] = append0[3] >> 8 | append1[0] << 24;
7716 w1[3] = append1[0] >> 8 | append1[1] << 24;
7717 w2[0] = append1[1] >> 8 | append1[2] << 24;
7718 w2[1] = append1[2] >> 8 | append1[3] << 24;
7719 w2[2] = append1[3] >> 8 | append2[0] << 24;
7720 w2[3] = append2[0] >> 8;
7736 w0[3] = w0[3] | append0[0] << 8;
7737 w1[0] = append0[0] >> 24 | append0[1] << 8;
7738 w1[1] = append0[1] >> 24 | append0[2] << 8;
7739 w1[2] = append0[2] >> 24 | append0[3] << 8;
7740 w1[3] = append0[3] >> 24 | append1[0] << 8;
7741 w2[0] = append1[0] >> 24 | append1[1] << 8;
7742 w2[1] = append1[1] >> 24 | append1[2] << 8;
7743 w2[2] = append1[2] >> 24 | append1[3] << 8;
7744 w2[3] = append1[3] >> 24 | append2[0] << 8;
7745 w3[0] = append2[0] >> 24;
7749 w0[3] = w0[3] | append0[0] << 16;
7750 w1[0] = append0[0] >> 16 | append0[1] << 16;
7751 w1[1] = append0[1] >> 16 | append0[2] << 16;
7752 w1[2] = append0[2] >> 16 | append0[3] << 16;
7753 w1[3] = append0[3] >> 16 | append1[0] << 16;
7754 w2[0] = append1[0] >> 16 | append1[1] << 16;
7755 w2[1] = append1[1] >> 16 | append1[2] << 16;
7756 w2[2] = append1[2] >> 16 | append1[3] << 16;
7757 w2[3] = append1[3] >> 16 | append2[0] << 16;
7758 w3[0] = append2[0] >> 16;
7762 w0[3] = w0[3] | append0[0] << 24;
7763 w1[0] = append0[0] >> 8 | append0[1] << 24;
7764 w1[1] = append0[1] >> 8 | append0[2] << 24;
7765 w1[2] = append0[2] >> 8 | append0[3] << 24;
7766 w1[3] = append0[3] >> 8 | append1[0] << 24;
7767 w2[0] = append1[0] >> 8 | append1[1] << 24;
7768 w2[1] = append1[1] >> 8 | append1[2] << 24;
7769 w2[2] = append1[2] >> 8 | append1[3] << 24;
7770 w2[3] = append1[3] >> 8 | append2[0] << 24;
7771 w3[0] = append2[0] >> 8;
7787 w1[0] = w1[0] | append0[0] << 8;
7788 w1[1] = append0[0] >> 24 | append0[1] << 8;
7789 w1[2] = append0[1] >> 24 | append0[2] << 8;
7790 w1[3] = append0[2] >> 24 | append0[3] << 8;
7791 w2[0] = append0[3] >> 24 | append1[0] << 8;
7792 w2[1] = append1[0] >> 24 | append1[1] << 8;
7793 w2[2] = append1[1] >> 24 | append1[2] << 8;
7794 w2[3] = append1[2] >> 24 | append1[3] << 8;
7795 w3[0] = append1[3] >> 24 | append2[0] << 8;
7796 w3[1] = append2[0] >> 24;
7800 w1[0] = w1[0] | append0[0] << 16;
7801 w1[1] = append0[0] >> 16 | append0[1] << 16;
7802 w1[2] = append0[1] >> 16 | append0[2] << 16;
7803 w1[3] = append0[2] >> 16 | append0[3] << 16;
7804 w2[0] = append0[3] >> 16 | append1[0] << 16;
7805 w2[1] = append1[0] >> 16 | append1[1] << 16;
7806 w2[2] = append1[1] >> 16 | append1[2] << 16;
7807 w2[3] = append1[2] >> 16 | append1[3] << 16;
7808 w3[0] = append1[3] >> 16 | append2[0] << 16;
7809 w3[1] = append2[0] >> 16;
7813 w1[0] = w1[0] | append0[0] << 24;
7814 w1[1] = append0[0] >> 8 | append0[1] << 24;
7815 w1[2] = append0[1] >> 8 | append0[2] << 24;
7816 w1[3] = append0[2] >> 8 | append0[3] << 24;
7817 w2[0] = append0[3] >> 8 | append1[0] << 24;
7818 w2[1] = append1[0] >> 8 | append1[1] << 24;
7819 w2[2] = append1[1] >> 8 | append1[2] << 24;
7820 w2[3] = append1[2] >> 8 | append1[3] << 24;
7821 w3[0] = append1[3] >> 8 | append2[0] << 24;
7822 w3[1] = append2[0] >> 8;
7838 w1[1] = w1[1] | append0[0] << 8;
7839 w1[2] = append0[0] >> 24 | append0[1] << 8;
7840 w1[3] = append0[1] >> 24 | append0[2] << 8;
7841 w2[0] = append0[2] >> 24 | append0[3] << 8;
7842 w2[1] = append0[3] >> 24 | append1[0] << 8;
7843 w2[2] = append1[0] >> 24 | append1[1] << 8;
7844 w2[3] = append1[1] >> 24 | append1[2] << 8;
7845 w3[0] = append1[2] >> 24 | append1[3] << 8;
7846 w3[1] = append1[3] >> 24 | append2[0] << 8;
7850 w1[1] = w1[1] | append0[0] << 16;
7851 w1[2] = append0[0] >> 16 | append0[1] << 16;
7852 w1[3] = append0[1] >> 16 | append0[2] << 16;
7853 w2[0] = append0[2] >> 16 | append0[3] << 16;
7854 w2[1] = append0[3] >> 16 | append1[0] << 16;
7855 w2[2] = append1[0] >> 16 | append1[1] << 16;
7856 w2[3] = append1[1] >> 16 | append1[2] << 16;
7857 w3[0] = append1[2] >> 16 | append1[3] << 16;
7858 w3[1] = append1[3] >> 16 | append2[0] << 16;
7862 w1[1] = w1[1] | append0[0] << 24;
7863 w1[2] = append0[0] >> 8 | append0[1] << 24;
7864 w1[3] = append0[1] >> 8 | append0[2] << 24;
7865 w2[0] = append0[2] >> 8 | append0[3] << 24;
7866 w2[1] = append0[3] >> 8 | append1[0] << 24;
7867 w2[2] = append1[0] >> 8 | append1[1] << 24;
7868 w2[3] = append1[1] >> 8 | append1[2] << 24;
7869 w3[0] = append1[2] >> 8 | append1[3] << 24;
7870 w3[1] = append1[3] >> 8 | append2[0] << 24;
7885 w1[2] = w1[2] | append0[0] << 8;
7886 w1[3] = append0[0] >> 24 | append0[1] << 8;
7887 w2[0] = append0[1] >> 24 | append0[2] << 8;
7888 w2[1] = append0[2] >> 24 | append0[3] << 8;
7889 w2[2] = append0[3] >> 24 | append1[0] << 8;
7890 w2[3] = append1[0] >> 24 | append1[1] << 8;
7891 w3[0] = append1[1] >> 24 | append1[2] << 8;
7892 w3[1] = append1[2] >> 24 | append1[3] << 8;
7896 w1[2] = w1[2] | append0[0] << 16;
7897 w1[3] = append0[0] >> 16 | append0[1] << 16;
7898 w2[0] = append0[1] >> 16 | append0[2] << 16;
7899 w2[1] = append0[2] >> 16 | append0[3] << 16;
7900 w2[2] = append0[3] >> 16 | append1[0] << 16;
7901 w2[3] = append1[0] >> 16 | append1[1] << 16;
7902 w3[0] = append1[1] >> 16 | append1[2] << 16;
7903 w3[1] = append1[2] >> 16 | append1[3] << 16;
7907 w1[2] = w1[2] | append0[0] << 24;
7908 w1[3] = append0[0] >> 8 | append0[1] << 24;
7909 w2[0] = append0[1] >> 8 | append0[2] << 24;
7910 w2[1] = append0[2] >> 8 | append0[3] << 24;
7911 w2[2] = append0[3] >> 8 | append1[0] << 24;
7912 w2[3] = append1[0] >> 8 | append1[1] << 24;
7913 w3[0] = append1[1] >> 8 | append1[2] << 24;
7914 w3[1] = append1[2] >> 8 | append1[3] << 24;
7928 w1[3] = w1[3] | append0[0] << 8;
7929 w2[0] = append0[0] >> 24 | append0[1] << 8;
7930 w2[1] = append0[1] >> 24 | append0[2] << 8;
7931 w2[2] = append0[2] >> 24 | append0[3] << 8;
7932 w2[3] = append0[3] >> 24 | append1[0] << 8;
7933 w3[0] = append1[0] >> 24 | append1[1] << 8;
7934 w3[1] = append1[1] >> 24 | append1[2] << 8;
7938 w1[3] = w1[3] | append0[0] << 16;
7939 w2[0] = append0[0] >> 16 | append0[1] << 16;
7940 w2[1] = append0[1] >> 16 | append0[2] << 16;
7941 w2[2] = append0[2] >> 16 | append0[3] << 16;
7942 w2[3] = append0[3] >> 16 | append1[0] << 16;
7943 w3[0] = append1[0] >> 16 | append1[1] << 16;
7944 w3[1] = append1[1] >> 16 | append1[2] << 16;
7948 w1[3] = w1[3] | append0[0] << 24;
7949 w2[0] = append0[0] >> 8 | append0[1] << 24;
7950 w2[1] = append0[1] >> 8 | append0[2] << 24;
7951 w2[2] = append0[2] >> 8 | append0[3] << 24;
7952 w2[3] = append0[3] >> 8 | append1[0] << 24;
7953 w3[0] = append1[0] >> 8 | append1[1] << 24;
7954 w3[1] = append1[1] >> 8 | append1[2] << 24;