2 * Author......: Jens Steube <jens.steube@gmail.com>
6 static int hash_comp (const u32 d1
[4], __global u32
*d2
)
8 if (d1
[3] > d2
[DGST_R3
]) return ( 1);
9 if (d1
[3] < d2
[DGST_R3
]) return (-1);
10 if (d1
[2] > d2
[DGST_R2
]) return ( 1);
11 if (d1
[2] < d2
[DGST_R2
]) return (-1);
12 if (d1
[1] > d2
[DGST_R1
]) return ( 1);
13 if (d1
[1] < d2
[DGST_R1
]) return (-1);
14 if (d1
[0] > d2
[DGST_R0
]) return ( 1);
15 if (d1
[0] < d2
[DGST_R0
]) return (-1);
20 static int find_hash (const u32 digest
[4], const u32 digests_cnt
, __global digest_t
*digests_buf
)
22 for (u32 l
= 0, r
= digests_cnt
; r
; r
>>= 1)
28 const int cmp
= hash_comp (digest
, digests_buf
[c
].digest_buf
);
37 if (cmp
== 0) return (c
);
43 static u32
check_bitmap (__global u32
*bitmap
, const u32 bitmap_mask
, const u32 bitmap_shift
, const u32 digest
)
45 return (bitmap
[(digest
>> bitmap_shift
) & bitmap_mask
] & (1 << (digest
& 0x1f)));
48 static u32
check (const u32 digest
[2], __global u32
*bitmap_s1_a
, __global u32
*bitmap_s1_b
, __global u32
*bitmap_s1_c
, __global u32
*bitmap_s1_d
, __global u32
*bitmap_s2_a
, __global u32
*bitmap_s2_b
, __global u32
*bitmap_s2_c
, __global u32
*bitmap_s2_d
, const u32 bitmap_mask
, const u32 bitmap_shift1
, const u32 bitmap_shift2
)
50 if (check_bitmap (bitmap_s1_a
, bitmap_mask
, bitmap_shift1
, digest
[0]) == 0) return (0);
51 if (check_bitmap (bitmap_s1_b
, bitmap_mask
, bitmap_shift1
, digest
[1]) == 0) return (0);
52 if (check_bitmap (bitmap_s1_c
, bitmap_mask
, bitmap_shift1
, digest
[2]) == 0) return (0);
53 if (check_bitmap (bitmap_s1_d
, bitmap_mask
, bitmap_shift1
, digest
[3]) == 0) return (0);
55 if (check_bitmap (bitmap_s2_a
, bitmap_mask
, bitmap_shift2
, digest
[0]) == 0) return (0);
56 if (check_bitmap (bitmap_s2_b
, bitmap_mask
, bitmap_shift2
, digest
[1]) == 0) return (0);
57 if (check_bitmap (bitmap_s2_c
, bitmap_mask
, bitmap_shift2
, digest
[2]) == 0) return (0);
58 if (check_bitmap (bitmap_s2_d
, bitmap_mask
, bitmap_shift2
, digest
[3]) == 0) return (0);
63 static void mark_hash (__global plain_t
*plains_buf
, __global u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
65 hashes_shown
[hash_pos
] = 1;
67 plains_buf
[hash_pos
].gidvid
= (gid
* 1) + 0;
68 plains_buf
[hash_pos
].il_pos
= il_pos
;
71 static void truncate_block (u32 w
[4], const u32 len
)
80 case 1: w
[0] &= 0x000000FF;
85 case 2: w
[0] &= 0x0000FFFF;
90 case 3: w
[0] &= 0x00FFFFFF;
99 case 5: w
[1] &= 0x000000FF;
103 case 6: w
[1] &= 0x0000FFFF;
107 case 7: w
[1] &= 0x00FFFFFF;
114 case 9: w
[2] &= 0x000000FF;
117 case 10: w
[2] &= 0x0000FFFF;
120 case 11: w
[2] &= 0x00FFFFFF;
125 case 13: w
[3] &= 0x000000FF;
127 case 14: w
[3] &= 0x0000FFFF;
129 case 15: w
[3] &= 0x00FFFFFF;
134 static void make_unicode_S (const u32 in
[4], u32 out1
[4], u32 out2
[4])
137 out2
[3] = __byte_perm_S (in
[3], 0, 0x7372);
138 out2
[2] = __byte_perm_S (in
[3], 0, 0x7170);
139 out2
[1] = __byte_perm_S (in
[2], 0, 0x7372);
140 out2
[0] = __byte_perm_S (in
[2], 0, 0x7170);
141 out1
[3] = __byte_perm_S (in
[1], 0, 0x7372);
142 out1
[2] = __byte_perm_S (in
[1], 0, 0x7170);
143 out1
[1] = __byte_perm_S (in
[0], 0, 0x7372);
144 out1
[0] = __byte_perm_S (in
[0], 0, 0x7170);
147 #if defined IS_AMD || defined IS_GENERIC
148 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
149 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
150 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
151 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
152 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
153 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
154 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
155 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
159 static void make_unicode (const u32x in
[4], u32x out1
[4], u32x out2
[4])
162 out2
[3] = __byte_perm (in
[3], 0, 0x7372);
163 out2
[2] = __byte_perm (in
[3], 0, 0x7170);
164 out2
[1] = __byte_perm (in
[2], 0, 0x7372);
165 out2
[0] = __byte_perm (in
[2], 0, 0x7170);
166 out1
[3] = __byte_perm (in
[1], 0, 0x7372);
167 out1
[2] = __byte_perm (in
[1], 0, 0x7170);
168 out1
[1] = __byte_perm (in
[0], 0, 0x7372);
169 out1
[0] = __byte_perm (in
[0], 0, 0x7170);
172 #if defined IS_AMD || defined IS_GENERIC
173 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
174 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
175 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
176 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
177 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
178 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
179 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
180 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
184 static void undo_unicode_S (const u32 in1
[4], const u32 in2
[4], u32 out
[4])
187 out
[0] = __byte_perm_S (in1
[0], in1
[1], 0x6420);
188 out
[1] = __byte_perm_S (in1
[2], in1
[3], 0x6420);
189 out
[2] = __byte_perm_S (in2
[0], in2
[1], 0x6420);
190 out
[3] = __byte_perm_S (in2
[2], in2
[3], 0x6420);
193 #if defined IS_AMD || defined IS_GENERIC
194 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
195 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
196 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
197 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
198 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
199 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
200 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
201 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
205 static void undo_unicode (const u32x in1
[4], const u32x in2
[4], u32x out
[4])
208 out
[0] = __byte_perm (in1
[0], in1
[1], 0x6420);
209 out
[1] = __byte_perm (in1
[2], in1
[3], 0x6420);
210 out
[2] = __byte_perm (in2
[0], in2
[1], 0x6420);
211 out
[3] = __byte_perm (in2
[2], in2
[3], 0x6420);
214 #if defined IS_AMD || defined IS_GENERIC
215 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
216 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
217 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
218 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
219 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
220 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
221 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
222 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
226 static void append_0x01_1x4 (u32 w0
[4], const u32 offset
)
235 w0
[0] = w0
[0] | 0x0100;
239 w0
[0] = w0
[0] | 0x010000;
243 w0
[0] = w0
[0] | 0x01000000;
251 w0
[1] = w0
[1] | 0x0100;
255 w0
[1] = w0
[1] | 0x010000;
259 w0
[1] = w0
[1] | 0x01000000;
267 w0
[2] = w0
[2] | 0x0100;
271 w0
[2] = w0
[2] | 0x010000;
275 w0
[2] = w0
[2] | 0x01000000;
283 w0
[3] = w0
[3] | 0x0100;
287 w0
[3] = w0
[3] | 0x010000;
291 w0
[3] = w0
[3] | 0x01000000;
296 static void append_0x01_2x4 (u32 w0
[4], u32 w1
[4], const u32 offset
)
305 w0
[0] = w0
[0] | 0x0100;
309 w0
[0] = w0
[0] | 0x010000;
313 w0
[0] = w0
[0] | 0x01000000;
321 w0
[1] = w0
[1] | 0x0100;
325 w0
[1] = w0
[1] | 0x010000;
329 w0
[1] = w0
[1] | 0x01000000;
337 w0
[2] = w0
[2] | 0x0100;
341 w0
[2] = w0
[2] | 0x010000;
345 w0
[2] = w0
[2] | 0x01000000;
353 w0
[3] = w0
[3] | 0x0100;
357 w0
[3] = w0
[3] | 0x010000;
361 w0
[3] = w0
[3] | 0x01000000;
369 w1
[0] = w1
[0] | 0x0100;
373 w1
[0] = w1
[0] | 0x010000;
377 w1
[0] = w1
[0] | 0x01000000;
385 w1
[1] = w1
[1] | 0x0100;
389 w1
[1] = w1
[1] | 0x010000;
393 w1
[1] = w1
[1] | 0x01000000;
401 w1
[2] = w1
[2] | 0x0100;
405 w1
[2] = w1
[2] | 0x010000;
409 w1
[2] = w1
[2] | 0x01000000;
417 w1
[3] = w1
[3] | 0x0100;
421 w1
[3] = w1
[3] | 0x010000;
425 w1
[3] = w1
[3] | 0x01000000;
430 static void append_0x01_3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
439 w0
[0] = w0
[0] | 0x0100;
443 w0
[0] = w0
[0] | 0x010000;
447 w0
[0] = w0
[0] | 0x01000000;
455 w0
[1] = w0
[1] | 0x0100;
459 w0
[1] = w0
[1] | 0x010000;
463 w0
[1] = w0
[1] | 0x01000000;
471 w0
[2] = w0
[2] | 0x0100;
475 w0
[2] = w0
[2] | 0x010000;
479 w0
[2] = w0
[2] | 0x01000000;
487 w0
[3] = w0
[3] | 0x0100;
491 w0
[3] = w0
[3] | 0x010000;
495 w0
[3] = w0
[3] | 0x01000000;
503 w1
[0] = w1
[0] | 0x0100;
507 w1
[0] = w1
[0] | 0x010000;
511 w1
[0] = w1
[0] | 0x01000000;
519 w1
[1] = w1
[1] | 0x0100;
523 w1
[1] = w1
[1] | 0x010000;
527 w1
[1] = w1
[1] | 0x01000000;
535 w1
[2] = w1
[2] | 0x0100;
539 w1
[2] = w1
[2] | 0x010000;
543 w1
[2] = w1
[2] | 0x01000000;
551 w1
[3] = w1
[3] | 0x0100;
555 w1
[3] = w1
[3] | 0x010000;
559 w1
[3] = w1
[3] | 0x01000000;
567 w2
[0] = w2
[0] | 0x0100;
571 w2
[0] = w2
[0] | 0x010000;
575 w2
[0] = w2
[0] | 0x01000000;
583 w2
[1] = w2
[1] | 0x0100;
587 w2
[1] = w2
[1] | 0x010000;
591 w2
[1] = w2
[1] | 0x01000000;
599 w2
[2] = w2
[2] | 0x0100;
603 w2
[2] = w2
[2] | 0x010000;
607 w2
[2] = w2
[2] | 0x01000000;
615 w2
[3] = w2
[3] | 0x0100;
619 w2
[3] = w2
[3] | 0x010000;
623 w2
[3] = w2
[3] | 0x01000000;
628 static void append_0x01_4x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
637 w0
[0] = w0
[0] | 0x0100;
641 w0
[0] = w0
[0] | 0x010000;
645 w0
[0] = w0
[0] | 0x01000000;
653 w0
[1] = w0
[1] | 0x0100;
657 w0
[1] = w0
[1] | 0x010000;
661 w0
[1] = w0
[1] | 0x01000000;
669 w0
[2] = w0
[2] | 0x0100;
673 w0
[2] = w0
[2] | 0x010000;
677 w0
[2] = w0
[2] | 0x01000000;
685 w0
[3] = w0
[3] | 0x0100;
689 w0
[3] = w0
[3] | 0x010000;
693 w0
[3] = w0
[3] | 0x01000000;
701 w1
[0] = w1
[0] | 0x0100;
705 w1
[0] = w1
[0] | 0x010000;
709 w1
[0] = w1
[0] | 0x01000000;
717 w1
[1] = w1
[1] | 0x0100;
721 w1
[1] = w1
[1] | 0x010000;
725 w1
[1] = w1
[1] | 0x01000000;
733 w1
[2] = w1
[2] | 0x0100;
737 w1
[2] = w1
[2] | 0x010000;
741 w1
[2] = w1
[2] | 0x01000000;
749 w1
[3] = w1
[3] | 0x0100;
753 w1
[3] = w1
[3] | 0x010000;
757 w1
[3] = w1
[3] | 0x01000000;
765 w2
[0] = w2
[0] | 0x0100;
769 w2
[0] = w2
[0] | 0x010000;
773 w2
[0] = w2
[0] | 0x01000000;
781 w2
[1] = w2
[1] | 0x0100;
785 w2
[1] = w2
[1] | 0x010000;
789 w2
[1] = w2
[1] | 0x01000000;
797 w2
[2] = w2
[2] | 0x0100;
801 w2
[2] = w2
[2] | 0x010000;
805 w2
[2] = w2
[2] | 0x01000000;
813 w2
[3] = w2
[3] | 0x0100;
817 w2
[3] = w2
[3] | 0x010000;
821 w2
[3] = w2
[3] | 0x01000000;
829 w3
[0] = w3
[0] | 0x0100;
833 w3
[0] = w3
[0] | 0x010000;
837 w3
[0] = w3
[0] | 0x01000000;
845 w3
[1] = w3
[1] | 0x0100;
849 w3
[1] = w3
[1] | 0x010000;
853 w3
[1] = w3
[1] | 0x01000000;
861 w3
[2] = w3
[2] | 0x0100;
865 w3
[2] = w3
[2] | 0x010000;
869 w3
[2] = w3
[2] | 0x01000000;
877 w3
[3] = w3
[3] | 0x0100;
881 w3
[3] = w3
[3] | 0x010000;
885 w3
[3] = w3
[3] | 0x01000000;
890 static void append_0x01_8x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
899 w0
[0] = w0
[0] | 0x0100;
903 w0
[0] = w0
[0] | 0x010000;
907 w0
[0] = w0
[0] | 0x01000000;
915 w0
[1] = w0
[1] | 0x0100;
919 w0
[1] = w0
[1] | 0x010000;
923 w0
[1] = w0
[1] | 0x01000000;
931 w0
[2] = w0
[2] | 0x0100;
935 w0
[2] = w0
[2] | 0x010000;
939 w0
[2] = w0
[2] | 0x01000000;
947 w0
[3] = w0
[3] | 0x0100;
951 w0
[3] = w0
[3] | 0x010000;
955 w0
[3] = w0
[3] | 0x01000000;
963 w1
[0] = w1
[0] | 0x0100;
967 w1
[0] = w1
[0] | 0x010000;
971 w1
[0] = w1
[0] | 0x01000000;
979 w1
[1] = w1
[1] | 0x0100;
983 w1
[1] = w1
[1] | 0x010000;
987 w1
[1] = w1
[1] | 0x01000000;
995 w1
[2] = w1
[2] | 0x0100;
999 w1
[2] = w1
[2] | 0x010000;
1003 w1
[2] = w1
[2] | 0x01000000;
1011 w1
[3] = w1
[3] | 0x0100;
1015 w1
[3] = w1
[3] | 0x010000;
1019 w1
[3] = w1
[3] | 0x01000000;
1027 w2
[0] = w2
[0] | 0x0100;
1031 w2
[0] = w2
[0] | 0x010000;
1035 w2
[0] = w2
[0] | 0x01000000;
1043 w2
[1] = w2
[1] | 0x0100;
1047 w2
[1] = w2
[1] | 0x010000;
1051 w2
[1] = w2
[1] | 0x01000000;
1059 w2
[2] = w2
[2] | 0x0100;
1063 w2
[2] = w2
[2] | 0x010000;
1067 w2
[2] = w2
[2] | 0x01000000;
1075 w2
[3] = w2
[3] | 0x0100;
1079 w2
[3] = w2
[3] | 0x010000;
1083 w2
[3] = w2
[3] | 0x01000000;
1091 w3
[0] = w3
[0] | 0x0100;
1095 w3
[0] = w3
[0] | 0x010000;
1099 w3
[0] = w3
[0] | 0x01000000;
1107 w3
[1] = w3
[1] | 0x0100;
1111 w3
[1] = w3
[1] | 0x010000;
1115 w3
[1] = w3
[1] | 0x01000000;
1123 w3
[2] = w3
[2] | 0x0100;
1127 w3
[2] = w3
[2] | 0x010000;
1131 w3
[2] = w3
[2] | 0x01000000;
1139 w3
[3] = w3
[3] | 0x0100;
1143 w3
[3] = w3
[3] | 0x010000;
1147 w3
[3] = w3
[3] | 0x01000000;
1155 w4
[0] = w4
[0] | 0x0100;
1159 w4
[0] = w4
[0] | 0x010000;
1163 w4
[0] = w4
[0] | 0x01000000;
1171 w4
[1] = w4
[1] | 0x0100;
1175 w4
[1] = w4
[1] | 0x010000;
1179 w4
[1] = w4
[1] | 0x01000000;
1187 w4
[2] = w4
[2] | 0x0100;
1191 w4
[2] = w4
[2] | 0x010000;
1195 w4
[2] = w4
[2] | 0x01000000;
1203 w4
[3] = w4
[3] | 0x0100;
1207 w4
[3] = w4
[3] | 0x010000;
1211 w4
[3] = w4
[3] | 0x01000000;
1219 w5
[0] = w5
[0] | 0x0100;
1223 w5
[0] = w5
[0] | 0x010000;
1227 w5
[0] = w5
[0] | 0x01000000;
1235 w5
[1] = w5
[1] | 0x0100;
1239 w5
[1] = w5
[1] | 0x010000;
1243 w5
[1] = w5
[1] | 0x01000000;
1251 w5
[2] = w5
[2] | 0x0100;
1255 w5
[2] = w5
[2] | 0x010000;
1259 w5
[2] = w5
[2] | 0x01000000;
1267 w5
[3] = w5
[3] | 0x0100;
1271 w5
[3] = w5
[3] | 0x010000;
1275 w5
[3] = w5
[3] | 0x01000000;
1283 w6
[0] = w6
[0] | 0x0100;
1287 w6
[0] = w6
[0] | 0x010000;
1291 w6
[0] = w6
[0] | 0x01000000;
1299 w6
[1] = w6
[1] | 0x0100;
1303 w6
[1] = w6
[1] | 0x010000;
1307 w6
[1] = w6
[1] | 0x01000000;
1315 w6
[2] = w6
[2] | 0x0100;
1319 w6
[2] = w6
[2] | 0x010000;
1323 w6
[2] = w6
[2] | 0x01000000;
1331 w6
[3] = w6
[3] | 0x0100;
1335 w6
[3] = w6
[3] | 0x010000;
1339 w6
[3] = w6
[3] | 0x01000000;
1347 w7
[0] = w7
[0] | 0x0100;
1351 w7
[0] = w7
[0] | 0x010000;
1355 w7
[0] = w7
[0] | 0x01000000;
1363 w7
[1] = w7
[1] | 0x0100;
1367 w7
[1] = w7
[1] | 0x010000;
1371 w7
[1] = w7
[1] | 0x01000000;
1379 w7
[2] = w7
[2] | 0x0100;
1383 w7
[2] = w7
[2] | 0x010000;
1387 w7
[2] = w7
[2] | 0x01000000;
1395 w7
[3] = w7
[3] | 0x0100;
1399 w7
[3] = w7
[3] | 0x010000;
1403 w7
[3] = w7
[3] | 0x01000000;
1408 static void append_0x02_1x4 (u32 w0
[4], const u32 offset
)
1417 w0
[0] = w0
[0] | 0x0200;
1421 w0
[0] = w0
[0] | 0x020000;
1425 w0
[0] = w0
[0] | 0x02000000;
1433 w0
[1] = w0
[1] | 0x0200;
1437 w0
[1] = w0
[1] | 0x020000;
1441 w0
[1] = w0
[1] | 0x02000000;
1449 w0
[2] = w0
[2] | 0x0200;
1453 w0
[2] = w0
[2] | 0x020000;
1457 w0
[2] = w0
[2] | 0x02000000;
1465 w0
[3] = w0
[3] | 0x0200;
1469 w0
[3] = w0
[3] | 0x020000;
1473 w0
[3] = w0
[3] | 0x02000000;
1478 static void append_0x02_2x4 (u32 w0
[4], u32 w1
[4], const u32 offset
)
1487 w0
[0] = w0
[0] | 0x0200;
1491 w0
[0] = w0
[0] | 0x020000;
1495 w0
[0] = w0
[0] | 0x02000000;
1503 w0
[1] = w0
[1] | 0x0200;
1507 w0
[1] = w0
[1] | 0x020000;
1511 w0
[1] = w0
[1] | 0x02000000;
1519 w0
[2] = w0
[2] | 0x0200;
1523 w0
[2] = w0
[2] | 0x020000;
1527 w0
[2] = w0
[2] | 0x02000000;
1535 w0
[3] = w0
[3] | 0x0200;
1539 w0
[3] = w0
[3] | 0x020000;
1543 w0
[3] = w0
[3] | 0x02000000;
1551 w1
[0] = w1
[0] | 0x0200;
1555 w1
[0] = w1
[0] | 0x020000;
1559 w1
[0] = w1
[0] | 0x02000000;
1567 w1
[1] = w1
[1] | 0x0200;
1571 w1
[1] = w1
[1] | 0x020000;
1575 w1
[1] = w1
[1] | 0x02000000;
1583 w1
[2] = w1
[2] | 0x0200;
1587 w1
[2] = w1
[2] | 0x020000;
1591 w1
[2] = w1
[2] | 0x02000000;
1599 w1
[3] = w1
[3] | 0x0200;
1603 w1
[3] = w1
[3] | 0x020000;
1607 w1
[3] = w1
[3] | 0x02000000;
1612 static void append_0x02_3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
1621 w0
[0] = w0
[0] | 0x0200;
1625 w0
[0] = w0
[0] | 0x020000;
1629 w0
[0] = w0
[0] | 0x02000000;
1637 w0
[1] = w0
[1] | 0x0200;
1641 w0
[1] = w0
[1] | 0x020000;
1645 w0
[1] = w0
[1] | 0x02000000;
1653 w0
[2] = w0
[2] | 0x0200;
1657 w0
[2] = w0
[2] | 0x020000;
1661 w0
[2] = w0
[2] | 0x02000000;
1669 w0
[3] = w0
[3] | 0x0200;
1673 w0
[3] = w0
[3] | 0x020000;
1677 w0
[3] = w0
[3] | 0x02000000;
1685 w1
[0] = w1
[0] | 0x0200;
1689 w1
[0] = w1
[0] | 0x020000;
1693 w1
[0] = w1
[0] | 0x02000000;
1701 w1
[1] = w1
[1] | 0x0200;
1705 w1
[1] = w1
[1] | 0x020000;
1709 w1
[1] = w1
[1] | 0x02000000;
1717 w1
[2] = w1
[2] | 0x0200;
1721 w1
[2] = w1
[2] | 0x020000;
1725 w1
[2] = w1
[2] | 0x02000000;
1733 w1
[3] = w1
[3] | 0x0200;
1737 w1
[3] = w1
[3] | 0x020000;
1741 w1
[3] = w1
[3] | 0x02000000;
1749 w2
[0] = w2
[0] | 0x0200;
1753 w2
[0] = w2
[0] | 0x020000;
1757 w2
[0] = w2
[0] | 0x02000000;
1765 w2
[1] = w2
[1] | 0x0200;
1769 w2
[1] = w2
[1] | 0x020000;
1773 w2
[1] = w2
[1] | 0x02000000;
1781 w2
[2] = w2
[2] | 0x0200;
1785 w2
[2] = w2
[2] | 0x020000;
1789 w2
[2] = w2
[2] | 0x02000000;
1797 w2
[3] = w2
[3] | 0x0200;
1801 w2
[3] = w2
[3] | 0x020000;
1805 w2
[3] = w2
[3] | 0x02000000;
1810 static void append_0x02_4x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
1819 w0
[0] = w0
[0] | 0x0200;
1823 w0
[0] = w0
[0] | 0x020000;
1827 w0
[0] = w0
[0] | 0x02000000;
1835 w0
[1] = w0
[1] | 0x0200;
1839 w0
[1] = w0
[1] | 0x020000;
1843 w0
[1] = w0
[1] | 0x02000000;
1851 w0
[2] = w0
[2] | 0x0200;
1855 w0
[2] = w0
[2] | 0x020000;
1859 w0
[2] = w0
[2] | 0x02000000;
1867 w0
[3] = w0
[3] | 0x0200;
1871 w0
[3] = w0
[3] | 0x020000;
1875 w0
[3] = w0
[3] | 0x02000000;
1883 w1
[0] = w1
[0] | 0x0200;
1887 w1
[0] = w1
[0] | 0x020000;
1891 w1
[0] = w1
[0] | 0x02000000;
1899 w1
[1] = w1
[1] | 0x0200;
1903 w1
[1] = w1
[1] | 0x020000;
1907 w1
[1] = w1
[1] | 0x02000000;
1915 w1
[2] = w1
[2] | 0x0200;
1919 w1
[2] = w1
[2] | 0x020000;
1923 w1
[2] = w1
[2] | 0x02000000;
1931 w1
[3] = w1
[3] | 0x0200;
1935 w1
[3] = w1
[3] | 0x020000;
1939 w1
[3] = w1
[3] | 0x02000000;
1947 w2
[0] = w2
[0] | 0x0200;
1951 w2
[0] = w2
[0] | 0x020000;
1955 w2
[0] = w2
[0] | 0x02000000;
1963 w2
[1] = w2
[1] | 0x0200;
1967 w2
[1] = w2
[1] | 0x020000;
1971 w2
[1] = w2
[1] | 0x02000000;
1979 w2
[2] = w2
[2] | 0x0200;
1983 w2
[2] = w2
[2] | 0x020000;
1987 w2
[2] = w2
[2] | 0x02000000;
1995 w2
[3] = w2
[3] | 0x0200;
1999 w2
[3] = w2
[3] | 0x020000;
2003 w2
[3] = w2
[3] | 0x02000000;
2011 w3
[0] = w3
[0] | 0x0200;
2015 w3
[0] = w3
[0] | 0x020000;
2019 w3
[0] = w3
[0] | 0x02000000;
2027 w3
[1] = w3
[1] | 0x0200;
2031 w3
[1] = w3
[1] | 0x020000;
2035 w3
[1] = w3
[1] | 0x02000000;
2043 w3
[2] = w3
[2] | 0x0200;
2047 w3
[2] = w3
[2] | 0x020000;
2051 w3
[2] = w3
[2] | 0x02000000;
2059 w3
[3] = w3
[3] | 0x0200;
2063 w3
[3] = w3
[3] | 0x020000;
2067 w3
[3] = w3
[3] | 0x02000000;
2072 static void append_0x02_8x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
2081 w0
[0] = w0
[0] | 0x0200;
2085 w0
[0] = w0
[0] | 0x020000;
2089 w0
[0] = w0
[0] | 0x02000000;
2097 w0
[1] = w0
[1] | 0x0200;
2101 w0
[1] = w0
[1] | 0x020000;
2105 w0
[1] = w0
[1] | 0x02000000;
2113 w0
[2] = w0
[2] | 0x0200;
2117 w0
[2] = w0
[2] | 0x020000;
2121 w0
[2] = w0
[2] | 0x02000000;
2129 w0
[3] = w0
[3] | 0x0200;
2133 w0
[3] = w0
[3] | 0x020000;
2137 w0
[3] = w0
[3] | 0x02000000;
2145 w1
[0] = w1
[0] | 0x0200;
2149 w1
[0] = w1
[0] | 0x020000;
2153 w1
[0] = w1
[0] | 0x02000000;
2161 w1
[1] = w1
[1] | 0x0200;
2165 w1
[1] = w1
[1] | 0x020000;
2169 w1
[1] = w1
[1] | 0x02000000;
2177 w1
[2] = w1
[2] | 0x0200;
2181 w1
[2] = w1
[2] | 0x020000;
2185 w1
[2] = w1
[2] | 0x02000000;
2193 w1
[3] = w1
[3] | 0x0200;
2197 w1
[3] = w1
[3] | 0x020000;
2201 w1
[3] = w1
[3] | 0x02000000;
2209 w2
[0] = w2
[0] | 0x0200;
2213 w2
[0] = w2
[0] | 0x020000;
2217 w2
[0] = w2
[0] | 0x02000000;
2225 w2
[1] = w2
[1] | 0x0200;
2229 w2
[1] = w2
[1] | 0x020000;
2233 w2
[1] = w2
[1] | 0x02000000;
2241 w2
[2] = w2
[2] | 0x0200;
2245 w2
[2] = w2
[2] | 0x020000;
2249 w2
[2] = w2
[2] | 0x02000000;
2257 w2
[3] = w2
[3] | 0x0200;
2261 w2
[3] = w2
[3] | 0x020000;
2265 w2
[3] = w2
[3] | 0x02000000;
2273 w3
[0] = w3
[0] | 0x0200;
2277 w3
[0] = w3
[0] | 0x020000;
2281 w3
[0] = w3
[0] | 0x02000000;
2289 w3
[1] = w3
[1] | 0x0200;
2293 w3
[1] = w3
[1] | 0x020000;
2297 w3
[1] = w3
[1] | 0x02000000;
2305 w3
[2] = w3
[2] | 0x0200;
2309 w3
[2] = w3
[2] | 0x020000;
2313 w3
[2] = w3
[2] | 0x02000000;
2321 w3
[3] = w3
[3] | 0x0200;
2325 w3
[3] = w3
[3] | 0x020000;
2329 w3
[3] = w3
[3] | 0x02000000;
2337 w4
[0] = w4
[0] | 0x0200;
2341 w4
[0] = w4
[0] | 0x020000;
2345 w4
[0] = w4
[0] | 0x02000000;
2353 w4
[1] = w4
[1] | 0x0200;
2357 w4
[1] = w4
[1] | 0x020000;
2361 w4
[1] = w4
[1] | 0x02000000;
2369 w4
[2] = w4
[2] | 0x0200;
2373 w4
[2] = w4
[2] | 0x020000;
2377 w4
[2] = w4
[2] | 0x02000000;
2385 w4
[3] = w4
[3] | 0x0200;
2389 w4
[3] = w4
[3] | 0x020000;
2393 w4
[3] = w4
[3] | 0x02000000;
2401 w5
[0] = w5
[0] | 0x0200;
2405 w5
[0] = w5
[0] | 0x020000;
2409 w5
[0] = w5
[0] | 0x02000000;
2417 w5
[1] = w5
[1] | 0x0200;
2421 w5
[1] = w5
[1] | 0x020000;
2425 w5
[1] = w5
[1] | 0x02000000;
2433 w5
[2] = w5
[2] | 0x0200;
2437 w5
[2] = w5
[2] | 0x020000;
2441 w5
[2] = w5
[2] | 0x02000000;
2449 w5
[3] = w5
[3] | 0x0200;
2453 w5
[3] = w5
[3] | 0x020000;
2457 w5
[3] = w5
[3] | 0x02000000;
2465 w6
[0] = w6
[0] | 0x0200;
2469 w6
[0] = w6
[0] | 0x020000;
2473 w6
[0] = w6
[0] | 0x02000000;
2481 w6
[1] = w6
[1] | 0x0200;
2485 w6
[1] = w6
[1] | 0x020000;
2489 w6
[1] = w6
[1] | 0x02000000;
2497 w6
[2] = w6
[2] | 0x0200;
2501 w6
[2] = w6
[2] | 0x020000;
2505 w6
[2] = w6
[2] | 0x02000000;
2513 w6
[3] = w6
[3] | 0x0200;
2517 w6
[3] = w6
[3] | 0x020000;
2521 w6
[3] = w6
[3] | 0x02000000;
2529 w7
[0] = w7
[0] | 0x0200;
2533 w7
[0] = w7
[0] | 0x020000;
2537 w7
[0] = w7
[0] | 0x02000000;
2545 w7
[1] = w7
[1] | 0x0200;
2549 w7
[1] = w7
[1] | 0x020000;
2553 w7
[1] = w7
[1] | 0x02000000;
2561 w7
[2] = w7
[2] | 0x0200;
2565 w7
[2] = w7
[2] | 0x020000;
2569 w7
[2] = w7
[2] | 0x02000000;
2577 w7
[3] = w7
[3] | 0x0200;
2581 w7
[3] = w7
[3] | 0x020000;
2585 w7
[3] = w7
[3] | 0x02000000;
2590 static void append_0x80_1x4 (u32 w0
[4], const u32 offset
)
2599 w0
[0] = w0
[0] | 0x8000;
2603 w0
[0] = w0
[0] | 0x800000;
2607 w0
[0] = w0
[0] | 0x80000000;
2615 w0
[1] = w0
[1] | 0x8000;
2619 w0
[1] = w0
[1] | 0x800000;
2623 w0
[1] = w0
[1] | 0x80000000;
2631 w0
[2] = w0
[2] | 0x8000;
2635 w0
[2] = w0
[2] | 0x800000;
2639 w0
[2] = w0
[2] | 0x80000000;
2647 w0
[3] = w0
[3] | 0x8000;
2651 w0
[3] = w0
[3] | 0x800000;
2655 w0
[3] = w0
[3] | 0x80000000;
2660 static void append_0x80_2x4 (u32 w0
[4], u32 w1
[4], const u32 offset
)
2669 w0
[0] = w0
[0] | 0x8000;
2673 w0
[0] = w0
[0] | 0x800000;
2677 w0
[0] = w0
[0] | 0x80000000;
2685 w0
[1] = w0
[1] | 0x8000;
2689 w0
[1] = w0
[1] | 0x800000;
2693 w0
[1] = w0
[1] | 0x80000000;
2701 w0
[2] = w0
[2] | 0x8000;
2705 w0
[2] = w0
[2] | 0x800000;
2709 w0
[2] = w0
[2] | 0x80000000;
2717 w0
[3] = w0
[3] | 0x8000;
2721 w0
[3] = w0
[3] | 0x800000;
2725 w0
[3] = w0
[3] | 0x80000000;
2733 w1
[0] = w1
[0] | 0x8000;
2737 w1
[0] = w1
[0] | 0x800000;
2741 w1
[0] = w1
[0] | 0x80000000;
2749 w1
[1] = w1
[1] | 0x8000;
2753 w1
[1] = w1
[1] | 0x800000;
2757 w1
[1] = w1
[1] | 0x80000000;
2765 w1
[2] = w1
[2] | 0x8000;
2769 w1
[2] = w1
[2] | 0x800000;
2773 w1
[2] = w1
[2] | 0x80000000;
2781 w1
[3] = w1
[3] | 0x8000;
2785 w1
[3] = w1
[3] | 0x800000;
2789 w1
[3] = w1
[3] | 0x80000000;
2794 static void append_0x80_3x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
2803 w0
[0] = w0
[0] | 0x8000;
2807 w0
[0] = w0
[0] | 0x800000;
2811 w0
[0] = w0
[0] | 0x80000000;
2819 w0
[1] = w0
[1] | 0x8000;
2823 w0
[1] = w0
[1] | 0x800000;
2827 w0
[1] = w0
[1] | 0x80000000;
2835 w0
[2] = w0
[2] | 0x8000;
2839 w0
[2] = w0
[2] | 0x800000;
2843 w0
[2] = w0
[2] | 0x80000000;
2851 w0
[3] = w0
[3] | 0x8000;
2855 w0
[3] = w0
[3] | 0x800000;
2859 w0
[3] = w0
[3] | 0x80000000;
2867 w1
[0] = w1
[0] | 0x8000;
2871 w1
[0] = w1
[0] | 0x800000;
2875 w1
[0] = w1
[0] | 0x80000000;
2883 w1
[1] = w1
[1] | 0x8000;
2887 w1
[1] = w1
[1] | 0x800000;
2891 w1
[1] = w1
[1] | 0x80000000;
2899 w1
[2] = w1
[2] | 0x8000;
2903 w1
[2] = w1
[2] | 0x800000;
2907 w1
[2] = w1
[2] | 0x80000000;
2915 w1
[3] = w1
[3] | 0x8000;
2919 w1
[3] = w1
[3] | 0x800000;
2923 w1
[3] = w1
[3] | 0x80000000;
2931 w2
[0] = w2
[0] | 0x8000;
2935 w2
[0] = w2
[0] | 0x800000;
2939 w2
[0] = w2
[0] | 0x80000000;
2947 w2
[1] = w2
[1] | 0x8000;
2951 w2
[1] = w2
[1] | 0x800000;
2955 w2
[1] = w2
[1] | 0x80000000;
2963 w2
[2] = w2
[2] | 0x8000;
2967 w2
[2] = w2
[2] | 0x800000;
2971 w2
[2] = w2
[2] | 0x80000000;
2979 w2
[3] = w2
[3] | 0x8000;
2983 w2
[3] = w2
[3] | 0x800000;
2987 w2
[3] = w2
[3] | 0x80000000;
2992 static void append_0x80_4x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
3001 w0
[0] = w0
[0] | 0x8000;
3005 w0
[0] = w0
[0] | 0x800000;
3009 w0
[0] = w0
[0] | 0x80000000;
3017 w0
[1] = w0
[1] | 0x8000;
3021 w0
[1] = w0
[1] | 0x800000;
3025 w0
[1] = w0
[1] | 0x80000000;
3033 w0
[2] = w0
[2] | 0x8000;
3037 w0
[2] = w0
[2] | 0x800000;
3041 w0
[2] = w0
[2] | 0x80000000;
3049 w0
[3] = w0
[3] | 0x8000;
3053 w0
[3] = w0
[3] | 0x800000;
3057 w0
[3] = w0
[3] | 0x80000000;
3065 w1
[0] = w1
[0] | 0x8000;
3069 w1
[0] = w1
[0] | 0x800000;
3073 w1
[0] = w1
[0] | 0x80000000;
3081 w1
[1] = w1
[1] | 0x8000;
3085 w1
[1] = w1
[1] | 0x800000;
3089 w1
[1] = w1
[1] | 0x80000000;
3097 w1
[2] = w1
[2] | 0x8000;
3101 w1
[2] = w1
[2] | 0x800000;
3105 w1
[2] = w1
[2] | 0x80000000;
3113 w1
[3] = w1
[3] | 0x8000;
3117 w1
[3] = w1
[3] | 0x800000;
3121 w1
[3] = w1
[3] | 0x80000000;
3129 w2
[0] = w2
[0] | 0x8000;
3133 w2
[0] = w2
[0] | 0x800000;
3137 w2
[0] = w2
[0] | 0x80000000;
3145 w2
[1] = w2
[1] | 0x8000;
3149 w2
[1] = w2
[1] | 0x800000;
3153 w2
[1] = w2
[1] | 0x80000000;
3161 w2
[2] = w2
[2] | 0x8000;
3165 w2
[2] = w2
[2] | 0x800000;
3169 w2
[2] = w2
[2] | 0x80000000;
3177 w2
[3] = w2
[3] | 0x8000;
3181 w2
[3] = w2
[3] | 0x800000;
3185 w2
[3] = w2
[3] | 0x80000000;
3193 w3
[0] = w3
[0] | 0x8000;
3197 w3
[0] = w3
[0] | 0x800000;
3201 w3
[0] = w3
[0] | 0x80000000;
3209 w3
[1] = w3
[1] | 0x8000;
3213 w3
[1] = w3
[1] | 0x800000;
3217 w3
[1] = w3
[1] | 0x80000000;
3225 w3
[2] = w3
[2] | 0x8000;
3229 w3
[2] = w3
[2] | 0x800000;
3233 w3
[2] = w3
[2] | 0x80000000;
3241 w3
[3] = w3
[3] | 0x8000;
3245 w3
[3] = w3
[3] | 0x800000;
3249 w3
[3] = w3
[3] | 0x80000000;
3254 static void append_0x80_8x4 (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], u32 w4
[4], u32 w5
[4], u32 w6
[4], u32 w7
[4], const u32 offset
)
3263 w0
[0] = w0
[0] | 0x8000;
3267 w0
[0] = w0
[0] | 0x800000;
3271 w0
[0] = w0
[0] | 0x80000000;
3279 w0
[1] = w0
[1] | 0x8000;
3283 w0
[1] = w0
[1] | 0x800000;
3287 w0
[1] = w0
[1] | 0x80000000;
3295 w0
[2] = w0
[2] | 0x8000;
3299 w0
[2] = w0
[2] | 0x800000;
3303 w0
[2] = w0
[2] | 0x80000000;
3311 w0
[3] = w0
[3] | 0x8000;
3315 w0
[3] = w0
[3] | 0x800000;
3319 w0
[3] = w0
[3] | 0x80000000;
3327 w1
[0] = w1
[0] | 0x8000;
3331 w1
[0] = w1
[0] | 0x800000;
3335 w1
[0] = w1
[0] | 0x80000000;
3343 w1
[1] = w1
[1] | 0x8000;
3347 w1
[1] = w1
[1] | 0x800000;
3351 w1
[1] = w1
[1] | 0x80000000;
3359 w1
[2] = w1
[2] | 0x8000;
3363 w1
[2] = w1
[2] | 0x800000;
3367 w1
[2] = w1
[2] | 0x80000000;
3375 w1
[3] = w1
[3] | 0x8000;
3379 w1
[3] = w1
[3] | 0x800000;
3383 w1
[3] = w1
[3] | 0x80000000;
3391 w2
[0] = w2
[0] | 0x8000;
3395 w2
[0] = w2
[0] | 0x800000;
3399 w2
[0] = w2
[0] | 0x80000000;
3407 w2
[1] = w2
[1] | 0x8000;
3411 w2
[1] = w2
[1] | 0x800000;
3415 w2
[1] = w2
[1] | 0x80000000;
3423 w2
[2] = w2
[2] | 0x8000;
3427 w2
[2] = w2
[2] | 0x800000;
3431 w2
[2] = w2
[2] | 0x80000000;
3439 w2
[3] = w2
[3] | 0x8000;
3443 w2
[3] = w2
[3] | 0x800000;
3447 w2
[3] = w2
[3] | 0x80000000;
3455 w3
[0] = w3
[0] | 0x8000;
3459 w3
[0] = w3
[0] | 0x800000;
3463 w3
[0] = w3
[0] | 0x80000000;
3471 w3
[1] = w3
[1] | 0x8000;
3475 w3
[1] = w3
[1] | 0x800000;
3479 w3
[1] = w3
[1] | 0x80000000;
3487 w3
[2] = w3
[2] | 0x8000;
3491 w3
[2] = w3
[2] | 0x800000;
3495 w3
[2] = w3
[2] | 0x80000000;
3503 w3
[3] = w3
[3] | 0x8000;
3507 w3
[3] = w3
[3] | 0x800000;
3511 w3
[3] = w3
[3] | 0x80000000;
3519 w4
[0] = w4
[0] | 0x8000;
3523 w4
[0] = w4
[0] | 0x800000;
3527 w4
[0] = w4
[0] | 0x80000000;
3535 w4
[1] = w4
[1] | 0x8000;
3539 w4
[1] = w4
[1] | 0x800000;
3543 w4
[1] = w4
[1] | 0x80000000;
3551 w4
[2] = w4
[2] | 0x8000;
3555 w4
[2] = w4
[2] | 0x800000;
3559 w4
[2] = w4
[2] | 0x80000000;
3567 w4
[3] = w4
[3] | 0x8000;
3571 w4
[3] = w4
[3] | 0x800000;
3575 w4
[3] = w4
[3] | 0x80000000;
3583 w5
[0] = w5
[0] | 0x8000;
3587 w5
[0] = w5
[0] | 0x800000;
3591 w5
[0] = w5
[0] | 0x80000000;
3599 w5
[1] = w5
[1] | 0x8000;
3603 w5
[1] = w5
[1] | 0x800000;
3607 w5
[1] = w5
[1] | 0x80000000;
3615 w5
[2] = w5
[2] | 0x8000;
3619 w5
[2] = w5
[2] | 0x800000;
3623 w5
[2] = w5
[2] | 0x80000000;
3631 w5
[3] = w5
[3] | 0x8000;
3635 w5
[3] = w5
[3] | 0x800000;
3639 w5
[3] = w5
[3] | 0x80000000;
3647 w6
[0] = w6
[0] | 0x8000;
3651 w6
[0] = w6
[0] | 0x800000;
3655 w6
[0] = w6
[0] | 0x80000000;
3663 w6
[1] = w6
[1] | 0x8000;
3667 w6
[1] = w6
[1] | 0x800000;
3671 w6
[1] = w6
[1] | 0x80000000;
3679 w6
[2] = w6
[2] | 0x8000;
3683 w6
[2] = w6
[2] | 0x800000;
3687 w6
[2] = w6
[2] | 0x80000000;
3695 w6
[3] = w6
[3] | 0x8000;
3699 w6
[3] = w6
[3] | 0x800000;
3703 w6
[3] = w6
[3] | 0x80000000;
3711 w7
[0] = w7
[0] | 0x8000;
3715 w7
[0] = w7
[0] | 0x800000;
3719 w7
[0] = w7
[0] | 0x80000000;
3727 w7
[1] = w7
[1] | 0x8000;
3731 w7
[1] = w7
[1] | 0x800000;
3735 w7
[1] = w7
[1] | 0x80000000;
3743 w7
[2] = w7
[2] | 0x8000;
3747 w7
[2] = w7
[2] | 0x800000;
3751 w7
[2] = w7
[2] | 0x80000000;
3759 w7
[3] = w7
[3] | 0x8000;
3763 w7
[3] = w7
[3] | 0x800000;
3767 w7
[3] = w7
[3] | 0x80000000;
3772 static void append_0x80_1x16 (u32 w
[16], const u32 offset
)
3781 w
[ 0] = w
[ 0] | 0x8000;
3785 w
[ 0] = w
[ 0] | 0x800000;
3789 w
[ 0] = w
[ 0] | 0x80000000;
3797 w
[ 1] = w
[ 1] | 0x8000;
3801 w
[ 1] = w
[ 1] | 0x800000;
3805 w
[ 1] = w
[ 1] | 0x80000000;
3813 w
[ 2] = w
[ 2] | 0x8000;
3817 w
[ 2] = w
[ 2] | 0x800000;
3821 w
[ 2] = w
[ 2] | 0x80000000;
3829 w
[ 3] = w
[ 3] | 0x8000;
3833 w
[ 3] = w
[ 3] | 0x800000;
3837 w
[ 3] = w
[ 3] | 0x80000000;
3845 w
[ 4] = w
[ 4] | 0x8000;
3849 w
[ 4] = w
[ 4] | 0x800000;
3853 w
[ 4] = w
[ 4] | 0x80000000;
3861 w
[ 5] = w
[ 5] | 0x8000;
3865 w
[ 5] = w
[ 5] | 0x800000;
3869 w
[ 5] = w
[ 5] | 0x80000000;
3877 w
[ 6] = w
[ 6] | 0x8000;
3881 w
[ 6] = w
[ 6] | 0x800000;
3885 w
[ 6] = w
[ 6] | 0x80000000;
3893 w
[ 7] = w
[ 7] | 0x8000;
3897 w
[ 7] = w
[ 7] | 0x800000;
3901 w
[ 7] = w
[ 7] | 0x80000000;
3909 w
[ 8] = w
[ 8] | 0x8000;
3913 w
[ 8] = w
[ 8] | 0x800000;
3917 w
[ 8] = w
[ 8] | 0x80000000;
3925 w
[ 9] = w
[ 9] | 0x8000;
3929 w
[ 9] = w
[ 9] | 0x800000;
3933 w
[ 9] = w
[ 9] | 0x80000000;
3941 w
[10] = w
[10] | 0x8000;
3945 w
[10] = w
[10] | 0x800000;
3949 w
[10] = w
[10] | 0x80000000;
3957 w
[11] = w
[11] | 0x8000;
3961 w
[11] = w
[11] | 0x800000;
3965 w
[11] = w
[11] | 0x80000000;
3973 w
[12] = w
[12] | 0x8000;
3977 w
[12] = w
[12] | 0x800000;
3981 w
[12] = w
[12] | 0x80000000;
3989 w
[13] = w
[13] | 0x8000;
3993 w
[13] = w
[13] | 0x800000;
3997 w
[13] = w
[13] | 0x80000000;
4005 w
[14] = w
[14] | 0x8000;
4009 w
[14] = w
[14] | 0x800000;
4013 w
[14] = w
[14] | 0x80000000;
4021 w
[15] = w
[15] | 0x8000;
4025 w
[15] = w
[15] | 0x800000;
4029 w
[15] = w
[15] | 0x80000000;
4034 static void switch_buffer_by_offset_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
4036 #if defined IS_AMD || defined IS_GENERIC
4037 const int offset_mod_4
= offset
& 3;
4039 const int offset_minus_4
= 4 - offset
;
4044 w3
[2] = amd_bytealign_S ( 0, w3
[1], offset_minus_4
);
4045 w3
[1] = amd_bytealign_S (w3
[1], w3
[0], offset_minus_4
);
4046 w3
[0] = amd_bytealign_S (w3
[0], w2
[3], offset_minus_4
);
4047 w2
[3] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
4048 w2
[2] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
4049 w2
[1] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
4050 w2
[0] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
4051 w1
[3] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
4052 w1
[2] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
4053 w1
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
4054 w1
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
4055 w0
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
4056 w0
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
4057 w0
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4058 w0
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4060 if (offset_mod_4
== 0)
4082 w3
[2] = amd_bytealign_S ( 0, w3
[0], offset_minus_4
);
4083 w3
[1] = amd_bytealign_S (w3
[0], w2
[3], offset_minus_4
);
4084 w3
[0] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
4085 w2
[3] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
4086 w2
[2] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
4087 w2
[1] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
4088 w2
[0] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
4089 w1
[3] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
4090 w1
[2] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
4091 w1
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
4092 w1
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
4093 w0
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
4094 w0
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4095 w0
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4098 if (offset_mod_4
== 0)
4119 w3
[2] = amd_bytealign_S ( 0, w2
[3], offset_minus_4
);
4120 w3
[1] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
4121 w3
[0] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
4122 w2
[3] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
4123 w2
[2] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
4124 w2
[1] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
4125 w2
[0] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
4126 w1
[3] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
4127 w1
[2] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
4128 w1
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
4129 w1
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
4130 w0
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4131 w0
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4135 if (offset_mod_4
== 0)
4155 w3
[2] = amd_bytealign_S ( 0, w2
[2], offset_minus_4
);
4156 w3
[1] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
4157 w3
[0] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
4158 w2
[3] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
4159 w2
[2] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
4160 w2
[1] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
4161 w2
[0] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
4162 w1
[3] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
4163 w1
[2] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
4164 w1
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
4165 w1
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4166 w0
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4171 if (offset_mod_4
== 0)
4190 w3
[2] = amd_bytealign_S ( 0, w2
[1], offset_minus_4
);
4191 w3
[1] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
4192 w3
[0] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
4193 w2
[3] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
4194 w2
[2] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
4195 w2
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
4196 w2
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
4197 w1
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
4198 w1
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
4199 w1
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4200 w1
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4206 if (offset_mod_4
== 0)
4224 w3
[2] = amd_bytealign_S ( 0, w2
[0], offset_minus_4
);
4225 w3
[1] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
4226 w3
[0] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
4227 w2
[3] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
4228 w2
[2] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
4229 w2
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
4230 w2
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
4231 w1
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
4232 w1
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4233 w1
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4240 if (offset_mod_4
== 0)
4257 w3
[2] = amd_bytealign_S ( 0, w1
[3], offset_minus_4
);
4258 w3
[1] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
4259 w3
[0] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
4260 w2
[3] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
4261 w2
[2] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
4262 w2
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
4263 w2
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
4264 w1
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4265 w1
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4273 if (offset_mod_4
== 0)
4289 w3
[2] = amd_bytealign_S ( 0, w1
[2], offset_minus_4
);
4290 w3
[1] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
4291 w3
[0] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
4292 w2
[3] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
4293 w2
[2] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
4294 w2
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
4295 w2
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4296 w1
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4305 if (offset_mod_4
== 0)
4320 w3
[2] = amd_bytealign_S ( 0, w1
[1], offset_minus_4
);
4321 w3
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
4322 w3
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
4323 w2
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
4324 w2
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
4325 w2
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4326 w2
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4336 if (offset_mod_4
== 0)
4350 w3
[2] = amd_bytealign_S ( 0, w1
[0], offset_minus_4
);
4351 w3
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
4352 w3
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
4353 w2
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
4354 w2
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4355 w2
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4366 if (offset_mod_4
== 0)
4379 w3
[2] = amd_bytealign_S ( 0, w0
[3], offset_minus_4
);
4380 w3
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
4381 w3
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
4382 w2
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4383 w2
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4395 if (offset_mod_4
== 0)
4407 w3
[2] = amd_bytealign_S ( 0, w0
[2], offset_minus_4
);
4408 w3
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
4409 w3
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4410 w2
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4423 if (offset_mod_4
== 0)
4434 w3
[2] = amd_bytealign_S ( 0, w0
[1], offset_minus_4
);
4435 w3
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
4436 w3
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4450 if (offset_mod_4
== 0)
4460 w3
[2] = amd_bytealign_S ( 0, w0
[0], offset_minus_4
);
4461 w3
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
4476 if (offset_mod_4
== 0)
4487 const int offset_minus_4
= 4 - (offset
% 4);
4489 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
4494 w3
[1] = __byte_perm_S (w3
[0], w3
[1], selector
);
4495 w3
[0] = __byte_perm_S (w2
[3], w3
[0], selector
);
4496 w2
[3] = __byte_perm_S (w2
[2], w2
[3], selector
);
4497 w2
[2] = __byte_perm_S (w2
[1], w2
[2], selector
);
4498 w2
[1] = __byte_perm_S (w2
[0], w2
[1], selector
);
4499 w2
[0] = __byte_perm_S (w1
[3], w2
[0], selector
);
4500 w1
[3] = __byte_perm_S (w1
[2], w1
[3], selector
);
4501 w1
[2] = __byte_perm_S (w1
[1], w1
[2], selector
);
4502 w1
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
4503 w1
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
4504 w0
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
4505 w0
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
4506 w0
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
4507 w0
[0] = __byte_perm_S ( 0, w0
[0], selector
);
4512 w3
[1] = __byte_perm_S (w2
[3], w3
[0], selector
);
4513 w3
[0] = __byte_perm_S (w2
[2], w2
[3], selector
);
4514 w2
[3] = __byte_perm_S (w2
[1], w2
[2], selector
);
4515 w2
[2] = __byte_perm_S (w2
[0], w2
[1], selector
);
4516 w2
[1] = __byte_perm_S (w1
[3], w2
[0], selector
);
4517 w2
[0] = __byte_perm_S (w1
[2], w1
[3], selector
);
4518 w1
[3] = __byte_perm_S (w1
[1], w1
[2], selector
);
4519 w1
[2] = __byte_perm_S (w1
[0], w1
[1], selector
);
4520 w1
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
4521 w1
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
4522 w0
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
4523 w0
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
4524 w0
[1] = __byte_perm_S ( 0, w0
[0], selector
);
4530 w3
[1] = __byte_perm_S (w2
[2], w2
[3], selector
);
4531 w3
[0] = __byte_perm_S (w2
[1], w2
[2], selector
);
4532 w2
[3] = __byte_perm_S (w2
[0], w2
[1], selector
);
4533 w2
[2] = __byte_perm_S (w1
[3], w2
[0], selector
);
4534 w2
[1] = __byte_perm_S (w1
[2], w1
[3], selector
);
4535 w2
[0] = __byte_perm_S (w1
[1], w1
[2], selector
);
4536 w1
[3] = __byte_perm_S (w1
[0], w1
[1], selector
);
4537 w1
[2] = __byte_perm_S (w0
[3], w1
[0], selector
);
4538 w1
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
4539 w1
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
4540 w0
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
4541 w0
[2] = __byte_perm_S ( 0, w0
[0], selector
);
4548 w3
[1] = __byte_perm_S (w2
[1], w2
[2], selector
);
4549 w3
[0] = __byte_perm_S (w2
[0], w2
[1], selector
);
4550 w2
[3] = __byte_perm_S (w1
[3], w2
[0], selector
);
4551 w2
[2] = __byte_perm_S (w1
[2], w1
[3], selector
);
4552 w2
[1] = __byte_perm_S (w1
[1], w1
[2], selector
);
4553 w2
[0] = __byte_perm_S (w1
[0], w1
[1], selector
);
4554 w1
[3] = __byte_perm_S (w0
[3], w1
[0], selector
);
4555 w1
[2] = __byte_perm_S (w0
[2], w0
[3], selector
);
4556 w1
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
4557 w1
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
4558 w0
[3] = __byte_perm_S ( 0, w0
[0], selector
);
4566 w3
[1] = __byte_perm_S (w2
[0], w2
[1], selector
);
4567 w3
[0] = __byte_perm_S (w1
[3], w2
[0], selector
);
4568 w2
[3] = __byte_perm_S (w1
[2], w1
[3], selector
);
4569 w2
[2] = __byte_perm_S (w1
[1], w1
[2], selector
);
4570 w2
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
4571 w2
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
4572 w1
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
4573 w1
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
4574 w1
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
4575 w1
[0] = __byte_perm_S ( 0, w0
[0], selector
);
4584 w3
[1] = __byte_perm_S (w1
[3], w2
[0], selector
);
4585 w3
[0] = __byte_perm_S (w1
[2], w1
[3], selector
);
4586 w2
[3] = __byte_perm_S (w1
[1], w1
[2], selector
);
4587 w2
[2] = __byte_perm_S (w1
[0], w1
[1], selector
);
4588 w2
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
4589 w2
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
4590 w1
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
4591 w1
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
4592 w1
[1] = __byte_perm_S ( 0, w0
[0], selector
);
4602 w3
[1] = __byte_perm_S (w1
[2], w1
[3], selector
);
4603 w3
[0] = __byte_perm_S (w1
[1], w1
[2], selector
);
4604 w2
[3] = __byte_perm_S (w1
[0], w1
[1], selector
);
4605 w2
[2] = __byte_perm_S (w0
[3], w1
[0], selector
);
4606 w2
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
4607 w2
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
4608 w1
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
4609 w1
[2] = __byte_perm_S ( 0, w0
[0], selector
);
4620 w3
[1] = __byte_perm_S (w1
[1], w1
[2], selector
);
4621 w3
[0] = __byte_perm_S (w1
[0], w1
[1], selector
);
4622 w2
[3] = __byte_perm_S (w0
[3], w1
[0], selector
);
4623 w2
[2] = __byte_perm_S (w0
[2], w0
[3], selector
);
4624 w2
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
4625 w2
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
4626 w1
[3] = __byte_perm_S ( 0, w0
[0], selector
);
4638 w3
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
4639 w3
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
4640 w2
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
4641 w2
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
4642 w2
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
4643 w2
[0] = __byte_perm_S ( 0, w0
[0], selector
);
4656 w3
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
4657 w3
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
4658 w2
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
4659 w2
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
4660 w2
[1] = __byte_perm_S ( 0, w0
[0], selector
);
4674 w3
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
4675 w3
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
4676 w2
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
4677 w2
[2] = __byte_perm_S ( 0, w0
[0], selector
);
4692 w3
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
4693 w3
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
4694 w2
[3] = __byte_perm_S ( 0, w0
[0], selector
);
4710 w3
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
4711 w3
[0] = __byte_perm_S ( 0, w0
[0], selector
);
4728 w3
[1] = __byte_perm_S ( 0, w0
[0], selector
);
4748 static void switch_buffer_by_offset_be_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
4750 #if defined IS_AMD || defined IS_GENERIC
4754 w3
[2] = amd_bytealign_S (w3
[1], 0, offset
);
4755 w3
[1] = amd_bytealign_S (w3
[0], w3
[1], offset
);
4756 w3
[0] = amd_bytealign_S (w2
[3], w3
[0], offset
);
4757 w2
[3] = amd_bytealign_S (w2
[2], w2
[3], offset
);
4758 w2
[2] = amd_bytealign_S (w2
[1], w2
[2], offset
);
4759 w2
[1] = amd_bytealign_S (w2
[0], w2
[1], offset
);
4760 w2
[0] = amd_bytealign_S (w1
[3], w2
[0], offset
);
4761 w1
[3] = amd_bytealign_S (w1
[2], w1
[3], offset
);
4762 w1
[2] = amd_bytealign_S (w1
[1], w1
[2], offset
);
4763 w1
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
4764 w1
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
4765 w0
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
4766 w0
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
4767 w0
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4768 w0
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
4772 w3
[2] = amd_bytealign_S (w3
[0], 0, offset
);
4773 w3
[1] = amd_bytealign_S (w2
[3], w3
[0], offset
);
4774 w3
[0] = amd_bytealign_S (w2
[2], w2
[3], offset
);
4775 w2
[3] = amd_bytealign_S (w2
[1], w2
[2], offset
);
4776 w2
[2] = amd_bytealign_S (w2
[0], w2
[1], offset
);
4777 w2
[1] = amd_bytealign_S (w1
[3], w2
[0], offset
);
4778 w2
[0] = amd_bytealign_S (w1
[2], w1
[3], offset
);
4779 w1
[3] = amd_bytealign_S (w1
[1], w1
[2], offset
);
4780 w1
[2] = amd_bytealign_S (w1
[0], w1
[1], offset
);
4781 w1
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
4782 w1
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
4783 w0
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
4784 w0
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4785 w0
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
4790 w3
[2] = amd_bytealign_S (w2
[3], 0, offset
);
4791 w3
[1] = amd_bytealign_S (w2
[2], w2
[3], offset
);
4792 w3
[0] = amd_bytealign_S (w2
[1], w2
[2], offset
);
4793 w2
[3] = amd_bytealign_S (w2
[0], w2
[1], offset
);
4794 w2
[2] = amd_bytealign_S (w1
[3], w2
[0], offset
);
4795 w2
[1] = amd_bytealign_S (w1
[2], w1
[3], offset
);
4796 w2
[0] = amd_bytealign_S (w1
[1], w1
[2], offset
);
4797 w1
[3] = amd_bytealign_S (w1
[0], w1
[1], offset
);
4798 w1
[2] = amd_bytealign_S (w0
[3], w1
[0], offset
);
4799 w1
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
4800 w1
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
4801 w0
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4802 w0
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
4808 w3
[2] = amd_bytealign_S (w2
[2], 0, offset
);
4809 w3
[1] = amd_bytealign_S (w2
[1], w2
[2], offset
);
4810 w3
[0] = amd_bytealign_S (w2
[0], w2
[1], offset
);
4811 w2
[3] = amd_bytealign_S (w1
[3], w2
[0], offset
);
4812 w2
[2] = amd_bytealign_S (w1
[2], w1
[3], offset
);
4813 w2
[1] = amd_bytealign_S (w1
[1], w1
[2], offset
);
4814 w2
[0] = amd_bytealign_S (w1
[0], w1
[1], offset
);
4815 w1
[3] = amd_bytealign_S (w0
[3], w1
[0], offset
);
4816 w1
[2] = amd_bytealign_S (w0
[2], w0
[3], offset
);
4817 w1
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
4818 w1
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4819 w0
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
4826 w3
[2] = amd_bytealign_S (w2
[1], 0, offset
);
4827 w3
[1] = amd_bytealign_S (w2
[0], w2
[1], offset
);
4828 w3
[0] = amd_bytealign_S (w1
[3], w2
[0], offset
);
4829 w2
[3] = amd_bytealign_S (w1
[2], w1
[3], offset
);
4830 w2
[2] = amd_bytealign_S (w1
[1], w1
[2], offset
);
4831 w2
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
4832 w2
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
4833 w1
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
4834 w1
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
4835 w1
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4836 w1
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
4844 w3
[2] = amd_bytealign_S (w2
[0], 0, offset
);
4845 w3
[1] = amd_bytealign_S (w1
[3], w2
[0], offset
);
4846 w3
[0] = amd_bytealign_S (w1
[2], w1
[3], offset
);
4847 w2
[3] = amd_bytealign_S (w1
[1], w1
[2], offset
);
4848 w2
[2] = amd_bytealign_S (w1
[0], w1
[1], offset
);
4849 w2
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
4850 w2
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
4851 w1
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
4852 w1
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4853 w1
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
4862 w3
[2] = amd_bytealign_S (w1
[3], 0, offset
);
4863 w3
[1] = amd_bytealign_S (w1
[2], w1
[3], offset
);
4864 w3
[0] = amd_bytealign_S (w1
[1], w1
[2], offset
);
4865 w2
[3] = amd_bytealign_S (w1
[0], w1
[1], offset
);
4866 w2
[2] = amd_bytealign_S (w0
[3], w1
[0], offset
);
4867 w2
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
4868 w2
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
4869 w1
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4870 w1
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
4880 w3
[2] = amd_bytealign_S (w1
[2], 0, offset
);
4881 w3
[1] = amd_bytealign_S (w1
[1], w1
[2], offset
);
4882 w3
[0] = amd_bytealign_S (w1
[0], w1
[1], offset
);
4883 w2
[3] = amd_bytealign_S (w0
[3], w1
[0], offset
);
4884 w2
[2] = amd_bytealign_S (w0
[2], w0
[3], offset
);
4885 w2
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
4886 w2
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4887 w1
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
4898 w3
[2] = amd_bytealign_S (w1
[1], 0, offset
);
4899 w3
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
4900 w3
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
4901 w2
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
4902 w2
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
4903 w2
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4904 w2
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
4916 w3
[2] = amd_bytealign_S (w1
[0], 0, offset
);
4917 w3
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
4918 w3
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
4919 w2
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
4920 w2
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4921 w2
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
4934 w3
[2] = amd_bytealign_S (w0
[3], 0, offset
);
4935 w3
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
4936 w3
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
4937 w2
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4938 w2
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
4952 w3
[2] = amd_bytealign_S (w0
[2], 0, offset
);
4953 w3
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
4954 w3
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4955 w2
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
4970 w3
[2] = amd_bytealign_S (w0
[1], 0, offset
);
4971 w3
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
4972 w3
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
4988 w3
[2] = amd_bytealign_S (w0
[0], 0, offset
);
4989 w3
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
5008 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
5013 w3
[1] = __byte_perm_S (w3
[1], w3
[0], selector
);
5014 w3
[0] = __byte_perm_S (w3
[0], w2
[3], selector
);
5015 w2
[3] = __byte_perm_S (w2
[3], w2
[2], selector
);
5016 w2
[2] = __byte_perm_S (w2
[2], w2
[1], selector
);
5017 w2
[1] = __byte_perm_S (w2
[1], w2
[0], selector
);
5018 w2
[0] = __byte_perm_S (w2
[0], w1
[3], selector
);
5019 w1
[3] = __byte_perm_S (w1
[3], w1
[2], selector
);
5020 w1
[2] = __byte_perm_S (w1
[2], w1
[1], selector
);
5021 w1
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
5022 w1
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
5023 w0
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
5024 w0
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
5025 w0
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
5026 w0
[0] = __byte_perm_S (w0
[0], 0, selector
);
5030 w3
[1] = __byte_perm_S (w3
[0], w2
[3], selector
);
5031 w3
[0] = __byte_perm_S (w2
[3], w2
[2], selector
);
5032 w2
[3] = __byte_perm_S (w2
[2], w2
[1], selector
);
5033 w2
[2] = __byte_perm_S (w2
[1], w2
[0], selector
);
5034 w2
[1] = __byte_perm_S (w2
[0], w1
[3], selector
);
5035 w2
[0] = __byte_perm_S (w1
[3], w1
[2], selector
);
5036 w1
[3] = __byte_perm_S (w1
[2], w1
[1], selector
);
5037 w1
[2] = __byte_perm_S (w1
[1], w1
[0], selector
);
5038 w1
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
5039 w1
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
5040 w0
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
5041 w0
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
5042 w0
[1] = __byte_perm_S (w0
[0], 0, selector
);
5047 w3
[1] = __byte_perm_S (w2
[3], w2
[2], selector
);
5048 w3
[0] = __byte_perm_S (w2
[2], w2
[1], selector
);
5049 w2
[3] = __byte_perm_S (w2
[1], w2
[0], selector
);
5050 w2
[2] = __byte_perm_S (w2
[0], w1
[3], selector
);
5051 w2
[1] = __byte_perm_S (w1
[3], w1
[2], selector
);
5052 w2
[0] = __byte_perm_S (w1
[2], w1
[1], selector
);
5053 w1
[3] = __byte_perm_S (w1
[1], w1
[0], selector
);
5054 w1
[2] = __byte_perm_S (w1
[0], w0
[3], selector
);
5055 w1
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
5056 w1
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
5057 w0
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
5058 w0
[2] = __byte_perm_S (w0
[0], 0, selector
);
5064 w3
[1] = __byte_perm_S (w2
[2], w2
[1], selector
);
5065 w3
[0] = __byte_perm_S (w2
[1], w2
[0], selector
);
5066 w2
[3] = __byte_perm_S (w2
[0], w1
[3], selector
);
5067 w2
[2] = __byte_perm_S (w1
[3], w1
[2], selector
);
5068 w2
[1] = __byte_perm_S (w1
[2], w1
[1], selector
);
5069 w2
[0] = __byte_perm_S (w1
[1], w1
[0], selector
);
5070 w1
[3] = __byte_perm_S (w1
[0], w0
[3], selector
);
5071 w1
[2] = __byte_perm_S (w0
[3], w0
[2], selector
);
5072 w1
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
5073 w1
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
5074 w0
[3] = __byte_perm_S (w0
[0], 0, selector
);
5081 w3
[1] = __byte_perm_S (w2
[1], w2
[0], selector
);
5082 w3
[0] = __byte_perm_S (w2
[0], w1
[3], selector
);
5083 w2
[3] = __byte_perm_S (w1
[3], w1
[2], selector
);
5084 w2
[2] = __byte_perm_S (w1
[2], w1
[1], selector
);
5085 w2
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
5086 w2
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
5087 w1
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
5088 w1
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
5089 w1
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
5090 w1
[0] = __byte_perm_S (w0
[0], 0, selector
);
5098 w3
[1] = __byte_perm_S (w2
[0], w1
[3], selector
);
5099 w3
[0] = __byte_perm_S (w1
[3], w1
[2], selector
);
5100 w2
[3] = __byte_perm_S (w1
[2], w1
[1], selector
);
5101 w2
[2] = __byte_perm_S (w1
[1], w1
[0], selector
);
5102 w2
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
5103 w2
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
5104 w1
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
5105 w1
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
5106 w1
[1] = __byte_perm_S (w0
[0], 0, selector
);
5115 w3
[1] = __byte_perm_S (w1
[3], w1
[2], selector
);
5116 w3
[0] = __byte_perm_S (w1
[2], w1
[1], selector
);
5117 w2
[3] = __byte_perm_S (w1
[1], w1
[0], selector
);
5118 w2
[2] = __byte_perm_S (w1
[0], w0
[3], selector
);
5119 w2
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
5120 w2
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
5121 w1
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
5122 w1
[2] = __byte_perm_S (w0
[0], 0, selector
);
5132 w3
[1] = __byte_perm_S (w1
[2], w1
[1], selector
);
5133 w3
[0] = __byte_perm_S (w1
[1], w1
[0], selector
);
5134 w2
[3] = __byte_perm_S (w1
[0], w0
[3], selector
);
5135 w2
[2] = __byte_perm_S (w0
[3], w0
[2], selector
);
5136 w2
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
5137 w2
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
5138 w1
[3] = __byte_perm_S (w0
[0], 0, selector
);
5149 w3
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
5150 w3
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
5151 w2
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
5152 w2
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
5153 w2
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
5154 w2
[0] = __byte_perm_S (w0
[0], 0, selector
);
5166 w3
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
5167 w3
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
5168 w2
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
5169 w2
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
5170 w2
[1] = __byte_perm_S (w0
[0], 0, selector
);
5183 w3
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
5184 w3
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
5185 w2
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
5186 w2
[2] = __byte_perm_S (w0
[0], 0, selector
);
5200 w3
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
5201 w3
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
5202 w2
[3] = __byte_perm_S (w0
[0], 0, selector
);
5217 w3
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
5218 w3
[0] = __byte_perm_S (w0
[0], 0, selector
);
5234 w3
[1] = __byte_perm_S (w0
[0], 0, selector
);
5253 static void switch_buffer_by_offset (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
5255 #if defined IS_AMD || defined IS_GENERIC
5256 const int offset_mod_4
= offset
& 3;
5258 const int offset_minus_4
= 4 - offset
;
5263 w3
[2] = amd_bytealign ( 0, w3
[1], offset_minus_4
);
5264 w3
[1] = amd_bytealign (w3
[1], w3
[0], offset_minus_4
);
5265 w3
[0] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
5266 w2
[3] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
5267 w2
[2] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
5268 w2
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
5269 w2
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
5270 w1
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
5271 w1
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
5272 w1
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5273 w1
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5274 w0
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5275 w0
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5276 w0
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5277 w0
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5279 if (offset_mod_4
== 0)
5301 w3
[2] = amd_bytealign ( 0, w3
[0], offset_minus_4
);
5302 w3
[1] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
5303 w3
[0] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
5304 w2
[3] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
5305 w2
[2] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
5306 w2
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
5307 w2
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
5308 w1
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
5309 w1
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5310 w1
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5311 w1
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5312 w0
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5313 w0
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5314 w0
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5317 if (offset_mod_4
== 0)
5338 w3
[2] = amd_bytealign ( 0, w2
[3], offset_minus_4
);
5339 w3
[1] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
5340 w3
[0] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
5341 w2
[3] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
5342 w2
[2] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
5343 w2
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
5344 w2
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
5345 w1
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5346 w1
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5347 w1
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5348 w1
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5349 w0
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5350 w0
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5354 if (offset_mod_4
== 0)
5374 w3
[2] = amd_bytealign ( 0, w2
[2], offset_minus_4
);
5375 w3
[1] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
5376 w3
[0] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
5377 w2
[3] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
5378 w2
[2] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
5379 w2
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
5380 w2
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5381 w1
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5382 w1
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5383 w1
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5384 w1
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5385 w0
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5390 if (offset_mod_4
== 0)
5409 w3
[2] = amd_bytealign ( 0, w2
[1], offset_minus_4
);
5410 w3
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
5411 w3
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
5412 w2
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
5413 w2
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
5414 w2
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5415 w2
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5416 w1
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5417 w1
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5418 w1
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5419 w1
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5425 if (offset_mod_4
== 0)
5443 w3
[2] = amd_bytealign ( 0, w2
[0], offset_minus_4
);
5444 w3
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
5445 w3
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
5446 w2
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
5447 w2
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5448 w2
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5449 w2
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5450 w1
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5451 w1
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5452 w1
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5459 if (offset_mod_4
== 0)
5476 w3
[2] = amd_bytealign ( 0, w1
[3], offset_minus_4
);
5477 w3
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
5478 w3
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
5479 w2
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5480 w2
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5481 w2
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5482 w2
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5483 w1
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5484 w1
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5492 if (offset_mod_4
== 0)
5508 w3
[2] = amd_bytealign ( 0, w1
[2], offset_minus_4
);
5509 w3
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
5510 w3
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5511 w2
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5512 w2
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5513 w2
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5514 w2
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5515 w1
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5524 if (offset_mod_4
== 0)
5539 w3
[2] = amd_bytealign ( 0, w1
[1], offset_minus_4
);
5540 w3
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
5541 w3
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5542 w2
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5543 w2
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5544 w2
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5545 w2
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5555 if (offset_mod_4
== 0)
5569 w3
[2] = amd_bytealign ( 0, w1
[0], offset_minus_4
);
5570 w3
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
5571 w3
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5572 w2
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5573 w2
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5574 w2
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5585 if (offset_mod_4
== 0)
5598 w3
[2] = amd_bytealign ( 0, w0
[3], offset_minus_4
);
5599 w3
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
5600 w3
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5601 w2
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5602 w2
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5614 if (offset_mod_4
== 0)
5626 w3
[2] = amd_bytealign ( 0, w0
[2], offset_minus_4
);
5627 w3
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
5628 w3
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5629 w2
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5642 if (offset_mod_4
== 0)
5653 w3
[2] = amd_bytealign ( 0, w0
[1], offset_minus_4
);
5654 w3
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
5655 w3
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5669 if (offset_mod_4
== 0)
5679 w3
[2] = amd_bytealign ( 0, w0
[0], offset_minus_4
);
5680 w3
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
5695 if (offset_mod_4
== 0)
5706 const int offset_minus_4
= 4 - (offset
% 4);
5708 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
5713 w3
[1] = __byte_perm (w3
[0], w3
[1], selector
);
5714 w3
[0] = __byte_perm (w2
[3], w3
[0], selector
);
5715 w2
[3] = __byte_perm (w2
[2], w2
[3], selector
);
5716 w2
[2] = __byte_perm (w2
[1], w2
[2], selector
);
5717 w2
[1] = __byte_perm (w2
[0], w2
[1], selector
);
5718 w2
[0] = __byte_perm (w1
[3], w2
[0], selector
);
5719 w1
[3] = __byte_perm (w1
[2], w1
[3], selector
);
5720 w1
[2] = __byte_perm (w1
[1], w1
[2], selector
);
5721 w1
[1] = __byte_perm (w1
[0], w1
[1], selector
);
5722 w1
[0] = __byte_perm (w0
[3], w1
[0], selector
);
5723 w0
[3] = __byte_perm (w0
[2], w0
[3], selector
);
5724 w0
[2] = __byte_perm (w0
[1], w0
[2], selector
);
5725 w0
[1] = __byte_perm (w0
[0], w0
[1], selector
);
5726 w0
[0] = __byte_perm ( 0, w0
[0], selector
);
5731 w3
[1] = __byte_perm (w2
[3], w3
[0], selector
);
5732 w3
[0] = __byte_perm (w2
[2], w2
[3], selector
);
5733 w2
[3] = __byte_perm (w2
[1], w2
[2], selector
);
5734 w2
[2] = __byte_perm (w2
[0], w2
[1], selector
);
5735 w2
[1] = __byte_perm (w1
[3], w2
[0], selector
);
5736 w2
[0] = __byte_perm (w1
[2], w1
[3], selector
);
5737 w1
[3] = __byte_perm (w1
[1], w1
[2], selector
);
5738 w1
[2] = __byte_perm (w1
[0], w1
[1], selector
);
5739 w1
[1] = __byte_perm (w0
[3], w1
[0], selector
);
5740 w1
[0] = __byte_perm (w0
[2], w0
[3], selector
);
5741 w0
[3] = __byte_perm (w0
[1], w0
[2], selector
);
5742 w0
[2] = __byte_perm (w0
[0], w0
[1], selector
);
5743 w0
[1] = __byte_perm ( 0, w0
[0], selector
);
5749 w3
[1] = __byte_perm (w2
[2], w2
[3], selector
);
5750 w3
[0] = __byte_perm (w2
[1], w2
[2], selector
);
5751 w2
[3] = __byte_perm (w2
[0], w2
[1], selector
);
5752 w2
[2] = __byte_perm (w1
[3], w2
[0], selector
);
5753 w2
[1] = __byte_perm (w1
[2], w1
[3], selector
);
5754 w2
[0] = __byte_perm (w1
[1], w1
[2], selector
);
5755 w1
[3] = __byte_perm (w1
[0], w1
[1], selector
);
5756 w1
[2] = __byte_perm (w0
[3], w1
[0], selector
);
5757 w1
[1] = __byte_perm (w0
[2], w0
[3], selector
);
5758 w1
[0] = __byte_perm (w0
[1], w0
[2], selector
);
5759 w0
[3] = __byte_perm (w0
[0], w0
[1], selector
);
5760 w0
[2] = __byte_perm ( 0, w0
[0], selector
);
5767 w3
[1] = __byte_perm (w2
[1], w2
[2], selector
);
5768 w3
[0] = __byte_perm (w2
[0], w2
[1], selector
);
5769 w2
[3] = __byte_perm (w1
[3], w2
[0], selector
);
5770 w2
[2] = __byte_perm (w1
[2], w1
[3], selector
);
5771 w2
[1] = __byte_perm (w1
[1], w1
[2], selector
);
5772 w2
[0] = __byte_perm (w1
[0], w1
[1], selector
);
5773 w1
[3] = __byte_perm (w0
[3], w1
[0], selector
);
5774 w1
[2] = __byte_perm (w0
[2], w0
[3], selector
);
5775 w1
[1] = __byte_perm (w0
[1], w0
[2], selector
);
5776 w1
[0] = __byte_perm (w0
[0], w0
[1], selector
);
5777 w0
[3] = __byte_perm ( 0, w0
[0], selector
);
5785 w3
[1] = __byte_perm (w2
[0], w2
[1], selector
);
5786 w3
[0] = __byte_perm (w1
[3], w2
[0], selector
);
5787 w2
[3] = __byte_perm (w1
[2], w1
[3], selector
);
5788 w2
[2] = __byte_perm (w1
[1], w1
[2], selector
);
5789 w2
[1] = __byte_perm (w1
[0], w1
[1], selector
);
5790 w2
[0] = __byte_perm (w0
[3], w1
[0], selector
);
5791 w1
[3] = __byte_perm (w0
[2], w0
[3], selector
);
5792 w1
[2] = __byte_perm (w0
[1], w0
[2], selector
);
5793 w1
[1] = __byte_perm (w0
[0], w0
[1], selector
);
5794 w1
[0] = __byte_perm ( 0, w0
[0], selector
);
5803 w3
[1] = __byte_perm (w1
[3], w2
[0], selector
);
5804 w3
[0] = __byte_perm (w1
[2], w1
[3], selector
);
5805 w2
[3] = __byte_perm (w1
[1], w1
[2], selector
);
5806 w2
[2] = __byte_perm (w1
[0], w1
[1], selector
);
5807 w2
[1] = __byte_perm (w0
[3], w1
[0], selector
);
5808 w2
[0] = __byte_perm (w0
[2], w0
[3], selector
);
5809 w1
[3] = __byte_perm (w0
[1], w0
[2], selector
);
5810 w1
[2] = __byte_perm (w0
[0], w0
[1], selector
);
5811 w1
[1] = __byte_perm ( 0, w0
[0], selector
);
5821 w3
[1] = __byte_perm (w1
[2], w1
[3], selector
);
5822 w3
[0] = __byte_perm (w1
[1], w1
[2], selector
);
5823 w2
[3] = __byte_perm (w1
[0], w1
[1], selector
);
5824 w2
[2] = __byte_perm (w0
[3], w1
[0], selector
);
5825 w2
[1] = __byte_perm (w0
[2], w0
[3], selector
);
5826 w2
[0] = __byte_perm (w0
[1], w0
[2], selector
);
5827 w1
[3] = __byte_perm (w0
[0], w0
[1], selector
);
5828 w1
[2] = __byte_perm ( 0, w0
[0], selector
);
5839 w3
[1] = __byte_perm (w1
[1], w1
[2], selector
);
5840 w3
[0] = __byte_perm (w1
[0], w1
[1], selector
);
5841 w2
[3] = __byte_perm (w0
[3], w1
[0], selector
);
5842 w2
[2] = __byte_perm (w0
[2], w0
[3], selector
);
5843 w2
[1] = __byte_perm (w0
[1], w0
[2], selector
);
5844 w2
[0] = __byte_perm (w0
[0], w0
[1], selector
);
5845 w1
[3] = __byte_perm ( 0, w0
[0], selector
);
5857 w3
[1] = __byte_perm (w1
[0], w1
[1], selector
);
5858 w3
[0] = __byte_perm (w0
[3], w1
[0], selector
);
5859 w2
[3] = __byte_perm (w0
[2], w0
[3], selector
);
5860 w2
[2] = __byte_perm (w0
[1], w0
[2], selector
);
5861 w2
[1] = __byte_perm (w0
[0], w0
[1], selector
);
5862 w2
[0] = __byte_perm ( 0, w0
[0], selector
);
5875 w3
[1] = __byte_perm (w0
[3], w1
[0], selector
);
5876 w3
[0] = __byte_perm (w0
[2], w0
[3], selector
);
5877 w2
[3] = __byte_perm (w0
[1], w0
[2], selector
);
5878 w2
[2] = __byte_perm (w0
[0], w0
[1], selector
);
5879 w2
[1] = __byte_perm ( 0, w0
[0], selector
);
5893 w3
[1] = __byte_perm (w0
[2], w0
[3], selector
);
5894 w3
[0] = __byte_perm (w0
[1], w0
[2], selector
);
5895 w2
[3] = __byte_perm (w0
[0], w0
[1], selector
);
5896 w2
[2] = __byte_perm ( 0, w0
[0], selector
);
5911 w3
[1] = __byte_perm (w0
[1], w0
[2], selector
);
5912 w3
[0] = __byte_perm (w0
[0], w0
[1], selector
);
5913 w2
[3] = __byte_perm ( 0, w0
[0], selector
);
5929 w3
[1] = __byte_perm (w0
[0], w0
[1], selector
);
5930 w3
[0] = __byte_perm ( 0, w0
[0], selector
);
5947 w3
[1] = __byte_perm ( 0, w0
[0], selector
);
5967 static void switch_buffer_by_offset_be (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
5969 #if defined IS_AMD || defined IS_GENERIC
5973 w3
[2] = amd_bytealign (w3
[1], 0, offset
);
5974 w3
[1] = amd_bytealign (w3
[0], w3
[1], offset
);
5975 w3
[0] = amd_bytealign (w2
[3], w3
[0], offset
);
5976 w2
[3] = amd_bytealign (w2
[2], w2
[3], offset
);
5977 w2
[2] = amd_bytealign (w2
[1], w2
[2], offset
);
5978 w2
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
5979 w2
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
5980 w1
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
5981 w1
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
5982 w1
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
5983 w1
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
5984 w0
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
5985 w0
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
5986 w0
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
5987 w0
[0] = amd_bytealign ( 0, w0
[0], offset
);
5991 w3
[2] = amd_bytealign (w3
[0], 0, offset
);
5992 w3
[1] = amd_bytealign (w2
[3], w3
[0], offset
);
5993 w3
[0] = amd_bytealign (w2
[2], w2
[3], offset
);
5994 w2
[3] = amd_bytealign (w2
[1], w2
[2], offset
);
5995 w2
[2] = amd_bytealign (w2
[0], w2
[1], offset
);
5996 w2
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
5997 w2
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
5998 w1
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
5999 w1
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
6000 w1
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
6001 w1
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
6002 w0
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
6003 w0
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
6004 w0
[1] = amd_bytealign ( 0, w0
[0], offset
);
6009 w3
[2] = amd_bytealign (w2
[3], 0, offset
);
6010 w3
[1] = amd_bytealign (w2
[2], w2
[3], offset
);
6011 w3
[0] = amd_bytealign (w2
[1], w2
[2], offset
);
6012 w2
[3] = amd_bytealign (w2
[0], w2
[1], offset
);
6013 w2
[2] = amd_bytealign (w1
[3], w2
[0], offset
);
6014 w2
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
6015 w2
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
6016 w1
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
6017 w1
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
6018 w1
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
6019 w1
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
6020 w0
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
6021 w0
[2] = amd_bytealign ( 0, w0
[0], offset
);
6027 w3
[2] = amd_bytealign (w2
[2], 0, offset
);
6028 w3
[1] = amd_bytealign (w2
[1], w2
[2], offset
);
6029 w3
[0] = amd_bytealign (w2
[0], w2
[1], offset
);
6030 w2
[3] = amd_bytealign (w1
[3], w2
[0], offset
);
6031 w2
[2] = amd_bytealign (w1
[2], w1
[3], offset
);
6032 w2
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
6033 w2
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
6034 w1
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
6035 w1
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
6036 w1
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
6037 w1
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
6038 w0
[3] = amd_bytealign ( 0, w0
[0], offset
);
6045 w3
[2] = amd_bytealign (w2
[1], 0, offset
);
6046 w3
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
6047 w3
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
6048 w2
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
6049 w2
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
6050 w2
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
6051 w2
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
6052 w1
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
6053 w1
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
6054 w1
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
6055 w1
[0] = amd_bytealign ( 0, w0
[0], offset
);
6063 w3
[2] = amd_bytealign (w2
[0], 0, offset
);
6064 w3
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
6065 w3
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
6066 w2
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
6067 w2
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
6068 w2
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
6069 w2
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
6070 w1
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
6071 w1
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
6072 w1
[1] = amd_bytealign ( 0, w0
[0], offset
);
6081 w3
[2] = amd_bytealign (w1
[3], 0, offset
);
6082 w3
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
6083 w3
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
6084 w2
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
6085 w2
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
6086 w2
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
6087 w2
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
6088 w1
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
6089 w1
[2] = amd_bytealign ( 0, w0
[0], offset
);
6099 w3
[2] = amd_bytealign (w1
[2], 0, offset
);
6100 w3
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
6101 w3
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
6102 w2
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
6103 w2
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
6104 w2
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
6105 w2
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
6106 w1
[3] = amd_bytealign ( 0, w0
[0], offset
);
6117 w3
[2] = amd_bytealign (w1
[1], 0, offset
);
6118 w3
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
6119 w3
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
6120 w2
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
6121 w2
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
6122 w2
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
6123 w2
[0] = amd_bytealign ( 0, w0
[0], offset
);
6135 w3
[2] = amd_bytealign (w1
[0], 0, offset
);
6136 w3
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
6137 w3
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
6138 w2
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
6139 w2
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
6140 w2
[1] = amd_bytealign ( 0, w0
[0], offset
);
6153 w3
[2] = amd_bytealign (w0
[3], 0, offset
);
6154 w3
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
6155 w3
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
6156 w2
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
6157 w2
[2] = amd_bytealign ( 0, w0
[0], offset
);
6171 w3
[2] = amd_bytealign (w0
[2], 0, offset
);
6172 w3
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
6173 w3
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
6174 w2
[3] = amd_bytealign ( 0, w0
[0], offset
);
6189 w3
[2] = amd_bytealign (w0
[1], 0, offset
);
6190 w3
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
6191 w3
[0] = amd_bytealign ( 0, w0
[0], offset
);
6207 w3
[2] = amd_bytealign (w0
[0], 0, offset
);
6208 w3
[1] = amd_bytealign ( 0, w0
[0], offset
);
6227 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
6232 w3
[1] = __byte_perm (w3
[1], w3
[0], selector
);
6233 w3
[0] = __byte_perm (w3
[0], w2
[3], selector
);
6234 w2
[3] = __byte_perm (w2
[3], w2
[2], selector
);
6235 w2
[2] = __byte_perm (w2
[2], w2
[1], selector
);
6236 w2
[1] = __byte_perm (w2
[1], w2
[0], selector
);
6237 w2
[0] = __byte_perm (w2
[0], w1
[3], selector
);
6238 w1
[3] = __byte_perm (w1
[3], w1
[2], selector
);
6239 w1
[2] = __byte_perm (w1
[2], w1
[1], selector
);
6240 w1
[1] = __byte_perm (w1
[1], w1
[0], selector
);
6241 w1
[0] = __byte_perm (w1
[0], w0
[3], selector
);
6242 w0
[3] = __byte_perm (w0
[3], w0
[2], selector
);
6243 w0
[2] = __byte_perm (w0
[2], w0
[1], selector
);
6244 w0
[1] = __byte_perm (w0
[1], w0
[0], selector
);
6245 w0
[0] = __byte_perm (w0
[0], 0, selector
);
6249 w3
[1] = __byte_perm (w3
[0], w2
[3], selector
);
6250 w3
[0] = __byte_perm (w2
[3], w2
[2], selector
);
6251 w2
[3] = __byte_perm (w2
[2], w2
[1], selector
);
6252 w2
[2] = __byte_perm (w2
[1], w2
[0], selector
);
6253 w2
[1] = __byte_perm (w2
[0], w1
[3], selector
);
6254 w2
[0] = __byte_perm (w1
[3], w1
[2], selector
);
6255 w1
[3] = __byte_perm (w1
[2], w1
[1], selector
);
6256 w1
[2] = __byte_perm (w1
[1], w1
[0], selector
);
6257 w1
[1] = __byte_perm (w1
[0], w0
[3], selector
);
6258 w1
[0] = __byte_perm (w0
[3], w0
[2], selector
);
6259 w0
[3] = __byte_perm (w0
[2], w0
[1], selector
);
6260 w0
[2] = __byte_perm (w0
[1], w0
[0], selector
);
6261 w0
[1] = __byte_perm (w0
[0], 0, selector
);
6266 w3
[1] = __byte_perm (w2
[3], w2
[2], selector
);
6267 w3
[0] = __byte_perm (w2
[2], w2
[1], selector
);
6268 w2
[3] = __byte_perm (w2
[1], w2
[0], selector
);
6269 w2
[2] = __byte_perm (w2
[0], w1
[3], selector
);
6270 w2
[1] = __byte_perm (w1
[3], w1
[2], selector
);
6271 w2
[0] = __byte_perm (w1
[2], w1
[1], selector
);
6272 w1
[3] = __byte_perm (w1
[1], w1
[0], selector
);
6273 w1
[2] = __byte_perm (w1
[0], w0
[3], selector
);
6274 w1
[1] = __byte_perm (w0
[3], w0
[2], selector
);
6275 w1
[0] = __byte_perm (w0
[2], w0
[1], selector
);
6276 w0
[3] = __byte_perm (w0
[1], w0
[0], selector
);
6277 w0
[2] = __byte_perm (w0
[0], 0, selector
);
6283 w3
[1] = __byte_perm (w2
[2], w2
[1], selector
);
6284 w3
[0] = __byte_perm (w2
[1], w2
[0], selector
);
6285 w2
[3] = __byte_perm (w2
[0], w1
[3], selector
);
6286 w2
[2] = __byte_perm (w1
[3], w1
[2], selector
);
6287 w2
[1] = __byte_perm (w1
[2], w1
[1], selector
);
6288 w2
[0] = __byte_perm (w1
[1], w1
[0], selector
);
6289 w1
[3] = __byte_perm (w1
[0], w0
[3], selector
);
6290 w1
[2] = __byte_perm (w0
[3], w0
[2], selector
);
6291 w1
[1] = __byte_perm (w0
[2], w0
[1], selector
);
6292 w1
[0] = __byte_perm (w0
[1], w0
[0], selector
);
6293 w0
[3] = __byte_perm (w0
[0], 0, selector
);
6300 w3
[1] = __byte_perm (w2
[1], w2
[0], selector
);
6301 w3
[0] = __byte_perm (w2
[0], w1
[3], selector
);
6302 w2
[3] = __byte_perm (w1
[3], w1
[2], selector
);
6303 w2
[2] = __byte_perm (w1
[2], w1
[1], selector
);
6304 w2
[1] = __byte_perm (w1
[1], w1
[0], selector
);
6305 w2
[0] = __byte_perm (w1
[0], w0
[3], selector
);
6306 w1
[3] = __byte_perm (w0
[3], w0
[2], selector
);
6307 w1
[2] = __byte_perm (w0
[2], w0
[1], selector
);
6308 w1
[1] = __byte_perm (w0
[1], w0
[0], selector
);
6309 w1
[0] = __byte_perm (w0
[0], 0, selector
);
6317 w3
[1] = __byte_perm (w2
[0], w1
[3], selector
);
6318 w3
[0] = __byte_perm (w1
[3], w1
[2], selector
);
6319 w2
[3] = __byte_perm (w1
[2], w1
[1], selector
);
6320 w2
[2] = __byte_perm (w1
[1], w1
[0], selector
);
6321 w2
[1] = __byte_perm (w1
[0], w0
[3], selector
);
6322 w2
[0] = __byte_perm (w0
[3], w0
[2], selector
);
6323 w1
[3] = __byte_perm (w0
[2], w0
[1], selector
);
6324 w1
[2] = __byte_perm (w0
[1], w0
[0], selector
);
6325 w1
[1] = __byte_perm (w0
[0], 0, selector
);
6334 w3
[1] = __byte_perm (w1
[3], w1
[2], selector
);
6335 w3
[0] = __byte_perm (w1
[2], w1
[1], selector
);
6336 w2
[3] = __byte_perm (w1
[1], w1
[0], selector
);
6337 w2
[2] = __byte_perm (w1
[0], w0
[3], selector
);
6338 w2
[1] = __byte_perm (w0
[3], w0
[2], selector
);
6339 w2
[0] = __byte_perm (w0
[2], w0
[1], selector
);
6340 w1
[3] = __byte_perm (w0
[1], w0
[0], selector
);
6341 w1
[2] = __byte_perm (w0
[0], 0, selector
);
6351 w3
[1] = __byte_perm (w1
[2], w1
[1], selector
);
6352 w3
[0] = __byte_perm (w1
[1], w1
[0], selector
);
6353 w2
[3] = __byte_perm (w1
[0], w0
[3], selector
);
6354 w2
[2] = __byte_perm (w0
[3], w0
[2], selector
);
6355 w2
[1] = __byte_perm (w0
[2], w0
[1], selector
);
6356 w2
[0] = __byte_perm (w0
[1], w0
[0], selector
);
6357 w1
[3] = __byte_perm (w0
[0], 0, selector
);
6368 w3
[1] = __byte_perm (w1
[1], w1
[0], selector
);
6369 w3
[0] = __byte_perm (w1
[0], w0
[3], selector
);
6370 w2
[3] = __byte_perm (w0
[3], w0
[2], selector
);
6371 w2
[2] = __byte_perm (w0
[2], w0
[1], selector
);
6372 w2
[1] = __byte_perm (w0
[1], w0
[0], selector
);
6373 w2
[0] = __byte_perm (w0
[0], 0, selector
);
6385 w3
[1] = __byte_perm (w1
[0], w0
[3], selector
);
6386 w3
[0] = __byte_perm (w0
[3], w0
[2], selector
);
6387 w2
[3] = __byte_perm (w0
[2], w0
[1], selector
);
6388 w2
[2] = __byte_perm (w0
[1], w0
[0], selector
);
6389 w2
[1] = __byte_perm (w0
[0], 0, selector
);
6402 w3
[1] = __byte_perm (w0
[3], w0
[2], selector
);
6403 w3
[0] = __byte_perm (w0
[2], w0
[1], selector
);
6404 w2
[3] = __byte_perm (w0
[1], w0
[0], selector
);
6405 w2
[2] = __byte_perm (w0
[0], 0, selector
);
6419 w3
[1] = __byte_perm (w0
[2], w0
[1], selector
);
6420 w3
[0] = __byte_perm (w0
[1], w0
[0], selector
);
6421 w2
[3] = __byte_perm (w0
[0], 0, selector
);
6436 w3
[1] = __byte_perm (w0
[1], w0
[0], selector
);
6437 w3
[0] = __byte_perm (w0
[0], 0, selector
);
6453 w3
[1] = __byte_perm (w0
[0], 0, selector
);
6472 /* not needed anymore?
6474 // before: append_0x80_2_be
6475 static void append_0x80_2x4_be (u32 w0[4], u32 w1[4], const u32 offset)
6480 w0[0] |= 0x80000000;
6496 w0[1] |= 0x80000000;
6512 w0[2] |= 0x80000000;
6528 w0[3] |= 0x80000000;
6544 w1[0] |= 0x80000000;
6560 w1[1] |= 0x80000000;
6576 w1[2] |= 0x80000000;
6592 w1[3] |= 0x80000000;
6609 // before: append_0x80_8
6610 static void append_0x80_1x32 (u32 w[32], const u32 offset)
6619 w[ 0] = w[ 0] | 0x8000;
6623 w[ 0] = w[ 0] | 0x800000;
6627 w[ 0] = w[ 0] | 0x80000000;
6635 w[ 1] = w[ 1] | 0x8000;
6639 w[ 1] = w[ 1] | 0x800000;
6643 w[ 1] = w[ 1] | 0x80000000;
6651 w[ 2] = w[ 2] | 0x8000;
6655 w[ 2] = w[ 2] | 0x800000;
6659 w[ 2] = w[ 2] | 0x80000000;
6667 w[ 3] = w[ 3] | 0x8000;
6671 w[ 3] = w[ 3] | 0x800000;
6675 w[ 3] = w[ 3] | 0x80000000;
6683 w[ 4] = w[ 4] | 0x8000;
6687 w[ 4] = w[ 4] | 0x800000;
6691 w[ 4] = w[ 4] | 0x80000000;
6699 w[ 5] = w[ 5] | 0x8000;
6703 w[ 5] = w[ 5] | 0x800000;
6707 w[ 5] = w[ 5] | 0x80000000;
6715 w[ 6] = w[ 6] | 0x8000;
6719 w[ 6] = w[ 6] | 0x800000;
6723 w[ 6] = w[ 6] | 0x80000000;
6731 w[ 7] = w[ 7] | 0x8000;
6735 w[ 7] = w[ 7] | 0x800000;
6739 w[ 7] = w[ 7] | 0x80000000;
6747 w[ 8] = w[ 8] | 0x8000;
6751 w[ 8] = w[ 8] | 0x800000;
6755 w[ 8] = w[ 8] | 0x80000000;
6763 w[ 9] = w[ 9] | 0x8000;
6767 w[ 9] = w[ 9] | 0x800000;
6771 w[ 9] = w[ 9] | 0x80000000;
6779 w[10] = w[10] | 0x8000;
6783 w[10] = w[10] | 0x800000;
6787 w[10] = w[10] | 0x80000000;
6795 w[11] = w[11] | 0x8000;
6799 w[11] = w[11] | 0x800000;
6803 w[11] = w[11] | 0x80000000;
6811 w[12] = w[12] | 0x8000;
6815 w[12] = w[12] | 0x800000;
6819 w[12] = w[12] | 0x80000000;
6827 w[13] = w[13] | 0x8000;
6831 w[13] = w[13] | 0x800000;
6835 w[13] = w[13] | 0x80000000;
6843 w[14] = w[14] | 0x8000;
6847 w[14] = w[14] | 0x800000;
6851 w[14] = w[14] | 0x80000000;
6859 w[15] = w[15] | 0x8000;
6863 w[15] = w[15] | 0x800000;
6867 w[15] = w[15] | 0x80000000;
6875 w[16] = w[16] | 0x8000;
6879 w[16] = w[16] | 0x800000;
6883 w[16] = w[16] | 0x80000000;
6891 w[17] = w[17] | 0x8000;
6895 w[17] = w[17] | 0x800000;
6899 w[17] = w[17] | 0x80000000;
6907 w[18] = w[18] | 0x8000;
6911 w[18] = w[18] | 0x800000;
6915 w[18] = w[18] | 0x80000000;
6923 w[19] = w[19] | 0x8000;
6927 w[19] = w[19] | 0x800000;
6931 w[19] = w[19] | 0x80000000;
6939 w[20] = w[20] | 0x8000;
6943 w[20] = w[20] | 0x800000;
6947 w[20] = w[20] | 0x80000000;
6955 w[21] = w[21] | 0x8000;
6959 w[21] = w[21] | 0x800000;
6963 w[21] = w[21] | 0x80000000;
6971 w[22] = w[22] | 0x8000;
6975 w[22] = w[22] | 0x800000;
6979 w[22] = w[22] | 0x80000000;
6987 w[23] = w[23] | 0x8000;
6991 w[23] = w[23] | 0x800000;
6995 w[23] = w[23] | 0x80000000;
7003 w[24] = w[24] | 0x8000;
7007 w[24] = w[24] | 0x800000;
7011 w[24] = w[24] | 0x80000000;
7019 w[25] = w[25] | 0x8000;
7023 w[25] = w[25] | 0x800000;
7027 w[25] = w[25] | 0x80000000;
7035 w[26] = w[26] | 0x8000;
7039 w[26] = w[26] | 0x800000;
7043 w[26] = w[26] | 0x80000000;
7051 w[27] = w[27] | 0x8000;
7055 w[27] = w[27] | 0x800000;
7059 w[27] = w[27] | 0x80000000;
7067 w[28] = w[28] | 0x8000;
7071 w[28] = w[28] | 0x800000;
7075 w[28] = w[28] | 0x80000000;
7083 w[29] = w[29] | 0x8000;
7087 w[29] = w[29] | 0x800000;
7091 w[29] = w[29] | 0x80000000;
7099 w[30] = w[30] | 0x8000;
7103 w[30] = w[30] | 0x800000;
7107 w[30] = w[30] | 0x80000000;
7115 w[31] = w[31] | 0x8000;
7119 w[31] = w[31] | 0x800000;
7123 w[31] = w[31] | 0x80000000;
7128 // before: device_memcat2L
7129 static void memcat_c7_d1x2_sl1x2_sr1x2 (const u32 offset, u32 dst0[2], u32 src_l0[2], u32 src_r0[2])
7134 dst0[0] = src_l0[0] | src_r0[0] << 8;
7135 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
7139 dst0[0] = src_l0[0] | src_r0[0] << 16;
7140 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
7144 dst0[0] = src_l0[0] | src_r0[0] << 24;
7145 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
7149 dst0[1] = src_r0[0];
7153 dst0[1] = src_l0[1] | src_r0[0] << 8;
7157 dst0[1] = src_l0[1] | src_r0[0] << 16;
7161 dst0[1] = src_l0[1] | src_r0[0] << 24;
7166 // before: device_memcat4L
7167 static void memcat_c15_d1x4_sl1x4_sr1x4 (const u32 offset, u32 dst0[4], u32 src_l0[4], u32 src_r0[4])
7172 dst0[0] = src_l0[0] | src_r0[0] << 8;
7173 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
7174 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
7175 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
7179 dst0[0] = src_l0[0] | src_r0[0] << 16;
7180 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
7181 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
7182 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
7186 dst0[0] = src_l0[0] | src_r0[0] << 24;
7187 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
7188 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
7189 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
7193 dst0[1] = src_r0[0];
7194 dst0[2] = src_r0[1];
7195 dst0[3] = src_r0[2];
7199 dst0[1] = src_l0[1] | src_r0[0] << 8;
7200 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
7201 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
7205 dst0[1] = src_l0[1] | src_r0[0] << 16;
7206 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
7207 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
7211 dst0[1] = src_l0[1] | src_r0[0] << 24;
7212 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
7213 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
7217 dst0[2] = src_r0[0];
7218 dst0[3] = src_r0[1];
7222 dst0[2] = src_l0[2] | src_r0[0] << 8;
7223 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
7227 dst0[2] = src_l0[2] | src_r0[0] << 16;
7228 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
7232 dst0[2] = src_l0[2] | src_r0[0] << 24;
7233 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
7237 dst0[3] = src_r0[0];
7241 dst0[3] = src_l0[3] | src_r0[0] << 8;
7245 dst0[3] = src_l0[3] | src_r0[0] << 16;
7249 dst0[3] = src_l0[3] | src_r0[0] << 24;
7254 // before: device_memcat8L
7255 static void memcat_c31_d2x4_sl2x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 src_l0[4], u32 src_l1[4], u32 src_r0[4])
7260 dst0[0] = src_l0[0] | src_r0[0] << 8;
7261 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
7262 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
7263 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
7264 dst1[0] = src_r0[3] >> 24;
7268 dst0[0] = src_l0[0] | src_r0[0] << 16;
7269 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
7270 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
7271 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
7272 dst1[0] = src_r0[3] >> 16;
7276 dst0[0] = src_l0[0] | src_r0[0] << 24;
7277 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
7278 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
7279 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
7280 dst1[0] = src_r0[3] >> 8;
7284 dst0[1] = src_r0[0];
7285 dst0[2] = src_r0[1];
7286 dst0[3] = src_r0[2];
7287 dst1[0] = src_r0[3];
7291 dst0[1] = src_l0[1] | src_r0[0] << 8;
7292 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
7293 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
7294 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
7295 dst1[1] = src_r0[3] >> 24;
7299 dst0[1] = src_l0[1] | src_r0[0] << 16;
7300 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
7301 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
7302 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
7303 dst1[1] = src_r0[3] >> 16;
7307 dst0[1] = src_l0[1] | src_r0[0] << 24;
7308 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
7309 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
7310 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
7311 dst1[1] = src_r0[3] >> 8;
7315 dst0[2] = src_r0[0];
7316 dst0[3] = src_r0[1];
7317 dst1[0] = src_r0[2];
7318 dst1[1] = src_r0[3];
7322 dst0[2] = src_l0[2] | src_r0[0] << 8;
7323 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
7324 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
7325 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
7326 dst1[2] = src_r0[3] >> 24;
7330 dst0[2] = src_l0[2] | src_r0[0] << 16;
7331 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
7332 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
7333 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
7334 dst1[2] = src_r0[3] >> 16;
7338 dst0[2] = src_l0[2] | src_r0[0] << 24;
7339 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
7340 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
7341 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
7342 dst1[2] = src_r0[3] >> 8;
7346 dst0[3] = src_r0[0];
7347 dst1[0] = src_r0[1];
7348 dst1[1] = src_r0[2];
7349 dst1[2] = src_r0[3];
7353 dst0[3] = src_l0[3] | src_r0[0] << 8;
7354 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
7355 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
7356 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
7357 dst1[3] = src_r0[3] >> 24;
7361 dst0[3] = src_l0[3] | src_r0[0] << 16;
7362 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
7363 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
7364 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
7365 dst1[3] = src_r0[3] >> 16;
7369 dst0[3] = src_l0[3] | src_r0[0] << 24;
7370 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
7371 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
7372 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
7373 dst1[3] = src_r0[3] >> 8;
7377 dst1[0] = src_r0[0];
7378 dst1[1] = src_r0[1];
7379 dst1[2] = src_r0[2];
7380 dst1[3] = src_r0[3];
7384 dst1[0] = src_l1[0] | src_r0[0] << 8;
7385 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
7386 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
7387 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
7391 dst1[0] = src_l1[0] | src_r0[0] << 16;
7392 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
7393 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
7394 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
7398 dst1[0] = src_l1[0] | src_r0[0] << 24;
7399 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
7400 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
7401 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
7405 dst1[1] = src_r0[0];
7406 dst1[2] = src_r0[1];
7407 dst1[3] = src_r0[2];
7411 dst1[1] = src_l1[1] | src_r0[0] << 8;
7412 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
7413 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
7417 dst1[1] = src_l1[1] | src_r0[0] << 16;
7418 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
7419 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
7423 dst1[1] = src_l1[1] | src_r0[0] << 24;
7424 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
7425 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
7429 dst1[2] = src_r0[0];
7430 dst1[3] = src_r0[1];
7434 dst1[2] = src_l1[2] | src_r0[0] << 8;
7435 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
7439 dst1[2] = src_l1[2] | src_r0[0] << 16;
7440 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
7444 dst1[2] = src_l1[2] | src_r0[0] << 24;
7445 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
7449 dst1[3] = src_r0[0];
7453 dst1[3] = src_l1[3] | src_r0[0] << 8;
7457 dst1[3] = src_l1[3] | src_r0[0] << 16;
7461 dst1[3] = src_l1[3] | src_r0[0] << 24;
7466 // before: device_memcat12L
7467 static void memcat_c47_d3x4_sl3x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4])
7472 dst0[0] = src_l0[0] | src_r0[0] << 8;
7473 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
7474 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
7475 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
7476 dst1[0] = src_r0[3] >> 24;
7480 dst0[0] = src_l0[0] | src_r0[0] << 16;
7481 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
7482 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
7483 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
7484 dst1[0] = src_r0[3] >> 16;
7488 dst0[0] = src_l0[0] | src_r0[0] << 24;
7489 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
7490 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
7491 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
7492 dst1[0] = src_r0[3] >> 8;
7496 dst0[1] = src_r0[0];
7497 dst0[2] = src_r0[1];
7498 dst0[3] = src_r0[2];
7499 dst1[0] = src_r0[3];
7503 dst0[1] = src_l0[1] | src_r0[0] << 8;
7504 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
7505 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
7506 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
7507 dst1[1] = src_r0[3] >> 24;
7511 dst0[1] = src_l0[1] | src_r0[0] << 16;
7512 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
7513 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
7514 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
7515 dst1[1] = src_r0[3] >> 16;
7519 dst0[1] = src_l0[1] | src_r0[0] << 24;
7520 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
7521 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
7522 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
7523 dst1[1] = src_r0[3] >> 8;
7527 dst0[2] = src_r0[0];
7528 dst0[3] = src_r0[1];
7529 dst1[0] = src_r0[2];
7530 dst1[1] = src_r0[3];
7534 dst0[2] = src_l0[2] | src_r0[0] << 8;
7535 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
7536 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
7537 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
7538 dst1[2] = src_r0[3] >> 24;
7542 dst0[2] = src_l0[2] | src_r0[0] << 16;
7543 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
7544 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
7545 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
7546 dst1[2] = src_r0[3] >> 16;
7550 dst0[2] = src_l0[2] | src_r0[0] << 24;
7551 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
7552 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
7553 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
7554 dst1[2] = src_r0[3] >> 8;
7558 dst0[3] = src_r0[0];
7559 dst1[0] = src_r0[1];
7560 dst1[1] = src_r0[2];
7561 dst1[2] = src_r0[3];
7565 dst0[3] = src_l0[3] | src_r0[0] << 8;
7566 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
7567 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
7568 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
7569 dst1[3] = src_r0[3] >> 24;
7573 dst0[3] = src_l0[3] | src_r0[0] << 16;
7574 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
7575 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
7576 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
7577 dst1[3] = src_r0[3] >> 16;
7581 dst0[3] = src_l0[3] | src_r0[0] << 24;
7582 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
7583 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
7584 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
7585 dst1[3] = src_r0[3] >> 8;
7589 dst1[0] = src_r0[0];
7590 dst1[1] = src_r0[1];
7591 dst1[2] = src_r0[2];
7592 dst1[3] = src_r0[3];
7596 dst1[0] = src_l1[0] | src_r0[0] << 8;
7597 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
7598 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
7599 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
7600 dst2[0] = src_r0[3] >> 24;
7604 dst1[0] = src_l1[0] | src_r0[0] << 16;
7605 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
7606 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
7607 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
7608 dst2[0] = src_r0[3] >> 16;
7612 dst1[0] = src_l1[0] | src_r0[0] << 24;
7613 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
7614 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
7615 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
7616 dst2[0] = src_r0[3] >> 8;
7620 dst1[1] = src_r0[0];
7621 dst1[2] = src_r0[1];
7622 dst1[3] = src_r0[2];
7623 dst2[0] = src_r0[3];
7627 dst1[1] = src_l1[1] | src_r0[0] << 8;
7628 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
7629 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
7630 dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8;
7631 dst2[1] = src_r0[3] >> 24;
7635 dst1[1] = src_l1[1] | src_r0[0] << 16;
7636 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
7637 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
7638 dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16;
7639 dst2[1] = src_r0[3] >> 16;
7643 dst1[1] = src_l1[1] | src_r0[0] << 24;
7644 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
7645 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
7646 dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24;
7647 dst2[1] = src_r0[3] >> 8;
7651 dst1[2] = src_r0[0];
7652 dst1[3] = src_r0[1];
7653 dst2[0] = src_r0[2];
7654 dst2[1] = src_r0[3];
7658 dst1[2] = src_l1[2] | src_r0[0] << 8;
7659 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
7660 dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8;
7661 dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8;
7662 dst2[2] = src_r0[3] >> 24;
7666 dst1[2] = src_l1[2] | src_r0[0] << 16;
7667 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
7668 dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16;
7669 dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16;
7670 dst2[2] = src_r0[3] >> 16;
7674 dst1[2] = src_l1[2] | src_r0[0] << 24;
7675 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
7676 dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24;
7677 dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24;
7678 dst2[2] = src_r0[3] >> 8;
7682 dst1[3] = src_r0[0];
7683 dst2[0] = src_r0[1];
7684 dst2[1] = src_r0[2];
7685 dst2[2] = src_r0[3];
7689 dst1[3] = src_l1[3] | src_r0[0] << 8;
7690 dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8;
7691 dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8;
7692 dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8;
7693 dst2[3] = src_r0[3] >> 24;
7697 dst1[3] = src_l1[3] | src_r0[0] << 16;
7698 dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16;
7699 dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16;
7700 dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16;
7701 dst2[3] = src_r0[3] >> 16;
7705 dst1[3] = src_l1[3] | src_r0[0] << 24;
7706 dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24;
7707 dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24;
7708 dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24;
7709 dst2[3] = src_r0[3] >> 8;
7713 dst2[0] = src_r0[0];
7714 dst2[1] = src_r0[1];
7715 dst2[2] = src_r0[2];
7716 dst2[3] = src_r0[3];
7720 dst2[0] = src_l2[0] | src_r0[0] << 8;
7721 dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8;
7722 dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8;
7723 dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8;
7727 dst2[0] = src_l2[0] | src_r0[0] << 16;
7728 dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16;
7729 dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16;
7730 dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16;
7734 dst2[0] = src_l2[0] | src_r0[0] << 24;
7735 dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24;
7736 dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24;
7737 dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24;
7741 dst2[1] = src_r0[0];
7742 dst2[2] = src_r0[1];
7743 dst2[3] = src_r0[2];
7747 dst2[1] = src_l2[1] | src_r0[0] << 8;
7748 dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8;
7749 dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8;
7753 dst2[1] = src_l2[1] | src_r0[0] << 16;
7754 dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16;
7755 dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16;
7759 dst2[1] = src_l2[1] | src_r0[0] << 24;
7760 dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24;
7761 dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24;
7765 dst2[2] = src_r0[0];
7766 dst2[3] = src_r0[1];
7770 dst2[2] = src_l2[2] | src_r0[0] << 8;
7771 dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8;
7775 dst2[2] = src_l2[2] | src_r0[0] << 16;
7776 dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16;
7780 dst2[2] = src_l2[2] | src_r0[0] << 24;
7781 dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24;
7785 dst2[3] = src_r0[0];
7789 dst2[3] = src_l2[3] | src_r0[0] << 8;
7793 dst2[3] = src_l2[3] | src_r0[0] << 16;
7797 dst2[3] = src_l2[3] | src_r0[0] << 24;
7802 // before: device_memcat12L
7803 static void memcat_c47_d3x4_sl3x4_sr2x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4], u32 src_r1[4])
7808 dst0[0] = src_r0[0];
7809 dst0[1] = src_r0[1];
7810 dst0[2] = src_r0[2];
7811 dst0[3] = src_r0[3];
7812 dst1[0] = src_r1[0];
7813 dst1[1] = src_r1[1];
7814 dst1[2] = src_r1[2];
7815 dst1[3] = src_r1[3];
7819 dst0[0] = src_l0[0] | src_r0[0] << 8;
7820 dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8;
7821 dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8;
7822 dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8;
7823 dst1[0] = src_r0[3] >> 24 | src_r1[0] << 8;
7824 dst1[1] = src_r1[0] >> 24 | src_r1[1] << 8;
7825 dst1[2] = src_r1[1] >> 24 | src_r1[2] << 8;
7826 dst1[3] = src_r1[2] >> 24 | src_r1[3] << 8;
7827 dst2[0] = src_r1[3] >> 24;
7831 dst0[0] = src_l0[0] | src_r0[0] << 16;
7832 dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16;
7833 dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16;
7834 dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16;
7835 dst1[0] = src_r0[3] >> 16 | src_r1[0] << 16;
7836 dst1[1] = src_r1[0] >> 16 | src_r1[1] << 16;
7837 dst1[2] = src_r1[1] >> 16 | src_r1[2] << 16;
7838 dst1[3] = src_r1[2] >> 16 | src_r1[3] << 16;
7839 dst2[0] = src_r1[3] >> 16;
7843 dst0[0] = src_l0[0] | src_r0[0] << 24;
7844 dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24;
7845 dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24;
7846 dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24;
7847 dst1[0] = src_r0[3] >> 8 | src_r1[0] << 24;
7848 dst1[1] = src_r1[0] >> 8 | src_r1[1] << 24;
7849 dst1[2] = src_r1[1] >> 8 | src_r1[2] << 24;
7850 dst1[3] = src_r1[2] >> 8 | src_r1[3] << 24;
7851 dst2[0] = src_r1[3] >> 8;
7855 dst0[1] = src_r0[0];
7856 dst0[2] = src_r0[1];
7857 dst0[3] = src_r0[2];
7858 dst1[0] = src_r0[3];
7859 dst1[1] = src_r1[0];
7860 dst1[2] = src_r1[1];
7861 dst1[3] = src_r1[2];
7862 dst2[0] = src_r1[3];
7866 dst0[1] = src_l0[1] | src_r0[0] << 8;
7867 dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8;
7868 dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8;
7869 dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8;
7870 dst1[1] = src_r0[3] >> 24 | src_r1[0] << 8;
7871 dst1[2] = src_r1[0] >> 24 | src_r1[1] << 8;
7872 dst1[3] = src_r1[1] >> 24 | src_r1[2] << 8;
7873 dst2[0] = src_r1[2] >> 24 | src_r1[3] << 8;
7874 dst2[1] = src_r1[3] >> 24;
7878 dst0[1] = src_l0[1] | src_r0[0] << 16;
7879 dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16;
7880 dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16;
7881 dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16;
7882 dst1[1] = src_r0[3] >> 16 | src_r1[0] << 16;
7883 dst1[2] = src_r1[0] >> 16 | src_r1[1] << 16;
7884 dst1[3] = src_r1[1] >> 16 | src_r1[2] << 16;
7885 dst2[0] = src_r1[2] >> 16 | src_r1[3] << 16;
7886 dst2[1] = src_r1[3] >> 16;
7890 dst0[1] = src_l0[1] | src_r0[0] << 24;
7891 dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24;
7892 dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24;
7893 dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24;
7894 dst1[1] = src_r0[3] >> 8 | src_r1[0] << 24;
7895 dst1[2] = src_r1[0] >> 8 | src_r1[1] << 24;
7896 dst1[3] = src_r1[1] >> 8 | src_r1[2] << 24;
7897 dst2[0] = src_r1[2] >> 8 | src_r1[3] << 24;
7898 dst2[1] = src_r1[3] >> 8;
7902 dst0[2] = src_r0[0];
7903 dst0[3] = src_r0[1];
7904 dst1[0] = src_r0[2];
7905 dst1[1] = src_r0[3];
7906 dst1[2] = src_r1[0];
7907 dst1[3] = src_r1[1];
7908 dst2[0] = src_r1[2];
7909 dst2[1] = src_r1[3];
7913 dst0[2] = src_l0[2] | src_r0[0] << 8;
7914 dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8;
7915 dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8;
7916 dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8;
7917 dst1[2] = src_r0[3] >> 24 | src_r1[0] << 8;
7918 dst1[3] = src_r1[0] >> 24 | src_r1[1] << 8;
7919 dst2[0] = src_r1[1] >> 24 | src_r1[2] << 8;
7920 dst2[1] = src_r1[2] >> 24 | src_r1[3] << 8;
7921 dst2[2] = src_r1[3] >> 24;
7925 dst0[2] = src_l0[2] | src_r0[0] << 16;
7926 dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16;
7927 dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16;
7928 dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16;
7929 dst1[2] = src_r0[3] >> 16 | src_r1[0] << 16;
7930 dst1[3] = src_r1[0] >> 16 | src_r1[1] << 16;
7931 dst2[0] = src_r1[1] >> 16 | src_r1[2] << 16;
7932 dst2[1] = src_r1[2] >> 16 | src_r1[3] << 16;
7933 dst2[2] = src_r1[3] >> 16;
7937 dst0[2] = src_l0[2] | src_r0[0] << 24;
7938 dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24;
7939 dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24;
7940 dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24;
7941 dst1[2] = src_r0[3] >> 8 | src_r1[0] << 24;
7942 dst1[3] = src_r1[0] >> 8 | src_r1[1] << 24;
7943 dst2[0] = src_r1[1] >> 8 | src_r1[2] << 24;
7944 dst2[1] = src_r1[2] >> 8 | src_r1[3] << 24;
7945 dst2[2] = src_r1[3] >> 8;
7949 dst0[3] = src_r0[0];
7950 dst1[0] = src_r0[1];
7951 dst1[1] = src_r0[2];
7952 dst1[2] = src_r0[3];
7953 dst1[3] = src_r1[0];
7954 dst2[0] = src_r1[1];
7955 dst2[1] = src_r1[2];
7956 dst2[2] = src_r1[3];
7960 dst0[3] = src_l0[3] | src_r0[0] << 8;
7961 dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8;
7962 dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8;
7963 dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8;
7964 dst1[3] = src_r0[3] >> 24 | src_r1[0] << 8;
7965 dst2[0] = src_r1[0] >> 24 | src_r1[1] << 8;
7966 dst2[1] = src_r1[1] >> 24 | src_r1[2] << 8;
7967 dst2[2] = src_r1[2] >> 24 | src_r1[3] << 8;
7968 dst2[3] = src_r1[3] >> 24;
7972 dst0[3] = src_l0[3] | src_r0[0] << 16;
7973 dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16;
7974 dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16;
7975 dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16;
7976 dst1[3] = src_r0[3] >> 16 | src_r1[0] << 16;
7977 dst2[0] = src_r1[0] >> 16 | src_r1[1] << 16;
7978 dst2[1] = src_r1[1] >> 16 | src_r1[2] << 16;
7979 dst2[2] = src_r1[2] >> 16 | src_r1[3] << 16;
7980 dst2[3] = src_r1[3] >> 16;
7984 dst0[3] = src_l0[3] | src_r0[0] << 24;
7985 dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24;
7986 dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24;
7987 dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24;
7988 dst1[3] = src_r0[3] >> 8 | src_r1[0] << 24;
7989 dst2[0] = src_r1[0] >> 8 | src_r1[1] << 24;
7990 dst2[1] = src_r1[1] >> 8 | src_r1[2] << 24;
7991 dst2[2] = src_r1[2] >> 8 | src_r1[3] << 24;
7992 dst2[3] = src_r1[3] >> 8;
7996 dst1[0] = src_r0[0];
7997 dst1[1] = src_r0[1];
7998 dst1[2] = src_r0[2];
7999 dst1[3] = src_r0[3];
8000 dst2[0] = src_r1[0];
8001 dst2[1] = src_r1[1];
8002 dst2[2] = src_r1[2];
8003 dst2[3] = src_r1[3];
8007 dst1[0] = src_l1[0] | src_r0[0] << 8;
8008 dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8;
8009 dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8;
8010 dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8;
8011 dst2[0] = src_r0[3] >> 24 | src_r1[0] << 8;
8012 dst2[1] = src_r1[0] >> 24 | src_r1[1] << 8;
8013 dst2[2] = src_r1[1] >> 24 | src_r1[2] << 8;
8014 dst2[3] = src_r1[2] >> 24 | src_r1[3] << 8;
8018 dst1[0] = src_l1[0] | src_r0[0] << 16;
8019 dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16;
8020 dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16;
8021 dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16;
8022 dst2[0] = src_r0[3] >> 16 | src_r1[0] << 16;
8023 dst2[1] = src_r1[0] >> 16 | src_r1[1] << 16;
8024 dst2[2] = src_r1[1] >> 16 | src_r1[2] << 16;
8025 dst2[3] = src_r1[2] >> 16 | src_r1[3] << 16;
8029 dst1[0] = src_l1[0] | src_r0[0] << 24;
8030 dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24;
8031 dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24;
8032 dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24;
8033 dst2[0] = src_r0[3] >> 8 | src_r1[0] << 24;
8034 dst2[1] = src_r1[0] >> 8 | src_r1[1] << 24;
8035 dst2[2] = src_r1[1] >> 8 | src_r1[2] << 24;
8036 dst2[3] = src_r1[2] >> 8 | src_r1[3] << 24;
8040 dst1[1] = src_r1[0];
8041 dst1[2] = src_r0[1];
8042 dst1[3] = src_r0[2];
8043 dst2[0] = src_r0[3];
8044 dst2[1] = src_r1[0];
8045 dst2[2] = src_r1[1];
8046 dst2[3] = src_r1[2];
8050 dst1[1] = src_l1[1] | src_r0[0] << 8;
8051 dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8;
8052 dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8;
8053 dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8;
8054 dst2[1] = src_r0[3] >> 24 | src_r1[0] << 8;
8055 dst2[2] = src_r1[0] >> 24 | src_r1[1] << 8;
8056 dst2[3] = src_r1[1] >> 24 | src_r1[2] << 8;
8060 dst1[1] = src_l1[1] | src_r0[0] << 16;
8061 dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16;
8062 dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16;
8063 dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16;
8064 dst2[1] = src_r0[3] >> 16 | src_r1[0] << 16;
8065 dst2[2] = src_r1[0] >> 16 | src_r1[1] << 16;
8066 dst2[3] = src_r1[1] >> 16 | src_r1[2] << 16;
8070 dst1[1] = src_l1[1] | src_r0[0] << 24;
8071 dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24;
8072 dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24;
8073 dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24;
8074 dst2[1] = src_r0[3] >> 8 | src_r1[0] << 24;
8075 dst2[2] = src_r1[0] >> 8 | src_r1[1] << 24;
8076 dst2[3] = src_r1[1] >> 8 | src_r1[2] << 24;
8080 dst1[2] = src_r1[0];
8081 dst1[3] = src_r0[1];
8082 dst2[0] = src_r0[2];
8083 dst2[1] = src_r0[3];
8084 dst2[2] = src_r1[0];
8085 dst2[3] = src_r1[1];
8089 dst1[2] = src_l1[2] | src_r0[0] << 8;
8090 dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8;
8091 dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8;
8092 dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8;
8093 dst2[2] = src_r0[3] >> 24 | src_r1[0] << 8;
8094 dst2[3] = src_r1[0] >> 24 | src_r1[1] << 8;
8098 dst1[2] = src_l1[2] | src_r0[0] << 16;
8099 dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16;
8100 dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16;
8101 dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16;
8102 dst2[2] = src_r0[3] >> 16 | src_r1[0] << 16;
8103 dst2[3] = src_r1[0] >> 16 | src_r1[1] << 16;
8107 dst1[2] = src_l1[2] | src_r0[0] << 24;
8108 dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24;
8109 dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24;
8110 dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24;
8111 dst2[2] = src_r0[3] >> 8 | src_r1[0] << 24;
8112 dst2[3] = src_r1[0] >> 8 | src_r1[1] << 24;
8116 dst1[3] = src_r1[0];
8117 dst2[0] = src_r0[1];
8118 dst2[1] = src_r0[2];
8119 dst2[2] = src_r0[3];
8120 dst2[3] = src_r1[0];
8124 dst1[3] = src_l1[3] | src_r0[0] << 8;
8125 dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8;
8126 dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8;
8127 dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8;
8128 dst2[3] = src_r0[3] >> 24 | src_r1[0] << 8;
8132 dst1[3] = src_l1[3] | src_r0[0] << 16;
8133 dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16;
8134 dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16;
8135 dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16;
8136 dst2[3] = src_r0[3] >> 16 | src_r1[0] << 16;
8140 dst1[3] = src_l1[3] | src_r0[0] << 24;
8141 dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24;
8142 dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24;
8143 dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24;
8144 dst2[3] = src_r0[3] >> 8 | src_r1[0] << 24;
8148 dst2[0] = src_r0[0];
8149 dst2[1] = src_r0[1];
8150 dst2[2] = src_r0[2];
8151 dst2[3] = src_r0[3];
8155 dst2[0] = src_l2[0] | src_r0[0] << 8;
8156 dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8;
8157 dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8;
8158 dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8;
8162 dst2[0] = src_l2[0] | src_r0[0] << 16;
8163 dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16;
8164 dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16;
8165 dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16;
8169 dst2[0] = src_l2[0] | src_r0[0] << 24;
8170 dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24;
8171 dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24;
8172 dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24;
8176 dst2[1] = src_r0[0];
8177 dst2[2] = src_r0[1];
8178 dst2[3] = src_r0[2];
8182 dst2[1] = src_l2[1] | src_r0[0] << 8;
8183 dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8;
8184 dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8;
8188 dst2[1] = src_l2[1] | src_r0[0] << 16;
8189 dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16;
8190 dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16;
8194 dst2[1] = src_l2[1] | src_r0[0] << 24;
8195 dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24;
8196 dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24;
8200 dst2[2] = src_r0[0];
8201 dst2[3] = src_r0[1];
8205 dst2[2] = src_l2[2] | src_r0[0] << 8;
8206 dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8;
8210 dst2[2] = src_l2[2] | src_r0[0] << 16;
8211 dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16;
8215 dst2[2] = src_l2[2] | src_r0[0] << 24;
8216 dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24;
8220 dst2[3] = src_r0[0];
8224 dst2[3] = src_l2[3] | src_r0[0] << 8;
8228 dst2[3] = src_l2[3] | src_r0[0] << 16;
8232 dst2[3] = src_l2[3] | src_r0[0] << 24;
8237 // before: memcat16_9
8238 static void memcat_c15_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset)
8255 w0[0] = w0[0] | append0[0] << 8;
8256 w0[1] = append0[0] >> 24 | append0[1] << 8;
8257 w0[2] = append0[1] >> 24 | append0[2] << 8;
8258 w0[3] = append0[2] >> 24 | append0[3] << 8;
8259 w1[0] = append0[3] >> 24 | append1[0] << 8;
8260 w1[1] = append1[0] >> 24 | append1[1] << 8;
8261 w1[2] = append1[1] >> 24 | append1[2] << 8;
8262 w1[3] = append1[2] >> 24 | append1[3] << 8;
8263 w2[0] = append1[3] >> 24 | append2[0] << 8;
8264 w2[1] = append2[0] >> 24;
8268 w0[0] = w0[0] | append0[0] << 16;
8269 w0[1] = append0[0] >> 16 | append0[1] << 16;
8270 w0[2] = append0[1] >> 16 | append0[2] << 16;
8271 w0[3] = append0[2] >> 16 | append0[3] << 16;
8272 w1[0] = append0[3] >> 16 | append1[0] << 16;
8273 w1[1] = append1[0] >> 16 | append1[1] << 16;
8274 w1[2] = append1[1] >> 16 | append1[2] << 16;
8275 w1[3] = append1[2] >> 16 | append1[3] << 16;
8276 w2[0] = append1[3] >> 16 | append2[0] << 16;
8277 w2[1] = append2[0] >> 16;
8281 w0[0] = w0[0] | append0[0] << 24;
8282 w0[1] = append0[0] >> 8 | append0[1] << 24;
8283 w0[2] = append0[1] >> 8 | append0[2] << 24;
8284 w0[3] = append0[2] >> 8 | append0[3] << 24;
8285 w1[0] = append0[3] >> 8 | append1[0] << 24;
8286 w1[1] = append1[0] >> 8 | append1[1] << 24;
8287 w1[2] = append1[1] >> 8 | append1[2] << 24;
8288 w1[3] = append1[2] >> 8 | append1[3] << 24;
8289 w2[0] = append1[3] >> 8 | append2[0] << 24;
8290 w2[1] = append2[0] >> 8;
8306 w0[1] = w0[1] | append0[0] << 8;
8307 w0[2] = append0[0] >> 24 | append0[1] << 8;
8308 w0[3] = append0[1] >> 24 | append0[2] << 8;
8309 w1[0] = append0[2] >> 24 | append0[3] << 8;
8310 w1[1] = append0[3] >> 24 | append1[0] << 8;
8311 w1[2] = append1[0] >> 24 | append1[1] << 8;
8312 w1[3] = append1[1] >> 24 | append1[2] << 8;
8313 w2[0] = append1[2] >> 24 | append1[3] << 8;
8314 w2[1] = append1[3] >> 24 | append2[0] << 8;
8315 w2[2] = append2[0] >> 24;
8319 w0[1] = w0[1] | append0[0] << 16;
8320 w0[2] = append0[0] >> 16 | append0[1] << 16;
8321 w0[3] = append0[1] >> 16 | append0[2] << 16;
8322 w1[0] = append0[2] >> 16 | append0[3] << 16;
8323 w1[1] = append0[3] >> 16 | append1[0] << 16;
8324 w1[2] = append1[0] >> 16 | append1[1] << 16;
8325 w1[3] = append1[1] >> 16 | append1[2] << 16;
8326 w2[0] = append1[2] >> 16 | append1[3] << 16;
8327 w2[1] = append1[3] >> 16 | append2[0] << 16;
8328 w2[2] = append2[0] >> 16;
8332 w0[1] = w0[1] | append0[0] << 24;
8333 w0[2] = append0[0] >> 8 | append0[1] << 24;
8334 w0[3] = append0[1] >> 8 | append0[2] << 24;
8335 w1[0] = append0[2] >> 8 | append0[3] << 24;
8336 w1[1] = append0[3] >> 8 | append1[0] << 24;
8337 w1[2] = append1[0] >> 8 | append1[1] << 24;
8338 w1[3] = append1[1] >> 8 | append1[2] << 24;
8339 w2[0] = append1[2] >> 8 | append1[3] << 24;
8340 w2[1] = append1[3] >> 8 | append2[0] << 24;
8341 w2[2] = append2[0] >> 8;
8357 w0[2] = w0[2] | append0[0] << 8;
8358 w0[3] = append0[0] >> 24 | append0[1] << 8;
8359 w1[0] = append0[1] >> 24 | append0[2] << 8;
8360 w1[1] = append0[2] >> 24 | append0[3] << 8;
8361 w1[2] = append0[3] >> 24 | append1[0] << 8;
8362 w1[3] = append1[0] >> 24 | append1[1] << 8;
8363 w2[0] = append1[1] >> 24 | append1[2] << 8;
8364 w2[1] = append1[2] >> 24 | append1[3] << 8;
8365 w2[2] = append1[3] >> 24 | append2[0] << 8;
8366 w2[3] = append2[0] >> 24;
8370 w0[2] = w0[2] | append0[0] << 16;
8371 w0[3] = append0[0] >> 16 | append0[1] << 16;
8372 w1[0] = append0[1] >> 16 | append0[2] << 16;
8373 w1[1] = append0[2] >> 16 | append0[3] << 16;
8374 w1[2] = append0[3] >> 16 | append1[0] << 16;
8375 w1[3] = append1[0] >> 16 | append1[1] << 16;
8376 w2[0] = append1[1] >> 16 | append1[2] << 16;
8377 w2[1] = append1[2] >> 16 | append1[3] << 16;
8378 w2[2] = append1[3] >> 16 | append2[0] << 16;
8379 w2[3] = append2[0] >> 16;
8383 w0[2] = w0[2] | append0[0] << 24;
8384 w0[3] = append0[0] >> 8 | append0[1] << 24;
8385 w1[0] = append0[1] >> 8 | append0[2] << 24;
8386 w1[1] = append0[2] >> 8 | append0[3] << 24;
8387 w1[2] = append0[3] >> 8 | append1[0] << 24;
8388 w1[3] = append1[0] >> 8 | append1[1] << 24;
8389 w2[0] = append1[1] >> 8 | append1[2] << 24;
8390 w2[1] = append1[2] >> 8 | append1[3] << 24;
8391 w2[2] = append1[3] >> 8 | append2[0] << 24;
8392 w2[3] = append2[0] >> 8;
8408 w0[3] = w0[3] | append0[0] << 8;
8409 w1[0] = append0[0] >> 24 | append0[1] << 8;
8410 w1[1] = append0[1] >> 24 | append0[2] << 8;
8411 w1[2] = append0[2] >> 24 | append0[3] << 8;
8412 w1[3] = append0[3] >> 24 | append1[0] << 8;
8413 w2[0] = append1[0] >> 24 | append1[1] << 8;
8414 w2[1] = append1[1] >> 24 | append1[2] << 8;
8415 w2[2] = append1[2] >> 24 | append1[3] << 8;
8416 w2[3] = append1[3] >> 24 | append2[0] << 8;
8417 w3[0] = append2[0] >> 24;
8421 w0[3] = w0[3] | append0[0] << 16;
8422 w1[0] = append0[0] >> 16 | append0[1] << 16;
8423 w1[1] = append0[1] >> 16 | append0[2] << 16;
8424 w1[2] = append0[2] >> 16 | append0[3] << 16;
8425 w1[3] = append0[3] >> 16 | append1[0] << 16;
8426 w2[0] = append1[0] >> 16 | append1[1] << 16;
8427 w2[1] = append1[1] >> 16 | append1[2] << 16;
8428 w2[2] = append1[2] >> 16 | append1[3] << 16;
8429 w2[3] = append1[3] >> 16 | append2[0] << 16;
8430 w3[0] = append2[0] >> 16;
8434 w0[3] = w0[3] | append0[0] << 24;
8435 w1[0] = append0[0] >> 8 | append0[1] << 24;
8436 w1[1] = append0[1] >> 8 | append0[2] << 24;
8437 w1[2] = append0[2] >> 8 | append0[3] << 24;
8438 w1[3] = append0[3] >> 8 | append1[0] << 24;
8439 w2[0] = append1[0] >> 8 | append1[1] << 24;
8440 w2[1] = append1[1] >> 8 | append1[2] << 24;
8441 w2[2] = append1[2] >> 8 | append1[3] << 24;
8442 w2[3] = append1[3] >> 8 | append2[0] << 24;
8443 w3[0] = append2[0] >> 8;
8448 // before: memcat32_8
8449 static void memcat_c32_w4x4_a2x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 offset)
8465 w0[0] = w0[0] | append0[0] << 8;
8466 w0[1] = append0[0] >> 24 | append0[1] << 8;
8467 w0[2] = append0[1] >> 24 | append0[2] << 8;
8468 w0[3] = append0[2] >> 24 | append0[3] << 8;
8469 w1[0] = append0[3] >> 24 | append1[0] << 8;
8470 w1[1] = append1[0] >> 24 | append1[1] << 8;
8471 w1[2] = append1[1] >> 24 | append1[2] << 8;
8472 w1[3] = append1[2] >> 24 | append1[3] << 8;
8473 w2[0] = append1[3] >> 24;
8477 w0[0] = w0[0] | append0[0] << 16;
8478 w0[1] = append0[0] >> 16 | append0[1] << 16;
8479 w0[2] = append0[1] >> 16 | append0[2] << 16;
8480 w0[3] = append0[2] >> 16 | append0[3] << 16;
8481 w1[0] = append0[3] >> 16 | append1[0] << 16;
8482 w1[1] = append1[0] >> 16 | append1[1] << 16;
8483 w1[2] = append1[1] >> 16 | append1[2] << 16;
8484 w1[3] = append1[2] >> 16 | append1[3] << 16;
8485 w2[0] = append1[3] >> 16;
8489 w0[0] = w0[0] | append0[0] << 24;
8490 w0[1] = append0[0] >> 8 | append0[1] << 24;
8491 w0[2] = append0[1] >> 8 | append0[2] << 24;
8492 w0[3] = append0[2] >> 8 | append0[3] << 24;
8493 w1[0] = append0[3] >> 8 | append1[0] << 24;
8494 w1[1] = append1[0] >> 8 | append1[1] << 24;
8495 w1[2] = append1[1] >> 8 | append1[2] << 24;
8496 w1[3] = append1[2] >> 8 | append1[3] << 24;
8497 w2[0] = append1[3] >> 8;
8512 w0[1] = w0[1] | append0[0] << 8;
8513 w0[2] = append0[0] >> 24 | append0[1] << 8;
8514 w0[3] = append0[1] >> 24 | append0[2] << 8;
8515 w1[0] = append0[2] >> 24 | append0[3] << 8;
8516 w1[1] = append0[3] >> 24 | append1[0] << 8;
8517 w1[2] = append1[0] >> 24 | append1[1] << 8;
8518 w1[3] = append1[1] >> 24 | append1[2] << 8;
8519 w2[0] = append1[2] >> 24 | append1[3] << 8;
8520 w2[1] = append1[3] >> 24;
8524 w0[1] = w0[1] | append0[0] << 16;
8525 w0[2] = append0[0] >> 16 | append0[1] << 16;
8526 w0[3] = append0[1] >> 16 | append0[2] << 16;
8527 w1[0] = append0[2] >> 16 | append0[3] << 16;
8528 w1[1] = append0[3] >> 16 | append1[0] << 16;
8529 w1[2] = append1[0] >> 16 | append1[1] << 16;
8530 w1[3] = append1[1] >> 16 | append1[2] << 16;
8531 w2[0] = append1[2] >> 16 | append1[3] << 16;
8532 w2[1] = append1[3] >> 16;
8536 w0[1] = w0[1] | append0[0] << 24;
8537 w0[2] = append0[0] >> 8 | append0[1] << 24;
8538 w0[3] = append0[1] >> 8 | append0[2] << 24;
8539 w1[0] = append0[2] >> 8 | append0[3] << 24;
8540 w1[1] = append0[3] >> 8 | append1[0] << 24;
8541 w1[2] = append1[0] >> 8 | append1[1] << 24;
8542 w1[3] = append1[1] >> 8 | append1[2] << 24;
8543 w2[0] = append1[2] >> 8 | append1[3] << 24;
8544 w2[1] = append1[3] >> 8;
8559 w0[2] = w0[2] | append0[0] << 8;
8560 w0[3] = append0[0] >> 24 | append0[1] << 8;
8561 w1[0] = append0[1] >> 24 | append0[2] << 8;
8562 w1[1] = append0[2] >> 24 | append0[3] << 8;
8563 w1[2] = append0[3] >> 24 | append1[0] << 8;
8564 w1[3] = append1[0] >> 24 | append1[1] << 8;
8565 w2[0] = append1[1] >> 24 | append1[2] << 8;
8566 w2[1] = append1[2] >> 24 | append1[3] << 8;
8567 w2[2] = append1[3] >> 24;
8571 w0[2] = w0[2] | append0[0] << 16;
8572 w0[3] = append0[0] >> 16 | append0[1] << 16;
8573 w1[0] = append0[1] >> 16 | append0[2] << 16;
8574 w1[1] = append0[2] >> 16 | append0[3] << 16;
8575 w1[2] = append0[3] >> 16 | append1[0] << 16;
8576 w1[3] = append1[0] >> 16 | append1[1] << 16;
8577 w2[0] = append1[1] >> 16 | append1[2] << 16;
8578 w2[1] = append1[2] >> 16 | append1[3] << 16;
8579 w2[2] = append1[3] >> 16;
8583 w0[2] = w0[2] | append0[0] << 24;
8584 w0[3] = append0[0] >> 8 | append0[1] << 24;
8585 w1[0] = append0[1] >> 8 | append0[2] << 24;
8586 w1[1] = append0[2] >> 8 | append0[3] << 24;
8587 w1[2] = append0[3] >> 8 | append1[0] << 24;
8588 w1[3] = append1[0] >> 8 | append1[1] << 24;
8589 w2[0] = append1[1] >> 8 | append1[2] << 24;
8590 w2[1] = append1[2] >> 8 | append1[3] << 24;
8591 w2[2] = append1[3] >> 8;
8606 w0[3] = w0[3] | append0[0] << 8;
8607 w1[0] = append0[0] >> 24 | append0[1] << 8;
8608 w1[1] = append0[1] >> 24 | append0[2] << 8;
8609 w1[2] = append0[2] >> 24 | append0[3] << 8;
8610 w1[3] = append0[3] >> 24 | append1[0] << 8;
8611 w2[0] = append1[0] >> 24 | append1[1] << 8;
8612 w2[1] = append1[1] >> 24 | append1[2] << 8;
8613 w2[2] = append1[2] >> 24 | append1[3] << 8;
8614 w2[3] = append1[3] >> 24;
8618 w0[3] = w0[3] | append0[0] << 16;
8619 w1[0] = append0[0] >> 16 | append0[1] << 16;
8620 w1[1] = append0[1] >> 16 | append0[2] << 16;
8621 w1[2] = append0[2] >> 16 | append0[3] << 16;
8622 w1[3] = append0[3] >> 16 | append1[0] << 16;
8623 w2[0] = append1[0] >> 16 | append1[1] << 16;
8624 w2[1] = append1[1] >> 16 | append1[2] << 16;
8625 w2[2] = append1[2] >> 16 | append1[3] << 16;
8626 w2[3] = append1[3] >> 16;
8630 w0[3] = w0[3] | append0[0] << 24;
8631 w1[0] = append0[0] >> 8 | append0[1] << 24;
8632 w1[1] = append0[1] >> 8 | append0[2] << 24;
8633 w1[2] = append0[2] >> 8 | append0[3] << 24;
8634 w1[3] = append0[3] >> 8 | append1[0] << 24;
8635 w2[0] = append1[0] >> 8 | append1[1] << 24;
8636 w2[1] = append1[1] >> 8 | append1[2] << 24;
8637 w2[2] = append1[2] >> 8 | append1[3] << 24;
8638 w2[3] = append1[3] >> 8;
8653 w1[0] = w1[0] | append0[0] << 8;
8654 w1[1] = append0[0] >> 24 | append0[1] << 8;
8655 w1[2] = append0[1] >> 24 | append0[2] << 8;
8656 w1[3] = append0[2] >> 24 | append0[3] << 8;
8657 w2[0] = append0[3] >> 24 | append1[0] << 8;
8658 w2[1] = append1[0] >> 24 | append1[1] << 8;
8659 w2[2] = append1[1] >> 24 | append1[2] << 8;
8660 w2[3] = append1[2] >> 24 | append1[3] << 8;
8661 w3[0] = append1[3] >> 24;
8665 w1[0] = w1[0] | append0[0] << 16;
8666 w1[1] = append0[0] >> 16 | append0[1] << 16;
8667 w1[2] = append0[1] >> 16 | append0[2] << 16;
8668 w1[3] = append0[2] >> 16 | append0[3] << 16;
8669 w2[0] = append0[3] >> 16 | append1[0] << 16;
8670 w2[1] = append1[0] >> 16 | append1[1] << 16;
8671 w2[2] = append1[1] >> 16 | append1[2] << 16;
8672 w2[3] = append1[2] >> 16 | append1[3] << 16;
8673 w3[0] = append1[3] >> 16;
8677 w1[0] = w1[0] | append0[0] << 24;
8678 w1[1] = append0[0] >> 8 | append0[1] << 24;
8679 w1[2] = append0[1] >> 8 | append0[2] << 24;
8680 w1[3] = append0[2] >> 8 | append0[3] << 24;
8681 w2[0] = append0[3] >> 8 | append1[0] << 24;
8682 w2[1] = append1[0] >> 8 | append1[1] << 24;
8683 w2[2] = append1[1] >> 8 | append1[2] << 24;
8684 w2[3] = append1[2] >> 8 | append1[3] << 24;
8685 w3[0] = append1[3] >> 8;
8700 w1[1] = w1[1] | append0[0] << 8;
8701 w1[2] = append0[0] >> 24 | append0[1] << 8;
8702 w1[3] = append0[1] >> 24 | append0[2] << 8;
8703 w2[0] = append0[2] >> 24 | append0[3] << 8;
8704 w2[1] = append0[3] >> 24 | append1[0] << 8;
8705 w2[2] = append1[0] >> 24 | append1[1] << 8;
8706 w2[3] = append1[1] >> 24 | append1[2] << 8;
8707 w3[0] = append1[2] >> 24 | append1[3] << 8;
8708 w3[1] = append1[3] >> 24;
8712 w1[1] = w1[1] | append0[0] << 16;
8713 w1[2] = append0[0] >> 16 | append0[1] << 16;
8714 w1[3] = append0[1] >> 16 | append0[2] << 16;
8715 w2[0] = append0[2] >> 16 | append0[3] << 16;
8716 w2[1] = append0[3] >> 16 | append1[0] << 16;
8717 w2[2] = append1[0] >> 16 | append1[1] << 16;
8718 w2[3] = append1[1] >> 16 | append1[2] << 16;
8719 w3[0] = append1[2] >> 16 | append1[3] << 16;
8720 w3[1] = append1[3] >> 16;
8724 w1[1] = w1[1] | append0[0] << 24;
8725 w1[2] = append0[0] >> 8 | append0[1] << 24;
8726 w1[3] = append0[1] >> 8 | append0[2] << 24;
8727 w2[0] = append0[2] >> 8 | append0[3] << 24;
8728 w2[1] = append0[3] >> 8 | append1[0] << 24;
8729 w2[2] = append1[0] >> 8 | append1[1] << 24;
8730 w2[3] = append1[1] >> 8 | append1[2] << 24;
8731 w3[0] = append1[2] >> 8 | append1[3] << 24;
8732 w3[1] = append1[3] >> 8;
8747 w1[2] = w1[2] | append0[0] << 8;
8748 w1[3] = append0[0] >> 24 | append0[1] << 8;
8749 w2[0] = append0[1] >> 24 | append0[2] << 8;
8750 w2[1] = append0[2] >> 24 | append0[3] << 8;
8751 w2[2] = append0[3] >> 24 | append1[0] << 8;
8752 w2[3] = append1[0] >> 24 | append1[1] << 8;
8753 w3[0] = append1[1] >> 24 | append1[2] << 8;
8754 w3[1] = append1[2] >> 24 | append1[3] << 8;
8758 w1[2] = w1[2] | append0[0] << 16;
8759 w1[3] = append0[0] >> 16 | append0[1] << 16;
8760 w2[0] = append0[1] >> 16 | append0[2] << 16;
8761 w2[1] = append0[2] >> 16 | append0[3] << 16;
8762 w2[2] = append0[3] >> 16 | append1[0] << 16;
8763 w2[3] = append1[0] >> 16 | append1[1] << 16;
8764 w3[0] = append1[1] >> 16 | append1[2] << 16;
8765 w3[1] = append1[2] >> 16 | append1[3] << 16;
8769 w1[2] = w1[2] | append0[0] << 24;
8770 w1[3] = append0[0] >> 8 | append0[1] << 24;
8771 w2[0] = append0[1] >> 8 | append0[2] << 24;
8772 w2[1] = append0[2] >> 8 | append0[3] << 24;
8773 w2[2] = append0[3] >> 8 | append1[0] << 24;
8774 w2[3] = append1[0] >> 8 | append1[1] << 24;
8775 w3[0] = append1[1] >> 8 | append1[2] << 24;
8776 w3[1] = append1[2] >> 8 | append1[3] << 24;
8790 w1[3] = w1[3] | append0[0] << 8;
8791 w2[0] = append0[0] >> 24 | append0[1] << 8;
8792 w2[1] = append0[1] >> 24 | append0[2] << 8;
8793 w2[2] = append0[2] >> 24 | append0[3] << 8;
8794 w2[3] = append0[3] >> 24 | append1[0] << 8;
8795 w3[0] = append1[0] >> 24 | append1[1] << 8;
8796 w3[1] = append1[1] >> 24 | append1[2] << 8;
8800 w1[3] = w1[3] | append0[0] << 16;
8801 w2[0] = append0[0] >> 16 | append0[1] << 16;
8802 w2[1] = append0[1] >> 16 | append0[2] << 16;
8803 w2[2] = append0[2] >> 16 | append0[3] << 16;
8804 w2[3] = append0[3] >> 16 | append1[0] << 16;
8805 w3[0] = append1[0] >> 16 | append1[1] << 16;
8806 w3[1] = append1[1] >> 16 | append1[2] << 16;
8810 w1[3] = w1[3] | append0[0] << 24;
8811 w2[0] = append0[0] >> 8 | append0[1] << 24;
8812 w2[1] = append0[1] >> 8 | append0[2] << 24;
8813 w2[2] = append0[2] >> 8 | append0[3] << 24;
8814 w2[3] = append0[3] >> 8 | append1[0] << 24;
8815 w3[0] = append1[0] >> 8 | append1[1] << 24;
8816 w3[1] = append1[1] >> 8 | append1[2] << 24;
8830 // before: memcat32_9
8831 static void memcat_c32_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset)
8848 w0[0] = w0[0] | append0[0] << 8;
8849 w0[1] = append0[0] >> 24 | append0[1] << 8;
8850 w0[2] = append0[1] >> 24 | append0[2] << 8;
8851 w0[3] = append0[2] >> 24 | append0[3] << 8;
8852 w1[0] = append0[3] >> 24 | append1[0] << 8;
8853 w1[1] = append1[0] >> 24 | append1[1] << 8;
8854 w1[2] = append1[1] >> 24 | append1[2] << 8;
8855 w1[3] = append1[2] >> 24 | append1[3] << 8;
8856 w2[0] = append1[3] >> 24 | append2[0] << 8;
8857 w2[1] = append2[0] >> 24;
8861 w0[0] = w0[0] | append0[0] << 16;
8862 w0[1] = append0[0] >> 16 | append0[1] << 16;
8863 w0[2] = append0[1] >> 16 | append0[2] << 16;
8864 w0[3] = append0[2] >> 16 | append0[3] << 16;
8865 w1[0] = append0[3] >> 16 | append1[0] << 16;
8866 w1[1] = append1[0] >> 16 | append1[1] << 16;
8867 w1[2] = append1[1] >> 16 | append1[2] << 16;
8868 w1[3] = append1[2] >> 16 | append1[3] << 16;
8869 w2[0] = append1[3] >> 16 | append2[0] << 16;
8870 w2[1] = append2[0] >> 16;
8874 w0[0] = w0[0] | append0[0] << 24;
8875 w0[1] = append0[0] >> 8 | append0[1] << 24;
8876 w0[2] = append0[1] >> 8 | append0[2] << 24;
8877 w0[3] = append0[2] >> 8 | append0[3] << 24;
8878 w1[0] = append0[3] >> 8 | append1[0] << 24;
8879 w1[1] = append1[0] >> 8 | append1[1] << 24;
8880 w1[2] = append1[1] >> 8 | append1[2] << 24;
8881 w1[3] = append1[2] >> 8 | append1[3] << 24;
8882 w2[0] = append1[3] >> 8 | append2[0] << 24;
8883 w2[1] = append2[0] >> 8;
8899 w0[1] = w0[1] | append0[0] << 8;
8900 w0[2] = append0[0] >> 24 | append0[1] << 8;
8901 w0[3] = append0[1] >> 24 | append0[2] << 8;
8902 w1[0] = append0[2] >> 24 | append0[3] << 8;
8903 w1[1] = append0[3] >> 24 | append1[0] << 8;
8904 w1[2] = append1[0] >> 24 | append1[1] << 8;
8905 w1[3] = append1[1] >> 24 | append1[2] << 8;
8906 w2[0] = append1[2] >> 24 | append1[3] << 8;
8907 w2[1] = append1[3] >> 24 | append2[0] << 8;
8908 w2[2] = append2[0] >> 24;
8912 w0[1] = w0[1] | append0[0] << 16;
8913 w0[2] = append0[0] >> 16 | append0[1] << 16;
8914 w0[3] = append0[1] >> 16 | append0[2] << 16;
8915 w1[0] = append0[2] >> 16 | append0[3] << 16;
8916 w1[1] = append0[3] >> 16 | append1[0] << 16;
8917 w1[2] = append1[0] >> 16 | append1[1] << 16;
8918 w1[3] = append1[1] >> 16 | append1[2] << 16;
8919 w2[0] = append1[2] >> 16 | append1[3] << 16;
8920 w2[1] = append1[3] >> 16 | append2[0] << 16;
8921 w2[2] = append2[0] >> 16;
8925 w0[1] = w0[1] | append0[0] << 24;
8926 w0[2] = append0[0] >> 8 | append0[1] << 24;
8927 w0[3] = append0[1] >> 8 | append0[2] << 24;
8928 w1[0] = append0[2] >> 8 | append0[3] << 24;
8929 w1[1] = append0[3] >> 8 | append1[0] << 24;
8930 w1[2] = append1[0] >> 8 | append1[1] << 24;
8931 w1[3] = append1[1] >> 8 | append1[2] << 24;
8932 w2[0] = append1[2] >> 8 | append1[3] << 24;
8933 w2[1] = append1[3] >> 8 | append2[0] << 24;
8934 w2[2] = append2[0] >> 8;
8950 w0[2] = w0[2] | append0[0] << 8;
8951 w0[3] = append0[0] >> 24 | append0[1] << 8;
8952 w1[0] = append0[1] >> 24 | append0[2] << 8;
8953 w1[1] = append0[2] >> 24 | append0[3] << 8;
8954 w1[2] = append0[3] >> 24 | append1[0] << 8;
8955 w1[3] = append1[0] >> 24 | append1[1] << 8;
8956 w2[0] = append1[1] >> 24 | append1[2] << 8;
8957 w2[1] = append1[2] >> 24 | append1[3] << 8;
8958 w2[2] = append1[3] >> 24 | append2[0] << 8;
8959 w2[3] = append2[0] >> 24;
8963 w0[2] = w0[2] | append0[0] << 16;
8964 w0[3] = append0[0] >> 16 | append0[1] << 16;
8965 w1[0] = append0[1] >> 16 | append0[2] << 16;
8966 w1[1] = append0[2] >> 16 | append0[3] << 16;
8967 w1[2] = append0[3] >> 16 | append1[0] << 16;
8968 w1[3] = append1[0] >> 16 | append1[1] << 16;
8969 w2[0] = append1[1] >> 16 | append1[2] << 16;
8970 w2[1] = append1[2] >> 16 | append1[3] << 16;
8971 w2[2] = append1[3] >> 16 | append2[0] << 16;
8972 w2[3] = append2[0] >> 16;
8976 w0[2] = w0[2] | append0[0] << 24;
8977 w0[3] = append0[0] >> 8 | append0[1] << 24;
8978 w1[0] = append0[1] >> 8 | append0[2] << 24;
8979 w1[1] = append0[2] >> 8 | append0[3] << 24;
8980 w1[2] = append0[3] >> 8 | append1[0] << 24;
8981 w1[3] = append1[0] >> 8 | append1[1] << 24;
8982 w2[0] = append1[1] >> 8 | append1[2] << 24;
8983 w2[1] = append1[2] >> 8 | append1[3] << 24;
8984 w2[2] = append1[3] >> 8 | append2[0] << 24;
8985 w2[3] = append2[0] >> 8;
9001 w0[3] = w0[3] | append0[0] << 8;
9002 w1[0] = append0[0] >> 24 | append0[1] << 8;
9003 w1[1] = append0[1] >> 24 | append0[2] << 8;
9004 w1[2] = append0[2] >> 24 | append0[3] << 8;
9005 w1[3] = append0[3] >> 24 | append1[0] << 8;
9006 w2[0] = append1[0] >> 24 | append1[1] << 8;
9007 w2[1] = append1[1] >> 24 | append1[2] << 8;
9008 w2[2] = append1[2] >> 24 | append1[3] << 8;
9009 w2[3] = append1[3] >> 24 | append2[0] << 8;
9010 w3[0] = append2[0] >> 24;
9014 w0[3] = w0[3] | append0[0] << 16;
9015 w1[0] = append0[0] >> 16 | append0[1] << 16;
9016 w1[1] = append0[1] >> 16 | append0[2] << 16;
9017 w1[2] = append0[2] >> 16 | append0[3] << 16;
9018 w1[3] = append0[3] >> 16 | append1[0] << 16;
9019 w2[0] = append1[0] >> 16 | append1[1] << 16;
9020 w2[1] = append1[1] >> 16 | append1[2] << 16;
9021 w2[2] = append1[2] >> 16 | append1[3] << 16;
9022 w2[3] = append1[3] >> 16 | append2[0] << 16;
9023 w3[0] = append2[0] >> 16;
9027 w0[3] = w0[3] | append0[0] << 24;
9028 w1[0] = append0[0] >> 8 | append0[1] << 24;
9029 w1[1] = append0[1] >> 8 | append0[2] << 24;
9030 w1[2] = append0[2] >> 8 | append0[3] << 24;
9031 w1[3] = append0[3] >> 8 | append1[0] << 24;
9032 w2[0] = append1[0] >> 8 | append1[1] << 24;
9033 w2[1] = append1[1] >> 8 | append1[2] << 24;
9034 w2[2] = append1[2] >> 8 | append1[3] << 24;
9035 w2[3] = append1[3] >> 8 | append2[0] << 24;
9036 w3[0] = append2[0] >> 8;
9052 w1[0] = w1[0] | append0[0] << 8;
9053 w1[1] = append0[0] >> 24 | append0[1] << 8;
9054 w1[2] = append0[1] >> 24 | append0[2] << 8;
9055 w1[3] = append0[2] >> 24 | append0[3] << 8;
9056 w2[0] = append0[3] >> 24 | append1[0] << 8;
9057 w2[1] = append1[0] >> 24 | append1[1] << 8;
9058 w2[2] = append1[1] >> 24 | append1[2] << 8;
9059 w2[3] = append1[2] >> 24 | append1[3] << 8;
9060 w3[0] = append1[3] >> 24 | append2[0] << 8;
9061 w3[1] = append2[0] >> 24;
9065 w1[0] = w1[0] | append0[0] << 16;
9066 w1[1] = append0[0] >> 16 | append0[1] << 16;
9067 w1[2] = append0[1] >> 16 | append0[2] << 16;
9068 w1[3] = append0[2] >> 16 | append0[3] << 16;
9069 w2[0] = append0[3] >> 16 | append1[0] << 16;
9070 w2[1] = append1[0] >> 16 | append1[1] << 16;
9071 w2[2] = append1[1] >> 16 | append1[2] << 16;
9072 w2[3] = append1[2] >> 16 | append1[3] << 16;
9073 w3[0] = append1[3] >> 16 | append2[0] << 16;
9074 w3[1] = append2[0] >> 16;
9078 w1[0] = w1[0] | append0[0] << 24;
9079 w1[1] = append0[0] >> 8 | append0[1] << 24;
9080 w1[2] = append0[1] >> 8 | append0[2] << 24;
9081 w1[3] = append0[2] >> 8 | append0[3] << 24;
9082 w2[0] = append0[3] >> 8 | append1[0] << 24;
9083 w2[1] = append1[0] >> 8 | append1[1] << 24;
9084 w2[2] = append1[1] >> 8 | append1[2] << 24;
9085 w2[3] = append1[2] >> 8 | append1[3] << 24;
9086 w3[0] = append1[3] >> 8 | append2[0] << 24;
9087 w3[1] = append2[0] >> 8;
9103 w1[1] = w1[1] | append0[0] << 8;
9104 w1[2] = append0[0] >> 24 | append0[1] << 8;
9105 w1[3] = append0[1] >> 24 | append0[2] << 8;
9106 w2[0] = append0[2] >> 24 | append0[3] << 8;
9107 w2[1] = append0[3] >> 24 | append1[0] << 8;
9108 w2[2] = append1[0] >> 24 | append1[1] << 8;
9109 w2[3] = append1[1] >> 24 | append1[2] << 8;
9110 w3[0] = append1[2] >> 24 | append1[3] << 8;
9111 w3[1] = append1[3] >> 24 | append2[0] << 8;
9115 w1[1] = w1[1] | append0[0] << 16;
9116 w1[2] = append0[0] >> 16 | append0[1] << 16;
9117 w1[3] = append0[1] >> 16 | append0[2] << 16;
9118 w2[0] = append0[2] >> 16 | append0[3] << 16;
9119 w2[1] = append0[3] >> 16 | append1[0] << 16;
9120 w2[2] = append1[0] >> 16 | append1[1] << 16;
9121 w2[3] = append1[1] >> 16 | append1[2] << 16;
9122 w3[0] = append1[2] >> 16 | append1[3] << 16;
9123 w3[1] = append1[3] >> 16 | append2[0] << 16;
9127 w1[1] = w1[1] | append0[0] << 24;
9128 w1[2] = append0[0] >> 8 | append0[1] << 24;
9129 w1[3] = append0[1] >> 8 | append0[2] << 24;
9130 w2[0] = append0[2] >> 8 | append0[3] << 24;
9131 w2[1] = append0[3] >> 8 | append1[0] << 24;
9132 w2[2] = append1[0] >> 8 | append1[1] << 24;
9133 w2[3] = append1[1] >> 8 | append1[2] << 24;
9134 w3[0] = append1[2] >> 8 | append1[3] << 24;
9135 w3[1] = append1[3] >> 8 | append2[0] << 24;
9150 w1[2] = w1[2] | append0[0] << 8;
9151 w1[3] = append0[0] >> 24 | append0[1] << 8;
9152 w2[0] = append0[1] >> 24 | append0[2] << 8;
9153 w2[1] = append0[2] >> 24 | append0[3] << 8;
9154 w2[2] = append0[3] >> 24 | append1[0] << 8;
9155 w2[3] = append1[0] >> 24 | append1[1] << 8;
9156 w3[0] = append1[1] >> 24 | append1[2] << 8;
9157 w3[1] = append1[2] >> 24 | append1[3] << 8;
9161 w1[2] = w1[2] | append0[0] << 16;
9162 w1[3] = append0[0] >> 16 | append0[1] << 16;
9163 w2[0] = append0[1] >> 16 | append0[2] << 16;
9164 w2[1] = append0[2] >> 16 | append0[3] << 16;
9165 w2[2] = append0[3] >> 16 | append1[0] << 16;
9166 w2[3] = append1[0] >> 16 | append1[1] << 16;
9167 w3[0] = append1[1] >> 16 | append1[2] << 16;
9168 w3[1] = append1[2] >> 16 | append1[3] << 16;
9172 w1[2] = w1[2] | append0[0] << 24;
9173 w1[3] = append0[0] >> 8 | append0[1] << 24;
9174 w2[0] = append0[1] >> 8 | append0[2] << 24;
9175 w2[1] = append0[2] >> 8 | append0[3] << 24;
9176 w2[2] = append0[3] >> 8 | append1[0] << 24;
9177 w2[3] = append1[0] >> 8 | append1[1] << 24;
9178 w3[0] = append1[1] >> 8 | append1[2] << 24;
9179 w3[1] = append1[2] >> 8 | append1[3] << 24;
9193 w1[3] = w1[3] | append0[0] << 8;
9194 w2[0] = append0[0] >> 24 | append0[1] << 8;
9195 w2[1] = append0[1] >> 24 | append0[2] << 8;
9196 w2[2] = append0[2] >> 24 | append0[3] << 8;
9197 w2[3] = append0[3] >> 24 | append1[0] << 8;
9198 w3[0] = append1[0] >> 24 | append1[1] << 8;
9199 w3[1] = append1[1] >> 24 | append1[2] << 8;
9203 w1[3] = w1[3] | append0[0] << 16;
9204 w2[0] = append0[0] >> 16 | append0[1] << 16;
9205 w2[1] = append0[1] >> 16 | append0[2] << 16;
9206 w2[2] = append0[2] >> 16 | append0[3] << 16;
9207 w2[3] = append0[3] >> 16 | append1[0] << 16;
9208 w3[0] = append1[0] >> 16 | append1[1] << 16;
9209 w3[1] = append1[1] >> 16 | append1[2] << 16;
9213 w1[3] = w1[3] | append0[0] << 24;
9214 w2[0] = append0[0] >> 8 | append0[1] << 24;
9215 w2[1] = append0[1] >> 8 | append0[2] << 24;
9216 w2[2] = append0[2] >> 8 | append0[3] << 24;
9217 w2[3] = append0[3] >> 8 | append1[0] << 24;
9218 w3[0] = append1[0] >> 8 | append1[1] << 24;
9219 w3[1] = append1[1] >> 8 | append1[2] << 24;