2 * Author......: Jens Steube <jens.steube@gmail.com>
7 * pure scalar functions
10 static int hash_comp (const u32 d1
[4], __global u32
*d2
)
12 if (d1
[3] > d2
[DGST_R3
]) return ( 1);
13 if (d1
[3] < d2
[DGST_R3
]) return (-1);
14 if (d1
[2] > d2
[DGST_R2
]) return ( 1);
15 if (d1
[2] < d2
[DGST_R2
]) return (-1);
16 if (d1
[1] > d2
[DGST_R1
]) return ( 1);
17 if (d1
[1] < d2
[DGST_R1
]) return (-1);
18 if (d1
[0] > d2
[DGST_R0
]) return ( 1);
19 if (d1
[0] < d2
[DGST_R0
]) return (-1);
24 static int find_hash (const u32 digest
[4], const u32 digests_cnt
, __global digest_t
*digests_buf
)
26 for (u32 l
= 0, r
= digests_cnt
; r
; r
>>= 1)
32 const int cmp
= hash_comp (digest
, digests_buf
[c
].digest_buf
);
41 if (cmp
== 0) return (c
);
47 static u32
check_bitmap (__global u32
*bitmap
, const u32 bitmap_mask
, const u32 bitmap_shift
, const u32 digest
)
49 return (bitmap
[(digest
>> bitmap_shift
) & bitmap_mask
] & (1 << (digest
& 0x1f)));
52 static u32
check (const u32 digest
[2], __global u32
*bitmap_s1_a
, __global u32
*bitmap_s1_b
, __global u32
*bitmap_s1_c
, __global u32
*bitmap_s1_d
, __global u32
*bitmap_s2_a
, __global u32
*bitmap_s2_b
, __global u32
*bitmap_s2_c
, __global u32
*bitmap_s2_d
, const u32 bitmap_mask
, const u32 bitmap_shift1
, const u32 bitmap_shift2
)
54 if (check_bitmap (bitmap_s1_a
, bitmap_mask
, bitmap_shift1
, digest
[0]) == 0) return (0);
55 if (check_bitmap (bitmap_s1_b
, bitmap_mask
, bitmap_shift1
, digest
[1]) == 0) return (0);
56 if (check_bitmap (bitmap_s1_c
, bitmap_mask
, bitmap_shift1
, digest
[2]) == 0) return (0);
57 if (check_bitmap (bitmap_s1_d
, bitmap_mask
, bitmap_shift1
, digest
[3]) == 0) return (0);
59 if (check_bitmap (bitmap_s2_a
, bitmap_mask
, bitmap_shift2
, digest
[0]) == 0) return (0);
60 if (check_bitmap (bitmap_s2_b
, bitmap_mask
, bitmap_shift2
, digest
[1]) == 0) return (0);
61 if (check_bitmap (bitmap_s2_c
, bitmap_mask
, bitmap_shift2
, digest
[2]) == 0) return (0);
62 if (check_bitmap (bitmap_s2_d
, bitmap_mask
, bitmap_shift2
, digest
[3]) == 0) return (0);
67 static void mark_hash (__global plain_t
*plains_buf
, __global u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
69 hashes_shown
[hash_pos
] = 1;
71 plains_buf
[hash_pos
].gidvid
= (gid
* 1) + 0;
72 plains_buf
[hash_pos
].il_pos
= il_pos
;
79 static void truncate_block (u32x w
[4], const u32 len
)
88 case 1: w
[0] &= 0x000000FF;
93 case 2: w
[0] &= 0x0000FFFF;
98 case 3: w
[0] &= 0x00FFFFFF;
107 case 5: w
[1] &= 0x000000FF;
111 case 6: w
[1] &= 0x0000FFFF;
115 case 7: w
[1] &= 0x00FFFFFF;
122 case 9: w
[2] &= 0x000000FF;
125 case 10: w
[2] &= 0x0000FFFF;
128 case 11: w
[2] &= 0x00FFFFFF;
133 case 13: w
[3] &= 0x000000FF;
135 case 14: w
[3] &= 0x0000FFFF;
137 case 15: w
[3] &= 0x00FFFFFF;
142 static void make_unicode (const u32x in
[4], u32x out1
[4], u32x out2
[4])
145 out2
[3] = __byte_perm (in
[3], 0, 0x7372);
146 out2
[2] = __byte_perm (in
[3], 0, 0x7170);
147 out2
[1] = __byte_perm (in
[2], 0, 0x7372);
148 out2
[0] = __byte_perm (in
[2], 0, 0x7170);
149 out1
[3] = __byte_perm (in
[1], 0, 0x7372);
150 out1
[2] = __byte_perm (in
[1], 0, 0x7170);
151 out1
[1] = __byte_perm (in
[0], 0, 0x7372);
152 out1
[0] = __byte_perm (in
[0], 0, 0x7170);
155 #if defined IS_AMD || defined IS_GENERIC
156 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
157 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
158 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
159 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
160 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
161 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
162 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
163 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
167 static void undo_unicode (const u32x in1
[4], const u32x in2
[4], u32x out
[4])
170 out
[0] = __byte_perm (in1
[0], in1
[1], 0x6420);
171 out
[1] = __byte_perm (in1
[2], in1
[3], 0x6420);
172 out
[2] = __byte_perm (in2
[0], in2
[1], 0x6420);
173 out
[3] = __byte_perm (in2
[2], in2
[3], 0x6420);
176 #if defined IS_AMD || defined IS_GENERIC
177 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
178 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
179 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
180 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
181 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
182 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
183 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
184 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
188 static void append_0x01_1x4 (u32x w0
[4], const u32 offset
)
197 w0
[0] = w0
[0] | 0x0100;
201 w0
[0] = w0
[0] | 0x010000;
205 w0
[0] = w0
[0] | 0x01000000;
213 w0
[1] = w0
[1] | 0x0100;
217 w0
[1] = w0
[1] | 0x010000;
221 w0
[1] = w0
[1] | 0x01000000;
229 w0
[2] = w0
[2] | 0x0100;
233 w0
[2] = w0
[2] | 0x010000;
237 w0
[2] = w0
[2] | 0x01000000;
245 w0
[3] = w0
[3] | 0x0100;
249 w0
[3] = w0
[3] | 0x010000;
253 w0
[3] = w0
[3] | 0x01000000;
258 static void append_0x01_2x4 (u32x w0
[4], u32x w1
[4], const u32 offset
)
267 w0
[0] = w0
[0] | 0x0100;
271 w0
[0] = w0
[0] | 0x010000;
275 w0
[0] = w0
[0] | 0x01000000;
283 w0
[1] = w0
[1] | 0x0100;
287 w0
[1] = w0
[1] | 0x010000;
291 w0
[1] = w0
[1] | 0x01000000;
299 w0
[2] = w0
[2] | 0x0100;
303 w0
[2] = w0
[2] | 0x010000;
307 w0
[2] = w0
[2] | 0x01000000;
315 w0
[3] = w0
[3] | 0x0100;
319 w0
[3] = w0
[3] | 0x010000;
323 w0
[3] = w0
[3] | 0x01000000;
331 w1
[0] = w1
[0] | 0x0100;
335 w1
[0] = w1
[0] | 0x010000;
339 w1
[0] = w1
[0] | 0x01000000;
347 w1
[1] = w1
[1] | 0x0100;
351 w1
[1] = w1
[1] | 0x010000;
355 w1
[1] = w1
[1] | 0x01000000;
363 w1
[2] = w1
[2] | 0x0100;
367 w1
[2] = w1
[2] | 0x010000;
371 w1
[2] = w1
[2] | 0x01000000;
379 w1
[3] = w1
[3] | 0x0100;
383 w1
[3] = w1
[3] | 0x010000;
387 w1
[3] = w1
[3] | 0x01000000;
392 static void append_0x01_3x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
401 w0
[0] = w0
[0] | 0x0100;
405 w0
[0] = w0
[0] | 0x010000;
409 w0
[0] = w0
[0] | 0x01000000;
417 w0
[1] = w0
[1] | 0x0100;
421 w0
[1] = w0
[1] | 0x010000;
425 w0
[1] = w0
[1] | 0x01000000;
433 w0
[2] = w0
[2] | 0x0100;
437 w0
[2] = w0
[2] | 0x010000;
441 w0
[2] = w0
[2] | 0x01000000;
449 w0
[3] = w0
[3] | 0x0100;
453 w0
[3] = w0
[3] | 0x010000;
457 w0
[3] = w0
[3] | 0x01000000;
465 w1
[0] = w1
[0] | 0x0100;
469 w1
[0] = w1
[0] | 0x010000;
473 w1
[0] = w1
[0] | 0x01000000;
481 w1
[1] = w1
[1] | 0x0100;
485 w1
[1] = w1
[1] | 0x010000;
489 w1
[1] = w1
[1] | 0x01000000;
497 w1
[2] = w1
[2] | 0x0100;
501 w1
[2] = w1
[2] | 0x010000;
505 w1
[2] = w1
[2] | 0x01000000;
513 w1
[3] = w1
[3] | 0x0100;
517 w1
[3] = w1
[3] | 0x010000;
521 w1
[3] = w1
[3] | 0x01000000;
529 w2
[0] = w2
[0] | 0x0100;
533 w2
[0] = w2
[0] | 0x010000;
537 w2
[0] = w2
[0] | 0x01000000;
545 w2
[1] = w2
[1] | 0x0100;
549 w2
[1] = w2
[1] | 0x010000;
553 w2
[1] = w2
[1] | 0x01000000;
561 w2
[2] = w2
[2] | 0x0100;
565 w2
[2] = w2
[2] | 0x010000;
569 w2
[2] = w2
[2] | 0x01000000;
577 w2
[3] = w2
[3] | 0x0100;
581 w2
[3] = w2
[3] | 0x010000;
585 w2
[3] = w2
[3] | 0x01000000;
590 static void append_0x01_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
599 w0
[0] = w0
[0] | 0x0100;
603 w0
[0] = w0
[0] | 0x010000;
607 w0
[0] = w0
[0] | 0x01000000;
615 w0
[1] = w0
[1] | 0x0100;
619 w0
[1] = w0
[1] | 0x010000;
623 w0
[1] = w0
[1] | 0x01000000;
631 w0
[2] = w0
[2] | 0x0100;
635 w0
[2] = w0
[2] | 0x010000;
639 w0
[2] = w0
[2] | 0x01000000;
647 w0
[3] = w0
[3] | 0x0100;
651 w0
[3] = w0
[3] | 0x010000;
655 w0
[3] = w0
[3] | 0x01000000;
663 w1
[0] = w1
[0] | 0x0100;
667 w1
[0] = w1
[0] | 0x010000;
671 w1
[0] = w1
[0] | 0x01000000;
679 w1
[1] = w1
[1] | 0x0100;
683 w1
[1] = w1
[1] | 0x010000;
687 w1
[1] = w1
[1] | 0x01000000;
695 w1
[2] = w1
[2] | 0x0100;
699 w1
[2] = w1
[2] | 0x010000;
703 w1
[2] = w1
[2] | 0x01000000;
711 w1
[3] = w1
[3] | 0x0100;
715 w1
[3] = w1
[3] | 0x010000;
719 w1
[3] = w1
[3] | 0x01000000;
727 w2
[0] = w2
[0] | 0x0100;
731 w2
[0] = w2
[0] | 0x010000;
735 w2
[0] = w2
[0] | 0x01000000;
743 w2
[1] = w2
[1] | 0x0100;
747 w2
[1] = w2
[1] | 0x010000;
751 w2
[1] = w2
[1] | 0x01000000;
759 w2
[2] = w2
[2] | 0x0100;
763 w2
[2] = w2
[2] | 0x010000;
767 w2
[2] = w2
[2] | 0x01000000;
775 w2
[3] = w2
[3] | 0x0100;
779 w2
[3] = w2
[3] | 0x010000;
783 w2
[3] = w2
[3] | 0x01000000;
791 w3
[0] = w3
[0] | 0x0100;
795 w3
[0] = w3
[0] | 0x010000;
799 w3
[0] = w3
[0] | 0x01000000;
807 w3
[1] = w3
[1] | 0x0100;
811 w3
[1] = w3
[1] | 0x010000;
815 w3
[1] = w3
[1] | 0x01000000;
823 w3
[2] = w3
[2] | 0x0100;
827 w3
[2] = w3
[2] | 0x010000;
831 w3
[2] = w3
[2] | 0x01000000;
839 w3
[3] = w3
[3] | 0x0100;
843 w3
[3] = w3
[3] | 0x010000;
847 w3
[3] = w3
[3] | 0x01000000;
852 static void append_0x01_8x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
861 w0
[0] = w0
[0] | 0x0100;
865 w0
[0] = w0
[0] | 0x010000;
869 w0
[0] = w0
[0] | 0x01000000;
877 w0
[1] = w0
[1] | 0x0100;
881 w0
[1] = w0
[1] | 0x010000;
885 w0
[1] = w0
[1] | 0x01000000;
893 w0
[2] = w0
[2] | 0x0100;
897 w0
[2] = w0
[2] | 0x010000;
901 w0
[2] = w0
[2] | 0x01000000;
909 w0
[3] = w0
[3] | 0x0100;
913 w0
[3] = w0
[3] | 0x010000;
917 w0
[3] = w0
[3] | 0x01000000;
925 w1
[0] = w1
[0] | 0x0100;
929 w1
[0] = w1
[0] | 0x010000;
933 w1
[0] = w1
[0] | 0x01000000;
941 w1
[1] = w1
[1] | 0x0100;
945 w1
[1] = w1
[1] | 0x010000;
949 w1
[1] = w1
[1] | 0x01000000;
957 w1
[2] = w1
[2] | 0x0100;
961 w1
[2] = w1
[2] | 0x010000;
965 w1
[2] = w1
[2] | 0x01000000;
973 w1
[3] = w1
[3] | 0x0100;
977 w1
[3] = w1
[3] | 0x010000;
981 w1
[3] = w1
[3] | 0x01000000;
989 w2
[0] = w2
[0] | 0x0100;
993 w2
[0] = w2
[0] | 0x010000;
997 w2
[0] = w2
[0] | 0x01000000;
1005 w2
[1] = w2
[1] | 0x0100;
1009 w2
[1] = w2
[1] | 0x010000;
1013 w2
[1] = w2
[1] | 0x01000000;
1021 w2
[2] = w2
[2] | 0x0100;
1025 w2
[2] = w2
[2] | 0x010000;
1029 w2
[2] = w2
[2] | 0x01000000;
1037 w2
[3] = w2
[3] | 0x0100;
1041 w2
[3] = w2
[3] | 0x010000;
1045 w2
[3] = w2
[3] | 0x01000000;
1053 w3
[0] = w3
[0] | 0x0100;
1057 w3
[0] = w3
[0] | 0x010000;
1061 w3
[0] = w3
[0] | 0x01000000;
1069 w3
[1] = w3
[1] | 0x0100;
1073 w3
[1] = w3
[1] | 0x010000;
1077 w3
[1] = w3
[1] | 0x01000000;
1085 w3
[2] = w3
[2] | 0x0100;
1089 w3
[2] = w3
[2] | 0x010000;
1093 w3
[2] = w3
[2] | 0x01000000;
1101 w3
[3] = w3
[3] | 0x0100;
1105 w3
[3] = w3
[3] | 0x010000;
1109 w3
[3] = w3
[3] | 0x01000000;
1117 w4
[0] = w4
[0] | 0x0100;
1121 w4
[0] = w4
[0] | 0x010000;
1125 w4
[0] = w4
[0] | 0x01000000;
1133 w4
[1] = w4
[1] | 0x0100;
1137 w4
[1] = w4
[1] | 0x010000;
1141 w4
[1] = w4
[1] | 0x01000000;
1149 w4
[2] = w4
[2] | 0x0100;
1153 w4
[2] = w4
[2] | 0x010000;
1157 w4
[2] = w4
[2] | 0x01000000;
1165 w4
[3] = w4
[3] | 0x0100;
1169 w4
[3] = w4
[3] | 0x010000;
1173 w4
[3] = w4
[3] | 0x01000000;
1181 w5
[0] = w5
[0] | 0x0100;
1185 w5
[0] = w5
[0] | 0x010000;
1189 w5
[0] = w5
[0] | 0x01000000;
1197 w5
[1] = w5
[1] | 0x0100;
1201 w5
[1] = w5
[1] | 0x010000;
1205 w5
[1] = w5
[1] | 0x01000000;
1213 w5
[2] = w5
[2] | 0x0100;
1217 w5
[2] = w5
[2] | 0x010000;
1221 w5
[2] = w5
[2] | 0x01000000;
1229 w5
[3] = w5
[3] | 0x0100;
1233 w5
[3] = w5
[3] | 0x010000;
1237 w5
[3] = w5
[3] | 0x01000000;
1245 w6
[0] = w6
[0] | 0x0100;
1249 w6
[0] = w6
[0] | 0x010000;
1253 w6
[0] = w6
[0] | 0x01000000;
1261 w6
[1] = w6
[1] | 0x0100;
1265 w6
[1] = w6
[1] | 0x010000;
1269 w6
[1] = w6
[1] | 0x01000000;
1277 w6
[2] = w6
[2] | 0x0100;
1281 w6
[2] = w6
[2] | 0x010000;
1285 w6
[2] = w6
[2] | 0x01000000;
1293 w6
[3] = w6
[3] | 0x0100;
1297 w6
[3] = w6
[3] | 0x010000;
1301 w6
[3] = w6
[3] | 0x01000000;
1309 w7
[0] = w7
[0] | 0x0100;
1313 w7
[0] = w7
[0] | 0x010000;
1317 w7
[0] = w7
[0] | 0x01000000;
1325 w7
[1] = w7
[1] | 0x0100;
1329 w7
[1] = w7
[1] | 0x010000;
1333 w7
[1] = w7
[1] | 0x01000000;
1341 w7
[2] = w7
[2] | 0x0100;
1345 w7
[2] = w7
[2] | 0x010000;
1349 w7
[2] = w7
[2] | 0x01000000;
1357 w7
[3] = w7
[3] | 0x0100;
1361 w7
[3] = w7
[3] | 0x010000;
1365 w7
[3] = w7
[3] | 0x01000000;
1370 static void append_0x02_1x4 (u32x w0
[4], const u32 offset
)
1379 w0
[0] = w0
[0] | 0x0200;
1383 w0
[0] = w0
[0] | 0x020000;
1387 w0
[0] = w0
[0] | 0x02000000;
1395 w0
[1] = w0
[1] | 0x0200;
1399 w0
[1] = w0
[1] | 0x020000;
1403 w0
[1] = w0
[1] | 0x02000000;
1411 w0
[2] = w0
[2] | 0x0200;
1415 w0
[2] = w0
[2] | 0x020000;
1419 w0
[2] = w0
[2] | 0x02000000;
1427 w0
[3] = w0
[3] | 0x0200;
1431 w0
[3] = w0
[3] | 0x020000;
1435 w0
[3] = w0
[3] | 0x02000000;
1440 static void append_0x02_2x4 (u32x w0
[4], u32x w1
[4], const u32 offset
)
1449 w0
[0] = w0
[0] | 0x0200;
1453 w0
[0] = w0
[0] | 0x020000;
1457 w0
[0] = w0
[0] | 0x02000000;
1465 w0
[1] = w0
[1] | 0x0200;
1469 w0
[1] = w0
[1] | 0x020000;
1473 w0
[1] = w0
[1] | 0x02000000;
1481 w0
[2] = w0
[2] | 0x0200;
1485 w0
[2] = w0
[2] | 0x020000;
1489 w0
[2] = w0
[2] | 0x02000000;
1497 w0
[3] = w0
[3] | 0x0200;
1501 w0
[3] = w0
[3] | 0x020000;
1505 w0
[3] = w0
[3] | 0x02000000;
1513 w1
[0] = w1
[0] | 0x0200;
1517 w1
[0] = w1
[0] | 0x020000;
1521 w1
[0] = w1
[0] | 0x02000000;
1529 w1
[1] = w1
[1] | 0x0200;
1533 w1
[1] = w1
[1] | 0x020000;
1537 w1
[1] = w1
[1] | 0x02000000;
1545 w1
[2] = w1
[2] | 0x0200;
1549 w1
[2] = w1
[2] | 0x020000;
1553 w1
[2] = w1
[2] | 0x02000000;
1561 w1
[3] = w1
[3] | 0x0200;
1565 w1
[3] = w1
[3] | 0x020000;
1569 w1
[3] = w1
[3] | 0x02000000;
1574 static void append_0x02_3x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
1583 w0
[0] = w0
[0] | 0x0200;
1587 w0
[0] = w0
[0] | 0x020000;
1591 w0
[0] = w0
[0] | 0x02000000;
1599 w0
[1] = w0
[1] | 0x0200;
1603 w0
[1] = w0
[1] | 0x020000;
1607 w0
[1] = w0
[1] | 0x02000000;
1615 w0
[2] = w0
[2] | 0x0200;
1619 w0
[2] = w0
[2] | 0x020000;
1623 w0
[2] = w0
[2] | 0x02000000;
1631 w0
[3] = w0
[3] | 0x0200;
1635 w0
[3] = w0
[3] | 0x020000;
1639 w0
[3] = w0
[3] | 0x02000000;
1647 w1
[0] = w1
[0] | 0x0200;
1651 w1
[0] = w1
[0] | 0x020000;
1655 w1
[0] = w1
[0] | 0x02000000;
1663 w1
[1] = w1
[1] | 0x0200;
1667 w1
[1] = w1
[1] | 0x020000;
1671 w1
[1] = w1
[1] | 0x02000000;
1679 w1
[2] = w1
[2] | 0x0200;
1683 w1
[2] = w1
[2] | 0x020000;
1687 w1
[2] = w1
[2] | 0x02000000;
1695 w1
[3] = w1
[3] | 0x0200;
1699 w1
[3] = w1
[3] | 0x020000;
1703 w1
[3] = w1
[3] | 0x02000000;
1711 w2
[0] = w2
[0] | 0x0200;
1715 w2
[0] = w2
[0] | 0x020000;
1719 w2
[0] = w2
[0] | 0x02000000;
1727 w2
[1] = w2
[1] | 0x0200;
1731 w2
[1] = w2
[1] | 0x020000;
1735 w2
[1] = w2
[1] | 0x02000000;
1743 w2
[2] = w2
[2] | 0x0200;
1747 w2
[2] = w2
[2] | 0x020000;
1751 w2
[2] = w2
[2] | 0x02000000;
1759 w2
[3] = w2
[3] | 0x0200;
1763 w2
[3] = w2
[3] | 0x020000;
1767 w2
[3] = w2
[3] | 0x02000000;
1772 static void append_0x02_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
1781 w0
[0] = w0
[0] | 0x0200;
1785 w0
[0] = w0
[0] | 0x020000;
1789 w0
[0] = w0
[0] | 0x02000000;
1797 w0
[1] = w0
[1] | 0x0200;
1801 w0
[1] = w0
[1] | 0x020000;
1805 w0
[1] = w0
[1] | 0x02000000;
1813 w0
[2] = w0
[2] | 0x0200;
1817 w0
[2] = w0
[2] | 0x020000;
1821 w0
[2] = w0
[2] | 0x02000000;
1829 w0
[3] = w0
[3] | 0x0200;
1833 w0
[3] = w0
[3] | 0x020000;
1837 w0
[3] = w0
[3] | 0x02000000;
1845 w1
[0] = w1
[0] | 0x0200;
1849 w1
[0] = w1
[0] | 0x020000;
1853 w1
[0] = w1
[0] | 0x02000000;
1861 w1
[1] = w1
[1] | 0x0200;
1865 w1
[1] = w1
[1] | 0x020000;
1869 w1
[1] = w1
[1] | 0x02000000;
1877 w1
[2] = w1
[2] | 0x0200;
1881 w1
[2] = w1
[2] | 0x020000;
1885 w1
[2] = w1
[2] | 0x02000000;
1893 w1
[3] = w1
[3] | 0x0200;
1897 w1
[3] = w1
[3] | 0x020000;
1901 w1
[3] = w1
[3] | 0x02000000;
1909 w2
[0] = w2
[0] | 0x0200;
1913 w2
[0] = w2
[0] | 0x020000;
1917 w2
[0] = w2
[0] | 0x02000000;
1925 w2
[1] = w2
[1] | 0x0200;
1929 w2
[1] = w2
[1] | 0x020000;
1933 w2
[1] = w2
[1] | 0x02000000;
1941 w2
[2] = w2
[2] | 0x0200;
1945 w2
[2] = w2
[2] | 0x020000;
1949 w2
[2] = w2
[2] | 0x02000000;
1957 w2
[3] = w2
[3] | 0x0200;
1961 w2
[3] = w2
[3] | 0x020000;
1965 w2
[3] = w2
[3] | 0x02000000;
1973 w3
[0] = w3
[0] | 0x0200;
1977 w3
[0] = w3
[0] | 0x020000;
1981 w3
[0] = w3
[0] | 0x02000000;
1989 w3
[1] = w3
[1] | 0x0200;
1993 w3
[1] = w3
[1] | 0x020000;
1997 w3
[1] = w3
[1] | 0x02000000;
2005 w3
[2] = w3
[2] | 0x0200;
2009 w3
[2] = w3
[2] | 0x020000;
2013 w3
[2] = w3
[2] | 0x02000000;
2021 w3
[3] = w3
[3] | 0x0200;
2025 w3
[3] = w3
[3] | 0x020000;
2029 w3
[3] = w3
[3] | 0x02000000;
2034 static void append_0x02_8x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
2043 w0
[0] = w0
[0] | 0x0200;
2047 w0
[0] = w0
[0] | 0x020000;
2051 w0
[0] = w0
[0] | 0x02000000;
2059 w0
[1] = w0
[1] | 0x0200;
2063 w0
[1] = w0
[1] | 0x020000;
2067 w0
[1] = w0
[1] | 0x02000000;
2075 w0
[2] = w0
[2] | 0x0200;
2079 w0
[2] = w0
[2] | 0x020000;
2083 w0
[2] = w0
[2] | 0x02000000;
2091 w0
[3] = w0
[3] | 0x0200;
2095 w0
[3] = w0
[3] | 0x020000;
2099 w0
[3] = w0
[3] | 0x02000000;
2107 w1
[0] = w1
[0] | 0x0200;
2111 w1
[0] = w1
[0] | 0x020000;
2115 w1
[0] = w1
[0] | 0x02000000;
2123 w1
[1] = w1
[1] | 0x0200;
2127 w1
[1] = w1
[1] | 0x020000;
2131 w1
[1] = w1
[1] | 0x02000000;
2139 w1
[2] = w1
[2] | 0x0200;
2143 w1
[2] = w1
[2] | 0x020000;
2147 w1
[2] = w1
[2] | 0x02000000;
2155 w1
[3] = w1
[3] | 0x0200;
2159 w1
[3] = w1
[3] | 0x020000;
2163 w1
[3] = w1
[3] | 0x02000000;
2171 w2
[0] = w2
[0] | 0x0200;
2175 w2
[0] = w2
[0] | 0x020000;
2179 w2
[0] = w2
[0] | 0x02000000;
2187 w2
[1] = w2
[1] | 0x0200;
2191 w2
[1] = w2
[1] | 0x020000;
2195 w2
[1] = w2
[1] | 0x02000000;
2203 w2
[2] = w2
[2] | 0x0200;
2207 w2
[2] = w2
[2] | 0x020000;
2211 w2
[2] = w2
[2] | 0x02000000;
2219 w2
[3] = w2
[3] | 0x0200;
2223 w2
[3] = w2
[3] | 0x020000;
2227 w2
[3] = w2
[3] | 0x02000000;
2235 w3
[0] = w3
[0] | 0x0200;
2239 w3
[0] = w3
[0] | 0x020000;
2243 w3
[0] = w3
[0] | 0x02000000;
2251 w3
[1] = w3
[1] | 0x0200;
2255 w3
[1] = w3
[1] | 0x020000;
2259 w3
[1] = w3
[1] | 0x02000000;
2267 w3
[2] = w3
[2] | 0x0200;
2271 w3
[2] = w3
[2] | 0x020000;
2275 w3
[2] = w3
[2] | 0x02000000;
2283 w3
[3] = w3
[3] | 0x0200;
2287 w3
[3] = w3
[3] | 0x020000;
2291 w3
[3] = w3
[3] | 0x02000000;
2299 w4
[0] = w4
[0] | 0x0200;
2303 w4
[0] = w4
[0] | 0x020000;
2307 w4
[0] = w4
[0] | 0x02000000;
2315 w4
[1] = w4
[1] | 0x0200;
2319 w4
[1] = w4
[1] | 0x020000;
2323 w4
[1] = w4
[1] | 0x02000000;
2331 w4
[2] = w4
[2] | 0x0200;
2335 w4
[2] = w4
[2] | 0x020000;
2339 w4
[2] = w4
[2] | 0x02000000;
2347 w4
[3] = w4
[3] | 0x0200;
2351 w4
[3] = w4
[3] | 0x020000;
2355 w4
[3] = w4
[3] | 0x02000000;
2363 w5
[0] = w5
[0] | 0x0200;
2367 w5
[0] = w5
[0] | 0x020000;
2371 w5
[0] = w5
[0] | 0x02000000;
2379 w5
[1] = w5
[1] | 0x0200;
2383 w5
[1] = w5
[1] | 0x020000;
2387 w5
[1] = w5
[1] | 0x02000000;
2395 w5
[2] = w5
[2] | 0x0200;
2399 w5
[2] = w5
[2] | 0x020000;
2403 w5
[2] = w5
[2] | 0x02000000;
2411 w5
[3] = w5
[3] | 0x0200;
2415 w5
[3] = w5
[3] | 0x020000;
2419 w5
[3] = w5
[3] | 0x02000000;
2427 w6
[0] = w6
[0] | 0x0200;
2431 w6
[0] = w6
[0] | 0x020000;
2435 w6
[0] = w6
[0] | 0x02000000;
2443 w6
[1] = w6
[1] | 0x0200;
2447 w6
[1] = w6
[1] | 0x020000;
2451 w6
[1] = w6
[1] | 0x02000000;
2459 w6
[2] = w6
[2] | 0x0200;
2463 w6
[2] = w6
[2] | 0x020000;
2467 w6
[2] = w6
[2] | 0x02000000;
2475 w6
[3] = w6
[3] | 0x0200;
2479 w6
[3] = w6
[3] | 0x020000;
2483 w6
[3] = w6
[3] | 0x02000000;
2491 w7
[0] = w7
[0] | 0x0200;
2495 w7
[0] = w7
[0] | 0x020000;
2499 w7
[0] = w7
[0] | 0x02000000;
2507 w7
[1] = w7
[1] | 0x0200;
2511 w7
[1] = w7
[1] | 0x020000;
2515 w7
[1] = w7
[1] | 0x02000000;
2523 w7
[2] = w7
[2] | 0x0200;
2527 w7
[2] = w7
[2] | 0x020000;
2531 w7
[2] = w7
[2] | 0x02000000;
2539 w7
[3] = w7
[3] | 0x0200;
2543 w7
[3] = w7
[3] | 0x020000;
2547 w7
[3] = w7
[3] | 0x02000000;
2552 static void append_0x80_1x4 (u32x w0
[4], const u32 offset
)
2561 w0
[0] = w0
[0] | 0x8000;
2565 w0
[0] = w0
[0] | 0x800000;
2569 w0
[0] = w0
[0] | 0x80000000;
2577 w0
[1] = w0
[1] | 0x8000;
2581 w0
[1] = w0
[1] | 0x800000;
2585 w0
[1] = w0
[1] | 0x80000000;
2593 w0
[2] = w0
[2] | 0x8000;
2597 w0
[2] = w0
[2] | 0x800000;
2601 w0
[2] = w0
[2] | 0x80000000;
2609 w0
[3] = w0
[3] | 0x8000;
2613 w0
[3] = w0
[3] | 0x800000;
2617 w0
[3] = w0
[3] | 0x80000000;
2622 static void append_0x80_2x4 (u32x w0
[4], u32x w1
[4], const u32 offset
)
2631 w0
[0] = w0
[0] | 0x8000;
2635 w0
[0] = w0
[0] | 0x800000;
2639 w0
[0] = w0
[0] | 0x80000000;
2647 w0
[1] = w0
[1] | 0x8000;
2651 w0
[1] = w0
[1] | 0x800000;
2655 w0
[1] = w0
[1] | 0x80000000;
2663 w0
[2] = w0
[2] | 0x8000;
2667 w0
[2] = w0
[2] | 0x800000;
2671 w0
[2] = w0
[2] | 0x80000000;
2679 w0
[3] = w0
[3] | 0x8000;
2683 w0
[3] = w0
[3] | 0x800000;
2687 w0
[3] = w0
[3] | 0x80000000;
2695 w1
[0] = w1
[0] | 0x8000;
2699 w1
[0] = w1
[0] | 0x800000;
2703 w1
[0] = w1
[0] | 0x80000000;
2711 w1
[1] = w1
[1] | 0x8000;
2715 w1
[1] = w1
[1] | 0x800000;
2719 w1
[1] = w1
[1] | 0x80000000;
2727 w1
[2] = w1
[2] | 0x8000;
2731 w1
[2] = w1
[2] | 0x800000;
2735 w1
[2] = w1
[2] | 0x80000000;
2743 w1
[3] = w1
[3] | 0x8000;
2747 w1
[3] = w1
[3] | 0x800000;
2751 w1
[3] = w1
[3] | 0x80000000;
2756 static void append_0x80_3x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
2765 w0
[0] = w0
[0] | 0x8000;
2769 w0
[0] = w0
[0] | 0x800000;
2773 w0
[0] = w0
[0] | 0x80000000;
2781 w0
[1] = w0
[1] | 0x8000;
2785 w0
[1] = w0
[1] | 0x800000;
2789 w0
[1] = w0
[1] | 0x80000000;
2797 w0
[2] = w0
[2] | 0x8000;
2801 w0
[2] = w0
[2] | 0x800000;
2805 w0
[2] = w0
[2] | 0x80000000;
2813 w0
[3] = w0
[3] | 0x8000;
2817 w0
[3] = w0
[3] | 0x800000;
2821 w0
[3] = w0
[3] | 0x80000000;
2829 w1
[0] = w1
[0] | 0x8000;
2833 w1
[0] = w1
[0] | 0x800000;
2837 w1
[0] = w1
[0] | 0x80000000;
2845 w1
[1] = w1
[1] | 0x8000;
2849 w1
[1] = w1
[1] | 0x800000;
2853 w1
[1] = w1
[1] | 0x80000000;
2861 w1
[2] = w1
[2] | 0x8000;
2865 w1
[2] = w1
[2] | 0x800000;
2869 w1
[2] = w1
[2] | 0x80000000;
2877 w1
[3] = w1
[3] | 0x8000;
2881 w1
[3] = w1
[3] | 0x800000;
2885 w1
[3] = w1
[3] | 0x80000000;
2893 w2
[0] = w2
[0] | 0x8000;
2897 w2
[0] = w2
[0] | 0x800000;
2901 w2
[0] = w2
[0] | 0x80000000;
2909 w2
[1] = w2
[1] | 0x8000;
2913 w2
[1] = w2
[1] | 0x800000;
2917 w2
[1] = w2
[1] | 0x80000000;
2925 w2
[2] = w2
[2] | 0x8000;
2929 w2
[2] = w2
[2] | 0x800000;
2933 w2
[2] = w2
[2] | 0x80000000;
2941 w2
[3] = w2
[3] | 0x8000;
2945 w2
[3] = w2
[3] | 0x800000;
2949 w2
[3] = w2
[3] | 0x80000000;
2954 static void append_0x80_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
2963 w0
[0] = w0
[0] | 0x8000;
2967 w0
[0] = w0
[0] | 0x800000;
2971 w0
[0] = w0
[0] | 0x80000000;
2979 w0
[1] = w0
[1] | 0x8000;
2983 w0
[1] = w0
[1] | 0x800000;
2987 w0
[1] = w0
[1] | 0x80000000;
2995 w0
[2] = w0
[2] | 0x8000;
2999 w0
[2] = w0
[2] | 0x800000;
3003 w0
[2] = w0
[2] | 0x80000000;
3011 w0
[3] = w0
[3] | 0x8000;
3015 w0
[3] = w0
[3] | 0x800000;
3019 w0
[3] = w0
[3] | 0x80000000;
3027 w1
[0] = w1
[0] | 0x8000;
3031 w1
[0] = w1
[0] | 0x800000;
3035 w1
[0] = w1
[0] | 0x80000000;
3043 w1
[1] = w1
[1] | 0x8000;
3047 w1
[1] = w1
[1] | 0x800000;
3051 w1
[1] = w1
[1] | 0x80000000;
3059 w1
[2] = w1
[2] | 0x8000;
3063 w1
[2] = w1
[2] | 0x800000;
3067 w1
[2] = w1
[2] | 0x80000000;
3075 w1
[3] = w1
[3] | 0x8000;
3079 w1
[3] = w1
[3] | 0x800000;
3083 w1
[3] = w1
[3] | 0x80000000;
3091 w2
[0] = w2
[0] | 0x8000;
3095 w2
[0] = w2
[0] | 0x800000;
3099 w2
[0] = w2
[0] | 0x80000000;
3107 w2
[1] = w2
[1] | 0x8000;
3111 w2
[1] = w2
[1] | 0x800000;
3115 w2
[1] = w2
[1] | 0x80000000;
3123 w2
[2] = w2
[2] | 0x8000;
3127 w2
[2] = w2
[2] | 0x800000;
3131 w2
[2] = w2
[2] | 0x80000000;
3139 w2
[3] = w2
[3] | 0x8000;
3143 w2
[3] = w2
[3] | 0x800000;
3147 w2
[3] = w2
[3] | 0x80000000;
3155 w3
[0] = w3
[0] | 0x8000;
3159 w3
[0] = w3
[0] | 0x800000;
3163 w3
[0] = w3
[0] | 0x80000000;
3171 w3
[1] = w3
[1] | 0x8000;
3175 w3
[1] = w3
[1] | 0x800000;
3179 w3
[1] = w3
[1] | 0x80000000;
3187 w3
[2] = w3
[2] | 0x8000;
3191 w3
[2] = w3
[2] | 0x800000;
3195 w3
[2] = w3
[2] | 0x80000000;
3203 w3
[3] = w3
[3] | 0x8000;
3207 w3
[3] = w3
[3] | 0x800000;
3211 w3
[3] = w3
[3] | 0x80000000;
3216 static void append_0x80_8x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
3225 w0
[0] = w0
[0] | 0x8000;
3229 w0
[0] = w0
[0] | 0x800000;
3233 w0
[0] = w0
[0] | 0x80000000;
3241 w0
[1] = w0
[1] | 0x8000;
3245 w0
[1] = w0
[1] | 0x800000;
3249 w0
[1] = w0
[1] | 0x80000000;
3257 w0
[2] = w0
[2] | 0x8000;
3261 w0
[2] = w0
[2] | 0x800000;
3265 w0
[2] = w0
[2] | 0x80000000;
3273 w0
[3] = w0
[3] | 0x8000;
3277 w0
[3] = w0
[3] | 0x800000;
3281 w0
[3] = w0
[3] | 0x80000000;
3289 w1
[0] = w1
[0] | 0x8000;
3293 w1
[0] = w1
[0] | 0x800000;
3297 w1
[0] = w1
[0] | 0x80000000;
3305 w1
[1] = w1
[1] | 0x8000;
3309 w1
[1] = w1
[1] | 0x800000;
3313 w1
[1] = w1
[1] | 0x80000000;
3321 w1
[2] = w1
[2] | 0x8000;
3325 w1
[2] = w1
[2] | 0x800000;
3329 w1
[2] = w1
[2] | 0x80000000;
3337 w1
[3] = w1
[3] | 0x8000;
3341 w1
[3] = w1
[3] | 0x800000;
3345 w1
[3] = w1
[3] | 0x80000000;
3353 w2
[0] = w2
[0] | 0x8000;
3357 w2
[0] = w2
[0] | 0x800000;
3361 w2
[0] = w2
[0] | 0x80000000;
3369 w2
[1] = w2
[1] | 0x8000;
3373 w2
[1] = w2
[1] | 0x800000;
3377 w2
[1] = w2
[1] | 0x80000000;
3385 w2
[2] = w2
[2] | 0x8000;
3389 w2
[2] = w2
[2] | 0x800000;
3393 w2
[2] = w2
[2] | 0x80000000;
3401 w2
[3] = w2
[3] | 0x8000;
3405 w2
[3] = w2
[3] | 0x800000;
3409 w2
[3] = w2
[3] | 0x80000000;
3417 w3
[0] = w3
[0] | 0x8000;
3421 w3
[0] = w3
[0] | 0x800000;
3425 w3
[0] = w3
[0] | 0x80000000;
3433 w3
[1] = w3
[1] | 0x8000;
3437 w3
[1] = w3
[1] | 0x800000;
3441 w3
[1] = w3
[1] | 0x80000000;
3449 w3
[2] = w3
[2] | 0x8000;
3453 w3
[2] = w3
[2] | 0x800000;
3457 w3
[2] = w3
[2] | 0x80000000;
3465 w3
[3] = w3
[3] | 0x8000;
3469 w3
[3] = w3
[3] | 0x800000;
3473 w3
[3] = w3
[3] | 0x80000000;
3481 w4
[0] = w4
[0] | 0x8000;
3485 w4
[0] = w4
[0] | 0x800000;
3489 w4
[0] = w4
[0] | 0x80000000;
3497 w4
[1] = w4
[1] | 0x8000;
3501 w4
[1] = w4
[1] | 0x800000;
3505 w4
[1] = w4
[1] | 0x80000000;
3513 w4
[2] = w4
[2] | 0x8000;
3517 w4
[2] = w4
[2] | 0x800000;
3521 w4
[2] = w4
[2] | 0x80000000;
3529 w4
[3] = w4
[3] | 0x8000;
3533 w4
[3] = w4
[3] | 0x800000;
3537 w4
[3] = w4
[3] | 0x80000000;
3545 w5
[0] = w5
[0] | 0x8000;
3549 w5
[0] = w5
[0] | 0x800000;
3553 w5
[0] = w5
[0] | 0x80000000;
3561 w5
[1] = w5
[1] | 0x8000;
3565 w5
[1] = w5
[1] | 0x800000;
3569 w5
[1] = w5
[1] | 0x80000000;
3577 w5
[2] = w5
[2] | 0x8000;
3581 w5
[2] = w5
[2] | 0x800000;
3585 w5
[2] = w5
[2] | 0x80000000;
3593 w5
[3] = w5
[3] | 0x8000;
3597 w5
[3] = w5
[3] | 0x800000;
3601 w5
[3] = w5
[3] | 0x80000000;
3609 w6
[0] = w6
[0] | 0x8000;
3613 w6
[0] = w6
[0] | 0x800000;
3617 w6
[0] = w6
[0] | 0x80000000;
3625 w6
[1] = w6
[1] | 0x8000;
3629 w6
[1] = w6
[1] | 0x800000;
3633 w6
[1] = w6
[1] | 0x80000000;
3641 w6
[2] = w6
[2] | 0x8000;
3645 w6
[2] = w6
[2] | 0x800000;
3649 w6
[2] = w6
[2] | 0x80000000;
3657 w6
[3] = w6
[3] | 0x8000;
3661 w6
[3] = w6
[3] | 0x800000;
3665 w6
[3] = w6
[3] | 0x80000000;
3673 w7
[0] = w7
[0] | 0x8000;
3677 w7
[0] = w7
[0] | 0x800000;
3681 w7
[0] = w7
[0] | 0x80000000;
3689 w7
[1] = w7
[1] | 0x8000;
3693 w7
[1] = w7
[1] | 0x800000;
3697 w7
[1] = w7
[1] | 0x80000000;
3705 w7
[2] = w7
[2] | 0x8000;
3709 w7
[2] = w7
[2] | 0x800000;
3713 w7
[2] = w7
[2] | 0x80000000;
3721 w7
[3] = w7
[3] | 0x8000;
3725 w7
[3] = w7
[3] | 0x800000;
3729 w7
[3] = w7
[3] | 0x80000000;
3734 static void append_0x80_1x16 (u32x w
[16], const u32 offset
)
3743 w
[ 0] = w
[ 0] | 0x8000;
3747 w
[ 0] = w
[ 0] | 0x800000;
3751 w
[ 0] = w
[ 0] | 0x80000000;
3759 w
[ 1] = w
[ 1] | 0x8000;
3763 w
[ 1] = w
[ 1] | 0x800000;
3767 w
[ 1] = w
[ 1] | 0x80000000;
3775 w
[ 2] = w
[ 2] | 0x8000;
3779 w
[ 2] = w
[ 2] | 0x800000;
3783 w
[ 2] = w
[ 2] | 0x80000000;
3791 w
[ 3] = w
[ 3] | 0x8000;
3795 w
[ 3] = w
[ 3] | 0x800000;
3799 w
[ 3] = w
[ 3] | 0x80000000;
3807 w
[ 4] = w
[ 4] | 0x8000;
3811 w
[ 4] = w
[ 4] | 0x800000;
3815 w
[ 4] = w
[ 4] | 0x80000000;
3823 w
[ 5] = w
[ 5] | 0x8000;
3827 w
[ 5] = w
[ 5] | 0x800000;
3831 w
[ 5] = w
[ 5] | 0x80000000;
3839 w
[ 6] = w
[ 6] | 0x8000;
3843 w
[ 6] = w
[ 6] | 0x800000;
3847 w
[ 6] = w
[ 6] | 0x80000000;
3855 w
[ 7] = w
[ 7] | 0x8000;
3859 w
[ 7] = w
[ 7] | 0x800000;
3863 w
[ 7] = w
[ 7] | 0x80000000;
3871 w
[ 8] = w
[ 8] | 0x8000;
3875 w
[ 8] = w
[ 8] | 0x800000;
3879 w
[ 8] = w
[ 8] | 0x80000000;
3887 w
[ 9] = w
[ 9] | 0x8000;
3891 w
[ 9] = w
[ 9] | 0x800000;
3895 w
[ 9] = w
[ 9] | 0x80000000;
3903 w
[10] = w
[10] | 0x8000;
3907 w
[10] = w
[10] | 0x800000;
3911 w
[10] = w
[10] | 0x80000000;
3919 w
[11] = w
[11] | 0x8000;
3923 w
[11] = w
[11] | 0x800000;
3927 w
[11] = w
[11] | 0x80000000;
3935 w
[12] = w
[12] | 0x8000;
3939 w
[12] = w
[12] | 0x800000;
3943 w
[12] = w
[12] | 0x80000000;
3951 w
[13] = w
[13] | 0x8000;
3955 w
[13] = w
[13] | 0x800000;
3959 w
[13] = w
[13] | 0x80000000;
3967 w
[14] = w
[14] | 0x8000;
3971 w
[14] = w
[14] | 0x800000;
3975 w
[14] = w
[14] | 0x80000000;
3983 w
[15] = w
[15] | 0x8000;
3987 w
[15] = w
[15] | 0x800000;
3991 w
[15] = w
[15] | 0x80000000;
3996 static void switch_buffer_by_offset_le (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
3998 #if defined IS_AMD || defined IS_GENERIC
3999 const int offset_mod_4
= offset
& 3;
4001 const int offset_minus_4
= 4 - offset
;
4006 w3
[2] = amd_bytealign ( 0, w3
[1], offset_minus_4
);
4007 w3
[1] = amd_bytealign (w3
[1], w3
[0], offset_minus_4
);
4008 w3
[0] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
4009 w2
[3] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4010 w2
[2] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4011 w2
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4012 w2
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4013 w1
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4014 w1
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4015 w1
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4016 w1
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4017 w0
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4018 w0
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4019 w0
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4020 w0
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4022 if (offset_mod_4
== 0)
4044 w3
[2] = amd_bytealign ( 0, w3
[0], offset_minus_4
);
4045 w3
[1] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
4046 w3
[0] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4047 w2
[3] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4048 w2
[2] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4049 w2
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4050 w2
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4051 w1
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4052 w1
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4053 w1
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4054 w1
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4055 w0
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4056 w0
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4057 w0
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4060 if (offset_mod_4
== 0)
4081 w3
[2] = amd_bytealign ( 0, w2
[3], offset_minus_4
);
4082 w3
[1] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4083 w3
[0] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4084 w2
[3] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4085 w2
[2] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4086 w2
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4087 w2
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4088 w1
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4089 w1
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4090 w1
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4091 w1
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4092 w0
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4093 w0
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4097 if (offset_mod_4
== 0)
4117 w3
[2] = amd_bytealign ( 0, w2
[2], offset_minus_4
);
4118 w3
[1] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4119 w3
[0] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4120 w2
[3] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4121 w2
[2] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4122 w2
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4123 w2
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4124 w1
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4125 w1
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4126 w1
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4127 w1
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4128 w0
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4133 if (offset_mod_4
== 0)
4152 w3
[2] = amd_bytealign ( 0, w2
[1], offset_minus_4
);
4153 w3
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4154 w3
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4155 w2
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4156 w2
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4157 w2
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4158 w2
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4159 w1
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4160 w1
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4161 w1
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4162 w1
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4168 if (offset_mod_4
== 0)
4186 w3
[2] = amd_bytealign ( 0, w2
[0], offset_minus_4
);
4187 w3
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4188 w3
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4189 w2
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4190 w2
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4191 w2
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4192 w2
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4193 w1
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4194 w1
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4195 w1
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4202 if (offset_mod_4
== 0)
4219 w3
[2] = amd_bytealign ( 0, w1
[3], offset_minus_4
);
4220 w3
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4221 w3
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4222 w2
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4223 w2
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4224 w2
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4225 w2
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4226 w1
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4227 w1
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4235 if (offset_mod_4
== 0)
4251 w3
[2] = amd_bytealign ( 0, w1
[2], offset_minus_4
);
4252 w3
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4253 w3
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4254 w2
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4255 w2
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4256 w2
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4257 w2
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4258 w1
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4267 if (offset_mod_4
== 0)
4282 w3
[2] = amd_bytealign ( 0, w1
[1], offset_minus_4
);
4283 w3
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4284 w3
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4285 w2
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4286 w2
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4287 w2
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4288 w2
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4298 if (offset_mod_4
== 0)
4312 w3
[2] = amd_bytealign ( 0, w1
[0], offset_minus_4
);
4313 w3
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4314 w3
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4315 w2
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4316 w2
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4317 w2
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4328 if (offset_mod_4
== 0)
4341 w3
[2] = amd_bytealign ( 0, w0
[3], offset_minus_4
);
4342 w3
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4343 w3
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4344 w2
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4345 w2
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4357 if (offset_mod_4
== 0)
4369 w3
[2] = amd_bytealign ( 0, w0
[2], offset_minus_4
);
4370 w3
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4371 w3
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4372 w2
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4385 if (offset_mod_4
== 0)
4396 w3
[2] = amd_bytealign ( 0, w0
[1], offset_minus_4
);
4397 w3
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4398 w3
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4412 if (offset_mod_4
== 0)
4422 w3
[2] = amd_bytealign ( 0, w0
[0], offset_minus_4
);
4423 w3
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4438 if (offset_mod_4
== 0)
4449 const int offset_minus_4
= 4 - (offset
% 4);
4451 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
4456 w3
[1] = __byte_perm (w3
[0], w3
[1], selector
);
4457 w3
[0] = __byte_perm (w2
[3], w3
[0], selector
);
4458 w2
[3] = __byte_perm (w2
[2], w2
[3], selector
);
4459 w2
[2] = __byte_perm (w2
[1], w2
[2], selector
);
4460 w2
[1] = __byte_perm (w2
[0], w2
[1], selector
);
4461 w2
[0] = __byte_perm (w1
[3], w2
[0], selector
);
4462 w1
[3] = __byte_perm (w1
[2], w1
[3], selector
);
4463 w1
[2] = __byte_perm (w1
[1], w1
[2], selector
);
4464 w1
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4465 w1
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4466 w0
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4467 w0
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4468 w0
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4469 w0
[0] = __byte_perm ( 0, w0
[0], selector
);
4474 w3
[1] = __byte_perm (w2
[3], w3
[0], selector
);
4475 w3
[0] = __byte_perm (w2
[2], w2
[3], selector
);
4476 w2
[3] = __byte_perm (w2
[1], w2
[2], selector
);
4477 w2
[2] = __byte_perm (w2
[0], w2
[1], selector
);
4478 w2
[1] = __byte_perm (w1
[3], w2
[0], selector
);
4479 w2
[0] = __byte_perm (w1
[2], w1
[3], selector
);
4480 w1
[3] = __byte_perm (w1
[1], w1
[2], selector
);
4481 w1
[2] = __byte_perm (w1
[0], w1
[1], selector
);
4482 w1
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4483 w1
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4484 w0
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4485 w0
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4486 w0
[1] = __byte_perm ( 0, w0
[0], selector
);
4492 w3
[1] = __byte_perm (w2
[2], w2
[3], selector
);
4493 w3
[0] = __byte_perm (w2
[1], w2
[2], selector
);
4494 w2
[3] = __byte_perm (w2
[0], w2
[1], selector
);
4495 w2
[2] = __byte_perm (w1
[3], w2
[0], selector
);
4496 w2
[1] = __byte_perm (w1
[2], w1
[3], selector
);
4497 w2
[0] = __byte_perm (w1
[1], w1
[2], selector
);
4498 w1
[3] = __byte_perm (w1
[0], w1
[1], selector
);
4499 w1
[2] = __byte_perm (w0
[3], w1
[0], selector
);
4500 w1
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4501 w1
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4502 w0
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4503 w0
[2] = __byte_perm ( 0, w0
[0], selector
);
4510 w3
[1] = __byte_perm (w2
[1], w2
[2], selector
);
4511 w3
[0] = __byte_perm (w2
[0], w2
[1], selector
);
4512 w2
[3] = __byte_perm (w1
[3], w2
[0], selector
);
4513 w2
[2] = __byte_perm (w1
[2], w1
[3], selector
);
4514 w2
[1] = __byte_perm (w1
[1], w1
[2], selector
);
4515 w2
[0] = __byte_perm (w1
[0], w1
[1], selector
);
4516 w1
[3] = __byte_perm (w0
[3], w1
[0], selector
);
4517 w1
[2] = __byte_perm (w0
[2], w0
[3], selector
);
4518 w1
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4519 w1
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4520 w0
[3] = __byte_perm ( 0, w0
[0], selector
);
4528 w3
[1] = __byte_perm (w2
[0], w2
[1], selector
);
4529 w3
[0] = __byte_perm (w1
[3], w2
[0], selector
);
4530 w2
[3] = __byte_perm (w1
[2], w1
[3], selector
);
4531 w2
[2] = __byte_perm (w1
[1], w1
[2], selector
);
4532 w2
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4533 w2
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4534 w1
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4535 w1
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4536 w1
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4537 w1
[0] = __byte_perm ( 0, w0
[0], selector
);
4546 w3
[1] = __byte_perm (w1
[3], w2
[0], selector
);
4547 w3
[0] = __byte_perm (w1
[2], w1
[3], selector
);
4548 w2
[3] = __byte_perm (w1
[1], w1
[2], selector
);
4549 w2
[2] = __byte_perm (w1
[0], w1
[1], selector
);
4550 w2
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4551 w2
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4552 w1
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4553 w1
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4554 w1
[1] = __byte_perm ( 0, w0
[0], selector
);
4564 w3
[1] = __byte_perm (w1
[2], w1
[3], selector
);
4565 w3
[0] = __byte_perm (w1
[1], w1
[2], selector
);
4566 w2
[3] = __byte_perm (w1
[0], w1
[1], selector
);
4567 w2
[2] = __byte_perm (w0
[3], w1
[0], selector
);
4568 w2
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4569 w2
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4570 w1
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4571 w1
[2] = __byte_perm ( 0, w0
[0], selector
);
4582 w3
[1] = __byte_perm (w1
[1], w1
[2], selector
);
4583 w3
[0] = __byte_perm (w1
[0], w1
[1], selector
);
4584 w2
[3] = __byte_perm (w0
[3], w1
[0], selector
);
4585 w2
[2] = __byte_perm (w0
[2], w0
[3], selector
);
4586 w2
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4587 w2
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4588 w1
[3] = __byte_perm ( 0, w0
[0], selector
);
4600 w3
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4601 w3
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4602 w2
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4603 w2
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4604 w2
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4605 w2
[0] = __byte_perm ( 0, w0
[0], selector
);
4618 w3
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4619 w3
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4620 w2
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4621 w2
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4622 w2
[1] = __byte_perm ( 0, w0
[0], selector
);
4636 w3
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4637 w3
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4638 w2
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4639 w2
[2] = __byte_perm ( 0, w0
[0], selector
);
4654 w3
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4655 w3
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4656 w2
[3] = __byte_perm ( 0, w0
[0], selector
);
4672 w3
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4673 w3
[0] = __byte_perm ( 0, w0
[0], selector
);
4690 w3
[1] = __byte_perm ( 0, w0
[0], selector
);
4710 static void switch_buffer_by_offset_be (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
4712 #if defined IS_AMD || defined IS_GENERIC
4716 w3
[2] = amd_bytealign (w3
[1], 0, offset
);
4717 w3
[1] = amd_bytealign (w3
[0], w3
[1], offset
);
4718 w3
[0] = amd_bytealign (w2
[3], w3
[0], offset
);
4719 w2
[3] = amd_bytealign (w2
[2], w2
[3], offset
);
4720 w2
[2] = amd_bytealign (w2
[1], w2
[2], offset
);
4721 w2
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
4722 w2
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
4723 w1
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
4724 w1
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
4725 w1
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4726 w1
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4727 w0
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4728 w0
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4729 w0
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4730 w0
[0] = amd_bytealign ( 0, w0
[0], offset
);
4734 w3
[2] = amd_bytealign (w3
[0], 0, offset
);
4735 w3
[1] = amd_bytealign (w2
[3], w3
[0], offset
);
4736 w3
[0] = amd_bytealign (w2
[2], w2
[3], offset
);
4737 w2
[3] = amd_bytealign (w2
[1], w2
[2], offset
);
4738 w2
[2] = amd_bytealign (w2
[0], w2
[1], offset
);
4739 w2
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
4740 w2
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
4741 w1
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
4742 w1
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
4743 w1
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4744 w1
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4745 w0
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4746 w0
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4747 w0
[1] = amd_bytealign ( 0, w0
[0], offset
);
4752 w3
[2] = amd_bytealign (w2
[3], 0, offset
);
4753 w3
[1] = amd_bytealign (w2
[2], w2
[3], offset
);
4754 w3
[0] = amd_bytealign (w2
[1], w2
[2], offset
);
4755 w2
[3] = amd_bytealign (w2
[0], w2
[1], offset
);
4756 w2
[2] = amd_bytealign (w1
[3], w2
[0], offset
);
4757 w2
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
4758 w2
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
4759 w1
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
4760 w1
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
4761 w1
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4762 w1
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4763 w0
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4764 w0
[2] = amd_bytealign ( 0, w0
[0], offset
);
4770 w3
[2] = amd_bytealign (w2
[2], 0, offset
);
4771 w3
[1] = amd_bytealign (w2
[1], w2
[2], offset
);
4772 w3
[0] = amd_bytealign (w2
[0], w2
[1], offset
);
4773 w2
[3] = amd_bytealign (w1
[3], w2
[0], offset
);
4774 w2
[2] = amd_bytealign (w1
[2], w1
[3], offset
);
4775 w2
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
4776 w2
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
4777 w1
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
4778 w1
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
4779 w1
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4780 w1
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4781 w0
[3] = amd_bytealign ( 0, w0
[0], offset
);
4788 w3
[2] = amd_bytealign (w2
[1], 0, offset
);
4789 w3
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
4790 w3
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
4791 w2
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
4792 w2
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
4793 w2
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4794 w2
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4795 w1
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4796 w1
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4797 w1
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4798 w1
[0] = amd_bytealign ( 0, w0
[0], offset
);
4806 w3
[2] = amd_bytealign (w2
[0], 0, offset
);
4807 w3
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
4808 w3
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
4809 w2
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
4810 w2
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
4811 w2
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4812 w2
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4813 w1
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4814 w1
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4815 w1
[1] = amd_bytealign ( 0, w0
[0], offset
);
4824 w3
[2] = amd_bytealign (w1
[3], 0, offset
);
4825 w3
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
4826 w3
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
4827 w2
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
4828 w2
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
4829 w2
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4830 w2
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4831 w1
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4832 w1
[2] = amd_bytealign ( 0, w0
[0], offset
);
4842 w3
[2] = amd_bytealign (w1
[2], 0, offset
);
4843 w3
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
4844 w3
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
4845 w2
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
4846 w2
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
4847 w2
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4848 w2
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4849 w1
[3] = amd_bytealign ( 0, w0
[0], offset
);
4860 w3
[2] = amd_bytealign (w1
[1], 0, offset
);
4861 w3
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4862 w3
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4863 w2
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4864 w2
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4865 w2
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4866 w2
[0] = amd_bytealign ( 0, w0
[0], offset
);
4878 w3
[2] = amd_bytealign (w1
[0], 0, offset
);
4879 w3
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4880 w3
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4881 w2
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4882 w2
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4883 w2
[1] = amd_bytealign ( 0, w0
[0], offset
);
4896 w3
[2] = amd_bytealign (w0
[3], 0, offset
);
4897 w3
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4898 w3
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4899 w2
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4900 w2
[2] = amd_bytealign ( 0, w0
[0], offset
);
4914 w3
[2] = amd_bytealign (w0
[2], 0, offset
);
4915 w3
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4916 w3
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4917 w2
[3] = amd_bytealign ( 0, w0
[0], offset
);
4932 w3
[2] = amd_bytealign (w0
[1], 0, offset
);
4933 w3
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4934 w3
[0] = amd_bytealign ( 0, w0
[0], offset
);
4950 w3
[2] = amd_bytealign (w0
[0], 0, offset
);
4951 w3
[1] = amd_bytealign ( 0, w0
[0], offset
);
4970 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
4975 w3
[1] = __byte_perm (w3
[1], w3
[0], selector
);
4976 w3
[0] = __byte_perm (w3
[0], w2
[3], selector
);
4977 w2
[3] = __byte_perm (w2
[3], w2
[2], selector
);
4978 w2
[2] = __byte_perm (w2
[2], w2
[1], selector
);
4979 w2
[1] = __byte_perm (w2
[1], w2
[0], selector
);
4980 w2
[0] = __byte_perm (w2
[0], w1
[3], selector
);
4981 w1
[3] = __byte_perm (w1
[3], w1
[2], selector
);
4982 w1
[2] = __byte_perm (w1
[2], w1
[1], selector
);
4983 w1
[1] = __byte_perm (w1
[1], w1
[0], selector
);
4984 w1
[0] = __byte_perm (w1
[0], w0
[3], selector
);
4985 w0
[3] = __byte_perm (w0
[3], w0
[2], selector
);
4986 w0
[2] = __byte_perm (w0
[2], w0
[1], selector
);
4987 w0
[1] = __byte_perm (w0
[1], w0
[0], selector
);
4988 w0
[0] = __byte_perm (w0
[0], 0, selector
);
4992 w3
[1] = __byte_perm (w3
[0], w2
[3], selector
);
4993 w3
[0] = __byte_perm (w2
[3], w2
[2], selector
);
4994 w2
[3] = __byte_perm (w2
[2], w2
[1], selector
);
4995 w2
[2] = __byte_perm (w2
[1], w2
[0], selector
);
4996 w2
[1] = __byte_perm (w2
[0], w1
[3], selector
);
4997 w2
[0] = __byte_perm (w1
[3], w1
[2], selector
);
4998 w1
[3] = __byte_perm (w1
[2], w1
[1], selector
);
4999 w1
[2] = __byte_perm (w1
[1], w1
[0], selector
);
5000 w1
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5001 w1
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5002 w0
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5003 w0
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5004 w0
[1] = __byte_perm (w0
[0], 0, selector
);
5009 w3
[1] = __byte_perm (w2
[3], w2
[2], selector
);
5010 w3
[0] = __byte_perm (w2
[2], w2
[1], selector
);
5011 w2
[3] = __byte_perm (w2
[1], w2
[0], selector
);
5012 w2
[2] = __byte_perm (w2
[0], w1
[3], selector
);
5013 w2
[1] = __byte_perm (w1
[3], w1
[2], selector
);
5014 w2
[0] = __byte_perm (w1
[2], w1
[1], selector
);
5015 w1
[3] = __byte_perm (w1
[1], w1
[0], selector
);
5016 w1
[2] = __byte_perm (w1
[0], w0
[3], selector
);
5017 w1
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5018 w1
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5019 w0
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5020 w0
[2] = __byte_perm (w0
[0], 0, selector
);
5026 w3
[1] = __byte_perm (w2
[2], w2
[1], selector
);
5027 w3
[0] = __byte_perm (w2
[1], w2
[0], selector
);
5028 w2
[3] = __byte_perm (w2
[0], w1
[3], selector
);
5029 w2
[2] = __byte_perm (w1
[3], w1
[2], selector
);
5030 w2
[1] = __byte_perm (w1
[2], w1
[1], selector
);
5031 w2
[0] = __byte_perm (w1
[1], w1
[0], selector
);
5032 w1
[3] = __byte_perm (w1
[0], w0
[3], selector
);
5033 w1
[2] = __byte_perm (w0
[3], w0
[2], selector
);
5034 w1
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5035 w1
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5036 w0
[3] = __byte_perm (w0
[0], 0, selector
);
5043 w3
[1] = __byte_perm (w2
[1], w2
[0], selector
);
5044 w3
[0] = __byte_perm (w2
[0], w1
[3], selector
);
5045 w2
[3] = __byte_perm (w1
[3], w1
[2], selector
);
5046 w2
[2] = __byte_perm (w1
[2], w1
[1], selector
);
5047 w2
[1] = __byte_perm (w1
[1], w1
[0], selector
);
5048 w2
[0] = __byte_perm (w1
[0], w0
[3], selector
);
5049 w1
[3] = __byte_perm (w0
[3], w0
[2], selector
);
5050 w1
[2] = __byte_perm (w0
[2], w0
[1], selector
);
5051 w1
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5052 w1
[0] = __byte_perm (w0
[0], 0, selector
);
5060 w3
[1] = __byte_perm (w2
[0], w1
[3], selector
);
5061 w3
[0] = __byte_perm (w1
[3], w1
[2], selector
);
5062 w2
[3] = __byte_perm (w1
[2], w1
[1], selector
);
5063 w2
[2] = __byte_perm (w1
[1], w1
[0], selector
);
5064 w2
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5065 w2
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5066 w1
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5067 w1
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5068 w1
[1] = __byte_perm (w0
[0], 0, selector
);
5077 w3
[1] = __byte_perm (w1
[3], w1
[2], selector
);
5078 w3
[0] = __byte_perm (w1
[2], w1
[1], selector
);
5079 w2
[3] = __byte_perm (w1
[1], w1
[0], selector
);
5080 w2
[2] = __byte_perm (w1
[0], w0
[3], selector
);
5081 w2
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5082 w2
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5083 w1
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5084 w1
[2] = __byte_perm (w0
[0], 0, selector
);
5094 w3
[1] = __byte_perm (w1
[2], w1
[1], selector
);
5095 w3
[0] = __byte_perm (w1
[1], w1
[0], selector
);
5096 w2
[3] = __byte_perm (w1
[0], w0
[3], selector
);
5097 w2
[2] = __byte_perm (w0
[3], w0
[2], selector
);
5098 w2
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5099 w2
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5100 w1
[3] = __byte_perm (w0
[0], 0, selector
);
5111 w3
[1] = __byte_perm (w1
[1], w1
[0], selector
);
5112 w3
[0] = __byte_perm (w1
[0], w0
[3], selector
);
5113 w2
[3] = __byte_perm (w0
[3], w0
[2], selector
);
5114 w2
[2] = __byte_perm (w0
[2], w0
[1], selector
);
5115 w2
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5116 w2
[0] = __byte_perm (w0
[0], 0, selector
);
5128 w3
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5129 w3
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5130 w2
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5131 w2
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5132 w2
[1] = __byte_perm (w0
[0], 0, selector
);
5145 w3
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5146 w3
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5147 w2
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5148 w2
[2] = __byte_perm (w0
[0], 0, selector
);
5162 w3
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5163 w3
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5164 w2
[3] = __byte_perm (w0
[0], 0, selector
);
5179 w3
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5180 w3
[0] = __byte_perm (w0
[0], 0, selector
);
5196 w3
[1] = __byte_perm (w0
[0], 0, selector
);
5215 static void overwrite_at_le (u32x sw
[16], const u32x w0
, const u32 salt_len
)
5217 #if defined cl_amd_media_ops
5222 case 1: sw
[0] = amd_bytealign (w0
, sw
[0] << 24, 3);
5223 sw
[1] = amd_bytealign (sw
[1] >> 8, w0
, 3);
5225 case 2: sw
[0] = amd_bytealign (w0
, sw
[0] << 16, 2);
5226 sw
[1] = amd_bytealign (sw
[1] >> 16, w0
, 2);
5228 case 3: sw
[0] = amd_bytealign (w0
, sw
[0] << 8, 1);
5229 sw
[1] = amd_bytealign (sw
[1] >> 24, w0
, 1);
5233 case 5: sw
[1] = amd_bytealign (w0
, sw
[1] << 24, 3);
5234 sw
[2] = amd_bytealign (sw
[2] >> 8, w0
, 3);
5236 case 6: sw
[1] = amd_bytealign (w0
, sw
[1] << 16, 2);
5237 sw
[2] = amd_bytealign (sw
[2] >> 16, w0
, 2);
5239 case 7: sw
[1] = amd_bytealign (w0
, sw
[1] << 8, 1);
5240 sw
[2] = amd_bytealign (sw
[2] >> 24, w0
, 1);
5244 case 9: sw
[2] = amd_bytealign (w0
, sw
[2] << 24, 3);
5245 sw
[3] = amd_bytealign (sw
[3] >> 8, w0
, 3);
5247 case 10: sw
[2] = amd_bytealign (w0
, sw
[2] << 16, 2);
5248 sw
[3] = amd_bytealign (sw
[3] >> 16, w0
, 2);
5250 case 11: sw
[2] = amd_bytealign (w0
, sw
[2] << 8, 1);
5251 sw
[3] = amd_bytealign (sw
[3] >> 24, w0
, 1);
5253 case 12: sw
[3] = w0
;
5255 case 13: sw
[3] = amd_bytealign (w0
, sw
[3] << 24, 3);
5256 sw
[4] = amd_bytealign (sw
[4] >> 8, w0
, 3);
5258 case 14: sw
[3] = amd_bytealign (w0
, sw
[3] << 16, 2);
5259 sw
[4] = amd_bytealign (sw
[4] >> 16, w0
, 2);
5261 case 15: sw
[3] = amd_bytealign (w0
, sw
[3] << 8, 1);
5262 sw
[4] = amd_bytealign (sw
[4] >> 24, w0
, 1);
5264 case 16: sw
[4] = w0
;
5266 case 17: sw
[4] = amd_bytealign (w0
, sw
[4] << 24, 3);
5267 sw
[5] = amd_bytealign (sw
[5] >> 8, w0
, 3);
5269 case 18: sw
[4] = amd_bytealign (w0
, sw
[4] << 16, 2);
5270 sw
[5] = amd_bytealign (sw
[5] >> 16, w0
, 2);
5272 case 19: sw
[4] = amd_bytealign (w0
, sw
[4] << 8, 1);
5273 sw
[5] = amd_bytealign (sw
[5] >> 24, w0
, 1);
5275 case 20: sw
[5] = w0
;
5277 case 21: sw
[5] = amd_bytealign (w0
, sw
[5] << 24, 3);
5278 sw
[6] = amd_bytealign (sw
[6] >> 8, w0
, 3);
5280 case 22: sw
[5] = amd_bytealign (w0
, sw
[5] << 16, 2);
5281 sw
[6] = amd_bytealign (sw
[6] >> 16, w0
, 2);
5283 case 23: sw
[5] = amd_bytealign (w0
, sw
[5] << 8, 1);
5284 sw
[6] = amd_bytealign (sw
[6] >> 24, w0
, 1);
5286 case 24: sw
[6] = w0
;
5288 case 25: sw
[6] = amd_bytealign (w0
, sw
[6] << 24, 3);
5289 sw
[7] = amd_bytealign (sw
[7] >> 8, w0
, 3);
5291 case 26: sw
[6] = amd_bytealign (w0
, sw
[6] << 16, 2);
5292 sw
[7] = amd_bytealign (sw
[7] >> 16, w0
, 2);
5294 case 27: sw
[6] = amd_bytealign (w0
, sw
[6] << 8, 1);
5295 sw
[7] = amd_bytealign (sw
[7] >> 24, w0
, 1);
5297 case 28: sw
[7] = w0
;
5299 case 29: sw
[7] = amd_bytealign (w0
, sw
[7] << 24, 3);
5300 sw
[8] = amd_bytealign (sw
[8] >> 8, w0
, 3);
5302 case 30: sw
[7] = amd_bytealign (w0
, sw
[7] << 16, 2);
5303 sw
[8] = amd_bytealign (sw
[8] >> 16, w0
, 2);
5305 case 31: sw
[7] = amd_bytealign (w0
, sw
[7] << 8, 1);
5306 sw
[8] = amd_bytealign (sw
[8] >> 24, w0
, 1);
5314 case 1: sw
[0] = (sw
[0] & 0x000000ff) | (w0
<< 8);
5315 sw
[1] = (sw
[1] & 0xffffff00) | (w0
>> 24);
5317 case 2: sw
[0] = (sw
[0] & 0x0000ffff) | (w0
<< 16);
5318 sw
[1] = (sw
[1] & 0xffff0000) | (w0
>> 16);
5320 case 3: sw
[0] = (sw
[0] & 0x00ffffff) | (w0
<< 24);
5321 sw
[1] = (sw
[1] & 0xff000000) | (w0
>> 8);
5325 case 5: sw
[1] = (sw
[1] & 0x000000ff) | (w0
<< 8);
5326 sw
[2] = (sw
[2] & 0xffffff00) | (w0
>> 24);
5328 case 6: sw
[1] = (sw
[1] & 0x0000ffff) | (w0
<< 16);
5329 sw
[2] = (sw
[2] & 0xffff0000) | (w0
>> 16);
5331 case 7: sw
[1] = (sw
[1] & 0x00ffffff) | (w0
<< 24);
5332 sw
[2] = (sw
[2] & 0xff000000) | (w0
>> 8);
5336 case 9: sw
[2] = (sw
[2] & 0x000000ff) | (w0
<< 8);
5337 sw
[3] = (sw
[3] & 0xffffff00) | (w0
>> 24);
5339 case 10: sw
[2] = (sw
[2] & 0x0000ffff) | (w0
<< 16);
5340 sw
[3] = (sw
[3] & 0xffff0000) | (w0
>> 16);
5342 case 11: sw
[2] = (sw
[2] & 0x00ffffff) | (w0
<< 24);
5343 sw
[3] = (sw
[3] & 0xff000000) | (w0
>> 8);
5345 case 12: sw
[3] = w0
;
5347 case 13: sw
[3] = (sw
[3] & 0x000000ff) | (w0
<< 8);
5348 sw
[4] = (sw
[4] & 0xffffff00) | (w0
>> 24);
5350 case 14: sw
[3] = (sw
[3] & 0x0000ffff) | (w0
<< 16);
5351 sw
[4] = (sw
[4] & 0xffff0000) | (w0
>> 16);
5353 case 15: sw
[3] = (sw
[3] & 0x00ffffff) | (w0
<< 24);
5354 sw
[4] = (sw
[4] & 0xff000000) | (w0
>> 8);
5356 case 16: sw
[4] = w0
;
5358 case 17: sw
[4] = (sw
[4] & 0x000000ff) | (w0
<< 8);
5359 sw
[5] = (sw
[5] & 0xffffff00) | (w0
>> 24);
5361 case 18: sw
[4] = (sw
[4] & 0x0000ffff) | (w0
<< 16);
5362 sw
[5] = (sw
[5] & 0xffff0000) | (w0
>> 16);
5364 case 19: sw
[4] = (sw
[4] & 0x00ffffff) | (w0
<< 24);
5365 sw
[5] = (sw
[5] & 0xff000000) | (w0
>> 8);
5367 case 20: sw
[5] = w0
;
5369 case 21: sw
[5] = (sw
[5] & 0x000000ff) | (w0
<< 8);
5370 sw
[6] = (sw
[6] & 0xffffff00) | (w0
>> 24);
5372 case 22: sw
[5] = (sw
[5] & 0x0000ffff) | (w0
<< 16);
5373 sw
[6] = (sw
[6] & 0xffff0000) | (w0
>> 16);
5375 case 23: sw
[5] = (sw
[5] & 0x00ffffff) | (w0
<< 24);
5376 sw
[6] = (sw
[6] & 0xff000000) | (w0
>> 8);
5378 case 24: sw
[6] = w0
;
5380 case 25: sw
[6] = (sw
[6] & 0x000000ff) | (w0
<< 8);
5381 sw
[7] = (sw
[7] & 0xffffff00) | (w0
>> 24);
5383 case 26: sw
[6] = (sw
[6] & 0x0000ffff) | (w0
<< 16);
5384 sw
[7] = (sw
[7] & 0xffff0000) | (w0
>> 16);
5386 case 27: sw
[6] = (sw
[6] & 0x00ffffff) | (w0
<< 24);
5387 sw
[7] = (sw
[7] & 0xff000000) | (w0
>> 8);
5389 case 28: sw
[7] = w0
;
5391 case 29: sw
[7] = (sw
[7] & 0x000000ff) | (w0
<< 8);
5392 sw
[8] = (sw
[8] & 0xffffff00) | (w0
>> 24);
5394 case 30: sw
[7] = (sw
[7] & 0x0000ffff) | (w0
<< 16);
5395 sw
[8] = (sw
[8] & 0xffff0000) | (w0
>> 16);
5397 case 31: sw
[7] = (sw
[7] & 0x00ffffff) | (w0
<< 24);
5398 sw
[8] = (sw
[8] & 0xff000000) | (w0
>> 8);
5404 static void overwrite_at_be (u32x sw
[16], const u32x w0
, const u32 salt_len
)
5406 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5412 case 1: sw
[0] = (sw
[0] & 0xff000000) | (w0
>> 8);
5413 sw
[1] = (sw
[1] & 0x00ffffff) | (w0
<< 24);
5415 case 2: sw
[0] = (sw
[0] & 0xffff0000) | (w0
>> 16);
5416 sw
[1] = (sw
[1] & 0x0000ffff) | (w0
<< 16);
5418 case 3: sw
[0] = (sw
[0] & 0xffffff00) | (w0
>> 24);
5419 sw
[1] = (sw
[1] & 0x000000ff) | (w0
<< 8);
5423 case 5: sw
[1] = (sw
[1] & 0xff000000) | (w0
>> 8);
5424 sw
[2] = (sw
[2] & 0x00ffffff) | (w0
<< 24);
5426 case 6: sw
[1] = (sw
[1] & 0xffff0000) | (w0
>> 16);
5427 sw
[2] = (sw
[2] & 0x0000ffff) | (w0
<< 16);
5429 case 7: sw
[1] = (sw
[1] & 0xffffff00) | (w0
>> 24);
5430 sw
[2] = (sw
[2] & 0x000000ff) | (w0
<< 8);
5434 case 9: sw
[2] = (sw
[2] & 0xff000000) | (w0
>> 8);
5435 sw
[3] = (sw
[3] & 0x00ffffff) | (w0
<< 24);
5437 case 10: sw
[2] = (sw
[2] & 0xffff0000) | (w0
>> 16);
5438 sw
[3] = (sw
[3] & 0x0000ffff) | (w0
<< 16);
5440 case 11: sw
[2] = (sw
[2] & 0xffffff00) | (w0
>> 24);
5441 sw
[3] = (sw
[3] & 0x000000ff) | (w0
<< 8);
5443 case 12: sw
[3] = w0
;
5445 case 13: sw
[3] = (sw
[3] & 0xff000000) | (w0
>> 8);
5446 sw
[4] = (sw
[4] & 0x00ffffff) | (w0
<< 24);
5448 case 14: sw
[3] = (sw
[3] & 0xffff0000) | (w0
>> 16);
5449 sw
[4] = (sw
[4] & 0x0000ffff) | (w0
<< 16);
5451 case 15: sw
[3] = (sw
[3] & 0xffffff00) | (w0
>> 24);
5452 sw
[4] = (sw
[4] & 0x000000ff) | (w0
<< 8);
5454 case 16: sw
[4] = w0
;
5456 case 17: sw
[4] = (sw
[4] & 0xff000000) | (w0
>> 8);
5457 sw
[5] = (sw
[5] & 0x00ffffff) | (w0
<< 24);
5459 case 18: sw
[4] = (sw
[4] & 0xffff0000) | (w0
>> 16);
5460 sw
[5] = (sw
[5] & 0x0000ffff) | (w0
<< 16);
5462 case 19: sw
[4] = (sw
[4] & 0xffffff00) | (w0
>> 24);
5463 sw
[5] = (sw
[5] & 0x000000ff) | (w0
<< 8);
5465 case 20: sw
[5] = w0
;
5467 case 21: sw
[5] = (sw
[5] & 0xff000000) | (w0
>> 8);
5468 sw
[6] = (sw
[6] & 0x00ffffff) | (w0
<< 24);
5470 case 22: sw
[5] = (sw
[5] & 0xffff0000) | (w0
>> 16);
5471 sw
[6] = (sw
[6] & 0x0000ffff) | (w0
<< 16);
5473 case 23: sw
[5] = (sw
[5] & 0xffffff00) | (w0
>> 24);
5474 sw
[6] = (sw
[6] & 0x000000ff) | (w0
<< 8);
5476 case 24: sw
[6] = w0
;
5478 case 25: sw
[6] = (sw
[6] & 0xff000000) | (w0
>> 8);
5479 sw
[7] = (sw
[7] & 0x00ffffff) | (w0
<< 24);
5481 case 26: sw
[6] = (sw
[6] & 0xffff0000) | (w0
>> 16);
5482 sw
[7] = (sw
[7] & 0x0000ffff) | (w0
<< 16);
5484 case 27: sw
[6] = (sw
[6] & 0xffffff00) | (w0
>> 24);
5485 sw
[7] = (sw
[7] & 0x000000ff) | (w0
<< 8);
5487 case 28: sw
[7] = w0
;
5489 case 29: sw
[7] = (sw
[7] & 0xff000000) | (w0
>> 8);
5490 sw
[8] = (sw
[8] & 0x00ffffff) | (w0
<< 24);
5492 case 30: sw
[7] = (sw
[7] & 0xffff0000) | (w0
>> 16);
5493 sw
[8] = (sw
[8] & 0x0000ffff) | (w0
<< 16);
5495 case 31: sw
[7] = (sw
[7] & 0xffffff00) | (w0
>> 24);
5496 sw
[8] = (sw
[8] & 0x000000ff) | (w0
<< 8);
5502 * vector functions as scalar (for outer loop usage)
5505 static void append_0x80_1x4_S (u32 w0
[4], const u32 offset
)
5514 w0
[0] = w0
[0] | 0x8000;
5518 w0
[0] = w0
[0] | 0x800000;
5522 w0
[0] = w0
[0] | 0x80000000;
5530 w0
[1] = w0
[1] | 0x8000;
5534 w0
[1] = w0
[1] | 0x800000;
5538 w0
[1] = w0
[1] | 0x80000000;
5546 w0
[2] = w0
[2] | 0x8000;
5550 w0
[2] = w0
[2] | 0x800000;
5554 w0
[2] = w0
[2] | 0x80000000;
5562 w0
[3] = w0
[3] | 0x8000;
5566 w0
[3] = w0
[3] | 0x800000;
5570 w0
[3] = w0
[3] | 0x80000000;
5575 static void append_0x80_2x4_S (u32 w0
[4], u32 w1
[4], const u32 offset
)
5584 w0
[0] = w0
[0] | 0x8000;
5588 w0
[0] = w0
[0] | 0x800000;
5592 w0
[0] = w0
[0] | 0x80000000;
5600 w0
[1] = w0
[1] | 0x8000;
5604 w0
[1] = w0
[1] | 0x800000;
5608 w0
[1] = w0
[1] | 0x80000000;
5616 w0
[2] = w0
[2] | 0x8000;
5620 w0
[2] = w0
[2] | 0x800000;
5624 w0
[2] = w0
[2] | 0x80000000;
5632 w0
[3] = w0
[3] | 0x8000;
5636 w0
[3] = w0
[3] | 0x800000;
5640 w0
[3] = w0
[3] | 0x80000000;
5648 w1
[0] = w1
[0] | 0x8000;
5652 w1
[0] = w1
[0] | 0x800000;
5656 w1
[0] = w1
[0] | 0x80000000;
5664 w1
[1] = w1
[1] | 0x8000;
5668 w1
[1] = w1
[1] | 0x800000;
5672 w1
[1] = w1
[1] | 0x80000000;
5680 w1
[2] = w1
[2] | 0x8000;
5684 w1
[2] = w1
[2] | 0x800000;
5688 w1
[2] = w1
[2] | 0x80000000;
5696 w1
[3] = w1
[3] | 0x8000;
5700 w1
[3] = w1
[3] | 0x800000;
5704 w1
[3] = w1
[3] | 0x80000000;
5709 static void append_0x80_3x4_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
5718 w0
[0] = w0
[0] | 0x8000;
5722 w0
[0] = w0
[0] | 0x800000;
5726 w0
[0] = w0
[0] | 0x80000000;
5734 w0
[1] = w0
[1] | 0x8000;
5738 w0
[1] = w0
[1] | 0x800000;
5742 w0
[1] = w0
[1] | 0x80000000;
5750 w0
[2] = w0
[2] | 0x8000;
5754 w0
[2] = w0
[2] | 0x800000;
5758 w0
[2] = w0
[2] | 0x80000000;
5766 w0
[3] = w0
[3] | 0x8000;
5770 w0
[3] = w0
[3] | 0x800000;
5774 w0
[3] = w0
[3] | 0x80000000;
5782 w1
[0] = w1
[0] | 0x8000;
5786 w1
[0] = w1
[0] | 0x800000;
5790 w1
[0] = w1
[0] | 0x80000000;
5798 w1
[1] = w1
[1] | 0x8000;
5802 w1
[1] = w1
[1] | 0x800000;
5806 w1
[1] = w1
[1] | 0x80000000;
5814 w1
[2] = w1
[2] | 0x8000;
5818 w1
[2] = w1
[2] | 0x800000;
5822 w1
[2] = w1
[2] | 0x80000000;
5830 w1
[3] = w1
[3] | 0x8000;
5834 w1
[3] = w1
[3] | 0x800000;
5838 w1
[3] = w1
[3] | 0x80000000;
5846 w2
[0] = w2
[0] | 0x8000;
5850 w2
[0] = w2
[0] | 0x800000;
5854 w2
[0] = w2
[0] | 0x80000000;
5862 w2
[1] = w2
[1] | 0x8000;
5866 w2
[1] = w2
[1] | 0x800000;
5870 w2
[1] = w2
[1] | 0x80000000;
5878 w2
[2] = w2
[2] | 0x8000;
5882 w2
[2] = w2
[2] | 0x800000;
5886 w2
[2] = w2
[2] | 0x80000000;
5894 w2
[3] = w2
[3] | 0x8000;
5898 w2
[3] = w2
[3] | 0x800000;
5902 w2
[3] = w2
[3] | 0x80000000;
5907 static void append_0x80_4x4_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
5916 w0
[0] = w0
[0] | 0x8000;
5920 w0
[0] = w0
[0] | 0x800000;
5924 w0
[0] = w0
[0] | 0x80000000;
5932 w0
[1] = w0
[1] | 0x8000;
5936 w0
[1] = w0
[1] | 0x800000;
5940 w0
[1] = w0
[1] | 0x80000000;
5948 w0
[2] = w0
[2] | 0x8000;
5952 w0
[2] = w0
[2] | 0x800000;
5956 w0
[2] = w0
[2] | 0x80000000;
5964 w0
[3] = w0
[3] | 0x8000;
5968 w0
[3] = w0
[3] | 0x800000;
5972 w0
[3] = w0
[3] | 0x80000000;
5980 w1
[0] = w1
[0] | 0x8000;
5984 w1
[0] = w1
[0] | 0x800000;
5988 w1
[0] = w1
[0] | 0x80000000;
5996 w1
[1] = w1
[1] | 0x8000;
6000 w1
[1] = w1
[1] | 0x800000;
6004 w1
[1] = w1
[1] | 0x80000000;
6012 w1
[2] = w1
[2] | 0x8000;
6016 w1
[2] = w1
[2] | 0x800000;
6020 w1
[2] = w1
[2] | 0x80000000;
6028 w1
[3] = w1
[3] | 0x8000;
6032 w1
[3] = w1
[3] | 0x800000;
6036 w1
[3] = w1
[3] | 0x80000000;
6044 w2
[0] = w2
[0] | 0x8000;
6048 w2
[0] = w2
[0] | 0x800000;
6052 w2
[0] = w2
[0] | 0x80000000;
6060 w2
[1] = w2
[1] | 0x8000;
6064 w2
[1] = w2
[1] | 0x800000;
6068 w2
[1] = w2
[1] | 0x80000000;
6076 w2
[2] = w2
[2] | 0x8000;
6080 w2
[2] = w2
[2] | 0x800000;
6084 w2
[2] = w2
[2] | 0x80000000;
6092 w2
[3] = w2
[3] | 0x8000;
6096 w2
[3] = w2
[3] | 0x800000;
6100 w2
[3] = w2
[3] | 0x80000000;
6108 w3
[0] = w3
[0] | 0x8000;
6112 w3
[0] = w3
[0] | 0x800000;
6116 w3
[0] = w3
[0] | 0x80000000;
6124 w3
[1] = w3
[1] | 0x8000;
6128 w3
[1] = w3
[1] | 0x800000;
6132 w3
[1] = w3
[1] | 0x80000000;
6140 w3
[2] = w3
[2] | 0x8000;
6144 w3
[2] = w3
[2] | 0x800000;
6148 w3
[2] = w3
[2] | 0x80000000;
6156 w3
[3] = w3
[3] | 0x8000;
6160 w3
[3] = w3
[3] | 0x800000;
6164 w3
[3] = w3
[3] | 0x80000000;
6169 static void truncate_block_S (u32 w
[4], const u32 len
)
6178 case 1: w
[0] &= 0x000000FF;
6183 case 2: w
[0] &= 0x0000FFFF;
6188 case 3: w
[0] &= 0x00FFFFFF;
6197 case 5: w
[1] &= 0x000000FF;
6201 case 6: w
[1] &= 0x0000FFFF;
6205 case 7: w
[1] &= 0x00FFFFFF;
6212 case 9: w
[2] &= 0x000000FF;
6215 case 10: w
[2] &= 0x0000FFFF;
6218 case 11: w
[2] &= 0x00FFFFFF;
6223 case 13: w
[3] &= 0x000000FF;
6225 case 14: w
[3] &= 0x0000FFFF;
6227 case 15: w
[3] &= 0x00FFFFFF;
6232 static void make_unicode_S (const u32 in
[4], u32 out1
[4], u32 out2
[4])
6235 out2
[3] = __byte_perm_S (in
[3], 0, 0x7372);
6236 out2
[2] = __byte_perm_S (in
[3], 0, 0x7170);
6237 out2
[1] = __byte_perm_S (in
[2], 0, 0x7372);
6238 out2
[0] = __byte_perm_S (in
[2], 0, 0x7170);
6239 out1
[3] = __byte_perm_S (in
[1], 0, 0x7372);
6240 out1
[2] = __byte_perm_S (in
[1], 0, 0x7170);
6241 out1
[1] = __byte_perm_S (in
[0], 0, 0x7372);
6242 out1
[0] = __byte_perm_S (in
[0], 0, 0x7170);
6245 #if defined IS_AMD || defined IS_GENERIC
6246 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
6247 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
6248 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
6249 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
6250 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
6251 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
6252 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
6253 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
6257 static void undo_unicode_S (const u32 in1
[4], const u32 in2
[4], u32 out
[4])
6260 out
[0] = __byte_perm_S (in1
[0], in1
[1], 0x6420);
6261 out
[1] = __byte_perm_S (in1
[2], in1
[3], 0x6420);
6262 out
[2] = __byte_perm_S (in2
[0], in2
[1], 0x6420);
6263 out
[3] = __byte_perm_S (in2
[2], in2
[3], 0x6420);
6266 #if defined IS_AMD || defined IS_GENERIC
6267 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
6268 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
6269 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
6270 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
6271 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
6272 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
6273 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
6274 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
6278 static void switch_buffer_by_offset_le_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
6280 #if defined IS_AMD || defined IS_GENERIC
6281 const int offset_mod_4
= offset
& 3;
6283 const int offset_minus_4
= 4 - offset
;
6288 w3
[2] = amd_bytealign_S ( 0, w3
[1], offset_minus_4
);
6289 w3
[1] = amd_bytealign_S (w3
[1], w3
[0], offset_minus_4
);
6290 w3
[0] = amd_bytealign_S (w3
[0], w2
[3], offset_minus_4
);
6291 w2
[3] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
6292 w2
[2] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
6293 w2
[1] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
6294 w2
[0] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
6295 w1
[3] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6296 w1
[2] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6297 w1
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6298 w1
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6299 w0
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6300 w0
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6301 w0
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6302 w0
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6304 if (offset_mod_4
== 0)
6326 w3
[2] = amd_bytealign_S ( 0, w3
[0], offset_minus_4
);
6327 w3
[1] = amd_bytealign_S (w3
[0], w2
[3], offset_minus_4
);
6328 w3
[0] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
6329 w2
[3] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
6330 w2
[2] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
6331 w2
[1] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
6332 w2
[0] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6333 w1
[3] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6334 w1
[2] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6335 w1
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6336 w1
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6337 w0
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6338 w0
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6339 w0
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6342 if (offset_mod_4
== 0)
6363 w3
[2] = amd_bytealign_S ( 0, w2
[3], offset_minus_4
);
6364 w3
[1] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
6365 w3
[0] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
6366 w2
[3] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
6367 w2
[2] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
6368 w2
[1] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6369 w2
[0] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6370 w1
[3] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6371 w1
[2] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6372 w1
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6373 w1
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6374 w0
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6375 w0
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6379 if (offset_mod_4
== 0)
6399 w3
[2] = amd_bytealign_S ( 0, w2
[2], offset_minus_4
);
6400 w3
[1] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
6401 w3
[0] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
6402 w2
[3] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
6403 w2
[2] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6404 w2
[1] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6405 w2
[0] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6406 w1
[3] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6407 w1
[2] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6408 w1
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6409 w1
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6410 w0
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6415 if (offset_mod_4
== 0)
6434 w3
[2] = amd_bytealign_S ( 0, w2
[1], offset_minus_4
);
6435 w3
[1] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
6436 w3
[0] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
6437 w2
[3] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6438 w2
[2] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6439 w2
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6440 w2
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6441 w1
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6442 w1
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6443 w1
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6444 w1
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6450 if (offset_mod_4
== 0)
6468 w3
[2] = amd_bytealign_S ( 0, w2
[0], offset_minus_4
);
6469 w3
[1] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
6470 w3
[0] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6471 w2
[3] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6472 w2
[2] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6473 w2
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6474 w2
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6475 w1
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6476 w1
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6477 w1
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6484 if (offset_mod_4
== 0)
6501 w3
[2] = amd_bytealign_S ( 0, w1
[3], offset_minus_4
);
6502 w3
[1] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6503 w3
[0] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6504 w2
[3] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6505 w2
[2] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6506 w2
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6507 w2
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6508 w1
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6509 w1
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6517 if (offset_mod_4
== 0)
6533 w3
[2] = amd_bytealign_S ( 0, w1
[2], offset_minus_4
);
6534 w3
[1] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6535 w3
[0] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6536 w2
[3] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6537 w2
[2] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6538 w2
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6539 w2
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6540 w1
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6549 if (offset_mod_4
== 0)
6564 w3
[2] = amd_bytealign_S ( 0, w1
[1], offset_minus_4
);
6565 w3
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6566 w3
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6567 w2
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6568 w2
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6569 w2
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6570 w2
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6580 if (offset_mod_4
== 0)
6594 w3
[2] = amd_bytealign_S ( 0, w1
[0], offset_minus_4
);
6595 w3
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6596 w3
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6597 w2
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6598 w2
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6599 w2
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6610 if (offset_mod_4
== 0)
6623 w3
[2] = amd_bytealign_S ( 0, w0
[3], offset_minus_4
);
6624 w3
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6625 w3
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6626 w2
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6627 w2
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6639 if (offset_mod_4
== 0)
6651 w3
[2] = amd_bytealign_S ( 0, w0
[2], offset_minus_4
);
6652 w3
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6653 w3
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6654 w2
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6667 if (offset_mod_4
== 0)
6678 w3
[2] = amd_bytealign_S ( 0, w0
[1], offset_minus_4
);
6679 w3
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6680 w3
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6694 if (offset_mod_4
== 0)
6704 w3
[2] = amd_bytealign_S ( 0, w0
[0], offset_minus_4
);
6705 w3
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6720 if (offset_mod_4
== 0)
6731 const int offset_minus_4
= 4 - (offset
% 4);
6733 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
6738 w3
[1] = __byte_perm_S (w3
[0], w3
[1], selector
);
6739 w3
[0] = __byte_perm_S (w2
[3], w3
[0], selector
);
6740 w2
[3] = __byte_perm_S (w2
[2], w2
[3], selector
);
6741 w2
[2] = __byte_perm_S (w2
[1], w2
[2], selector
);
6742 w2
[1] = __byte_perm_S (w2
[0], w2
[1], selector
);
6743 w2
[0] = __byte_perm_S (w1
[3], w2
[0], selector
);
6744 w1
[3] = __byte_perm_S (w1
[2], w1
[3], selector
);
6745 w1
[2] = __byte_perm_S (w1
[1], w1
[2], selector
);
6746 w1
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
6747 w1
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
6748 w0
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
6749 w0
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
6750 w0
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
6751 w0
[0] = __byte_perm_S ( 0, w0
[0], selector
);
6756 w3
[1] = __byte_perm_S (w2
[3], w3
[0], selector
);
6757 w3
[0] = __byte_perm_S (w2
[2], w2
[3], selector
);
6758 w2
[3] = __byte_perm_S (w2
[1], w2
[2], selector
);
6759 w2
[2] = __byte_perm_S (w2
[0], w2
[1], selector
);
6760 w2
[1] = __byte_perm_S (w1
[3], w2
[0], selector
);
6761 w2
[0] = __byte_perm_S (w1
[2], w1
[3], selector
);
6762 w1
[3] = __byte_perm_S (w1
[1], w1
[2], selector
);
6763 w1
[2] = __byte_perm_S (w1
[0], w1
[1], selector
);
6764 w1
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
6765 w1
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
6766 w0
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
6767 w0
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
6768 w0
[1] = __byte_perm_S ( 0, w0
[0], selector
);
6774 w3
[1] = __byte_perm_S (w2
[2], w2
[3], selector
);
6775 w3
[0] = __byte_perm_S (w2
[1], w2
[2], selector
);
6776 w2
[3] = __byte_perm_S (w2
[0], w2
[1], selector
);
6777 w2
[2] = __byte_perm_S (w1
[3], w2
[0], selector
);
6778 w2
[1] = __byte_perm_S (w1
[2], w1
[3], selector
);
6779 w2
[0] = __byte_perm_S (w1
[1], w1
[2], selector
);
6780 w1
[3] = __byte_perm_S (w1
[0], w1
[1], selector
);
6781 w1
[2] = __byte_perm_S (w0
[3], w1
[0], selector
);
6782 w1
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
6783 w1
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
6784 w0
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
6785 w0
[2] = __byte_perm_S ( 0, w0
[0], selector
);
6792 w3
[1] = __byte_perm_S (w2
[1], w2
[2], selector
);
6793 w3
[0] = __byte_perm_S (w2
[0], w2
[1], selector
);
6794 w2
[3] = __byte_perm_S (w1
[3], w2
[0], selector
);
6795 w2
[2] = __byte_perm_S (w1
[2], w1
[3], selector
);
6796 w2
[1] = __byte_perm_S (w1
[1], w1
[2], selector
);
6797 w2
[0] = __byte_perm_S (w1
[0], w1
[1], selector
);
6798 w1
[3] = __byte_perm_S (w0
[3], w1
[0], selector
);
6799 w1
[2] = __byte_perm_S (w0
[2], w0
[3], selector
);
6800 w1
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
6801 w1
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
6802 w0
[3] = __byte_perm_S ( 0, w0
[0], selector
);
6810 w3
[1] = __byte_perm_S (w2
[0], w2
[1], selector
);
6811 w3
[0] = __byte_perm_S (w1
[3], w2
[0], selector
);
6812 w2
[3] = __byte_perm_S (w1
[2], w1
[3], selector
);
6813 w2
[2] = __byte_perm_S (w1
[1], w1
[2], selector
);
6814 w2
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
6815 w2
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
6816 w1
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
6817 w1
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
6818 w1
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
6819 w1
[0] = __byte_perm_S ( 0, w0
[0], selector
);
6828 w3
[1] = __byte_perm_S (w1
[3], w2
[0], selector
);
6829 w3
[0] = __byte_perm_S (w1
[2], w1
[3], selector
);
6830 w2
[3] = __byte_perm_S (w1
[1], w1
[2], selector
);
6831 w2
[2] = __byte_perm_S (w1
[0], w1
[1], selector
);
6832 w2
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
6833 w2
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
6834 w1
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
6835 w1
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
6836 w1
[1] = __byte_perm_S ( 0, w0
[0], selector
);
6846 w3
[1] = __byte_perm_S (w1
[2], w1
[3], selector
);
6847 w3
[0] = __byte_perm_S (w1
[1], w1
[2], selector
);
6848 w2
[3] = __byte_perm_S (w1
[0], w1
[1], selector
);
6849 w2
[2] = __byte_perm_S (w0
[3], w1
[0], selector
);
6850 w2
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
6851 w2
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
6852 w1
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
6853 w1
[2] = __byte_perm_S ( 0, w0
[0], selector
);
6864 w3
[1] = __byte_perm_S (w1
[1], w1
[2], selector
);
6865 w3
[0] = __byte_perm_S (w1
[0], w1
[1], selector
);
6866 w2
[3] = __byte_perm_S (w0
[3], w1
[0], selector
);
6867 w2
[2] = __byte_perm_S (w0
[2], w0
[3], selector
);
6868 w2
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
6869 w2
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
6870 w1
[3] = __byte_perm_S ( 0, w0
[0], selector
);
6882 w3
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
6883 w3
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
6884 w2
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
6885 w2
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
6886 w2
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
6887 w2
[0] = __byte_perm_S ( 0, w0
[0], selector
);
6900 w3
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
6901 w3
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
6902 w2
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
6903 w2
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
6904 w2
[1] = __byte_perm_S ( 0, w0
[0], selector
);
6918 w3
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
6919 w3
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
6920 w2
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
6921 w2
[2] = __byte_perm_S ( 0, w0
[0], selector
);
6936 w3
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
6937 w3
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
6938 w2
[3] = __byte_perm_S ( 0, w0
[0], selector
);
6954 w3
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
6955 w3
[0] = __byte_perm_S ( 0, w0
[0], selector
);
6972 w3
[1] = __byte_perm_S ( 0, w0
[0], selector
);
6992 static void switch_buffer_by_offset_be_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
6994 #if defined IS_AMD || defined IS_GENERIC
6998 w3
[2] = amd_bytealign_S (w3
[1], 0, offset
);
6999 w3
[1] = amd_bytealign_S (w3
[0], w3
[1], offset
);
7000 w3
[0] = amd_bytealign_S (w2
[3], w3
[0], offset
);
7001 w2
[3] = amd_bytealign_S (w2
[2], w2
[3], offset
);
7002 w2
[2] = amd_bytealign_S (w2
[1], w2
[2], offset
);
7003 w2
[1] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7004 w2
[0] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7005 w1
[3] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7006 w1
[2] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7007 w1
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7008 w1
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7009 w0
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7010 w0
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7011 w0
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7012 w0
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
7016 w3
[2] = amd_bytealign_S (w3
[0], 0, offset
);
7017 w3
[1] = amd_bytealign_S (w2
[3], w3
[0], offset
);
7018 w3
[0] = amd_bytealign_S (w2
[2], w2
[3], offset
);
7019 w2
[3] = amd_bytealign_S (w2
[1], w2
[2], offset
);
7020 w2
[2] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7021 w2
[1] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7022 w2
[0] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7023 w1
[3] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7024 w1
[2] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7025 w1
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7026 w1
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7027 w0
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7028 w0
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7029 w0
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
7034 w3
[2] = amd_bytealign_S (w2
[3], 0, offset
);
7035 w3
[1] = amd_bytealign_S (w2
[2], w2
[3], offset
);
7036 w3
[0] = amd_bytealign_S (w2
[1], w2
[2], offset
);
7037 w2
[3] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7038 w2
[2] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7039 w2
[1] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7040 w2
[0] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7041 w1
[3] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7042 w1
[2] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7043 w1
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7044 w1
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7045 w0
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7046 w0
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
7052 w3
[2] = amd_bytealign_S (w2
[2], 0, offset
);
7053 w3
[1] = amd_bytealign_S (w2
[1], w2
[2], offset
);
7054 w3
[0] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7055 w2
[3] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7056 w2
[2] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7057 w2
[1] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7058 w2
[0] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7059 w1
[3] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7060 w1
[2] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7061 w1
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7062 w1
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7063 w0
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
7070 w3
[2] = amd_bytealign_S (w2
[1], 0, offset
);
7071 w3
[1] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7072 w3
[0] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7073 w2
[3] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7074 w2
[2] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7075 w2
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7076 w2
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7077 w1
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7078 w1
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7079 w1
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7080 w1
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
7088 w3
[2] = amd_bytealign_S (w2
[0], 0, offset
);
7089 w3
[1] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7090 w3
[0] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7091 w2
[3] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7092 w2
[2] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7093 w2
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7094 w2
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7095 w1
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7096 w1
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7097 w1
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
7106 w3
[2] = amd_bytealign_S (w1
[3], 0, offset
);
7107 w3
[1] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7108 w3
[0] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7109 w2
[3] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7110 w2
[2] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7111 w2
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7112 w2
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7113 w1
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7114 w1
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
7124 w3
[2] = amd_bytealign_S (w1
[2], 0, offset
);
7125 w3
[1] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7126 w3
[0] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7127 w2
[3] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7128 w2
[2] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7129 w2
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7130 w2
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7131 w1
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
7142 w3
[2] = amd_bytealign_S (w1
[1], 0, offset
);
7143 w3
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7144 w3
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7145 w2
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7146 w2
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7147 w2
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7148 w2
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
7160 w3
[2] = amd_bytealign_S (w1
[0], 0, offset
);
7161 w3
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7162 w3
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7163 w2
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7164 w2
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7165 w2
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
7178 w3
[2] = amd_bytealign_S (w0
[3], 0, offset
);
7179 w3
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7180 w3
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7181 w2
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7182 w2
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
7196 w3
[2] = amd_bytealign_S (w0
[2], 0, offset
);
7197 w3
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7198 w3
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7199 w2
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
7214 w3
[2] = amd_bytealign_S (w0
[1], 0, offset
);
7215 w3
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7216 w3
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
7232 w3
[2] = amd_bytealign_S (w0
[0], 0, offset
);
7233 w3
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
7252 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
7257 w3
[1] = __byte_perm_S (w3
[1], w3
[0], selector
);
7258 w3
[0] = __byte_perm_S (w3
[0], w2
[3], selector
);
7259 w2
[3] = __byte_perm_S (w2
[3], w2
[2], selector
);
7260 w2
[2] = __byte_perm_S (w2
[2], w2
[1], selector
);
7261 w2
[1] = __byte_perm_S (w2
[1], w2
[0], selector
);
7262 w2
[0] = __byte_perm_S (w2
[0], w1
[3], selector
);
7263 w1
[3] = __byte_perm_S (w1
[3], w1
[2], selector
);
7264 w1
[2] = __byte_perm_S (w1
[2], w1
[1], selector
);
7265 w1
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
7266 w1
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
7267 w0
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
7268 w0
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
7269 w0
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
7270 w0
[0] = __byte_perm_S (w0
[0], 0, selector
);
7274 w3
[1] = __byte_perm_S (w3
[0], w2
[3], selector
);
7275 w3
[0] = __byte_perm_S (w2
[3], w2
[2], selector
);
7276 w2
[3] = __byte_perm_S (w2
[2], w2
[1], selector
);
7277 w2
[2] = __byte_perm_S (w2
[1], w2
[0], selector
);
7278 w2
[1] = __byte_perm_S (w2
[0], w1
[3], selector
);
7279 w2
[0] = __byte_perm_S (w1
[3], w1
[2], selector
);
7280 w1
[3] = __byte_perm_S (w1
[2], w1
[1], selector
);
7281 w1
[2] = __byte_perm_S (w1
[1], w1
[0], selector
);
7282 w1
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
7283 w1
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
7284 w0
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
7285 w0
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
7286 w0
[1] = __byte_perm_S (w0
[0], 0, selector
);
7291 w3
[1] = __byte_perm_S (w2
[3], w2
[2], selector
);
7292 w3
[0] = __byte_perm_S (w2
[2], w2
[1], selector
);
7293 w2
[3] = __byte_perm_S (w2
[1], w2
[0], selector
);
7294 w2
[2] = __byte_perm_S (w2
[0], w1
[3], selector
);
7295 w2
[1] = __byte_perm_S (w1
[3], w1
[2], selector
);
7296 w2
[0] = __byte_perm_S (w1
[2], w1
[1], selector
);
7297 w1
[3] = __byte_perm_S (w1
[1], w1
[0], selector
);
7298 w1
[2] = __byte_perm_S (w1
[0], w0
[3], selector
);
7299 w1
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
7300 w1
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
7301 w0
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
7302 w0
[2] = __byte_perm_S (w0
[0], 0, selector
);
7308 w3
[1] = __byte_perm_S (w2
[2], w2
[1], selector
);
7309 w3
[0] = __byte_perm_S (w2
[1], w2
[0], selector
);
7310 w2
[3] = __byte_perm_S (w2
[0], w1
[3], selector
);
7311 w2
[2] = __byte_perm_S (w1
[3], w1
[2], selector
);
7312 w2
[1] = __byte_perm_S (w1
[2], w1
[1], selector
);
7313 w2
[0] = __byte_perm_S (w1
[1], w1
[0], selector
);
7314 w1
[3] = __byte_perm_S (w1
[0], w0
[3], selector
);
7315 w1
[2] = __byte_perm_S (w0
[3], w0
[2], selector
);
7316 w1
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
7317 w1
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
7318 w0
[3] = __byte_perm_S (w0
[0], 0, selector
);
7325 w3
[1] = __byte_perm_S (w2
[1], w2
[0], selector
);
7326 w3
[0] = __byte_perm_S (w2
[0], w1
[3], selector
);
7327 w2
[3] = __byte_perm_S (w1
[3], w1
[2], selector
);
7328 w2
[2] = __byte_perm_S (w1
[2], w1
[1], selector
);
7329 w2
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
7330 w2
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
7331 w1
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
7332 w1
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
7333 w1
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
7334 w1
[0] = __byte_perm_S (w0
[0], 0, selector
);
7342 w3
[1] = __byte_perm_S (w2
[0], w1
[3], selector
);
7343 w3
[0] = __byte_perm_S (w1
[3], w1
[2], selector
);
7344 w2
[3] = __byte_perm_S (w1
[2], w1
[1], selector
);
7345 w2
[2] = __byte_perm_S (w1
[1], w1
[0], selector
);
7346 w2
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
7347 w2
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
7348 w1
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
7349 w1
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
7350 w1
[1] = __byte_perm_S (w0
[0], 0, selector
);
7359 w3
[1] = __byte_perm_S (w1
[3], w1
[2], selector
);
7360 w3
[0] = __byte_perm_S (w1
[2], w1
[1], selector
);
7361 w2
[3] = __byte_perm_S (w1
[1], w1
[0], selector
);
7362 w2
[2] = __byte_perm_S (w1
[0], w0
[3], selector
);
7363 w2
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
7364 w2
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
7365 w1
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
7366 w1
[2] = __byte_perm_S (w0
[0], 0, selector
);
7376 w3
[1] = __byte_perm_S (w1
[2], w1
[1], selector
);
7377 w3
[0] = __byte_perm_S (w1
[1], w1
[0], selector
);
7378 w2
[3] = __byte_perm_S (w1
[0], w0
[3], selector
);
7379 w2
[2] = __byte_perm_S (w0
[3], w0
[2], selector
);
7380 w2
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
7381 w2
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
7382 w1
[3] = __byte_perm_S (w0
[0], 0, selector
);
7393 w3
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
7394 w3
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
7395 w2
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
7396 w2
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
7397 w2
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
7398 w2
[0] = __byte_perm_S (w0
[0], 0, selector
);
7410 w3
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
7411 w3
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
7412 w2
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
7413 w2
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
7414 w2
[1] = __byte_perm_S (w0
[0], 0, selector
);
7427 w3
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
7428 w3
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
7429 w2
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
7430 w2
[2] = __byte_perm_S (w0
[0], 0, selector
);
7444 w3
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
7445 w3
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
7446 w2
[3] = __byte_perm_S (w0
[0], 0, selector
);
7461 w3
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
7462 w3
[0] = __byte_perm_S (w0
[0], 0, selector
);
7478 w3
[1] = __byte_perm_S (w0
[0], 0, selector
);