2 * Author......: Jens Steube <jens.steube@gmail.com>
7 * pure scalar functions
10 static int hash_comp (const u32 d1
[4], __global u32
*d2
)
12 if (d1
[3] > d2
[DGST_R3
]) return ( 1);
13 if (d1
[3] < d2
[DGST_R3
]) return (-1);
14 if (d1
[2] > d2
[DGST_R2
]) return ( 1);
15 if (d1
[2] < d2
[DGST_R2
]) return (-1);
16 if (d1
[1] > d2
[DGST_R1
]) return ( 1);
17 if (d1
[1] < d2
[DGST_R1
]) return (-1);
18 if (d1
[0] > d2
[DGST_R0
]) return ( 1);
19 if (d1
[0] < d2
[DGST_R0
]) return (-1);
24 static int find_hash (const u32 digest
[4], const u32 digests_cnt
, __global digest_t
*digests_buf
)
26 for (u32 l
= 0, r
= digests_cnt
; r
; r
>>= 1)
32 const int cmp
= hash_comp (digest
, digests_buf
[c
].digest_buf
);
41 if (cmp
== 0) return (c
);
47 static u32
check_bitmap (__global u32
*bitmap
, const u32 bitmap_mask
, const u32 bitmap_shift
, const u32 digest
)
49 return (bitmap
[(digest
>> bitmap_shift
) & bitmap_mask
] & (1 << (digest
& 0x1f)));
52 static u32
check (const u32 digest
[2], __global u32
*bitmap_s1_a
, __global u32
*bitmap_s1_b
, __global u32
*bitmap_s1_c
, __global u32
*bitmap_s1_d
, __global u32
*bitmap_s2_a
, __global u32
*bitmap_s2_b
, __global u32
*bitmap_s2_c
, __global u32
*bitmap_s2_d
, const u32 bitmap_mask
, const u32 bitmap_shift1
, const u32 bitmap_shift2
)
54 if (check_bitmap (bitmap_s1_a
, bitmap_mask
, bitmap_shift1
, digest
[0]) == 0) return (0);
55 if (check_bitmap (bitmap_s1_b
, bitmap_mask
, bitmap_shift1
, digest
[1]) == 0) return (0);
56 if (check_bitmap (bitmap_s1_c
, bitmap_mask
, bitmap_shift1
, digest
[2]) == 0) return (0);
57 if (check_bitmap (bitmap_s1_d
, bitmap_mask
, bitmap_shift1
, digest
[3]) == 0) return (0);
59 if (check_bitmap (bitmap_s2_a
, bitmap_mask
, bitmap_shift2
, digest
[0]) == 0) return (0);
60 if (check_bitmap (bitmap_s2_b
, bitmap_mask
, bitmap_shift2
, digest
[1]) == 0) return (0);
61 if (check_bitmap (bitmap_s2_c
, bitmap_mask
, bitmap_shift2
, digest
[2]) == 0) return (0);
62 if (check_bitmap (bitmap_s2_d
, bitmap_mask
, bitmap_shift2
, digest
[3]) == 0) return (0);
67 static void mark_hash (__global plain_t
*plains_buf
, __global u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
69 hashes_shown
[hash_pos
] = 1;
71 plains_buf
[hash_pos
].gidvid
= (gid
* 1) + 0;
72 plains_buf
[hash_pos
].il_pos
= il_pos
;
79 static void truncate_block (u32x w
[4], const u32 len
)
88 case 1: w
[0] &= 0x000000FF;
93 case 2: w
[0] &= 0x0000FFFF;
98 case 3: w
[0] &= 0x00FFFFFF;
107 case 5: w
[1] &= 0x000000FF;
111 case 6: w
[1] &= 0x0000FFFF;
115 case 7: w
[1] &= 0x00FFFFFF;
122 case 9: w
[2] &= 0x000000FF;
125 case 10: w
[2] &= 0x0000FFFF;
128 case 11: w
[2] &= 0x00FFFFFF;
133 case 13: w
[3] &= 0x000000FF;
135 case 14: w
[3] &= 0x0000FFFF;
137 case 15: w
[3] &= 0x00FFFFFF;
142 static void make_unicode (const u32x in
[4], u32x out1
[4], u32x out2
[4])
145 out2
[3] = __byte_perm (in
[3], 0, 0x7372);
146 out2
[2] = __byte_perm (in
[3], 0, 0x7170);
147 out2
[1] = __byte_perm (in
[2], 0, 0x7372);
148 out2
[0] = __byte_perm (in
[2], 0, 0x7170);
149 out1
[3] = __byte_perm (in
[1], 0, 0x7372);
150 out1
[2] = __byte_perm (in
[1], 0, 0x7170);
151 out1
[1] = __byte_perm (in
[0], 0, 0x7372);
152 out1
[0] = __byte_perm (in
[0], 0, 0x7170);
155 #if defined IS_AMD || defined IS_GENERIC
156 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
157 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
158 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
159 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
160 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
161 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
162 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
163 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
167 static void undo_unicode (const u32x in1
[4], const u32x in2
[4], u32x out
[4])
170 out
[0] = __byte_perm (in1
[0], in1
[1], 0x6420);
171 out
[1] = __byte_perm (in1
[2], in1
[3], 0x6420);
172 out
[2] = __byte_perm (in2
[0], in2
[1], 0x6420);
173 out
[3] = __byte_perm (in2
[2], in2
[3], 0x6420);
176 #if defined IS_AMD || defined IS_GENERIC
177 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
178 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
179 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
180 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
181 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
182 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
183 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
184 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
188 static void append_0x01_1x4 (u32x w0
[4], const u32 offset
)
197 w0
[0] = w0
[0] | 0x0100;
201 w0
[0] = w0
[0] | 0x010000;
205 w0
[0] = w0
[0] | 0x01000000;
213 w0
[1] = w0
[1] | 0x0100;
217 w0
[1] = w0
[1] | 0x010000;
221 w0
[1] = w0
[1] | 0x01000000;
229 w0
[2] = w0
[2] | 0x0100;
233 w0
[2] = w0
[2] | 0x010000;
237 w0
[2] = w0
[2] | 0x01000000;
245 w0
[3] = w0
[3] | 0x0100;
249 w0
[3] = w0
[3] | 0x010000;
253 w0
[3] = w0
[3] | 0x01000000;
258 static void append_0x01_2x4 (u32x w0
[4], u32x w1
[4], const u32 offset
)
267 w0
[0] = w0
[0] | 0x0100;
271 w0
[0] = w0
[0] | 0x010000;
275 w0
[0] = w0
[0] | 0x01000000;
283 w0
[1] = w0
[1] | 0x0100;
287 w0
[1] = w0
[1] | 0x010000;
291 w0
[1] = w0
[1] | 0x01000000;
299 w0
[2] = w0
[2] | 0x0100;
303 w0
[2] = w0
[2] | 0x010000;
307 w0
[2] = w0
[2] | 0x01000000;
315 w0
[3] = w0
[3] | 0x0100;
319 w0
[3] = w0
[3] | 0x010000;
323 w0
[3] = w0
[3] | 0x01000000;
331 w1
[0] = w1
[0] | 0x0100;
335 w1
[0] = w1
[0] | 0x010000;
339 w1
[0] = w1
[0] | 0x01000000;
347 w1
[1] = w1
[1] | 0x0100;
351 w1
[1] = w1
[1] | 0x010000;
355 w1
[1] = w1
[1] | 0x01000000;
363 w1
[2] = w1
[2] | 0x0100;
367 w1
[2] = w1
[2] | 0x010000;
371 w1
[2] = w1
[2] | 0x01000000;
379 w1
[3] = w1
[3] | 0x0100;
383 w1
[3] = w1
[3] | 0x010000;
387 w1
[3] = w1
[3] | 0x01000000;
392 static void append_0x01_3x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
401 w0
[0] = w0
[0] | 0x0100;
405 w0
[0] = w0
[0] | 0x010000;
409 w0
[0] = w0
[0] | 0x01000000;
417 w0
[1] = w0
[1] | 0x0100;
421 w0
[1] = w0
[1] | 0x010000;
425 w0
[1] = w0
[1] | 0x01000000;
433 w0
[2] = w0
[2] | 0x0100;
437 w0
[2] = w0
[2] | 0x010000;
441 w0
[2] = w0
[2] | 0x01000000;
449 w0
[3] = w0
[3] | 0x0100;
453 w0
[3] = w0
[3] | 0x010000;
457 w0
[3] = w0
[3] | 0x01000000;
465 w1
[0] = w1
[0] | 0x0100;
469 w1
[0] = w1
[0] | 0x010000;
473 w1
[0] = w1
[0] | 0x01000000;
481 w1
[1] = w1
[1] | 0x0100;
485 w1
[1] = w1
[1] | 0x010000;
489 w1
[1] = w1
[1] | 0x01000000;
497 w1
[2] = w1
[2] | 0x0100;
501 w1
[2] = w1
[2] | 0x010000;
505 w1
[2] = w1
[2] | 0x01000000;
513 w1
[3] = w1
[3] | 0x0100;
517 w1
[3] = w1
[3] | 0x010000;
521 w1
[3] = w1
[3] | 0x01000000;
529 w2
[0] = w2
[0] | 0x0100;
533 w2
[0] = w2
[0] | 0x010000;
537 w2
[0] = w2
[0] | 0x01000000;
545 w2
[1] = w2
[1] | 0x0100;
549 w2
[1] = w2
[1] | 0x010000;
553 w2
[1] = w2
[1] | 0x01000000;
561 w2
[2] = w2
[2] | 0x0100;
565 w2
[2] = w2
[2] | 0x010000;
569 w2
[2] = w2
[2] | 0x01000000;
577 w2
[3] = w2
[3] | 0x0100;
581 w2
[3] = w2
[3] | 0x010000;
585 w2
[3] = w2
[3] | 0x01000000;
590 static void append_0x01_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
599 w0
[0] = w0
[0] | 0x0100;
603 w0
[0] = w0
[0] | 0x010000;
607 w0
[0] = w0
[0] | 0x01000000;
615 w0
[1] = w0
[1] | 0x0100;
619 w0
[1] = w0
[1] | 0x010000;
623 w0
[1] = w0
[1] | 0x01000000;
631 w0
[2] = w0
[2] | 0x0100;
635 w0
[2] = w0
[2] | 0x010000;
639 w0
[2] = w0
[2] | 0x01000000;
647 w0
[3] = w0
[3] | 0x0100;
651 w0
[3] = w0
[3] | 0x010000;
655 w0
[3] = w0
[3] | 0x01000000;
663 w1
[0] = w1
[0] | 0x0100;
667 w1
[0] = w1
[0] | 0x010000;
671 w1
[0] = w1
[0] | 0x01000000;
679 w1
[1] = w1
[1] | 0x0100;
683 w1
[1] = w1
[1] | 0x010000;
687 w1
[1] = w1
[1] | 0x01000000;
695 w1
[2] = w1
[2] | 0x0100;
699 w1
[2] = w1
[2] | 0x010000;
703 w1
[2] = w1
[2] | 0x01000000;
711 w1
[3] = w1
[3] | 0x0100;
715 w1
[3] = w1
[3] | 0x010000;
719 w1
[3] = w1
[3] | 0x01000000;
727 w2
[0] = w2
[0] | 0x0100;
731 w2
[0] = w2
[0] | 0x010000;
735 w2
[0] = w2
[0] | 0x01000000;
743 w2
[1] = w2
[1] | 0x0100;
747 w2
[1] = w2
[1] | 0x010000;
751 w2
[1] = w2
[1] | 0x01000000;
759 w2
[2] = w2
[2] | 0x0100;
763 w2
[2] = w2
[2] | 0x010000;
767 w2
[2] = w2
[2] | 0x01000000;
775 w2
[3] = w2
[3] | 0x0100;
779 w2
[3] = w2
[3] | 0x010000;
783 w2
[3] = w2
[3] | 0x01000000;
791 w3
[0] = w3
[0] | 0x0100;
795 w3
[0] = w3
[0] | 0x010000;
799 w3
[0] = w3
[0] | 0x01000000;
807 w3
[1] = w3
[1] | 0x0100;
811 w3
[1] = w3
[1] | 0x010000;
815 w3
[1] = w3
[1] | 0x01000000;
823 w3
[2] = w3
[2] | 0x0100;
827 w3
[2] = w3
[2] | 0x010000;
831 w3
[2] = w3
[2] | 0x01000000;
839 w3
[3] = w3
[3] | 0x0100;
843 w3
[3] = w3
[3] | 0x010000;
847 w3
[3] = w3
[3] | 0x01000000;
852 static void append_0x01_8x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
861 w0
[0] = w0
[0] | 0x0100;
865 w0
[0] = w0
[0] | 0x010000;
869 w0
[0] = w0
[0] | 0x01000000;
877 w0
[1] = w0
[1] | 0x0100;
881 w0
[1] = w0
[1] | 0x010000;
885 w0
[1] = w0
[1] | 0x01000000;
893 w0
[2] = w0
[2] | 0x0100;
897 w0
[2] = w0
[2] | 0x010000;
901 w0
[2] = w0
[2] | 0x01000000;
909 w0
[3] = w0
[3] | 0x0100;
913 w0
[3] = w0
[3] | 0x010000;
917 w0
[3] = w0
[3] | 0x01000000;
925 w1
[0] = w1
[0] | 0x0100;
929 w1
[0] = w1
[0] | 0x010000;
933 w1
[0] = w1
[0] | 0x01000000;
941 w1
[1] = w1
[1] | 0x0100;
945 w1
[1] = w1
[1] | 0x010000;
949 w1
[1] = w1
[1] | 0x01000000;
957 w1
[2] = w1
[2] | 0x0100;
961 w1
[2] = w1
[2] | 0x010000;
965 w1
[2] = w1
[2] | 0x01000000;
973 w1
[3] = w1
[3] | 0x0100;
977 w1
[3] = w1
[3] | 0x010000;
981 w1
[3] = w1
[3] | 0x01000000;
989 w2
[0] = w2
[0] | 0x0100;
993 w2
[0] = w2
[0] | 0x010000;
997 w2
[0] = w2
[0] | 0x01000000;
1005 w2
[1] = w2
[1] | 0x0100;
1009 w2
[1] = w2
[1] | 0x010000;
1013 w2
[1] = w2
[1] | 0x01000000;
1021 w2
[2] = w2
[2] | 0x0100;
1025 w2
[2] = w2
[2] | 0x010000;
1029 w2
[2] = w2
[2] | 0x01000000;
1037 w2
[3] = w2
[3] | 0x0100;
1041 w2
[3] = w2
[3] | 0x010000;
1045 w2
[3] = w2
[3] | 0x01000000;
1053 w3
[0] = w3
[0] | 0x0100;
1057 w3
[0] = w3
[0] | 0x010000;
1061 w3
[0] = w3
[0] | 0x01000000;
1069 w3
[1] = w3
[1] | 0x0100;
1073 w3
[1] = w3
[1] | 0x010000;
1077 w3
[1] = w3
[1] | 0x01000000;
1085 w3
[2] = w3
[2] | 0x0100;
1089 w3
[2] = w3
[2] | 0x010000;
1093 w3
[2] = w3
[2] | 0x01000000;
1101 w3
[3] = w3
[3] | 0x0100;
1105 w3
[3] = w3
[3] | 0x010000;
1109 w3
[3] = w3
[3] | 0x01000000;
1117 w4
[0] = w4
[0] | 0x0100;
1121 w4
[0] = w4
[0] | 0x010000;
1125 w4
[0] = w4
[0] | 0x01000000;
1133 w4
[1] = w4
[1] | 0x0100;
1137 w4
[1] = w4
[1] | 0x010000;
1141 w4
[1] = w4
[1] | 0x01000000;
1149 w4
[2] = w4
[2] | 0x0100;
1153 w4
[2] = w4
[2] | 0x010000;
1157 w4
[2] = w4
[2] | 0x01000000;
1165 w4
[3] = w4
[3] | 0x0100;
1169 w4
[3] = w4
[3] | 0x010000;
1173 w4
[3] = w4
[3] | 0x01000000;
1181 w5
[0] = w5
[0] | 0x0100;
1185 w5
[0] = w5
[0] | 0x010000;
1189 w5
[0] = w5
[0] | 0x01000000;
1197 w5
[1] = w5
[1] | 0x0100;
1201 w5
[1] = w5
[1] | 0x010000;
1205 w5
[1] = w5
[1] | 0x01000000;
1213 w5
[2] = w5
[2] | 0x0100;
1217 w5
[2] = w5
[2] | 0x010000;
1221 w5
[2] = w5
[2] | 0x01000000;
1229 w5
[3] = w5
[3] | 0x0100;
1233 w5
[3] = w5
[3] | 0x010000;
1237 w5
[3] = w5
[3] | 0x01000000;
1245 w6
[0] = w6
[0] | 0x0100;
1249 w6
[0] = w6
[0] | 0x010000;
1253 w6
[0] = w6
[0] | 0x01000000;
1261 w6
[1] = w6
[1] | 0x0100;
1265 w6
[1] = w6
[1] | 0x010000;
1269 w6
[1] = w6
[1] | 0x01000000;
1277 w6
[2] = w6
[2] | 0x0100;
1281 w6
[2] = w6
[2] | 0x010000;
1285 w6
[2] = w6
[2] | 0x01000000;
1293 w6
[3] = w6
[3] | 0x0100;
1297 w6
[3] = w6
[3] | 0x010000;
1301 w6
[3] = w6
[3] | 0x01000000;
1309 w7
[0] = w7
[0] | 0x0100;
1313 w7
[0] = w7
[0] | 0x010000;
1317 w7
[0] = w7
[0] | 0x01000000;
1325 w7
[1] = w7
[1] | 0x0100;
1329 w7
[1] = w7
[1] | 0x010000;
1333 w7
[1] = w7
[1] | 0x01000000;
1341 w7
[2] = w7
[2] | 0x0100;
1345 w7
[2] = w7
[2] | 0x010000;
1349 w7
[2] = w7
[2] | 0x01000000;
1357 w7
[3] = w7
[3] | 0x0100;
1361 w7
[3] = w7
[3] | 0x010000;
1365 w7
[3] = w7
[3] | 0x01000000;
1370 static void append_0x02_1x4 (u32x w0
[4], const u32 offset
)
1379 w0
[0] = w0
[0] | 0x0200;
1383 w0
[0] = w0
[0] | 0x020000;
1387 w0
[0] = w0
[0] | 0x02000000;
1395 w0
[1] = w0
[1] | 0x0200;
1399 w0
[1] = w0
[1] | 0x020000;
1403 w0
[1] = w0
[1] | 0x02000000;
1411 w0
[2] = w0
[2] | 0x0200;
1415 w0
[2] = w0
[2] | 0x020000;
1419 w0
[2] = w0
[2] | 0x02000000;
1427 w0
[3] = w0
[3] | 0x0200;
1431 w0
[3] = w0
[3] | 0x020000;
1435 w0
[3] = w0
[3] | 0x02000000;
1440 static void append_0x02_2x4 (u32x w0
[4], u32x w1
[4], const u32 offset
)
1449 w0
[0] = w0
[0] | 0x0200;
1453 w0
[0] = w0
[0] | 0x020000;
1457 w0
[0] = w0
[0] | 0x02000000;
1465 w0
[1] = w0
[1] | 0x0200;
1469 w0
[1] = w0
[1] | 0x020000;
1473 w0
[1] = w0
[1] | 0x02000000;
1481 w0
[2] = w0
[2] | 0x0200;
1485 w0
[2] = w0
[2] | 0x020000;
1489 w0
[2] = w0
[2] | 0x02000000;
1497 w0
[3] = w0
[3] | 0x0200;
1501 w0
[3] = w0
[3] | 0x020000;
1505 w0
[3] = w0
[3] | 0x02000000;
1513 w1
[0] = w1
[0] | 0x0200;
1517 w1
[0] = w1
[0] | 0x020000;
1521 w1
[0] = w1
[0] | 0x02000000;
1529 w1
[1] = w1
[1] | 0x0200;
1533 w1
[1] = w1
[1] | 0x020000;
1537 w1
[1] = w1
[1] | 0x02000000;
1545 w1
[2] = w1
[2] | 0x0200;
1549 w1
[2] = w1
[2] | 0x020000;
1553 w1
[2] = w1
[2] | 0x02000000;
1561 w1
[3] = w1
[3] | 0x0200;
1565 w1
[3] = w1
[3] | 0x020000;
1569 w1
[3] = w1
[3] | 0x02000000;
1574 static void append_0x02_3x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
1583 w0
[0] = w0
[0] | 0x0200;
1587 w0
[0] = w0
[0] | 0x020000;
1591 w0
[0] = w0
[0] | 0x02000000;
1599 w0
[1] = w0
[1] | 0x0200;
1603 w0
[1] = w0
[1] | 0x020000;
1607 w0
[1] = w0
[1] | 0x02000000;
1615 w0
[2] = w0
[2] | 0x0200;
1619 w0
[2] = w0
[2] | 0x020000;
1623 w0
[2] = w0
[2] | 0x02000000;
1631 w0
[3] = w0
[3] | 0x0200;
1635 w0
[3] = w0
[3] | 0x020000;
1639 w0
[3] = w0
[3] | 0x02000000;
1647 w1
[0] = w1
[0] | 0x0200;
1651 w1
[0] = w1
[0] | 0x020000;
1655 w1
[0] = w1
[0] | 0x02000000;
1663 w1
[1] = w1
[1] | 0x0200;
1667 w1
[1] = w1
[1] | 0x020000;
1671 w1
[1] = w1
[1] | 0x02000000;
1679 w1
[2] = w1
[2] | 0x0200;
1683 w1
[2] = w1
[2] | 0x020000;
1687 w1
[2] = w1
[2] | 0x02000000;
1695 w1
[3] = w1
[3] | 0x0200;
1699 w1
[3] = w1
[3] | 0x020000;
1703 w1
[3] = w1
[3] | 0x02000000;
1711 w2
[0] = w2
[0] | 0x0200;
1715 w2
[0] = w2
[0] | 0x020000;
1719 w2
[0] = w2
[0] | 0x02000000;
1727 w2
[1] = w2
[1] | 0x0200;
1731 w2
[1] = w2
[1] | 0x020000;
1735 w2
[1] = w2
[1] | 0x02000000;
1743 w2
[2] = w2
[2] | 0x0200;
1747 w2
[2] = w2
[2] | 0x020000;
1751 w2
[2] = w2
[2] | 0x02000000;
1759 w2
[3] = w2
[3] | 0x0200;
1763 w2
[3] = w2
[3] | 0x020000;
1767 w2
[3] = w2
[3] | 0x02000000;
1772 static void append_0x02_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
1781 w0
[0] = w0
[0] | 0x0200;
1785 w0
[0] = w0
[0] | 0x020000;
1789 w0
[0] = w0
[0] | 0x02000000;
1797 w0
[1] = w0
[1] | 0x0200;
1801 w0
[1] = w0
[1] | 0x020000;
1805 w0
[1] = w0
[1] | 0x02000000;
1813 w0
[2] = w0
[2] | 0x0200;
1817 w0
[2] = w0
[2] | 0x020000;
1821 w0
[2] = w0
[2] | 0x02000000;
1829 w0
[3] = w0
[3] | 0x0200;
1833 w0
[3] = w0
[3] | 0x020000;
1837 w0
[3] = w0
[3] | 0x02000000;
1845 w1
[0] = w1
[0] | 0x0200;
1849 w1
[0] = w1
[0] | 0x020000;
1853 w1
[0] = w1
[0] | 0x02000000;
1861 w1
[1] = w1
[1] | 0x0200;
1865 w1
[1] = w1
[1] | 0x020000;
1869 w1
[1] = w1
[1] | 0x02000000;
1877 w1
[2] = w1
[2] | 0x0200;
1881 w1
[2] = w1
[2] | 0x020000;
1885 w1
[2] = w1
[2] | 0x02000000;
1893 w1
[3] = w1
[3] | 0x0200;
1897 w1
[3] = w1
[3] | 0x020000;
1901 w1
[3] = w1
[3] | 0x02000000;
1909 w2
[0] = w2
[0] | 0x0200;
1913 w2
[0] = w2
[0] | 0x020000;
1917 w2
[0] = w2
[0] | 0x02000000;
1925 w2
[1] = w2
[1] | 0x0200;
1929 w2
[1] = w2
[1] | 0x020000;
1933 w2
[1] = w2
[1] | 0x02000000;
1941 w2
[2] = w2
[2] | 0x0200;
1945 w2
[2] = w2
[2] | 0x020000;
1949 w2
[2] = w2
[2] | 0x02000000;
1957 w2
[3] = w2
[3] | 0x0200;
1961 w2
[3] = w2
[3] | 0x020000;
1965 w2
[3] = w2
[3] | 0x02000000;
1973 w3
[0] = w3
[0] | 0x0200;
1977 w3
[0] = w3
[0] | 0x020000;
1981 w3
[0] = w3
[0] | 0x02000000;
1989 w3
[1] = w3
[1] | 0x0200;
1993 w3
[1] = w3
[1] | 0x020000;
1997 w3
[1] = w3
[1] | 0x02000000;
2005 w3
[2] = w3
[2] | 0x0200;
2009 w3
[2] = w3
[2] | 0x020000;
2013 w3
[2] = w3
[2] | 0x02000000;
2021 w3
[3] = w3
[3] | 0x0200;
2025 w3
[3] = w3
[3] | 0x020000;
2029 w3
[3] = w3
[3] | 0x02000000;
2034 static void append_0x02_8x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
2043 w0
[0] = w0
[0] | 0x0200;
2047 w0
[0] = w0
[0] | 0x020000;
2051 w0
[0] = w0
[0] | 0x02000000;
2059 w0
[1] = w0
[1] | 0x0200;
2063 w0
[1] = w0
[1] | 0x020000;
2067 w0
[1] = w0
[1] | 0x02000000;
2075 w0
[2] = w0
[2] | 0x0200;
2079 w0
[2] = w0
[2] | 0x020000;
2083 w0
[2] = w0
[2] | 0x02000000;
2091 w0
[3] = w0
[3] | 0x0200;
2095 w0
[3] = w0
[3] | 0x020000;
2099 w0
[3] = w0
[3] | 0x02000000;
2107 w1
[0] = w1
[0] | 0x0200;
2111 w1
[0] = w1
[0] | 0x020000;
2115 w1
[0] = w1
[0] | 0x02000000;
2123 w1
[1] = w1
[1] | 0x0200;
2127 w1
[1] = w1
[1] | 0x020000;
2131 w1
[1] = w1
[1] | 0x02000000;
2139 w1
[2] = w1
[2] | 0x0200;
2143 w1
[2] = w1
[2] | 0x020000;
2147 w1
[2] = w1
[2] | 0x02000000;
2155 w1
[3] = w1
[3] | 0x0200;
2159 w1
[3] = w1
[3] | 0x020000;
2163 w1
[3] = w1
[3] | 0x02000000;
2171 w2
[0] = w2
[0] | 0x0200;
2175 w2
[0] = w2
[0] | 0x020000;
2179 w2
[0] = w2
[0] | 0x02000000;
2187 w2
[1] = w2
[1] | 0x0200;
2191 w2
[1] = w2
[1] | 0x020000;
2195 w2
[1] = w2
[1] | 0x02000000;
2203 w2
[2] = w2
[2] | 0x0200;
2207 w2
[2] = w2
[2] | 0x020000;
2211 w2
[2] = w2
[2] | 0x02000000;
2219 w2
[3] = w2
[3] | 0x0200;
2223 w2
[3] = w2
[3] | 0x020000;
2227 w2
[3] = w2
[3] | 0x02000000;
2235 w3
[0] = w3
[0] | 0x0200;
2239 w3
[0] = w3
[0] | 0x020000;
2243 w3
[0] = w3
[0] | 0x02000000;
2251 w3
[1] = w3
[1] | 0x0200;
2255 w3
[1] = w3
[1] | 0x020000;
2259 w3
[1] = w3
[1] | 0x02000000;
2267 w3
[2] = w3
[2] | 0x0200;
2271 w3
[2] = w3
[2] | 0x020000;
2275 w3
[2] = w3
[2] | 0x02000000;
2283 w3
[3] = w3
[3] | 0x0200;
2287 w3
[3] = w3
[3] | 0x020000;
2291 w3
[3] = w3
[3] | 0x02000000;
2299 w4
[0] = w4
[0] | 0x0200;
2303 w4
[0] = w4
[0] | 0x020000;
2307 w4
[0] = w4
[0] | 0x02000000;
2315 w4
[1] = w4
[1] | 0x0200;
2319 w4
[1] = w4
[1] | 0x020000;
2323 w4
[1] = w4
[1] | 0x02000000;
2331 w4
[2] = w4
[2] | 0x0200;
2335 w4
[2] = w4
[2] | 0x020000;
2339 w4
[2] = w4
[2] | 0x02000000;
2347 w4
[3] = w4
[3] | 0x0200;
2351 w4
[3] = w4
[3] | 0x020000;
2355 w4
[3] = w4
[3] | 0x02000000;
2363 w5
[0] = w5
[0] | 0x0200;
2367 w5
[0] = w5
[0] | 0x020000;
2371 w5
[0] = w5
[0] | 0x02000000;
2379 w5
[1] = w5
[1] | 0x0200;
2383 w5
[1] = w5
[1] | 0x020000;
2387 w5
[1] = w5
[1] | 0x02000000;
2395 w5
[2] = w5
[2] | 0x0200;
2399 w5
[2] = w5
[2] | 0x020000;
2403 w5
[2] = w5
[2] | 0x02000000;
2411 w5
[3] = w5
[3] | 0x0200;
2415 w5
[3] = w5
[3] | 0x020000;
2419 w5
[3] = w5
[3] | 0x02000000;
2427 w6
[0] = w6
[0] | 0x0200;
2431 w6
[0] = w6
[0] | 0x020000;
2435 w6
[0] = w6
[0] | 0x02000000;
2443 w6
[1] = w6
[1] | 0x0200;
2447 w6
[1] = w6
[1] | 0x020000;
2451 w6
[1] = w6
[1] | 0x02000000;
2459 w6
[2] = w6
[2] | 0x0200;
2463 w6
[2] = w6
[2] | 0x020000;
2467 w6
[2] = w6
[2] | 0x02000000;
2475 w6
[3] = w6
[3] | 0x0200;
2479 w6
[3] = w6
[3] | 0x020000;
2483 w6
[3] = w6
[3] | 0x02000000;
2491 w7
[0] = w7
[0] | 0x0200;
2495 w7
[0] = w7
[0] | 0x020000;
2499 w7
[0] = w7
[0] | 0x02000000;
2507 w7
[1] = w7
[1] | 0x0200;
2511 w7
[1] = w7
[1] | 0x020000;
2515 w7
[1] = w7
[1] | 0x02000000;
2523 w7
[2] = w7
[2] | 0x0200;
2527 w7
[2] = w7
[2] | 0x020000;
2531 w7
[2] = w7
[2] | 0x02000000;
2539 w7
[3] = w7
[3] | 0x0200;
2543 w7
[3] = w7
[3] | 0x020000;
2547 w7
[3] = w7
[3] | 0x02000000;
2552 static void append_0x80_1x4 (u32x w0
[4], const u32 offset
)
2561 w0
[0] = w0
[0] | 0x8000;
2565 w0
[0] = w0
[0] | 0x800000;
2569 w0
[0] = w0
[0] | 0x80000000;
2577 w0
[1] = w0
[1] | 0x8000;
2581 w0
[1] = w0
[1] | 0x800000;
2585 w0
[1] = w0
[1] | 0x80000000;
2593 w0
[2] = w0
[2] | 0x8000;
2597 w0
[2] = w0
[2] | 0x800000;
2601 w0
[2] = w0
[2] | 0x80000000;
2609 w0
[3] = w0
[3] | 0x8000;
2613 w0
[3] = w0
[3] | 0x800000;
2617 w0
[3] = w0
[3] | 0x80000000;
2622 static void append_0x80_2x4 (u32x w0
[4], u32x w1
[4], const u32 offset
)
2631 w0
[0] = w0
[0] | 0x8000;
2635 w0
[0] = w0
[0] | 0x800000;
2639 w0
[0] = w0
[0] | 0x80000000;
2647 w0
[1] = w0
[1] | 0x8000;
2651 w0
[1] = w0
[1] | 0x800000;
2655 w0
[1] = w0
[1] | 0x80000000;
2663 w0
[2] = w0
[2] | 0x8000;
2667 w0
[2] = w0
[2] | 0x800000;
2671 w0
[2] = w0
[2] | 0x80000000;
2679 w0
[3] = w0
[3] | 0x8000;
2683 w0
[3] = w0
[3] | 0x800000;
2687 w0
[3] = w0
[3] | 0x80000000;
2695 w1
[0] = w1
[0] | 0x8000;
2699 w1
[0] = w1
[0] | 0x800000;
2703 w1
[0] = w1
[0] | 0x80000000;
2711 w1
[1] = w1
[1] | 0x8000;
2715 w1
[1] = w1
[1] | 0x800000;
2719 w1
[1] = w1
[1] | 0x80000000;
2727 w1
[2] = w1
[2] | 0x8000;
2731 w1
[2] = w1
[2] | 0x800000;
2735 w1
[2] = w1
[2] | 0x80000000;
2743 w1
[3] = w1
[3] | 0x8000;
2747 w1
[3] = w1
[3] | 0x800000;
2751 w1
[3] = w1
[3] | 0x80000000;
2756 static void append_0x80_3x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
2765 w0
[0] = w0
[0] | 0x8000;
2769 w0
[0] = w0
[0] | 0x800000;
2773 w0
[0] = w0
[0] | 0x80000000;
2781 w0
[1] = w0
[1] | 0x8000;
2785 w0
[1] = w0
[1] | 0x800000;
2789 w0
[1] = w0
[1] | 0x80000000;
2797 w0
[2] = w0
[2] | 0x8000;
2801 w0
[2] = w0
[2] | 0x800000;
2805 w0
[2] = w0
[2] | 0x80000000;
2813 w0
[3] = w0
[3] | 0x8000;
2817 w0
[3] = w0
[3] | 0x800000;
2821 w0
[3] = w0
[3] | 0x80000000;
2829 w1
[0] = w1
[0] | 0x8000;
2833 w1
[0] = w1
[0] | 0x800000;
2837 w1
[0] = w1
[0] | 0x80000000;
2845 w1
[1] = w1
[1] | 0x8000;
2849 w1
[1] = w1
[1] | 0x800000;
2853 w1
[1] = w1
[1] | 0x80000000;
2861 w1
[2] = w1
[2] | 0x8000;
2865 w1
[2] = w1
[2] | 0x800000;
2869 w1
[2] = w1
[2] | 0x80000000;
2877 w1
[3] = w1
[3] | 0x8000;
2881 w1
[3] = w1
[3] | 0x800000;
2885 w1
[3] = w1
[3] | 0x80000000;
2893 w2
[0] = w2
[0] | 0x8000;
2897 w2
[0] = w2
[0] | 0x800000;
2901 w2
[0] = w2
[0] | 0x80000000;
2909 w2
[1] = w2
[1] | 0x8000;
2913 w2
[1] = w2
[1] | 0x800000;
2917 w2
[1] = w2
[1] | 0x80000000;
2925 w2
[2] = w2
[2] | 0x8000;
2929 w2
[2] = w2
[2] | 0x800000;
2933 w2
[2] = w2
[2] | 0x80000000;
2941 w2
[3] = w2
[3] | 0x8000;
2945 w2
[3] = w2
[3] | 0x800000;
2949 w2
[3] = w2
[3] | 0x80000000;
2954 static void append_0x80_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
2963 w0
[0] = w0
[0] | 0x8000;
2967 w0
[0] = w0
[0] | 0x800000;
2971 w0
[0] = w0
[0] | 0x80000000;
2979 w0
[1] = w0
[1] | 0x8000;
2983 w0
[1] = w0
[1] | 0x800000;
2987 w0
[1] = w0
[1] | 0x80000000;
2995 w0
[2] = w0
[2] | 0x8000;
2999 w0
[2] = w0
[2] | 0x800000;
3003 w0
[2] = w0
[2] | 0x80000000;
3011 w0
[3] = w0
[3] | 0x8000;
3015 w0
[3] = w0
[3] | 0x800000;
3019 w0
[3] = w0
[3] | 0x80000000;
3027 w1
[0] = w1
[0] | 0x8000;
3031 w1
[0] = w1
[0] | 0x800000;
3035 w1
[0] = w1
[0] | 0x80000000;
3043 w1
[1] = w1
[1] | 0x8000;
3047 w1
[1] = w1
[1] | 0x800000;
3051 w1
[1] = w1
[1] | 0x80000000;
3059 w1
[2] = w1
[2] | 0x8000;
3063 w1
[2] = w1
[2] | 0x800000;
3067 w1
[2] = w1
[2] | 0x80000000;
3075 w1
[3] = w1
[3] | 0x8000;
3079 w1
[3] = w1
[3] | 0x800000;
3083 w1
[3] = w1
[3] | 0x80000000;
3091 w2
[0] = w2
[0] | 0x8000;
3095 w2
[0] = w2
[0] | 0x800000;
3099 w2
[0] = w2
[0] | 0x80000000;
3107 w2
[1] = w2
[1] | 0x8000;
3111 w2
[1] = w2
[1] | 0x800000;
3115 w2
[1] = w2
[1] | 0x80000000;
3123 w2
[2] = w2
[2] | 0x8000;
3127 w2
[2] = w2
[2] | 0x800000;
3131 w2
[2] = w2
[2] | 0x80000000;
3139 w2
[3] = w2
[3] | 0x8000;
3143 w2
[3] = w2
[3] | 0x800000;
3147 w2
[3] = w2
[3] | 0x80000000;
3155 w3
[0] = w3
[0] | 0x8000;
3159 w3
[0] = w3
[0] | 0x800000;
3163 w3
[0] = w3
[0] | 0x80000000;
3171 w3
[1] = w3
[1] | 0x8000;
3175 w3
[1] = w3
[1] | 0x800000;
3179 w3
[1] = w3
[1] | 0x80000000;
3187 w3
[2] = w3
[2] | 0x8000;
3191 w3
[2] = w3
[2] | 0x800000;
3195 w3
[2] = w3
[2] | 0x80000000;
3203 w3
[3] = w3
[3] | 0x8000;
3207 w3
[3] = w3
[3] | 0x800000;
3211 w3
[3] = w3
[3] | 0x80000000;
3216 static void append_0x80_8x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
3225 w0
[0] = w0
[0] | 0x8000;
3229 w0
[0] = w0
[0] | 0x800000;
3233 w0
[0] = w0
[0] | 0x80000000;
3241 w0
[1] = w0
[1] | 0x8000;
3245 w0
[1] = w0
[1] | 0x800000;
3249 w0
[1] = w0
[1] | 0x80000000;
3257 w0
[2] = w0
[2] | 0x8000;
3261 w0
[2] = w0
[2] | 0x800000;
3265 w0
[2] = w0
[2] | 0x80000000;
3273 w0
[3] = w0
[3] | 0x8000;
3277 w0
[3] = w0
[3] | 0x800000;
3281 w0
[3] = w0
[3] | 0x80000000;
3289 w1
[0] = w1
[0] | 0x8000;
3293 w1
[0] = w1
[0] | 0x800000;
3297 w1
[0] = w1
[0] | 0x80000000;
3305 w1
[1] = w1
[1] | 0x8000;
3309 w1
[1] = w1
[1] | 0x800000;
3313 w1
[1] = w1
[1] | 0x80000000;
3321 w1
[2] = w1
[2] | 0x8000;
3325 w1
[2] = w1
[2] | 0x800000;
3329 w1
[2] = w1
[2] | 0x80000000;
3337 w1
[3] = w1
[3] | 0x8000;
3341 w1
[3] = w1
[3] | 0x800000;
3345 w1
[3] = w1
[3] | 0x80000000;
3353 w2
[0] = w2
[0] | 0x8000;
3357 w2
[0] = w2
[0] | 0x800000;
3361 w2
[0] = w2
[0] | 0x80000000;
3369 w2
[1] = w2
[1] | 0x8000;
3373 w2
[1] = w2
[1] | 0x800000;
3377 w2
[1] = w2
[1] | 0x80000000;
3385 w2
[2] = w2
[2] | 0x8000;
3389 w2
[2] = w2
[2] | 0x800000;
3393 w2
[2] = w2
[2] | 0x80000000;
3401 w2
[3] = w2
[3] | 0x8000;
3405 w2
[3] = w2
[3] | 0x800000;
3409 w2
[3] = w2
[3] | 0x80000000;
3417 w3
[0] = w3
[0] | 0x8000;
3421 w3
[0] = w3
[0] | 0x800000;
3425 w3
[0] = w3
[0] | 0x80000000;
3433 w3
[1] = w3
[1] | 0x8000;
3437 w3
[1] = w3
[1] | 0x800000;
3441 w3
[1] = w3
[1] | 0x80000000;
3449 w3
[2] = w3
[2] | 0x8000;
3453 w3
[2] = w3
[2] | 0x800000;
3457 w3
[2] = w3
[2] | 0x80000000;
3465 w3
[3] = w3
[3] | 0x8000;
3469 w3
[3] = w3
[3] | 0x800000;
3473 w3
[3] = w3
[3] | 0x80000000;
3481 w4
[0] = w4
[0] | 0x8000;
3485 w4
[0] = w4
[0] | 0x800000;
3489 w4
[0] = w4
[0] | 0x80000000;
3497 w4
[1] = w4
[1] | 0x8000;
3501 w4
[1] = w4
[1] | 0x800000;
3505 w4
[1] = w4
[1] | 0x80000000;
3513 w4
[2] = w4
[2] | 0x8000;
3517 w4
[2] = w4
[2] | 0x800000;
3521 w4
[2] = w4
[2] | 0x80000000;
3529 w4
[3] = w4
[3] | 0x8000;
3533 w4
[3] = w4
[3] | 0x800000;
3537 w4
[3] = w4
[3] | 0x80000000;
3545 w5
[0] = w5
[0] | 0x8000;
3549 w5
[0] = w5
[0] | 0x800000;
3553 w5
[0] = w5
[0] | 0x80000000;
3561 w5
[1] = w5
[1] | 0x8000;
3565 w5
[1] = w5
[1] | 0x800000;
3569 w5
[1] = w5
[1] | 0x80000000;
3577 w5
[2] = w5
[2] | 0x8000;
3581 w5
[2] = w5
[2] | 0x800000;
3585 w5
[2] = w5
[2] | 0x80000000;
3593 w5
[3] = w5
[3] | 0x8000;
3597 w5
[3] = w5
[3] | 0x800000;
3601 w5
[3] = w5
[3] | 0x80000000;
3609 w6
[0] = w6
[0] | 0x8000;
3613 w6
[0] = w6
[0] | 0x800000;
3617 w6
[0] = w6
[0] | 0x80000000;
3625 w6
[1] = w6
[1] | 0x8000;
3629 w6
[1] = w6
[1] | 0x800000;
3633 w6
[1] = w6
[1] | 0x80000000;
3641 w6
[2] = w6
[2] | 0x8000;
3645 w6
[2] = w6
[2] | 0x800000;
3649 w6
[2] = w6
[2] | 0x80000000;
3657 w6
[3] = w6
[3] | 0x8000;
3661 w6
[3] = w6
[3] | 0x800000;
3665 w6
[3] = w6
[3] | 0x80000000;
3673 w7
[0] = w7
[0] | 0x8000;
3677 w7
[0] = w7
[0] | 0x800000;
3681 w7
[0] = w7
[0] | 0x80000000;
3689 w7
[1] = w7
[1] | 0x8000;
3693 w7
[1] = w7
[1] | 0x800000;
3697 w7
[1] = w7
[1] | 0x80000000;
3705 w7
[2] = w7
[2] | 0x8000;
3709 w7
[2] = w7
[2] | 0x800000;
3713 w7
[2] = w7
[2] | 0x80000000;
3721 w7
[3] = w7
[3] | 0x8000;
3725 w7
[3] = w7
[3] | 0x800000;
3729 w7
[3] = w7
[3] | 0x80000000;
3734 static void append_0x80_1x16 (u32x w
[16], const u32 offset
)
3743 w
[ 0] = w
[ 0] | 0x8000;
3747 w
[ 0] = w
[ 0] | 0x800000;
3751 w
[ 0] = w
[ 0] | 0x80000000;
3759 w
[ 1] = w
[ 1] | 0x8000;
3763 w
[ 1] = w
[ 1] | 0x800000;
3767 w
[ 1] = w
[ 1] | 0x80000000;
3775 w
[ 2] = w
[ 2] | 0x8000;
3779 w
[ 2] = w
[ 2] | 0x800000;
3783 w
[ 2] = w
[ 2] | 0x80000000;
3791 w
[ 3] = w
[ 3] | 0x8000;
3795 w
[ 3] = w
[ 3] | 0x800000;
3799 w
[ 3] = w
[ 3] | 0x80000000;
3807 w
[ 4] = w
[ 4] | 0x8000;
3811 w
[ 4] = w
[ 4] | 0x800000;
3815 w
[ 4] = w
[ 4] | 0x80000000;
3823 w
[ 5] = w
[ 5] | 0x8000;
3827 w
[ 5] = w
[ 5] | 0x800000;
3831 w
[ 5] = w
[ 5] | 0x80000000;
3839 w
[ 6] = w
[ 6] | 0x8000;
3843 w
[ 6] = w
[ 6] | 0x800000;
3847 w
[ 6] = w
[ 6] | 0x80000000;
3855 w
[ 7] = w
[ 7] | 0x8000;
3859 w
[ 7] = w
[ 7] | 0x800000;
3863 w
[ 7] = w
[ 7] | 0x80000000;
3871 w
[ 8] = w
[ 8] | 0x8000;
3875 w
[ 8] = w
[ 8] | 0x800000;
3879 w
[ 8] = w
[ 8] | 0x80000000;
3887 w
[ 9] = w
[ 9] | 0x8000;
3891 w
[ 9] = w
[ 9] | 0x800000;
3895 w
[ 9] = w
[ 9] | 0x80000000;
3903 w
[10] = w
[10] | 0x8000;
3907 w
[10] = w
[10] | 0x800000;
3911 w
[10] = w
[10] | 0x80000000;
3919 w
[11] = w
[11] | 0x8000;
3923 w
[11] = w
[11] | 0x800000;
3927 w
[11] = w
[11] | 0x80000000;
3935 w
[12] = w
[12] | 0x8000;
3939 w
[12] = w
[12] | 0x800000;
3943 w
[12] = w
[12] | 0x80000000;
3951 w
[13] = w
[13] | 0x8000;
3955 w
[13] = w
[13] | 0x800000;
3959 w
[13] = w
[13] | 0x80000000;
3967 w
[14] = w
[14] | 0x8000;
3971 w
[14] = w
[14] | 0x800000;
3975 w
[14] = w
[14] | 0x80000000;
3983 w
[15] = w
[15] | 0x8000;
3987 w
[15] = w
[15] | 0x800000;
3991 w
[15] = w
[15] | 0x80000000;
3996 static void switch_buffer_by_offset_le (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
3998 #if defined IS_AMD || defined IS_GENERIC
3999 const int offset_mod_4
= offset
& 3;
4001 const int offset_minus_4
= 4 - offset
;
4006 w3
[2] = amd_bytealign ( 0, w3
[1], offset_minus_4
);
4007 w3
[1] = amd_bytealign (w3
[1], w3
[0], offset_minus_4
);
4008 w3
[0] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
4009 w2
[3] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4010 w2
[2] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4011 w2
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4012 w2
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4013 w1
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4014 w1
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4015 w1
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4016 w1
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4017 w0
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4018 w0
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4019 w0
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4020 w0
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4022 if (offset_mod_4
== 0)
4044 w3
[2] = amd_bytealign ( 0, w3
[0], offset_minus_4
);
4045 w3
[1] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
4046 w3
[0] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4047 w2
[3] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4048 w2
[2] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4049 w2
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4050 w2
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4051 w1
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4052 w1
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4053 w1
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4054 w1
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4055 w0
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4056 w0
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4057 w0
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4060 if (offset_mod_4
== 0)
4081 w3
[2] = amd_bytealign ( 0, w2
[3], offset_minus_4
);
4082 w3
[1] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4083 w3
[0] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4084 w2
[3] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4085 w2
[2] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4086 w2
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4087 w2
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4088 w1
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4089 w1
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4090 w1
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4091 w1
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4092 w0
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4093 w0
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4097 if (offset_mod_4
== 0)
4117 w3
[2] = amd_bytealign ( 0, w2
[2], offset_minus_4
);
4118 w3
[1] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4119 w3
[0] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4120 w2
[3] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4121 w2
[2] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4122 w2
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4123 w2
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4124 w1
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4125 w1
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4126 w1
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4127 w1
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4128 w0
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4133 if (offset_mod_4
== 0)
4152 w3
[2] = amd_bytealign ( 0, w2
[1], offset_minus_4
);
4153 w3
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4154 w3
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4155 w2
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4156 w2
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4157 w2
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4158 w2
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4159 w1
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4160 w1
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4161 w1
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4162 w1
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4168 if (offset_mod_4
== 0)
4186 w3
[2] = amd_bytealign ( 0, w2
[0], offset_minus_4
);
4187 w3
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4188 w3
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4189 w2
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4190 w2
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4191 w2
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4192 w2
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4193 w1
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4194 w1
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4195 w1
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4202 if (offset_mod_4
== 0)
4219 w3
[2] = amd_bytealign ( 0, w1
[3], offset_minus_4
);
4220 w3
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4221 w3
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4222 w2
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4223 w2
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4224 w2
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4225 w2
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4226 w1
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4227 w1
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4235 if (offset_mod_4
== 0)
4251 w3
[2] = amd_bytealign ( 0, w1
[2], offset_minus_4
);
4252 w3
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4253 w3
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4254 w2
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4255 w2
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4256 w2
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4257 w2
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4258 w1
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4267 if (offset_mod_4
== 0)
4282 w3
[2] = amd_bytealign ( 0, w1
[1], offset_minus_4
);
4283 w3
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4284 w3
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4285 w2
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4286 w2
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4287 w2
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4288 w2
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4298 if (offset_mod_4
== 0)
4312 w3
[2] = amd_bytealign ( 0, w1
[0], offset_minus_4
);
4313 w3
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4314 w3
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4315 w2
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4316 w2
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4317 w2
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4328 if (offset_mod_4
== 0)
4341 w3
[2] = amd_bytealign ( 0, w0
[3], offset_minus_4
);
4342 w3
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4343 w3
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4344 w2
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4345 w2
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4357 if (offset_mod_4
== 0)
4369 w3
[2] = amd_bytealign ( 0, w0
[2], offset_minus_4
);
4370 w3
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4371 w3
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4372 w2
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4385 if (offset_mod_4
== 0)
4396 w3
[2] = amd_bytealign ( 0, w0
[1], offset_minus_4
);
4397 w3
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4398 w3
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4412 if (offset_mod_4
== 0)
4422 w3
[2] = amd_bytealign ( 0, w0
[0], offset_minus_4
);
4423 w3
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4438 if (offset_mod_4
== 0)
4449 const int offset_minus_4
= 4 - (offset
% 4);
4451 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
4456 w3
[1] = __byte_perm (w3
[0], w3
[1], selector
);
4457 w3
[0] = __byte_perm (w2
[3], w3
[0], selector
);
4458 w2
[3] = __byte_perm (w2
[2], w2
[3], selector
);
4459 w2
[2] = __byte_perm (w2
[1], w2
[2], selector
);
4460 w2
[1] = __byte_perm (w2
[0], w2
[1], selector
);
4461 w2
[0] = __byte_perm (w1
[3], w2
[0], selector
);
4462 w1
[3] = __byte_perm (w1
[2], w1
[3], selector
);
4463 w1
[2] = __byte_perm (w1
[1], w1
[2], selector
);
4464 w1
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4465 w1
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4466 w0
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4467 w0
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4468 w0
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4469 w0
[0] = __byte_perm ( 0, w0
[0], selector
);
4474 w3
[1] = __byte_perm (w2
[3], w3
[0], selector
);
4475 w3
[0] = __byte_perm (w2
[2], w2
[3], selector
);
4476 w2
[3] = __byte_perm (w2
[1], w2
[2], selector
);
4477 w2
[2] = __byte_perm (w2
[0], w2
[1], selector
);
4478 w2
[1] = __byte_perm (w1
[3], w2
[0], selector
);
4479 w2
[0] = __byte_perm (w1
[2], w1
[3], selector
);
4480 w1
[3] = __byte_perm (w1
[1], w1
[2], selector
);
4481 w1
[2] = __byte_perm (w1
[0], w1
[1], selector
);
4482 w1
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4483 w1
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4484 w0
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4485 w0
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4486 w0
[1] = __byte_perm ( 0, w0
[0], selector
);
4492 w3
[1] = __byte_perm (w2
[2], w2
[3], selector
);
4493 w3
[0] = __byte_perm (w2
[1], w2
[2], selector
);
4494 w2
[3] = __byte_perm (w2
[0], w2
[1], selector
);
4495 w2
[2] = __byte_perm (w1
[3], w2
[0], selector
);
4496 w2
[1] = __byte_perm (w1
[2], w1
[3], selector
);
4497 w2
[0] = __byte_perm (w1
[1], w1
[2], selector
);
4498 w1
[3] = __byte_perm (w1
[0], w1
[1], selector
);
4499 w1
[2] = __byte_perm (w0
[3], w1
[0], selector
);
4500 w1
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4501 w1
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4502 w0
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4503 w0
[2] = __byte_perm ( 0, w0
[0], selector
);
4510 w3
[1] = __byte_perm (w2
[1], w2
[2], selector
);
4511 w3
[0] = __byte_perm (w2
[0], w2
[1], selector
);
4512 w2
[3] = __byte_perm (w1
[3], w2
[0], selector
);
4513 w2
[2] = __byte_perm (w1
[2], w1
[3], selector
);
4514 w2
[1] = __byte_perm (w1
[1], w1
[2], selector
);
4515 w2
[0] = __byte_perm (w1
[0], w1
[1], selector
);
4516 w1
[3] = __byte_perm (w0
[3], w1
[0], selector
);
4517 w1
[2] = __byte_perm (w0
[2], w0
[3], selector
);
4518 w1
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4519 w1
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4520 w0
[3] = __byte_perm ( 0, w0
[0], selector
);
4528 w3
[1] = __byte_perm (w2
[0], w2
[1], selector
);
4529 w3
[0] = __byte_perm (w1
[3], w2
[0], selector
);
4530 w2
[3] = __byte_perm (w1
[2], w1
[3], selector
);
4531 w2
[2] = __byte_perm (w1
[1], w1
[2], selector
);
4532 w2
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4533 w2
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4534 w1
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4535 w1
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4536 w1
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4537 w1
[0] = __byte_perm ( 0, w0
[0], selector
);
4546 w3
[1] = __byte_perm (w1
[3], w2
[0], selector
);
4547 w3
[0] = __byte_perm (w1
[2], w1
[3], selector
);
4548 w2
[3] = __byte_perm (w1
[1], w1
[2], selector
);
4549 w2
[2] = __byte_perm (w1
[0], w1
[1], selector
);
4550 w2
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4551 w2
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4552 w1
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4553 w1
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4554 w1
[1] = __byte_perm ( 0, w0
[0], selector
);
4564 w3
[1] = __byte_perm (w1
[2], w1
[3], selector
);
4565 w3
[0] = __byte_perm (w1
[1], w1
[2], selector
);
4566 w2
[3] = __byte_perm (w1
[0], w1
[1], selector
);
4567 w2
[2] = __byte_perm (w0
[3], w1
[0], selector
);
4568 w2
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4569 w2
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4570 w1
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4571 w1
[2] = __byte_perm ( 0, w0
[0], selector
);
4582 w3
[1] = __byte_perm (w1
[1], w1
[2], selector
);
4583 w3
[0] = __byte_perm (w1
[0], w1
[1], selector
);
4584 w2
[3] = __byte_perm (w0
[3], w1
[0], selector
);
4585 w2
[2] = __byte_perm (w0
[2], w0
[3], selector
);
4586 w2
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4587 w2
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4588 w1
[3] = __byte_perm ( 0, w0
[0], selector
);
4600 w3
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4601 w3
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4602 w2
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4603 w2
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4604 w2
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4605 w2
[0] = __byte_perm ( 0, w0
[0], selector
);
4618 w3
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4619 w3
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4620 w2
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4621 w2
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4622 w2
[1] = __byte_perm ( 0, w0
[0], selector
);
4636 w3
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4637 w3
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4638 w2
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4639 w2
[2] = __byte_perm ( 0, w0
[0], selector
);
4654 w3
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4655 w3
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4656 w2
[3] = __byte_perm ( 0, w0
[0], selector
);
4672 w3
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4673 w3
[0] = __byte_perm ( 0, w0
[0], selector
);
4690 w3
[1] = __byte_perm ( 0, w0
[0], selector
);
4710 static void switch_buffer_by_offset_be (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
4712 #if defined IS_AMD || defined IS_GENERIC
4716 w3
[2] = amd_bytealign (w3
[1], 0, offset
);
4717 w3
[1] = amd_bytealign (w3
[0], w3
[1], offset
);
4718 w3
[0] = amd_bytealign (w2
[3], w3
[0], offset
);
4719 w2
[3] = amd_bytealign (w2
[2], w2
[3], offset
);
4720 w2
[2] = amd_bytealign (w2
[1], w2
[2], offset
);
4721 w2
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
4722 w2
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
4723 w1
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
4724 w1
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
4725 w1
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4726 w1
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4727 w0
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4728 w0
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4729 w0
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4730 w0
[0] = amd_bytealign ( 0, w0
[0], offset
);
4734 w3
[2] = amd_bytealign (w3
[0], 0, offset
);
4735 w3
[1] = amd_bytealign (w2
[3], w3
[0], offset
);
4736 w3
[0] = amd_bytealign (w2
[2], w2
[3], offset
);
4737 w2
[3] = amd_bytealign (w2
[1], w2
[2], offset
);
4738 w2
[2] = amd_bytealign (w2
[0], w2
[1], offset
);
4739 w2
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
4740 w2
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
4741 w1
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
4742 w1
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
4743 w1
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4744 w1
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4745 w0
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4746 w0
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4747 w0
[1] = amd_bytealign ( 0, w0
[0], offset
);
4752 w3
[2] = amd_bytealign (w2
[3], 0, offset
);
4753 w3
[1] = amd_bytealign (w2
[2], w2
[3], offset
);
4754 w3
[0] = amd_bytealign (w2
[1], w2
[2], offset
);
4755 w2
[3] = amd_bytealign (w2
[0], w2
[1], offset
);
4756 w2
[2] = amd_bytealign (w1
[3], w2
[0], offset
);
4757 w2
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
4758 w2
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
4759 w1
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
4760 w1
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
4761 w1
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4762 w1
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4763 w0
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4764 w0
[2] = amd_bytealign ( 0, w0
[0], offset
);
4770 w3
[2] = amd_bytealign (w2
[2], 0, offset
);
4771 w3
[1] = amd_bytealign (w2
[1], w2
[2], offset
);
4772 w3
[0] = amd_bytealign (w2
[0], w2
[1], offset
);
4773 w2
[3] = amd_bytealign (w1
[3], w2
[0], offset
);
4774 w2
[2] = amd_bytealign (w1
[2], w1
[3], offset
);
4775 w2
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
4776 w2
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
4777 w1
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
4778 w1
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
4779 w1
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4780 w1
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4781 w0
[3] = amd_bytealign ( 0, w0
[0], offset
);
4788 w3
[2] = amd_bytealign (w2
[1], 0, offset
);
4789 w3
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
4790 w3
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
4791 w2
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
4792 w2
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
4793 w2
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4794 w2
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4795 w1
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4796 w1
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4797 w1
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4798 w1
[0] = amd_bytealign ( 0, w0
[0], offset
);
4806 w3
[2] = amd_bytealign (w2
[0], 0, offset
);
4807 w3
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
4808 w3
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
4809 w2
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
4810 w2
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
4811 w2
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4812 w2
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4813 w1
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4814 w1
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4815 w1
[1] = amd_bytealign ( 0, w0
[0], offset
);
4824 w3
[2] = amd_bytealign (w1
[3], 0, offset
);
4825 w3
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
4826 w3
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
4827 w2
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
4828 w2
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
4829 w2
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4830 w2
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4831 w1
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4832 w1
[2] = amd_bytealign ( 0, w0
[0], offset
);
4842 w3
[2] = amd_bytealign (w1
[2], 0, offset
);
4843 w3
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
4844 w3
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
4845 w2
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
4846 w2
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
4847 w2
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4848 w2
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4849 w1
[3] = amd_bytealign ( 0, w0
[0], offset
);
4860 w3
[2] = amd_bytealign (w1
[1], 0, offset
);
4861 w3
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4862 w3
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4863 w2
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4864 w2
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4865 w2
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4866 w2
[0] = amd_bytealign ( 0, w0
[0], offset
);
4878 w3
[2] = amd_bytealign (w1
[0], 0, offset
);
4879 w3
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4880 w3
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4881 w2
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4882 w2
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4883 w2
[1] = amd_bytealign ( 0, w0
[0], offset
);
4896 w3
[2] = amd_bytealign (w0
[3], 0, offset
);
4897 w3
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4898 w3
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4899 w2
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4900 w2
[2] = amd_bytealign ( 0, w0
[0], offset
);
4914 w3
[2] = amd_bytealign (w0
[2], 0, offset
);
4915 w3
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4916 w3
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4917 w2
[3] = amd_bytealign ( 0, w0
[0], offset
);
4932 w3
[2] = amd_bytealign (w0
[1], 0, offset
);
4933 w3
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4934 w3
[0] = amd_bytealign ( 0, w0
[0], offset
);
4950 w3
[2] = amd_bytealign (w0
[0], 0, offset
);
4951 w3
[1] = amd_bytealign ( 0, w0
[0], offset
);
4970 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
4975 w3
[1] = __byte_perm (w3
[1], w3
[0], selector
);
4976 w3
[0] = __byte_perm (w3
[0], w2
[3], selector
);
4977 w2
[3] = __byte_perm (w2
[3], w2
[2], selector
);
4978 w2
[2] = __byte_perm (w2
[2], w2
[1], selector
);
4979 w2
[1] = __byte_perm (w2
[1], w2
[0], selector
);
4980 w2
[0] = __byte_perm (w2
[0], w1
[3], selector
);
4981 w1
[3] = __byte_perm (w1
[3], w1
[2], selector
);
4982 w1
[2] = __byte_perm (w1
[2], w1
[1], selector
);
4983 w1
[1] = __byte_perm (w1
[1], w1
[0], selector
);
4984 w1
[0] = __byte_perm (w1
[0], w0
[3], selector
);
4985 w0
[3] = __byte_perm (w0
[3], w0
[2], selector
);
4986 w0
[2] = __byte_perm (w0
[2], w0
[1], selector
);
4987 w0
[1] = __byte_perm (w0
[1], w0
[0], selector
);
4988 w0
[0] = __byte_perm (w0
[0], 0, selector
);
4992 w3
[1] = __byte_perm (w3
[0], w2
[3], selector
);
4993 w3
[0] = __byte_perm (w2
[3], w2
[2], selector
);
4994 w2
[3] = __byte_perm (w2
[2], w2
[1], selector
);
4995 w2
[2] = __byte_perm (w2
[1], w2
[0], selector
);
4996 w2
[1] = __byte_perm (w2
[0], w1
[3], selector
);
4997 w2
[0] = __byte_perm (w1
[3], w1
[2], selector
);
4998 w1
[3] = __byte_perm (w1
[2], w1
[1], selector
);
4999 w1
[2] = __byte_perm (w1
[1], w1
[0], selector
);
5000 w1
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5001 w1
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5002 w0
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5003 w0
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5004 w0
[1] = __byte_perm (w0
[0], 0, selector
);
5009 w3
[1] = __byte_perm (w2
[3], w2
[2], selector
);
5010 w3
[0] = __byte_perm (w2
[2], w2
[1], selector
);
5011 w2
[3] = __byte_perm (w2
[1], w2
[0], selector
);
5012 w2
[2] = __byte_perm (w2
[0], w1
[3], selector
);
5013 w2
[1] = __byte_perm (w1
[3], w1
[2], selector
);
5014 w2
[0] = __byte_perm (w1
[2], w1
[1], selector
);
5015 w1
[3] = __byte_perm (w1
[1], w1
[0], selector
);
5016 w1
[2] = __byte_perm (w1
[0], w0
[3], selector
);
5017 w1
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5018 w1
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5019 w0
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5020 w0
[2] = __byte_perm (w0
[0], 0, selector
);
5026 w3
[1] = __byte_perm (w2
[2], w2
[1], selector
);
5027 w3
[0] = __byte_perm (w2
[1], w2
[0], selector
);
5028 w2
[3] = __byte_perm (w2
[0], w1
[3], selector
);
5029 w2
[2] = __byte_perm (w1
[3], w1
[2], selector
);
5030 w2
[1] = __byte_perm (w1
[2], w1
[1], selector
);
5031 w2
[0] = __byte_perm (w1
[1], w1
[0], selector
);
5032 w1
[3] = __byte_perm (w1
[0], w0
[3], selector
);
5033 w1
[2] = __byte_perm (w0
[3], w0
[2], selector
);
5034 w1
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5035 w1
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5036 w0
[3] = __byte_perm (w0
[0], 0, selector
);
5043 w3
[1] = __byte_perm (w2
[1], w2
[0], selector
);
5044 w3
[0] = __byte_perm (w2
[0], w1
[3], selector
);
5045 w2
[3] = __byte_perm (w1
[3], w1
[2], selector
);
5046 w2
[2] = __byte_perm (w1
[2], w1
[1], selector
);
5047 w2
[1] = __byte_perm (w1
[1], w1
[0], selector
);
5048 w2
[0] = __byte_perm (w1
[0], w0
[3], selector
);
5049 w1
[3] = __byte_perm (w0
[3], w0
[2], selector
);
5050 w1
[2] = __byte_perm (w0
[2], w0
[1], selector
);
5051 w1
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5052 w1
[0] = __byte_perm (w0
[0], 0, selector
);
5060 w3
[1] = __byte_perm (w2
[0], w1
[3], selector
);
5061 w3
[0] = __byte_perm (w1
[3], w1
[2], selector
);
5062 w2
[3] = __byte_perm (w1
[2], w1
[1], selector
);
5063 w2
[2] = __byte_perm (w1
[1], w1
[0], selector
);
5064 w2
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5065 w2
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5066 w1
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5067 w1
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5068 w1
[1] = __byte_perm (w0
[0], 0, selector
);
5077 w3
[1] = __byte_perm (w1
[3], w1
[2], selector
);
5078 w3
[0] = __byte_perm (w1
[2], w1
[1], selector
);
5079 w2
[3] = __byte_perm (w1
[1], w1
[0], selector
);
5080 w2
[2] = __byte_perm (w1
[0], w0
[3], selector
);
5081 w2
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5082 w2
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5083 w1
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5084 w1
[2] = __byte_perm (w0
[0], 0, selector
);
5094 w3
[1] = __byte_perm (w1
[2], w1
[1], selector
);
5095 w3
[0] = __byte_perm (w1
[1], w1
[0], selector
);
5096 w2
[3] = __byte_perm (w1
[0], w0
[3], selector
);
5097 w2
[2] = __byte_perm (w0
[3], w0
[2], selector
);
5098 w2
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5099 w2
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5100 w1
[3] = __byte_perm (w0
[0], 0, selector
);
5111 w3
[1] = __byte_perm (w1
[1], w1
[0], selector
);
5112 w3
[0] = __byte_perm (w1
[0], w0
[3], selector
);
5113 w2
[3] = __byte_perm (w0
[3], w0
[2], selector
);
5114 w2
[2] = __byte_perm (w0
[2], w0
[1], selector
);
5115 w2
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5116 w2
[0] = __byte_perm (w0
[0], 0, selector
);
5128 w3
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5129 w3
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5130 w2
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5131 w2
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5132 w2
[1] = __byte_perm (w0
[0], 0, selector
);
5145 w3
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5146 w3
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5147 w2
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5148 w2
[2] = __byte_perm (w0
[0], 0, selector
);
5162 w3
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5163 w3
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5164 w2
[3] = __byte_perm (w0
[0], 0, selector
);
5179 w3
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5180 w3
[0] = __byte_perm (w0
[0], 0, selector
);
5196 w3
[1] = __byte_perm (w0
[0], 0, selector
);
5215 static void overwrite_at_le (u32x sw
[16], const u32x w0
, const u32 salt_len
)
5217 #if defined cl_amd_media_ops
5222 case 1: sw
[0] = amd_bytealign (w0
, sw
[0] << 24, 3);
5223 sw
[1] = amd_bytealign (sw
[1] >> 8, w0
, 3);
5225 case 2: sw
[0] = amd_bytealign (w0
, sw
[0] << 16, 2);
5226 sw
[1] = amd_bytealign (sw
[1] >> 16, w0
, 2);
5228 case 3: sw
[0] = amd_bytealign (w0
, sw
[0] << 8, 1);
5229 sw
[1] = amd_bytealign (sw
[1] >> 24, w0
, 1);
5233 case 5: sw
[1] = amd_bytealign (w0
, sw
[1] << 24, 3);
5234 sw
[2] = amd_bytealign (sw
[2] >> 8, w0
, 3);
5236 case 6: sw
[1] = amd_bytealign (w0
, sw
[1] << 16, 2);
5237 sw
[2] = amd_bytealign (sw
[2] >> 16, w0
, 2);
5239 case 7: sw
[1] = amd_bytealign (w0
, sw
[1] << 8, 1);
5240 sw
[2] = amd_bytealign (sw
[2] >> 24, w0
, 1);
5244 case 9: sw
[2] = amd_bytealign (w0
, sw
[2] << 24, 3);
5245 sw
[3] = amd_bytealign (sw
[3] >> 8, w0
, 3);
5247 case 10: sw
[2] = amd_bytealign (w0
, sw
[2] << 16, 2);
5248 sw
[3] = amd_bytealign (sw
[3] >> 16, w0
, 2);
5250 case 11: sw
[2] = amd_bytealign (w0
, sw
[2] << 8, 1);
5251 sw
[3] = amd_bytealign (sw
[3] >> 24, w0
, 1);
5253 case 12: sw
[3] = w0
;
5255 case 13: sw
[3] = amd_bytealign (w0
, sw
[3] << 24, 3);
5256 sw
[4] = amd_bytealign (sw
[4] >> 8, w0
, 3);
5258 case 14: sw
[3] = amd_bytealign (w0
, sw
[3] << 16, 2);
5259 sw
[4] = amd_bytealign (sw
[4] >> 16, w0
, 2);
5261 case 15: sw
[3] = amd_bytealign (w0
, sw
[3] << 8, 1);
5262 sw
[4] = amd_bytealign (sw
[4] >> 24, w0
, 1);
5264 case 16: sw
[4] = w0
;
5266 case 17: sw
[4] = amd_bytealign (w0
, sw
[4] << 24, 3);
5267 sw
[5] = amd_bytealign (sw
[5] >> 8, w0
, 3);
5269 case 18: sw
[4] = amd_bytealign (w0
, sw
[4] << 16, 2);
5270 sw
[5] = amd_bytealign (sw
[5] >> 16, w0
, 2);
5272 case 19: sw
[4] = amd_bytealign (w0
, sw
[4] << 8, 1);
5273 sw
[5] = amd_bytealign (sw
[5] >> 24, w0
, 1);
5275 case 20: sw
[5] = w0
;
5277 case 21: sw
[5] = amd_bytealign (w0
, sw
[5] << 24, 3);
5278 sw
[6] = amd_bytealign (sw
[6] >> 8, w0
, 3);
5280 case 22: sw
[5] = amd_bytealign (w0
, sw
[5] << 16, 2);
5281 sw
[6] = amd_bytealign (sw
[6] >> 16, w0
, 2);
5283 case 23: sw
[5] = amd_bytealign (w0
, sw
[5] << 8, 1);
5284 sw
[6] = amd_bytealign (sw
[6] >> 24, w0
, 1);
5286 case 24: sw
[6] = w0
;
5288 case 25: sw
[6] = amd_bytealign (w0
, sw
[6] << 24, 3);
5289 sw
[7] = amd_bytealign (sw
[7] >> 8, w0
, 3);
5291 case 26: sw
[6] = amd_bytealign (w0
, sw
[6] << 16, 2);
5292 sw
[7] = amd_bytealign (sw
[7] >> 16, w0
, 2);
5294 case 27: sw
[6] = amd_bytealign (w0
, sw
[6] << 8, 1);
5295 sw
[7] = amd_bytealign (sw
[7] >> 24, w0
, 1);
5297 case 28: sw
[7] = w0
;
5299 case 29: sw
[7] = amd_bytealign (w0
, sw
[7] << 24, 3);
5300 sw
[8] = amd_bytealign (sw
[8] >> 8, w0
, 3);
5302 case 30: sw
[7] = amd_bytealign (w0
, sw
[7] << 16, 2);
5303 sw
[8] = amd_bytealign (sw
[8] >> 16, w0
, 2);
5305 case 31: sw
[7] = amd_bytealign (w0
, sw
[7] << 8, 1);
5306 sw
[8] = amd_bytealign (sw
[8] >> 24, w0
, 1);
5314 case 1: sw
[0] = (sw
[0] & 0x000000ff) | (w0
<< 8);
5315 sw
[1] = (sw
[1] & 0xffffff00) | (w0
>> 24);
5317 case 2: sw
[0] = (sw
[0] & 0x0000ffff) | (w0
<< 16);
5318 sw
[1] = (sw
[1] & 0xffff0000) | (w0
>> 16);
5320 case 3: sw
[0] = (sw
[0] & 0x00ffffff) | (w0
<< 24);
5321 sw
[1] = (sw
[1] & 0xff000000) | (w0
>> 8);
5325 case 5: sw
[1] = (sw
[1] & 0x000000ff) | (w0
<< 8);
5326 sw
[2] = (sw
[2] & 0xffffff00) | (w0
>> 24);
5328 case 6: sw
[1] = (sw
[1] & 0x0000ffff) | (w0
<< 16);
5329 sw
[2] = (sw
[2] & 0xffff0000) | (w0
>> 16);
5331 case 7: sw
[1] = (sw
[1] & 0x00ffffff) | (w0
<< 24);
5332 sw
[2] = (sw
[2] & 0xff000000) | (w0
>> 8);
5336 case 9: sw
[2] = (sw
[2] & 0x000000ff) | (w0
<< 8);
5337 sw
[3] = (sw
[3] & 0xffffff00) | (w0
>> 24);
5339 case 10: sw
[2] = (sw
[2] & 0x0000ffff) | (w0
<< 16);
5340 sw
[3] = (sw
[3] & 0xffff0000) | (w0
>> 16);
5342 case 11: sw
[2] = (sw
[2] & 0x00ffffff) | (w0
<< 24);
5343 sw
[3] = (sw
[3] & 0xff000000) | (w0
>> 8);
5345 case 12: sw
[3] = w0
;
5347 case 13: sw
[3] = (sw
[3] & 0x000000ff) | (w0
<< 8);
5348 sw
[4] = (sw
[4] & 0xffffff00) | (w0
>> 24);
5350 case 14: sw
[3] = (sw
[3] & 0x0000ffff) | (w0
<< 16);
5351 sw
[4] = (sw
[4] & 0xffff0000) | (w0
>> 16);
5353 case 15: sw
[3] = (sw
[3] & 0x00ffffff) | (w0
<< 24);
5354 sw
[4] = (sw
[4] & 0xff000000) | (w0
>> 8);
5356 case 16: sw
[4] = w0
;
5358 case 17: sw
[4] = (sw
[4] & 0x000000ff) | (w0
<< 8);
5359 sw
[5] = (sw
[5] & 0xffffff00) | (w0
>> 24);
5361 case 18: sw
[4] = (sw
[4] & 0x0000ffff) | (w0
<< 16);
5362 sw
[5] = (sw
[5] & 0xffff0000) | (w0
>> 16);
5364 case 19: sw
[4] = (sw
[4] & 0x00ffffff) | (w0
<< 24);
5365 sw
[5] = (sw
[5] & 0xff000000) | (w0
>> 8);
5367 case 20: sw
[5] = w0
;
5369 case 21: sw
[5] = (sw
[5] & 0x000000ff) | (w0
<< 8);
5370 sw
[6] = (sw
[6] & 0xffffff00) | (w0
>> 24);
5372 case 22: sw
[5] = (sw
[5] & 0x0000ffff) | (w0
<< 16);
5373 sw
[6] = (sw
[6] & 0xffff0000) | (w0
>> 16);
5375 case 23: sw
[5] = (sw
[5] & 0x00ffffff) | (w0
<< 24);
5376 sw
[6] = (sw
[6] & 0xff000000) | (w0
>> 8);
5378 case 24: sw
[6] = w0
;
5380 case 25: sw
[6] = (sw
[6] & 0x000000ff) | (w0
<< 8);
5381 sw
[7] = (sw
[7] & 0xffffff00) | (w0
>> 24);
5383 case 26: sw
[6] = (sw
[6] & 0x0000ffff) | (w0
<< 16);
5384 sw
[7] = (sw
[7] & 0xffff0000) | (w0
>> 16);
5386 case 27: sw
[6] = (sw
[6] & 0x00ffffff) | (w0
<< 24);
5387 sw
[7] = (sw
[7] & 0xff000000) | (w0
>> 8);
5389 case 28: sw
[7] = w0
;
5391 case 29: sw
[7] = (sw
[7] & 0x000000ff) | (w0
<< 8);
5392 sw
[8] = (sw
[8] & 0xffffff00) | (w0
>> 24);
5394 case 30: sw
[7] = (sw
[7] & 0x0000ffff) | (w0
<< 16);
5395 sw
[8] = (sw
[8] & 0xffff0000) | (w0
>> 16);
5397 case 31: sw
[7] = (sw
[7] & 0x00ffffff) | (w0
<< 24);
5398 sw
[8] = (sw
[8] & 0xff000000) | (w0
>> 8);
5404 static void overwrite_at_be (u32x sw
[16], const u32x w0
, const u32 salt_len
)
5406 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5412 case 1: sw
[0] = (sw
[0] & 0xff000000) | (w0
>> 8);
5413 sw
[1] = (sw
[1] & 0x00ffffff) | (w0
<< 24);
5415 case 2: sw
[0] = (sw
[0] & 0xffff0000) | (w0
>> 16);
5416 sw
[1] = (sw
[1] & 0x0000ffff) | (w0
<< 16);
5418 case 3: sw
[0] = (sw
[0] & 0xffffff00) | (w0
>> 24);
5419 sw
[1] = (sw
[1] & 0x000000ff) | (w0
<< 8);
5423 case 5: sw
[1] = (sw
[1] & 0xff000000) | (w0
>> 8);
5424 sw
[2] = (sw
[2] & 0x00ffffff) | (w0
<< 24);
5426 case 6: sw
[1] = (sw
[1] & 0xffff0000) | (w0
>> 16);
5427 sw
[2] = (sw
[2] & 0x0000ffff) | (w0
<< 16);
5429 case 7: sw
[1] = (sw
[1] & 0xffffff00) | (w0
>> 24);
5430 sw
[2] = (sw
[2] & 0x000000ff) | (w0
<< 8);
5434 case 9: sw
[2] = (sw
[2] & 0xff000000) | (w0
>> 8);
5435 sw
[3] = (sw
[3] & 0x00ffffff) | (w0
<< 24);
5437 case 10: sw
[2] = (sw
[2] & 0xffff0000) | (w0
>> 16);
5438 sw
[3] = (sw
[3] & 0x0000ffff) | (w0
<< 16);
5440 case 11: sw
[2] = (sw
[2] & 0xffffff00) | (w0
>> 24);
5441 sw
[3] = (sw
[3] & 0x000000ff) | (w0
<< 8);
5443 case 12: sw
[3] = w0
;
5445 case 13: sw
[3] = (sw
[3] & 0xff000000) | (w0
>> 8);
5446 sw
[4] = (sw
[4] & 0x00ffffff) | (w0
<< 24);
5448 case 14: sw
[3] = (sw
[3] & 0xffff0000) | (w0
>> 16);
5449 sw
[4] = (sw
[4] & 0x0000ffff) | (w0
<< 16);
5451 case 15: sw
[3] = (sw
[3] & 0xffffff00) | (w0
>> 24);
5452 sw
[4] = (sw
[4] & 0x000000ff) | (w0
<< 8);
5454 case 16: sw
[4] = w0
;
5456 case 17: sw
[4] = (sw
[4] & 0xff000000) | (w0
>> 8);
5457 sw
[5] = (sw
[5] & 0x00ffffff) | (w0
<< 24);
5459 case 18: sw
[4] = (sw
[4] & 0xffff0000) | (w0
>> 16);
5460 sw
[5] = (sw
[5] & 0x0000ffff) | (w0
<< 16);
5462 case 19: sw
[4] = (sw
[4] & 0xffffff00) | (w0
>> 24);
5463 sw
[5] = (sw
[5] & 0x000000ff) | (w0
<< 8);
5465 case 20: sw
[5] = w0
;
5467 case 21: sw
[5] = (sw
[5] & 0xff000000) | (w0
>> 8);
5468 sw
[6] = (sw
[6] & 0x00ffffff) | (w0
<< 24);
5470 case 22: sw
[5] = (sw
[5] & 0xffff0000) | (w0
>> 16);
5471 sw
[6] = (sw
[6] & 0x0000ffff) | (w0
<< 16);
5473 case 23: sw
[5] = (sw
[5] & 0xffffff00) | (w0
>> 24);
5474 sw
[6] = (sw
[6] & 0x000000ff) | (w0
<< 8);
5476 case 24: sw
[6] = w0
;
5478 case 25: sw
[6] = (sw
[6] & 0xff000000) | (w0
>> 8);
5479 sw
[7] = (sw
[7] & 0x00ffffff) | (w0
<< 24);
5481 case 26: sw
[6] = (sw
[6] & 0xffff0000) | (w0
>> 16);
5482 sw
[7] = (sw
[7] & 0x0000ffff) | (w0
<< 16);
5484 case 27: sw
[6] = (sw
[6] & 0xffffff00) | (w0
>> 24);
5485 sw
[7] = (sw
[7] & 0x000000ff) | (w0
<< 8);
5487 case 28: sw
[7] = w0
;
5489 case 29: sw
[7] = (sw
[7] & 0xff000000) | (w0
>> 8);
5490 sw
[8] = (sw
[8] & 0x00ffffff) | (w0
<< 24);
5492 case 30: sw
[7] = (sw
[7] & 0xffff0000) | (w0
>> 16);
5493 sw
[8] = (sw
[8] & 0x0000ffff) | (w0
<< 16);
5495 case 31: sw
[7] = (sw
[7] & 0xffffff00) | (w0
>> 24);
5496 sw
[8] = (sw
[8] & 0x000000ff) | (w0
<< 8);
5501 static void overwrite_at_le_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x wx
, const u32 salt_len
)
5503 #if defined cl_amd_media_ops
5508 case 1: w0
[0] = amd_bytealign (wx
, w0
[0] << 24, 3);
5509 w0
[1] = amd_bytealign (w0
[1] >> 8, wx
, 3);
5511 case 2: w0
[0] = amd_bytealign (wx
, w0
[0] << 16, 2);
5512 w0
[1] = amd_bytealign (w0
[1] >> 16, wx
, 2);
5514 case 3: w0
[0] = amd_bytealign (wx
, w0
[0] << 8, 1);
5515 w0
[1] = amd_bytealign (w0
[1] >> 24, wx
, 1);
5519 case 5: w0
[1] = amd_bytealign (wx
, w0
[1] << 24, 3);
5520 w0
[2] = amd_bytealign (w0
[2] >> 8, wx
, 3);
5522 case 6: w0
[1] = amd_bytealign (wx
, w0
[1] << 16, 2);
5523 w0
[2] = amd_bytealign (w0
[2] >> 16, wx
, 2);
5525 case 7: w0
[1] = amd_bytealign (wx
, w0
[1] << 8, 1);
5526 w0
[2] = amd_bytealign (w0
[2] >> 24, wx
, 1);
5530 case 9: w0
[2] = amd_bytealign (wx
, w0
[2] << 24, 3);
5531 w0
[3] = amd_bytealign (w0
[3] >> 8, wx
, 3);
5533 case 10: w0
[2] = amd_bytealign (wx
, w0
[2] << 16, 2);
5534 w0
[3] = amd_bytealign (w0
[3] >> 16, wx
, 2);
5536 case 11: w0
[2] = amd_bytealign (wx
, w0
[2] << 8, 1);
5537 w0
[3] = amd_bytealign (w0
[3] >> 24, wx
, 1);
5539 case 12: w0
[3] = wx
;
5541 case 13: w0
[3] = amd_bytealign (wx
, w0
[3] << 24, 3);
5542 w1
[0] = amd_bytealign (w1
[0] >> 8, wx
, 3);
5544 case 14: w0
[3] = amd_bytealign (wx
, w0
[3] << 16, 2);
5545 w1
[0] = amd_bytealign (w1
[0] >> 16, wx
, 2);
5547 case 15: w0
[3] = amd_bytealign (wx
, w0
[3] << 8, 1);
5548 w1
[0] = amd_bytealign (w1
[0] >> 24, wx
, 1);
5550 case 16: w1
[0] = wx
;
5552 case 17: w1
[0] = amd_bytealign (wx
, w1
[0] << 24, 3);
5553 w1
[1] = amd_bytealign (w1
[1] >> 8, wx
, 3);
5555 case 18: w1
[0] = amd_bytealign (wx
, w1
[0] << 16, 2);
5556 w1
[1] = amd_bytealign (w1
[1] >> 16, wx
, 2);
5558 case 19: w1
[0] = amd_bytealign (wx
, w1
[0] << 8, 1);
5559 w1
[1] = amd_bytealign (w1
[1] >> 24, wx
, 1);
5561 case 20: w1
[1] = wx
;
5563 case 21: w1
[1] = amd_bytealign (wx
, w1
[1] << 24, 3);
5564 w1
[2] = amd_bytealign (w1
[2] >> 8, wx
, 3);
5566 case 22: w1
[1] = amd_bytealign (wx
, w1
[1] << 16, 2);
5567 w1
[2] = amd_bytealign (w1
[2] >> 16, wx
, 2);
5569 case 23: w1
[1] = amd_bytealign (wx
, w1
[1] << 8, 1);
5570 w1
[2] = amd_bytealign (w1
[2] >> 24, wx
, 1);
5572 case 24: w1
[2] = wx
;
5574 case 25: w1
[2] = amd_bytealign (wx
, w1
[2] << 24, 3);
5575 w1
[3] = amd_bytealign (w1
[3] >> 8, wx
, 3);
5577 case 26: w1
[2] = amd_bytealign (wx
, w1
[2] << 16, 2);
5578 w1
[3] = amd_bytealign (w1
[3] >> 16, wx
, 2);
5580 case 27: w1
[2] = amd_bytealign (wx
, w1
[2] << 8, 1);
5581 w1
[3] = amd_bytealign (w1
[3] >> 24, wx
, 1);
5583 case 28: w1
[3] = wx
;
5585 case 29: w1
[3] = amd_bytealign (wx
, w1
[3] << 24, 3);
5586 w2
[0] = amd_bytealign (w2
[0] >> 8, wx
, 3);
5588 case 30: w1
[3] = amd_bytealign (wx
, w1
[3] << 16, 2);
5589 w2
[0] = amd_bytealign (w2
[0] >> 16, wx
, 2);
5591 case 31: w1
[3] = amd_bytealign (wx
, w1
[3] << 8, 1);
5592 w2
[0] = amd_bytealign (w2
[0] >> 24, wx
, 1);
5594 case 32: w0
[0] = wx
;
5596 case 33: w2
[0] = amd_bytealign (wx
, w2
[0] << 24, 3);
5597 w2
[1] = amd_bytealign (w2
[1] >> 8, wx
, 3);
5599 case 34: w2
[0] = amd_bytealign (wx
, w2
[0] << 16, 2);
5600 w2
[1] = amd_bytealign (w2
[1] >> 16, wx
, 2);
5602 case 35: w2
[0] = amd_bytealign (wx
, w2
[0] << 8, 1);
5603 w2
[1] = amd_bytealign (w2
[1] >> 24, wx
, 1);
5605 case 36: w2
[1] = wx
;
5607 case 37: w2
[1] = amd_bytealign (wx
, w2
[1] << 24, 3);
5608 w2
[2] = amd_bytealign (w2
[2] >> 8, wx
, 3);
5610 case 38: w2
[1] = amd_bytealign (wx
, w2
[1] << 16, 2);
5611 w2
[2] = amd_bytealign (w2
[2] >> 16, wx
, 2);
5613 case 39: w2
[1] = amd_bytealign (wx
, w2
[1] << 8, 1);
5614 w2
[2] = amd_bytealign (w2
[2] >> 24, wx
, 1);
5616 case 40: w2
[2] = wx
;
5618 case 41: w2
[2] = amd_bytealign (wx
, w2
[2] << 24, 3);
5619 w2
[3] = amd_bytealign (w2
[3] >> 8, wx
, 3);
5621 case 42: w2
[2] = amd_bytealign (wx
, w2
[2] << 16, 2);
5622 w2
[3] = amd_bytealign (w2
[3] >> 16, wx
, 2);
5624 case 43: w2
[2] = amd_bytealign (wx
, w2
[2] << 8, 1);
5625 w2
[3] = amd_bytealign (w2
[3] >> 24, wx
, 1);
5627 case 44: w2
[3] = wx
;
5629 case 45: w2
[3] = amd_bytealign (wx
, w2
[3] << 24, 3);
5630 w3
[0] = amd_bytealign (w3
[0] >> 8, wx
, 3);
5632 case 46: w2
[3] = amd_bytealign (wx
, w2
[3] << 16, 2);
5633 w3
[0] = amd_bytealign (w3
[0] >> 16, wx
, 2);
5635 case 47: w2
[3] = amd_bytealign (wx
, w2
[3] << 8, 1);
5636 w3
[0] = amd_bytealign (w3
[0] >> 24, wx
, 1);
5638 case 48: w3
[0] = wx
;
5640 case 49: w3
[0] = amd_bytealign (wx
, w3
[0] << 24, 3);
5641 w3
[1] = amd_bytealign (w3
[1] >> 8, wx
, 3);
5643 case 50: w3
[0] = amd_bytealign (wx
, w3
[0] << 16, 2);
5644 w3
[1] = amd_bytealign (w3
[1] >> 16, wx
, 2);
5646 case 51: w3
[0] = amd_bytealign (wx
, w3
[0] << 8, 1);
5647 w3
[1] = amd_bytealign (w3
[1] >> 24, wx
, 1);
5649 case 52: w3
[1] = wx
;
5651 case 53: w3
[1] = amd_bytealign (wx
, w3
[1] << 24, 3);
5652 w3
[2] = amd_bytealign (w3
[2] >> 8, wx
, 3);
5654 case 54: w3
[1] = amd_bytealign (wx
, w3
[1] << 16, 2);
5655 w3
[2] = amd_bytealign (w3
[2] >> 16, wx
, 2);
5657 case 55: w3
[1] = amd_bytealign (wx
, w3
[1] << 8, 1);
5658 w3
[2] = amd_bytealign (w3
[2] >> 24, wx
, 1);
5660 case 56: w3
[2] = wx
;
5662 case 57: w3
[2] = amd_bytealign (wx
, w3
[2] << 24, 3);
5663 w3
[3] = amd_bytealign (w3
[3] >> 8, wx
, 3);
5665 case 58: w3
[2] = amd_bytealign (wx
, w3
[2] << 16, 2);
5666 w3
[3] = amd_bytealign (w3
[3] >> 16, wx
, 2);
5668 case 59: w3
[2] = amd_bytealign (wx
, w3
[2] << 8, 1);
5669 w3
[3] = amd_bytealign (w3
[3] >> 24, wx
, 1);
5671 case 60: w3
[3] = wx
;
5673 case 61: w3
[3] = amd_bytealign (wx
, w3
[3] << 24, 3);
5674 //w4[0] = amd_bytealign (w4[0] >> 8, wx, 3);
5676 case 62: w3
[3] = amd_bytealign (wx
, w3
[3] << 16, 2);
5677 //w4[0] = amd_bytealign (w4[0] >> 16, wx, 2);
5679 case 63: w3
[3] = amd_bytealign (wx
, w3
[3] << 8, 1);
5680 //w4[0] = amd_bytealign (w4[0] >> 24, wx, 1);
5688 case 1: w0
[0] = (w0
[0] & 0x000000ff) | (wx
<< 8);
5689 w0
[1] = (w0
[1] & 0xffffff00) | (wx
>> 24);
5691 case 2: w0
[0] = (w0
[0] & 0x0000ffff) | (wx
<< 16);
5692 w0
[1] = (w0
[1] & 0xffff0000) | (wx
>> 16);
5694 case 3: w0
[0] = (w0
[0] & 0x00ffffff) | (wx
<< 24);
5695 w0
[1] = (w0
[1] & 0xff000000) | (wx
>> 8);
5699 case 5: w0
[1] = (w0
[1] & 0x000000ff) | (wx
<< 8);
5700 w0
[2] = (w0
[2] & 0xffffff00) | (wx
>> 24);
5702 case 6: w0
[1] = (w0
[1] & 0x0000ffff) | (wx
<< 16);
5703 w0
[2] = (w0
[2] & 0xffff0000) | (wx
>> 16);
5705 case 7: w0
[1] = (w0
[1] & 0x00ffffff) | (wx
<< 24);
5706 w0
[2] = (w0
[2] & 0xff000000) | (wx
>> 8);
5710 case 9: w0
[2] = (w0
[2] & 0x000000ff) | (wx
<< 8);
5711 w0
[3] = (w0
[3] & 0xffffff00) | (wx
>> 24);
5713 case 10: w0
[2] = (w0
[2] & 0x0000ffff) | (wx
<< 16);
5714 w0
[3] = (w0
[3] & 0xffff0000) | (wx
>> 16);
5716 case 11: w0
[2] = (w0
[2] & 0x00ffffff) | (wx
<< 24);
5717 w0
[3] = (w0
[3] & 0xff000000) | (wx
>> 8);
5719 case 12: w0
[3] = wx
;
5721 case 13: w0
[3] = (w0
[3] & 0x000000ff) | (wx
<< 8);
5722 w1
[0] = (w1
[0] & 0xffffff00) | (wx
>> 24);
5724 case 14: w0
[3] = (w0
[3] & 0x0000ffff) | (wx
<< 16);
5725 w1
[0] = (w1
[0] & 0xffff0000) | (wx
>> 16);
5727 case 15: w0
[3] = (w0
[3] & 0x00ffffff) | (wx
<< 24);
5728 w1
[0] = (w1
[0] & 0xff000000) | (wx
>> 8);
5730 case 16: w1
[0] = wx
;
5732 case 17: w1
[0] = (w1
[0] & 0x000000ff) | (wx
<< 8);
5733 w1
[1] = (w1
[1] & 0xffffff00) | (wx
>> 24);
5735 case 18: w1
[0] = (w1
[0] & 0x0000ffff) | (wx
<< 16);
5736 w1
[1] = (w1
[1] & 0xffff0000) | (wx
>> 16);
5738 case 19: w1
[0] = (w1
[0] & 0x00ffffff) | (wx
<< 24);
5739 w1
[1] = (w1
[1] & 0xff000000) | (wx
>> 8);
5741 case 20: w1
[1] = wx
;
5743 case 21: w1
[1] = (w1
[1] & 0x000000ff) | (wx
<< 8);
5744 w1
[2] = (w1
[2] & 0xffffff00) | (wx
>> 24);
5746 case 22: w1
[1] = (w1
[1] & 0x0000ffff) | (wx
<< 16);
5747 w1
[2] = (w1
[2] & 0xffff0000) | (wx
>> 16);
5749 case 23: w1
[1] = (w1
[1] & 0x00ffffff) | (wx
<< 24);
5750 w1
[2] = (w1
[2] & 0xff000000) | (wx
>> 8);
5752 case 24: w1
[2] = wx
;
5754 case 25: w1
[2] = (w1
[2] & 0x000000ff) | (wx
<< 8);
5755 w1
[3] = (w1
[3] & 0xffffff00) | (wx
>> 24);
5757 case 26: w1
[2] = (w1
[2] & 0x0000ffff) | (wx
<< 16);
5758 w1
[3] = (w1
[3] & 0xffff0000) | (wx
>> 16);
5760 case 27: w1
[2] = (w1
[2] & 0x00ffffff) | (wx
<< 24);
5761 w1
[3] = (w1
[3] & 0xff000000) | (wx
>> 8);
5763 case 28: w1
[3] = wx
;
5765 case 29: w1
[3] = (w1
[3] & 0x000000ff) | (wx
<< 8);
5766 w2
[0] = (w2
[0] & 0xffffff00) | (wx
>> 24);
5768 case 30: w1
[3] = (w1
[3] & 0x0000ffff) | (wx
<< 16);
5769 w2
[0] = (w2
[0] & 0xffff0000) | (wx
>> 16);
5771 case 31: w1
[3] = (w1
[3] & 0x00ffffff) | (wx
<< 24);
5772 w2
[0] = (w2
[0] & 0xff000000) | (wx
>> 8);
5774 case 32: w2
[0] = wx
;
5776 case 33: w2
[0] = (w2
[0] & 0x000000ff) | (wx
<< 8);
5777 w2
[1] = (w2
[1] & 0xffffff00) | (wx
>> 24);
5779 case 34: w2
[0] = (w2
[0] & 0x0000ffff) | (wx
<< 16);
5780 w2
[1] = (w2
[1] & 0xffff0000) | (wx
>> 16);
5782 case 35: w2
[0] = (w2
[0] & 0x00ffffff) | (wx
<< 24);
5783 w2
[1] = (w2
[1] & 0xff000000) | (wx
>> 8);
5785 case 36: w2
[1] = wx
;
5787 case 37: w2
[1] = (w2
[1] & 0x000000ff) | (wx
<< 8);
5788 w2
[2] = (w2
[2] & 0xffffff00) | (wx
>> 24);
5790 case 38: w2
[1] = (w2
[1] & 0x0000ffff) | (wx
<< 16);
5791 w2
[2] = (w2
[2] & 0xffff0000) | (wx
>> 16);
5793 case 39: w2
[1] = (w2
[1] & 0x00ffffff) | (wx
<< 24);
5794 w2
[2] = (w2
[2] & 0xff000000) | (wx
>> 8);
5796 case 40: w2
[2] = wx
;
5798 case 41: w2
[2] = (w2
[2] & 0x000000ff) | (wx
<< 8);
5799 w2
[3] = (w2
[3] & 0xffffff00) | (wx
>> 24);
5801 case 42: w2
[2] = (w2
[2] & 0x0000ffff) | (wx
<< 16);
5802 w2
[3] = (w2
[3] & 0xffff0000) | (wx
>> 16);
5804 case 43: w2
[2] = (w2
[2] & 0x00ffffff) | (wx
<< 24);
5805 w2
[3] = (w2
[3] & 0xff000000) | (wx
>> 8);
5807 case 44: w2
[3] = wx
;
5809 case 45: w2
[3] = (w2
[3] & 0x000000ff) | (wx
<< 8);
5810 w3
[0] = (w3
[0] & 0xffffff00) | (wx
>> 24);
5812 case 46: w2
[3] = (w2
[3] & 0x0000ffff) | (wx
<< 16);
5813 w3
[0] = (w3
[0] & 0xffff0000) | (wx
>> 16);
5815 case 47: w2
[3] = (w2
[3] & 0x00ffffff) | (wx
<< 24);
5816 w3
[0] = (w3
[0] & 0xff000000) | (wx
>> 8);
5818 case 48: w3
[0] = wx
;
5820 case 49: w3
[0] = (w3
[0] & 0x000000ff) | (wx
<< 8);
5821 w3
[1] = (w3
[1] & 0xffffff00) | (wx
>> 24);
5823 case 50: w3
[0] = (w3
[0] & 0x0000ffff) | (wx
<< 16);
5824 w3
[1] = (w3
[1] & 0xffff0000) | (wx
>> 16);
5826 case 51: w3
[0] = (w3
[0] & 0x00ffffff) | (wx
<< 24);
5827 w3
[1] = (w3
[1] & 0xff000000) | (wx
>> 8);
5829 case 52: w3
[1] = wx
;
5831 case 53: w3
[1] = (w3
[1] & 0x000000ff) | (wx
<< 8);
5832 w3
[2] = (w3
[2] & 0xffffff00) | (wx
>> 24);
5834 case 54: w3
[1] = (w3
[1] & 0x0000ffff) | (wx
<< 16);
5835 w3
[2] = (w3
[2] & 0xffff0000) | (wx
>> 16);
5837 case 55: w3
[1] = (w3
[1] & 0x00ffffff) | (wx
<< 24);
5838 w3
[2] = (w3
[2] & 0xff000000) | (wx
>> 8);
5840 case 56: w3
[2] = wx
;
5842 case 57: w3
[2] = (w3
[2] & 0x000000ff) | (wx
<< 8);
5843 w3
[3] = (w3
[3] & 0xffffff00) | (wx
>> 24);
5845 case 58: w3
[2] = (w3
[2] & 0x0000ffff) | (wx
<< 16);
5846 w3
[3] = (w3
[3] & 0xffff0000) | (wx
>> 16);
5848 case 59: w3
[2] = (w3
[2] & 0x00ffffff) | (wx
<< 24);
5849 w3
[3] = (w3
[3] & 0xff000000) | (wx
>> 8);
5851 case 60: w3
[3] = wx
;
5853 case 61: w3
[3] = (w3
[3] & 0x000000ff) | (wx
<< 8);
5854 //w4[0] = (w4[0] & 0xffffff00) | (wx >> 24);
5856 case 62: w3
[3] = (w3
[3] & 0x0000ffff) | (wx
<< 16);
5857 //w4[0] = (w4[0] & 0xffff0000) | (wx >> 16);
5859 case 63: w3
[3] = (w3
[3] & 0x00ffffff) | (wx
<< 24);
5860 //w4[0] = (w4[0] & 0xff000000) | (wx >> 8);
5866 static void overwrite_at_be_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x wx
, const u32 salt_len
)
5868 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5874 case 1: w0
[0] = (w0
[0] & 0xff000000) | (wx
>> 8);
5875 w0
[1] = (w0
[1] & 0x00ffffff) | (wx
<< 24);
5877 case 2: w0
[0] = (w0
[0] & 0xffff0000) | (wx
>> 16);
5878 w0
[1] = (w0
[1] & 0x0000ffff) | (wx
<< 16);
5880 case 3: w0
[0] = (w0
[0] & 0xffffff00) | (wx
>> 24);
5881 w0
[1] = (w0
[1] & 0x000000ff) | (wx
<< 8);
5885 case 5: w0
[1] = (w0
[1] & 0xff000000) | (wx
>> 8);
5886 w0
[2] = (w0
[2] & 0x00ffffff) | (wx
<< 24);
5888 case 6: w0
[1] = (w0
[1] & 0xffff0000) | (wx
>> 16);
5889 w0
[2] = (w0
[2] & 0x0000ffff) | (wx
<< 16);
5891 case 7: w0
[1] = (w0
[1] & 0xffffff00) | (wx
>> 24);
5892 w0
[2] = (w0
[2] & 0x000000ff) | (wx
<< 8);
5896 case 9: w0
[2] = (w0
[2] & 0xff000000) | (wx
>> 8);
5897 w0
[3] = (w0
[3] & 0x00ffffff) | (wx
<< 24);
5899 case 10: w0
[2] = (w0
[2] & 0xffff0000) | (wx
>> 16);
5900 w0
[3] = (w0
[3] & 0x0000ffff) | (wx
<< 16);
5902 case 11: w0
[2] = (w0
[2] & 0xffffff00) | (wx
>> 24);
5903 w0
[3] = (w0
[3] & 0x000000ff) | (wx
<< 8);
5905 case 12: w0
[3] = wx
;
5907 case 13: w0
[3] = (w0
[3] & 0xff000000) | (wx
>> 8);
5908 w1
[0] = (w1
[0] & 0x00ffffff) | (wx
<< 24);
5910 case 14: w0
[3] = (w0
[3] & 0xffff0000) | (wx
>> 16);
5911 w1
[0] = (w1
[0] & 0x0000ffff) | (wx
<< 16);
5913 case 15: w0
[3] = (w0
[3] & 0xffffff00) | (wx
>> 24);
5914 w1
[0] = (w1
[0] & 0x000000ff) | (wx
<< 8);
5916 case 16: w1
[0] = wx
;
5918 case 17: w1
[0] = (w1
[0] & 0xff000000) | (wx
>> 8);
5919 w1
[1] = (w1
[1] & 0x00ffffff) | (wx
<< 24);
5921 case 18: w1
[0] = (w1
[0] & 0xffff0000) | (wx
>> 16);
5922 w1
[1] = (w1
[1] & 0x0000ffff) | (wx
<< 16);
5924 case 19: w1
[0] = (w1
[0] & 0xffffff00) | (wx
>> 24);
5925 w1
[1] = (w1
[1] & 0x000000ff) | (wx
<< 8);
5927 case 20: w1
[1] = wx
;
5929 case 21: w1
[1] = (w1
[1] & 0xff000000) | (wx
>> 8);
5930 w1
[2] = (w1
[2] & 0x00ffffff) | (wx
<< 24);
5932 case 22: w1
[1] = (w1
[1] & 0xffff0000) | (wx
>> 16);
5933 w1
[2] = (w1
[2] & 0x0000ffff) | (wx
<< 16);
5935 case 23: w1
[1] = (w1
[1] & 0xffffff00) | (wx
>> 24);
5936 w1
[2] = (w1
[2] & 0x000000ff) | (wx
<< 8);
5938 case 24: w1
[2] = wx
;
5940 case 25: w1
[2] = (w1
[2] & 0xff000000) | (wx
>> 8);
5941 w1
[3] = (w1
[3] & 0x00ffffff) | (wx
<< 24);
5943 case 26: w1
[2] = (w1
[2] & 0xffff0000) | (wx
>> 16);
5944 w1
[3] = (w1
[3] & 0x0000ffff) | (wx
<< 16);
5946 case 27: w1
[2] = (w1
[2] & 0xffffff00) | (wx
>> 24);
5947 w1
[3] = (w1
[3] & 0x000000ff) | (wx
<< 8);
5949 case 28: w1
[3] = wx
;
5951 case 29: w1
[3] = (w1
[3] & 0xff000000) | (wx
>> 8);
5952 w2
[0] = (w2
[0] & 0x00ffffff) | (wx
<< 24);
5954 case 30: w1
[3] = (w1
[3] & 0xffff0000) | (wx
>> 16);
5955 w2
[0] = (w2
[0] & 0x0000ffff) | (wx
<< 16);
5957 case 31: w1
[3] = (w1
[3] & 0xffffff00) | (wx
>> 24);
5958 w2
[0] = (w2
[0] & 0x000000ff) | (wx
<< 8);
5960 case 32: w2
[0] = wx
;
5962 case 33: w2
[0] = (w2
[0] & 0xff000000) | (wx
>> 8);
5963 w2
[1] = (w2
[1] & 0x00ffffff) | (wx
<< 24);
5965 case 34: w2
[0] = (w2
[0] & 0xffff0000) | (wx
>> 16);
5966 w2
[1] = (w2
[1] & 0x0000ffff) | (wx
<< 16);
5968 case 35: w2
[0] = (w2
[0] & 0xffffff00) | (wx
>> 24);
5969 w2
[1] = (w2
[1] & 0x000000ff) | (wx
<< 8);
5971 case 36: w2
[1] = wx
;
5973 case 37: w2
[1] = (w2
[1] & 0xff000000) | (wx
>> 8);
5974 w2
[2] = (w2
[2] & 0x00ffffff) | (wx
<< 24);
5976 case 38: w2
[1] = (w2
[1] & 0xffff0000) | (wx
>> 16);
5977 w2
[2] = (w2
[2] & 0x0000ffff) | (wx
<< 16);
5979 case 39: w2
[1] = (w2
[1] & 0xffffff00) | (wx
>> 24);
5980 w2
[2] = (w2
[2] & 0x000000ff) | (wx
<< 8);
5982 case 40: w2
[2] = wx
;
5984 case 41: w2
[2] = (w2
[2] & 0xff000000) | (wx
>> 8);
5985 w2
[3] = (w2
[3] & 0x00ffffff) | (wx
<< 24);
5987 case 42: w2
[2] = (w2
[2] & 0xffff0000) | (wx
>> 16);
5988 w2
[3] = (w2
[3] & 0x0000ffff) | (wx
<< 16);
5990 case 43: w2
[2] = (w2
[2] & 0xffffff00) | (wx
>> 24);
5991 w2
[3] = (w2
[3] & 0x000000ff) | (wx
<< 8);
5993 case 44: w2
[3] = wx
;
5995 case 45: w2
[3] = (w2
[3] & 0xff000000) | (wx
>> 8);
5996 w3
[0] = (w3
[0] & 0x00ffffff) | (wx
<< 24);
5998 case 46: w2
[3] = (w2
[3] & 0xffff0000) | (wx
>> 16);
5999 w3
[0] = (w3
[0] & 0x0000ffff) | (wx
<< 16);
6001 case 47: w2
[3] = (w2
[3] & 0xffffff00) | (wx
>> 24);
6002 w3
[0] = (w3
[0] & 0x000000ff) | (wx
<< 8);
6004 case 48: w3
[0] = wx
;
6006 case 49: w3
[0] = (w3
[0] & 0xff000000) | (wx
>> 8);
6007 w3
[1] = (w3
[1] & 0x00ffffff) | (wx
<< 24);
6009 case 50: w3
[0] = (w3
[0] & 0xffff0000) | (wx
>> 16);
6010 w3
[1] = (w3
[1] & 0x0000ffff) | (wx
<< 16);
6012 case 51: w3
[0] = (w3
[0] & 0xffffff00) | (wx
>> 24);
6013 w3
[1] = (w3
[1] & 0x000000ff) | (wx
<< 8);
6015 case 52: w3
[1] = wx
;
6017 case 53: w3
[1] = (w3
[1] & 0xff000000) | (wx
>> 8);
6018 w3
[2] = (w3
[2] & 0x00ffffff) | (wx
<< 24);
6020 case 54: w3
[1] = (w3
[1] & 0xffff0000) | (wx
>> 16);
6021 w3
[2] = (w3
[2] & 0x0000ffff) | (wx
<< 16);
6023 case 55: w3
[1] = (w3
[1] & 0xffffff00) | (wx
>> 24);
6024 w3
[2] = (w3
[2] & 0x000000ff) | (wx
<< 8);
6026 case 56: w3
[2] = wx
;
6028 case 57: w3
[2] = (w3
[2] & 0xff000000) | (wx
>> 8);
6029 w3
[3] = (w3
[3] & 0x00ffffff) | (wx
<< 24);
6031 case 58: w3
[2] = (w3
[2] & 0xffff0000) | (wx
>> 16);
6032 w3
[3] = (w3
[3] & 0x0000ffff) | (wx
<< 16);
6034 case 59: w3
[2] = (w3
[2] & 0xffffff00) | (wx
>> 24);
6035 w3
[3] = (w3
[3] & 0x000000ff) | (wx
<< 8);
6037 case 60: w3
[3] = wx
;
6039 case 61: w3
[3] = (w3
[3] & 0xff000000) | (wx
>> 8);
6040 //w4[0] = (w4[0] & 0x00ffffff) | (wx << 24);
6042 case 62: w3
[3] = (w3
[3] & 0xffff0000) | (wx
>> 16);
6043 //w4[0] = (w4[0] & 0x0000ffff) | (wx << 16);
6045 case 63: w3
[3] = (w3
[3] & 0xffffff00) | (wx
>> 24);
6046 //w4[0] = (w4[0] & 0x000000ff) | (wx << 8);
6052 * vector functions as scalar (for outer loop usage)
6055 static void append_0x80_1x4_S (u32 w0
[4], const u32 offset
)
6064 w0
[0] = w0
[0] | 0x8000;
6068 w0
[0] = w0
[0] | 0x800000;
6072 w0
[0] = w0
[0] | 0x80000000;
6080 w0
[1] = w0
[1] | 0x8000;
6084 w0
[1] = w0
[1] | 0x800000;
6088 w0
[1] = w0
[1] | 0x80000000;
6096 w0
[2] = w0
[2] | 0x8000;
6100 w0
[2] = w0
[2] | 0x800000;
6104 w0
[2] = w0
[2] | 0x80000000;
6112 w0
[3] = w0
[3] | 0x8000;
6116 w0
[3] = w0
[3] | 0x800000;
6120 w0
[3] = w0
[3] | 0x80000000;
6125 static void append_0x80_2x4_S (u32 w0
[4], u32 w1
[4], const u32 offset
)
6134 w0
[0] = w0
[0] | 0x8000;
6138 w0
[0] = w0
[0] | 0x800000;
6142 w0
[0] = w0
[0] | 0x80000000;
6150 w0
[1] = w0
[1] | 0x8000;
6154 w0
[1] = w0
[1] | 0x800000;
6158 w0
[1] = w0
[1] | 0x80000000;
6166 w0
[2] = w0
[2] | 0x8000;
6170 w0
[2] = w0
[2] | 0x800000;
6174 w0
[2] = w0
[2] | 0x80000000;
6182 w0
[3] = w0
[3] | 0x8000;
6186 w0
[3] = w0
[3] | 0x800000;
6190 w0
[3] = w0
[3] | 0x80000000;
6198 w1
[0] = w1
[0] | 0x8000;
6202 w1
[0] = w1
[0] | 0x800000;
6206 w1
[0] = w1
[0] | 0x80000000;
6214 w1
[1] = w1
[1] | 0x8000;
6218 w1
[1] = w1
[1] | 0x800000;
6222 w1
[1] = w1
[1] | 0x80000000;
6230 w1
[2] = w1
[2] | 0x8000;
6234 w1
[2] = w1
[2] | 0x800000;
6238 w1
[2] = w1
[2] | 0x80000000;
6246 w1
[3] = w1
[3] | 0x8000;
6250 w1
[3] = w1
[3] | 0x800000;
6254 w1
[3] = w1
[3] | 0x80000000;
6259 static void append_0x80_3x4_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
6268 w0
[0] = w0
[0] | 0x8000;
6272 w0
[0] = w0
[0] | 0x800000;
6276 w0
[0] = w0
[0] | 0x80000000;
6284 w0
[1] = w0
[1] | 0x8000;
6288 w0
[1] = w0
[1] | 0x800000;
6292 w0
[1] = w0
[1] | 0x80000000;
6300 w0
[2] = w0
[2] | 0x8000;
6304 w0
[2] = w0
[2] | 0x800000;
6308 w0
[2] = w0
[2] | 0x80000000;
6316 w0
[3] = w0
[3] | 0x8000;
6320 w0
[3] = w0
[3] | 0x800000;
6324 w0
[3] = w0
[3] | 0x80000000;
6332 w1
[0] = w1
[0] | 0x8000;
6336 w1
[0] = w1
[0] | 0x800000;
6340 w1
[0] = w1
[0] | 0x80000000;
6348 w1
[1] = w1
[1] | 0x8000;
6352 w1
[1] = w1
[1] | 0x800000;
6356 w1
[1] = w1
[1] | 0x80000000;
6364 w1
[2] = w1
[2] | 0x8000;
6368 w1
[2] = w1
[2] | 0x800000;
6372 w1
[2] = w1
[2] | 0x80000000;
6380 w1
[3] = w1
[3] | 0x8000;
6384 w1
[3] = w1
[3] | 0x800000;
6388 w1
[3] = w1
[3] | 0x80000000;
6396 w2
[0] = w2
[0] | 0x8000;
6400 w2
[0] = w2
[0] | 0x800000;
6404 w2
[0] = w2
[0] | 0x80000000;
6412 w2
[1] = w2
[1] | 0x8000;
6416 w2
[1] = w2
[1] | 0x800000;
6420 w2
[1] = w2
[1] | 0x80000000;
6428 w2
[2] = w2
[2] | 0x8000;
6432 w2
[2] = w2
[2] | 0x800000;
6436 w2
[2] = w2
[2] | 0x80000000;
6444 w2
[3] = w2
[3] | 0x8000;
6448 w2
[3] = w2
[3] | 0x800000;
6452 w2
[3] = w2
[3] | 0x80000000;
6457 static void append_0x80_4x4_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
6466 w0
[0] = w0
[0] | 0x8000;
6470 w0
[0] = w0
[0] | 0x800000;
6474 w0
[0] = w0
[0] | 0x80000000;
6482 w0
[1] = w0
[1] | 0x8000;
6486 w0
[1] = w0
[1] | 0x800000;
6490 w0
[1] = w0
[1] | 0x80000000;
6498 w0
[2] = w0
[2] | 0x8000;
6502 w0
[2] = w0
[2] | 0x800000;
6506 w0
[2] = w0
[2] | 0x80000000;
6514 w0
[3] = w0
[3] | 0x8000;
6518 w0
[3] = w0
[3] | 0x800000;
6522 w0
[3] = w0
[3] | 0x80000000;
6530 w1
[0] = w1
[0] | 0x8000;
6534 w1
[0] = w1
[0] | 0x800000;
6538 w1
[0] = w1
[0] | 0x80000000;
6546 w1
[1] = w1
[1] | 0x8000;
6550 w1
[1] = w1
[1] | 0x800000;
6554 w1
[1] = w1
[1] | 0x80000000;
6562 w1
[2] = w1
[2] | 0x8000;
6566 w1
[2] = w1
[2] | 0x800000;
6570 w1
[2] = w1
[2] | 0x80000000;
6578 w1
[3] = w1
[3] | 0x8000;
6582 w1
[3] = w1
[3] | 0x800000;
6586 w1
[3] = w1
[3] | 0x80000000;
6594 w2
[0] = w2
[0] | 0x8000;
6598 w2
[0] = w2
[0] | 0x800000;
6602 w2
[0] = w2
[0] | 0x80000000;
6610 w2
[1] = w2
[1] | 0x8000;
6614 w2
[1] = w2
[1] | 0x800000;
6618 w2
[1] = w2
[1] | 0x80000000;
6626 w2
[2] = w2
[2] | 0x8000;
6630 w2
[2] = w2
[2] | 0x800000;
6634 w2
[2] = w2
[2] | 0x80000000;
6642 w2
[3] = w2
[3] | 0x8000;
6646 w2
[3] = w2
[3] | 0x800000;
6650 w2
[3] = w2
[3] | 0x80000000;
6658 w3
[0] = w3
[0] | 0x8000;
6662 w3
[0] = w3
[0] | 0x800000;
6666 w3
[0] = w3
[0] | 0x80000000;
6674 w3
[1] = w3
[1] | 0x8000;
6678 w3
[1] = w3
[1] | 0x800000;
6682 w3
[1] = w3
[1] | 0x80000000;
6690 w3
[2] = w3
[2] | 0x8000;
6694 w3
[2] = w3
[2] | 0x800000;
6698 w3
[2] = w3
[2] | 0x80000000;
6706 w3
[3] = w3
[3] | 0x8000;
6710 w3
[3] = w3
[3] | 0x800000;
6714 w3
[3] = w3
[3] | 0x80000000;
6719 static void truncate_block_S (u32 w
[4], const u32 len
)
6728 case 1: w
[0] &= 0x000000FF;
6733 case 2: w
[0] &= 0x0000FFFF;
6738 case 3: w
[0] &= 0x00FFFFFF;
6747 case 5: w
[1] &= 0x000000FF;
6751 case 6: w
[1] &= 0x0000FFFF;
6755 case 7: w
[1] &= 0x00FFFFFF;
6762 case 9: w
[2] &= 0x000000FF;
6765 case 10: w
[2] &= 0x0000FFFF;
6768 case 11: w
[2] &= 0x00FFFFFF;
6773 case 13: w
[3] &= 0x000000FF;
6775 case 14: w
[3] &= 0x0000FFFF;
6777 case 15: w
[3] &= 0x00FFFFFF;
6782 static void make_unicode_S (const u32 in
[4], u32 out1
[4], u32 out2
[4])
6785 out2
[3] = __byte_perm_S (in
[3], 0, 0x7372);
6786 out2
[2] = __byte_perm_S (in
[3], 0, 0x7170);
6787 out2
[1] = __byte_perm_S (in
[2], 0, 0x7372);
6788 out2
[0] = __byte_perm_S (in
[2], 0, 0x7170);
6789 out1
[3] = __byte_perm_S (in
[1], 0, 0x7372);
6790 out1
[2] = __byte_perm_S (in
[1], 0, 0x7170);
6791 out1
[1] = __byte_perm_S (in
[0], 0, 0x7372);
6792 out1
[0] = __byte_perm_S (in
[0], 0, 0x7170);
6795 #if defined IS_AMD || defined IS_GENERIC
6796 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
6797 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
6798 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
6799 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
6800 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
6801 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
6802 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
6803 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
6807 static void undo_unicode_S (const u32 in1
[4], const u32 in2
[4], u32 out
[4])
6810 out
[0] = __byte_perm_S (in1
[0], in1
[1], 0x6420);
6811 out
[1] = __byte_perm_S (in1
[2], in1
[3], 0x6420);
6812 out
[2] = __byte_perm_S (in2
[0], in2
[1], 0x6420);
6813 out
[3] = __byte_perm_S (in2
[2], in2
[3], 0x6420);
6816 #if defined IS_AMD || defined IS_GENERIC
6817 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
6818 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
6819 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
6820 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
6821 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
6822 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
6823 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
6824 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
6828 static void switch_buffer_by_offset_le_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
6830 #if defined IS_AMD || defined IS_GENERIC
6831 const int offset_mod_4
= offset
& 3;
6833 const int offset_minus_4
= 4 - offset
;
6838 w3
[2] = amd_bytealign_S ( 0, w3
[1], offset_minus_4
);
6839 w3
[1] = amd_bytealign_S (w3
[1], w3
[0], offset_minus_4
);
6840 w3
[0] = amd_bytealign_S (w3
[0], w2
[3], offset_minus_4
);
6841 w2
[3] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
6842 w2
[2] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
6843 w2
[1] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
6844 w2
[0] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
6845 w1
[3] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6846 w1
[2] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6847 w1
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6848 w1
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6849 w0
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6850 w0
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6851 w0
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6852 w0
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6854 if (offset_mod_4
== 0)
6876 w3
[2] = amd_bytealign_S ( 0, w3
[0], offset_minus_4
);
6877 w3
[1] = amd_bytealign_S (w3
[0], w2
[3], offset_minus_4
);
6878 w3
[0] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
6879 w2
[3] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
6880 w2
[2] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
6881 w2
[1] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
6882 w2
[0] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6883 w1
[3] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6884 w1
[2] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6885 w1
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6886 w1
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6887 w0
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6888 w0
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6889 w0
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6892 if (offset_mod_4
== 0)
6913 w3
[2] = amd_bytealign_S ( 0, w2
[3], offset_minus_4
);
6914 w3
[1] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
6915 w3
[0] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
6916 w2
[3] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
6917 w2
[2] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
6918 w2
[1] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6919 w2
[0] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6920 w1
[3] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6921 w1
[2] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6922 w1
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6923 w1
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6924 w0
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6925 w0
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6929 if (offset_mod_4
== 0)
6949 w3
[2] = amd_bytealign_S ( 0, w2
[2], offset_minus_4
);
6950 w3
[1] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
6951 w3
[0] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
6952 w2
[3] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
6953 w2
[2] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6954 w2
[1] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6955 w2
[0] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6956 w1
[3] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6957 w1
[2] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6958 w1
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6959 w1
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6960 w0
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6965 if (offset_mod_4
== 0)
6984 w3
[2] = amd_bytealign_S ( 0, w2
[1], offset_minus_4
);
6985 w3
[1] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
6986 w3
[0] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
6987 w2
[3] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6988 w2
[2] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6989 w2
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6990 w2
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6991 w1
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6992 w1
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6993 w1
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6994 w1
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7000 if (offset_mod_4
== 0)
7018 w3
[2] = amd_bytealign_S ( 0, w2
[0], offset_minus_4
);
7019 w3
[1] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
7020 w3
[0] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7021 w2
[3] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7022 w2
[2] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7023 w2
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7024 w2
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7025 w1
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7026 w1
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7027 w1
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7034 if (offset_mod_4
== 0)
7051 w3
[2] = amd_bytealign_S ( 0, w1
[3], offset_minus_4
);
7052 w3
[1] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7053 w3
[0] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7054 w2
[3] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7055 w2
[2] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7056 w2
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7057 w2
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7058 w1
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7059 w1
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7067 if (offset_mod_4
== 0)
7083 w3
[2] = amd_bytealign_S ( 0, w1
[2], offset_minus_4
);
7084 w3
[1] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7085 w3
[0] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7086 w2
[3] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7087 w2
[2] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7088 w2
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7089 w2
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7090 w1
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7099 if (offset_mod_4
== 0)
7114 w3
[2] = amd_bytealign_S ( 0, w1
[1], offset_minus_4
);
7115 w3
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7116 w3
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7117 w2
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7118 w2
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7119 w2
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7120 w2
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7130 if (offset_mod_4
== 0)
7144 w3
[2] = amd_bytealign_S ( 0, w1
[0], offset_minus_4
);
7145 w3
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7146 w3
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7147 w2
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7148 w2
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7149 w2
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7160 if (offset_mod_4
== 0)
7173 w3
[2] = amd_bytealign_S ( 0, w0
[3], offset_minus_4
);
7174 w3
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7175 w3
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7176 w2
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7177 w2
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7189 if (offset_mod_4
== 0)
7201 w3
[2] = amd_bytealign_S ( 0, w0
[2], offset_minus_4
);
7202 w3
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7203 w3
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7204 w2
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7217 if (offset_mod_4
== 0)
7228 w3
[2] = amd_bytealign_S ( 0, w0
[1], offset_minus_4
);
7229 w3
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7230 w3
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7244 if (offset_mod_4
== 0)
7254 w3
[2] = amd_bytealign_S ( 0, w0
[0], offset_minus_4
);
7255 w3
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7270 if (offset_mod_4
== 0)
7281 const int offset_minus_4
= 4 - (offset
% 4);
7283 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
7288 w3
[1] = __byte_perm_S (w3
[0], w3
[1], selector
);
7289 w3
[0] = __byte_perm_S (w2
[3], w3
[0], selector
);
7290 w2
[3] = __byte_perm_S (w2
[2], w2
[3], selector
);
7291 w2
[2] = __byte_perm_S (w2
[1], w2
[2], selector
);
7292 w2
[1] = __byte_perm_S (w2
[0], w2
[1], selector
);
7293 w2
[0] = __byte_perm_S (w1
[3], w2
[0], selector
);
7294 w1
[3] = __byte_perm_S (w1
[2], w1
[3], selector
);
7295 w1
[2] = __byte_perm_S (w1
[1], w1
[2], selector
);
7296 w1
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
7297 w1
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
7298 w0
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
7299 w0
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
7300 w0
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
7301 w0
[0] = __byte_perm_S ( 0, w0
[0], selector
);
7306 w3
[1] = __byte_perm_S (w2
[3], w3
[0], selector
);
7307 w3
[0] = __byte_perm_S (w2
[2], w2
[3], selector
);
7308 w2
[3] = __byte_perm_S (w2
[1], w2
[2], selector
);
7309 w2
[2] = __byte_perm_S (w2
[0], w2
[1], selector
);
7310 w2
[1] = __byte_perm_S (w1
[3], w2
[0], selector
);
7311 w2
[0] = __byte_perm_S (w1
[2], w1
[3], selector
);
7312 w1
[3] = __byte_perm_S (w1
[1], w1
[2], selector
);
7313 w1
[2] = __byte_perm_S (w1
[0], w1
[1], selector
);
7314 w1
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
7315 w1
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
7316 w0
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
7317 w0
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
7318 w0
[1] = __byte_perm_S ( 0, w0
[0], selector
);
7324 w3
[1] = __byte_perm_S (w2
[2], w2
[3], selector
);
7325 w3
[0] = __byte_perm_S (w2
[1], w2
[2], selector
);
7326 w2
[3] = __byte_perm_S (w2
[0], w2
[1], selector
);
7327 w2
[2] = __byte_perm_S (w1
[3], w2
[0], selector
);
7328 w2
[1] = __byte_perm_S (w1
[2], w1
[3], selector
);
7329 w2
[0] = __byte_perm_S (w1
[1], w1
[2], selector
);
7330 w1
[3] = __byte_perm_S (w1
[0], w1
[1], selector
);
7331 w1
[2] = __byte_perm_S (w0
[3], w1
[0], selector
);
7332 w1
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
7333 w1
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
7334 w0
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
7335 w0
[2] = __byte_perm_S ( 0, w0
[0], selector
);
7342 w3
[1] = __byte_perm_S (w2
[1], w2
[2], selector
);
7343 w3
[0] = __byte_perm_S (w2
[0], w2
[1], selector
);
7344 w2
[3] = __byte_perm_S (w1
[3], w2
[0], selector
);
7345 w2
[2] = __byte_perm_S (w1
[2], w1
[3], selector
);
7346 w2
[1] = __byte_perm_S (w1
[1], w1
[2], selector
);
7347 w2
[0] = __byte_perm_S (w1
[0], w1
[1], selector
);
7348 w1
[3] = __byte_perm_S (w0
[3], w1
[0], selector
);
7349 w1
[2] = __byte_perm_S (w0
[2], w0
[3], selector
);
7350 w1
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
7351 w1
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
7352 w0
[3] = __byte_perm_S ( 0, w0
[0], selector
);
7360 w3
[1] = __byte_perm_S (w2
[0], w2
[1], selector
);
7361 w3
[0] = __byte_perm_S (w1
[3], w2
[0], selector
);
7362 w2
[3] = __byte_perm_S (w1
[2], w1
[3], selector
);
7363 w2
[2] = __byte_perm_S (w1
[1], w1
[2], selector
);
7364 w2
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
7365 w2
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
7366 w1
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
7367 w1
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
7368 w1
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
7369 w1
[0] = __byte_perm_S ( 0, w0
[0], selector
);
7378 w3
[1] = __byte_perm_S (w1
[3], w2
[0], selector
);
7379 w3
[0] = __byte_perm_S (w1
[2], w1
[3], selector
);
7380 w2
[3] = __byte_perm_S (w1
[1], w1
[2], selector
);
7381 w2
[2] = __byte_perm_S (w1
[0], w1
[1], selector
);
7382 w2
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
7383 w2
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
7384 w1
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
7385 w1
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
7386 w1
[1] = __byte_perm_S ( 0, w0
[0], selector
);
7396 w3
[1] = __byte_perm_S (w1
[2], w1
[3], selector
);
7397 w3
[0] = __byte_perm_S (w1
[1], w1
[2], selector
);
7398 w2
[3] = __byte_perm_S (w1
[0], w1
[1], selector
);
7399 w2
[2] = __byte_perm_S (w0
[3], w1
[0], selector
);
7400 w2
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
7401 w2
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
7402 w1
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
7403 w1
[2] = __byte_perm_S ( 0, w0
[0], selector
);
7414 w3
[1] = __byte_perm_S (w1
[1], w1
[2], selector
);
7415 w3
[0] = __byte_perm_S (w1
[0], w1
[1], selector
);
7416 w2
[3] = __byte_perm_S (w0
[3], w1
[0], selector
);
7417 w2
[2] = __byte_perm_S (w0
[2], w0
[3], selector
);
7418 w2
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
7419 w2
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
7420 w1
[3] = __byte_perm_S ( 0, w0
[0], selector
);
7432 w3
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
7433 w3
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
7434 w2
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
7435 w2
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
7436 w2
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
7437 w2
[0] = __byte_perm_S ( 0, w0
[0], selector
);
7450 w3
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
7451 w3
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
7452 w2
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
7453 w2
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
7454 w2
[1] = __byte_perm_S ( 0, w0
[0], selector
);
7468 w3
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
7469 w3
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
7470 w2
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
7471 w2
[2] = __byte_perm_S ( 0, w0
[0], selector
);
7486 w3
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
7487 w3
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
7488 w2
[3] = __byte_perm_S ( 0, w0
[0], selector
);
7504 w3
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
7505 w3
[0] = __byte_perm_S ( 0, w0
[0], selector
);
7522 w3
[1] = __byte_perm_S ( 0, w0
[0], selector
);
7542 static void switch_buffer_by_offset_be_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
7544 #if defined IS_AMD || defined IS_GENERIC
7548 w3
[2] = amd_bytealign_S (w3
[1], 0, offset
);
7549 w3
[1] = amd_bytealign_S (w3
[0], w3
[1], offset
);
7550 w3
[0] = amd_bytealign_S (w2
[3], w3
[0], offset
);
7551 w2
[3] = amd_bytealign_S (w2
[2], w2
[3], offset
);
7552 w2
[2] = amd_bytealign_S (w2
[1], w2
[2], offset
);
7553 w2
[1] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7554 w2
[0] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7555 w1
[3] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7556 w1
[2] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7557 w1
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7558 w1
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7559 w0
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7560 w0
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7561 w0
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7562 w0
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
7566 w3
[2] = amd_bytealign_S (w3
[0], 0, offset
);
7567 w3
[1] = amd_bytealign_S (w2
[3], w3
[0], offset
);
7568 w3
[0] = amd_bytealign_S (w2
[2], w2
[3], offset
);
7569 w2
[3] = amd_bytealign_S (w2
[1], w2
[2], offset
);
7570 w2
[2] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7571 w2
[1] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7572 w2
[0] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7573 w1
[3] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7574 w1
[2] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7575 w1
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7576 w1
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7577 w0
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7578 w0
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7579 w0
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
7584 w3
[2] = amd_bytealign_S (w2
[3], 0, offset
);
7585 w3
[1] = amd_bytealign_S (w2
[2], w2
[3], offset
);
7586 w3
[0] = amd_bytealign_S (w2
[1], w2
[2], offset
);
7587 w2
[3] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7588 w2
[2] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7589 w2
[1] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7590 w2
[0] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7591 w1
[3] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7592 w1
[2] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7593 w1
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7594 w1
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7595 w0
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7596 w0
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
7602 w3
[2] = amd_bytealign_S (w2
[2], 0, offset
);
7603 w3
[1] = amd_bytealign_S (w2
[1], w2
[2], offset
);
7604 w3
[0] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7605 w2
[3] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7606 w2
[2] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7607 w2
[1] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7608 w2
[0] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7609 w1
[3] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7610 w1
[2] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7611 w1
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7612 w1
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7613 w0
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
7620 w3
[2] = amd_bytealign_S (w2
[1], 0, offset
);
7621 w3
[1] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7622 w3
[0] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7623 w2
[3] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7624 w2
[2] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7625 w2
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7626 w2
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7627 w1
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7628 w1
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7629 w1
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7630 w1
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
7638 w3
[2] = amd_bytealign_S (w2
[0], 0, offset
);
7639 w3
[1] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7640 w3
[0] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7641 w2
[3] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7642 w2
[2] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7643 w2
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7644 w2
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7645 w1
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7646 w1
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7647 w1
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
7656 w3
[2] = amd_bytealign_S (w1
[3], 0, offset
);
7657 w3
[1] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7658 w3
[0] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7659 w2
[3] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7660 w2
[2] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7661 w2
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7662 w2
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7663 w1
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7664 w1
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
7674 w3
[2] = amd_bytealign_S (w1
[2], 0, offset
);
7675 w3
[1] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7676 w3
[0] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7677 w2
[3] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7678 w2
[2] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7679 w2
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7680 w2
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7681 w1
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
7692 w3
[2] = amd_bytealign_S (w1
[1], 0, offset
);
7693 w3
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7694 w3
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7695 w2
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7696 w2
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7697 w2
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7698 w2
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
7710 w3
[2] = amd_bytealign_S (w1
[0], 0, offset
);
7711 w3
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7712 w3
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7713 w2
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7714 w2
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7715 w2
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
7728 w3
[2] = amd_bytealign_S (w0
[3], 0, offset
);
7729 w3
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7730 w3
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7731 w2
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7732 w2
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
7746 w3
[2] = amd_bytealign_S (w0
[2], 0, offset
);
7747 w3
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7748 w3
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7749 w2
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
7764 w3
[2] = amd_bytealign_S (w0
[1], 0, offset
);
7765 w3
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7766 w3
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
7782 w3
[2] = amd_bytealign_S (w0
[0], 0, offset
);
7783 w3
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
7802 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
7807 w3
[1] = __byte_perm_S (w3
[1], w3
[0], selector
);
7808 w3
[0] = __byte_perm_S (w3
[0], w2
[3], selector
);
7809 w2
[3] = __byte_perm_S (w2
[3], w2
[2], selector
);
7810 w2
[2] = __byte_perm_S (w2
[2], w2
[1], selector
);
7811 w2
[1] = __byte_perm_S (w2
[1], w2
[0], selector
);
7812 w2
[0] = __byte_perm_S (w2
[0], w1
[3], selector
);
7813 w1
[3] = __byte_perm_S (w1
[3], w1
[2], selector
);
7814 w1
[2] = __byte_perm_S (w1
[2], w1
[1], selector
);
7815 w1
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
7816 w1
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
7817 w0
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
7818 w0
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
7819 w0
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
7820 w0
[0] = __byte_perm_S (w0
[0], 0, selector
);
7824 w3
[1] = __byte_perm_S (w3
[0], w2
[3], selector
);
7825 w3
[0] = __byte_perm_S (w2
[3], w2
[2], selector
);
7826 w2
[3] = __byte_perm_S (w2
[2], w2
[1], selector
);
7827 w2
[2] = __byte_perm_S (w2
[1], w2
[0], selector
);
7828 w2
[1] = __byte_perm_S (w2
[0], w1
[3], selector
);
7829 w2
[0] = __byte_perm_S (w1
[3], w1
[2], selector
);
7830 w1
[3] = __byte_perm_S (w1
[2], w1
[1], selector
);
7831 w1
[2] = __byte_perm_S (w1
[1], w1
[0], selector
);
7832 w1
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
7833 w1
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
7834 w0
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
7835 w0
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
7836 w0
[1] = __byte_perm_S (w0
[0], 0, selector
);
7841 w3
[1] = __byte_perm_S (w2
[3], w2
[2], selector
);
7842 w3
[0] = __byte_perm_S (w2
[2], w2
[1], selector
);
7843 w2
[3] = __byte_perm_S (w2
[1], w2
[0], selector
);
7844 w2
[2] = __byte_perm_S (w2
[0], w1
[3], selector
);
7845 w2
[1] = __byte_perm_S (w1
[3], w1
[2], selector
);
7846 w2
[0] = __byte_perm_S (w1
[2], w1
[1], selector
);
7847 w1
[3] = __byte_perm_S (w1
[1], w1
[0], selector
);
7848 w1
[2] = __byte_perm_S (w1
[0], w0
[3], selector
);
7849 w1
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
7850 w1
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
7851 w0
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
7852 w0
[2] = __byte_perm_S (w0
[0], 0, selector
);
7858 w3
[1] = __byte_perm_S (w2
[2], w2
[1], selector
);
7859 w3
[0] = __byte_perm_S (w2
[1], w2
[0], selector
);
7860 w2
[3] = __byte_perm_S (w2
[0], w1
[3], selector
);
7861 w2
[2] = __byte_perm_S (w1
[3], w1
[2], selector
);
7862 w2
[1] = __byte_perm_S (w1
[2], w1
[1], selector
);
7863 w2
[0] = __byte_perm_S (w1
[1], w1
[0], selector
);
7864 w1
[3] = __byte_perm_S (w1
[0], w0
[3], selector
);
7865 w1
[2] = __byte_perm_S (w0
[3], w0
[2], selector
);
7866 w1
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
7867 w1
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
7868 w0
[3] = __byte_perm_S (w0
[0], 0, selector
);
7875 w3
[1] = __byte_perm_S (w2
[1], w2
[0], selector
);
7876 w3
[0] = __byte_perm_S (w2
[0], w1
[3], selector
);
7877 w2
[3] = __byte_perm_S (w1
[3], w1
[2], selector
);
7878 w2
[2] = __byte_perm_S (w1
[2], w1
[1], selector
);
7879 w2
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
7880 w2
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
7881 w1
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
7882 w1
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
7883 w1
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
7884 w1
[0] = __byte_perm_S (w0
[0], 0, selector
);
7892 w3
[1] = __byte_perm_S (w2
[0], w1
[3], selector
);
7893 w3
[0] = __byte_perm_S (w1
[3], w1
[2], selector
);
7894 w2
[3] = __byte_perm_S (w1
[2], w1
[1], selector
);
7895 w2
[2] = __byte_perm_S (w1
[1], w1
[0], selector
);
7896 w2
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
7897 w2
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
7898 w1
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
7899 w1
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
7900 w1
[1] = __byte_perm_S (w0
[0], 0, selector
);
7909 w3
[1] = __byte_perm_S (w1
[3], w1
[2], selector
);
7910 w3
[0] = __byte_perm_S (w1
[2], w1
[1], selector
);
7911 w2
[3] = __byte_perm_S (w1
[1], w1
[0], selector
);
7912 w2
[2] = __byte_perm_S (w1
[0], w0
[3], selector
);
7913 w2
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
7914 w2
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
7915 w1
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
7916 w1
[2] = __byte_perm_S (w0
[0], 0, selector
);
7926 w3
[1] = __byte_perm_S (w1
[2], w1
[1], selector
);
7927 w3
[0] = __byte_perm_S (w1
[1], w1
[0], selector
);
7928 w2
[3] = __byte_perm_S (w1
[0], w0
[3], selector
);
7929 w2
[2] = __byte_perm_S (w0
[3], w0
[2], selector
);
7930 w2
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
7931 w2
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
7932 w1
[3] = __byte_perm_S (w0
[0], 0, selector
);
7943 w3
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
7944 w3
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
7945 w2
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
7946 w2
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
7947 w2
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
7948 w2
[0] = __byte_perm_S (w0
[0], 0, selector
);
7960 w3
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
7961 w3
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
7962 w2
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
7963 w2
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
7964 w2
[1] = __byte_perm_S (w0
[0], 0, selector
);
7977 w3
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
7978 w3
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
7979 w2
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
7980 w2
[2] = __byte_perm_S (w0
[0], 0, selector
);
7994 w3
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
7995 w3
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
7996 w2
[3] = __byte_perm_S (w0
[0], 0, selector
);
8011 w3
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
8012 w3
[0] = __byte_perm_S (w0
[0], 0, selector
);
8028 w3
[1] = __byte_perm_S (w0
[0], 0, selector
);
8048 * vector functions on scalar types (for inner loop usage)
8051 #define PACKVS4(sn,vn,e) \
8052 sn[0] = vn[0].s##e; \
8053 sn[1] = vn[1].s##e; \
8054 sn[2] = vn[2].s##e; \
8057 #define PACKSV4(sn,vn,e) \
8058 vn[0].s##e = sn[0]; \
8059 vn[1].s##e = sn[1]; \
8060 vn[2].s##e = sn[2]; \
8063 #define PACKVS44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
8064 PACKVS4 (s0, v0, e); \
8065 PACKVS4 (s1, v1, e); \
8066 PACKVS4 (s2, v2, e); \
8067 PACKVS4 (s3, v3, e);
8069 #define PACKSV44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
8070 PACKSV4 (s0, v0, e); \
8071 PACKSV4 (s1, v1, e); \
8072 PACKSV4 (s2, v2, e); \
8073 PACKSV4 (s3, v3, e);
8075 static void switch_buffer_by_offset_le_VV (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x offset
)
8079 switch_buffer_by_offset_le_S (w0
, w1
, w2
, w3
, offset
);
8092 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8093 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8095 #elif VECT_SIZE == 4
8097 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8098 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8099 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
8100 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
8102 #elif VECT_SIZE == 8
8104 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8105 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8106 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
8107 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
8108 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s4
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4);
8109 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s5
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5);
8110 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s6
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6);
8111 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s7
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7);
8113 #elif VECT_SIZE == 16
8115 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8116 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8117 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
8118 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
8119 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s4
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4);
8120 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s5
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5);
8121 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s6
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6);
8122 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s7
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7);
8123 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 8); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s8
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 8);
8124 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 9); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s9
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 9);
8125 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, a
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sa
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, a
);
8126 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, b
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sb
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, b
);
8127 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, c
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sc
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, c
);
8128 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, d
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sd
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, d
);
8129 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, e
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.se
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, e
);
8130 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, f
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sf
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, f
);
8135 static void append_0x80_4x4_VV (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x offset
)
8139 append_0x80_4x4_S (w0
, w1
, w2
, w3
, offset
);
8152 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8153 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8155 #elif VECT_SIZE == 4
8157 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8158 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8159 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
8160 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
8162 #elif VECT_SIZE == 8
8164 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8165 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8166 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
8167 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
8168 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s4
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4);
8169 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s5
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5);
8170 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s6
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6);
8171 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s7
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7);
8173 #elif VECT_SIZE == 16
8175 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8176 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8177 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
8178 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
8179 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s4
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4);
8180 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s5
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5);
8181 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s6
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6);
8182 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s7
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7);
8183 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 8); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s8
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 8);
8184 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 9); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s9
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 9);
8185 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, a
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sa
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, a
);
8186 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, b
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sb
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, b
);
8187 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, c
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sc
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, c
);
8188 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, d
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sd
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, d
);
8189 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, e
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.se
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, e
);
8190 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, f
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sf
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, f
);