2 * Author......: Jens Steube <jens.steube@gmail.com>
7 * pure scalar functions
10 inline int hash_comp (const u32 d1
[4], __global u32
*d2
)
12 if (d1
[3] > d2
[DGST_R3
]) return ( 1);
13 if (d1
[3] < d2
[DGST_R3
]) return (-1);
14 if (d1
[2] > d2
[DGST_R2
]) return ( 1);
15 if (d1
[2] < d2
[DGST_R2
]) return (-1);
16 if (d1
[1] > d2
[DGST_R1
]) return ( 1);
17 if (d1
[1] < d2
[DGST_R1
]) return (-1);
18 if (d1
[0] > d2
[DGST_R0
]) return ( 1);
19 if (d1
[0] < d2
[DGST_R0
]) return (-1);
24 inline int find_hash (const u32 digest
[4], const u32 digests_cnt
, __global digest_t
*digests_buf
)
26 for (u32 l
= 0, r
= digests_cnt
; r
; r
>>= 1)
32 const int cmp
= hash_comp (digest
, digests_buf
[c
].digest_buf
);
41 if (cmp
== 0) return (c
);
47 inline u32
check_bitmap (__global u32
*bitmap
, const u32 bitmap_mask
, const u32 bitmap_shift
, const u32 digest
)
49 return (bitmap
[(digest
>> bitmap_shift
) & bitmap_mask
] & (1 << (digest
& 0x1f)));
52 inline u32
check (const u32 digest
[2], __global u32
*bitmap_s1_a
, __global u32
*bitmap_s1_b
, __global u32
*bitmap_s1_c
, __global u32
*bitmap_s1_d
, __global u32
*bitmap_s2_a
, __global u32
*bitmap_s2_b
, __global u32
*bitmap_s2_c
, __global u32
*bitmap_s2_d
, const u32 bitmap_mask
, const u32 bitmap_shift1
, const u32 bitmap_shift2
)
54 if (check_bitmap (bitmap_s1_a
, bitmap_mask
, bitmap_shift1
, digest
[0]) == 0) return (0);
55 if (check_bitmap (bitmap_s1_b
, bitmap_mask
, bitmap_shift1
, digest
[1]) == 0) return (0);
56 if (check_bitmap (bitmap_s1_c
, bitmap_mask
, bitmap_shift1
, digest
[2]) == 0) return (0);
57 if (check_bitmap (bitmap_s1_d
, bitmap_mask
, bitmap_shift1
, digest
[3]) == 0) return (0);
59 if (check_bitmap (bitmap_s2_a
, bitmap_mask
, bitmap_shift2
, digest
[0]) == 0) return (0);
60 if (check_bitmap (bitmap_s2_b
, bitmap_mask
, bitmap_shift2
, digest
[1]) == 0) return (0);
61 if (check_bitmap (bitmap_s2_c
, bitmap_mask
, bitmap_shift2
, digest
[2]) == 0) return (0);
62 if (check_bitmap (bitmap_s2_d
, bitmap_mask
, bitmap_shift2
, digest
[3]) == 0) return (0);
67 inline void mark_hash (__global plain_t
*plains_buf
, __global u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
69 hashes_shown
[hash_pos
] = 1;
71 plains_buf
[hash_pos
].gidvid
= (gid
* 1) + 0;
72 plains_buf
[hash_pos
].il_pos
= il_pos
;
79 inline void truncate_block (u32x w
[4], const u32 len
)
88 case 1: w
[0] &= 0x000000FF;
93 case 2: w
[0] &= 0x0000FFFF;
98 case 3: w
[0] &= 0x00FFFFFF;
107 case 5: w
[1] &= 0x000000FF;
111 case 6: w
[1] &= 0x0000FFFF;
115 case 7: w
[1] &= 0x00FFFFFF;
122 case 9: w
[2] &= 0x000000FF;
125 case 10: w
[2] &= 0x0000FFFF;
128 case 11: w
[2] &= 0x00FFFFFF;
133 case 13: w
[3] &= 0x000000FF;
135 case 14: w
[3] &= 0x0000FFFF;
137 case 15: w
[3] &= 0x00FFFFFF;
142 inline void make_unicode (const u32x in
[4], u32x out1
[4], u32x out2
[4])
145 out2
[3] = __byte_perm (in
[3], 0, 0x7372);
146 out2
[2] = __byte_perm (in
[3], 0, 0x7170);
147 out2
[1] = __byte_perm (in
[2], 0, 0x7372);
148 out2
[0] = __byte_perm (in
[2], 0, 0x7170);
149 out1
[3] = __byte_perm (in
[1], 0, 0x7372);
150 out1
[2] = __byte_perm (in
[1], 0, 0x7170);
151 out1
[1] = __byte_perm (in
[0], 0, 0x7372);
152 out1
[0] = __byte_perm (in
[0], 0, 0x7170);
155 #if defined IS_AMD || defined IS_GENERIC
156 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
157 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
158 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
159 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
160 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
161 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
162 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
163 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
167 inline void undo_unicode (const u32x in1
[4], const u32x in2
[4], u32x out
[4])
170 out
[0] = __byte_perm (in1
[0], in1
[1], 0x6420);
171 out
[1] = __byte_perm (in1
[2], in1
[3], 0x6420);
172 out
[2] = __byte_perm (in2
[0], in2
[1], 0x6420);
173 out
[3] = __byte_perm (in2
[2], in2
[3], 0x6420);
176 #if defined IS_AMD || defined IS_GENERIC
177 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
178 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
179 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
180 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
181 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
182 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
183 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
184 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
188 inline void append_0x01_1x4 (u32x w0
[4], const u32 offset
)
197 w0
[0] = w0
[0] | 0x0100;
201 w0
[0] = w0
[0] | 0x010000;
205 w0
[0] = w0
[0] | 0x01000000;
213 w0
[1] = w0
[1] | 0x0100;
217 w0
[1] = w0
[1] | 0x010000;
221 w0
[1] = w0
[1] | 0x01000000;
229 w0
[2] = w0
[2] | 0x0100;
233 w0
[2] = w0
[2] | 0x010000;
237 w0
[2] = w0
[2] | 0x01000000;
245 w0
[3] = w0
[3] | 0x0100;
249 w0
[3] = w0
[3] | 0x010000;
253 w0
[3] = w0
[3] | 0x01000000;
258 inline void append_0x01_2x4 (u32x w0
[4], u32x w1
[4], const u32 offset
)
267 w0
[0] = w0
[0] | 0x0100;
271 w0
[0] = w0
[0] | 0x010000;
275 w0
[0] = w0
[0] | 0x01000000;
283 w0
[1] = w0
[1] | 0x0100;
287 w0
[1] = w0
[1] | 0x010000;
291 w0
[1] = w0
[1] | 0x01000000;
299 w0
[2] = w0
[2] | 0x0100;
303 w0
[2] = w0
[2] | 0x010000;
307 w0
[2] = w0
[2] | 0x01000000;
315 w0
[3] = w0
[3] | 0x0100;
319 w0
[3] = w0
[3] | 0x010000;
323 w0
[3] = w0
[3] | 0x01000000;
331 w1
[0] = w1
[0] | 0x0100;
335 w1
[0] = w1
[0] | 0x010000;
339 w1
[0] = w1
[0] | 0x01000000;
347 w1
[1] = w1
[1] | 0x0100;
351 w1
[1] = w1
[1] | 0x010000;
355 w1
[1] = w1
[1] | 0x01000000;
363 w1
[2] = w1
[2] | 0x0100;
367 w1
[2] = w1
[2] | 0x010000;
371 w1
[2] = w1
[2] | 0x01000000;
379 w1
[3] = w1
[3] | 0x0100;
383 w1
[3] = w1
[3] | 0x010000;
387 w1
[3] = w1
[3] | 0x01000000;
392 inline void append_0x01_3x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
401 w0
[0] = w0
[0] | 0x0100;
405 w0
[0] = w0
[0] | 0x010000;
409 w0
[0] = w0
[0] | 0x01000000;
417 w0
[1] = w0
[1] | 0x0100;
421 w0
[1] = w0
[1] | 0x010000;
425 w0
[1] = w0
[1] | 0x01000000;
433 w0
[2] = w0
[2] | 0x0100;
437 w0
[2] = w0
[2] | 0x010000;
441 w0
[2] = w0
[2] | 0x01000000;
449 w0
[3] = w0
[3] | 0x0100;
453 w0
[3] = w0
[3] | 0x010000;
457 w0
[3] = w0
[3] | 0x01000000;
465 w1
[0] = w1
[0] | 0x0100;
469 w1
[0] = w1
[0] | 0x010000;
473 w1
[0] = w1
[0] | 0x01000000;
481 w1
[1] = w1
[1] | 0x0100;
485 w1
[1] = w1
[1] | 0x010000;
489 w1
[1] = w1
[1] | 0x01000000;
497 w1
[2] = w1
[2] | 0x0100;
501 w1
[2] = w1
[2] | 0x010000;
505 w1
[2] = w1
[2] | 0x01000000;
513 w1
[3] = w1
[3] | 0x0100;
517 w1
[3] = w1
[3] | 0x010000;
521 w1
[3] = w1
[3] | 0x01000000;
529 w2
[0] = w2
[0] | 0x0100;
533 w2
[0] = w2
[0] | 0x010000;
537 w2
[0] = w2
[0] | 0x01000000;
545 w2
[1] = w2
[1] | 0x0100;
549 w2
[1] = w2
[1] | 0x010000;
553 w2
[1] = w2
[1] | 0x01000000;
561 w2
[2] = w2
[2] | 0x0100;
565 w2
[2] = w2
[2] | 0x010000;
569 w2
[2] = w2
[2] | 0x01000000;
577 w2
[3] = w2
[3] | 0x0100;
581 w2
[3] = w2
[3] | 0x010000;
585 w2
[3] = w2
[3] | 0x01000000;
590 inline void append_0x01_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
599 w0
[0] = w0
[0] | 0x0100;
603 w0
[0] = w0
[0] | 0x010000;
607 w0
[0] = w0
[0] | 0x01000000;
615 w0
[1] = w0
[1] | 0x0100;
619 w0
[1] = w0
[1] | 0x010000;
623 w0
[1] = w0
[1] | 0x01000000;
631 w0
[2] = w0
[2] | 0x0100;
635 w0
[2] = w0
[2] | 0x010000;
639 w0
[2] = w0
[2] | 0x01000000;
647 w0
[3] = w0
[3] | 0x0100;
651 w0
[3] = w0
[3] | 0x010000;
655 w0
[3] = w0
[3] | 0x01000000;
663 w1
[0] = w1
[0] | 0x0100;
667 w1
[0] = w1
[0] | 0x010000;
671 w1
[0] = w1
[0] | 0x01000000;
679 w1
[1] = w1
[1] | 0x0100;
683 w1
[1] = w1
[1] | 0x010000;
687 w1
[1] = w1
[1] | 0x01000000;
695 w1
[2] = w1
[2] | 0x0100;
699 w1
[2] = w1
[2] | 0x010000;
703 w1
[2] = w1
[2] | 0x01000000;
711 w1
[3] = w1
[3] | 0x0100;
715 w1
[3] = w1
[3] | 0x010000;
719 w1
[3] = w1
[3] | 0x01000000;
727 w2
[0] = w2
[0] | 0x0100;
731 w2
[0] = w2
[0] | 0x010000;
735 w2
[0] = w2
[0] | 0x01000000;
743 w2
[1] = w2
[1] | 0x0100;
747 w2
[1] = w2
[1] | 0x010000;
751 w2
[1] = w2
[1] | 0x01000000;
759 w2
[2] = w2
[2] | 0x0100;
763 w2
[2] = w2
[2] | 0x010000;
767 w2
[2] = w2
[2] | 0x01000000;
775 w2
[3] = w2
[3] | 0x0100;
779 w2
[3] = w2
[3] | 0x010000;
783 w2
[3] = w2
[3] | 0x01000000;
791 w3
[0] = w3
[0] | 0x0100;
795 w3
[0] = w3
[0] | 0x010000;
799 w3
[0] = w3
[0] | 0x01000000;
807 w3
[1] = w3
[1] | 0x0100;
811 w3
[1] = w3
[1] | 0x010000;
815 w3
[1] = w3
[1] | 0x01000000;
823 w3
[2] = w3
[2] | 0x0100;
827 w3
[2] = w3
[2] | 0x010000;
831 w3
[2] = w3
[2] | 0x01000000;
839 w3
[3] = w3
[3] | 0x0100;
843 w3
[3] = w3
[3] | 0x010000;
847 w3
[3] = w3
[3] | 0x01000000;
852 inline void append_0x01_8x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
861 w0
[0] = w0
[0] | 0x0100;
865 w0
[0] = w0
[0] | 0x010000;
869 w0
[0] = w0
[0] | 0x01000000;
877 w0
[1] = w0
[1] | 0x0100;
881 w0
[1] = w0
[1] | 0x010000;
885 w0
[1] = w0
[1] | 0x01000000;
893 w0
[2] = w0
[2] | 0x0100;
897 w0
[2] = w0
[2] | 0x010000;
901 w0
[2] = w0
[2] | 0x01000000;
909 w0
[3] = w0
[3] | 0x0100;
913 w0
[3] = w0
[3] | 0x010000;
917 w0
[3] = w0
[3] | 0x01000000;
925 w1
[0] = w1
[0] | 0x0100;
929 w1
[0] = w1
[0] | 0x010000;
933 w1
[0] = w1
[0] | 0x01000000;
941 w1
[1] = w1
[1] | 0x0100;
945 w1
[1] = w1
[1] | 0x010000;
949 w1
[1] = w1
[1] | 0x01000000;
957 w1
[2] = w1
[2] | 0x0100;
961 w1
[2] = w1
[2] | 0x010000;
965 w1
[2] = w1
[2] | 0x01000000;
973 w1
[3] = w1
[3] | 0x0100;
977 w1
[3] = w1
[3] | 0x010000;
981 w1
[3] = w1
[3] | 0x01000000;
989 w2
[0] = w2
[0] | 0x0100;
993 w2
[0] = w2
[0] | 0x010000;
997 w2
[0] = w2
[0] | 0x01000000;
1005 w2
[1] = w2
[1] | 0x0100;
1009 w2
[1] = w2
[1] | 0x010000;
1013 w2
[1] = w2
[1] | 0x01000000;
1021 w2
[2] = w2
[2] | 0x0100;
1025 w2
[2] = w2
[2] | 0x010000;
1029 w2
[2] = w2
[2] | 0x01000000;
1037 w2
[3] = w2
[3] | 0x0100;
1041 w2
[3] = w2
[3] | 0x010000;
1045 w2
[3] = w2
[3] | 0x01000000;
1053 w3
[0] = w3
[0] | 0x0100;
1057 w3
[0] = w3
[0] | 0x010000;
1061 w3
[0] = w3
[0] | 0x01000000;
1069 w3
[1] = w3
[1] | 0x0100;
1073 w3
[1] = w3
[1] | 0x010000;
1077 w3
[1] = w3
[1] | 0x01000000;
1085 w3
[2] = w3
[2] | 0x0100;
1089 w3
[2] = w3
[2] | 0x010000;
1093 w3
[2] = w3
[2] | 0x01000000;
1101 w3
[3] = w3
[3] | 0x0100;
1105 w3
[3] = w3
[3] | 0x010000;
1109 w3
[3] = w3
[3] | 0x01000000;
1117 w4
[0] = w4
[0] | 0x0100;
1121 w4
[0] = w4
[0] | 0x010000;
1125 w4
[0] = w4
[0] | 0x01000000;
1133 w4
[1] = w4
[1] | 0x0100;
1137 w4
[1] = w4
[1] | 0x010000;
1141 w4
[1] = w4
[1] | 0x01000000;
1149 w4
[2] = w4
[2] | 0x0100;
1153 w4
[2] = w4
[2] | 0x010000;
1157 w4
[2] = w4
[2] | 0x01000000;
1165 w4
[3] = w4
[3] | 0x0100;
1169 w4
[3] = w4
[3] | 0x010000;
1173 w4
[3] = w4
[3] | 0x01000000;
1181 w5
[0] = w5
[0] | 0x0100;
1185 w5
[0] = w5
[0] | 0x010000;
1189 w5
[0] = w5
[0] | 0x01000000;
1197 w5
[1] = w5
[1] | 0x0100;
1201 w5
[1] = w5
[1] | 0x010000;
1205 w5
[1] = w5
[1] | 0x01000000;
1213 w5
[2] = w5
[2] | 0x0100;
1217 w5
[2] = w5
[2] | 0x010000;
1221 w5
[2] = w5
[2] | 0x01000000;
1229 w5
[3] = w5
[3] | 0x0100;
1233 w5
[3] = w5
[3] | 0x010000;
1237 w5
[3] = w5
[3] | 0x01000000;
1245 w6
[0] = w6
[0] | 0x0100;
1249 w6
[0] = w6
[0] | 0x010000;
1253 w6
[0] = w6
[0] | 0x01000000;
1261 w6
[1] = w6
[1] | 0x0100;
1265 w6
[1] = w6
[1] | 0x010000;
1269 w6
[1] = w6
[1] | 0x01000000;
1277 w6
[2] = w6
[2] | 0x0100;
1281 w6
[2] = w6
[2] | 0x010000;
1285 w6
[2] = w6
[2] | 0x01000000;
1293 w6
[3] = w6
[3] | 0x0100;
1297 w6
[3] = w6
[3] | 0x010000;
1301 w6
[3] = w6
[3] | 0x01000000;
1309 w7
[0] = w7
[0] | 0x0100;
1313 w7
[0] = w7
[0] | 0x010000;
1317 w7
[0] = w7
[0] | 0x01000000;
1325 w7
[1] = w7
[1] | 0x0100;
1329 w7
[1] = w7
[1] | 0x010000;
1333 w7
[1] = w7
[1] | 0x01000000;
1341 w7
[2] = w7
[2] | 0x0100;
1345 w7
[2] = w7
[2] | 0x010000;
1349 w7
[2] = w7
[2] | 0x01000000;
1357 w7
[3] = w7
[3] | 0x0100;
1361 w7
[3] = w7
[3] | 0x010000;
1365 w7
[3] = w7
[3] | 0x01000000;
1370 inline void append_0x02_1x4 (u32x w0
[4], const u32 offset
)
1379 w0
[0] = w0
[0] | 0x0200;
1383 w0
[0] = w0
[0] | 0x020000;
1387 w0
[0] = w0
[0] | 0x02000000;
1395 w0
[1] = w0
[1] | 0x0200;
1399 w0
[1] = w0
[1] | 0x020000;
1403 w0
[1] = w0
[1] | 0x02000000;
1411 w0
[2] = w0
[2] | 0x0200;
1415 w0
[2] = w0
[2] | 0x020000;
1419 w0
[2] = w0
[2] | 0x02000000;
1427 w0
[3] = w0
[3] | 0x0200;
1431 w0
[3] = w0
[3] | 0x020000;
1435 w0
[3] = w0
[3] | 0x02000000;
1440 inline void append_0x02_2x4 (u32x w0
[4], u32x w1
[4], const u32 offset
)
1449 w0
[0] = w0
[0] | 0x0200;
1453 w0
[0] = w0
[0] | 0x020000;
1457 w0
[0] = w0
[0] | 0x02000000;
1465 w0
[1] = w0
[1] | 0x0200;
1469 w0
[1] = w0
[1] | 0x020000;
1473 w0
[1] = w0
[1] | 0x02000000;
1481 w0
[2] = w0
[2] | 0x0200;
1485 w0
[2] = w0
[2] | 0x020000;
1489 w0
[2] = w0
[2] | 0x02000000;
1497 w0
[3] = w0
[3] | 0x0200;
1501 w0
[3] = w0
[3] | 0x020000;
1505 w0
[3] = w0
[3] | 0x02000000;
1513 w1
[0] = w1
[0] | 0x0200;
1517 w1
[0] = w1
[0] | 0x020000;
1521 w1
[0] = w1
[0] | 0x02000000;
1529 w1
[1] = w1
[1] | 0x0200;
1533 w1
[1] = w1
[1] | 0x020000;
1537 w1
[1] = w1
[1] | 0x02000000;
1545 w1
[2] = w1
[2] | 0x0200;
1549 w1
[2] = w1
[2] | 0x020000;
1553 w1
[2] = w1
[2] | 0x02000000;
1561 w1
[3] = w1
[3] | 0x0200;
1565 w1
[3] = w1
[3] | 0x020000;
1569 w1
[3] = w1
[3] | 0x02000000;
1574 inline void append_0x02_3x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
1583 w0
[0] = w0
[0] | 0x0200;
1587 w0
[0] = w0
[0] | 0x020000;
1591 w0
[0] = w0
[0] | 0x02000000;
1599 w0
[1] = w0
[1] | 0x0200;
1603 w0
[1] = w0
[1] | 0x020000;
1607 w0
[1] = w0
[1] | 0x02000000;
1615 w0
[2] = w0
[2] | 0x0200;
1619 w0
[2] = w0
[2] | 0x020000;
1623 w0
[2] = w0
[2] | 0x02000000;
1631 w0
[3] = w0
[3] | 0x0200;
1635 w0
[3] = w0
[3] | 0x020000;
1639 w0
[3] = w0
[3] | 0x02000000;
1647 w1
[0] = w1
[0] | 0x0200;
1651 w1
[0] = w1
[0] | 0x020000;
1655 w1
[0] = w1
[0] | 0x02000000;
1663 w1
[1] = w1
[1] | 0x0200;
1667 w1
[1] = w1
[1] | 0x020000;
1671 w1
[1] = w1
[1] | 0x02000000;
1679 w1
[2] = w1
[2] | 0x0200;
1683 w1
[2] = w1
[2] | 0x020000;
1687 w1
[2] = w1
[2] | 0x02000000;
1695 w1
[3] = w1
[3] | 0x0200;
1699 w1
[3] = w1
[3] | 0x020000;
1703 w1
[3] = w1
[3] | 0x02000000;
1711 w2
[0] = w2
[0] | 0x0200;
1715 w2
[0] = w2
[0] | 0x020000;
1719 w2
[0] = w2
[0] | 0x02000000;
1727 w2
[1] = w2
[1] | 0x0200;
1731 w2
[1] = w2
[1] | 0x020000;
1735 w2
[1] = w2
[1] | 0x02000000;
1743 w2
[2] = w2
[2] | 0x0200;
1747 w2
[2] = w2
[2] | 0x020000;
1751 w2
[2] = w2
[2] | 0x02000000;
1759 w2
[3] = w2
[3] | 0x0200;
1763 w2
[3] = w2
[3] | 0x020000;
1767 w2
[3] = w2
[3] | 0x02000000;
1772 inline void append_0x02_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
1781 w0
[0] = w0
[0] | 0x0200;
1785 w0
[0] = w0
[0] | 0x020000;
1789 w0
[0] = w0
[0] | 0x02000000;
1797 w0
[1] = w0
[1] | 0x0200;
1801 w0
[1] = w0
[1] | 0x020000;
1805 w0
[1] = w0
[1] | 0x02000000;
1813 w0
[2] = w0
[2] | 0x0200;
1817 w0
[2] = w0
[2] | 0x020000;
1821 w0
[2] = w0
[2] | 0x02000000;
1829 w0
[3] = w0
[3] | 0x0200;
1833 w0
[3] = w0
[3] | 0x020000;
1837 w0
[3] = w0
[3] | 0x02000000;
1845 w1
[0] = w1
[0] | 0x0200;
1849 w1
[0] = w1
[0] | 0x020000;
1853 w1
[0] = w1
[0] | 0x02000000;
1861 w1
[1] = w1
[1] | 0x0200;
1865 w1
[1] = w1
[1] | 0x020000;
1869 w1
[1] = w1
[1] | 0x02000000;
1877 w1
[2] = w1
[2] | 0x0200;
1881 w1
[2] = w1
[2] | 0x020000;
1885 w1
[2] = w1
[2] | 0x02000000;
1893 w1
[3] = w1
[3] | 0x0200;
1897 w1
[3] = w1
[3] | 0x020000;
1901 w1
[3] = w1
[3] | 0x02000000;
1909 w2
[0] = w2
[0] | 0x0200;
1913 w2
[0] = w2
[0] | 0x020000;
1917 w2
[0] = w2
[0] | 0x02000000;
1925 w2
[1] = w2
[1] | 0x0200;
1929 w2
[1] = w2
[1] | 0x020000;
1933 w2
[1] = w2
[1] | 0x02000000;
1941 w2
[2] = w2
[2] | 0x0200;
1945 w2
[2] = w2
[2] | 0x020000;
1949 w2
[2] = w2
[2] | 0x02000000;
1957 w2
[3] = w2
[3] | 0x0200;
1961 w2
[3] = w2
[3] | 0x020000;
1965 w2
[3] = w2
[3] | 0x02000000;
1973 w3
[0] = w3
[0] | 0x0200;
1977 w3
[0] = w3
[0] | 0x020000;
1981 w3
[0] = w3
[0] | 0x02000000;
1989 w3
[1] = w3
[1] | 0x0200;
1993 w3
[1] = w3
[1] | 0x020000;
1997 w3
[1] = w3
[1] | 0x02000000;
2005 w3
[2] = w3
[2] | 0x0200;
2009 w3
[2] = w3
[2] | 0x020000;
2013 w3
[2] = w3
[2] | 0x02000000;
2021 w3
[3] = w3
[3] | 0x0200;
2025 w3
[3] = w3
[3] | 0x020000;
2029 w3
[3] = w3
[3] | 0x02000000;
2034 inline void append_0x02_8x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
2043 w0
[0] = w0
[0] | 0x0200;
2047 w0
[0] = w0
[0] | 0x020000;
2051 w0
[0] = w0
[0] | 0x02000000;
2059 w0
[1] = w0
[1] | 0x0200;
2063 w0
[1] = w0
[1] | 0x020000;
2067 w0
[1] = w0
[1] | 0x02000000;
2075 w0
[2] = w0
[2] | 0x0200;
2079 w0
[2] = w0
[2] | 0x020000;
2083 w0
[2] = w0
[2] | 0x02000000;
2091 w0
[3] = w0
[3] | 0x0200;
2095 w0
[3] = w0
[3] | 0x020000;
2099 w0
[3] = w0
[3] | 0x02000000;
2107 w1
[0] = w1
[0] | 0x0200;
2111 w1
[0] = w1
[0] | 0x020000;
2115 w1
[0] = w1
[0] | 0x02000000;
2123 w1
[1] = w1
[1] | 0x0200;
2127 w1
[1] = w1
[1] | 0x020000;
2131 w1
[1] = w1
[1] | 0x02000000;
2139 w1
[2] = w1
[2] | 0x0200;
2143 w1
[2] = w1
[2] | 0x020000;
2147 w1
[2] = w1
[2] | 0x02000000;
2155 w1
[3] = w1
[3] | 0x0200;
2159 w1
[3] = w1
[3] | 0x020000;
2163 w1
[3] = w1
[3] | 0x02000000;
2171 w2
[0] = w2
[0] | 0x0200;
2175 w2
[0] = w2
[0] | 0x020000;
2179 w2
[0] = w2
[0] | 0x02000000;
2187 w2
[1] = w2
[1] | 0x0200;
2191 w2
[1] = w2
[1] | 0x020000;
2195 w2
[1] = w2
[1] | 0x02000000;
2203 w2
[2] = w2
[2] | 0x0200;
2207 w2
[2] = w2
[2] | 0x020000;
2211 w2
[2] = w2
[2] | 0x02000000;
2219 w2
[3] = w2
[3] | 0x0200;
2223 w2
[3] = w2
[3] | 0x020000;
2227 w2
[3] = w2
[3] | 0x02000000;
2235 w3
[0] = w3
[0] | 0x0200;
2239 w3
[0] = w3
[0] | 0x020000;
2243 w3
[0] = w3
[0] | 0x02000000;
2251 w3
[1] = w3
[1] | 0x0200;
2255 w3
[1] = w3
[1] | 0x020000;
2259 w3
[1] = w3
[1] | 0x02000000;
2267 w3
[2] = w3
[2] | 0x0200;
2271 w3
[2] = w3
[2] | 0x020000;
2275 w3
[2] = w3
[2] | 0x02000000;
2283 w3
[3] = w3
[3] | 0x0200;
2287 w3
[3] = w3
[3] | 0x020000;
2291 w3
[3] = w3
[3] | 0x02000000;
2299 w4
[0] = w4
[0] | 0x0200;
2303 w4
[0] = w4
[0] | 0x020000;
2307 w4
[0] = w4
[0] | 0x02000000;
2315 w4
[1] = w4
[1] | 0x0200;
2319 w4
[1] = w4
[1] | 0x020000;
2323 w4
[1] = w4
[1] | 0x02000000;
2331 w4
[2] = w4
[2] | 0x0200;
2335 w4
[2] = w4
[2] | 0x020000;
2339 w4
[2] = w4
[2] | 0x02000000;
2347 w4
[3] = w4
[3] | 0x0200;
2351 w4
[3] = w4
[3] | 0x020000;
2355 w4
[3] = w4
[3] | 0x02000000;
2363 w5
[0] = w5
[0] | 0x0200;
2367 w5
[0] = w5
[0] | 0x020000;
2371 w5
[0] = w5
[0] | 0x02000000;
2379 w5
[1] = w5
[1] | 0x0200;
2383 w5
[1] = w5
[1] | 0x020000;
2387 w5
[1] = w5
[1] | 0x02000000;
2395 w5
[2] = w5
[2] | 0x0200;
2399 w5
[2] = w5
[2] | 0x020000;
2403 w5
[2] = w5
[2] | 0x02000000;
2411 w5
[3] = w5
[3] | 0x0200;
2415 w5
[3] = w5
[3] | 0x020000;
2419 w5
[3] = w5
[3] | 0x02000000;
2427 w6
[0] = w6
[0] | 0x0200;
2431 w6
[0] = w6
[0] | 0x020000;
2435 w6
[0] = w6
[0] | 0x02000000;
2443 w6
[1] = w6
[1] | 0x0200;
2447 w6
[1] = w6
[1] | 0x020000;
2451 w6
[1] = w6
[1] | 0x02000000;
2459 w6
[2] = w6
[2] | 0x0200;
2463 w6
[2] = w6
[2] | 0x020000;
2467 w6
[2] = w6
[2] | 0x02000000;
2475 w6
[3] = w6
[3] | 0x0200;
2479 w6
[3] = w6
[3] | 0x020000;
2483 w6
[3] = w6
[3] | 0x02000000;
2491 w7
[0] = w7
[0] | 0x0200;
2495 w7
[0] = w7
[0] | 0x020000;
2499 w7
[0] = w7
[0] | 0x02000000;
2507 w7
[1] = w7
[1] | 0x0200;
2511 w7
[1] = w7
[1] | 0x020000;
2515 w7
[1] = w7
[1] | 0x02000000;
2523 w7
[2] = w7
[2] | 0x0200;
2527 w7
[2] = w7
[2] | 0x020000;
2531 w7
[2] = w7
[2] | 0x02000000;
2539 w7
[3] = w7
[3] | 0x0200;
2543 w7
[3] = w7
[3] | 0x020000;
2547 w7
[3] = w7
[3] | 0x02000000;
2552 inline void append_0x80_1x4 (u32x w0
[4], const u32 offset
)
2561 w0
[0] = w0
[0] | 0x8000;
2565 w0
[0] = w0
[0] | 0x800000;
2569 w0
[0] = w0
[0] | 0x80000000;
2577 w0
[1] = w0
[1] | 0x8000;
2581 w0
[1] = w0
[1] | 0x800000;
2585 w0
[1] = w0
[1] | 0x80000000;
2593 w0
[2] = w0
[2] | 0x8000;
2597 w0
[2] = w0
[2] | 0x800000;
2601 w0
[2] = w0
[2] | 0x80000000;
2609 w0
[3] = w0
[3] | 0x8000;
2613 w0
[3] = w0
[3] | 0x800000;
2617 w0
[3] = w0
[3] | 0x80000000;
2622 inline void append_0x80_2x4 (u32x w0
[4], u32x w1
[4], const u32 offset
)
2631 w0
[0] = w0
[0] | 0x8000;
2635 w0
[0] = w0
[0] | 0x800000;
2639 w0
[0] = w0
[0] | 0x80000000;
2647 w0
[1] = w0
[1] | 0x8000;
2651 w0
[1] = w0
[1] | 0x800000;
2655 w0
[1] = w0
[1] | 0x80000000;
2663 w0
[2] = w0
[2] | 0x8000;
2667 w0
[2] = w0
[2] | 0x800000;
2671 w0
[2] = w0
[2] | 0x80000000;
2679 w0
[3] = w0
[3] | 0x8000;
2683 w0
[3] = w0
[3] | 0x800000;
2687 w0
[3] = w0
[3] | 0x80000000;
2695 w1
[0] = w1
[0] | 0x8000;
2699 w1
[0] = w1
[0] | 0x800000;
2703 w1
[0] = w1
[0] | 0x80000000;
2711 w1
[1] = w1
[1] | 0x8000;
2715 w1
[1] = w1
[1] | 0x800000;
2719 w1
[1] = w1
[1] | 0x80000000;
2727 w1
[2] = w1
[2] | 0x8000;
2731 w1
[2] = w1
[2] | 0x800000;
2735 w1
[2] = w1
[2] | 0x80000000;
2743 w1
[3] = w1
[3] | 0x8000;
2747 w1
[3] = w1
[3] | 0x800000;
2751 w1
[3] = w1
[3] | 0x80000000;
2756 inline void append_0x80_3x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
2765 w0
[0] = w0
[0] | 0x8000;
2769 w0
[0] = w0
[0] | 0x800000;
2773 w0
[0] = w0
[0] | 0x80000000;
2781 w0
[1] = w0
[1] | 0x8000;
2785 w0
[1] = w0
[1] | 0x800000;
2789 w0
[1] = w0
[1] | 0x80000000;
2797 w0
[2] = w0
[2] | 0x8000;
2801 w0
[2] = w0
[2] | 0x800000;
2805 w0
[2] = w0
[2] | 0x80000000;
2813 w0
[3] = w0
[3] | 0x8000;
2817 w0
[3] = w0
[3] | 0x800000;
2821 w0
[3] = w0
[3] | 0x80000000;
2829 w1
[0] = w1
[0] | 0x8000;
2833 w1
[0] = w1
[0] | 0x800000;
2837 w1
[0] = w1
[0] | 0x80000000;
2845 w1
[1] = w1
[1] | 0x8000;
2849 w1
[1] = w1
[1] | 0x800000;
2853 w1
[1] = w1
[1] | 0x80000000;
2861 w1
[2] = w1
[2] | 0x8000;
2865 w1
[2] = w1
[2] | 0x800000;
2869 w1
[2] = w1
[2] | 0x80000000;
2877 w1
[3] = w1
[3] | 0x8000;
2881 w1
[3] = w1
[3] | 0x800000;
2885 w1
[3] = w1
[3] | 0x80000000;
2893 w2
[0] = w2
[0] | 0x8000;
2897 w2
[0] = w2
[0] | 0x800000;
2901 w2
[0] = w2
[0] | 0x80000000;
2909 w2
[1] = w2
[1] | 0x8000;
2913 w2
[1] = w2
[1] | 0x800000;
2917 w2
[1] = w2
[1] | 0x80000000;
2925 w2
[2] = w2
[2] | 0x8000;
2929 w2
[2] = w2
[2] | 0x800000;
2933 w2
[2] = w2
[2] | 0x80000000;
2941 w2
[3] = w2
[3] | 0x8000;
2945 w2
[3] = w2
[3] | 0x800000;
2949 w2
[3] = w2
[3] | 0x80000000;
2954 inline void append_0x80_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
2963 w0
[0] = w0
[0] | 0x8000;
2967 w0
[0] = w0
[0] | 0x800000;
2971 w0
[0] = w0
[0] | 0x80000000;
2979 w0
[1] = w0
[1] | 0x8000;
2983 w0
[1] = w0
[1] | 0x800000;
2987 w0
[1] = w0
[1] | 0x80000000;
2995 w0
[2] = w0
[2] | 0x8000;
2999 w0
[2] = w0
[2] | 0x800000;
3003 w0
[2] = w0
[2] | 0x80000000;
3011 w0
[3] = w0
[3] | 0x8000;
3015 w0
[3] = w0
[3] | 0x800000;
3019 w0
[3] = w0
[3] | 0x80000000;
3027 w1
[0] = w1
[0] | 0x8000;
3031 w1
[0] = w1
[0] | 0x800000;
3035 w1
[0] = w1
[0] | 0x80000000;
3043 w1
[1] = w1
[1] | 0x8000;
3047 w1
[1] = w1
[1] | 0x800000;
3051 w1
[1] = w1
[1] | 0x80000000;
3059 w1
[2] = w1
[2] | 0x8000;
3063 w1
[2] = w1
[2] | 0x800000;
3067 w1
[2] = w1
[2] | 0x80000000;
3075 w1
[3] = w1
[3] | 0x8000;
3079 w1
[3] = w1
[3] | 0x800000;
3083 w1
[3] = w1
[3] | 0x80000000;
3091 w2
[0] = w2
[0] | 0x8000;
3095 w2
[0] = w2
[0] | 0x800000;
3099 w2
[0] = w2
[0] | 0x80000000;
3107 w2
[1] = w2
[1] | 0x8000;
3111 w2
[1] = w2
[1] | 0x800000;
3115 w2
[1] = w2
[1] | 0x80000000;
3123 w2
[2] = w2
[2] | 0x8000;
3127 w2
[2] = w2
[2] | 0x800000;
3131 w2
[2] = w2
[2] | 0x80000000;
3139 w2
[3] = w2
[3] | 0x8000;
3143 w2
[3] = w2
[3] | 0x800000;
3147 w2
[3] = w2
[3] | 0x80000000;
3155 w3
[0] = w3
[0] | 0x8000;
3159 w3
[0] = w3
[0] | 0x800000;
3163 w3
[0] = w3
[0] | 0x80000000;
3171 w3
[1] = w3
[1] | 0x8000;
3175 w3
[1] = w3
[1] | 0x800000;
3179 w3
[1] = w3
[1] | 0x80000000;
3187 w3
[2] = w3
[2] | 0x8000;
3191 w3
[2] = w3
[2] | 0x800000;
3195 w3
[2] = w3
[2] | 0x80000000;
3203 w3
[3] = w3
[3] | 0x8000;
3207 w3
[3] = w3
[3] | 0x800000;
3211 w3
[3] = w3
[3] | 0x80000000;
3216 inline void append_0x80_8x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
3225 w0
[0] = w0
[0] | 0x8000;
3229 w0
[0] = w0
[0] | 0x800000;
3233 w0
[0] = w0
[0] | 0x80000000;
3241 w0
[1] = w0
[1] | 0x8000;
3245 w0
[1] = w0
[1] | 0x800000;
3249 w0
[1] = w0
[1] | 0x80000000;
3257 w0
[2] = w0
[2] | 0x8000;
3261 w0
[2] = w0
[2] | 0x800000;
3265 w0
[2] = w0
[2] | 0x80000000;
3273 w0
[3] = w0
[3] | 0x8000;
3277 w0
[3] = w0
[3] | 0x800000;
3281 w0
[3] = w0
[3] | 0x80000000;
3289 w1
[0] = w1
[0] | 0x8000;
3293 w1
[0] = w1
[0] | 0x800000;
3297 w1
[0] = w1
[0] | 0x80000000;
3305 w1
[1] = w1
[1] | 0x8000;
3309 w1
[1] = w1
[1] | 0x800000;
3313 w1
[1] = w1
[1] | 0x80000000;
3321 w1
[2] = w1
[2] | 0x8000;
3325 w1
[2] = w1
[2] | 0x800000;
3329 w1
[2] = w1
[2] | 0x80000000;
3337 w1
[3] = w1
[3] | 0x8000;
3341 w1
[3] = w1
[3] | 0x800000;
3345 w1
[3] = w1
[3] | 0x80000000;
3353 w2
[0] = w2
[0] | 0x8000;
3357 w2
[0] = w2
[0] | 0x800000;
3361 w2
[0] = w2
[0] | 0x80000000;
3369 w2
[1] = w2
[1] | 0x8000;
3373 w2
[1] = w2
[1] | 0x800000;
3377 w2
[1] = w2
[1] | 0x80000000;
3385 w2
[2] = w2
[2] | 0x8000;
3389 w2
[2] = w2
[2] | 0x800000;
3393 w2
[2] = w2
[2] | 0x80000000;
3401 w2
[3] = w2
[3] | 0x8000;
3405 w2
[3] = w2
[3] | 0x800000;
3409 w2
[3] = w2
[3] | 0x80000000;
3417 w3
[0] = w3
[0] | 0x8000;
3421 w3
[0] = w3
[0] | 0x800000;
3425 w3
[0] = w3
[0] | 0x80000000;
3433 w3
[1] = w3
[1] | 0x8000;
3437 w3
[1] = w3
[1] | 0x800000;
3441 w3
[1] = w3
[1] | 0x80000000;
3449 w3
[2] = w3
[2] | 0x8000;
3453 w3
[2] = w3
[2] | 0x800000;
3457 w3
[2] = w3
[2] | 0x80000000;
3465 w3
[3] = w3
[3] | 0x8000;
3469 w3
[3] = w3
[3] | 0x800000;
3473 w3
[3] = w3
[3] | 0x80000000;
3481 w4
[0] = w4
[0] | 0x8000;
3485 w4
[0] = w4
[0] | 0x800000;
3489 w4
[0] = w4
[0] | 0x80000000;
3497 w4
[1] = w4
[1] | 0x8000;
3501 w4
[1] = w4
[1] | 0x800000;
3505 w4
[1] = w4
[1] | 0x80000000;
3513 w4
[2] = w4
[2] | 0x8000;
3517 w4
[2] = w4
[2] | 0x800000;
3521 w4
[2] = w4
[2] | 0x80000000;
3529 w4
[3] = w4
[3] | 0x8000;
3533 w4
[3] = w4
[3] | 0x800000;
3537 w4
[3] = w4
[3] | 0x80000000;
3545 w5
[0] = w5
[0] | 0x8000;
3549 w5
[0] = w5
[0] | 0x800000;
3553 w5
[0] = w5
[0] | 0x80000000;
3561 w5
[1] = w5
[1] | 0x8000;
3565 w5
[1] = w5
[1] | 0x800000;
3569 w5
[1] = w5
[1] | 0x80000000;
3577 w5
[2] = w5
[2] | 0x8000;
3581 w5
[2] = w5
[2] | 0x800000;
3585 w5
[2] = w5
[2] | 0x80000000;
3593 w5
[3] = w5
[3] | 0x8000;
3597 w5
[3] = w5
[3] | 0x800000;
3601 w5
[3] = w5
[3] | 0x80000000;
3609 w6
[0] = w6
[0] | 0x8000;
3613 w6
[0] = w6
[0] | 0x800000;
3617 w6
[0] = w6
[0] | 0x80000000;
3625 w6
[1] = w6
[1] | 0x8000;
3629 w6
[1] = w6
[1] | 0x800000;
3633 w6
[1] = w6
[1] | 0x80000000;
3641 w6
[2] = w6
[2] | 0x8000;
3645 w6
[2] = w6
[2] | 0x800000;
3649 w6
[2] = w6
[2] | 0x80000000;
3657 w6
[3] = w6
[3] | 0x8000;
3661 w6
[3] = w6
[3] | 0x800000;
3665 w6
[3] = w6
[3] | 0x80000000;
3673 w7
[0] = w7
[0] | 0x8000;
3677 w7
[0] = w7
[0] | 0x800000;
3681 w7
[0] = w7
[0] | 0x80000000;
3689 w7
[1] = w7
[1] | 0x8000;
3693 w7
[1] = w7
[1] | 0x800000;
3697 w7
[1] = w7
[1] | 0x80000000;
3705 w7
[2] = w7
[2] | 0x8000;
3709 w7
[2] = w7
[2] | 0x800000;
3713 w7
[2] = w7
[2] | 0x80000000;
3721 w7
[3] = w7
[3] | 0x8000;
3725 w7
[3] = w7
[3] | 0x800000;
3729 w7
[3] = w7
[3] | 0x80000000;
3734 inline void append_0x80_1x16 (u32x w
[16], const u32 offset
)
3743 w
[ 0] = w
[ 0] | 0x8000;
3747 w
[ 0] = w
[ 0] | 0x800000;
3751 w
[ 0] = w
[ 0] | 0x80000000;
3759 w
[ 1] = w
[ 1] | 0x8000;
3763 w
[ 1] = w
[ 1] | 0x800000;
3767 w
[ 1] = w
[ 1] | 0x80000000;
3775 w
[ 2] = w
[ 2] | 0x8000;
3779 w
[ 2] = w
[ 2] | 0x800000;
3783 w
[ 2] = w
[ 2] | 0x80000000;
3791 w
[ 3] = w
[ 3] | 0x8000;
3795 w
[ 3] = w
[ 3] | 0x800000;
3799 w
[ 3] = w
[ 3] | 0x80000000;
3807 w
[ 4] = w
[ 4] | 0x8000;
3811 w
[ 4] = w
[ 4] | 0x800000;
3815 w
[ 4] = w
[ 4] | 0x80000000;
3823 w
[ 5] = w
[ 5] | 0x8000;
3827 w
[ 5] = w
[ 5] | 0x800000;
3831 w
[ 5] = w
[ 5] | 0x80000000;
3839 w
[ 6] = w
[ 6] | 0x8000;
3843 w
[ 6] = w
[ 6] | 0x800000;
3847 w
[ 6] = w
[ 6] | 0x80000000;
3855 w
[ 7] = w
[ 7] | 0x8000;
3859 w
[ 7] = w
[ 7] | 0x800000;
3863 w
[ 7] = w
[ 7] | 0x80000000;
3871 w
[ 8] = w
[ 8] | 0x8000;
3875 w
[ 8] = w
[ 8] | 0x800000;
3879 w
[ 8] = w
[ 8] | 0x80000000;
3887 w
[ 9] = w
[ 9] | 0x8000;
3891 w
[ 9] = w
[ 9] | 0x800000;
3895 w
[ 9] = w
[ 9] | 0x80000000;
3903 w
[10] = w
[10] | 0x8000;
3907 w
[10] = w
[10] | 0x800000;
3911 w
[10] = w
[10] | 0x80000000;
3919 w
[11] = w
[11] | 0x8000;
3923 w
[11] = w
[11] | 0x800000;
3927 w
[11] = w
[11] | 0x80000000;
3935 w
[12] = w
[12] | 0x8000;
3939 w
[12] = w
[12] | 0x800000;
3943 w
[12] = w
[12] | 0x80000000;
3951 w
[13] = w
[13] | 0x8000;
3955 w
[13] = w
[13] | 0x800000;
3959 w
[13] = w
[13] | 0x80000000;
3967 w
[14] = w
[14] | 0x8000;
3971 w
[14] = w
[14] | 0x800000;
3975 w
[14] = w
[14] | 0x80000000;
3983 w
[15] = w
[15] | 0x8000;
3987 w
[15] = w
[15] | 0x800000;
3991 w
[15] = w
[15] | 0x80000000;
3996 inline void switch_buffer_by_offset_le (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
3998 #if defined IS_AMD || defined IS_GENERIC
3999 const int offset_mod_4
= offset
& 3;
4001 const int offset_minus_4
= 4 - offset
;
4006 w3
[2] = amd_bytealign ( 0, w3
[1], offset_minus_4
);
4007 w3
[1] = amd_bytealign (w3
[1], w3
[0], offset_minus_4
);
4008 w3
[0] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
4009 w2
[3] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4010 w2
[2] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4011 w2
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4012 w2
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4013 w1
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4014 w1
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4015 w1
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4016 w1
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4017 w0
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4018 w0
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4019 w0
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4020 w0
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4022 if (offset_mod_4
== 0)
4044 w3
[2] = amd_bytealign ( 0, w3
[0], offset_minus_4
);
4045 w3
[1] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
4046 w3
[0] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4047 w2
[3] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4048 w2
[2] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4049 w2
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4050 w2
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4051 w1
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4052 w1
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4053 w1
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4054 w1
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4055 w0
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4056 w0
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4057 w0
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4060 if (offset_mod_4
== 0)
4081 w3
[2] = amd_bytealign ( 0, w2
[3], offset_minus_4
);
4082 w3
[1] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4083 w3
[0] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4084 w2
[3] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4085 w2
[2] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4086 w2
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4087 w2
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4088 w1
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4089 w1
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4090 w1
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4091 w1
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4092 w0
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4093 w0
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4097 if (offset_mod_4
== 0)
4117 w3
[2] = amd_bytealign ( 0, w2
[2], offset_minus_4
);
4118 w3
[1] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4119 w3
[0] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4120 w2
[3] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4121 w2
[2] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4122 w2
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4123 w2
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4124 w1
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4125 w1
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4126 w1
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4127 w1
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4128 w0
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4133 if (offset_mod_4
== 0)
4152 w3
[2] = amd_bytealign ( 0, w2
[1], offset_minus_4
);
4153 w3
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4154 w3
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4155 w2
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4156 w2
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4157 w2
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4158 w2
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4159 w1
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4160 w1
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4161 w1
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4162 w1
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4168 if (offset_mod_4
== 0)
4186 w3
[2] = amd_bytealign ( 0, w2
[0], offset_minus_4
);
4187 w3
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4188 w3
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4189 w2
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4190 w2
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4191 w2
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4192 w2
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4193 w1
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4194 w1
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4195 w1
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4202 if (offset_mod_4
== 0)
4219 w3
[2] = amd_bytealign ( 0, w1
[3], offset_minus_4
);
4220 w3
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4221 w3
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4222 w2
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4223 w2
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4224 w2
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4225 w2
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4226 w1
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4227 w1
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4235 if (offset_mod_4
== 0)
4251 w3
[2] = amd_bytealign ( 0, w1
[2], offset_minus_4
);
4252 w3
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4253 w3
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4254 w2
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4255 w2
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4256 w2
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4257 w2
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4258 w1
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4267 if (offset_mod_4
== 0)
4282 w3
[2] = amd_bytealign ( 0, w1
[1], offset_minus_4
);
4283 w3
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4284 w3
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4285 w2
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4286 w2
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4287 w2
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4288 w2
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4298 if (offset_mod_4
== 0)
4312 w3
[2] = amd_bytealign ( 0, w1
[0], offset_minus_4
);
4313 w3
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4314 w3
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4315 w2
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4316 w2
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4317 w2
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4328 if (offset_mod_4
== 0)
4341 w3
[2] = amd_bytealign ( 0, w0
[3], offset_minus_4
);
4342 w3
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4343 w3
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4344 w2
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4345 w2
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4357 if (offset_mod_4
== 0)
4369 w3
[2] = amd_bytealign ( 0, w0
[2], offset_minus_4
);
4370 w3
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4371 w3
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4372 w2
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4385 if (offset_mod_4
== 0)
4396 w3
[2] = amd_bytealign ( 0, w0
[1], offset_minus_4
);
4397 w3
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4398 w3
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4412 if (offset_mod_4
== 0)
4422 w3
[2] = amd_bytealign ( 0, w0
[0], offset_minus_4
);
4423 w3
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4438 if (offset_mod_4
== 0)
4449 const int offset_minus_4
= 4 - (offset
% 4);
4451 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
4456 w3
[1] = __byte_perm (w3
[0], w3
[1], selector
);
4457 w3
[0] = __byte_perm (w2
[3], w3
[0], selector
);
4458 w2
[3] = __byte_perm (w2
[2], w2
[3], selector
);
4459 w2
[2] = __byte_perm (w2
[1], w2
[2], selector
);
4460 w2
[1] = __byte_perm (w2
[0], w2
[1], selector
);
4461 w2
[0] = __byte_perm (w1
[3], w2
[0], selector
);
4462 w1
[3] = __byte_perm (w1
[2], w1
[3], selector
);
4463 w1
[2] = __byte_perm (w1
[1], w1
[2], selector
);
4464 w1
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4465 w1
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4466 w0
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4467 w0
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4468 w0
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4469 w0
[0] = __byte_perm ( 0, w0
[0], selector
);
4474 w3
[1] = __byte_perm (w2
[3], w3
[0], selector
);
4475 w3
[0] = __byte_perm (w2
[2], w2
[3], selector
);
4476 w2
[3] = __byte_perm (w2
[1], w2
[2], selector
);
4477 w2
[2] = __byte_perm (w2
[0], w2
[1], selector
);
4478 w2
[1] = __byte_perm (w1
[3], w2
[0], selector
);
4479 w2
[0] = __byte_perm (w1
[2], w1
[3], selector
);
4480 w1
[3] = __byte_perm (w1
[1], w1
[2], selector
);
4481 w1
[2] = __byte_perm (w1
[0], w1
[1], selector
);
4482 w1
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4483 w1
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4484 w0
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4485 w0
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4486 w0
[1] = __byte_perm ( 0, w0
[0], selector
);
4492 w3
[1] = __byte_perm (w2
[2], w2
[3], selector
);
4493 w3
[0] = __byte_perm (w2
[1], w2
[2], selector
);
4494 w2
[3] = __byte_perm (w2
[0], w2
[1], selector
);
4495 w2
[2] = __byte_perm (w1
[3], w2
[0], selector
);
4496 w2
[1] = __byte_perm (w1
[2], w1
[3], selector
);
4497 w2
[0] = __byte_perm (w1
[1], w1
[2], selector
);
4498 w1
[3] = __byte_perm (w1
[0], w1
[1], selector
);
4499 w1
[2] = __byte_perm (w0
[3], w1
[0], selector
);
4500 w1
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4501 w1
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4502 w0
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4503 w0
[2] = __byte_perm ( 0, w0
[0], selector
);
4510 w3
[1] = __byte_perm (w2
[1], w2
[2], selector
);
4511 w3
[0] = __byte_perm (w2
[0], w2
[1], selector
);
4512 w2
[3] = __byte_perm (w1
[3], w2
[0], selector
);
4513 w2
[2] = __byte_perm (w1
[2], w1
[3], selector
);
4514 w2
[1] = __byte_perm (w1
[1], w1
[2], selector
);
4515 w2
[0] = __byte_perm (w1
[0], w1
[1], selector
);
4516 w1
[3] = __byte_perm (w0
[3], w1
[0], selector
);
4517 w1
[2] = __byte_perm (w0
[2], w0
[3], selector
);
4518 w1
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4519 w1
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4520 w0
[3] = __byte_perm ( 0, w0
[0], selector
);
4528 w3
[1] = __byte_perm (w2
[0], w2
[1], selector
);
4529 w3
[0] = __byte_perm (w1
[3], w2
[0], selector
);
4530 w2
[3] = __byte_perm (w1
[2], w1
[3], selector
);
4531 w2
[2] = __byte_perm (w1
[1], w1
[2], selector
);
4532 w2
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4533 w2
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4534 w1
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4535 w1
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4536 w1
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4537 w1
[0] = __byte_perm ( 0, w0
[0], selector
);
4546 w3
[1] = __byte_perm (w1
[3], w2
[0], selector
);
4547 w3
[0] = __byte_perm (w1
[2], w1
[3], selector
);
4548 w2
[3] = __byte_perm (w1
[1], w1
[2], selector
);
4549 w2
[2] = __byte_perm (w1
[0], w1
[1], selector
);
4550 w2
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4551 w2
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4552 w1
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4553 w1
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4554 w1
[1] = __byte_perm ( 0, w0
[0], selector
);
4564 w3
[1] = __byte_perm (w1
[2], w1
[3], selector
);
4565 w3
[0] = __byte_perm (w1
[1], w1
[2], selector
);
4566 w2
[3] = __byte_perm (w1
[0], w1
[1], selector
);
4567 w2
[2] = __byte_perm (w0
[3], w1
[0], selector
);
4568 w2
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4569 w2
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4570 w1
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4571 w1
[2] = __byte_perm ( 0, w0
[0], selector
);
4582 w3
[1] = __byte_perm (w1
[1], w1
[2], selector
);
4583 w3
[0] = __byte_perm (w1
[0], w1
[1], selector
);
4584 w2
[3] = __byte_perm (w0
[3], w1
[0], selector
);
4585 w2
[2] = __byte_perm (w0
[2], w0
[3], selector
);
4586 w2
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4587 w2
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4588 w1
[3] = __byte_perm ( 0, w0
[0], selector
);
4600 w3
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4601 w3
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4602 w2
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4603 w2
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4604 w2
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4605 w2
[0] = __byte_perm ( 0, w0
[0], selector
);
4618 w3
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4619 w3
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4620 w2
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4621 w2
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4622 w2
[1] = __byte_perm ( 0, w0
[0], selector
);
4636 w3
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4637 w3
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4638 w2
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4639 w2
[2] = __byte_perm ( 0, w0
[0], selector
);
4654 w3
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4655 w3
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4656 w2
[3] = __byte_perm ( 0, w0
[0], selector
);
4672 w3
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4673 w3
[0] = __byte_perm ( 0, w0
[0], selector
);
4690 w3
[1] = __byte_perm ( 0, w0
[0], selector
);
4710 inline void switch_buffer_by_offset_be (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
4712 #if defined IS_AMD || defined IS_GENERIC
4716 w3
[2] = amd_bytealign (w3
[1], 0, offset
);
4717 w3
[1] = amd_bytealign (w3
[0], w3
[1], offset
);
4718 w3
[0] = amd_bytealign (w2
[3], w3
[0], offset
);
4719 w2
[3] = amd_bytealign (w2
[2], w2
[3], offset
);
4720 w2
[2] = amd_bytealign (w2
[1], w2
[2], offset
);
4721 w2
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
4722 w2
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
4723 w1
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
4724 w1
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
4725 w1
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4726 w1
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4727 w0
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4728 w0
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4729 w0
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4730 w0
[0] = amd_bytealign ( 0, w0
[0], offset
);
4734 w3
[2] = amd_bytealign (w3
[0], 0, offset
);
4735 w3
[1] = amd_bytealign (w2
[3], w3
[0], offset
);
4736 w3
[0] = amd_bytealign (w2
[2], w2
[3], offset
);
4737 w2
[3] = amd_bytealign (w2
[1], w2
[2], offset
);
4738 w2
[2] = amd_bytealign (w2
[0], w2
[1], offset
);
4739 w2
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
4740 w2
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
4741 w1
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
4742 w1
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
4743 w1
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4744 w1
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4745 w0
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4746 w0
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4747 w0
[1] = amd_bytealign ( 0, w0
[0], offset
);
4752 w3
[2] = amd_bytealign (w2
[3], 0, offset
);
4753 w3
[1] = amd_bytealign (w2
[2], w2
[3], offset
);
4754 w3
[0] = amd_bytealign (w2
[1], w2
[2], offset
);
4755 w2
[3] = amd_bytealign (w2
[0], w2
[1], offset
);
4756 w2
[2] = amd_bytealign (w1
[3], w2
[0], offset
);
4757 w2
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
4758 w2
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
4759 w1
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
4760 w1
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
4761 w1
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4762 w1
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4763 w0
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4764 w0
[2] = amd_bytealign ( 0, w0
[0], offset
);
4770 w3
[2] = amd_bytealign (w2
[2], 0, offset
);
4771 w3
[1] = amd_bytealign (w2
[1], w2
[2], offset
);
4772 w3
[0] = amd_bytealign (w2
[0], w2
[1], offset
);
4773 w2
[3] = amd_bytealign (w1
[3], w2
[0], offset
);
4774 w2
[2] = amd_bytealign (w1
[2], w1
[3], offset
);
4775 w2
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
4776 w2
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
4777 w1
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
4778 w1
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
4779 w1
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4780 w1
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4781 w0
[3] = amd_bytealign ( 0, w0
[0], offset
);
4788 w3
[2] = amd_bytealign (w2
[1], 0, offset
);
4789 w3
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
4790 w3
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
4791 w2
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
4792 w2
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
4793 w2
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4794 w2
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4795 w1
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4796 w1
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4797 w1
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4798 w1
[0] = amd_bytealign ( 0, w0
[0], offset
);
4806 w3
[2] = amd_bytealign (w2
[0], 0, offset
);
4807 w3
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
4808 w3
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
4809 w2
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
4810 w2
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
4811 w2
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4812 w2
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4813 w1
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4814 w1
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4815 w1
[1] = amd_bytealign ( 0, w0
[0], offset
);
4824 w3
[2] = amd_bytealign (w1
[3], 0, offset
);
4825 w3
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
4826 w3
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
4827 w2
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
4828 w2
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
4829 w2
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4830 w2
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4831 w1
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4832 w1
[2] = amd_bytealign ( 0, w0
[0], offset
);
4842 w3
[2] = amd_bytealign (w1
[2], 0, offset
);
4843 w3
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
4844 w3
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
4845 w2
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
4846 w2
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
4847 w2
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4848 w2
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4849 w1
[3] = amd_bytealign ( 0, w0
[0], offset
);
4860 w3
[2] = amd_bytealign (w1
[1], 0, offset
);
4861 w3
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4862 w3
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4863 w2
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4864 w2
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4865 w2
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4866 w2
[0] = amd_bytealign ( 0, w0
[0], offset
);
4878 w3
[2] = amd_bytealign (w1
[0], 0, offset
);
4879 w3
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4880 w3
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4881 w2
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4882 w2
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4883 w2
[1] = amd_bytealign ( 0, w0
[0], offset
);
4896 w3
[2] = amd_bytealign (w0
[3], 0, offset
);
4897 w3
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4898 w3
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4899 w2
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4900 w2
[2] = amd_bytealign ( 0, w0
[0], offset
);
4914 w3
[2] = amd_bytealign (w0
[2], 0, offset
);
4915 w3
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4916 w3
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4917 w2
[3] = amd_bytealign ( 0, w0
[0], offset
);
4932 w3
[2] = amd_bytealign (w0
[1], 0, offset
);
4933 w3
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4934 w3
[0] = amd_bytealign ( 0, w0
[0], offset
);
4950 w3
[2] = amd_bytealign (w0
[0], 0, offset
);
4951 w3
[1] = amd_bytealign ( 0, w0
[0], offset
);
4970 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
4975 w3
[1] = __byte_perm (w3
[1], w3
[0], selector
);
4976 w3
[0] = __byte_perm (w3
[0], w2
[3], selector
);
4977 w2
[3] = __byte_perm (w2
[3], w2
[2], selector
);
4978 w2
[2] = __byte_perm (w2
[2], w2
[1], selector
);
4979 w2
[1] = __byte_perm (w2
[1], w2
[0], selector
);
4980 w2
[0] = __byte_perm (w2
[0], w1
[3], selector
);
4981 w1
[3] = __byte_perm (w1
[3], w1
[2], selector
);
4982 w1
[2] = __byte_perm (w1
[2], w1
[1], selector
);
4983 w1
[1] = __byte_perm (w1
[1], w1
[0], selector
);
4984 w1
[0] = __byte_perm (w1
[0], w0
[3], selector
);
4985 w0
[3] = __byte_perm (w0
[3], w0
[2], selector
);
4986 w0
[2] = __byte_perm (w0
[2], w0
[1], selector
);
4987 w0
[1] = __byte_perm (w0
[1], w0
[0], selector
);
4988 w0
[0] = __byte_perm (w0
[0], 0, selector
);
4992 w3
[1] = __byte_perm (w3
[0], w2
[3], selector
);
4993 w3
[0] = __byte_perm (w2
[3], w2
[2], selector
);
4994 w2
[3] = __byte_perm (w2
[2], w2
[1], selector
);
4995 w2
[2] = __byte_perm (w2
[1], w2
[0], selector
);
4996 w2
[1] = __byte_perm (w2
[0], w1
[3], selector
);
4997 w2
[0] = __byte_perm (w1
[3], w1
[2], selector
);
4998 w1
[3] = __byte_perm (w1
[2], w1
[1], selector
);
4999 w1
[2] = __byte_perm (w1
[1], w1
[0], selector
);
5000 w1
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5001 w1
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5002 w0
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5003 w0
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5004 w0
[1] = __byte_perm (w0
[0], 0, selector
);
5009 w3
[1] = __byte_perm (w2
[3], w2
[2], selector
);
5010 w3
[0] = __byte_perm (w2
[2], w2
[1], selector
);
5011 w2
[3] = __byte_perm (w2
[1], w2
[0], selector
);
5012 w2
[2] = __byte_perm (w2
[0], w1
[3], selector
);
5013 w2
[1] = __byte_perm (w1
[3], w1
[2], selector
);
5014 w2
[0] = __byte_perm (w1
[2], w1
[1], selector
);
5015 w1
[3] = __byte_perm (w1
[1], w1
[0], selector
);
5016 w1
[2] = __byte_perm (w1
[0], w0
[3], selector
);
5017 w1
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5018 w1
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5019 w0
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5020 w0
[2] = __byte_perm (w0
[0], 0, selector
);
5026 w3
[1] = __byte_perm (w2
[2], w2
[1], selector
);
5027 w3
[0] = __byte_perm (w2
[1], w2
[0], selector
);
5028 w2
[3] = __byte_perm (w2
[0], w1
[3], selector
);
5029 w2
[2] = __byte_perm (w1
[3], w1
[2], selector
);
5030 w2
[1] = __byte_perm (w1
[2], w1
[1], selector
);
5031 w2
[0] = __byte_perm (w1
[1], w1
[0], selector
);
5032 w1
[3] = __byte_perm (w1
[0], w0
[3], selector
);
5033 w1
[2] = __byte_perm (w0
[3], w0
[2], selector
);
5034 w1
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5035 w1
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5036 w0
[3] = __byte_perm (w0
[0], 0, selector
);
5043 w3
[1] = __byte_perm (w2
[1], w2
[0], selector
);
5044 w3
[0] = __byte_perm (w2
[0], w1
[3], selector
);
5045 w2
[3] = __byte_perm (w1
[3], w1
[2], selector
);
5046 w2
[2] = __byte_perm (w1
[2], w1
[1], selector
);
5047 w2
[1] = __byte_perm (w1
[1], w1
[0], selector
);
5048 w2
[0] = __byte_perm (w1
[0], w0
[3], selector
);
5049 w1
[3] = __byte_perm (w0
[3], w0
[2], selector
);
5050 w1
[2] = __byte_perm (w0
[2], w0
[1], selector
);
5051 w1
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5052 w1
[0] = __byte_perm (w0
[0], 0, selector
);
5060 w3
[1] = __byte_perm (w2
[0], w1
[3], selector
);
5061 w3
[0] = __byte_perm (w1
[3], w1
[2], selector
);
5062 w2
[3] = __byte_perm (w1
[2], w1
[1], selector
);
5063 w2
[2] = __byte_perm (w1
[1], w1
[0], selector
);
5064 w2
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5065 w2
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5066 w1
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5067 w1
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5068 w1
[1] = __byte_perm (w0
[0], 0, selector
);
5077 w3
[1] = __byte_perm (w1
[3], w1
[2], selector
);
5078 w3
[0] = __byte_perm (w1
[2], w1
[1], selector
);
5079 w2
[3] = __byte_perm (w1
[1], w1
[0], selector
);
5080 w2
[2] = __byte_perm (w1
[0], w0
[3], selector
);
5081 w2
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5082 w2
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5083 w1
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5084 w1
[2] = __byte_perm (w0
[0], 0, selector
);
5094 w3
[1] = __byte_perm (w1
[2], w1
[1], selector
);
5095 w3
[0] = __byte_perm (w1
[1], w1
[0], selector
);
5096 w2
[3] = __byte_perm (w1
[0], w0
[3], selector
);
5097 w2
[2] = __byte_perm (w0
[3], w0
[2], selector
);
5098 w2
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5099 w2
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5100 w1
[3] = __byte_perm (w0
[0], 0, selector
);
5111 w3
[1] = __byte_perm (w1
[1], w1
[0], selector
);
5112 w3
[0] = __byte_perm (w1
[0], w0
[3], selector
);
5113 w2
[3] = __byte_perm (w0
[3], w0
[2], selector
);
5114 w2
[2] = __byte_perm (w0
[2], w0
[1], selector
);
5115 w2
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5116 w2
[0] = __byte_perm (w0
[0], 0, selector
);
5128 w3
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5129 w3
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5130 w2
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5131 w2
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5132 w2
[1] = __byte_perm (w0
[0], 0, selector
);
5145 w3
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5146 w3
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5147 w2
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5148 w2
[2] = __byte_perm (w0
[0], 0, selector
);
5162 w3
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5163 w3
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5164 w2
[3] = __byte_perm (w0
[0], 0, selector
);
5179 w3
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5180 w3
[0] = __byte_perm (w0
[0], 0, selector
);
5196 w3
[1] = __byte_perm (w0
[0], 0, selector
);
5215 inline void overwrite_at_le (u32x sw
[16], const u32x w0
, const u32 salt_len
)
5217 #if defined cl_amd_media_ops
5222 case 1: sw
[0] = amd_bytealign (w0
, sw
[0] << 24, 3);
5223 sw
[1] = amd_bytealign (sw
[1] >> 8, w0
, 3);
5225 case 2: sw
[0] = amd_bytealign (w0
, sw
[0] << 16, 2);
5226 sw
[1] = amd_bytealign (sw
[1] >> 16, w0
, 2);
5228 case 3: sw
[0] = amd_bytealign (w0
, sw
[0] << 8, 1);
5229 sw
[1] = amd_bytealign (sw
[1] >> 24, w0
, 1);
5233 case 5: sw
[1] = amd_bytealign (w0
, sw
[1] << 24, 3);
5234 sw
[2] = amd_bytealign (sw
[2] >> 8, w0
, 3);
5236 case 6: sw
[1] = amd_bytealign (w0
, sw
[1] << 16, 2);
5237 sw
[2] = amd_bytealign (sw
[2] >> 16, w0
, 2);
5239 case 7: sw
[1] = amd_bytealign (w0
, sw
[1] << 8, 1);
5240 sw
[2] = amd_bytealign (sw
[2] >> 24, w0
, 1);
5244 case 9: sw
[2] = amd_bytealign (w0
, sw
[2] << 24, 3);
5245 sw
[3] = amd_bytealign (sw
[3] >> 8, w0
, 3);
5247 case 10: sw
[2] = amd_bytealign (w0
, sw
[2] << 16, 2);
5248 sw
[3] = amd_bytealign (sw
[3] >> 16, w0
, 2);
5250 case 11: sw
[2] = amd_bytealign (w0
, sw
[2] << 8, 1);
5251 sw
[3] = amd_bytealign (sw
[3] >> 24, w0
, 1);
5253 case 12: sw
[3] = w0
;
5255 case 13: sw
[3] = amd_bytealign (w0
, sw
[3] << 24, 3);
5256 sw
[4] = amd_bytealign (sw
[4] >> 8, w0
, 3);
5258 case 14: sw
[3] = amd_bytealign (w0
, sw
[3] << 16, 2);
5259 sw
[4] = amd_bytealign (sw
[4] >> 16, w0
, 2);
5261 case 15: sw
[3] = amd_bytealign (w0
, sw
[3] << 8, 1);
5262 sw
[4] = amd_bytealign (sw
[4] >> 24, w0
, 1);
5264 case 16: sw
[4] = w0
;
5266 case 17: sw
[4] = amd_bytealign (w0
, sw
[4] << 24, 3);
5267 sw
[5] = amd_bytealign (sw
[5] >> 8, w0
, 3);
5269 case 18: sw
[4] = amd_bytealign (w0
, sw
[4] << 16, 2);
5270 sw
[5] = amd_bytealign (sw
[5] >> 16, w0
, 2);
5272 case 19: sw
[4] = amd_bytealign (w0
, sw
[4] << 8, 1);
5273 sw
[5] = amd_bytealign (sw
[5] >> 24, w0
, 1);
5275 case 20: sw
[5] = w0
;
5277 case 21: sw
[5] = amd_bytealign (w0
, sw
[5] << 24, 3);
5278 sw
[6] = amd_bytealign (sw
[6] >> 8, w0
, 3);
5280 case 22: sw
[5] = amd_bytealign (w0
, sw
[5] << 16, 2);
5281 sw
[6] = amd_bytealign (sw
[6] >> 16, w0
, 2);
5283 case 23: sw
[5] = amd_bytealign (w0
, sw
[5] << 8, 1);
5284 sw
[6] = amd_bytealign (sw
[6] >> 24, w0
, 1);
5286 case 24: sw
[6] = w0
;
5288 case 25: sw
[6] = amd_bytealign (w0
, sw
[6] << 24, 3);
5289 sw
[7] = amd_bytealign (sw
[7] >> 8, w0
, 3);
5291 case 26: sw
[6] = amd_bytealign (w0
, sw
[6] << 16, 2);
5292 sw
[7] = amd_bytealign (sw
[7] >> 16, w0
, 2);
5294 case 27: sw
[6] = amd_bytealign (w0
, sw
[6] << 8, 1);
5295 sw
[7] = amd_bytealign (sw
[7] >> 24, w0
, 1);
5297 case 28: sw
[7] = w0
;
5299 case 29: sw
[7] = amd_bytealign (w0
, sw
[7] << 24, 3);
5300 sw
[8] = amd_bytealign (sw
[8] >> 8, w0
, 3);
5302 case 30: sw
[7] = amd_bytealign (w0
, sw
[7] << 16, 2);
5303 sw
[8] = amd_bytealign (sw
[8] >> 16, w0
, 2);
5305 case 31: sw
[7] = amd_bytealign (w0
, sw
[7] << 8, 1);
5306 sw
[8] = amd_bytealign (sw
[8] >> 24, w0
, 1);
5314 case 1: sw
[0] = (sw
[0] & 0x000000ff) | (w0
<< 8);
5315 sw
[1] = (sw
[1] & 0xffffff00) | (w0
>> 24);
5317 case 2: sw
[0] = (sw
[0] & 0x0000ffff) | (w0
<< 16);
5318 sw
[1] = (sw
[1] & 0xffff0000) | (w0
>> 16);
5320 case 3: sw
[0] = (sw
[0] & 0x00ffffff) | (w0
<< 24);
5321 sw
[1] = (sw
[1] & 0xff000000) | (w0
>> 8);
5325 case 5: sw
[1] = (sw
[1] & 0x000000ff) | (w0
<< 8);
5326 sw
[2] = (sw
[2] & 0xffffff00) | (w0
>> 24);
5328 case 6: sw
[1] = (sw
[1] & 0x0000ffff) | (w0
<< 16);
5329 sw
[2] = (sw
[2] & 0xffff0000) | (w0
>> 16);
5331 case 7: sw
[1] = (sw
[1] & 0x00ffffff) | (w0
<< 24);
5332 sw
[2] = (sw
[2] & 0xff000000) | (w0
>> 8);
5336 case 9: sw
[2] = (sw
[2] & 0x000000ff) | (w0
<< 8);
5337 sw
[3] = (sw
[3] & 0xffffff00) | (w0
>> 24);
5339 case 10: sw
[2] = (sw
[2] & 0x0000ffff) | (w0
<< 16);
5340 sw
[3] = (sw
[3] & 0xffff0000) | (w0
>> 16);
5342 case 11: sw
[2] = (sw
[2] & 0x00ffffff) | (w0
<< 24);
5343 sw
[3] = (sw
[3] & 0xff000000) | (w0
>> 8);
5345 case 12: sw
[3] = w0
;
5347 case 13: sw
[3] = (sw
[3] & 0x000000ff) | (w0
<< 8);
5348 sw
[4] = (sw
[4] & 0xffffff00) | (w0
>> 24);
5350 case 14: sw
[3] = (sw
[3] & 0x0000ffff) | (w0
<< 16);
5351 sw
[4] = (sw
[4] & 0xffff0000) | (w0
>> 16);
5353 case 15: sw
[3] = (sw
[3] & 0x00ffffff) | (w0
<< 24);
5354 sw
[4] = (sw
[4] & 0xff000000) | (w0
>> 8);
5356 case 16: sw
[4] = w0
;
5358 case 17: sw
[4] = (sw
[4] & 0x000000ff) | (w0
<< 8);
5359 sw
[5] = (sw
[5] & 0xffffff00) | (w0
>> 24);
5361 case 18: sw
[4] = (sw
[4] & 0x0000ffff) | (w0
<< 16);
5362 sw
[5] = (sw
[5] & 0xffff0000) | (w0
>> 16);
5364 case 19: sw
[4] = (sw
[4] & 0x00ffffff) | (w0
<< 24);
5365 sw
[5] = (sw
[5] & 0xff000000) | (w0
>> 8);
5367 case 20: sw
[5] = w0
;
5369 case 21: sw
[5] = (sw
[5] & 0x000000ff) | (w0
<< 8);
5370 sw
[6] = (sw
[6] & 0xffffff00) | (w0
>> 24);
5372 case 22: sw
[5] = (sw
[5] & 0x0000ffff) | (w0
<< 16);
5373 sw
[6] = (sw
[6] & 0xffff0000) | (w0
>> 16);
5375 case 23: sw
[5] = (sw
[5] & 0x00ffffff) | (w0
<< 24);
5376 sw
[6] = (sw
[6] & 0xff000000) | (w0
>> 8);
5378 case 24: sw
[6] = w0
;
5380 case 25: sw
[6] = (sw
[6] & 0x000000ff) | (w0
<< 8);
5381 sw
[7] = (sw
[7] & 0xffffff00) | (w0
>> 24);
5383 case 26: sw
[6] = (sw
[6] & 0x0000ffff) | (w0
<< 16);
5384 sw
[7] = (sw
[7] & 0xffff0000) | (w0
>> 16);
5386 case 27: sw
[6] = (sw
[6] & 0x00ffffff) | (w0
<< 24);
5387 sw
[7] = (sw
[7] & 0xff000000) | (w0
>> 8);
5389 case 28: sw
[7] = w0
;
5391 case 29: sw
[7] = (sw
[7] & 0x000000ff) | (w0
<< 8);
5392 sw
[8] = (sw
[8] & 0xffffff00) | (w0
>> 24);
5394 case 30: sw
[7] = (sw
[7] & 0x0000ffff) | (w0
<< 16);
5395 sw
[8] = (sw
[8] & 0xffff0000) | (w0
>> 16);
5397 case 31: sw
[7] = (sw
[7] & 0x00ffffff) | (w0
<< 24);
5398 sw
[8] = (sw
[8] & 0xff000000) | (w0
>> 8);
5404 inline void overwrite_at_be (u32x sw
[16], const u32x w0
, const u32 salt_len
)
5406 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5412 case 1: sw
[0] = (sw
[0] & 0xff000000) | (w0
>> 8);
5413 sw
[1] = (sw
[1] & 0x00ffffff) | (w0
<< 24);
5415 case 2: sw
[0] = (sw
[0] & 0xffff0000) | (w0
>> 16);
5416 sw
[1] = (sw
[1] & 0x0000ffff) | (w0
<< 16);
5418 case 3: sw
[0] = (sw
[0] & 0xffffff00) | (w0
>> 24);
5419 sw
[1] = (sw
[1] & 0x000000ff) | (w0
<< 8);
5423 case 5: sw
[1] = (sw
[1] & 0xff000000) | (w0
>> 8);
5424 sw
[2] = (sw
[2] & 0x00ffffff) | (w0
<< 24);
5426 case 6: sw
[1] = (sw
[1] & 0xffff0000) | (w0
>> 16);
5427 sw
[2] = (sw
[2] & 0x0000ffff) | (w0
<< 16);
5429 case 7: sw
[1] = (sw
[1] & 0xffffff00) | (w0
>> 24);
5430 sw
[2] = (sw
[2] & 0x000000ff) | (w0
<< 8);
5434 case 9: sw
[2] = (sw
[2] & 0xff000000) | (w0
>> 8);
5435 sw
[3] = (sw
[3] & 0x00ffffff) | (w0
<< 24);
5437 case 10: sw
[2] = (sw
[2] & 0xffff0000) | (w0
>> 16);
5438 sw
[3] = (sw
[3] & 0x0000ffff) | (w0
<< 16);
5440 case 11: sw
[2] = (sw
[2] & 0xffffff00) | (w0
>> 24);
5441 sw
[3] = (sw
[3] & 0x000000ff) | (w0
<< 8);
5443 case 12: sw
[3] = w0
;
5445 case 13: sw
[3] = (sw
[3] & 0xff000000) | (w0
>> 8);
5446 sw
[4] = (sw
[4] & 0x00ffffff) | (w0
<< 24);
5448 case 14: sw
[3] = (sw
[3] & 0xffff0000) | (w0
>> 16);
5449 sw
[4] = (sw
[4] & 0x0000ffff) | (w0
<< 16);
5451 case 15: sw
[3] = (sw
[3] & 0xffffff00) | (w0
>> 24);
5452 sw
[4] = (sw
[4] & 0x000000ff) | (w0
<< 8);
5454 case 16: sw
[4] = w0
;
5456 case 17: sw
[4] = (sw
[4] & 0xff000000) | (w0
>> 8);
5457 sw
[5] = (sw
[5] & 0x00ffffff) | (w0
<< 24);
5459 case 18: sw
[4] = (sw
[4] & 0xffff0000) | (w0
>> 16);
5460 sw
[5] = (sw
[5] & 0x0000ffff) | (w0
<< 16);
5462 case 19: sw
[4] = (sw
[4] & 0xffffff00) | (w0
>> 24);
5463 sw
[5] = (sw
[5] & 0x000000ff) | (w0
<< 8);
5465 case 20: sw
[5] = w0
;
5467 case 21: sw
[5] = (sw
[5] & 0xff000000) | (w0
>> 8);
5468 sw
[6] = (sw
[6] & 0x00ffffff) | (w0
<< 24);
5470 case 22: sw
[5] = (sw
[5] & 0xffff0000) | (w0
>> 16);
5471 sw
[6] = (sw
[6] & 0x0000ffff) | (w0
<< 16);
5473 case 23: sw
[5] = (sw
[5] & 0xffffff00) | (w0
>> 24);
5474 sw
[6] = (sw
[6] & 0x000000ff) | (w0
<< 8);
5476 case 24: sw
[6] = w0
;
5478 case 25: sw
[6] = (sw
[6] & 0xff000000) | (w0
>> 8);
5479 sw
[7] = (sw
[7] & 0x00ffffff) | (w0
<< 24);
5481 case 26: sw
[6] = (sw
[6] & 0xffff0000) | (w0
>> 16);
5482 sw
[7] = (sw
[7] & 0x0000ffff) | (w0
<< 16);
5484 case 27: sw
[6] = (sw
[6] & 0xffffff00) | (w0
>> 24);
5485 sw
[7] = (sw
[7] & 0x000000ff) | (w0
<< 8);
5487 case 28: sw
[7] = w0
;
5489 case 29: sw
[7] = (sw
[7] & 0xff000000) | (w0
>> 8);
5490 sw
[8] = (sw
[8] & 0x00ffffff) | (w0
<< 24);
5492 case 30: sw
[7] = (sw
[7] & 0xffff0000) | (w0
>> 16);
5493 sw
[8] = (sw
[8] & 0x0000ffff) | (w0
<< 16);
5495 case 31: sw
[7] = (sw
[7] & 0xffffff00) | (w0
>> 24);
5496 sw
[8] = (sw
[8] & 0x000000ff) | (w0
<< 8);
5501 inline void overwrite_at_le_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x wx
, const u32 salt_len
)
5503 #if defined cl_amd_media_ops
5508 case 1: w0
[0] = amd_bytealign (wx
, w0
[0] << 24, 3);
5509 w0
[1] = amd_bytealign (w0
[1] >> 8, wx
, 3);
5511 case 2: w0
[0] = amd_bytealign (wx
, w0
[0] << 16, 2);
5512 w0
[1] = amd_bytealign (w0
[1] >> 16, wx
, 2);
5514 case 3: w0
[0] = amd_bytealign (wx
, w0
[0] << 8, 1);
5515 w0
[1] = amd_bytealign (w0
[1] >> 24, wx
, 1);
5519 case 5: w0
[1] = amd_bytealign (wx
, w0
[1] << 24, 3);
5520 w0
[2] = amd_bytealign (w0
[2] >> 8, wx
, 3);
5522 case 6: w0
[1] = amd_bytealign (wx
, w0
[1] << 16, 2);
5523 w0
[2] = amd_bytealign (w0
[2] >> 16, wx
, 2);
5525 case 7: w0
[1] = amd_bytealign (wx
, w0
[1] << 8, 1);
5526 w0
[2] = amd_bytealign (w0
[2] >> 24, wx
, 1);
5530 case 9: w0
[2] = amd_bytealign (wx
, w0
[2] << 24, 3);
5531 w0
[3] = amd_bytealign (w0
[3] >> 8, wx
, 3);
5533 case 10: w0
[2] = amd_bytealign (wx
, w0
[2] << 16, 2);
5534 w0
[3] = amd_bytealign (w0
[3] >> 16, wx
, 2);
5536 case 11: w0
[2] = amd_bytealign (wx
, w0
[2] << 8, 1);
5537 w0
[3] = amd_bytealign (w0
[3] >> 24, wx
, 1);
5539 case 12: w0
[3] = wx
;
5541 case 13: w0
[3] = amd_bytealign (wx
, w0
[3] << 24, 3);
5542 w1
[0] = amd_bytealign (w1
[0] >> 8, wx
, 3);
5544 case 14: w0
[3] = amd_bytealign (wx
, w0
[3] << 16, 2);
5545 w1
[0] = amd_bytealign (w1
[0] >> 16, wx
, 2);
5547 case 15: w0
[3] = amd_bytealign (wx
, w0
[3] << 8, 1);
5548 w1
[0] = amd_bytealign (w1
[0] >> 24, wx
, 1);
5550 case 16: w1
[0] = wx
;
5552 case 17: w1
[0] = amd_bytealign (wx
, w1
[0] << 24, 3);
5553 w1
[1] = amd_bytealign (w1
[1] >> 8, wx
, 3);
5555 case 18: w1
[0] = amd_bytealign (wx
, w1
[0] << 16, 2);
5556 w1
[1] = amd_bytealign (w1
[1] >> 16, wx
, 2);
5558 case 19: w1
[0] = amd_bytealign (wx
, w1
[0] << 8, 1);
5559 w1
[1] = amd_bytealign (w1
[1] >> 24, wx
, 1);
5561 case 20: w1
[1] = wx
;
5563 case 21: w1
[1] = amd_bytealign (wx
, w1
[1] << 24, 3);
5564 w1
[2] = amd_bytealign (w1
[2] >> 8, wx
, 3);
5566 case 22: w1
[1] = amd_bytealign (wx
, w1
[1] << 16, 2);
5567 w1
[2] = amd_bytealign (w1
[2] >> 16, wx
, 2);
5569 case 23: w1
[1] = amd_bytealign (wx
, w1
[1] << 8, 1);
5570 w1
[2] = amd_bytealign (w1
[2] >> 24, wx
, 1);
5572 case 24: w1
[2] = wx
;
5574 case 25: w1
[2] = amd_bytealign (wx
, w1
[2] << 24, 3);
5575 w1
[3] = amd_bytealign (w1
[3] >> 8, wx
, 3);
5577 case 26: w1
[2] = amd_bytealign (wx
, w1
[2] << 16, 2);
5578 w1
[3] = amd_bytealign (w1
[3] >> 16, wx
, 2);
5580 case 27: w1
[2] = amd_bytealign (wx
, w1
[2] << 8, 1);
5581 w1
[3] = amd_bytealign (w1
[3] >> 24, wx
, 1);
5583 case 28: w1
[3] = wx
;
5585 case 29: w1
[3] = amd_bytealign (wx
, w1
[3] << 24, 3);
5586 w2
[0] = amd_bytealign (w2
[0] >> 8, wx
, 3);
5588 case 30: w1
[3] = amd_bytealign (wx
, w1
[3] << 16, 2);
5589 w2
[0] = amd_bytealign (w2
[0] >> 16, wx
, 2);
5591 case 31: w1
[3] = amd_bytealign (wx
, w1
[3] << 8, 1);
5592 w2
[0] = amd_bytealign (w2
[0] >> 24, wx
, 1);
5594 case 32: w2
[0] = wx
;
5596 case 33: w2
[0] = amd_bytealign (wx
, w2
[0] << 24, 3);
5597 w2
[1] = amd_bytealign (w2
[1] >> 8, wx
, 3);
5599 case 34: w2
[0] = amd_bytealign (wx
, w2
[0] << 16, 2);
5600 w2
[1] = amd_bytealign (w2
[1] >> 16, wx
, 2);
5602 case 35: w2
[0] = amd_bytealign (wx
, w2
[0] << 8, 1);
5603 w2
[1] = amd_bytealign (w2
[1] >> 24, wx
, 1);
5605 case 36: w2
[1] = wx
;
5607 case 37: w2
[1] = amd_bytealign (wx
, w2
[1] << 24, 3);
5608 w2
[2] = amd_bytealign (w2
[2] >> 8, wx
, 3);
5610 case 38: w2
[1] = amd_bytealign (wx
, w2
[1] << 16, 2);
5611 w2
[2] = amd_bytealign (w2
[2] >> 16, wx
, 2);
5613 case 39: w2
[1] = amd_bytealign (wx
, w2
[1] << 8, 1);
5614 w2
[2] = amd_bytealign (w2
[2] >> 24, wx
, 1);
5616 case 40: w2
[2] = wx
;
5618 case 41: w2
[2] = amd_bytealign (wx
, w2
[2] << 24, 3);
5619 w2
[3] = amd_bytealign (w2
[3] >> 8, wx
, 3);
5621 case 42: w2
[2] = amd_bytealign (wx
, w2
[2] << 16, 2);
5622 w2
[3] = amd_bytealign (w2
[3] >> 16, wx
, 2);
5624 case 43: w2
[2] = amd_bytealign (wx
, w2
[2] << 8, 1);
5625 w2
[3] = amd_bytealign (w2
[3] >> 24, wx
, 1);
5627 case 44: w2
[3] = wx
;
5629 case 45: w2
[3] = amd_bytealign (wx
, w2
[3] << 24, 3);
5630 w3
[0] = amd_bytealign (w3
[0] >> 8, wx
, 3);
5632 case 46: w2
[3] = amd_bytealign (wx
, w2
[3] << 16, 2);
5633 w3
[0] = amd_bytealign (w3
[0] >> 16, wx
, 2);
5635 case 47: w2
[3] = amd_bytealign (wx
, w2
[3] << 8, 1);
5636 w3
[0] = amd_bytealign (w3
[0] >> 24, wx
, 1);
5638 case 48: w3
[0] = wx
;
5640 case 49: w3
[0] = amd_bytealign (wx
, w3
[0] << 24, 3);
5641 w3
[1] = amd_bytealign (w3
[1] >> 8, wx
, 3);
5643 case 50: w3
[0] = amd_bytealign (wx
, w3
[0] << 16, 2);
5644 w3
[1] = amd_bytealign (w3
[1] >> 16, wx
, 2);
5646 case 51: w3
[0] = amd_bytealign (wx
, w3
[0] << 8, 1);
5647 w3
[1] = amd_bytealign (w3
[1] >> 24, wx
, 1);
5649 case 52: w3
[1] = wx
;
5651 case 53: w3
[1] = amd_bytealign (wx
, w3
[1] << 24, 3);
5652 w3
[2] = amd_bytealign (w3
[2] >> 8, wx
, 3);
5654 case 54: w3
[1] = amd_bytealign (wx
, w3
[1] << 16, 2);
5655 w3
[2] = amd_bytealign (w3
[2] >> 16, wx
, 2);
5657 case 55: w3
[1] = amd_bytealign (wx
, w3
[1] << 8, 1);
5658 w3
[2] = amd_bytealign (w3
[2] >> 24, wx
, 1);
5660 case 56: w3
[2] = wx
;
5662 case 57: w3
[2] = amd_bytealign (wx
, w3
[2] << 24, 3);
5663 w3
[3] = amd_bytealign (w3
[3] >> 8, wx
, 3);
5665 case 58: w3
[2] = amd_bytealign (wx
, w3
[2] << 16, 2);
5666 w3
[3] = amd_bytealign (w3
[3] >> 16, wx
, 2);
5668 case 59: w3
[2] = amd_bytealign (wx
, w3
[2] << 8, 1);
5669 w3
[3] = amd_bytealign (w3
[3] >> 24, wx
, 1);
5671 case 60: w3
[3] = wx
;
5673 case 61: w3
[3] = amd_bytealign (wx
, w3
[3] << 24, 3);
5674 //w4[0] = amd_bytealign (w4[0] >> 8, wx, 3);
5676 case 62: w3
[3] = amd_bytealign (wx
, w3
[3] << 16, 2);
5677 //w4[0] = amd_bytealign (w4[0] >> 16, wx, 2);
5679 case 63: w3
[3] = amd_bytealign (wx
, w3
[3] << 8, 1);
5680 //w4[0] = amd_bytealign (w4[0] >> 24, wx, 1);
5688 case 1: w0
[0] = (w0
[0] & 0x000000ff) | (wx
<< 8);
5689 w0
[1] = (w0
[1] & 0xffffff00) | (wx
>> 24);
5691 case 2: w0
[0] = (w0
[0] & 0x0000ffff) | (wx
<< 16);
5692 w0
[1] = (w0
[1] & 0xffff0000) | (wx
>> 16);
5694 case 3: w0
[0] = (w0
[0] & 0x00ffffff) | (wx
<< 24);
5695 w0
[1] = (w0
[1] & 0xff000000) | (wx
>> 8);
5699 case 5: w0
[1] = (w0
[1] & 0x000000ff) | (wx
<< 8);
5700 w0
[2] = (w0
[2] & 0xffffff00) | (wx
>> 24);
5702 case 6: w0
[1] = (w0
[1] & 0x0000ffff) | (wx
<< 16);
5703 w0
[2] = (w0
[2] & 0xffff0000) | (wx
>> 16);
5705 case 7: w0
[1] = (w0
[1] & 0x00ffffff) | (wx
<< 24);
5706 w0
[2] = (w0
[2] & 0xff000000) | (wx
>> 8);
5710 case 9: w0
[2] = (w0
[2] & 0x000000ff) | (wx
<< 8);
5711 w0
[3] = (w0
[3] & 0xffffff00) | (wx
>> 24);
5713 case 10: w0
[2] = (w0
[2] & 0x0000ffff) | (wx
<< 16);
5714 w0
[3] = (w0
[3] & 0xffff0000) | (wx
>> 16);
5716 case 11: w0
[2] = (w0
[2] & 0x00ffffff) | (wx
<< 24);
5717 w0
[3] = (w0
[3] & 0xff000000) | (wx
>> 8);
5719 case 12: w0
[3] = wx
;
5721 case 13: w0
[3] = (w0
[3] & 0x000000ff) | (wx
<< 8);
5722 w1
[0] = (w1
[0] & 0xffffff00) | (wx
>> 24);
5724 case 14: w0
[3] = (w0
[3] & 0x0000ffff) | (wx
<< 16);
5725 w1
[0] = (w1
[0] & 0xffff0000) | (wx
>> 16);
5727 case 15: w0
[3] = (w0
[3] & 0x00ffffff) | (wx
<< 24);
5728 w1
[0] = (w1
[0] & 0xff000000) | (wx
>> 8);
5730 case 16: w1
[0] = wx
;
5732 case 17: w1
[0] = (w1
[0] & 0x000000ff) | (wx
<< 8);
5733 w1
[1] = (w1
[1] & 0xffffff00) | (wx
>> 24);
5735 case 18: w1
[0] = (w1
[0] & 0x0000ffff) | (wx
<< 16);
5736 w1
[1] = (w1
[1] & 0xffff0000) | (wx
>> 16);
5738 case 19: w1
[0] = (w1
[0] & 0x00ffffff) | (wx
<< 24);
5739 w1
[1] = (w1
[1] & 0xff000000) | (wx
>> 8);
5741 case 20: w1
[1] = wx
;
5743 case 21: w1
[1] = (w1
[1] & 0x000000ff) | (wx
<< 8);
5744 w1
[2] = (w1
[2] & 0xffffff00) | (wx
>> 24);
5746 case 22: w1
[1] = (w1
[1] & 0x0000ffff) | (wx
<< 16);
5747 w1
[2] = (w1
[2] & 0xffff0000) | (wx
>> 16);
5749 case 23: w1
[1] = (w1
[1] & 0x00ffffff) | (wx
<< 24);
5750 w1
[2] = (w1
[2] & 0xff000000) | (wx
>> 8);
5752 case 24: w1
[2] = wx
;
5754 case 25: w1
[2] = (w1
[2] & 0x000000ff) | (wx
<< 8);
5755 w1
[3] = (w1
[3] & 0xffffff00) | (wx
>> 24);
5757 case 26: w1
[2] = (w1
[2] & 0x0000ffff) | (wx
<< 16);
5758 w1
[3] = (w1
[3] & 0xffff0000) | (wx
>> 16);
5760 case 27: w1
[2] = (w1
[2] & 0x00ffffff) | (wx
<< 24);
5761 w1
[3] = (w1
[3] & 0xff000000) | (wx
>> 8);
5763 case 28: w1
[3] = wx
;
5765 case 29: w1
[3] = (w1
[3] & 0x000000ff) | (wx
<< 8);
5766 w2
[0] = (w2
[0] & 0xffffff00) | (wx
>> 24);
5768 case 30: w1
[3] = (w1
[3] & 0x0000ffff) | (wx
<< 16);
5769 w2
[0] = (w2
[0] & 0xffff0000) | (wx
>> 16);
5771 case 31: w1
[3] = (w1
[3] & 0x00ffffff) | (wx
<< 24);
5772 w2
[0] = (w2
[0] & 0xff000000) | (wx
>> 8);
5774 case 32: w2
[0] = wx
;
5776 case 33: w2
[0] = (w2
[0] & 0x000000ff) | (wx
<< 8);
5777 w2
[1] = (w2
[1] & 0xffffff00) | (wx
>> 24);
5779 case 34: w2
[0] = (w2
[0] & 0x0000ffff) | (wx
<< 16);
5780 w2
[1] = (w2
[1] & 0xffff0000) | (wx
>> 16);
5782 case 35: w2
[0] = (w2
[0] & 0x00ffffff) | (wx
<< 24);
5783 w2
[1] = (w2
[1] & 0xff000000) | (wx
>> 8);
5785 case 36: w2
[1] = wx
;
5787 case 37: w2
[1] = (w2
[1] & 0x000000ff) | (wx
<< 8);
5788 w2
[2] = (w2
[2] & 0xffffff00) | (wx
>> 24);
5790 case 38: w2
[1] = (w2
[1] & 0x0000ffff) | (wx
<< 16);
5791 w2
[2] = (w2
[2] & 0xffff0000) | (wx
>> 16);
5793 case 39: w2
[1] = (w2
[1] & 0x00ffffff) | (wx
<< 24);
5794 w2
[2] = (w2
[2] & 0xff000000) | (wx
>> 8);
5796 case 40: w2
[2] = wx
;
5798 case 41: w2
[2] = (w2
[2] & 0x000000ff) | (wx
<< 8);
5799 w2
[3] = (w2
[3] & 0xffffff00) | (wx
>> 24);
5801 case 42: w2
[2] = (w2
[2] & 0x0000ffff) | (wx
<< 16);
5802 w2
[3] = (w2
[3] & 0xffff0000) | (wx
>> 16);
5804 case 43: w2
[2] = (w2
[2] & 0x00ffffff) | (wx
<< 24);
5805 w2
[3] = (w2
[3] & 0xff000000) | (wx
>> 8);
5807 case 44: w2
[3] = wx
;
5809 case 45: w2
[3] = (w2
[3] & 0x000000ff) | (wx
<< 8);
5810 w3
[0] = (w3
[0] & 0xffffff00) | (wx
>> 24);
5812 case 46: w2
[3] = (w2
[3] & 0x0000ffff) | (wx
<< 16);
5813 w3
[0] = (w3
[0] & 0xffff0000) | (wx
>> 16);
5815 case 47: w2
[3] = (w2
[3] & 0x00ffffff) | (wx
<< 24);
5816 w3
[0] = (w3
[0] & 0xff000000) | (wx
>> 8);
5818 case 48: w3
[0] = wx
;
5820 case 49: w3
[0] = (w3
[0] & 0x000000ff) | (wx
<< 8);
5821 w3
[1] = (w3
[1] & 0xffffff00) | (wx
>> 24);
5823 case 50: w3
[0] = (w3
[0] & 0x0000ffff) | (wx
<< 16);
5824 w3
[1] = (w3
[1] & 0xffff0000) | (wx
>> 16);
5826 case 51: w3
[0] = (w3
[0] & 0x00ffffff) | (wx
<< 24);
5827 w3
[1] = (w3
[1] & 0xff000000) | (wx
>> 8);
5829 case 52: w3
[1] = wx
;
5831 case 53: w3
[1] = (w3
[1] & 0x000000ff) | (wx
<< 8);
5832 w3
[2] = (w3
[2] & 0xffffff00) | (wx
>> 24);
5834 case 54: w3
[1] = (w3
[1] & 0x0000ffff) | (wx
<< 16);
5835 w3
[2] = (w3
[2] & 0xffff0000) | (wx
>> 16);
5837 case 55: w3
[1] = (w3
[1] & 0x00ffffff) | (wx
<< 24);
5838 w3
[2] = (w3
[2] & 0xff000000) | (wx
>> 8);
5840 case 56: w3
[2] = wx
;
5842 case 57: w3
[2] = (w3
[2] & 0x000000ff) | (wx
<< 8);
5843 w3
[3] = (w3
[3] & 0xffffff00) | (wx
>> 24);
5845 case 58: w3
[2] = (w3
[2] & 0x0000ffff) | (wx
<< 16);
5846 w3
[3] = (w3
[3] & 0xffff0000) | (wx
>> 16);
5848 case 59: w3
[2] = (w3
[2] & 0x00ffffff) | (wx
<< 24);
5849 w3
[3] = (w3
[3] & 0xff000000) | (wx
>> 8);
5851 case 60: w3
[3] = wx
;
5853 case 61: w3
[3] = (w3
[3] & 0x000000ff) | (wx
<< 8);
5854 //w4[0] = (w4[0] & 0xffffff00) | (wx >> 24);
5856 case 62: w3
[3] = (w3
[3] & 0x0000ffff) | (wx
<< 16);
5857 //w4[0] = (w4[0] & 0xffff0000) | (wx >> 16);
5859 case 63: w3
[3] = (w3
[3] & 0x00ffffff) | (wx
<< 24);
5860 //w4[0] = (w4[0] & 0xff000000) | (wx >> 8);
5866 inline void overwrite_at_be_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x wx
, const u32 salt_len
)
5868 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5874 case 1: w0
[0] = (w0
[0] & 0xff000000) | (wx
>> 8);
5875 w0
[1] = (w0
[1] & 0x00ffffff) | (wx
<< 24);
5877 case 2: w0
[0] = (w0
[0] & 0xffff0000) | (wx
>> 16);
5878 w0
[1] = (w0
[1] & 0x0000ffff) | (wx
<< 16);
5880 case 3: w0
[0] = (w0
[0] & 0xffffff00) | (wx
>> 24);
5881 w0
[1] = (w0
[1] & 0x000000ff) | (wx
<< 8);
5885 case 5: w0
[1] = (w0
[1] & 0xff000000) | (wx
>> 8);
5886 w0
[2] = (w0
[2] & 0x00ffffff) | (wx
<< 24);
5888 case 6: w0
[1] = (w0
[1] & 0xffff0000) | (wx
>> 16);
5889 w0
[2] = (w0
[2] & 0x0000ffff) | (wx
<< 16);
5891 case 7: w0
[1] = (w0
[1] & 0xffffff00) | (wx
>> 24);
5892 w0
[2] = (w0
[2] & 0x000000ff) | (wx
<< 8);
5896 case 9: w0
[2] = (w0
[2] & 0xff000000) | (wx
>> 8);
5897 w0
[3] = (w0
[3] & 0x00ffffff) | (wx
<< 24);
5899 case 10: w0
[2] = (w0
[2] & 0xffff0000) | (wx
>> 16);
5900 w0
[3] = (w0
[3] & 0x0000ffff) | (wx
<< 16);
5902 case 11: w0
[2] = (w0
[2] & 0xffffff00) | (wx
>> 24);
5903 w0
[3] = (w0
[3] & 0x000000ff) | (wx
<< 8);
5905 case 12: w0
[3] = wx
;
5907 case 13: w0
[3] = (w0
[3] & 0xff000000) | (wx
>> 8);
5908 w1
[0] = (w1
[0] & 0x00ffffff) | (wx
<< 24);
5910 case 14: w0
[3] = (w0
[3] & 0xffff0000) | (wx
>> 16);
5911 w1
[0] = (w1
[0] & 0x0000ffff) | (wx
<< 16);
5913 case 15: w0
[3] = (w0
[3] & 0xffffff00) | (wx
>> 24);
5914 w1
[0] = (w1
[0] & 0x000000ff) | (wx
<< 8);
5916 case 16: w1
[0] = wx
;
5918 case 17: w1
[0] = (w1
[0] & 0xff000000) | (wx
>> 8);
5919 w1
[1] = (w1
[1] & 0x00ffffff) | (wx
<< 24);
5921 case 18: w1
[0] = (w1
[0] & 0xffff0000) | (wx
>> 16);
5922 w1
[1] = (w1
[1] & 0x0000ffff) | (wx
<< 16);
5924 case 19: w1
[0] = (w1
[0] & 0xffffff00) | (wx
>> 24);
5925 w1
[1] = (w1
[1] & 0x000000ff) | (wx
<< 8);
5927 case 20: w1
[1] = wx
;
5929 case 21: w1
[1] = (w1
[1] & 0xff000000) | (wx
>> 8);
5930 w1
[2] = (w1
[2] & 0x00ffffff) | (wx
<< 24);
5932 case 22: w1
[1] = (w1
[1] & 0xffff0000) | (wx
>> 16);
5933 w1
[2] = (w1
[2] & 0x0000ffff) | (wx
<< 16);
5935 case 23: w1
[1] = (w1
[1] & 0xffffff00) | (wx
>> 24);
5936 w1
[2] = (w1
[2] & 0x000000ff) | (wx
<< 8);
5938 case 24: w1
[2] = wx
;
5940 case 25: w1
[2] = (w1
[2] & 0xff000000) | (wx
>> 8);
5941 w1
[3] = (w1
[3] & 0x00ffffff) | (wx
<< 24);
5943 case 26: w1
[2] = (w1
[2] & 0xffff0000) | (wx
>> 16);
5944 w1
[3] = (w1
[3] & 0x0000ffff) | (wx
<< 16);
5946 case 27: w1
[2] = (w1
[2] & 0xffffff00) | (wx
>> 24);
5947 w1
[3] = (w1
[3] & 0x000000ff) | (wx
<< 8);
5949 case 28: w1
[3] = wx
;
5951 case 29: w1
[3] = (w1
[3] & 0xff000000) | (wx
>> 8);
5952 w2
[0] = (w2
[0] & 0x00ffffff) | (wx
<< 24);
5954 case 30: w1
[3] = (w1
[3] & 0xffff0000) | (wx
>> 16);
5955 w2
[0] = (w2
[0] & 0x0000ffff) | (wx
<< 16);
5957 case 31: w1
[3] = (w1
[3] & 0xffffff00) | (wx
>> 24);
5958 w2
[0] = (w2
[0] & 0x000000ff) | (wx
<< 8);
5960 case 32: w2
[0] = wx
;
5962 case 33: w2
[0] = (w2
[0] & 0xff000000) | (wx
>> 8);
5963 w2
[1] = (w2
[1] & 0x00ffffff) | (wx
<< 24);
5965 case 34: w2
[0] = (w2
[0] & 0xffff0000) | (wx
>> 16);
5966 w2
[1] = (w2
[1] & 0x0000ffff) | (wx
<< 16);
5968 case 35: w2
[0] = (w2
[0] & 0xffffff00) | (wx
>> 24);
5969 w2
[1] = (w2
[1] & 0x000000ff) | (wx
<< 8);
5971 case 36: w2
[1] = wx
;
5973 case 37: w2
[1] = (w2
[1] & 0xff000000) | (wx
>> 8);
5974 w2
[2] = (w2
[2] & 0x00ffffff) | (wx
<< 24);
5976 case 38: w2
[1] = (w2
[1] & 0xffff0000) | (wx
>> 16);
5977 w2
[2] = (w2
[2] & 0x0000ffff) | (wx
<< 16);
5979 case 39: w2
[1] = (w2
[1] & 0xffffff00) | (wx
>> 24);
5980 w2
[2] = (w2
[2] & 0x000000ff) | (wx
<< 8);
5982 case 40: w2
[2] = wx
;
5984 case 41: w2
[2] = (w2
[2] & 0xff000000) | (wx
>> 8);
5985 w2
[3] = (w2
[3] & 0x00ffffff) | (wx
<< 24);
5987 case 42: w2
[2] = (w2
[2] & 0xffff0000) | (wx
>> 16);
5988 w2
[3] = (w2
[3] & 0x0000ffff) | (wx
<< 16);
5990 case 43: w2
[2] = (w2
[2] & 0xffffff00) | (wx
>> 24);
5991 w2
[3] = (w2
[3] & 0x000000ff) | (wx
<< 8);
5993 case 44: w2
[3] = wx
;
5995 case 45: w2
[3] = (w2
[3] & 0xff000000) | (wx
>> 8);
5996 w3
[0] = (w3
[0] & 0x00ffffff) | (wx
<< 24);
5998 case 46: w2
[3] = (w2
[3] & 0xffff0000) | (wx
>> 16);
5999 w3
[0] = (w3
[0] & 0x0000ffff) | (wx
<< 16);
6001 case 47: w2
[3] = (w2
[3] & 0xffffff00) | (wx
>> 24);
6002 w3
[0] = (w3
[0] & 0x000000ff) | (wx
<< 8);
6004 case 48: w3
[0] = wx
;
6006 case 49: w3
[0] = (w3
[0] & 0xff000000) | (wx
>> 8);
6007 w3
[1] = (w3
[1] & 0x00ffffff) | (wx
<< 24);
6009 case 50: w3
[0] = (w3
[0] & 0xffff0000) | (wx
>> 16);
6010 w3
[1] = (w3
[1] & 0x0000ffff) | (wx
<< 16);
6012 case 51: w3
[0] = (w3
[0] & 0xffffff00) | (wx
>> 24);
6013 w3
[1] = (w3
[1] & 0x000000ff) | (wx
<< 8);
6015 case 52: w3
[1] = wx
;
6017 case 53: w3
[1] = (w3
[1] & 0xff000000) | (wx
>> 8);
6018 w3
[2] = (w3
[2] & 0x00ffffff) | (wx
<< 24);
6020 case 54: w3
[1] = (w3
[1] & 0xffff0000) | (wx
>> 16);
6021 w3
[2] = (w3
[2] & 0x0000ffff) | (wx
<< 16);
6023 case 55: w3
[1] = (w3
[1] & 0xffffff00) | (wx
>> 24);
6024 w3
[2] = (w3
[2] & 0x000000ff) | (wx
<< 8);
6026 case 56: w3
[2] = wx
;
6028 case 57: w3
[2] = (w3
[2] & 0xff000000) | (wx
>> 8);
6029 w3
[3] = (w3
[3] & 0x00ffffff) | (wx
<< 24);
6031 case 58: w3
[2] = (w3
[2] & 0xffff0000) | (wx
>> 16);
6032 w3
[3] = (w3
[3] & 0x0000ffff) | (wx
<< 16);
6034 case 59: w3
[2] = (w3
[2] & 0xffffff00) | (wx
>> 24);
6035 w3
[3] = (w3
[3] & 0x000000ff) | (wx
<< 8);
6037 case 60: w3
[3] = wx
;
6039 case 61: w3
[3] = (w3
[3] & 0xff000000) | (wx
>> 8);
6040 //w4[0] = (w4[0] & 0x00ffffff) | (wx << 24);
6042 case 62: w3
[3] = (w3
[3] & 0xffff0000) | (wx
>> 16);
6043 //w4[0] = (w4[0] & 0x0000ffff) | (wx << 16);
6045 case 63: w3
[3] = (w3
[3] & 0xffffff00) | (wx
>> 24);
6046 //w4[0] = (w4[0] & 0x000000ff) | (wx << 8);
6052 * vector functions as scalar (for outer loop usage)
6055 inline void append_0x01_1x4_S (u32 w0
[4], const u32 offset
)
6064 w0
[0] = w0
[0] | 0x0100;
6068 w0
[0] = w0
[0] | 0x010000;
6072 w0
[0] = w0
[0] | 0x01000000;
6080 w0
[1] = w0
[1] | 0x0100;
6084 w0
[1] = w0
[1] | 0x010000;
6088 w0
[1] = w0
[1] | 0x01000000;
6096 w0
[2] = w0
[2] | 0x0100;
6100 w0
[2] = w0
[2] | 0x010000;
6104 w0
[2] = w0
[2] | 0x01000000;
6112 w0
[3] = w0
[3] | 0x0100;
6116 w0
[3] = w0
[3] | 0x010000;
6120 w0
[3] = w0
[3] | 0x01000000;
6125 inline void append_0x01_2x4_S (u32 w0
[4], u32 w1
[4], const u32 offset
)
6134 w0
[0] = w0
[0] | 0x0100;
6138 w0
[0] = w0
[0] | 0x010000;
6142 w0
[0] = w0
[0] | 0x01000000;
6150 w0
[1] = w0
[1] | 0x0100;
6154 w0
[1] = w0
[1] | 0x010000;
6158 w0
[1] = w0
[1] | 0x01000000;
6166 w0
[2] = w0
[2] | 0x0100;
6170 w0
[2] = w0
[2] | 0x010000;
6174 w0
[2] = w0
[2] | 0x01000000;
6182 w0
[3] = w0
[3] | 0x0100;
6186 w0
[3] = w0
[3] | 0x010000;
6190 w0
[3] = w0
[3] | 0x01000000;
6198 w1
[0] = w1
[0] | 0x0100;
6202 w1
[0] = w1
[0] | 0x010000;
6206 w1
[0] = w1
[0] | 0x01000000;
6214 w1
[1] = w1
[1] | 0x0100;
6218 w1
[1] = w1
[1] | 0x010000;
6222 w1
[1] = w1
[1] | 0x01000000;
6230 w1
[2] = w1
[2] | 0x0100;
6234 w1
[2] = w1
[2] | 0x010000;
6238 w1
[2] = w1
[2] | 0x01000000;
6246 w1
[3] = w1
[3] | 0x0100;
6250 w1
[3] = w1
[3] | 0x010000;
6254 w1
[3] = w1
[3] | 0x01000000;
6259 inline void append_0x01_3x4_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
6268 w0
[0] = w0
[0] | 0x0100;
6272 w0
[0] = w0
[0] | 0x010000;
6276 w0
[0] = w0
[0] | 0x01000000;
6284 w0
[1] = w0
[1] | 0x0100;
6288 w0
[1] = w0
[1] | 0x010000;
6292 w0
[1] = w0
[1] | 0x01000000;
6300 w0
[2] = w0
[2] | 0x0100;
6304 w0
[2] = w0
[2] | 0x010000;
6308 w0
[2] = w0
[2] | 0x01000000;
6316 w0
[3] = w0
[3] | 0x0100;
6320 w0
[3] = w0
[3] | 0x010000;
6324 w0
[3] = w0
[3] | 0x01000000;
6332 w1
[0] = w1
[0] | 0x0100;
6336 w1
[0] = w1
[0] | 0x010000;
6340 w1
[0] = w1
[0] | 0x01000000;
6348 w1
[1] = w1
[1] | 0x0100;
6352 w1
[1] = w1
[1] | 0x010000;
6356 w1
[1] = w1
[1] | 0x01000000;
6364 w1
[2] = w1
[2] | 0x0100;
6368 w1
[2] = w1
[2] | 0x010000;
6372 w1
[2] = w1
[2] | 0x01000000;
6380 w1
[3] = w1
[3] | 0x0100;
6384 w1
[3] = w1
[3] | 0x010000;
6388 w1
[3] = w1
[3] | 0x01000000;
6396 w2
[0] = w2
[0] | 0x0100;
6400 w2
[0] = w2
[0] | 0x010000;
6404 w2
[0] = w2
[0] | 0x01000000;
6412 w2
[1] = w2
[1] | 0x0100;
6416 w2
[1] = w2
[1] | 0x010000;
6420 w2
[1] = w2
[1] | 0x01000000;
6428 w2
[2] = w2
[2] | 0x0100;
6432 w2
[2] = w2
[2] | 0x010000;
6436 w2
[2] = w2
[2] | 0x01000000;
6444 w2
[3] = w2
[3] | 0x0100;
6448 w2
[3] = w2
[3] | 0x010000;
6452 w2
[3] = w2
[3] | 0x01000000;
6457 inline void append_0x01_4x4_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
6466 w0
[0] = w0
[0] | 0x0100;
6470 w0
[0] = w0
[0] | 0x010000;
6474 w0
[0] = w0
[0] | 0x01000000;
6482 w0
[1] = w0
[1] | 0x0100;
6486 w0
[1] = w0
[1] | 0x010000;
6490 w0
[1] = w0
[1] | 0x01000000;
6498 w0
[2] = w0
[2] | 0x0100;
6502 w0
[2] = w0
[2] | 0x010000;
6506 w0
[2] = w0
[2] | 0x01000000;
6514 w0
[3] = w0
[3] | 0x0100;
6518 w0
[3] = w0
[3] | 0x010000;
6522 w0
[3] = w0
[3] | 0x01000000;
6530 w1
[0] = w1
[0] | 0x0100;
6534 w1
[0] = w1
[0] | 0x010000;
6538 w1
[0] = w1
[0] | 0x01000000;
6546 w1
[1] = w1
[1] | 0x0100;
6550 w1
[1] = w1
[1] | 0x010000;
6554 w1
[1] = w1
[1] | 0x01000000;
6562 w1
[2] = w1
[2] | 0x0100;
6566 w1
[2] = w1
[2] | 0x010000;
6570 w1
[2] = w1
[2] | 0x01000000;
6578 w1
[3] = w1
[3] | 0x0100;
6582 w1
[3] = w1
[3] | 0x010000;
6586 w1
[3] = w1
[3] | 0x01000000;
6594 w2
[0] = w2
[0] | 0x0100;
6598 w2
[0] = w2
[0] | 0x010000;
6602 w2
[0] = w2
[0] | 0x01000000;
6610 w2
[1] = w2
[1] | 0x0100;
6614 w2
[1] = w2
[1] | 0x010000;
6618 w2
[1] = w2
[1] | 0x01000000;
6626 w2
[2] = w2
[2] | 0x0100;
6630 w2
[2] = w2
[2] | 0x010000;
6634 w2
[2] = w2
[2] | 0x01000000;
6642 w2
[3] = w2
[3] | 0x0100;
6646 w2
[3] = w2
[3] | 0x010000;
6650 w2
[3] = w2
[3] | 0x01000000;
6658 w3
[0] = w3
[0] | 0x0100;
6662 w3
[0] = w3
[0] | 0x010000;
6666 w3
[0] = w3
[0] | 0x01000000;
6674 w3
[1] = w3
[1] | 0x0100;
6678 w3
[1] = w3
[1] | 0x010000;
6682 w3
[1] = w3
[1] | 0x01000000;
6690 w3
[2] = w3
[2] | 0x0100;
6694 w3
[2] = w3
[2] | 0x010000;
6698 w3
[2] = w3
[2] | 0x01000000;
6706 w3
[3] = w3
[3] | 0x0100;
6710 w3
[3] = w3
[3] | 0x010000;
6714 w3
[3] = w3
[3] | 0x01000000;
6719 inline void append_0x02_2x4_S (u32 w0
[4], u32 w1
[4], const u32 offset
)
6728 w0
[0] = w0
[0] | 0x0200;
6732 w0
[0] = w0
[0] | 0x020000;
6736 w0
[0] = w0
[0] | 0x02000000;
6744 w0
[1] = w0
[1] | 0x0200;
6748 w0
[1] = w0
[1] | 0x020000;
6752 w0
[1] = w0
[1] | 0x02000000;
6760 w0
[2] = w0
[2] | 0x0200;
6764 w0
[2] = w0
[2] | 0x020000;
6768 w0
[2] = w0
[2] | 0x02000000;
6776 w0
[3] = w0
[3] | 0x0200;
6780 w0
[3] = w0
[3] | 0x020000;
6784 w0
[3] = w0
[3] | 0x02000000;
6792 w1
[0] = w1
[0] | 0x0200;
6796 w1
[0] = w1
[0] | 0x020000;
6800 w1
[0] = w1
[0] | 0x02000000;
6808 w1
[1] = w1
[1] | 0x0200;
6812 w1
[1] = w1
[1] | 0x020000;
6816 w1
[1] = w1
[1] | 0x02000000;
6824 w1
[2] = w1
[2] | 0x0200;
6828 w1
[2] = w1
[2] | 0x020000;
6832 w1
[2] = w1
[2] | 0x02000000;
6840 w1
[3] = w1
[3] | 0x0200;
6844 w1
[3] = w1
[3] | 0x020000;
6848 w1
[3] = w1
[3] | 0x02000000;
6853 inline void append_0x02_3x4_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
6862 w0
[0] = w0
[0] | 0x0200;
6866 w0
[0] = w0
[0] | 0x020000;
6870 w0
[0] = w0
[0] | 0x02000000;
6878 w0
[1] = w0
[1] | 0x0200;
6882 w0
[1] = w0
[1] | 0x020000;
6886 w0
[1] = w0
[1] | 0x02000000;
6894 w0
[2] = w0
[2] | 0x0200;
6898 w0
[2] = w0
[2] | 0x020000;
6902 w0
[2] = w0
[2] | 0x02000000;
6910 w0
[3] = w0
[3] | 0x0200;
6914 w0
[3] = w0
[3] | 0x020000;
6918 w0
[3] = w0
[3] | 0x02000000;
6926 w1
[0] = w1
[0] | 0x0200;
6930 w1
[0] = w1
[0] | 0x020000;
6934 w1
[0] = w1
[0] | 0x02000000;
6942 w1
[1] = w1
[1] | 0x0200;
6946 w1
[1] = w1
[1] | 0x020000;
6950 w1
[1] = w1
[1] | 0x02000000;
6958 w1
[2] = w1
[2] | 0x0200;
6962 w1
[2] = w1
[2] | 0x020000;
6966 w1
[2] = w1
[2] | 0x02000000;
6974 w1
[3] = w1
[3] | 0x0200;
6978 w1
[3] = w1
[3] | 0x020000;
6982 w1
[3] = w1
[3] | 0x02000000;
6990 w2
[0] = w2
[0] | 0x0200;
6994 w2
[0] = w2
[0] | 0x020000;
6998 w2
[0] = w2
[0] | 0x02000000;
7006 w2
[1] = w2
[1] | 0x0200;
7010 w2
[1] = w2
[1] | 0x020000;
7014 w2
[1] = w2
[1] | 0x02000000;
7022 w2
[2] = w2
[2] | 0x0200;
7026 w2
[2] = w2
[2] | 0x020000;
7030 w2
[2] = w2
[2] | 0x02000000;
7038 w2
[3] = w2
[3] | 0x0200;
7042 w2
[3] = w2
[3] | 0x020000;
7046 w2
[3] = w2
[3] | 0x02000000;
7051 inline void append_0x80_1x4_S (u32 w0
[4], const u32 offset
)
7060 w0
[0] = w0
[0] | 0x8000;
7064 w0
[0] = w0
[0] | 0x800000;
7068 w0
[0] = w0
[0] | 0x80000000;
7076 w0
[1] = w0
[1] | 0x8000;
7080 w0
[1] = w0
[1] | 0x800000;
7084 w0
[1] = w0
[1] | 0x80000000;
7092 w0
[2] = w0
[2] | 0x8000;
7096 w0
[2] = w0
[2] | 0x800000;
7100 w0
[2] = w0
[2] | 0x80000000;
7108 w0
[3] = w0
[3] | 0x8000;
7112 w0
[3] = w0
[3] | 0x800000;
7116 w0
[3] = w0
[3] | 0x80000000;
7121 inline void append_0x80_2x4_S (u32 w0
[4], u32 w1
[4], const u32 offset
)
7130 w0
[0] = w0
[0] | 0x8000;
7134 w0
[0] = w0
[0] | 0x800000;
7138 w0
[0] = w0
[0] | 0x80000000;
7146 w0
[1] = w0
[1] | 0x8000;
7150 w0
[1] = w0
[1] | 0x800000;
7154 w0
[1] = w0
[1] | 0x80000000;
7162 w0
[2] = w0
[2] | 0x8000;
7166 w0
[2] = w0
[2] | 0x800000;
7170 w0
[2] = w0
[2] | 0x80000000;
7178 w0
[3] = w0
[3] | 0x8000;
7182 w0
[3] = w0
[3] | 0x800000;
7186 w0
[3] = w0
[3] | 0x80000000;
7194 w1
[0] = w1
[0] | 0x8000;
7198 w1
[0] = w1
[0] | 0x800000;
7202 w1
[0] = w1
[0] | 0x80000000;
7210 w1
[1] = w1
[1] | 0x8000;
7214 w1
[1] = w1
[1] | 0x800000;
7218 w1
[1] = w1
[1] | 0x80000000;
7226 w1
[2] = w1
[2] | 0x8000;
7230 w1
[2] = w1
[2] | 0x800000;
7234 w1
[2] = w1
[2] | 0x80000000;
7242 w1
[3] = w1
[3] | 0x8000;
7246 w1
[3] = w1
[3] | 0x800000;
7250 w1
[3] = w1
[3] | 0x80000000;
7255 inline void append_0x80_3x4_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
7264 w0
[0] = w0
[0] | 0x8000;
7268 w0
[0] = w0
[0] | 0x800000;
7272 w0
[0] = w0
[0] | 0x80000000;
7280 w0
[1] = w0
[1] | 0x8000;
7284 w0
[1] = w0
[1] | 0x800000;
7288 w0
[1] = w0
[1] | 0x80000000;
7296 w0
[2] = w0
[2] | 0x8000;
7300 w0
[2] = w0
[2] | 0x800000;
7304 w0
[2] = w0
[2] | 0x80000000;
7312 w0
[3] = w0
[3] | 0x8000;
7316 w0
[3] = w0
[3] | 0x800000;
7320 w0
[3] = w0
[3] | 0x80000000;
7328 w1
[0] = w1
[0] | 0x8000;
7332 w1
[0] = w1
[0] | 0x800000;
7336 w1
[0] = w1
[0] | 0x80000000;
7344 w1
[1] = w1
[1] | 0x8000;
7348 w1
[1] = w1
[1] | 0x800000;
7352 w1
[1] = w1
[1] | 0x80000000;
7360 w1
[2] = w1
[2] | 0x8000;
7364 w1
[2] = w1
[2] | 0x800000;
7368 w1
[2] = w1
[2] | 0x80000000;
7376 w1
[3] = w1
[3] | 0x8000;
7380 w1
[3] = w1
[3] | 0x800000;
7384 w1
[3] = w1
[3] | 0x80000000;
7392 w2
[0] = w2
[0] | 0x8000;
7396 w2
[0] = w2
[0] | 0x800000;
7400 w2
[0] = w2
[0] | 0x80000000;
7408 w2
[1] = w2
[1] | 0x8000;
7412 w2
[1] = w2
[1] | 0x800000;
7416 w2
[1] = w2
[1] | 0x80000000;
7424 w2
[2] = w2
[2] | 0x8000;
7428 w2
[2] = w2
[2] | 0x800000;
7432 w2
[2] = w2
[2] | 0x80000000;
7440 w2
[3] = w2
[3] | 0x8000;
7444 w2
[3] = w2
[3] | 0x800000;
7448 w2
[3] = w2
[3] | 0x80000000;
7453 inline void append_0x80_4x4_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
7462 w0
[0] = w0
[0] | 0x8000;
7466 w0
[0] = w0
[0] | 0x800000;
7470 w0
[0] = w0
[0] | 0x80000000;
7478 w0
[1] = w0
[1] | 0x8000;
7482 w0
[1] = w0
[1] | 0x800000;
7486 w0
[1] = w0
[1] | 0x80000000;
7494 w0
[2] = w0
[2] | 0x8000;
7498 w0
[2] = w0
[2] | 0x800000;
7502 w0
[2] = w0
[2] | 0x80000000;
7510 w0
[3] = w0
[3] | 0x8000;
7514 w0
[3] = w0
[3] | 0x800000;
7518 w0
[3] = w0
[3] | 0x80000000;
7526 w1
[0] = w1
[0] | 0x8000;
7530 w1
[0] = w1
[0] | 0x800000;
7534 w1
[0] = w1
[0] | 0x80000000;
7542 w1
[1] = w1
[1] | 0x8000;
7546 w1
[1] = w1
[1] | 0x800000;
7550 w1
[1] = w1
[1] | 0x80000000;
7558 w1
[2] = w1
[2] | 0x8000;
7562 w1
[2] = w1
[2] | 0x800000;
7566 w1
[2] = w1
[2] | 0x80000000;
7574 w1
[3] = w1
[3] | 0x8000;
7578 w1
[3] = w1
[3] | 0x800000;
7582 w1
[3] = w1
[3] | 0x80000000;
7590 w2
[0] = w2
[0] | 0x8000;
7594 w2
[0] = w2
[0] | 0x800000;
7598 w2
[0] = w2
[0] | 0x80000000;
7606 w2
[1] = w2
[1] | 0x8000;
7610 w2
[1] = w2
[1] | 0x800000;
7614 w2
[1] = w2
[1] | 0x80000000;
7622 w2
[2] = w2
[2] | 0x8000;
7626 w2
[2] = w2
[2] | 0x800000;
7630 w2
[2] = w2
[2] | 0x80000000;
7638 w2
[3] = w2
[3] | 0x8000;
7642 w2
[3] = w2
[3] | 0x800000;
7646 w2
[3] = w2
[3] | 0x80000000;
7654 w3
[0] = w3
[0] | 0x8000;
7658 w3
[0] = w3
[0] | 0x800000;
7662 w3
[0] = w3
[0] | 0x80000000;
7670 w3
[1] = w3
[1] | 0x8000;
7674 w3
[1] = w3
[1] | 0x800000;
7678 w3
[1] = w3
[1] | 0x80000000;
7686 w3
[2] = w3
[2] | 0x8000;
7690 w3
[2] = w3
[2] | 0x800000;
7694 w3
[2] = w3
[2] | 0x80000000;
7702 w3
[3] = w3
[3] | 0x8000;
7706 w3
[3] = w3
[3] | 0x800000;
7710 w3
[3] = w3
[3] | 0x80000000;
7715 inline void truncate_block_S (u32 w
[4], const u32 len
)
7724 case 1: w
[0] &= 0x000000FF;
7729 case 2: w
[0] &= 0x0000FFFF;
7734 case 3: w
[0] &= 0x00FFFFFF;
7743 case 5: w
[1] &= 0x000000FF;
7747 case 6: w
[1] &= 0x0000FFFF;
7751 case 7: w
[1] &= 0x00FFFFFF;
7758 case 9: w
[2] &= 0x000000FF;
7761 case 10: w
[2] &= 0x0000FFFF;
7764 case 11: w
[2] &= 0x00FFFFFF;
7769 case 13: w
[3] &= 0x000000FF;
7771 case 14: w
[3] &= 0x0000FFFF;
7773 case 15: w
[3] &= 0x00FFFFFF;
7778 inline void make_unicode_S (const u32 in
[4], u32 out1
[4], u32 out2
[4])
7781 out2
[3] = __byte_perm_S (in
[3], 0, 0x7372);
7782 out2
[2] = __byte_perm_S (in
[3], 0, 0x7170);
7783 out2
[1] = __byte_perm_S (in
[2], 0, 0x7372);
7784 out2
[0] = __byte_perm_S (in
[2], 0, 0x7170);
7785 out1
[3] = __byte_perm_S (in
[1], 0, 0x7372);
7786 out1
[2] = __byte_perm_S (in
[1], 0, 0x7170);
7787 out1
[1] = __byte_perm_S (in
[0], 0, 0x7372);
7788 out1
[0] = __byte_perm_S (in
[0], 0, 0x7170);
7791 #if defined IS_AMD || defined IS_GENERIC
7792 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
7793 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
7794 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
7795 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
7796 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
7797 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
7798 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
7799 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
7803 inline void undo_unicode_S (const u32 in1
[4], const u32 in2
[4], u32 out
[4])
7806 out
[0] = __byte_perm_S (in1
[0], in1
[1], 0x6420);
7807 out
[1] = __byte_perm_S (in1
[2], in1
[3], 0x6420);
7808 out
[2] = __byte_perm_S (in2
[0], in2
[1], 0x6420);
7809 out
[3] = __byte_perm_S (in2
[2], in2
[3], 0x6420);
7812 #if defined IS_AMD || defined IS_GENERIC
7813 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
7814 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
7815 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
7816 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
7817 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
7818 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
7819 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
7820 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
7824 inline void switch_buffer_by_offset_le_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
7826 #if defined IS_AMD || defined IS_GENERIC
7827 const int offset_mod_4
= offset
& 3;
7829 const int offset_minus_4
= 4 - offset
;
7834 w3
[2] = amd_bytealign_S ( 0, w3
[1], offset_minus_4
);
7835 w3
[1] = amd_bytealign_S (w3
[1], w3
[0], offset_minus_4
);
7836 w3
[0] = amd_bytealign_S (w3
[0], w2
[3], offset_minus_4
);
7837 w2
[3] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
7838 w2
[2] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
7839 w2
[1] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
7840 w2
[0] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
7841 w1
[3] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7842 w1
[2] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7843 w1
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7844 w1
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7845 w0
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7846 w0
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7847 w0
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7848 w0
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7850 if (offset_mod_4
== 0)
7872 w3
[2] = amd_bytealign_S ( 0, w3
[0], offset_minus_4
);
7873 w3
[1] = amd_bytealign_S (w3
[0], w2
[3], offset_minus_4
);
7874 w3
[0] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
7875 w2
[3] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
7876 w2
[2] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
7877 w2
[1] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
7878 w2
[0] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7879 w1
[3] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7880 w1
[2] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7881 w1
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7882 w1
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7883 w0
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7884 w0
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7885 w0
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7888 if (offset_mod_4
== 0)
7909 w3
[2] = amd_bytealign_S ( 0, w2
[3], offset_minus_4
);
7910 w3
[1] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
7911 w3
[0] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
7912 w2
[3] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
7913 w2
[2] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
7914 w2
[1] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7915 w2
[0] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7916 w1
[3] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7917 w1
[2] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7918 w1
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7919 w1
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7920 w0
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7921 w0
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7925 if (offset_mod_4
== 0)
7945 w3
[2] = amd_bytealign_S ( 0, w2
[2], offset_minus_4
);
7946 w3
[1] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
7947 w3
[0] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
7948 w2
[3] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
7949 w2
[2] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7950 w2
[1] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7951 w2
[0] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7952 w1
[3] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7953 w1
[2] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7954 w1
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7955 w1
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7956 w0
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7961 if (offset_mod_4
== 0)
7980 w3
[2] = amd_bytealign_S ( 0, w2
[1], offset_minus_4
);
7981 w3
[1] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
7982 w3
[0] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
7983 w2
[3] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7984 w2
[2] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7985 w2
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7986 w2
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7987 w1
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7988 w1
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7989 w1
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7990 w1
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7996 if (offset_mod_4
== 0)
8014 w3
[2] = amd_bytealign_S ( 0, w2
[0], offset_minus_4
);
8015 w3
[1] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
8016 w3
[0] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
8017 w2
[3] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
8018 w2
[2] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
8019 w2
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
8020 w2
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
8021 w1
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
8022 w1
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
8023 w1
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
8030 if (offset_mod_4
== 0)
8047 w3
[2] = amd_bytealign_S ( 0, w1
[3], offset_minus_4
);
8048 w3
[1] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
8049 w3
[0] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
8050 w2
[3] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
8051 w2
[2] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
8052 w2
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
8053 w2
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
8054 w1
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
8055 w1
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
8063 if (offset_mod_4
== 0)
8079 w3
[2] = amd_bytealign_S ( 0, w1
[2], offset_minus_4
);
8080 w3
[1] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
8081 w3
[0] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
8082 w2
[3] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
8083 w2
[2] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
8084 w2
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
8085 w2
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
8086 w1
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
8095 if (offset_mod_4
== 0)
8110 w3
[2] = amd_bytealign_S ( 0, w1
[1], offset_minus_4
);
8111 w3
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
8112 w3
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
8113 w2
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
8114 w2
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
8115 w2
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
8116 w2
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
8126 if (offset_mod_4
== 0)
8140 w3
[2] = amd_bytealign_S ( 0, w1
[0], offset_minus_4
);
8141 w3
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
8142 w3
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
8143 w2
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
8144 w2
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
8145 w2
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
8156 if (offset_mod_4
== 0)
8169 w3
[2] = amd_bytealign_S ( 0, w0
[3], offset_minus_4
);
8170 w3
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
8171 w3
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
8172 w2
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
8173 w2
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
8185 if (offset_mod_4
== 0)
8197 w3
[2] = amd_bytealign_S ( 0, w0
[2], offset_minus_4
);
8198 w3
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
8199 w3
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
8200 w2
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
8213 if (offset_mod_4
== 0)
8224 w3
[2] = amd_bytealign_S ( 0, w0
[1], offset_minus_4
);
8225 w3
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
8226 w3
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
8240 if (offset_mod_4
== 0)
8250 w3
[2] = amd_bytealign_S ( 0, w0
[0], offset_minus_4
);
8251 w3
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
8266 if (offset_mod_4
== 0)
8277 const int offset_minus_4
= 4 - (offset
% 4);
8279 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
8284 w3
[1] = __byte_perm_S (w3
[0], w3
[1], selector
);
8285 w3
[0] = __byte_perm_S (w2
[3], w3
[0], selector
);
8286 w2
[3] = __byte_perm_S (w2
[2], w2
[3], selector
);
8287 w2
[2] = __byte_perm_S (w2
[1], w2
[2], selector
);
8288 w2
[1] = __byte_perm_S (w2
[0], w2
[1], selector
);
8289 w2
[0] = __byte_perm_S (w1
[3], w2
[0], selector
);
8290 w1
[3] = __byte_perm_S (w1
[2], w1
[3], selector
);
8291 w1
[2] = __byte_perm_S (w1
[1], w1
[2], selector
);
8292 w1
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
8293 w1
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
8294 w0
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
8295 w0
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
8296 w0
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
8297 w0
[0] = __byte_perm_S ( 0, w0
[0], selector
);
8302 w3
[1] = __byte_perm_S (w2
[3], w3
[0], selector
);
8303 w3
[0] = __byte_perm_S (w2
[2], w2
[3], selector
);
8304 w2
[3] = __byte_perm_S (w2
[1], w2
[2], selector
);
8305 w2
[2] = __byte_perm_S (w2
[0], w2
[1], selector
);
8306 w2
[1] = __byte_perm_S (w1
[3], w2
[0], selector
);
8307 w2
[0] = __byte_perm_S (w1
[2], w1
[3], selector
);
8308 w1
[3] = __byte_perm_S (w1
[1], w1
[2], selector
);
8309 w1
[2] = __byte_perm_S (w1
[0], w1
[1], selector
);
8310 w1
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
8311 w1
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
8312 w0
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
8313 w0
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
8314 w0
[1] = __byte_perm_S ( 0, w0
[0], selector
);
8320 w3
[1] = __byte_perm_S (w2
[2], w2
[3], selector
);
8321 w3
[0] = __byte_perm_S (w2
[1], w2
[2], selector
);
8322 w2
[3] = __byte_perm_S (w2
[0], w2
[1], selector
);
8323 w2
[2] = __byte_perm_S (w1
[3], w2
[0], selector
);
8324 w2
[1] = __byte_perm_S (w1
[2], w1
[3], selector
);
8325 w2
[0] = __byte_perm_S (w1
[1], w1
[2], selector
);
8326 w1
[3] = __byte_perm_S (w1
[0], w1
[1], selector
);
8327 w1
[2] = __byte_perm_S (w0
[3], w1
[0], selector
);
8328 w1
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
8329 w1
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
8330 w0
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
8331 w0
[2] = __byte_perm_S ( 0, w0
[0], selector
);
8338 w3
[1] = __byte_perm_S (w2
[1], w2
[2], selector
);
8339 w3
[0] = __byte_perm_S (w2
[0], w2
[1], selector
);
8340 w2
[3] = __byte_perm_S (w1
[3], w2
[0], selector
);
8341 w2
[2] = __byte_perm_S (w1
[2], w1
[3], selector
);
8342 w2
[1] = __byte_perm_S (w1
[1], w1
[2], selector
);
8343 w2
[0] = __byte_perm_S (w1
[0], w1
[1], selector
);
8344 w1
[3] = __byte_perm_S (w0
[3], w1
[0], selector
);
8345 w1
[2] = __byte_perm_S (w0
[2], w0
[3], selector
);
8346 w1
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
8347 w1
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
8348 w0
[3] = __byte_perm_S ( 0, w0
[0], selector
);
8356 w3
[1] = __byte_perm_S (w2
[0], w2
[1], selector
);
8357 w3
[0] = __byte_perm_S (w1
[3], w2
[0], selector
);
8358 w2
[3] = __byte_perm_S (w1
[2], w1
[3], selector
);
8359 w2
[2] = __byte_perm_S (w1
[1], w1
[2], selector
);
8360 w2
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
8361 w2
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
8362 w1
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
8363 w1
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
8364 w1
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
8365 w1
[0] = __byte_perm_S ( 0, w0
[0], selector
);
8374 w3
[1] = __byte_perm_S (w1
[3], w2
[0], selector
);
8375 w3
[0] = __byte_perm_S (w1
[2], w1
[3], selector
);
8376 w2
[3] = __byte_perm_S (w1
[1], w1
[2], selector
);
8377 w2
[2] = __byte_perm_S (w1
[0], w1
[1], selector
);
8378 w2
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
8379 w2
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
8380 w1
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
8381 w1
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
8382 w1
[1] = __byte_perm_S ( 0, w0
[0], selector
);
8392 w3
[1] = __byte_perm_S (w1
[2], w1
[3], selector
);
8393 w3
[0] = __byte_perm_S (w1
[1], w1
[2], selector
);
8394 w2
[3] = __byte_perm_S (w1
[0], w1
[1], selector
);
8395 w2
[2] = __byte_perm_S (w0
[3], w1
[0], selector
);
8396 w2
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
8397 w2
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
8398 w1
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
8399 w1
[2] = __byte_perm_S ( 0, w0
[0], selector
);
8410 w3
[1] = __byte_perm_S (w1
[1], w1
[2], selector
);
8411 w3
[0] = __byte_perm_S (w1
[0], w1
[1], selector
);
8412 w2
[3] = __byte_perm_S (w0
[3], w1
[0], selector
);
8413 w2
[2] = __byte_perm_S (w0
[2], w0
[3], selector
);
8414 w2
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
8415 w2
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
8416 w1
[3] = __byte_perm_S ( 0, w0
[0], selector
);
8428 w3
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
8429 w3
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
8430 w2
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
8431 w2
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
8432 w2
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
8433 w2
[0] = __byte_perm_S ( 0, w0
[0], selector
);
8446 w3
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
8447 w3
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
8448 w2
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
8449 w2
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
8450 w2
[1] = __byte_perm_S ( 0, w0
[0], selector
);
8464 w3
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
8465 w3
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
8466 w2
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
8467 w2
[2] = __byte_perm_S ( 0, w0
[0], selector
);
8482 w3
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
8483 w3
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
8484 w2
[3] = __byte_perm_S ( 0, w0
[0], selector
);
8500 w3
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
8501 w3
[0] = __byte_perm_S ( 0, w0
[0], selector
);
8518 w3
[1] = __byte_perm_S ( 0, w0
[0], selector
);
8538 inline void switch_buffer_by_offset_be_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
8540 #if defined IS_AMD || defined IS_GENERIC
8544 w3
[2] = amd_bytealign_S (w3
[1], 0, offset
);
8545 w3
[1] = amd_bytealign_S (w3
[0], w3
[1], offset
);
8546 w3
[0] = amd_bytealign_S (w2
[3], w3
[0], offset
);
8547 w2
[3] = amd_bytealign_S (w2
[2], w2
[3], offset
);
8548 w2
[2] = amd_bytealign_S (w2
[1], w2
[2], offset
);
8549 w2
[1] = amd_bytealign_S (w2
[0], w2
[1], offset
);
8550 w2
[0] = amd_bytealign_S (w1
[3], w2
[0], offset
);
8551 w1
[3] = amd_bytealign_S (w1
[2], w1
[3], offset
);
8552 w1
[2] = amd_bytealign_S (w1
[1], w1
[2], offset
);
8553 w1
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
8554 w1
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
8555 w0
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
8556 w0
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
8557 w0
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8558 w0
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
8562 w3
[2] = amd_bytealign_S (w3
[0], 0, offset
);
8563 w3
[1] = amd_bytealign_S (w2
[3], w3
[0], offset
);
8564 w3
[0] = amd_bytealign_S (w2
[2], w2
[3], offset
);
8565 w2
[3] = amd_bytealign_S (w2
[1], w2
[2], offset
);
8566 w2
[2] = amd_bytealign_S (w2
[0], w2
[1], offset
);
8567 w2
[1] = amd_bytealign_S (w1
[3], w2
[0], offset
);
8568 w2
[0] = amd_bytealign_S (w1
[2], w1
[3], offset
);
8569 w1
[3] = amd_bytealign_S (w1
[1], w1
[2], offset
);
8570 w1
[2] = amd_bytealign_S (w1
[0], w1
[1], offset
);
8571 w1
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
8572 w1
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
8573 w0
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
8574 w0
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8575 w0
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
8580 w3
[2] = amd_bytealign_S (w2
[3], 0, offset
);
8581 w3
[1] = amd_bytealign_S (w2
[2], w2
[3], offset
);
8582 w3
[0] = amd_bytealign_S (w2
[1], w2
[2], offset
);
8583 w2
[3] = amd_bytealign_S (w2
[0], w2
[1], offset
);
8584 w2
[2] = amd_bytealign_S (w1
[3], w2
[0], offset
);
8585 w2
[1] = amd_bytealign_S (w1
[2], w1
[3], offset
);
8586 w2
[0] = amd_bytealign_S (w1
[1], w1
[2], offset
);
8587 w1
[3] = amd_bytealign_S (w1
[0], w1
[1], offset
);
8588 w1
[2] = amd_bytealign_S (w0
[3], w1
[0], offset
);
8589 w1
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
8590 w1
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
8591 w0
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8592 w0
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
8598 w3
[2] = amd_bytealign_S (w2
[2], 0, offset
);
8599 w3
[1] = amd_bytealign_S (w2
[1], w2
[2], offset
);
8600 w3
[0] = amd_bytealign_S (w2
[0], w2
[1], offset
);
8601 w2
[3] = amd_bytealign_S (w1
[3], w2
[0], offset
);
8602 w2
[2] = amd_bytealign_S (w1
[2], w1
[3], offset
);
8603 w2
[1] = amd_bytealign_S (w1
[1], w1
[2], offset
);
8604 w2
[0] = amd_bytealign_S (w1
[0], w1
[1], offset
);
8605 w1
[3] = amd_bytealign_S (w0
[3], w1
[0], offset
);
8606 w1
[2] = amd_bytealign_S (w0
[2], w0
[3], offset
);
8607 w1
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
8608 w1
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8609 w0
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
8616 w3
[2] = amd_bytealign_S (w2
[1], 0, offset
);
8617 w3
[1] = amd_bytealign_S (w2
[0], w2
[1], offset
);
8618 w3
[0] = amd_bytealign_S (w1
[3], w2
[0], offset
);
8619 w2
[3] = amd_bytealign_S (w1
[2], w1
[3], offset
);
8620 w2
[2] = amd_bytealign_S (w1
[1], w1
[2], offset
);
8621 w2
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
8622 w2
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
8623 w1
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
8624 w1
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
8625 w1
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8626 w1
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
8634 w3
[2] = amd_bytealign_S (w2
[0], 0, offset
);
8635 w3
[1] = amd_bytealign_S (w1
[3], w2
[0], offset
);
8636 w3
[0] = amd_bytealign_S (w1
[2], w1
[3], offset
);
8637 w2
[3] = amd_bytealign_S (w1
[1], w1
[2], offset
);
8638 w2
[2] = amd_bytealign_S (w1
[0], w1
[1], offset
);
8639 w2
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
8640 w2
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
8641 w1
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
8642 w1
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8643 w1
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
8652 w3
[2] = amd_bytealign_S (w1
[3], 0, offset
);
8653 w3
[1] = amd_bytealign_S (w1
[2], w1
[3], offset
);
8654 w3
[0] = amd_bytealign_S (w1
[1], w1
[2], offset
);
8655 w2
[3] = amd_bytealign_S (w1
[0], w1
[1], offset
);
8656 w2
[2] = amd_bytealign_S (w0
[3], w1
[0], offset
);
8657 w2
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
8658 w2
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
8659 w1
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8660 w1
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
8670 w3
[2] = amd_bytealign_S (w1
[2], 0, offset
);
8671 w3
[1] = amd_bytealign_S (w1
[1], w1
[2], offset
);
8672 w3
[0] = amd_bytealign_S (w1
[0], w1
[1], offset
);
8673 w2
[3] = amd_bytealign_S (w0
[3], w1
[0], offset
);
8674 w2
[2] = amd_bytealign_S (w0
[2], w0
[3], offset
);
8675 w2
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
8676 w2
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8677 w1
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
8688 w3
[2] = amd_bytealign_S (w1
[1], 0, offset
);
8689 w3
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
8690 w3
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
8691 w2
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
8692 w2
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
8693 w2
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8694 w2
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
8706 w3
[2] = amd_bytealign_S (w1
[0], 0, offset
);
8707 w3
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
8708 w3
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
8709 w2
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
8710 w2
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8711 w2
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
8724 w3
[2] = amd_bytealign_S (w0
[3], 0, offset
);
8725 w3
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
8726 w3
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
8727 w2
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8728 w2
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
8742 w3
[2] = amd_bytealign_S (w0
[2], 0, offset
);
8743 w3
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
8744 w3
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8745 w2
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
8760 w3
[2] = amd_bytealign_S (w0
[1], 0, offset
);
8761 w3
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
8762 w3
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
8778 w3
[2] = amd_bytealign_S (w0
[0], 0, offset
);
8779 w3
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
8798 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
8803 w3
[1] = __byte_perm_S (w3
[1], w3
[0], selector
);
8804 w3
[0] = __byte_perm_S (w3
[0], w2
[3], selector
);
8805 w2
[3] = __byte_perm_S (w2
[3], w2
[2], selector
);
8806 w2
[2] = __byte_perm_S (w2
[2], w2
[1], selector
);
8807 w2
[1] = __byte_perm_S (w2
[1], w2
[0], selector
);
8808 w2
[0] = __byte_perm_S (w2
[0], w1
[3], selector
);
8809 w1
[3] = __byte_perm_S (w1
[3], w1
[2], selector
);
8810 w1
[2] = __byte_perm_S (w1
[2], w1
[1], selector
);
8811 w1
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
8812 w1
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
8813 w0
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
8814 w0
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
8815 w0
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
8816 w0
[0] = __byte_perm_S (w0
[0], 0, selector
);
8820 w3
[1] = __byte_perm_S (w3
[0], w2
[3], selector
);
8821 w3
[0] = __byte_perm_S (w2
[3], w2
[2], selector
);
8822 w2
[3] = __byte_perm_S (w2
[2], w2
[1], selector
);
8823 w2
[2] = __byte_perm_S (w2
[1], w2
[0], selector
);
8824 w2
[1] = __byte_perm_S (w2
[0], w1
[3], selector
);
8825 w2
[0] = __byte_perm_S (w1
[3], w1
[2], selector
);
8826 w1
[3] = __byte_perm_S (w1
[2], w1
[1], selector
);
8827 w1
[2] = __byte_perm_S (w1
[1], w1
[0], selector
);
8828 w1
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
8829 w1
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
8830 w0
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
8831 w0
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
8832 w0
[1] = __byte_perm_S (w0
[0], 0, selector
);
8837 w3
[1] = __byte_perm_S (w2
[3], w2
[2], selector
);
8838 w3
[0] = __byte_perm_S (w2
[2], w2
[1], selector
);
8839 w2
[3] = __byte_perm_S (w2
[1], w2
[0], selector
);
8840 w2
[2] = __byte_perm_S (w2
[0], w1
[3], selector
);
8841 w2
[1] = __byte_perm_S (w1
[3], w1
[2], selector
);
8842 w2
[0] = __byte_perm_S (w1
[2], w1
[1], selector
);
8843 w1
[3] = __byte_perm_S (w1
[1], w1
[0], selector
);
8844 w1
[2] = __byte_perm_S (w1
[0], w0
[3], selector
);
8845 w1
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
8846 w1
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
8847 w0
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
8848 w0
[2] = __byte_perm_S (w0
[0], 0, selector
);
8854 w3
[1] = __byte_perm_S (w2
[2], w2
[1], selector
);
8855 w3
[0] = __byte_perm_S (w2
[1], w2
[0], selector
);
8856 w2
[3] = __byte_perm_S (w2
[0], w1
[3], selector
);
8857 w2
[2] = __byte_perm_S (w1
[3], w1
[2], selector
);
8858 w2
[1] = __byte_perm_S (w1
[2], w1
[1], selector
);
8859 w2
[0] = __byte_perm_S (w1
[1], w1
[0], selector
);
8860 w1
[3] = __byte_perm_S (w1
[0], w0
[3], selector
);
8861 w1
[2] = __byte_perm_S (w0
[3], w0
[2], selector
);
8862 w1
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
8863 w1
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
8864 w0
[3] = __byte_perm_S (w0
[0], 0, selector
);
8871 w3
[1] = __byte_perm_S (w2
[1], w2
[0], selector
);
8872 w3
[0] = __byte_perm_S (w2
[0], w1
[3], selector
);
8873 w2
[3] = __byte_perm_S (w1
[3], w1
[2], selector
);
8874 w2
[2] = __byte_perm_S (w1
[2], w1
[1], selector
);
8875 w2
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
8876 w2
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
8877 w1
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
8878 w1
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
8879 w1
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
8880 w1
[0] = __byte_perm_S (w0
[0], 0, selector
);
8888 w3
[1] = __byte_perm_S (w2
[0], w1
[3], selector
);
8889 w3
[0] = __byte_perm_S (w1
[3], w1
[2], selector
);
8890 w2
[3] = __byte_perm_S (w1
[2], w1
[1], selector
);
8891 w2
[2] = __byte_perm_S (w1
[1], w1
[0], selector
);
8892 w2
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
8893 w2
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
8894 w1
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
8895 w1
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
8896 w1
[1] = __byte_perm_S (w0
[0], 0, selector
);
8905 w3
[1] = __byte_perm_S (w1
[3], w1
[2], selector
);
8906 w3
[0] = __byte_perm_S (w1
[2], w1
[1], selector
);
8907 w2
[3] = __byte_perm_S (w1
[1], w1
[0], selector
);
8908 w2
[2] = __byte_perm_S (w1
[0], w0
[3], selector
);
8909 w2
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
8910 w2
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
8911 w1
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
8912 w1
[2] = __byte_perm_S (w0
[0], 0, selector
);
8922 w3
[1] = __byte_perm_S (w1
[2], w1
[1], selector
);
8923 w3
[0] = __byte_perm_S (w1
[1], w1
[0], selector
);
8924 w2
[3] = __byte_perm_S (w1
[0], w0
[3], selector
);
8925 w2
[2] = __byte_perm_S (w0
[3], w0
[2], selector
);
8926 w2
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
8927 w2
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
8928 w1
[3] = __byte_perm_S (w0
[0], 0, selector
);
8939 w3
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
8940 w3
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
8941 w2
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
8942 w2
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
8943 w2
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
8944 w2
[0] = __byte_perm_S (w0
[0], 0, selector
);
8956 w3
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
8957 w3
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
8958 w2
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
8959 w2
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
8960 w2
[1] = __byte_perm_S (w0
[0], 0, selector
);
8973 w3
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
8974 w3
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
8975 w2
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
8976 w2
[2] = __byte_perm_S (w0
[0], 0, selector
);
8990 w3
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
8991 w3
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
8992 w2
[3] = __byte_perm_S (w0
[0], 0, selector
);
9007 w3
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
9008 w3
[0] = __byte_perm_S (w0
[0], 0, selector
);
9024 w3
[1] = __byte_perm_S (w0
[0], 0, selector
);
9044 * vector functions on scalar types (for inner loop usage)
9047 #define PACKVS2(sn,vn,e) \
9048 sn[0] = vn[0].s##e; \
9051 #define PACKSV2(sn,vn,e) \
9052 vn[0].s##e = sn[0]; \
9055 #define PACKVS24(s0,s1,v0,v1,e) \
9056 PACKVS4 (s0, v0, e); \
9057 PACKVS4 (s1, v1, e);
9059 #define PACKSV24(s0,s1,v0,v1,e) \
9060 PACKSV4 (s0, v0, e); \
9061 PACKSV4 (s1, v1, e);
9063 #define PACKVS4(sn,vn,e) \
9064 sn[0] = vn[0].s##e; \
9065 sn[1] = vn[1].s##e; \
9066 sn[2] = vn[2].s##e; \
9069 #define PACKSV4(sn,vn,e) \
9070 vn[0].s##e = sn[0]; \
9071 vn[1].s##e = sn[1]; \
9072 vn[2].s##e = sn[2]; \
9075 #define PACKVS44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
9076 PACKVS4 (s0, v0, e); \
9077 PACKVS4 (s1, v1, e); \
9078 PACKVS4 (s2, v2, e); \
9079 PACKVS4 (s3, v3, e);
9081 #define PACKSV44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
9082 PACKSV4 (s0, v0, e); \
9083 PACKSV4 (s1, v1, e); \
9084 PACKSV4 (s2, v2, e); \
9085 PACKSV4 (s3, v3, e);
9087 inline void switch_buffer_by_offset_le_VV (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x offset
)
9091 switch_buffer_by_offset_le_S (w0
, w1
, w2
, w3
, offset
);
9104 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
9105 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
9107 #elif VECT_SIZE == 4
9109 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
9110 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
9111 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
9112 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
9114 #elif VECT_SIZE == 8
9116 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
9117 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
9118 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
9119 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
9120 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s4
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4);
9121 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s5
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5);
9122 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s6
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6);
9123 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s7
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7);
9125 #elif VECT_SIZE == 16
9127 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
9128 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
9129 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
9130 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
9131 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s4
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4);
9132 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s5
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5);
9133 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s6
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6);
9134 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s7
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7);
9135 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 8); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s8
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 8);
9136 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 9); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s9
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 9);
9137 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, a
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sa
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, a
);
9138 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, b
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sb
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, b
);
9139 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, c
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sc
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, c
);
9140 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, d
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sd
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, d
);
9141 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, e
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.se
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, e
);
9142 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, f
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sf
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, f
);
9147 inline void append_0x01_2x4_VV (u32x w0
[4], u32x w1
[4], const u32x offset
)
9151 append_0x01_2x4_S (w0
, w1
, offset
);
9162 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x01_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
9163 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x01_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
9165 #elif VECT_SIZE == 4
9167 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x01_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
9168 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x01_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
9169 PACKVS24 (t0
, t1
, w0
, w1
, 2); append_0x01_2x4_S (t0
, t1
, offset
.s2
); PACKSV24 (t0
, t1
, w0
, w1
, 2);
9170 PACKVS24 (t0
, t1
, w0
, w1
, 3); append_0x01_2x4_S (t0
, t1
, offset
.s3
); PACKSV24 (t0
, t1
, w0
, w1
, 3);
9172 #elif VECT_SIZE == 8
9174 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x01_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
9175 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x01_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
9176 PACKVS24 (t0
, t1
, w0
, w1
, 2); append_0x01_2x4_S (t0
, t1
, offset
.s2
); PACKSV24 (t0
, t1
, w0
, w1
, 2);
9177 PACKVS24 (t0
, t1
, w0
, w1
, 3); append_0x01_2x4_S (t0
, t1
, offset
.s3
); PACKSV24 (t0
, t1
, w0
, w1
, 3);
9178 PACKVS24 (t0
, t1
, w0
, w1
, 4); append_0x01_2x4_S (t0
, t1
, offset
.s4
); PACKSV24 (t0
, t1
, w0
, w1
, 4);
9179 PACKVS24 (t0
, t1
, w0
, w1
, 5); append_0x01_2x4_S (t0
, t1
, offset
.s5
); PACKSV24 (t0
, t1
, w0
, w1
, 5);
9180 PACKVS24 (t0
, t1
, w0
, w1
, 6); append_0x01_2x4_S (t0
, t1
, offset
.s6
); PACKSV24 (t0
, t1
, w0
, w1
, 6);
9181 PACKVS24 (t0
, t1
, w0
, w1
, 7); append_0x01_2x4_S (t0
, t1
, offset
.s7
); PACKSV24 (t0
, t1
, w0
, w1
, 7);
9183 #elif VECT_SIZE == 16
9185 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x01_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
9186 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x01_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
9187 PACKVS24 (t0
, t1
, w0
, w1
, 2); append_0x01_2x4_S (t0
, t1
, offset
.s2
); PACKSV24 (t0
, t1
, w0
, w1
, 2);
9188 PACKVS24 (t0
, t1
, w0
, w1
, 3); append_0x01_2x4_S (t0
, t1
, offset
.s3
); PACKSV24 (t0
, t1
, w0
, w1
, 3);
9189 PACKVS24 (t0
, t1
, w0
, w1
, 4); append_0x01_2x4_S (t0
, t1
, offset
.s4
); PACKSV24 (t0
, t1
, w0
, w1
, 4);
9190 PACKVS24 (t0
, t1
, w0
, w1
, 5); append_0x01_2x4_S (t0
, t1
, offset
.s5
); PACKSV24 (t0
, t1
, w0
, w1
, 5);
9191 PACKVS24 (t0
, t1
, w0
, w1
, 6); append_0x01_2x4_S (t0
, t1
, offset
.s6
); PACKSV24 (t0
, t1
, w0
, w1
, 6);
9192 PACKVS24 (t0
, t1
, w0
, w1
, 7); append_0x01_2x4_S (t0
, t1
, offset
.s7
); PACKSV24 (t0
, t1
, w0
, w1
, 7);
9193 PACKVS24 (t0
, t1
, w0
, w1
, 8); append_0x01_2x4_S (t0
, t1
, offset
.s8
); PACKSV24 (t0
, t1
, w0
, w1
, 8);
9194 PACKVS24 (t0
, t1
, w0
, w1
, 9); append_0x01_2x4_S (t0
, t1
, offset
.s9
); PACKSV24 (t0
, t1
, w0
, w1
, 9);
9195 PACKVS24 (t0
, t1
, w0
, w1
, a
); append_0x01_2x4_S (t0
, t1
, offset
.sa
); PACKSV24 (t0
, t1
, w0
, w1
, a
);
9196 PACKVS24 (t0
, t1
, w0
, w1
, b
); append_0x01_2x4_S (t0
, t1
, offset
.sb
); PACKSV24 (t0
, t1
, w0
, w1
, b
);
9197 PACKVS24 (t0
, t1
, w0
, w1
, c
); append_0x01_2x4_S (t0
, t1
, offset
.sc
); PACKSV24 (t0
, t1
, w0
, w1
, c
);
9198 PACKVS24 (t0
, t1
, w0
, w1
, d
); append_0x01_2x4_S (t0
, t1
, offset
.sd
); PACKSV24 (t0
, t1
, w0
, w1
, d
);
9199 PACKVS24 (t0
, t1
, w0
, w1
, e
); append_0x01_2x4_S (t0
, t1
, offset
.se
); PACKSV24 (t0
, t1
, w0
, w1
, e
);
9200 PACKVS24 (t0
, t1
, w0
, w1
, f
); append_0x01_2x4_S (t0
, t1
, offset
.sf
); PACKSV24 (t0
, t1
, w0
, w1
, f
);
9205 inline void append_0x80_2x4_VV (u32x w0
[4], u32x w1
[4], const u32x offset
)
9209 append_0x80_2x4_S (w0
, w1
, offset
);
9220 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x80_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
9221 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x80_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
9223 #elif VECT_SIZE == 4
9225 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x80_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
9226 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x80_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
9227 PACKVS24 (t0
, t1
, w0
, w1
, 2); append_0x80_2x4_S (t0
, t1
, offset
.s2
); PACKSV24 (t0
, t1
, w0
, w1
, 2);
9228 PACKVS24 (t0
, t1
, w0
, w1
, 3); append_0x80_2x4_S (t0
, t1
, offset
.s3
); PACKSV24 (t0
, t1
, w0
, w1
, 3);
9230 #elif VECT_SIZE == 8
9232 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x80_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
9233 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x80_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
9234 PACKVS24 (t0
, t1
, w0
, w1
, 2); append_0x80_2x4_S (t0
, t1
, offset
.s2
); PACKSV24 (t0
, t1
, w0
, w1
, 2);
9235 PACKVS24 (t0
, t1
, w0
, w1
, 3); append_0x80_2x4_S (t0
, t1
, offset
.s3
); PACKSV24 (t0
, t1
, w0
, w1
, 3);
9236 PACKVS24 (t0
, t1
, w0
, w1
, 4); append_0x80_2x4_S (t0
, t1
, offset
.s4
); PACKSV24 (t0
, t1
, w0
, w1
, 4);
9237 PACKVS24 (t0
, t1
, w0
, w1
, 5); append_0x80_2x4_S (t0
, t1
, offset
.s5
); PACKSV24 (t0
, t1
, w0
, w1
, 5);
9238 PACKVS24 (t0
, t1
, w0
, w1
, 6); append_0x80_2x4_S (t0
, t1
, offset
.s6
); PACKSV24 (t0
, t1
, w0
, w1
, 6);
9239 PACKVS24 (t0
, t1
, w0
, w1
, 7); append_0x80_2x4_S (t0
, t1
, offset
.s7
); PACKSV24 (t0
, t1
, w0
, w1
, 7);
9241 #elif VECT_SIZE == 16
9243 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x80_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
9244 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x80_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
9245 PACKVS24 (t0
, t1
, w0
, w1
, 2); append_0x80_2x4_S (t0
, t1
, offset
.s2
); PACKSV24 (t0
, t1
, w0
, w1
, 2);
9246 PACKVS24 (t0
, t1
, w0
, w1
, 3); append_0x80_2x4_S (t0
, t1
, offset
.s3
); PACKSV24 (t0
, t1
, w0
, w1
, 3);
9247 PACKVS24 (t0
, t1
, w0
, w1
, 4); append_0x80_2x4_S (t0
, t1
, offset
.s4
); PACKSV24 (t0
, t1
, w0
, w1
, 4);
9248 PACKVS24 (t0
, t1
, w0
, w1
, 5); append_0x80_2x4_S (t0
, t1
, offset
.s5
); PACKSV24 (t0
, t1
, w0
, w1
, 5);
9249 PACKVS24 (t0
, t1
, w0
, w1
, 6); append_0x80_2x4_S (t0
, t1
, offset
.s6
); PACKSV24 (t0
, t1
, w0
, w1
, 6);
9250 PACKVS24 (t0
, t1
, w0
, w1
, 7); append_0x80_2x4_S (t0
, t1
, offset
.s7
); PACKSV24 (t0
, t1
, w0
, w1
, 7);
9251 PACKVS24 (t0
, t1
, w0
, w1
, 8); append_0x80_2x4_S (t0
, t1
, offset
.s8
); PACKSV24 (t0
, t1
, w0
, w1
, 8);
9252 PACKVS24 (t0
, t1
, w0
, w1
, 9); append_0x80_2x4_S (t0
, t1
, offset
.s9
); PACKSV24 (t0
, t1
, w0
, w1
, 9);
9253 PACKVS24 (t0
, t1
, w0
, w1
, a
); append_0x80_2x4_S (t0
, t1
, offset
.sa
); PACKSV24 (t0
, t1
, w0
, w1
, a
);
9254 PACKVS24 (t0
, t1
, w0
, w1
, b
); append_0x80_2x4_S (t0
, t1
, offset
.sb
); PACKSV24 (t0
, t1
, w0
, w1
, b
);
9255 PACKVS24 (t0
, t1
, w0
, w1
, c
); append_0x80_2x4_S (t0
, t1
, offset
.sc
); PACKSV24 (t0
, t1
, w0
, w1
, c
);
9256 PACKVS24 (t0
, t1
, w0
, w1
, d
); append_0x80_2x4_S (t0
, t1
, offset
.sd
); PACKSV24 (t0
, t1
, w0
, w1
, d
);
9257 PACKVS24 (t0
, t1
, w0
, w1
, e
); append_0x80_2x4_S (t0
, t1
, offset
.se
); PACKSV24 (t0
, t1
, w0
, w1
, e
);
9258 PACKVS24 (t0
, t1
, w0
, w1
, f
); append_0x80_2x4_S (t0
, t1
, offset
.sf
); PACKSV24 (t0
, t1
, w0
, w1
, f
);
9263 inline void append_0x80_4x4_VV (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x offset
)
9267 append_0x80_4x4_S (w0
, w1
, w2
, w3
, offset
);
9280 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
9281 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
9283 #elif VECT_SIZE == 4
9285 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
9286 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
9287 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
9288 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
9290 #elif VECT_SIZE == 8
9292 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
9293 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
9294 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
9295 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
9296 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s4
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4);
9297 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s5
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5);
9298 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s6
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6);
9299 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s7
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7);
9301 #elif VECT_SIZE == 16
9303 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
9304 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
9305 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
9306 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
9307 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s4
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4);
9308 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s5
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5);
9309 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s6
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6);
9310 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s7
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7);
9311 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 8); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s8
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 8);
9312 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 9); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s9
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 9);
9313 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, a
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sa
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, a
);
9314 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, b
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sb
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, b
);
9315 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, c
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sc
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, c
);
9316 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, d
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sd
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, d
);
9317 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, e
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.se
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, e
);
9318 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, f
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sf
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, f
);