2 * Author......: Jens Steube <jens.steube@gmail.com>
7 * pure scalar functions
10 inline int hash_comp (const u32 d1
[4], __global u32
*d2
)
12 if (d1
[3] > d2
[DGST_R3
]) return ( 1);
13 if (d1
[3] < d2
[DGST_R3
]) return (-1);
14 if (d1
[2] > d2
[DGST_R2
]) return ( 1);
15 if (d1
[2] < d2
[DGST_R2
]) return (-1);
16 if (d1
[1] > d2
[DGST_R1
]) return ( 1);
17 if (d1
[1] < d2
[DGST_R1
]) return (-1);
18 if (d1
[0] > d2
[DGST_R0
]) return ( 1);
19 if (d1
[0] < d2
[DGST_R0
]) return (-1);
24 inline int find_hash (const u32 digest
[4], const u32 digests_cnt
, __global digest_t
*digests_buf
)
26 for (u32 l
= 0, r
= digests_cnt
; r
; r
>>= 1)
32 const int cmp
= hash_comp (digest
, digests_buf
[c
].digest_buf
);
41 if (cmp
== 0) return (c
);
47 inline u32
check_bitmap (__global u32
*bitmap
, const u32 bitmap_mask
, const u32 bitmap_shift
, const u32 digest
)
49 return (bitmap
[(digest
>> bitmap_shift
) & bitmap_mask
] & (1 << (digest
& 0x1f)));
52 inline u32
check (const u32 digest
[2], __global u32
*bitmap_s1_a
, __global u32
*bitmap_s1_b
, __global u32
*bitmap_s1_c
, __global u32
*bitmap_s1_d
, __global u32
*bitmap_s2_a
, __global u32
*bitmap_s2_b
, __global u32
*bitmap_s2_c
, __global u32
*bitmap_s2_d
, const u32 bitmap_mask
, const u32 bitmap_shift1
, const u32 bitmap_shift2
)
54 if (check_bitmap (bitmap_s1_a
, bitmap_mask
, bitmap_shift1
, digest
[0]) == 0) return (0);
55 if (check_bitmap (bitmap_s1_b
, bitmap_mask
, bitmap_shift1
, digest
[1]) == 0) return (0);
56 if (check_bitmap (bitmap_s1_c
, bitmap_mask
, bitmap_shift1
, digest
[2]) == 0) return (0);
57 if (check_bitmap (bitmap_s1_d
, bitmap_mask
, bitmap_shift1
, digest
[3]) == 0) return (0);
59 if (check_bitmap (bitmap_s2_a
, bitmap_mask
, bitmap_shift2
, digest
[0]) == 0) return (0);
60 if (check_bitmap (bitmap_s2_b
, bitmap_mask
, bitmap_shift2
, digest
[1]) == 0) return (0);
61 if (check_bitmap (bitmap_s2_c
, bitmap_mask
, bitmap_shift2
, digest
[2]) == 0) return (0);
62 if (check_bitmap (bitmap_s2_d
, bitmap_mask
, bitmap_shift2
, digest
[3]) == 0) return (0);
67 inline void mark_hash (__global plain_t
*plains_buf
, __global u32
*hashes_shown
, const int hash_pos
, const u32 gid
, const u32 il_pos
)
69 hashes_shown
[hash_pos
] = 1;
71 plains_buf
[hash_pos
].gidvid
= (gid
* 1) + 0;
72 plains_buf
[hash_pos
].il_pos
= il_pos
;
79 inline void truncate_block (u32x w
[4], const u32 len
)
88 case 1: w
[0] &= 0x000000FF;
93 case 2: w
[0] &= 0x0000FFFF;
98 case 3: w
[0] &= 0x00FFFFFF;
107 case 5: w
[1] &= 0x000000FF;
111 case 6: w
[1] &= 0x0000FFFF;
115 case 7: w
[1] &= 0x00FFFFFF;
122 case 9: w
[2] &= 0x000000FF;
125 case 10: w
[2] &= 0x0000FFFF;
128 case 11: w
[2] &= 0x00FFFFFF;
133 case 13: w
[3] &= 0x000000FF;
135 case 14: w
[3] &= 0x0000FFFF;
137 case 15: w
[3] &= 0x00FFFFFF;
142 inline void make_unicode (const u32x in
[4], u32x out1
[4], u32x out2
[4])
145 out2
[3] = __byte_perm (in
[3], 0, 0x7372);
146 out2
[2] = __byte_perm (in
[3], 0, 0x7170);
147 out2
[1] = __byte_perm (in
[2], 0, 0x7372);
148 out2
[0] = __byte_perm (in
[2], 0, 0x7170);
149 out1
[3] = __byte_perm (in
[1], 0, 0x7372);
150 out1
[2] = __byte_perm (in
[1], 0, 0x7170);
151 out1
[1] = __byte_perm (in
[0], 0, 0x7372);
152 out1
[0] = __byte_perm (in
[0], 0, 0x7170);
155 #if defined IS_AMD || defined IS_GENERIC
156 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
157 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
158 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
159 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
160 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
161 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
162 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
163 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
167 inline void undo_unicode (const u32x in1
[4], const u32x in2
[4], u32x out
[4])
170 out
[0] = __byte_perm (in1
[0], in1
[1], 0x6420);
171 out
[1] = __byte_perm (in1
[2], in1
[3], 0x6420);
172 out
[2] = __byte_perm (in2
[0], in2
[1], 0x6420);
173 out
[3] = __byte_perm (in2
[2], in2
[3], 0x6420);
176 #if defined IS_AMD || defined IS_GENERIC
177 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
178 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
179 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
180 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
181 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
182 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
183 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
184 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
188 inline void append_0x01_1x4 (u32x w0
[4], const u32 offset
)
197 w0
[0] = w0
[0] | 0x0100;
201 w0
[0] = w0
[0] | 0x010000;
205 w0
[0] = w0
[0] | 0x01000000;
213 w0
[1] = w0
[1] | 0x0100;
217 w0
[1] = w0
[1] | 0x010000;
221 w0
[1] = w0
[1] | 0x01000000;
229 w0
[2] = w0
[2] | 0x0100;
233 w0
[2] = w0
[2] | 0x010000;
237 w0
[2] = w0
[2] | 0x01000000;
245 w0
[3] = w0
[3] | 0x0100;
249 w0
[3] = w0
[3] | 0x010000;
253 w0
[3] = w0
[3] | 0x01000000;
258 inline void append_0x01_2x4 (u32x w0
[4], u32x w1
[4], const u32 offset
)
267 w0
[0] = w0
[0] | 0x0100;
271 w0
[0] = w0
[0] | 0x010000;
275 w0
[0] = w0
[0] | 0x01000000;
283 w0
[1] = w0
[1] | 0x0100;
287 w0
[1] = w0
[1] | 0x010000;
291 w0
[1] = w0
[1] | 0x01000000;
299 w0
[2] = w0
[2] | 0x0100;
303 w0
[2] = w0
[2] | 0x010000;
307 w0
[2] = w0
[2] | 0x01000000;
315 w0
[3] = w0
[3] | 0x0100;
319 w0
[3] = w0
[3] | 0x010000;
323 w0
[3] = w0
[3] | 0x01000000;
331 w1
[0] = w1
[0] | 0x0100;
335 w1
[0] = w1
[0] | 0x010000;
339 w1
[0] = w1
[0] | 0x01000000;
347 w1
[1] = w1
[1] | 0x0100;
351 w1
[1] = w1
[1] | 0x010000;
355 w1
[1] = w1
[1] | 0x01000000;
363 w1
[2] = w1
[2] | 0x0100;
367 w1
[2] = w1
[2] | 0x010000;
371 w1
[2] = w1
[2] | 0x01000000;
379 w1
[3] = w1
[3] | 0x0100;
383 w1
[3] = w1
[3] | 0x010000;
387 w1
[3] = w1
[3] | 0x01000000;
392 inline void append_0x01_3x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
401 w0
[0] = w0
[0] | 0x0100;
405 w0
[0] = w0
[0] | 0x010000;
409 w0
[0] = w0
[0] | 0x01000000;
417 w0
[1] = w0
[1] | 0x0100;
421 w0
[1] = w0
[1] | 0x010000;
425 w0
[1] = w0
[1] | 0x01000000;
433 w0
[2] = w0
[2] | 0x0100;
437 w0
[2] = w0
[2] | 0x010000;
441 w0
[2] = w0
[2] | 0x01000000;
449 w0
[3] = w0
[3] | 0x0100;
453 w0
[3] = w0
[3] | 0x010000;
457 w0
[3] = w0
[3] | 0x01000000;
465 w1
[0] = w1
[0] | 0x0100;
469 w1
[0] = w1
[0] | 0x010000;
473 w1
[0] = w1
[0] | 0x01000000;
481 w1
[1] = w1
[1] | 0x0100;
485 w1
[1] = w1
[1] | 0x010000;
489 w1
[1] = w1
[1] | 0x01000000;
497 w1
[2] = w1
[2] | 0x0100;
501 w1
[2] = w1
[2] | 0x010000;
505 w1
[2] = w1
[2] | 0x01000000;
513 w1
[3] = w1
[3] | 0x0100;
517 w1
[3] = w1
[3] | 0x010000;
521 w1
[3] = w1
[3] | 0x01000000;
529 w2
[0] = w2
[0] | 0x0100;
533 w2
[0] = w2
[0] | 0x010000;
537 w2
[0] = w2
[0] | 0x01000000;
545 w2
[1] = w2
[1] | 0x0100;
549 w2
[1] = w2
[1] | 0x010000;
553 w2
[1] = w2
[1] | 0x01000000;
561 w2
[2] = w2
[2] | 0x0100;
565 w2
[2] = w2
[2] | 0x010000;
569 w2
[2] = w2
[2] | 0x01000000;
577 w2
[3] = w2
[3] | 0x0100;
581 w2
[3] = w2
[3] | 0x010000;
585 w2
[3] = w2
[3] | 0x01000000;
590 inline void append_0x01_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
599 w0
[0] = w0
[0] | 0x0100;
603 w0
[0] = w0
[0] | 0x010000;
607 w0
[0] = w0
[0] | 0x01000000;
615 w0
[1] = w0
[1] | 0x0100;
619 w0
[1] = w0
[1] | 0x010000;
623 w0
[1] = w0
[1] | 0x01000000;
631 w0
[2] = w0
[2] | 0x0100;
635 w0
[2] = w0
[2] | 0x010000;
639 w0
[2] = w0
[2] | 0x01000000;
647 w0
[3] = w0
[3] | 0x0100;
651 w0
[3] = w0
[3] | 0x010000;
655 w0
[3] = w0
[3] | 0x01000000;
663 w1
[0] = w1
[0] | 0x0100;
667 w1
[0] = w1
[0] | 0x010000;
671 w1
[0] = w1
[0] | 0x01000000;
679 w1
[1] = w1
[1] | 0x0100;
683 w1
[1] = w1
[1] | 0x010000;
687 w1
[1] = w1
[1] | 0x01000000;
695 w1
[2] = w1
[2] | 0x0100;
699 w1
[2] = w1
[2] | 0x010000;
703 w1
[2] = w1
[2] | 0x01000000;
711 w1
[3] = w1
[3] | 0x0100;
715 w1
[3] = w1
[3] | 0x010000;
719 w1
[3] = w1
[3] | 0x01000000;
727 w2
[0] = w2
[0] | 0x0100;
731 w2
[0] = w2
[0] | 0x010000;
735 w2
[0] = w2
[0] | 0x01000000;
743 w2
[1] = w2
[1] | 0x0100;
747 w2
[1] = w2
[1] | 0x010000;
751 w2
[1] = w2
[1] | 0x01000000;
759 w2
[2] = w2
[2] | 0x0100;
763 w2
[2] = w2
[2] | 0x010000;
767 w2
[2] = w2
[2] | 0x01000000;
775 w2
[3] = w2
[3] | 0x0100;
779 w2
[3] = w2
[3] | 0x010000;
783 w2
[3] = w2
[3] | 0x01000000;
791 w3
[0] = w3
[0] | 0x0100;
795 w3
[0] = w3
[0] | 0x010000;
799 w3
[0] = w3
[0] | 0x01000000;
807 w3
[1] = w3
[1] | 0x0100;
811 w3
[1] = w3
[1] | 0x010000;
815 w3
[1] = w3
[1] | 0x01000000;
823 w3
[2] = w3
[2] | 0x0100;
827 w3
[2] = w3
[2] | 0x010000;
831 w3
[2] = w3
[2] | 0x01000000;
839 w3
[3] = w3
[3] | 0x0100;
843 w3
[3] = w3
[3] | 0x010000;
847 w3
[3] = w3
[3] | 0x01000000;
852 inline void append_0x01_8x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
861 w0
[0] = w0
[0] | 0x0100;
865 w0
[0] = w0
[0] | 0x010000;
869 w0
[0] = w0
[0] | 0x01000000;
877 w0
[1] = w0
[1] | 0x0100;
881 w0
[1] = w0
[1] | 0x010000;
885 w0
[1] = w0
[1] | 0x01000000;
893 w0
[2] = w0
[2] | 0x0100;
897 w0
[2] = w0
[2] | 0x010000;
901 w0
[2] = w0
[2] | 0x01000000;
909 w0
[3] = w0
[3] | 0x0100;
913 w0
[3] = w0
[3] | 0x010000;
917 w0
[3] = w0
[3] | 0x01000000;
925 w1
[0] = w1
[0] | 0x0100;
929 w1
[0] = w1
[0] | 0x010000;
933 w1
[0] = w1
[0] | 0x01000000;
941 w1
[1] = w1
[1] | 0x0100;
945 w1
[1] = w1
[1] | 0x010000;
949 w1
[1] = w1
[1] | 0x01000000;
957 w1
[2] = w1
[2] | 0x0100;
961 w1
[2] = w1
[2] | 0x010000;
965 w1
[2] = w1
[2] | 0x01000000;
973 w1
[3] = w1
[3] | 0x0100;
977 w1
[3] = w1
[3] | 0x010000;
981 w1
[3] = w1
[3] | 0x01000000;
989 w2
[0] = w2
[0] | 0x0100;
993 w2
[0] = w2
[0] | 0x010000;
997 w2
[0] = w2
[0] | 0x01000000;
1005 w2
[1] = w2
[1] | 0x0100;
1009 w2
[1] = w2
[1] | 0x010000;
1013 w2
[1] = w2
[1] | 0x01000000;
1021 w2
[2] = w2
[2] | 0x0100;
1025 w2
[2] = w2
[2] | 0x010000;
1029 w2
[2] = w2
[2] | 0x01000000;
1037 w2
[3] = w2
[3] | 0x0100;
1041 w2
[3] = w2
[3] | 0x010000;
1045 w2
[3] = w2
[3] | 0x01000000;
1053 w3
[0] = w3
[0] | 0x0100;
1057 w3
[0] = w3
[0] | 0x010000;
1061 w3
[0] = w3
[0] | 0x01000000;
1069 w3
[1] = w3
[1] | 0x0100;
1073 w3
[1] = w3
[1] | 0x010000;
1077 w3
[1] = w3
[1] | 0x01000000;
1085 w3
[2] = w3
[2] | 0x0100;
1089 w3
[2] = w3
[2] | 0x010000;
1093 w3
[2] = w3
[2] | 0x01000000;
1101 w3
[3] = w3
[3] | 0x0100;
1105 w3
[3] = w3
[3] | 0x010000;
1109 w3
[3] = w3
[3] | 0x01000000;
1117 w4
[0] = w4
[0] | 0x0100;
1121 w4
[0] = w4
[0] | 0x010000;
1125 w4
[0] = w4
[0] | 0x01000000;
1133 w4
[1] = w4
[1] | 0x0100;
1137 w4
[1] = w4
[1] | 0x010000;
1141 w4
[1] = w4
[1] | 0x01000000;
1149 w4
[2] = w4
[2] | 0x0100;
1153 w4
[2] = w4
[2] | 0x010000;
1157 w4
[2] = w4
[2] | 0x01000000;
1165 w4
[3] = w4
[3] | 0x0100;
1169 w4
[3] = w4
[3] | 0x010000;
1173 w4
[3] = w4
[3] | 0x01000000;
1181 w5
[0] = w5
[0] | 0x0100;
1185 w5
[0] = w5
[0] | 0x010000;
1189 w5
[0] = w5
[0] | 0x01000000;
1197 w5
[1] = w5
[1] | 0x0100;
1201 w5
[1] = w5
[1] | 0x010000;
1205 w5
[1] = w5
[1] | 0x01000000;
1213 w5
[2] = w5
[2] | 0x0100;
1217 w5
[2] = w5
[2] | 0x010000;
1221 w5
[2] = w5
[2] | 0x01000000;
1229 w5
[3] = w5
[3] | 0x0100;
1233 w5
[3] = w5
[3] | 0x010000;
1237 w5
[3] = w5
[3] | 0x01000000;
1245 w6
[0] = w6
[0] | 0x0100;
1249 w6
[0] = w6
[0] | 0x010000;
1253 w6
[0] = w6
[0] | 0x01000000;
1261 w6
[1] = w6
[1] | 0x0100;
1265 w6
[1] = w6
[1] | 0x010000;
1269 w6
[1] = w6
[1] | 0x01000000;
1277 w6
[2] = w6
[2] | 0x0100;
1281 w6
[2] = w6
[2] | 0x010000;
1285 w6
[2] = w6
[2] | 0x01000000;
1293 w6
[3] = w6
[3] | 0x0100;
1297 w6
[3] = w6
[3] | 0x010000;
1301 w6
[3] = w6
[3] | 0x01000000;
1309 w7
[0] = w7
[0] | 0x0100;
1313 w7
[0] = w7
[0] | 0x010000;
1317 w7
[0] = w7
[0] | 0x01000000;
1325 w7
[1] = w7
[1] | 0x0100;
1329 w7
[1] = w7
[1] | 0x010000;
1333 w7
[1] = w7
[1] | 0x01000000;
1341 w7
[2] = w7
[2] | 0x0100;
1345 w7
[2] = w7
[2] | 0x010000;
1349 w7
[2] = w7
[2] | 0x01000000;
1357 w7
[3] = w7
[3] | 0x0100;
1361 w7
[3] = w7
[3] | 0x010000;
1365 w7
[3] = w7
[3] | 0x01000000;
1370 inline void append_0x02_1x4 (u32x w0
[4], const u32 offset
)
1379 w0
[0] = w0
[0] | 0x0200;
1383 w0
[0] = w0
[0] | 0x020000;
1387 w0
[0] = w0
[0] | 0x02000000;
1395 w0
[1] = w0
[1] | 0x0200;
1399 w0
[1] = w0
[1] | 0x020000;
1403 w0
[1] = w0
[1] | 0x02000000;
1411 w0
[2] = w0
[2] | 0x0200;
1415 w0
[2] = w0
[2] | 0x020000;
1419 w0
[2] = w0
[2] | 0x02000000;
1427 w0
[3] = w0
[3] | 0x0200;
1431 w0
[3] = w0
[3] | 0x020000;
1435 w0
[3] = w0
[3] | 0x02000000;
1440 inline void append_0x02_2x4 (u32x w0
[4], u32x w1
[4], const u32 offset
)
1449 w0
[0] = w0
[0] | 0x0200;
1453 w0
[0] = w0
[0] | 0x020000;
1457 w0
[0] = w0
[0] | 0x02000000;
1465 w0
[1] = w0
[1] | 0x0200;
1469 w0
[1] = w0
[1] | 0x020000;
1473 w0
[1] = w0
[1] | 0x02000000;
1481 w0
[2] = w0
[2] | 0x0200;
1485 w0
[2] = w0
[2] | 0x020000;
1489 w0
[2] = w0
[2] | 0x02000000;
1497 w0
[3] = w0
[3] | 0x0200;
1501 w0
[3] = w0
[3] | 0x020000;
1505 w0
[3] = w0
[3] | 0x02000000;
1513 w1
[0] = w1
[0] | 0x0200;
1517 w1
[0] = w1
[0] | 0x020000;
1521 w1
[0] = w1
[0] | 0x02000000;
1529 w1
[1] = w1
[1] | 0x0200;
1533 w1
[1] = w1
[1] | 0x020000;
1537 w1
[1] = w1
[1] | 0x02000000;
1545 w1
[2] = w1
[2] | 0x0200;
1549 w1
[2] = w1
[2] | 0x020000;
1553 w1
[2] = w1
[2] | 0x02000000;
1561 w1
[3] = w1
[3] | 0x0200;
1565 w1
[3] = w1
[3] | 0x020000;
1569 w1
[3] = w1
[3] | 0x02000000;
1574 inline void append_0x02_3x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
1583 w0
[0] = w0
[0] | 0x0200;
1587 w0
[0] = w0
[0] | 0x020000;
1591 w0
[0] = w0
[0] | 0x02000000;
1599 w0
[1] = w0
[1] | 0x0200;
1603 w0
[1] = w0
[1] | 0x020000;
1607 w0
[1] = w0
[1] | 0x02000000;
1615 w0
[2] = w0
[2] | 0x0200;
1619 w0
[2] = w0
[2] | 0x020000;
1623 w0
[2] = w0
[2] | 0x02000000;
1631 w0
[3] = w0
[3] | 0x0200;
1635 w0
[3] = w0
[3] | 0x020000;
1639 w0
[3] = w0
[3] | 0x02000000;
1647 w1
[0] = w1
[0] | 0x0200;
1651 w1
[0] = w1
[0] | 0x020000;
1655 w1
[0] = w1
[0] | 0x02000000;
1663 w1
[1] = w1
[1] | 0x0200;
1667 w1
[1] = w1
[1] | 0x020000;
1671 w1
[1] = w1
[1] | 0x02000000;
1679 w1
[2] = w1
[2] | 0x0200;
1683 w1
[2] = w1
[2] | 0x020000;
1687 w1
[2] = w1
[2] | 0x02000000;
1695 w1
[3] = w1
[3] | 0x0200;
1699 w1
[3] = w1
[3] | 0x020000;
1703 w1
[3] = w1
[3] | 0x02000000;
1711 w2
[0] = w2
[0] | 0x0200;
1715 w2
[0] = w2
[0] | 0x020000;
1719 w2
[0] = w2
[0] | 0x02000000;
1727 w2
[1] = w2
[1] | 0x0200;
1731 w2
[1] = w2
[1] | 0x020000;
1735 w2
[1] = w2
[1] | 0x02000000;
1743 w2
[2] = w2
[2] | 0x0200;
1747 w2
[2] = w2
[2] | 0x020000;
1751 w2
[2] = w2
[2] | 0x02000000;
1759 w2
[3] = w2
[3] | 0x0200;
1763 w2
[3] = w2
[3] | 0x020000;
1767 w2
[3] = w2
[3] | 0x02000000;
1772 inline void append_0x02_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
1781 w0
[0] = w0
[0] | 0x0200;
1785 w0
[0] = w0
[0] | 0x020000;
1789 w0
[0] = w0
[0] | 0x02000000;
1797 w0
[1] = w0
[1] | 0x0200;
1801 w0
[1] = w0
[1] | 0x020000;
1805 w0
[1] = w0
[1] | 0x02000000;
1813 w0
[2] = w0
[2] | 0x0200;
1817 w0
[2] = w0
[2] | 0x020000;
1821 w0
[2] = w0
[2] | 0x02000000;
1829 w0
[3] = w0
[3] | 0x0200;
1833 w0
[3] = w0
[3] | 0x020000;
1837 w0
[3] = w0
[3] | 0x02000000;
1845 w1
[0] = w1
[0] | 0x0200;
1849 w1
[0] = w1
[0] | 0x020000;
1853 w1
[0] = w1
[0] | 0x02000000;
1861 w1
[1] = w1
[1] | 0x0200;
1865 w1
[1] = w1
[1] | 0x020000;
1869 w1
[1] = w1
[1] | 0x02000000;
1877 w1
[2] = w1
[2] | 0x0200;
1881 w1
[2] = w1
[2] | 0x020000;
1885 w1
[2] = w1
[2] | 0x02000000;
1893 w1
[3] = w1
[3] | 0x0200;
1897 w1
[3] = w1
[3] | 0x020000;
1901 w1
[3] = w1
[3] | 0x02000000;
1909 w2
[0] = w2
[0] | 0x0200;
1913 w2
[0] = w2
[0] | 0x020000;
1917 w2
[0] = w2
[0] | 0x02000000;
1925 w2
[1] = w2
[1] | 0x0200;
1929 w2
[1] = w2
[1] | 0x020000;
1933 w2
[1] = w2
[1] | 0x02000000;
1941 w2
[2] = w2
[2] | 0x0200;
1945 w2
[2] = w2
[2] | 0x020000;
1949 w2
[2] = w2
[2] | 0x02000000;
1957 w2
[3] = w2
[3] | 0x0200;
1961 w2
[3] = w2
[3] | 0x020000;
1965 w2
[3] = w2
[3] | 0x02000000;
1973 w3
[0] = w3
[0] | 0x0200;
1977 w3
[0] = w3
[0] | 0x020000;
1981 w3
[0] = w3
[0] | 0x02000000;
1989 w3
[1] = w3
[1] | 0x0200;
1993 w3
[1] = w3
[1] | 0x020000;
1997 w3
[1] = w3
[1] | 0x02000000;
2005 w3
[2] = w3
[2] | 0x0200;
2009 w3
[2] = w3
[2] | 0x020000;
2013 w3
[2] = w3
[2] | 0x02000000;
2021 w3
[3] = w3
[3] | 0x0200;
2025 w3
[3] = w3
[3] | 0x020000;
2029 w3
[3] = w3
[3] | 0x02000000;
2034 inline void append_0x02_8x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
2043 w0
[0] = w0
[0] | 0x0200;
2047 w0
[0] = w0
[0] | 0x020000;
2051 w0
[0] = w0
[0] | 0x02000000;
2059 w0
[1] = w0
[1] | 0x0200;
2063 w0
[1] = w0
[1] | 0x020000;
2067 w0
[1] = w0
[1] | 0x02000000;
2075 w0
[2] = w0
[2] | 0x0200;
2079 w0
[2] = w0
[2] | 0x020000;
2083 w0
[2] = w0
[2] | 0x02000000;
2091 w0
[3] = w0
[3] | 0x0200;
2095 w0
[3] = w0
[3] | 0x020000;
2099 w0
[3] = w0
[3] | 0x02000000;
2107 w1
[0] = w1
[0] | 0x0200;
2111 w1
[0] = w1
[0] | 0x020000;
2115 w1
[0] = w1
[0] | 0x02000000;
2123 w1
[1] = w1
[1] | 0x0200;
2127 w1
[1] = w1
[1] | 0x020000;
2131 w1
[1] = w1
[1] | 0x02000000;
2139 w1
[2] = w1
[2] | 0x0200;
2143 w1
[2] = w1
[2] | 0x020000;
2147 w1
[2] = w1
[2] | 0x02000000;
2155 w1
[3] = w1
[3] | 0x0200;
2159 w1
[3] = w1
[3] | 0x020000;
2163 w1
[3] = w1
[3] | 0x02000000;
2171 w2
[0] = w2
[0] | 0x0200;
2175 w2
[0] = w2
[0] | 0x020000;
2179 w2
[0] = w2
[0] | 0x02000000;
2187 w2
[1] = w2
[1] | 0x0200;
2191 w2
[1] = w2
[1] | 0x020000;
2195 w2
[1] = w2
[1] | 0x02000000;
2203 w2
[2] = w2
[2] | 0x0200;
2207 w2
[2] = w2
[2] | 0x020000;
2211 w2
[2] = w2
[2] | 0x02000000;
2219 w2
[3] = w2
[3] | 0x0200;
2223 w2
[3] = w2
[3] | 0x020000;
2227 w2
[3] = w2
[3] | 0x02000000;
2235 w3
[0] = w3
[0] | 0x0200;
2239 w3
[0] = w3
[0] | 0x020000;
2243 w3
[0] = w3
[0] | 0x02000000;
2251 w3
[1] = w3
[1] | 0x0200;
2255 w3
[1] = w3
[1] | 0x020000;
2259 w3
[1] = w3
[1] | 0x02000000;
2267 w3
[2] = w3
[2] | 0x0200;
2271 w3
[2] = w3
[2] | 0x020000;
2275 w3
[2] = w3
[2] | 0x02000000;
2283 w3
[3] = w3
[3] | 0x0200;
2287 w3
[3] = w3
[3] | 0x020000;
2291 w3
[3] = w3
[3] | 0x02000000;
2299 w4
[0] = w4
[0] | 0x0200;
2303 w4
[0] = w4
[0] | 0x020000;
2307 w4
[0] = w4
[0] | 0x02000000;
2315 w4
[1] = w4
[1] | 0x0200;
2319 w4
[1] = w4
[1] | 0x020000;
2323 w4
[1] = w4
[1] | 0x02000000;
2331 w4
[2] = w4
[2] | 0x0200;
2335 w4
[2] = w4
[2] | 0x020000;
2339 w4
[2] = w4
[2] | 0x02000000;
2347 w4
[3] = w4
[3] | 0x0200;
2351 w4
[3] = w4
[3] | 0x020000;
2355 w4
[3] = w4
[3] | 0x02000000;
2363 w5
[0] = w5
[0] | 0x0200;
2367 w5
[0] = w5
[0] | 0x020000;
2371 w5
[0] = w5
[0] | 0x02000000;
2379 w5
[1] = w5
[1] | 0x0200;
2383 w5
[1] = w5
[1] | 0x020000;
2387 w5
[1] = w5
[1] | 0x02000000;
2395 w5
[2] = w5
[2] | 0x0200;
2399 w5
[2] = w5
[2] | 0x020000;
2403 w5
[2] = w5
[2] | 0x02000000;
2411 w5
[3] = w5
[3] | 0x0200;
2415 w5
[3] = w5
[3] | 0x020000;
2419 w5
[3] = w5
[3] | 0x02000000;
2427 w6
[0] = w6
[0] | 0x0200;
2431 w6
[0] = w6
[0] | 0x020000;
2435 w6
[0] = w6
[0] | 0x02000000;
2443 w6
[1] = w6
[1] | 0x0200;
2447 w6
[1] = w6
[1] | 0x020000;
2451 w6
[1] = w6
[1] | 0x02000000;
2459 w6
[2] = w6
[2] | 0x0200;
2463 w6
[2] = w6
[2] | 0x020000;
2467 w6
[2] = w6
[2] | 0x02000000;
2475 w6
[3] = w6
[3] | 0x0200;
2479 w6
[3] = w6
[3] | 0x020000;
2483 w6
[3] = w6
[3] | 0x02000000;
2491 w7
[0] = w7
[0] | 0x0200;
2495 w7
[0] = w7
[0] | 0x020000;
2499 w7
[0] = w7
[0] | 0x02000000;
2507 w7
[1] = w7
[1] | 0x0200;
2511 w7
[1] = w7
[1] | 0x020000;
2515 w7
[1] = w7
[1] | 0x02000000;
2523 w7
[2] = w7
[2] | 0x0200;
2527 w7
[2] = w7
[2] | 0x020000;
2531 w7
[2] = w7
[2] | 0x02000000;
2539 w7
[3] = w7
[3] | 0x0200;
2543 w7
[3] = w7
[3] | 0x020000;
2547 w7
[3] = w7
[3] | 0x02000000;
2552 inline void append_0x80_1x4 (u32x w0
[4], const u32 offset
)
2561 w0
[0] = w0
[0] | 0x8000;
2565 w0
[0] = w0
[0] | 0x800000;
2569 w0
[0] = w0
[0] | 0x80000000;
2577 w0
[1] = w0
[1] | 0x8000;
2581 w0
[1] = w0
[1] | 0x800000;
2585 w0
[1] = w0
[1] | 0x80000000;
2593 w0
[2] = w0
[2] | 0x8000;
2597 w0
[2] = w0
[2] | 0x800000;
2601 w0
[2] = w0
[2] | 0x80000000;
2609 w0
[3] = w0
[3] | 0x8000;
2613 w0
[3] = w0
[3] | 0x800000;
2617 w0
[3] = w0
[3] | 0x80000000;
2622 inline void append_0x80_2x4 (u32x w0
[4], u32x w1
[4], const u32 offset
)
2631 w0
[0] = w0
[0] | 0x8000;
2635 w0
[0] = w0
[0] | 0x800000;
2639 w0
[0] = w0
[0] | 0x80000000;
2647 w0
[1] = w0
[1] | 0x8000;
2651 w0
[1] = w0
[1] | 0x800000;
2655 w0
[1] = w0
[1] | 0x80000000;
2663 w0
[2] = w0
[2] | 0x8000;
2667 w0
[2] = w0
[2] | 0x800000;
2671 w0
[2] = w0
[2] | 0x80000000;
2679 w0
[3] = w0
[3] | 0x8000;
2683 w0
[3] = w0
[3] | 0x800000;
2687 w0
[3] = w0
[3] | 0x80000000;
2695 w1
[0] = w1
[0] | 0x8000;
2699 w1
[0] = w1
[0] | 0x800000;
2703 w1
[0] = w1
[0] | 0x80000000;
2711 w1
[1] = w1
[1] | 0x8000;
2715 w1
[1] = w1
[1] | 0x800000;
2719 w1
[1] = w1
[1] | 0x80000000;
2727 w1
[2] = w1
[2] | 0x8000;
2731 w1
[2] = w1
[2] | 0x800000;
2735 w1
[2] = w1
[2] | 0x80000000;
2743 w1
[3] = w1
[3] | 0x8000;
2747 w1
[3] = w1
[3] | 0x800000;
2751 w1
[3] = w1
[3] | 0x80000000;
2756 inline void append_0x80_3x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], const u32 offset
)
2765 w0
[0] = w0
[0] | 0x8000;
2769 w0
[0] = w0
[0] | 0x800000;
2773 w0
[0] = w0
[0] | 0x80000000;
2781 w0
[1] = w0
[1] | 0x8000;
2785 w0
[1] = w0
[1] | 0x800000;
2789 w0
[1] = w0
[1] | 0x80000000;
2797 w0
[2] = w0
[2] | 0x8000;
2801 w0
[2] = w0
[2] | 0x800000;
2805 w0
[2] = w0
[2] | 0x80000000;
2813 w0
[3] = w0
[3] | 0x8000;
2817 w0
[3] = w0
[3] | 0x800000;
2821 w0
[3] = w0
[3] | 0x80000000;
2829 w1
[0] = w1
[0] | 0x8000;
2833 w1
[0] = w1
[0] | 0x800000;
2837 w1
[0] = w1
[0] | 0x80000000;
2845 w1
[1] = w1
[1] | 0x8000;
2849 w1
[1] = w1
[1] | 0x800000;
2853 w1
[1] = w1
[1] | 0x80000000;
2861 w1
[2] = w1
[2] | 0x8000;
2865 w1
[2] = w1
[2] | 0x800000;
2869 w1
[2] = w1
[2] | 0x80000000;
2877 w1
[3] = w1
[3] | 0x8000;
2881 w1
[3] = w1
[3] | 0x800000;
2885 w1
[3] = w1
[3] | 0x80000000;
2893 w2
[0] = w2
[0] | 0x8000;
2897 w2
[0] = w2
[0] | 0x800000;
2901 w2
[0] = w2
[0] | 0x80000000;
2909 w2
[1] = w2
[1] | 0x8000;
2913 w2
[1] = w2
[1] | 0x800000;
2917 w2
[1] = w2
[1] | 0x80000000;
2925 w2
[2] = w2
[2] | 0x8000;
2929 w2
[2] = w2
[2] | 0x800000;
2933 w2
[2] = w2
[2] | 0x80000000;
2941 w2
[3] = w2
[3] | 0x8000;
2945 w2
[3] = w2
[3] | 0x800000;
2949 w2
[3] = w2
[3] | 0x80000000;
2954 inline void append_0x80_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
2963 w0
[0] = w0
[0] | 0x8000;
2967 w0
[0] = w0
[0] | 0x800000;
2971 w0
[0] = w0
[0] | 0x80000000;
2979 w0
[1] = w0
[1] | 0x8000;
2983 w0
[1] = w0
[1] | 0x800000;
2987 w0
[1] = w0
[1] | 0x80000000;
2995 w0
[2] = w0
[2] | 0x8000;
2999 w0
[2] = w0
[2] | 0x800000;
3003 w0
[2] = w0
[2] | 0x80000000;
3011 w0
[3] = w0
[3] | 0x8000;
3015 w0
[3] = w0
[3] | 0x800000;
3019 w0
[3] = w0
[3] | 0x80000000;
3027 w1
[0] = w1
[0] | 0x8000;
3031 w1
[0] = w1
[0] | 0x800000;
3035 w1
[0] = w1
[0] | 0x80000000;
3043 w1
[1] = w1
[1] | 0x8000;
3047 w1
[1] = w1
[1] | 0x800000;
3051 w1
[1] = w1
[1] | 0x80000000;
3059 w1
[2] = w1
[2] | 0x8000;
3063 w1
[2] = w1
[2] | 0x800000;
3067 w1
[2] = w1
[2] | 0x80000000;
3075 w1
[3] = w1
[3] | 0x8000;
3079 w1
[3] = w1
[3] | 0x800000;
3083 w1
[3] = w1
[3] | 0x80000000;
3091 w2
[0] = w2
[0] | 0x8000;
3095 w2
[0] = w2
[0] | 0x800000;
3099 w2
[0] = w2
[0] | 0x80000000;
3107 w2
[1] = w2
[1] | 0x8000;
3111 w2
[1] = w2
[1] | 0x800000;
3115 w2
[1] = w2
[1] | 0x80000000;
3123 w2
[2] = w2
[2] | 0x8000;
3127 w2
[2] = w2
[2] | 0x800000;
3131 w2
[2] = w2
[2] | 0x80000000;
3139 w2
[3] = w2
[3] | 0x8000;
3143 w2
[3] = w2
[3] | 0x800000;
3147 w2
[3] = w2
[3] | 0x80000000;
3155 w3
[0] = w3
[0] | 0x8000;
3159 w3
[0] = w3
[0] | 0x800000;
3163 w3
[0] = w3
[0] | 0x80000000;
3171 w3
[1] = w3
[1] | 0x8000;
3175 w3
[1] = w3
[1] | 0x800000;
3179 w3
[1] = w3
[1] | 0x80000000;
3187 w3
[2] = w3
[2] | 0x8000;
3191 w3
[2] = w3
[2] | 0x800000;
3195 w3
[2] = w3
[2] | 0x80000000;
3203 w3
[3] = w3
[3] | 0x8000;
3207 w3
[3] = w3
[3] | 0x800000;
3211 w3
[3] = w3
[3] | 0x80000000;
3216 inline void append_0x80_8x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], u32x w4
[4], u32x w5
[4], u32x w6
[4], u32x w7
[4], const u32 offset
)
3225 w0
[0] = w0
[0] | 0x8000;
3229 w0
[0] = w0
[0] | 0x800000;
3233 w0
[0] = w0
[0] | 0x80000000;
3241 w0
[1] = w0
[1] | 0x8000;
3245 w0
[1] = w0
[1] | 0x800000;
3249 w0
[1] = w0
[1] | 0x80000000;
3257 w0
[2] = w0
[2] | 0x8000;
3261 w0
[2] = w0
[2] | 0x800000;
3265 w0
[2] = w0
[2] | 0x80000000;
3273 w0
[3] = w0
[3] | 0x8000;
3277 w0
[3] = w0
[3] | 0x800000;
3281 w0
[3] = w0
[3] | 0x80000000;
3289 w1
[0] = w1
[0] | 0x8000;
3293 w1
[0] = w1
[0] | 0x800000;
3297 w1
[0] = w1
[0] | 0x80000000;
3305 w1
[1] = w1
[1] | 0x8000;
3309 w1
[1] = w1
[1] | 0x800000;
3313 w1
[1] = w1
[1] | 0x80000000;
3321 w1
[2] = w1
[2] | 0x8000;
3325 w1
[2] = w1
[2] | 0x800000;
3329 w1
[2] = w1
[2] | 0x80000000;
3337 w1
[3] = w1
[3] | 0x8000;
3341 w1
[3] = w1
[3] | 0x800000;
3345 w1
[3] = w1
[3] | 0x80000000;
3353 w2
[0] = w2
[0] | 0x8000;
3357 w2
[0] = w2
[0] | 0x800000;
3361 w2
[0] = w2
[0] | 0x80000000;
3369 w2
[1] = w2
[1] | 0x8000;
3373 w2
[1] = w2
[1] | 0x800000;
3377 w2
[1] = w2
[1] | 0x80000000;
3385 w2
[2] = w2
[2] | 0x8000;
3389 w2
[2] = w2
[2] | 0x800000;
3393 w2
[2] = w2
[2] | 0x80000000;
3401 w2
[3] = w2
[3] | 0x8000;
3405 w2
[3] = w2
[3] | 0x800000;
3409 w2
[3] = w2
[3] | 0x80000000;
3417 w3
[0] = w3
[0] | 0x8000;
3421 w3
[0] = w3
[0] | 0x800000;
3425 w3
[0] = w3
[0] | 0x80000000;
3433 w3
[1] = w3
[1] | 0x8000;
3437 w3
[1] = w3
[1] | 0x800000;
3441 w3
[1] = w3
[1] | 0x80000000;
3449 w3
[2] = w3
[2] | 0x8000;
3453 w3
[2] = w3
[2] | 0x800000;
3457 w3
[2] = w3
[2] | 0x80000000;
3465 w3
[3] = w3
[3] | 0x8000;
3469 w3
[3] = w3
[3] | 0x800000;
3473 w3
[3] = w3
[3] | 0x80000000;
3481 w4
[0] = w4
[0] | 0x8000;
3485 w4
[0] = w4
[0] | 0x800000;
3489 w4
[0] = w4
[0] | 0x80000000;
3497 w4
[1] = w4
[1] | 0x8000;
3501 w4
[1] = w4
[1] | 0x800000;
3505 w4
[1] = w4
[1] | 0x80000000;
3513 w4
[2] = w4
[2] | 0x8000;
3517 w4
[2] = w4
[2] | 0x800000;
3521 w4
[2] = w4
[2] | 0x80000000;
3529 w4
[3] = w4
[3] | 0x8000;
3533 w4
[3] = w4
[3] | 0x800000;
3537 w4
[3] = w4
[3] | 0x80000000;
3545 w5
[0] = w5
[0] | 0x8000;
3549 w5
[0] = w5
[0] | 0x800000;
3553 w5
[0] = w5
[0] | 0x80000000;
3561 w5
[1] = w5
[1] | 0x8000;
3565 w5
[1] = w5
[1] | 0x800000;
3569 w5
[1] = w5
[1] | 0x80000000;
3577 w5
[2] = w5
[2] | 0x8000;
3581 w5
[2] = w5
[2] | 0x800000;
3585 w5
[2] = w5
[2] | 0x80000000;
3593 w5
[3] = w5
[3] | 0x8000;
3597 w5
[3] = w5
[3] | 0x800000;
3601 w5
[3] = w5
[3] | 0x80000000;
3609 w6
[0] = w6
[0] | 0x8000;
3613 w6
[0] = w6
[0] | 0x800000;
3617 w6
[0] = w6
[0] | 0x80000000;
3625 w6
[1] = w6
[1] | 0x8000;
3629 w6
[1] = w6
[1] | 0x800000;
3633 w6
[1] = w6
[1] | 0x80000000;
3641 w6
[2] = w6
[2] | 0x8000;
3645 w6
[2] = w6
[2] | 0x800000;
3649 w6
[2] = w6
[2] | 0x80000000;
3657 w6
[3] = w6
[3] | 0x8000;
3661 w6
[3] = w6
[3] | 0x800000;
3665 w6
[3] = w6
[3] | 0x80000000;
3673 w7
[0] = w7
[0] | 0x8000;
3677 w7
[0] = w7
[0] | 0x800000;
3681 w7
[0] = w7
[0] | 0x80000000;
3689 w7
[1] = w7
[1] | 0x8000;
3693 w7
[1] = w7
[1] | 0x800000;
3697 w7
[1] = w7
[1] | 0x80000000;
3705 w7
[2] = w7
[2] | 0x8000;
3709 w7
[2] = w7
[2] | 0x800000;
3713 w7
[2] = w7
[2] | 0x80000000;
3721 w7
[3] = w7
[3] | 0x8000;
3725 w7
[3] = w7
[3] | 0x800000;
3729 w7
[3] = w7
[3] | 0x80000000;
3734 inline void append_0x80_1x16 (u32x w
[16], const u32 offset
)
3743 w
[ 0] = w
[ 0] | 0x8000;
3747 w
[ 0] = w
[ 0] | 0x800000;
3751 w
[ 0] = w
[ 0] | 0x80000000;
3759 w
[ 1] = w
[ 1] | 0x8000;
3763 w
[ 1] = w
[ 1] | 0x800000;
3767 w
[ 1] = w
[ 1] | 0x80000000;
3775 w
[ 2] = w
[ 2] | 0x8000;
3779 w
[ 2] = w
[ 2] | 0x800000;
3783 w
[ 2] = w
[ 2] | 0x80000000;
3791 w
[ 3] = w
[ 3] | 0x8000;
3795 w
[ 3] = w
[ 3] | 0x800000;
3799 w
[ 3] = w
[ 3] | 0x80000000;
3807 w
[ 4] = w
[ 4] | 0x8000;
3811 w
[ 4] = w
[ 4] | 0x800000;
3815 w
[ 4] = w
[ 4] | 0x80000000;
3823 w
[ 5] = w
[ 5] | 0x8000;
3827 w
[ 5] = w
[ 5] | 0x800000;
3831 w
[ 5] = w
[ 5] | 0x80000000;
3839 w
[ 6] = w
[ 6] | 0x8000;
3843 w
[ 6] = w
[ 6] | 0x800000;
3847 w
[ 6] = w
[ 6] | 0x80000000;
3855 w
[ 7] = w
[ 7] | 0x8000;
3859 w
[ 7] = w
[ 7] | 0x800000;
3863 w
[ 7] = w
[ 7] | 0x80000000;
3871 w
[ 8] = w
[ 8] | 0x8000;
3875 w
[ 8] = w
[ 8] | 0x800000;
3879 w
[ 8] = w
[ 8] | 0x80000000;
3887 w
[ 9] = w
[ 9] | 0x8000;
3891 w
[ 9] = w
[ 9] | 0x800000;
3895 w
[ 9] = w
[ 9] | 0x80000000;
3903 w
[10] = w
[10] | 0x8000;
3907 w
[10] = w
[10] | 0x800000;
3911 w
[10] = w
[10] | 0x80000000;
3919 w
[11] = w
[11] | 0x8000;
3923 w
[11] = w
[11] | 0x800000;
3927 w
[11] = w
[11] | 0x80000000;
3935 w
[12] = w
[12] | 0x8000;
3939 w
[12] = w
[12] | 0x800000;
3943 w
[12] = w
[12] | 0x80000000;
3951 w
[13] = w
[13] | 0x8000;
3955 w
[13] = w
[13] | 0x800000;
3959 w
[13] = w
[13] | 0x80000000;
3967 w
[14] = w
[14] | 0x8000;
3971 w
[14] = w
[14] | 0x800000;
3975 w
[14] = w
[14] | 0x80000000;
3983 w
[15] = w
[15] | 0x8000;
3987 w
[15] = w
[15] | 0x800000;
3991 w
[15] = w
[15] | 0x80000000;
3996 inline void switch_buffer_by_offset_le (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
3998 #if defined IS_AMD || defined IS_GENERIC
3999 const int offset_mod_4
= offset
& 3;
4001 const int offset_minus_4
= 4 - offset
;
4006 w3
[2] = amd_bytealign ( 0, w3
[1], offset_minus_4
);
4007 w3
[1] = amd_bytealign (w3
[1], w3
[0], offset_minus_4
);
4008 w3
[0] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
4009 w2
[3] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4010 w2
[2] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4011 w2
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4012 w2
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4013 w1
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4014 w1
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4015 w1
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4016 w1
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4017 w0
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4018 w0
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4019 w0
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4020 w0
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4022 if (offset_mod_4
== 0)
4044 w3
[2] = amd_bytealign ( 0, w3
[0], offset_minus_4
);
4045 w3
[1] = amd_bytealign (w3
[0], w2
[3], offset_minus_4
);
4046 w3
[0] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4047 w2
[3] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4048 w2
[2] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4049 w2
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4050 w2
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4051 w1
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4052 w1
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4053 w1
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4054 w1
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4055 w0
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4056 w0
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4057 w0
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4060 if (offset_mod_4
== 0)
4081 w3
[2] = amd_bytealign ( 0, w2
[3], offset_minus_4
);
4082 w3
[1] = amd_bytealign (w2
[3], w2
[2], offset_minus_4
);
4083 w3
[0] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4084 w2
[3] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4085 w2
[2] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4086 w2
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4087 w2
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4088 w1
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4089 w1
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4090 w1
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4091 w1
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4092 w0
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4093 w0
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4097 if (offset_mod_4
== 0)
4117 w3
[2] = amd_bytealign ( 0, w2
[2], offset_minus_4
);
4118 w3
[1] = amd_bytealign (w2
[2], w2
[1], offset_minus_4
);
4119 w3
[0] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4120 w2
[3] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4121 w2
[2] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4122 w2
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4123 w2
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4124 w1
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4125 w1
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4126 w1
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4127 w1
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4128 w0
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4133 if (offset_mod_4
== 0)
4152 w3
[2] = amd_bytealign ( 0, w2
[1], offset_minus_4
);
4153 w3
[1] = amd_bytealign (w2
[1], w2
[0], offset_minus_4
);
4154 w3
[0] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4155 w2
[3] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4156 w2
[2] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4157 w2
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4158 w2
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4159 w1
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4160 w1
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4161 w1
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4162 w1
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4168 if (offset_mod_4
== 0)
4186 w3
[2] = amd_bytealign ( 0, w2
[0], offset_minus_4
);
4187 w3
[1] = amd_bytealign (w2
[0], w1
[3], offset_minus_4
);
4188 w3
[0] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4189 w2
[3] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4190 w2
[2] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4191 w2
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4192 w2
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4193 w1
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4194 w1
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4195 w1
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4202 if (offset_mod_4
== 0)
4219 w3
[2] = amd_bytealign ( 0, w1
[3], offset_minus_4
);
4220 w3
[1] = amd_bytealign (w1
[3], w1
[2], offset_minus_4
);
4221 w3
[0] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4222 w2
[3] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4223 w2
[2] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4224 w2
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4225 w2
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4226 w1
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4227 w1
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4235 if (offset_mod_4
== 0)
4251 w3
[2] = amd_bytealign ( 0, w1
[2], offset_minus_4
);
4252 w3
[1] = amd_bytealign (w1
[2], w1
[1], offset_minus_4
);
4253 w3
[0] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4254 w2
[3] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4255 w2
[2] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4256 w2
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4257 w2
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4258 w1
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4267 if (offset_mod_4
== 0)
4282 w3
[2] = amd_bytealign ( 0, w1
[1], offset_minus_4
);
4283 w3
[1] = amd_bytealign (w1
[1], w1
[0], offset_minus_4
);
4284 w3
[0] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4285 w2
[3] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4286 w2
[2] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4287 w2
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4288 w2
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4298 if (offset_mod_4
== 0)
4312 w3
[2] = amd_bytealign ( 0, w1
[0], offset_minus_4
);
4313 w3
[1] = amd_bytealign (w1
[0], w0
[3], offset_minus_4
);
4314 w3
[0] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4315 w2
[3] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4316 w2
[2] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4317 w2
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4328 if (offset_mod_4
== 0)
4341 w3
[2] = amd_bytealign ( 0, w0
[3], offset_minus_4
);
4342 w3
[1] = amd_bytealign (w0
[3], w0
[2], offset_minus_4
);
4343 w3
[0] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4344 w2
[3] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4345 w2
[2] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4357 if (offset_mod_4
== 0)
4369 w3
[2] = amd_bytealign ( 0, w0
[2], offset_minus_4
);
4370 w3
[1] = amd_bytealign (w0
[2], w0
[1], offset_minus_4
);
4371 w3
[0] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4372 w2
[3] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4385 if (offset_mod_4
== 0)
4396 w3
[2] = amd_bytealign ( 0, w0
[1], offset_minus_4
);
4397 w3
[1] = amd_bytealign (w0
[1], w0
[0], offset_minus_4
);
4398 w3
[0] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4412 if (offset_mod_4
== 0)
4422 w3
[2] = amd_bytealign ( 0, w0
[0], offset_minus_4
);
4423 w3
[1] = amd_bytealign (w0
[0], 0, offset_minus_4
);
4438 if (offset_mod_4
== 0)
4449 const int offset_minus_4
= 4 - (offset
% 4);
4451 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
4456 w3
[1] = __byte_perm (w3
[0], w3
[1], selector
);
4457 w3
[0] = __byte_perm (w2
[3], w3
[0], selector
);
4458 w2
[3] = __byte_perm (w2
[2], w2
[3], selector
);
4459 w2
[2] = __byte_perm (w2
[1], w2
[2], selector
);
4460 w2
[1] = __byte_perm (w2
[0], w2
[1], selector
);
4461 w2
[0] = __byte_perm (w1
[3], w2
[0], selector
);
4462 w1
[3] = __byte_perm (w1
[2], w1
[3], selector
);
4463 w1
[2] = __byte_perm (w1
[1], w1
[2], selector
);
4464 w1
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4465 w1
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4466 w0
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4467 w0
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4468 w0
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4469 w0
[0] = __byte_perm ( 0, w0
[0], selector
);
4474 w3
[1] = __byte_perm (w2
[3], w3
[0], selector
);
4475 w3
[0] = __byte_perm (w2
[2], w2
[3], selector
);
4476 w2
[3] = __byte_perm (w2
[1], w2
[2], selector
);
4477 w2
[2] = __byte_perm (w2
[0], w2
[1], selector
);
4478 w2
[1] = __byte_perm (w1
[3], w2
[0], selector
);
4479 w2
[0] = __byte_perm (w1
[2], w1
[3], selector
);
4480 w1
[3] = __byte_perm (w1
[1], w1
[2], selector
);
4481 w1
[2] = __byte_perm (w1
[0], w1
[1], selector
);
4482 w1
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4483 w1
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4484 w0
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4485 w0
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4486 w0
[1] = __byte_perm ( 0, w0
[0], selector
);
4492 w3
[1] = __byte_perm (w2
[2], w2
[3], selector
);
4493 w3
[0] = __byte_perm (w2
[1], w2
[2], selector
);
4494 w2
[3] = __byte_perm (w2
[0], w2
[1], selector
);
4495 w2
[2] = __byte_perm (w1
[3], w2
[0], selector
);
4496 w2
[1] = __byte_perm (w1
[2], w1
[3], selector
);
4497 w2
[0] = __byte_perm (w1
[1], w1
[2], selector
);
4498 w1
[3] = __byte_perm (w1
[0], w1
[1], selector
);
4499 w1
[2] = __byte_perm (w0
[3], w1
[0], selector
);
4500 w1
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4501 w1
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4502 w0
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4503 w0
[2] = __byte_perm ( 0, w0
[0], selector
);
4510 w3
[1] = __byte_perm (w2
[1], w2
[2], selector
);
4511 w3
[0] = __byte_perm (w2
[0], w2
[1], selector
);
4512 w2
[3] = __byte_perm (w1
[3], w2
[0], selector
);
4513 w2
[2] = __byte_perm (w1
[2], w1
[3], selector
);
4514 w2
[1] = __byte_perm (w1
[1], w1
[2], selector
);
4515 w2
[0] = __byte_perm (w1
[0], w1
[1], selector
);
4516 w1
[3] = __byte_perm (w0
[3], w1
[0], selector
);
4517 w1
[2] = __byte_perm (w0
[2], w0
[3], selector
);
4518 w1
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4519 w1
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4520 w0
[3] = __byte_perm ( 0, w0
[0], selector
);
4528 w3
[1] = __byte_perm (w2
[0], w2
[1], selector
);
4529 w3
[0] = __byte_perm (w1
[3], w2
[0], selector
);
4530 w2
[3] = __byte_perm (w1
[2], w1
[3], selector
);
4531 w2
[2] = __byte_perm (w1
[1], w1
[2], selector
);
4532 w2
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4533 w2
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4534 w1
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4535 w1
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4536 w1
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4537 w1
[0] = __byte_perm ( 0, w0
[0], selector
);
4546 w3
[1] = __byte_perm (w1
[3], w2
[0], selector
);
4547 w3
[0] = __byte_perm (w1
[2], w1
[3], selector
);
4548 w2
[3] = __byte_perm (w1
[1], w1
[2], selector
);
4549 w2
[2] = __byte_perm (w1
[0], w1
[1], selector
);
4550 w2
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4551 w2
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4552 w1
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4553 w1
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4554 w1
[1] = __byte_perm ( 0, w0
[0], selector
);
4564 w3
[1] = __byte_perm (w1
[2], w1
[3], selector
);
4565 w3
[0] = __byte_perm (w1
[1], w1
[2], selector
);
4566 w2
[3] = __byte_perm (w1
[0], w1
[1], selector
);
4567 w2
[2] = __byte_perm (w0
[3], w1
[0], selector
);
4568 w2
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4569 w2
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4570 w1
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4571 w1
[2] = __byte_perm ( 0, w0
[0], selector
);
4582 w3
[1] = __byte_perm (w1
[1], w1
[2], selector
);
4583 w3
[0] = __byte_perm (w1
[0], w1
[1], selector
);
4584 w2
[3] = __byte_perm (w0
[3], w1
[0], selector
);
4585 w2
[2] = __byte_perm (w0
[2], w0
[3], selector
);
4586 w2
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4587 w2
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4588 w1
[3] = __byte_perm ( 0, w0
[0], selector
);
4600 w3
[1] = __byte_perm (w1
[0], w1
[1], selector
);
4601 w3
[0] = __byte_perm (w0
[3], w1
[0], selector
);
4602 w2
[3] = __byte_perm (w0
[2], w0
[3], selector
);
4603 w2
[2] = __byte_perm (w0
[1], w0
[2], selector
);
4604 w2
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4605 w2
[0] = __byte_perm ( 0, w0
[0], selector
);
4618 w3
[1] = __byte_perm (w0
[3], w1
[0], selector
);
4619 w3
[0] = __byte_perm (w0
[2], w0
[3], selector
);
4620 w2
[3] = __byte_perm (w0
[1], w0
[2], selector
);
4621 w2
[2] = __byte_perm (w0
[0], w0
[1], selector
);
4622 w2
[1] = __byte_perm ( 0, w0
[0], selector
);
4636 w3
[1] = __byte_perm (w0
[2], w0
[3], selector
);
4637 w3
[0] = __byte_perm (w0
[1], w0
[2], selector
);
4638 w2
[3] = __byte_perm (w0
[0], w0
[1], selector
);
4639 w2
[2] = __byte_perm ( 0, w0
[0], selector
);
4654 w3
[1] = __byte_perm (w0
[1], w0
[2], selector
);
4655 w3
[0] = __byte_perm (w0
[0], w0
[1], selector
);
4656 w2
[3] = __byte_perm ( 0, w0
[0], selector
);
4672 w3
[1] = __byte_perm (w0
[0], w0
[1], selector
);
4673 w3
[0] = __byte_perm ( 0, w0
[0], selector
);
4690 w3
[1] = __byte_perm ( 0, w0
[0], selector
);
4710 inline void switch_buffer_by_offset_be (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32 offset
)
4712 #if defined IS_AMD || defined IS_GENERIC
4716 w3
[2] = amd_bytealign (w3
[1], 0, offset
);
4717 w3
[1] = amd_bytealign (w3
[0], w3
[1], offset
);
4718 w3
[0] = amd_bytealign (w2
[3], w3
[0], offset
);
4719 w2
[3] = amd_bytealign (w2
[2], w2
[3], offset
);
4720 w2
[2] = amd_bytealign (w2
[1], w2
[2], offset
);
4721 w2
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
4722 w2
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
4723 w1
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
4724 w1
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
4725 w1
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4726 w1
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4727 w0
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4728 w0
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4729 w0
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4730 w0
[0] = amd_bytealign ( 0, w0
[0], offset
);
4734 w3
[2] = amd_bytealign (w3
[0], 0, offset
);
4735 w3
[1] = amd_bytealign (w2
[3], w3
[0], offset
);
4736 w3
[0] = amd_bytealign (w2
[2], w2
[3], offset
);
4737 w2
[3] = amd_bytealign (w2
[1], w2
[2], offset
);
4738 w2
[2] = amd_bytealign (w2
[0], w2
[1], offset
);
4739 w2
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
4740 w2
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
4741 w1
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
4742 w1
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
4743 w1
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4744 w1
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4745 w0
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4746 w0
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4747 w0
[1] = amd_bytealign ( 0, w0
[0], offset
);
4752 w3
[2] = amd_bytealign (w2
[3], 0, offset
);
4753 w3
[1] = amd_bytealign (w2
[2], w2
[3], offset
);
4754 w3
[0] = amd_bytealign (w2
[1], w2
[2], offset
);
4755 w2
[3] = amd_bytealign (w2
[0], w2
[1], offset
);
4756 w2
[2] = amd_bytealign (w1
[3], w2
[0], offset
);
4757 w2
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
4758 w2
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
4759 w1
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
4760 w1
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
4761 w1
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4762 w1
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4763 w0
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4764 w0
[2] = amd_bytealign ( 0, w0
[0], offset
);
4770 w3
[2] = amd_bytealign (w2
[2], 0, offset
);
4771 w3
[1] = amd_bytealign (w2
[1], w2
[2], offset
);
4772 w3
[0] = amd_bytealign (w2
[0], w2
[1], offset
);
4773 w2
[3] = amd_bytealign (w1
[3], w2
[0], offset
);
4774 w2
[2] = amd_bytealign (w1
[2], w1
[3], offset
);
4775 w2
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
4776 w2
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
4777 w1
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
4778 w1
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
4779 w1
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4780 w1
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4781 w0
[3] = amd_bytealign ( 0, w0
[0], offset
);
4788 w3
[2] = amd_bytealign (w2
[1], 0, offset
);
4789 w3
[1] = amd_bytealign (w2
[0], w2
[1], offset
);
4790 w3
[0] = amd_bytealign (w1
[3], w2
[0], offset
);
4791 w2
[3] = amd_bytealign (w1
[2], w1
[3], offset
);
4792 w2
[2] = amd_bytealign (w1
[1], w1
[2], offset
);
4793 w2
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4794 w2
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4795 w1
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4796 w1
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4797 w1
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4798 w1
[0] = amd_bytealign ( 0, w0
[0], offset
);
4806 w3
[2] = amd_bytealign (w2
[0], 0, offset
);
4807 w3
[1] = amd_bytealign (w1
[3], w2
[0], offset
);
4808 w3
[0] = amd_bytealign (w1
[2], w1
[3], offset
);
4809 w2
[3] = amd_bytealign (w1
[1], w1
[2], offset
);
4810 w2
[2] = amd_bytealign (w1
[0], w1
[1], offset
);
4811 w2
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4812 w2
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4813 w1
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4814 w1
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4815 w1
[1] = amd_bytealign ( 0, w0
[0], offset
);
4824 w3
[2] = amd_bytealign (w1
[3], 0, offset
);
4825 w3
[1] = amd_bytealign (w1
[2], w1
[3], offset
);
4826 w3
[0] = amd_bytealign (w1
[1], w1
[2], offset
);
4827 w2
[3] = amd_bytealign (w1
[0], w1
[1], offset
);
4828 w2
[2] = amd_bytealign (w0
[3], w1
[0], offset
);
4829 w2
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4830 w2
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4831 w1
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4832 w1
[2] = amd_bytealign ( 0, w0
[0], offset
);
4842 w3
[2] = amd_bytealign (w1
[2], 0, offset
);
4843 w3
[1] = amd_bytealign (w1
[1], w1
[2], offset
);
4844 w3
[0] = amd_bytealign (w1
[0], w1
[1], offset
);
4845 w2
[3] = amd_bytealign (w0
[3], w1
[0], offset
);
4846 w2
[2] = amd_bytealign (w0
[2], w0
[3], offset
);
4847 w2
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4848 w2
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4849 w1
[3] = amd_bytealign ( 0, w0
[0], offset
);
4860 w3
[2] = amd_bytealign (w1
[1], 0, offset
);
4861 w3
[1] = amd_bytealign (w1
[0], w1
[1], offset
);
4862 w3
[0] = amd_bytealign (w0
[3], w1
[0], offset
);
4863 w2
[3] = amd_bytealign (w0
[2], w0
[3], offset
);
4864 w2
[2] = amd_bytealign (w0
[1], w0
[2], offset
);
4865 w2
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4866 w2
[0] = amd_bytealign ( 0, w0
[0], offset
);
4878 w3
[2] = amd_bytealign (w1
[0], 0, offset
);
4879 w3
[1] = amd_bytealign (w0
[3], w1
[0], offset
);
4880 w3
[0] = amd_bytealign (w0
[2], w0
[3], offset
);
4881 w2
[3] = amd_bytealign (w0
[1], w0
[2], offset
);
4882 w2
[2] = amd_bytealign (w0
[0], w0
[1], offset
);
4883 w2
[1] = amd_bytealign ( 0, w0
[0], offset
);
4896 w3
[2] = amd_bytealign (w0
[3], 0, offset
);
4897 w3
[1] = amd_bytealign (w0
[2], w0
[3], offset
);
4898 w3
[0] = amd_bytealign (w0
[1], w0
[2], offset
);
4899 w2
[3] = amd_bytealign (w0
[0], w0
[1], offset
);
4900 w2
[2] = amd_bytealign ( 0, w0
[0], offset
);
4914 w3
[2] = amd_bytealign (w0
[2], 0, offset
);
4915 w3
[1] = amd_bytealign (w0
[1], w0
[2], offset
);
4916 w3
[0] = amd_bytealign (w0
[0], w0
[1], offset
);
4917 w2
[3] = amd_bytealign ( 0, w0
[0], offset
);
4932 w3
[2] = amd_bytealign (w0
[1], 0, offset
);
4933 w3
[1] = amd_bytealign (w0
[0], w0
[1], offset
);
4934 w3
[0] = amd_bytealign ( 0, w0
[0], offset
);
4950 w3
[2] = amd_bytealign (w0
[0], 0, offset
);
4951 w3
[1] = amd_bytealign ( 0, w0
[0], offset
);
4970 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
4975 w3
[1] = __byte_perm (w3
[1], w3
[0], selector
);
4976 w3
[0] = __byte_perm (w3
[0], w2
[3], selector
);
4977 w2
[3] = __byte_perm (w2
[3], w2
[2], selector
);
4978 w2
[2] = __byte_perm (w2
[2], w2
[1], selector
);
4979 w2
[1] = __byte_perm (w2
[1], w2
[0], selector
);
4980 w2
[0] = __byte_perm (w2
[0], w1
[3], selector
);
4981 w1
[3] = __byte_perm (w1
[3], w1
[2], selector
);
4982 w1
[2] = __byte_perm (w1
[2], w1
[1], selector
);
4983 w1
[1] = __byte_perm (w1
[1], w1
[0], selector
);
4984 w1
[0] = __byte_perm (w1
[0], w0
[3], selector
);
4985 w0
[3] = __byte_perm (w0
[3], w0
[2], selector
);
4986 w0
[2] = __byte_perm (w0
[2], w0
[1], selector
);
4987 w0
[1] = __byte_perm (w0
[1], w0
[0], selector
);
4988 w0
[0] = __byte_perm (w0
[0], 0, selector
);
4992 w3
[1] = __byte_perm (w3
[0], w2
[3], selector
);
4993 w3
[0] = __byte_perm (w2
[3], w2
[2], selector
);
4994 w2
[3] = __byte_perm (w2
[2], w2
[1], selector
);
4995 w2
[2] = __byte_perm (w2
[1], w2
[0], selector
);
4996 w2
[1] = __byte_perm (w2
[0], w1
[3], selector
);
4997 w2
[0] = __byte_perm (w1
[3], w1
[2], selector
);
4998 w1
[3] = __byte_perm (w1
[2], w1
[1], selector
);
4999 w1
[2] = __byte_perm (w1
[1], w1
[0], selector
);
5000 w1
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5001 w1
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5002 w0
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5003 w0
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5004 w0
[1] = __byte_perm (w0
[0], 0, selector
);
5009 w3
[1] = __byte_perm (w2
[3], w2
[2], selector
);
5010 w3
[0] = __byte_perm (w2
[2], w2
[1], selector
);
5011 w2
[3] = __byte_perm (w2
[1], w2
[0], selector
);
5012 w2
[2] = __byte_perm (w2
[0], w1
[3], selector
);
5013 w2
[1] = __byte_perm (w1
[3], w1
[2], selector
);
5014 w2
[0] = __byte_perm (w1
[2], w1
[1], selector
);
5015 w1
[3] = __byte_perm (w1
[1], w1
[0], selector
);
5016 w1
[2] = __byte_perm (w1
[0], w0
[3], selector
);
5017 w1
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5018 w1
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5019 w0
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5020 w0
[2] = __byte_perm (w0
[0], 0, selector
);
5026 w3
[1] = __byte_perm (w2
[2], w2
[1], selector
);
5027 w3
[0] = __byte_perm (w2
[1], w2
[0], selector
);
5028 w2
[3] = __byte_perm (w2
[0], w1
[3], selector
);
5029 w2
[2] = __byte_perm (w1
[3], w1
[2], selector
);
5030 w2
[1] = __byte_perm (w1
[2], w1
[1], selector
);
5031 w2
[0] = __byte_perm (w1
[1], w1
[0], selector
);
5032 w1
[3] = __byte_perm (w1
[0], w0
[3], selector
);
5033 w1
[2] = __byte_perm (w0
[3], w0
[2], selector
);
5034 w1
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5035 w1
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5036 w0
[3] = __byte_perm (w0
[0], 0, selector
);
5043 w3
[1] = __byte_perm (w2
[1], w2
[0], selector
);
5044 w3
[0] = __byte_perm (w2
[0], w1
[3], selector
);
5045 w2
[3] = __byte_perm (w1
[3], w1
[2], selector
);
5046 w2
[2] = __byte_perm (w1
[2], w1
[1], selector
);
5047 w2
[1] = __byte_perm (w1
[1], w1
[0], selector
);
5048 w2
[0] = __byte_perm (w1
[0], w0
[3], selector
);
5049 w1
[3] = __byte_perm (w0
[3], w0
[2], selector
);
5050 w1
[2] = __byte_perm (w0
[2], w0
[1], selector
);
5051 w1
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5052 w1
[0] = __byte_perm (w0
[0], 0, selector
);
5060 w3
[1] = __byte_perm (w2
[0], w1
[3], selector
);
5061 w3
[0] = __byte_perm (w1
[3], w1
[2], selector
);
5062 w2
[3] = __byte_perm (w1
[2], w1
[1], selector
);
5063 w2
[2] = __byte_perm (w1
[1], w1
[0], selector
);
5064 w2
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5065 w2
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5066 w1
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5067 w1
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5068 w1
[1] = __byte_perm (w0
[0], 0, selector
);
5077 w3
[1] = __byte_perm (w1
[3], w1
[2], selector
);
5078 w3
[0] = __byte_perm (w1
[2], w1
[1], selector
);
5079 w2
[3] = __byte_perm (w1
[1], w1
[0], selector
);
5080 w2
[2] = __byte_perm (w1
[0], w0
[3], selector
);
5081 w2
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5082 w2
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5083 w1
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5084 w1
[2] = __byte_perm (w0
[0], 0, selector
);
5094 w3
[1] = __byte_perm (w1
[2], w1
[1], selector
);
5095 w3
[0] = __byte_perm (w1
[1], w1
[0], selector
);
5096 w2
[3] = __byte_perm (w1
[0], w0
[3], selector
);
5097 w2
[2] = __byte_perm (w0
[3], w0
[2], selector
);
5098 w2
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5099 w2
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5100 w1
[3] = __byte_perm (w0
[0], 0, selector
);
5111 w3
[1] = __byte_perm (w1
[1], w1
[0], selector
);
5112 w3
[0] = __byte_perm (w1
[0], w0
[3], selector
);
5113 w2
[3] = __byte_perm (w0
[3], w0
[2], selector
);
5114 w2
[2] = __byte_perm (w0
[2], w0
[1], selector
);
5115 w2
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5116 w2
[0] = __byte_perm (w0
[0], 0, selector
);
5128 w3
[1] = __byte_perm (w1
[0], w0
[3], selector
);
5129 w3
[0] = __byte_perm (w0
[3], w0
[2], selector
);
5130 w2
[3] = __byte_perm (w0
[2], w0
[1], selector
);
5131 w2
[2] = __byte_perm (w0
[1], w0
[0], selector
);
5132 w2
[1] = __byte_perm (w0
[0], 0, selector
);
5145 w3
[1] = __byte_perm (w0
[3], w0
[2], selector
);
5146 w3
[0] = __byte_perm (w0
[2], w0
[1], selector
);
5147 w2
[3] = __byte_perm (w0
[1], w0
[0], selector
);
5148 w2
[2] = __byte_perm (w0
[0], 0, selector
);
5162 w3
[1] = __byte_perm (w0
[2], w0
[1], selector
);
5163 w3
[0] = __byte_perm (w0
[1], w0
[0], selector
);
5164 w2
[3] = __byte_perm (w0
[0], 0, selector
);
5179 w3
[1] = __byte_perm (w0
[1], w0
[0], selector
);
5180 w3
[0] = __byte_perm (w0
[0], 0, selector
);
5196 w3
[1] = __byte_perm (w0
[0], 0, selector
);
5215 inline void overwrite_at_le (u32x sw
[16], const u32x w0
, const u32 salt_len
)
5217 #if defined cl_amd_media_ops
5222 case 1: sw
[0] = amd_bytealign (w0
, sw
[0] << 24, 3);
5223 sw
[1] = amd_bytealign (sw
[1] >> 8, w0
, 3);
5225 case 2: sw
[0] = amd_bytealign (w0
, sw
[0] << 16, 2);
5226 sw
[1] = amd_bytealign (sw
[1] >> 16, w0
, 2);
5228 case 3: sw
[0] = amd_bytealign (w0
, sw
[0] << 8, 1);
5229 sw
[1] = amd_bytealign (sw
[1] >> 24, w0
, 1);
5233 case 5: sw
[1] = amd_bytealign (w0
, sw
[1] << 24, 3);
5234 sw
[2] = amd_bytealign (sw
[2] >> 8, w0
, 3);
5236 case 6: sw
[1] = amd_bytealign (w0
, sw
[1] << 16, 2);
5237 sw
[2] = amd_bytealign (sw
[2] >> 16, w0
, 2);
5239 case 7: sw
[1] = amd_bytealign (w0
, sw
[1] << 8, 1);
5240 sw
[2] = amd_bytealign (sw
[2] >> 24, w0
, 1);
5244 case 9: sw
[2] = amd_bytealign (w0
, sw
[2] << 24, 3);
5245 sw
[3] = amd_bytealign (sw
[3] >> 8, w0
, 3);
5247 case 10: sw
[2] = amd_bytealign (w0
, sw
[2] << 16, 2);
5248 sw
[3] = amd_bytealign (sw
[3] >> 16, w0
, 2);
5250 case 11: sw
[2] = amd_bytealign (w0
, sw
[2] << 8, 1);
5251 sw
[3] = amd_bytealign (sw
[3] >> 24, w0
, 1);
5253 case 12: sw
[3] = w0
;
5255 case 13: sw
[3] = amd_bytealign (w0
, sw
[3] << 24, 3);
5256 sw
[4] = amd_bytealign (sw
[4] >> 8, w0
, 3);
5258 case 14: sw
[3] = amd_bytealign (w0
, sw
[3] << 16, 2);
5259 sw
[4] = amd_bytealign (sw
[4] >> 16, w0
, 2);
5261 case 15: sw
[3] = amd_bytealign (w0
, sw
[3] << 8, 1);
5262 sw
[4] = amd_bytealign (sw
[4] >> 24, w0
, 1);
5264 case 16: sw
[4] = w0
;
5266 case 17: sw
[4] = amd_bytealign (w0
, sw
[4] << 24, 3);
5267 sw
[5] = amd_bytealign (sw
[5] >> 8, w0
, 3);
5269 case 18: sw
[4] = amd_bytealign (w0
, sw
[4] << 16, 2);
5270 sw
[5] = amd_bytealign (sw
[5] >> 16, w0
, 2);
5272 case 19: sw
[4] = amd_bytealign (w0
, sw
[4] << 8, 1);
5273 sw
[5] = amd_bytealign (sw
[5] >> 24, w0
, 1);
5275 case 20: sw
[5] = w0
;
5277 case 21: sw
[5] = amd_bytealign (w0
, sw
[5] << 24, 3);
5278 sw
[6] = amd_bytealign (sw
[6] >> 8, w0
, 3);
5280 case 22: sw
[5] = amd_bytealign (w0
, sw
[5] << 16, 2);
5281 sw
[6] = amd_bytealign (sw
[6] >> 16, w0
, 2);
5283 case 23: sw
[5] = amd_bytealign (w0
, sw
[5] << 8, 1);
5284 sw
[6] = amd_bytealign (sw
[6] >> 24, w0
, 1);
5286 case 24: sw
[6] = w0
;
5288 case 25: sw
[6] = amd_bytealign (w0
, sw
[6] << 24, 3);
5289 sw
[7] = amd_bytealign (sw
[7] >> 8, w0
, 3);
5291 case 26: sw
[6] = amd_bytealign (w0
, sw
[6] << 16, 2);
5292 sw
[7] = amd_bytealign (sw
[7] >> 16, w0
, 2);
5294 case 27: sw
[6] = amd_bytealign (w0
, sw
[6] << 8, 1);
5295 sw
[7] = amd_bytealign (sw
[7] >> 24, w0
, 1);
5297 case 28: sw
[7] = w0
;
5299 case 29: sw
[7] = amd_bytealign (w0
, sw
[7] << 24, 3);
5300 sw
[8] = amd_bytealign (sw
[8] >> 8, w0
, 3);
5302 case 30: sw
[7] = amd_bytealign (w0
, sw
[7] << 16, 2);
5303 sw
[8] = amd_bytealign (sw
[8] >> 16, w0
, 2);
5305 case 31: sw
[7] = amd_bytealign (w0
, sw
[7] << 8, 1);
5306 sw
[8] = amd_bytealign (sw
[8] >> 24, w0
, 1);
5314 case 1: sw
[0] = (sw
[0] & 0x000000ff) | (w0
<< 8);
5315 sw
[1] = (sw
[1] & 0xffffff00) | (w0
>> 24);
5317 case 2: sw
[0] = (sw
[0] & 0x0000ffff) | (w0
<< 16);
5318 sw
[1] = (sw
[1] & 0xffff0000) | (w0
>> 16);
5320 case 3: sw
[0] = (sw
[0] & 0x00ffffff) | (w0
<< 24);
5321 sw
[1] = (sw
[1] & 0xff000000) | (w0
>> 8);
5325 case 5: sw
[1] = (sw
[1] & 0x000000ff) | (w0
<< 8);
5326 sw
[2] = (sw
[2] & 0xffffff00) | (w0
>> 24);
5328 case 6: sw
[1] = (sw
[1] & 0x0000ffff) | (w0
<< 16);
5329 sw
[2] = (sw
[2] & 0xffff0000) | (w0
>> 16);
5331 case 7: sw
[1] = (sw
[1] & 0x00ffffff) | (w0
<< 24);
5332 sw
[2] = (sw
[2] & 0xff000000) | (w0
>> 8);
5336 case 9: sw
[2] = (sw
[2] & 0x000000ff) | (w0
<< 8);
5337 sw
[3] = (sw
[3] & 0xffffff00) | (w0
>> 24);
5339 case 10: sw
[2] = (sw
[2] & 0x0000ffff) | (w0
<< 16);
5340 sw
[3] = (sw
[3] & 0xffff0000) | (w0
>> 16);
5342 case 11: sw
[2] = (sw
[2] & 0x00ffffff) | (w0
<< 24);
5343 sw
[3] = (sw
[3] & 0xff000000) | (w0
>> 8);
5345 case 12: sw
[3] = w0
;
5347 case 13: sw
[3] = (sw
[3] & 0x000000ff) | (w0
<< 8);
5348 sw
[4] = (sw
[4] & 0xffffff00) | (w0
>> 24);
5350 case 14: sw
[3] = (sw
[3] & 0x0000ffff) | (w0
<< 16);
5351 sw
[4] = (sw
[4] & 0xffff0000) | (w0
>> 16);
5353 case 15: sw
[3] = (sw
[3] & 0x00ffffff) | (w0
<< 24);
5354 sw
[4] = (sw
[4] & 0xff000000) | (w0
>> 8);
5356 case 16: sw
[4] = w0
;
5358 case 17: sw
[4] = (sw
[4] & 0x000000ff) | (w0
<< 8);
5359 sw
[5] = (sw
[5] & 0xffffff00) | (w0
>> 24);
5361 case 18: sw
[4] = (sw
[4] & 0x0000ffff) | (w0
<< 16);
5362 sw
[5] = (sw
[5] & 0xffff0000) | (w0
>> 16);
5364 case 19: sw
[4] = (sw
[4] & 0x00ffffff) | (w0
<< 24);
5365 sw
[5] = (sw
[5] & 0xff000000) | (w0
>> 8);
5367 case 20: sw
[5] = w0
;
5369 case 21: sw
[5] = (sw
[5] & 0x000000ff) | (w0
<< 8);
5370 sw
[6] = (sw
[6] & 0xffffff00) | (w0
>> 24);
5372 case 22: sw
[5] = (sw
[5] & 0x0000ffff) | (w0
<< 16);
5373 sw
[6] = (sw
[6] & 0xffff0000) | (w0
>> 16);
5375 case 23: sw
[5] = (sw
[5] & 0x00ffffff) | (w0
<< 24);
5376 sw
[6] = (sw
[6] & 0xff000000) | (w0
>> 8);
5378 case 24: sw
[6] = w0
;
5380 case 25: sw
[6] = (sw
[6] & 0x000000ff) | (w0
<< 8);
5381 sw
[7] = (sw
[7] & 0xffffff00) | (w0
>> 24);
5383 case 26: sw
[6] = (sw
[6] & 0x0000ffff) | (w0
<< 16);
5384 sw
[7] = (sw
[7] & 0xffff0000) | (w0
>> 16);
5386 case 27: sw
[6] = (sw
[6] & 0x00ffffff) | (w0
<< 24);
5387 sw
[7] = (sw
[7] & 0xff000000) | (w0
>> 8);
5389 case 28: sw
[7] = w0
;
5391 case 29: sw
[7] = (sw
[7] & 0x000000ff) | (w0
<< 8);
5392 sw
[8] = (sw
[8] & 0xffffff00) | (w0
>> 24);
5394 case 30: sw
[7] = (sw
[7] & 0x0000ffff) | (w0
<< 16);
5395 sw
[8] = (sw
[8] & 0xffff0000) | (w0
>> 16);
5397 case 31: sw
[7] = (sw
[7] & 0x00ffffff) | (w0
<< 24);
5398 sw
[8] = (sw
[8] & 0xff000000) | (w0
>> 8);
5404 inline void overwrite_at_be (u32x sw
[16], const u32x w0
, const u32 salt_len
)
5406 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5412 case 1: sw
[0] = (sw
[0] & 0xff000000) | (w0
>> 8);
5413 sw
[1] = (sw
[1] & 0x00ffffff) | (w0
<< 24);
5415 case 2: sw
[0] = (sw
[0] & 0xffff0000) | (w0
>> 16);
5416 sw
[1] = (sw
[1] & 0x0000ffff) | (w0
<< 16);
5418 case 3: sw
[0] = (sw
[0] & 0xffffff00) | (w0
>> 24);
5419 sw
[1] = (sw
[1] & 0x000000ff) | (w0
<< 8);
5423 case 5: sw
[1] = (sw
[1] & 0xff000000) | (w0
>> 8);
5424 sw
[2] = (sw
[2] & 0x00ffffff) | (w0
<< 24);
5426 case 6: sw
[1] = (sw
[1] & 0xffff0000) | (w0
>> 16);
5427 sw
[2] = (sw
[2] & 0x0000ffff) | (w0
<< 16);
5429 case 7: sw
[1] = (sw
[1] & 0xffffff00) | (w0
>> 24);
5430 sw
[2] = (sw
[2] & 0x000000ff) | (w0
<< 8);
5434 case 9: sw
[2] = (sw
[2] & 0xff000000) | (w0
>> 8);
5435 sw
[3] = (sw
[3] & 0x00ffffff) | (w0
<< 24);
5437 case 10: sw
[2] = (sw
[2] & 0xffff0000) | (w0
>> 16);
5438 sw
[3] = (sw
[3] & 0x0000ffff) | (w0
<< 16);
5440 case 11: sw
[2] = (sw
[2] & 0xffffff00) | (w0
>> 24);
5441 sw
[3] = (sw
[3] & 0x000000ff) | (w0
<< 8);
5443 case 12: sw
[3] = w0
;
5445 case 13: sw
[3] = (sw
[3] & 0xff000000) | (w0
>> 8);
5446 sw
[4] = (sw
[4] & 0x00ffffff) | (w0
<< 24);
5448 case 14: sw
[3] = (sw
[3] & 0xffff0000) | (w0
>> 16);
5449 sw
[4] = (sw
[4] & 0x0000ffff) | (w0
<< 16);
5451 case 15: sw
[3] = (sw
[3] & 0xffffff00) | (w0
>> 24);
5452 sw
[4] = (sw
[4] & 0x000000ff) | (w0
<< 8);
5454 case 16: sw
[4] = w0
;
5456 case 17: sw
[4] = (sw
[4] & 0xff000000) | (w0
>> 8);
5457 sw
[5] = (sw
[5] & 0x00ffffff) | (w0
<< 24);
5459 case 18: sw
[4] = (sw
[4] & 0xffff0000) | (w0
>> 16);
5460 sw
[5] = (sw
[5] & 0x0000ffff) | (w0
<< 16);
5462 case 19: sw
[4] = (sw
[4] & 0xffffff00) | (w0
>> 24);
5463 sw
[5] = (sw
[5] & 0x000000ff) | (w0
<< 8);
5465 case 20: sw
[5] = w0
;
5467 case 21: sw
[5] = (sw
[5] & 0xff000000) | (w0
>> 8);
5468 sw
[6] = (sw
[6] & 0x00ffffff) | (w0
<< 24);
5470 case 22: sw
[5] = (sw
[5] & 0xffff0000) | (w0
>> 16);
5471 sw
[6] = (sw
[6] & 0x0000ffff) | (w0
<< 16);
5473 case 23: sw
[5] = (sw
[5] & 0xffffff00) | (w0
>> 24);
5474 sw
[6] = (sw
[6] & 0x000000ff) | (w0
<< 8);
5476 case 24: sw
[6] = w0
;
5478 case 25: sw
[6] = (sw
[6] & 0xff000000) | (w0
>> 8);
5479 sw
[7] = (sw
[7] & 0x00ffffff) | (w0
<< 24);
5481 case 26: sw
[6] = (sw
[6] & 0xffff0000) | (w0
>> 16);
5482 sw
[7] = (sw
[7] & 0x0000ffff) | (w0
<< 16);
5484 case 27: sw
[6] = (sw
[6] & 0xffffff00) | (w0
>> 24);
5485 sw
[7] = (sw
[7] & 0x000000ff) | (w0
<< 8);
5487 case 28: sw
[7] = w0
;
5489 case 29: sw
[7] = (sw
[7] & 0xff000000) | (w0
>> 8);
5490 sw
[8] = (sw
[8] & 0x00ffffff) | (w0
<< 24);
5492 case 30: sw
[7] = (sw
[7] & 0xffff0000) | (w0
>> 16);
5493 sw
[8] = (sw
[8] & 0x0000ffff) | (w0
<< 16);
5495 case 31: sw
[7] = (sw
[7] & 0xffffff00) | (w0
>> 24);
5496 sw
[8] = (sw
[8] & 0x000000ff) | (w0
<< 8);
5501 inline void overwrite_at_le_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x wx
, const u32 salt_len
)
5503 #if defined cl_amd_media_ops
5508 case 1: w0
[0] = amd_bytealign (wx
, w0
[0] << 24, 3);
5509 w0
[1] = amd_bytealign (w0
[1] >> 8, wx
, 3);
5511 case 2: w0
[0] = amd_bytealign (wx
, w0
[0] << 16, 2);
5512 w0
[1] = amd_bytealign (w0
[1] >> 16, wx
, 2);
5514 case 3: w0
[0] = amd_bytealign (wx
, w0
[0] << 8, 1);
5515 w0
[1] = amd_bytealign (w0
[1] >> 24, wx
, 1);
5519 case 5: w0
[1] = amd_bytealign (wx
, w0
[1] << 24, 3);
5520 w0
[2] = amd_bytealign (w0
[2] >> 8, wx
, 3);
5522 case 6: w0
[1] = amd_bytealign (wx
, w0
[1] << 16, 2);
5523 w0
[2] = amd_bytealign (w0
[2] >> 16, wx
, 2);
5525 case 7: w0
[1] = amd_bytealign (wx
, w0
[1] << 8, 1);
5526 w0
[2] = amd_bytealign (w0
[2] >> 24, wx
, 1);
5530 case 9: w0
[2] = amd_bytealign (wx
, w0
[2] << 24, 3);
5531 w0
[3] = amd_bytealign (w0
[3] >> 8, wx
, 3);
5533 case 10: w0
[2] = amd_bytealign (wx
, w0
[2] << 16, 2);
5534 w0
[3] = amd_bytealign (w0
[3] >> 16, wx
, 2);
5536 case 11: w0
[2] = amd_bytealign (wx
, w0
[2] << 8, 1);
5537 w0
[3] = amd_bytealign (w0
[3] >> 24, wx
, 1);
5539 case 12: w0
[3] = wx
;
5541 case 13: w0
[3] = amd_bytealign (wx
, w0
[3] << 24, 3);
5542 w1
[0] = amd_bytealign (w1
[0] >> 8, wx
, 3);
5544 case 14: w0
[3] = amd_bytealign (wx
, w0
[3] << 16, 2);
5545 w1
[0] = amd_bytealign (w1
[0] >> 16, wx
, 2);
5547 case 15: w0
[3] = amd_bytealign (wx
, w0
[3] << 8, 1);
5548 w1
[0] = amd_bytealign (w1
[0] >> 24, wx
, 1);
5550 case 16: w1
[0] = wx
;
5552 case 17: w1
[0] = amd_bytealign (wx
, w1
[0] << 24, 3);
5553 w1
[1] = amd_bytealign (w1
[1] >> 8, wx
, 3);
5555 case 18: w1
[0] = amd_bytealign (wx
, w1
[0] << 16, 2);
5556 w1
[1] = amd_bytealign (w1
[1] >> 16, wx
, 2);
5558 case 19: w1
[0] = amd_bytealign (wx
, w1
[0] << 8, 1);
5559 w1
[1] = amd_bytealign (w1
[1] >> 24, wx
, 1);
5561 case 20: w1
[1] = wx
;
5563 case 21: w1
[1] = amd_bytealign (wx
, w1
[1] << 24, 3);
5564 w1
[2] = amd_bytealign (w1
[2] >> 8, wx
, 3);
5566 case 22: w1
[1] = amd_bytealign (wx
, w1
[1] << 16, 2);
5567 w1
[2] = amd_bytealign (w1
[2] >> 16, wx
, 2);
5569 case 23: w1
[1] = amd_bytealign (wx
, w1
[1] << 8, 1);
5570 w1
[2] = amd_bytealign (w1
[2] >> 24, wx
, 1);
5572 case 24: w1
[2] = wx
;
5574 case 25: w1
[2] = amd_bytealign (wx
, w1
[2] << 24, 3);
5575 w1
[3] = amd_bytealign (w1
[3] >> 8, wx
, 3);
5577 case 26: w1
[2] = amd_bytealign (wx
, w1
[2] << 16, 2);
5578 w1
[3] = amd_bytealign (w1
[3] >> 16, wx
, 2);
5580 case 27: w1
[2] = amd_bytealign (wx
, w1
[2] << 8, 1);
5581 w1
[3] = amd_bytealign (w1
[3] >> 24, wx
, 1);
5583 case 28: w1
[3] = wx
;
5585 case 29: w1
[3] = amd_bytealign (wx
, w1
[3] << 24, 3);
5586 w2
[0] = amd_bytealign (w2
[0] >> 8, wx
, 3);
5588 case 30: w1
[3] = amd_bytealign (wx
, w1
[3] << 16, 2);
5589 w2
[0] = amd_bytealign (w2
[0] >> 16, wx
, 2);
5591 case 31: w1
[3] = amd_bytealign (wx
, w1
[3] << 8, 1);
5592 w2
[0] = amd_bytealign (w2
[0] >> 24, wx
, 1);
5594 case 32: w2
[0] = wx
;
5596 case 33: w2
[0] = amd_bytealign (wx
, w2
[0] << 24, 3);
5597 w2
[1] = amd_bytealign (w2
[1] >> 8, wx
, 3);
5599 case 34: w2
[0] = amd_bytealign (wx
, w2
[0] << 16, 2);
5600 w2
[1] = amd_bytealign (w2
[1] >> 16, wx
, 2);
5602 case 35: w2
[0] = amd_bytealign (wx
, w2
[0] << 8, 1);
5603 w2
[1] = amd_bytealign (w2
[1] >> 24, wx
, 1);
5605 case 36: w2
[1] = wx
;
5607 case 37: w2
[1] = amd_bytealign (wx
, w2
[1] << 24, 3);
5608 w2
[2] = amd_bytealign (w2
[2] >> 8, wx
, 3);
5610 case 38: w2
[1] = amd_bytealign (wx
, w2
[1] << 16, 2);
5611 w2
[2] = amd_bytealign (w2
[2] >> 16, wx
, 2);
5613 case 39: w2
[1] = amd_bytealign (wx
, w2
[1] << 8, 1);
5614 w2
[2] = amd_bytealign (w2
[2] >> 24, wx
, 1);
5616 case 40: w2
[2] = wx
;
5618 case 41: w2
[2] = amd_bytealign (wx
, w2
[2] << 24, 3);
5619 w2
[3] = amd_bytealign (w2
[3] >> 8, wx
, 3);
5621 case 42: w2
[2] = amd_bytealign (wx
, w2
[2] << 16, 2);
5622 w2
[3] = amd_bytealign (w2
[3] >> 16, wx
, 2);
5624 case 43: w2
[2] = amd_bytealign (wx
, w2
[2] << 8, 1);
5625 w2
[3] = amd_bytealign (w2
[3] >> 24, wx
, 1);
5627 case 44: w2
[3] = wx
;
5629 case 45: w2
[3] = amd_bytealign (wx
, w2
[3] << 24, 3);
5630 w3
[0] = amd_bytealign (w3
[0] >> 8, wx
, 3);
5632 case 46: w2
[3] = amd_bytealign (wx
, w2
[3] << 16, 2);
5633 w3
[0] = amd_bytealign (w3
[0] >> 16, wx
, 2);
5635 case 47: w2
[3] = amd_bytealign (wx
, w2
[3] << 8, 1);
5636 w3
[0] = amd_bytealign (w3
[0] >> 24, wx
, 1);
5638 case 48: w3
[0] = wx
;
5640 case 49: w3
[0] = amd_bytealign (wx
, w3
[0] << 24, 3);
5641 w3
[1] = amd_bytealign (w3
[1] >> 8, wx
, 3);
5643 case 50: w3
[0] = amd_bytealign (wx
, w3
[0] << 16, 2);
5644 w3
[1] = amd_bytealign (w3
[1] >> 16, wx
, 2);
5646 case 51: w3
[0] = amd_bytealign (wx
, w3
[0] << 8, 1);
5647 w3
[1] = amd_bytealign (w3
[1] >> 24, wx
, 1);
5649 case 52: w3
[1] = wx
;
5651 case 53: w3
[1] = amd_bytealign (wx
, w3
[1] << 24, 3);
5652 w3
[2] = amd_bytealign (w3
[2] >> 8, wx
, 3);
5654 case 54: w3
[1] = amd_bytealign (wx
, w3
[1] << 16, 2);
5655 w3
[2] = amd_bytealign (w3
[2] >> 16, wx
, 2);
5657 case 55: w3
[1] = amd_bytealign (wx
, w3
[1] << 8, 1);
5658 w3
[2] = amd_bytealign (w3
[2] >> 24, wx
, 1);
5660 case 56: w3
[2] = wx
;
5662 case 57: w3
[2] = amd_bytealign (wx
, w3
[2] << 24, 3);
5663 w3
[3] = amd_bytealign (w3
[3] >> 8, wx
, 3);
5665 case 58: w3
[2] = amd_bytealign (wx
, w3
[2] << 16, 2);
5666 w3
[3] = amd_bytealign (w3
[3] >> 16, wx
, 2);
5668 case 59: w3
[2] = amd_bytealign (wx
, w3
[2] << 8, 1);
5669 w3
[3] = amd_bytealign (w3
[3] >> 24, wx
, 1);
5671 case 60: w3
[3] = wx
;
5673 case 61: w3
[3] = amd_bytealign (wx
, w3
[3] << 24, 3);
5674 //w4[0] = amd_bytealign (w4[0] >> 8, wx, 3);
5676 case 62: w3
[3] = amd_bytealign (wx
, w3
[3] << 16, 2);
5677 //w4[0] = amd_bytealign (w4[0] >> 16, wx, 2);
5679 case 63: w3
[3] = amd_bytealign (wx
, w3
[3] << 8, 1);
5680 //w4[0] = amd_bytealign (w4[0] >> 24, wx, 1);
5688 case 1: w0
[0] = (w0
[0] & 0x000000ff) | (wx
<< 8);
5689 w0
[1] = (w0
[1] & 0xffffff00) | (wx
>> 24);
5691 case 2: w0
[0] = (w0
[0] & 0x0000ffff) | (wx
<< 16);
5692 w0
[1] = (w0
[1] & 0xffff0000) | (wx
>> 16);
5694 case 3: w0
[0] = (w0
[0] & 0x00ffffff) | (wx
<< 24);
5695 w0
[1] = (w0
[1] & 0xff000000) | (wx
>> 8);
5699 case 5: w0
[1] = (w0
[1] & 0x000000ff) | (wx
<< 8);
5700 w0
[2] = (w0
[2] & 0xffffff00) | (wx
>> 24);
5702 case 6: w0
[1] = (w0
[1] & 0x0000ffff) | (wx
<< 16);
5703 w0
[2] = (w0
[2] & 0xffff0000) | (wx
>> 16);
5705 case 7: w0
[1] = (w0
[1] & 0x00ffffff) | (wx
<< 24);
5706 w0
[2] = (w0
[2] & 0xff000000) | (wx
>> 8);
5710 case 9: w0
[2] = (w0
[2] & 0x000000ff) | (wx
<< 8);
5711 w0
[3] = (w0
[3] & 0xffffff00) | (wx
>> 24);
5713 case 10: w0
[2] = (w0
[2] & 0x0000ffff) | (wx
<< 16);
5714 w0
[3] = (w0
[3] & 0xffff0000) | (wx
>> 16);
5716 case 11: w0
[2] = (w0
[2] & 0x00ffffff) | (wx
<< 24);
5717 w0
[3] = (w0
[3] & 0xff000000) | (wx
>> 8);
5719 case 12: w0
[3] = wx
;
5721 case 13: w0
[3] = (w0
[3] & 0x000000ff) | (wx
<< 8);
5722 w1
[0] = (w1
[0] & 0xffffff00) | (wx
>> 24);
5724 case 14: w0
[3] = (w0
[3] & 0x0000ffff) | (wx
<< 16);
5725 w1
[0] = (w1
[0] & 0xffff0000) | (wx
>> 16);
5727 case 15: w0
[3] = (w0
[3] & 0x00ffffff) | (wx
<< 24);
5728 w1
[0] = (w1
[0] & 0xff000000) | (wx
>> 8);
5730 case 16: w1
[0] = wx
;
5732 case 17: w1
[0] = (w1
[0] & 0x000000ff) | (wx
<< 8);
5733 w1
[1] = (w1
[1] & 0xffffff00) | (wx
>> 24);
5735 case 18: w1
[0] = (w1
[0] & 0x0000ffff) | (wx
<< 16);
5736 w1
[1] = (w1
[1] & 0xffff0000) | (wx
>> 16);
5738 case 19: w1
[0] = (w1
[0] & 0x00ffffff) | (wx
<< 24);
5739 w1
[1] = (w1
[1] & 0xff000000) | (wx
>> 8);
5741 case 20: w1
[1] = wx
;
5743 case 21: w1
[1] = (w1
[1] & 0x000000ff) | (wx
<< 8);
5744 w1
[2] = (w1
[2] & 0xffffff00) | (wx
>> 24);
5746 case 22: w1
[1] = (w1
[1] & 0x0000ffff) | (wx
<< 16);
5747 w1
[2] = (w1
[2] & 0xffff0000) | (wx
>> 16);
5749 case 23: w1
[1] = (w1
[1] & 0x00ffffff) | (wx
<< 24);
5750 w1
[2] = (w1
[2] & 0xff000000) | (wx
>> 8);
5752 case 24: w1
[2] = wx
;
5754 case 25: w1
[2] = (w1
[2] & 0x000000ff) | (wx
<< 8);
5755 w1
[3] = (w1
[3] & 0xffffff00) | (wx
>> 24);
5757 case 26: w1
[2] = (w1
[2] & 0x0000ffff) | (wx
<< 16);
5758 w1
[3] = (w1
[3] & 0xffff0000) | (wx
>> 16);
5760 case 27: w1
[2] = (w1
[2] & 0x00ffffff) | (wx
<< 24);
5761 w1
[3] = (w1
[3] & 0xff000000) | (wx
>> 8);
5763 case 28: w1
[3] = wx
;
5765 case 29: w1
[3] = (w1
[3] & 0x000000ff) | (wx
<< 8);
5766 w2
[0] = (w2
[0] & 0xffffff00) | (wx
>> 24);
5768 case 30: w1
[3] = (w1
[3] & 0x0000ffff) | (wx
<< 16);
5769 w2
[0] = (w2
[0] & 0xffff0000) | (wx
>> 16);
5771 case 31: w1
[3] = (w1
[3] & 0x00ffffff) | (wx
<< 24);
5772 w2
[0] = (w2
[0] & 0xff000000) | (wx
>> 8);
5774 case 32: w2
[0] = wx
;
5776 case 33: w2
[0] = (w2
[0] & 0x000000ff) | (wx
<< 8);
5777 w2
[1] = (w2
[1] & 0xffffff00) | (wx
>> 24);
5779 case 34: w2
[0] = (w2
[0] & 0x0000ffff) | (wx
<< 16);
5780 w2
[1] = (w2
[1] & 0xffff0000) | (wx
>> 16);
5782 case 35: w2
[0] = (w2
[0] & 0x00ffffff) | (wx
<< 24);
5783 w2
[1] = (w2
[1] & 0xff000000) | (wx
>> 8);
5785 case 36: w2
[1] = wx
;
5787 case 37: w2
[1] = (w2
[1] & 0x000000ff) | (wx
<< 8);
5788 w2
[2] = (w2
[2] & 0xffffff00) | (wx
>> 24);
5790 case 38: w2
[1] = (w2
[1] & 0x0000ffff) | (wx
<< 16);
5791 w2
[2] = (w2
[2] & 0xffff0000) | (wx
>> 16);
5793 case 39: w2
[1] = (w2
[1] & 0x00ffffff) | (wx
<< 24);
5794 w2
[2] = (w2
[2] & 0xff000000) | (wx
>> 8);
5796 case 40: w2
[2] = wx
;
5798 case 41: w2
[2] = (w2
[2] & 0x000000ff) | (wx
<< 8);
5799 w2
[3] = (w2
[3] & 0xffffff00) | (wx
>> 24);
5801 case 42: w2
[2] = (w2
[2] & 0x0000ffff) | (wx
<< 16);
5802 w2
[3] = (w2
[3] & 0xffff0000) | (wx
>> 16);
5804 case 43: w2
[2] = (w2
[2] & 0x00ffffff) | (wx
<< 24);
5805 w2
[3] = (w2
[3] & 0xff000000) | (wx
>> 8);
5807 case 44: w2
[3] = wx
;
5809 case 45: w2
[3] = (w2
[3] & 0x000000ff) | (wx
<< 8);
5810 w3
[0] = (w3
[0] & 0xffffff00) | (wx
>> 24);
5812 case 46: w2
[3] = (w2
[3] & 0x0000ffff) | (wx
<< 16);
5813 w3
[0] = (w3
[0] & 0xffff0000) | (wx
>> 16);
5815 case 47: w2
[3] = (w2
[3] & 0x00ffffff) | (wx
<< 24);
5816 w3
[0] = (w3
[0] & 0xff000000) | (wx
>> 8);
5818 case 48: w3
[0] = wx
;
5820 case 49: w3
[0] = (w3
[0] & 0x000000ff) | (wx
<< 8);
5821 w3
[1] = (w3
[1] & 0xffffff00) | (wx
>> 24);
5823 case 50: w3
[0] = (w3
[0] & 0x0000ffff) | (wx
<< 16);
5824 w3
[1] = (w3
[1] & 0xffff0000) | (wx
>> 16);
5826 case 51: w3
[0] = (w3
[0] & 0x00ffffff) | (wx
<< 24);
5827 w3
[1] = (w3
[1] & 0xff000000) | (wx
>> 8);
5829 case 52: w3
[1] = wx
;
5831 case 53: w3
[1] = (w3
[1] & 0x000000ff) | (wx
<< 8);
5832 w3
[2] = (w3
[2] & 0xffffff00) | (wx
>> 24);
5834 case 54: w3
[1] = (w3
[1] & 0x0000ffff) | (wx
<< 16);
5835 w3
[2] = (w3
[2] & 0xffff0000) | (wx
>> 16);
5837 case 55: w3
[1] = (w3
[1] & 0x00ffffff) | (wx
<< 24);
5838 w3
[2] = (w3
[2] & 0xff000000) | (wx
>> 8);
5840 case 56: w3
[2] = wx
;
5842 case 57: w3
[2] = (w3
[2] & 0x000000ff) | (wx
<< 8);
5843 w3
[3] = (w3
[3] & 0xffffff00) | (wx
>> 24);
5845 case 58: w3
[2] = (w3
[2] & 0x0000ffff) | (wx
<< 16);
5846 w3
[3] = (w3
[3] & 0xffff0000) | (wx
>> 16);
5848 case 59: w3
[2] = (w3
[2] & 0x00ffffff) | (wx
<< 24);
5849 w3
[3] = (w3
[3] & 0xff000000) | (wx
>> 8);
5851 case 60: w3
[3] = wx
;
5853 case 61: w3
[3] = (w3
[3] & 0x000000ff) | (wx
<< 8);
5854 //w4[0] = (w4[0] & 0xffffff00) | (wx >> 24);
5856 case 62: w3
[3] = (w3
[3] & 0x0000ffff) | (wx
<< 16);
5857 //w4[0] = (w4[0] & 0xffff0000) | (wx >> 16);
5859 case 63: w3
[3] = (w3
[3] & 0x00ffffff) | (wx
<< 24);
5860 //w4[0] = (w4[0] & 0xff000000) | (wx >> 8);
5866 inline void overwrite_at_be_4x4 (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x wx
, const u32 salt_len
)
5868 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5874 case 1: w0
[0] = (w0
[0] & 0xff000000) | (wx
>> 8);
5875 w0
[1] = (w0
[1] & 0x00ffffff) | (wx
<< 24);
5877 case 2: w0
[0] = (w0
[0] & 0xffff0000) | (wx
>> 16);
5878 w0
[1] = (w0
[1] & 0x0000ffff) | (wx
<< 16);
5880 case 3: w0
[0] = (w0
[0] & 0xffffff00) | (wx
>> 24);
5881 w0
[1] = (w0
[1] & 0x000000ff) | (wx
<< 8);
5885 case 5: w0
[1] = (w0
[1] & 0xff000000) | (wx
>> 8);
5886 w0
[2] = (w0
[2] & 0x00ffffff) | (wx
<< 24);
5888 case 6: w0
[1] = (w0
[1] & 0xffff0000) | (wx
>> 16);
5889 w0
[2] = (w0
[2] & 0x0000ffff) | (wx
<< 16);
5891 case 7: w0
[1] = (w0
[1] & 0xffffff00) | (wx
>> 24);
5892 w0
[2] = (w0
[2] & 0x000000ff) | (wx
<< 8);
5896 case 9: w0
[2] = (w0
[2] & 0xff000000) | (wx
>> 8);
5897 w0
[3] = (w0
[3] & 0x00ffffff) | (wx
<< 24);
5899 case 10: w0
[2] = (w0
[2] & 0xffff0000) | (wx
>> 16);
5900 w0
[3] = (w0
[3] & 0x0000ffff) | (wx
<< 16);
5902 case 11: w0
[2] = (w0
[2] & 0xffffff00) | (wx
>> 24);
5903 w0
[3] = (w0
[3] & 0x000000ff) | (wx
<< 8);
5905 case 12: w0
[3] = wx
;
5907 case 13: w0
[3] = (w0
[3] & 0xff000000) | (wx
>> 8);
5908 w1
[0] = (w1
[0] & 0x00ffffff) | (wx
<< 24);
5910 case 14: w0
[3] = (w0
[3] & 0xffff0000) | (wx
>> 16);
5911 w1
[0] = (w1
[0] & 0x0000ffff) | (wx
<< 16);
5913 case 15: w0
[3] = (w0
[3] & 0xffffff00) | (wx
>> 24);
5914 w1
[0] = (w1
[0] & 0x000000ff) | (wx
<< 8);
5916 case 16: w1
[0] = wx
;
5918 case 17: w1
[0] = (w1
[0] & 0xff000000) | (wx
>> 8);
5919 w1
[1] = (w1
[1] & 0x00ffffff) | (wx
<< 24);
5921 case 18: w1
[0] = (w1
[0] & 0xffff0000) | (wx
>> 16);
5922 w1
[1] = (w1
[1] & 0x0000ffff) | (wx
<< 16);
5924 case 19: w1
[0] = (w1
[0] & 0xffffff00) | (wx
>> 24);
5925 w1
[1] = (w1
[1] & 0x000000ff) | (wx
<< 8);
5927 case 20: w1
[1] = wx
;
5929 case 21: w1
[1] = (w1
[1] & 0xff000000) | (wx
>> 8);
5930 w1
[2] = (w1
[2] & 0x00ffffff) | (wx
<< 24);
5932 case 22: w1
[1] = (w1
[1] & 0xffff0000) | (wx
>> 16);
5933 w1
[2] = (w1
[2] & 0x0000ffff) | (wx
<< 16);
5935 case 23: w1
[1] = (w1
[1] & 0xffffff00) | (wx
>> 24);
5936 w1
[2] = (w1
[2] & 0x000000ff) | (wx
<< 8);
5938 case 24: w1
[2] = wx
;
5940 case 25: w1
[2] = (w1
[2] & 0xff000000) | (wx
>> 8);
5941 w1
[3] = (w1
[3] & 0x00ffffff) | (wx
<< 24);
5943 case 26: w1
[2] = (w1
[2] & 0xffff0000) | (wx
>> 16);
5944 w1
[3] = (w1
[3] & 0x0000ffff) | (wx
<< 16);
5946 case 27: w1
[2] = (w1
[2] & 0xffffff00) | (wx
>> 24);
5947 w1
[3] = (w1
[3] & 0x000000ff) | (wx
<< 8);
5949 case 28: w1
[3] = wx
;
5951 case 29: w1
[3] = (w1
[3] & 0xff000000) | (wx
>> 8);
5952 w2
[0] = (w2
[0] & 0x00ffffff) | (wx
<< 24);
5954 case 30: w1
[3] = (w1
[3] & 0xffff0000) | (wx
>> 16);
5955 w2
[0] = (w2
[0] & 0x0000ffff) | (wx
<< 16);
5957 case 31: w1
[3] = (w1
[3] & 0xffffff00) | (wx
>> 24);
5958 w2
[0] = (w2
[0] & 0x000000ff) | (wx
<< 8);
5960 case 32: w2
[0] = wx
;
5962 case 33: w2
[0] = (w2
[0] & 0xff000000) | (wx
>> 8);
5963 w2
[1] = (w2
[1] & 0x00ffffff) | (wx
<< 24);
5965 case 34: w2
[0] = (w2
[0] & 0xffff0000) | (wx
>> 16);
5966 w2
[1] = (w2
[1] & 0x0000ffff) | (wx
<< 16);
5968 case 35: w2
[0] = (w2
[0] & 0xffffff00) | (wx
>> 24);
5969 w2
[1] = (w2
[1] & 0x000000ff) | (wx
<< 8);
5971 case 36: w2
[1] = wx
;
5973 case 37: w2
[1] = (w2
[1] & 0xff000000) | (wx
>> 8);
5974 w2
[2] = (w2
[2] & 0x00ffffff) | (wx
<< 24);
5976 case 38: w2
[1] = (w2
[1] & 0xffff0000) | (wx
>> 16);
5977 w2
[2] = (w2
[2] & 0x0000ffff) | (wx
<< 16);
5979 case 39: w2
[1] = (w2
[1] & 0xffffff00) | (wx
>> 24);
5980 w2
[2] = (w2
[2] & 0x000000ff) | (wx
<< 8);
5982 case 40: w2
[2] = wx
;
5984 case 41: w2
[2] = (w2
[2] & 0xff000000) | (wx
>> 8);
5985 w2
[3] = (w2
[3] & 0x00ffffff) | (wx
<< 24);
5987 case 42: w2
[2] = (w2
[2] & 0xffff0000) | (wx
>> 16);
5988 w2
[3] = (w2
[3] & 0x0000ffff) | (wx
<< 16);
5990 case 43: w2
[2] = (w2
[2] & 0xffffff00) | (wx
>> 24);
5991 w2
[3] = (w2
[3] & 0x000000ff) | (wx
<< 8);
5993 case 44: w2
[3] = wx
;
5995 case 45: w2
[3] = (w2
[3] & 0xff000000) | (wx
>> 8);
5996 w3
[0] = (w3
[0] & 0x00ffffff) | (wx
<< 24);
5998 case 46: w2
[3] = (w2
[3] & 0xffff0000) | (wx
>> 16);
5999 w3
[0] = (w3
[0] & 0x0000ffff) | (wx
<< 16);
6001 case 47: w2
[3] = (w2
[3] & 0xffffff00) | (wx
>> 24);
6002 w3
[0] = (w3
[0] & 0x000000ff) | (wx
<< 8);
6004 case 48: w3
[0] = wx
;
6006 case 49: w3
[0] = (w3
[0] & 0xff000000) | (wx
>> 8);
6007 w3
[1] = (w3
[1] & 0x00ffffff) | (wx
<< 24);
6009 case 50: w3
[0] = (w3
[0] & 0xffff0000) | (wx
>> 16);
6010 w3
[1] = (w3
[1] & 0x0000ffff) | (wx
<< 16);
6012 case 51: w3
[0] = (w3
[0] & 0xffffff00) | (wx
>> 24);
6013 w3
[1] = (w3
[1] & 0x000000ff) | (wx
<< 8);
6015 case 52: w3
[1] = wx
;
6017 case 53: w3
[1] = (w3
[1] & 0xff000000) | (wx
>> 8);
6018 w3
[2] = (w3
[2] & 0x00ffffff) | (wx
<< 24);
6020 case 54: w3
[1] = (w3
[1] & 0xffff0000) | (wx
>> 16);
6021 w3
[2] = (w3
[2] & 0x0000ffff) | (wx
<< 16);
6023 case 55: w3
[1] = (w3
[1] & 0xffffff00) | (wx
>> 24);
6024 w3
[2] = (w3
[2] & 0x000000ff) | (wx
<< 8);
6026 case 56: w3
[2] = wx
;
6028 case 57: w3
[2] = (w3
[2] & 0xff000000) | (wx
>> 8);
6029 w3
[3] = (w3
[3] & 0x00ffffff) | (wx
<< 24);
6031 case 58: w3
[2] = (w3
[2] & 0xffff0000) | (wx
>> 16);
6032 w3
[3] = (w3
[3] & 0x0000ffff) | (wx
<< 16);
6034 case 59: w3
[2] = (w3
[2] & 0xffffff00) | (wx
>> 24);
6035 w3
[3] = (w3
[3] & 0x000000ff) | (wx
<< 8);
6037 case 60: w3
[3] = wx
;
6039 case 61: w3
[3] = (w3
[3] & 0xff000000) | (wx
>> 8);
6040 //w4[0] = (w4[0] & 0x00ffffff) | (wx << 24);
6042 case 62: w3
[3] = (w3
[3] & 0xffff0000) | (wx
>> 16);
6043 //w4[0] = (w4[0] & 0x0000ffff) | (wx << 16);
6045 case 63: w3
[3] = (w3
[3] & 0xffffff00) | (wx
>> 24);
6046 //w4[0] = (w4[0] & 0x000000ff) | (wx << 8);
6052 * vector functions as scalar (for outer loop usage)
6055 inline void append_0x01_2x4_S (u32x w0
[4], u32x w1
[4], const u32 offset
)
6064 w0
[0] = w0
[0] | 0x0100;
6068 w0
[0] = w0
[0] | 0x010000;
6072 w0
[0] = w0
[0] | 0x01000000;
6080 w0
[1] = w0
[1] | 0x0100;
6084 w0
[1] = w0
[1] | 0x010000;
6088 w0
[1] = w0
[1] | 0x01000000;
6096 w0
[2] = w0
[2] | 0x0100;
6100 w0
[2] = w0
[2] | 0x010000;
6104 w0
[2] = w0
[2] | 0x01000000;
6112 w0
[3] = w0
[3] | 0x0100;
6116 w0
[3] = w0
[3] | 0x010000;
6120 w0
[3] = w0
[3] | 0x01000000;
6128 w1
[0] = w1
[0] | 0x0100;
6132 w1
[0] = w1
[0] | 0x010000;
6136 w1
[0] = w1
[0] | 0x01000000;
6144 w1
[1] = w1
[1] | 0x0100;
6148 w1
[1] = w1
[1] | 0x010000;
6152 w1
[1] = w1
[1] | 0x01000000;
6160 w1
[2] = w1
[2] | 0x0100;
6164 w1
[2] = w1
[2] | 0x010000;
6168 w1
[2] = w1
[2] | 0x01000000;
6176 w1
[3] = w1
[3] | 0x0100;
6180 w1
[3] = w1
[3] | 0x010000;
6184 w1
[3] = w1
[3] | 0x01000000;
6189 inline void append_0x80_1x4_S (u32 w0
[4], const u32 offset
)
6198 w0
[0] = w0
[0] | 0x8000;
6202 w0
[0] = w0
[0] | 0x800000;
6206 w0
[0] = w0
[0] | 0x80000000;
6214 w0
[1] = w0
[1] | 0x8000;
6218 w0
[1] = w0
[1] | 0x800000;
6222 w0
[1] = w0
[1] | 0x80000000;
6230 w0
[2] = w0
[2] | 0x8000;
6234 w0
[2] = w0
[2] | 0x800000;
6238 w0
[2] = w0
[2] | 0x80000000;
6246 w0
[3] = w0
[3] | 0x8000;
6250 w0
[3] = w0
[3] | 0x800000;
6254 w0
[3] = w0
[3] | 0x80000000;
6259 inline void append_0x80_2x4_S (u32 w0
[4], u32 w1
[4], const u32 offset
)
6268 w0
[0] = w0
[0] | 0x8000;
6272 w0
[0] = w0
[0] | 0x800000;
6276 w0
[0] = w0
[0] | 0x80000000;
6284 w0
[1] = w0
[1] | 0x8000;
6288 w0
[1] = w0
[1] | 0x800000;
6292 w0
[1] = w0
[1] | 0x80000000;
6300 w0
[2] = w0
[2] | 0x8000;
6304 w0
[2] = w0
[2] | 0x800000;
6308 w0
[2] = w0
[2] | 0x80000000;
6316 w0
[3] = w0
[3] | 0x8000;
6320 w0
[3] = w0
[3] | 0x800000;
6324 w0
[3] = w0
[3] | 0x80000000;
6332 w1
[0] = w1
[0] | 0x8000;
6336 w1
[0] = w1
[0] | 0x800000;
6340 w1
[0] = w1
[0] | 0x80000000;
6348 w1
[1] = w1
[1] | 0x8000;
6352 w1
[1] = w1
[1] | 0x800000;
6356 w1
[1] = w1
[1] | 0x80000000;
6364 w1
[2] = w1
[2] | 0x8000;
6368 w1
[2] = w1
[2] | 0x800000;
6372 w1
[2] = w1
[2] | 0x80000000;
6380 w1
[3] = w1
[3] | 0x8000;
6384 w1
[3] = w1
[3] | 0x800000;
6388 w1
[3] = w1
[3] | 0x80000000;
6393 inline void append_0x80_3x4_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], const u32 offset
)
6402 w0
[0] = w0
[0] | 0x8000;
6406 w0
[0] = w0
[0] | 0x800000;
6410 w0
[0] = w0
[0] | 0x80000000;
6418 w0
[1] = w0
[1] | 0x8000;
6422 w0
[1] = w0
[1] | 0x800000;
6426 w0
[1] = w0
[1] | 0x80000000;
6434 w0
[2] = w0
[2] | 0x8000;
6438 w0
[2] = w0
[2] | 0x800000;
6442 w0
[2] = w0
[2] | 0x80000000;
6450 w0
[3] = w0
[3] | 0x8000;
6454 w0
[3] = w0
[3] | 0x800000;
6458 w0
[3] = w0
[3] | 0x80000000;
6466 w1
[0] = w1
[0] | 0x8000;
6470 w1
[0] = w1
[0] | 0x800000;
6474 w1
[0] = w1
[0] | 0x80000000;
6482 w1
[1] = w1
[1] | 0x8000;
6486 w1
[1] = w1
[1] | 0x800000;
6490 w1
[1] = w1
[1] | 0x80000000;
6498 w1
[2] = w1
[2] | 0x8000;
6502 w1
[2] = w1
[2] | 0x800000;
6506 w1
[2] = w1
[2] | 0x80000000;
6514 w1
[3] = w1
[3] | 0x8000;
6518 w1
[3] = w1
[3] | 0x800000;
6522 w1
[3] = w1
[3] | 0x80000000;
6530 w2
[0] = w2
[0] | 0x8000;
6534 w2
[0] = w2
[0] | 0x800000;
6538 w2
[0] = w2
[0] | 0x80000000;
6546 w2
[1] = w2
[1] | 0x8000;
6550 w2
[1] = w2
[1] | 0x800000;
6554 w2
[1] = w2
[1] | 0x80000000;
6562 w2
[2] = w2
[2] | 0x8000;
6566 w2
[2] = w2
[2] | 0x800000;
6570 w2
[2] = w2
[2] | 0x80000000;
6578 w2
[3] = w2
[3] | 0x8000;
6582 w2
[3] = w2
[3] | 0x800000;
6586 w2
[3] = w2
[3] | 0x80000000;
6591 inline void append_0x80_4x4_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
6600 w0
[0] = w0
[0] | 0x8000;
6604 w0
[0] = w0
[0] | 0x800000;
6608 w0
[0] = w0
[0] | 0x80000000;
6616 w0
[1] = w0
[1] | 0x8000;
6620 w0
[1] = w0
[1] | 0x800000;
6624 w0
[1] = w0
[1] | 0x80000000;
6632 w0
[2] = w0
[2] | 0x8000;
6636 w0
[2] = w0
[2] | 0x800000;
6640 w0
[2] = w0
[2] | 0x80000000;
6648 w0
[3] = w0
[3] | 0x8000;
6652 w0
[3] = w0
[3] | 0x800000;
6656 w0
[3] = w0
[3] | 0x80000000;
6664 w1
[0] = w1
[0] | 0x8000;
6668 w1
[0] = w1
[0] | 0x800000;
6672 w1
[0] = w1
[0] | 0x80000000;
6680 w1
[1] = w1
[1] | 0x8000;
6684 w1
[1] = w1
[1] | 0x800000;
6688 w1
[1] = w1
[1] | 0x80000000;
6696 w1
[2] = w1
[2] | 0x8000;
6700 w1
[2] = w1
[2] | 0x800000;
6704 w1
[2] = w1
[2] | 0x80000000;
6712 w1
[3] = w1
[3] | 0x8000;
6716 w1
[3] = w1
[3] | 0x800000;
6720 w1
[3] = w1
[3] | 0x80000000;
6728 w2
[0] = w2
[0] | 0x8000;
6732 w2
[0] = w2
[0] | 0x800000;
6736 w2
[0] = w2
[0] | 0x80000000;
6744 w2
[1] = w2
[1] | 0x8000;
6748 w2
[1] = w2
[1] | 0x800000;
6752 w2
[1] = w2
[1] | 0x80000000;
6760 w2
[2] = w2
[2] | 0x8000;
6764 w2
[2] = w2
[2] | 0x800000;
6768 w2
[2] = w2
[2] | 0x80000000;
6776 w2
[3] = w2
[3] | 0x8000;
6780 w2
[3] = w2
[3] | 0x800000;
6784 w2
[3] = w2
[3] | 0x80000000;
6792 w3
[0] = w3
[0] | 0x8000;
6796 w3
[0] = w3
[0] | 0x800000;
6800 w3
[0] = w3
[0] | 0x80000000;
6808 w3
[1] = w3
[1] | 0x8000;
6812 w3
[1] = w3
[1] | 0x800000;
6816 w3
[1] = w3
[1] | 0x80000000;
6824 w3
[2] = w3
[2] | 0x8000;
6828 w3
[2] = w3
[2] | 0x800000;
6832 w3
[2] = w3
[2] | 0x80000000;
6840 w3
[3] = w3
[3] | 0x8000;
6844 w3
[3] = w3
[3] | 0x800000;
6848 w3
[3] = w3
[3] | 0x80000000;
6853 inline void truncate_block_S (u32 w
[4], const u32 len
)
6862 case 1: w
[0] &= 0x000000FF;
6867 case 2: w
[0] &= 0x0000FFFF;
6872 case 3: w
[0] &= 0x00FFFFFF;
6881 case 5: w
[1] &= 0x000000FF;
6885 case 6: w
[1] &= 0x0000FFFF;
6889 case 7: w
[1] &= 0x00FFFFFF;
6896 case 9: w
[2] &= 0x000000FF;
6899 case 10: w
[2] &= 0x0000FFFF;
6902 case 11: w
[2] &= 0x00FFFFFF;
6907 case 13: w
[3] &= 0x000000FF;
6909 case 14: w
[3] &= 0x0000FFFF;
6911 case 15: w
[3] &= 0x00FFFFFF;
6916 inline void make_unicode_S (const u32 in
[4], u32 out1
[4], u32 out2
[4])
6919 out2
[3] = __byte_perm_S (in
[3], 0, 0x7372);
6920 out2
[2] = __byte_perm_S (in
[3], 0, 0x7170);
6921 out2
[1] = __byte_perm_S (in
[2], 0, 0x7372);
6922 out2
[0] = __byte_perm_S (in
[2], 0, 0x7170);
6923 out1
[3] = __byte_perm_S (in
[1], 0, 0x7372);
6924 out1
[2] = __byte_perm_S (in
[1], 0, 0x7170);
6925 out1
[1] = __byte_perm_S (in
[0], 0, 0x7372);
6926 out1
[0] = __byte_perm_S (in
[0], 0, 0x7170);
6929 #if defined IS_AMD || defined IS_GENERIC
6930 out2
[3] = ((in
[3] >> 8) & 0x00FF0000) | ((in
[3] >> 16) & 0x000000FF);
6931 out2
[2] = ((in
[3] << 8) & 0x00FF0000) | ((in
[3] >> 0) & 0x000000FF);
6932 out2
[1] = ((in
[2] >> 8) & 0x00FF0000) | ((in
[2] >> 16) & 0x000000FF);
6933 out2
[0] = ((in
[2] << 8) & 0x00FF0000) | ((in
[2] >> 0) & 0x000000FF);
6934 out1
[3] = ((in
[1] >> 8) & 0x00FF0000) | ((in
[1] >> 16) & 0x000000FF);
6935 out1
[2] = ((in
[1] << 8) & 0x00FF0000) | ((in
[1] >> 0) & 0x000000FF);
6936 out1
[1] = ((in
[0] >> 8) & 0x00FF0000) | ((in
[0] >> 16) & 0x000000FF);
6937 out1
[0] = ((in
[0] << 8) & 0x00FF0000) | ((in
[0] >> 0) & 0x000000FF);
6941 inline void undo_unicode_S (const u32 in1
[4], const u32 in2
[4], u32 out
[4])
6944 out
[0] = __byte_perm_S (in1
[0], in1
[1], 0x6420);
6945 out
[1] = __byte_perm_S (in1
[2], in1
[3], 0x6420);
6946 out
[2] = __byte_perm_S (in2
[0], in2
[1], 0x6420);
6947 out
[3] = __byte_perm_S (in2
[2], in2
[3], 0x6420);
6950 #if defined IS_AMD || defined IS_GENERIC
6951 out
[0] = ((in1
[0] & 0x000000ff) >> 0) | ((in1
[0] & 0x00ff0000) >> 8)
6952 | ((in1
[1] & 0x000000ff) << 16) | ((in1
[1] & 0x00ff0000) << 8);
6953 out
[1] = ((in1
[2] & 0x000000ff) >> 0) | ((in1
[2] & 0x00ff0000) >> 8)
6954 | ((in1
[3] & 0x000000ff) << 16) | ((in1
[3] & 0x00ff0000) << 8);
6955 out
[2] = ((in2
[0] & 0x000000ff) >> 0) | ((in2
[0] & 0x00ff0000) >> 8)
6956 | ((in2
[1] & 0x000000ff) << 16) | ((in2
[1] & 0x00ff0000) << 8);
6957 out
[3] = ((in2
[2] & 0x000000ff) >> 0) | ((in2
[2] & 0x00ff0000) >> 8)
6958 | ((in2
[3] & 0x000000ff) << 16) | ((in2
[3] & 0x00ff0000) << 8);
6962 inline void switch_buffer_by_offset_le_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
6964 #if defined IS_AMD || defined IS_GENERIC
6965 const int offset_mod_4
= offset
& 3;
6967 const int offset_minus_4
= 4 - offset
;
6972 w3
[2] = amd_bytealign_S ( 0, w3
[1], offset_minus_4
);
6973 w3
[1] = amd_bytealign_S (w3
[1], w3
[0], offset_minus_4
);
6974 w3
[0] = amd_bytealign_S (w3
[0], w2
[3], offset_minus_4
);
6975 w2
[3] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
6976 w2
[2] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
6977 w2
[1] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
6978 w2
[0] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
6979 w1
[3] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
6980 w1
[2] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
6981 w1
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
6982 w1
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
6983 w0
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
6984 w0
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
6985 w0
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
6986 w0
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
6988 if (offset_mod_4
== 0)
7010 w3
[2] = amd_bytealign_S ( 0, w3
[0], offset_minus_4
);
7011 w3
[1] = amd_bytealign_S (w3
[0], w2
[3], offset_minus_4
);
7012 w3
[0] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
7013 w2
[3] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
7014 w2
[2] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
7015 w2
[1] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
7016 w2
[0] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7017 w1
[3] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7018 w1
[2] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7019 w1
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7020 w1
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7021 w0
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7022 w0
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7023 w0
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7026 if (offset_mod_4
== 0)
7047 w3
[2] = amd_bytealign_S ( 0, w2
[3], offset_minus_4
);
7048 w3
[1] = amd_bytealign_S (w2
[3], w2
[2], offset_minus_4
);
7049 w3
[0] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
7050 w2
[3] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
7051 w2
[2] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
7052 w2
[1] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7053 w2
[0] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7054 w1
[3] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7055 w1
[2] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7056 w1
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7057 w1
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7058 w0
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7059 w0
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7063 if (offset_mod_4
== 0)
7083 w3
[2] = amd_bytealign_S ( 0, w2
[2], offset_minus_4
);
7084 w3
[1] = amd_bytealign_S (w2
[2], w2
[1], offset_minus_4
);
7085 w3
[0] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
7086 w2
[3] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
7087 w2
[2] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7088 w2
[1] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7089 w2
[0] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7090 w1
[3] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7091 w1
[2] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7092 w1
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7093 w1
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7094 w0
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7099 if (offset_mod_4
== 0)
7118 w3
[2] = amd_bytealign_S ( 0, w2
[1], offset_minus_4
);
7119 w3
[1] = amd_bytealign_S (w2
[1], w2
[0], offset_minus_4
);
7120 w3
[0] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
7121 w2
[3] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7122 w2
[2] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7123 w2
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7124 w2
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7125 w1
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7126 w1
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7127 w1
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7128 w1
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7134 if (offset_mod_4
== 0)
7152 w3
[2] = amd_bytealign_S ( 0, w2
[0], offset_minus_4
);
7153 w3
[1] = amd_bytealign_S (w2
[0], w1
[3], offset_minus_4
);
7154 w3
[0] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7155 w2
[3] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7156 w2
[2] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7157 w2
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7158 w2
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7159 w1
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7160 w1
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7161 w1
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7168 if (offset_mod_4
== 0)
7185 w3
[2] = amd_bytealign_S ( 0, w1
[3], offset_minus_4
);
7186 w3
[1] = amd_bytealign_S (w1
[3], w1
[2], offset_minus_4
);
7187 w3
[0] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7188 w2
[3] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7189 w2
[2] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7190 w2
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7191 w2
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7192 w1
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7193 w1
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7201 if (offset_mod_4
== 0)
7217 w3
[2] = amd_bytealign_S ( 0, w1
[2], offset_minus_4
);
7218 w3
[1] = amd_bytealign_S (w1
[2], w1
[1], offset_minus_4
);
7219 w3
[0] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7220 w2
[3] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7221 w2
[2] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7222 w2
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7223 w2
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7224 w1
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7233 if (offset_mod_4
== 0)
7248 w3
[2] = amd_bytealign_S ( 0, w1
[1], offset_minus_4
);
7249 w3
[1] = amd_bytealign_S (w1
[1], w1
[0], offset_minus_4
);
7250 w3
[0] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7251 w2
[3] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7252 w2
[2] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7253 w2
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7254 w2
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7264 if (offset_mod_4
== 0)
7278 w3
[2] = amd_bytealign_S ( 0, w1
[0], offset_minus_4
);
7279 w3
[1] = amd_bytealign_S (w1
[0], w0
[3], offset_minus_4
);
7280 w3
[0] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7281 w2
[3] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7282 w2
[2] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7283 w2
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7294 if (offset_mod_4
== 0)
7307 w3
[2] = amd_bytealign_S ( 0, w0
[3], offset_minus_4
);
7308 w3
[1] = amd_bytealign_S (w0
[3], w0
[2], offset_minus_4
);
7309 w3
[0] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7310 w2
[3] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7311 w2
[2] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7323 if (offset_mod_4
== 0)
7335 w3
[2] = amd_bytealign_S ( 0, w0
[2], offset_minus_4
);
7336 w3
[1] = amd_bytealign_S (w0
[2], w0
[1], offset_minus_4
);
7337 w3
[0] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7338 w2
[3] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7351 if (offset_mod_4
== 0)
7362 w3
[2] = amd_bytealign_S ( 0, w0
[1], offset_minus_4
);
7363 w3
[1] = amd_bytealign_S (w0
[1], w0
[0], offset_minus_4
);
7364 w3
[0] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7378 if (offset_mod_4
== 0)
7388 w3
[2] = amd_bytealign_S ( 0, w0
[0], offset_minus_4
);
7389 w3
[1] = amd_bytealign_S (w0
[0], 0, offset_minus_4
);
7404 if (offset_mod_4
== 0)
7415 const int offset_minus_4
= 4 - (offset
% 4);
7417 const int selector
= (0x76543210 >> (offset_minus_4
* 4)) & 0xffff;
7422 w3
[1] = __byte_perm_S (w3
[0], w3
[1], selector
);
7423 w3
[0] = __byte_perm_S (w2
[3], w3
[0], selector
);
7424 w2
[3] = __byte_perm_S (w2
[2], w2
[3], selector
);
7425 w2
[2] = __byte_perm_S (w2
[1], w2
[2], selector
);
7426 w2
[1] = __byte_perm_S (w2
[0], w2
[1], selector
);
7427 w2
[0] = __byte_perm_S (w1
[3], w2
[0], selector
);
7428 w1
[3] = __byte_perm_S (w1
[2], w1
[3], selector
);
7429 w1
[2] = __byte_perm_S (w1
[1], w1
[2], selector
);
7430 w1
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
7431 w1
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
7432 w0
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
7433 w0
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
7434 w0
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
7435 w0
[0] = __byte_perm_S ( 0, w0
[0], selector
);
7440 w3
[1] = __byte_perm_S (w2
[3], w3
[0], selector
);
7441 w3
[0] = __byte_perm_S (w2
[2], w2
[3], selector
);
7442 w2
[3] = __byte_perm_S (w2
[1], w2
[2], selector
);
7443 w2
[2] = __byte_perm_S (w2
[0], w2
[1], selector
);
7444 w2
[1] = __byte_perm_S (w1
[3], w2
[0], selector
);
7445 w2
[0] = __byte_perm_S (w1
[2], w1
[3], selector
);
7446 w1
[3] = __byte_perm_S (w1
[1], w1
[2], selector
);
7447 w1
[2] = __byte_perm_S (w1
[0], w1
[1], selector
);
7448 w1
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
7449 w1
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
7450 w0
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
7451 w0
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
7452 w0
[1] = __byte_perm_S ( 0, w0
[0], selector
);
7458 w3
[1] = __byte_perm_S (w2
[2], w2
[3], selector
);
7459 w3
[0] = __byte_perm_S (w2
[1], w2
[2], selector
);
7460 w2
[3] = __byte_perm_S (w2
[0], w2
[1], selector
);
7461 w2
[2] = __byte_perm_S (w1
[3], w2
[0], selector
);
7462 w2
[1] = __byte_perm_S (w1
[2], w1
[3], selector
);
7463 w2
[0] = __byte_perm_S (w1
[1], w1
[2], selector
);
7464 w1
[3] = __byte_perm_S (w1
[0], w1
[1], selector
);
7465 w1
[2] = __byte_perm_S (w0
[3], w1
[0], selector
);
7466 w1
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
7467 w1
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
7468 w0
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
7469 w0
[2] = __byte_perm_S ( 0, w0
[0], selector
);
7476 w3
[1] = __byte_perm_S (w2
[1], w2
[2], selector
);
7477 w3
[0] = __byte_perm_S (w2
[0], w2
[1], selector
);
7478 w2
[3] = __byte_perm_S (w1
[3], w2
[0], selector
);
7479 w2
[2] = __byte_perm_S (w1
[2], w1
[3], selector
);
7480 w2
[1] = __byte_perm_S (w1
[1], w1
[2], selector
);
7481 w2
[0] = __byte_perm_S (w1
[0], w1
[1], selector
);
7482 w1
[3] = __byte_perm_S (w0
[3], w1
[0], selector
);
7483 w1
[2] = __byte_perm_S (w0
[2], w0
[3], selector
);
7484 w1
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
7485 w1
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
7486 w0
[3] = __byte_perm_S ( 0, w0
[0], selector
);
7494 w3
[1] = __byte_perm_S (w2
[0], w2
[1], selector
);
7495 w3
[0] = __byte_perm_S (w1
[3], w2
[0], selector
);
7496 w2
[3] = __byte_perm_S (w1
[2], w1
[3], selector
);
7497 w2
[2] = __byte_perm_S (w1
[1], w1
[2], selector
);
7498 w2
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
7499 w2
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
7500 w1
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
7501 w1
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
7502 w1
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
7503 w1
[0] = __byte_perm_S ( 0, w0
[0], selector
);
7512 w3
[1] = __byte_perm_S (w1
[3], w2
[0], selector
);
7513 w3
[0] = __byte_perm_S (w1
[2], w1
[3], selector
);
7514 w2
[3] = __byte_perm_S (w1
[1], w1
[2], selector
);
7515 w2
[2] = __byte_perm_S (w1
[0], w1
[1], selector
);
7516 w2
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
7517 w2
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
7518 w1
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
7519 w1
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
7520 w1
[1] = __byte_perm_S ( 0, w0
[0], selector
);
7530 w3
[1] = __byte_perm_S (w1
[2], w1
[3], selector
);
7531 w3
[0] = __byte_perm_S (w1
[1], w1
[2], selector
);
7532 w2
[3] = __byte_perm_S (w1
[0], w1
[1], selector
);
7533 w2
[2] = __byte_perm_S (w0
[3], w1
[0], selector
);
7534 w2
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
7535 w2
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
7536 w1
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
7537 w1
[2] = __byte_perm_S ( 0, w0
[0], selector
);
7548 w3
[1] = __byte_perm_S (w1
[1], w1
[2], selector
);
7549 w3
[0] = __byte_perm_S (w1
[0], w1
[1], selector
);
7550 w2
[3] = __byte_perm_S (w0
[3], w1
[0], selector
);
7551 w2
[2] = __byte_perm_S (w0
[2], w0
[3], selector
);
7552 w2
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
7553 w2
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
7554 w1
[3] = __byte_perm_S ( 0, w0
[0], selector
);
7566 w3
[1] = __byte_perm_S (w1
[0], w1
[1], selector
);
7567 w3
[0] = __byte_perm_S (w0
[3], w1
[0], selector
);
7568 w2
[3] = __byte_perm_S (w0
[2], w0
[3], selector
);
7569 w2
[2] = __byte_perm_S (w0
[1], w0
[2], selector
);
7570 w2
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
7571 w2
[0] = __byte_perm_S ( 0, w0
[0], selector
);
7584 w3
[1] = __byte_perm_S (w0
[3], w1
[0], selector
);
7585 w3
[0] = __byte_perm_S (w0
[2], w0
[3], selector
);
7586 w2
[3] = __byte_perm_S (w0
[1], w0
[2], selector
);
7587 w2
[2] = __byte_perm_S (w0
[0], w0
[1], selector
);
7588 w2
[1] = __byte_perm_S ( 0, w0
[0], selector
);
7602 w3
[1] = __byte_perm_S (w0
[2], w0
[3], selector
);
7603 w3
[0] = __byte_perm_S (w0
[1], w0
[2], selector
);
7604 w2
[3] = __byte_perm_S (w0
[0], w0
[1], selector
);
7605 w2
[2] = __byte_perm_S ( 0, w0
[0], selector
);
7620 w3
[1] = __byte_perm_S (w0
[1], w0
[2], selector
);
7621 w3
[0] = __byte_perm_S (w0
[0], w0
[1], selector
);
7622 w2
[3] = __byte_perm_S ( 0, w0
[0], selector
);
7638 w3
[1] = __byte_perm_S (w0
[0], w0
[1], selector
);
7639 w3
[0] = __byte_perm_S ( 0, w0
[0], selector
);
7656 w3
[1] = __byte_perm_S ( 0, w0
[0], selector
);
7676 inline void switch_buffer_by_offset_be_S (u32 w0
[4], u32 w1
[4], u32 w2
[4], u32 w3
[4], const u32 offset
)
7678 #if defined IS_AMD || defined IS_GENERIC
7682 w3
[2] = amd_bytealign_S (w3
[1], 0, offset
);
7683 w3
[1] = amd_bytealign_S (w3
[0], w3
[1], offset
);
7684 w3
[0] = amd_bytealign_S (w2
[3], w3
[0], offset
);
7685 w2
[3] = amd_bytealign_S (w2
[2], w2
[3], offset
);
7686 w2
[2] = amd_bytealign_S (w2
[1], w2
[2], offset
);
7687 w2
[1] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7688 w2
[0] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7689 w1
[3] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7690 w1
[2] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7691 w1
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7692 w1
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7693 w0
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7694 w0
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7695 w0
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7696 w0
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
7700 w3
[2] = amd_bytealign_S (w3
[0], 0, offset
);
7701 w3
[1] = amd_bytealign_S (w2
[3], w3
[0], offset
);
7702 w3
[0] = amd_bytealign_S (w2
[2], w2
[3], offset
);
7703 w2
[3] = amd_bytealign_S (w2
[1], w2
[2], offset
);
7704 w2
[2] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7705 w2
[1] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7706 w2
[0] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7707 w1
[3] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7708 w1
[2] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7709 w1
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7710 w1
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7711 w0
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7712 w0
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7713 w0
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
7718 w3
[2] = amd_bytealign_S (w2
[3], 0, offset
);
7719 w3
[1] = amd_bytealign_S (w2
[2], w2
[3], offset
);
7720 w3
[0] = amd_bytealign_S (w2
[1], w2
[2], offset
);
7721 w2
[3] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7722 w2
[2] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7723 w2
[1] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7724 w2
[0] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7725 w1
[3] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7726 w1
[2] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7727 w1
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7728 w1
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7729 w0
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7730 w0
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
7736 w3
[2] = amd_bytealign_S (w2
[2], 0, offset
);
7737 w3
[1] = amd_bytealign_S (w2
[1], w2
[2], offset
);
7738 w3
[0] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7739 w2
[3] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7740 w2
[2] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7741 w2
[1] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7742 w2
[0] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7743 w1
[3] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7744 w1
[2] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7745 w1
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7746 w1
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7747 w0
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
7754 w3
[2] = amd_bytealign_S (w2
[1], 0, offset
);
7755 w3
[1] = amd_bytealign_S (w2
[0], w2
[1], offset
);
7756 w3
[0] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7757 w2
[3] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7758 w2
[2] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7759 w2
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7760 w2
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7761 w1
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7762 w1
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7763 w1
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7764 w1
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
7772 w3
[2] = amd_bytealign_S (w2
[0], 0, offset
);
7773 w3
[1] = amd_bytealign_S (w1
[3], w2
[0], offset
);
7774 w3
[0] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7775 w2
[3] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7776 w2
[2] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7777 w2
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7778 w2
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7779 w1
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7780 w1
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7781 w1
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
7790 w3
[2] = amd_bytealign_S (w1
[3], 0, offset
);
7791 w3
[1] = amd_bytealign_S (w1
[2], w1
[3], offset
);
7792 w3
[0] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7793 w2
[3] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7794 w2
[2] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7795 w2
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7796 w2
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7797 w1
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7798 w1
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
7808 w3
[2] = amd_bytealign_S (w1
[2], 0, offset
);
7809 w3
[1] = amd_bytealign_S (w1
[1], w1
[2], offset
);
7810 w3
[0] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7811 w2
[3] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7812 w2
[2] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7813 w2
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7814 w2
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7815 w1
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
7826 w3
[2] = amd_bytealign_S (w1
[1], 0, offset
);
7827 w3
[1] = amd_bytealign_S (w1
[0], w1
[1], offset
);
7828 w3
[0] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7829 w2
[3] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7830 w2
[2] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7831 w2
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7832 w2
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
7844 w3
[2] = amd_bytealign_S (w1
[0], 0, offset
);
7845 w3
[1] = amd_bytealign_S (w0
[3], w1
[0], offset
);
7846 w3
[0] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7847 w2
[3] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7848 w2
[2] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7849 w2
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
7862 w3
[2] = amd_bytealign_S (w0
[3], 0, offset
);
7863 w3
[1] = amd_bytealign_S (w0
[2], w0
[3], offset
);
7864 w3
[0] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7865 w2
[3] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7866 w2
[2] = amd_bytealign_S ( 0, w0
[0], offset
);
7880 w3
[2] = amd_bytealign_S (w0
[2], 0, offset
);
7881 w3
[1] = amd_bytealign_S (w0
[1], w0
[2], offset
);
7882 w3
[0] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7883 w2
[3] = amd_bytealign_S ( 0, w0
[0], offset
);
7898 w3
[2] = amd_bytealign_S (w0
[1], 0, offset
);
7899 w3
[1] = amd_bytealign_S (w0
[0], w0
[1], offset
);
7900 w3
[0] = amd_bytealign_S ( 0, w0
[0], offset
);
7916 w3
[2] = amd_bytealign_S (w0
[0], 0, offset
);
7917 w3
[1] = amd_bytealign_S ( 0, w0
[0], offset
);
7936 const int selector
= (0x76543210 >> ((offset
& 3) * 4)) & 0xffff;
7941 w3
[1] = __byte_perm_S (w3
[1], w3
[0], selector
);
7942 w3
[0] = __byte_perm_S (w3
[0], w2
[3], selector
);
7943 w2
[3] = __byte_perm_S (w2
[3], w2
[2], selector
);
7944 w2
[2] = __byte_perm_S (w2
[2], w2
[1], selector
);
7945 w2
[1] = __byte_perm_S (w2
[1], w2
[0], selector
);
7946 w2
[0] = __byte_perm_S (w2
[0], w1
[3], selector
);
7947 w1
[3] = __byte_perm_S (w1
[3], w1
[2], selector
);
7948 w1
[2] = __byte_perm_S (w1
[2], w1
[1], selector
);
7949 w1
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
7950 w1
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
7951 w0
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
7952 w0
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
7953 w0
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
7954 w0
[0] = __byte_perm_S (w0
[0], 0, selector
);
7958 w3
[1] = __byte_perm_S (w3
[0], w2
[3], selector
);
7959 w3
[0] = __byte_perm_S (w2
[3], w2
[2], selector
);
7960 w2
[3] = __byte_perm_S (w2
[2], w2
[1], selector
);
7961 w2
[2] = __byte_perm_S (w2
[1], w2
[0], selector
);
7962 w2
[1] = __byte_perm_S (w2
[0], w1
[3], selector
);
7963 w2
[0] = __byte_perm_S (w1
[3], w1
[2], selector
);
7964 w1
[3] = __byte_perm_S (w1
[2], w1
[1], selector
);
7965 w1
[2] = __byte_perm_S (w1
[1], w1
[0], selector
);
7966 w1
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
7967 w1
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
7968 w0
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
7969 w0
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
7970 w0
[1] = __byte_perm_S (w0
[0], 0, selector
);
7975 w3
[1] = __byte_perm_S (w2
[3], w2
[2], selector
);
7976 w3
[0] = __byte_perm_S (w2
[2], w2
[1], selector
);
7977 w2
[3] = __byte_perm_S (w2
[1], w2
[0], selector
);
7978 w2
[2] = __byte_perm_S (w2
[0], w1
[3], selector
);
7979 w2
[1] = __byte_perm_S (w1
[3], w1
[2], selector
);
7980 w2
[0] = __byte_perm_S (w1
[2], w1
[1], selector
);
7981 w1
[3] = __byte_perm_S (w1
[1], w1
[0], selector
);
7982 w1
[2] = __byte_perm_S (w1
[0], w0
[3], selector
);
7983 w1
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
7984 w1
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
7985 w0
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
7986 w0
[2] = __byte_perm_S (w0
[0], 0, selector
);
7992 w3
[1] = __byte_perm_S (w2
[2], w2
[1], selector
);
7993 w3
[0] = __byte_perm_S (w2
[1], w2
[0], selector
);
7994 w2
[3] = __byte_perm_S (w2
[0], w1
[3], selector
);
7995 w2
[2] = __byte_perm_S (w1
[3], w1
[2], selector
);
7996 w2
[1] = __byte_perm_S (w1
[2], w1
[1], selector
);
7997 w2
[0] = __byte_perm_S (w1
[1], w1
[0], selector
);
7998 w1
[3] = __byte_perm_S (w1
[0], w0
[3], selector
);
7999 w1
[2] = __byte_perm_S (w0
[3], w0
[2], selector
);
8000 w1
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
8001 w1
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
8002 w0
[3] = __byte_perm_S (w0
[0], 0, selector
);
8009 w3
[1] = __byte_perm_S (w2
[1], w2
[0], selector
);
8010 w3
[0] = __byte_perm_S (w2
[0], w1
[3], selector
);
8011 w2
[3] = __byte_perm_S (w1
[3], w1
[2], selector
);
8012 w2
[2] = __byte_perm_S (w1
[2], w1
[1], selector
);
8013 w2
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
8014 w2
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
8015 w1
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
8016 w1
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
8017 w1
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
8018 w1
[0] = __byte_perm_S (w0
[0], 0, selector
);
8026 w3
[1] = __byte_perm_S (w2
[0], w1
[3], selector
);
8027 w3
[0] = __byte_perm_S (w1
[3], w1
[2], selector
);
8028 w2
[3] = __byte_perm_S (w1
[2], w1
[1], selector
);
8029 w2
[2] = __byte_perm_S (w1
[1], w1
[0], selector
);
8030 w2
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
8031 w2
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
8032 w1
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
8033 w1
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
8034 w1
[1] = __byte_perm_S (w0
[0], 0, selector
);
8043 w3
[1] = __byte_perm_S (w1
[3], w1
[2], selector
);
8044 w3
[0] = __byte_perm_S (w1
[2], w1
[1], selector
);
8045 w2
[3] = __byte_perm_S (w1
[1], w1
[0], selector
);
8046 w2
[2] = __byte_perm_S (w1
[0], w0
[3], selector
);
8047 w2
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
8048 w2
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
8049 w1
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
8050 w1
[2] = __byte_perm_S (w0
[0], 0, selector
);
8060 w3
[1] = __byte_perm_S (w1
[2], w1
[1], selector
);
8061 w3
[0] = __byte_perm_S (w1
[1], w1
[0], selector
);
8062 w2
[3] = __byte_perm_S (w1
[0], w0
[3], selector
);
8063 w2
[2] = __byte_perm_S (w0
[3], w0
[2], selector
);
8064 w2
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
8065 w2
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
8066 w1
[3] = __byte_perm_S (w0
[0], 0, selector
);
8077 w3
[1] = __byte_perm_S (w1
[1], w1
[0], selector
);
8078 w3
[0] = __byte_perm_S (w1
[0], w0
[3], selector
);
8079 w2
[3] = __byte_perm_S (w0
[3], w0
[2], selector
);
8080 w2
[2] = __byte_perm_S (w0
[2], w0
[1], selector
);
8081 w2
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
8082 w2
[0] = __byte_perm_S (w0
[0], 0, selector
);
8094 w3
[1] = __byte_perm_S (w1
[0], w0
[3], selector
);
8095 w3
[0] = __byte_perm_S (w0
[3], w0
[2], selector
);
8096 w2
[3] = __byte_perm_S (w0
[2], w0
[1], selector
);
8097 w2
[2] = __byte_perm_S (w0
[1], w0
[0], selector
);
8098 w2
[1] = __byte_perm_S (w0
[0], 0, selector
);
8111 w3
[1] = __byte_perm_S (w0
[3], w0
[2], selector
);
8112 w3
[0] = __byte_perm_S (w0
[2], w0
[1], selector
);
8113 w2
[3] = __byte_perm_S (w0
[1], w0
[0], selector
);
8114 w2
[2] = __byte_perm_S (w0
[0], 0, selector
);
8128 w3
[1] = __byte_perm_S (w0
[2], w0
[1], selector
);
8129 w3
[0] = __byte_perm_S (w0
[1], w0
[0], selector
);
8130 w2
[3] = __byte_perm_S (w0
[0], 0, selector
);
8145 w3
[1] = __byte_perm_S (w0
[1], w0
[0], selector
);
8146 w3
[0] = __byte_perm_S (w0
[0], 0, selector
);
8162 w3
[1] = __byte_perm_S (w0
[0], 0, selector
);
8182 * vector functions on scalar types (for inner loop usage)
8185 #define PACKVS2(sn,vn,e) \
8186 sn[0] = vn[0].s##e; \
8189 #define PACKSV2(sn,vn,e) \
8190 vn[0].s##e = sn[0]; \
8193 #define PACKVS24(s0,s1,v0,v1,e) \
8194 PACKVS4 (s0, v0, e); \
8195 PACKVS4 (s1, v1, e);
8197 #define PACKSV24(s0,s1,v0,v1,e) \
8198 PACKSV4 (s0, v0, e); \
8199 PACKSV4 (s1, v1, e);
8201 #define PACKVS4(sn,vn,e) \
8202 sn[0] = vn[0].s##e; \
8203 sn[1] = vn[1].s##e; \
8204 sn[2] = vn[2].s##e; \
8207 #define PACKSV4(sn,vn,e) \
8208 vn[0].s##e = sn[0]; \
8209 vn[1].s##e = sn[1]; \
8210 vn[2].s##e = sn[2]; \
8213 #define PACKVS44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
8214 PACKVS4 (s0, v0, e); \
8215 PACKVS4 (s1, v1, e); \
8216 PACKVS4 (s2, v2, e); \
8217 PACKVS4 (s3, v3, e);
8219 #define PACKSV44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
8220 PACKSV4 (s0, v0, e); \
8221 PACKSV4 (s1, v1, e); \
8222 PACKSV4 (s2, v2, e); \
8223 PACKSV4 (s3, v3, e);
8225 inline void switch_buffer_by_offset_le_VV (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x offset
)
8229 switch_buffer_by_offset_le_S (w0
, w1
, w2
, w3
, offset
);
8242 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8243 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8245 #elif VECT_SIZE == 4
8247 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8248 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8249 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
8250 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
8252 #elif VECT_SIZE == 8
8254 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8255 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8256 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
8257 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
8258 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s4
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4);
8259 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s5
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5);
8260 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s6
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6);
8261 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s7
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7);
8263 #elif VECT_SIZE == 16
8265 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8266 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8267 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
8268 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
8269 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s4
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4);
8270 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s5
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5);
8271 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s6
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6);
8272 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s7
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7);
8273 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 8); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s8
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 8);
8274 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 9); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.s9
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 9);
8275 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, a
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sa
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, a
);
8276 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, b
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sb
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, b
);
8277 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, c
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sc
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, c
);
8278 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, d
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sd
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, d
);
8279 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, e
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.se
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, e
);
8280 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, f
); switch_buffer_by_offset_le_S (t0
, t1
, t2
, t3
, offset
.sf
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, f
);
8285 inline void append_0x01_2x4_VV (u32x w0
[4], u32x w1
[4], const u32x offset
)
8289 append_0x01_2x4_S (w0
, w1
, offset
);
8300 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x01_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
8301 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x01_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
8303 #elif VECT_SIZE == 4
8305 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x01_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
8306 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x01_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
8307 PACKVS24 (t0
, t1
, w0
, w1
, 2); append_0x01_2x4_S (t0
, t1
, offset
.s2
); PACKSV24 (t0
, t1
, w0
, w1
, 2);
8308 PACKVS24 (t0
, t1
, w0
, w1
, 3); append_0x01_2x4_S (t0
, t1
, offset
.s3
); PACKSV24 (t0
, t1
, w0
, w1
, 3);
8310 #elif VECT_SIZE == 8
8312 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x01_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
8313 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x01_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
8314 PACKVS24 (t0
, t1
, w0
, w1
, 2); append_0x01_2x4_S (t0
, t1
, offset
.s2
); PACKSV24 (t0
, t1
, w0
, w1
, 2);
8315 PACKVS24 (t0
, t1
, w0
, w1
, 3); append_0x01_2x4_S (t0
, t1
, offset
.s3
); PACKSV24 (t0
, t1
, w0
, w1
, 3);
8316 PACKVS24 (t0
, t1
, w0
, w1
, 4); append_0x01_2x4_S (t0
, t1
, offset
.s4
); PACKSV24 (t0
, t1
, w0
, w1
, 4);
8317 PACKVS24 (t0
, t1
, w0
, w1
, 5); append_0x01_2x4_S (t0
, t1
, offset
.s5
); PACKSV24 (t0
, t1
, w0
, w1
, 5);
8318 PACKVS24 (t0
, t1
, w0
, w1
, 6); append_0x01_2x4_S (t0
, t1
, offset
.s6
); PACKSV24 (t0
, t1
, w0
, w1
, 6);
8319 PACKVS24 (t0
, t1
, w0
, w1
, 7); append_0x01_2x4_S (t0
, t1
, offset
.s7
); PACKSV24 (t0
, t1
, w0
, w1
, 7);
8321 #elif VECT_SIZE == 16
8323 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x01_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
8324 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x01_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
8325 PACKVS24 (t0
, t1
, w0
, w1
, 2); append_0x01_2x4_S (t0
, t1
, offset
.s2
); PACKSV24 (t0
, t1
, w0
, w1
, 2);
8326 PACKVS24 (t0
, t1
, w0
, w1
, 3); append_0x01_2x4_S (t0
, t1
, offset
.s3
); PACKSV24 (t0
, t1
, w0
, w1
, 3);
8327 PACKVS24 (t0
, t1
, w0
, w1
, 4); append_0x01_2x4_S (t0
, t1
, offset
.s4
); PACKSV24 (t0
, t1
, w0
, w1
, 4);
8328 PACKVS24 (t0
, t1
, w0
, w1
, 5); append_0x01_2x4_S (t0
, t1
, offset
.s5
); PACKSV24 (t0
, t1
, w0
, w1
, 5);
8329 PACKVS24 (t0
, t1
, w0
, w1
, 6); append_0x01_2x4_S (t0
, t1
, offset
.s6
); PACKSV24 (t0
, t1
, w0
, w1
, 6);
8330 PACKVS24 (t0
, t1
, w0
, w1
, 7); append_0x01_2x4_S (t0
, t1
, offset
.s7
); PACKSV24 (t0
, t1
, w0
, w1
, 7);
8331 PACKVS24 (t0
, t1
, w0
, w1
, 8); append_0x01_2x4_S (t0
, t1
, offset
.s8
); PACKSV24 (t0
, t1
, w0
, w1
, 8);
8332 PACKVS24 (t0
, t1
, w0
, w1
, 9); append_0x01_2x4_S (t0
, t1
, offset
.s9
); PACKSV24 (t0
, t1
, w0
, w1
, 9);
8333 PACKVS24 (t0
, t1
, w0
, w1
, a
); append_0x01_2x4_S (t0
, t1
, offset
.sa
); PACKSV24 (t0
, t1
, w0
, w1
, a
);
8334 PACKVS24 (t0
, t1
, w0
, w1
, b
); append_0x01_2x4_S (t0
, t1
, offset
.sb
); PACKSV24 (t0
, t1
, w0
, w1
, b
);
8335 PACKVS24 (t0
, t1
, w0
, w1
, c
); append_0x01_2x4_S (t0
, t1
, offset
.sc
); PACKSV24 (t0
, t1
, w0
, w1
, c
);
8336 PACKVS24 (t0
, t1
, w0
, w1
, d
); append_0x01_2x4_S (t0
, t1
, offset
.sd
); PACKSV24 (t0
, t1
, w0
, w1
, d
);
8337 PACKVS24 (t0
, t1
, w0
, w1
, e
); append_0x01_2x4_S (t0
, t1
, offset
.se
); PACKSV24 (t0
, t1
, w0
, w1
, e
);
8338 PACKVS24 (t0
, t1
, w0
, w1
, f
); append_0x01_2x4_S (t0
, t1
, offset
.sf
); PACKSV24 (t0
, t1
, w0
, w1
, f
);
8343 inline void append_0x80_2x4_VV (u32x w0
[4], u32x w1
[4], const u32x offset
)
8347 append_0x80_2x4_S (w0
, w1
, offset
);
8358 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x80_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
8359 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x80_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
8361 #elif VECT_SIZE == 4
8363 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x80_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
8364 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x80_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
8365 PACKVS24 (t0
, t1
, w0
, w1
, 2); append_0x80_2x4_S (t0
, t1
, offset
.s2
); PACKSV24 (t0
, t1
, w0
, w1
, 2);
8366 PACKVS24 (t0
, t1
, w0
, w1
, 3); append_0x80_2x4_S (t0
, t1
, offset
.s3
); PACKSV24 (t0
, t1
, w0
, w1
, 3);
8368 #elif VECT_SIZE == 8
8370 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x80_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
8371 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x80_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
8372 PACKVS24 (t0
, t1
, w0
, w1
, 2); append_0x80_2x4_S (t0
, t1
, offset
.s2
); PACKSV24 (t0
, t1
, w0
, w1
, 2);
8373 PACKVS24 (t0
, t1
, w0
, w1
, 3); append_0x80_2x4_S (t0
, t1
, offset
.s3
); PACKSV24 (t0
, t1
, w0
, w1
, 3);
8374 PACKVS24 (t0
, t1
, w0
, w1
, 4); append_0x80_2x4_S (t0
, t1
, offset
.s4
); PACKSV24 (t0
, t1
, w0
, w1
, 4);
8375 PACKVS24 (t0
, t1
, w0
, w1
, 5); append_0x80_2x4_S (t0
, t1
, offset
.s5
); PACKSV24 (t0
, t1
, w0
, w1
, 5);
8376 PACKVS24 (t0
, t1
, w0
, w1
, 6); append_0x80_2x4_S (t0
, t1
, offset
.s6
); PACKSV24 (t0
, t1
, w0
, w1
, 6);
8377 PACKVS24 (t0
, t1
, w0
, w1
, 7); append_0x80_2x4_S (t0
, t1
, offset
.s7
); PACKSV24 (t0
, t1
, w0
, w1
, 7);
8379 #elif VECT_SIZE == 16
8381 PACKVS24 (t0
, t1
, w0
, w1
, 0); append_0x80_2x4_S (t0
, t1
, offset
.s0
); PACKSV24 (t0
, t1
, w0
, w1
, 0);
8382 PACKVS24 (t0
, t1
, w0
, w1
, 1); append_0x80_2x4_S (t0
, t1
, offset
.s1
); PACKSV24 (t0
, t1
, w0
, w1
, 1);
8383 PACKVS24 (t0
, t1
, w0
, w1
, 2); append_0x80_2x4_S (t0
, t1
, offset
.s2
); PACKSV24 (t0
, t1
, w0
, w1
, 2);
8384 PACKVS24 (t0
, t1
, w0
, w1
, 3); append_0x80_2x4_S (t0
, t1
, offset
.s3
); PACKSV24 (t0
, t1
, w0
, w1
, 3);
8385 PACKVS24 (t0
, t1
, w0
, w1
, 4); append_0x80_2x4_S (t0
, t1
, offset
.s4
); PACKSV24 (t0
, t1
, w0
, w1
, 4);
8386 PACKVS24 (t0
, t1
, w0
, w1
, 5); append_0x80_2x4_S (t0
, t1
, offset
.s5
); PACKSV24 (t0
, t1
, w0
, w1
, 5);
8387 PACKVS24 (t0
, t1
, w0
, w1
, 6); append_0x80_2x4_S (t0
, t1
, offset
.s6
); PACKSV24 (t0
, t1
, w0
, w1
, 6);
8388 PACKVS24 (t0
, t1
, w0
, w1
, 7); append_0x80_2x4_S (t0
, t1
, offset
.s7
); PACKSV24 (t0
, t1
, w0
, w1
, 7);
8389 PACKVS24 (t0
, t1
, w0
, w1
, 8); append_0x80_2x4_S (t0
, t1
, offset
.s8
); PACKSV24 (t0
, t1
, w0
, w1
, 8);
8390 PACKVS24 (t0
, t1
, w0
, w1
, 9); append_0x80_2x4_S (t0
, t1
, offset
.s9
); PACKSV24 (t0
, t1
, w0
, w1
, 9);
8391 PACKVS24 (t0
, t1
, w0
, w1
, a
); append_0x80_2x4_S (t0
, t1
, offset
.sa
); PACKSV24 (t0
, t1
, w0
, w1
, a
);
8392 PACKVS24 (t0
, t1
, w0
, w1
, b
); append_0x80_2x4_S (t0
, t1
, offset
.sb
); PACKSV24 (t0
, t1
, w0
, w1
, b
);
8393 PACKVS24 (t0
, t1
, w0
, w1
, c
); append_0x80_2x4_S (t0
, t1
, offset
.sc
); PACKSV24 (t0
, t1
, w0
, w1
, c
);
8394 PACKVS24 (t0
, t1
, w0
, w1
, d
); append_0x80_2x4_S (t0
, t1
, offset
.sd
); PACKSV24 (t0
, t1
, w0
, w1
, d
);
8395 PACKVS24 (t0
, t1
, w0
, w1
, e
); append_0x80_2x4_S (t0
, t1
, offset
.se
); PACKSV24 (t0
, t1
, w0
, w1
, e
);
8396 PACKVS24 (t0
, t1
, w0
, w1
, f
); append_0x80_2x4_S (t0
, t1
, offset
.sf
); PACKSV24 (t0
, t1
, w0
, w1
, f
);
8401 inline void append_0x80_4x4_VV (u32x w0
[4], u32x w1
[4], u32x w2
[4], u32x w3
[4], const u32x offset
)
8405 append_0x80_4x4_S (w0
, w1
, w2
, w3
, offset
);
8418 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8419 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8421 #elif VECT_SIZE == 4
8423 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8424 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8425 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
8426 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
8428 #elif VECT_SIZE == 8
8430 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8431 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8432 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
8433 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
8434 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s4
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4);
8435 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s5
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5);
8436 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s6
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6);
8437 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s7
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7);
8439 #elif VECT_SIZE == 16
8441 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s0
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 0);
8442 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s1
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 1);
8443 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s2
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 2);
8444 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s3
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 3);
8445 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s4
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 4);
8446 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s5
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 5);
8447 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s6
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 6);
8448 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s7
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 7);
8449 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 8); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s8
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 8);
8450 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 9); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.s9
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, 9);
8451 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, a
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sa
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, a
);
8452 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, b
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sb
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, b
);
8453 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, c
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sc
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, c
);
8454 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, d
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sd
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, d
);
8455 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, e
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.se
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, e
);
8456 PACKVS44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, f
); append_0x80_4x4_S (t0
, t1
, t2
, t3
, offset
.sf
); PACKSV44 (t0
, t1
, t2
, t3
, w0
, w1
, w2
, w3
, f
);