2 * Author......: Jens Steube <jens.steube@gmail.com>
7 * pure scalar functions
10 inline int hash_comp (const u32 d1[4], __global u32 *d2)
12 if (d1[3] > d2[DGST_R3]) return ( 1);
13 if (d1[3] < d2[DGST_R3]) return (-1);
14 if (d1[2] > d2[DGST_R2]) return ( 1);
15 if (d1[2] < d2[DGST_R2]) return (-1);
16 if (d1[1] > d2[DGST_R1]) return ( 1);
17 if (d1[1] < d2[DGST_R1]) return (-1);
18 if (d1[0] > d2[DGST_R0]) return ( 1);
19 if (d1[0] < d2[DGST_R0]) return (-1);
24 inline int find_hash (const u32 digest[4], const u32 digests_cnt, __global digest_t *digests_buf)
26 for (u32 l = 0, r = digests_cnt; r; r >>= 1)
32 const int cmp = hash_comp (digest, digests_buf[c].digest_buf);
41 if (cmp == 0) return (c);
47 inline u32 check_bitmap (__global u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest)
49 return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f)));
52 inline u32 check (const u32 digest[2], __global u32 *bitmap_s1_a, __global u32 *bitmap_s1_b, __global u32 *bitmap_s1_c, __global u32 *bitmap_s1_d, __global u32 *bitmap_s2_a, __global u32 *bitmap_s2_b, __global u32 *bitmap_s2_c, __global u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2)
54 if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0);
55 if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0);
56 if (check_bitmap (bitmap_s1_c, bitmap_mask, bitmap_shift1, digest[2]) == 0) return (0);
57 if (check_bitmap (bitmap_s1_d, bitmap_mask, bitmap_shift1, digest[3]) == 0) return (0);
59 if (check_bitmap (bitmap_s2_a, bitmap_mask, bitmap_shift2, digest[0]) == 0) return (0);
60 if (check_bitmap (bitmap_s2_b, bitmap_mask, bitmap_shift2, digest[1]) == 0) return (0);
61 if (check_bitmap (bitmap_s2_c, bitmap_mask, bitmap_shift2, digest[2]) == 0) return (0);
62 if (check_bitmap (bitmap_s2_d, bitmap_mask, bitmap_shift2, digest[3]) == 0) return (0);
67 inline void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, const int salt_pos, const int digest_pos, const int hash_pos, const u32 gid, const u32 il_pos)
69 const u32 idx = atomic_inc (d_result);
71 plains_buf[idx].salt_pos = salt_pos;
72 plains_buf[idx].digest_pos = digest_pos; // relative
73 plains_buf[idx].hash_pos = hash_pos; // absolute
74 plains_buf[idx].gidvid = gid;
75 plains_buf[idx].il_pos = il_pos;
82 inline void truncate_block (u32x w[4], const u32 len)
91 case 1: w[0] &= 0x000000FF;
96 case 2: w[0] &= 0x0000FFFF;
101 case 3: w[0] &= 0x00FFFFFF;
110 case 5: w[1] &= 0x000000FF;
114 case 6: w[1] &= 0x0000FFFF;
118 case 7: w[1] &= 0x00FFFFFF;
125 case 9: w[2] &= 0x000000FF;
128 case 10: w[2] &= 0x0000FFFF;
131 case 11: w[2] &= 0x00FFFFFF;
136 case 13: w[3] &= 0x000000FF;
138 case 14: w[3] &= 0x0000FFFF;
140 case 15: w[3] &= 0x00FFFFFF;
145 inline void make_unicode (const u32x in[4], u32x out1[4], u32x out2[4])
148 out2[3] = __byte_perm (in[3], 0, 0x7372);
149 out2[2] = __byte_perm (in[3], 0, 0x7170);
150 out2[1] = __byte_perm (in[2], 0, 0x7372);
151 out2[0] = __byte_perm (in[2], 0, 0x7170);
152 out1[3] = __byte_perm (in[1], 0, 0x7372);
153 out1[2] = __byte_perm (in[1], 0, 0x7170);
154 out1[1] = __byte_perm (in[0], 0, 0x7372);
155 out1[0] = __byte_perm (in[0], 0, 0x7170);
158 #if defined IS_AMD || defined IS_GENERIC
159 out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
160 out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
161 out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
162 out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
163 out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
164 out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
165 out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
166 out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
170 inline void undo_unicode (const u32x in1[4], const u32x in2[4], u32x out[4])
173 out[0] = __byte_perm (in1[0], in1[1], 0x6420);
174 out[1] = __byte_perm (in1[2], in1[3], 0x6420);
175 out[2] = __byte_perm (in2[0], in2[1], 0x6420);
176 out[3] = __byte_perm (in2[2], in2[3], 0x6420);
179 #if defined IS_AMD || defined IS_GENERIC
180 out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
181 | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
182 out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
183 | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
184 out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
185 | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
186 out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
187 | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
191 inline void append_0x01_1x4 (u32x w0[4], const u32 offset)
200 w0[0] = w0[0] | 0x0100;
204 w0[0] = w0[0] | 0x010000;
208 w0[0] = w0[0] | 0x01000000;
216 w0[1] = w0[1] | 0x0100;
220 w0[1] = w0[1] | 0x010000;
224 w0[1] = w0[1] | 0x01000000;
232 w0[2] = w0[2] | 0x0100;
236 w0[2] = w0[2] | 0x010000;
240 w0[2] = w0[2] | 0x01000000;
248 w0[3] = w0[3] | 0x0100;
252 w0[3] = w0[3] | 0x010000;
256 w0[3] = w0[3] | 0x01000000;
261 inline void append_0x01_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
270 w0[0] = w0[0] | 0x0100;
274 w0[0] = w0[0] | 0x010000;
278 w0[0] = w0[0] | 0x01000000;
286 w0[1] = w0[1] | 0x0100;
290 w0[1] = w0[1] | 0x010000;
294 w0[1] = w0[1] | 0x01000000;
302 w0[2] = w0[2] | 0x0100;
306 w0[2] = w0[2] | 0x010000;
310 w0[2] = w0[2] | 0x01000000;
318 w0[3] = w0[3] | 0x0100;
322 w0[3] = w0[3] | 0x010000;
326 w0[3] = w0[3] | 0x01000000;
334 w1[0] = w1[0] | 0x0100;
338 w1[0] = w1[0] | 0x010000;
342 w1[0] = w1[0] | 0x01000000;
350 w1[1] = w1[1] | 0x0100;
354 w1[1] = w1[1] | 0x010000;
358 w1[1] = w1[1] | 0x01000000;
366 w1[2] = w1[2] | 0x0100;
370 w1[2] = w1[2] | 0x010000;
374 w1[2] = w1[2] | 0x01000000;
382 w1[3] = w1[3] | 0x0100;
386 w1[3] = w1[3] | 0x010000;
390 w1[3] = w1[3] | 0x01000000;
395 inline void append_0x01_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
404 w0[0] = w0[0] | 0x0100;
408 w0[0] = w0[0] | 0x010000;
412 w0[0] = w0[0] | 0x01000000;
420 w0[1] = w0[1] | 0x0100;
424 w0[1] = w0[1] | 0x010000;
428 w0[1] = w0[1] | 0x01000000;
436 w0[2] = w0[2] | 0x0100;
440 w0[2] = w0[2] | 0x010000;
444 w0[2] = w0[2] | 0x01000000;
452 w0[3] = w0[3] | 0x0100;
456 w0[3] = w0[3] | 0x010000;
460 w0[3] = w0[3] | 0x01000000;
468 w1[0] = w1[0] | 0x0100;
472 w1[0] = w1[0] | 0x010000;
476 w1[0] = w1[0] | 0x01000000;
484 w1[1] = w1[1] | 0x0100;
488 w1[1] = w1[1] | 0x010000;
492 w1[1] = w1[1] | 0x01000000;
500 w1[2] = w1[2] | 0x0100;
504 w1[2] = w1[2] | 0x010000;
508 w1[2] = w1[2] | 0x01000000;
516 w1[3] = w1[3] | 0x0100;
520 w1[3] = w1[3] | 0x010000;
524 w1[3] = w1[3] | 0x01000000;
532 w2[0] = w2[0] | 0x0100;
536 w2[0] = w2[0] | 0x010000;
540 w2[0] = w2[0] | 0x01000000;
548 w2[1] = w2[1] | 0x0100;
552 w2[1] = w2[1] | 0x010000;
556 w2[1] = w2[1] | 0x01000000;
564 w2[2] = w2[2] | 0x0100;
568 w2[2] = w2[2] | 0x010000;
572 w2[2] = w2[2] | 0x01000000;
580 w2[3] = w2[3] | 0x0100;
584 w2[3] = w2[3] | 0x010000;
588 w2[3] = w2[3] | 0x01000000;
593 inline void append_0x01_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
602 w0[0] = w0[0] | 0x0100;
606 w0[0] = w0[0] | 0x010000;
610 w0[0] = w0[0] | 0x01000000;
618 w0[1] = w0[1] | 0x0100;
622 w0[1] = w0[1] | 0x010000;
626 w0[1] = w0[1] | 0x01000000;
634 w0[2] = w0[2] | 0x0100;
638 w0[2] = w0[2] | 0x010000;
642 w0[2] = w0[2] | 0x01000000;
650 w0[3] = w0[3] | 0x0100;
654 w0[3] = w0[3] | 0x010000;
658 w0[3] = w0[3] | 0x01000000;
666 w1[0] = w1[0] | 0x0100;
670 w1[0] = w1[0] | 0x010000;
674 w1[0] = w1[0] | 0x01000000;
682 w1[1] = w1[1] | 0x0100;
686 w1[1] = w1[1] | 0x010000;
690 w1[1] = w1[1] | 0x01000000;
698 w1[2] = w1[2] | 0x0100;
702 w1[2] = w1[2] | 0x010000;
706 w1[2] = w1[2] | 0x01000000;
714 w1[3] = w1[3] | 0x0100;
718 w1[3] = w1[3] | 0x010000;
722 w1[3] = w1[3] | 0x01000000;
730 w2[0] = w2[0] | 0x0100;
734 w2[0] = w2[0] | 0x010000;
738 w2[0] = w2[0] | 0x01000000;
746 w2[1] = w2[1] | 0x0100;
750 w2[1] = w2[1] | 0x010000;
754 w2[1] = w2[1] | 0x01000000;
762 w2[2] = w2[2] | 0x0100;
766 w2[2] = w2[2] | 0x010000;
770 w2[2] = w2[2] | 0x01000000;
778 w2[3] = w2[3] | 0x0100;
782 w2[3] = w2[3] | 0x010000;
786 w2[3] = w2[3] | 0x01000000;
794 w3[0] = w3[0] | 0x0100;
798 w3[0] = w3[0] | 0x010000;
802 w3[0] = w3[0] | 0x01000000;
810 w3[1] = w3[1] | 0x0100;
814 w3[1] = w3[1] | 0x010000;
818 w3[1] = w3[1] | 0x01000000;
826 w3[2] = w3[2] | 0x0100;
830 w3[2] = w3[2] | 0x010000;
834 w3[2] = w3[2] | 0x01000000;
842 w3[3] = w3[3] | 0x0100;
846 w3[3] = w3[3] | 0x010000;
850 w3[3] = w3[3] | 0x01000000;
855 inline void append_0x01_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
864 w0[0] = w0[0] | 0x0100;
868 w0[0] = w0[0] | 0x010000;
872 w0[0] = w0[0] | 0x01000000;
880 w0[1] = w0[1] | 0x0100;
884 w0[1] = w0[1] | 0x010000;
888 w0[1] = w0[1] | 0x01000000;
896 w0[2] = w0[2] | 0x0100;
900 w0[2] = w0[2] | 0x010000;
904 w0[2] = w0[2] | 0x01000000;
912 w0[3] = w0[3] | 0x0100;
916 w0[3] = w0[3] | 0x010000;
920 w0[3] = w0[3] | 0x01000000;
928 w1[0] = w1[0] | 0x0100;
932 w1[0] = w1[0] | 0x010000;
936 w1[0] = w1[0] | 0x01000000;
944 w1[1] = w1[1] | 0x0100;
948 w1[1] = w1[1] | 0x010000;
952 w1[1] = w1[1] | 0x01000000;
960 w1[2] = w1[2] | 0x0100;
964 w1[2] = w1[2] | 0x010000;
968 w1[2] = w1[2] | 0x01000000;
976 w1[3] = w1[3] | 0x0100;
980 w1[3] = w1[3] | 0x010000;
984 w1[3] = w1[3] | 0x01000000;
992 w2[0] = w2[0] | 0x0100;
996 w2[0] = w2[0] | 0x010000;
1000 w2[0] = w2[0] | 0x01000000;
1008 w2[1] = w2[1] | 0x0100;
1012 w2[1] = w2[1] | 0x010000;
1016 w2[1] = w2[1] | 0x01000000;
1024 w2[2] = w2[2] | 0x0100;
1028 w2[2] = w2[2] | 0x010000;
1032 w2[2] = w2[2] | 0x01000000;
1040 w2[3] = w2[3] | 0x0100;
1044 w2[3] = w2[3] | 0x010000;
1048 w2[3] = w2[3] | 0x01000000;
1056 w3[0] = w3[0] | 0x0100;
1060 w3[0] = w3[0] | 0x010000;
1064 w3[0] = w3[0] | 0x01000000;
1072 w3[1] = w3[1] | 0x0100;
1076 w3[1] = w3[1] | 0x010000;
1080 w3[1] = w3[1] | 0x01000000;
1088 w3[2] = w3[2] | 0x0100;
1092 w3[2] = w3[2] | 0x010000;
1096 w3[2] = w3[2] | 0x01000000;
1104 w3[3] = w3[3] | 0x0100;
1108 w3[3] = w3[3] | 0x010000;
1112 w3[3] = w3[3] | 0x01000000;
1120 w4[0] = w4[0] | 0x0100;
1124 w4[0] = w4[0] | 0x010000;
1128 w4[0] = w4[0] | 0x01000000;
1136 w4[1] = w4[1] | 0x0100;
1140 w4[1] = w4[1] | 0x010000;
1144 w4[1] = w4[1] | 0x01000000;
1152 w4[2] = w4[2] | 0x0100;
1156 w4[2] = w4[2] | 0x010000;
1160 w4[2] = w4[2] | 0x01000000;
1168 w4[3] = w4[3] | 0x0100;
1172 w4[3] = w4[3] | 0x010000;
1176 w4[3] = w4[3] | 0x01000000;
1184 w5[0] = w5[0] | 0x0100;
1188 w5[0] = w5[0] | 0x010000;
1192 w5[0] = w5[0] | 0x01000000;
1200 w5[1] = w5[1] | 0x0100;
1204 w5[1] = w5[1] | 0x010000;
1208 w5[1] = w5[1] | 0x01000000;
1216 w5[2] = w5[2] | 0x0100;
1220 w5[2] = w5[2] | 0x010000;
1224 w5[2] = w5[2] | 0x01000000;
1232 w5[3] = w5[3] | 0x0100;
1236 w5[3] = w5[3] | 0x010000;
1240 w5[3] = w5[3] | 0x01000000;
1248 w6[0] = w6[0] | 0x0100;
1252 w6[0] = w6[0] | 0x010000;
1256 w6[0] = w6[0] | 0x01000000;
1264 w6[1] = w6[1] | 0x0100;
1268 w6[1] = w6[1] | 0x010000;
1272 w6[1] = w6[1] | 0x01000000;
1280 w6[2] = w6[2] | 0x0100;
1284 w6[2] = w6[2] | 0x010000;
1288 w6[2] = w6[2] | 0x01000000;
1296 w6[3] = w6[3] | 0x0100;
1300 w6[3] = w6[3] | 0x010000;
1304 w6[3] = w6[3] | 0x01000000;
1312 w7[0] = w7[0] | 0x0100;
1316 w7[0] = w7[0] | 0x010000;
1320 w7[0] = w7[0] | 0x01000000;
1328 w7[1] = w7[1] | 0x0100;
1332 w7[1] = w7[1] | 0x010000;
1336 w7[1] = w7[1] | 0x01000000;
1344 w7[2] = w7[2] | 0x0100;
1348 w7[2] = w7[2] | 0x010000;
1352 w7[2] = w7[2] | 0x01000000;
1360 w7[3] = w7[3] | 0x0100;
1364 w7[3] = w7[3] | 0x010000;
1368 w7[3] = w7[3] | 0x01000000;
1373 inline void append_0x02_1x4 (u32x w0[4], const u32 offset)
1382 w0[0] = w0[0] | 0x0200;
1386 w0[0] = w0[0] | 0x020000;
1390 w0[0] = w0[0] | 0x02000000;
1398 w0[1] = w0[1] | 0x0200;
1402 w0[1] = w0[1] | 0x020000;
1406 w0[1] = w0[1] | 0x02000000;
1414 w0[2] = w0[2] | 0x0200;
1418 w0[2] = w0[2] | 0x020000;
1422 w0[2] = w0[2] | 0x02000000;
1430 w0[3] = w0[3] | 0x0200;
1434 w0[3] = w0[3] | 0x020000;
1438 w0[3] = w0[3] | 0x02000000;
1443 inline void append_0x02_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
1452 w0[0] = w0[0] | 0x0200;
1456 w0[0] = w0[0] | 0x020000;
1460 w0[0] = w0[0] | 0x02000000;
1468 w0[1] = w0[1] | 0x0200;
1472 w0[1] = w0[1] | 0x020000;
1476 w0[1] = w0[1] | 0x02000000;
1484 w0[2] = w0[2] | 0x0200;
1488 w0[2] = w0[2] | 0x020000;
1492 w0[2] = w0[2] | 0x02000000;
1500 w0[3] = w0[3] | 0x0200;
1504 w0[3] = w0[3] | 0x020000;
1508 w0[3] = w0[3] | 0x02000000;
1516 w1[0] = w1[0] | 0x0200;
1520 w1[0] = w1[0] | 0x020000;
1524 w1[0] = w1[0] | 0x02000000;
1532 w1[1] = w1[1] | 0x0200;
1536 w1[1] = w1[1] | 0x020000;
1540 w1[1] = w1[1] | 0x02000000;
1548 w1[2] = w1[2] | 0x0200;
1552 w1[2] = w1[2] | 0x020000;
1556 w1[2] = w1[2] | 0x02000000;
1564 w1[3] = w1[3] | 0x0200;
1568 w1[3] = w1[3] | 0x020000;
1572 w1[3] = w1[3] | 0x02000000;
1577 inline void append_0x02_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
1586 w0[0] = w0[0] | 0x0200;
1590 w0[0] = w0[0] | 0x020000;
1594 w0[0] = w0[0] | 0x02000000;
1602 w0[1] = w0[1] | 0x0200;
1606 w0[1] = w0[1] | 0x020000;
1610 w0[1] = w0[1] | 0x02000000;
1618 w0[2] = w0[2] | 0x0200;
1622 w0[2] = w0[2] | 0x020000;
1626 w0[2] = w0[2] | 0x02000000;
1634 w0[3] = w0[3] | 0x0200;
1638 w0[3] = w0[3] | 0x020000;
1642 w0[3] = w0[3] | 0x02000000;
1650 w1[0] = w1[0] | 0x0200;
1654 w1[0] = w1[0] | 0x020000;
1658 w1[0] = w1[0] | 0x02000000;
1666 w1[1] = w1[1] | 0x0200;
1670 w1[1] = w1[1] | 0x020000;
1674 w1[1] = w1[1] | 0x02000000;
1682 w1[2] = w1[2] | 0x0200;
1686 w1[2] = w1[2] | 0x020000;
1690 w1[2] = w1[2] | 0x02000000;
1698 w1[3] = w1[3] | 0x0200;
1702 w1[3] = w1[3] | 0x020000;
1706 w1[3] = w1[3] | 0x02000000;
1714 w2[0] = w2[0] | 0x0200;
1718 w2[0] = w2[0] | 0x020000;
1722 w2[0] = w2[0] | 0x02000000;
1730 w2[1] = w2[1] | 0x0200;
1734 w2[1] = w2[1] | 0x020000;
1738 w2[1] = w2[1] | 0x02000000;
1746 w2[2] = w2[2] | 0x0200;
1750 w2[2] = w2[2] | 0x020000;
1754 w2[2] = w2[2] | 0x02000000;
1762 w2[3] = w2[3] | 0x0200;
1766 w2[3] = w2[3] | 0x020000;
1770 w2[3] = w2[3] | 0x02000000;
1775 inline void append_0x02_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
1784 w0[0] = w0[0] | 0x0200;
1788 w0[0] = w0[0] | 0x020000;
1792 w0[0] = w0[0] | 0x02000000;
1800 w0[1] = w0[1] | 0x0200;
1804 w0[1] = w0[1] | 0x020000;
1808 w0[1] = w0[1] | 0x02000000;
1816 w0[2] = w0[2] | 0x0200;
1820 w0[2] = w0[2] | 0x020000;
1824 w0[2] = w0[2] | 0x02000000;
1832 w0[3] = w0[3] | 0x0200;
1836 w0[3] = w0[3] | 0x020000;
1840 w0[3] = w0[3] | 0x02000000;
1848 w1[0] = w1[0] | 0x0200;
1852 w1[0] = w1[0] | 0x020000;
1856 w1[0] = w1[0] | 0x02000000;
1864 w1[1] = w1[1] | 0x0200;
1868 w1[1] = w1[1] | 0x020000;
1872 w1[1] = w1[1] | 0x02000000;
1880 w1[2] = w1[2] | 0x0200;
1884 w1[2] = w1[2] | 0x020000;
1888 w1[2] = w1[2] | 0x02000000;
1896 w1[3] = w1[3] | 0x0200;
1900 w1[3] = w1[3] | 0x020000;
1904 w1[3] = w1[3] | 0x02000000;
1912 w2[0] = w2[0] | 0x0200;
1916 w2[0] = w2[0] | 0x020000;
1920 w2[0] = w2[0] | 0x02000000;
1928 w2[1] = w2[1] | 0x0200;
1932 w2[1] = w2[1] | 0x020000;
1936 w2[1] = w2[1] | 0x02000000;
1944 w2[2] = w2[2] | 0x0200;
1948 w2[2] = w2[2] | 0x020000;
1952 w2[2] = w2[2] | 0x02000000;
1960 w2[3] = w2[3] | 0x0200;
1964 w2[3] = w2[3] | 0x020000;
1968 w2[3] = w2[3] | 0x02000000;
1976 w3[0] = w3[0] | 0x0200;
1980 w3[0] = w3[0] | 0x020000;
1984 w3[0] = w3[0] | 0x02000000;
1992 w3[1] = w3[1] | 0x0200;
1996 w3[1] = w3[1] | 0x020000;
2000 w3[1] = w3[1] | 0x02000000;
2008 w3[2] = w3[2] | 0x0200;
2012 w3[2] = w3[2] | 0x020000;
2016 w3[2] = w3[2] | 0x02000000;
2024 w3[3] = w3[3] | 0x0200;
2028 w3[3] = w3[3] | 0x020000;
2032 w3[3] = w3[3] | 0x02000000;
2037 inline void append_0x02_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
2046 w0[0] = w0[0] | 0x0200;
2050 w0[0] = w0[0] | 0x020000;
2054 w0[0] = w0[0] | 0x02000000;
2062 w0[1] = w0[1] | 0x0200;
2066 w0[1] = w0[1] | 0x020000;
2070 w0[1] = w0[1] | 0x02000000;
2078 w0[2] = w0[2] | 0x0200;
2082 w0[2] = w0[2] | 0x020000;
2086 w0[2] = w0[2] | 0x02000000;
2094 w0[3] = w0[3] | 0x0200;
2098 w0[3] = w0[3] | 0x020000;
2102 w0[3] = w0[3] | 0x02000000;
2110 w1[0] = w1[0] | 0x0200;
2114 w1[0] = w1[0] | 0x020000;
2118 w1[0] = w1[0] | 0x02000000;
2126 w1[1] = w1[1] | 0x0200;
2130 w1[1] = w1[1] | 0x020000;
2134 w1[1] = w1[1] | 0x02000000;
2142 w1[2] = w1[2] | 0x0200;
2146 w1[2] = w1[2] | 0x020000;
2150 w1[2] = w1[2] | 0x02000000;
2158 w1[3] = w1[3] | 0x0200;
2162 w1[3] = w1[3] | 0x020000;
2166 w1[3] = w1[3] | 0x02000000;
2174 w2[0] = w2[0] | 0x0200;
2178 w2[0] = w2[0] | 0x020000;
2182 w2[0] = w2[0] | 0x02000000;
2190 w2[1] = w2[1] | 0x0200;
2194 w2[1] = w2[1] | 0x020000;
2198 w2[1] = w2[1] | 0x02000000;
2206 w2[2] = w2[2] | 0x0200;
2210 w2[2] = w2[2] | 0x020000;
2214 w2[2] = w2[2] | 0x02000000;
2222 w2[3] = w2[3] | 0x0200;
2226 w2[3] = w2[3] | 0x020000;
2230 w2[3] = w2[3] | 0x02000000;
2238 w3[0] = w3[0] | 0x0200;
2242 w3[0] = w3[0] | 0x020000;
2246 w3[0] = w3[0] | 0x02000000;
2254 w3[1] = w3[1] | 0x0200;
2258 w3[1] = w3[1] | 0x020000;
2262 w3[1] = w3[1] | 0x02000000;
2270 w3[2] = w3[2] | 0x0200;
2274 w3[2] = w3[2] | 0x020000;
2278 w3[2] = w3[2] | 0x02000000;
2286 w3[3] = w3[3] | 0x0200;
2290 w3[3] = w3[3] | 0x020000;
2294 w3[3] = w3[3] | 0x02000000;
2302 w4[0] = w4[0] | 0x0200;
2306 w4[0] = w4[0] | 0x020000;
2310 w4[0] = w4[0] | 0x02000000;
2318 w4[1] = w4[1] | 0x0200;
2322 w4[1] = w4[1] | 0x020000;
2326 w4[1] = w4[1] | 0x02000000;
2334 w4[2] = w4[2] | 0x0200;
2338 w4[2] = w4[2] | 0x020000;
2342 w4[2] = w4[2] | 0x02000000;
2350 w4[3] = w4[3] | 0x0200;
2354 w4[3] = w4[3] | 0x020000;
2358 w4[3] = w4[3] | 0x02000000;
2366 w5[0] = w5[0] | 0x0200;
2370 w5[0] = w5[0] | 0x020000;
2374 w5[0] = w5[0] | 0x02000000;
2382 w5[1] = w5[1] | 0x0200;
2386 w5[1] = w5[1] | 0x020000;
2390 w5[1] = w5[1] | 0x02000000;
2398 w5[2] = w5[2] | 0x0200;
2402 w5[2] = w5[2] | 0x020000;
2406 w5[2] = w5[2] | 0x02000000;
2414 w5[3] = w5[3] | 0x0200;
2418 w5[3] = w5[3] | 0x020000;
2422 w5[3] = w5[3] | 0x02000000;
2430 w6[0] = w6[0] | 0x0200;
2434 w6[0] = w6[0] | 0x020000;
2438 w6[0] = w6[0] | 0x02000000;
2446 w6[1] = w6[1] | 0x0200;
2450 w6[1] = w6[1] | 0x020000;
2454 w6[1] = w6[1] | 0x02000000;
2462 w6[2] = w6[2] | 0x0200;
2466 w6[2] = w6[2] | 0x020000;
2470 w6[2] = w6[2] | 0x02000000;
2478 w6[3] = w6[3] | 0x0200;
2482 w6[3] = w6[3] | 0x020000;
2486 w6[3] = w6[3] | 0x02000000;
2494 w7[0] = w7[0] | 0x0200;
2498 w7[0] = w7[0] | 0x020000;
2502 w7[0] = w7[0] | 0x02000000;
2510 w7[1] = w7[1] | 0x0200;
2514 w7[1] = w7[1] | 0x020000;
2518 w7[1] = w7[1] | 0x02000000;
2526 w7[2] = w7[2] | 0x0200;
2530 w7[2] = w7[2] | 0x020000;
2534 w7[2] = w7[2] | 0x02000000;
2542 w7[3] = w7[3] | 0x0200;
2546 w7[3] = w7[3] | 0x020000;
2550 w7[3] = w7[3] | 0x02000000;
2555 inline void append_0x80_1x4 (u32x w0[4], const u32 offset)
2564 w0[0] = w0[0] | 0x8000;
2568 w0[0] = w0[0] | 0x800000;
2572 w0[0] = w0[0] | 0x80000000;
2580 w0[1] = w0[1] | 0x8000;
2584 w0[1] = w0[1] | 0x800000;
2588 w0[1] = w0[1] | 0x80000000;
2596 w0[2] = w0[2] | 0x8000;
2600 w0[2] = w0[2] | 0x800000;
2604 w0[2] = w0[2] | 0x80000000;
2612 w0[3] = w0[3] | 0x8000;
2616 w0[3] = w0[3] | 0x800000;
2620 w0[3] = w0[3] | 0x80000000;
2625 inline void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
2634 w0[0] = w0[0] | 0x8000;
2638 w0[0] = w0[0] | 0x800000;
2642 w0[0] = w0[0] | 0x80000000;
2650 w0[1] = w0[1] | 0x8000;
2654 w0[1] = w0[1] | 0x800000;
2658 w0[1] = w0[1] | 0x80000000;
2666 w0[2] = w0[2] | 0x8000;
2670 w0[2] = w0[2] | 0x800000;
2674 w0[2] = w0[2] | 0x80000000;
2682 w0[3] = w0[3] | 0x8000;
2686 w0[3] = w0[3] | 0x800000;
2690 w0[3] = w0[3] | 0x80000000;
2698 w1[0] = w1[0] | 0x8000;
2702 w1[0] = w1[0] | 0x800000;
2706 w1[0] = w1[0] | 0x80000000;
2714 w1[1] = w1[1] | 0x8000;
2718 w1[1] = w1[1] | 0x800000;
2722 w1[1] = w1[1] | 0x80000000;
2730 w1[2] = w1[2] | 0x8000;
2734 w1[2] = w1[2] | 0x800000;
2738 w1[2] = w1[2] | 0x80000000;
2746 w1[3] = w1[3] | 0x8000;
2750 w1[3] = w1[3] | 0x800000;
2754 w1[3] = w1[3] | 0x80000000;
2759 inline void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
2768 w0[0] = w0[0] | 0x8000;
2772 w0[0] = w0[0] | 0x800000;
2776 w0[0] = w0[0] | 0x80000000;
2784 w0[1] = w0[1] | 0x8000;
2788 w0[1] = w0[1] | 0x800000;
2792 w0[1] = w0[1] | 0x80000000;
2800 w0[2] = w0[2] | 0x8000;
2804 w0[2] = w0[2] | 0x800000;
2808 w0[2] = w0[2] | 0x80000000;
2816 w0[3] = w0[3] | 0x8000;
2820 w0[3] = w0[3] | 0x800000;
2824 w0[3] = w0[3] | 0x80000000;
2832 w1[0] = w1[0] | 0x8000;
2836 w1[0] = w1[0] | 0x800000;
2840 w1[0] = w1[0] | 0x80000000;
2848 w1[1] = w1[1] | 0x8000;
2852 w1[1] = w1[1] | 0x800000;
2856 w1[1] = w1[1] | 0x80000000;
2864 w1[2] = w1[2] | 0x8000;
2868 w1[2] = w1[2] | 0x800000;
2872 w1[2] = w1[2] | 0x80000000;
2880 w1[3] = w1[3] | 0x8000;
2884 w1[3] = w1[3] | 0x800000;
2888 w1[3] = w1[3] | 0x80000000;
2896 w2[0] = w2[0] | 0x8000;
2900 w2[0] = w2[0] | 0x800000;
2904 w2[0] = w2[0] | 0x80000000;
2912 w2[1] = w2[1] | 0x8000;
2916 w2[1] = w2[1] | 0x800000;
2920 w2[1] = w2[1] | 0x80000000;
2928 w2[2] = w2[2] | 0x8000;
2932 w2[2] = w2[2] | 0x800000;
2936 w2[2] = w2[2] | 0x80000000;
2944 w2[3] = w2[3] | 0x8000;
2948 w2[3] = w2[3] | 0x800000;
2952 w2[3] = w2[3] | 0x80000000;
2957 inline void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
2966 w0[0] = w0[0] | 0x8000;
2970 w0[0] = w0[0] | 0x800000;
2974 w0[0] = w0[0] | 0x80000000;
2982 w0[1] = w0[1] | 0x8000;
2986 w0[1] = w0[1] | 0x800000;
2990 w0[1] = w0[1] | 0x80000000;
2998 w0[2] = w0[2] | 0x8000;
3002 w0[2] = w0[2] | 0x800000;
3006 w0[2] = w0[2] | 0x80000000;
3014 w0[3] = w0[3] | 0x8000;
3018 w0[3] = w0[3] | 0x800000;
3022 w0[3] = w0[3] | 0x80000000;
3030 w1[0] = w1[0] | 0x8000;
3034 w1[0] = w1[0] | 0x800000;
3038 w1[0] = w1[0] | 0x80000000;
3046 w1[1] = w1[1] | 0x8000;
3050 w1[1] = w1[1] | 0x800000;
3054 w1[1] = w1[1] | 0x80000000;
3062 w1[2] = w1[2] | 0x8000;
3066 w1[2] = w1[2] | 0x800000;
3070 w1[2] = w1[2] | 0x80000000;
3078 w1[3] = w1[3] | 0x8000;
3082 w1[3] = w1[3] | 0x800000;
3086 w1[3] = w1[3] | 0x80000000;
3094 w2[0] = w2[0] | 0x8000;
3098 w2[0] = w2[0] | 0x800000;
3102 w2[0] = w2[0] | 0x80000000;
3110 w2[1] = w2[1] | 0x8000;
3114 w2[1] = w2[1] | 0x800000;
3118 w2[1] = w2[1] | 0x80000000;
3126 w2[2] = w2[2] | 0x8000;
3130 w2[2] = w2[2] | 0x800000;
3134 w2[2] = w2[2] | 0x80000000;
3142 w2[3] = w2[3] | 0x8000;
3146 w2[3] = w2[3] | 0x800000;
3150 w2[3] = w2[3] | 0x80000000;
3158 w3[0] = w3[0] | 0x8000;
3162 w3[0] = w3[0] | 0x800000;
3166 w3[0] = w3[0] | 0x80000000;
3174 w3[1] = w3[1] | 0x8000;
3178 w3[1] = w3[1] | 0x800000;
3182 w3[1] = w3[1] | 0x80000000;
3190 w3[2] = w3[2] | 0x8000;
3194 w3[2] = w3[2] | 0x800000;
3198 w3[2] = w3[2] | 0x80000000;
3206 w3[3] = w3[3] | 0x8000;
3210 w3[3] = w3[3] | 0x800000;
3214 w3[3] = w3[3] | 0x80000000;
3219 inline void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
3228 w0[0] = w0[0] | 0x8000;
3232 w0[0] = w0[0] | 0x800000;
3236 w0[0] = w0[0] | 0x80000000;
3244 w0[1] = w0[1] | 0x8000;
3248 w0[1] = w0[1] | 0x800000;
3252 w0[1] = w0[1] | 0x80000000;
3260 w0[2] = w0[2] | 0x8000;
3264 w0[2] = w0[2] | 0x800000;
3268 w0[2] = w0[2] | 0x80000000;
3276 w0[3] = w0[3] | 0x8000;
3280 w0[3] = w0[3] | 0x800000;
3284 w0[3] = w0[3] | 0x80000000;
3292 w1[0] = w1[0] | 0x8000;
3296 w1[0] = w1[0] | 0x800000;
3300 w1[0] = w1[0] | 0x80000000;
3308 w1[1] = w1[1] | 0x8000;
3312 w1[1] = w1[1] | 0x800000;
3316 w1[1] = w1[1] | 0x80000000;
3324 w1[2] = w1[2] | 0x8000;
3328 w1[2] = w1[2] | 0x800000;
3332 w1[2] = w1[2] | 0x80000000;
3340 w1[3] = w1[3] | 0x8000;
3344 w1[3] = w1[3] | 0x800000;
3348 w1[3] = w1[3] | 0x80000000;
3356 w2[0] = w2[0] | 0x8000;
3360 w2[0] = w2[0] | 0x800000;
3364 w2[0] = w2[0] | 0x80000000;
3372 w2[1] = w2[1] | 0x8000;
3376 w2[1] = w2[1] | 0x800000;
3380 w2[1] = w2[1] | 0x80000000;
3388 w2[2] = w2[2] | 0x8000;
3392 w2[2] = w2[2] | 0x800000;
3396 w2[2] = w2[2] | 0x80000000;
3404 w2[3] = w2[3] | 0x8000;
3408 w2[3] = w2[3] | 0x800000;
3412 w2[3] = w2[3] | 0x80000000;
3420 w3[0] = w3[0] | 0x8000;
3424 w3[0] = w3[0] | 0x800000;
3428 w3[0] = w3[0] | 0x80000000;
3436 w3[1] = w3[1] | 0x8000;
3440 w3[1] = w3[1] | 0x800000;
3444 w3[1] = w3[1] | 0x80000000;
3452 w3[2] = w3[2] | 0x8000;
3456 w3[2] = w3[2] | 0x800000;
3460 w3[2] = w3[2] | 0x80000000;
3468 w3[3] = w3[3] | 0x8000;
3472 w3[3] = w3[3] | 0x800000;
3476 w3[3] = w3[3] | 0x80000000;
3484 w4[0] = w4[0] | 0x8000;
3488 w4[0] = w4[0] | 0x800000;
3492 w4[0] = w4[0] | 0x80000000;
3500 w4[1] = w4[1] | 0x8000;
3504 w4[1] = w4[1] | 0x800000;
3508 w4[1] = w4[1] | 0x80000000;
3516 w4[2] = w4[2] | 0x8000;
3520 w4[2] = w4[2] | 0x800000;
3524 w4[2] = w4[2] | 0x80000000;
3532 w4[3] = w4[3] | 0x8000;
3536 w4[3] = w4[3] | 0x800000;
3540 w4[3] = w4[3] | 0x80000000;
3548 w5[0] = w5[0] | 0x8000;
3552 w5[0] = w5[0] | 0x800000;
3556 w5[0] = w5[0] | 0x80000000;
3564 w5[1] = w5[1] | 0x8000;
3568 w5[1] = w5[1] | 0x800000;
3572 w5[1] = w5[1] | 0x80000000;
3580 w5[2] = w5[2] | 0x8000;
3584 w5[2] = w5[2] | 0x800000;
3588 w5[2] = w5[2] | 0x80000000;
3596 w5[3] = w5[3] | 0x8000;
3600 w5[3] = w5[3] | 0x800000;
3604 w5[3] = w5[3] | 0x80000000;
3612 w6[0] = w6[0] | 0x8000;
3616 w6[0] = w6[0] | 0x800000;
3620 w6[0] = w6[0] | 0x80000000;
3628 w6[1] = w6[1] | 0x8000;
3632 w6[1] = w6[1] | 0x800000;
3636 w6[1] = w6[1] | 0x80000000;
3644 w6[2] = w6[2] | 0x8000;
3648 w6[2] = w6[2] | 0x800000;
3652 w6[2] = w6[2] | 0x80000000;
3660 w6[3] = w6[3] | 0x8000;
3664 w6[3] = w6[3] | 0x800000;
3668 w6[3] = w6[3] | 0x80000000;
3676 w7[0] = w7[0] | 0x8000;
3680 w7[0] = w7[0] | 0x800000;
3684 w7[0] = w7[0] | 0x80000000;
3692 w7[1] = w7[1] | 0x8000;
3696 w7[1] = w7[1] | 0x800000;
3700 w7[1] = w7[1] | 0x80000000;
3708 w7[2] = w7[2] | 0x8000;
3712 w7[2] = w7[2] | 0x800000;
3716 w7[2] = w7[2] | 0x80000000;
3724 w7[3] = w7[3] | 0x8000;
3728 w7[3] = w7[3] | 0x800000;
3732 w7[3] = w7[3] | 0x80000000;
3737 inline void append_0x80_1x16 (u32x w[16], const u32 offset)
3746 w[ 0] = w[ 0] | 0x8000;
3750 w[ 0] = w[ 0] | 0x800000;
3754 w[ 0] = w[ 0] | 0x80000000;
3762 w[ 1] = w[ 1] | 0x8000;
3766 w[ 1] = w[ 1] | 0x800000;
3770 w[ 1] = w[ 1] | 0x80000000;
3778 w[ 2] = w[ 2] | 0x8000;
3782 w[ 2] = w[ 2] | 0x800000;
3786 w[ 2] = w[ 2] | 0x80000000;
3794 w[ 3] = w[ 3] | 0x8000;
3798 w[ 3] = w[ 3] | 0x800000;
3802 w[ 3] = w[ 3] | 0x80000000;
3810 w[ 4] = w[ 4] | 0x8000;
3814 w[ 4] = w[ 4] | 0x800000;
3818 w[ 4] = w[ 4] | 0x80000000;
3826 w[ 5] = w[ 5] | 0x8000;
3830 w[ 5] = w[ 5] | 0x800000;
3834 w[ 5] = w[ 5] | 0x80000000;
3842 w[ 6] = w[ 6] | 0x8000;
3846 w[ 6] = w[ 6] | 0x800000;
3850 w[ 6] = w[ 6] | 0x80000000;
3858 w[ 7] = w[ 7] | 0x8000;
3862 w[ 7] = w[ 7] | 0x800000;
3866 w[ 7] = w[ 7] | 0x80000000;
3874 w[ 8] = w[ 8] | 0x8000;
3878 w[ 8] = w[ 8] | 0x800000;
3882 w[ 8] = w[ 8] | 0x80000000;
3890 w[ 9] = w[ 9] | 0x8000;
3894 w[ 9] = w[ 9] | 0x800000;
3898 w[ 9] = w[ 9] | 0x80000000;
3906 w[10] = w[10] | 0x8000;
3910 w[10] = w[10] | 0x800000;
3914 w[10] = w[10] | 0x80000000;
3922 w[11] = w[11] | 0x8000;
3926 w[11] = w[11] | 0x800000;
3930 w[11] = w[11] | 0x80000000;
3938 w[12] = w[12] | 0x8000;
3942 w[12] = w[12] | 0x800000;
3946 w[12] = w[12] | 0x80000000;
3954 w[13] = w[13] | 0x8000;
3958 w[13] = w[13] | 0x800000;
3962 w[13] = w[13] | 0x80000000;
3970 w[14] = w[14] | 0x8000;
3974 w[14] = w[14] | 0x800000;
3978 w[14] = w[14] | 0x80000000;
3986 w[15] = w[15] | 0x8000;
3990 w[15] = w[15] | 0x800000;
3994 w[15] = w[15] | 0x80000000;
3999 inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
4001 #if defined IS_AMD || defined IS_GENERIC
4002 const int offset_mod_4 = offset & 3;
4004 const int offset_minus_4 = 4 - offset;
4009 w3[2] = amd_bytealign ( 0, w3[1], offset_minus_4);
4010 w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4);
4011 w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4);
4012 w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4013 w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4014 w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4015 w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4016 w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4017 w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4018 w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4019 w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4020 w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4021 w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4022 w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4023 w0[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4025 if (offset_mod_4 == 0)
4047 w3[2] = amd_bytealign ( 0, w3[0], offset_minus_4);
4048 w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4);
4049 w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4050 w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4051 w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4052 w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4053 w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4054 w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4055 w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4056 w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4057 w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4058 w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4059 w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4060 w0[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4063 if (offset_mod_4 == 0)
4084 w3[2] = amd_bytealign ( 0, w2[3], offset_minus_4);
4085 w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4086 w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4087 w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4088 w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4089 w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4090 w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4091 w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4092 w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4093 w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4094 w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4095 w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4096 w0[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4100 if (offset_mod_4 == 0)
4120 w3[2] = amd_bytealign ( 0, w2[2], offset_minus_4);
4121 w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4122 w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4123 w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4124 w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4125 w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4126 w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4127 w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4128 w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4129 w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4130 w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4131 w0[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4136 if (offset_mod_4 == 0)
4155 w3[2] = amd_bytealign ( 0, w2[1], offset_minus_4);
4156 w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4157 w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4158 w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4159 w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4160 w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4161 w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4162 w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4163 w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4164 w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4165 w1[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4171 if (offset_mod_4 == 0)
4189 w3[2] = amd_bytealign ( 0, w2[0], offset_minus_4);
4190 w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4191 w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4192 w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4193 w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4194 w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4195 w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4196 w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4197 w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4198 w1[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4205 if (offset_mod_4 == 0)
4222 w3[2] = amd_bytealign ( 0, w1[3], offset_minus_4);
4223 w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4224 w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4225 w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4226 w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4227 w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4228 w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4229 w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4230 w1[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4238 if (offset_mod_4 == 0)
4254 w3[2] = amd_bytealign ( 0, w1[2], offset_minus_4);
4255 w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4256 w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4257 w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4258 w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4259 w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4260 w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4261 w1[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4270 if (offset_mod_4 == 0)
4285 w3[2] = amd_bytealign ( 0, w1[1], offset_minus_4);
4286 w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4287 w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4288 w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4289 w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4290 w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4291 w2[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4301 if (offset_mod_4 == 0)
4315 w3[2] = amd_bytealign ( 0, w1[0], offset_minus_4);
4316 w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4317 w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4318 w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4319 w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4320 w2[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4331 if (offset_mod_4 == 0)
4344 w3[2] = amd_bytealign ( 0, w0[3], offset_minus_4);
4345 w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4346 w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4347 w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4348 w2[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4360 if (offset_mod_4 == 0)
4372 w3[2] = amd_bytealign ( 0, w0[2], offset_minus_4);
4373 w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4374 w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4375 w2[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4388 if (offset_mod_4 == 0)
4399 w3[2] = amd_bytealign ( 0, w0[1], offset_minus_4);
4400 w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4401 w3[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4415 if (offset_mod_4 == 0)
4425 w3[2] = amd_bytealign ( 0, w0[0], offset_minus_4);
4426 w3[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4441 if (offset_mod_4 == 0)
4452 const int offset_minus_4 = 4 - (offset % 4);
4454 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
4459 w3[1] = __byte_perm (w3[0], w3[1], selector);
4460 w3[0] = __byte_perm (w2[3], w3[0], selector);
4461 w2[3] = __byte_perm (w2[2], w2[3], selector);
4462 w2[2] = __byte_perm (w2[1], w2[2], selector);
4463 w2[1] = __byte_perm (w2[0], w2[1], selector);
4464 w2[0] = __byte_perm (w1[3], w2[0], selector);
4465 w1[3] = __byte_perm (w1[2], w1[3], selector);
4466 w1[2] = __byte_perm (w1[1], w1[2], selector);
4467 w1[1] = __byte_perm (w1[0], w1[1], selector);
4468 w1[0] = __byte_perm (w0[3], w1[0], selector);
4469 w0[3] = __byte_perm (w0[2], w0[3], selector);
4470 w0[2] = __byte_perm (w0[1], w0[2], selector);
4471 w0[1] = __byte_perm (w0[0], w0[1], selector);
4472 w0[0] = __byte_perm ( 0, w0[0], selector);
4477 w3[1] = __byte_perm (w2[3], w3[0], selector);
4478 w3[0] = __byte_perm (w2[2], w2[3], selector);
4479 w2[3] = __byte_perm (w2[1], w2[2], selector);
4480 w2[2] = __byte_perm (w2[0], w2[1], selector);
4481 w2[1] = __byte_perm (w1[3], w2[0], selector);
4482 w2[0] = __byte_perm (w1[2], w1[3], selector);
4483 w1[3] = __byte_perm (w1[1], w1[2], selector);
4484 w1[2] = __byte_perm (w1[0], w1[1], selector);
4485 w1[1] = __byte_perm (w0[3], w1[0], selector);
4486 w1[0] = __byte_perm (w0[2], w0[3], selector);
4487 w0[3] = __byte_perm (w0[1], w0[2], selector);
4488 w0[2] = __byte_perm (w0[0], w0[1], selector);
4489 w0[1] = __byte_perm ( 0, w0[0], selector);
4495 w3[1] = __byte_perm (w2[2], w2[3], selector);
4496 w3[0] = __byte_perm (w2[1], w2[2], selector);
4497 w2[3] = __byte_perm (w2[0], w2[1], selector);
4498 w2[2] = __byte_perm (w1[3], w2[0], selector);
4499 w2[1] = __byte_perm (w1[2], w1[3], selector);
4500 w2[0] = __byte_perm (w1[1], w1[2], selector);
4501 w1[3] = __byte_perm (w1[0], w1[1], selector);
4502 w1[2] = __byte_perm (w0[3], w1[0], selector);
4503 w1[1] = __byte_perm (w0[2], w0[3], selector);
4504 w1[0] = __byte_perm (w0[1], w0[2], selector);
4505 w0[3] = __byte_perm (w0[0], w0[1], selector);
4506 w0[2] = __byte_perm ( 0, w0[0], selector);
4513 w3[1] = __byte_perm (w2[1], w2[2], selector);
4514 w3[0] = __byte_perm (w2[0], w2[1], selector);
4515 w2[3] = __byte_perm (w1[3], w2[0], selector);
4516 w2[2] = __byte_perm (w1[2], w1[3], selector);
4517 w2[1] = __byte_perm (w1[1], w1[2], selector);
4518 w2[0] = __byte_perm (w1[0], w1[1], selector);
4519 w1[3] = __byte_perm (w0[3], w1[0], selector);
4520 w1[2] = __byte_perm (w0[2], w0[3], selector);
4521 w1[1] = __byte_perm (w0[1], w0[2], selector);
4522 w1[0] = __byte_perm (w0[0], w0[1], selector);
4523 w0[3] = __byte_perm ( 0, w0[0], selector);
4531 w3[1] = __byte_perm (w2[0], w2[1], selector);
4532 w3[0] = __byte_perm (w1[3], w2[0], selector);
4533 w2[3] = __byte_perm (w1[2], w1[3], selector);
4534 w2[2] = __byte_perm (w1[1], w1[2], selector);
4535 w2[1] = __byte_perm (w1[0], w1[1], selector);
4536 w2[0] = __byte_perm (w0[3], w1[0], selector);
4537 w1[3] = __byte_perm (w0[2], w0[3], selector);
4538 w1[2] = __byte_perm (w0[1], w0[2], selector);
4539 w1[1] = __byte_perm (w0[0], w0[1], selector);
4540 w1[0] = __byte_perm ( 0, w0[0], selector);
4549 w3[1] = __byte_perm (w1[3], w2[0], selector);
4550 w3[0] = __byte_perm (w1[2], w1[3], selector);
4551 w2[3] = __byte_perm (w1[1], w1[2], selector);
4552 w2[2] = __byte_perm (w1[0], w1[1], selector);
4553 w2[1] = __byte_perm (w0[3], w1[0], selector);
4554 w2[0] = __byte_perm (w0[2], w0[3], selector);
4555 w1[3] = __byte_perm (w0[1], w0[2], selector);
4556 w1[2] = __byte_perm (w0[0], w0[1], selector);
4557 w1[1] = __byte_perm ( 0, w0[0], selector);
4567 w3[1] = __byte_perm (w1[2], w1[3], selector);
4568 w3[0] = __byte_perm (w1[1], w1[2], selector);
4569 w2[3] = __byte_perm (w1[0], w1[1], selector);
4570 w2[2] = __byte_perm (w0[3], w1[0], selector);
4571 w2[1] = __byte_perm (w0[2], w0[3], selector);
4572 w2[0] = __byte_perm (w0[1], w0[2], selector);
4573 w1[3] = __byte_perm (w0[0], w0[1], selector);
4574 w1[2] = __byte_perm ( 0, w0[0], selector);
4585 w3[1] = __byte_perm (w1[1], w1[2], selector);
4586 w3[0] = __byte_perm (w1[0], w1[1], selector);
4587 w2[3] = __byte_perm (w0[3], w1[0], selector);
4588 w2[2] = __byte_perm (w0[2], w0[3], selector);
4589 w2[1] = __byte_perm (w0[1], w0[2], selector);
4590 w2[0] = __byte_perm (w0[0], w0[1], selector);
4591 w1[3] = __byte_perm ( 0, w0[0], selector);
4603 w3[1] = __byte_perm (w1[0], w1[1], selector);
4604 w3[0] = __byte_perm (w0[3], w1[0], selector);
4605 w2[3] = __byte_perm (w0[2], w0[3], selector);
4606 w2[2] = __byte_perm (w0[1], w0[2], selector);
4607 w2[1] = __byte_perm (w0[0], w0[1], selector);
4608 w2[0] = __byte_perm ( 0, w0[0], selector);
4621 w3[1] = __byte_perm (w0[3], w1[0], selector);
4622 w3[0] = __byte_perm (w0[2], w0[3], selector);
4623 w2[3] = __byte_perm (w0[1], w0[2], selector);
4624 w2[2] = __byte_perm (w0[0], w0[1], selector);
4625 w2[1] = __byte_perm ( 0, w0[0], selector);
4639 w3[1] = __byte_perm (w0[2], w0[3], selector);
4640 w3[0] = __byte_perm (w0[1], w0[2], selector);
4641 w2[3] = __byte_perm (w0[0], w0[1], selector);
4642 w2[2] = __byte_perm ( 0, w0[0], selector);
4657 w3[1] = __byte_perm (w0[1], w0[2], selector);
4658 w3[0] = __byte_perm (w0[0], w0[1], selector);
4659 w2[3] = __byte_perm ( 0, w0[0], selector);
4675 w3[1] = __byte_perm (w0[0], w0[1], selector);
4676 w3[0] = __byte_perm ( 0, w0[0], selector);
4693 w3[1] = __byte_perm ( 0, w0[0], selector);
4713 inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
4715 #if defined IS_AMD || defined IS_GENERIC
4719 w3[2] = amd_bytealign (w3[1], 0, offset);
4720 w3[1] = amd_bytealign (w3[0], w3[1], offset);
4721 w3[0] = amd_bytealign (w2[3], w3[0], offset);
4722 w2[3] = amd_bytealign (w2[2], w2[3], offset);
4723 w2[2] = amd_bytealign (w2[1], w2[2], offset);
4724 w2[1] = amd_bytealign (w2[0], w2[1], offset);
4725 w2[0] = amd_bytealign (w1[3], w2[0], offset);
4726 w1[3] = amd_bytealign (w1[2], w1[3], offset);
4727 w1[2] = amd_bytealign (w1[1], w1[2], offset);
4728 w1[1] = amd_bytealign (w1[0], w1[1], offset);
4729 w1[0] = amd_bytealign (w0[3], w1[0], offset);
4730 w0[3] = amd_bytealign (w0[2], w0[3], offset);
4731 w0[2] = amd_bytealign (w0[1], w0[2], offset);
4732 w0[1] = amd_bytealign (w0[0], w0[1], offset);
4733 w0[0] = amd_bytealign ( 0, w0[0], offset);
4737 w3[2] = amd_bytealign (w3[0], 0, offset);
4738 w3[1] = amd_bytealign (w2[3], w3[0], offset);
4739 w3[0] = amd_bytealign (w2[2], w2[3], offset);
4740 w2[3] = amd_bytealign (w2[1], w2[2], offset);
4741 w2[2] = amd_bytealign (w2[0], w2[1], offset);
4742 w2[1] = amd_bytealign (w1[3], w2[0], offset);
4743 w2[0] = amd_bytealign (w1[2], w1[3], offset);
4744 w1[3] = amd_bytealign (w1[1], w1[2], offset);
4745 w1[2] = amd_bytealign (w1[0], w1[1], offset);
4746 w1[1] = amd_bytealign (w0[3], w1[0], offset);
4747 w1[0] = amd_bytealign (w0[2], w0[3], offset);
4748 w0[3] = amd_bytealign (w0[1], w0[2], offset);
4749 w0[2] = amd_bytealign (w0[0], w0[1], offset);
4750 w0[1] = amd_bytealign ( 0, w0[0], offset);
4755 w3[2] = amd_bytealign (w2[3], 0, offset);
4756 w3[1] = amd_bytealign (w2[2], w2[3], offset);
4757 w3[0] = amd_bytealign (w2[1], w2[2], offset);
4758 w2[3] = amd_bytealign (w2[0], w2[1], offset);
4759 w2[2] = amd_bytealign (w1[3], w2[0], offset);
4760 w2[1] = amd_bytealign (w1[2], w1[3], offset);
4761 w2[0] = amd_bytealign (w1[1], w1[2], offset);
4762 w1[3] = amd_bytealign (w1[0], w1[1], offset);
4763 w1[2] = amd_bytealign (w0[3], w1[0], offset);
4764 w1[1] = amd_bytealign (w0[2], w0[3], offset);
4765 w1[0] = amd_bytealign (w0[1], w0[2], offset);
4766 w0[3] = amd_bytealign (w0[0], w0[1], offset);
4767 w0[2] = amd_bytealign ( 0, w0[0], offset);
4773 w3[2] = amd_bytealign (w2[2], 0, offset);
4774 w3[1] = amd_bytealign (w2[1], w2[2], offset);
4775 w3[0] = amd_bytealign (w2[0], w2[1], offset);
4776 w2[3] = amd_bytealign (w1[3], w2[0], offset);
4777 w2[2] = amd_bytealign (w1[2], w1[3], offset);
4778 w2[1] = amd_bytealign (w1[1], w1[2], offset);
4779 w2[0] = amd_bytealign (w1[0], w1[1], offset);
4780 w1[3] = amd_bytealign (w0[3], w1[0], offset);
4781 w1[2] = amd_bytealign (w0[2], w0[3], offset);
4782 w1[1] = amd_bytealign (w0[1], w0[2], offset);
4783 w1[0] = amd_bytealign (w0[0], w0[1], offset);
4784 w0[3] = amd_bytealign ( 0, w0[0], offset);
4791 w3[2] = amd_bytealign (w2[1], 0, offset);
4792 w3[1] = amd_bytealign (w2[0], w2[1], offset);
4793 w3[0] = amd_bytealign (w1[3], w2[0], offset);
4794 w2[3] = amd_bytealign (w1[2], w1[3], offset);
4795 w2[2] = amd_bytealign (w1[1], w1[2], offset);
4796 w2[1] = amd_bytealign (w1[0], w1[1], offset);
4797 w2[0] = amd_bytealign (w0[3], w1[0], offset);
4798 w1[3] = amd_bytealign (w0[2], w0[3], offset);
4799 w1[2] = amd_bytealign (w0[1], w0[2], offset);
4800 w1[1] = amd_bytealign (w0[0], w0[1], offset);
4801 w1[0] = amd_bytealign ( 0, w0[0], offset);
4809 w3[2] = amd_bytealign (w2[0], 0, offset);
4810 w3[1] = amd_bytealign (w1[3], w2[0], offset);
4811 w3[0] = amd_bytealign (w1[2], w1[3], offset);
4812 w2[3] = amd_bytealign (w1[1], w1[2], offset);
4813 w2[2] = amd_bytealign (w1[0], w1[1], offset);
4814 w2[1] = amd_bytealign (w0[3], w1[0], offset);
4815 w2[0] = amd_bytealign (w0[2], w0[3], offset);
4816 w1[3] = amd_bytealign (w0[1], w0[2], offset);
4817 w1[2] = amd_bytealign (w0[0], w0[1], offset);
4818 w1[1] = amd_bytealign ( 0, w0[0], offset);
4827 w3[2] = amd_bytealign (w1[3], 0, offset);
4828 w3[1] = amd_bytealign (w1[2], w1[3], offset);
4829 w3[0] = amd_bytealign (w1[1], w1[2], offset);
4830 w2[3] = amd_bytealign (w1[0], w1[1], offset);
4831 w2[2] = amd_bytealign (w0[3], w1[0], offset);
4832 w2[1] = amd_bytealign (w0[2], w0[3], offset);
4833 w2[0] = amd_bytealign (w0[1], w0[2], offset);
4834 w1[3] = amd_bytealign (w0[0], w0[1], offset);
4835 w1[2] = amd_bytealign ( 0, w0[0], offset);
4845 w3[2] = amd_bytealign (w1[2], 0, offset);
4846 w3[1] = amd_bytealign (w1[1], w1[2], offset);
4847 w3[0] = amd_bytealign (w1[0], w1[1], offset);
4848 w2[3] = amd_bytealign (w0[3], w1[0], offset);
4849 w2[2] = amd_bytealign (w0[2], w0[3], offset);
4850 w2[1] = amd_bytealign (w0[1], w0[2], offset);
4851 w2[0] = amd_bytealign (w0[0], w0[1], offset);
4852 w1[3] = amd_bytealign ( 0, w0[0], offset);
4863 w3[2] = amd_bytealign (w1[1], 0, offset);
4864 w3[1] = amd_bytealign (w1[0], w1[1], offset);
4865 w3[0] = amd_bytealign (w0[3], w1[0], offset);
4866 w2[3] = amd_bytealign (w0[2], w0[3], offset);
4867 w2[2] = amd_bytealign (w0[1], w0[2], offset);
4868 w2[1] = amd_bytealign (w0[0], w0[1], offset);
4869 w2[0] = amd_bytealign ( 0, w0[0], offset);
4881 w3[2] = amd_bytealign (w1[0], 0, offset);
4882 w3[1] = amd_bytealign (w0[3], w1[0], offset);
4883 w3[0] = amd_bytealign (w0[2], w0[3], offset);
4884 w2[3] = amd_bytealign (w0[1], w0[2], offset);
4885 w2[2] = amd_bytealign (w0[0], w0[1], offset);
4886 w2[1] = amd_bytealign ( 0, w0[0], offset);
4899 w3[2] = amd_bytealign (w0[3], 0, offset);
4900 w3[1] = amd_bytealign (w0[2], w0[3], offset);
4901 w3[0] = amd_bytealign (w0[1], w0[2], offset);
4902 w2[3] = amd_bytealign (w0[0], w0[1], offset);
4903 w2[2] = amd_bytealign ( 0, w0[0], offset);
4917 w3[2] = amd_bytealign (w0[2], 0, offset);
4918 w3[1] = amd_bytealign (w0[1], w0[2], offset);
4919 w3[0] = amd_bytealign (w0[0], w0[1], offset);
4920 w2[3] = amd_bytealign ( 0, w0[0], offset);
4935 w3[2] = amd_bytealign (w0[1], 0, offset);
4936 w3[1] = amd_bytealign (w0[0], w0[1], offset);
4937 w3[0] = amd_bytealign ( 0, w0[0], offset);
4953 w3[2] = amd_bytealign (w0[0], 0, offset);
4954 w3[1] = amd_bytealign ( 0, w0[0], offset);
4973 const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
4978 w3[1] = __byte_perm (w3[1], w3[0], selector);
4979 w3[0] = __byte_perm (w3[0], w2[3], selector);
4980 w2[3] = __byte_perm (w2[3], w2[2], selector);
4981 w2[2] = __byte_perm (w2[2], w2[1], selector);
4982 w2[1] = __byte_perm (w2[1], w2[0], selector);
4983 w2[0] = __byte_perm (w2[0], w1[3], selector);
4984 w1[3] = __byte_perm (w1[3], w1[2], selector);
4985 w1[2] = __byte_perm (w1[2], w1[1], selector);
4986 w1[1] = __byte_perm (w1[1], w1[0], selector);
4987 w1[0] = __byte_perm (w1[0], w0[3], selector);
4988 w0[3] = __byte_perm (w0[3], w0[2], selector);
4989 w0[2] = __byte_perm (w0[2], w0[1], selector);
4990 w0[1] = __byte_perm (w0[1], w0[0], selector);
4991 w0[0] = __byte_perm (w0[0], 0, selector);
4995 w3[1] = __byte_perm (w3[0], w2[3], selector);
4996 w3[0] = __byte_perm (w2[3], w2[2], selector);
4997 w2[3] = __byte_perm (w2[2], w2[1], selector);
4998 w2[2] = __byte_perm (w2[1], w2[0], selector);
4999 w2[1] = __byte_perm (w2[0], w1[3], selector);
5000 w2[0] = __byte_perm (w1[3], w1[2], selector);
5001 w1[3] = __byte_perm (w1[2], w1[1], selector);
5002 w1[2] = __byte_perm (w1[1], w1[0], selector);
5003 w1[1] = __byte_perm (w1[0], w0[3], selector);
5004 w1[0] = __byte_perm (w0[3], w0[2], selector);
5005 w0[3] = __byte_perm (w0[2], w0[1], selector);
5006 w0[2] = __byte_perm (w0[1], w0[0], selector);
5007 w0[1] = __byte_perm (w0[0], 0, selector);
5012 w3[1] = __byte_perm (w2[3], w2[2], selector);
5013 w3[0] = __byte_perm (w2[2], w2[1], selector);
5014 w2[3] = __byte_perm (w2[1], w2[0], selector);
5015 w2[2] = __byte_perm (w2[0], w1[3], selector);
5016 w2[1] = __byte_perm (w1[3], w1[2], selector);
5017 w2[0] = __byte_perm (w1[2], w1[1], selector);
5018 w1[3] = __byte_perm (w1[1], w1[0], selector);
5019 w1[2] = __byte_perm (w1[0], w0[3], selector);
5020 w1[1] = __byte_perm (w0[3], w0[2], selector);
5021 w1[0] = __byte_perm (w0[2], w0[1], selector);
5022 w0[3] = __byte_perm (w0[1], w0[0], selector);
5023 w0[2] = __byte_perm (w0[0], 0, selector);
5029 w3[1] = __byte_perm (w2[2], w2[1], selector);
5030 w3[0] = __byte_perm (w2[1], w2[0], selector);
5031 w2[3] = __byte_perm (w2[0], w1[3], selector);
5032 w2[2] = __byte_perm (w1[3], w1[2], selector);
5033 w2[1] = __byte_perm (w1[2], w1[1], selector);
5034 w2[0] = __byte_perm (w1[1], w1[0], selector);
5035 w1[3] = __byte_perm (w1[0], w0[3], selector);
5036 w1[2] = __byte_perm (w0[3], w0[2], selector);
5037 w1[1] = __byte_perm (w0[2], w0[1], selector);
5038 w1[0] = __byte_perm (w0[1], w0[0], selector);
5039 w0[3] = __byte_perm (w0[0], 0, selector);
5046 w3[1] = __byte_perm (w2[1], w2[0], selector);
5047 w3[0] = __byte_perm (w2[0], w1[3], selector);
5048 w2[3] = __byte_perm (w1[3], w1[2], selector);
5049 w2[2] = __byte_perm (w1[2], w1[1], selector);
5050 w2[1] = __byte_perm (w1[1], w1[0], selector);
5051 w2[0] = __byte_perm (w1[0], w0[3], selector);
5052 w1[3] = __byte_perm (w0[3], w0[2], selector);
5053 w1[2] = __byte_perm (w0[2], w0[1], selector);
5054 w1[1] = __byte_perm (w0[1], w0[0], selector);
5055 w1[0] = __byte_perm (w0[0], 0, selector);
5063 w3[1] = __byte_perm (w2[0], w1[3], selector);
5064 w3[0] = __byte_perm (w1[3], w1[2], selector);
5065 w2[3] = __byte_perm (w1[2], w1[1], selector);
5066 w2[2] = __byte_perm (w1[1], w1[0], selector);
5067 w2[1] = __byte_perm (w1[0], w0[3], selector);
5068 w2[0] = __byte_perm (w0[3], w0[2], selector);
5069 w1[3] = __byte_perm (w0[2], w0[1], selector);
5070 w1[2] = __byte_perm (w0[1], w0[0], selector);
5071 w1[1] = __byte_perm (w0[0], 0, selector);
5080 w3[1] = __byte_perm (w1[3], w1[2], selector);
5081 w3[0] = __byte_perm (w1[2], w1[1], selector);
5082 w2[3] = __byte_perm (w1[1], w1[0], selector);
5083 w2[2] = __byte_perm (w1[0], w0[3], selector);
5084 w2[1] = __byte_perm (w0[3], w0[2], selector);
5085 w2[0] = __byte_perm (w0[2], w0[1], selector);
5086 w1[3] = __byte_perm (w0[1], w0[0], selector);
5087 w1[2] = __byte_perm (w0[0], 0, selector);
5097 w3[1] = __byte_perm (w1[2], w1[1], selector);
5098 w3[0] = __byte_perm (w1[1], w1[0], selector);
5099 w2[3] = __byte_perm (w1[0], w0[3], selector);
5100 w2[2] = __byte_perm (w0[3], w0[2], selector);
5101 w2[1] = __byte_perm (w0[2], w0[1], selector);
5102 w2[0] = __byte_perm (w0[1], w0[0], selector);
5103 w1[3] = __byte_perm (w0[0], 0, selector);
5114 w3[1] = __byte_perm (w1[1], w1[0], selector);
5115 w3[0] = __byte_perm (w1[0], w0[3], selector);
5116 w2[3] = __byte_perm (w0[3], w0[2], selector);
5117 w2[2] = __byte_perm (w0[2], w0[1], selector);
5118 w2[1] = __byte_perm (w0[1], w0[0], selector);
5119 w2[0] = __byte_perm (w0[0], 0, selector);
5131 w3[1] = __byte_perm (w1[0], w0[3], selector);
5132 w3[0] = __byte_perm (w0[3], w0[2], selector);
5133 w2[3] = __byte_perm (w0[2], w0[1], selector);
5134 w2[2] = __byte_perm (w0[1], w0[0], selector);
5135 w2[1] = __byte_perm (w0[0], 0, selector);
5148 w3[1] = __byte_perm (w0[3], w0[2], selector);
5149 w3[0] = __byte_perm (w0[2], w0[1], selector);
5150 w2[3] = __byte_perm (w0[1], w0[0], selector);
5151 w2[2] = __byte_perm (w0[0], 0, selector);
5165 w3[1] = __byte_perm (w0[2], w0[1], selector);
5166 w3[0] = __byte_perm (w0[1], w0[0], selector);
5167 w2[3] = __byte_perm (w0[0], 0, selector);
5182 w3[1] = __byte_perm (w0[1], w0[0], selector);
5183 w3[0] = __byte_perm (w0[0], 0, selector);
5199 w3[1] = __byte_perm (w0[0], 0, selector);
5218 inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len)
5220 #if defined cl_amd_media_ops
5225 case 1: sw[0] = amd_bytealign (w0, sw[0] << 24, 3);
5226 sw[1] = amd_bytealign (sw[1] >> 8, w0, 3);
5228 case 2: sw[0] = amd_bytealign (w0, sw[0] << 16, 2);
5229 sw[1] = amd_bytealign (sw[1] >> 16, w0, 2);
5231 case 3: sw[0] = amd_bytealign (w0, sw[0] << 8, 1);
5232 sw[1] = amd_bytealign (sw[1] >> 24, w0, 1);
5236 case 5: sw[1] = amd_bytealign (w0, sw[1] << 24, 3);
5237 sw[2] = amd_bytealign (sw[2] >> 8, w0, 3);
5239 case 6: sw[1] = amd_bytealign (w0, sw[1] << 16, 2);
5240 sw[2] = amd_bytealign (sw[2] >> 16, w0, 2);
5242 case 7: sw[1] = amd_bytealign (w0, sw[1] << 8, 1);
5243 sw[2] = amd_bytealign (sw[2] >> 24, w0, 1);
5247 case 9: sw[2] = amd_bytealign (w0, sw[2] << 24, 3);
5248 sw[3] = amd_bytealign (sw[3] >> 8, w0, 3);
5250 case 10: sw[2] = amd_bytealign (w0, sw[2] << 16, 2);
5251 sw[3] = amd_bytealign (sw[3] >> 16, w0, 2);
5253 case 11: sw[2] = amd_bytealign (w0, sw[2] << 8, 1);
5254 sw[3] = amd_bytealign (sw[3] >> 24, w0, 1);
5256 case 12: sw[3] = w0;
5258 case 13: sw[3] = amd_bytealign (w0, sw[3] << 24, 3);
5259 sw[4] = amd_bytealign (sw[4] >> 8, w0, 3);
5261 case 14: sw[3] = amd_bytealign (w0, sw[3] << 16, 2);
5262 sw[4] = amd_bytealign (sw[4] >> 16, w0, 2);
5264 case 15: sw[3] = amd_bytealign (w0, sw[3] << 8, 1);
5265 sw[4] = amd_bytealign (sw[4] >> 24, w0, 1);
5267 case 16: sw[4] = w0;
5269 case 17: sw[4] = amd_bytealign (w0, sw[4] << 24, 3);
5270 sw[5] = amd_bytealign (sw[5] >> 8, w0, 3);
5272 case 18: sw[4] = amd_bytealign (w0, sw[4] << 16, 2);
5273 sw[5] = amd_bytealign (sw[5] >> 16, w0, 2);
5275 case 19: sw[4] = amd_bytealign (w0, sw[4] << 8, 1);
5276 sw[5] = amd_bytealign (sw[5] >> 24, w0, 1);
5278 case 20: sw[5] = w0;
5280 case 21: sw[5] = amd_bytealign (w0, sw[5] << 24, 3);
5281 sw[6] = amd_bytealign (sw[6] >> 8, w0, 3);
5283 case 22: sw[5] = amd_bytealign (w0, sw[5] << 16, 2);
5284 sw[6] = amd_bytealign (sw[6] >> 16, w0, 2);
5286 case 23: sw[5] = amd_bytealign (w0, sw[5] << 8, 1);
5287 sw[6] = amd_bytealign (sw[6] >> 24, w0, 1);
5289 case 24: sw[6] = w0;
5291 case 25: sw[6] = amd_bytealign (w0, sw[6] << 24, 3);
5292 sw[7] = amd_bytealign (sw[7] >> 8, w0, 3);
5294 case 26: sw[6] = amd_bytealign (w0, sw[6] << 16, 2);
5295 sw[7] = amd_bytealign (sw[7] >> 16, w0, 2);
5297 case 27: sw[6] = amd_bytealign (w0, sw[6] << 8, 1);
5298 sw[7] = amd_bytealign (sw[7] >> 24, w0, 1);
5300 case 28: sw[7] = w0;
5302 case 29: sw[7] = amd_bytealign (w0, sw[7] << 24, 3);
5303 sw[8] = amd_bytealign (sw[8] >> 8, w0, 3);
5305 case 30: sw[7] = amd_bytealign (w0, sw[7] << 16, 2);
5306 sw[8] = amd_bytealign (sw[8] >> 16, w0, 2);
5308 case 31: sw[7] = amd_bytealign (w0, sw[7] << 8, 1);
5309 sw[8] = amd_bytealign (sw[8] >> 24, w0, 1);
5317 case 1: sw[0] = (sw[0] & 0x000000ff) | (w0 << 8);
5318 sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
5320 case 2: sw[0] = (sw[0] & 0x0000ffff) | (w0 << 16);
5321 sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
5323 case 3: sw[0] = (sw[0] & 0x00ffffff) | (w0 << 24);
5324 sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
5328 case 5: sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
5329 sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
5331 case 6: sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
5332 sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
5334 case 7: sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
5335 sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
5339 case 9: sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
5340 sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
5342 case 10: sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
5343 sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
5345 case 11: sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
5346 sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
5348 case 12: sw[3] = w0;
5350 case 13: sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
5351 sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
5353 case 14: sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
5354 sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
5356 case 15: sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
5357 sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
5359 case 16: sw[4] = w0;
5361 case 17: sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
5362 sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
5364 case 18: sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
5365 sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
5367 case 19: sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
5368 sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
5370 case 20: sw[5] = w0;
5372 case 21: sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
5373 sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
5375 case 22: sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
5376 sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
5378 case 23: sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
5379 sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
5381 case 24: sw[6] = w0;
5383 case 25: sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
5384 sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
5386 case 26: sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
5387 sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
5389 case 27: sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
5390 sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
5392 case 28: sw[7] = w0;
5394 case 29: sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
5395 sw[8] = (sw[8] & 0xffffff00) | (w0 >> 24);
5397 case 30: sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
5398 sw[8] = (sw[8] & 0xffff0000) | (w0 >> 16);
5400 case 31: sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
5401 sw[8] = (sw[8] & 0xff000000) | (w0 >> 8);
5407 inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len)
5409 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5415 case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8);
5416 sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
5418 case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16);
5419 sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
5421 case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24);
5422 sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
5426 case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
5427 sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
5429 case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
5430 sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
5432 case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
5433 sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
5437 case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
5438 sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
5440 case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
5441 sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
5443 case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
5444 sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
5446 case 12: sw[3] = w0;
5448 case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
5449 sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
5451 case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
5452 sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
5454 case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
5455 sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
5457 case 16: sw[4] = w0;
5459 case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
5460 sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
5462 case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
5463 sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
5465 case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
5466 sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
5468 case 20: sw[5] = w0;
5470 case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
5471 sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
5473 case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
5474 sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
5476 case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
5477 sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
5479 case 24: sw[6] = w0;
5481 case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
5482 sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
5484 case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
5485 sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
5487 case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
5488 sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
5490 case 28: sw[7] = w0;
5492 case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
5493 sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24);
5495 case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
5496 sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16);
5498 case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
5499 sw[8] = (sw[8] & 0x000000ff) | (w0 << 8);
5504 inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len)
5506 #if defined cl_amd_media_ops
5511 case 1: w0[0] = amd_bytealign (wx, w0[0] << 24, 3);
5512 w0[1] = amd_bytealign (w0[1] >> 8, wx, 3);
5514 case 2: w0[0] = amd_bytealign (wx, w0[0] << 16, 2);
5515 w0[1] = amd_bytealign (w0[1] >> 16, wx, 2);
5517 case 3: w0[0] = amd_bytealign (wx, w0[0] << 8, 1);
5518 w0[1] = amd_bytealign (w0[1] >> 24, wx, 1);
5522 case 5: w0[1] = amd_bytealign (wx, w0[1] << 24, 3);
5523 w0[2] = amd_bytealign (w0[2] >> 8, wx, 3);
5525 case 6: w0[1] = amd_bytealign (wx, w0[1] << 16, 2);
5526 w0[2] = amd_bytealign (w0[2] >> 16, wx, 2);
5528 case 7: w0[1] = amd_bytealign (wx, w0[1] << 8, 1);
5529 w0[2] = amd_bytealign (w0[2] >> 24, wx, 1);
5533 case 9: w0[2] = amd_bytealign (wx, w0[2] << 24, 3);
5534 w0[3] = amd_bytealign (w0[3] >> 8, wx, 3);
5536 case 10: w0[2] = amd_bytealign (wx, w0[2] << 16, 2);
5537 w0[3] = amd_bytealign (w0[3] >> 16, wx, 2);
5539 case 11: w0[2] = amd_bytealign (wx, w0[2] << 8, 1);
5540 w0[3] = amd_bytealign (w0[3] >> 24, wx, 1);
5542 case 12: w0[3] = wx;
5544 case 13: w0[3] = amd_bytealign (wx, w0[3] << 24, 3);
5545 w1[0] = amd_bytealign (w1[0] >> 8, wx, 3);
5547 case 14: w0[3] = amd_bytealign (wx, w0[3] << 16, 2);
5548 w1[0] = amd_bytealign (w1[0] >> 16, wx, 2);
5550 case 15: w0[3] = amd_bytealign (wx, w0[3] << 8, 1);
5551 w1[0] = amd_bytealign (w1[0] >> 24, wx, 1);
5553 case 16: w1[0] = wx;
5555 case 17: w1[0] = amd_bytealign (wx, w1[0] << 24, 3);
5556 w1[1] = amd_bytealign (w1[1] >> 8, wx, 3);
5558 case 18: w1[0] = amd_bytealign (wx, w1[0] << 16, 2);
5559 w1[1] = amd_bytealign (w1[1] >> 16, wx, 2);
5561 case 19: w1[0] = amd_bytealign (wx, w1[0] << 8, 1);
5562 w1[1] = amd_bytealign (w1[1] >> 24, wx, 1);
5564 case 20: w1[1] = wx;
5566 case 21: w1[1] = amd_bytealign (wx, w1[1] << 24, 3);
5567 w1[2] = amd_bytealign (w1[2] >> 8, wx, 3);
5569 case 22: w1[1] = amd_bytealign (wx, w1[1] << 16, 2);
5570 w1[2] = amd_bytealign (w1[2] >> 16, wx, 2);
5572 case 23: w1[1] = amd_bytealign (wx, w1[1] << 8, 1);
5573 w1[2] = amd_bytealign (w1[2] >> 24, wx, 1);
5575 case 24: w1[2] = wx;
5577 case 25: w1[2] = amd_bytealign (wx, w1[2] << 24, 3);
5578 w1[3] = amd_bytealign (w1[3] >> 8, wx, 3);
5580 case 26: w1[2] = amd_bytealign (wx, w1[2] << 16, 2);
5581 w1[3] = amd_bytealign (w1[3] >> 16, wx, 2);
5583 case 27: w1[2] = amd_bytealign (wx, w1[2] << 8, 1);
5584 w1[3] = amd_bytealign (w1[3] >> 24, wx, 1);
5586 case 28: w1[3] = wx;
5588 case 29: w1[3] = amd_bytealign (wx, w1[3] << 24, 3);
5589 w2[0] = amd_bytealign (w2[0] >> 8, wx, 3);
5591 case 30: w1[3] = amd_bytealign (wx, w1[3] << 16, 2);
5592 w2[0] = amd_bytealign (w2[0] >> 16, wx, 2);
5594 case 31: w1[3] = amd_bytealign (wx, w1[3] << 8, 1);
5595 w2[0] = amd_bytealign (w2[0] >> 24, wx, 1);
5597 case 32: w2[0] = wx;
5599 case 33: w2[0] = amd_bytealign (wx, w2[0] << 24, 3);
5600 w2[1] = amd_bytealign (w2[1] >> 8, wx, 3);
5602 case 34: w2[0] = amd_bytealign (wx, w2[0] << 16, 2);
5603 w2[1] = amd_bytealign (w2[1] >> 16, wx, 2);
5605 case 35: w2[0] = amd_bytealign (wx, w2[0] << 8, 1);
5606 w2[1] = amd_bytealign (w2[1] >> 24, wx, 1);
5608 case 36: w2[1] = wx;
5610 case 37: w2[1] = amd_bytealign (wx, w2[1] << 24, 3);
5611 w2[2] = amd_bytealign (w2[2] >> 8, wx, 3);
5613 case 38: w2[1] = amd_bytealign (wx, w2[1] << 16, 2);
5614 w2[2] = amd_bytealign (w2[2] >> 16, wx, 2);
5616 case 39: w2[1] = amd_bytealign (wx, w2[1] << 8, 1);
5617 w2[2] = amd_bytealign (w2[2] >> 24, wx, 1);
5619 case 40: w2[2] = wx;
5621 case 41: w2[2] = amd_bytealign (wx, w2[2] << 24, 3);
5622 w2[3] = amd_bytealign (w2[3] >> 8, wx, 3);
5624 case 42: w2[2] = amd_bytealign (wx, w2[2] << 16, 2);
5625 w2[3] = amd_bytealign (w2[3] >> 16, wx, 2);
5627 case 43: w2[2] = amd_bytealign (wx, w2[2] << 8, 1);
5628 w2[3] = amd_bytealign (w2[3] >> 24, wx, 1);
5630 case 44: w2[3] = wx;
5632 case 45: w2[3] = amd_bytealign (wx, w2[3] << 24, 3);
5633 w3[0] = amd_bytealign (w3[0] >> 8, wx, 3);
5635 case 46: w2[3] = amd_bytealign (wx, w2[3] << 16, 2);
5636 w3[0] = amd_bytealign (w3[0] >> 16, wx, 2);
5638 case 47: w2[3] = amd_bytealign (wx, w2[3] << 8, 1);
5639 w3[0] = amd_bytealign (w3[0] >> 24, wx, 1);
5641 case 48: w3[0] = wx;
5643 case 49: w3[0] = amd_bytealign (wx, w3[0] << 24, 3);
5644 w3[1] = amd_bytealign (w3[1] >> 8, wx, 3);
5646 case 50: w3[0] = amd_bytealign (wx, w3[0] << 16, 2);
5647 w3[1] = amd_bytealign (w3[1] >> 16, wx, 2);
5649 case 51: w3[0] = amd_bytealign (wx, w3[0] << 8, 1);
5650 w3[1] = amd_bytealign (w3[1] >> 24, wx, 1);
5652 case 52: w3[1] = wx;
5654 case 53: w3[1] = amd_bytealign (wx, w3[1] << 24, 3);
5655 w3[2] = amd_bytealign (w3[2] >> 8, wx, 3);
5657 case 54: w3[1] = amd_bytealign (wx, w3[1] << 16, 2);
5658 w3[2] = amd_bytealign (w3[2] >> 16, wx, 2);
5660 case 55: w3[1] = amd_bytealign (wx, w3[1] << 8, 1);
5661 w3[2] = amd_bytealign (w3[2] >> 24, wx, 1);
5663 case 56: w3[2] = wx;
5665 case 57: w3[2] = amd_bytealign (wx, w3[2] << 24, 3);
5666 w3[3] = amd_bytealign (w3[3] >> 8, wx, 3);
5668 case 58: w3[2] = amd_bytealign (wx, w3[2] << 16, 2);
5669 w3[3] = amd_bytealign (w3[3] >> 16, wx, 2);
5671 case 59: w3[2] = amd_bytealign (wx, w3[2] << 8, 1);
5672 w3[3] = amd_bytealign (w3[3] >> 24, wx, 1);
5674 case 60: w3[3] = wx;
5676 case 61: w3[3] = amd_bytealign (wx, w3[3] << 24, 3);
5677 //w4[0] = amd_bytealign (w4[0] >> 8, wx, 3);
5679 case 62: w3[3] = amd_bytealign (wx, w3[3] << 16, 2);
5680 //w4[0] = amd_bytealign (w4[0] >> 16, wx, 2);
5682 case 63: w3[3] = amd_bytealign (wx, w3[3] << 8, 1);
5683 //w4[0] = amd_bytealign (w4[0] >> 24, wx, 1);
5691 case 1: w0[0] = (w0[0] & 0x000000ff) | (wx << 8);
5692 w0[1] = (w0[1] & 0xffffff00) | (wx >> 24);
5694 case 2: w0[0] = (w0[0] & 0x0000ffff) | (wx << 16);
5695 w0[1] = (w0[1] & 0xffff0000) | (wx >> 16);
5697 case 3: w0[0] = (w0[0] & 0x00ffffff) | (wx << 24);
5698 w0[1] = (w0[1] & 0xff000000) | (wx >> 8);
5702 case 5: w0[1] = (w0[1] & 0x000000ff) | (wx << 8);
5703 w0[2] = (w0[2] & 0xffffff00) | (wx >> 24);
5705 case 6: w0[1] = (w0[1] & 0x0000ffff) | (wx << 16);
5706 w0[2] = (w0[2] & 0xffff0000) | (wx >> 16);
5708 case 7: w0[1] = (w0[1] & 0x00ffffff) | (wx << 24);
5709 w0[2] = (w0[2] & 0xff000000) | (wx >> 8);
5713 case 9: w0[2] = (w0[2] & 0x000000ff) | (wx << 8);
5714 w0[3] = (w0[3] & 0xffffff00) | (wx >> 24);
5716 case 10: w0[2] = (w0[2] & 0x0000ffff) | (wx << 16);
5717 w0[3] = (w0[3] & 0xffff0000) | (wx >> 16);
5719 case 11: w0[2] = (w0[2] & 0x00ffffff) | (wx << 24);
5720 w0[3] = (w0[3] & 0xff000000) | (wx >> 8);
5722 case 12: w0[3] = wx;
5724 case 13: w0[3] = (w0[3] & 0x000000ff) | (wx << 8);
5725 w1[0] = (w1[0] & 0xffffff00) | (wx >> 24);
5727 case 14: w0[3] = (w0[3] & 0x0000ffff) | (wx << 16);
5728 w1[0] = (w1[0] & 0xffff0000) | (wx >> 16);
5730 case 15: w0[3] = (w0[3] & 0x00ffffff) | (wx << 24);
5731 w1[0] = (w1[0] & 0xff000000) | (wx >> 8);
5733 case 16: w1[0] = wx;
5735 case 17: w1[0] = (w1[0] & 0x000000ff) | (wx << 8);
5736 w1[1] = (w1[1] & 0xffffff00) | (wx >> 24);
5738 case 18: w1[0] = (w1[0] & 0x0000ffff) | (wx << 16);
5739 w1[1] = (w1[1] & 0xffff0000) | (wx >> 16);
5741 case 19: w1[0] = (w1[0] & 0x00ffffff) | (wx << 24);
5742 w1[1] = (w1[1] & 0xff000000) | (wx >> 8);
5744 case 20: w1[1] = wx;
5746 case 21: w1[1] = (w1[1] & 0x000000ff) | (wx << 8);
5747 w1[2] = (w1[2] & 0xffffff00) | (wx >> 24);
5749 case 22: w1[1] = (w1[1] & 0x0000ffff) | (wx << 16);
5750 w1[2] = (w1[2] & 0xffff0000) | (wx >> 16);
5752 case 23: w1[1] = (w1[1] & 0x00ffffff) | (wx << 24);
5753 w1[2] = (w1[2] & 0xff000000) | (wx >> 8);
5755 case 24: w1[2] = wx;
5757 case 25: w1[2] = (w1[2] & 0x000000ff) | (wx << 8);
5758 w1[3] = (w1[3] & 0xffffff00) | (wx >> 24);
5760 case 26: w1[2] = (w1[2] & 0x0000ffff) | (wx << 16);
5761 w1[3] = (w1[3] & 0xffff0000) | (wx >> 16);
5763 case 27: w1[2] = (w1[2] & 0x00ffffff) | (wx << 24);
5764 w1[3] = (w1[3] & 0xff000000) | (wx >> 8);
5766 case 28: w1[3] = wx;
5768 case 29: w1[3] = (w1[3] & 0x000000ff) | (wx << 8);
5769 w2[0] = (w2[0] & 0xffffff00) | (wx >> 24);
5771 case 30: w1[3] = (w1[3] & 0x0000ffff) | (wx << 16);
5772 w2[0] = (w2[0] & 0xffff0000) | (wx >> 16);
5774 case 31: w1[3] = (w1[3] & 0x00ffffff) | (wx << 24);
5775 w2[0] = (w2[0] & 0xff000000) | (wx >> 8);
5777 case 32: w2[0] = wx;
5779 case 33: w2[0] = (w2[0] & 0x000000ff) | (wx << 8);
5780 w2[1] = (w2[1] & 0xffffff00) | (wx >> 24);
5782 case 34: w2[0] = (w2[0] & 0x0000ffff) | (wx << 16);
5783 w2[1] = (w2[1] & 0xffff0000) | (wx >> 16);
5785 case 35: w2[0] = (w2[0] & 0x00ffffff) | (wx << 24);
5786 w2[1] = (w2[1] & 0xff000000) | (wx >> 8);
5788 case 36: w2[1] = wx;
5790 case 37: w2[1] = (w2[1] & 0x000000ff) | (wx << 8);
5791 w2[2] = (w2[2] & 0xffffff00) | (wx >> 24);
5793 case 38: w2[1] = (w2[1] & 0x0000ffff) | (wx << 16);
5794 w2[2] = (w2[2] & 0xffff0000) | (wx >> 16);
5796 case 39: w2[1] = (w2[1] & 0x00ffffff) | (wx << 24);
5797 w2[2] = (w2[2] & 0xff000000) | (wx >> 8);
5799 case 40: w2[2] = wx;
5801 case 41: w2[2] = (w2[2] & 0x000000ff) | (wx << 8);
5802 w2[3] = (w2[3] & 0xffffff00) | (wx >> 24);
5804 case 42: w2[2] = (w2[2] & 0x0000ffff) | (wx << 16);
5805 w2[3] = (w2[3] & 0xffff0000) | (wx >> 16);
5807 case 43: w2[2] = (w2[2] & 0x00ffffff) | (wx << 24);
5808 w2[3] = (w2[3] & 0xff000000) | (wx >> 8);
5810 case 44: w2[3] = wx;
5812 case 45: w2[3] = (w2[3] & 0x000000ff) | (wx << 8);
5813 w3[0] = (w3[0] & 0xffffff00) | (wx >> 24);
5815 case 46: w2[3] = (w2[3] & 0x0000ffff) | (wx << 16);
5816 w3[0] = (w3[0] & 0xffff0000) | (wx >> 16);
5818 case 47: w2[3] = (w2[3] & 0x00ffffff) | (wx << 24);
5819 w3[0] = (w3[0] & 0xff000000) | (wx >> 8);
5821 case 48: w3[0] = wx;
5823 case 49: w3[0] = (w3[0] & 0x000000ff) | (wx << 8);
5824 w3[1] = (w3[1] & 0xffffff00) | (wx >> 24);
5826 case 50: w3[0] = (w3[0] & 0x0000ffff) | (wx << 16);
5827 w3[1] = (w3[1] & 0xffff0000) | (wx >> 16);
5829 case 51: w3[0] = (w3[0] & 0x00ffffff) | (wx << 24);
5830 w3[1] = (w3[1] & 0xff000000) | (wx >> 8);
5832 case 52: w3[1] = wx;
5834 case 53: w3[1] = (w3[1] & 0x000000ff) | (wx << 8);
5835 w3[2] = (w3[2] & 0xffffff00) | (wx >> 24);
5837 case 54: w3[1] = (w3[1] & 0x0000ffff) | (wx << 16);
5838 w3[2] = (w3[2] & 0xffff0000) | (wx >> 16);
5840 case 55: w3[1] = (w3[1] & 0x00ffffff) | (wx << 24);
5841 w3[2] = (w3[2] & 0xff000000) | (wx >> 8);
5843 case 56: w3[2] = wx;
5845 case 57: w3[2] = (w3[2] & 0x000000ff) | (wx << 8);
5846 w3[3] = (w3[3] & 0xffffff00) | (wx >> 24);
5848 case 58: w3[2] = (w3[2] & 0x0000ffff) | (wx << 16);
5849 w3[3] = (w3[3] & 0xffff0000) | (wx >> 16);
5851 case 59: w3[2] = (w3[2] & 0x00ffffff) | (wx << 24);
5852 w3[3] = (w3[3] & 0xff000000) | (wx >> 8);
5854 case 60: w3[3] = wx;
5856 case 61: w3[3] = (w3[3] & 0x000000ff) | (wx << 8);
5857 //w4[0] = (w4[0] & 0xffffff00) | (wx >> 24);
5859 case 62: w3[3] = (w3[3] & 0x0000ffff) | (wx << 16);
5860 //w4[0] = (w4[0] & 0xffff0000) | (wx >> 16);
5862 case 63: w3[3] = (w3[3] & 0x00ffffff) | (wx << 24);
5863 //w4[0] = (w4[0] & 0xff000000) | (wx >> 8);
5869 inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len)
5871 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5877 case 1: w0[0] = (w0[0] & 0xff000000) | (wx >> 8);
5878 w0[1] = (w0[1] & 0x00ffffff) | (wx << 24);
5880 case 2: w0[0] = (w0[0] & 0xffff0000) | (wx >> 16);
5881 w0[1] = (w0[1] & 0x0000ffff) | (wx << 16);
5883 case 3: w0[0] = (w0[0] & 0xffffff00) | (wx >> 24);
5884 w0[1] = (w0[1] & 0x000000ff) | (wx << 8);
5888 case 5: w0[1] = (w0[1] & 0xff000000) | (wx >> 8);
5889 w0[2] = (w0[2] & 0x00ffffff) | (wx << 24);
5891 case 6: w0[1] = (w0[1] & 0xffff0000) | (wx >> 16);
5892 w0[2] = (w0[2] & 0x0000ffff) | (wx << 16);
5894 case 7: w0[1] = (w0[1] & 0xffffff00) | (wx >> 24);
5895 w0[2] = (w0[2] & 0x000000ff) | (wx << 8);
5899 case 9: w0[2] = (w0[2] & 0xff000000) | (wx >> 8);
5900 w0[3] = (w0[3] & 0x00ffffff) | (wx << 24);
5902 case 10: w0[2] = (w0[2] & 0xffff0000) | (wx >> 16);
5903 w0[3] = (w0[3] & 0x0000ffff) | (wx << 16);
5905 case 11: w0[2] = (w0[2] & 0xffffff00) | (wx >> 24);
5906 w0[3] = (w0[3] & 0x000000ff) | (wx << 8);
5908 case 12: w0[3] = wx;
5910 case 13: w0[3] = (w0[3] & 0xff000000) | (wx >> 8);
5911 w1[0] = (w1[0] & 0x00ffffff) | (wx << 24);
5913 case 14: w0[3] = (w0[3] & 0xffff0000) | (wx >> 16);
5914 w1[0] = (w1[0] & 0x0000ffff) | (wx << 16);
5916 case 15: w0[3] = (w0[3] & 0xffffff00) | (wx >> 24);
5917 w1[0] = (w1[0] & 0x000000ff) | (wx << 8);
5919 case 16: w1[0] = wx;
5921 case 17: w1[0] = (w1[0] & 0xff000000) | (wx >> 8);
5922 w1[1] = (w1[1] & 0x00ffffff) | (wx << 24);
5924 case 18: w1[0] = (w1[0] & 0xffff0000) | (wx >> 16);
5925 w1[1] = (w1[1] & 0x0000ffff) | (wx << 16);
5927 case 19: w1[0] = (w1[0] & 0xffffff00) | (wx >> 24);
5928 w1[1] = (w1[1] & 0x000000ff) | (wx << 8);
5930 case 20: w1[1] = wx;
5932 case 21: w1[1] = (w1[1] & 0xff000000) | (wx >> 8);
5933 w1[2] = (w1[2] & 0x00ffffff) | (wx << 24);
5935 case 22: w1[1] = (w1[1] & 0xffff0000) | (wx >> 16);
5936 w1[2] = (w1[2] & 0x0000ffff) | (wx << 16);
5938 case 23: w1[1] = (w1[1] & 0xffffff00) | (wx >> 24);
5939 w1[2] = (w1[2] & 0x000000ff) | (wx << 8);
5941 case 24: w1[2] = wx;
5943 case 25: w1[2] = (w1[2] & 0xff000000) | (wx >> 8);
5944 w1[3] = (w1[3] & 0x00ffffff) | (wx << 24);
5946 case 26: w1[2] = (w1[2] & 0xffff0000) | (wx >> 16);
5947 w1[3] = (w1[3] & 0x0000ffff) | (wx << 16);
5949 case 27: w1[2] = (w1[2] & 0xffffff00) | (wx >> 24);
5950 w1[3] = (w1[3] & 0x000000ff) | (wx << 8);
5952 case 28: w1[3] = wx;
5954 case 29: w1[3] = (w1[3] & 0xff000000) | (wx >> 8);
5955 w2[0] = (w2[0] & 0x00ffffff) | (wx << 24);
5957 case 30: w1[3] = (w1[3] & 0xffff0000) | (wx >> 16);
5958 w2[0] = (w2[0] & 0x0000ffff) | (wx << 16);
5960 case 31: w1[3] = (w1[3] & 0xffffff00) | (wx >> 24);
5961 w2[0] = (w2[0] & 0x000000ff) | (wx << 8);
5963 case 32: w2[0] = wx;
5965 case 33: w2[0] = (w2[0] & 0xff000000) | (wx >> 8);
5966 w2[1] = (w2[1] & 0x00ffffff) | (wx << 24);
5968 case 34: w2[0] = (w2[0] & 0xffff0000) | (wx >> 16);
5969 w2[1] = (w2[1] & 0x0000ffff) | (wx << 16);
5971 case 35: w2[0] = (w2[0] & 0xffffff00) | (wx >> 24);
5972 w2[1] = (w2[1] & 0x000000ff) | (wx << 8);
5974 case 36: w2[1] = wx;
5976 case 37: w2[1] = (w2[1] & 0xff000000) | (wx >> 8);
5977 w2[2] = (w2[2] & 0x00ffffff) | (wx << 24);
5979 case 38: w2[1] = (w2[1] & 0xffff0000) | (wx >> 16);
5980 w2[2] = (w2[2] & 0x0000ffff) | (wx << 16);
5982 case 39: w2[1] = (w2[1] & 0xffffff00) | (wx >> 24);
5983 w2[2] = (w2[2] & 0x000000ff) | (wx << 8);
5985 case 40: w2[2] = wx;
5987 case 41: w2[2] = (w2[2] & 0xff000000) | (wx >> 8);
5988 w2[3] = (w2[3] & 0x00ffffff) | (wx << 24);
5990 case 42: w2[2] = (w2[2] & 0xffff0000) | (wx >> 16);
5991 w2[3] = (w2[3] & 0x0000ffff) | (wx << 16);
5993 case 43: w2[2] = (w2[2] & 0xffffff00) | (wx >> 24);
5994 w2[3] = (w2[3] & 0x000000ff) | (wx << 8);
5996 case 44: w2[3] = wx;
5998 case 45: w2[3] = (w2[3] & 0xff000000) | (wx >> 8);
5999 w3[0] = (w3[0] & 0x00ffffff) | (wx << 24);
6001 case 46: w2[3] = (w2[3] & 0xffff0000) | (wx >> 16);
6002 w3[0] = (w3[0] & 0x0000ffff) | (wx << 16);
6004 case 47: w2[3] = (w2[3] & 0xffffff00) | (wx >> 24);
6005 w3[0] = (w3[0] & 0x000000ff) | (wx << 8);
6007 case 48: w3[0] = wx;
6009 case 49: w3[0] = (w3[0] & 0xff000000) | (wx >> 8);
6010 w3[1] = (w3[1] & 0x00ffffff) | (wx << 24);
6012 case 50: w3[0] = (w3[0] & 0xffff0000) | (wx >> 16);
6013 w3[1] = (w3[1] & 0x0000ffff) | (wx << 16);
6015 case 51: w3[0] = (w3[0] & 0xffffff00) | (wx >> 24);
6016 w3[1] = (w3[1] & 0x000000ff) | (wx << 8);
6018 case 52: w3[1] = wx;
6020 case 53: w3[1] = (w3[1] & 0xff000000) | (wx >> 8);
6021 w3[2] = (w3[2] & 0x00ffffff) | (wx << 24);
6023 case 54: w3[1] = (w3[1] & 0xffff0000) | (wx >> 16);
6024 w3[2] = (w3[2] & 0x0000ffff) | (wx << 16);
6026 case 55: w3[1] = (w3[1] & 0xffffff00) | (wx >> 24);
6027 w3[2] = (w3[2] & 0x000000ff) | (wx << 8);
6029 case 56: w3[2] = wx;
6031 case 57: w3[2] = (w3[2] & 0xff000000) | (wx >> 8);
6032 w3[3] = (w3[3] & 0x00ffffff) | (wx << 24);
6034 case 58: w3[2] = (w3[2] & 0xffff0000) | (wx >> 16);
6035 w3[3] = (w3[3] & 0x0000ffff) | (wx << 16);
6037 case 59: w3[2] = (w3[2] & 0xffffff00) | (wx >> 24);
6038 w3[3] = (w3[3] & 0x000000ff) | (wx << 8);
6040 case 60: w3[3] = wx;
6042 case 61: w3[3] = (w3[3] & 0xff000000) | (wx >> 8);
6043 //w4[0] = (w4[0] & 0x00ffffff) | (wx << 24);
6045 case 62: w3[3] = (w3[3] & 0xffff0000) | (wx >> 16);
6046 //w4[0] = (w4[0] & 0x0000ffff) | (wx << 16);
6048 case 63: w3[3] = (w3[3] & 0xffffff00) | (wx >> 24);
6049 //w4[0] = (w4[0] & 0x000000ff) | (wx << 8);
6055 * vector functions as scalar (for outer loop usage)
6058 inline void append_0x01_1x4_S (u32 w0[4], const u32 offset)
6067 w0[0] = w0[0] | 0x0100;
6071 w0[0] = w0[0] | 0x010000;
6075 w0[0] = w0[0] | 0x01000000;
6083 w0[1] = w0[1] | 0x0100;
6087 w0[1] = w0[1] | 0x010000;
6091 w0[1] = w0[1] | 0x01000000;
6099 w0[2] = w0[2] | 0x0100;
6103 w0[2] = w0[2] | 0x010000;
6107 w0[2] = w0[2] | 0x01000000;
6115 w0[3] = w0[3] | 0x0100;
6119 w0[3] = w0[3] | 0x010000;
6123 w0[3] = w0[3] | 0x01000000;
6128 inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
6137 w0[0] = w0[0] | 0x0100;
6141 w0[0] = w0[0] | 0x010000;
6145 w0[0] = w0[0] | 0x01000000;
6153 w0[1] = w0[1] | 0x0100;
6157 w0[1] = w0[1] | 0x010000;
6161 w0[1] = w0[1] | 0x01000000;
6169 w0[2] = w0[2] | 0x0100;
6173 w0[2] = w0[2] | 0x010000;
6177 w0[2] = w0[2] | 0x01000000;
6185 w0[3] = w0[3] | 0x0100;
6189 w0[3] = w0[3] | 0x010000;
6193 w0[3] = w0[3] | 0x01000000;
6201 w1[0] = w1[0] | 0x0100;
6205 w1[0] = w1[0] | 0x010000;
6209 w1[0] = w1[0] | 0x01000000;
6217 w1[1] = w1[1] | 0x0100;
6221 w1[1] = w1[1] | 0x010000;
6225 w1[1] = w1[1] | 0x01000000;
6233 w1[2] = w1[2] | 0x0100;
6237 w1[2] = w1[2] | 0x010000;
6241 w1[2] = w1[2] | 0x01000000;
6249 w1[3] = w1[3] | 0x0100;
6253 w1[3] = w1[3] | 0x010000;
6257 w1[3] = w1[3] | 0x01000000;
6262 inline void append_0x01_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
6271 w0[0] = w0[0] | 0x0100;
6275 w0[0] = w0[0] | 0x010000;
6279 w0[0] = w0[0] | 0x01000000;
6287 w0[1] = w0[1] | 0x0100;
6291 w0[1] = w0[1] | 0x010000;
6295 w0[1] = w0[1] | 0x01000000;
6303 w0[2] = w0[2] | 0x0100;
6307 w0[2] = w0[2] | 0x010000;
6311 w0[2] = w0[2] | 0x01000000;
6319 w0[3] = w0[3] | 0x0100;
6323 w0[3] = w0[3] | 0x010000;
6327 w0[3] = w0[3] | 0x01000000;
6335 w1[0] = w1[0] | 0x0100;
6339 w1[0] = w1[0] | 0x010000;
6343 w1[0] = w1[0] | 0x01000000;
6351 w1[1] = w1[1] | 0x0100;
6355 w1[1] = w1[1] | 0x010000;
6359 w1[1] = w1[1] | 0x01000000;
6367 w1[2] = w1[2] | 0x0100;
6371 w1[2] = w1[2] | 0x010000;
6375 w1[2] = w1[2] | 0x01000000;
6383 w1[3] = w1[3] | 0x0100;
6387 w1[3] = w1[3] | 0x010000;
6391 w1[3] = w1[3] | 0x01000000;
6399 w2[0] = w2[0] | 0x0100;
6403 w2[0] = w2[0] | 0x010000;
6407 w2[0] = w2[0] | 0x01000000;
6415 w2[1] = w2[1] | 0x0100;
6419 w2[1] = w2[1] | 0x010000;
6423 w2[1] = w2[1] | 0x01000000;
6431 w2[2] = w2[2] | 0x0100;
6435 w2[2] = w2[2] | 0x010000;
6439 w2[2] = w2[2] | 0x01000000;
6447 w2[3] = w2[3] | 0x0100;
6451 w2[3] = w2[3] | 0x010000;
6455 w2[3] = w2[3] | 0x01000000;
6460 inline void append_0x01_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
6469 w0[0] = w0[0] | 0x0100;
6473 w0[0] = w0[0] | 0x010000;
6477 w0[0] = w0[0] | 0x01000000;
6485 w0[1] = w0[1] | 0x0100;
6489 w0[1] = w0[1] | 0x010000;
6493 w0[1] = w0[1] | 0x01000000;
6501 w0[2] = w0[2] | 0x0100;
6505 w0[2] = w0[2] | 0x010000;
6509 w0[2] = w0[2] | 0x01000000;
6517 w0[3] = w0[3] | 0x0100;
6521 w0[3] = w0[3] | 0x010000;
6525 w0[3] = w0[3] | 0x01000000;
6533 w1[0] = w1[0] | 0x0100;
6537 w1[0] = w1[0] | 0x010000;
6541 w1[0] = w1[0] | 0x01000000;
6549 w1[1] = w1[1] | 0x0100;
6553 w1[1] = w1[1] | 0x010000;
6557 w1[1] = w1[1] | 0x01000000;
6565 w1[2] = w1[2] | 0x0100;
6569 w1[2] = w1[2] | 0x010000;
6573 w1[2] = w1[2] | 0x01000000;
6581 w1[3] = w1[3] | 0x0100;
6585 w1[3] = w1[3] | 0x010000;
6589 w1[3] = w1[3] | 0x01000000;
6597 w2[0] = w2[0] | 0x0100;
6601 w2[0] = w2[0] | 0x010000;
6605 w2[0] = w2[0] | 0x01000000;
6613 w2[1] = w2[1] | 0x0100;
6617 w2[1] = w2[1] | 0x010000;
6621 w2[1] = w2[1] | 0x01000000;
6629 w2[2] = w2[2] | 0x0100;
6633 w2[2] = w2[2] | 0x010000;
6637 w2[2] = w2[2] | 0x01000000;
6645 w2[3] = w2[3] | 0x0100;
6649 w2[3] = w2[3] | 0x010000;
6653 w2[3] = w2[3] | 0x01000000;
6661 w3[0] = w3[0] | 0x0100;
6665 w3[0] = w3[0] | 0x010000;
6669 w3[0] = w3[0] | 0x01000000;
6677 w3[1] = w3[1] | 0x0100;
6681 w3[1] = w3[1] | 0x010000;
6685 w3[1] = w3[1] | 0x01000000;
6693 w3[2] = w3[2] | 0x0100;
6697 w3[2] = w3[2] | 0x010000;
6701 w3[2] = w3[2] | 0x01000000;
6709 w3[3] = w3[3] | 0x0100;
6713 w3[3] = w3[3] | 0x010000;
6717 w3[3] = w3[3] | 0x01000000;
6722 inline void append_0x02_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
6731 w0[0] = w0[0] | 0x0200;
6735 w0[0] = w0[0] | 0x020000;
6739 w0[0] = w0[0] | 0x02000000;
6747 w0[1] = w0[1] | 0x0200;
6751 w0[1] = w0[1] | 0x020000;
6755 w0[1] = w0[1] | 0x02000000;
6763 w0[2] = w0[2] | 0x0200;
6767 w0[2] = w0[2] | 0x020000;
6771 w0[2] = w0[2] | 0x02000000;
6779 w0[3] = w0[3] | 0x0200;
6783 w0[3] = w0[3] | 0x020000;
6787 w0[3] = w0[3] | 0x02000000;
6795 w1[0] = w1[0] | 0x0200;
6799 w1[0] = w1[0] | 0x020000;
6803 w1[0] = w1[0] | 0x02000000;
6811 w1[1] = w1[1] | 0x0200;
6815 w1[1] = w1[1] | 0x020000;
6819 w1[1] = w1[1] | 0x02000000;
6827 w1[2] = w1[2] | 0x0200;
6831 w1[2] = w1[2] | 0x020000;
6835 w1[2] = w1[2] | 0x02000000;
6843 w1[3] = w1[3] | 0x0200;
6847 w1[3] = w1[3] | 0x020000;
6851 w1[3] = w1[3] | 0x02000000;
6856 inline void append_0x02_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
6865 w0[0] = w0[0] | 0x0200;
6869 w0[0] = w0[0] | 0x020000;
6873 w0[0] = w0[0] | 0x02000000;
6881 w0[1] = w0[1] | 0x0200;
6885 w0[1] = w0[1] | 0x020000;
6889 w0[1] = w0[1] | 0x02000000;
6897 w0[2] = w0[2] | 0x0200;
6901 w0[2] = w0[2] | 0x020000;
6905 w0[2] = w0[2] | 0x02000000;
6913 w0[3] = w0[3] | 0x0200;
6917 w0[3] = w0[3] | 0x020000;
6921 w0[3] = w0[3] | 0x02000000;
6929 w1[0] = w1[0] | 0x0200;
6933 w1[0] = w1[0] | 0x020000;
6937 w1[0] = w1[0] | 0x02000000;
6945 w1[1] = w1[1] | 0x0200;
6949 w1[1] = w1[1] | 0x020000;
6953 w1[1] = w1[1] | 0x02000000;
6961 w1[2] = w1[2] | 0x0200;
6965 w1[2] = w1[2] | 0x020000;
6969 w1[2] = w1[2] | 0x02000000;
6977 w1[3] = w1[3] | 0x0200;
6981 w1[3] = w1[3] | 0x020000;
6985 w1[3] = w1[3] | 0x02000000;
6993 w2[0] = w2[0] | 0x0200;
6997 w2[0] = w2[0] | 0x020000;
7001 w2[0] = w2[0] | 0x02000000;
7009 w2[1] = w2[1] | 0x0200;
7013 w2[1] = w2[1] | 0x020000;
7017 w2[1] = w2[1] | 0x02000000;
7025 w2[2] = w2[2] | 0x0200;
7029 w2[2] = w2[2] | 0x020000;
7033 w2[2] = w2[2] | 0x02000000;
7041 w2[3] = w2[3] | 0x0200;
7045 w2[3] = w2[3] | 0x020000;
7049 w2[3] = w2[3] | 0x02000000;
7054 inline void append_0x80_1x4_S (u32 w0[4], const u32 offset)
7063 w0[0] = w0[0] | 0x8000;
7067 w0[0] = w0[0] | 0x800000;
7071 w0[0] = w0[0] | 0x80000000;
7079 w0[1] = w0[1] | 0x8000;
7083 w0[1] = w0[1] | 0x800000;
7087 w0[1] = w0[1] | 0x80000000;
7095 w0[2] = w0[2] | 0x8000;
7099 w0[2] = w0[2] | 0x800000;
7103 w0[2] = w0[2] | 0x80000000;
7111 w0[3] = w0[3] | 0x8000;
7115 w0[3] = w0[3] | 0x800000;
7119 w0[3] = w0[3] | 0x80000000;
7124 inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
7133 w0[0] = w0[0] | 0x8000;
7137 w0[0] = w0[0] | 0x800000;
7141 w0[0] = w0[0] | 0x80000000;
7149 w0[1] = w0[1] | 0x8000;
7153 w0[1] = w0[1] | 0x800000;
7157 w0[1] = w0[1] | 0x80000000;
7165 w0[2] = w0[2] | 0x8000;
7169 w0[2] = w0[2] | 0x800000;
7173 w0[2] = w0[2] | 0x80000000;
7181 w0[3] = w0[3] | 0x8000;
7185 w0[3] = w0[3] | 0x800000;
7189 w0[3] = w0[3] | 0x80000000;
7197 w1[0] = w1[0] | 0x8000;
7201 w1[0] = w1[0] | 0x800000;
7205 w1[0] = w1[0] | 0x80000000;
7213 w1[1] = w1[1] | 0x8000;
7217 w1[1] = w1[1] | 0x800000;
7221 w1[1] = w1[1] | 0x80000000;
7229 w1[2] = w1[2] | 0x8000;
7233 w1[2] = w1[2] | 0x800000;
7237 w1[2] = w1[2] | 0x80000000;
7245 w1[3] = w1[3] | 0x8000;
7249 w1[3] = w1[3] | 0x800000;
7253 w1[3] = w1[3] | 0x80000000;
7258 inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
7267 w0[0] = w0[0] | 0x8000;
7271 w0[0] = w0[0] | 0x800000;
7275 w0[0] = w0[0] | 0x80000000;
7283 w0[1] = w0[1] | 0x8000;
7287 w0[1] = w0[1] | 0x800000;
7291 w0[1] = w0[1] | 0x80000000;
7299 w0[2] = w0[2] | 0x8000;
7303 w0[2] = w0[2] | 0x800000;
7307 w0[2] = w0[2] | 0x80000000;
7315 w0[3] = w0[3] | 0x8000;
7319 w0[3] = w0[3] | 0x800000;
7323 w0[3] = w0[3] | 0x80000000;
7331 w1[0] = w1[0] | 0x8000;
7335 w1[0] = w1[0] | 0x800000;
7339 w1[0] = w1[0] | 0x80000000;
7347 w1[1] = w1[1] | 0x8000;
7351 w1[1] = w1[1] | 0x800000;
7355 w1[1] = w1[1] | 0x80000000;
7363 w1[2] = w1[2] | 0x8000;
7367 w1[2] = w1[2] | 0x800000;
7371 w1[2] = w1[2] | 0x80000000;
7379 w1[3] = w1[3] | 0x8000;
7383 w1[3] = w1[3] | 0x800000;
7387 w1[3] = w1[3] | 0x80000000;
7395 w2[0] = w2[0] | 0x8000;
7399 w2[0] = w2[0] | 0x800000;
7403 w2[0] = w2[0] | 0x80000000;
7411 w2[1] = w2[1] | 0x8000;
7415 w2[1] = w2[1] | 0x800000;
7419 w2[1] = w2[1] | 0x80000000;
7427 w2[2] = w2[2] | 0x8000;
7431 w2[2] = w2[2] | 0x800000;
7435 w2[2] = w2[2] | 0x80000000;
7443 w2[3] = w2[3] | 0x8000;
7447 w2[3] = w2[3] | 0x800000;
7451 w2[3] = w2[3] | 0x80000000;
7456 inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
7465 w0[0] = w0[0] | 0x8000;
7469 w0[0] = w0[0] | 0x800000;
7473 w0[0] = w0[0] | 0x80000000;
7481 w0[1] = w0[1] | 0x8000;
7485 w0[1] = w0[1] | 0x800000;
7489 w0[1] = w0[1] | 0x80000000;
7497 w0[2] = w0[2] | 0x8000;
7501 w0[2] = w0[2] | 0x800000;
7505 w0[2] = w0[2] | 0x80000000;
7513 w0[3] = w0[3] | 0x8000;
7517 w0[3] = w0[3] | 0x800000;
7521 w0[3] = w0[3] | 0x80000000;
7529 w1[0] = w1[0] | 0x8000;
7533 w1[0] = w1[0] | 0x800000;
7537 w1[0] = w1[0] | 0x80000000;
7545 w1[1] = w1[1] | 0x8000;
7549 w1[1] = w1[1] | 0x800000;
7553 w1[1] = w1[1] | 0x80000000;
7561 w1[2] = w1[2] | 0x8000;
7565 w1[2] = w1[2] | 0x800000;
7569 w1[2] = w1[2] | 0x80000000;
7577 w1[3] = w1[3] | 0x8000;
7581 w1[3] = w1[3] | 0x800000;
7585 w1[3] = w1[3] | 0x80000000;
7593 w2[0] = w2[0] | 0x8000;
7597 w2[0] = w2[0] | 0x800000;
7601 w2[0] = w2[0] | 0x80000000;
7609 w2[1] = w2[1] | 0x8000;
7613 w2[1] = w2[1] | 0x800000;
7617 w2[1] = w2[1] | 0x80000000;
7625 w2[2] = w2[2] | 0x8000;
7629 w2[2] = w2[2] | 0x800000;
7633 w2[2] = w2[2] | 0x80000000;
7641 w2[3] = w2[3] | 0x8000;
7645 w2[3] = w2[3] | 0x800000;
7649 w2[3] = w2[3] | 0x80000000;
7657 w3[0] = w3[0] | 0x8000;
7661 w3[0] = w3[0] | 0x800000;
7665 w3[0] = w3[0] | 0x80000000;
7673 w3[1] = w3[1] | 0x8000;
7677 w3[1] = w3[1] | 0x800000;
7681 w3[1] = w3[1] | 0x80000000;
7689 w3[2] = w3[2] | 0x8000;
7693 w3[2] = w3[2] | 0x800000;
7697 w3[2] = w3[2] | 0x80000000;
7705 w3[3] = w3[3] | 0x8000;
7709 w3[3] = w3[3] | 0x800000;
7713 w3[3] = w3[3] | 0x80000000;
7718 inline void truncate_block_S (u32 w[4], const u32 len)
7727 case 1: w[0] &= 0x000000FF;
7732 case 2: w[0] &= 0x0000FFFF;
7737 case 3: w[0] &= 0x00FFFFFF;
7746 case 5: w[1] &= 0x000000FF;
7750 case 6: w[1] &= 0x0000FFFF;
7754 case 7: w[1] &= 0x00FFFFFF;
7761 case 9: w[2] &= 0x000000FF;
7764 case 10: w[2] &= 0x0000FFFF;
7767 case 11: w[2] &= 0x00FFFFFF;
7772 case 13: w[3] &= 0x000000FF;
7774 case 14: w[3] &= 0x0000FFFF;
7776 case 15: w[3] &= 0x00FFFFFF;
7781 inline void make_unicode_S (const u32 in[4], u32 out1[4], u32 out2[4])
7784 out2[3] = __byte_perm_S (in[3], 0, 0x7372);
7785 out2[2] = __byte_perm_S (in[3], 0, 0x7170);
7786 out2[1] = __byte_perm_S (in[2], 0, 0x7372);
7787 out2[0] = __byte_perm_S (in[2], 0, 0x7170);
7788 out1[3] = __byte_perm_S (in[1], 0, 0x7372);
7789 out1[2] = __byte_perm_S (in[1], 0, 0x7170);
7790 out1[1] = __byte_perm_S (in[0], 0, 0x7372);
7791 out1[0] = __byte_perm_S (in[0], 0, 0x7170);
7794 #if defined IS_AMD || defined IS_GENERIC
7795 out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
7796 out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
7797 out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
7798 out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
7799 out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
7800 out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
7801 out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
7802 out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
7806 inline void undo_unicode_S (const u32 in1[4], const u32 in2[4], u32 out[4])
7809 out[0] = __byte_perm_S (in1[0], in1[1], 0x6420);
7810 out[1] = __byte_perm_S (in1[2], in1[3], 0x6420);
7811 out[2] = __byte_perm_S (in2[0], in2[1], 0x6420);
7812 out[3] = __byte_perm_S (in2[2], in2[3], 0x6420);
7815 #if defined IS_AMD || defined IS_GENERIC
7816 out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
7817 | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
7818 out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
7819 | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
7820 out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
7821 | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
7822 out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
7823 | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
7827 inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
7829 #if defined IS_AMD || defined IS_GENERIC
7830 const int offset_mod_4 = offset & 3;
7832 const int offset_minus_4 = 4 - offset;
7837 w3[2] = amd_bytealign_S ( 0, w3[1], offset_minus_4);
7838 w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4);
7839 w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
7840 w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7841 w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7842 w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7843 w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7844 w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7845 w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7846 w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7847 w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7848 w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7849 w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7850 w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7851 w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7853 if (offset_mod_4 == 0)
7875 w3[2] = amd_bytealign_S ( 0, w3[0], offset_minus_4);
7876 w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
7877 w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7878 w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7879 w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7880 w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7881 w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7882 w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7883 w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7884 w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7885 w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7886 w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7887 w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7888 w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7891 if (offset_mod_4 == 0)
7912 w3[2] = amd_bytealign_S ( 0, w2[3], offset_minus_4);
7913 w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7914 w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7915 w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7916 w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7917 w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7918 w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7919 w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7920 w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7921 w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7922 w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7923 w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7924 w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7928 if (offset_mod_4 == 0)
7948 w3[2] = amd_bytealign_S ( 0, w2[2], offset_minus_4);
7949 w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7950 w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7951 w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7952 w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7953 w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7954 w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7955 w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7956 w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7957 w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7958 w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7959 w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7964 if (offset_mod_4 == 0)
7983 w3[2] = amd_bytealign_S ( 0, w2[1], offset_minus_4);
7984 w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7985 w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7986 w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7987 w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7988 w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7989 w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7990 w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7991 w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7992 w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7993 w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7999 if (offset_mod_4 == 0)
8017 w3[2] = amd_bytealign_S ( 0, w2[0], offset_minus_4);
8018 w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
8019 w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
8020 w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
8021 w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
8022 w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8023 w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8024 w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8025 w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8026 w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8033 if (offset_mod_4 == 0)
8050 w3[2] = amd_bytealign_S ( 0, w1[3], offset_minus_4);
8051 w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
8052 w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
8053 w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
8054 w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8055 w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8056 w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8057 w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8058 w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8066 if (offset_mod_4 == 0)
8082 w3[2] = amd_bytealign_S ( 0, w1[2], offset_minus_4);
8083 w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
8084 w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
8085 w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8086 w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8087 w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8088 w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8089 w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8098 if (offset_mod_4 == 0)
8113 w3[2] = amd_bytealign_S ( 0, w1[1], offset_minus_4);
8114 w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
8115 w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8116 w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8117 w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8118 w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8119 w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8129 if (offset_mod_4 == 0)
8143 w3[2] = amd_bytealign_S ( 0, w1[0], offset_minus_4);
8144 w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8145 w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8146 w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8147 w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8148 w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8159 if (offset_mod_4 == 0)
8172 w3[2] = amd_bytealign_S ( 0, w0[3], offset_minus_4);
8173 w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8174 w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8175 w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8176 w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8188 if (offset_mod_4 == 0)
8200 w3[2] = amd_bytealign_S ( 0, w0[2], offset_minus_4);
8201 w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8202 w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8203 w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8216 if (offset_mod_4 == 0)
8227 w3[2] = amd_bytealign_S ( 0, w0[1], offset_minus_4);
8228 w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8229 w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8243 if (offset_mod_4 == 0)
8253 w3[2] = amd_bytealign_S ( 0, w0[0], offset_minus_4);
8254 w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8269 if (offset_mod_4 == 0)
8280 const int offset_minus_4 = 4 - (offset % 4);
8282 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
8287 w3[1] = __byte_perm_S (w3[0], w3[1], selector);
8288 w3[0] = __byte_perm_S (w2[3], w3[0], selector);
8289 w2[3] = __byte_perm_S (w2[2], w2[3], selector);
8290 w2[2] = __byte_perm_S (w2[1], w2[2], selector);
8291 w2[1] = __byte_perm_S (w2[0], w2[1], selector);
8292 w2[0] = __byte_perm_S (w1[3], w2[0], selector);
8293 w1[3] = __byte_perm_S (w1[2], w1[3], selector);
8294 w1[2] = __byte_perm_S (w1[1], w1[2], selector);
8295 w1[1] = __byte_perm_S (w1[0], w1[1], selector);
8296 w1[0] = __byte_perm_S (w0[3], w1[0], selector);
8297 w0[3] = __byte_perm_S (w0[2], w0[3], selector);
8298 w0[2] = __byte_perm_S (w0[1], w0[2], selector);
8299 w0[1] = __byte_perm_S (w0[0], w0[1], selector);
8300 w0[0] = __byte_perm_S ( 0, w0[0], selector);
8305 w3[1] = __byte_perm_S (w2[3], w3[0], selector);
8306 w3[0] = __byte_perm_S (w2[2], w2[3], selector);
8307 w2[3] = __byte_perm_S (w2[1], w2[2], selector);
8308 w2[2] = __byte_perm_S (w2[0], w2[1], selector);
8309 w2[1] = __byte_perm_S (w1[3], w2[0], selector);
8310 w2[0] = __byte_perm_S (w1[2], w1[3], selector);
8311 w1[3] = __byte_perm_S (w1[1], w1[2], selector);
8312 w1[2] = __byte_perm_S (w1[0], w1[1], selector);
8313 w1[1] = __byte_perm_S (w0[3], w1[0], selector);
8314 w1[0] = __byte_perm_S (w0[2], w0[3], selector);
8315 w0[3] = __byte_perm_S (w0[1], w0[2], selector);
8316 w0[2] = __byte_perm_S (w0[0], w0[1], selector);
8317 w0[1] = __byte_perm_S ( 0, w0[0], selector);
8323 w3[1] = __byte_perm_S (w2[2], w2[3], selector);
8324 w3[0] = __byte_perm_S (w2[1], w2[2], selector);
8325 w2[3] = __byte_perm_S (w2[0], w2[1], selector);
8326 w2[2] = __byte_perm_S (w1[3], w2[0], selector);
8327 w2[1] = __byte_perm_S (w1[2], w1[3], selector);
8328 w2[0] = __byte_perm_S (w1[1], w1[2], selector);
8329 w1[3] = __byte_perm_S (w1[0], w1[1], selector);
8330 w1[2] = __byte_perm_S (w0[3], w1[0], selector);
8331 w1[1] = __byte_perm_S (w0[2], w0[3], selector);
8332 w1[0] = __byte_perm_S (w0[1], w0[2], selector);
8333 w0[3] = __byte_perm_S (w0[0], w0[1], selector);
8334 w0[2] = __byte_perm_S ( 0, w0[0], selector);
8341 w3[1] = __byte_perm_S (w2[1], w2[2], selector);
8342 w3[0] = __byte_perm_S (w2[0], w2[1], selector);
8343 w2[3] = __byte_perm_S (w1[3], w2[0], selector);
8344 w2[2] = __byte_perm_S (w1[2], w1[3], selector);
8345 w2[1] = __byte_perm_S (w1[1], w1[2], selector);
8346 w2[0] = __byte_perm_S (w1[0], w1[1], selector);
8347 w1[3] = __byte_perm_S (w0[3], w1[0], selector);
8348 w1[2] = __byte_perm_S (w0[2], w0[3], selector);
8349 w1[1] = __byte_perm_S (w0[1], w0[2], selector);
8350 w1[0] = __byte_perm_S (w0[0], w0[1], selector);
8351 w0[3] = __byte_perm_S ( 0, w0[0], selector);
8359 w3[1] = __byte_perm_S (w2[0], w2[1], selector);
8360 w3[0] = __byte_perm_S (w1[3], w2[0], selector);
8361 w2[3] = __byte_perm_S (w1[2], w1[3], selector);
8362 w2[2] = __byte_perm_S (w1[1], w1[2], selector);
8363 w2[1] = __byte_perm_S (w1[0], w1[1], selector);
8364 w2[0] = __byte_perm_S (w0[3], w1[0], selector);
8365 w1[3] = __byte_perm_S (w0[2], w0[3], selector);
8366 w1[2] = __byte_perm_S (w0[1], w0[2], selector);
8367 w1[1] = __byte_perm_S (w0[0], w0[1], selector);
8368 w1[0] = __byte_perm_S ( 0, w0[0], selector);
8377 w3[1] = __byte_perm_S (w1[3], w2[0], selector);
8378 w3[0] = __byte_perm_S (w1[2], w1[3], selector);
8379 w2[3] = __byte_perm_S (w1[1], w1[2], selector);
8380 w2[2] = __byte_perm_S (w1[0], w1[1], selector);
8381 w2[1] = __byte_perm_S (w0[3], w1[0], selector);
8382 w2[0] = __byte_perm_S (w0[2], w0[3], selector);
8383 w1[3] = __byte_perm_S (w0[1], w0[2], selector);
8384 w1[2] = __byte_perm_S (w0[0], w0[1], selector);
8385 w1[1] = __byte_perm_S ( 0, w0[0], selector);
8395 w3[1] = __byte_perm_S (w1[2], w1[3], selector);
8396 w3[0] = __byte_perm_S (w1[1], w1[2], selector);
8397 w2[3] = __byte_perm_S (w1[0], w1[1], selector);
8398 w2[2] = __byte_perm_S (w0[3], w1[0], selector);
8399 w2[1] = __byte_perm_S (w0[2], w0[3], selector);
8400 w2[0] = __byte_perm_S (w0[1], w0[2], selector);
8401 w1[3] = __byte_perm_S (w0[0], w0[1], selector);
8402 w1[2] = __byte_perm_S ( 0, w0[0], selector);
8413 w3[1] = __byte_perm_S (w1[1], w1[2], selector);
8414 w3[0] = __byte_perm_S (w1[0], w1[1], selector);
8415 w2[3] = __byte_perm_S (w0[3], w1[0], selector);
8416 w2[2] = __byte_perm_S (w0[2], w0[3], selector);
8417 w2[1] = __byte_perm_S (w0[1], w0[2], selector);
8418 w2[0] = __byte_perm_S (w0[0], w0[1], selector);
8419 w1[3] = __byte_perm_S ( 0, w0[0], selector);
8431 w3[1] = __byte_perm_S (w1[0], w1[1], selector);
8432 w3[0] = __byte_perm_S (w0[3], w1[0], selector);
8433 w2[3] = __byte_perm_S (w0[2], w0[3], selector);
8434 w2[2] = __byte_perm_S (w0[1], w0[2], selector);
8435 w2[1] = __byte_perm_S (w0[0], w0[1], selector);
8436 w2[0] = __byte_perm_S ( 0, w0[0], selector);
8449 w3[1] = __byte_perm_S (w0[3], w1[0], selector);
8450 w3[0] = __byte_perm_S (w0[2], w0[3], selector);
8451 w2[3] = __byte_perm_S (w0[1], w0[2], selector);
8452 w2[2] = __byte_perm_S (w0[0], w0[1], selector);
8453 w2[1] = __byte_perm_S ( 0, w0[0], selector);
8467 w3[1] = __byte_perm_S (w0[2], w0[3], selector);
8468 w3[0] = __byte_perm_S (w0[1], w0[2], selector);
8469 w2[3] = __byte_perm_S (w0[0], w0[1], selector);
8470 w2[2] = __byte_perm_S ( 0, w0[0], selector);
8485 w3[1] = __byte_perm_S (w0[1], w0[2], selector);
8486 w3[0] = __byte_perm_S (w0[0], w0[1], selector);
8487 w2[3] = __byte_perm_S ( 0, w0[0], selector);
8503 w3[1] = __byte_perm_S (w0[0], w0[1], selector);
8504 w3[0] = __byte_perm_S ( 0, w0[0], selector);
8521 w3[1] = __byte_perm_S ( 0, w0[0], selector);
8541 inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
8543 #if defined IS_AMD || defined IS_GENERIC
8547 w3[2] = amd_bytealign_S (w3[1], 0, offset);
8548 w3[1] = amd_bytealign_S (w3[0], w3[1], offset);
8549 w3[0] = amd_bytealign_S (w2[3], w3[0], offset);
8550 w2[3] = amd_bytealign_S (w2[2], w2[3], offset);
8551 w2[2] = amd_bytealign_S (w2[1], w2[2], offset);
8552 w2[1] = amd_bytealign_S (w2[0], w2[1], offset);
8553 w2[0] = amd_bytealign_S (w1[3], w2[0], offset);
8554 w1[3] = amd_bytealign_S (w1[2], w1[3], offset);
8555 w1[2] = amd_bytealign_S (w1[1], w1[2], offset);
8556 w1[1] = amd_bytealign_S (w1[0], w1[1], offset);
8557 w1[0] = amd_bytealign_S (w0[3], w1[0], offset);
8558 w0[3] = amd_bytealign_S (w0[2], w0[3], offset);
8559 w0[2] = amd_bytealign_S (w0[1], w0[2], offset);
8560 w0[1] = amd_bytealign_S (w0[0], w0[1], offset);
8561 w0[0] = amd_bytealign_S ( 0, w0[0], offset);
8565 w3[2] = amd_bytealign_S (w3[0], 0, offset);
8566 w3[1] = amd_bytealign_S (w2[3], w3[0], offset);
8567 w3[0] = amd_bytealign_S (w2[2], w2[3], offset);
8568 w2[3] = amd_bytealign_S (w2[1], w2[2], offset);
8569 w2[2] = amd_bytealign_S (w2[0], w2[1], offset);
8570 w2[1] = amd_bytealign_S (w1[3], w2[0], offset);
8571 w2[0] = amd_bytealign_S (w1[2], w1[3], offset);
8572 w1[3] = amd_bytealign_S (w1[1], w1[2], offset);
8573 w1[2] = amd_bytealign_S (w1[0], w1[1], offset);
8574 w1[1] = amd_bytealign_S (w0[3], w1[0], offset);
8575 w1[0] = amd_bytealign_S (w0[2], w0[3], offset);
8576 w0[3] = amd_bytealign_S (w0[1], w0[2], offset);
8577 w0[2] = amd_bytealign_S (w0[0], w0[1], offset);
8578 w0[1] = amd_bytealign_S ( 0, w0[0], offset);
8583 w3[2] = amd_bytealign_S (w2[3], 0, offset);
8584 w3[1] = amd_bytealign_S (w2[2], w2[3], offset);
8585 w3[0] = amd_bytealign_S (w2[1], w2[2], offset);
8586 w2[3] = amd_bytealign_S (w2[0], w2[1], offset);
8587 w2[2] = amd_bytealign_S (w1[3], w2[0], offset);
8588 w2[1] = amd_bytealign_S (w1[2], w1[3], offset);
8589 w2[0] = amd_bytealign_S (w1[1], w1[2], offset);
8590 w1[3] = amd_bytealign_S (w1[0], w1[1], offset);
8591 w1[2] = amd_bytealign_S (w0[3], w1[0], offset);
8592 w1[1] = amd_bytealign_S (w0[2], w0[3], offset);
8593 w1[0] = amd_bytealign_S (w0[1], w0[2], offset);
8594 w0[3] = amd_bytealign_S (w0[0], w0[1], offset);
8595 w0[2] = amd_bytealign_S ( 0, w0[0], offset);
8601 w3[2] = amd_bytealign_S (w2[2], 0, offset);
8602 w3[1] = amd_bytealign_S (w2[1], w2[2], offset);
8603 w3[0] = amd_bytealign_S (w2[0], w2[1], offset);
8604 w2[3] = amd_bytealign_S (w1[3], w2[0], offset);
8605 w2[2] = amd_bytealign_S (w1[2], w1[3], offset);
8606 w2[1] = amd_bytealign_S (w1[1], w1[2], offset);
8607 w2[0] = amd_bytealign_S (w1[0], w1[1], offset);
8608 w1[3] = amd_bytealign_S (w0[3], w1[0], offset);
8609 w1[2] = amd_bytealign_S (w0[2], w0[3], offset);
8610 w1[1] = amd_bytealign_S (w0[1], w0[2], offset);
8611 w1[0] = amd_bytealign_S (w0[0], w0[1], offset);
8612 w0[3] = amd_bytealign_S ( 0, w0[0], offset);
8619 w3[2] = amd_bytealign_S (w2[1], 0, offset);
8620 w3[1] = amd_bytealign_S (w2[0], w2[1], offset);
8621 w3[0] = amd_bytealign_S (w1[3], w2[0], offset);
8622 w2[3] = amd_bytealign_S (w1[2], w1[3], offset);
8623 w2[2] = amd_bytealign_S (w1[1], w1[2], offset);
8624 w2[1] = amd_bytealign_S (w1[0], w1[1], offset);
8625 w2[0] = amd_bytealign_S (w0[3], w1[0], offset);
8626 w1[3] = amd_bytealign_S (w0[2], w0[3], offset);
8627 w1[2] = amd_bytealign_S (w0[1], w0[2], offset);
8628 w1[1] = amd_bytealign_S (w0[0], w0[1], offset);
8629 w1[0] = amd_bytealign_S ( 0, w0[0], offset);
8637 w3[2] = amd_bytealign_S (w2[0], 0, offset);
8638 w3[1] = amd_bytealign_S (w1[3], w2[0], offset);
8639 w3[0] = amd_bytealign_S (w1[2], w1[3], offset);
8640 w2[3] = amd_bytealign_S (w1[1], w1[2], offset);
8641 w2[2] = amd_bytealign_S (w1[0], w1[1], offset);
8642 w2[1] = amd_bytealign_S (w0[3], w1[0], offset);
8643 w2[0] = amd_bytealign_S (w0[2], w0[3], offset);
8644 w1[3] = amd_bytealign_S (w0[1], w0[2], offset);
8645 w1[2] = amd_bytealign_S (w0[0], w0[1], offset);
8646 w1[1] = amd_bytealign_S ( 0, w0[0], offset);
8655 w3[2] = amd_bytealign_S (w1[3], 0, offset);
8656 w3[1] = amd_bytealign_S (w1[2], w1[3], offset);
8657 w3[0] = amd_bytealign_S (w1[1], w1[2], offset);
8658 w2[3] = amd_bytealign_S (w1[0], w1[1], offset);
8659 w2[2] = amd_bytealign_S (w0[3], w1[0], offset);
8660 w2[1] = amd_bytealign_S (w0[2], w0[3], offset);
8661 w2[0] = amd_bytealign_S (w0[1], w0[2], offset);
8662 w1[3] = amd_bytealign_S (w0[0], w0[1], offset);
8663 w1[2] = amd_bytealign_S ( 0, w0[0], offset);
8673 w3[2] = amd_bytealign_S (w1[2], 0, offset);
8674 w3[1] = amd_bytealign_S (w1[1], w1[2], offset);
8675 w3[0] = amd_bytealign_S (w1[0], w1[1], offset);
8676 w2[3] = amd_bytealign_S (w0[3], w1[0], offset);
8677 w2[2] = amd_bytealign_S (w0[2], w0[3], offset);
8678 w2[1] = amd_bytealign_S (w0[1], w0[2], offset);
8679 w2[0] = amd_bytealign_S (w0[0], w0[1], offset);
8680 w1[3] = amd_bytealign_S ( 0, w0[0], offset);
8691 w3[2] = amd_bytealign_S (w1[1], 0, offset);
8692 w3[1] = amd_bytealign_S (w1[0], w1[1], offset);
8693 w3[0] = amd_bytealign_S (w0[3], w1[0], offset);
8694 w2[3] = amd_bytealign_S (w0[2], w0[3], offset);
8695 w2[2] = amd_bytealign_S (w0[1], w0[2], offset);
8696 w2[1] = amd_bytealign_S (w0[0], w0[1], offset);
8697 w2[0] = amd_bytealign_S ( 0, w0[0], offset);
8709 w3[2] = amd_bytealign_S (w1[0], 0, offset);
8710 w3[1] = amd_bytealign_S (w0[3], w1[0], offset);
8711 w3[0] = amd_bytealign_S (w0[2], w0[3], offset);
8712 w2[3] = amd_bytealign_S (w0[1], w0[2], offset);
8713 w2[2] = amd_bytealign_S (w0[0], w0[1], offset);
8714 w2[1] = amd_bytealign_S ( 0, w0[0], offset);
8727 w3[2] = amd_bytealign_S (w0[3], 0, offset);
8728 w3[1] = amd_bytealign_S (w0[2], w0[3], offset);
8729 w3[0] = amd_bytealign_S (w0[1], w0[2], offset);
8730 w2[3] = amd_bytealign_S (w0[0], w0[1], offset);
8731 w2[2] = amd_bytealign_S ( 0, w0[0], offset);
8745 w3[2] = amd_bytealign_S (w0[2], 0, offset);
8746 w3[1] = amd_bytealign_S (w0[1], w0[2], offset);
8747 w3[0] = amd_bytealign_S (w0[0], w0[1], offset);
8748 w2[3] = amd_bytealign_S ( 0, w0[0], offset);
8763 w3[2] = amd_bytealign_S (w0[1], 0, offset);
8764 w3[1] = amd_bytealign_S (w0[0], w0[1], offset);
8765 w3[0] = amd_bytealign_S ( 0, w0[0], offset);
8781 w3[2] = amd_bytealign_S (w0[0], 0, offset);
8782 w3[1] = amd_bytealign_S ( 0, w0[0], offset);
8801 const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
8806 w3[1] = __byte_perm_S (w3[1], w3[0], selector);
8807 w3[0] = __byte_perm_S (w3[0], w2[3], selector);
8808 w2[3] = __byte_perm_S (w2[3], w2[2], selector);
8809 w2[2] = __byte_perm_S (w2[2], w2[1], selector);
8810 w2[1] = __byte_perm_S (w2[1], w2[0], selector);
8811 w2[0] = __byte_perm_S (w2[0], w1[3], selector);
8812 w1[3] = __byte_perm_S (w1[3], w1[2], selector);
8813 w1[2] = __byte_perm_S (w1[2], w1[1], selector);
8814 w1[1] = __byte_perm_S (w1[1], w1[0], selector);
8815 w1[0] = __byte_perm_S (w1[0], w0[3], selector);
8816 w0[3] = __byte_perm_S (w0[3], w0[2], selector);
8817 w0[2] = __byte_perm_S (w0[2], w0[1], selector);
8818 w0[1] = __byte_perm_S (w0[1], w0[0], selector);
8819 w0[0] = __byte_perm_S (w0[0], 0, selector);
8823 w3[1] = __byte_perm_S (w3[0], w2[3], selector);
8824 w3[0] = __byte_perm_S (w2[3], w2[2], selector);
8825 w2[3] = __byte_perm_S (w2[2], w2[1], selector);
8826 w2[2] = __byte_perm_S (w2[1], w2[0], selector);
8827 w2[1] = __byte_perm_S (w2[0], w1[3], selector);
8828 w2[0] = __byte_perm_S (w1[3], w1[2], selector);
8829 w1[3] = __byte_perm_S (w1[2], w1[1], selector);
8830 w1[2] = __byte_perm_S (w1[1], w1[0], selector);
8831 w1[1] = __byte_perm_S (w1[0], w0[3], selector);
8832 w1[0] = __byte_perm_S (w0[3], w0[2], selector);
8833 w0[3] = __byte_perm_S (w0[2], w0[1], selector);
8834 w0[2] = __byte_perm_S (w0[1], w0[0], selector);
8835 w0[1] = __byte_perm_S (w0[0], 0, selector);
8840 w3[1] = __byte_perm_S (w2[3], w2[2], selector);
8841 w3[0] = __byte_perm_S (w2[2], w2[1], selector);
8842 w2[3] = __byte_perm_S (w2[1], w2[0], selector);
8843 w2[2] = __byte_perm_S (w2[0], w1[3], selector);
8844 w2[1] = __byte_perm_S (w1[3], w1[2], selector);
8845 w2[0] = __byte_perm_S (w1[2], w1[1], selector);
8846 w1[3] = __byte_perm_S (w1[1], w1[0], selector);
8847 w1[2] = __byte_perm_S (w1[0], w0[3], selector);
8848 w1[1] = __byte_perm_S (w0[3], w0[2], selector);
8849 w1[0] = __byte_perm_S (w0[2], w0[1], selector);
8850 w0[3] = __byte_perm_S (w0[1], w0[0], selector);
8851 w0[2] = __byte_perm_S (w0[0], 0, selector);
8857 w3[1] = __byte_perm_S (w2[2], w2[1], selector);
8858 w3[0] = __byte_perm_S (w2[1], w2[0], selector);
8859 w2[3] = __byte_perm_S (w2[0], w1[3], selector);
8860 w2[2] = __byte_perm_S (w1[3], w1[2], selector);
8861 w2[1] = __byte_perm_S (w1[2], w1[1], selector);
8862 w2[0] = __byte_perm_S (w1[1], w1[0], selector);
8863 w1[3] = __byte_perm_S (w1[0], w0[3], selector);
8864 w1[2] = __byte_perm_S (w0[3], w0[2], selector);
8865 w1[1] = __byte_perm_S (w0[2], w0[1], selector);
8866 w1[0] = __byte_perm_S (w0[1], w0[0], selector);
8867 w0[3] = __byte_perm_S (w0[0], 0, selector);
8874 w3[1] = __byte_perm_S (w2[1], w2[0], selector);
8875 w3[0] = __byte_perm_S (w2[0], w1[3], selector);
8876 w2[3] = __byte_perm_S (w1[3], w1[2], selector);
8877 w2[2] = __byte_perm_S (w1[2], w1[1], selector);
8878 w2[1] = __byte_perm_S (w1[1], w1[0], selector);
8879 w2[0] = __byte_perm_S (w1[0], w0[3], selector);
8880 w1[3] = __byte_perm_S (w0[3], w0[2], selector);
8881 w1[2] = __byte_perm_S (w0[2], w0[1], selector);
8882 w1[1] = __byte_perm_S (w0[1], w0[0], selector);
8883 w1[0] = __byte_perm_S (w0[0], 0, selector);
8891 w3[1] = __byte_perm_S (w2[0], w1[3], selector);
8892 w3[0] = __byte_perm_S (w1[3], w1[2], selector);
8893 w2[3] = __byte_perm_S (w1[2], w1[1], selector);
8894 w2[2] = __byte_perm_S (w1[1], w1[0], selector);
8895 w2[1] = __byte_perm_S (w1[0], w0[3], selector);
8896 w2[0] = __byte_perm_S (w0[3], w0[2], selector);
8897 w1[3] = __byte_perm_S (w0[2], w0[1], selector);
8898 w1[2] = __byte_perm_S (w0[1], w0[0], selector);
8899 w1[1] = __byte_perm_S (w0[0], 0, selector);
8908 w3[1] = __byte_perm_S (w1[3], w1[2], selector);
8909 w3[0] = __byte_perm_S (w1[2], w1[1], selector);
8910 w2[3] = __byte_perm_S (w1[1], w1[0], selector);
8911 w2[2] = __byte_perm_S (w1[0], w0[3], selector);
8912 w2[1] = __byte_perm_S (w0[3], w0[2], selector);
8913 w2[0] = __byte_perm_S (w0[2], w0[1], selector);
8914 w1[3] = __byte_perm_S (w0[1], w0[0], selector);
8915 w1[2] = __byte_perm_S (w0[0], 0, selector);
8925 w3[1] = __byte_perm_S (w1[2], w1[1], selector);
8926 w3[0] = __byte_perm_S (w1[1], w1[0], selector);
8927 w2[3] = __byte_perm_S (w1[0], w0[3], selector);
8928 w2[2] = __byte_perm_S (w0[3], w0[2], selector);
8929 w2[1] = __byte_perm_S (w0[2], w0[1], selector);
8930 w2[0] = __byte_perm_S (w0[1], w0[0], selector);
8931 w1[3] = __byte_perm_S (w0[0], 0, selector);
8942 w3[1] = __byte_perm_S (w1[1], w1[0], selector);
8943 w3[0] = __byte_perm_S (w1[0], w0[3], selector);
8944 w2[3] = __byte_perm_S (w0[3], w0[2], selector);
8945 w2[2] = __byte_perm_S (w0[2], w0[1], selector);
8946 w2[1] = __byte_perm_S (w0[1], w0[0], selector);
8947 w2[0] = __byte_perm_S (w0[0], 0, selector);
8959 w3[1] = __byte_perm_S (w1[0], w0[3], selector);
8960 w3[0] = __byte_perm_S (w0[3], w0[2], selector);
8961 w2[3] = __byte_perm_S (w0[2], w0[1], selector);
8962 w2[2] = __byte_perm_S (w0[1], w0[0], selector);
8963 w2[1] = __byte_perm_S (w0[0], 0, selector);
8976 w3[1] = __byte_perm_S (w0[3], w0[2], selector);
8977 w3[0] = __byte_perm_S (w0[2], w0[1], selector);
8978 w2[3] = __byte_perm_S (w0[1], w0[0], selector);
8979 w2[2] = __byte_perm_S (w0[0], 0, selector);
8993 w3[1] = __byte_perm_S (w0[2], w0[1], selector);
8994 w3[0] = __byte_perm_S (w0[1], w0[0], selector);
8995 w2[3] = __byte_perm_S (w0[0], 0, selector);
9010 w3[1] = __byte_perm_S (w0[1], w0[0], selector);
9011 w3[0] = __byte_perm_S (w0[0], 0, selector);
9027 w3[1] = __byte_perm_S (w0[0], 0, selector);
9047 * vector functions on scalar types (for inner loop usage)
9050 #define PACKVS2(sn,vn,e) \
9051 sn[0] = vn[0].s##e; \
9054 #define PACKSV2(sn,vn,e) \
9055 vn[0].s##e = sn[0]; \
9058 #define PACKVS24(s0,s1,v0,v1,e) \
9059 PACKVS4 (s0, v0, e); \
9060 PACKVS4 (s1, v1, e);
9062 #define PACKSV24(s0,s1,v0,v1,e) \
9063 PACKSV4 (s0, v0, e); \
9064 PACKSV4 (s1, v1, e);
9066 #define PACKVS4(sn,vn,e) \
9067 sn[0] = vn[0].s##e; \
9068 sn[1] = vn[1].s##e; \
9069 sn[2] = vn[2].s##e; \
9072 #define PACKSV4(sn,vn,e) \
9073 vn[0].s##e = sn[0]; \
9074 vn[1].s##e = sn[1]; \
9075 vn[2].s##e = sn[2]; \
9078 #define PACKVS44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
9079 PACKVS4 (s0, v0, e); \
9080 PACKVS4 (s1, v1, e); \
9081 PACKVS4 (s2, v2, e); \
9082 PACKVS4 (s3, v3, e);
9084 #define PACKSV44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
9085 PACKSV4 (s0, v0, e); \
9086 PACKSV4 (s1, v1, e); \
9087 PACKSV4 (s2, v2, e); \
9088 PACKSV4 (s3, v3, e);
9090 inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
9094 switch_buffer_by_offset_le_S (w0, w1, w2, w3, offset);
9107 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9108 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9110 #elif VECT_SIZE == 4
9112 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9113 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9114 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9115 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9117 #elif VECT_SIZE == 8
9119 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9120 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9121 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9122 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9123 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
9124 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
9125 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
9126 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
9128 #elif VECT_SIZE == 16
9130 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9131 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9132 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9133 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9134 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
9135 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
9136 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
9137 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
9138 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
9139 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
9140 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
9141 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
9142 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
9143 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
9144 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
9145 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
9150 inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
9154 append_0x01_2x4_S (w0, w1, offset);
9165 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9166 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9168 #elif VECT_SIZE == 4
9170 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9171 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9172 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9173 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9175 #elif VECT_SIZE == 8
9177 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9178 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9179 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9180 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9181 PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
9182 PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
9183 PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
9184 PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
9186 #elif VECT_SIZE == 16
9188 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9189 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9190 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9191 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9192 PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
9193 PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
9194 PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
9195 PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
9196 PACKVS24 (t0, t1, w0, w1, 8); append_0x01_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
9197 PACKVS24 (t0, t1, w0, w1, 9); append_0x01_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
9198 PACKVS24 (t0, t1, w0, w1, a); append_0x01_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
9199 PACKVS24 (t0, t1, w0, w1, b); append_0x01_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
9200 PACKVS24 (t0, t1, w0, w1, c); append_0x01_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
9201 PACKVS24 (t0, t1, w0, w1, d); append_0x01_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
9202 PACKVS24 (t0, t1, w0, w1, e); append_0x01_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
9203 PACKVS24 (t0, t1, w0, w1, f); append_0x01_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
9208 inline void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
9212 append_0x80_2x4_S (w0, w1, offset);
9223 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9224 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9226 #elif VECT_SIZE == 4
9228 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9229 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9230 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9231 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9233 #elif VECT_SIZE == 8
9235 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9236 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9237 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9238 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9239 PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
9240 PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
9241 PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
9242 PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
9244 #elif VECT_SIZE == 16
9246 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9247 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9248 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9249 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9250 PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
9251 PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
9252 PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
9253 PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
9254 PACKVS24 (t0, t1, w0, w1, 8); append_0x80_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
9255 PACKVS24 (t0, t1, w0, w1, 9); append_0x80_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
9256 PACKVS24 (t0, t1, w0, w1, a); append_0x80_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
9257 PACKVS24 (t0, t1, w0, w1, b); append_0x80_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
9258 PACKVS24 (t0, t1, w0, w1, c); append_0x80_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
9259 PACKVS24 (t0, t1, w0, w1, d); append_0x80_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
9260 PACKVS24 (t0, t1, w0, w1, e); append_0x80_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
9261 PACKVS24 (t0, t1, w0, w1, f); append_0x80_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
9266 inline void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
9270 append_0x80_4x4_S (w0, w1, w2, w3, offset);
9283 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9284 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9286 #elif VECT_SIZE == 4
9288 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9289 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9290 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9291 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9293 #elif VECT_SIZE == 8
9295 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9296 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9297 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9298 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9299 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
9300 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
9301 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
9302 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
9304 #elif VECT_SIZE == 16
9306 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9307 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9308 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9309 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9310 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
9311 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
9312 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
9313 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
9314 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); append_0x80_4x4_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
9315 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); append_0x80_4x4_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
9316 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); append_0x80_4x4_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
9317 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); append_0x80_4x4_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
9318 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); append_0x80_4x4_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
9319 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); append_0x80_4x4_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
9320 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); append_0x80_4x4_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
9321 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); append_0x80_4x4_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
9326 __kernel void gpu_memset (__global uint4 *buf, const u32 value, const u32 gid_max)
9328 const u32 gid = get_global_id (0);
9330 if (gid >= gid_max) return;
9332 buf[gid] = (uint4) (value);