Added SIMD code for WPA/WPA2
[hashcat.git] / OpenCL / common.c
1 /**
2 * Author......: Jens Steube <jens.steube@gmail.com>
3 * License.....: MIT
4 */
5
6 /**
7 * pure scalar functions
8 */
9
10 inline int hash_comp (const u32 d1[4], __global u32 *d2)
11 {
12 if (d1[3] > d2[DGST_R3]) return ( 1);
13 if (d1[3] < d2[DGST_R3]) return (-1);
14 if (d1[2] > d2[DGST_R2]) return ( 1);
15 if (d1[2] < d2[DGST_R2]) return (-1);
16 if (d1[1] > d2[DGST_R1]) return ( 1);
17 if (d1[1] < d2[DGST_R1]) return (-1);
18 if (d1[0] > d2[DGST_R0]) return ( 1);
19 if (d1[0] < d2[DGST_R0]) return (-1);
20
21 return (0);
22 }
23
24 inline int find_hash (const u32 digest[4], const u32 digests_cnt, __global digest_t *digests_buf)
25 {
26 for (u32 l = 0, r = digests_cnt; r; r >>= 1)
27 {
28 const u32 m = r >> 1;
29
30 const u32 c = l + m;
31
32 const int cmp = hash_comp (digest, digests_buf[c].digest_buf);
33
34 if (cmp > 0)
35 {
36 l += m + 1;
37
38 r--;
39 }
40
41 if (cmp == 0) return (c);
42 }
43
44 return (-1);
45 }
46
47 inline u32 check_bitmap (__global u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest)
48 {
49 return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f)));
50 }
51
52 inline u32 check (const u32 digest[2], __global u32 *bitmap_s1_a, __global u32 *bitmap_s1_b, __global u32 *bitmap_s1_c, __global u32 *bitmap_s1_d, __global u32 *bitmap_s2_a, __global u32 *bitmap_s2_b, __global u32 *bitmap_s2_c, __global u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2)
53 {
54 if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0);
55 if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0);
56 if (check_bitmap (bitmap_s1_c, bitmap_mask, bitmap_shift1, digest[2]) == 0) return (0);
57 if (check_bitmap (bitmap_s1_d, bitmap_mask, bitmap_shift1, digest[3]) == 0) return (0);
58
59 if (check_bitmap (bitmap_s2_a, bitmap_mask, bitmap_shift2, digest[0]) == 0) return (0);
60 if (check_bitmap (bitmap_s2_b, bitmap_mask, bitmap_shift2, digest[1]) == 0) return (0);
61 if (check_bitmap (bitmap_s2_c, bitmap_mask, bitmap_shift2, digest[2]) == 0) return (0);
62 if (check_bitmap (bitmap_s2_d, bitmap_mask, bitmap_shift2, digest[3]) == 0) return (0);
63
64 return (1);
65 }
66
67 inline void mark_hash (__global plain_t *plains_buf, __global u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
68 {
69 hashes_shown[hash_pos] = 1;
70
71 plains_buf[hash_pos].gidvid = (gid * 1) + 0;
72 plains_buf[hash_pos].il_pos = il_pos;
73 }
74
75 /**
76 * vector functions
77 */
78
79 inline void truncate_block (u32x w[4], const u32 len)
80 {
81 switch (len)
82 {
83 case 0: w[0] &= 0;
84 w[1] &= 0;
85 w[2] &= 0;
86 w[3] &= 0;
87 break;
88 case 1: w[0] &= 0x000000FF;
89 w[1] &= 0;
90 w[2] &= 0;
91 w[3] &= 0;
92 break;
93 case 2: w[0] &= 0x0000FFFF;
94 w[1] &= 0;
95 w[2] &= 0;
96 w[3] &= 0;
97 break;
98 case 3: w[0] &= 0x00FFFFFF;
99 w[1] &= 0;
100 w[2] &= 0;
101 w[3] &= 0;
102 break;
103 case 4: w[1] &= 0;
104 w[2] &= 0;
105 w[3] &= 0;
106 break;
107 case 5: w[1] &= 0x000000FF;
108 w[2] &= 0;
109 w[3] &= 0;
110 break;
111 case 6: w[1] &= 0x0000FFFF;
112 w[2] &= 0;
113 w[3] &= 0;
114 break;
115 case 7: w[1] &= 0x00FFFFFF;
116 w[2] &= 0;
117 w[3] &= 0;
118 break;
119 case 8: w[2] &= 0;
120 w[3] &= 0;
121 break;
122 case 9: w[2] &= 0x000000FF;
123 w[3] &= 0;
124 break;
125 case 10: w[2] &= 0x0000FFFF;
126 w[3] &= 0;
127 break;
128 case 11: w[2] &= 0x00FFFFFF;
129 w[3] &= 0;
130 break;
131 case 12: w[3] &= 0;
132 break;
133 case 13: w[3] &= 0x000000FF;
134 break;
135 case 14: w[3] &= 0x0000FFFF;
136 break;
137 case 15: w[3] &= 0x00FFFFFF;
138 break;
139 }
140 }
141
142 inline void make_unicode (const u32x in[4], u32x out1[4], u32x out2[4])
143 {
144 #ifdef IS_NV
145 out2[3] = __byte_perm (in[3], 0, 0x7372);
146 out2[2] = __byte_perm (in[3], 0, 0x7170);
147 out2[1] = __byte_perm (in[2], 0, 0x7372);
148 out2[0] = __byte_perm (in[2], 0, 0x7170);
149 out1[3] = __byte_perm (in[1], 0, 0x7372);
150 out1[2] = __byte_perm (in[1], 0, 0x7170);
151 out1[1] = __byte_perm (in[0], 0, 0x7372);
152 out1[0] = __byte_perm (in[0], 0, 0x7170);
153 #endif
154
155 #if defined IS_AMD || defined IS_GENERIC
156 out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
157 out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
158 out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
159 out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
160 out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
161 out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
162 out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
163 out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
164 #endif
165 }
166
167 inline void undo_unicode (const u32x in1[4], const u32x in2[4], u32x out[4])
168 {
169 #ifdef IS_NV
170 out[0] = __byte_perm (in1[0], in1[1], 0x6420);
171 out[1] = __byte_perm (in1[2], in1[3], 0x6420);
172 out[2] = __byte_perm (in2[0], in2[1], 0x6420);
173 out[3] = __byte_perm (in2[2], in2[3], 0x6420);
174 #endif
175
176 #if defined IS_AMD || defined IS_GENERIC
177 out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
178 | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
179 out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
180 | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
181 out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
182 | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
183 out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
184 | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
185 #endif
186 }
187
188 inline void append_0x01_1x4 (u32x w0[4], const u32 offset)
189 {
190 switch (offset)
191 {
192 case 0:
193 w0[0] = 0x01;
194 break;
195
196 case 1:
197 w0[0] = w0[0] | 0x0100;
198 break;
199
200 case 2:
201 w0[0] = w0[0] | 0x010000;
202 break;
203
204 case 3:
205 w0[0] = w0[0] | 0x01000000;
206 break;
207
208 case 4:
209 w0[1] = 0x01;
210 break;
211
212 case 5:
213 w0[1] = w0[1] | 0x0100;
214 break;
215
216 case 6:
217 w0[1] = w0[1] | 0x010000;
218 break;
219
220 case 7:
221 w0[1] = w0[1] | 0x01000000;
222 break;
223
224 case 8:
225 w0[2] = 0x01;
226 break;
227
228 case 9:
229 w0[2] = w0[2] | 0x0100;
230 break;
231
232 case 10:
233 w0[2] = w0[2] | 0x010000;
234 break;
235
236 case 11:
237 w0[2] = w0[2] | 0x01000000;
238 break;
239
240 case 12:
241 w0[3] = 0x01;
242 break;
243
244 case 13:
245 w0[3] = w0[3] | 0x0100;
246 break;
247
248 case 14:
249 w0[3] = w0[3] | 0x010000;
250 break;
251
252 case 15:
253 w0[3] = w0[3] | 0x01000000;
254 break;
255 }
256 }
257
258 inline void append_0x01_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
259 {
260 switch (offset)
261 {
262 case 0:
263 w0[0] = 0x01;
264 break;
265
266 case 1:
267 w0[0] = w0[0] | 0x0100;
268 break;
269
270 case 2:
271 w0[0] = w0[0] | 0x010000;
272 break;
273
274 case 3:
275 w0[0] = w0[0] | 0x01000000;
276 break;
277
278 case 4:
279 w0[1] = 0x01;
280 break;
281
282 case 5:
283 w0[1] = w0[1] | 0x0100;
284 break;
285
286 case 6:
287 w0[1] = w0[1] | 0x010000;
288 break;
289
290 case 7:
291 w0[1] = w0[1] | 0x01000000;
292 break;
293
294 case 8:
295 w0[2] = 0x01;
296 break;
297
298 case 9:
299 w0[2] = w0[2] | 0x0100;
300 break;
301
302 case 10:
303 w0[2] = w0[2] | 0x010000;
304 break;
305
306 case 11:
307 w0[2] = w0[2] | 0x01000000;
308 break;
309
310 case 12:
311 w0[3] = 0x01;
312 break;
313
314 case 13:
315 w0[3] = w0[3] | 0x0100;
316 break;
317
318 case 14:
319 w0[3] = w0[3] | 0x010000;
320 break;
321
322 case 15:
323 w0[3] = w0[3] | 0x01000000;
324 break;
325
326 case 16:
327 w1[0] = 0x01;
328 break;
329
330 case 17:
331 w1[0] = w1[0] | 0x0100;
332 break;
333
334 case 18:
335 w1[0] = w1[0] | 0x010000;
336 break;
337
338 case 19:
339 w1[0] = w1[0] | 0x01000000;
340 break;
341
342 case 20:
343 w1[1] = 0x01;
344 break;
345
346 case 21:
347 w1[1] = w1[1] | 0x0100;
348 break;
349
350 case 22:
351 w1[1] = w1[1] | 0x010000;
352 break;
353
354 case 23:
355 w1[1] = w1[1] | 0x01000000;
356 break;
357
358 case 24:
359 w1[2] = 0x01;
360 break;
361
362 case 25:
363 w1[2] = w1[2] | 0x0100;
364 break;
365
366 case 26:
367 w1[2] = w1[2] | 0x010000;
368 break;
369
370 case 27:
371 w1[2] = w1[2] | 0x01000000;
372 break;
373
374 case 28:
375 w1[3] = 0x01;
376 break;
377
378 case 29:
379 w1[3] = w1[3] | 0x0100;
380 break;
381
382 case 30:
383 w1[3] = w1[3] | 0x010000;
384 break;
385
386 case 31:
387 w1[3] = w1[3] | 0x01000000;
388 break;
389 }
390 }
391
392 inline void append_0x01_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
393 {
394 switch (offset)
395 {
396 case 0:
397 w0[0] = 0x01;
398 break;
399
400 case 1:
401 w0[0] = w0[0] | 0x0100;
402 break;
403
404 case 2:
405 w0[0] = w0[0] | 0x010000;
406 break;
407
408 case 3:
409 w0[0] = w0[0] | 0x01000000;
410 break;
411
412 case 4:
413 w0[1] = 0x01;
414 break;
415
416 case 5:
417 w0[1] = w0[1] | 0x0100;
418 break;
419
420 case 6:
421 w0[1] = w0[1] | 0x010000;
422 break;
423
424 case 7:
425 w0[1] = w0[1] | 0x01000000;
426 break;
427
428 case 8:
429 w0[2] = 0x01;
430 break;
431
432 case 9:
433 w0[2] = w0[2] | 0x0100;
434 break;
435
436 case 10:
437 w0[2] = w0[2] | 0x010000;
438 break;
439
440 case 11:
441 w0[2] = w0[2] | 0x01000000;
442 break;
443
444 case 12:
445 w0[3] = 0x01;
446 break;
447
448 case 13:
449 w0[3] = w0[3] | 0x0100;
450 break;
451
452 case 14:
453 w0[3] = w0[3] | 0x010000;
454 break;
455
456 case 15:
457 w0[3] = w0[3] | 0x01000000;
458 break;
459
460 case 16:
461 w1[0] = 0x01;
462 break;
463
464 case 17:
465 w1[0] = w1[0] | 0x0100;
466 break;
467
468 case 18:
469 w1[0] = w1[0] | 0x010000;
470 break;
471
472 case 19:
473 w1[0] = w1[0] | 0x01000000;
474 break;
475
476 case 20:
477 w1[1] = 0x01;
478 break;
479
480 case 21:
481 w1[1] = w1[1] | 0x0100;
482 break;
483
484 case 22:
485 w1[1] = w1[1] | 0x010000;
486 break;
487
488 case 23:
489 w1[1] = w1[1] | 0x01000000;
490 break;
491
492 case 24:
493 w1[2] = 0x01;
494 break;
495
496 case 25:
497 w1[2] = w1[2] | 0x0100;
498 break;
499
500 case 26:
501 w1[2] = w1[2] | 0x010000;
502 break;
503
504 case 27:
505 w1[2] = w1[2] | 0x01000000;
506 break;
507
508 case 28:
509 w1[3] = 0x01;
510 break;
511
512 case 29:
513 w1[3] = w1[3] | 0x0100;
514 break;
515
516 case 30:
517 w1[3] = w1[3] | 0x010000;
518 break;
519
520 case 31:
521 w1[3] = w1[3] | 0x01000000;
522 break;
523
524 case 32:
525 w2[0] = 0x01;
526 break;
527
528 case 33:
529 w2[0] = w2[0] | 0x0100;
530 break;
531
532 case 34:
533 w2[0] = w2[0] | 0x010000;
534 break;
535
536 case 35:
537 w2[0] = w2[0] | 0x01000000;
538 break;
539
540 case 36:
541 w2[1] = 0x01;
542 break;
543
544 case 37:
545 w2[1] = w2[1] | 0x0100;
546 break;
547
548 case 38:
549 w2[1] = w2[1] | 0x010000;
550 break;
551
552 case 39:
553 w2[1] = w2[1] | 0x01000000;
554 break;
555
556 case 40:
557 w2[2] = 0x01;
558 break;
559
560 case 41:
561 w2[2] = w2[2] | 0x0100;
562 break;
563
564 case 42:
565 w2[2] = w2[2] | 0x010000;
566 break;
567
568 case 43:
569 w2[2] = w2[2] | 0x01000000;
570 break;
571
572 case 44:
573 w2[3] = 0x01;
574 break;
575
576 case 45:
577 w2[3] = w2[3] | 0x0100;
578 break;
579
580 case 46:
581 w2[3] = w2[3] | 0x010000;
582 break;
583
584 case 47:
585 w2[3] = w2[3] | 0x01000000;
586 break;
587 }
588 }
589
590 inline void append_0x01_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
591 {
592 switch (offset)
593 {
594 case 0:
595 w0[0] = 0x01;
596 break;
597
598 case 1:
599 w0[0] = w0[0] | 0x0100;
600 break;
601
602 case 2:
603 w0[0] = w0[0] | 0x010000;
604 break;
605
606 case 3:
607 w0[0] = w0[0] | 0x01000000;
608 break;
609
610 case 4:
611 w0[1] = 0x01;
612 break;
613
614 case 5:
615 w0[1] = w0[1] | 0x0100;
616 break;
617
618 case 6:
619 w0[1] = w0[1] | 0x010000;
620 break;
621
622 case 7:
623 w0[1] = w0[1] | 0x01000000;
624 break;
625
626 case 8:
627 w0[2] = 0x01;
628 break;
629
630 case 9:
631 w0[2] = w0[2] | 0x0100;
632 break;
633
634 case 10:
635 w0[2] = w0[2] | 0x010000;
636 break;
637
638 case 11:
639 w0[2] = w0[2] | 0x01000000;
640 break;
641
642 case 12:
643 w0[3] = 0x01;
644 break;
645
646 case 13:
647 w0[3] = w0[3] | 0x0100;
648 break;
649
650 case 14:
651 w0[3] = w0[3] | 0x010000;
652 break;
653
654 case 15:
655 w0[3] = w0[3] | 0x01000000;
656 break;
657
658 case 16:
659 w1[0] = 0x01;
660 break;
661
662 case 17:
663 w1[0] = w1[0] | 0x0100;
664 break;
665
666 case 18:
667 w1[0] = w1[0] | 0x010000;
668 break;
669
670 case 19:
671 w1[0] = w1[0] | 0x01000000;
672 break;
673
674 case 20:
675 w1[1] = 0x01;
676 break;
677
678 case 21:
679 w1[1] = w1[1] | 0x0100;
680 break;
681
682 case 22:
683 w1[1] = w1[1] | 0x010000;
684 break;
685
686 case 23:
687 w1[1] = w1[1] | 0x01000000;
688 break;
689
690 case 24:
691 w1[2] = 0x01;
692 break;
693
694 case 25:
695 w1[2] = w1[2] | 0x0100;
696 break;
697
698 case 26:
699 w1[2] = w1[2] | 0x010000;
700 break;
701
702 case 27:
703 w1[2] = w1[2] | 0x01000000;
704 break;
705
706 case 28:
707 w1[3] = 0x01;
708 break;
709
710 case 29:
711 w1[3] = w1[3] | 0x0100;
712 break;
713
714 case 30:
715 w1[3] = w1[3] | 0x010000;
716 break;
717
718 case 31:
719 w1[3] = w1[3] | 0x01000000;
720 break;
721
722 case 32:
723 w2[0] = 0x01;
724 break;
725
726 case 33:
727 w2[0] = w2[0] | 0x0100;
728 break;
729
730 case 34:
731 w2[0] = w2[0] | 0x010000;
732 break;
733
734 case 35:
735 w2[0] = w2[0] | 0x01000000;
736 break;
737
738 case 36:
739 w2[1] = 0x01;
740 break;
741
742 case 37:
743 w2[1] = w2[1] | 0x0100;
744 break;
745
746 case 38:
747 w2[1] = w2[1] | 0x010000;
748 break;
749
750 case 39:
751 w2[1] = w2[1] | 0x01000000;
752 break;
753
754 case 40:
755 w2[2] = 0x01;
756 break;
757
758 case 41:
759 w2[2] = w2[2] | 0x0100;
760 break;
761
762 case 42:
763 w2[2] = w2[2] | 0x010000;
764 break;
765
766 case 43:
767 w2[2] = w2[2] | 0x01000000;
768 break;
769
770 case 44:
771 w2[3] = 0x01;
772 break;
773
774 case 45:
775 w2[3] = w2[3] | 0x0100;
776 break;
777
778 case 46:
779 w2[3] = w2[3] | 0x010000;
780 break;
781
782 case 47:
783 w2[3] = w2[3] | 0x01000000;
784 break;
785
786 case 48:
787 w3[0] = 0x01;
788 break;
789
790 case 49:
791 w3[0] = w3[0] | 0x0100;
792 break;
793
794 case 50:
795 w3[0] = w3[0] | 0x010000;
796 break;
797
798 case 51:
799 w3[0] = w3[0] | 0x01000000;
800 break;
801
802 case 52:
803 w3[1] = 0x01;
804 break;
805
806 case 53:
807 w3[1] = w3[1] | 0x0100;
808 break;
809
810 case 54:
811 w3[1] = w3[1] | 0x010000;
812 break;
813
814 case 55:
815 w3[1] = w3[1] | 0x01000000;
816 break;
817
818 case 56:
819 w3[2] = 0x01;
820 break;
821
822 case 57:
823 w3[2] = w3[2] | 0x0100;
824 break;
825
826 case 58:
827 w3[2] = w3[2] | 0x010000;
828 break;
829
830 case 59:
831 w3[2] = w3[2] | 0x01000000;
832 break;
833
834 case 60:
835 w3[3] = 0x01;
836 break;
837
838 case 61:
839 w3[3] = w3[3] | 0x0100;
840 break;
841
842 case 62:
843 w3[3] = w3[3] | 0x010000;
844 break;
845
846 case 63:
847 w3[3] = w3[3] | 0x01000000;
848 break;
849 }
850 }
851
852 inline void append_0x01_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
853 {
854 switch (offset)
855 {
856 case 0:
857 w0[0] = 0x01;
858 break;
859
860 case 1:
861 w0[0] = w0[0] | 0x0100;
862 break;
863
864 case 2:
865 w0[0] = w0[0] | 0x010000;
866 break;
867
868 case 3:
869 w0[0] = w0[0] | 0x01000000;
870 break;
871
872 case 4:
873 w0[1] = 0x01;
874 break;
875
876 case 5:
877 w0[1] = w0[1] | 0x0100;
878 break;
879
880 case 6:
881 w0[1] = w0[1] | 0x010000;
882 break;
883
884 case 7:
885 w0[1] = w0[1] | 0x01000000;
886 break;
887
888 case 8:
889 w0[2] = 0x01;
890 break;
891
892 case 9:
893 w0[2] = w0[2] | 0x0100;
894 break;
895
896 case 10:
897 w0[2] = w0[2] | 0x010000;
898 break;
899
900 case 11:
901 w0[2] = w0[2] | 0x01000000;
902 break;
903
904 case 12:
905 w0[3] = 0x01;
906 break;
907
908 case 13:
909 w0[3] = w0[3] | 0x0100;
910 break;
911
912 case 14:
913 w0[3] = w0[3] | 0x010000;
914 break;
915
916 case 15:
917 w0[3] = w0[3] | 0x01000000;
918 break;
919
920 case 16:
921 w1[0] = 0x01;
922 break;
923
924 case 17:
925 w1[0] = w1[0] | 0x0100;
926 break;
927
928 case 18:
929 w1[0] = w1[0] | 0x010000;
930 break;
931
932 case 19:
933 w1[0] = w1[0] | 0x01000000;
934 break;
935
936 case 20:
937 w1[1] = 0x01;
938 break;
939
940 case 21:
941 w1[1] = w1[1] | 0x0100;
942 break;
943
944 case 22:
945 w1[1] = w1[1] | 0x010000;
946 break;
947
948 case 23:
949 w1[1] = w1[1] | 0x01000000;
950 break;
951
952 case 24:
953 w1[2] = 0x01;
954 break;
955
956 case 25:
957 w1[2] = w1[2] | 0x0100;
958 break;
959
960 case 26:
961 w1[2] = w1[2] | 0x010000;
962 break;
963
964 case 27:
965 w1[2] = w1[2] | 0x01000000;
966 break;
967
968 case 28:
969 w1[3] = 0x01;
970 break;
971
972 case 29:
973 w1[3] = w1[3] | 0x0100;
974 break;
975
976 case 30:
977 w1[3] = w1[3] | 0x010000;
978 break;
979
980 case 31:
981 w1[3] = w1[3] | 0x01000000;
982 break;
983
984 case 32:
985 w2[0] = 0x01;
986 break;
987
988 case 33:
989 w2[0] = w2[0] | 0x0100;
990 break;
991
992 case 34:
993 w2[0] = w2[0] | 0x010000;
994 break;
995
996 case 35:
997 w2[0] = w2[0] | 0x01000000;
998 break;
999
1000 case 36:
1001 w2[1] = 0x01;
1002 break;
1003
1004 case 37:
1005 w2[1] = w2[1] | 0x0100;
1006 break;
1007
1008 case 38:
1009 w2[1] = w2[1] | 0x010000;
1010 break;
1011
1012 case 39:
1013 w2[1] = w2[1] | 0x01000000;
1014 break;
1015
1016 case 40:
1017 w2[2] = 0x01;
1018 break;
1019
1020 case 41:
1021 w2[2] = w2[2] | 0x0100;
1022 break;
1023
1024 case 42:
1025 w2[2] = w2[2] | 0x010000;
1026 break;
1027
1028 case 43:
1029 w2[2] = w2[2] | 0x01000000;
1030 break;
1031
1032 case 44:
1033 w2[3] = 0x01;
1034 break;
1035
1036 case 45:
1037 w2[3] = w2[3] | 0x0100;
1038 break;
1039
1040 case 46:
1041 w2[3] = w2[3] | 0x010000;
1042 break;
1043
1044 case 47:
1045 w2[3] = w2[3] | 0x01000000;
1046 break;
1047
1048 case 48:
1049 w3[0] = 0x01;
1050 break;
1051
1052 case 49:
1053 w3[0] = w3[0] | 0x0100;
1054 break;
1055
1056 case 50:
1057 w3[0] = w3[0] | 0x010000;
1058 break;
1059
1060 case 51:
1061 w3[0] = w3[0] | 0x01000000;
1062 break;
1063
1064 case 52:
1065 w3[1] = 0x01;
1066 break;
1067
1068 case 53:
1069 w3[1] = w3[1] | 0x0100;
1070 break;
1071
1072 case 54:
1073 w3[1] = w3[1] | 0x010000;
1074 break;
1075
1076 case 55:
1077 w3[1] = w3[1] | 0x01000000;
1078 break;
1079
1080 case 56:
1081 w3[2] = 0x01;
1082 break;
1083
1084 case 57:
1085 w3[2] = w3[2] | 0x0100;
1086 break;
1087
1088 case 58:
1089 w3[2] = w3[2] | 0x010000;
1090 break;
1091
1092 case 59:
1093 w3[2] = w3[2] | 0x01000000;
1094 break;
1095
1096 case 60:
1097 w3[3] = 0x01;
1098 break;
1099
1100 case 61:
1101 w3[3] = w3[3] | 0x0100;
1102 break;
1103
1104 case 62:
1105 w3[3] = w3[3] | 0x010000;
1106 break;
1107
1108 case 63:
1109 w3[3] = w3[3] | 0x01000000;
1110 break;
1111
1112 case 64:
1113 w4[0] = 0x01;
1114 break;
1115
1116 case 65:
1117 w4[0] = w4[0] | 0x0100;
1118 break;
1119
1120 case 66:
1121 w4[0] = w4[0] | 0x010000;
1122 break;
1123
1124 case 67:
1125 w4[0] = w4[0] | 0x01000000;
1126 break;
1127
1128 case 68:
1129 w4[1] = 0x01;
1130 break;
1131
1132 case 69:
1133 w4[1] = w4[1] | 0x0100;
1134 break;
1135
1136 case 70:
1137 w4[1] = w4[1] | 0x010000;
1138 break;
1139
1140 case 71:
1141 w4[1] = w4[1] | 0x01000000;
1142 break;
1143
1144 case 72:
1145 w4[2] = 0x01;
1146 break;
1147
1148 case 73:
1149 w4[2] = w4[2] | 0x0100;
1150 break;
1151
1152 case 74:
1153 w4[2] = w4[2] | 0x010000;
1154 break;
1155
1156 case 75:
1157 w4[2] = w4[2] | 0x01000000;
1158 break;
1159
1160 case 76:
1161 w4[3] = 0x01;
1162 break;
1163
1164 case 77:
1165 w4[3] = w4[3] | 0x0100;
1166 break;
1167
1168 case 78:
1169 w4[3] = w4[3] | 0x010000;
1170 break;
1171
1172 case 79:
1173 w4[3] = w4[3] | 0x01000000;
1174 break;
1175
1176 case 80:
1177 w5[0] = 0x01;
1178 break;
1179
1180 case 81:
1181 w5[0] = w5[0] | 0x0100;
1182 break;
1183
1184 case 82:
1185 w5[0] = w5[0] | 0x010000;
1186 break;
1187
1188 case 83:
1189 w5[0] = w5[0] | 0x01000000;
1190 break;
1191
1192 case 84:
1193 w5[1] = 0x01;
1194 break;
1195
1196 case 85:
1197 w5[1] = w5[1] | 0x0100;
1198 break;
1199
1200 case 86:
1201 w5[1] = w5[1] | 0x010000;
1202 break;
1203
1204 case 87:
1205 w5[1] = w5[1] | 0x01000000;
1206 break;
1207
1208 case 88:
1209 w5[2] = 0x01;
1210 break;
1211
1212 case 89:
1213 w5[2] = w5[2] | 0x0100;
1214 break;
1215
1216 case 90:
1217 w5[2] = w5[2] | 0x010000;
1218 break;
1219
1220 case 91:
1221 w5[2] = w5[2] | 0x01000000;
1222 break;
1223
1224 case 92:
1225 w5[3] = 0x01;
1226 break;
1227
1228 case 93:
1229 w5[3] = w5[3] | 0x0100;
1230 break;
1231
1232 case 94:
1233 w5[3] = w5[3] | 0x010000;
1234 break;
1235
1236 case 95:
1237 w5[3] = w5[3] | 0x01000000;
1238 break;
1239
1240 case 96:
1241 w6[0] = 0x01;
1242 break;
1243
1244 case 97:
1245 w6[0] = w6[0] | 0x0100;
1246 break;
1247
1248 case 98:
1249 w6[0] = w6[0] | 0x010000;
1250 break;
1251
1252 case 99:
1253 w6[0] = w6[0] | 0x01000000;
1254 break;
1255
1256 case 100:
1257 w6[1] = 0x01;
1258 break;
1259
1260 case 101:
1261 w6[1] = w6[1] | 0x0100;
1262 break;
1263
1264 case 102:
1265 w6[1] = w6[1] | 0x010000;
1266 break;
1267
1268 case 103:
1269 w6[1] = w6[1] | 0x01000000;
1270 break;
1271
1272 case 104:
1273 w6[2] = 0x01;
1274 break;
1275
1276 case 105:
1277 w6[2] = w6[2] | 0x0100;
1278 break;
1279
1280 case 106:
1281 w6[2] = w6[2] | 0x010000;
1282 break;
1283
1284 case 107:
1285 w6[2] = w6[2] | 0x01000000;
1286 break;
1287
1288 case 108:
1289 w6[3] = 0x01;
1290 break;
1291
1292 case 109:
1293 w6[3] = w6[3] | 0x0100;
1294 break;
1295
1296 case 110:
1297 w6[3] = w6[3] | 0x010000;
1298 break;
1299
1300 case 111:
1301 w6[3] = w6[3] | 0x01000000;
1302 break;
1303
1304 case 112:
1305 w7[0] = 0x01;
1306 break;
1307
1308 case 113:
1309 w7[0] = w7[0] | 0x0100;
1310 break;
1311
1312 case 114:
1313 w7[0] = w7[0] | 0x010000;
1314 break;
1315
1316 case 115:
1317 w7[0] = w7[0] | 0x01000000;
1318 break;
1319
1320 case 116:
1321 w7[1] = 0x01;
1322 break;
1323
1324 case 117:
1325 w7[1] = w7[1] | 0x0100;
1326 break;
1327
1328 case 118:
1329 w7[1] = w7[1] | 0x010000;
1330 break;
1331
1332 case 119:
1333 w7[1] = w7[1] | 0x01000000;
1334 break;
1335
1336 case 120:
1337 w7[2] = 0x01;
1338 break;
1339
1340 case 121:
1341 w7[2] = w7[2] | 0x0100;
1342 break;
1343
1344 case 122:
1345 w7[2] = w7[2] | 0x010000;
1346 break;
1347
1348 case 123:
1349 w7[2] = w7[2] | 0x01000000;
1350 break;
1351
1352 case 124:
1353 w7[3] = 0x01;
1354 break;
1355
1356 case 125:
1357 w7[3] = w7[3] | 0x0100;
1358 break;
1359
1360 case 126:
1361 w7[3] = w7[3] | 0x010000;
1362 break;
1363
1364 case 127:
1365 w7[3] = w7[3] | 0x01000000;
1366 break;
1367 }
1368 }
1369
1370 inline void append_0x02_1x4 (u32x w0[4], const u32 offset)
1371 {
1372 switch (offset)
1373 {
1374 case 0:
1375 w0[0] = 0x02;
1376 break;
1377
1378 case 1:
1379 w0[0] = w0[0] | 0x0200;
1380 break;
1381
1382 case 2:
1383 w0[0] = w0[0] | 0x020000;
1384 break;
1385
1386 case 3:
1387 w0[0] = w0[0] | 0x02000000;
1388 break;
1389
1390 case 4:
1391 w0[1] = 0x02;
1392 break;
1393
1394 case 5:
1395 w0[1] = w0[1] | 0x0200;
1396 break;
1397
1398 case 6:
1399 w0[1] = w0[1] | 0x020000;
1400 break;
1401
1402 case 7:
1403 w0[1] = w0[1] | 0x02000000;
1404 break;
1405
1406 case 8:
1407 w0[2] = 0x02;
1408 break;
1409
1410 case 9:
1411 w0[2] = w0[2] | 0x0200;
1412 break;
1413
1414 case 10:
1415 w0[2] = w0[2] | 0x020000;
1416 break;
1417
1418 case 11:
1419 w0[2] = w0[2] | 0x02000000;
1420 break;
1421
1422 case 12:
1423 w0[3] = 0x02;
1424 break;
1425
1426 case 13:
1427 w0[3] = w0[3] | 0x0200;
1428 break;
1429
1430 case 14:
1431 w0[3] = w0[3] | 0x020000;
1432 break;
1433
1434 case 15:
1435 w0[3] = w0[3] | 0x02000000;
1436 break;
1437 }
1438 }
1439
1440 inline void append_0x02_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
1441 {
1442 switch (offset)
1443 {
1444 case 0:
1445 w0[0] = 0x02;
1446 break;
1447
1448 case 1:
1449 w0[0] = w0[0] | 0x0200;
1450 break;
1451
1452 case 2:
1453 w0[0] = w0[0] | 0x020000;
1454 break;
1455
1456 case 3:
1457 w0[0] = w0[0] | 0x02000000;
1458 break;
1459
1460 case 4:
1461 w0[1] = 0x02;
1462 break;
1463
1464 case 5:
1465 w0[1] = w0[1] | 0x0200;
1466 break;
1467
1468 case 6:
1469 w0[1] = w0[1] | 0x020000;
1470 break;
1471
1472 case 7:
1473 w0[1] = w0[1] | 0x02000000;
1474 break;
1475
1476 case 8:
1477 w0[2] = 0x02;
1478 break;
1479
1480 case 9:
1481 w0[2] = w0[2] | 0x0200;
1482 break;
1483
1484 case 10:
1485 w0[2] = w0[2] | 0x020000;
1486 break;
1487
1488 case 11:
1489 w0[2] = w0[2] | 0x02000000;
1490 break;
1491
1492 case 12:
1493 w0[3] = 0x02;
1494 break;
1495
1496 case 13:
1497 w0[3] = w0[3] | 0x0200;
1498 break;
1499
1500 case 14:
1501 w0[3] = w0[3] | 0x020000;
1502 break;
1503
1504 case 15:
1505 w0[3] = w0[3] | 0x02000000;
1506 break;
1507
1508 case 16:
1509 w1[0] = 0x02;
1510 break;
1511
1512 case 17:
1513 w1[0] = w1[0] | 0x0200;
1514 break;
1515
1516 case 18:
1517 w1[0] = w1[0] | 0x020000;
1518 break;
1519
1520 case 19:
1521 w1[0] = w1[0] | 0x02000000;
1522 break;
1523
1524 case 20:
1525 w1[1] = 0x02;
1526 break;
1527
1528 case 21:
1529 w1[1] = w1[1] | 0x0200;
1530 break;
1531
1532 case 22:
1533 w1[1] = w1[1] | 0x020000;
1534 break;
1535
1536 case 23:
1537 w1[1] = w1[1] | 0x02000000;
1538 break;
1539
1540 case 24:
1541 w1[2] = 0x02;
1542 break;
1543
1544 case 25:
1545 w1[2] = w1[2] | 0x0200;
1546 break;
1547
1548 case 26:
1549 w1[2] = w1[2] | 0x020000;
1550 break;
1551
1552 case 27:
1553 w1[2] = w1[2] | 0x02000000;
1554 break;
1555
1556 case 28:
1557 w1[3] = 0x02;
1558 break;
1559
1560 case 29:
1561 w1[3] = w1[3] | 0x0200;
1562 break;
1563
1564 case 30:
1565 w1[3] = w1[3] | 0x020000;
1566 break;
1567
1568 case 31:
1569 w1[3] = w1[3] | 0x02000000;
1570 break;
1571 }
1572 }
1573
1574 inline void append_0x02_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
1575 {
1576 switch (offset)
1577 {
1578 case 0:
1579 w0[0] = 0x02;
1580 break;
1581
1582 case 1:
1583 w0[0] = w0[0] | 0x0200;
1584 break;
1585
1586 case 2:
1587 w0[0] = w0[0] | 0x020000;
1588 break;
1589
1590 case 3:
1591 w0[0] = w0[0] | 0x02000000;
1592 break;
1593
1594 case 4:
1595 w0[1] = 0x02;
1596 break;
1597
1598 case 5:
1599 w0[1] = w0[1] | 0x0200;
1600 break;
1601
1602 case 6:
1603 w0[1] = w0[1] | 0x020000;
1604 break;
1605
1606 case 7:
1607 w0[1] = w0[1] | 0x02000000;
1608 break;
1609
1610 case 8:
1611 w0[2] = 0x02;
1612 break;
1613
1614 case 9:
1615 w0[2] = w0[2] | 0x0200;
1616 break;
1617
1618 case 10:
1619 w0[2] = w0[2] | 0x020000;
1620 break;
1621
1622 case 11:
1623 w0[2] = w0[2] | 0x02000000;
1624 break;
1625
1626 case 12:
1627 w0[3] = 0x02;
1628 break;
1629
1630 case 13:
1631 w0[3] = w0[3] | 0x0200;
1632 break;
1633
1634 case 14:
1635 w0[3] = w0[3] | 0x020000;
1636 break;
1637
1638 case 15:
1639 w0[3] = w0[3] | 0x02000000;
1640 break;
1641
1642 case 16:
1643 w1[0] = 0x02;
1644 break;
1645
1646 case 17:
1647 w1[0] = w1[0] | 0x0200;
1648 break;
1649
1650 case 18:
1651 w1[0] = w1[0] | 0x020000;
1652 break;
1653
1654 case 19:
1655 w1[0] = w1[0] | 0x02000000;
1656 break;
1657
1658 case 20:
1659 w1[1] = 0x02;
1660 break;
1661
1662 case 21:
1663 w1[1] = w1[1] | 0x0200;
1664 break;
1665
1666 case 22:
1667 w1[1] = w1[1] | 0x020000;
1668 break;
1669
1670 case 23:
1671 w1[1] = w1[1] | 0x02000000;
1672 break;
1673
1674 case 24:
1675 w1[2] = 0x02;
1676 break;
1677
1678 case 25:
1679 w1[2] = w1[2] | 0x0200;
1680 break;
1681
1682 case 26:
1683 w1[2] = w1[2] | 0x020000;
1684 break;
1685
1686 case 27:
1687 w1[2] = w1[2] | 0x02000000;
1688 break;
1689
1690 case 28:
1691 w1[3] = 0x02;
1692 break;
1693
1694 case 29:
1695 w1[3] = w1[3] | 0x0200;
1696 break;
1697
1698 case 30:
1699 w1[3] = w1[3] | 0x020000;
1700 break;
1701
1702 case 31:
1703 w1[3] = w1[3] | 0x02000000;
1704 break;
1705
1706 case 32:
1707 w2[0] = 0x02;
1708 break;
1709
1710 case 33:
1711 w2[0] = w2[0] | 0x0200;
1712 break;
1713
1714 case 34:
1715 w2[0] = w2[0] | 0x020000;
1716 break;
1717
1718 case 35:
1719 w2[0] = w2[0] | 0x02000000;
1720 break;
1721
1722 case 36:
1723 w2[1] = 0x02;
1724 break;
1725
1726 case 37:
1727 w2[1] = w2[1] | 0x0200;
1728 break;
1729
1730 case 38:
1731 w2[1] = w2[1] | 0x020000;
1732 break;
1733
1734 case 39:
1735 w2[1] = w2[1] | 0x02000000;
1736 break;
1737
1738 case 40:
1739 w2[2] = 0x02;
1740 break;
1741
1742 case 41:
1743 w2[2] = w2[2] | 0x0200;
1744 break;
1745
1746 case 42:
1747 w2[2] = w2[2] | 0x020000;
1748 break;
1749
1750 case 43:
1751 w2[2] = w2[2] | 0x02000000;
1752 break;
1753
1754 case 44:
1755 w2[3] = 0x02;
1756 break;
1757
1758 case 45:
1759 w2[3] = w2[3] | 0x0200;
1760 break;
1761
1762 case 46:
1763 w2[3] = w2[3] | 0x020000;
1764 break;
1765
1766 case 47:
1767 w2[3] = w2[3] | 0x02000000;
1768 break;
1769 }
1770 }
1771
1772 inline void append_0x02_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
1773 {
1774 switch (offset)
1775 {
1776 case 0:
1777 w0[0] = 0x02;
1778 break;
1779
1780 case 1:
1781 w0[0] = w0[0] | 0x0200;
1782 break;
1783
1784 case 2:
1785 w0[0] = w0[0] | 0x020000;
1786 break;
1787
1788 case 3:
1789 w0[0] = w0[0] | 0x02000000;
1790 break;
1791
1792 case 4:
1793 w0[1] = 0x02;
1794 break;
1795
1796 case 5:
1797 w0[1] = w0[1] | 0x0200;
1798 break;
1799
1800 case 6:
1801 w0[1] = w0[1] | 0x020000;
1802 break;
1803
1804 case 7:
1805 w0[1] = w0[1] | 0x02000000;
1806 break;
1807
1808 case 8:
1809 w0[2] = 0x02;
1810 break;
1811
1812 case 9:
1813 w0[2] = w0[2] | 0x0200;
1814 break;
1815
1816 case 10:
1817 w0[2] = w0[2] | 0x020000;
1818 break;
1819
1820 case 11:
1821 w0[2] = w0[2] | 0x02000000;
1822 break;
1823
1824 case 12:
1825 w0[3] = 0x02;
1826 break;
1827
1828 case 13:
1829 w0[3] = w0[3] | 0x0200;
1830 break;
1831
1832 case 14:
1833 w0[3] = w0[3] | 0x020000;
1834 break;
1835
1836 case 15:
1837 w0[3] = w0[3] | 0x02000000;
1838 break;
1839
1840 case 16:
1841 w1[0] = 0x02;
1842 break;
1843
1844 case 17:
1845 w1[0] = w1[0] | 0x0200;
1846 break;
1847
1848 case 18:
1849 w1[0] = w1[0] | 0x020000;
1850 break;
1851
1852 case 19:
1853 w1[0] = w1[0] | 0x02000000;
1854 break;
1855
1856 case 20:
1857 w1[1] = 0x02;
1858 break;
1859
1860 case 21:
1861 w1[1] = w1[1] | 0x0200;
1862 break;
1863
1864 case 22:
1865 w1[1] = w1[1] | 0x020000;
1866 break;
1867
1868 case 23:
1869 w1[1] = w1[1] | 0x02000000;
1870 break;
1871
1872 case 24:
1873 w1[2] = 0x02;
1874 break;
1875
1876 case 25:
1877 w1[2] = w1[2] | 0x0200;
1878 break;
1879
1880 case 26:
1881 w1[2] = w1[2] | 0x020000;
1882 break;
1883
1884 case 27:
1885 w1[2] = w1[2] | 0x02000000;
1886 break;
1887
1888 case 28:
1889 w1[3] = 0x02;
1890 break;
1891
1892 case 29:
1893 w1[3] = w1[3] | 0x0200;
1894 break;
1895
1896 case 30:
1897 w1[3] = w1[3] | 0x020000;
1898 break;
1899
1900 case 31:
1901 w1[3] = w1[3] | 0x02000000;
1902 break;
1903
1904 case 32:
1905 w2[0] = 0x02;
1906 break;
1907
1908 case 33:
1909 w2[0] = w2[0] | 0x0200;
1910 break;
1911
1912 case 34:
1913 w2[0] = w2[0] | 0x020000;
1914 break;
1915
1916 case 35:
1917 w2[0] = w2[0] | 0x02000000;
1918 break;
1919
1920 case 36:
1921 w2[1] = 0x02;
1922 break;
1923
1924 case 37:
1925 w2[1] = w2[1] | 0x0200;
1926 break;
1927
1928 case 38:
1929 w2[1] = w2[1] | 0x020000;
1930 break;
1931
1932 case 39:
1933 w2[1] = w2[1] | 0x02000000;
1934 break;
1935
1936 case 40:
1937 w2[2] = 0x02;
1938 break;
1939
1940 case 41:
1941 w2[2] = w2[2] | 0x0200;
1942 break;
1943
1944 case 42:
1945 w2[2] = w2[2] | 0x020000;
1946 break;
1947
1948 case 43:
1949 w2[2] = w2[2] | 0x02000000;
1950 break;
1951
1952 case 44:
1953 w2[3] = 0x02;
1954 break;
1955
1956 case 45:
1957 w2[3] = w2[3] | 0x0200;
1958 break;
1959
1960 case 46:
1961 w2[3] = w2[3] | 0x020000;
1962 break;
1963
1964 case 47:
1965 w2[3] = w2[3] | 0x02000000;
1966 break;
1967
1968 case 48:
1969 w3[0] = 0x02;
1970 break;
1971
1972 case 49:
1973 w3[0] = w3[0] | 0x0200;
1974 break;
1975
1976 case 50:
1977 w3[0] = w3[0] | 0x020000;
1978 break;
1979
1980 case 51:
1981 w3[0] = w3[0] | 0x02000000;
1982 break;
1983
1984 case 52:
1985 w3[1] = 0x02;
1986 break;
1987
1988 case 53:
1989 w3[1] = w3[1] | 0x0200;
1990 break;
1991
1992 case 54:
1993 w3[1] = w3[1] | 0x020000;
1994 break;
1995
1996 case 55:
1997 w3[1] = w3[1] | 0x02000000;
1998 break;
1999
2000 case 56:
2001 w3[2] = 0x02;
2002 break;
2003
2004 case 57:
2005 w3[2] = w3[2] | 0x0200;
2006 break;
2007
2008 case 58:
2009 w3[2] = w3[2] | 0x020000;
2010 break;
2011
2012 case 59:
2013 w3[2] = w3[2] | 0x02000000;
2014 break;
2015
2016 case 60:
2017 w3[3] = 0x02;
2018 break;
2019
2020 case 61:
2021 w3[3] = w3[3] | 0x0200;
2022 break;
2023
2024 case 62:
2025 w3[3] = w3[3] | 0x020000;
2026 break;
2027
2028 case 63:
2029 w3[3] = w3[3] | 0x02000000;
2030 break;
2031 }
2032 }
2033
2034 inline void append_0x02_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
2035 {
2036 switch (offset)
2037 {
2038 case 0:
2039 w0[0] = 0x02;
2040 break;
2041
2042 case 1:
2043 w0[0] = w0[0] | 0x0200;
2044 break;
2045
2046 case 2:
2047 w0[0] = w0[0] | 0x020000;
2048 break;
2049
2050 case 3:
2051 w0[0] = w0[0] | 0x02000000;
2052 break;
2053
2054 case 4:
2055 w0[1] = 0x02;
2056 break;
2057
2058 case 5:
2059 w0[1] = w0[1] | 0x0200;
2060 break;
2061
2062 case 6:
2063 w0[1] = w0[1] | 0x020000;
2064 break;
2065
2066 case 7:
2067 w0[1] = w0[1] | 0x02000000;
2068 break;
2069
2070 case 8:
2071 w0[2] = 0x02;
2072 break;
2073
2074 case 9:
2075 w0[2] = w0[2] | 0x0200;
2076 break;
2077
2078 case 10:
2079 w0[2] = w0[2] | 0x020000;
2080 break;
2081
2082 case 11:
2083 w0[2] = w0[2] | 0x02000000;
2084 break;
2085
2086 case 12:
2087 w0[3] = 0x02;
2088 break;
2089
2090 case 13:
2091 w0[3] = w0[3] | 0x0200;
2092 break;
2093
2094 case 14:
2095 w0[3] = w0[3] | 0x020000;
2096 break;
2097
2098 case 15:
2099 w0[3] = w0[3] | 0x02000000;
2100 break;
2101
2102 case 16:
2103 w1[0] = 0x02;
2104 break;
2105
2106 case 17:
2107 w1[0] = w1[0] | 0x0200;
2108 break;
2109
2110 case 18:
2111 w1[0] = w1[0] | 0x020000;
2112 break;
2113
2114 case 19:
2115 w1[0] = w1[0] | 0x02000000;
2116 break;
2117
2118 case 20:
2119 w1[1] = 0x02;
2120 break;
2121
2122 case 21:
2123 w1[1] = w1[1] | 0x0200;
2124 break;
2125
2126 case 22:
2127 w1[1] = w1[1] | 0x020000;
2128 break;
2129
2130 case 23:
2131 w1[1] = w1[1] | 0x02000000;
2132 break;
2133
2134 case 24:
2135 w1[2] = 0x02;
2136 break;
2137
2138 case 25:
2139 w1[2] = w1[2] | 0x0200;
2140 break;
2141
2142 case 26:
2143 w1[2] = w1[2] | 0x020000;
2144 break;
2145
2146 case 27:
2147 w1[2] = w1[2] | 0x02000000;
2148 break;
2149
2150 case 28:
2151 w1[3] = 0x02;
2152 break;
2153
2154 case 29:
2155 w1[3] = w1[3] | 0x0200;
2156 break;
2157
2158 case 30:
2159 w1[3] = w1[3] | 0x020000;
2160 break;
2161
2162 case 31:
2163 w1[3] = w1[3] | 0x02000000;
2164 break;
2165
2166 case 32:
2167 w2[0] = 0x02;
2168 break;
2169
2170 case 33:
2171 w2[0] = w2[0] | 0x0200;
2172 break;
2173
2174 case 34:
2175 w2[0] = w2[0] | 0x020000;
2176 break;
2177
2178 case 35:
2179 w2[0] = w2[0] | 0x02000000;
2180 break;
2181
2182 case 36:
2183 w2[1] = 0x02;
2184 break;
2185
2186 case 37:
2187 w2[1] = w2[1] | 0x0200;
2188 break;
2189
2190 case 38:
2191 w2[1] = w2[1] | 0x020000;
2192 break;
2193
2194 case 39:
2195 w2[1] = w2[1] | 0x02000000;
2196 break;
2197
2198 case 40:
2199 w2[2] = 0x02;
2200 break;
2201
2202 case 41:
2203 w2[2] = w2[2] | 0x0200;
2204 break;
2205
2206 case 42:
2207 w2[2] = w2[2] | 0x020000;
2208 break;
2209
2210 case 43:
2211 w2[2] = w2[2] | 0x02000000;
2212 break;
2213
2214 case 44:
2215 w2[3] = 0x02;
2216 break;
2217
2218 case 45:
2219 w2[3] = w2[3] | 0x0200;
2220 break;
2221
2222 case 46:
2223 w2[3] = w2[3] | 0x020000;
2224 break;
2225
2226 case 47:
2227 w2[3] = w2[3] | 0x02000000;
2228 break;
2229
2230 case 48:
2231 w3[0] = 0x02;
2232 break;
2233
2234 case 49:
2235 w3[0] = w3[0] | 0x0200;
2236 break;
2237
2238 case 50:
2239 w3[0] = w3[0] | 0x020000;
2240 break;
2241
2242 case 51:
2243 w3[0] = w3[0] | 0x02000000;
2244 break;
2245
2246 case 52:
2247 w3[1] = 0x02;
2248 break;
2249
2250 case 53:
2251 w3[1] = w3[1] | 0x0200;
2252 break;
2253
2254 case 54:
2255 w3[1] = w3[1] | 0x020000;
2256 break;
2257
2258 case 55:
2259 w3[1] = w3[1] | 0x02000000;
2260 break;
2261
2262 case 56:
2263 w3[2] = 0x02;
2264 break;
2265
2266 case 57:
2267 w3[2] = w3[2] | 0x0200;
2268 break;
2269
2270 case 58:
2271 w3[2] = w3[2] | 0x020000;
2272 break;
2273
2274 case 59:
2275 w3[2] = w3[2] | 0x02000000;
2276 break;
2277
2278 case 60:
2279 w3[3] = 0x02;
2280 break;
2281
2282 case 61:
2283 w3[3] = w3[3] | 0x0200;
2284 break;
2285
2286 case 62:
2287 w3[3] = w3[3] | 0x020000;
2288 break;
2289
2290 case 63:
2291 w3[3] = w3[3] | 0x02000000;
2292 break;
2293
2294 case 64:
2295 w4[0] = 0x02;
2296 break;
2297
2298 case 65:
2299 w4[0] = w4[0] | 0x0200;
2300 break;
2301
2302 case 66:
2303 w4[0] = w4[0] | 0x020000;
2304 break;
2305
2306 case 67:
2307 w4[0] = w4[0] | 0x02000000;
2308 break;
2309
2310 case 68:
2311 w4[1] = 0x02;
2312 break;
2313
2314 case 69:
2315 w4[1] = w4[1] | 0x0200;
2316 break;
2317
2318 case 70:
2319 w4[1] = w4[1] | 0x020000;
2320 break;
2321
2322 case 71:
2323 w4[1] = w4[1] | 0x02000000;
2324 break;
2325
2326 case 72:
2327 w4[2] = 0x02;
2328 break;
2329
2330 case 73:
2331 w4[2] = w4[2] | 0x0200;
2332 break;
2333
2334 case 74:
2335 w4[2] = w4[2] | 0x020000;
2336 break;
2337
2338 case 75:
2339 w4[2] = w4[2] | 0x02000000;
2340 break;
2341
2342 case 76:
2343 w4[3] = 0x02;
2344 break;
2345
2346 case 77:
2347 w4[3] = w4[3] | 0x0200;
2348 break;
2349
2350 case 78:
2351 w4[3] = w4[3] | 0x020000;
2352 break;
2353
2354 case 79:
2355 w4[3] = w4[3] | 0x02000000;
2356 break;
2357
2358 case 80:
2359 w5[0] = 0x02;
2360 break;
2361
2362 case 81:
2363 w5[0] = w5[0] | 0x0200;
2364 break;
2365
2366 case 82:
2367 w5[0] = w5[0] | 0x020000;
2368 break;
2369
2370 case 83:
2371 w5[0] = w5[0] | 0x02000000;
2372 break;
2373
2374 case 84:
2375 w5[1] = 0x02;
2376 break;
2377
2378 case 85:
2379 w5[1] = w5[1] | 0x0200;
2380 break;
2381
2382 case 86:
2383 w5[1] = w5[1] | 0x020000;
2384 break;
2385
2386 case 87:
2387 w5[1] = w5[1] | 0x02000000;
2388 break;
2389
2390 case 88:
2391 w5[2] = 0x02;
2392 break;
2393
2394 case 89:
2395 w5[2] = w5[2] | 0x0200;
2396 break;
2397
2398 case 90:
2399 w5[2] = w5[2] | 0x020000;
2400 break;
2401
2402 case 91:
2403 w5[2] = w5[2] | 0x02000000;
2404 break;
2405
2406 case 92:
2407 w5[3] = 0x02;
2408 break;
2409
2410 case 93:
2411 w5[3] = w5[3] | 0x0200;
2412 break;
2413
2414 case 94:
2415 w5[3] = w5[3] | 0x020000;
2416 break;
2417
2418 case 95:
2419 w5[3] = w5[3] | 0x02000000;
2420 break;
2421
2422 case 96:
2423 w6[0] = 0x02;
2424 break;
2425
2426 case 97:
2427 w6[0] = w6[0] | 0x0200;
2428 break;
2429
2430 case 98:
2431 w6[0] = w6[0] | 0x020000;
2432 break;
2433
2434 case 99:
2435 w6[0] = w6[0] | 0x02000000;
2436 break;
2437
2438 case 100:
2439 w6[1] = 0x02;
2440 break;
2441
2442 case 101:
2443 w6[1] = w6[1] | 0x0200;
2444 break;
2445
2446 case 102:
2447 w6[1] = w6[1] | 0x020000;
2448 break;
2449
2450 case 103:
2451 w6[1] = w6[1] | 0x02000000;
2452 break;
2453
2454 case 104:
2455 w6[2] = 0x02;
2456 break;
2457
2458 case 105:
2459 w6[2] = w6[2] | 0x0200;
2460 break;
2461
2462 case 106:
2463 w6[2] = w6[2] | 0x020000;
2464 break;
2465
2466 case 107:
2467 w6[2] = w6[2] | 0x02000000;
2468 break;
2469
2470 case 108:
2471 w6[3] = 0x02;
2472 break;
2473
2474 case 109:
2475 w6[3] = w6[3] | 0x0200;
2476 break;
2477
2478 case 110:
2479 w6[3] = w6[3] | 0x020000;
2480 break;
2481
2482 case 111:
2483 w6[3] = w6[3] | 0x02000000;
2484 break;
2485
2486 case 112:
2487 w7[0] = 0x02;
2488 break;
2489
2490 case 113:
2491 w7[0] = w7[0] | 0x0200;
2492 break;
2493
2494 case 114:
2495 w7[0] = w7[0] | 0x020000;
2496 break;
2497
2498 case 115:
2499 w7[0] = w7[0] | 0x02000000;
2500 break;
2501
2502 case 116:
2503 w7[1] = 0x02;
2504 break;
2505
2506 case 117:
2507 w7[1] = w7[1] | 0x0200;
2508 break;
2509
2510 case 118:
2511 w7[1] = w7[1] | 0x020000;
2512 break;
2513
2514 case 119:
2515 w7[1] = w7[1] | 0x02000000;
2516 break;
2517
2518 case 120:
2519 w7[2] = 0x02;
2520 break;
2521
2522 case 121:
2523 w7[2] = w7[2] | 0x0200;
2524 break;
2525
2526 case 122:
2527 w7[2] = w7[2] | 0x020000;
2528 break;
2529
2530 case 123:
2531 w7[2] = w7[2] | 0x02000000;
2532 break;
2533
2534 case 124:
2535 w7[3] = 0x02;
2536 break;
2537
2538 case 125:
2539 w7[3] = w7[3] | 0x0200;
2540 break;
2541
2542 case 126:
2543 w7[3] = w7[3] | 0x020000;
2544 break;
2545
2546 case 127:
2547 w7[3] = w7[3] | 0x02000000;
2548 break;
2549 }
2550 }
2551
2552 inline void append_0x80_1x4 (u32x w0[4], const u32 offset)
2553 {
2554 switch (offset)
2555 {
2556 case 0:
2557 w0[0] = 0x80;
2558 break;
2559
2560 case 1:
2561 w0[0] = w0[0] | 0x8000;
2562 break;
2563
2564 case 2:
2565 w0[0] = w0[0] | 0x800000;
2566 break;
2567
2568 case 3:
2569 w0[0] = w0[0] | 0x80000000;
2570 break;
2571
2572 case 4:
2573 w0[1] = 0x80;
2574 break;
2575
2576 case 5:
2577 w0[1] = w0[1] | 0x8000;
2578 break;
2579
2580 case 6:
2581 w0[1] = w0[1] | 0x800000;
2582 break;
2583
2584 case 7:
2585 w0[1] = w0[1] | 0x80000000;
2586 break;
2587
2588 case 8:
2589 w0[2] = 0x80;
2590 break;
2591
2592 case 9:
2593 w0[2] = w0[2] | 0x8000;
2594 break;
2595
2596 case 10:
2597 w0[2] = w0[2] | 0x800000;
2598 break;
2599
2600 case 11:
2601 w0[2] = w0[2] | 0x80000000;
2602 break;
2603
2604 case 12:
2605 w0[3] = 0x80;
2606 break;
2607
2608 case 13:
2609 w0[3] = w0[3] | 0x8000;
2610 break;
2611
2612 case 14:
2613 w0[3] = w0[3] | 0x800000;
2614 break;
2615
2616 case 15:
2617 w0[3] = w0[3] | 0x80000000;
2618 break;
2619 }
2620 }
2621
2622 inline void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
2623 {
2624 switch (offset)
2625 {
2626 case 0:
2627 w0[0] = 0x80;
2628 break;
2629
2630 case 1:
2631 w0[0] = w0[0] | 0x8000;
2632 break;
2633
2634 case 2:
2635 w0[0] = w0[0] | 0x800000;
2636 break;
2637
2638 case 3:
2639 w0[0] = w0[0] | 0x80000000;
2640 break;
2641
2642 case 4:
2643 w0[1] = 0x80;
2644 break;
2645
2646 case 5:
2647 w0[1] = w0[1] | 0x8000;
2648 break;
2649
2650 case 6:
2651 w0[1] = w0[1] | 0x800000;
2652 break;
2653
2654 case 7:
2655 w0[1] = w0[1] | 0x80000000;
2656 break;
2657
2658 case 8:
2659 w0[2] = 0x80;
2660 break;
2661
2662 case 9:
2663 w0[2] = w0[2] | 0x8000;
2664 break;
2665
2666 case 10:
2667 w0[2] = w0[2] | 0x800000;
2668 break;
2669
2670 case 11:
2671 w0[2] = w0[2] | 0x80000000;
2672 break;
2673
2674 case 12:
2675 w0[3] = 0x80;
2676 break;
2677
2678 case 13:
2679 w0[3] = w0[3] | 0x8000;
2680 break;
2681
2682 case 14:
2683 w0[3] = w0[3] | 0x800000;
2684 break;
2685
2686 case 15:
2687 w0[3] = w0[3] | 0x80000000;
2688 break;
2689
2690 case 16:
2691 w1[0] = 0x80;
2692 break;
2693
2694 case 17:
2695 w1[0] = w1[0] | 0x8000;
2696 break;
2697
2698 case 18:
2699 w1[0] = w1[0] | 0x800000;
2700 break;
2701
2702 case 19:
2703 w1[0] = w1[0] | 0x80000000;
2704 break;
2705
2706 case 20:
2707 w1[1] = 0x80;
2708 break;
2709
2710 case 21:
2711 w1[1] = w1[1] | 0x8000;
2712 break;
2713
2714 case 22:
2715 w1[1] = w1[1] | 0x800000;
2716 break;
2717
2718 case 23:
2719 w1[1] = w1[1] | 0x80000000;
2720 break;
2721
2722 case 24:
2723 w1[2] = 0x80;
2724 break;
2725
2726 case 25:
2727 w1[2] = w1[2] | 0x8000;
2728 break;
2729
2730 case 26:
2731 w1[2] = w1[2] | 0x800000;
2732 break;
2733
2734 case 27:
2735 w1[2] = w1[2] | 0x80000000;
2736 break;
2737
2738 case 28:
2739 w1[3] = 0x80;
2740 break;
2741
2742 case 29:
2743 w1[3] = w1[3] | 0x8000;
2744 break;
2745
2746 case 30:
2747 w1[3] = w1[3] | 0x800000;
2748 break;
2749
2750 case 31:
2751 w1[3] = w1[3] | 0x80000000;
2752 break;
2753 }
2754 }
2755
2756 inline void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
2757 {
2758 switch (offset)
2759 {
2760 case 0:
2761 w0[0] = 0x80;
2762 break;
2763
2764 case 1:
2765 w0[0] = w0[0] | 0x8000;
2766 break;
2767
2768 case 2:
2769 w0[0] = w0[0] | 0x800000;
2770 break;
2771
2772 case 3:
2773 w0[0] = w0[0] | 0x80000000;
2774 break;
2775
2776 case 4:
2777 w0[1] = 0x80;
2778 break;
2779
2780 case 5:
2781 w0[1] = w0[1] | 0x8000;
2782 break;
2783
2784 case 6:
2785 w0[1] = w0[1] | 0x800000;
2786 break;
2787
2788 case 7:
2789 w0[1] = w0[1] | 0x80000000;
2790 break;
2791
2792 case 8:
2793 w0[2] = 0x80;
2794 break;
2795
2796 case 9:
2797 w0[2] = w0[2] | 0x8000;
2798 break;
2799
2800 case 10:
2801 w0[2] = w0[2] | 0x800000;
2802 break;
2803
2804 case 11:
2805 w0[2] = w0[2] | 0x80000000;
2806 break;
2807
2808 case 12:
2809 w0[3] = 0x80;
2810 break;
2811
2812 case 13:
2813 w0[3] = w0[3] | 0x8000;
2814 break;
2815
2816 case 14:
2817 w0[3] = w0[3] | 0x800000;
2818 break;
2819
2820 case 15:
2821 w0[3] = w0[3] | 0x80000000;
2822 break;
2823
2824 case 16:
2825 w1[0] = 0x80;
2826 break;
2827
2828 case 17:
2829 w1[0] = w1[0] | 0x8000;
2830 break;
2831
2832 case 18:
2833 w1[0] = w1[0] | 0x800000;
2834 break;
2835
2836 case 19:
2837 w1[0] = w1[0] | 0x80000000;
2838 break;
2839
2840 case 20:
2841 w1[1] = 0x80;
2842 break;
2843
2844 case 21:
2845 w1[1] = w1[1] | 0x8000;
2846 break;
2847
2848 case 22:
2849 w1[1] = w1[1] | 0x800000;
2850 break;
2851
2852 case 23:
2853 w1[1] = w1[1] | 0x80000000;
2854 break;
2855
2856 case 24:
2857 w1[2] = 0x80;
2858 break;
2859
2860 case 25:
2861 w1[2] = w1[2] | 0x8000;
2862 break;
2863
2864 case 26:
2865 w1[2] = w1[2] | 0x800000;
2866 break;
2867
2868 case 27:
2869 w1[2] = w1[2] | 0x80000000;
2870 break;
2871
2872 case 28:
2873 w1[3] = 0x80;
2874 break;
2875
2876 case 29:
2877 w1[3] = w1[3] | 0x8000;
2878 break;
2879
2880 case 30:
2881 w1[3] = w1[3] | 0x800000;
2882 break;
2883
2884 case 31:
2885 w1[3] = w1[3] | 0x80000000;
2886 break;
2887
2888 case 32:
2889 w2[0] = 0x80;
2890 break;
2891
2892 case 33:
2893 w2[0] = w2[0] | 0x8000;
2894 break;
2895
2896 case 34:
2897 w2[0] = w2[0] | 0x800000;
2898 break;
2899
2900 case 35:
2901 w2[0] = w2[0] | 0x80000000;
2902 break;
2903
2904 case 36:
2905 w2[1] = 0x80;
2906 break;
2907
2908 case 37:
2909 w2[1] = w2[1] | 0x8000;
2910 break;
2911
2912 case 38:
2913 w2[1] = w2[1] | 0x800000;
2914 break;
2915
2916 case 39:
2917 w2[1] = w2[1] | 0x80000000;
2918 break;
2919
2920 case 40:
2921 w2[2] = 0x80;
2922 break;
2923
2924 case 41:
2925 w2[2] = w2[2] | 0x8000;
2926 break;
2927
2928 case 42:
2929 w2[2] = w2[2] | 0x800000;
2930 break;
2931
2932 case 43:
2933 w2[2] = w2[2] | 0x80000000;
2934 break;
2935
2936 case 44:
2937 w2[3] = 0x80;
2938 break;
2939
2940 case 45:
2941 w2[3] = w2[3] | 0x8000;
2942 break;
2943
2944 case 46:
2945 w2[3] = w2[3] | 0x800000;
2946 break;
2947
2948 case 47:
2949 w2[3] = w2[3] | 0x80000000;
2950 break;
2951 }
2952 }
2953
2954 inline void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
2955 {
2956 switch (offset)
2957 {
2958 case 0:
2959 w0[0] = 0x80;
2960 break;
2961
2962 case 1:
2963 w0[0] = w0[0] | 0x8000;
2964 break;
2965
2966 case 2:
2967 w0[0] = w0[0] | 0x800000;
2968 break;
2969
2970 case 3:
2971 w0[0] = w0[0] | 0x80000000;
2972 break;
2973
2974 case 4:
2975 w0[1] = 0x80;
2976 break;
2977
2978 case 5:
2979 w0[1] = w0[1] | 0x8000;
2980 break;
2981
2982 case 6:
2983 w0[1] = w0[1] | 0x800000;
2984 break;
2985
2986 case 7:
2987 w0[1] = w0[1] | 0x80000000;
2988 break;
2989
2990 case 8:
2991 w0[2] = 0x80;
2992 break;
2993
2994 case 9:
2995 w0[2] = w0[2] | 0x8000;
2996 break;
2997
2998 case 10:
2999 w0[2] = w0[2] | 0x800000;
3000 break;
3001
3002 case 11:
3003 w0[2] = w0[2] | 0x80000000;
3004 break;
3005
3006 case 12:
3007 w0[3] = 0x80;
3008 break;
3009
3010 case 13:
3011 w0[3] = w0[3] | 0x8000;
3012 break;
3013
3014 case 14:
3015 w0[3] = w0[3] | 0x800000;
3016 break;
3017
3018 case 15:
3019 w0[3] = w0[3] | 0x80000000;
3020 break;
3021
3022 case 16:
3023 w1[0] = 0x80;
3024 break;
3025
3026 case 17:
3027 w1[0] = w1[0] | 0x8000;
3028 break;
3029
3030 case 18:
3031 w1[0] = w1[0] | 0x800000;
3032 break;
3033
3034 case 19:
3035 w1[0] = w1[0] | 0x80000000;
3036 break;
3037
3038 case 20:
3039 w1[1] = 0x80;
3040 break;
3041
3042 case 21:
3043 w1[1] = w1[1] | 0x8000;
3044 break;
3045
3046 case 22:
3047 w1[1] = w1[1] | 0x800000;
3048 break;
3049
3050 case 23:
3051 w1[1] = w1[1] | 0x80000000;
3052 break;
3053
3054 case 24:
3055 w1[2] = 0x80;
3056 break;
3057
3058 case 25:
3059 w1[2] = w1[2] | 0x8000;
3060 break;
3061
3062 case 26:
3063 w1[2] = w1[2] | 0x800000;
3064 break;
3065
3066 case 27:
3067 w1[2] = w1[2] | 0x80000000;
3068 break;
3069
3070 case 28:
3071 w1[3] = 0x80;
3072 break;
3073
3074 case 29:
3075 w1[3] = w1[3] | 0x8000;
3076 break;
3077
3078 case 30:
3079 w1[3] = w1[3] | 0x800000;
3080 break;
3081
3082 case 31:
3083 w1[3] = w1[3] | 0x80000000;
3084 break;
3085
3086 case 32:
3087 w2[0] = 0x80;
3088 break;
3089
3090 case 33:
3091 w2[0] = w2[0] | 0x8000;
3092 break;
3093
3094 case 34:
3095 w2[0] = w2[0] | 0x800000;
3096 break;
3097
3098 case 35:
3099 w2[0] = w2[0] | 0x80000000;
3100 break;
3101
3102 case 36:
3103 w2[1] = 0x80;
3104 break;
3105
3106 case 37:
3107 w2[1] = w2[1] | 0x8000;
3108 break;
3109
3110 case 38:
3111 w2[1] = w2[1] | 0x800000;
3112 break;
3113
3114 case 39:
3115 w2[1] = w2[1] | 0x80000000;
3116 break;
3117
3118 case 40:
3119 w2[2] = 0x80;
3120 break;
3121
3122 case 41:
3123 w2[2] = w2[2] | 0x8000;
3124 break;
3125
3126 case 42:
3127 w2[2] = w2[2] | 0x800000;
3128 break;
3129
3130 case 43:
3131 w2[2] = w2[2] | 0x80000000;
3132 break;
3133
3134 case 44:
3135 w2[3] = 0x80;
3136 break;
3137
3138 case 45:
3139 w2[3] = w2[3] | 0x8000;
3140 break;
3141
3142 case 46:
3143 w2[3] = w2[3] | 0x800000;
3144 break;
3145
3146 case 47:
3147 w2[3] = w2[3] | 0x80000000;
3148 break;
3149
3150 case 48:
3151 w3[0] = 0x80;
3152 break;
3153
3154 case 49:
3155 w3[0] = w3[0] | 0x8000;
3156 break;
3157
3158 case 50:
3159 w3[0] = w3[0] | 0x800000;
3160 break;
3161
3162 case 51:
3163 w3[0] = w3[0] | 0x80000000;
3164 break;
3165
3166 case 52:
3167 w3[1] = 0x80;
3168 break;
3169
3170 case 53:
3171 w3[1] = w3[1] | 0x8000;
3172 break;
3173
3174 case 54:
3175 w3[1] = w3[1] | 0x800000;
3176 break;
3177
3178 case 55:
3179 w3[1] = w3[1] | 0x80000000;
3180 break;
3181
3182 case 56:
3183 w3[2] = 0x80;
3184 break;
3185
3186 case 57:
3187 w3[2] = w3[2] | 0x8000;
3188 break;
3189
3190 case 58:
3191 w3[2] = w3[2] | 0x800000;
3192 break;
3193
3194 case 59:
3195 w3[2] = w3[2] | 0x80000000;
3196 break;
3197
3198 case 60:
3199 w3[3] = 0x80;
3200 break;
3201
3202 case 61:
3203 w3[3] = w3[3] | 0x8000;
3204 break;
3205
3206 case 62:
3207 w3[3] = w3[3] | 0x800000;
3208 break;
3209
3210 case 63:
3211 w3[3] = w3[3] | 0x80000000;
3212 break;
3213 }
3214 }
3215
3216 inline void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
3217 {
3218 switch (offset)
3219 {
3220 case 0:
3221 w0[0] = 0x80;
3222 break;
3223
3224 case 1:
3225 w0[0] = w0[0] | 0x8000;
3226 break;
3227
3228 case 2:
3229 w0[0] = w0[0] | 0x800000;
3230 break;
3231
3232 case 3:
3233 w0[0] = w0[0] | 0x80000000;
3234 break;
3235
3236 case 4:
3237 w0[1] = 0x80;
3238 break;
3239
3240 case 5:
3241 w0[1] = w0[1] | 0x8000;
3242 break;
3243
3244 case 6:
3245 w0[1] = w0[1] | 0x800000;
3246 break;
3247
3248 case 7:
3249 w0[1] = w0[1] | 0x80000000;
3250 break;
3251
3252 case 8:
3253 w0[2] = 0x80;
3254 break;
3255
3256 case 9:
3257 w0[2] = w0[2] | 0x8000;
3258 break;
3259
3260 case 10:
3261 w0[2] = w0[2] | 0x800000;
3262 break;
3263
3264 case 11:
3265 w0[2] = w0[2] | 0x80000000;
3266 break;
3267
3268 case 12:
3269 w0[3] = 0x80;
3270 break;
3271
3272 case 13:
3273 w0[3] = w0[3] | 0x8000;
3274 break;
3275
3276 case 14:
3277 w0[3] = w0[3] | 0x800000;
3278 break;
3279
3280 case 15:
3281 w0[3] = w0[3] | 0x80000000;
3282 break;
3283
3284 case 16:
3285 w1[0] = 0x80;
3286 break;
3287
3288 case 17:
3289 w1[0] = w1[0] | 0x8000;
3290 break;
3291
3292 case 18:
3293 w1[0] = w1[0] | 0x800000;
3294 break;
3295
3296 case 19:
3297 w1[0] = w1[0] | 0x80000000;
3298 break;
3299
3300 case 20:
3301 w1[1] = 0x80;
3302 break;
3303
3304 case 21:
3305 w1[1] = w1[1] | 0x8000;
3306 break;
3307
3308 case 22:
3309 w1[1] = w1[1] | 0x800000;
3310 break;
3311
3312 case 23:
3313 w1[1] = w1[1] | 0x80000000;
3314 break;
3315
3316 case 24:
3317 w1[2] = 0x80;
3318 break;
3319
3320 case 25:
3321 w1[2] = w1[2] | 0x8000;
3322 break;
3323
3324 case 26:
3325 w1[2] = w1[2] | 0x800000;
3326 break;
3327
3328 case 27:
3329 w1[2] = w1[2] | 0x80000000;
3330 break;
3331
3332 case 28:
3333 w1[3] = 0x80;
3334 break;
3335
3336 case 29:
3337 w1[3] = w1[3] | 0x8000;
3338 break;
3339
3340 case 30:
3341 w1[3] = w1[3] | 0x800000;
3342 break;
3343
3344 case 31:
3345 w1[3] = w1[3] | 0x80000000;
3346 break;
3347
3348 case 32:
3349 w2[0] = 0x80;
3350 break;
3351
3352 case 33:
3353 w2[0] = w2[0] | 0x8000;
3354 break;
3355
3356 case 34:
3357 w2[0] = w2[0] | 0x800000;
3358 break;
3359
3360 case 35:
3361 w2[0] = w2[0] | 0x80000000;
3362 break;
3363
3364 case 36:
3365 w2[1] = 0x80;
3366 break;
3367
3368 case 37:
3369 w2[1] = w2[1] | 0x8000;
3370 break;
3371
3372 case 38:
3373 w2[1] = w2[1] | 0x800000;
3374 break;
3375
3376 case 39:
3377 w2[1] = w2[1] | 0x80000000;
3378 break;
3379
3380 case 40:
3381 w2[2] = 0x80;
3382 break;
3383
3384 case 41:
3385 w2[2] = w2[2] | 0x8000;
3386 break;
3387
3388 case 42:
3389 w2[2] = w2[2] | 0x800000;
3390 break;
3391
3392 case 43:
3393 w2[2] = w2[2] | 0x80000000;
3394 break;
3395
3396 case 44:
3397 w2[3] = 0x80;
3398 break;
3399
3400 case 45:
3401 w2[3] = w2[3] | 0x8000;
3402 break;
3403
3404 case 46:
3405 w2[3] = w2[3] | 0x800000;
3406 break;
3407
3408 case 47:
3409 w2[3] = w2[3] | 0x80000000;
3410 break;
3411
3412 case 48:
3413 w3[0] = 0x80;
3414 break;
3415
3416 case 49:
3417 w3[0] = w3[0] | 0x8000;
3418 break;
3419
3420 case 50:
3421 w3[0] = w3[0] | 0x800000;
3422 break;
3423
3424 case 51:
3425 w3[0] = w3[0] | 0x80000000;
3426 break;
3427
3428 case 52:
3429 w3[1] = 0x80;
3430 break;
3431
3432 case 53:
3433 w3[1] = w3[1] | 0x8000;
3434 break;
3435
3436 case 54:
3437 w3[1] = w3[1] | 0x800000;
3438 break;
3439
3440 case 55:
3441 w3[1] = w3[1] | 0x80000000;
3442 break;
3443
3444 case 56:
3445 w3[2] = 0x80;
3446 break;
3447
3448 case 57:
3449 w3[2] = w3[2] | 0x8000;
3450 break;
3451
3452 case 58:
3453 w3[2] = w3[2] | 0x800000;
3454 break;
3455
3456 case 59:
3457 w3[2] = w3[2] | 0x80000000;
3458 break;
3459
3460 case 60:
3461 w3[3] = 0x80;
3462 break;
3463
3464 case 61:
3465 w3[3] = w3[3] | 0x8000;
3466 break;
3467
3468 case 62:
3469 w3[3] = w3[3] | 0x800000;
3470 break;
3471
3472 case 63:
3473 w3[3] = w3[3] | 0x80000000;
3474 break;
3475
3476 case 64:
3477 w4[0] = 0x80;
3478 break;
3479
3480 case 65:
3481 w4[0] = w4[0] | 0x8000;
3482 break;
3483
3484 case 66:
3485 w4[0] = w4[0] | 0x800000;
3486 break;
3487
3488 case 67:
3489 w4[0] = w4[0] | 0x80000000;
3490 break;
3491
3492 case 68:
3493 w4[1] = 0x80;
3494 break;
3495
3496 case 69:
3497 w4[1] = w4[1] | 0x8000;
3498 break;
3499
3500 case 70:
3501 w4[1] = w4[1] | 0x800000;
3502 break;
3503
3504 case 71:
3505 w4[1] = w4[1] | 0x80000000;
3506 break;
3507
3508 case 72:
3509 w4[2] = 0x80;
3510 break;
3511
3512 case 73:
3513 w4[2] = w4[2] | 0x8000;
3514 break;
3515
3516 case 74:
3517 w4[2] = w4[2] | 0x800000;
3518 break;
3519
3520 case 75:
3521 w4[2] = w4[2] | 0x80000000;
3522 break;
3523
3524 case 76:
3525 w4[3] = 0x80;
3526 break;
3527
3528 case 77:
3529 w4[3] = w4[3] | 0x8000;
3530 break;
3531
3532 case 78:
3533 w4[3] = w4[3] | 0x800000;
3534 break;
3535
3536 case 79:
3537 w4[3] = w4[3] | 0x80000000;
3538 break;
3539
3540 case 80:
3541 w5[0] = 0x80;
3542 break;
3543
3544 case 81:
3545 w5[0] = w5[0] | 0x8000;
3546 break;
3547
3548 case 82:
3549 w5[0] = w5[0] | 0x800000;
3550 break;
3551
3552 case 83:
3553 w5[0] = w5[0] | 0x80000000;
3554 break;
3555
3556 case 84:
3557 w5[1] = 0x80;
3558 break;
3559
3560 case 85:
3561 w5[1] = w5[1] | 0x8000;
3562 break;
3563
3564 case 86:
3565 w5[1] = w5[1] | 0x800000;
3566 break;
3567
3568 case 87:
3569 w5[1] = w5[1] | 0x80000000;
3570 break;
3571
3572 case 88:
3573 w5[2] = 0x80;
3574 break;
3575
3576 case 89:
3577 w5[2] = w5[2] | 0x8000;
3578 break;
3579
3580 case 90:
3581 w5[2] = w5[2] | 0x800000;
3582 break;
3583
3584 case 91:
3585 w5[2] = w5[2] | 0x80000000;
3586 break;
3587
3588 case 92:
3589 w5[3] = 0x80;
3590 break;
3591
3592 case 93:
3593 w5[3] = w5[3] | 0x8000;
3594 break;
3595
3596 case 94:
3597 w5[3] = w5[3] | 0x800000;
3598 break;
3599
3600 case 95:
3601 w5[3] = w5[3] | 0x80000000;
3602 break;
3603
3604 case 96:
3605 w6[0] = 0x80;
3606 break;
3607
3608 case 97:
3609 w6[0] = w6[0] | 0x8000;
3610 break;
3611
3612 case 98:
3613 w6[0] = w6[0] | 0x800000;
3614 break;
3615
3616 case 99:
3617 w6[0] = w6[0] | 0x80000000;
3618 break;
3619
3620 case 100:
3621 w6[1] = 0x80;
3622 break;
3623
3624 case 101:
3625 w6[1] = w6[1] | 0x8000;
3626 break;
3627
3628 case 102:
3629 w6[1] = w6[1] | 0x800000;
3630 break;
3631
3632 case 103:
3633 w6[1] = w6[1] | 0x80000000;
3634 break;
3635
3636 case 104:
3637 w6[2] = 0x80;
3638 break;
3639
3640 case 105:
3641 w6[2] = w6[2] | 0x8000;
3642 break;
3643
3644 case 106:
3645 w6[2] = w6[2] | 0x800000;
3646 break;
3647
3648 case 107:
3649 w6[2] = w6[2] | 0x80000000;
3650 break;
3651
3652 case 108:
3653 w6[3] = 0x80;
3654 break;
3655
3656 case 109:
3657 w6[3] = w6[3] | 0x8000;
3658 break;
3659
3660 case 110:
3661 w6[3] = w6[3] | 0x800000;
3662 break;
3663
3664 case 111:
3665 w6[3] = w6[3] | 0x80000000;
3666 break;
3667
3668 case 112:
3669 w7[0] = 0x80;
3670 break;
3671
3672 case 113:
3673 w7[0] = w7[0] | 0x8000;
3674 break;
3675
3676 case 114:
3677 w7[0] = w7[0] | 0x800000;
3678 break;
3679
3680 case 115:
3681 w7[0] = w7[0] | 0x80000000;
3682 break;
3683
3684 case 116:
3685 w7[1] = 0x80;
3686 break;
3687
3688 case 117:
3689 w7[1] = w7[1] | 0x8000;
3690 break;
3691
3692 case 118:
3693 w7[1] = w7[1] | 0x800000;
3694 break;
3695
3696 case 119:
3697 w7[1] = w7[1] | 0x80000000;
3698 break;
3699
3700 case 120:
3701 w7[2] = 0x80;
3702 break;
3703
3704 case 121:
3705 w7[2] = w7[2] | 0x8000;
3706 break;
3707
3708 case 122:
3709 w7[2] = w7[2] | 0x800000;
3710 break;
3711
3712 case 123:
3713 w7[2] = w7[2] | 0x80000000;
3714 break;
3715
3716 case 124:
3717 w7[3] = 0x80;
3718 break;
3719
3720 case 125:
3721 w7[3] = w7[3] | 0x8000;
3722 break;
3723
3724 case 126:
3725 w7[3] = w7[3] | 0x800000;
3726 break;
3727
3728 case 127:
3729 w7[3] = w7[3] | 0x80000000;
3730 break;
3731 }
3732 }
3733
3734 inline void append_0x80_1x16 (u32x w[16], const u32 offset)
3735 {
3736 switch (offset)
3737 {
3738 case 0:
3739 w[ 0] = 0x80;
3740 break;
3741
3742 case 1:
3743 w[ 0] = w[ 0] | 0x8000;
3744 break;
3745
3746 case 2:
3747 w[ 0] = w[ 0] | 0x800000;
3748 break;
3749
3750 case 3:
3751 w[ 0] = w[ 0] | 0x80000000;
3752 break;
3753
3754 case 4:
3755 w[ 1] = 0x80;
3756 break;
3757
3758 case 5:
3759 w[ 1] = w[ 1] | 0x8000;
3760 break;
3761
3762 case 6:
3763 w[ 1] = w[ 1] | 0x800000;
3764 break;
3765
3766 case 7:
3767 w[ 1] = w[ 1] | 0x80000000;
3768 break;
3769
3770 case 8:
3771 w[ 2] = 0x80;
3772 break;
3773
3774 case 9:
3775 w[ 2] = w[ 2] | 0x8000;
3776 break;
3777
3778 case 10:
3779 w[ 2] = w[ 2] | 0x800000;
3780 break;
3781
3782 case 11:
3783 w[ 2] = w[ 2] | 0x80000000;
3784 break;
3785
3786 case 12:
3787 w[ 3] = 0x80;
3788 break;
3789
3790 case 13:
3791 w[ 3] = w[ 3] | 0x8000;
3792 break;
3793
3794 case 14:
3795 w[ 3] = w[ 3] | 0x800000;
3796 break;
3797
3798 case 15:
3799 w[ 3] = w[ 3] | 0x80000000;
3800 break;
3801
3802 case 16:
3803 w[ 4] = 0x80;
3804 break;
3805
3806 case 17:
3807 w[ 4] = w[ 4] | 0x8000;
3808 break;
3809
3810 case 18:
3811 w[ 4] = w[ 4] | 0x800000;
3812 break;
3813
3814 case 19:
3815 w[ 4] = w[ 4] | 0x80000000;
3816 break;
3817
3818 case 20:
3819 w[ 5] = 0x80;
3820 break;
3821
3822 case 21:
3823 w[ 5] = w[ 5] | 0x8000;
3824 break;
3825
3826 case 22:
3827 w[ 5] = w[ 5] | 0x800000;
3828 break;
3829
3830 case 23:
3831 w[ 5] = w[ 5] | 0x80000000;
3832 break;
3833
3834 case 24:
3835 w[ 6] = 0x80;
3836 break;
3837
3838 case 25:
3839 w[ 6] = w[ 6] | 0x8000;
3840 break;
3841
3842 case 26:
3843 w[ 6] = w[ 6] | 0x800000;
3844 break;
3845
3846 case 27:
3847 w[ 6] = w[ 6] | 0x80000000;
3848 break;
3849
3850 case 28:
3851 w[ 7] = 0x80;
3852 break;
3853
3854 case 29:
3855 w[ 7] = w[ 7] | 0x8000;
3856 break;
3857
3858 case 30:
3859 w[ 7] = w[ 7] | 0x800000;
3860 break;
3861
3862 case 31:
3863 w[ 7] = w[ 7] | 0x80000000;
3864 break;
3865
3866 case 32:
3867 w[ 8] = 0x80;
3868 break;
3869
3870 case 33:
3871 w[ 8] = w[ 8] | 0x8000;
3872 break;
3873
3874 case 34:
3875 w[ 8] = w[ 8] | 0x800000;
3876 break;
3877
3878 case 35:
3879 w[ 8] = w[ 8] | 0x80000000;
3880 break;
3881
3882 case 36:
3883 w[ 9] = 0x80;
3884 break;
3885
3886 case 37:
3887 w[ 9] = w[ 9] | 0x8000;
3888 break;
3889
3890 case 38:
3891 w[ 9] = w[ 9] | 0x800000;
3892 break;
3893
3894 case 39:
3895 w[ 9] = w[ 9] | 0x80000000;
3896 break;
3897
3898 case 40:
3899 w[10] = 0x80;
3900 break;
3901
3902 case 41:
3903 w[10] = w[10] | 0x8000;
3904 break;
3905
3906 case 42:
3907 w[10] = w[10] | 0x800000;
3908 break;
3909
3910 case 43:
3911 w[10] = w[10] | 0x80000000;
3912 break;
3913
3914 case 44:
3915 w[11] = 0x80;
3916 break;
3917
3918 case 45:
3919 w[11] = w[11] | 0x8000;
3920 break;
3921
3922 case 46:
3923 w[11] = w[11] | 0x800000;
3924 break;
3925
3926 case 47:
3927 w[11] = w[11] | 0x80000000;
3928 break;
3929
3930 case 48:
3931 w[12] = 0x80;
3932 break;
3933
3934 case 49:
3935 w[12] = w[12] | 0x8000;
3936 break;
3937
3938 case 50:
3939 w[12] = w[12] | 0x800000;
3940 break;
3941
3942 case 51:
3943 w[12] = w[12] | 0x80000000;
3944 break;
3945
3946 case 52:
3947 w[13] = 0x80;
3948 break;
3949
3950 case 53:
3951 w[13] = w[13] | 0x8000;
3952 break;
3953
3954 case 54:
3955 w[13] = w[13] | 0x800000;
3956 break;
3957
3958 case 55:
3959 w[13] = w[13] | 0x80000000;
3960 break;
3961
3962 case 56:
3963 w[14] = 0x80;
3964 break;
3965
3966 case 57:
3967 w[14] = w[14] | 0x8000;
3968 break;
3969
3970 case 58:
3971 w[14] = w[14] | 0x800000;
3972 break;
3973
3974 case 59:
3975 w[14] = w[14] | 0x80000000;
3976 break;
3977
3978 case 60:
3979 w[15] = 0x80;
3980 break;
3981
3982 case 61:
3983 w[15] = w[15] | 0x8000;
3984 break;
3985
3986 case 62:
3987 w[15] = w[15] | 0x800000;
3988 break;
3989
3990 case 63:
3991 w[15] = w[15] | 0x80000000;
3992 break;
3993 }
3994 }
3995
3996 inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
3997 {
3998 #if defined IS_AMD || defined IS_GENERIC
3999 const int offset_mod_4 = offset & 3;
4000
4001 const int offset_minus_4 = 4 - offset;
4002
4003 switch (offset / 4)
4004 {
4005 case 0:
4006 w3[2] = amd_bytealign ( 0, w3[1], offset_minus_4);
4007 w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4);
4008 w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4);
4009 w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4010 w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4011 w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4012 w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4013 w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4014 w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4015 w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4016 w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4017 w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4018 w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4019 w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4020 w0[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4021
4022 if (offset_mod_4 == 0)
4023 {
4024 w0[0] = w0[1];
4025 w0[1] = w0[2];
4026 w0[2] = w0[3];
4027 w0[3] = w1[0];
4028 w1[0] = w1[1];
4029 w1[1] = w1[2];
4030 w1[2] = w1[3];
4031 w1[3] = w2[0];
4032 w2[0] = w2[1];
4033 w2[1] = w2[2];
4034 w2[2] = w2[3];
4035 w2[3] = w3[0];
4036 w3[0] = w3[1];
4037 w3[1] = w3[2];
4038 w3[2] = 0;
4039 }
4040
4041 break;
4042
4043 case 1:
4044 w3[2] = amd_bytealign ( 0, w3[0], offset_minus_4);
4045 w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4);
4046 w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4047 w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4048 w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4049 w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4050 w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4051 w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4052 w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4053 w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4054 w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4055 w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4056 w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4057 w0[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4058 w0[0] = 0;
4059
4060 if (offset_mod_4 == 0)
4061 {
4062 w0[1] = w0[2];
4063 w0[2] = w0[3];
4064 w0[3] = w1[0];
4065 w1[0] = w1[1];
4066 w1[1] = w1[2];
4067 w1[2] = w1[3];
4068 w1[3] = w2[0];
4069 w2[0] = w2[1];
4070 w2[1] = w2[2];
4071 w2[2] = w2[3];
4072 w2[3] = w3[0];
4073 w3[0] = w3[1];
4074 w3[1] = w3[2];
4075 w3[2] = 0;
4076 }
4077
4078 break;
4079
4080 case 2:
4081 w3[2] = amd_bytealign ( 0, w2[3], offset_minus_4);
4082 w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4083 w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4084 w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4085 w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4086 w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4087 w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4088 w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4089 w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4090 w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4091 w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4092 w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4093 w0[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4094 w0[1] = 0;
4095 w0[0] = 0;
4096
4097 if (offset_mod_4 == 0)
4098 {
4099 w0[2] = w0[3];
4100 w0[3] = w1[0];
4101 w1[0] = w1[1];
4102 w1[1] = w1[2];
4103 w1[2] = w1[3];
4104 w1[3] = w2[0];
4105 w2[0] = w2[1];
4106 w2[1] = w2[2];
4107 w2[2] = w2[3];
4108 w2[3] = w3[0];
4109 w3[0] = w3[1];
4110 w3[1] = w3[2];
4111 w3[2] = 0;
4112 }
4113
4114 break;
4115
4116 case 3:
4117 w3[2] = amd_bytealign ( 0, w2[2], offset_minus_4);
4118 w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4119 w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4120 w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4121 w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4122 w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4123 w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4124 w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4125 w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4126 w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4127 w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4128 w0[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4129 w0[2] = 0;
4130 w0[1] = 0;
4131 w0[0] = 0;
4132
4133 if (offset_mod_4 == 0)
4134 {
4135 w0[3] = w1[0];
4136 w1[0] = w1[1];
4137 w1[1] = w1[2];
4138 w1[2] = w1[3];
4139 w1[3] = w2[0];
4140 w2[0] = w2[1];
4141 w2[1] = w2[2];
4142 w2[2] = w2[3];
4143 w2[3] = w3[0];
4144 w3[0] = w3[1];
4145 w3[1] = w3[2];
4146 w3[2] = 0;
4147 }
4148
4149 break;
4150
4151 case 4:
4152 w3[2] = amd_bytealign ( 0, w2[1], offset_minus_4);
4153 w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4154 w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4155 w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4156 w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4157 w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4158 w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4159 w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4160 w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4161 w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4162 w1[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4163 w0[3] = 0;
4164 w0[2] = 0;
4165 w0[1] = 0;
4166 w0[0] = 0;
4167
4168 if (offset_mod_4 == 0)
4169 {
4170 w1[0] = w1[1];
4171 w1[1] = w1[2];
4172 w1[2] = w1[3];
4173 w1[3] = w2[0];
4174 w2[0] = w2[1];
4175 w2[1] = w2[2];
4176 w2[2] = w2[3];
4177 w2[3] = w3[0];
4178 w3[0] = w3[1];
4179 w3[1] = w3[2];
4180 w3[2] = 0;
4181 }
4182
4183 break;
4184
4185 case 5:
4186 w3[2] = amd_bytealign ( 0, w2[0], offset_minus_4);
4187 w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4188 w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4189 w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4190 w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4191 w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4192 w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4193 w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4194 w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4195 w1[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4196 w1[0] = 0;
4197 w0[3] = 0;
4198 w0[2] = 0;
4199 w0[1] = 0;
4200 w0[0] = 0;
4201
4202 if (offset_mod_4 == 0)
4203 {
4204 w1[1] = w1[2];
4205 w1[2] = w1[3];
4206 w1[3] = w2[0];
4207 w2[0] = w2[1];
4208 w2[1] = w2[2];
4209 w2[2] = w2[3];
4210 w2[3] = w3[0];
4211 w3[0] = w3[1];
4212 w3[1] = w3[2];
4213 w3[2] = 0;
4214 }
4215
4216 break;
4217
4218 case 6:
4219 w3[2] = amd_bytealign ( 0, w1[3], offset_minus_4);
4220 w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4221 w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4222 w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4223 w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4224 w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4225 w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4226 w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4227 w1[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4228 w1[1] = 0;
4229 w1[0] = 0;
4230 w0[3] = 0;
4231 w0[2] = 0;
4232 w0[1] = 0;
4233 w0[0] = 0;
4234
4235 if (offset_mod_4 == 0)
4236 {
4237 w1[2] = w1[3];
4238 w1[3] = w2[0];
4239 w2[0] = w2[1];
4240 w2[1] = w2[2];
4241 w2[2] = w2[3];
4242 w2[3] = w3[0];
4243 w3[0] = w3[1];
4244 w3[1] = w3[2];
4245 w3[2] = 0;
4246 }
4247
4248 break;
4249
4250 case 7:
4251 w3[2] = amd_bytealign ( 0, w1[2], offset_minus_4);
4252 w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4253 w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4254 w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4255 w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4256 w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4257 w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4258 w1[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4259 w1[2] = 0;
4260 w1[1] = 0;
4261 w1[0] = 0;
4262 w0[3] = 0;
4263 w0[2] = 0;
4264 w0[1] = 0;
4265 w0[0] = 0;
4266
4267 if (offset_mod_4 == 0)
4268 {
4269 w1[3] = w2[0];
4270 w2[0] = w2[1];
4271 w2[1] = w2[2];
4272 w2[2] = w2[3];
4273 w2[3] = w3[0];
4274 w3[0] = w3[1];
4275 w3[1] = w3[2];
4276 w3[2] = 0;
4277 }
4278
4279 break;
4280
4281 case 8:
4282 w3[2] = amd_bytealign ( 0, w1[1], offset_minus_4);
4283 w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4284 w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4285 w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4286 w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4287 w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4288 w2[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4289 w1[3] = 0;
4290 w1[2] = 0;
4291 w1[1] = 0;
4292 w1[0] = 0;
4293 w0[3] = 0;
4294 w0[2] = 0;
4295 w0[1] = 0;
4296 w0[0] = 0;
4297
4298 if (offset_mod_4 == 0)
4299 {
4300 w2[0] = w2[1];
4301 w2[1] = w2[2];
4302 w2[2] = w2[3];
4303 w2[3] = w3[0];
4304 w3[0] = w3[1];
4305 w3[1] = w3[2];
4306 w3[2] = 0;
4307 }
4308
4309 break;
4310
4311 case 9:
4312 w3[2] = amd_bytealign ( 0, w1[0], offset_minus_4);
4313 w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4314 w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4315 w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4316 w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4317 w2[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4318 w2[0] = 0;
4319 w1[3] = 0;
4320 w1[2] = 0;
4321 w1[1] = 0;
4322 w1[0] = 0;
4323 w0[3] = 0;
4324 w0[2] = 0;
4325 w0[1] = 0;
4326 w0[0] = 0;
4327
4328 if (offset_mod_4 == 0)
4329 {
4330 w2[1] = w2[2];
4331 w2[2] = w2[3];
4332 w2[3] = w3[0];
4333 w3[0] = w3[1];
4334 w3[1] = w3[2];
4335 w3[2] = 0;
4336 }
4337
4338 break;
4339
4340 case 10:
4341 w3[2] = amd_bytealign ( 0, w0[3], offset_minus_4);
4342 w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4343 w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4344 w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4345 w2[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4346 w2[1] = 0;
4347 w2[0] = 0;
4348 w1[3] = 0;
4349 w1[2] = 0;
4350 w1[1] = 0;
4351 w1[0] = 0;
4352 w0[3] = 0;
4353 w0[2] = 0;
4354 w0[1] = 0;
4355 w0[0] = 0;
4356
4357 if (offset_mod_4 == 0)
4358 {
4359 w2[2] = w2[3];
4360 w2[3] = w3[0];
4361 w3[0] = w3[1];
4362 w3[1] = w3[2];
4363 w3[2] = 0;
4364 }
4365
4366 break;
4367
4368 case 11:
4369 w3[2] = amd_bytealign ( 0, w0[2], offset_minus_4);
4370 w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4371 w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4372 w2[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4373 w2[2] = 0;
4374 w2[1] = 0;
4375 w2[0] = 0;
4376 w1[3] = 0;
4377 w1[2] = 0;
4378 w1[1] = 0;
4379 w1[0] = 0;
4380 w0[3] = 0;
4381 w0[2] = 0;
4382 w0[1] = 0;
4383 w0[0] = 0;
4384
4385 if (offset_mod_4 == 0)
4386 {
4387 w2[3] = w3[0];
4388 w3[0] = w3[1];
4389 w3[1] = w3[2];
4390 w3[2] = 0;
4391 }
4392
4393 break;
4394
4395 case 12:
4396 w3[2] = amd_bytealign ( 0, w0[1], offset_minus_4);
4397 w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4398 w3[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4399 w2[3] = 0;
4400 w2[2] = 0;
4401 w2[1] = 0;
4402 w2[0] = 0;
4403 w1[3] = 0;
4404 w1[2] = 0;
4405 w1[1] = 0;
4406 w1[0] = 0;
4407 w0[3] = 0;
4408 w0[2] = 0;
4409 w0[1] = 0;
4410 w0[0] = 0;
4411
4412 if (offset_mod_4 == 0)
4413 {
4414 w3[0] = w3[1];
4415 w3[1] = w3[2];
4416 w3[2] = 0;
4417 }
4418
4419 break;
4420
4421 case 13:
4422 w3[2] = amd_bytealign ( 0, w0[0], offset_minus_4);
4423 w3[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4424 w3[0] = 0;
4425 w2[3] = 0;
4426 w2[2] = 0;
4427 w2[1] = 0;
4428 w2[0] = 0;
4429 w1[3] = 0;
4430 w1[2] = 0;
4431 w1[1] = 0;
4432 w1[0] = 0;
4433 w0[3] = 0;
4434 w0[2] = 0;
4435 w0[1] = 0;
4436 w0[0] = 0;
4437
4438 if (offset_mod_4 == 0)
4439 {
4440 w3[1] = w3[2];
4441 w3[2] = 0;
4442 }
4443
4444 break;
4445 }
4446 #endif
4447
4448 #ifdef IS_NV
4449 const int offset_minus_4 = 4 - (offset % 4);
4450
4451 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
4452
4453 switch (offset / 4)
4454 {
4455 case 0:
4456 w3[1] = __byte_perm (w3[0], w3[1], selector);
4457 w3[0] = __byte_perm (w2[3], w3[0], selector);
4458 w2[3] = __byte_perm (w2[2], w2[3], selector);
4459 w2[2] = __byte_perm (w2[1], w2[2], selector);
4460 w2[1] = __byte_perm (w2[0], w2[1], selector);
4461 w2[0] = __byte_perm (w1[3], w2[0], selector);
4462 w1[3] = __byte_perm (w1[2], w1[3], selector);
4463 w1[2] = __byte_perm (w1[1], w1[2], selector);
4464 w1[1] = __byte_perm (w1[0], w1[1], selector);
4465 w1[0] = __byte_perm (w0[3], w1[0], selector);
4466 w0[3] = __byte_perm (w0[2], w0[3], selector);
4467 w0[2] = __byte_perm (w0[1], w0[2], selector);
4468 w0[1] = __byte_perm (w0[0], w0[1], selector);
4469 w0[0] = __byte_perm ( 0, w0[0], selector);
4470
4471 break;
4472
4473 case 1:
4474 w3[1] = __byte_perm (w2[3], w3[0], selector);
4475 w3[0] = __byte_perm (w2[2], w2[3], selector);
4476 w2[3] = __byte_perm (w2[1], w2[2], selector);
4477 w2[2] = __byte_perm (w2[0], w2[1], selector);
4478 w2[1] = __byte_perm (w1[3], w2[0], selector);
4479 w2[0] = __byte_perm (w1[2], w1[3], selector);
4480 w1[3] = __byte_perm (w1[1], w1[2], selector);
4481 w1[2] = __byte_perm (w1[0], w1[1], selector);
4482 w1[1] = __byte_perm (w0[3], w1[0], selector);
4483 w1[0] = __byte_perm (w0[2], w0[3], selector);
4484 w0[3] = __byte_perm (w0[1], w0[2], selector);
4485 w0[2] = __byte_perm (w0[0], w0[1], selector);
4486 w0[1] = __byte_perm ( 0, w0[0], selector);
4487 w0[0] = 0;
4488
4489 break;
4490
4491 case 2:
4492 w3[1] = __byte_perm (w2[2], w2[3], selector);
4493 w3[0] = __byte_perm (w2[1], w2[2], selector);
4494 w2[3] = __byte_perm (w2[0], w2[1], selector);
4495 w2[2] = __byte_perm (w1[3], w2[0], selector);
4496 w2[1] = __byte_perm (w1[2], w1[3], selector);
4497 w2[0] = __byte_perm (w1[1], w1[2], selector);
4498 w1[3] = __byte_perm (w1[0], w1[1], selector);
4499 w1[2] = __byte_perm (w0[3], w1[0], selector);
4500 w1[1] = __byte_perm (w0[2], w0[3], selector);
4501 w1[0] = __byte_perm (w0[1], w0[2], selector);
4502 w0[3] = __byte_perm (w0[0], w0[1], selector);
4503 w0[2] = __byte_perm ( 0, w0[0], selector);
4504 w0[1] = 0;
4505 w0[0] = 0;
4506
4507 break;
4508
4509 case 3:
4510 w3[1] = __byte_perm (w2[1], w2[2], selector);
4511 w3[0] = __byte_perm (w2[0], w2[1], selector);
4512 w2[3] = __byte_perm (w1[3], w2[0], selector);
4513 w2[2] = __byte_perm (w1[2], w1[3], selector);
4514 w2[1] = __byte_perm (w1[1], w1[2], selector);
4515 w2[0] = __byte_perm (w1[0], w1[1], selector);
4516 w1[3] = __byte_perm (w0[3], w1[0], selector);
4517 w1[2] = __byte_perm (w0[2], w0[3], selector);
4518 w1[1] = __byte_perm (w0[1], w0[2], selector);
4519 w1[0] = __byte_perm (w0[0], w0[1], selector);
4520 w0[3] = __byte_perm ( 0, w0[0], selector);
4521 w0[2] = 0;
4522 w0[1] = 0;
4523 w0[0] = 0;
4524
4525 break;
4526
4527 case 4:
4528 w3[1] = __byte_perm (w2[0], w2[1], selector);
4529 w3[0] = __byte_perm (w1[3], w2[0], selector);
4530 w2[3] = __byte_perm (w1[2], w1[3], selector);
4531 w2[2] = __byte_perm (w1[1], w1[2], selector);
4532 w2[1] = __byte_perm (w1[0], w1[1], selector);
4533 w2[0] = __byte_perm (w0[3], w1[0], selector);
4534 w1[3] = __byte_perm (w0[2], w0[3], selector);
4535 w1[2] = __byte_perm (w0[1], w0[2], selector);
4536 w1[1] = __byte_perm (w0[0], w0[1], selector);
4537 w1[0] = __byte_perm ( 0, w0[0], selector);
4538 w0[3] = 0;
4539 w0[2] = 0;
4540 w0[1] = 0;
4541 w0[0] = 0;
4542
4543 break;
4544
4545 case 5:
4546 w3[1] = __byte_perm (w1[3], w2[0], selector);
4547 w3[0] = __byte_perm (w1[2], w1[3], selector);
4548 w2[3] = __byte_perm (w1[1], w1[2], selector);
4549 w2[2] = __byte_perm (w1[0], w1[1], selector);
4550 w2[1] = __byte_perm (w0[3], w1[0], selector);
4551 w2[0] = __byte_perm (w0[2], w0[3], selector);
4552 w1[3] = __byte_perm (w0[1], w0[2], selector);
4553 w1[2] = __byte_perm (w0[0], w0[1], selector);
4554 w1[1] = __byte_perm ( 0, w0[0], selector);
4555 w1[0] = 0;
4556 w0[3] = 0;
4557 w0[2] = 0;
4558 w0[1] = 0;
4559 w0[0] = 0;
4560
4561 break;
4562
4563 case 6:
4564 w3[1] = __byte_perm (w1[2], w1[3], selector);
4565 w3[0] = __byte_perm (w1[1], w1[2], selector);
4566 w2[3] = __byte_perm (w1[0], w1[1], selector);
4567 w2[2] = __byte_perm (w0[3], w1[0], selector);
4568 w2[1] = __byte_perm (w0[2], w0[3], selector);
4569 w2[0] = __byte_perm (w0[1], w0[2], selector);
4570 w1[3] = __byte_perm (w0[0], w0[1], selector);
4571 w1[2] = __byte_perm ( 0, w0[0], selector);
4572 w1[1] = 0;
4573 w1[0] = 0;
4574 w0[3] = 0;
4575 w0[2] = 0;
4576 w0[1] = 0;
4577 w0[0] = 0;
4578
4579 break;
4580
4581 case 7:
4582 w3[1] = __byte_perm (w1[1], w1[2], selector);
4583 w3[0] = __byte_perm (w1[0], w1[1], selector);
4584 w2[3] = __byte_perm (w0[3], w1[0], selector);
4585 w2[2] = __byte_perm (w0[2], w0[3], selector);
4586 w2[1] = __byte_perm (w0[1], w0[2], selector);
4587 w2[0] = __byte_perm (w0[0], w0[1], selector);
4588 w1[3] = __byte_perm ( 0, w0[0], selector);
4589 w1[2] = 0;
4590 w1[1] = 0;
4591 w1[0] = 0;
4592 w0[3] = 0;
4593 w0[2] = 0;
4594 w0[1] = 0;
4595 w0[0] = 0;
4596
4597 break;
4598
4599 case 8:
4600 w3[1] = __byte_perm (w1[0], w1[1], selector);
4601 w3[0] = __byte_perm (w0[3], w1[0], selector);
4602 w2[3] = __byte_perm (w0[2], w0[3], selector);
4603 w2[2] = __byte_perm (w0[1], w0[2], selector);
4604 w2[1] = __byte_perm (w0[0], w0[1], selector);
4605 w2[0] = __byte_perm ( 0, w0[0], selector);
4606 w1[3] = 0;
4607 w1[2] = 0;
4608 w1[1] = 0;
4609 w1[0] = 0;
4610 w0[3] = 0;
4611 w0[2] = 0;
4612 w0[1] = 0;
4613 w0[0] = 0;
4614
4615 break;
4616
4617 case 9:
4618 w3[1] = __byte_perm (w0[3], w1[0], selector);
4619 w3[0] = __byte_perm (w0[2], w0[3], selector);
4620 w2[3] = __byte_perm (w0[1], w0[2], selector);
4621 w2[2] = __byte_perm (w0[0], w0[1], selector);
4622 w2[1] = __byte_perm ( 0, w0[0], selector);
4623 w2[0] = 0;
4624 w1[3] = 0;
4625 w1[2] = 0;
4626 w1[1] = 0;
4627 w1[0] = 0;
4628 w0[3] = 0;
4629 w0[2] = 0;
4630 w0[1] = 0;
4631 w0[0] = 0;
4632
4633 break;
4634
4635 case 10:
4636 w3[1] = __byte_perm (w0[2], w0[3], selector);
4637 w3[0] = __byte_perm (w0[1], w0[2], selector);
4638 w2[3] = __byte_perm (w0[0], w0[1], selector);
4639 w2[2] = __byte_perm ( 0, w0[0], selector);
4640 w2[1] = 0;
4641 w2[0] = 0;
4642 w1[3] = 0;
4643 w1[2] = 0;
4644 w1[1] = 0;
4645 w1[0] = 0;
4646 w0[3] = 0;
4647 w0[2] = 0;
4648 w0[1] = 0;
4649 w0[0] = 0;
4650
4651 break;
4652
4653 case 11:
4654 w3[1] = __byte_perm (w0[1], w0[2], selector);
4655 w3[0] = __byte_perm (w0[0], w0[1], selector);
4656 w2[3] = __byte_perm ( 0, w0[0], selector);
4657 w2[2] = 0;
4658 w2[1] = 0;
4659 w2[0] = 0;
4660 w1[3] = 0;
4661 w1[2] = 0;
4662 w1[1] = 0;
4663 w1[0] = 0;
4664 w0[3] = 0;
4665 w0[2] = 0;
4666 w0[1] = 0;
4667 w0[0] = 0;
4668
4669 break;
4670
4671 case 12:
4672 w3[1] = __byte_perm (w0[0], w0[1], selector);
4673 w3[0] = __byte_perm ( 0, w0[0], selector);
4674 w2[3] = 0;
4675 w2[2] = 0;
4676 w2[1] = 0;
4677 w2[0] = 0;
4678 w1[3] = 0;
4679 w1[2] = 0;
4680 w1[1] = 0;
4681 w1[0] = 0;
4682 w0[3] = 0;
4683 w0[2] = 0;
4684 w0[1] = 0;
4685 w0[0] = 0;
4686
4687 break;
4688
4689 case 13:
4690 w3[1] = __byte_perm ( 0, w0[0], selector);
4691 w3[0] = 0;
4692 w2[3] = 0;
4693 w2[2] = 0;
4694 w2[1] = 0;
4695 w2[0] = 0;
4696 w1[3] = 0;
4697 w1[2] = 0;
4698 w1[1] = 0;
4699 w1[0] = 0;
4700 w0[3] = 0;
4701 w0[2] = 0;
4702 w0[1] = 0;
4703 w0[0] = 0;
4704
4705 break;
4706 }
4707 #endif
4708 }
4709
4710 inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
4711 {
4712 #if defined IS_AMD || defined IS_GENERIC
4713 switch (offset / 4)
4714 {
4715 case 0:
4716 w3[2] = amd_bytealign (w3[1], 0, offset);
4717 w3[1] = amd_bytealign (w3[0], w3[1], offset);
4718 w3[0] = amd_bytealign (w2[3], w3[0], offset);
4719 w2[3] = amd_bytealign (w2[2], w2[3], offset);
4720 w2[2] = amd_bytealign (w2[1], w2[2], offset);
4721 w2[1] = amd_bytealign (w2[0], w2[1], offset);
4722 w2[0] = amd_bytealign (w1[3], w2[0], offset);
4723 w1[3] = amd_bytealign (w1[2], w1[3], offset);
4724 w1[2] = amd_bytealign (w1[1], w1[2], offset);
4725 w1[1] = amd_bytealign (w1[0], w1[1], offset);
4726 w1[0] = amd_bytealign (w0[3], w1[0], offset);
4727 w0[3] = amd_bytealign (w0[2], w0[3], offset);
4728 w0[2] = amd_bytealign (w0[1], w0[2], offset);
4729 w0[1] = amd_bytealign (w0[0], w0[1], offset);
4730 w0[0] = amd_bytealign ( 0, w0[0], offset);
4731 break;
4732
4733 case 1:
4734 w3[2] = amd_bytealign (w3[0], 0, offset);
4735 w3[1] = amd_bytealign (w2[3], w3[0], offset);
4736 w3[0] = amd_bytealign (w2[2], w2[3], offset);
4737 w2[3] = amd_bytealign (w2[1], w2[2], offset);
4738 w2[2] = amd_bytealign (w2[0], w2[1], offset);
4739 w2[1] = amd_bytealign (w1[3], w2[0], offset);
4740 w2[0] = amd_bytealign (w1[2], w1[3], offset);
4741 w1[3] = amd_bytealign (w1[1], w1[2], offset);
4742 w1[2] = amd_bytealign (w1[0], w1[1], offset);
4743 w1[1] = amd_bytealign (w0[3], w1[0], offset);
4744 w1[0] = amd_bytealign (w0[2], w0[3], offset);
4745 w0[3] = amd_bytealign (w0[1], w0[2], offset);
4746 w0[2] = amd_bytealign (w0[0], w0[1], offset);
4747 w0[1] = amd_bytealign ( 0, w0[0], offset);
4748 w0[0] = 0;
4749 break;
4750
4751 case 2:
4752 w3[2] = amd_bytealign (w2[3], 0, offset);
4753 w3[1] = amd_bytealign (w2[2], w2[3], offset);
4754 w3[0] = amd_bytealign (w2[1], w2[2], offset);
4755 w2[3] = amd_bytealign (w2[0], w2[1], offset);
4756 w2[2] = amd_bytealign (w1[3], w2[0], offset);
4757 w2[1] = amd_bytealign (w1[2], w1[3], offset);
4758 w2[0] = amd_bytealign (w1[1], w1[2], offset);
4759 w1[3] = amd_bytealign (w1[0], w1[1], offset);
4760 w1[2] = amd_bytealign (w0[3], w1[0], offset);
4761 w1[1] = amd_bytealign (w0[2], w0[3], offset);
4762 w1[0] = amd_bytealign (w0[1], w0[2], offset);
4763 w0[3] = amd_bytealign (w0[0], w0[1], offset);
4764 w0[2] = amd_bytealign ( 0, w0[0], offset);
4765 w0[1] = 0;
4766 w0[0] = 0;
4767 break;
4768
4769 case 3:
4770 w3[2] = amd_bytealign (w2[2], 0, offset);
4771 w3[1] = amd_bytealign (w2[1], w2[2], offset);
4772 w3[0] = amd_bytealign (w2[0], w2[1], offset);
4773 w2[3] = amd_bytealign (w1[3], w2[0], offset);
4774 w2[2] = amd_bytealign (w1[2], w1[3], offset);
4775 w2[1] = amd_bytealign (w1[1], w1[2], offset);
4776 w2[0] = amd_bytealign (w1[0], w1[1], offset);
4777 w1[3] = amd_bytealign (w0[3], w1[0], offset);
4778 w1[2] = amd_bytealign (w0[2], w0[3], offset);
4779 w1[1] = amd_bytealign (w0[1], w0[2], offset);
4780 w1[0] = amd_bytealign (w0[0], w0[1], offset);
4781 w0[3] = amd_bytealign ( 0, w0[0], offset);
4782 w0[2] = 0;
4783 w0[1] = 0;
4784 w0[0] = 0;
4785 break;
4786
4787 case 4:
4788 w3[2] = amd_bytealign (w2[1], 0, offset);
4789 w3[1] = amd_bytealign (w2[0], w2[1], offset);
4790 w3[0] = amd_bytealign (w1[3], w2[0], offset);
4791 w2[3] = amd_bytealign (w1[2], w1[3], offset);
4792 w2[2] = amd_bytealign (w1[1], w1[2], offset);
4793 w2[1] = amd_bytealign (w1[0], w1[1], offset);
4794 w2[0] = amd_bytealign (w0[3], w1[0], offset);
4795 w1[3] = amd_bytealign (w0[2], w0[3], offset);
4796 w1[2] = amd_bytealign (w0[1], w0[2], offset);
4797 w1[1] = amd_bytealign (w0[0], w0[1], offset);
4798 w1[0] = amd_bytealign ( 0, w0[0], offset);
4799 w0[3] = 0;
4800 w0[2] = 0;
4801 w0[1] = 0;
4802 w0[0] = 0;
4803 break;
4804
4805 case 5:
4806 w3[2] = amd_bytealign (w2[0], 0, offset);
4807 w3[1] = amd_bytealign (w1[3], w2[0], offset);
4808 w3[0] = amd_bytealign (w1[2], w1[3], offset);
4809 w2[3] = amd_bytealign (w1[1], w1[2], offset);
4810 w2[2] = amd_bytealign (w1[0], w1[1], offset);
4811 w2[1] = amd_bytealign (w0[3], w1[0], offset);
4812 w2[0] = amd_bytealign (w0[2], w0[3], offset);
4813 w1[3] = amd_bytealign (w0[1], w0[2], offset);
4814 w1[2] = amd_bytealign (w0[0], w0[1], offset);
4815 w1[1] = amd_bytealign ( 0, w0[0], offset);
4816 w1[0] = 0;
4817 w0[3] = 0;
4818 w0[2] = 0;
4819 w0[1] = 0;
4820 w0[0] = 0;
4821 break;
4822
4823 case 6:
4824 w3[2] = amd_bytealign (w1[3], 0, offset);
4825 w3[1] = amd_bytealign (w1[2], w1[3], offset);
4826 w3[0] = amd_bytealign (w1[1], w1[2], offset);
4827 w2[3] = amd_bytealign (w1[0], w1[1], offset);
4828 w2[2] = amd_bytealign (w0[3], w1[0], offset);
4829 w2[1] = amd_bytealign (w0[2], w0[3], offset);
4830 w2[0] = amd_bytealign (w0[1], w0[2], offset);
4831 w1[3] = amd_bytealign (w0[0], w0[1], offset);
4832 w1[2] = amd_bytealign ( 0, w0[0], offset);
4833 w1[1] = 0;
4834 w1[0] = 0;
4835 w0[3] = 0;
4836 w0[2] = 0;
4837 w0[1] = 0;
4838 w0[0] = 0;
4839 break;
4840
4841 case 7:
4842 w3[2] = amd_bytealign (w1[2], 0, offset);
4843 w3[1] = amd_bytealign (w1[1], w1[2], offset);
4844 w3[0] = amd_bytealign (w1[0], w1[1], offset);
4845 w2[3] = amd_bytealign (w0[3], w1[0], offset);
4846 w2[2] = amd_bytealign (w0[2], w0[3], offset);
4847 w2[1] = amd_bytealign (w0[1], w0[2], offset);
4848 w2[0] = amd_bytealign (w0[0], w0[1], offset);
4849 w1[3] = amd_bytealign ( 0, w0[0], offset);
4850 w1[2] = 0;
4851 w1[1] = 0;
4852 w1[0] = 0;
4853 w0[3] = 0;
4854 w0[2] = 0;
4855 w0[1] = 0;
4856 w0[0] = 0;
4857 break;
4858
4859 case 8:
4860 w3[2] = amd_bytealign (w1[1], 0, offset);
4861 w3[1] = amd_bytealign (w1[0], w1[1], offset);
4862 w3[0] = amd_bytealign (w0[3], w1[0], offset);
4863 w2[3] = amd_bytealign (w0[2], w0[3], offset);
4864 w2[2] = amd_bytealign (w0[1], w0[2], offset);
4865 w2[1] = amd_bytealign (w0[0], w0[1], offset);
4866 w2[0] = amd_bytealign ( 0, w0[0], offset);
4867 w1[3] = 0;
4868 w1[2] = 0;
4869 w1[1] = 0;
4870 w1[0] = 0;
4871 w0[3] = 0;
4872 w0[2] = 0;
4873 w0[1] = 0;
4874 w0[0] = 0;
4875 break;
4876
4877 case 9:
4878 w3[2] = amd_bytealign (w1[0], 0, offset);
4879 w3[1] = amd_bytealign (w0[3], w1[0], offset);
4880 w3[0] = amd_bytealign (w0[2], w0[3], offset);
4881 w2[3] = amd_bytealign (w0[1], w0[2], offset);
4882 w2[2] = amd_bytealign (w0[0], w0[1], offset);
4883 w2[1] = amd_bytealign ( 0, w0[0], offset);
4884 w2[0] = 0;
4885 w1[3] = 0;
4886 w1[2] = 0;
4887 w1[1] = 0;
4888 w1[0] = 0;
4889 w0[3] = 0;
4890 w0[2] = 0;
4891 w0[1] = 0;
4892 w0[0] = 0;
4893 break;
4894
4895 case 10:
4896 w3[2] = amd_bytealign (w0[3], 0, offset);
4897 w3[1] = amd_bytealign (w0[2], w0[3], offset);
4898 w3[0] = amd_bytealign (w0[1], w0[2], offset);
4899 w2[3] = amd_bytealign (w0[0], w0[1], offset);
4900 w2[2] = amd_bytealign ( 0, w0[0], offset);
4901 w2[1] = 0;
4902 w2[0] = 0;
4903 w1[3] = 0;
4904 w1[2] = 0;
4905 w1[1] = 0;
4906 w1[0] = 0;
4907 w0[3] = 0;
4908 w0[2] = 0;
4909 w0[1] = 0;
4910 w0[0] = 0;
4911 break;
4912
4913 case 11:
4914 w3[2] = amd_bytealign (w0[2], 0, offset);
4915 w3[1] = amd_bytealign (w0[1], w0[2], offset);
4916 w3[0] = amd_bytealign (w0[0], w0[1], offset);
4917 w2[3] = amd_bytealign ( 0, w0[0], offset);
4918 w2[2] = 0;
4919 w2[1] = 0;
4920 w2[0] = 0;
4921 w1[3] = 0;
4922 w1[2] = 0;
4923 w1[1] = 0;
4924 w1[0] = 0;
4925 w0[3] = 0;
4926 w0[2] = 0;
4927 w0[1] = 0;
4928 w0[0] = 0;
4929 break;
4930
4931 case 12:
4932 w3[2] = amd_bytealign (w0[1], 0, offset);
4933 w3[1] = amd_bytealign (w0[0], w0[1], offset);
4934 w3[0] = amd_bytealign ( 0, w0[0], offset);
4935 w2[3] = 0;
4936 w2[2] = 0;
4937 w2[1] = 0;
4938 w2[0] = 0;
4939 w1[3] = 0;
4940 w1[2] = 0;
4941 w1[1] = 0;
4942 w1[0] = 0;
4943 w0[3] = 0;
4944 w0[2] = 0;
4945 w0[1] = 0;
4946 w0[0] = 0;
4947 break;
4948
4949 case 13:
4950 w3[2] = amd_bytealign (w0[0], 0, offset);
4951 w3[1] = amd_bytealign ( 0, w0[0], offset);
4952 w3[0] = 0;
4953 w2[3] = 0;
4954 w2[2] = 0;
4955 w2[1] = 0;
4956 w2[0] = 0;
4957 w1[3] = 0;
4958 w1[2] = 0;
4959 w1[1] = 0;
4960 w1[0] = 0;
4961 w0[3] = 0;
4962 w0[2] = 0;
4963 w0[1] = 0;
4964 w0[0] = 0;
4965 break;
4966 }
4967 #endif
4968
4969 #ifdef IS_NV
4970 const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
4971
4972 switch (offset / 4)
4973 {
4974 case 0:
4975 w3[1] = __byte_perm (w3[1], w3[0], selector);
4976 w3[0] = __byte_perm (w3[0], w2[3], selector);
4977 w2[3] = __byte_perm (w2[3], w2[2], selector);
4978 w2[2] = __byte_perm (w2[2], w2[1], selector);
4979 w2[1] = __byte_perm (w2[1], w2[0], selector);
4980 w2[0] = __byte_perm (w2[0], w1[3], selector);
4981 w1[3] = __byte_perm (w1[3], w1[2], selector);
4982 w1[2] = __byte_perm (w1[2], w1[1], selector);
4983 w1[1] = __byte_perm (w1[1], w1[0], selector);
4984 w1[0] = __byte_perm (w1[0], w0[3], selector);
4985 w0[3] = __byte_perm (w0[3], w0[2], selector);
4986 w0[2] = __byte_perm (w0[2], w0[1], selector);
4987 w0[1] = __byte_perm (w0[1], w0[0], selector);
4988 w0[0] = __byte_perm (w0[0], 0, selector);
4989 break;
4990
4991 case 1:
4992 w3[1] = __byte_perm (w3[0], w2[3], selector);
4993 w3[0] = __byte_perm (w2[3], w2[2], selector);
4994 w2[3] = __byte_perm (w2[2], w2[1], selector);
4995 w2[2] = __byte_perm (w2[1], w2[0], selector);
4996 w2[1] = __byte_perm (w2[0], w1[3], selector);
4997 w2[0] = __byte_perm (w1[3], w1[2], selector);
4998 w1[3] = __byte_perm (w1[2], w1[1], selector);
4999 w1[2] = __byte_perm (w1[1], w1[0], selector);
5000 w1[1] = __byte_perm (w1[0], w0[3], selector);
5001 w1[0] = __byte_perm (w0[3], w0[2], selector);
5002 w0[3] = __byte_perm (w0[2], w0[1], selector);
5003 w0[2] = __byte_perm (w0[1], w0[0], selector);
5004 w0[1] = __byte_perm (w0[0], 0, selector);
5005 w0[0] = 0;
5006 break;
5007
5008 case 2:
5009 w3[1] = __byte_perm (w2[3], w2[2], selector);
5010 w3[0] = __byte_perm (w2[2], w2[1], selector);
5011 w2[3] = __byte_perm (w2[1], w2[0], selector);
5012 w2[2] = __byte_perm (w2[0], w1[3], selector);
5013 w2[1] = __byte_perm (w1[3], w1[2], selector);
5014 w2[0] = __byte_perm (w1[2], w1[1], selector);
5015 w1[3] = __byte_perm (w1[1], w1[0], selector);
5016 w1[2] = __byte_perm (w1[0], w0[3], selector);
5017 w1[1] = __byte_perm (w0[3], w0[2], selector);
5018 w1[0] = __byte_perm (w0[2], w0[1], selector);
5019 w0[3] = __byte_perm (w0[1], w0[0], selector);
5020 w0[2] = __byte_perm (w0[0], 0, selector);
5021 w0[1] = 0;
5022 w0[0] = 0;
5023 break;
5024
5025 case 3:
5026 w3[1] = __byte_perm (w2[2], w2[1], selector);
5027 w3[0] = __byte_perm (w2[1], w2[0], selector);
5028 w2[3] = __byte_perm (w2[0], w1[3], selector);
5029 w2[2] = __byte_perm (w1[3], w1[2], selector);
5030 w2[1] = __byte_perm (w1[2], w1[1], selector);
5031 w2[0] = __byte_perm (w1[1], w1[0], selector);
5032 w1[3] = __byte_perm (w1[0], w0[3], selector);
5033 w1[2] = __byte_perm (w0[3], w0[2], selector);
5034 w1[1] = __byte_perm (w0[2], w0[1], selector);
5035 w1[0] = __byte_perm (w0[1], w0[0], selector);
5036 w0[3] = __byte_perm (w0[0], 0, selector);
5037 w0[2] = 0;
5038 w0[1] = 0;
5039 w0[0] = 0;
5040 break;
5041
5042 case 4:
5043 w3[1] = __byte_perm (w2[1], w2[0], selector);
5044 w3[0] = __byte_perm (w2[0], w1[3], selector);
5045 w2[3] = __byte_perm (w1[3], w1[2], selector);
5046 w2[2] = __byte_perm (w1[2], w1[1], selector);
5047 w2[1] = __byte_perm (w1[1], w1[0], selector);
5048 w2[0] = __byte_perm (w1[0], w0[3], selector);
5049 w1[3] = __byte_perm (w0[3], w0[2], selector);
5050 w1[2] = __byte_perm (w0[2], w0[1], selector);
5051 w1[1] = __byte_perm (w0[1], w0[0], selector);
5052 w1[0] = __byte_perm (w0[0], 0, selector);
5053 w0[3] = 0;
5054 w0[2] = 0;
5055 w0[1] = 0;
5056 w0[0] = 0;
5057 break;
5058
5059 case 5:
5060 w3[1] = __byte_perm (w2[0], w1[3], selector);
5061 w3[0] = __byte_perm (w1[3], w1[2], selector);
5062 w2[3] = __byte_perm (w1[2], w1[1], selector);
5063 w2[2] = __byte_perm (w1[1], w1[0], selector);
5064 w2[1] = __byte_perm (w1[0], w0[3], selector);
5065 w2[0] = __byte_perm (w0[3], w0[2], selector);
5066 w1[3] = __byte_perm (w0[2], w0[1], selector);
5067 w1[2] = __byte_perm (w0[1], w0[0], selector);
5068 w1[1] = __byte_perm (w0[0], 0, selector);
5069 w1[0] = 0;
5070 w0[3] = 0;
5071 w0[2] = 0;
5072 w0[1] = 0;
5073 w0[0] = 0;
5074 break;
5075
5076 case 6:
5077 w3[1] = __byte_perm (w1[3], w1[2], selector);
5078 w3[0] = __byte_perm (w1[2], w1[1], selector);
5079 w2[3] = __byte_perm (w1[1], w1[0], selector);
5080 w2[2] = __byte_perm (w1[0], w0[3], selector);
5081 w2[1] = __byte_perm (w0[3], w0[2], selector);
5082 w2[0] = __byte_perm (w0[2], w0[1], selector);
5083 w1[3] = __byte_perm (w0[1], w0[0], selector);
5084 w1[2] = __byte_perm (w0[0], 0, selector);
5085 w1[1] = 0;
5086 w1[0] = 0;
5087 w0[3] = 0;
5088 w0[2] = 0;
5089 w0[1] = 0;
5090 w0[0] = 0;
5091 break;
5092
5093 case 7:
5094 w3[1] = __byte_perm (w1[2], w1[1], selector);
5095 w3[0] = __byte_perm (w1[1], w1[0], selector);
5096 w2[3] = __byte_perm (w1[0], w0[3], selector);
5097 w2[2] = __byte_perm (w0[3], w0[2], selector);
5098 w2[1] = __byte_perm (w0[2], w0[1], selector);
5099 w2[0] = __byte_perm (w0[1], w0[0], selector);
5100 w1[3] = __byte_perm (w0[0], 0, selector);
5101 w1[2] = 0;
5102 w1[1] = 0;
5103 w1[0] = 0;
5104 w0[3] = 0;
5105 w0[2] = 0;
5106 w0[1] = 0;
5107 w0[0] = 0;
5108 break;
5109
5110 case 8:
5111 w3[1] = __byte_perm (w1[1], w1[0], selector);
5112 w3[0] = __byte_perm (w1[0], w0[3], selector);
5113 w2[3] = __byte_perm (w0[3], w0[2], selector);
5114 w2[2] = __byte_perm (w0[2], w0[1], selector);
5115 w2[1] = __byte_perm (w0[1], w0[0], selector);
5116 w2[0] = __byte_perm (w0[0], 0, selector);
5117 w1[3] = 0;
5118 w1[2] = 0;
5119 w1[1] = 0;
5120 w1[0] = 0;
5121 w0[3] = 0;
5122 w0[2] = 0;
5123 w0[1] = 0;
5124 w0[0] = 0;
5125 break;
5126
5127 case 9:
5128 w3[1] = __byte_perm (w1[0], w0[3], selector);
5129 w3[0] = __byte_perm (w0[3], w0[2], selector);
5130 w2[3] = __byte_perm (w0[2], w0[1], selector);
5131 w2[2] = __byte_perm (w0[1], w0[0], selector);
5132 w2[1] = __byte_perm (w0[0], 0, selector);
5133 w2[0] = 0;
5134 w1[3] = 0;
5135 w1[2] = 0;
5136 w1[1] = 0;
5137 w1[0] = 0;
5138 w0[3] = 0;
5139 w0[2] = 0;
5140 w0[1] = 0;
5141 w0[0] = 0;
5142 break;
5143
5144 case 10:
5145 w3[1] = __byte_perm (w0[3], w0[2], selector);
5146 w3[0] = __byte_perm (w0[2], w0[1], selector);
5147 w2[3] = __byte_perm (w0[1], w0[0], selector);
5148 w2[2] = __byte_perm (w0[0], 0, selector);
5149 w2[1] = 0;
5150 w2[0] = 0;
5151 w1[3] = 0;
5152 w1[2] = 0;
5153 w1[1] = 0;
5154 w1[0] = 0;
5155 w0[3] = 0;
5156 w0[2] = 0;
5157 w0[1] = 0;
5158 w0[0] = 0;
5159 break;
5160
5161 case 11:
5162 w3[1] = __byte_perm (w0[2], w0[1], selector);
5163 w3[0] = __byte_perm (w0[1], w0[0], selector);
5164 w2[3] = __byte_perm (w0[0], 0, selector);
5165 w2[2] = 0;
5166 w2[1] = 0;
5167 w2[0] = 0;
5168 w1[3] = 0;
5169 w1[2] = 0;
5170 w1[1] = 0;
5171 w1[0] = 0;
5172 w0[3] = 0;
5173 w0[2] = 0;
5174 w0[1] = 0;
5175 w0[0] = 0;
5176 break;
5177
5178 case 12:
5179 w3[1] = __byte_perm (w0[1], w0[0], selector);
5180 w3[0] = __byte_perm (w0[0], 0, selector);
5181 w2[3] = 0;
5182 w2[2] = 0;
5183 w2[1] = 0;
5184 w2[0] = 0;
5185 w1[3] = 0;
5186 w1[2] = 0;
5187 w1[1] = 0;
5188 w1[0] = 0;
5189 w0[3] = 0;
5190 w0[2] = 0;
5191 w0[1] = 0;
5192 w0[0] = 0;
5193 break;
5194
5195 case 13:
5196 w3[1] = __byte_perm (w0[0], 0, selector);
5197 w3[0] = 0;
5198 w2[3] = 0;
5199 w2[2] = 0;
5200 w2[1] = 0;
5201 w2[0] = 0;
5202 w1[3] = 0;
5203 w1[2] = 0;
5204 w1[1] = 0;
5205 w1[0] = 0;
5206 w0[3] = 0;
5207 w0[2] = 0;
5208 w0[1] = 0;
5209 w0[0] = 0;
5210 break;
5211 }
5212 #endif
5213 }
5214
5215 inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len)
5216 {
5217 #if defined cl_amd_media_ops
5218 switch (salt_len)
5219 {
5220 case 0: sw[0] = w0;
5221 break;
5222 case 1: sw[0] = amd_bytealign (w0, sw[0] << 24, 3);
5223 sw[1] = amd_bytealign (sw[1] >> 8, w0, 3);
5224 break;
5225 case 2: sw[0] = amd_bytealign (w0, sw[0] << 16, 2);
5226 sw[1] = amd_bytealign (sw[1] >> 16, w0, 2);
5227 break;
5228 case 3: sw[0] = amd_bytealign (w0, sw[0] << 8, 1);
5229 sw[1] = amd_bytealign (sw[1] >> 24, w0, 1);
5230 break;
5231 case 4: sw[1] = w0;
5232 break;
5233 case 5: sw[1] = amd_bytealign (w0, sw[1] << 24, 3);
5234 sw[2] = amd_bytealign (sw[2] >> 8, w0, 3);
5235 break;
5236 case 6: sw[1] = amd_bytealign (w0, sw[1] << 16, 2);
5237 sw[2] = amd_bytealign (sw[2] >> 16, w0, 2);
5238 break;
5239 case 7: sw[1] = amd_bytealign (w0, sw[1] << 8, 1);
5240 sw[2] = amd_bytealign (sw[2] >> 24, w0, 1);
5241 break;
5242 case 8: sw[2] = w0;
5243 break;
5244 case 9: sw[2] = amd_bytealign (w0, sw[2] << 24, 3);
5245 sw[3] = amd_bytealign (sw[3] >> 8, w0, 3);
5246 break;
5247 case 10: sw[2] = amd_bytealign (w0, sw[2] << 16, 2);
5248 sw[3] = amd_bytealign (sw[3] >> 16, w0, 2);
5249 break;
5250 case 11: sw[2] = amd_bytealign (w0, sw[2] << 8, 1);
5251 sw[3] = amd_bytealign (sw[3] >> 24, w0, 1);
5252 break;
5253 case 12: sw[3] = w0;
5254 break;
5255 case 13: sw[3] = amd_bytealign (w0, sw[3] << 24, 3);
5256 sw[4] = amd_bytealign (sw[4] >> 8, w0, 3);
5257 break;
5258 case 14: sw[3] = amd_bytealign (w0, sw[3] << 16, 2);
5259 sw[4] = amd_bytealign (sw[4] >> 16, w0, 2);
5260 break;
5261 case 15: sw[3] = amd_bytealign (w0, sw[3] << 8, 1);
5262 sw[4] = amd_bytealign (sw[4] >> 24, w0, 1);
5263 break;
5264 case 16: sw[4] = w0;
5265 break;
5266 case 17: sw[4] = amd_bytealign (w0, sw[4] << 24, 3);
5267 sw[5] = amd_bytealign (sw[5] >> 8, w0, 3);
5268 break;
5269 case 18: sw[4] = amd_bytealign (w0, sw[4] << 16, 2);
5270 sw[5] = amd_bytealign (sw[5] >> 16, w0, 2);
5271 break;
5272 case 19: sw[4] = amd_bytealign (w0, sw[4] << 8, 1);
5273 sw[5] = amd_bytealign (sw[5] >> 24, w0, 1);
5274 break;
5275 case 20: sw[5] = w0;
5276 break;
5277 case 21: sw[5] = amd_bytealign (w0, sw[5] << 24, 3);
5278 sw[6] = amd_bytealign (sw[6] >> 8, w0, 3);
5279 break;
5280 case 22: sw[5] = amd_bytealign (w0, sw[5] << 16, 2);
5281 sw[6] = amd_bytealign (sw[6] >> 16, w0, 2);
5282 break;
5283 case 23: sw[5] = amd_bytealign (w0, sw[5] << 8, 1);
5284 sw[6] = amd_bytealign (sw[6] >> 24, w0, 1);
5285 break;
5286 case 24: sw[6] = w0;
5287 break;
5288 case 25: sw[6] = amd_bytealign (w0, sw[6] << 24, 3);
5289 sw[7] = amd_bytealign (sw[7] >> 8, w0, 3);
5290 break;
5291 case 26: sw[6] = amd_bytealign (w0, sw[6] << 16, 2);
5292 sw[7] = amd_bytealign (sw[7] >> 16, w0, 2);
5293 break;
5294 case 27: sw[6] = amd_bytealign (w0, sw[6] << 8, 1);
5295 sw[7] = amd_bytealign (sw[7] >> 24, w0, 1);
5296 break;
5297 case 28: sw[7] = w0;
5298 break;
5299 case 29: sw[7] = amd_bytealign (w0, sw[7] << 24, 3);
5300 sw[8] = amd_bytealign (sw[8] >> 8, w0, 3);
5301 break;
5302 case 30: sw[7] = amd_bytealign (w0, sw[7] << 16, 2);
5303 sw[8] = amd_bytealign (sw[8] >> 16, w0, 2);
5304 break;
5305 case 31: sw[7] = amd_bytealign (w0, sw[7] << 8, 1);
5306 sw[8] = amd_bytealign (sw[8] >> 24, w0, 1);
5307 break;
5308 }
5309 #else
5310 switch (salt_len)
5311 {
5312 case 0: sw[0] = w0;
5313 break;
5314 case 1: sw[0] = (sw[0] & 0x000000ff) | (w0 << 8);
5315 sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
5316 break;
5317 case 2: sw[0] = (sw[0] & 0x0000ffff) | (w0 << 16);
5318 sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
5319 break;
5320 case 3: sw[0] = (sw[0] & 0x00ffffff) | (w0 << 24);
5321 sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
5322 break;
5323 case 4: sw[1] = w0;
5324 break;
5325 case 5: sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
5326 sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
5327 break;
5328 case 6: sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
5329 sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
5330 break;
5331 case 7: sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
5332 sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
5333 break;
5334 case 8: sw[2] = w0;
5335 break;
5336 case 9: sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
5337 sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
5338 break;
5339 case 10: sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
5340 sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
5341 break;
5342 case 11: sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
5343 sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
5344 break;
5345 case 12: sw[3] = w0;
5346 break;
5347 case 13: sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
5348 sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
5349 break;
5350 case 14: sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
5351 sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
5352 break;
5353 case 15: sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
5354 sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
5355 break;
5356 case 16: sw[4] = w0;
5357 break;
5358 case 17: sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
5359 sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
5360 break;
5361 case 18: sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
5362 sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
5363 break;
5364 case 19: sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
5365 sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
5366 break;
5367 case 20: sw[5] = w0;
5368 break;
5369 case 21: sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
5370 sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
5371 break;
5372 case 22: sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
5373 sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
5374 break;
5375 case 23: sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
5376 sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
5377 break;
5378 case 24: sw[6] = w0;
5379 break;
5380 case 25: sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
5381 sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
5382 break;
5383 case 26: sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
5384 sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
5385 break;
5386 case 27: sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
5387 sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
5388 break;
5389 case 28: sw[7] = w0;
5390 break;
5391 case 29: sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
5392 sw[8] = (sw[8] & 0xffffff00) | (w0 >> 24);
5393 break;
5394 case 30: sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
5395 sw[8] = (sw[8] & 0xffff0000) | (w0 >> 16);
5396 break;
5397 case 31: sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
5398 sw[8] = (sw[8] & 0xff000000) | (w0 >> 8);
5399 break;
5400 }
5401 #endif
5402 }
5403
5404 inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len)
5405 {
5406 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5407
5408 switch (salt_len)
5409 {
5410 case 0: sw[0] = w0;
5411 break;
5412 case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8);
5413 sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
5414 break;
5415 case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16);
5416 sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
5417 break;
5418 case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24);
5419 sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
5420 break;
5421 case 4: sw[1] = w0;
5422 break;
5423 case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
5424 sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
5425 break;
5426 case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
5427 sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
5428 break;
5429 case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
5430 sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
5431 break;
5432 case 8: sw[2] = w0;
5433 break;
5434 case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
5435 sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
5436 break;
5437 case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
5438 sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
5439 break;
5440 case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
5441 sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
5442 break;
5443 case 12: sw[3] = w0;
5444 break;
5445 case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
5446 sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
5447 break;
5448 case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
5449 sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
5450 break;
5451 case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
5452 sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
5453 break;
5454 case 16: sw[4] = w0;
5455 break;
5456 case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
5457 sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
5458 break;
5459 case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
5460 sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
5461 break;
5462 case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
5463 sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
5464 break;
5465 case 20: sw[5] = w0;
5466 break;
5467 case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
5468 sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
5469 break;
5470 case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
5471 sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
5472 break;
5473 case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
5474 sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
5475 break;
5476 case 24: sw[6] = w0;
5477 break;
5478 case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
5479 sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
5480 break;
5481 case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
5482 sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
5483 break;
5484 case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
5485 sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
5486 break;
5487 case 28: sw[7] = w0;
5488 break;
5489 case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
5490 sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24);
5491 break;
5492 case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
5493 sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16);
5494 break;
5495 case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
5496 sw[8] = (sw[8] & 0x000000ff) | (w0 << 8);
5497 break;
5498 }
5499 }
5500
5501 inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len)
5502 {
5503 #if defined cl_amd_media_ops
5504 switch (salt_len)
5505 {
5506 case 0: w0[0] = wx;
5507 break;
5508 case 1: w0[0] = amd_bytealign (wx, w0[0] << 24, 3);
5509 w0[1] = amd_bytealign (w0[1] >> 8, wx, 3);
5510 break;
5511 case 2: w0[0] = amd_bytealign (wx, w0[0] << 16, 2);
5512 w0[1] = amd_bytealign (w0[1] >> 16, wx, 2);
5513 break;
5514 case 3: w0[0] = amd_bytealign (wx, w0[0] << 8, 1);
5515 w0[1] = amd_bytealign (w0[1] >> 24, wx, 1);
5516 break;
5517 case 4: w0[1] = wx;
5518 break;
5519 case 5: w0[1] = amd_bytealign (wx, w0[1] << 24, 3);
5520 w0[2] = amd_bytealign (w0[2] >> 8, wx, 3);
5521 break;
5522 case 6: w0[1] = amd_bytealign (wx, w0[1] << 16, 2);
5523 w0[2] = amd_bytealign (w0[2] >> 16, wx, 2);
5524 break;
5525 case 7: w0[1] = amd_bytealign (wx, w0[1] << 8, 1);
5526 w0[2] = amd_bytealign (w0[2] >> 24, wx, 1);
5527 break;
5528 case 8: w0[2] = wx;
5529 break;
5530 case 9: w0[2] = amd_bytealign (wx, w0[2] << 24, 3);
5531 w0[3] = amd_bytealign (w0[3] >> 8, wx, 3);
5532 break;
5533 case 10: w0[2] = amd_bytealign (wx, w0[2] << 16, 2);
5534 w0[3] = amd_bytealign (w0[3] >> 16, wx, 2);
5535 break;
5536 case 11: w0[2] = amd_bytealign (wx, w0[2] << 8, 1);
5537 w0[3] = amd_bytealign (w0[3] >> 24, wx, 1);
5538 break;
5539 case 12: w0[3] = wx;
5540 break;
5541 case 13: w0[3] = amd_bytealign (wx, w0[3] << 24, 3);
5542 w1[0] = amd_bytealign (w1[0] >> 8, wx, 3);
5543 break;
5544 case 14: w0[3] = amd_bytealign (wx, w0[3] << 16, 2);
5545 w1[0] = amd_bytealign (w1[0] >> 16, wx, 2);
5546 break;
5547 case 15: w0[3] = amd_bytealign (wx, w0[3] << 8, 1);
5548 w1[0] = amd_bytealign (w1[0] >> 24, wx, 1);
5549 break;
5550 case 16: w1[0] = wx;
5551 break;
5552 case 17: w1[0] = amd_bytealign (wx, w1[0] << 24, 3);
5553 w1[1] = amd_bytealign (w1[1] >> 8, wx, 3);
5554 break;
5555 case 18: w1[0] = amd_bytealign (wx, w1[0] << 16, 2);
5556 w1[1] = amd_bytealign (w1[1] >> 16, wx, 2);
5557 break;
5558 case 19: w1[0] = amd_bytealign (wx, w1[0] << 8, 1);
5559 w1[1] = amd_bytealign (w1[1] >> 24, wx, 1);
5560 break;
5561 case 20: w1[1] = wx;
5562 break;
5563 case 21: w1[1] = amd_bytealign (wx, w1[1] << 24, 3);
5564 w1[2] = amd_bytealign (w1[2] >> 8, wx, 3);
5565 break;
5566 case 22: w1[1] = amd_bytealign (wx, w1[1] << 16, 2);
5567 w1[2] = amd_bytealign (w1[2] >> 16, wx, 2);
5568 break;
5569 case 23: w1[1] = amd_bytealign (wx, w1[1] << 8, 1);
5570 w1[2] = amd_bytealign (w1[2] >> 24, wx, 1);
5571 break;
5572 case 24: w1[2] = wx;
5573 break;
5574 case 25: w1[2] = amd_bytealign (wx, w1[2] << 24, 3);
5575 w1[3] = amd_bytealign (w1[3] >> 8, wx, 3);
5576 break;
5577 case 26: w1[2] = amd_bytealign (wx, w1[2] << 16, 2);
5578 w1[3] = amd_bytealign (w1[3] >> 16, wx, 2);
5579 break;
5580 case 27: w1[2] = amd_bytealign (wx, w1[2] << 8, 1);
5581 w1[3] = amd_bytealign (w1[3] >> 24, wx, 1);
5582 break;
5583 case 28: w1[3] = wx;
5584 break;
5585 case 29: w1[3] = amd_bytealign (wx, w1[3] << 24, 3);
5586 w2[0] = amd_bytealign (w2[0] >> 8, wx, 3);
5587 break;
5588 case 30: w1[3] = amd_bytealign (wx, w1[3] << 16, 2);
5589 w2[0] = amd_bytealign (w2[0] >> 16, wx, 2);
5590 break;
5591 case 31: w1[3] = amd_bytealign (wx, w1[3] << 8, 1);
5592 w2[0] = amd_bytealign (w2[0] >> 24, wx, 1);
5593 break;
5594 case 32: w2[0] = wx;
5595 break;
5596 case 33: w2[0] = amd_bytealign (wx, w2[0] << 24, 3);
5597 w2[1] = amd_bytealign (w2[1] >> 8, wx, 3);
5598 break;
5599 case 34: w2[0] = amd_bytealign (wx, w2[0] << 16, 2);
5600 w2[1] = amd_bytealign (w2[1] >> 16, wx, 2);
5601 break;
5602 case 35: w2[0] = amd_bytealign (wx, w2[0] << 8, 1);
5603 w2[1] = amd_bytealign (w2[1] >> 24, wx, 1);
5604 break;
5605 case 36: w2[1] = wx;
5606 break;
5607 case 37: w2[1] = amd_bytealign (wx, w2[1] << 24, 3);
5608 w2[2] = amd_bytealign (w2[2] >> 8, wx, 3);
5609 break;
5610 case 38: w2[1] = amd_bytealign (wx, w2[1] << 16, 2);
5611 w2[2] = amd_bytealign (w2[2] >> 16, wx, 2);
5612 break;
5613 case 39: w2[1] = amd_bytealign (wx, w2[1] << 8, 1);
5614 w2[2] = amd_bytealign (w2[2] >> 24, wx, 1);
5615 break;
5616 case 40: w2[2] = wx;
5617 break;
5618 case 41: w2[2] = amd_bytealign (wx, w2[2] << 24, 3);
5619 w2[3] = amd_bytealign (w2[3] >> 8, wx, 3);
5620 break;
5621 case 42: w2[2] = amd_bytealign (wx, w2[2] << 16, 2);
5622 w2[3] = amd_bytealign (w2[3] >> 16, wx, 2);
5623 break;
5624 case 43: w2[2] = amd_bytealign (wx, w2[2] << 8, 1);
5625 w2[3] = amd_bytealign (w2[3] >> 24, wx, 1);
5626 break;
5627 case 44: w2[3] = wx;
5628 break;
5629 case 45: w2[3] = amd_bytealign (wx, w2[3] << 24, 3);
5630 w3[0] = amd_bytealign (w3[0] >> 8, wx, 3);
5631 break;
5632 case 46: w2[3] = amd_bytealign (wx, w2[3] << 16, 2);
5633 w3[0] = amd_bytealign (w3[0] >> 16, wx, 2);
5634 break;
5635 case 47: w2[3] = amd_bytealign (wx, w2[3] << 8, 1);
5636 w3[0] = amd_bytealign (w3[0] >> 24, wx, 1);
5637 break;
5638 case 48: w3[0] = wx;
5639 break;
5640 case 49: w3[0] = amd_bytealign (wx, w3[0] << 24, 3);
5641 w3[1] = amd_bytealign (w3[1] >> 8, wx, 3);
5642 break;
5643 case 50: w3[0] = amd_bytealign (wx, w3[0] << 16, 2);
5644 w3[1] = amd_bytealign (w3[1] >> 16, wx, 2);
5645 break;
5646 case 51: w3[0] = amd_bytealign (wx, w3[0] << 8, 1);
5647 w3[1] = amd_bytealign (w3[1] >> 24, wx, 1);
5648 break;
5649 case 52: w3[1] = wx;
5650 break;
5651 case 53: w3[1] = amd_bytealign (wx, w3[1] << 24, 3);
5652 w3[2] = amd_bytealign (w3[2] >> 8, wx, 3);
5653 break;
5654 case 54: w3[1] = amd_bytealign (wx, w3[1] << 16, 2);
5655 w3[2] = amd_bytealign (w3[2] >> 16, wx, 2);
5656 break;
5657 case 55: w3[1] = amd_bytealign (wx, w3[1] << 8, 1);
5658 w3[2] = amd_bytealign (w3[2] >> 24, wx, 1);
5659 break;
5660 case 56: w3[2] = wx;
5661 break;
5662 case 57: w3[2] = amd_bytealign (wx, w3[2] << 24, 3);
5663 w3[3] = amd_bytealign (w3[3] >> 8, wx, 3);
5664 break;
5665 case 58: w3[2] = amd_bytealign (wx, w3[2] << 16, 2);
5666 w3[3] = amd_bytealign (w3[3] >> 16, wx, 2);
5667 break;
5668 case 59: w3[2] = amd_bytealign (wx, w3[2] << 8, 1);
5669 w3[3] = amd_bytealign (w3[3] >> 24, wx, 1);
5670 break;
5671 case 60: w3[3] = wx;
5672 break;
5673 case 61: w3[3] = amd_bytealign (wx, w3[3] << 24, 3);
5674 //w4[0] = amd_bytealign (w4[0] >> 8, wx, 3);
5675 break;
5676 case 62: w3[3] = amd_bytealign (wx, w3[3] << 16, 2);
5677 //w4[0] = amd_bytealign (w4[0] >> 16, wx, 2);
5678 break;
5679 case 63: w3[3] = amd_bytealign (wx, w3[3] << 8, 1);
5680 //w4[0] = amd_bytealign (w4[0] >> 24, wx, 1);
5681 break;
5682 }
5683 #else
5684 switch (salt_len)
5685 {
5686 case 0: w0[0] = wx;
5687 break;
5688 case 1: w0[0] = (w0[0] & 0x000000ff) | (wx << 8);
5689 w0[1] = (w0[1] & 0xffffff00) | (wx >> 24);
5690 break;
5691 case 2: w0[0] = (w0[0] & 0x0000ffff) | (wx << 16);
5692 w0[1] = (w0[1] & 0xffff0000) | (wx >> 16);
5693 break;
5694 case 3: w0[0] = (w0[0] & 0x00ffffff) | (wx << 24);
5695 w0[1] = (w0[1] & 0xff000000) | (wx >> 8);
5696 break;
5697 case 4: w0[1] = wx;
5698 break;
5699 case 5: w0[1] = (w0[1] & 0x000000ff) | (wx << 8);
5700 w0[2] = (w0[2] & 0xffffff00) | (wx >> 24);
5701 break;
5702 case 6: w0[1] = (w0[1] & 0x0000ffff) | (wx << 16);
5703 w0[2] = (w0[2] & 0xffff0000) | (wx >> 16);
5704 break;
5705 case 7: w0[1] = (w0[1] & 0x00ffffff) | (wx << 24);
5706 w0[2] = (w0[2] & 0xff000000) | (wx >> 8);
5707 break;
5708 case 8: w0[2] = wx;
5709 break;
5710 case 9: w0[2] = (w0[2] & 0x000000ff) | (wx << 8);
5711 w0[3] = (w0[3] & 0xffffff00) | (wx >> 24);
5712 break;
5713 case 10: w0[2] = (w0[2] & 0x0000ffff) | (wx << 16);
5714 w0[3] = (w0[3] & 0xffff0000) | (wx >> 16);
5715 break;
5716 case 11: w0[2] = (w0[2] & 0x00ffffff) | (wx << 24);
5717 w0[3] = (w0[3] & 0xff000000) | (wx >> 8);
5718 break;
5719 case 12: w0[3] = wx;
5720 break;
5721 case 13: w0[3] = (w0[3] & 0x000000ff) | (wx << 8);
5722 w1[0] = (w1[0] & 0xffffff00) | (wx >> 24);
5723 break;
5724 case 14: w0[3] = (w0[3] & 0x0000ffff) | (wx << 16);
5725 w1[0] = (w1[0] & 0xffff0000) | (wx >> 16);
5726 break;
5727 case 15: w0[3] = (w0[3] & 0x00ffffff) | (wx << 24);
5728 w1[0] = (w1[0] & 0xff000000) | (wx >> 8);
5729 break;
5730 case 16: w1[0] = wx;
5731 break;
5732 case 17: w1[0] = (w1[0] & 0x000000ff) | (wx << 8);
5733 w1[1] = (w1[1] & 0xffffff00) | (wx >> 24);
5734 break;
5735 case 18: w1[0] = (w1[0] & 0x0000ffff) | (wx << 16);
5736 w1[1] = (w1[1] & 0xffff0000) | (wx >> 16);
5737 break;
5738 case 19: w1[0] = (w1[0] & 0x00ffffff) | (wx << 24);
5739 w1[1] = (w1[1] & 0xff000000) | (wx >> 8);
5740 break;
5741 case 20: w1[1] = wx;
5742 break;
5743 case 21: w1[1] = (w1[1] & 0x000000ff) | (wx << 8);
5744 w1[2] = (w1[2] & 0xffffff00) | (wx >> 24);
5745 break;
5746 case 22: w1[1] = (w1[1] & 0x0000ffff) | (wx << 16);
5747 w1[2] = (w1[2] & 0xffff0000) | (wx >> 16);
5748 break;
5749 case 23: w1[1] = (w1[1] & 0x00ffffff) | (wx << 24);
5750 w1[2] = (w1[2] & 0xff000000) | (wx >> 8);
5751 break;
5752 case 24: w1[2] = wx;
5753 break;
5754 case 25: w1[2] = (w1[2] & 0x000000ff) | (wx << 8);
5755 w1[3] = (w1[3] & 0xffffff00) | (wx >> 24);
5756 break;
5757 case 26: w1[2] = (w1[2] & 0x0000ffff) | (wx << 16);
5758 w1[3] = (w1[3] & 0xffff0000) | (wx >> 16);
5759 break;
5760 case 27: w1[2] = (w1[2] & 0x00ffffff) | (wx << 24);
5761 w1[3] = (w1[3] & 0xff000000) | (wx >> 8);
5762 break;
5763 case 28: w1[3] = wx;
5764 break;
5765 case 29: w1[3] = (w1[3] & 0x000000ff) | (wx << 8);
5766 w2[0] = (w2[0] & 0xffffff00) | (wx >> 24);
5767 break;
5768 case 30: w1[3] = (w1[3] & 0x0000ffff) | (wx << 16);
5769 w2[0] = (w2[0] & 0xffff0000) | (wx >> 16);
5770 break;
5771 case 31: w1[3] = (w1[3] & 0x00ffffff) | (wx << 24);
5772 w2[0] = (w2[0] & 0xff000000) | (wx >> 8);
5773 break;
5774 case 32: w2[0] = wx;
5775 break;
5776 case 33: w2[0] = (w2[0] & 0x000000ff) | (wx << 8);
5777 w2[1] = (w2[1] & 0xffffff00) | (wx >> 24);
5778 break;
5779 case 34: w2[0] = (w2[0] & 0x0000ffff) | (wx << 16);
5780 w2[1] = (w2[1] & 0xffff0000) | (wx >> 16);
5781 break;
5782 case 35: w2[0] = (w2[0] & 0x00ffffff) | (wx << 24);
5783 w2[1] = (w2[1] & 0xff000000) | (wx >> 8);
5784 break;
5785 case 36: w2[1] = wx;
5786 break;
5787 case 37: w2[1] = (w2[1] & 0x000000ff) | (wx << 8);
5788 w2[2] = (w2[2] & 0xffffff00) | (wx >> 24);
5789 break;
5790 case 38: w2[1] = (w2[1] & 0x0000ffff) | (wx << 16);
5791 w2[2] = (w2[2] & 0xffff0000) | (wx >> 16);
5792 break;
5793 case 39: w2[1] = (w2[1] & 0x00ffffff) | (wx << 24);
5794 w2[2] = (w2[2] & 0xff000000) | (wx >> 8);
5795 break;
5796 case 40: w2[2] = wx;
5797 break;
5798 case 41: w2[2] = (w2[2] & 0x000000ff) | (wx << 8);
5799 w2[3] = (w2[3] & 0xffffff00) | (wx >> 24);
5800 break;
5801 case 42: w2[2] = (w2[2] & 0x0000ffff) | (wx << 16);
5802 w2[3] = (w2[3] & 0xffff0000) | (wx >> 16);
5803 break;
5804 case 43: w2[2] = (w2[2] & 0x00ffffff) | (wx << 24);
5805 w2[3] = (w2[3] & 0xff000000) | (wx >> 8);
5806 break;
5807 case 44: w2[3] = wx;
5808 break;
5809 case 45: w2[3] = (w2[3] & 0x000000ff) | (wx << 8);
5810 w3[0] = (w3[0] & 0xffffff00) | (wx >> 24);
5811 break;
5812 case 46: w2[3] = (w2[3] & 0x0000ffff) | (wx << 16);
5813 w3[0] = (w3[0] & 0xffff0000) | (wx >> 16);
5814 break;
5815 case 47: w2[3] = (w2[3] & 0x00ffffff) | (wx << 24);
5816 w3[0] = (w3[0] & 0xff000000) | (wx >> 8);
5817 break;
5818 case 48: w3[0] = wx;
5819 break;
5820 case 49: w3[0] = (w3[0] & 0x000000ff) | (wx << 8);
5821 w3[1] = (w3[1] & 0xffffff00) | (wx >> 24);
5822 break;
5823 case 50: w3[0] = (w3[0] & 0x0000ffff) | (wx << 16);
5824 w3[1] = (w3[1] & 0xffff0000) | (wx >> 16);
5825 break;
5826 case 51: w3[0] = (w3[0] & 0x00ffffff) | (wx << 24);
5827 w3[1] = (w3[1] & 0xff000000) | (wx >> 8);
5828 break;
5829 case 52: w3[1] = wx;
5830 break;
5831 case 53: w3[1] = (w3[1] & 0x000000ff) | (wx << 8);
5832 w3[2] = (w3[2] & 0xffffff00) | (wx >> 24);
5833 break;
5834 case 54: w3[1] = (w3[1] & 0x0000ffff) | (wx << 16);
5835 w3[2] = (w3[2] & 0xffff0000) | (wx >> 16);
5836 break;
5837 case 55: w3[1] = (w3[1] & 0x00ffffff) | (wx << 24);
5838 w3[2] = (w3[2] & 0xff000000) | (wx >> 8);
5839 break;
5840 case 56: w3[2] = wx;
5841 break;
5842 case 57: w3[2] = (w3[2] & 0x000000ff) | (wx << 8);
5843 w3[3] = (w3[3] & 0xffffff00) | (wx >> 24);
5844 break;
5845 case 58: w3[2] = (w3[2] & 0x0000ffff) | (wx << 16);
5846 w3[3] = (w3[3] & 0xffff0000) | (wx >> 16);
5847 break;
5848 case 59: w3[2] = (w3[2] & 0x00ffffff) | (wx << 24);
5849 w3[3] = (w3[3] & 0xff000000) | (wx >> 8);
5850 break;
5851 case 60: w3[3] = wx;
5852 break;
5853 case 61: w3[3] = (w3[3] & 0x000000ff) | (wx << 8);
5854 //w4[0] = (w4[0] & 0xffffff00) | (wx >> 24);
5855 break;
5856 case 62: w3[3] = (w3[3] & 0x0000ffff) | (wx << 16);
5857 //w4[0] = (w4[0] & 0xffff0000) | (wx >> 16);
5858 break;
5859 case 63: w3[3] = (w3[3] & 0x00ffffff) | (wx << 24);
5860 //w4[0] = (w4[0] & 0xff000000) | (wx >> 8);
5861 break;
5862 }
5863 #endif
5864 }
5865
5866 inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len)
5867 {
5868 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5869
5870 switch (salt_len)
5871 {
5872 case 0: w0[0] = wx;
5873 break;
5874 case 1: w0[0] = (w0[0] & 0xff000000) | (wx >> 8);
5875 w0[1] = (w0[1] & 0x00ffffff) | (wx << 24);
5876 break;
5877 case 2: w0[0] = (w0[0] & 0xffff0000) | (wx >> 16);
5878 w0[1] = (w0[1] & 0x0000ffff) | (wx << 16);
5879 break;
5880 case 3: w0[0] = (w0[0] & 0xffffff00) | (wx >> 24);
5881 w0[1] = (w0[1] & 0x000000ff) | (wx << 8);
5882 break;
5883 case 4: w0[1] = wx;
5884 break;
5885 case 5: w0[1] = (w0[1] & 0xff000000) | (wx >> 8);
5886 w0[2] = (w0[2] & 0x00ffffff) | (wx << 24);
5887 break;
5888 case 6: w0[1] = (w0[1] & 0xffff0000) | (wx >> 16);
5889 w0[2] = (w0[2] & 0x0000ffff) | (wx << 16);
5890 break;
5891 case 7: w0[1] = (w0[1] & 0xffffff00) | (wx >> 24);
5892 w0[2] = (w0[2] & 0x000000ff) | (wx << 8);
5893 break;
5894 case 8: w0[2] = wx;
5895 break;
5896 case 9: w0[2] = (w0[2] & 0xff000000) | (wx >> 8);
5897 w0[3] = (w0[3] & 0x00ffffff) | (wx << 24);
5898 break;
5899 case 10: w0[2] = (w0[2] & 0xffff0000) | (wx >> 16);
5900 w0[3] = (w0[3] & 0x0000ffff) | (wx << 16);
5901 break;
5902 case 11: w0[2] = (w0[2] & 0xffffff00) | (wx >> 24);
5903 w0[3] = (w0[3] & 0x000000ff) | (wx << 8);
5904 break;
5905 case 12: w0[3] = wx;
5906 break;
5907 case 13: w0[3] = (w0[3] & 0xff000000) | (wx >> 8);
5908 w1[0] = (w1[0] & 0x00ffffff) | (wx << 24);
5909 break;
5910 case 14: w0[3] = (w0[3] & 0xffff0000) | (wx >> 16);
5911 w1[0] = (w1[0] & 0x0000ffff) | (wx << 16);
5912 break;
5913 case 15: w0[3] = (w0[3] & 0xffffff00) | (wx >> 24);
5914 w1[0] = (w1[0] & 0x000000ff) | (wx << 8);
5915 break;
5916 case 16: w1[0] = wx;
5917 break;
5918 case 17: w1[0] = (w1[0] & 0xff000000) | (wx >> 8);
5919 w1[1] = (w1[1] & 0x00ffffff) | (wx << 24);
5920 break;
5921 case 18: w1[0] = (w1[0] & 0xffff0000) | (wx >> 16);
5922 w1[1] = (w1[1] & 0x0000ffff) | (wx << 16);
5923 break;
5924 case 19: w1[0] = (w1[0] & 0xffffff00) | (wx >> 24);
5925 w1[1] = (w1[1] & 0x000000ff) | (wx << 8);
5926 break;
5927 case 20: w1[1] = wx;
5928 break;
5929 case 21: w1[1] = (w1[1] & 0xff000000) | (wx >> 8);
5930 w1[2] = (w1[2] & 0x00ffffff) | (wx << 24);
5931 break;
5932 case 22: w1[1] = (w1[1] & 0xffff0000) | (wx >> 16);
5933 w1[2] = (w1[2] & 0x0000ffff) | (wx << 16);
5934 break;
5935 case 23: w1[1] = (w1[1] & 0xffffff00) | (wx >> 24);
5936 w1[2] = (w1[2] & 0x000000ff) | (wx << 8);
5937 break;
5938 case 24: w1[2] = wx;
5939 break;
5940 case 25: w1[2] = (w1[2] & 0xff000000) | (wx >> 8);
5941 w1[3] = (w1[3] & 0x00ffffff) | (wx << 24);
5942 break;
5943 case 26: w1[2] = (w1[2] & 0xffff0000) | (wx >> 16);
5944 w1[3] = (w1[3] & 0x0000ffff) | (wx << 16);
5945 break;
5946 case 27: w1[2] = (w1[2] & 0xffffff00) | (wx >> 24);
5947 w1[3] = (w1[3] & 0x000000ff) | (wx << 8);
5948 break;
5949 case 28: w1[3] = wx;
5950 break;
5951 case 29: w1[3] = (w1[3] & 0xff000000) | (wx >> 8);
5952 w2[0] = (w2[0] & 0x00ffffff) | (wx << 24);
5953 break;
5954 case 30: w1[3] = (w1[3] & 0xffff0000) | (wx >> 16);
5955 w2[0] = (w2[0] & 0x0000ffff) | (wx << 16);
5956 break;
5957 case 31: w1[3] = (w1[3] & 0xffffff00) | (wx >> 24);
5958 w2[0] = (w2[0] & 0x000000ff) | (wx << 8);
5959 break;
5960 case 32: w2[0] = wx;
5961 break;
5962 case 33: w2[0] = (w2[0] & 0xff000000) | (wx >> 8);
5963 w2[1] = (w2[1] & 0x00ffffff) | (wx << 24);
5964 break;
5965 case 34: w2[0] = (w2[0] & 0xffff0000) | (wx >> 16);
5966 w2[1] = (w2[1] & 0x0000ffff) | (wx << 16);
5967 break;
5968 case 35: w2[0] = (w2[0] & 0xffffff00) | (wx >> 24);
5969 w2[1] = (w2[1] & 0x000000ff) | (wx << 8);
5970 break;
5971 case 36: w2[1] = wx;
5972 break;
5973 case 37: w2[1] = (w2[1] & 0xff000000) | (wx >> 8);
5974 w2[2] = (w2[2] & 0x00ffffff) | (wx << 24);
5975 break;
5976 case 38: w2[1] = (w2[1] & 0xffff0000) | (wx >> 16);
5977 w2[2] = (w2[2] & 0x0000ffff) | (wx << 16);
5978 break;
5979 case 39: w2[1] = (w2[1] & 0xffffff00) | (wx >> 24);
5980 w2[2] = (w2[2] & 0x000000ff) | (wx << 8);
5981 break;
5982 case 40: w2[2] = wx;
5983 break;
5984 case 41: w2[2] = (w2[2] & 0xff000000) | (wx >> 8);
5985 w2[3] = (w2[3] & 0x00ffffff) | (wx << 24);
5986 break;
5987 case 42: w2[2] = (w2[2] & 0xffff0000) | (wx >> 16);
5988 w2[3] = (w2[3] & 0x0000ffff) | (wx << 16);
5989 break;
5990 case 43: w2[2] = (w2[2] & 0xffffff00) | (wx >> 24);
5991 w2[3] = (w2[3] & 0x000000ff) | (wx << 8);
5992 break;
5993 case 44: w2[3] = wx;
5994 break;
5995 case 45: w2[3] = (w2[3] & 0xff000000) | (wx >> 8);
5996 w3[0] = (w3[0] & 0x00ffffff) | (wx << 24);
5997 break;
5998 case 46: w2[3] = (w2[3] & 0xffff0000) | (wx >> 16);
5999 w3[0] = (w3[0] & 0x0000ffff) | (wx << 16);
6000 break;
6001 case 47: w2[3] = (w2[3] & 0xffffff00) | (wx >> 24);
6002 w3[0] = (w3[0] & 0x000000ff) | (wx << 8);
6003 break;
6004 case 48: w3[0] = wx;
6005 break;
6006 case 49: w3[0] = (w3[0] & 0xff000000) | (wx >> 8);
6007 w3[1] = (w3[1] & 0x00ffffff) | (wx << 24);
6008 break;
6009 case 50: w3[0] = (w3[0] & 0xffff0000) | (wx >> 16);
6010 w3[1] = (w3[1] & 0x0000ffff) | (wx << 16);
6011 break;
6012 case 51: w3[0] = (w3[0] & 0xffffff00) | (wx >> 24);
6013 w3[1] = (w3[1] & 0x000000ff) | (wx << 8);
6014 break;
6015 case 52: w3[1] = wx;
6016 break;
6017 case 53: w3[1] = (w3[1] & 0xff000000) | (wx >> 8);
6018 w3[2] = (w3[2] & 0x00ffffff) | (wx << 24);
6019 break;
6020 case 54: w3[1] = (w3[1] & 0xffff0000) | (wx >> 16);
6021 w3[2] = (w3[2] & 0x0000ffff) | (wx << 16);
6022 break;
6023 case 55: w3[1] = (w3[1] & 0xffffff00) | (wx >> 24);
6024 w3[2] = (w3[2] & 0x000000ff) | (wx << 8);
6025 break;
6026 case 56: w3[2] = wx;
6027 break;
6028 case 57: w3[2] = (w3[2] & 0xff000000) | (wx >> 8);
6029 w3[3] = (w3[3] & 0x00ffffff) | (wx << 24);
6030 break;
6031 case 58: w3[2] = (w3[2] & 0xffff0000) | (wx >> 16);
6032 w3[3] = (w3[3] & 0x0000ffff) | (wx << 16);
6033 break;
6034 case 59: w3[2] = (w3[2] & 0xffffff00) | (wx >> 24);
6035 w3[3] = (w3[3] & 0x000000ff) | (wx << 8);
6036 break;
6037 case 60: w3[3] = wx;
6038 break;
6039 case 61: w3[3] = (w3[3] & 0xff000000) | (wx >> 8);
6040 //w4[0] = (w4[0] & 0x00ffffff) | (wx << 24);
6041 break;
6042 case 62: w3[3] = (w3[3] & 0xffff0000) | (wx >> 16);
6043 //w4[0] = (w4[0] & 0x0000ffff) | (wx << 16);
6044 break;
6045 case 63: w3[3] = (w3[3] & 0xffffff00) | (wx >> 24);
6046 //w4[0] = (w4[0] & 0x000000ff) | (wx << 8);
6047 break;
6048 }
6049 }
6050
6051 /**
6052 * vector functions as scalar (for outer loop usage)
6053 */
6054
6055 inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
6056 {
6057 switch (offset)
6058 {
6059 case 0:
6060 w0[0] = 0x01;
6061 break;
6062
6063 case 1:
6064 w0[0] = w0[0] | 0x0100;
6065 break;
6066
6067 case 2:
6068 w0[0] = w0[0] | 0x010000;
6069 break;
6070
6071 case 3:
6072 w0[0] = w0[0] | 0x01000000;
6073 break;
6074
6075 case 4:
6076 w0[1] = 0x01;
6077 break;
6078
6079 case 5:
6080 w0[1] = w0[1] | 0x0100;
6081 break;
6082
6083 case 6:
6084 w0[1] = w0[1] | 0x010000;
6085 break;
6086
6087 case 7:
6088 w0[1] = w0[1] | 0x01000000;
6089 break;
6090
6091 case 8:
6092 w0[2] = 0x01;
6093 break;
6094
6095 case 9:
6096 w0[2] = w0[2] | 0x0100;
6097 break;
6098
6099 case 10:
6100 w0[2] = w0[2] | 0x010000;
6101 break;
6102
6103 case 11:
6104 w0[2] = w0[2] | 0x01000000;
6105 break;
6106
6107 case 12:
6108 w0[3] = 0x01;
6109 break;
6110
6111 case 13:
6112 w0[3] = w0[3] | 0x0100;
6113 break;
6114
6115 case 14:
6116 w0[3] = w0[3] | 0x010000;
6117 break;
6118
6119 case 15:
6120 w0[3] = w0[3] | 0x01000000;
6121 break;
6122
6123 case 16:
6124 w1[0] = 0x01;
6125 break;
6126
6127 case 17:
6128 w1[0] = w1[0] | 0x0100;
6129 break;
6130
6131 case 18:
6132 w1[0] = w1[0] | 0x010000;
6133 break;
6134
6135 case 19:
6136 w1[0] = w1[0] | 0x01000000;
6137 break;
6138
6139 case 20:
6140 w1[1] = 0x01;
6141 break;
6142
6143 case 21:
6144 w1[1] = w1[1] | 0x0100;
6145 break;
6146
6147 case 22:
6148 w1[1] = w1[1] | 0x010000;
6149 break;
6150
6151 case 23:
6152 w1[1] = w1[1] | 0x01000000;
6153 break;
6154
6155 case 24:
6156 w1[2] = 0x01;
6157 break;
6158
6159 case 25:
6160 w1[2] = w1[2] | 0x0100;
6161 break;
6162
6163 case 26:
6164 w1[2] = w1[2] | 0x010000;
6165 break;
6166
6167 case 27:
6168 w1[2] = w1[2] | 0x01000000;
6169 break;
6170
6171 case 28:
6172 w1[3] = 0x01;
6173 break;
6174
6175 case 29:
6176 w1[3] = w1[3] | 0x0100;
6177 break;
6178
6179 case 30:
6180 w1[3] = w1[3] | 0x010000;
6181 break;
6182
6183 case 31:
6184 w1[3] = w1[3] | 0x01000000;
6185 break;
6186 }
6187 }
6188
6189 inline void append_0x01_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
6190 {
6191 switch (offset)
6192 {
6193 case 0:
6194 w0[0] = 0x01;
6195 break;
6196
6197 case 1:
6198 w0[0] = w0[0] | 0x0100;
6199 break;
6200
6201 case 2:
6202 w0[0] = w0[0] | 0x010000;
6203 break;
6204
6205 case 3:
6206 w0[0] = w0[0] | 0x01000000;
6207 break;
6208
6209 case 4:
6210 w0[1] = 0x01;
6211 break;
6212
6213 case 5:
6214 w0[1] = w0[1] | 0x0100;
6215 break;
6216
6217 case 6:
6218 w0[1] = w0[1] | 0x010000;
6219 break;
6220
6221 case 7:
6222 w0[1] = w0[1] | 0x01000000;
6223 break;
6224
6225 case 8:
6226 w0[2] = 0x01;
6227 break;
6228
6229 case 9:
6230 w0[2] = w0[2] | 0x0100;
6231 break;
6232
6233 case 10:
6234 w0[2] = w0[2] | 0x010000;
6235 break;
6236
6237 case 11:
6238 w0[2] = w0[2] | 0x01000000;
6239 break;
6240
6241 case 12:
6242 w0[3] = 0x01;
6243 break;
6244
6245 case 13:
6246 w0[3] = w0[3] | 0x0100;
6247 break;
6248
6249 case 14:
6250 w0[3] = w0[3] | 0x010000;
6251 break;
6252
6253 case 15:
6254 w0[3] = w0[3] | 0x01000000;
6255 break;
6256
6257 case 16:
6258 w1[0] = 0x01;
6259 break;
6260
6261 case 17:
6262 w1[0] = w1[0] | 0x0100;
6263 break;
6264
6265 case 18:
6266 w1[0] = w1[0] | 0x010000;
6267 break;
6268
6269 case 19:
6270 w1[0] = w1[0] | 0x01000000;
6271 break;
6272
6273 case 20:
6274 w1[1] = 0x01;
6275 break;
6276
6277 case 21:
6278 w1[1] = w1[1] | 0x0100;
6279 break;
6280
6281 case 22:
6282 w1[1] = w1[1] | 0x010000;
6283 break;
6284
6285 case 23:
6286 w1[1] = w1[1] | 0x01000000;
6287 break;
6288
6289 case 24:
6290 w1[2] = 0x01;
6291 break;
6292
6293 case 25:
6294 w1[2] = w1[2] | 0x0100;
6295 break;
6296
6297 case 26:
6298 w1[2] = w1[2] | 0x010000;
6299 break;
6300
6301 case 27:
6302 w1[2] = w1[2] | 0x01000000;
6303 break;
6304
6305 case 28:
6306 w1[3] = 0x01;
6307 break;
6308
6309 case 29:
6310 w1[3] = w1[3] | 0x0100;
6311 break;
6312
6313 case 30:
6314 w1[3] = w1[3] | 0x010000;
6315 break;
6316
6317 case 31:
6318 w1[3] = w1[3] | 0x01000000;
6319 break;
6320
6321 case 32:
6322 w2[0] = 0x01;
6323 break;
6324
6325 case 33:
6326 w2[0] = w2[0] | 0x0100;
6327 break;
6328
6329 case 34:
6330 w2[0] = w2[0] | 0x010000;
6331 break;
6332
6333 case 35:
6334 w2[0] = w2[0] | 0x01000000;
6335 break;
6336
6337 case 36:
6338 w2[1] = 0x01;
6339 break;
6340
6341 case 37:
6342 w2[1] = w2[1] | 0x0100;
6343 break;
6344
6345 case 38:
6346 w2[1] = w2[1] | 0x010000;
6347 break;
6348
6349 case 39:
6350 w2[1] = w2[1] | 0x01000000;
6351 break;
6352
6353 case 40:
6354 w2[2] = 0x01;
6355 break;
6356
6357 case 41:
6358 w2[2] = w2[2] | 0x0100;
6359 break;
6360
6361 case 42:
6362 w2[2] = w2[2] | 0x010000;
6363 break;
6364
6365 case 43:
6366 w2[2] = w2[2] | 0x01000000;
6367 break;
6368
6369 case 44:
6370 w2[3] = 0x01;
6371 break;
6372
6373 case 45:
6374 w2[3] = w2[3] | 0x0100;
6375 break;
6376
6377 case 46:
6378 w2[3] = w2[3] | 0x010000;
6379 break;
6380
6381 case 47:
6382 w2[3] = w2[3] | 0x01000000;
6383 break;
6384 }
6385 }
6386
6387 inline void append_0x02_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
6388 {
6389 switch (offset)
6390 {
6391 case 0:
6392 w0[0] = 0x02;
6393 break;
6394
6395 case 1:
6396 w0[0] = w0[0] | 0x0200;
6397 break;
6398
6399 case 2:
6400 w0[0] = w0[0] | 0x020000;
6401 break;
6402
6403 case 3:
6404 w0[0] = w0[0] | 0x02000000;
6405 break;
6406
6407 case 4:
6408 w0[1] = 0x02;
6409 break;
6410
6411 case 5:
6412 w0[1] = w0[1] | 0x0200;
6413 break;
6414
6415 case 6:
6416 w0[1] = w0[1] | 0x020000;
6417 break;
6418
6419 case 7:
6420 w0[1] = w0[1] | 0x02000000;
6421 break;
6422
6423 case 8:
6424 w0[2] = 0x02;
6425 break;
6426
6427 case 9:
6428 w0[2] = w0[2] | 0x0200;
6429 break;
6430
6431 case 10:
6432 w0[2] = w0[2] | 0x020000;
6433 break;
6434
6435 case 11:
6436 w0[2] = w0[2] | 0x02000000;
6437 break;
6438
6439 case 12:
6440 w0[3] = 0x02;
6441 break;
6442
6443 case 13:
6444 w0[3] = w0[3] | 0x0200;
6445 break;
6446
6447 case 14:
6448 w0[3] = w0[3] | 0x020000;
6449 break;
6450
6451 case 15:
6452 w0[3] = w0[3] | 0x02000000;
6453 break;
6454
6455 case 16:
6456 w1[0] = 0x02;
6457 break;
6458
6459 case 17:
6460 w1[0] = w1[0] | 0x0200;
6461 break;
6462
6463 case 18:
6464 w1[0] = w1[0] | 0x020000;
6465 break;
6466
6467 case 19:
6468 w1[0] = w1[0] | 0x02000000;
6469 break;
6470
6471 case 20:
6472 w1[1] = 0x02;
6473 break;
6474
6475 case 21:
6476 w1[1] = w1[1] | 0x0200;
6477 break;
6478
6479 case 22:
6480 w1[1] = w1[1] | 0x020000;
6481 break;
6482
6483 case 23:
6484 w1[1] = w1[1] | 0x02000000;
6485 break;
6486
6487 case 24:
6488 w1[2] = 0x02;
6489 break;
6490
6491 case 25:
6492 w1[2] = w1[2] | 0x0200;
6493 break;
6494
6495 case 26:
6496 w1[2] = w1[2] | 0x020000;
6497 break;
6498
6499 case 27:
6500 w1[2] = w1[2] | 0x02000000;
6501 break;
6502
6503 case 28:
6504 w1[3] = 0x02;
6505 break;
6506
6507 case 29:
6508 w1[3] = w1[3] | 0x0200;
6509 break;
6510
6511 case 30:
6512 w1[3] = w1[3] | 0x020000;
6513 break;
6514
6515 case 31:
6516 w1[3] = w1[3] | 0x02000000;
6517 break;
6518 }
6519 }
6520
6521 inline void append_0x02_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
6522 {
6523 switch (offset)
6524 {
6525 case 0:
6526 w0[0] = 0x02;
6527 break;
6528
6529 case 1:
6530 w0[0] = w0[0] | 0x0200;
6531 break;
6532
6533 case 2:
6534 w0[0] = w0[0] | 0x020000;
6535 break;
6536
6537 case 3:
6538 w0[0] = w0[0] | 0x02000000;
6539 break;
6540
6541 case 4:
6542 w0[1] = 0x02;
6543 break;
6544
6545 case 5:
6546 w0[1] = w0[1] | 0x0200;
6547 break;
6548
6549 case 6:
6550 w0[1] = w0[1] | 0x020000;
6551 break;
6552
6553 case 7:
6554 w0[1] = w0[1] | 0x02000000;
6555 break;
6556
6557 case 8:
6558 w0[2] = 0x02;
6559 break;
6560
6561 case 9:
6562 w0[2] = w0[2] | 0x0200;
6563 break;
6564
6565 case 10:
6566 w0[2] = w0[2] | 0x020000;
6567 break;
6568
6569 case 11:
6570 w0[2] = w0[2] | 0x02000000;
6571 break;
6572
6573 case 12:
6574 w0[3] = 0x02;
6575 break;
6576
6577 case 13:
6578 w0[3] = w0[3] | 0x0200;
6579 break;
6580
6581 case 14:
6582 w0[3] = w0[3] | 0x020000;
6583 break;
6584
6585 case 15:
6586 w0[3] = w0[3] | 0x02000000;
6587 break;
6588
6589 case 16:
6590 w1[0] = 0x02;
6591 break;
6592
6593 case 17:
6594 w1[0] = w1[0] | 0x0200;
6595 break;
6596
6597 case 18:
6598 w1[0] = w1[0] | 0x020000;
6599 break;
6600
6601 case 19:
6602 w1[0] = w1[0] | 0x02000000;
6603 break;
6604
6605 case 20:
6606 w1[1] = 0x02;
6607 break;
6608
6609 case 21:
6610 w1[1] = w1[1] | 0x0200;
6611 break;
6612
6613 case 22:
6614 w1[1] = w1[1] | 0x020000;
6615 break;
6616
6617 case 23:
6618 w1[1] = w1[1] | 0x02000000;
6619 break;
6620
6621 case 24:
6622 w1[2] = 0x02;
6623 break;
6624
6625 case 25:
6626 w1[2] = w1[2] | 0x0200;
6627 break;
6628
6629 case 26:
6630 w1[2] = w1[2] | 0x020000;
6631 break;
6632
6633 case 27:
6634 w1[2] = w1[2] | 0x02000000;
6635 break;
6636
6637 case 28:
6638 w1[3] = 0x02;
6639 break;
6640
6641 case 29:
6642 w1[3] = w1[3] | 0x0200;
6643 break;
6644
6645 case 30:
6646 w1[3] = w1[3] | 0x020000;
6647 break;
6648
6649 case 31:
6650 w1[3] = w1[3] | 0x02000000;
6651 break;
6652
6653 case 32:
6654 w2[0] = 0x02;
6655 break;
6656
6657 case 33:
6658 w2[0] = w2[0] | 0x0200;
6659 break;
6660
6661 case 34:
6662 w2[0] = w2[0] | 0x020000;
6663 break;
6664
6665 case 35:
6666 w2[0] = w2[0] | 0x02000000;
6667 break;
6668
6669 case 36:
6670 w2[1] = 0x02;
6671 break;
6672
6673 case 37:
6674 w2[1] = w2[1] | 0x0200;
6675 break;
6676
6677 case 38:
6678 w2[1] = w2[1] | 0x020000;
6679 break;
6680
6681 case 39:
6682 w2[1] = w2[1] | 0x02000000;
6683 break;
6684
6685 case 40:
6686 w2[2] = 0x02;
6687 break;
6688
6689 case 41:
6690 w2[2] = w2[2] | 0x0200;
6691 break;
6692
6693 case 42:
6694 w2[2] = w2[2] | 0x020000;
6695 break;
6696
6697 case 43:
6698 w2[2] = w2[2] | 0x02000000;
6699 break;
6700
6701 case 44:
6702 w2[3] = 0x02;
6703 break;
6704
6705 case 45:
6706 w2[3] = w2[3] | 0x0200;
6707 break;
6708
6709 case 46:
6710 w2[3] = w2[3] | 0x020000;
6711 break;
6712
6713 case 47:
6714 w2[3] = w2[3] | 0x02000000;
6715 break;
6716 }
6717 }
6718
6719 inline void append_0x80_1x4_S (u32 w0[4], const u32 offset)
6720 {
6721 switch (offset)
6722 {
6723 case 0:
6724 w0[0] = 0x80;
6725 break;
6726
6727 case 1:
6728 w0[0] = w0[0] | 0x8000;
6729 break;
6730
6731 case 2:
6732 w0[0] = w0[0] | 0x800000;
6733 break;
6734
6735 case 3:
6736 w0[0] = w0[0] | 0x80000000;
6737 break;
6738
6739 case 4:
6740 w0[1] = 0x80;
6741 break;
6742
6743 case 5:
6744 w0[1] = w0[1] | 0x8000;
6745 break;
6746
6747 case 6:
6748 w0[1] = w0[1] | 0x800000;
6749 break;
6750
6751 case 7:
6752 w0[1] = w0[1] | 0x80000000;
6753 break;
6754
6755 case 8:
6756 w0[2] = 0x80;
6757 break;
6758
6759 case 9:
6760 w0[2] = w0[2] | 0x8000;
6761 break;
6762
6763 case 10:
6764 w0[2] = w0[2] | 0x800000;
6765 break;
6766
6767 case 11:
6768 w0[2] = w0[2] | 0x80000000;
6769 break;
6770
6771 case 12:
6772 w0[3] = 0x80;
6773 break;
6774
6775 case 13:
6776 w0[3] = w0[3] | 0x8000;
6777 break;
6778
6779 case 14:
6780 w0[3] = w0[3] | 0x800000;
6781 break;
6782
6783 case 15:
6784 w0[3] = w0[3] | 0x80000000;
6785 break;
6786 }
6787 }
6788
6789 inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
6790 {
6791 switch (offset)
6792 {
6793 case 0:
6794 w0[0] = 0x80;
6795 break;
6796
6797 case 1:
6798 w0[0] = w0[0] | 0x8000;
6799 break;
6800
6801 case 2:
6802 w0[0] = w0[0] | 0x800000;
6803 break;
6804
6805 case 3:
6806 w0[0] = w0[0] | 0x80000000;
6807 break;
6808
6809 case 4:
6810 w0[1] = 0x80;
6811 break;
6812
6813 case 5:
6814 w0[1] = w0[1] | 0x8000;
6815 break;
6816
6817 case 6:
6818 w0[1] = w0[1] | 0x800000;
6819 break;
6820
6821 case 7:
6822 w0[1] = w0[1] | 0x80000000;
6823 break;
6824
6825 case 8:
6826 w0[2] = 0x80;
6827 break;
6828
6829 case 9:
6830 w0[2] = w0[2] | 0x8000;
6831 break;
6832
6833 case 10:
6834 w0[2] = w0[2] | 0x800000;
6835 break;
6836
6837 case 11:
6838 w0[2] = w0[2] | 0x80000000;
6839 break;
6840
6841 case 12:
6842 w0[3] = 0x80;
6843 break;
6844
6845 case 13:
6846 w0[3] = w0[3] | 0x8000;
6847 break;
6848
6849 case 14:
6850 w0[3] = w0[3] | 0x800000;
6851 break;
6852
6853 case 15:
6854 w0[3] = w0[3] | 0x80000000;
6855 break;
6856
6857 case 16:
6858 w1[0] = 0x80;
6859 break;
6860
6861 case 17:
6862 w1[0] = w1[0] | 0x8000;
6863 break;
6864
6865 case 18:
6866 w1[0] = w1[0] | 0x800000;
6867 break;
6868
6869 case 19:
6870 w1[0] = w1[0] | 0x80000000;
6871 break;
6872
6873 case 20:
6874 w1[1] = 0x80;
6875 break;
6876
6877 case 21:
6878 w1[1] = w1[1] | 0x8000;
6879 break;
6880
6881 case 22:
6882 w1[1] = w1[1] | 0x800000;
6883 break;
6884
6885 case 23:
6886 w1[1] = w1[1] | 0x80000000;
6887 break;
6888
6889 case 24:
6890 w1[2] = 0x80;
6891 break;
6892
6893 case 25:
6894 w1[2] = w1[2] | 0x8000;
6895 break;
6896
6897 case 26:
6898 w1[2] = w1[2] | 0x800000;
6899 break;
6900
6901 case 27:
6902 w1[2] = w1[2] | 0x80000000;
6903 break;
6904
6905 case 28:
6906 w1[3] = 0x80;
6907 break;
6908
6909 case 29:
6910 w1[3] = w1[3] | 0x8000;
6911 break;
6912
6913 case 30:
6914 w1[3] = w1[3] | 0x800000;
6915 break;
6916
6917 case 31:
6918 w1[3] = w1[3] | 0x80000000;
6919 break;
6920 }
6921 }
6922
6923 inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
6924 {
6925 switch (offset)
6926 {
6927 case 0:
6928 w0[0] = 0x80;
6929 break;
6930
6931 case 1:
6932 w0[0] = w0[0] | 0x8000;
6933 break;
6934
6935 case 2:
6936 w0[0] = w0[0] | 0x800000;
6937 break;
6938
6939 case 3:
6940 w0[0] = w0[0] | 0x80000000;
6941 break;
6942
6943 case 4:
6944 w0[1] = 0x80;
6945 break;
6946
6947 case 5:
6948 w0[1] = w0[1] | 0x8000;
6949 break;
6950
6951 case 6:
6952 w0[1] = w0[1] | 0x800000;
6953 break;
6954
6955 case 7:
6956 w0[1] = w0[1] | 0x80000000;
6957 break;
6958
6959 case 8:
6960 w0[2] = 0x80;
6961 break;
6962
6963 case 9:
6964 w0[2] = w0[2] | 0x8000;
6965 break;
6966
6967 case 10:
6968 w0[2] = w0[2] | 0x800000;
6969 break;
6970
6971 case 11:
6972 w0[2] = w0[2] | 0x80000000;
6973 break;
6974
6975 case 12:
6976 w0[3] = 0x80;
6977 break;
6978
6979 case 13:
6980 w0[3] = w0[3] | 0x8000;
6981 break;
6982
6983 case 14:
6984 w0[3] = w0[3] | 0x800000;
6985 break;
6986
6987 case 15:
6988 w0[3] = w0[3] | 0x80000000;
6989 break;
6990
6991 case 16:
6992 w1[0] = 0x80;
6993 break;
6994
6995 case 17:
6996 w1[0] = w1[0] | 0x8000;
6997 break;
6998
6999 case 18:
7000 w1[0] = w1[0] | 0x800000;
7001 break;
7002
7003 case 19:
7004 w1[0] = w1[0] | 0x80000000;
7005 break;
7006
7007 case 20:
7008 w1[1] = 0x80;
7009 break;
7010
7011 case 21:
7012 w1[1] = w1[1] | 0x8000;
7013 break;
7014
7015 case 22:
7016 w1[1] = w1[1] | 0x800000;
7017 break;
7018
7019 case 23:
7020 w1[1] = w1[1] | 0x80000000;
7021 break;
7022
7023 case 24:
7024 w1[2] = 0x80;
7025 break;
7026
7027 case 25:
7028 w1[2] = w1[2] | 0x8000;
7029 break;
7030
7031 case 26:
7032 w1[2] = w1[2] | 0x800000;
7033 break;
7034
7035 case 27:
7036 w1[2] = w1[2] | 0x80000000;
7037 break;
7038
7039 case 28:
7040 w1[3] = 0x80;
7041 break;
7042
7043 case 29:
7044 w1[3] = w1[3] | 0x8000;
7045 break;
7046
7047 case 30:
7048 w1[3] = w1[3] | 0x800000;
7049 break;
7050
7051 case 31:
7052 w1[3] = w1[3] | 0x80000000;
7053 break;
7054
7055 case 32:
7056 w2[0] = 0x80;
7057 break;
7058
7059 case 33:
7060 w2[0] = w2[0] | 0x8000;
7061 break;
7062
7063 case 34:
7064 w2[0] = w2[0] | 0x800000;
7065 break;
7066
7067 case 35:
7068 w2[0] = w2[0] | 0x80000000;
7069 break;
7070
7071 case 36:
7072 w2[1] = 0x80;
7073 break;
7074
7075 case 37:
7076 w2[1] = w2[1] | 0x8000;
7077 break;
7078
7079 case 38:
7080 w2[1] = w2[1] | 0x800000;
7081 break;
7082
7083 case 39:
7084 w2[1] = w2[1] | 0x80000000;
7085 break;
7086
7087 case 40:
7088 w2[2] = 0x80;
7089 break;
7090
7091 case 41:
7092 w2[2] = w2[2] | 0x8000;
7093 break;
7094
7095 case 42:
7096 w2[2] = w2[2] | 0x800000;
7097 break;
7098
7099 case 43:
7100 w2[2] = w2[2] | 0x80000000;
7101 break;
7102
7103 case 44:
7104 w2[3] = 0x80;
7105 break;
7106
7107 case 45:
7108 w2[3] = w2[3] | 0x8000;
7109 break;
7110
7111 case 46:
7112 w2[3] = w2[3] | 0x800000;
7113 break;
7114
7115 case 47:
7116 w2[3] = w2[3] | 0x80000000;
7117 break;
7118 }
7119 }
7120
7121 inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
7122 {
7123 switch (offset)
7124 {
7125 case 0:
7126 w0[0] = 0x80;
7127 break;
7128
7129 case 1:
7130 w0[0] = w0[0] | 0x8000;
7131 break;
7132
7133 case 2:
7134 w0[0] = w0[0] | 0x800000;
7135 break;
7136
7137 case 3:
7138 w0[0] = w0[0] | 0x80000000;
7139 break;
7140
7141 case 4:
7142 w0[1] = 0x80;
7143 break;
7144
7145 case 5:
7146 w0[1] = w0[1] | 0x8000;
7147 break;
7148
7149 case 6:
7150 w0[1] = w0[1] | 0x800000;
7151 break;
7152
7153 case 7:
7154 w0[1] = w0[1] | 0x80000000;
7155 break;
7156
7157 case 8:
7158 w0[2] = 0x80;
7159 break;
7160
7161 case 9:
7162 w0[2] = w0[2] | 0x8000;
7163 break;
7164
7165 case 10:
7166 w0[2] = w0[2] | 0x800000;
7167 break;
7168
7169 case 11:
7170 w0[2] = w0[2] | 0x80000000;
7171 break;
7172
7173 case 12:
7174 w0[3] = 0x80;
7175 break;
7176
7177 case 13:
7178 w0[3] = w0[3] | 0x8000;
7179 break;
7180
7181 case 14:
7182 w0[3] = w0[3] | 0x800000;
7183 break;
7184
7185 case 15:
7186 w0[3] = w0[3] | 0x80000000;
7187 break;
7188
7189 case 16:
7190 w1[0] = 0x80;
7191 break;
7192
7193 case 17:
7194 w1[0] = w1[0] | 0x8000;
7195 break;
7196
7197 case 18:
7198 w1[0] = w1[0] | 0x800000;
7199 break;
7200
7201 case 19:
7202 w1[0] = w1[0] | 0x80000000;
7203 break;
7204
7205 case 20:
7206 w1[1] = 0x80;
7207 break;
7208
7209 case 21:
7210 w1[1] = w1[1] | 0x8000;
7211 break;
7212
7213 case 22:
7214 w1[1] = w1[1] | 0x800000;
7215 break;
7216
7217 case 23:
7218 w1[1] = w1[1] | 0x80000000;
7219 break;
7220
7221 case 24:
7222 w1[2] = 0x80;
7223 break;
7224
7225 case 25:
7226 w1[2] = w1[2] | 0x8000;
7227 break;
7228
7229 case 26:
7230 w1[2] = w1[2] | 0x800000;
7231 break;
7232
7233 case 27:
7234 w1[2] = w1[2] | 0x80000000;
7235 break;
7236
7237 case 28:
7238 w1[3] = 0x80;
7239 break;
7240
7241 case 29:
7242 w1[3] = w1[3] | 0x8000;
7243 break;
7244
7245 case 30:
7246 w1[3] = w1[3] | 0x800000;
7247 break;
7248
7249 case 31:
7250 w1[3] = w1[3] | 0x80000000;
7251 break;
7252
7253 case 32:
7254 w2[0] = 0x80;
7255 break;
7256
7257 case 33:
7258 w2[0] = w2[0] | 0x8000;
7259 break;
7260
7261 case 34:
7262 w2[0] = w2[0] | 0x800000;
7263 break;
7264
7265 case 35:
7266 w2[0] = w2[0] | 0x80000000;
7267 break;
7268
7269 case 36:
7270 w2[1] = 0x80;
7271 break;
7272
7273 case 37:
7274 w2[1] = w2[1] | 0x8000;
7275 break;
7276
7277 case 38:
7278 w2[1] = w2[1] | 0x800000;
7279 break;
7280
7281 case 39:
7282 w2[1] = w2[1] | 0x80000000;
7283 break;
7284
7285 case 40:
7286 w2[2] = 0x80;
7287 break;
7288
7289 case 41:
7290 w2[2] = w2[2] | 0x8000;
7291 break;
7292
7293 case 42:
7294 w2[2] = w2[2] | 0x800000;
7295 break;
7296
7297 case 43:
7298 w2[2] = w2[2] | 0x80000000;
7299 break;
7300
7301 case 44:
7302 w2[3] = 0x80;
7303 break;
7304
7305 case 45:
7306 w2[3] = w2[3] | 0x8000;
7307 break;
7308
7309 case 46:
7310 w2[3] = w2[3] | 0x800000;
7311 break;
7312
7313 case 47:
7314 w2[3] = w2[3] | 0x80000000;
7315 break;
7316
7317 case 48:
7318 w3[0] = 0x80;
7319 break;
7320
7321 case 49:
7322 w3[0] = w3[0] | 0x8000;
7323 break;
7324
7325 case 50:
7326 w3[0] = w3[0] | 0x800000;
7327 break;
7328
7329 case 51:
7330 w3[0] = w3[0] | 0x80000000;
7331 break;
7332
7333 case 52:
7334 w3[1] = 0x80;
7335 break;
7336
7337 case 53:
7338 w3[1] = w3[1] | 0x8000;
7339 break;
7340
7341 case 54:
7342 w3[1] = w3[1] | 0x800000;
7343 break;
7344
7345 case 55:
7346 w3[1] = w3[1] | 0x80000000;
7347 break;
7348
7349 case 56:
7350 w3[2] = 0x80;
7351 break;
7352
7353 case 57:
7354 w3[2] = w3[2] | 0x8000;
7355 break;
7356
7357 case 58:
7358 w3[2] = w3[2] | 0x800000;
7359 break;
7360
7361 case 59:
7362 w3[2] = w3[2] | 0x80000000;
7363 break;
7364
7365 case 60:
7366 w3[3] = 0x80;
7367 break;
7368
7369 case 61:
7370 w3[3] = w3[3] | 0x8000;
7371 break;
7372
7373 case 62:
7374 w3[3] = w3[3] | 0x800000;
7375 break;
7376
7377 case 63:
7378 w3[3] = w3[3] | 0x80000000;
7379 break;
7380 }
7381 }
7382
7383 inline void truncate_block_S (u32 w[4], const u32 len)
7384 {
7385 switch (len)
7386 {
7387 case 0: w[0] &= 0;
7388 w[1] &= 0;
7389 w[2] &= 0;
7390 w[3] &= 0;
7391 break;
7392 case 1: w[0] &= 0x000000FF;
7393 w[1] &= 0;
7394 w[2] &= 0;
7395 w[3] &= 0;
7396 break;
7397 case 2: w[0] &= 0x0000FFFF;
7398 w[1] &= 0;
7399 w[2] &= 0;
7400 w[3] &= 0;
7401 break;
7402 case 3: w[0] &= 0x00FFFFFF;
7403 w[1] &= 0;
7404 w[2] &= 0;
7405 w[3] &= 0;
7406 break;
7407 case 4: w[1] &= 0;
7408 w[2] &= 0;
7409 w[3] &= 0;
7410 break;
7411 case 5: w[1] &= 0x000000FF;
7412 w[2] &= 0;
7413 w[3] &= 0;
7414 break;
7415 case 6: w[1] &= 0x0000FFFF;
7416 w[2] &= 0;
7417 w[3] &= 0;
7418 break;
7419 case 7: w[1] &= 0x00FFFFFF;
7420 w[2] &= 0;
7421 w[3] &= 0;
7422 break;
7423 case 8: w[2] &= 0;
7424 w[3] &= 0;
7425 break;
7426 case 9: w[2] &= 0x000000FF;
7427 w[3] &= 0;
7428 break;
7429 case 10: w[2] &= 0x0000FFFF;
7430 w[3] &= 0;
7431 break;
7432 case 11: w[2] &= 0x00FFFFFF;
7433 w[3] &= 0;
7434 break;
7435 case 12: w[3] &= 0;
7436 break;
7437 case 13: w[3] &= 0x000000FF;
7438 break;
7439 case 14: w[3] &= 0x0000FFFF;
7440 break;
7441 case 15: w[3] &= 0x00FFFFFF;
7442 break;
7443 }
7444 }
7445
7446 inline void make_unicode_S (const u32 in[4], u32 out1[4], u32 out2[4])
7447 {
7448 #ifdef IS_NV
7449 out2[3] = __byte_perm_S (in[3], 0, 0x7372);
7450 out2[2] = __byte_perm_S (in[3], 0, 0x7170);
7451 out2[1] = __byte_perm_S (in[2], 0, 0x7372);
7452 out2[0] = __byte_perm_S (in[2], 0, 0x7170);
7453 out1[3] = __byte_perm_S (in[1], 0, 0x7372);
7454 out1[2] = __byte_perm_S (in[1], 0, 0x7170);
7455 out1[1] = __byte_perm_S (in[0], 0, 0x7372);
7456 out1[0] = __byte_perm_S (in[0], 0, 0x7170);
7457 #endif
7458
7459 #if defined IS_AMD || defined IS_GENERIC
7460 out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
7461 out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
7462 out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
7463 out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
7464 out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
7465 out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
7466 out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
7467 out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
7468 #endif
7469 }
7470
7471 inline void undo_unicode_S (const u32 in1[4], const u32 in2[4], u32 out[4])
7472 {
7473 #ifdef IS_NV
7474 out[0] = __byte_perm_S (in1[0], in1[1], 0x6420);
7475 out[1] = __byte_perm_S (in1[2], in1[3], 0x6420);
7476 out[2] = __byte_perm_S (in2[0], in2[1], 0x6420);
7477 out[3] = __byte_perm_S (in2[2], in2[3], 0x6420);
7478 #endif
7479
7480 #if defined IS_AMD || defined IS_GENERIC
7481 out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
7482 | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
7483 out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
7484 | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
7485 out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
7486 | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
7487 out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
7488 | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
7489 #endif
7490 }
7491
7492 inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
7493 {
7494 #if defined IS_AMD || defined IS_GENERIC
7495 const int offset_mod_4 = offset & 3;
7496
7497 const int offset_minus_4 = 4 - offset;
7498
7499 switch (offset / 4)
7500 {
7501 case 0:
7502 w3[2] = amd_bytealign_S ( 0, w3[1], offset_minus_4);
7503 w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4);
7504 w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
7505 w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7506 w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7507 w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7508 w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7509 w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7510 w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7511 w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7512 w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7513 w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7514 w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7515 w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7516 w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7517
7518 if (offset_mod_4 == 0)
7519 {
7520 w0[0] = w0[1];
7521 w0[1] = w0[2];
7522 w0[2] = w0[3];
7523 w0[3] = w1[0];
7524 w1[0] = w1[1];
7525 w1[1] = w1[2];
7526 w1[2] = w1[3];
7527 w1[3] = w2[0];
7528 w2[0] = w2[1];
7529 w2[1] = w2[2];
7530 w2[2] = w2[3];
7531 w2[3] = w3[0];
7532 w3[0] = w3[1];
7533 w3[1] = w3[2];
7534 w3[2] = 0;
7535 }
7536
7537 break;
7538
7539 case 1:
7540 w3[2] = amd_bytealign_S ( 0, w3[0], offset_minus_4);
7541 w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
7542 w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7543 w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7544 w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7545 w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7546 w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7547 w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7548 w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7549 w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7550 w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7551 w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7552 w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7553 w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7554 w0[0] = 0;
7555
7556 if (offset_mod_4 == 0)
7557 {
7558 w0[1] = w0[2];
7559 w0[2] = w0[3];
7560 w0[3] = w1[0];
7561 w1[0] = w1[1];
7562 w1[1] = w1[2];
7563 w1[2] = w1[3];
7564 w1[3] = w2[0];
7565 w2[0] = w2[1];
7566 w2[1] = w2[2];
7567 w2[2] = w2[3];
7568 w2[3] = w3[0];
7569 w3[0] = w3[1];
7570 w3[1] = w3[2];
7571 w3[2] = 0;
7572 }
7573
7574 break;
7575
7576 case 2:
7577 w3[2] = amd_bytealign_S ( 0, w2[3], offset_minus_4);
7578 w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7579 w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7580 w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7581 w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7582 w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7583 w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7584 w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7585 w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7586 w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7587 w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7588 w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7589 w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7590 w0[1] = 0;
7591 w0[0] = 0;
7592
7593 if (offset_mod_4 == 0)
7594 {
7595 w0[2] = w0[3];
7596 w0[3] = w1[0];
7597 w1[0] = w1[1];
7598 w1[1] = w1[2];
7599 w1[2] = w1[3];
7600 w1[3] = w2[0];
7601 w2[0] = w2[1];
7602 w2[1] = w2[2];
7603 w2[2] = w2[3];
7604 w2[3] = w3[0];
7605 w3[0] = w3[1];
7606 w3[1] = w3[2];
7607 w3[2] = 0;
7608 }
7609
7610 break;
7611
7612 case 3:
7613 w3[2] = amd_bytealign_S ( 0, w2[2], offset_minus_4);
7614 w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7615 w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7616 w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7617 w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7618 w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7619 w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7620 w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7621 w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7622 w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7623 w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7624 w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7625 w0[2] = 0;
7626 w0[1] = 0;
7627 w0[0] = 0;
7628
7629 if (offset_mod_4 == 0)
7630 {
7631 w0[3] = w1[0];
7632 w1[0] = w1[1];
7633 w1[1] = w1[2];
7634 w1[2] = w1[3];
7635 w1[3] = w2[0];
7636 w2[0] = w2[1];
7637 w2[1] = w2[2];
7638 w2[2] = w2[3];
7639 w2[3] = w3[0];
7640 w3[0] = w3[1];
7641 w3[1] = w3[2];
7642 w3[2] = 0;
7643 }
7644
7645 break;
7646
7647 case 4:
7648 w3[2] = amd_bytealign_S ( 0, w2[1], offset_minus_4);
7649 w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7650 w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7651 w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7652 w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7653 w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7654 w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7655 w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7656 w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7657 w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7658 w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7659 w0[3] = 0;
7660 w0[2] = 0;
7661 w0[1] = 0;
7662 w0[0] = 0;
7663
7664 if (offset_mod_4 == 0)
7665 {
7666 w1[0] = w1[1];
7667 w1[1] = w1[2];
7668 w1[2] = w1[3];
7669 w1[3] = w2[0];
7670 w2[0] = w2[1];
7671 w2[1] = w2[2];
7672 w2[2] = w2[3];
7673 w2[3] = w3[0];
7674 w3[0] = w3[1];
7675 w3[1] = w3[2];
7676 w3[2] = 0;
7677 }
7678
7679 break;
7680
7681 case 5:
7682 w3[2] = amd_bytealign_S ( 0, w2[0], offset_minus_4);
7683 w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7684 w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7685 w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7686 w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7687 w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7688 w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7689 w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7690 w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7691 w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7692 w1[0] = 0;
7693 w0[3] = 0;
7694 w0[2] = 0;
7695 w0[1] = 0;
7696 w0[0] = 0;
7697
7698 if (offset_mod_4 == 0)
7699 {
7700 w1[1] = w1[2];
7701 w1[2] = w1[3];
7702 w1[3] = w2[0];
7703 w2[0] = w2[1];
7704 w2[1] = w2[2];
7705 w2[2] = w2[3];
7706 w2[3] = w3[0];
7707 w3[0] = w3[1];
7708 w3[1] = w3[2];
7709 w3[2] = 0;
7710 }
7711
7712 break;
7713
7714 case 6:
7715 w3[2] = amd_bytealign_S ( 0, w1[3], offset_minus_4);
7716 w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7717 w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7718 w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7719 w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7720 w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7721 w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7722 w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7723 w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7724 w1[1] = 0;
7725 w1[0] = 0;
7726 w0[3] = 0;
7727 w0[2] = 0;
7728 w0[1] = 0;
7729 w0[0] = 0;
7730
7731 if (offset_mod_4 == 0)
7732 {
7733 w1[2] = w1[3];
7734 w1[3] = w2[0];
7735 w2[0] = w2[1];
7736 w2[1] = w2[2];
7737 w2[2] = w2[3];
7738 w2[3] = w3[0];
7739 w3[0] = w3[1];
7740 w3[1] = w3[2];
7741 w3[2] = 0;
7742 }
7743
7744 break;
7745
7746 case 7:
7747 w3[2] = amd_bytealign_S ( 0, w1[2], offset_minus_4);
7748 w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7749 w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7750 w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7751 w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7752 w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7753 w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7754 w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7755 w1[2] = 0;
7756 w1[1] = 0;
7757 w1[0] = 0;
7758 w0[3] = 0;
7759 w0[2] = 0;
7760 w0[1] = 0;
7761 w0[0] = 0;
7762
7763 if (offset_mod_4 == 0)
7764 {
7765 w1[3] = w2[0];
7766 w2[0] = w2[1];
7767 w2[1] = w2[2];
7768 w2[2] = w2[3];
7769 w2[3] = w3[0];
7770 w3[0] = w3[1];
7771 w3[1] = w3[2];
7772 w3[2] = 0;
7773 }
7774
7775 break;
7776
7777 case 8:
7778 w3[2] = amd_bytealign_S ( 0, w1[1], offset_minus_4);
7779 w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7780 w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7781 w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7782 w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7783 w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7784 w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7785 w1[3] = 0;
7786 w1[2] = 0;
7787 w1[1] = 0;
7788 w1[0] = 0;
7789 w0[3] = 0;
7790 w0[2] = 0;
7791 w0[1] = 0;
7792 w0[0] = 0;
7793
7794 if (offset_mod_4 == 0)
7795 {
7796 w2[0] = w2[1];
7797 w2[1] = w2[2];
7798 w2[2] = w2[3];
7799 w2[3] = w3[0];
7800 w3[0] = w3[1];
7801 w3[1] = w3[2];
7802 w3[2] = 0;
7803 }
7804
7805 break;
7806
7807 case 9:
7808 w3[2] = amd_bytealign_S ( 0, w1[0], offset_minus_4);
7809 w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7810 w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7811 w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7812 w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7813 w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7814 w2[0] = 0;
7815 w1[3] = 0;
7816 w1[2] = 0;
7817 w1[1] = 0;
7818 w1[0] = 0;
7819 w0[3] = 0;
7820 w0[2] = 0;
7821 w0[1] = 0;
7822 w0[0] = 0;
7823
7824 if (offset_mod_4 == 0)
7825 {
7826 w2[1] = w2[2];
7827 w2[2] = w2[3];
7828 w2[3] = w3[0];
7829 w3[0] = w3[1];
7830 w3[1] = w3[2];
7831 w3[2] = 0;
7832 }
7833
7834 break;
7835
7836 case 10:
7837 w3[2] = amd_bytealign_S ( 0, w0[3], offset_minus_4);
7838 w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7839 w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7840 w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7841 w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7842 w2[1] = 0;
7843 w2[0] = 0;
7844 w1[3] = 0;
7845 w1[2] = 0;
7846 w1[1] = 0;
7847 w1[0] = 0;
7848 w0[3] = 0;
7849 w0[2] = 0;
7850 w0[1] = 0;
7851 w0[0] = 0;
7852
7853 if (offset_mod_4 == 0)
7854 {
7855 w2[2] = w2[3];
7856 w2[3] = w3[0];
7857 w3[0] = w3[1];
7858 w3[1] = w3[2];
7859 w3[2] = 0;
7860 }
7861
7862 break;
7863
7864 case 11:
7865 w3[2] = amd_bytealign_S ( 0, w0[2], offset_minus_4);
7866 w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7867 w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7868 w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7869 w2[2] = 0;
7870 w2[1] = 0;
7871 w2[0] = 0;
7872 w1[3] = 0;
7873 w1[2] = 0;
7874 w1[1] = 0;
7875 w1[0] = 0;
7876 w0[3] = 0;
7877 w0[2] = 0;
7878 w0[1] = 0;
7879 w0[0] = 0;
7880
7881 if (offset_mod_4 == 0)
7882 {
7883 w2[3] = w3[0];
7884 w3[0] = w3[1];
7885 w3[1] = w3[2];
7886 w3[2] = 0;
7887 }
7888
7889 break;
7890
7891 case 12:
7892 w3[2] = amd_bytealign_S ( 0, w0[1], offset_minus_4);
7893 w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7894 w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7895 w2[3] = 0;
7896 w2[2] = 0;
7897 w2[1] = 0;
7898 w2[0] = 0;
7899 w1[3] = 0;
7900 w1[2] = 0;
7901 w1[1] = 0;
7902 w1[0] = 0;
7903 w0[3] = 0;
7904 w0[2] = 0;
7905 w0[1] = 0;
7906 w0[0] = 0;
7907
7908 if (offset_mod_4 == 0)
7909 {
7910 w3[0] = w3[1];
7911 w3[1] = w3[2];
7912 w3[2] = 0;
7913 }
7914
7915 break;
7916
7917 case 13:
7918 w3[2] = amd_bytealign_S ( 0, w0[0], offset_minus_4);
7919 w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7920 w3[0] = 0;
7921 w2[3] = 0;
7922 w2[2] = 0;
7923 w2[1] = 0;
7924 w2[0] = 0;
7925 w1[3] = 0;
7926 w1[2] = 0;
7927 w1[1] = 0;
7928 w1[0] = 0;
7929 w0[3] = 0;
7930 w0[2] = 0;
7931 w0[1] = 0;
7932 w0[0] = 0;
7933
7934 if (offset_mod_4 == 0)
7935 {
7936 w3[1] = w3[2];
7937 w3[2] = 0;
7938 }
7939
7940 break;
7941 }
7942 #endif
7943
7944 #ifdef IS_NV
7945 const int offset_minus_4 = 4 - (offset % 4);
7946
7947 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
7948
7949 switch (offset / 4)
7950 {
7951 case 0:
7952 w3[1] = __byte_perm_S (w3[0], w3[1], selector);
7953 w3[0] = __byte_perm_S (w2[3], w3[0], selector);
7954 w2[3] = __byte_perm_S (w2[2], w2[3], selector);
7955 w2[2] = __byte_perm_S (w2[1], w2[2], selector);
7956 w2[1] = __byte_perm_S (w2[0], w2[1], selector);
7957 w2[0] = __byte_perm_S (w1[3], w2[0], selector);
7958 w1[3] = __byte_perm_S (w1[2], w1[3], selector);
7959 w1[2] = __byte_perm_S (w1[1], w1[2], selector);
7960 w1[1] = __byte_perm_S (w1[0], w1[1], selector);
7961 w1[0] = __byte_perm_S (w0[3], w1[0], selector);
7962 w0[3] = __byte_perm_S (w0[2], w0[3], selector);
7963 w0[2] = __byte_perm_S (w0[1], w0[2], selector);
7964 w0[1] = __byte_perm_S (w0[0], w0[1], selector);
7965 w0[0] = __byte_perm_S ( 0, w0[0], selector);
7966
7967 break;
7968
7969 case 1:
7970 w3[1] = __byte_perm_S (w2[3], w3[0], selector);
7971 w3[0] = __byte_perm_S (w2[2], w2[3], selector);
7972 w2[3] = __byte_perm_S (w2[1], w2[2], selector);
7973 w2[2] = __byte_perm_S (w2[0], w2[1], selector);
7974 w2[1] = __byte_perm_S (w1[3], w2[0], selector);
7975 w2[0] = __byte_perm_S (w1[2], w1[3], selector);
7976 w1[3] = __byte_perm_S (w1[1], w1[2], selector);
7977 w1[2] = __byte_perm_S (w1[0], w1[1], selector);
7978 w1[1] = __byte_perm_S (w0[3], w1[0], selector);
7979 w1[0] = __byte_perm_S (w0[2], w0[3], selector);
7980 w0[3] = __byte_perm_S (w0[1], w0[2], selector);
7981 w0[2] = __byte_perm_S (w0[0], w0[1], selector);
7982 w0[1] = __byte_perm_S ( 0, w0[0], selector);
7983 w0[0] = 0;
7984
7985 break;
7986
7987 case 2:
7988 w3[1] = __byte_perm_S (w2[2], w2[3], selector);
7989 w3[0] = __byte_perm_S (w2[1], w2[2], selector);
7990 w2[3] = __byte_perm_S (w2[0], w2[1], selector);
7991 w2[2] = __byte_perm_S (w1[3], w2[0], selector);
7992 w2[1] = __byte_perm_S (w1[2], w1[3], selector);
7993 w2[0] = __byte_perm_S (w1[1], w1[2], selector);
7994 w1[3] = __byte_perm_S (w1[0], w1[1], selector);
7995 w1[2] = __byte_perm_S (w0[3], w1[0], selector);
7996 w1[1] = __byte_perm_S (w0[2], w0[3], selector);
7997 w1[0] = __byte_perm_S (w0[1], w0[2], selector);
7998 w0[3] = __byte_perm_S (w0[0], w0[1], selector);
7999 w0[2] = __byte_perm_S ( 0, w0[0], selector);
8000 w0[1] = 0;
8001 w0[0] = 0;
8002
8003 break;
8004
8005 case 3:
8006 w3[1] = __byte_perm_S (w2[1], w2[2], selector);
8007 w3[0] = __byte_perm_S (w2[0], w2[1], selector);
8008 w2[3] = __byte_perm_S (w1[3], w2[0], selector);
8009 w2[2] = __byte_perm_S (w1[2], w1[3], selector);
8010 w2[1] = __byte_perm_S (w1[1], w1[2], selector);
8011 w2[0] = __byte_perm_S (w1[0], w1[1], selector);
8012 w1[3] = __byte_perm_S (w0[3], w1[0], selector);
8013 w1[2] = __byte_perm_S (w0[2], w0[3], selector);
8014 w1[1] = __byte_perm_S (w0[1], w0[2], selector);
8015 w1[0] = __byte_perm_S (w0[0], w0[1], selector);
8016 w0[3] = __byte_perm_S ( 0, w0[0], selector);
8017 w0[2] = 0;
8018 w0[1] = 0;
8019 w0[0] = 0;
8020
8021 break;
8022
8023 case 4:
8024 w3[1] = __byte_perm_S (w2[0], w2[1], selector);
8025 w3[0] = __byte_perm_S (w1[3], w2[0], selector);
8026 w2[3] = __byte_perm_S (w1[2], w1[3], selector);
8027 w2[2] = __byte_perm_S (w1[1], w1[2], selector);
8028 w2[1] = __byte_perm_S (w1[0], w1[1], selector);
8029 w2[0] = __byte_perm_S (w0[3], w1[0], selector);
8030 w1[3] = __byte_perm_S (w0[2], w0[3], selector);
8031 w1[2] = __byte_perm_S (w0[1], w0[2], selector);
8032 w1[1] = __byte_perm_S (w0[0], w0[1], selector);
8033 w1[0] = __byte_perm_S ( 0, w0[0], selector);
8034 w0[3] = 0;
8035 w0[2] = 0;
8036 w0[1] = 0;
8037 w0[0] = 0;
8038
8039 break;
8040
8041 case 5:
8042 w3[1] = __byte_perm_S (w1[3], w2[0], selector);
8043 w3[0] = __byte_perm_S (w1[2], w1[3], selector);
8044 w2[3] = __byte_perm_S (w1[1], w1[2], selector);
8045 w2[2] = __byte_perm_S (w1[0], w1[1], selector);
8046 w2[1] = __byte_perm_S (w0[3], w1[0], selector);
8047 w2[0] = __byte_perm_S (w0[2], w0[3], selector);
8048 w1[3] = __byte_perm_S (w0[1], w0[2], selector);
8049 w1[2] = __byte_perm_S (w0[0], w0[1], selector);
8050 w1[1] = __byte_perm_S ( 0, w0[0], selector);
8051 w1[0] = 0;
8052 w0[3] = 0;
8053 w0[2] = 0;
8054 w0[1] = 0;
8055 w0[0] = 0;
8056
8057 break;
8058
8059 case 6:
8060 w3[1] = __byte_perm_S (w1[2], w1[3], selector);
8061 w3[0] = __byte_perm_S (w1[1], w1[2], selector);
8062 w2[3] = __byte_perm_S (w1[0], w1[1], selector);
8063 w2[2] = __byte_perm_S (w0[3], w1[0], selector);
8064 w2[1] = __byte_perm_S (w0[2], w0[3], selector);
8065 w2[0] = __byte_perm_S (w0[1], w0[2], selector);
8066 w1[3] = __byte_perm_S (w0[0], w0[1], selector);
8067 w1[2] = __byte_perm_S ( 0, w0[0], selector);
8068 w1[1] = 0;
8069 w1[0] = 0;
8070 w0[3] = 0;
8071 w0[2] = 0;
8072 w0[1] = 0;
8073 w0[0] = 0;
8074
8075 break;
8076
8077 case 7:
8078 w3[1] = __byte_perm_S (w1[1], w1[2], selector);
8079 w3[0] = __byte_perm_S (w1[0], w1[1], selector);
8080 w2[3] = __byte_perm_S (w0[3], w1[0], selector);
8081 w2[2] = __byte_perm_S (w0[2], w0[3], selector);
8082 w2[1] = __byte_perm_S (w0[1], w0[2], selector);
8083 w2[0] = __byte_perm_S (w0[0], w0[1], selector);
8084 w1[3] = __byte_perm_S ( 0, w0[0], selector);
8085 w1[2] = 0;
8086 w1[1] = 0;
8087 w1[0] = 0;
8088 w0[3] = 0;
8089 w0[2] = 0;
8090 w0[1] = 0;
8091 w0[0] = 0;
8092
8093 break;
8094
8095 case 8:
8096 w3[1] = __byte_perm_S (w1[0], w1[1], selector);
8097 w3[0] = __byte_perm_S (w0[3], w1[0], selector);
8098 w2[3] = __byte_perm_S (w0[2], w0[3], selector);
8099 w2[2] = __byte_perm_S (w0[1], w0[2], selector);
8100 w2[1] = __byte_perm_S (w0[0], w0[1], selector);
8101 w2[0] = __byte_perm_S ( 0, w0[0], selector);
8102 w1[3] = 0;
8103 w1[2] = 0;
8104 w1[1] = 0;
8105 w1[0] = 0;
8106 w0[3] = 0;
8107 w0[2] = 0;
8108 w0[1] = 0;
8109 w0[0] = 0;
8110
8111 break;
8112
8113 case 9:
8114 w3[1] = __byte_perm_S (w0[3], w1[0], selector);
8115 w3[0] = __byte_perm_S (w0[2], w0[3], selector);
8116 w2[3] = __byte_perm_S (w0[1], w0[2], selector);
8117 w2[2] = __byte_perm_S (w0[0], w0[1], selector);
8118 w2[1] = __byte_perm_S ( 0, w0[0], selector);
8119 w2[0] = 0;
8120 w1[3] = 0;
8121 w1[2] = 0;
8122 w1[1] = 0;
8123 w1[0] = 0;
8124 w0[3] = 0;
8125 w0[2] = 0;
8126 w0[1] = 0;
8127 w0[0] = 0;
8128
8129 break;
8130
8131 case 10:
8132 w3[1] = __byte_perm_S (w0[2], w0[3], selector);
8133 w3[0] = __byte_perm_S (w0[1], w0[2], selector);
8134 w2[3] = __byte_perm_S (w0[0], w0[1], selector);
8135 w2[2] = __byte_perm_S ( 0, w0[0], selector);
8136 w2[1] = 0;
8137 w2[0] = 0;
8138 w1[3] = 0;
8139 w1[2] = 0;
8140 w1[1] = 0;
8141 w1[0] = 0;
8142 w0[3] = 0;
8143 w0[2] = 0;
8144 w0[1] = 0;
8145 w0[0] = 0;
8146
8147 break;
8148
8149 case 11:
8150 w3[1] = __byte_perm_S (w0[1], w0[2], selector);
8151 w3[0] = __byte_perm_S (w0[0], w0[1], selector);
8152 w2[3] = __byte_perm_S ( 0, w0[0], selector);
8153 w2[2] = 0;
8154 w2[1] = 0;
8155 w2[0] = 0;
8156 w1[3] = 0;
8157 w1[2] = 0;
8158 w1[1] = 0;
8159 w1[0] = 0;
8160 w0[3] = 0;
8161 w0[2] = 0;
8162 w0[1] = 0;
8163 w0[0] = 0;
8164
8165 break;
8166
8167 case 12:
8168 w3[1] = __byte_perm_S (w0[0], w0[1], selector);
8169 w3[0] = __byte_perm_S ( 0, w0[0], selector);
8170 w2[3] = 0;
8171 w2[2] = 0;
8172 w2[1] = 0;
8173 w2[0] = 0;
8174 w1[3] = 0;
8175 w1[2] = 0;
8176 w1[1] = 0;
8177 w1[0] = 0;
8178 w0[3] = 0;
8179 w0[2] = 0;
8180 w0[1] = 0;
8181 w0[0] = 0;
8182
8183 break;
8184
8185 case 13:
8186 w3[1] = __byte_perm_S ( 0, w0[0], selector);
8187 w3[0] = 0;
8188 w2[3] = 0;
8189 w2[2] = 0;
8190 w2[1] = 0;
8191 w2[0] = 0;
8192 w1[3] = 0;
8193 w1[2] = 0;
8194 w1[1] = 0;
8195 w1[0] = 0;
8196 w0[3] = 0;
8197 w0[2] = 0;
8198 w0[1] = 0;
8199 w0[0] = 0;
8200
8201 break;
8202 }
8203 #endif
8204 }
8205
8206 inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
8207 {
8208 #if defined IS_AMD || defined IS_GENERIC
8209 switch (offset / 4)
8210 {
8211 case 0:
8212 w3[2] = amd_bytealign_S (w3[1], 0, offset);
8213 w3[1] = amd_bytealign_S (w3[0], w3[1], offset);
8214 w3[0] = amd_bytealign_S (w2[3], w3[0], offset);
8215 w2[3] = amd_bytealign_S (w2[2], w2[3], offset);
8216 w2[2] = amd_bytealign_S (w2[1], w2[2], offset);
8217 w2[1] = amd_bytealign_S (w2[0], w2[1], offset);
8218 w2[0] = amd_bytealign_S (w1[3], w2[0], offset);
8219 w1[3] = amd_bytealign_S (w1[2], w1[3], offset);
8220 w1[2] = amd_bytealign_S (w1[1], w1[2], offset);
8221 w1[1] = amd_bytealign_S (w1[0], w1[1], offset);
8222 w1[0] = amd_bytealign_S (w0[3], w1[0], offset);
8223 w0[3] = amd_bytealign_S (w0[2], w0[3], offset);
8224 w0[2] = amd_bytealign_S (w0[1], w0[2], offset);
8225 w0[1] = amd_bytealign_S (w0[0], w0[1], offset);
8226 w0[0] = amd_bytealign_S ( 0, w0[0], offset);
8227 break;
8228
8229 case 1:
8230 w3[2] = amd_bytealign_S (w3[0], 0, offset);
8231 w3[1] = amd_bytealign_S (w2[3], w3[0], offset);
8232 w3[0] = amd_bytealign_S (w2[2], w2[3], offset);
8233 w2[3] = amd_bytealign_S (w2[1], w2[2], offset);
8234 w2[2] = amd_bytealign_S (w2[0], w2[1], offset);
8235 w2[1] = amd_bytealign_S (w1[3], w2[0], offset);
8236 w2[0] = amd_bytealign_S (w1[2], w1[3], offset);
8237 w1[3] = amd_bytealign_S (w1[1], w1[2], offset);
8238 w1[2] = amd_bytealign_S (w1[0], w1[1], offset);
8239 w1[1] = amd_bytealign_S (w0[3], w1[0], offset);
8240 w1[0] = amd_bytealign_S (w0[2], w0[3], offset);
8241 w0[3] = amd_bytealign_S (w0[1], w0[2], offset);
8242 w0[2] = amd_bytealign_S (w0[0], w0[1], offset);
8243 w0[1] = amd_bytealign_S ( 0, w0[0], offset);
8244 w0[0] = 0;
8245 break;
8246
8247 case 2:
8248 w3[2] = amd_bytealign_S (w2[3], 0, offset);
8249 w3[1] = amd_bytealign_S (w2[2], w2[3], offset);
8250 w3[0] = amd_bytealign_S (w2[1], w2[2], offset);
8251 w2[3] = amd_bytealign_S (w2[0], w2[1], offset);
8252 w2[2] = amd_bytealign_S (w1[3], w2[0], offset);
8253 w2[1] = amd_bytealign_S (w1[2], w1[3], offset);
8254 w2[0] = amd_bytealign_S (w1[1], w1[2], offset);
8255 w1[3] = amd_bytealign_S (w1[0], w1[1], offset);
8256 w1[2] = amd_bytealign_S (w0[3], w1[0], offset);
8257 w1[1] = amd_bytealign_S (w0[2], w0[3], offset);
8258 w1[0] = amd_bytealign_S (w0[1], w0[2], offset);
8259 w0[3] = amd_bytealign_S (w0[0], w0[1], offset);
8260 w0[2] = amd_bytealign_S ( 0, w0[0], offset);
8261 w0[1] = 0;
8262 w0[0] = 0;
8263 break;
8264
8265 case 3:
8266 w3[2] = amd_bytealign_S (w2[2], 0, offset);
8267 w3[1] = amd_bytealign_S (w2[1], w2[2], offset);
8268 w3[0] = amd_bytealign_S (w2[0], w2[1], offset);
8269 w2[3] = amd_bytealign_S (w1[3], w2[0], offset);
8270 w2[2] = amd_bytealign_S (w1[2], w1[3], offset);
8271 w2[1] = amd_bytealign_S (w1[1], w1[2], offset);
8272 w2[0] = amd_bytealign_S (w1[0], w1[1], offset);
8273 w1[3] = amd_bytealign_S (w0[3], w1[0], offset);
8274 w1[2] = amd_bytealign_S (w0[2], w0[3], offset);
8275 w1[1] = amd_bytealign_S (w0[1], w0[2], offset);
8276 w1[0] = amd_bytealign_S (w0[0], w0[1], offset);
8277 w0[3] = amd_bytealign_S ( 0, w0[0], offset);
8278 w0[2] = 0;
8279 w0[1] = 0;
8280 w0[0] = 0;
8281 break;
8282
8283 case 4:
8284 w3[2] = amd_bytealign_S (w2[1], 0, offset);
8285 w3[1] = amd_bytealign_S (w2[0], w2[1], offset);
8286 w3[0] = amd_bytealign_S (w1[3], w2[0], offset);
8287 w2[3] = amd_bytealign_S (w1[2], w1[3], offset);
8288 w2[2] = amd_bytealign_S (w1[1], w1[2], offset);
8289 w2[1] = amd_bytealign_S (w1[0], w1[1], offset);
8290 w2[0] = amd_bytealign_S (w0[3], w1[0], offset);
8291 w1[3] = amd_bytealign_S (w0[2], w0[3], offset);
8292 w1[2] = amd_bytealign_S (w0[1], w0[2], offset);
8293 w1[1] = amd_bytealign_S (w0[0], w0[1], offset);
8294 w1[0] = amd_bytealign_S ( 0, w0[0], offset);
8295 w0[3] = 0;
8296 w0[2] = 0;
8297 w0[1] = 0;
8298 w0[0] = 0;
8299 break;
8300
8301 case 5:
8302 w3[2] = amd_bytealign_S (w2[0], 0, offset);
8303 w3[1] = amd_bytealign_S (w1[3], w2[0], offset);
8304 w3[0] = amd_bytealign_S (w1[2], w1[3], offset);
8305 w2[3] = amd_bytealign_S (w1[1], w1[2], offset);
8306 w2[2] = amd_bytealign_S (w1[0], w1[1], offset);
8307 w2[1] = amd_bytealign_S (w0[3], w1[0], offset);
8308 w2[0] = amd_bytealign_S (w0[2], w0[3], offset);
8309 w1[3] = amd_bytealign_S (w0[1], w0[2], offset);
8310 w1[2] = amd_bytealign_S (w0[0], w0[1], offset);
8311 w1[1] = amd_bytealign_S ( 0, w0[0], offset);
8312 w1[0] = 0;
8313 w0[3] = 0;
8314 w0[2] = 0;
8315 w0[1] = 0;
8316 w0[0] = 0;
8317 break;
8318
8319 case 6:
8320 w3[2] = amd_bytealign_S (w1[3], 0, offset);
8321 w3[1] = amd_bytealign_S (w1[2], w1[3], offset);
8322 w3[0] = amd_bytealign_S (w1[1], w1[2], offset);
8323 w2[3] = amd_bytealign_S (w1[0], w1[1], offset);
8324 w2[2] = amd_bytealign_S (w0[3], w1[0], offset);
8325 w2[1] = amd_bytealign_S (w0[2], w0[3], offset);
8326 w2[0] = amd_bytealign_S (w0[1], w0[2], offset);
8327 w1[3] = amd_bytealign_S (w0[0], w0[1], offset);
8328 w1[2] = amd_bytealign_S ( 0, w0[0], offset);
8329 w1[1] = 0;
8330 w1[0] = 0;
8331 w0[3] = 0;
8332 w0[2] = 0;
8333 w0[1] = 0;
8334 w0[0] = 0;
8335 break;
8336
8337 case 7:
8338 w3[2] = amd_bytealign_S (w1[2], 0, offset);
8339 w3[1] = amd_bytealign_S (w1[1], w1[2], offset);
8340 w3[0] = amd_bytealign_S (w1[0], w1[1], offset);
8341 w2[3] = amd_bytealign_S (w0[3], w1[0], offset);
8342 w2[2] = amd_bytealign_S (w0[2], w0[3], offset);
8343 w2[1] = amd_bytealign_S (w0[1], w0[2], offset);
8344 w2[0] = amd_bytealign_S (w0[0], w0[1], offset);
8345 w1[3] = amd_bytealign_S ( 0, w0[0], offset);
8346 w1[2] = 0;
8347 w1[1] = 0;
8348 w1[0] = 0;
8349 w0[3] = 0;
8350 w0[2] = 0;
8351 w0[1] = 0;
8352 w0[0] = 0;
8353 break;
8354
8355 case 8:
8356 w3[2] = amd_bytealign_S (w1[1], 0, offset);
8357 w3[1] = amd_bytealign_S (w1[0], w1[1], offset);
8358 w3[0] = amd_bytealign_S (w0[3], w1[0], offset);
8359 w2[3] = amd_bytealign_S (w0[2], w0[3], offset);
8360 w2[2] = amd_bytealign_S (w0[1], w0[2], offset);
8361 w2[1] = amd_bytealign_S (w0[0], w0[1], offset);
8362 w2[0] = amd_bytealign_S ( 0, w0[0], offset);
8363 w1[3] = 0;
8364 w1[2] = 0;
8365 w1[1] = 0;
8366 w1[0] = 0;
8367 w0[3] = 0;
8368 w0[2] = 0;
8369 w0[1] = 0;
8370 w0[0] = 0;
8371 break;
8372
8373 case 9:
8374 w3[2] = amd_bytealign_S (w1[0], 0, offset);
8375 w3[1] = amd_bytealign_S (w0[3], w1[0], offset);
8376 w3[0] = amd_bytealign_S (w0[2], w0[3], offset);
8377 w2[3] = amd_bytealign_S (w0[1], w0[2], offset);
8378 w2[2] = amd_bytealign_S (w0[0], w0[1], offset);
8379 w2[1] = amd_bytealign_S ( 0, w0[0], offset);
8380 w2[0] = 0;
8381 w1[3] = 0;
8382 w1[2] = 0;
8383 w1[1] = 0;
8384 w1[0] = 0;
8385 w0[3] = 0;
8386 w0[2] = 0;
8387 w0[1] = 0;
8388 w0[0] = 0;
8389 break;
8390
8391 case 10:
8392 w3[2] = amd_bytealign_S (w0[3], 0, offset);
8393 w3[1] = amd_bytealign_S (w0[2], w0[3], offset);
8394 w3[0] = amd_bytealign_S (w0[1], w0[2], offset);
8395 w2[3] = amd_bytealign_S (w0[0], w0[1], offset);
8396 w2[2] = amd_bytealign_S ( 0, w0[0], offset);
8397 w2[1] = 0;
8398 w2[0] = 0;
8399 w1[3] = 0;
8400 w1[2] = 0;
8401 w1[1] = 0;
8402 w1[0] = 0;
8403 w0[3] = 0;
8404 w0[2] = 0;
8405 w0[1] = 0;
8406 w0[0] = 0;
8407 break;
8408
8409 case 11:
8410 w3[2] = amd_bytealign_S (w0[2], 0, offset);
8411 w3[1] = amd_bytealign_S (w0[1], w0[2], offset);
8412 w3[0] = amd_bytealign_S (w0[0], w0[1], offset);
8413 w2[3] = amd_bytealign_S ( 0, w0[0], offset);
8414 w2[2] = 0;
8415 w2[1] = 0;
8416 w2[0] = 0;
8417 w1[3] = 0;
8418 w1[2] = 0;
8419 w1[1] = 0;
8420 w1[0] = 0;
8421 w0[3] = 0;
8422 w0[2] = 0;
8423 w0[1] = 0;
8424 w0[0] = 0;
8425 break;
8426
8427 case 12:
8428 w3[2] = amd_bytealign_S (w0[1], 0, offset);
8429 w3[1] = amd_bytealign_S (w0[0], w0[1], offset);
8430 w3[0] = amd_bytealign_S ( 0, w0[0], offset);
8431 w2[3] = 0;
8432 w2[2] = 0;
8433 w2[1] = 0;
8434 w2[0] = 0;
8435 w1[3] = 0;
8436 w1[2] = 0;
8437 w1[1] = 0;
8438 w1[0] = 0;
8439 w0[3] = 0;
8440 w0[2] = 0;
8441 w0[1] = 0;
8442 w0[0] = 0;
8443 break;
8444
8445 case 13:
8446 w3[2] = amd_bytealign_S (w0[0], 0, offset);
8447 w3[1] = amd_bytealign_S ( 0, w0[0], offset);
8448 w3[0] = 0;
8449 w2[3] = 0;
8450 w2[2] = 0;
8451 w2[1] = 0;
8452 w2[0] = 0;
8453 w1[3] = 0;
8454 w1[2] = 0;
8455 w1[1] = 0;
8456 w1[0] = 0;
8457 w0[3] = 0;
8458 w0[2] = 0;
8459 w0[1] = 0;
8460 w0[0] = 0;
8461 break;
8462 }
8463 #endif
8464
8465 #ifdef IS_NV
8466 const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
8467
8468 switch (offset / 4)
8469 {
8470 case 0:
8471 w3[1] = __byte_perm_S (w3[1], w3[0], selector);
8472 w3[0] = __byte_perm_S (w3[0], w2[3], selector);
8473 w2[3] = __byte_perm_S (w2[3], w2[2], selector);
8474 w2[2] = __byte_perm_S (w2[2], w2[1], selector);
8475 w2[1] = __byte_perm_S (w2[1], w2[0], selector);
8476 w2[0] = __byte_perm_S (w2[0], w1[3], selector);
8477 w1[3] = __byte_perm_S (w1[3], w1[2], selector);
8478 w1[2] = __byte_perm_S (w1[2], w1[1], selector);
8479 w1[1] = __byte_perm_S (w1[1], w1[0], selector);
8480 w1[0] = __byte_perm_S (w1[0], w0[3], selector);
8481 w0[3] = __byte_perm_S (w0[3], w0[2], selector);
8482 w0[2] = __byte_perm_S (w0[2], w0[1], selector);
8483 w0[1] = __byte_perm_S (w0[1], w0[0], selector);
8484 w0[0] = __byte_perm_S (w0[0], 0, selector);
8485 break;
8486
8487 case 1:
8488 w3[1] = __byte_perm_S (w3[0], w2[3], selector);
8489 w3[0] = __byte_perm_S (w2[3], w2[2], selector);
8490 w2[3] = __byte_perm_S (w2[2], w2[1], selector);
8491 w2[2] = __byte_perm_S (w2[1], w2[0], selector);
8492 w2[1] = __byte_perm_S (w2[0], w1[3], selector);
8493 w2[0] = __byte_perm_S (w1[3], w1[2], selector);
8494 w1[3] = __byte_perm_S (w1[2], w1[1], selector);
8495 w1[2] = __byte_perm_S (w1[1], w1[0], selector);
8496 w1[1] = __byte_perm_S (w1[0], w0[3], selector);
8497 w1[0] = __byte_perm_S (w0[3], w0[2], selector);
8498 w0[3] = __byte_perm_S (w0[2], w0[1], selector);
8499 w0[2] = __byte_perm_S (w0[1], w0[0], selector);
8500 w0[1] = __byte_perm_S (w0[0], 0, selector);
8501 w0[0] = 0;
8502 break;
8503
8504 case 2:
8505 w3[1] = __byte_perm_S (w2[3], w2[2], selector);
8506 w3[0] = __byte_perm_S (w2[2], w2[1], selector);
8507 w2[3] = __byte_perm_S (w2[1], w2[0], selector);
8508 w2[2] = __byte_perm_S (w2[0], w1[3], selector);
8509 w2[1] = __byte_perm_S (w1[3], w1[2], selector);
8510 w2[0] = __byte_perm_S (w1[2], w1[1], selector);
8511 w1[3] = __byte_perm_S (w1[1], w1[0], selector);
8512 w1[2] = __byte_perm_S (w1[0], w0[3], selector);
8513 w1[1] = __byte_perm_S (w0[3], w0[2], selector);
8514 w1[0] = __byte_perm_S (w0[2], w0[1], selector);
8515 w0[3] = __byte_perm_S (w0[1], w0[0], selector);
8516 w0[2] = __byte_perm_S (w0[0], 0, selector);
8517 w0[1] = 0;
8518 w0[0] = 0;
8519 break;
8520
8521 case 3:
8522 w3[1] = __byte_perm_S (w2[2], w2[1], selector);
8523 w3[0] = __byte_perm_S (w2[1], w2[0], selector);
8524 w2[3] = __byte_perm_S (w2[0], w1[3], selector);
8525 w2[2] = __byte_perm_S (w1[3], w1[2], selector);
8526 w2[1] = __byte_perm_S (w1[2], w1[1], selector);
8527 w2[0] = __byte_perm_S (w1[1], w1[0], selector);
8528 w1[3] = __byte_perm_S (w1[0], w0[3], selector);
8529 w1[2] = __byte_perm_S (w0[3], w0[2], selector);
8530 w1[1] = __byte_perm_S (w0[2], w0[1], selector);
8531 w1[0] = __byte_perm_S (w0[1], w0[0], selector);
8532 w0[3] = __byte_perm_S (w0[0], 0, selector);
8533 w0[2] = 0;
8534 w0[1] = 0;
8535 w0[0] = 0;
8536 break;
8537
8538 case 4:
8539 w3[1] = __byte_perm_S (w2[1], w2[0], selector);
8540 w3[0] = __byte_perm_S (w2[0], w1[3], selector);
8541 w2[3] = __byte_perm_S (w1[3], w1[2], selector);
8542 w2[2] = __byte_perm_S (w1[2], w1[1], selector);
8543 w2[1] = __byte_perm_S (w1[1], w1[0], selector);
8544 w2[0] = __byte_perm_S (w1[0], w0[3], selector);
8545 w1[3] = __byte_perm_S (w0[3], w0[2], selector);
8546 w1[2] = __byte_perm_S (w0[2], w0[1], selector);
8547 w1[1] = __byte_perm_S (w0[1], w0[0], selector);
8548 w1[0] = __byte_perm_S (w0[0], 0, selector);
8549 w0[3] = 0;
8550 w0[2] = 0;
8551 w0[1] = 0;
8552 w0[0] = 0;
8553 break;
8554
8555 case 5:
8556 w3[1] = __byte_perm_S (w2[0], w1[3], selector);
8557 w3[0] = __byte_perm_S (w1[3], w1[2], selector);
8558 w2[3] = __byte_perm_S (w1[2], w1[1], selector);
8559 w2[2] = __byte_perm_S (w1[1], w1[0], selector);
8560 w2[1] = __byte_perm_S (w1[0], w0[3], selector);
8561 w2[0] = __byte_perm_S (w0[3], w0[2], selector);
8562 w1[3] = __byte_perm_S (w0[2], w0[1], selector);
8563 w1[2] = __byte_perm_S (w0[1], w0[0], selector);
8564 w1[1] = __byte_perm_S (w0[0], 0, selector);
8565 w1[0] = 0;
8566 w0[3] = 0;
8567 w0[2] = 0;
8568 w0[1] = 0;
8569 w0[0] = 0;
8570 break;
8571
8572 case 6:
8573 w3[1] = __byte_perm_S (w1[3], w1[2], selector);
8574 w3[0] = __byte_perm_S (w1[2], w1[1], selector);
8575 w2[3] = __byte_perm_S (w1[1], w1[0], selector);
8576 w2[2] = __byte_perm_S (w1[0], w0[3], selector);
8577 w2[1] = __byte_perm_S (w0[3], w0[2], selector);
8578 w2[0] = __byte_perm_S (w0[2], w0[1], selector);
8579 w1[3] = __byte_perm_S (w0[1], w0[0], selector);
8580 w1[2] = __byte_perm_S (w0[0], 0, selector);
8581 w1[1] = 0;
8582 w1[0] = 0;
8583 w0[3] = 0;
8584 w0[2] = 0;
8585 w0[1] = 0;
8586 w0[0] = 0;
8587 break;
8588
8589 case 7:
8590 w3[1] = __byte_perm_S (w1[2], w1[1], selector);
8591 w3[0] = __byte_perm_S (w1[1], w1[0], selector);
8592 w2[3] = __byte_perm_S (w1[0], w0[3], selector);
8593 w2[2] = __byte_perm_S (w0[3], w0[2], selector);
8594 w2[1] = __byte_perm_S (w0[2], w0[1], selector);
8595 w2[0] = __byte_perm_S (w0[1], w0[0], selector);
8596 w1[3] = __byte_perm_S (w0[0], 0, selector);
8597 w1[2] = 0;
8598 w1[1] = 0;
8599 w1[0] = 0;
8600 w0[3] = 0;
8601 w0[2] = 0;
8602 w0[1] = 0;
8603 w0[0] = 0;
8604 break;
8605
8606 case 8:
8607 w3[1] = __byte_perm_S (w1[1], w1[0], selector);
8608 w3[0] = __byte_perm_S (w1[0], w0[3], selector);
8609 w2[3] = __byte_perm_S (w0[3], w0[2], selector);
8610 w2[2] = __byte_perm_S (w0[2], w0[1], selector);
8611 w2[1] = __byte_perm_S (w0[1], w0[0], selector);
8612 w2[0] = __byte_perm_S (w0[0], 0, selector);
8613 w1[3] = 0;
8614 w1[2] = 0;
8615 w1[1] = 0;
8616 w1[0] = 0;
8617 w0[3] = 0;
8618 w0[2] = 0;
8619 w0[1] = 0;
8620 w0[0] = 0;
8621 break;
8622
8623 case 9:
8624 w3[1] = __byte_perm_S (w1[0], w0[3], selector);
8625 w3[0] = __byte_perm_S (w0[3], w0[2], selector);
8626 w2[3] = __byte_perm_S (w0[2], w0[1], selector);
8627 w2[2] = __byte_perm_S (w0[1], w0[0], selector);
8628 w2[1] = __byte_perm_S (w0[0], 0, selector);
8629 w2[0] = 0;
8630 w1[3] = 0;
8631 w1[2] = 0;
8632 w1[1] = 0;
8633 w1[0] = 0;
8634 w0[3] = 0;
8635 w0[2] = 0;
8636 w0[1] = 0;
8637 w0[0] = 0;
8638 break;
8639
8640 case 10:
8641 w3[1] = __byte_perm_S (w0[3], w0[2], selector);
8642 w3[0] = __byte_perm_S (w0[2], w0[1], selector);
8643 w2[3] = __byte_perm_S (w0[1], w0[0], selector);
8644 w2[2] = __byte_perm_S (w0[0], 0, selector);
8645 w2[1] = 0;
8646 w2[0] = 0;
8647 w1[3] = 0;
8648 w1[2] = 0;
8649 w1[1] = 0;
8650 w1[0] = 0;
8651 w0[3] = 0;
8652 w0[2] = 0;
8653 w0[1] = 0;
8654 w0[0] = 0;
8655 break;
8656
8657 case 11:
8658 w3[1] = __byte_perm_S (w0[2], w0[1], selector);
8659 w3[0] = __byte_perm_S (w0[1], w0[0], selector);
8660 w2[3] = __byte_perm_S (w0[0], 0, selector);
8661 w2[2] = 0;
8662 w2[1] = 0;
8663 w2[0] = 0;
8664 w1[3] = 0;
8665 w1[2] = 0;
8666 w1[1] = 0;
8667 w1[0] = 0;
8668 w0[3] = 0;
8669 w0[2] = 0;
8670 w0[1] = 0;
8671 w0[0] = 0;
8672 break;
8673
8674 case 12:
8675 w3[1] = __byte_perm_S (w0[1], w0[0], selector);
8676 w3[0] = __byte_perm_S (w0[0], 0, selector);
8677 w2[3] = 0;
8678 w2[2] = 0;
8679 w2[1] = 0;
8680 w2[0] = 0;
8681 w1[3] = 0;
8682 w1[2] = 0;
8683 w1[1] = 0;
8684 w1[0] = 0;
8685 w0[3] = 0;
8686 w0[2] = 0;
8687 w0[1] = 0;
8688 w0[0] = 0;
8689 break;
8690
8691 case 13:
8692 w3[1] = __byte_perm_S (w0[0], 0, selector);
8693 w3[0] = 0;
8694 w2[3] = 0;
8695 w2[2] = 0;
8696 w2[1] = 0;
8697 w2[0] = 0;
8698 w1[3] = 0;
8699 w1[2] = 0;
8700 w1[1] = 0;
8701 w1[0] = 0;
8702 w0[3] = 0;
8703 w0[2] = 0;
8704 w0[1] = 0;
8705 w0[0] = 0;
8706 break;
8707 }
8708 #endif
8709 }
8710
8711 /**
8712 * vector functions on scalar types (for inner loop usage)
8713 */
8714
8715 #define PACKVS2(sn,vn,e) \
8716 sn[0] = vn[0].s##e; \
8717 sn[1] = vn[1].s##e;
8718
8719 #define PACKSV2(sn,vn,e) \
8720 vn[0].s##e = sn[0]; \
8721 vn[1].s##e = sn[1];
8722
8723 #define PACKVS24(s0,s1,v0,v1,e) \
8724 PACKVS4 (s0, v0, e); \
8725 PACKVS4 (s1, v1, e);
8726
8727 #define PACKSV24(s0,s1,v0,v1,e) \
8728 PACKSV4 (s0, v0, e); \
8729 PACKSV4 (s1, v1, e);
8730
8731 #define PACKVS4(sn,vn,e) \
8732 sn[0] = vn[0].s##e; \
8733 sn[1] = vn[1].s##e; \
8734 sn[2] = vn[2].s##e; \
8735 sn[3] = vn[3].s##e;
8736
8737 #define PACKSV4(sn,vn,e) \
8738 vn[0].s##e = sn[0]; \
8739 vn[1].s##e = sn[1]; \
8740 vn[2].s##e = sn[2]; \
8741 vn[3].s##e = sn[3];
8742
8743 #define PACKVS44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
8744 PACKVS4 (s0, v0, e); \
8745 PACKVS4 (s1, v1, e); \
8746 PACKVS4 (s2, v2, e); \
8747 PACKVS4 (s3, v3, e);
8748
8749 #define PACKSV44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
8750 PACKSV4 (s0, v0, e); \
8751 PACKSV4 (s1, v1, e); \
8752 PACKSV4 (s2, v2, e); \
8753 PACKSV4 (s3, v3, e);
8754
8755 inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
8756 {
8757 #if VECT_SIZE == 1
8758
8759 switch_buffer_by_offset_le_S (w0, w1, w2, w3, offset);
8760
8761 #else
8762
8763 u32 t0[4];
8764 u32 t1[4];
8765 u32 t2[4];
8766 u32 t3[4];
8767
8768 #endif
8769
8770 #if VECT_SIZE == 2
8771
8772 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8773 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8774
8775 #elif VECT_SIZE == 4
8776
8777 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8778 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8779 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
8780 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
8781
8782 #elif VECT_SIZE == 8
8783
8784 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8785 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8786 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
8787 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
8788 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
8789 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
8790 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
8791 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
8792
8793 #elif VECT_SIZE == 16
8794
8795 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8796 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8797 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
8798 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
8799 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
8800 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
8801 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
8802 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
8803 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
8804 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
8805 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
8806 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
8807 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
8808 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
8809 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
8810 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
8811
8812 #endif
8813 }
8814
8815 inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
8816 {
8817 #if VECT_SIZE == 1
8818
8819 append_0x01_2x4_S (w0, w1, offset);
8820
8821 #else
8822
8823 u32 t0[4];
8824 u32 t1[4];
8825
8826 #endif
8827
8828 #if VECT_SIZE == 2
8829
8830 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8831 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8832
8833 #elif VECT_SIZE == 4
8834
8835 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8836 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8837 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
8838 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
8839
8840 #elif VECT_SIZE == 8
8841
8842 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8843 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8844 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
8845 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
8846 PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
8847 PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
8848 PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
8849 PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
8850
8851 #elif VECT_SIZE == 16
8852
8853 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8854 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8855 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
8856 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
8857 PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
8858 PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
8859 PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
8860 PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
8861 PACKVS24 (t0, t1, w0, w1, 8); append_0x01_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
8862 PACKVS24 (t0, t1, w0, w1, 9); append_0x01_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
8863 PACKVS24 (t0, t1, w0, w1, a); append_0x01_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
8864 PACKVS24 (t0, t1, w0, w1, b); append_0x01_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
8865 PACKVS24 (t0, t1, w0, w1, c); append_0x01_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
8866 PACKVS24 (t0, t1, w0, w1, d); append_0x01_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
8867 PACKVS24 (t0, t1, w0, w1, e); append_0x01_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
8868 PACKVS24 (t0, t1, w0, w1, f); append_0x01_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
8869
8870 #endif
8871 }
8872
8873 inline void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
8874 {
8875 #if VECT_SIZE == 1
8876
8877 append_0x80_2x4_S (w0, w1, offset);
8878
8879 #else
8880
8881 u32 t0[4];
8882 u32 t1[4];
8883
8884 #endif
8885
8886 #if VECT_SIZE == 2
8887
8888 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8889 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8890
8891 #elif VECT_SIZE == 4
8892
8893 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8894 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8895 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
8896 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
8897
8898 #elif VECT_SIZE == 8
8899
8900 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8901 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8902 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
8903 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
8904 PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
8905 PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
8906 PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
8907 PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
8908
8909 #elif VECT_SIZE == 16
8910
8911 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8912 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8913 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
8914 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
8915 PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
8916 PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
8917 PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
8918 PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
8919 PACKVS24 (t0, t1, w0, w1, 8); append_0x80_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
8920 PACKVS24 (t0, t1, w0, w1, 9); append_0x80_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
8921 PACKVS24 (t0, t1, w0, w1, a); append_0x80_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
8922 PACKVS24 (t0, t1, w0, w1, b); append_0x80_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
8923 PACKVS24 (t0, t1, w0, w1, c); append_0x80_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
8924 PACKVS24 (t0, t1, w0, w1, d); append_0x80_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
8925 PACKVS24 (t0, t1, w0, w1, e); append_0x80_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
8926 PACKVS24 (t0, t1, w0, w1, f); append_0x80_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
8927
8928 #endif
8929 }
8930
8931 inline void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
8932 {
8933 #if VECT_SIZE == 1
8934
8935 append_0x80_4x4_S (w0, w1, w2, w3, offset);
8936
8937 #else
8938
8939 u32 t0[4];
8940 u32 t1[4];
8941 u32 t2[4];
8942 u32 t3[4];
8943
8944 #endif
8945
8946 #if VECT_SIZE == 2
8947
8948 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8949 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8950
8951 #elif VECT_SIZE == 4
8952
8953 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8954 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8955 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
8956 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
8957
8958 #elif VECT_SIZE == 8
8959
8960 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8961 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8962 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
8963 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
8964 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
8965 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
8966 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
8967 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
8968
8969 #elif VECT_SIZE == 16
8970
8971 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8972 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8973 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
8974 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
8975 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
8976 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
8977 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
8978 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
8979 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); append_0x80_4x4_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
8980 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); append_0x80_4x4_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
8981 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); append_0x80_4x4_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
8982 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); append_0x80_4x4_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
8983 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); append_0x80_4x4_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
8984 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); append_0x80_4x4_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
8985 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); append_0x80_4x4_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
8986 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); append_0x80_4x4_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
8987
8988 #endif
8989 }