Fix compiler warnings
[hashcat.git] / OpenCL / common.c
1 /**
2 * Author......: Jens Steube <jens.steube@gmail.com>
3 * License.....: MIT
4 */
5
6 /**
7 * pure scalar functions
8 */
9
10 inline int hash_comp (const u32 d1[4], __global u32 *d2)
11 {
12 if (d1[3] > d2[DGST_R3]) return ( 1);
13 if (d1[3] < d2[DGST_R3]) return (-1);
14 if (d1[2] > d2[DGST_R2]) return ( 1);
15 if (d1[2] < d2[DGST_R2]) return (-1);
16 if (d1[1] > d2[DGST_R1]) return ( 1);
17 if (d1[1] < d2[DGST_R1]) return (-1);
18 if (d1[0] > d2[DGST_R0]) return ( 1);
19 if (d1[0] < d2[DGST_R0]) return (-1);
20
21 return (0);
22 }
23
24 inline int find_hash (const u32 digest[4], const u32 digests_cnt, __global digest_t *digests_buf)
25 {
26 for (u32 l = 0, r = digests_cnt; r; r >>= 1)
27 {
28 const u32 m = r >> 1;
29
30 const u32 c = l + m;
31
32 const int cmp = hash_comp (digest, digests_buf[c].digest_buf);
33
34 if (cmp > 0)
35 {
36 l += m + 1;
37
38 r--;
39 }
40
41 if (cmp == 0) return (c);
42 }
43
44 return (-1);
45 }
46
47 inline u32 check_bitmap (__global u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest)
48 {
49 return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f)));
50 }
51
52 inline u32 check (const u32 digest[2], __global u32 *bitmap_s1_a, __global u32 *bitmap_s1_b, __global u32 *bitmap_s1_c, __global u32 *bitmap_s1_d, __global u32 *bitmap_s2_a, __global u32 *bitmap_s2_b, __global u32 *bitmap_s2_c, __global u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2)
53 {
54 if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0);
55 if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0);
56 if (check_bitmap (bitmap_s1_c, bitmap_mask, bitmap_shift1, digest[2]) == 0) return (0);
57 if (check_bitmap (bitmap_s1_d, bitmap_mask, bitmap_shift1, digest[3]) == 0) return (0);
58
59 if (check_bitmap (bitmap_s2_a, bitmap_mask, bitmap_shift2, digest[0]) == 0) return (0);
60 if (check_bitmap (bitmap_s2_b, bitmap_mask, bitmap_shift2, digest[1]) == 0) return (0);
61 if (check_bitmap (bitmap_s2_c, bitmap_mask, bitmap_shift2, digest[2]) == 0) return (0);
62 if (check_bitmap (bitmap_s2_d, bitmap_mask, bitmap_shift2, digest[3]) == 0) return (0);
63
64 return (1);
65 }
66
67 inline void mark_hash (__global plain_t *plains_buf, __global u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
68 {
69 hashes_shown[hash_pos] = 1;
70
71 plains_buf[hash_pos].gidvid = (gid * 1) + 0;
72 plains_buf[hash_pos].il_pos = il_pos;
73 }
74
75 /**
76 * vector functions
77 */
78
79 inline void truncate_block (u32x w[4], const u32 len)
80 {
81 switch (len)
82 {
83 case 0: w[0] &= 0;
84 w[1] &= 0;
85 w[2] &= 0;
86 w[3] &= 0;
87 break;
88 case 1: w[0] &= 0x000000FF;
89 w[1] &= 0;
90 w[2] &= 0;
91 w[3] &= 0;
92 break;
93 case 2: w[0] &= 0x0000FFFF;
94 w[1] &= 0;
95 w[2] &= 0;
96 w[3] &= 0;
97 break;
98 case 3: w[0] &= 0x00FFFFFF;
99 w[1] &= 0;
100 w[2] &= 0;
101 w[3] &= 0;
102 break;
103 case 4: w[1] &= 0;
104 w[2] &= 0;
105 w[3] &= 0;
106 break;
107 case 5: w[1] &= 0x000000FF;
108 w[2] &= 0;
109 w[3] &= 0;
110 break;
111 case 6: w[1] &= 0x0000FFFF;
112 w[2] &= 0;
113 w[3] &= 0;
114 break;
115 case 7: w[1] &= 0x00FFFFFF;
116 w[2] &= 0;
117 w[3] &= 0;
118 break;
119 case 8: w[2] &= 0;
120 w[3] &= 0;
121 break;
122 case 9: w[2] &= 0x000000FF;
123 w[3] &= 0;
124 break;
125 case 10: w[2] &= 0x0000FFFF;
126 w[3] &= 0;
127 break;
128 case 11: w[2] &= 0x00FFFFFF;
129 w[3] &= 0;
130 break;
131 case 12: w[3] &= 0;
132 break;
133 case 13: w[3] &= 0x000000FF;
134 break;
135 case 14: w[3] &= 0x0000FFFF;
136 break;
137 case 15: w[3] &= 0x00FFFFFF;
138 break;
139 }
140 }
141
142 inline void make_unicode (const u32x in[4], u32x out1[4], u32x out2[4])
143 {
144 #ifdef IS_NV
145 out2[3] = __byte_perm (in[3], 0, 0x7372);
146 out2[2] = __byte_perm (in[3], 0, 0x7170);
147 out2[1] = __byte_perm (in[2], 0, 0x7372);
148 out2[0] = __byte_perm (in[2], 0, 0x7170);
149 out1[3] = __byte_perm (in[1], 0, 0x7372);
150 out1[2] = __byte_perm (in[1], 0, 0x7170);
151 out1[1] = __byte_perm (in[0], 0, 0x7372);
152 out1[0] = __byte_perm (in[0], 0, 0x7170);
153 #endif
154
155 #if defined IS_AMD || defined IS_GENERIC
156 out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
157 out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
158 out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
159 out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
160 out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
161 out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
162 out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
163 out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
164 #endif
165 }
166
167 inline void undo_unicode (const u32x in1[4], const u32x in2[4], u32x out[4])
168 {
169 #ifdef IS_NV
170 out[0] = __byte_perm (in1[0], in1[1], 0x6420);
171 out[1] = __byte_perm (in1[2], in1[3], 0x6420);
172 out[2] = __byte_perm (in2[0], in2[1], 0x6420);
173 out[3] = __byte_perm (in2[2], in2[3], 0x6420);
174 #endif
175
176 #if defined IS_AMD || defined IS_GENERIC
177 out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
178 | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
179 out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
180 | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
181 out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
182 | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
183 out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
184 | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
185 #endif
186 }
187
188 inline void append_0x01_1x4 (u32x w0[4], const u32 offset)
189 {
190 switch (offset)
191 {
192 case 0:
193 w0[0] = 0x01;
194 break;
195
196 case 1:
197 w0[0] = w0[0] | 0x0100;
198 break;
199
200 case 2:
201 w0[0] = w0[0] | 0x010000;
202 break;
203
204 case 3:
205 w0[0] = w0[0] | 0x01000000;
206 break;
207
208 case 4:
209 w0[1] = 0x01;
210 break;
211
212 case 5:
213 w0[1] = w0[1] | 0x0100;
214 break;
215
216 case 6:
217 w0[1] = w0[1] | 0x010000;
218 break;
219
220 case 7:
221 w0[1] = w0[1] | 0x01000000;
222 break;
223
224 case 8:
225 w0[2] = 0x01;
226 break;
227
228 case 9:
229 w0[2] = w0[2] | 0x0100;
230 break;
231
232 case 10:
233 w0[2] = w0[2] | 0x010000;
234 break;
235
236 case 11:
237 w0[2] = w0[2] | 0x01000000;
238 break;
239
240 case 12:
241 w0[3] = 0x01;
242 break;
243
244 case 13:
245 w0[3] = w0[3] | 0x0100;
246 break;
247
248 case 14:
249 w0[3] = w0[3] | 0x010000;
250 break;
251
252 case 15:
253 w0[3] = w0[3] | 0x01000000;
254 break;
255 }
256 }
257
258 inline void append_0x01_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
259 {
260 switch (offset)
261 {
262 case 0:
263 w0[0] = 0x01;
264 break;
265
266 case 1:
267 w0[0] = w0[0] | 0x0100;
268 break;
269
270 case 2:
271 w0[0] = w0[0] | 0x010000;
272 break;
273
274 case 3:
275 w0[0] = w0[0] | 0x01000000;
276 break;
277
278 case 4:
279 w0[1] = 0x01;
280 break;
281
282 case 5:
283 w0[1] = w0[1] | 0x0100;
284 break;
285
286 case 6:
287 w0[1] = w0[1] | 0x010000;
288 break;
289
290 case 7:
291 w0[1] = w0[1] | 0x01000000;
292 break;
293
294 case 8:
295 w0[2] = 0x01;
296 break;
297
298 case 9:
299 w0[2] = w0[2] | 0x0100;
300 break;
301
302 case 10:
303 w0[2] = w0[2] | 0x010000;
304 break;
305
306 case 11:
307 w0[2] = w0[2] | 0x01000000;
308 break;
309
310 case 12:
311 w0[3] = 0x01;
312 break;
313
314 case 13:
315 w0[3] = w0[3] | 0x0100;
316 break;
317
318 case 14:
319 w0[3] = w0[3] | 0x010000;
320 break;
321
322 case 15:
323 w0[3] = w0[3] | 0x01000000;
324 break;
325
326 case 16:
327 w1[0] = 0x01;
328 break;
329
330 case 17:
331 w1[0] = w1[0] | 0x0100;
332 break;
333
334 case 18:
335 w1[0] = w1[0] | 0x010000;
336 break;
337
338 case 19:
339 w1[0] = w1[0] | 0x01000000;
340 break;
341
342 case 20:
343 w1[1] = 0x01;
344 break;
345
346 case 21:
347 w1[1] = w1[1] | 0x0100;
348 break;
349
350 case 22:
351 w1[1] = w1[1] | 0x010000;
352 break;
353
354 case 23:
355 w1[1] = w1[1] | 0x01000000;
356 break;
357
358 case 24:
359 w1[2] = 0x01;
360 break;
361
362 case 25:
363 w1[2] = w1[2] | 0x0100;
364 break;
365
366 case 26:
367 w1[2] = w1[2] | 0x010000;
368 break;
369
370 case 27:
371 w1[2] = w1[2] | 0x01000000;
372 break;
373
374 case 28:
375 w1[3] = 0x01;
376 break;
377
378 case 29:
379 w1[3] = w1[3] | 0x0100;
380 break;
381
382 case 30:
383 w1[3] = w1[3] | 0x010000;
384 break;
385
386 case 31:
387 w1[3] = w1[3] | 0x01000000;
388 break;
389 }
390 }
391
392 inline void append_0x01_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
393 {
394 switch (offset)
395 {
396 case 0:
397 w0[0] = 0x01;
398 break;
399
400 case 1:
401 w0[0] = w0[0] | 0x0100;
402 break;
403
404 case 2:
405 w0[0] = w0[0] | 0x010000;
406 break;
407
408 case 3:
409 w0[0] = w0[0] | 0x01000000;
410 break;
411
412 case 4:
413 w0[1] = 0x01;
414 break;
415
416 case 5:
417 w0[1] = w0[1] | 0x0100;
418 break;
419
420 case 6:
421 w0[1] = w0[1] | 0x010000;
422 break;
423
424 case 7:
425 w0[1] = w0[1] | 0x01000000;
426 break;
427
428 case 8:
429 w0[2] = 0x01;
430 break;
431
432 case 9:
433 w0[2] = w0[2] | 0x0100;
434 break;
435
436 case 10:
437 w0[2] = w0[2] | 0x010000;
438 break;
439
440 case 11:
441 w0[2] = w0[2] | 0x01000000;
442 break;
443
444 case 12:
445 w0[3] = 0x01;
446 break;
447
448 case 13:
449 w0[3] = w0[3] | 0x0100;
450 break;
451
452 case 14:
453 w0[3] = w0[3] | 0x010000;
454 break;
455
456 case 15:
457 w0[3] = w0[3] | 0x01000000;
458 break;
459
460 case 16:
461 w1[0] = 0x01;
462 break;
463
464 case 17:
465 w1[0] = w1[0] | 0x0100;
466 break;
467
468 case 18:
469 w1[0] = w1[0] | 0x010000;
470 break;
471
472 case 19:
473 w1[0] = w1[0] | 0x01000000;
474 break;
475
476 case 20:
477 w1[1] = 0x01;
478 break;
479
480 case 21:
481 w1[1] = w1[1] | 0x0100;
482 break;
483
484 case 22:
485 w1[1] = w1[1] | 0x010000;
486 break;
487
488 case 23:
489 w1[1] = w1[1] | 0x01000000;
490 break;
491
492 case 24:
493 w1[2] = 0x01;
494 break;
495
496 case 25:
497 w1[2] = w1[2] | 0x0100;
498 break;
499
500 case 26:
501 w1[2] = w1[2] | 0x010000;
502 break;
503
504 case 27:
505 w1[2] = w1[2] | 0x01000000;
506 break;
507
508 case 28:
509 w1[3] = 0x01;
510 break;
511
512 case 29:
513 w1[3] = w1[3] | 0x0100;
514 break;
515
516 case 30:
517 w1[3] = w1[3] | 0x010000;
518 break;
519
520 case 31:
521 w1[3] = w1[3] | 0x01000000;
522 break;
523
524 case 32:
525 w2[0] = 0x01;
526 break;
527
528 case 33:
529 w2[0] = w2[0] | 0x0100;
530 break;
531
532 case 34:
533 w2[0] = w2[0] | 0x010000;
534 break;
535
536 case 35:
537 w2[0] = w2[0] | 0x01000000;
538 break;
539
540 case 36:
541 w2[1] = 0x01;
542 break;
543
544 case 37:
545 w2[1] = w2[1] | 0x0100;
546 break;
547
548 case 38:
549 w2[1] = w2[1] | 0x010000;
550 break;
551
552 case 39:
553 w2[1] = w2[1] | 0x01000000;
554 break;
555
556 case 40:
557 w2[2] = 0x01;
558 break;
559
560 case 41:
561 w2[2] = w2[2] | 0x0100;
562 break;
563
564 case 42:
565 w2[2] = w2[2] | 0x010000;
566 break;
567
568 case 43:
569 w2[2] = w2[2] | 0x01000000;
570 break;
571
572 case 44:
573 w2[3] = 0x01;
574 break;
575
576 case 45:
577 w2[3] = w2[3] | 0x0100;
578 break;
579
580 case 46:
581 w2[3] = w2[3] | 0x010000;
582 break;
583
584 case 47:
585 w2[3] = w2[3] | 0x01000000;
586 break;
587 }
588 }
589
590 inline void append_0x01_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
591 {
592 switch (offset)
593 {
594 case 0:
595 w0[0] = 0x01;
596 break;
597
598 case 1:
599 w0[0] = w0[0] | 0x0100;
600 break;
601
602 case 2:
603 w0[0] = w0[0] | 0x010000;
604 break;
605
606 case 3:
607 w0[0] = w0[0] | 0x01000000;
608 break;
609
610 case 4:
611 w0[1] = 0x01;
612 break;
613
614 case 5:
615 w0[1] = w0[1] | 0x0100;
616 break;
617
618 case 6:
619 w0[1] = w0[1] | 0x010000;
620 break;
621
622 case 7:
623 w0[1] = w0[1] | 0x01000000;
624 break;
625
626 case 8:
627 w0[2] = 0x01;
628 break;
629
630 case 9:
631 w0[2] = w0[2] | 0x0100;
632 break;
633
634 case 10:
635 w0[2] = w0[2] | 0x010000;
636 break;
637
638 case 11:
639 w0[2] = w0[2] | 0x01000000;
640 break;
641
642 case 12:
643 w0[3] = 0x01;
644 break;
645
646 case 13:
647 w0[3] = w0[3] | 0x0100;
648 break;
649
650 case 14:
651 w0[3] = w0[3] | 0x010000;
652 break;
653
654 case 15:
655 w0[3] = w0[3] | 0x01000000;
656 break;
657
658 case 16:
659 w1[0] = 0x01;
660 break;
661
662 case 17:
663 w1[0] = w1[0] | 0x0100;
664 break;
665
666 case 18:
667 w1[0] = w1[0] | 0x010000;
668 break;
669
670 case 19:
671 w1[0] = w1[0] | 0x01000000;
672 break;
673
674 case 20:
675 w1[1] = 0x01;
676 break;
677
678 case 21:
679 w1[1] = w1[1] | 0x0100;
680 break;
681
682 case 22:
683 w1[1] = w1[1] | 0x010000;
684 break;
685
686 case 23:
687 w1[1] = w1[1] | 0x01000000;
688 break;
689
690 case 24:
691 w1[2] = 0x01;
692 break;
693
694 case 25:
695 w1[2] = w1[2] | 0x0100;
696 break;
697
698 case 26:
699 w1[2] = w1[2] | 0x010000;
700 break;
701
702 case 27:
703 w1[2] = w1[2] | 0x01000000;
704 break;
705
706 case 28:
707 w1[3] = 0x01;
708 break;
709
710 case 29:
711 w1[3] = w1[3] | 0x0100;
712 break;
713
714 case 30:
715 w1[3] = w1[3] | 0x010000;
716 break;
717
718 case 31:
719 w1[3] = w1[3] | 0x01000000;
720 break;
721
722 case 32:
723 w2[0] = 0x01;
724 break;
725
726 case 33:
727 w2[0] = w2[0] | 0x0100;
728 break;
729
730 case 34:
731 w2[0] = w2[0] | 0x010000;
732 break;
733
734 case 35:
735 w2[0] = w2[0] | 0x01000000;
736 break;
737
738 case 36:
739 w2[1] = 0x01;
740 break;
741
742 case 37:
743 w2[1] = w2[1] | 0x0100;
744 break;
745
746 case 38:
747 w2[1] = w2[1] | 0x010000;
748 break;
749
750 case 39:
751 w2[1] = w2[1] | 0x01000000;
752 break;
753
754 case 40:
755 w2[2] = 0x01;
756 break;
757
758 case 41:
759 w2[2] = w2[2] | 0x0100;
760 break;
761
762 case 42:
763 w2[2] = w2[2] | 0x010000;
764 break;
765
766 case 43:
767 w2[2] = w2[2] | 0x01000000;
768 break;
769
770 case 44:
771 w2[3] = 0x01;
772 break;
773
774 case 45:
775 w2[3] = w2[3] | 0x0100;
776 break;
777
778 case 46:
779 w2[3] = w2[3] | 0x010000;
780 break;
781
782 case 47:
783 w2[3] = w2[3] | 0x01000000;
784 break;
785
786 case 48:
787 w3[0] = 0x01;
788 break;
789
790 case 49:
791 w3[0] = w3[0] | 0x0100;
792 break;
793
794 case 50:
795 w3[0] = w3[0] | 0x010000;
796 break;
797
798 case 51:
799 w3[0] = w3[0] | 0x01000000;
800 break;
801
802 case 52:
803 w3[1] = 0x01;
804 break;
805
806 case 53:
807 w3[1] = w3[1] | 0x0100;
808 break;
809
810 case 54:
811 w3[1] = w3[1] | 0x010000;
812 break;
813
814 case 55:
815 w3[1] = w3[1] | 0x01000000;
816 break;
817
818 case 56:
819 w3[2] = 0x01;
820 break;
821
822 case 57:
823 w3[2] = w3[2] | 0x0100;
824 break;
825
826 case 58:
827 w3[2] = w3[2] | 0x010000;
828 break;
829
830 case 59:
831 w3[2] = w3[2] | 0x01000000;
832 break;
833
834 case 60:
835 w3[3] = 0x01;
836 break;
837
838 case 61:
839 w3[3] = w3[3] | 0x0100;
840 break;
841
842 case 62:
843 w3[3] = w3[3] | 0x010000;
844 break;
845
846 case 63:
847 w3[3] = w3[3] | 0x01000000;
848 break;
849 }
850 }
851
852 inline void append_0x01_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
853 {
854 switch (offset)
855 {
856 case 0:
857 w0[0] = 0x01;
858 break;
859
860 case 1:
861 w0[0] = w0[0] | 0x0100;
862 break;
863
864 case 2:
865 w0[0] = w0[0] | 0x010000;
866 break;
867
868 case 3:
869 w0[0] = w0[0] | 0x01000000;
870 break;
871
872 case 4:
873 w0[1] = 0x01;
874 break;
875
876 case 5:
877 w0[1] = w0[1] | 0x0100;
878 break;
879
880 case 6:
881 w0[1] = w0[1] | 0x010000;
882 break;
883
884 case 7:
885 w0[1] = w0[1] | 0x01000000;
886 break;
887
888 case 8:
889 w0[2] = 0x01;
890 break;
891
892 case 9:
893 w0[2] = w0[2] | 0x0100;
894 break;
895
896 case 10:
897 w0[2] = w0[2] | 0x010000;
898 break;
899
900 case 11:
901 w0[2] = w0[2] | 0x01000000;
902 break;
903
904 case 12:
905 w0[3] = 0x01;
906 break;
907
908 case 13:
909 w0[3] = w0[3] | 0x0100;
910 break;
911
912 case 14:
913 w0[3] = w0[3] | 0x010000;
914 break;
915
916 case 15:
917 w0[3] = w0[3] | 0x01000000;
918 break;
919
920 case 16:
921 w1[0] = 0x01;
922 break;
923
924 case 17:
925 w1[0] = w1[0] | 0x0100;
926 break;
927
928 case 18:
929 w1[0] = w1[0] | 0x010000;
930 break;
931
932 case 19:
933 w1[0] = w1[0] | 0x01000000;
934 break;
935
936 case 20:
937 w1[1] = 0x01;
938 break;
939
940 case 21:
941 w1[1] = w1[1] | 0x0100;
942 break;
943
944 case 22:
945 w1[1] = w1[1] | 0x010000;
946 break;
947
948 case 23:
949 w1[1] = w1[1] | 0x01000000;
950 break;
951
952 case 24:
953 w1[2] = 0x01;
954 break;
955
956 case 25:
957 w1[2] = w1[2] | 0x0100;
958 break;
959
960 case 26:
961 w1[2] = w1[2] | 0x010000;
962 break;
963
964 case 27:
965 w1[2] = w1[2] | 0x01000000;
966 break;
967
968 case 28:
969 w1[3] = 0x01;
970 break;
971
972 case 29:
973 w1[3] = w1[3] | 0x0100;
974 break;
975
976 case 30:
977 w1[3] = w1[3] | 0x010000;
978 break;
979
980 case 31:
981 w1[3] = w1[3] | 0x01000000;
982 break;
983
984 case 32:
985 w2[0] = 0x01;
986 break;
987
988 case 33:
989 w2[0] = w2[0] | 0x0100;
990 break;
991
992 case 34:
993 w2[0] = w2[0] | 0x010000;
994 break;
995
996 case 35:
997 w2[0] = w2[0] | 0x01000000;
998 break;
999
1000 case 36:
1001 w2[1] = 0x01;
1002 break;
1003
1004 case 37:
1005 w2[1] = w2[1] | 0x0100;
1006 break;
1007
1008 case 38:
1009 w2[1] = w2[1] | 0x010000;
1010 break;
1011
1012 case 39:
1013 w2[1] = w2[1] | 0x01000000;
1014 break;
1015
1016 case 40:
1017 w2[2] = 0x01;
1018 break;
1019
1020 case 41:
1021 w2[2] = w2[2] | 0x0100;
1022 break;
1023
1024 case 42:
1025 w2[2] = w2[2] | 0x010000;
1026 break;
1027
1028 case 43:
1029 w2[2] = w2[2] | 0x01000000;
1030 break;
1031
1032 case 44:
1033 w2[3] = 0x01;
1034 break;
1035
1036 case 45:
1037 w2[3] = w2[3] | 0x0100;
1038 break;
1039
1040 case 46:
1041 w2[3] = w2[3] | 0x010000;
1042 break;
1043
1044 case 47:
1045 w2[3] = w2[3] | 0x01000000;
1046 break;
1047
1048 case 48:
1049 w3[0] = 0x01;
1050 break;
1051
1052 case 49:
1053 w3[0] = w3[0] | 0x0100;
1054 break;
1055
1056 case 50:
1057 w3[0] = w3[0] | 0x010000;
1058 break;
1059
1060 case 51:
1061 w3[0] = w3[0] | 0x01000000;
1062 break;
1063
1064 case 52:
1065 w3[1] = 0x01;
1066 break;
1067
1068 case 53:
1069 w3[1] = w3[1] | 0x0100;
1070 break;
1071
1072 case 54:
1073 w3[1] = w3[1] | 0x010000;
1074 break;
1075
1076 case 55:
1077 w3[1] = w3[1] | 0x01000000;
1078 break;
1079
1080 case 56:
1081 w3[2] = 0x01;
1082 break;
1083
1084 case 57:
1085 w3[2] = w3[2] | 0x0100;
1086 break;
1087
1088 case 58:
1089 w3[2] = w3[2] | 0x010000;
1090 break;
1091
1092 case 59:
1093 w3[2] = w3[2] | 0x01000000;
1094 break;
1095
1096 case 60:
1097 w3[3] = 0x01;
1098 break;
1099
1100 case 61:
1101 w3[3] = w3[3] | 0x0100;
1102 break;
1103
1104 case 62:
1105 w3[3] = w3[3] | 0x010000;
1106 break;
1107
1108 case 63:
1109 w3[3] = w3[3] | 0x01000000;
1110 break;
1111
1112 case 64:
1113 w4[0] = 0x01;
1114 break;
1115
1116 case 65:
1117 w4[0] = w4[0] | 0x0100;
1118 break;
1119
1120 case 66:
1121 w4[0] = w4[0] | 0x010000;
1122 break;
1123
1124 case 67:
1125 w4[0] = w4[0] | 0x01000000;
1126 break;
1127
1128 case 68:
1129 w4[1] = 0x01;
1130 break;
1131
1132 case 69:
1133 w4[1] = w4[1] | 0x0100;
1134 break;
1135
1136 case 70:
1137 w4[1] = w4[1] | 0x010000;
1138 break;
1139
1140 case 71:
1141 w4[1] = w4[1] | 0x01000000;
1142 break;
1143
1144 case 72:
1145 w4[2] = 0x01;
1146 break;
1147
1148 case 73:
1149 w4[2] = w4[2] | 0x0100;
1150 break;
1151
1152 case 74:
1153 w4[2] = w4[2] | 0x010000;
1154 break;
1155
1156 case 75:
1157 w4[2] = w4[2] | 0x01000000;
1158 break;
1159
1160 case 76:
1161 w4[3] = 0x01;
1162 break;
1163
1164 case 77:
1165 w4[3] = w4[3] | 0x0100;
1166 break;
1167
1168 case 78:
1169 w4[3] = w4[3] | 0x010000;
1170 break;
1171
1172 case 79:
1173 w4[3] = w4[3] | 0x01000000;
1174 break;
1175
1176 case 80:
1177 w5[0] = 0x01;
1178 break;
1179
1180 case 81:
1181 w5[0] = w5[0] | 0x0100;
1182 break;
1183
1184 case 82:
1185 w5[0] = w5[0] | 0x010000;
1186 break;
1187
1188 case 83:
1189 w5[0] = w5[0] | 0x01000000;
1190 break;
1191
1192 case 84:
1193 w5[1] = 0x01;
1194 break;
1195
1196 case 85:
1197 w5[1] = w5[1] | 0x0100;
1198 break;
1199
1200 case 86:
1201 w5[1] = w5[1] | 0x010000;
1202 break;
1203
1204 case 87:
1205 w5[1] = w5[1] | 0x01000000;
1206 break;
1207
1208 case 88:
1209 w5[2] = 0x01;
1210 break;
1211
1212 case 89:
1213 w5[2] = w5[2] | 0x0100;
1214 break;
1215
1216 case 90:
1217 w5[2] = w5[2] | 0x010000;
1218 break;
1219
1220 case 91:
1221 w5[2] = w5[2] | 0x01000000;
1222 break;
1223
1224 case 92:
1225 w5[3] = 0x01;
1226 break;
1227
1228 case 93:
1229 w5[3] = w5[3] | 0x0100;
1230 break;
1231
1232 case 94:
1233 w5[3] = w5[3] | 0x010000;
1234 break;
1235
1236 case 95:
1237 w5[3] = w5[3] | 0x01000000;
1238 break;
1239
1240 case 96:
1241 w6[0] = 0x01;
1242 break;
1243
1244 case 97:
1245 w6[0] = w6[0] | 0x0100;
1246 break;
1247
1248 case 98:
1249 w6[0] = w6[0] | 0x010000;
1250 break;
1251
1252 case 99:
1253 w6[0] = w6[0] | 0x01000000;
1254 break;
1255
1256 case 100:
1257 w6[1] = 0x01;
1258 break;
1259
1260 case 101:
1261 w6[1] = w6[1] | 0x0100;
1262 break;
1263
1264 case 102:
1265 w6[1] = w6[1] | 0x010000;
1266 break;
1267
1268 case 103:
1269 w6[1] = w6[1] | 0x01000000;
1270 break;
1271
1272 case 104:
1273 w6[2] = 0x01;
1274 break;
1275
1276 case 105:
1277 w6[2] = w6[2] | 0x0100;
1278 break;
1279
1280 case 106:
1281 w6[2] = w6[2] | 0x010000;
1282 break;
1283
1284 case 107:
1285 w6[2] = w6[2] | 0x01000000;
1286 break;
1287
1288 case 108:
1289 w6[3] = 0x01;
1290 break;
1291
1292 case 109:
1293 w6[3] = w6[3] | 0x0100;
1294 break;
1295
1296 case 110:
1297 w6[3] = w6[3] | 0x010000;
1298 break;
1299
1300 case 111:
1301 w6[3] = w6[3] | 0x01000000;
1302 break;
1303
1304 case 112:
1305 w7[0] = 0x01;
1306 break;
1307
1308 case 113:
1309 w7[0] = w7[0] | 0x0100;
1310 break;
1311
1312 case 114:
1313 w7[0] = w7[0] | 0x010000;
1314 break;
1315
1316 case 115:
1317 w7[0] = w7[0] | 0x01000000;
1318 break;
1319
1320 case 116:
1321 w7[1] = 0x01;
1322 break;
1323
1324 case 117:
1325 w7[1] = w7[1] | 0x0100;
1326 break;
1327
1328 case 118:
1329 w7[1] = w7[1] | 0x010000;
1330 break;
1331
1332 case 119:
1333 w7[1] = w7[1] | 0x01000000;
1334 break;
1335
1336 case 120:
1337 w7[2] = 0x01;
1338 break;
1339
1340 case 121:
1341 w7[2] = w7[2] | 0x0100;
1342 break;
1343
1344 case 122:
1345 w7[2] = w7[2] | 0x010000;
1346 break;
1347
1348 case 123:
1349 w7[2] = w7[2] | 0x01000000;
1350 break;
1351
1352 case 124:
1353 w7[3] = 0x01;
1354 break;
1355
1356 case 125:
1357 w7[3] = w7[3] | 0x0100;
1358 break;
1359
1360 case 126:
1361 w7[3] = w7[3] | 0x010000;
1362 break;
1363
1364 case 127:
1365 w7[3] = w7[3] | 0x01000000;
1366 break;
1367 }
1368 }
1369
1370 inline void append_0x02_1x4 (u32x w0[4], const u32 offset)
1371 {
1372 switch (offset)
1373 {
1374 case 0:
1375 w0[0] = 0x02;
1376 break;
1377
1378 case 1:
1379 w0[0] = w0[0] | 0x0200;
1380 break;
1381
1382 case 2:
1383 w0[0] = w0[0] | 0x020000;
1384 break;
1385
1386 case 3:
1387 w0[0] = w0[0] | 0x02000000;
1388 break;
1389
1390 case 4:
1391 w0[1] = 0x02;
1392 break;
1393
1394 case 5:
1395 w0[1] = w0[1] | 0x0200;
1396 break;
1397
1398 case 6:
1399 w0[1] = w0[1] | 0x020000;
1400 break;
1401
1402 case 7:
1403 w0[1] = w0[1] | 0x02000000;
1404 break;
1405
1406 case 8:
1407 w0[2] = 0x02;
1408 break;
1409
1410 case 9:
1411 w0[2] = w0[2] | 0x0200;
1412 break;
1413
1414 case 10:
1415 w0[2] = w0[2] | 0x020000;
1416 break;
1417
1418 case 11:
1419 w0[2] = w0[2] | 0x02000000;
1420 break;
1421
1422 case 12:
1423 w0[3] = 0x02;
1424 break;
1425
1426 case 13:
1427 w0[3] = w0[3] | 0x0200;
1428 break;
1429
1430 case 14:
1431 w0[3] = w0[3] | 0x020000;
1432 break;
1433
1434 case 15:
1435 w0[3] = w0[3] | 0x02000000;
1436 break;
1437 }
1438 }
1439
1440 inline void append_0x02_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
1441 {
1442 switch (offset)
1443 {
1444 case 0:
1445 w0[0] = 0x02;
1446 break;
1447
1448 case 1:
1449 w0[0] = w0[0] | 0x0200;
1450 break;
1451
1452 case 2:
1453 w0[0] = w0[0] | 0x020000;
1454 break;
1455
1456 case 3:
1457 w0[0] = w0[0] | 0x02000000;
1458 break;
1459
1460 case 4:
1461 w0[1] = 0x02;
1462 break;
1463
1464 case 5:
1465 w0[1] = w0[1] | 0x0200;
1466 break;
1467
1468 case 6:
1469 w0[1] = w0[1] | 0x020000;
1470 break;
1471
1472 case 7:
1473 w0[1] = w0[1] | 0x02000000;
1474 break;
1475
1476 case 8:
1477 w0[2] = 0x02;
1478 break;
1479
1480 case 9:
1481 w0[2] = w0[2] | 0x0200;
1482 break;
1483
1484 case 10:
1485 w0[2] = w0[2] | 0x020000;
1486 break;
1487
1488 case 11:
1489 w0[2] = w0[2] | 0x02000000;
1490 break;
1491
1492 case 12:
1493 w0[3] = 0x02;
1494 break;
1495
1496 case 13:
1497 w0[3] = w0[3] | 0x0200;
1498 break;
1499
1500 case 14:
1501 w0[3] = w0[3] | 0x020000;
1502 break;
1503
1504 case 15:
1505 w0[3] = w0[3] | 0x02000000;
1506 break;
1507
1508 case 16:
1509 w1[0] = 0x02;
1510 break;
1511
1512 case 17:
1513 w1[0] = w1[0] | 0x0200;
1514 break;
1515
1516 case 18:
1517 w1[0] = w1[0] | 0x020000;
1518 break;
1519
1520 case 19:
1521 w1[0] = w1[0] | 0x02000000;
1522 break;
1523
1524 case 20:
1525 w1[1] = 0x02;
1526 break;
1527
1528 case 21:
1529 w1[1] = w1[1] | 0x0200;
1530 break;
1531
1532 case 22:
1533 w1[1] = w1[1] | 0x020000;
1534 break;
1535
1536 case 23:
1537 w1[1] = w1[1] | 0x02000000;
1538 break;
1539
1540 case 24:
1541 w1[2] = 0x02;
1542 break;
1543
1544 case 25:
1545 w1[2] = w1[2] | 0x0200;
1546 break;
1547
1548 case 26:
1549 w1[2] = w1[2] | 0x020000;
1550 break;
1551
1552 case 27:
1553 w1[2] = w1[2] | 0x02000000;
1554 break;
1555
1556 case 28:
1557 w1[3] = 0x02;
1558 break;
1559
1560 case 29:
1561 w1[3] = w1[3] | 0x0200;
1562 break;
1563
1564 case 30:
1565 w1[3] = w1[3] | 0x020000;
1566 break;
1567
1568 case 31:
1569 w1[3] = w1[3] | 0x02000000;
1570 break;
1571 }
1572 }
1573
1574 inline void append_0x02_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
1575 {
1576 switch (offset)
1577 {
1578 case 0:
1579 w0[0] = 0x02;
1580 break;
1581
1582 case 1:
1583 w0[0] = w0[0] | 0x0200;
1584 break;
1585
1586 case 2:
1587 w0[0] = w0[0] | 0x020000;
1588 break;
1589
1590 case 3:
1591 w0[0] = w0[0] | 0x02000000;
1592 break;
1593
1594 case 4:
1595 w0[1] = 0x02;
1596 break;
1597
1598 case 5:
1599 w0[1] = w0[1] | 0x0200;
1600 break;
1601
1602 case 6:
1603 w0[1] = w0[1] | 0x020000;
1604 break;
1605
1606 case 7:
1607 w0[1] = w0[1] | 0x02000000;
1608 break;
1609
1610 case 8:
1611 w0[2] = 0x02;
1612 break;
1613
1614 case 9:
1615 w0[2] = w0[2] | 0x0200;
1616 break;
1617
1618 case 10:
1619 w0[2] = w0[2] | 0x020000;
1620 break;
1621
1622 case 11:
1623 w0[2] = w0[2] | 0x02000000;
1624 break;
1625
1626 case 12:
1627 w0[3] = 0x02;
1628 break;
1629
1630 case 13:
1631 w0[3] = w0[3] | 0x0200;
1632 break;
1633
1634 case 14:
1635 w0[3] = w0[3] | 0x020000;
1636 break;
1637
1638 case 15:
1639 w0[3] = w0[3] | 0x02000000;
1640 break;
1641
1642 case 16:
1643 w1[0] = 0x02;
1644 break;
1645
1646 case 17:
1647 w1[0] = w1[0] | 0x0200;
1648 break;
1649
1650 case 18:
1651 w1[0] = w1[0] | 0x020000;
1652 break;
1653
1654 case 19:
1655 w1[0] = w1[0] | 0x02000000;
1656 break;
1657
1658 case 20:
1659 w1[1] = 0x02;
1660 break;
1661
1662 case 21:
1663 w1[1] = w1[1] | 0x0200;
1664 break;
1665
1666 case 22:
1667 w1[1] = w1[1] | 0x020000;
1668 break;
1669
1670 case 23:
1671 w1[1] = w1[1] | 0x02000000;
1672 break;
1673
1674 case 24:
1675 w1[2] = 0x02;
1676 break;
1677
1678 case 25:
1679 w1[2] = w1[2] | 0x0200;
1680 break;
1681
1682 case 26:
1683 w1[2] = w1[2] | 0x020000;
1684 break;
1685
1686 case 27:
1687 w1[2] = w1[2] | 0x02000000;
1688 break;
1689
1690 case 28:
1691 w1[3] = 0x02;
1692 break;
1693
1694 case 29:
1695 w1[3] = w1[3] | 0x0200;
1696 break;
1697
1698 case 30:
1699 w1[3] = w1[3] | 0x020000;
1700 break;
1701
1702 case 31:
1703 w1[3] = w1[3] | 0x02000000;
1704 break;
1705
1706 case 32:
1707 w2[0] = 0x02;
1708 break;
1709
1710 case 33:
1711 w2[0] = w2[0] | 0x0200;
1712 break;
1713
1714 case 34:
1715 w2[0] = w2[0] | 0x020000;
1716 break;
1717
1718 case 35:
1719 w2[0] = w2[0] | 0x02000000;
1720 break;
1721
1722 case 36:
1723 w2[1] = 0x02;
1724 break;
1725
1726 case 37:
1727 w2[1] = w2[1] | 0x0200;
1728 break;
1729
1730 case 38:
1731 w2[1] = w2[1] | 0x020000;
1732 break;
1733
1734 case 39:
1735 w2[1] = w2[1] | 0x02000000;
1736 break;
1737
1738 case 40:
1739 w2[2] = 0x02;
1740 break;
1741
1742 case 41:
1743 w2[2] = w2[2] | 0x0200;
1744 break;
1745
1746 case 42:
1747 w2[2] = w2[2] | 0x020000;
1748 break;
1749
1750 case 43:
1751 w2[2] = w2[2] | 0x02000000;
1752 break;
1753
1754 case 44:
1755 w2[3] = 0x02;
1756 break;
1757
1758 case 45:
1759 w2[3] = w2[3] | 0x0200;
1760 break;
1761
1762 case 46:
1763 w2[3] = w2[3] | 0x020000;
1764 break;
1765
1766 case 47:
1767 w2[3] = w2[3] | 0x02000000;
1768 break;
1769 }
1770 }
1771
1772 inline void append_0x02_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
1773 {
1774 switch (offset)
1775 {
1776 case 0:
1777 w0[0] = 0x02;
1778 break;
1779
1780 case 1:
1781 w0[0] = w0[0] | 0x0200;
1782 break;
1783
1784 case 2:
1785 w0[0] = w0[0] | 0x020000;
1786 break;
1787
1788 case 3:
1789 w0[0] = w0[0] | 0x02000000;
1790 break;
1791
1792 case 4:
1793 w0[1] = 0x02;
1794 break;
1795
1796 case 5:
1797 w0[1] = w0[1] | 0x0200;
1798 break;
1799
1800 case 6:
1801 w0[1] = w0[1] | 0x020000;
1802 break;
1803
1804 case 7:
1805 w0[1] = w0[1] | 0x02000000;
1806 break;
1807
1808 case 8:
1809 w0[2] = 0x02;
1810 break;
1811
1812 case 9:
1813 w0[2] = w0[2] | 0x0200;
1814 break;
1815
1816 case 10:
1817 w0[2] = w0[2] | 0x020000;
1818 break;
1819
1820 case 11:
1821 w0[2] = w0[2] | 0x02000000;
1822 break;
1823
1824 case 12:
1825 w0[3] = 0x02;
1826 break;
1827
1828 case 13:
1829 w0[3] = w0[3] | 0x0200;
1830 break;
1831
1832 case 14:
1833 w0[3] = w0[3] | 0x020000;
1834 break;
1835
1836 case 15:
1837 w0[3] = w0[3] | 0x02000000;
1838 break;
1839
1840 case 16:
1841 w1[0] = 0x02;
1842 break;
1843
1844 case 17:
1845 w1[0] = w1[0] | 0x0200;
1846 break;
1847
1848 case 18:
1849 w1[0] = w1[0] | 0x020000;
1850 break;
1851
1852 case 19:
1853 w1[0] = w1[0] | 0x02000000;
1854 break;
1855
1856 case 20:
1857 w1[1] = 0x02;
1858 break;
1859
1860 case 21:
1861 w1[1] = w1[1] | 0x0200;
1862 break;
1863
1864 case 22:
1865 w1[1] = w1[1] | 0x020000;
1866 break;
1867
1868 case 23:
1869 w1[1] = w1[1] | 0x02000000;
1870 break;
1871
1872 case 24:
1873 w1[2] = 0x02;
1874 break;
1875
1876 case 25:
1877 w1[2] = w1[2] | 0x0200;
1878 break;
1879
1880 case 26:
1881 w1[2] = w1[2] | 0x020000;
1882 break;
1883
1884 case 27:
1885 w1[2] = w1[2] | 0x02000000;
1886 break;
1887
1888 case 28:
1889 w1[3] = 0x02;
1890 break;
1891
1892 case 29:
1893 w1[3] = w1[3] | 0x0200;
1894 break;
1895
1896 case 30:
1897 w1[3] = w1[3] | 0x020000;
1898 break;
1899
1900 case 31:
1901 w1[3] = w1[3] | 0x02000000;
1902 break;
1903
1904 case 32:
1905 w2[0] = 0x02;
1906 break;
1907
1908 case 33:
1909 w2[0] = w2[0] | 0x0200;
1910 break;
1911
1912 case 34:
1913 w2[0] = w2[0] | 0x020000;
1914 break;
1915
1916 case 35:
1917 w2[0] = w2[0] | 0x02000000;
1918 break;
1919
1920 case 36:
1921 w2[1] = 0x02;
1922 break;
1923
1924 case 37:
1925 w2[1] = w2[1] | 0x0200;
1926 break;
1927
1928 case 38:
1929 w2[1] = w2[1] | 0x020000;
1930 break;
1931
1932 case 39:
1933 w2[1] = w2[1] | 0x02000000;
1934 break;
1935
1936 case 40:
1937 w2[2] = 0x02;
1938 break;
1939
1940 case 41:
1941 w2[2] = w2[2] | 0x0200;
1942 break;
1943
1944 case 42:
1945 w2[2] = w2[2] | 0x020000;
1946 break;
1947
1948 case 43:
1949 w2[2] = w2[2] | 0x02000000;
1950 break;
1951
1952 case 44:
1953 w2[3] = 0x02;
1954 break;
1955
1956 case 45:
1957 w2[3] = w2[3] | 0x0200;
1958 break;
1959
1960 case 46:
1961 w2[3] = w2[3] | 0x020000;
1962 break;
1963
1964 case 47:
1965 w2[3] = w2[3] | 0x02000000;
1966 break;
1967
1968 case 48:
1969 w3[0] = 0x02;
1970 break;
1971
1972 case 49:
1973 w3[0] = w3[0] | 0x0200;
1974 break;
1975
1976 case 50:
1977 w3[0] = w3[0] | 0x020000;
1978 break;
1979
1980 case 51:
1981 w3[0] = w3[0] | 0x02000000;
1982 break;
1983
1984 case 52:
1985 w3[1] = 0x02;
1986 break;
1987
1988 case 53:
1989 w3[1] = w3[1] | 0x0200;
1990 break;
1991
1992 case 54:
1993 w3[1] = w3[1] | 0x020000;
1994 break;
1995
1996 case 55:
1997 w3[1] = w3[1] | 0x02000000;
1998 break;
1999
2000 case 56:
2001 w3[2] = 0x02;
2002 break;
2003
2004 case 57:
2005 w3[2] = w3[2] | 0x0200;
2006 break;
2007
2008 case 58:
2009 w3[2] = w3[2] | 0x020000;
2010 break;
2011
2012 case 59:
2013 w3[2] = w3[2] | 0x02000000;
2014 break;
2015
2016 case 60:
2017 w3[3] = 0x02;
2018 break;
2019
2020 case 61:
2021 w3[3] = w3[3] | 0x0200;
2022 break;
2023
2024 case 62:
2025 w3[3] = w3[3] | 0x020000;
2026 break;
2027
2028 case 63:
2029 w3[3] = w3[3] | 0x02000000;
2030 break;
2031 }
2032 }
2033
2034 inline void append_0x02_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
2035 {
2036 switch (offset)
2037 {
2038 case 0:
2039 w0[0] = 0x02;
2040 break;
2041
2042 case 1:
2043 w0[0] = w0[0] | 0x0200;
2044 break;
2045
2046 case 2:
2047 w0[0] = w0[0] | 0x020000;
2048 break;
2049
2050 case 3:
2051 w0[0] = w0[0] | 0x02000000;
2052 break;
2053
2054 case 4:
2055 w0[1] = 0x02;
2056 break;
2057
2058 case 5:
2059 w0[1] = w0[1] | 0x0200;
2060 break;
2061
2062 case 6:
2063 w0[1] = w0[1] | 0x020000;
2064 break;
2065
2066 case 7:
2067 w0[1] = w0[1] | 0x02000000;
2068 break;
2069
2070 case 8:
2071 w0[2] = 0x02;
2072 break;
2073
2074 case 9:
2075 w0[2] = w0[2] | 0x0200;
2076 break;
2077
2078 case 10:
2079 w0[2] = w0[2] | 0x020000;
2080 break;
2081
2082 case 11:
2083 w0[2] = w0[2] | 0x02000000;
2084 break;
2085
2086 case 12:
2087 w0[3] = 0x02;
2088 break;
2089
2090 case 13:
2091 w0[3] = w0[3] | 0x0200;
2092 break;
2093
2094 case 14:
2095 w0[3] = w0[3] | 0x020000;
2096 break;
2097
2098 case 15:
2099 w0[3] = w0[3] | 0x02000000;
2100 break;
2101
2102 case 16:
2103 w1[0] = 0x02;
2104 break;
2105
2106 case 17:
2107 w1[0] = w1[0] | 0x0200;
2108 break;
2109
2110 case 18:
2111 w1[0] = w1[0] | 0x020000;
2112 break;
2113
2114 case 19:
2115 w1[0] = w1[0] | 0x02000000;
2116 break;
2117
2118 case 20:
2119 w1[1] = 0x02;
2120 break;
2121
2122 case 21:
2123 w1[1] = w1[1] | 0x0200;
2124 break;
2125
2126 case 22:
2127 w1[1] = w1[1] | 0x020000;
2128 break;
2129
2130 case 23:
2131 w1[1] = w1[1] | 0x02000000;
2132 break;
2133
2134 case 24:
2135 w1[2] = 0x02;
2136 break;
2137
2138 case 25:
2139 w1[2] = w1[2] | 0x0200;
2140 break;
2141
2142 case 26:
2143 w1[2] = w1[2] | 0x020000;
2144 break;
2145
2146 case 27:
2147 w1[2] = w1[2] | 0x02000000;
2148 break;
2149
2150 case 28:
2151 w1[3] = 0x02;
2152 break;
2153
2154 case 29:
2155 w1[3] = w1[3] | 0x0200;
2156 break;
2157
2158 case 30:
2159 w1[3] = w1[3] | 0x020000;
2160 break;
2161
2162 case 31:
2163 w1[3] = w1[3] | 0x02000000;
2164 break;
2165
2166 case 32:
2167 w2[0] = 0x02;
2168 break;
2169
2170 case 33:
2171 w2[0] = w2[0] | 0x0200;
2172 break;
2173
2174 case 34:
2175 w2[0] = w2[0] | 0x020000;
2176 break;
2177
2178 case 35:
2179 w2[0] = w2[0] | 0x02000000;
2180 break;
2181
2182 case 36:
2183 w2[1] = 0x02;
2184 break;
2185
2186 case 37:
2187 w2[1] = w2[1] | 0x0200;
2188 break;
2189
2190 case 38:
2191 w2[1] = w2[1] | 0x020000;
2192 break;
2193
2194 case 39:
2195 w2[1] = w2[1] | 0x02000000;
2196 break;
2197
2198 case 40:
2199 w2[2] = 0x02;
2200 break;
2201
2202 case 41:
2203 w2[2] = w2[2] | 0x0200;
2204 break;
2205
2206 case 42:
2207 w2[2] = w2[2] | 0x020000;
2208 break;
2209
2210 case 43:
2211 w2[2] = w2[2] | 0x02000000;
2212 break;
2213
2214 case 44:
2215 w2[3] = 0x02;
2216 break;
2217
2218 case 45:
2219 w2[3] = w2[3] | 0x0200;
2220 break;
2221
2222 case 46:
2223 w2[3] = w2[3] | 0x020000;
2224 break;
2225
2226 case 47:
2227 w2[3] = w2[3] | 0x02000000;
2228 break;
2229
2230 case 48:
2231 w3[0] = 0x02;
2232 break;
2233
2234 case 49:
2235 w3[0] = w3[0] | 0x0200;
2236 break;
2237
2238 case 50:
2239 w3[0] = w3[0] | 0x020000;
2240 break;
2241
2242 case 51:
2243 w3[0] = w3[0] | 0x02000000;
2244 break;
2245
2246 case 52:
2247 w3[1] = 0x02;
2248 break;
2249
2250 case 53:
2251 w3[1] = w3[1] | 0x0200;
2252 break;
2253
2254 case 54:
2255 w3[1] = w3[1] | 0x020000;
2256 break;
2257
2258 case 55:
2259 w3[1] = w3[1] | 0x02000000;
2260 break;
2261
2262 case 56:
2263 w3[2] = 0x02;
2264 break;
2265
2266 case 57:
2267 w3[2] = w3[2] | 0x0200;
2268 break;
2269
2270 case 58:
2271 w3[2] = w3[2] | 0x020000;
2272 break;
2273
2274 case 59:
2275 w3[2] = w3[2] | 0x02000000;
2276 break;
2277
2278 case 60:
2279 w3[3] = 0x02;
2280 break;
2281
2282 case 61:
2283 w3[3] = w3[3] | 0x0200;
2284 break;
2285
2286 case 62:
2287 w3[3] = w3[3] | 0x020000;
2288 break;
2289
2290 case 63:
2291 w3[3] = w3[3] | 0x02000000;
2292 break;
2293
2294 case 64:
2295 w4[0] = 0x02;
2296 break;
2297
2298 case 65:
2299 w4[0] = w4[0] | 0x0200;
2300 break;
2301
2302 case 66:
2303 w4[0] = w4[0] | 0x020000;
2304 break;
2305
2306 case 67:
2307 w4[0] = w4[0] | 0x02000000;
2308 break;
2309
2310 case 68:
2311 w4[1] = 0x02;
2312 break;
2313
2314 case 69:
2315 w4[1] = w4[1] | 0x0200;
2316 break;
2317
2318 case 70:
2319 w4[1] = w4[1] | 0x020000;
2320 break;
2321
2322 case 71:
2323 w4[1] = w4[1] | 0x02000000;
2324 break;
2325
2326 case 72:
2327 w4[2] = 0x02;
2328 break;
2329
2330 case 73:
2331 w4[2] = w4[2] | 0x0200;
2332 break;
2333
2334 case 74:
2335 w4[2] = w4[2] | 0x020000;
2336 break;
2337
2338 case 75:
2339 w4[2] = w4[2] | 0x02000000;
2340 break;
2341
2342 case 76:
2343 w4[3] = 0x02;
2344 break;
2345
2346 case 77:
2347 w4[3] = w4[3] | 0x0200;
2348 break;
2349
2350 case 78:
2351 w4[3] = w4[3] | 0x020000;
2352 break;
2353
2354 case 79:
2355 w4[3] = w4[3] | 0x02000000;
2356 break;
2357
2358 case 80:
2359 w5[0] = 0x02;
2360 break;
2361
2362 case 81:
2363 w5[0] = w5[0] | 0x0200;
2364 break;
2365
2366 case 82:
2367 w5[0] = w5[0] | 0x020000;
2368 break;
2369
2370 case 83:
2371 w5[0] = w5[0] | 0x02000000;
2372 break;
2373
2374 case 84:
2375 w5[1] = 0x02;
2376 break;
2377
2378 case 85:
2379 w5[1] = w5[1] | 0x0200;
2380 break;
2381
2382 case 86:
2383 w5[1] = w5[1] | 0x020000;
2384 break;
2385
2386 case 87:
2387 w5[1] = w5[1] | 0x02000000;
2388 break;
2389
2390 case 88:
2391 w5[2] = 0x02;
2392 break;
2393
2394 case 89:
2395 w5[2] = w5[2] | 0x0200;
2396 break;
2397
2398 case 90:
2399 w5[2] = w5[2] | 0x020000;
2400 break;
2401
2402 case 91:
2403 w5[2] = w5[2] | 0x02000000;
2404 break;
2405
2406 case 92:
2407 w5[3] = 0x02;
2408 break;
2409
2410 case 93:
2411 w5[3] = w5[3] | 0x0200;
2412 break;
2413
2414 case 94:
2415 w5[3] = w5[3] | 0x020000;
2416 break;
2417
2418 case 95:
2419 w5[3] = w5[3] | 0x02000000;
2420 break;
2421
2422 case 96:
2423 w6[0] = 0x02;
2424 break;
2425
2426 case 97:
2427 w6[0] = w6[0] | 0x0200;
2428 break;
2429
2430 case 98:
2431 w6[0] = w6[0] | 0x020000;
2432 break;
2433
2434 case 99:
2435 w6[0] = w6[0] | 0x02000000;
2436 break;
2437
2438 case 100:
2439 w6[1] = 0x02;
2440 break;
2441
2442 case 101:
2443 w6[1] = w6[1] | 0x0200;
2444 break;
2445
2446 case 102:
2447 w6[1] = w6[1] | 0x020000;
2448 break;
2449
2450 case 103:
2451 w6[1] = w6[1] | 0x02000000;
2452 break;
2453
2454 case 104:
2455 w6[2] = 0x02;
2456 break;
2457
2458 case 105:
2459 w6[2] = w6[2] | 0x0200;
2460 break;
2461
2462 case 106:
2463 w6[2] = w6[2] | 0x020000;
2464 break;
2465
2466 case 107:
2467 w6[2] = w6[2] | 0x02000000;
2468 break;
2469
2470 case 108:
2471 w6[3] = 0x02;
2472 break;
2473
2474 case 109:
2475 w6[3] = w6[3] | 0x0200;
2476 break;
2477
2478 case 110:
2479 w6[3] = w6[3] | 0x020000;
2480 break;
2481
2482 case 111:
2483 w6[3] = w6[3] | 0x02000000;
2484 break;
2485
2486 case 112:
2487 w7[0] = 0x02;
2488 break;
2489
2490 case 113:
2491 w7[0] = w7[0] | 0x0200;
2492 break;
2493
2494 case 114:
2495 w7[0] = w7[0] | 0x020000;
2496 break;
2497
2498 case 115:
2499 w7[0] = w7[0] | 0x02000000;
2500 break;
2501
2502 case 116:
2503 w7[1] = 0x02;
2504 break;
2505
2506 case 117:
2507 w7[1] = w7[1] | 0x0200;
2508 break;
2509
2510 case 118:
2511 w7[1] = w7[1] | 0x020000;
2512 break;
2513
2514 case 119:
2515 w7[1] = w7[1] | 0x02000000;
2516 break;
2517
2518 case 120:
2519 w7[2] = 0x02;
2520 break;
2521
2522 case 121:
2523 w7[2] = w7[2] | 0x0200;
2524 break;
2525
2526 case 122:
2527 w7[2] = w7[2] | 0x020000;
2528 break;
2529
2530 case 123:
2531 w7[2] = w7[2] | 0x02000000;
2532 break;
2533
2534 case 124:
2535 w7[3] = 0x02;
2536 break;
2537
2538 case 125:
2539 w7[3] = w7[3] | 0x0200;
2540 break;
2541
2542 case 126:
2543 w7[3] = w7[3] | 0x020000;
2544 break;
2545
2546 case 127:
2547 w7[3] = w7[3] | 0x02000000;
2548 break;
2549 }
2550 }
2551
2552 inline void append_0x80_1x4 (u32x w0[4], const u32 offset)
2553 {
2554 switch (offset)
2555 {
2556 case 0:
2557 w0[0] = 0x80;
2558 break;
2559
2560 case 1:
2561 w0[0] = w0[0] | 0x8000;
2562 break;
2563
2564 case 2:
2565 w0[0] = w0[0] | 0x800000;
2566 break;
2567
2568 case 3:
2569 w0[0] = w0[0] | 0x80000000;
2570 break;
2571
2572 case 4:
2573 w0[1] = 0x80;
2574 break;
2575
2576 case 5:
2577 w0[1] = w0[1] | 0x8000;
2578 break;
2579
2580 case 6:
2581 w0[1] = w0[1] | 0x800000;
2582 break;
2583
2584 case 7:
2585 w0[1] = w0[1] | 0x80000000;
2586 break;
2587
2588 case 8:
2589 w0[2] = 0x80;
2590 break;
2591
2592 case 9:
2593 w0[2] = w0[2] | 0x8000;
2594 break;
2595
2596 case 10:
2597 w0[2] = w0[2] | 0x800000;
2598 break;
2599
2600 case 11:
2601 w0[2] = w0[2] | 0x80000000;
2602 break;
2603
2604 case 12:
2605 w0[3] = 0x80;
2606 break;
2607
2608 case 13:
2609 w0[3] = w0[3] | 0x8000;
2610 break;
2611
2612 case 14:
2613 w0[3] = w0[3] | 0x800000;
2614 break;
2615
2616 case 15:
2617 w0[3] = w0[3] | 0x80000000;
2618 break;
2619 }
2620 }
2621
2622 inline void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
2623 {
2624 switch (offset)
2625 {
2626 case 0:
2627 w0[0] = 0x80;
2628 break;
2629
2630 case 1:
2631 w0[0] = w0[0] | 0x8000;
2632 break;
2633
2634 case 2:
2635 w0[0] = w0[0] | 0x800000;
2636 break;
2637
2638 case 3:
2639 w0[0] = w0[0] | 0x80000000;
2640 break;
2641
2642 case 4:
2643 w0[1] = 0x80;
2644 break;
2645
2646 case 5:
2647 w0[1] = w0[1] | 0x8000;
2648 break;
2649
2650 case 6:
2651 w0[1] = w0[1] | 0x800000;
2652 break;
2653
2654 case 7:
2655 w0[1] = w0[1] | 0x80000000;
2656 break;
2657
2658 case 8:
2659 w0[2] = 0x80;
2660 break;
2661
2662 case 9:
2663 w0[2] = w0[2] | 0x8000;
2664 break;
2665
2666 case 10:
2667 w0[2] = w0[2] | 0x800000;
2668 break;
2669
2670 case 11:
2671 w0[2] = w0[2] | 0x80000000;
2672 break;
2673
2674 case 12:
2675 w0[3] = 0x80;
2676 break;
2677
2678 case 13:
2679 w0[3] = w0[3] | 0x8000;
2680 break;
2681
2682 case 14:
2683 w0[3] = w0[3] | 0x800000;
2684 break;
2685
2686 case 15:
2687 w0[3] = w0[3] | 0x80000000;
2688 break;
2689
2690 case 16:
2691 w1[0] = 0x80;
2692 break;
2693
2694 case 17:
2695 w1[0] = w1[0] | 0x8000;
2696 break;
2697
2698 case 18:
2699 w1[0] = w1[0] | 0x800000;
2700 break;
2701
2702 case 19:
2703 w1[0] = w1[0] | 0x80000000;
2704 break;
2705
2706 case 20:
2707 w1[1] = 0x80;
2708 break;
2709
2710 case 21:
2711 w1[1] = w1[1] | 0x8000;
2712 break;
2713
2714 case 22:
2715 w1[1] = w1[1] | 0x800000;
2716 break;
2717
2718 case 23:
2719 w1[1] = w1[1] | 0x80000000;
2720 break;
2721
2722 case 24:
2723 w1[2] = 0x80;
2724 break;
2725
2726 case 25:
2727 w1[2] = w1[2] | 0x8000;
2728 break;
2729
2730 case 26:
2731 w1[2] = w1[2] | 0x800000;
2732 break;
2733
2734 case 27:
2735 w1[2] = w1[2] | 0x80000000;
2736 break;
2737
2738 case 28:
2739 w1[3] = 0x80;
2740 break;
2741
2742 case 29:
2743 w1[3] = w1[3] | 0x8000;
2744 break;
2745
2746 case 30:
2747 w1[3] = w1[3] | 0x800000;
2748 break;
2749
2750 case 31:
2751 w1[3] = w1[3] | 0x80000000;
2752 break;
2753 }
2754 }
2755
2756 inline void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
2757 {
2758 switch (offset)
2759 {
2760 case 0:
2761 w0[0] = 0x80;
2762 break;
2763
2764 case 1:
2765 w0[0] = w0[0] | 0x8000;
2766 break;
2767
2768 case 2:
2769 w0[0] = w0[0] | 0x800000;
2770 break;
2771
2772 case 3:
2773 w0[0] = w0[0] | 0x80000000;
2774 break;
2775
2776 case 4:
2777 w0[1] = 0x80;
2778 break;
2779
2780 case 5:
2781 w0[1] = w0[1] | 0x8000;
2782 break;
2783
2784 case 6:
2785 w0[1] = w0[1] | 0x800000;
2786 break;
2787
2788 case 7:
2789 w0[1] = w0[1] | 0x80000000;
2790 break;
2791
2792 case 8:
2793 w0[2] = 0x80;
2794 break;
2795
2796 case 9:
2797 w0[2] = w0[2] | 0x8000;
2798 break;
2799
2800 case 10:
2801 w0[2] = w0[2] | 0x800000;
2802 break;
2803
2804 case 11:
2805 w0[2] = w0[2] | 0x80000000;
2806 break;
2807
2808 case 12:
2809 w0[3] = 0x80;
2810 break;
2811
2812 case 13:
2813 w0[3] = w0[3] | 0x8000;
2814 break;
2815
2816 case 14:
2817 w0[3] = w0[3] | 0x800000;
2818 break;
2819
2820 case 15:
2821 w0[3] = w0[3] | 0x80000000;
2822 break;
2823
2824 case 16:
2825 w1[0] = 0x80;
2826 break;
2827
2828 case 17:
2829 w1[0] = w1[0] | 0x8000;
2830 break;
2831
2832 case 18:
2833 w1[0] = w1[0] | 0x800000;
2834 break;
2835
2836 case 19:
2837 w1[0] = w1[0] | 0x80000000;
2838 break;
2839
2840 case 20:
2841 w1[1] = 0x80;
2842 break;
2843
2844 case 21:
2845 w1[1] = w1[1] | 0x8000;
2846 break;
2847
2848 case 22:
2849 w1[1] = w1[1] | 0x800000;
2850 break;
2851
2852 case 23:
2853 w1[1] = w1[1] | 0x80000000;
2854 break;
2855
2856 case 24:
2857 w1[2] = 0x80;
2858 break;
2859
2860 case 25:
2861 w1[2] = w1[2] | 0x8000;
2862 break;
2863
2864 case 26:
2865 w1[2] = w1[2] | 0x800000;
2866 break;
2867
2868 case 27:
2869 w1[2] = w1[2] | 0x80000000;
2870 break;
2871
2872 case 28:
2873 w1[3] = 0x80;
2874 break;
2875
2876 case 29:
2877 w1[3] = w1[3] | 0x8000;
2878 break;
2879
2880 case 30:
2881 w1[3] = w1[3] | 0x800000;
2882 break;
2883
2884 case 31:
2885 w1[3] = w1[3] | 0x80000000;
2886 break;
2887
2888 case 32:
2889 w2[0] = 0x80;
2890 break;
2891
2892 case 33:
2893 w2[0] = w2[0] | 0x8000;
2894 break;
2895
2896 case 34:
2897 w2[0] = w2[0] | 0x800000;
2898 break;
2899
2900 case 35:
2901 w2[0] = w2[0] | 0x80000000;
2902 break;
2903
2904 case 36:
2905 w2[1] = 0x80;
2906 break;
2907
2908 case 37:
2909 w2[1] = w2[1] | 0x8000;
2910 break;
2911
2912 case 38:
2913 w2[1] = w2[1] | 0x800000;
2914 break;
2915
2916 case 39:
2917 w2[1] = w2[1] | 0x80000000;
2918 break;
2919
2920 case 40:
2921 w2[2] = 0x80;
2922 break;
2923
2924 case 41:
2925 w2[2] = w2[2] | 0x8000;
2926 break;
2927
2928 case 42:
2929 w2[2] = w2[2] | 0x800000;
2930 break;
2931
2932 case 43:
2933 w2[2] = w2[2] | 0x80000000;
2934 break;
2935
2936 case 44:
2937 w2[3] = 0x80;
2938 break;
2939
2940 case 45:
2941 w2[3] = w2[3] | 0x8000;
2942 break;
2943
2944 case 46:
2945 w2[3] = w2[3] | 0x800000;
2946 break;
2947
2948 case 47:
2949 w2[3] = w2[3] | 0x80000000;
2950 break;
2951 }
2952 }
2953
2954 inline void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
2955 {
2956 switch (offset)
2957 {
2958 case 0:
2959 w0[0] = 0x80;
2960 break;
2961
2962 case 1:
2963 w0[0] = w0[0] | 0x8000;
2964 break;
2965
2966 case 2:
2967 w0[0] = w0[0] | 0x800000;
2968 break;
2969
2970 case 3:
2971 w0[0] = w0[0] | 0x80000000;
2972 break;
2973
2974 case 4:
2975 w0[1] = 0x80;
2976 break;
2977
2978 case 5:
2979 w0[1] = w0[1] | 0x8000;
2980 break;
2981
2982 case 6:
2983 w0[1] = w0[1] | 0x800000;
2984 break;
2985
2986 case 7:
2987 w0[1] = w0[1] | 0x80000000;
2988 break;
2989
2990 case 8:
2991 w0[2] = 0x80;
2992 break;
2993
2994 case 9:
2995 w0[2] = w0[2] | 0x8000;
2996 break;
2997
2998 case 10:
2999 w0[2] = w0[2] | 0x800000;
3000 break;
3001
3002 case 11:
3003 w0[2] = w0[2] | 0x80000000;
3004 break;
3005
3006 case 12:
3007 w0[3] = 0x80;
3008 break;
3009
3010 case 13:
3011 w0[3] = w0[3] | 0x8000;
3012 break;
3013
3014 case 14:
3015 w0[3] = w0[3] | 0x800000;
3016 break;
3017
3018 case 15:
3019 w0[3] = w0[3] | 0x80000000;
3020 break;
3021
3022 case 16:
3023 w1[0] = 0x80;
3024 break;
3025
3026 case 17:
3027 w1[0] = w1[0] | 0x8000;
3028 break;
3029
3030 case 18:
3031 w1[0] = w1[0] | 0x800000;
3032 break;
3033
3034 case 19:
3035 w1[0] = w1[0] | 0x80000000;
3036 break;
3037
3038 case 20:
3039 w1[1] = 0x80;
3040 break;
3041
3042 case 21:
3043 w1[1] = w1[1] | 0x8000;
3044 break;
3045
3046 case 22:
3047 w1[1] = w1[1] | 0x800000;
3048 break;
3049
3050 case 23:
3051 w1[1] = w1[1] | 0x80000000;
3052 break;
3053
3054 case 24:
3055 w1[2] = 0x80;
3056 break;
3057
3058 case 25:
3059 w1[2] = w1[2] | 0x8000;
3060 break;
3061
3062 case 26:
3063 w1[2] = w1[2] | 0x800000;
3064 break;
3065
3066 case 27:
3067 w1[2] = w1[2] | 0x80000000;
3068 break;
3069
3070 case 28:
3071 w1[3] = 0x80;
3072 break;
3073
3074 case 29:
3075 w1[3] = w1[3] | 0x8000;
3076 break;
3077
3078 case 30:
3079 w1[3] = w1[3] | 0x800000;
3080 break;
3081
3082 case 31:
3083 w1[3] = w1[3] | 0x80000000;
3084 break;
3085
3086 case 32:
3087 w2[0] = 0x80;
3088 break;
3089
3090 case 33:
3091 w2[0] = w2[0] | 0x8000;
3092 break;
3093
3094 case 34:
3095 w2[0] = w2[0] | 0x800000;
3096 break;
3097
3098 case 35:
3099 w2[0] = w2[0] | 0x80000000;
3100 break;
3101
3102 case 36:
3103 w2[1] = 0x80;
3104 break;
3105
3106 case 37:
3107 w2[1] = w2[1] | 0x8000;
3108 break;
3109
3110 case 38:
3111 w2[1] = w2[1] | 0x800000;
3112 break;
3113
3114 case 39:
3115 w2[1] = w2[1] | 0x80000000;
3116 break;
3117
3118 case 40:
3119 w2[2] = 0x80;
3120 break;
3121
3122 case 41:
3123 w2[2] = w2[2] | 0x8000;
3124 break;
3125
3126 case 42:
3127 w2[2] = w2[2] | 0x800000;
3128 break;
3129
3130 case 43:
3131 w2[2] = w2[2] | 0x80000000;
3132 break;
3133
3134 case 44:
3135 w2[3] = 0x80;
3136 break;
3137
3138 case 45:
3139 w2[3] = w2[3] | 0x8000;
3140 break;
3141
3142 case 46:
3143 w2[3] = w2[3] | 0x800000;
3144 break;
3145
3146 case 47:
3147 w2[3] = w2[3] | 0x80000000;
3148 break;
3149
3150 case 48:
3151 w3[0] = 0x80;
3152 break;
3153
3154 case 49:
3155 w3[0] = w3[0] | 0x8000;
3156 break;
3157
3158 case 50:
3159 w3[0] = w3[0] | 0x800000;
3160 break;
3161
3162 case 51:
3163 w3[0] = w3[0] | 0x80000000;
3164 break;
3165
3166 case 52:
3167 w3[1] = 0x80;
3168 break;
3169
3170 case 53:
3171 w3[1] = w3[1] | 0x8000;
3172 break;
3173
3174 case 54:
3175 w3[1] = w3[1] | 0x800000;
3176 break;
3177
3178 case 55:
3179 w3[1] = w3[1] | 0x80000000;
3180 break;
3181
3182 case 56:
3183 w3[2] = 0x80;
3184 break;
3185
3186 case 57:
3187 w3[2] = w3[2] | 0x8000;
3188 break;
3189
3190 case 58:
3191 w3[2] = w3[2] | 0x800000;
3192 break;
3193
3194 case 59:
3195 w3[2] = w3[2] | 0x80000000;
3196 break;
3197
3198 case 60:
3199 w3[3] = 0x80;
3200 break;
3201
3202 case 61:
3203 w3[3] = w3[3] | 0x8000;
3204 break;
3205
3206 case 62:
3207 w3[3] = w3[3] | 0x800000;
3208 break;
3209
3210 case 63:
3211 w3[3] = w3[3] | 0x80000000;
3212 break;
3213 }
3214 }
3215
3216 inline void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
3217 {
3218 switch (offset)
3219 {
3220 case 0:
3221 w0[0] = 0x80;
3222 break;
3223
3224 case 1:
3225 w0[0] = w0[0] | 0x8000;
3226 break;
3227
3228 case 2:
3229 w0[0] = w0[0] | 0x800000;
3230 break;
3231
3232 case 3:
3233 w0[0] = w0[0] | 0x80000000;
3234 break;
3235
3236 case 4:
3237 w0[1] = 0x80;
3238 break;
3239
3240 case 5:
3241 w0[1] = w0[1] | 0x8000;
3242 break;
3243
3244 case 6:
3245 w0[1] = w0[1] | 0x800000;
3246 break;
3247
3248 case 7:
3249 w0[1] = w0[1] | 0x80000000;
3250 break;
3251
3252 case 8:
3253 w0[2] = 0x80;
3254 break;
3255
3256 case 9:
3257 w0[2] = w0[2] | 0x8000;
3258 break;
3259
3260 case 10:
3261 w0[2] = w0[2] | 0x800000;
3262 break;
3263
3264 case 11:
3265 w0[2] = w0[2] | 0x80000000;
3266 break;
3267
3268 case 12:
3269 w0[3] = 0x80;
3270 break;
3271
3272 case 13:
3273 w0[3] = w0[3] | 0x8000;
3274 break;
3275
3276 case 14:
3277 w0[3] = w0[3] | 0x800000;
3278 break;
3279
3280 case 15:
3281 w0[3] = w0[3] | 0x80000000;
3282 break;
3283
3284 case 16:
3285 w1[0] = 0x80;
3286 break;
3287
3288 case 17:
3289 w1[0] = w1[0] | 0x8000;
3290 break;
3291
3292 case 18:
3293 w1[0] = w1[0] | 0x800000;
3294 break;
3295
3296 case 19:
3297 w1[0] = w1[0] | 0x80000000;
3298 break;
3299
3300 case 20:
3301 w1[1] = 0x80;
3302 break;
3303
3304 case 21:
3305 w1[1] = w1[1] | 0x8000;
3306 break;
3307
3308 case 22:
3309 w1[1] = w1[1] | 0x800000;
3310 break;
3311
3312 case 23:
3313 w1[1] = w1[1] | 0x80000000;
3314 break;
3315
3316 case 24:
3317 w1[2] = 0x80;
3318 break;
3319
3320 case 25:
3321 w1[2] = w1[2] | 0x8000;
3322 break;
3323
3324 case 26:
3325 w1[2] = w1[2] | 0x800000;
3326 break;
3327
3328 case 27:
3329 w1[2] = w1[2] | 0x80000000;
3330 break;
3331
3332 case 28:
3333 w1[3] = 0x80;
3334 break;
3335
3336 case 29:
3337 w1[3] = w1[3] | 0x8000;
3338 break;
3339
3340 case 30:
3341 w1[3] = w1[3] | 0x800000;
3342 break;
3343
3344 case 31:
3345 w1[3] = w1[3] | 0x80000000;
3346 break;
3347
3348 case 32:
3349 w2[0] = 0x80;
3350 break;
3351
3352 case 33:
3353 w2[0] = w2[0] | 0x8000;
3354 break;
3355
3356 case 34:
3357 w2[0] = w2[0] | 0x800000;
3358 break;
3359
3360 case 35:
3361 w2[0] = w2[0] | 0x80000000;
3362 break;
3363
3364 case 36:
3365 w2[1] = 0x80;
3366 break;
3367
3368 case 37:
3369 w2[1] = w2[1] | 0x8000;
3370 break;
3371
3372 case 38:
3373 w2[1] = w2[1] | 0x800000;
3374 break;
3375
3376 case 39:
3377 w2[1] = w2[1] | 0x80000000;
3378 break;
3379
3380 case 40:
3381 w2[2] = 0x80;
3382 break;
3383
3384 case 41:
3385 w2[2] = w2[2] | 0x8000;
3386 break;
3387
3388 case 42:
3389 w2[2] = w2[2] | 0x800000;
3390 break;
3391
3392 case 43:
3393 w2[2] = w2[2] | 0x80000000;
3394 break;
3395
3396 case 44:
3397 w2[3] = 0x80;
3398 break;
3399
3400 case 45:
3401 w2[3] = w2[3] | 0x8000;
3402 break;
3403
3404 case 46:
3405 w2[3] = w2[3] | 0x800000;
3406 break;
3407
3408 case 47:
3409 w2[3] = w2[3] | 0x80000000;
3410 break;
3411
3412 case 48:
3413 w3[0] = 0x80;
3414 break;
3415
3416 case 49:
3417 w3[0] = w3[0] | 0x8000;
3418 break;
3419
3420 case 50:
3421 w3[0] = w3[0] | 0x800000;
3422 break;
3423
3424 case 51:
3425 w3[0] = w3[0] | 0x80000000;
3426 break;
3427
3428 case 52:
3429 w3[1] = 0x80;
3430 break;
3431
3432 case 53:
3433 w3[1] = w3[1] | 0x8000;
3434 break;
3435
3436 case 54:
3437 w3[1] = w3[1] | 0x800000;
3438 break;
3439
3440 case 55:
3441 w3[1] = w3[1] | 0x80000000;
3442 break;
3443
3444 case 56:
3445 w3[2] = 0x80;
3446 break;
3447
3448 case 57:
3449 w3[2] = w3[2] | 0x8000;
3450 break;
3451
3452 case 58:
3453 w3[2] = w3[2] | 0x800000;
3454 break;
3455
3456 case 59:
3457 w3[2] = w3[2] | 0x80000000;
3458 break;
3459
3460 case 60:
3461 w3[3] = 0x80;
3462 break;
3463
3464 case 61:
3465 w3[3] = w3[3] | 0x8000;
3466 break;
3467
3468 case 62:
3469 w3[3] = w3[3] | 0x800000;
3470 break;
3471
3472 case 63:
3473 w3[3] = w3[3] | 0x80000000;
3474 break;
3475
3476 case 64:
3477 w4[0] = 0x80;
3478 break;
3479
3480 case 65:
3481 w4[0] = w4[0] | 0x8000;
3482 break;
3483
3484 case 66:
3485 w4[0] = w4[0] | 0x800000;
3486 break;
3487
3488 case 67:
3489 w4[0] = w4[0] | 0x80000000;
3490 break;
3491
3492 case 68:
3493 w4[1] = 0x80;
3494 break;
3495
3496 case 69:
3497 w4[1] = w4[1] | 0x8000;
3498 break;
3499
3500 case 70:
3501 w4[1] = w4[1] | 0x800000;
3502 break;
3503
3504 case 71:
3505 w4[1] = w4[1] | 0x80000000;
3506 break;
3507
3508 case 72:
3509 w4[2] = 0x80;
3510 break;
3511
3512 case 73:
3513 w4[2] = w4[2] | 0x8000;
3514 break;
3515
3516 case 74:
3517 w4[2] = w4[2] | 0x800000;
3518 break;
3519
3520 case 75:
3521 w4[2] = w4[2] | 0x80000000;
3522 break;
3523
3524 case 76:
3525 w4[3] = 0x80;
3526 break;
3527
3528 case 77:
3529 w4[3] = w4[3] | 0x8000;
3530 break;
3531
3532 case 78:
3533 w4[3] = w4[3] | 0x800000;
3534 break;
3535
3536 case 79:
3537 w4[3] = w4[3] | 0x80000000;
3538 break;
3539
3540 case 80:
3541 w5[0] = 0x80;
3542 break;
3543
3544 case 81:
3545 w5[0] = w5[0] | 0x8000;
3546 break;
3547
3548 case 82:
3549 w5[0] = w5[0] | 0x800000;
3550 break;
3551
3552 case 83:
3553 w5[0] = w5[0] | 0x80000000;
3554 break;
3555
3556 case 84:
3557 w5[1] = 0x80;
3558 break;
3559
3560 case 85:
3561 w5[1] = w5[1] | 0x8000;
3562 break;
3563
3564 case 86:
3565 w5[1] = w5[1] | 0x800000;
3566 break;
3567
3568 case 87:
3569 w5[1] = w5[1] | 0x80000000;
3570 break;
3571
3572 case 88:
3573 w5[2] = 0x80;
3574 break;
3575
3576 case 89:
3577 w5[2] = w5[2] | 0x8000;
3578 break;
3579
3580 case 90:
3581 w5[2] = w5[2] | 0x800000;
3582 break;
3583
3584 case 91:
3585 w5[2] = w5[2] | 0x80000000;
3586 break;
3587
3588 case 92:
3589 w5[3] = 0x80;
3590 break;
3591
3592 case 93:
3593 w5[3] = w5[3] | 0x8000;
3594 break;
3595
3596 case 94:
3597 w5[3] = w5[3] | 0x800000;
3598 break;
3599
3600 case 95:
3601 w5[3] = w5[3] | 0x80000000;
3602 break;
3603
3604 case 96:
3605 w6[0] = 0x80;
3606 break;
3607
3608 case 97:
3609 w6[0] = w6[0] | 0x8000;
3610 break;
3611
3612 case 98:
3613 w6[0] = w6[0] | 0x800000;
3614 break;
3615
3616 case 99:
3617 w6[0] = w6[0] | 0x80000000;
3618 break;
3619
3620 case 100:
3621 w6[1] = 0x80;
3622 break;
3623
3624 case 101:
3625 w6[1] = w6[1] | 0x8000;
3626 break;
3627
3628 case 102:
3629 w6[1] = w6[1] | 0x800000;
3630 break;
3631
3632 case 103:
3633 w6[1] = w6[1] | 0x80000000;
3634 break;
3635
3636 case 104:
3637 w6[2] = 0x80;
3638 break;
3639
3640 case 105:
3641 w6[2] = w6[2] | 0x8000;
3642 break;
3643
3644 case 106:
3645 w6[2] = w6[2] | 0x800000;
3646 break;
3647
3648 case 107:
3649 w6[2] = w6[2] | 0x80000000;
3650 break;
3651
3652 case 108:
3653 w6[3] = 0x80;
3654 break;
3655
3656 case 109:
3657 w6[3] = w6[3] | 0x8000;
3658 break;
3659
3660 case 110:
3661 w6[3] = w6[3] | 0x800000;
3662 break;
3663
3664 case 111:
3665 w6[3] = w6[3] | 0x80000000;
3666 break;
3667
3668 case 112:
3669 w7[0] = 0x80;
3670 break;
3671
3672 case 113:
3673 w7[0] = w7[0] | 0x8000;
3674 break;
3675
3676 case 114:
3677 w7[0] = w7[0] | 0x800000;
3678 break;
3679
3680 case 115:
3681 w7[0] = w7[0] | 0x80000000;
3682 break;
3683
3684 case 116:
3685 w7[1] = 0x80;
3686 break;
3687
3688 case 117:
3689 w7[1] = w7[1] | 0x8000;
3690 break;
3691
3692 case 118:
3693 w7[1] = w7[1] | 0x800000;
3694 break;
3695
3696 case 119:
3697 w7[1] = w7[1] | 0x80000000;
3698 break;
3699
3700 case 120:
3701 w7[2] = 0x80;
3702 break;
3703
3704 case 121:
3705 w7[2] = w7[2] | 0x8000;
3706 break;
3707
3708 case 122:
3709 w7[2] = w7[2] | 0x800000;
3710 break;
3711
3712 case 123:
3713 w7[2] = w7[2] | 0x80000000;
3714 break;
3715
3716 case 124:
3717 w7[3] = 0x80;
3718 break;
3719
3720 case 125:
3721 w7[3] = w7[3] | 0x8000;
3722 break;
3723
3724 case 126:
3725 w7[3] = w7[3] | 0x800000;
3726 break;
3727
3728 case 127:
3729 w7[3] = w7[3] | 0x80000000;
3730 break;
3731 }
3732 }
3733
3734 inline void append_0x80_1x16 (u32x w[16], const u32 offset)
3735 {
3736 switch (offset)
3737 {
3738 case 0:
3739 w[ 0] = 0x80;
3740 break;
3741
3742 case 1:
3743 w[ 0] = w[ 0] | 0x8000;
3744 break;
3745
3746 case 2:
3747 w[ 0] = w[ 0] | 0x800000;
3748 break;
3749
3750 case 3:
3751 w[ 0] = w[ 0] | 0x80000000;
3752 break;
3753
3754 case 4:
3755 w[ 1] = 0x80;
3756 break;
3757
3758 case 5:
3759 w[ 1] = w[ 1] | 0x8000;
3760 break;
3761
3762 case 6:
3763 w[ 1] = w[ 1] | 0x800000;
3764 break;
3765
3766 case 7:
3767 w[ 1] = w[ 1] | 0x80000000;
3768 break;
3769
3770 case 8:
3771 w[ 2] = 0x80;
3772 break;
3773
3774 case 9:
3775 w[ 2] = w[ 2] | 0x8000;
3776 break;
3777
3778 case 10:
3779 w[ 2] = w[ 2] | 0x800000;
3780 break;
3781
3782 case 11:
3783 w[ 2] = w[ 2] | 0x80000000;
3784 break;
3785
3786 case 12:
3787 w[ 3] = 0x80;
3788 break;
3789
3790 case 13:
3791 w[ 3] = w[ 3] | 0x8000;
3792 break;
3793
3794 case 14:
3795 w[ 3] = w[ 3] | 0x800000;
3796 break;
3797
3798 case 15:
3799 w[ 3] = w[ 3] | 0x80000000;
3800 break;
3801
3802 case 16:
3803 w[ 4] = 0x80;
3804 break;
3805
3806 case 17:
3807 w[ 4] = w[ 4] | 0x8000;
3808 break;
3809
3810 case 18:
3811 w[ 4] = w[ 4] | 0x800000;
3812 break;
3813
3814 case 19:
3815 w[ 4] = w[ 4] | 0x80000000;
3816 break;
3817
3818 case 20:
3819 w[ 5] = 0x80;
3820 break;
3821
3822 case 21:
3823 w[ 5] = w[ 5] | 0x8000;
3824 break;
3825
3826 case 22:
3827 w[ 5] = w[ 5] | 0x800000;
3828 break;
3829
3830 case 23:
3831 w[ 5] = w[ 5] | 0x80000000;
3832 break;
3833
3834 case 24:
3835 w[ 6] = 0x80;
3836 break;
3837
3838 case 25:
3839 w[ 6] = w[ 6] | 0x8000;
3840 break;
3841
3842 case 26:
3843 w[ 6] = w[ 6] | 0x800000;
3844 break;
3845
3846 case 27:
3847 w[ 6] = w[ 6] | 0x80000000;
3848 break;
3849
3850 case 28:
3851 w[ 7] = 0x80;
3852 break;
3853
3854 case 29:
3855 w[ 7] = w[ 7] | 0x8000;
3856 break;
3857
3858 case 30:
3859 w[ 7] = w[ 7] | 0x800000;
3860 break;
3861
3862 case 31:
3863 w[ 7] = w[ 7] | 0x80000000;
3864 break;
3865
3866 case 32:
3867 w[ 8] = 0x80;
3868 break;
3869
3870 case 33:
3871 w[ 8] = w[ 8] | 0x8000;
3872 break;
3873
3874 case 34:
3875 w[ 8] = w[ 8] | 0x800000;
3876 break;
3877
3878 case 35:
3879 w[ 8] = w[ 8] | 0x80000000;
3880 break;
3881
3882 case 36:
3883 w[ 9] = 0x80;
3884 break;
3885
3886 case 37:
3887 w[ 9] = w[ 9] | 0x8000;
3888 break;
3889
3890 case 38:
3891 w[ 9] = w[ 9] | 0x800000;
3892 break;
3893
3894 case 39:
3895 w[ 9] = w[ 9] | 0x80000000;
3896 break;
3897
3898 case 40:
3899 w[10] = 0x80;
3900 break;
3901
3902 case 41:
3903 w[10] = w[10] | 0x8000;
3904 break;
3905
3906 case 42:
3907 w[10] = w[10] | 0x800000;
3908 break;
3909
3910 case 43:
3911 w[10] = w[10] | 0x80000000;
3912 break;
3913
3914 case 44:
3915 w[11] = 0x80;
3916 break;
3917
3918 case 45:
3919 w[11] = w[11] | 0x8000;
3920 break;
3921
3922 case 46:
3923 w[11] = w[11] | 0x800000;
3924 break;
3925
3926 case 47:
3927 w[11] = w[11] | 0x80000000;
3928 break;
3929
3930 case 48:
3931 w[12] = 0x80;
3932 break;
3933
3934 case 49:
3935 w[12] = w[12] | 0x8000;
3936 break;
3937
3938 case 50:
3939 w[12] = w[12] | 0x800000;
3940 break;
3941
3942 case 51:
3943 w[12] = w[12] | 0x80000000;
3944 break;
3945
3946 case 52:
3947 w[13] = 0x80;
3948 break;
3949
3950 case 53:
3951 w[13] = w[13] | 0x8000;
3952 break;
3953
3954 case 54:
3955 w[13] = w[13] | 0x800000;
3956 break;
3957
3958 case 55:
3959 w[13] = w[13] | 0x80000000;
3960 break;
3961
3962 case 56:
3963 w[14] = 0x80;
3964 break;
3965
3966 case 57:
3967 w[14] = w[14] | 0x8000;
3968 break;
3969
3970 case 58:
3971 w[14] = w[14] | 0x800000;
3972 break;
3973
3974 case 59:
3975 w[14] = w[14] | 0x80000000;
3976 break;
3977
3978 case 60:
3979 w[15] = 0x80;
3980 break;
3981
3982 case 61:
3983 w[15] = w[15] | 0x8000;
3984 break;
3985
3986 case 62:
3987 w[15] = w[15] | 0x800000;
3988 break;
3989
3990 case 63:
3991 w[15] = w[15] | 0x80000000;
3992 break;
3993 }
3994 }
3995
3996 inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
3997 {
3998 #if defined IS_AMD || defined IS_GENERIC
3999 const int offset_mod_4 = offset & 3;
4000
4001 const int offset_minus_4 = 4 - offset;
4002
4003 switch (offset / 4)
4004 {
4005 case 0:
4006 w3[2] = amd_bytealign ( 0, w3[1], offset_minus_4);
4007 w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4);
4008 w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4);
4009 w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4010 w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4011 w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4012 w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4013 w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4014 w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4015 w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4016 w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4017 w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4018 w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4019 w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4020 w0[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4021
4022 if (offset_mod_4 == 0)
4023 {
4024 w0[0] = w0[1];
4025 w0[1] = w0[2];
4026 w0[2] = w0[3];
4027 w0[3] = w1[0];
4028 w1[0] = w1[1];
4029 w1[1] = w1[2];
4030 w1[2] = w1[3];
4031 w1[3] = w2[0];
4032 w2[0] = w2[1];
4033 w2[1] = w2[2];
4034 w2[2] = w2[3];
4035 w2[3] = w3[0];
4036 w3[0] = w3[1];
4037 w3[1] = w3[2];
4038 w3[2] = 0;
4039 }
4040
4041 break;
4042
4043 case 1:
4044 w3[2] = amd_bytealign ( 0, w3[0], offset_minus_4);
4045 w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4);
4046 w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4047 w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4048 w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4049 w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4050 w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4051 w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4052 w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4053 w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4054 w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4055 w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4056 w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4057 w0[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4058 w0[0] = 0;
4059
4060 if (offset_mod_4 == 0)
4061 {
4062 w0[1] = w0[2];
4063 w0[2] = w0[3];
4064 w0[3] = w1[0];
4065 w1[0] = w1[1];
4066 w1[1] = w1[2];
4067 w1[2] = w1[3];
4068 w1[3] = w2[0];
4069 w2[0] = w2[1];
4070 w2[1] = w2[2];
4071 w2[2] = w2[3];
4072 w2[3] = w3[0];
4073 w3[0] = w3[1];
4074 w3[1] = w3[2];
4075 w3[2] = 0;
4076 }
4077
4078 break;
4079
4080 case 2:
4081 w3[2] = amd_bytealign ( 0, w2[3], offset_minus_4);
4082 w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4083 w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4084 w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4085 w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4086 w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4087 w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4088 w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4089 w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4090 w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4091 w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4092 w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4093 w0[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4094 w0[1] = 0;
4095 w0[0] = 0;
4096
4097 if (offset_mod_4 == 0)
4098 {
4099 w0[2] = w0[3];
4100 w0[3] = w1[0];
4101 w1[0] = w1[1];
4102 w1[1] = w1[2];
4103 w1[2] = w1[3];
4104 w1[3] = w2[0];
4105 w2[0] = w2[1];
4106 w2[1] = w2[2];
4107 w2[2] = w2[3];
4108 w2[3] = w3[0];
4109 w3[0] = w3[1];
4110 w3[1] = w3[2];
4111 w3[2] = 0;
4112 }
4113
4114 break;
4115
4116 case 3:
4117 w3[2] = amd_bytealign ( 0, w2[2], offset_minus_4);
4118 w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4119 w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4120 w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4121 w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4122 w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4123 w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4124 w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4125 w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4126 w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4127 w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4128 w0[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4129 w0[2] = 0;
4130 w0[1] = 0;
4131 w0[0] = 0;
4132
4133 if (offset_mod_4 == 0)
4134 {
4135 w0[3] = w1[0];
4136 w1[0] = w1[1];
4137 w1[1] = w1[2];
4138 w1[2] = w1[3];
4139 w1[3] = w2[0];
4140 w2[0] = w2[1];
4141 w2[1] = w2[2];
4142 w2[2] = w2[3];
4143 w2[3] = w3[0];
4144 w3[0] = w3[1];
4145 w3[1] = w3[2];
4146 w3[2] = 0;
4147 }
4148
4149 break;
4150
4151 case 4:
4152 w3[2] = amd_bytealign ( 0, w2[1], offset_minus_4);
4153 w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4154 w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4155 w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4156 w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4157 w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4158 w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4159 w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4160 w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4161 w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4162 w1[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4163 w0[3] = 0;
4164 w0[2] = 0;
4165 w0[1] = 0;
4166 w0[0] = 0;
4167
4168 if (offset_mod_4 == 0)
4169 {
4170 w1[0] = w1[1];
4171 w1[1] = w1[2];
4172 w1[2] = w1[3];
4173 w1[3] = w2[0];
4174 w2[0] = w2[1];
4175 w2[1] = w2[2];
4176 w2[2] = w2[3];
4177 w2[3] = w3[0];
4178 w3[0] = w3[1];
4179 w3[1] = w3[2];
4180 w3[2] = 0;
4181 }
4182
4183 break;
4184
4185 case 5:
4186 w3[2] = amd_bytealign ( 0, w2[0], offset_minus_4);
4187 w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4188 w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4189 w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4190 w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4191 w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4192 w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4193 w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4194 w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4195 w1[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4196 w1[0] = 0;
4197 w0[3] = 0;
4198 w0[2] = 0;
4199 w0[1] = 0;
4200 w0[0] = 0;
4201
4202 if (offset_mod_4 == 0)
4203 {
4204 w1[1] = w1[2];
4205 w1[2] = w1[3];
4206 w1[3] = w2[0];
4207 w2[0] = w2[1];
4208 w2[1] = w2[2];
4209 w2[2] = w2[3];
4210 w2[3] = w3[0];
4211 w3[0] = w3[1];
4212 w3[1] = w3[2];
4213 w3[2] = 0;
4214 }
4215
4216 break;
4217
4218 case 6:
4219 w3[2] = amd_bytealign ( 0, w1[3], offset_minus_4);
4220 w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4221 w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4222 w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4223 w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4224 w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4225 w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4226 w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4227 w1[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4228 w1[1] = 0;
4229 w1[0] = 0;
4230 w0[3] = 0;
4231 w0[2] = 0;
4232 w0[1] = 0;
4233 w0[0] = 0;
4234
4235 if (offset_mod_4 == 0)
4236 {
4237 w1[2] = w1[3];
4238 w1[3] = w2[0];
4239 w2[0] = w2[1];
4240 w2[1] = w2[2];
4241 w2[2] = w2[3];
4242 w2[3] = w3[0];
4243 w3[0] = w3[1];
4244 w3[1] = w3[2];
4245 w3[2] = 0;
4246 }
4247
4248 break;
4249
4250 case 7:
4251 w3[2] = amd_bytealign ( 0, w1[2], offset_minus_4);
4252 w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4253 w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4254 w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4255 w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4256 w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4257 w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4258 w1[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4259 w1[2] = 0;
4260 w1[1] = 0;
4261 w1[0] = 0;
4262 w0[3] = 0;
4263 w0[2] = 0;
4264 w0[1] = 0;
4265 w0[0] = 0;
4266
4267 if (offset_mod_4 == 0)
4268 {
4269 w1[3] = w2[0];
4270 w2[0] = w2[1];
4271 w2[1] = w2[2];
4272 w2[2] = w2[3];
4273 w2[3] = w3[0];
4274 w3[0] = w3[1];
4275 w3[1] = w3[2];
4276 w3[2] = 0;
4277 }
4278
4279 break;
4280
4281 case 8:
4282 w3[2] = amd_bytealign ( 0, w1[1], offset_minus_4);
4283 w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4284 w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4285 w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4286 w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4287 w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4288 w2[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4289 w1[3] = 0;
4290 w1[2] = 0;
4291 w1[1] = 0;
4292 w1[0] = 0;
4293 w0[3] = 0;
4294 w0[2] = 0;
4295 w0[1] = 0;
4296 w0[0] = 0;
4297
4298 if (offset_mod_4 == 0)
4299 {
4300 w2[0] = w2[1];
4301 w2[1] = w2[2];
4302 w2[2] = w2[3];
4303 w2[3] = w3[0];
4304 w3[0] = w3[1];
4305 w3[1] = w3[2];
4306 w3[2] = 0;
4307 }
4308
4309 break;
4310
4311 case 9:
4312 w3[2] = amd_bytealign ( 0, w1[0], offset_minus_4);
4313 w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4314 w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4315 w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4316 w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4317 w2[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4318 w2[0] = 0;
4319 w1[3] = 0;
4320 w1[2] = 0;
4321 w1[1] = 0;
4322 w1[0] = 0;
4323 w0[3] = 0;
4324 w0[2] = 0;
4325 w0[1] = 0;
4326 w0[0] = 0;
4327
4328 if (offset_mod_4 == 0)
4329 {
4330 w2[1] = w2[2];
4331 w2[2] = w2[3];
4332 w2[3] = w3[0];
4333 w3[0] = w3[1];
4334 w3[1] = w3[2];
4335 w3[2] = 0;
4336 }
4337
4338 break;
4339
4340 case 10:
4341 w3[2] = amd_bytealign ( 0, w0[3], offset_minus_4);
4342 w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4343 w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4344 w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4345 w2[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4346 w2[1] = 0;
4347 w2[0] = 0;
4348 w1[3] = 0;
4349 w1[2] = 0;
4350 w1[1] = 0;
4351 w1[0] = 0;
4352 w0[3] = 0;
4353 w0[2] = 0;
4354 w0[1] = 0;
4355 w0[0] = 0;
4356
4357 if (offset_mod_4 == 0)
4358 {
4359 w2[2] = w2[3];
4360 w2[3] = w3[0];
4361 w3[0] = w3[1];
4362 w3[1] = w3[2];
4363 w3[2] = 0;
4364 }
4365
4366 break;
4367
4368 case 11:
4369 w3[2] = amd_bytealign ( 0, w0[2], offset_minus_4);
4370 w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4371 w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4372 w2[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4373 w2[2] = 0;
4374 w2[1] = 0;
4375 w2[0] = 0;
4376 w1[3] = 0;
4377 w1[2] = 0;
4378 w1[1] = 0;
4379 w1[0] = 0;
4380 w0[3] = 0;
4381 w0[2] = 0;
4382 w0[1] = 0;
4383 w0[0] = 0;
4384
4385 if (offset_mod_4 == 0)
4386 {
4387 w2[3] = w3[0];
4388 w3[0] = w3[1];
4389 w3[1] = w3[2];
4390 w3[2] = 0;
4391 }
4392
4393 break;
4394
4395 case 12:
4396 w3[2] = amd_bytealign ( 0, w0[1], offset_minus_4);
4397 w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4398 w3[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4399 w2[3] = 0;
4400 w2[2] = 0;
4401 w2[1] = 0;
4402 w2[0] = 0;
4403 w1[3] = 0;
4404 w1[2] = 0;
4405 w1[1] = 0;
4406 w1[0] = 0;
4407 w0[3] = 0;
4408 w0[2] = 0;
4409 w0[1] = 0;
4410 w0[0] = 0;
4411
4412 if (offset_mod_4 == 0)
4413 {
4414 w3[0] = w3[1];
4415 w3[1] = w3[2];
4416 w3[2] = 0;
4417 }
4418
4419 break;
4420
4421 case 13:
4422 w3[2] = amd_bytealign ( 0, w0[0], offset_minus_4);
4423 w3[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4424 w3[0] = 0;
4425 w2[3] = 0;
4426 w2[2] = 0;
4427 w2[1] = 0;
4428 w2[0] = 0;
4429 w1[3] = 0;
4430 w1[2] = 0;
4431 w1[1] = 0;
4432 w1[0] = 0;
4433 w0[3] = 0;
4434 w0[2] = 0;
4435 w0[1] = 0;
4436 w0[0] = 0;
4437
4438 if (offset_mod_4 == 0)
4439 {
4440 w3[1] = w3[2];
4441 w3[2] = 0;
4442 }
4443
4444 break;
4445 }
4446 #endif
4447
4448 #ifdef IS_NV
4449 const int offset_minus_4 = 4 - (offset % 4);
4450
4451 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
4452
4453 switch (offset / 4)
4454 {
4455 case 0:
4456 w3[1] = __byte_perm (w3[0], w3[1], selector);
4457 w3[0] = __byte_perm (w2[3], w3[0], selector);
4458 w2[3] = __byte_perm (w2[2], w2[3], selector);
4459 w2[2] = __byte_perm (w2[1], w2[2], selector);
4460 w2[1] = __byte_perm (w2[0], w2[1], selector);
4461 w2[0] = __byte_perm (w1[3], w2[0], selector);
4462 w1[3] = __byte_perm (w1[2], w1[3], selector);
4463 w1[2] = __byte_perm (w1[1], w1[2], selector);
4464 w1[1] = __byte_perm (w1[0], w1[1], selector);
4465 w1[0] = __byte_perm (w0[3], w1[0], selector);
4466 w0[3] = __byte_perm (w0[2], w0[3], selector);
4467 w0[2] = __byte_perm (w0[1], w0[2], selector);
4468 w0[1] = __byte_perm (w0[0], w0[1], selector);
4469 w0[0] = __byte_perm ( 0, w0[0], selector);
4470
4471 break;
4472
4473 case 1:
4474 w3[1] = __byte_perm (w2[3], w3[0], selector);
4475 w3[0] = __byte_perm (w2[2], w2[3], selector);
4476 w2[3] = __byte_perm (w2[1], w2[2], selector);
4477 w2[2] = __byte_perm (w2[0], w2[1], selector);
4478 w2[1] = __byte_perm (w1[3], w2[0], selector);
4479 w2[0] = __byte_perm (w1[2], w1[3], selector);
4480 w1[3] = __byte_perm (w1[1], w1[2], selector);
4481 w1[2] = __byte_perm (w1[0], w1[1], selector);
4482 w1[1] = __byte_perm (w0[3], w1[0], selector);
4483 w1[0] = __byte_perm (w0[2], w0[3], selector);
4484 w0[3] = __byte_perm (w0[1], w0[2], selector);
4485 w0[2] = __byte_perm (w0[0], w0[1], selector);
4486 w0[1] = __byte_perm ( 0, w0[0], selector);
4487 w0[0] = 0;
4488
4489 break;
4490
4491 case 2:
4492 w3[1] = __byte_perm (w2[2], w2[3], selector);
4493 w3[0] = __byte_perm (w2[1], w2[2], selector);
4494 w2[3] = __byte_perm (w2[0], w2[1], selector);
4495 w2[2] = __byte_perm (w1[3], w2[0], selector);
4496 w2[1] = __byte_perm (w1[2], w1[3], selector);
4497 w2[0] = __byte_perm (w1[1], w1[2], selector);
4498 w1[3] = __byte_perm (w1[0], w1[1], selector);
4499 w1[2] = __byte_perm (w0[3], w1[0], selector);
4500 w1[1] = __byte_perm (w0[2], w0[3], selector);
4501 w1[0] = __byte_perm (w0[1], w0[2], selector);
4502 w0[3] = __byte_perm (w0[0], w0[1], selector);
4503 w0[2] = __byte_perm ( 0, w0[0], selector);
4504 w0[1] = 0;
4505 w0[0] = 0;
4506
4507 break;
4508
4509 case 3:
4510 w3[1] = __byte_perm (w2[1], w2[2], selector);
4511 w3[0] = __byte_perm (w2[0], w2[1], selector);
4512 w2[3] = __byte_perm (w1[3], w2[0], selector);
4513 w2[2] = __byte_perm (w1[2], w1[3], selector);
4514 w2[1] = __byte_perm (w1[1], w1[2], selector);
4515 w2[0] = __byte_perm (w1[0], w1[1], selector);
4516 w1[3] = __byte_perm (w0[3], w1[0], selector);
4517 w1[2] = __byte_perm (w0[2], w0[3], selector);
4518 w1[1] = __byte_perm (w0[1], w0[2], selector);
4519 w1[0] = __byte_perm (w0[0], w0[1], selector);
4520 w0[3] = __byte_perm ( 0, w0[0], selector);
4521 w0[2] = 0;
4522 w0[1] = 0;
4523 w0[0] = 0;
4524
4525 break;
4526
4527 case 4:
4528 w3[1] = __byte_perm (w2[0], w2[1], selector);
4529 w3[0] = __byte_perm (w1[3], w2[0], selector);
4530 w2[3] = __byte_perm (w1[2], w1[3], selector);
4531 w2[2] = __byte_perm (w1[1], w1[2], selector);
4532 w2[1] = __byte_perm (w1[0], w1[1], selector);
4533 w2[0] = __byte_perm (w0[3], w1[0], selector);
4534 w1[3] = __byte_perm (w0[2], w0[3], selector);
4535 w1[2] = __byte_perm (w0[1], w0[2], selector);
4536 w1[1] = __byte_perm (w0[0], w0[1], selector);
4537 w1[0] = __byte_perm ( 0, w0[0], selector);
4538 w0[3] = 0;
4539 w0[2] = 0;
4540 w0[1] = 0;
4541 w0[0] = 0;
4542
4543 break;
4544
4545 case 5:
4546 w3[1] = __byte_perm (w1[3], w2[0], selector);
4547 w3[0] = __byte_perm (w1[2], w1[3], selector);
4548 w2[3] = __byte_perm (w1[1], w1[2], selector);
4549 w2[2] = __byte_perm (w1[0], w1[1], selector);
4550 w2[1] = __byte_perm (w0[3], w1[0], selector);
4551 w2[0] = __byte_perm (w0[2], w0[3], selector);
4552 w1[3] = __byte_perm (w0[1], w0[2], selector);
4553 w1[2] = __byte_perm (w0[0], w0[1], selector);
4554 w1[1] = __byte_perm ( 0, w0[0], selector);
4555 w1[0] = 0;
4556 w0[3] = 0;
4557 w0[2] = 0;
4558 w0[1] = 0;
4559 w0[0] = 0;
4560
4561 break;
4562
4563 case 6:
4564 w3[1] = __byte_perm (w1[2], w1[3], selector);
4565 w3[0] = __byte_perm (w1[1], w1[2], selector);
4566 w2[3] = __byte_perm (w1[0], w1[1], selector);
4567 w2[2] = __byte_perm (w0[3], w1[0], selector);
4568 w2[1] = __byte_perm (w0[2], w0[3], selector);
4569 w2[0] = __byte_perm (w0[1], w0[2], selector);
4570 w1[3] = __byte_perm (w0[0], w0[1], selector);
4571 w1[2] = __byte_perm ( 0, w0[0], selector);
4572 w1[1] = 0;
4573 w1[0] = 0;
4574 w0[3] = 0;
4575 w0[2] = 0;
4576 w0[1] = 0;
4577 w0[0] = 0;
4578
4579 break;
4580
4581 case 7:
4582 w3[1] = __byte_perm (w1[1], w1[2], selector);
4583 w3[0] = __byte_perm (w1[0], w1[1], selector);
4584 w2[3] = __byte_perm (w0[3], w1[0], selector);
4585 w2[2] = __byte_perm (w0[2], w0[3], selector);
4586 w2[1] = __byte_perm (w0[1], w0[2], selector);
4587 w2[0] = __byte_perm (w0[0], w0[1], selector);
4588 w1[3] = __byte_perm ( 0, w0[0], selector);
4589 w1[2] = 0;
4590 w1[1] = 0;
4591 w1[0] = 0;
4592 w0[3] = 0;
4593 w0[2] = 0;
4594 w0[1] = 0;
4595 w0[0] = 0;
4596
4597 break;
4598
4599 case 8:
4600 w3[1] = __byte_perm (w1[0], w1[1], selector);
4601 w3[0] = __byte_perm (w0[3], w1[0], selector);
4602 w2[3] = __byte_perm (w0[2], w0[3], selector);
4603 w2[2] = __byte_perm (w0[1], w0[2], selector);
4604 w2[1] = __byte_perm (w0[0], w0[1], selector);
4605 w2[0] = __byte_perm ( 0, w0[0], selector);
4606 w1[3] = 0;
4607 w1[2] = 0;
4608 w1[1] = 0;
4609 w1[0] = 0;
4610 w0[3] = 0;
4611 w0[2] = 0;
4612 w0[1] = 0;
4613 w0[0] = 0;
4614
4615 break;
4616
4617 case 9:
4618 w3[1] = __byte_perm (w0[3], w1[0], selector);
4619 w3[0] = __byte_perm (w0[2], w0[3], selector);
4620 w2[3] = __byte_perm (w0[1], w0[2], selector);
4621 w2[2] = __byte_perm (w0[0], w0[1], selector);
4622 w2[1] = __byte_perm ( 0, w0[0], selector);
4623 w2[0] = 0;
4624 w1[3] = 0;
4625 w1[2] = 0;
4626 w1[1] = 0;
4627 w1[0] = 0;
4628 w0[3] = 0;
4629 w0[2] = 0;
4630 w0[1] = 0;
4631 w0[0] = 0;
4632
4633 break;
4634
4635 case 10:
4636 w3[1] = __byte_perm (w0[2], w0[3], selector);
4637 w3[0] = __byte_perm (w0[1], w0[2], selector);
4638 w2[3] = __byte_perm (w0[0], w0[1], selector);
4639 w2[2] = __byte_perm ( 0, w0[0], selector);
4640 w2[1] = 0;
4641 w2[0] = 0;
4642 w1[3] = 0;
4643 w1[2] = 0;
4644 w1[1] = 0;
4645 w1[0] = 0;
4646 w0[3] = 0;
4647 w0[2] = 0;
4648 w0[1] = 0;
4649 w0[0] = 0;
4650
4651 break;
4652
4653 case 11:
4654 w3[1] = __byte_perm (w0[1], w0[2], selector);
4655 w3[0] = __byte_perm (w0[0], w0[1], selector);
4656 w2[3] = __byte_perm ( 0, w0[0], selector);
4657 w2[2] = 0;
4658 w2[1] = 0;
4659 w2[0] = 0;
4660 w1[3] = 0;
4661 w1[2] = 0;
4662 w1[1] = 0;
4663 w1[0] = 0;
4664 w0[3] = 0;
4665 w0[2] = 0;
4666 w0[1] = 0;
4667 w0[0] = 0;
4668
4669 break;
4670
4671 case 12:
4672 w3[1] = __byte_perm (w0[0], w0[1], selector);
4673 w3[0] = __byte_perm ( 0, w0[0], selector);
4674 w2[3] = 0;
4675 w2[2] = 0;
4676 w2[1] = 0;
4677 w2[0] = 0;
4678 w1[3] = 0;
4679 w1[2] = 0;
4680 w1[1] = 0;
4681 w1[0] = 0;
4682 w0[3] = 0;
4683 w0[2] = 0;
4684 w0[1] = 0;
4685 w0[0] = 0;
4686
4687 break;
4688
4689 case 13:
4690 w3[1] = __byte_perm ( 0, w0[0], selector);
4691 w3[0] = 0;
4692 w2[3] = 0;
4693 w2[2] = 0;
4694 w2[1] = 0;
4695 w2[0] = 0;
4696 w1[3] = 0;
4697 w1[2] = 0;
4698 w1[1] = 0;
4699 w1[0] = 0;
4700 w0[3] = 0;
4701 w0[2] = 0;
4702 w0[1] = 0;
4703 w0[0] = 0;
4704
4705 break;
4706 }
4707 #endif
4708 }
4709
4710 inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
4711 {
4712 #if defined IS_AMD || defined IS_GENERIC
4713 switch (offset / 4)
4714 {
4715 case 0:
4716 w3[2] = amd_bytealign (w3[1], 0, offset);
4717 w3[1] = amd_bytealign (w3[0], w3[1], offset);
4718 w3[0] = amd_bytealign (w2[3], w3[0], offset);
4719 w2[3] = amd_bytealign (w2[2], w2[3], offset);
4720 w2[2] = amd_bytealign (w2[1], w2[2], offset);
4721 w2[1] = amd_bytealign (w2[0], w2[1], offset);
4722 w2[0] = amd_bytealign (w1[3], w2[0], offset);
4723 w1[3] = amd_bytealign (w1[2], w1[3], offset);
4724 w1[2] = amd_bytealign (w1[1], w1[2], offset);
4725 w1[1] = amd_bytealign (w1[0], w1[1], offset);
4726 w1[0] = amd_bytealign (w0[3], w1[0], offset);
4727 w0[3] = amd_bytealign (w0[2], w0[3], offset);
4728 w0[2] = amd_bytealign (w0[1], w0[2], offset);
4729 w0[1] = amd_bytealign (w0[0], w0[1], offset);
4730 w0[0] = amd_bytealign ( 0, w0[0], offset);
4731 break;
4732
4733 case 1:
4734 w3[2] = amd_bytealign (w3[0], 0, offset);
4735 w3[1] = amd_bytealign (w2[3], w3[0], offset);
4736 w3[0] = amd_bytealign (w2[2], w2[3], offset);
4737 w2[3] = amd_bytealign (w2[1], w2[2], offset);
4738 w2[2] = amd_bytealign (w2[0], w2[1], offset);
4739 w2[1] = amd_bytealign (w1[3], w2[0], offset);
4740 w2[0] = amd_bytealign (w1[2], w1[3], offset);
4741 w1[3] = amd_bytealign (w1[1], w1[2], offset);
4742 w1[2] = amd_bytealign (w1[0], w1[1], offset);
4743 w1[1] = amd_bytealign (w0[3], w1[0], offset);
4744 w1[0] = amd_bytealign (w0[2], w0[3], offset);
4745 w0[3] = amd_bytealign (w0[1], w0[2], offset);
4746 w0[2] = amd_bytealign (w0[0], w0[1], offset);
4747 w0[1] = amd_bytealign ( 0, w0[0], offset);
4748 w0[0] = 0;
4749 break;
4750
4751 case 2:
4752 w3[2] = amd_bytealign (w2[3], 0, offset);
4753 w3[1] = amd_bytealign (w2[2], w2[3], offset);
4754 w3[0] = amd_bytealign (w2[1], w2[2], offset);
4755 w2[3] = amd_bytealign (w2[0], w2[1], offset);
4756 w2[2] = amd_bytealign (w1[3], w2[0], offset);
4757 w2[1] = amd_bytealign (w1[2], w1[3], offset);
4758 w2[0] = amd_bytealign (w1[1], w1[2], offset);
4759 w1[3] = amd_bytealign (w1[0], w1[1], offset);
4760 w1[2] = amd_bytealign (w0[3], w1[0], offset);
4761 w1[1] = amd_bytealign (w0[2], w0[3], offset);
4762 w1[0] = amd_bytealign (w0[1], w0[2], offset);
4763 w0[3] = amd_bytealign (w0[0], w0[1], offset);
4764 w0[2] = amd_bytealign ( 0, w0[0], offset);
4765 w0[1] = 0;
4766 w0[0] = 0;
4767 break;
4768
4769 case 3:
4770 w3[2] = amd_bytealign (w2[2], 0, offset);
4771 w3[1] = amd_bytealign (w2[1], w2[2], offset);
4772 w3[0] = amd_bytealign (w2[0], w2[1], offset);
4773 w2[3] = amd_bytealign (w1[3], w2[0], offset);
4774 w2[2] = amd_bytealign (w1[2], w1[3], offset);
4775 w2[1] = amd_bytealign (w1[1], w1[2], offset);
4776 w2[0] = amd_bytealign (w1[0], w1[1], offset);
4777 w1[3] = amd_bytealign (w0[3], w1[0], offset);
4778 w1[2] = amd_bytealign (w0[2], w0[3], offset);
4779 w1[1] = amd_bytealign (w0[1], w0[2], offset);
4780 w1[0] = amd_bytealign (w0[0], w0[1], offset);
4781 w0[3] = amd_bytealign ( 0, w0[0], offset);
4782 w0[2] = 0;
4783 w0[1] = 0;
4784 w0[0] = 0;
4785 break;
4786
4787 case 4:
4788 w3[2] = amd_bytealign (w2[1], 0, offset);
4789 w3[1] = amd_bytealign (w2[0], w2[1], offset);
4790 w3[0] = amd_bytealign (w1[3], w2[0], offset);
4791 w2[3] = amd_bytealign (w1[2], w1[3], offset);
4792 w2[2] = amd_bytealign (w1[1], w1[2], offset);
4793 w2[1] = amd_bytealign (w1[0], w1[1], offset);
4794 w2[0] = amd_bytealign (w0[3], w1[0], offset);
4795 w1[3] = amd_bytealign (w0[2], w0[3], offset);
4796 w1[2] = amd_bytealign (w0[1], w0[2], offset);
4797 w1[1] = amd_bytealign (w0[0], w0[1], offset);
4798 w1[0] = amd_bytealign ( 0, w0[0], offset);
4799 w0[3] = 0;
4800 w0[2] = 0;
4801 w0[1] = 0;
4802 w0[0] = 0;
4803 break;
4804
4805 case 5:
4806 w3[2] = amd_bytealign (w2[0], 0, offset);
4807 w3[1] = amd_bytealign (w1[3], w2[0], offset);
4808 w3[0] = amd_bytealign (w1[2], w1[3], offset);
4809 w2[3] = amd_bytealign (w1[1], w1[2], offset);
4810 w2[2] = amd_bytealign (w1[0], w1[1], offset);
4811 w2[1] = amd_bytealign (w0[3], w1[0], offset);
4812 w2[0] = amd_bytealign (w0[2], w0[3], offset);
4813 w1[3] = amd_bytealign (w0[1], w0[2], offset);
4814 w1[2] = amd_bytealign (w0[0], w0[1], offset);
4815 w1[1] = amd_bytealign ( 0, w0[0], offset);
4816 w1[0] = 0;
4817 w0[3] = 0;
4818 w0[2] = 0;
4819 w0[1] = 0;
4820 w0[0] = 0;
4821 break;
4822
4823 case 6:
4824 w3[2] = amd_bytealign (w1[3], 0, offset);
4825 w3[1] = amd_bytealign (w1[2], w1[3], offset);
4826 w3[0] = amd_bytealign (w1[1], w1[2], offset);
4827 w2[3] = amd_bytealign (w1[0], w1[1], offset);
4828 w2[2] = amd_bytealign (w0[3], w1[0], offset);
4829 w2[1] = amd_bytealign (w0[2], w0[3], offset);
4830 w2[0] = amd_bytealign (w0[1], w0[2], offset);
4831 w1[3] = amd_bytealign (w0[0], w0[1], offset);
4832 w1[2] = amd_bytealign ( 0, w0[0], offset);
4833 w1[1] = 0;
4834 w1[0] = 0;
4835 w0[3] = 0;
4836 w0[2] = 0;
4837 w0[1] = 0;
4838 w0[0] = 0;
4839 break;
4840
4841 case 7:
4842 w3[2] = amd_bytealign (w1[2], 0, offset);
4843 w3[1] = amd_bytealign (w1[1], w1[2], offset);
4844 w3[0] = amd_bytealign (w1[0], w1[1], offset);
4845 w2[3] = amd_bytealign (w0[3], w1[0], offset);
4846 w2[2] = amd_bytealign (w0[2], w0[3], offset);
4847 w2[1] = amd_bytealign (w0[1], w0[2], offset);
4848 w2[0] = amd_bytealign (w0[0], w0[1], offset);
4849 w1[3] = amd_bytealign ( 0, w0[0], offset);
4850 w1[2] = 0;
4851 w1[1] = 0;
4852 w1[0] = 0;
4853 w0[3] = 0;
4854 w0[2] = 0;
4855 w0[1] = 0;
4856 w0[0] = 0;
4857 break;
4858
4859 case 8:
4860 w3[2] = amd_bytealign (w1[1], 0, offset);
4861 w3[1] = amd_bytealign (w1[0], w1[1], offset);
4862 w3[0] = amd_bytealign (w0[3], w1[0], offset);
4863 w2[3] = amd_bytealign (w0[2], w0[3], offset);
4864 w2[2] = amd_bytealign (w0[1], w0[2], offset);
4865 w2[1] = amd_bytealign (w0[0], w0[1], offset);
4866 w2[0] = amd_bytealign ( 0, w0[0], offset);
4867 w1[3] = 0;
4868 w1[2] = 0;
4869 w1[1] = 0;
4870 w1[0] = 0;
4871 w0[3] = 0;
4872 w0[2] = 0;
4873 w0[1] = 0;
4874 w0[0] = 0;
4875 break;
4876
4877 case 9:
4878 w3[2] = amd_bytealign (w1[0], 0, offset);
4879 w3[1] = amd_bytealign (w0[3], w1[0], offset);
4880 w3[0] = amd_bytealign (w0[2], w0[3], offset);
4881 w2[3] = amd_bytealign (w0[1], w0[2], offset);
4882 w2[2] = amd_bytealign (w0[0], w0[1], offset);
4883 w2[1] = amd_bytealign ( 0, w0[0], offset);
4884 w2[0] = 0;
4885 w1[3] = 0;
4886 w1[2] = 0;
4887 w1[1] = 0;
4888 w1[0] = 0;
4889 w0[3] = 0;
4890 w0[2] = 0;
4891 w0[1] = 0;
4892 w0[0] = 0;
4893 break;
4894
4895 case 10:
4896 w3[2] = amd_bytealign (w0[3], 0, offset);
4897 w3[1] = amd_bytealign (w0[2], w0[3], offset);
4898 w3[0] = amd_bytealign (w0[1], w0[2], offset);
4899 w2[3] = amd_bytealign (w0[0], w0[1], offset);
4900 w2[2] = amd_bytealign ( 0, w0[0], offset);
4901 w2[1] = 0;
4902 w2[0] = 0;
4903 w1[3] = 0;
4904 w1[2] = 0;
4905 w1[1] = 0;
4906 w1[0] = 0;
4907 w0[3] = 0;
4908 w0[2] = 0;
4909 w0[1] = 0;
4910 w0[0] = 0;
4911 break;
4912
4913 case 11:
4914 w3[2] = amd_bytealign (w0[2], 0, offset);
4915 w3[1] = amd_bytealign (w0[1], w0[2], offset);
4916 w3[0] = amd_bytealign (w0[0], w0[1], offset);
4917 w2[3] = amd_bytealign ( 0, w0[0], offset);
4918 w2[2] = 0;
4919 w2[1] = 0;
4920 w2[0] = 0;
4921 w1[3] = 0;
4922 w1[2] = 0;
4923 w1[1] = 0;
4924 w1[0] = 0;
4925 w0[3] = 0;
4926 w0[2] = 0;
4927 w0[1] = 0;
4928 w0[0] = 0;
4929 break;
4930
4931 case 12:
4932 w3[2] = amd_bytealign (w0[1], 0, offset);
4933 w3[1] = amd_bytealign (w0[0], w0[1], offset);
4934 w3[0] = amd_bytealign ( 0, w0[0], offset);
4935 w2[3] = 0;
4936 w2[2] = 0;
4937 w2[1] = 0;
4938 w2[0] = 0;
4939 w1[3] = 0;
4940 w1[2] = 0;
4941 w1[1] = 0;
4942 w1[0] = 0;
4943 w0[3] = 0;
4944 w0[2] = 0;
4945 w0[1] = 0;
4946 w0[0] = 0;
4947 break;
4948
4949 case 13:
4950 w3[2] = amd_bytealign (w0[0], 0, offset);
4951 w3[1] = amd_bytealign ( 0, w0[0], offset);
4952 w3[0] = 0;
4953 w2[3] = 0;
4954 w2[2] = 0;
4955 w2[1] = 0;
4956 w2[0] = 0;
4957 w1[3] = 0;
4958 w1[2] = 0;
4959 w1[1] = 0;
4960 w1[0] = 0;
4961 w0[3] = 0;
4962 w0[2] = 0;
4963 w0[1] = 0;
4964 w0[0] = 0;
4965 break;
4966 }
4967 #endif
4968
4969 #ifdef IS_NV
4970 const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
4971
4972 switch (offset / 4)
4973 {
4974 case 0:
4975 w3[1] = __byte_perm (w3[1], w3[0], selector);
4976 w3[0] = __byte_perm (w3[0], w2[3], selector);
4977 w2[3] = __byte_perm (w2[3], w2[2], selector);
4978 w2[2] = __byte_perm (w2[2], w2[1], selector);
4979 w2[1] = __byte_perm (w2[1], w2[0], selector);
4980 w2[0] = __byte_perm (w2[0], w1[3], selector);
4981 w1[3] = __byte_perm (w1[3], w1[2], selector);
4982 w1[2] = __byte_perm (w1[2], w1[1], selector);
4983 w1[1] = __byte_perm (w1[1], w1[0], selector);
4984 w1[0] = __byte_perm (w1[0], w0[3], selector);
4985 w0[3] = __byte_perm (w0[3], w0[2], selector);
4986 w0[2] = __byte_perm (w0[2], w0[1], selector);
4987 w0[1] = __byte_perm (w0[1], w0[0], selector);
4988 w0[0] = __byte_perm (w0[0], 0, selector);
4989 break;
4990
4991 case 1:
4992 w3[1] = __byte_perm (w3[0], w2[3], selector);
4993 w3[0] = __byte_perm (w2[3], w2[2], selector);
4994 w2[3] = __byte_perm (w2[2], w2[1], selector);
4995 w2[2] = __byte_perm (w2[1], w2[0], selector);
4996 w2[1] = __byte_perm (w2[0], w1[3], selector);
4997 w2[0] = __byte_perm (w1[3], w1[2], selector);
4998 w1[3] = __byte_perm (w1[2], w1[1], selector);
4999 w1[2] = __byte_perm (w1[1], w1[0], selector);
5000 w1[1] = __byte_perm (w1[0], w0[3], selector);
5001 w1[0] = __byte_perm (w0[3], w0[2], selector);
5002 w0[3] = __byte_perm (w0[2], w0[1], selector);
5003 w0[2] = __byte_perm (w0[1], w0[0], selector);
5004 w0[1] = __byte_perm (w0[0], 0, selector);
5005 w0[0] = 0;
5006 break;
5007
5008 case 2:
5009 w3[1] = __byte_perm (w2[3], w2[2], selector);
5010 w3[0] = __byte_perm (w2[2], w2[1], selector);
5011 w2[3] = __byte_perm (w2[1], w2[0], selector);
5012 w2[2] = __byte_perm (w2[0], w1[3], selector);
5013 w2[1] = __byte_perm (w1[3], w1[2], selector);
5014 w2[0] = __byte_perm (w1[2], w1[1], selector);
5015 w1[3] = __byte_perm (w1[1], w1[0], selector);
5016 w1[2] = __byte_perm (w1[0], w0[3], selector);
5017 w1[1] = __byte_perm (w0[3], w0[2], selector);
5018 w1[0] = __byte_perm (w0[2], w0[1], selector);
5019 w0[3] = __byte_perm (w0[1], w0[0], selector);
5020 w0[2] = __byte_perm (w0[0], 0, selector);
5021 w0[1] = 0;
5022 w0[0] = 0;
5023 break;
5024
5025 case 3:
5026 w3[1] = __byte_perm (w2[2], w2[1], selector);
5027 w3[0] = __byte_perm (w2[1], w2[0], selector);
5028 w2[3] = __byte_perm (w2[0], w1[3], selector);
5029 w2[2] = __byte_perm (w1[3], w1[2], selector);
5030 w2[1] = __byte_perm (w1[2], w1[1], selector);
5031 w2[0] = __byte_perm (w1[1], w1[0], selector);
5032 w1[3] = __byte_perm (w1[0], w0[3], selector);
5033 w1[2] = __byte_perm (w0[3], w0[2], selector);
5034 w1[1] = __byte_perm (w0[2], w0[1], selector);
5035 w1[0] = __byte_perm (w0[1], w0[0], selector);
5036 w0[3] = __byte_perm (w0[0], 0, selector);
5037 w0[2] = 0;
5038 w0[1] = 0;
5039 w0[0] = 0;
5040 break;
5041
5042 case 4:
5043 w3[1] = __byte_perm (w2[1], w2[0], selector);
5044 w3[0] = __byte_perm (w2[0], w1[3], selector);
5045 w2[3] = __byte_perm (w1[3], w1[2], selector);
5046 w2[2] = __byte_perm (w1[2], w1[1], selector);
5047 w2[1] = __byte_perm (w1[1], w1[0], selector);
5048 w2[0] = __byte_perm (w1[0], w0[3], selector);
5049 w1[3] = __byte_perm (w0[3], w0[2], selector);
5050 w1[2] = __byte_perm (w0[2], w0[1], selector);
5051 w1[1] = __byte_perm (w0[1], w0[0], selector);
5052 w1[0] = __byte_perm (w0[0], 0, selector);
5053 w0[3] = 0;
5054 w0[2] = 0;
5055 w0[1] = 0;
5056 w0[0] = 0;
5057 break;
5058
5059 case 5:
5060 w3[1] = __byte_perm (w2[0], w1[3], selector);
5061 w3[0] = __byte_perm (w1[3], w1[2], selector);
5062 w2[3] = __byte_perm (w1[2], w1[1], selector);
5063 w2[2] = __byte_perm (w1[1], w1[0], selector);
5064 w2[1] = __byte_perm (w1[0], w0[3], selector);
5065 w2[0] = __byte_perm (w0[3], w0[2], selector);
5066 w1[3] = __byte_perm (w0[2], w0[1], selector);
5067 w1[2] = __byte_perm (w0[1], w0[0], selector);
5068 w1[1] = __byte_perm (w0[0], 0, selector);
5069 w1[0] = 0;
5070 w0[3] = 0;
5071 w0[2] = 0;
5072 w0[1] = 0;
5073 w0[0] = 0;
5074 break;
5075
5076 case 6:
5077 w3[1] = __byte_perm (w1[3], w1[2], selector);
5078 w3[0] = __byte_perm (w1[2], w1[1], selector);
5079 w2[3] = __byte_perm (w1[1], w1[0], selector);
5080 w2[2] = __byte_perm (w1[0], w0[3], selector);
5081 w2[1] = __byte_perm (w0[3], w0[2], selector);
5082 w2[0] = __byte_perm (w0[2], w0[1], selector);
5083 w1[3] = __byte_perm (w0[1], w0[0], selector);
5084 w1[2] = __byte_perm (w0[0], 0, selector);
5085 w1[1] = 0;
5086 w1[0] = 0;
5087 w0[3] = 0;
5088 w0[2] = 0;
5089 w0[1] = 0;
5090 w0[0] = 0;
5091 break;
5092
5093 case 7:
5094 w3[1] = __byte_perm (w1[2], w1[1], selector);
5095 w3[0] = __byte_perm (w1[1], w1[0], selector);
5096 w2[3] = __byte_perm (w1[0], w0[3], selector);
5097 w2[2] = __byte_perm (w0[3], w0[2], selector);
5098 w2[1] = __byte_perm (w0[2], w0[1], selector);
5099 w2[0] = __byte_perm (w0[1], w0[0], selector);
5100 w1[3] = __byte_perm (w0[0], 0, selector);
5101 w1[2] = 0;
5102 w1[1] = 0;
5103 w1[0] = 0;
5104 w0[3] = 0;
5105 w0[2] = 0;
5106 w0[1] = 0;
5107 w0[0] = 0;
5108 break;
5109
5110 case 8:
5111 w3[1] = __byte_perm (w1[1], w1[0], selector);
5112 w3[0] = __byte_perm (w1[0], w0[3], selector);
5113 w2[3] = __byte_perm (w0[3], w0[2], selector);
5114 w2[2] = __byte_perm (w0[2], w0[1], selector);
5115 w2[1] = __byte_perm (w0[1], w0[0], selector);
5116 w2[0] = __byte_perm (w0[0], 0, selector);
5117 w1[3] = 0;
5118 w1[2] = 0;
5119 w1[1] = 0;
5120 w1[0] = 0;
5121 w0[3] = 0;
5122 w0[2] = 0;
5123 w0[1] = 0;
5124 w0[0] = 0;
5125 break;
5126
5127 case 9:
5128 w3[1] = __byte_perm (w1[0], w0[3], selector);
5129 w3[0] = __byte_perm (w0[3], w0[2], selector);
5130 w2[3] = __byte_perm (w0[2], w0[1], selector);
5131 w2[2] = __byte_perm (w0[1], w0[0], selector);
5132 w2[1] = __byte_perm (w0[0], 0, selector);
5133 w2[0] = 0;
5134 w1[3] = 0;
5135 w1[2] = 0;
5136 w1[1] = 0;
5137 w1[0] = 0;
5138 w0[3] = 0;
5139 w0[2] = 0;
5140 w0[1] = 0;
5141 w0[0] = 0;
5142 break;
5143
5144 case 10:
5145 w3[1] = __byte_perm (w0[3], w0[2], selector);
5146 w3[0] = __byte_perm (w0[2], w0[1], selector);
5147 w2[3] = __byte_perm (w0[1], w0[0], selector);
5148 w2[2] = __byte_perm (w0[0], 0, selector);
5149 w2[1] = 0;
5150 w2[0] = 0;
5151 w1[3] = 0;
5152 w1[2] = 0;
5153 w1[1] = 0;
5154 w1[0] = 0;
5155 w0[3] = 0;
5156 w0[2] = 0;
5157 w0[1] = 0;
5158 w0[0] = 0;
5159 break;
5160
5161 case 11:
5162 w3[1] = __byte_perm (w0[2], w0[1], selector);
5163 w3[0] = __byte_perm (w0[1], w0[0], selector);
5164 w2[3] = __byte_perm (w0[0], 0, selector);
5165 w2[2] = 0;
5166 w2[1] = 0;
5167 w2[0] = 0;
5168 w1[3] = 0;
5169 w1[2] = 0;
5170 w1[1] = 0;
5171 w1[0] = 0;
5172 w0[3] = 0;
5173 w0[2] = 0;
5174 w0[1] = 0;
5175 w0[0] = 0;
5176 break;
5177
5178 case 12:
5179 w3[1] = __byte_perm (w0[1], w0[0], selector);
5180 w3[0] = __byte_perm (w0[0], 0, selector);
5181 w2[3] = 0;
5182 w2[2] = 0;
5183 w2[1] = 0;
5184 w2[0] = 0;
5185 w1[3] = 0;
5186 w1[2] = 0;
5187 w1[1] = 0;
5188 w1[0] = 0;
5189 w0[3] = 0;
5190 w0[2] = 0;
5191 w0[1] = 0;
5192 w0[0] = 0;
5193 break;
5194
5195 case 13:
5196 w3[1] = __byte_perm (w0[0], 0, selector);
5197 w3[0] = 0;
5198 w2[3] = 0;
5199 w2[2] = 0;
5200 w2[1] = 0;
5201 w2[0] = 0;
5202 w1[3] = 0;
5203 w1[2] = 0;
5204 w1[1] = 0;
5205 w1[0] = 0;
5206 w0[3] = 0;
5207 w0[2] = 0;
5208 w0[1] = 0;
5209 w0[0] = 0;
5210 break;
5211 }
5212 #endif
5213 }
5214
5215 inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len)
5216 {
5217 #if defined cl_amd_media_ops
5218 switch (salt_len)
5219 {
5220 case 0: sw[0] = w0;
5221 break;
5222 case 1: sw[0] = amd_bytealign (w0, sw[0] << 24, 3);
5223 sw[1] = amd_bytealign (sw[1] >> 8, w0, 3);
5224 break;
5225 case 2: sw[0] = amd_bytealign (w0, sw[0] << 16, 2);
5226 sw[1] = amd_bytealign (sw[1] >> 16, w0, 2);
5227 break;
5228 case 3: sw[0] = amd_bytealign (w0, sw[0] << 8, 1);
5229 sw[1] = amd_bytealign (sw[1] >> 24, w0, 1);
5230 break;
5231 case 4: sw[1] = w0;
5232 break;
5233 case 5: sw[1] = amd_bytealign (w0, sw[1] << 24, 3);
5234 sw[2] = amd_bytealign (sw[2] >> 8, w0, 3);
5235 break;
5236 case 6: sw[1] = amd_bytealign (w0, sw[1] << 16, 2);
5237 sw[2] = amd_bytealign (sw[2] >> 16, w0, 2);
5238 break;
5239 case 7: sw[1] = amd_bytealign (w0, sw[1] << 8, 1);
5240 sw[2] = amd_bytealign (sw[2] >> 24, w0, 1);
5241 break;
5242 case 8: sw[2] = w0;
5243 break;
5244 case 9: sw[2] = amd_bytealign (w0, sw[2] << 24, 3);
5245 sw[3] = amd_bytealign (sw[3] >> 8, w0, 3);
5246 break;
5247 case 10: sw[2] = amd_bytealign (w0, sw[2] << 16, 2);
5248 sw[3] = amd_bytealign (sw[3] >> 16, w0, 2);
5249 break;
5250 case 11: sw[2] = amd_bytealign (w0, sw[2] << 8, 1);
5251 sw[3] = amd_bytealign (sw[3] >> 24, w0, 1);
5252 break;
5253 case 12: sw[3] = w0;
5254 break;
5255 case 13: sw[3] = amd_bytealign (w0, sw[3] << 24, 3);
5256 sw[4] = amd_bytealign (sw[4] >> 8, w0, 3);
5257 break;
5258 case 14: sw[3] = amd_bytealign (w0, sw[3] << 16, 2);
5259 sw[4] = amd_bytealign (sw[4] >> 16, w0, 2);
5260 break;
5261 case 15: sw[3] = amd_bytealign (w0, sw[3] << 8, 1);
5262 sw[4] = amd_bytealign (sw[4] >> 24, w0, 1);
5263 break;
5264 case 16: sw[4] = w0;
5265 break;
5266 case 17: sw[4] = amd_bytealign (w0, sw[4] << 24, 3);
5267 sw[5] = amd_bytealign (sw[5] >> 8, w0, 3);
5268 break;
5269 case 18: sw[4] = amd_bytealign (w0, sw[4] << 16, 2);
5270 sw[5] = amd_bytealign (sw[5] >> 16, w0, 2);
5271 break;
5272 case 19: sw[4] = amd_bytealign (w0, sw[4] << 8, 1);
5273 sw[5] = amd_bytealign (sw[5] >> 24, w0, 1);
5274 break;
5275 case 20: sw[5] = w0;
5276 break;
5277 case 21: sw[5] = amd_bytealign (w0, sw[5] << 24, 3);
5278 sw[6] = amd_bytealign (sw[6] >> 8, w0, 3);
5279 break;
5280 case 22: sw[5] = amd_bytealign (w0, sw[5] << 16, 2);
5281 sw[6] = amd_bytealign (sw[6] >> 16, w0, 2);
5282 break;
5283 case 23: sw[5] = amd_bytealign (w0, sw[5] << 8, 1);
5284 sw[6] = amd_bytealign (sw[6] >> 24, w0, 1);
5285 break;
5286 case 24: sw[6] = w0;
5287 break;
5288 case 25: sw[6] = amd_bytealign (w0, sw[6] << 24, 3);
5289 sw[7] = amd_bytealign (sw[7] >> 8, w0, 3);
5290 break;
5291 case 26: sw[6] = amd_bytealign (w0, sw[6] << 16, 2);
5292 sw[7] = amd_bytealign (sw[7] >> 16, w0, 2);
5293 break;
5294 case 27: sw[6] = amd_bytealign (w0, sw[6] << 8, 1);
5295 sw[7] = amd_bytealign (sw[7] >> 24, w0, 1);
5296 break;
5297 case 28: sw[7] = w0;
5298 break;
5299 case 29: sw[7] = amd_bytealign (w0, sw[7] << 24, 3);
5300 sw[8] = amd_bytealign (sw[8] >> 8, w0, 3);
5301 break;
5302 case 30: sw[7] = amd_bytealign (w0, sw[7] << 16, 2);
5303 sw[8] = amd_bytealign (sw[8] >> 16, w0, 2);
5304 break;
5305 case 31: sw[7] = amd_bytealign (w0, sw[7] << 8, 1);
5306 sw[8] = amd_bytealign (sw[8] >> 24, w0, 1);
5307 break;
5308 }
5309 #else
5310 switch (salt_len)
5311 {
5312 case 0: sw[0] = w0;
5313 break;
5314 case 1: sw[0] = (sw[0] & 0x000000ff) | (w0 << 8);
5315 sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
5316 break;
5317 case 2: sw[0] = (sw[0] & 0x0000ffff) | (w0 << 16);
5318 sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
5319 break;
5320 case 3: sw[0] = (sw[0] & 0x00ffffff) | (w0 << 24);
5321 sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
5322 break;
5323 case 4: sw[1] = w0;
5324 break;
5325 case 5: sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
5326 sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
5327 break;
5328 case 6: sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
5329 sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
5330 break;
5331 case 7: sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
5332 sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
5333 break;
5334 case 8: sw[2] = w0;
5335 break;
5336 case 9: sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
5337 sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
5338 break;
5339 case 10: sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
5340 sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
5341 break;
5342 case 11: sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
5343 sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
5344 break;
5345 case 12: sw[3] = w0;
5346 break;
5347 case 13: sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
5348 sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
5349 break;
5350 case 14: sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
5351 sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
5352 break;
5353 case 15: sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
5354 sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
5355 break;
5356 case 16: sw[4] = w0;
5357 break;
5358 case 17: sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
5359 sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
5360 break;
5361 case 18: sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
5362 sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
5363 break;
5364 case 19: sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
5365 sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
5366 break;
5367 case 20: sw[5] = w0;
5368 break;
5369 case 21: sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
5370 sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
5371 break;
5372 case 22: sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
5373 sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
5374 break;
5375 case 23: sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
5376 sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
5377 break;
5378 case 24: sw[6] = w0;
5379 break;
5380 case 25: sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
5381 sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
5382 break;
5383 case 26: sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
5384 sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
5385 break;
5386 case 27: sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
5387 sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
5388 break;
5389 case 28: sw[7] = w0;
5390 break;
5391 case 29: sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
5392 sw[8] = (sw[8] & 0xffffff00) | (w0 >> 24);
5393 break;
5394 case 30: sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
5395 sw[8] = (sw[8] & 0xffff0000) | (w0 >> 16);
5396 break;
5397 case 31: sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
5398 sw[8] = (sw[8] & 0xff000000) | (w0 >> 8);
5399 break;
5400 }
5401 #endif
5402 }
5403
5404 inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len)
5405 {
5406 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5407
5408 switch (salt_len)
5409 {
5410 case 0: sw[0] = w0;
5411 break;
5412 case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8);
5413 sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
5414 break;
5415 case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16);
5416 sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
5417 break;
5418 case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24);
5419 sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
5420 break;
5421 case 4: sw[1] = w0;
5422 break;
5423 case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
5424 sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
5425 break;
5426 case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
5427 sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
5428 break;
5429 case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
5430 sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
5431 break;
5432 case 8: sw[2] = w0;
5433 break;
5434 case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
5435 sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
5436 break;
5437 case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
5438 sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
5439 break;
5440 case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
5441 sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
5442 break;
5443 case 12: sw[3] = w0;
5444 break;
5445 case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
5446 sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
5447 break;
5448 case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
5449 sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
5450 break;
5451 case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
5452 sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
5453 break;
5454 case 16: sw[4] = w0;
5455 break;
5456 case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
5457 sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
5458 break;
5459 case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
5460 sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
5461 break;
5462 case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
5463 sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
5464 break;
5465 case 20: sw[5] = w0;
5466 break;
5467 case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
5468 sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
5469 break;
5470 case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
5471 sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
5472 break;
5473 case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
5474 sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
5475 break;
5476 case 24: sw[6] = w0;
5477 break;
5478 case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
5479 sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
5480 break;
5481 case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
5482 sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
5483 break;
5484 case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
5485 sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
5486 break;
5487 case 28: sw[7] = w0;
5488 break;
5489 case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
5490 sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24);
5491 break;
5492 case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
5493 sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16);
5494 break;
5495 case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
5496 sw[8] = (sw[8] & 0x000000ff) | (w0 << 8);
5497 break;
5498 }
5499 }
5500
5501 inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len)
5502 {
5503 #if defined cl_amd_media_ops
5504 switch (salt_len)
5505 {
5506 case 0: w0[0] = wx;
5507 break;
5508 case 1: w0[0] = amd_bytealign (wx, w0[0] << 24, 3);
5509 w0[1] = amd_bytealign (w0[1] >> 8, wx, 3);
5510 break;
5511 case 2: w0[0] = amd_bytealign (wx, w0[0] << 16, 2);
5512 w0[1] = amd_bytealign (w0[1] >> 16, wx, 2);
5513 break;
5514 case 3: w0[0] = amd_bytealign (wx, w0[0] << 8, 1);
5515 w0[1] = amd_bytealign (w0[1] >> 24, wx, 1);
5516 break;
5517 case 4: w0[1] = wx;
5518 break;
5519 case 5: w0[1] = amd_bytealign (wx, w0[1] << 24, 3);
5520 w0[2] = amd_bytealign (w0[2] >> 8, wx, 3);
5521 break;
5522 case 6: w0[1] = amd_bytealign (wx, w0[1] << 16, 2);
5523 w0[2] = amd_bytealign (w0[2] >> 16, wx, 2);
5524 break;
5525 case 7: w0[1] = amd_bytealign (wx, w0[1] << 8, 1);
5526 w0[2] = amd_bytealign (w0[2] >> 24, wx, 1);
5527 break;
5528 case 8: w0[2] = wx;
5529 break;
5530 case 9: w0[2] = amd_bytealign (wx, w0[2] << 24, 3);
5531 w0[3] = amd_bytealign (w0[3] >> 8, wx, 3);
5532 break;
5533 case 10: w0[2] = amd_bytealign (wx, w0[2] << 16, 2);
5534 w0[3] = amd_bytealign (w0[3] >> 16, wx, 2);
5535 break;
5536 case 11: w0[2] = amd_bytealign (wx, w0[2] << 8, 1);
5537 w0[3] = amd_bytealign (w0[3] >> 24, wx, 1);
5538 break;
5539 case 12: w0[3] = wx;
5540 break;
5541 case 13: w0[3] = amd_bytealign (wx, w0[3] << 24, 3);
5542 w1[0] = amd_bytealign (w1[0] >> 8, wx, 3);
5543 break;
5544 case 14: w0[3] = amd_bytealign (wx, w0[3] << 16, 2);
5545 w1[0] = amd_bytealign (w1[0] >> 16, wx, 2);
5546 break;
5547 case 15: w0[3] = amd_bytealign (wx, w0[3] << 8, 1);
5548 w1[0] = amd_bytealign (w1[0] >> 24, wx, 1);
5549 break;
5550 case 16: w1[0] = wx;
5551 break;
5552 case 17: w1[0] = amd_bytealign (wx, w1[0] << 24, 3);
5553 w1[1] = amd_bytealign (w1[1] >> 8, wx, 3);
5554 break;
5555 case 18: w1[0] = amd_bytealign (wx, w1[0] << 16, 2);
5556 w1[1] = amd_bytealign (w1[1] >> 16, wx, 2);
5557 break;
5558 case 19: w1[0] = amd_bytealign (wx, w1[0] << 8, 1);
5559 w1[1] = amd_bytealign (w1[1] >> 24, wx, 1);
5560 break;
5561 case 20: w1[1] = wx;
5562 break;
5563 case 21: w1[1] = amd_bytealign (wx, w1[1] << 24, 3);
5564 w1[2] = amd_bytealign (w1[2] >> 8, wx, 3);
5565 break;
5566 case 22: w1[1] = amd_bytealign (wx, w1[1] << 16, 2);
5567 w1[2] = amd_bytealign (w1[2] >> 16, wx, 2);
5568 break;
5569 case 23: w1[1] = amd_bytealign (wx, w1[1] << 8, 1);
5570 w1[2] = amd_bytealign (w1[2] >> 24, wx, 1);
5571 break;
5572 case 24: w1[2] = wx;
5573 break;
5574 case 25: w1[2] = amd_bytealign (wx, w1[2] << 24, 3);
5575 w1[3] = amd_bytealign (w1[3] >> 8, wx, 3);
5576 break;
5577 case 26: w1[2] = amd_bytealign (wx, w1[2] << 16, 2);
5578 w1[3] = amd_bytealign (w1[3] >> 16, wx, 2);
5579 break;
5580 case 27: w1[2] = amd_bytealign (wx, w1[2] << 8, 1);
5581 w1[3] = amd_bytealign (w1[3] >> 24, wx, 1);
5582 break;
5583 case 28: w1[3] = wx;
5584 break;
5585 case 29: w1[3] = amd_bytealign (wx, w1[3] << 24, 3);
5586 w2[0] = amd_bytealign (w2[0] >> 8, wx, 3);
5587 break;
5588 case 30: w1[3] = amd_bytealign (wx, w1[3] << 16, 2);
5589 w2[0] = amd_bytealign (w2[0] >> 16, wx, 2);
5590 break;
5591 case 31: w1[3] = amd_bytealign (wx, w1[3] << 8, 1);
5592 w2[0] = amd_bytealign (w2[0] >> 24, wx, 1);
5593 break;
5594 case 32: w2[0] = wx;
5595 break;
5596 case 33: w2[0] = amd_bytealign (wx, w2[0] << 24, 3);
5597 w2[1] = amd_bytealign (w2[1] >> 8, wx, 3);
5598 break;
5599 case 34: w2[0] = amd_bytealign (wx, w2[0] << 16, 2);
5600 w2[1] = amd_bytealign (w2[1] >> 16, wx, 2);
5601 break;
5602 case 35: w2[0] = amd_bytealign (wx, w2[0] << 8, 1);
5603 w2[1] = amd_bytealign (w2[1] >> 24, wx, 1);
5604 break;
5605 case 36: w2[1] = wx;
5606 break;
5607 case 37: w2[1] = amd_bytealign (wx, w2[1] << 24, 3);
5608 w2[2] = amd_bytealign (w2[2] >> 8, wx, 3);
5609 break;
5610 case 38: w2[1] = amd_bytealign (wx, w2[1] << 16, 2);
5611 w2[2] = amd_bytealign (w2[2] >> 16, wx, 2);
5612 break;
5613 case 39: w2[1] = amd_bytealign (wx, w2[1] << 8, 1);
5614 w2[2] = amd_bytealign (w2[2] >> 24, wx, 1);
5615 break;
5616 case 40: w2[2] = wx;
5617 break;
5618 case 41: w2[2] = amd_bytealign (wx, w2[2] << 24, 3);
5619 w2[3] = amd_bytealign (w2[3] >> 8, wx, 3);
5620 break;
5621 case 42: w2[2] = amd_bytealign (wx, w2[2] << 16, 2);
5622 w2[3] = amd_bytealign (w2[3] >> 16, wx, 2);
5623 break;
5624 case 43: w2[2] = amd_bytealign (wx, w2[2] << 8, 1);
5625 w2[3] = amd_bytealign (w2[3] >> 24, wx, 1);
5626 break;
5627 case 44: w2[3] = wx;
5628 break;
5629 case 45: w2[3] = amd_bytealign (wx, w2[3] << 24, 3);
5630 w3[0] = amd_bytealign (w3[0] >> 8, wx, 3);
5631 break;
5632 case 46: w2[3] = amd_bytealign (wx, w2[3] << 16, 2);
5633 w3[0] = amd_bytealign (w3[0] >> 16, wx, 2);
5634 break;
5635 case 47: w2[3] = amd_bytealign (wx, w2[3] << 8, 1);
5636 w3[0] = amd_bytealign (w3[0] >> 24, wx, 1);
5637 break;
5638 case 48: w3[0] = wx;
5639 break;
5640 case 49: w3[0] = amd_bytealign (wx, w3[0] << 24, 3);
5641 w3[1] = amd_bytealign (w3[1] >> 8, wx, 3);
5642 break;
5643 case 50: w3[0] = amd_bytealign (wx, w3[0] << 16, 2);
5644 w3[1] = amd_bytealign (w3[1] >> 16, wx, 2);
5645 break;
5646 case 51: w3[0] = amd_bytealign (wx, w3[0] << 8, 1);
5647 w3[1] = amd_bytealign (w3[1] >> 24, wx, 1);
5648 break;
5649 case 52: w3[1] = wx;
5650 break;
5651 case 53: w3[1] = amd_bytealign (wx, w3[1] << 24, 3);
5652 w3[2] = amd_bytealign (w3[2] >> 8, wx, 3);
5653 break;
5654 case 54: w3[1] = amd_bytealign (wx, w3[1] << 16, 2);
5655 w3[2] = amd_bytealign (w3[2] >> 16, wx, 2);
5656 break;
5657 case 55: w3[1] = amd_bytealign (wx, w3[1] << 8, 1);
5658 w3[2] = amd_bytealign (w3[2] >> 24, wx, 1);
5659 break;
5660 case 56: w3[2] = wx;
5661 break;
5662 case 57: w3[2] = amd_bytealign (wx, w3[2] << 24, 3);
5663 w3[3] = amd_bytealign (w3[3] >> 8, wx, 3);
5664 break;
5665 case 58: w3[2] = amd_bytealign (wx, w3[2] << 16, 2);
5666 w3[3] = amd_bytealign (w3[3] >> 16, wx, 2);
5667 break;
5668 case 59: w3[2] = amd_bytealign (wx, w3[2] << 8, 1);
5669 w3[3] = amd_bytealign (w3[3] >> 24, wx, 1);
5670 break;
5671 case 60: w3[3] = wx;
5672 break;
5673 case 61: w3[3] = amd_bytealign (wx, w3[3] << 24, 3);
5674 //w4[0] = amd_bytealign (w4[0] >> 8, wx, 3);
5675 break;
5676 case 62: w3[3] = amd_bytealign (wx, w3[3] << 16, 2);
5677 //w4[0] = amd_bytealign (w4[0] >> 16, wx, 2);
5678 break;
5679 case 63: w3[3] = amd_bytealign (wx, w3[3] << 8, 1);
5680 //w4[0] = amd_bytealign (w4[0] >> 24, wx, 1);
5681 break;
5682 }
5683 #else
5684 switch (salt_len)
5685 {
5686 case 0: w0[0] = wx;
5687 break;
5688 case 1: w0[0] = (w0[0] & 0x000000ff) | (wx << 8);
5689 w0[1] = (w0[1] & 0xffffff00) | (wx >> 24);
5690 break;
5691 case 2: w0[0] = (w0[0] & 0x0000ffff) | (wx << 16);
5692 w0[1] = (w0[1] & 0xffff0000) | (wx >> 16);
5693 break;
5694 case 3: w0[0] = (w0[0] & 0x00ffffff) | (wx << 24);
5695 w0[1] = (w0[1] & 0xff000000) | (wx >> 8);
5696 break;
5697 case 4: w0[1] = wx;
5698 break;
5699 case 5: w0[1] = (w0[1] & 0x000000ff) | (wx << 8);
5700 w0[2] = (w0[2] & 0xffffff00) | (wx >> 24);
5701 break;
5702 case 6: w0[1] = (w0[1] & 0x0000ffff) | (wx << 16);
5703 w0[2] = (w0[2] & 0xffff0000) | (wx >> 16);
5704 break;
5705 case 7: w0[1] = (w0[1] & 0x00ffffff) | (wx << 24);
5706 w0[2] = (w0[2] & 0xff000000) | (wx >> 8);
5707 break;
5708 case 8: w0[2] = wx;
5709 break;
5710 case 9: w0[2] = (w0[2] & 0x000000ff) | (wx << 8);
5711 w0[3] = (w0[3] & 0xffffff00) | (wx >> 24);
5712 break;
5713 case 10: w0[2] = (w0[2] & 0x0000ffff) | (wx << 16);
5714 w0[3] = (w0[3] & 0xffff0000) | (wx >> 16);
5715 break;
5716 case 11: w0[2] = (w0[2] & 0x00ffffff) | (wx << 24);
5717 w0[3] = (w0[3] & 0xff000000) | (wx >> 8);
5718 break;
5719 case 12: w0[3] = wx;
5720 break;
5721 case 13: w0[3] = (w0[3] & 0x000000ff) | (wx << 8);
5722 w1[0] = (w1[0] & 0xffffff00) | (wx >> 24);
5723 break;
5724 case 14: w0[3] = (w0[3] & 0x0000ffff) | (wx << 16);
5725 w1[0] = (w1[0] & 0xffff0000) | (wx >> 16);
5726 break;
5727 case 15: w0[3] = (w0[3] & 0x00ffffff) | (wx << 24);
5728 w1[0] = (w1[0] & 0xff000000) | (wx >> 8);
5729 break;
5730 case 16: w1[0] = wx;
5731 break;
5732 case 17: w1[0] = (w1[0] & 0x000000ff) | (wx << 8);
5733 w1[1] = (w1[1] & 0xffffff00) | (wx >> 24);
5734 break;
5735 case 18: w1[0] = (w1[0] & 0x0000ffff) | (wx << 16);
5736 w1[1] = (w1[1] & 0xffff0000) | (wx >> 16);
5737 break;
5738 case 19: w1[0] = (w1[0] & 0x00ffffff) | (wx << 24);
5739 w1[1] = (w1[1] & 0xff000000) | (wx >> 8);
5740 break;
5741 case 20: w1[1] = wx;
5742 break;
5743 case 21: w1[1] = (w1[1] & 0x000000ff) | (wx << 8);
5744 w1[2] = (w1[2] & 0xffffff00) | (wx >> 24);
5745 break;
5746 case 22: w1[1] = (w1[1] & 0x0000ffff) | (wx << 16);
5747 w1[2] = (w1[2] & 0xffff0000) | (wx >> 16);
5748 break;
5749 case 23: w1[1] = (w1[1] & 0x00ffffff) | (wx << 24);
5750 w1[2] = (w1[2] & 0xff000000) | (wx >> 8);
5751 break;
5752 case 24: w1[2] = wx;
5753 break;
5754 case 25: w1[2] = (w1[2] & 0x000000ff) | (wx << 8);
5755 w1[3] = (w1[3] & 0xffffff00) | (wx >> 24);
5756 break;
5757 case 26: w1[2] = (w1[2] & 0x0000ffff) | (wx << 16);
5758 w1[3] = (w1[3] & 0xffff0000) | (wx >> 16);
5759 break;
5760 case 27: w1[2] = (w1[2] & 0x00ffffff) | (wx << 24);
5761 w1[3] = (w1[3] & 0xff000000) | (wx >> 8);
5762 break;
5763 case 28: w1[3] = wx;
5764 break;
5765 case 29: w1[3] = (w1[3] & 0x000000ff) | (wx << 8);
5766 w2[0] = (w2[0] & 0xffffff00) | (wx >> 24);
5767 break;
5768 case 30: w1[3] = (w1[3] & 0x0000ffff) | (wx << 16);
5769 w2[0] = (w2[0] & 0xffff0000) | (wx >> 16);
5770 break;
5771 case 31: w1[3] = (w1[3] & 0x00ffffff) | (wx << 24);
5772 w2[0] = (w2[0] & 0xff000000) | (wx >> 8);
5773 break;
5774 case 32: w2[0] = wx;
5775 break;
5776 case 33: w2[0] = (w2[0] & 0x000000ff) | (wx << 8);
5777 w2[1] = (w2[1] & 0xffffff00) | (wx >> 24);
5778 break;
5779 case 34: w2[0] = (w2[0] & 0x0000ffff) | (wx << 16);
5780 w2[1] = (w2[1] & 0xffff0000) | (wx >> 16);
5781 break;
5782 case 35: w2[0] = (w2[0] & 0x00ffffff) | (wx << 24);
5783 w2[1] = (w2[1] & 0xff000000) | (wx >> 8);
5784 break;
5785 case 36: w2[1] = wx;
5786 break;
5787 case 37: w2[1] = (w2[1] & 0x000000ff) | (wx << 8);
5788 w2[2] = (w2[2] & 0xffffff00) | (wx >> 24);
5789 break;
5790 case 38: w2[1] = (w2[1] & 0x0000ffff) | (wx << 16);
5791 w2[2] = (w2[2] & 0xffff0000) | (wx >> 16);
5792 break;
5793 case 39: w2[1] = (w2[1] & 0x00ffffff) | (wx << 24);
5794 w2[2] = (w2[2] & 0xff000000) | (wx >> 8);
5795 break;
5796 case 40: w2[2] = wx;
5797 break;
5798 case 41: w2[2] = (w2[2] & 0x000000ff) | (wx << 8);
5799 w2[3] = (w2[3] & 0xffffff00) | (wx >> 24);
5800 break;
5801 case 42: w2[2] = (w2[2] & 0x0000ffff) | (wx << 16);
5802 w2[3] = (w2[3] & 0xffff0000) | (wx >> 16);
5803 break;
5804 case 43: w2[2] = (w2[2] & 0x00ffffff) | (wx << 24);
5805 w2[3] = (w2[3] & 0xff000000) | (wx >> 8);
5806 break;
5807 case 44: w2[3] = wx;
5808 break;
5809 case 45: w2[3] = (w2[3] & 0x000000ff) | (wx << 8);
5810 w3[0] = (w3[0] & 0xffffff00) | (wx >> 24);
5811 break;
5812 case 46: w2[3] = (w2[3] & 0x0000ffff) | (wx << 16);
5813 w3[0] = (w3[0] & 0xffff0000) | (wx >> 16);
5814 break;
5815 case 47: w2[3] = (w2[3] & 0x00ffffff) | (wx << 24);
5816 w3[0] = (w3[0] & 0xff000000) | (wx >> 8);
5817 break;
5818 case 48: w3[0] = wx;
5819 break;
5820 case 49: w3[0] = (w3[0] & 0x000000ff) | (wx << 8);
5821 w3[1] = (w3[1] & 0xffffff00) | (wx >> 24);
5822 break;
5823 case 50: w3[0] = (w3[0] & 0x0000ffff) | (wx << 16);
5824 w3[1] = (w3[1] & 0xffff0000) | (wx >> 16);
5825 break;
5826 case 51: w3[0] = (w3[0] & 0x00ffffff) | (wx << 24);
5827 w3[1] = (w3[1] & 0xff000000) | (wx >> 8);
5828 break;
5829 case 52: w3[1] = wx;
5830 break;
5831 case 53: w3[1] = (w3[1] & 0x000000ff) | (wx << 8);
5832 w3[2] = (w3[2] & 0xffffff00) | (wx >> 24);
5833 break;
5834 case 54: w3[1] = (w3[1] & 0x0000ffff) | (wx << 16);
5835 w3[2] = (w3[2] & 0xffff0000) | (wx >> 16);
5836 break;
5837 case 55: w3[1] = (w3[1] & 0x00ffffff) | (wx << 24);
5838 w3[2] = (w3[2] & 0xff000000) | (wx >> 8);
5839 break;
5840 case 56: w3[2] = wx;
5841 break;
5842 case 57: w3[2] = (w3[2] & 0x000000ff) | (wx << 8);
5843 w3[3] = (w3[3] & 0xffffff00) | (wx >> 24);
5844 break;
5845 case 58: w3[2] = (w3[2] & 0x0000ffff) | (wx << 16);
5846 w3[3] = (w3[3] & 0xffff0000) | (wx >> 16);
5847 break;
5848 case 59: w3[2] = (w3[2] & 0x00ffffff) | (wx << 24);
5849 w3[3] = (w3[3] & 0xff000000) | (wx >> 8);
5850 break;
5851 case 60: w3[3] = wx;
5852 break;
5853 case 61: w3[3] = (w3[3] & 0x000000ff) | (wx << 8);
5854 //w4[0] = (w4[0] & 0xffffff00) | (wx >> 24);
5855 break;
5856 case 62: w3[3] = (w3[3] & 0x0000ffff) | (wx << 16);
5857 //w4[0] = (w4[0] & 0xffff0000) | (wx >> 16);
5858 break;
5859 case 63: w3[3] = (w3[3] & 0x00ffffff) | (wx << 24);
5860 //w4[0] = (w4[0] & 0xff000000) | (wx >> 8);
5861 break;
5862 }
5863 #endif
5864 }
5865
5866 inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len)
5867 {
5868 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5869
5870 switch (salt_len)
5871 {
5872 case 0: w0[0] = wx;
5873 break;
5874 case 1: w0[0] = (w0[0] & 0xff000000) | (wx >> 8);
5875 w0[1] = (w0[1] & 0x00ffffff) | (wx << 24);
5876 break;
5877 case 2: w0[0] = (w0[0] & 0xffff0000) | (wx >> 16);
5878 w0[1] = (w0[1] & 0x0000ffff) | (wx << 16);
5879 break;
5880 case 3: w0[0] = (w0[0] & 0xffffff00) | (wx >> 24);
5881 w0[1] = (w0[1] & 0x000000ff) | (wx << 8);
5882 break;
5883 case 4: w0[1] = wx;
5884 break;
5885 case 5: w0[1] = (w0[1] & 0xff000000) | (wx >> 8);
5886 w0[2] = (w0[2] & 0x00ffffff) | (wx << 24);
5887 break;
5888 case 6: w0[1] = (w0[1] & 0xffff0000) | (wx >> 16);
5889 w0[2] = (w0[2] & 0x0000ffff) | (wx << 16);
5890 break;
5891 case 7: w0[1] = (w0[1] & 0xffffff00) | (wx >> 24);
5892 w0[2] = (w0[2] & 0x000000ff) | (wx << 8);
5893 break;
5894 case 8: w0[2] = wx;
5895 break;
5896 case 9: w0[2] = (w0[2] & 0xff000000) | (wx >> 8);
5897 w0[3] = (w0[3] & 0x00ffffff) | (wx << 24);
5898 break;
5899 case 10: w0[2] = (w0[2] & 0xffff0000) | (wx >> 16);
5900 w0[3] = (w0[3] & 0x0000ffff) | (wx << 16);
5901 break;
5902 case 11: w0[2] = (w0[2] & 0xffffff00) | (wx >> 24);
5903 w0[3] = (w0[3] & 0x000000ff) | (wx << 8);
5904 break;
5905 case 12: w0[3] = wx;
5906 break;
5907 case 13: w0[3] = (w0[3] & 0xff000000) | (wx >> 8);
5908 w1[0] = (w1[0] & 0x00ffffff) | (wx << 24);
5909 break;
5910 case 14: w0[3] = (w0[3] & 0xffff0000) | (wx >> 16);
5911 w1[0] = (w1[0] & 0x0000ffff) | (wx << 16);
5912 break;
5913 case 15: w0[3] = (w0[3] & 0xffffff00) | (wx >> 24);
5914 w1[0] = (w1[0] & 0x000000ff) | (wx << 8);
5915 break;
5916 case 16: w1[0] = wx;
5917 break;
5918 case 17: w1[0] = (w1[0] & 0xff000000) | (wx >> 8);
5919 w1[1] = (w1[1] & 0x00ffffff) | (wx << 24);
5920 break;
5921 case 18: w1[0] = (w1[0] & 0xffff0000) | (wx >> 16);
5922 w1[1] = (w1[1] & 0x0000ffff) | (wx << 16);
5923 break;
5924 case 19: w1[0] = (w1[0] & 0xffffff00) | (wx >> 24);
5925 w1[1] = (w1[1] & 0x000000ff) | (wx << 8);
5926 break;
5927 case 20: w1[1] = wx;
5928 break;
5929 case 21: w1[1] = (w1[1] & 0xff000000) | (wx >> 8);
5930 w1[2] = (w1[2] & 0x00ffffff) | (wx << 24);
5931 break;
5932 case 22: w1[1] = (w1[1] & 0xffff0000) | (wx >> 16);
5933 w1[2] = (w1[2] & 0x0000ffff) | (wx << 16);
5934 break;
5935 case 23: w1[1] = (w1[1] & 0xffffff00) | (wx >> 24);
5936 w1[2] = (w1[2] & 0x000000ff) | (wx << 8);
5937 break;
5938 case 24: w1[2] = wx;
5939 break;
5940 case 25: w1[2] = (w1[2] & 0xff000000) | (wx >> 8);
5941 w1[3] = (w1[3] & 0x00ffffff) | (wx << 24);
5942 break;
5943 case 26: w1[2] = (w1[2] & 0xffff0000) | (wx >> 16);
5944 w1[3] = (w1[3] & 0x0000ffff) | (wx << 16);
5945 break;
5946 case 27: w1[2] = (w1[2] & 0xffffff00) | (wx >> 24);
5947 w1[3] = (w1[3] & 0x000000ff) | (wx << 8);
5948 break;
5949 case 28: w1[3] = wx;
5950 break;
5951 case 29: w1[3] = (w1[3] & 0xff000000) | (wx >> 8);
5952 w2[0] = (w2[0] & 0x00ffffff) | (wx << 24);
5953 break;
5954 case 30: w1[3] = (w1[3] & 0xffff0000) | (wx >> 16);
5955 w2[0] = (w2[0] & 0x0000ffff) | (wx << 16);
5956 break;
5957 case 31: w1[3] = (w1[3] & 0xffffff00) | (wx >> 24);
5958 w2[0] = (w2[0] & 0x000000ff) | (wx << 8);
5959 break;
5960 case 32: w2[0] = wx;
5961 break;
5962 case 33: w2[0] = (w2[0] & 0xff000000) | (wx >> 8);
5963 w2[1] = (w2[1] & 0x00ffffff) | (wx << 24);
5964 break;
5965 case 34: w2[0] = (w2[0] & 0xffff0000) | (wx >> 16);
5966 w2[1] = (w2[1] & 0x0000ffff) | (wx << 16);
5967 break;
5968 case 35: w2[0] = (w2[0] & 0xffffff00) | (wx >> 24);
5969 w2[1] = (w2[1] & 0x000000ff) | (wx << 8);
5970 break;
5971 case 36: w2[1] = wx;
5972 break;
5973 case 37: w2[1] = (w2[1] & 0xff000000) | (wx >> 8);
5974 w2[2] = (w2[2] & 0x00ffffff) | (wx << 24);
5975 break;
5976 case 38: w2[1] = (w2[1] & 0xffff0000) | (wx >> 16);
5977 w2[2] = (w2[2] & 0x0000ffff) | (wx << 16);
5978 break;
5979 case 39: w2[1] = (w2[1] & 0xffffff00) | (wx >> 24);
5980 w2[2] = (w2[2] & 0x000000ff) | (wx << 8);
5981 break;
5982 case 40: w2[2] = wx;
5983 break;
5984 case 41: w2[2] = (w2[2] & 0xff000000) | (wx >> 8);
5985 w2[3] = (w2[3] & 0x00ffffff) | (wx << 24);
5986 break;
5987 case 42: w2[2] = (w2[2] & 0xffff0000) | (wx >> 16);
5988 w2[3] = (w2[3] & 0x0000ffff) | (wx << 16);
5989 break;
5990 case 43: w2[2] = (w2[2] & 0xffffff00) | (wx >> 24);
5991 w2[3] = (w2[3] & 0x000000ff) | (wx << 8);
5992 break;
5993 case 44: w2[3] = wx;
5994 break;
5995 case 45: w2[3] = (w2[3] & 0xff000000) | (wx >> 8);
5996 w3[0] = (w3[0] & 0x00ffffff) | (wx << 24);
5997 break;
5998 case 46: w2[3] = (w2[3] & 0xffff0000) | (wx >> 16);
5999 w3[0] = (w3[0] & 0x0000ffff) | (wx << 16);
6000 break;
6001 case 47: w2[3] = (w2[3] & 0xffffff00) | (wx >> 24);
6002 w3[0] = (w3[0] & 0x000000ff) | (wx << 8);
6003 break;
6004 case 48: w3[0] = wx;
6005 break;
6006 case 49: w3[0] = (w3[0] & 0xff000000) | (wx >> 8);
6007 w3[1] = (w3[1] & 0x00ffffff) | (wx << 24);
6008 break;
6009 case 50: w3[0] = (w3[0] & 0xffff0000) | (wx >> 16);
6010 w3[1] = (w3[1] & 0x0000ffff) | (wx << 16);
6011 break;
6012 case 51: w3[0] = (w3[0] & 0xffffff00) | (wx >> 24);
6013 w3[1] = (w3[1] & 0x000000ff) | (wx << 8);
6014 break;
6015 case 52: w3[1] = wx;
6016 break;
6017 case 53: w3[1] = (w3[1] & 0xff000000) | (wx >> 8);
6018 w3[2] = (w3[2] & 0x00ffffff) | (wx << 24);
6019 break;
6020 case 54: w3[1] = (w3[1] & 0xffff0000) | (wx >> 16);
6021 w3[2] = (w3[2] & 0x0000ffff) | (wx << 16);
6022 break;
6023 case 55: w3[1] = (w3[1] & 0xffffff00) | (wx >> 24);
6024 w3[2] = (w3[2] & 0x000000ff) | (wx << 8);
6025 break;
6026 case 56: w3[2] = wx;
6027 break;
6028 case 57: w3[2] = (w3[2] & 0xff000000) | (wx >> 8);
6029 w3[3] = (w3[3] & 0x00ffffff) | (wx << 24);
6030 break;
6031 case 58: w3[2] = (w3[2] & 0xffff0000) | (wx >> 16);
6032 w3[3] = (w3[3] & 0x0000ffff) | (wx << 16);
6033 break;
6034 case 59: w3[2] = (w3[2] & 0xffffff00) | (wx >> 24);
6035 w3[3] = (w3[3] & 0x000000ff) | (wx << 8);
6036 break;
6037 case 60: w3[3] = wx;
6038 break;
6039 case 61: w3[3] = (w3[3] & 0xff000000) | (wx >> 8);
6040 //w4[0] = (w4[0] & 0x00ffffff) | (wx << 24);
6041 break;
6042 case 62: w3[3] = (w3[3] & 0xffff0000) | (wx >> 16);
6043 //w4[0] = (w4[0] & 0x0000ffff) | (wx << 16);
6044 break;
6045 case 63: w3[3] = (w3[3] & 0xffffff00) | (wx >> 24);
6046 //w4[0] = (w4[0] & 0x000000ff) | (wx << 8);
6047 break;
6048 }
6049 }
6050
6051 /**
6052 * vector functions as scalar (for outer loop usage)
6053 */
6054
6055 inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
6056 {
6057 switch (offset)
6058 {
6059 case 0:
6060 w0[0] = 0x01;
6061 break;
6062
6063 case 1:
6064 w0[0] = w0[0] | 0x0100;
6065 break;
6066
6067 case 2:
6068 w0[0] = w0[0] | 0x010000;
6069 break;
6070
6071 case 3:
6072 w0[0] = w0[0] | 0x01000000;
6073 break;
6074
6075 case 4:
6076 w0[1] = 0x01;
6077 break;
6078
6079 case 5:
6080 w0[1] = w0[1] | 0x0100;
6081 break;
6082
6083 case 6:
6084 w0[1] = w0[1] | 0x010000;
6085 break;
6086
6087 case 7:
6088 w0[1] = w0[1] | 0x01000000;
6089 break;
6090
6091 case 8:
6092 w0[2] = 0x01;
6093 break;
6094
6095 case 9:
6096 w0[2] = w0[2] | 0x0100;
6097 break;
6098
6099 case 10:
6100 w0[2] = w0[2] | 0x010000;
6101 break;
6102
6103 case 11:
6104 w0[2] = w0[2] | 0x01000000;
6105 break;
6106
6107 case 12:
6108 w0[3] = 0x01;
6109 break;
6110
6111 case 13:
6112 w0[3] = w0[3] | 0x0100;
6113 break;
6114
6115 case 14:
6116 w0[3] = w0[3] | 0x010000;
6117 break;
6118
6119 case 15:
6120 w0[3] = w0[3] | 0x01000000;
6121 break;
6122
6123 case 16:
6124 w1[0] = 0x01;
6125 break;
6126
6127 case 17:
6128 w1[0] = w1[0] | 0x0100;
6129 break;
6130
6131 case 18:
6132 w1[0] = w1[0] | 0x010000;
6133 break;
6134
6135 case 19:
6136 w1[0] = w1[0] | 0x01000000;
6137 break;
6138
6139 case 20:
6140 w1[1] = 0x01;
6141 break;
6142
6143 case 21:
6144 w1[1] = w1[1] | 0x0100;
6145 break;
6146
6147 case 22:
6148 w1[1] = w1[1] | 0x010000;
6149 break;
6150
6151 case 23:
6152 w1[1] = w1[1] | 0x01000000;
6153 break;
6154
6155 case 24:
6156 w1[2] = 0x01;
6157 break;
6158
6159 case 25:
6160 w1[2] = w1[2] | 0x0100;
6161 break;
6162
6163 case 26:
6164 w1[2] = w1[2] | 0x010000;
6165 break;
6166
6167 case 27:
6168 w1[2] = w1[2] | 0x01000000;
6169 break;
6170
6171 case 28:
6172 w1[3] = 0x01;
6173 break;
6174
6175 case 29:
6176 w1[3] = w1[3] | 0x0100;
6177 break;
6178
6179 case 30:
6180 w1[3] = w1[3] | 0x010000;
6181 break;
6182
6183 case 31:
6184 w1[3] = w1[3] | 0x01000000;
6185 break;
6186 }
6187 }
6188
6189 inline void append_0x80_1x4_S (u32 w0[4], const u32 offset)
6190 {
6191 switch (offset)
6192 {
6193 case 0:
6194 w0[0] = 0x80;
6195 break;
6196
6197 case 1:
6198 w0[0] = w0[0] | 0x8000;
6199 break;
6200
6201 case 2:
6202 w0[0] = w0[0] | 0x800000;
6203 break;
6204
6205 case 3:
6206 w0[0] = w0[0] | 0x80000000;
6207 break;
6208
6209 case 4:
6210 w0[1] = 0x80;
6211 break;
6212
6213 case 5:
6214 w0[1] = w0[1] | 0x8000;
6215 break;
6216
6217 case 6:
6218 w0[1] = w0[1] | 0x800000;
6219 break;
6220
6221 case 7:
6222 w0[1] = w0[1] | 0x80000000;
6223 break;
6224
6225 case 8:
6226 w0[2] = 0x80;
6227 break;
6228
6229 case 9:
6230 w0[2] = w0[2] | 0x8000;
6231 break;
6232
6233 case 10:
6234 w0[2] = w0[2] | 0x800000;
6235 break;
6236
6237 case 11:
6238 w0[2] = w0[2] | 0x80000000;
6239 break;
6240
6241 case 12:
6242 w0[3] = 0x80;
6243 break;
6244
6245 case 13:
6246 w0[3] = w0[3] | 0x8000;
6247 break;
6248
6249 case 14:
6250 w0[3] = w0[3] | 0x800000;
6251 break;
6252
6253 case 15:
6254 w0[3] = w0[3] | 0x80000000;
6255 break;
6256 }
6257 }
6258
6259 inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
6260 {
6261 switch (offset)
6262 {
6263 case 0:
6264 w0[0] = 0x80;
6265 break;
6266
6267 case 1:
6268 w0[0] = w0[0] | 0x8000;
6269 break;
6270
6271 case 2:
6272 w0[0] = w0[0] | 0x800000;
6273 break;
6274
6275 case 3:
6276 w0[0] = w0[0] | 0x80000000;
6277 break;
6278
6279 case 4:
6280 w0[1] = 0x80;
6281 break;
6282
6283 case 5:
6284 w0[1] = w0[1] | 0x8000;
6285 break;
6286
6287 case 6:
6288 w0[1] = w0[1] | 0x800000;
6289 break;
6290
6291 case 7:
6292 w0[1] = w0[1] | 0x80000000;
6293 break;
6294
6295 case 8:
6296 w0[2] = 0x80;
6297 break;
6298
6299 case 9:
6300 w0[2] = w0[2] | 0x8000;
6301 break;
6302
6303 case 10:
6304 w0[2] = w0[2] | 0x800000;
6305 break;
6306
6307 case 11:
6308 w0[2] = w0[2] | 0x80000000;
6309 break;
6310
6311 case 12:
6312 w0[3] = 0x80;
6313 break;
6314
6315 case 13:
6316 w0[3] = w0[3] | 0x8000;
6317 break;
6318
6319 case 14:
6320 w0[3] = w0[3] | 0x800000;
6321 break;
6322
6323 case 15:
6324 w0[3] = w0[3] | 0x80000000;
6325 break;
6326
6327 case 16:
6328 w1[0] = 0x80;
6329 break;
6330
6331 case 17:
6332 w1[0] = w1[0] | 0x8000;
6333 break;
6334
6335 case 18:
6336 w1[0] = w1[0] | 0x800000;
6337 break;
6338
6339 case 19:
6340 w1[0] = w1[0] | 0x80000000;
6341 break;
6342
6343 case 20:
6344 w1[1] = 0x80;
6345 break;
6346
6347 case 21:
6348 w1[1] = w1[1] | 0x8000;
6349 break;
6350
6351 case 22:
6352 w1[1] = w1[1] | 0x800000;
6353 break;
6354
6355 case 23:
6356 w1[1] = w1[1] | 0x80000000;
6357 break;
6358
6359 case 24:
6360 w1[2] = 0x80;
6361 break;
6362
6363 case 25:
6364 w1[2] = w1[2] | 0x8000;
6365 break;
6366
6367 case 26:
6368 w1[2] = w1[2] | 0x800000;
6369 break;
6370
6371 case 27:
6372 w1[2] = w1[2] | 0x80000000;
6373 break;
6374
6375 case 28:
6376 w1[3] = 0x80;
6377 break;
6378
6379 case 29:
6380 w1[3] = w1[3] | 0x8000;
6381 break;
6382
6383 case 30:
6384 w1[3] = w1[3] | 0x800000;
6385 break;
6386
6387 case 31:
6388 w1[3] = w1[3] | 0x80000000;
6389 break;
6390 }
6391 }
6392
6393 inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
6394 {
6395 switch (offset)
6396 {
6397 case 0:
6398 w0[0] = 0x80;
6399 break;
6400
6401 case 1:
6402 w0[0] = w0[0] | 0x8000;
6403 break;
6404
6405 case 2:
6406 w0[0] = w0[0] | 0x800000;
6407 break;
6408
6409 case 3:
6410 w0[0] = w0[0] | 0x80000000;
6411 break;
6412
6413 case 4:
6414 w0[1] = 0x80;
6415 break;
6416
6417 case 5:
6418 w0[1] = w0[1] | 0x8000;
6419 break;
6420
6421 case 6:
6422 w0[1] = w0[1] | 0x800000;
6423 break;
6424
6425 case 7:
6426 w0[1] = w0[1] | 0x80000000;
6427 break;
6428
6429 case 8:
6430 w0[2] = 0x80;
6431 break;
6432
6433 case 9:
6434 w0[2] = w0[2] | 0x8000;
6435 break;
6436
6437 case 10:
6438 w0[2] = w0[2] | 0x800000;
6439 break;
6440
6441 case 11:
6442 w0[2] = w0[2] | 0x80000000;
6443 break;
6444
6445 case 12:
6446 w0[3] = 0x80;
6447 break;
6448
6449 case 13:
6450 w0[3] = w0[3] | 0x8000;
6451 break;
6452
6453 case 14:
6454 w0[3] = w0[3] | 0x800000;
6455 break;
6456
6457 case 15:
6458 w0[3] = w0[3] | 0x80000000;
6459 break;
6460
6461 case 16:
6462 w1[0] = 0x80;
6463 break;
6464
6465 case 17:
6466 w1[0] = w1[0] | 0x8000;
6467 break;
6468
6469 case 18:
6470 w1[0] = w1[0] | 0x800000;
6471 break;
6472
6473 case 19:
6474 w1[0] = w1[0] | 0x80000000;
6475 break;
6476
6477 case 20:
6478 w1[1] = 0x80;
6479 break;
6480
6481 case 21:
6482 w1[1] = w1[1] | 0x8000;
6483 break;
6484
6485 case 22:
6486 w1[1] = w1[1] | 0x800000;
6487 break;
6488
6489 case 23:
6490 w1[1] = w1[1] | 0x80000000;
6491 break;
6492
6493 case 24:
6494 w1[2] = 0x80;
6495 break;
6496
6497 case 25:
6498 w1[2] = w1[2] | 0x8000;
6499 break;
6500
6501 case 26:
6502 w1[2] = w1[2] | 0x800000;
6503 break;
6504
6505 case 27:
6506 w1[2] = w1[2] | 0x80000000;
6507 break;
6508
6509 case 28:
6510 w1[3] = 0x80;
6511 break;
6512
6513 case 29:
6514 w1[3] = w1[3] | 0x8000;
6515 break;
6516
6517 case 30:
6518 w1[3] = w1[3] | 0x800000;
6519 break;
6520
6521 case 31:
6522 w1[3] = w1[3] | 0x80000000;
6523 break;
6524
6525 case 32:
6526 w2[0] = 0x80;
6527 break;
6528
6529 case 33:
6530 w2[0] = w2[0] | 0x8000;
6531 break;
6532
6533 case 34:
6534 w2[0] = w2[0] | 0x800000;
6535 break;
6536
6537 case 35:
6538 w2[0] = w2[0] | 0x80000000;
6539 break;
6540
6541 case 36:
6542 w2[1] = 0x80;
6543 break;
6544
6545 case 37:
6546 w2[1] = w2[1] | 0x8000;
6547 break;
6548
6549 case 38:
6550 w2[1] = w2[1] | 0x800000;
6551 break;
6552
6553 case 39:
6554 w2[1] = w2[1] | 0x80000000;
6555 break;
6556
6557 case 40:
6558 w2[2] = 0x80;
6559 break;
6560
6561 case 41:
6562 w2[2] = w2[2] | 0x8000;
6563 break;
6564
6565 case 42:
6566 w2[2] = w2[2] | 0x800000;
6567 break;
6568
6569 case 43:
6570 w2[2] = w2[2] | 0x80000000;
6571 break;
6572
6573 case 44:
6574 w2[3] = 0x80;
6575 break;
6576
6577 case 45:
6578 w2[3] = w2[3] | 0x8000;
6579 break;
6580
6581 case 46:
6582 w2[3] = w2[3] | 0x800000;
6583 break;
6584
6585 case 47:
6586 w2[3] = w2[3] | 0x80000000;
6587 break;
6588 }
6589 }
6590
6591 inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
6592 {
6593 switch (offset)
6594 {
6595 case 0:
6596 w0[0] = 0x80;
6597 break;
6598
6599 case 1:
6600 w0[0] = w0[0] | 0x8000;
6601 break;
6602
6603 case 2:
6604 w0[0] = w0[0] | 0x800000;
6605 break;
6606
6607 case 3:
6608 w0[0] = w0[0] | 0x80000000;
6609 break;
6610
6611 case 4:
6612 w0[1] = 0x80;
6613 break;
6614
6615 case 5:
6616 w0[1] = w0[1] | 0x8000;
6617 break;
6618
6619 case 6:
6620 w0[1] = w0[1] | 0x800000;
6621 break;
6622
6623 case 7:
6624 w0[1] = w0[1] | 0x80000000;
6625 break;
6626
6627 case 8:
6628 w0[2] = 0x80;
6629 break;
6630
6631 case 9:
6632 w0[2] = w0[2] | 0x8000;
6633 break;
6634
6635 case 10:
6636 w0[2] = w0[2] | 0x800000;
6637 break;
6638
6639 case 11:
6640 w0[2] = w0[2] | 0x80000000;
6641 break;
6642
6643 case 12:
6644 w0[3] = 0x80;
6645 break;
6646
6647 case 13:
6648 w0[3] = w0[3] | 0x8000;
6649 break;
6650
6651 case 14:
6652 w0[3] = w0[3] | 0x800000;
6653 break;
6654
6655 case 15:
6656 w0[3] = w0[3] | 0x80000000;
6657 break;
6658
6659 case 16:
6660 w1[0] = 0x80;
6661 break;
6662
6663 case 17:
6664 w1[0] = w1[0] | 0x8000;
6665 break;
6666
6667 case 18:
6668 w1[0] = w1[0] | 0x800000;
6669 break;
6670
6671 case 19:
6672 w1[0] = w1[0] | 0x80000000;
6673 break;
6674
6675 case 20:
6676 w1[1] = 0x80;
6677 break;
6678
6679 case 21:
6680 w1[1] = w1[1] | 0x8000;
6681 break;
6682
6683 case 22:
6684 w1[1] = w1[1] | 0x800000;
6685 break;
6686
6687 case 23:
6688 w1[1] = w1[1] | 0x80000000;
6689 break;
6690
6691 case 24:
6692 w1[2] = 0x80;
6693 break;
6694
6695 case 25:
6696 w1[2] = w1[2] | 0x8000;
6697 break;
6698
6699 case 26:
6700 w1[2] = w1[2] | 0x800000;
6701 break;
6702
6703 case 27:
6704 w1[2] = w1[2] | 0x80000000;
6705 break;
6706
6707 case 28:
6708 w1[3] = 0x80;
6709 break;
6710
6711 case 29:
6712 w1[3] = w1[3] | 0x8000;
6713 break;
6714
6715 case 30:
6716 w1[3] = w1[3] | 0x800000;
6717 break;
6718
6719 case 31:
6720 w1[3] = w1[3] | 0x80000000;
6721 break;
6722
6723 case 32:
6724 w2[0] = 0x80;
6725 break;
6726
6727 case 33:
6728 w2[0] = w2[0] | 0x8000;
6729 break;
6730
6731 case 34:
6732 w2[0] = w2[0] | 0x800000;
6733 break;
6734
6735 case 35:
6736 w2[0] = w2[0] | 0x80000000;
6737 break;
6738
6739 case 36:
6740 w2[1] = 0x80;
6741 break;
6742
6743 case 37:
6744 w2[1] = w2[1] | 0x8000;
6745 break;
6746
6747 case 38:
6748 w2[1] = w2[1] | 0x800000;
6749 break;
6750
6751 case 39:
6752 w2[1] = w2[1] | 0x80000000;
6753 break;
6754
6755 case 40:
6756 w2[2] = 0x80;
6757 break;
6758
6759 case 41:
6760 w2[2] = w2[2] | 0x8000;
6761 break;
6762
6763 case 42:
6764 w2[2] = w2[2] | 0x800000;
6765 break;
6766
6767 case 43:
6768 w2[2] = w2[2] | 0x80000000;
6769 break;
6770
6771 case 44:
6772 w2[3] = 0x80;
6773 break;
6774
6775 case 45:
6776 w2[3] = w2[3] | 0x8000;
6777 break;
6778
6779 case 46:
6780 w2[3] = w2[3] | 0x800000;
6781 break;
6782
6783 case 47:
6784 w2[3] = w2[3] | 0x80000000;
6785 break;
6786
6787 case 48:
6788 w3[0] = 0x80;
6789 break;
6790
6791 case 49:
6792 w3[0] = w3[0] | 0x8000;
6793 break;
6794
6795 case 50:
6796 w3[0] = w3[0] | 0x800000;
6797 break;
6798
6799 case 51:
6800 w3[0] = w3[0] | 0x80000000;
6801 break;
6802
6803 case 52:
6804 w3[1] = 0x80;
6805 break;
6806
6807 case 53:
6808 w3[1] = w3[1] | 0x8000;
6809 break;
6810
6811 case 54:
6812 w3[1] = w3[1] | 0x800000;
6813 break;
6814
6815 case 55:
6816 w3[1] = w3[1] | 0x80000000;
6817 break;
6818
6819 case 56:
6820 w3[2] = 0x80;
6821 break;
6822
6823 case 57:
6824 w3[2] = w3[2] | 0x8000;
6825 break;
6826
6827 case 58:
6828 w3[2] = w3[2] | 0x800000;
6829 break;
6830
6831 case 59:
6832 w3[2] = w3[2] | 0x80000000;
6833 break;
6834
6835 case 60:
6836 w3[3] = 0x80;
6837 break;
6838
6839 case 61:
6840 w3[3] = w3[3] | 0x8000;
6841 break;
6842
6843 case 62:
6844 w3[3] = w3[3] | 0x800000;
6845 break;
6846
6847 case 63:
6848 w3[3] = w3[3] | 0x80000000;
6849 break;
6850 }
6851 }
6852
6853 inline void truncate_block_S (u32 w[4], const u32 len)
6854 {
6855 switch (len)
6856 {
6857 case 0: w[0] &= 0;
6858 w[1] &= 0;
6859 w[2] &= 0;
6860 w[3] &= 0;
6861 break;
6862 case 1: w[0] &= 0x000000FF;
6863 w[1] &= 0;
6864 w[2] &= 0;
6865 w[3] &= 0;
6866 break;
6867 case 2: w[0] &= 0x0000FFFF;
6868 w[1] &= 0;
6869 w[2] &= 0;
6870 w[3] &= 0;
6871 break;
6872 case 3: w[0] &= 0x00FFFFFF;
6873 w[1] &= 0;
6874 w[2] &= 0;
6875 w[3] &= 0;
6876 break;
6877 case 4: w[1] &= 0;
6878 w[2] &= 0;
6879 w[3] &= 0;
6880 break;
6881 case 5: w[1] &= 0x000000FF;
6882 w[2] &= 0;
6883 w[3] &= 0;
6884 break;
6885 case 6: w[1] &= 0x0000FFFF;
6886 w[2] &= 0;
6887 w[3] &= 0;
6888 break;
6889 case 7: w[1] &= 0x00FFFFFF;
6890 w[2] &= 0;
6891 w[3] &= 0;
6892 break;
6893 case 8: w[2] &= 0;
6894 w[3] &= 0;
6895 break;
6896 case 9: w[2] &= 0x000000FF;
6897 w[3] &= 0;
6898 break;
6899 case 10: w[2] &= 0x0000FFFF;
6900 w[3] &= 0;
6901 break;
6902 case 11: w[2] &= 0x00FFFFFF;
6903 w[3] &= 0;
6904 break;
6905 case 12: w[3] &= 0;
6906 break;
6907 case 13: w[3] &= 0x000000FF;
6908 break;
6909 case 14: w[3] &= 0x0000FFFF;
6910 break;
6911 case 15: w[3] &= 0x00FFFFFF;
6912 break;
6913 }
6914 }
6915
6916 inline void make_unicode_S (const u32 in[4], u32 out1[4], u32 out2[4])
6917 {
6918 #ifdef IS_NV
6919 out2[3] = __byte_perm_S (in[3], 0, 0x7372);
6920 out2[2] = __byte_perm_S (in[3], 0, 0x7170);
6921 out2[1] = __byte_perm_S (in[2], 0, 0x7372);
6922 out2[0] = __byte_perm_S (in[2], 0, 0x7170);
6923 out1[3] = __byte_perm_S (in[1], 0, 0x7372);
6924 out1[2] = __byte_perm_S (in[1], 0, 0x7170);
6925 out1[1] = __byte_perm_S (in[0], 0, 0x7372);
6926 out1[0] = __byte_perm_S (in[0], 0, 0x7170);
6927 #endif
6928
6929 #if defined IS_AMD || defined IS_GENERIC
6930 out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
6931 out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
6932 out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
6933 out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
6934 out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
6935 out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
6936 out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
6937 out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
6938 #endif
6939 }
6940
6941 inline void undo_unicode_S (const u32 in1[4], const u32 in2[4], u32 out[4])
6942 {
6943 #ifdef IS_NV
6944 out[0] = __byte_perm_S (in1[0], in1[1], 0x6420);
6945 out[1] = __byte_perm_S (in1[2], in1[3], 0x6420);
6946 out[2] = __byte_perm_S (in2[0], in2[1], 0x6420);
6947 out[3] = __byte_perm_S (in2[2], in2[3], 0x6420);
6948 #endif
6949
6950 #if defined IS_AMD || defined IS_GENERIC
6951 out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
6952 | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
6953 out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
6954 | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
6955 out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
6956 | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
6957 out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
6958 | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
6959 #endif
6960 }
6961
6962 inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
6963 {
6964 #if defined IS_AMD || defined IS_GENERIC
6965 const int offset_mod_4 = offset & 3;
6966
6967 const int offset_minus_4 = 4 - offset;
6968
6969 switch (offset / 4)
6970 {
6971 case 0:
6972 w3[2] = amd_bytealign_S ( 0, w3[1], offset_minus_4);
6973 w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4);
6974 w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
6975 w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
6976 w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
6977 w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
6978 w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
6979 w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
6980 w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
6981 w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
6982 w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
6983 w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
6984 w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
6985 w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
6986 w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
6987
6988 if (offset_mod_4 == 0)
6989 {
6990 w0[0] = w0[1];
6991 w0[1] = w0[2];
6992 w0[2] = w0[3];
6993 w0[3] = w1[0];
6994 w1[0] = w1[1];
6995 w1[1] = w1[2];
6996 w1[2] = w1[3];
6997 w1[3] = w2[0];
6998 w2[0] = w2[1];
6999 w2[1] = w2[2];
7000 w2[2] = w2[3];
7001 w2[3] = w3[0];
7002 w3[0] = w3[1];
7003 w3[1] = w3[2];
7004 w3[2] = 0;
7005 }
7006
7007 break;
7008
7009 case 1:
7010 w3[2] = amd_bytealign_S ( 0, w3[0], offset_minus_4);
7011 w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
7012 w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7013 w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7014 w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7015 w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7016 w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7017 w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7018 w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7019 w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7020 w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7021 w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7022 w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7023 w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7024 w0[0] = 0;
7025
7026 if (offset_mod_4 == 0)
7027 {
7028 w0[1] = w0[2];
7029 w0[2] = w0[3];
7030 w0[3] = w1[0];
7031 w1[0] = w1[1];
7032 w1[1] = w1[2];
7033 w1[2] = w1[3];
7034 w1[3] = w2[0];
7035 w2[0] = w2[1];
7036 w2[1] = w2[2];
7037 w2[2] = w2[3];
7038 w2[3] = w3[0];
7039 w3[0] = w3[1];
7040 w3[1] = w3[2];
7041 w3[2] = 0;
7042 }
7043
7044 break;
7045
7046 case 2:
7047 w3[2] = amd_bytealign_S ( 0, w2[3], offset_minus_4);
7048 w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7049 w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7050 w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7051 w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7052 w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7053 w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7054 w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7055 w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7056 w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7057 w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7058 w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7059 w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7060 w0[1] = 0;
7061 w0[0] = 0;
7062
7063 if (offset_mod_4 == 0)
7064 {
7065 w0[2] = w0[3];
7066 w0[3] = w1[0];
7067 w1[0] = w1[1];
7068 w1[1] = w1[2];
7069 w1[2] = w1[3];
7070 w1[3] = w2[0];
7071 w2[0] = w2[1];
7072 w2[1] = w2[2];
7073 w2[2] = w2[3];
7074 w2[3] = w3[0];
7075 w3[0] = w3[1];
7076 w3[1] = w3[2];
7077 w3[2] = 0;
7078 }
7079
7080 break;
7081
7082 case 3:
7083 w3[2] = amd_bytealign_S ( 0, w2[2], offset_minus_4);
7084 w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7085 w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7086 w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7087 w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7088 w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7089 w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7090 w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7091 w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7092 w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7093 w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7094 w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7095 w0[2] = 0;
7096 w0[1] = 0;
7097 w0[0] = 0;
7098
7099 if (offset_mod_4 == 0)
7100 {
7101 w0[3] = w1[0];
7102 w1[0] = w1[1];
7103 w1[1] = w1[2];
7104 w1[2] = w1[3];
7105 w1[3] = w2[0];
7106 w2[0] = w2[1];
7107 w2[1] = w2[2];
7108 w2[2] = w2[3];
7109 w2[3] = w3[0];
7110 w3[0] = w3[1];
7111 w3[1] = w3[2];
7112 w3[2] = 0;
7113 }
7114
7115 break;
7116
7117 case 4:
7118 w3[2] = amd_bytealign_S ( 0, w2[1], offset_minus_4);
7119 w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7120 w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7121 w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7122 w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7123 w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7124 w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7125 w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7126 w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7127 w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7128 w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7129 w0[3] = 0;
7130 w0[2] = 0;
7131 w0[1] = 0;
7132 w0[0] = 0;
7133
7134 if (offset_mod_4 == 0)
7135 {
7136 w1[0] = w1[1];
7137 w1[1] = w1[2];
7138 w1[2] = w1[3];
7139 w1[3] = w2[0];
7140 w2[0] = w2[1];
7141 w2[1] = w2[2];
7142 w2[2] = w2[3];
7143 w2[3] = w3[0];
7144 w3[0] = w3[1];
7145 w3[1] = w3[2];
7146 w3[2] = 0;
7147 }
7148
7149 break;
7150
7151 case 5:
7152 w3[2] = amd_bytealign_S ( 0, w2[0], offset_minus_4);
7153 w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7154 w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7155 w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7156 w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7157 w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7158 w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7159 w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7160 w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7161 w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7162 w1[0] = 0;
7163 w0[3] = 0;
7164 w0[2] = 0;
7165 w0[1] = 0;
7166 w0[0] = 0;
7167
7168 if (offset_mod_4 == 0)
7169 {
7170 w1[1] = w1[2];
7171 w1[2] = w1[3];
7172 w1[3] = w2[0];
7173 w2[0] = w2[1];
7174 w2[1] = w2[2];
7175 w2[2] = w2[3];
7176 w2[3] = w3[0];
7177 w3[0] = w3[1];
7178 w3[1] = w3[2];
7179 w3[2] = 0;
7180 }
7181
7182 break;
7183
7184 case 6:
7185 w3[2] = amd_bytealign_S ( 0, w1[3], offset_minus_4);
7186 w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7187 w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7188 w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7189 w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7190 w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7191 w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7192 w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7193 w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7194 w1[1] = 0;
7195 w1[0] = 0;
7196 w0[3] = 0;
7197 w0[2] = 0;
7198 w0[1] = 0;
7199 w0[0] = 0;
7200
7201 if (offset_mod_4 == 0)
7202 {
7203 w1[2] = w1[3];
7204 w1[3] = w2[0];
7205 w2[0] = w2[1];
7206 w2[1] = w2[2];
7207 w2[2] = w2[3];
7208 w2[3] = w3[0];
7209 w3[0] = w3[1];
7210 w3[1] = w3[2];
7211 w3[2] = 0;
7212 }
7213
7214 break;
7215
7216 case 7:
7217 w3[2] = amd_bytealign_S ( 0, w1[2], offset_minus_4);
7218 w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7219 w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7220 w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7221 w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7222 w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7223 w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7224 w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7225 w1[2] = 0;
7226 w1[1] = 0;
7227 w1[0] = 0;
7228 w0[3] = 0;
7229 w0[2] = 0;
7230 w0[1] = 0;
7231 w0[0] = 0;
7232
7233 if (offset_mod_4 == 0)
7234 {
7235 w1[3] = w2[0];
7236 w2[0] = w2[1];
7237 w2[1] = w2[2];
7238 w2[2] = w2[3];
7239 w2[3] = w3[0];
7240 w3[0] = w3[1];
7241 w3[1] = w3[2];
7242 w3[2] = 0;
7243 }
7244
7245 break;
7246
7247 case 8:
7248 w3[2] = amd_bytealign_S ( 0, w1[1], offset_minus_4);
7249 w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7250 w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7251 w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7252 w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7253 w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7254 w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7255 w1[3] = 0;
7256 w1[2] = 0;
7257 w1[1] = 0;
7258 w1[0] = 0;
7259 w0[3] = 0;
7260 w0[2] = 0;
7261 w0[1] = 0;
7262 w0[0] = 0;
7263
7264 if (offset_mod_4 == 0)
7265 {
7266 w2[0] = w2[1];
7267 w2[1] = w2[2];
7268 w2[2] = w2[3];
7269 w2[3] = w3[0];
7270 w3[0] = w3[1];
7271 w3[1] = w3[2];
7272 w3[2] = 0;
7273 }
7274
7275 break;
7276
7277 case 9:
7278 w3[2] = amd_bytealign_S ( 0, w1[0], offset_minus_4);
7279 w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7280 w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7281 w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7282 w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7283 w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7284 w2[0] = 0;
7285 w1[3] = 0;
7286 w1[2] = 0;
7287 w1[1] = 0;
7288 w1[0] = 0;
7289 w0[3] = 0;
7290 w0[2] = 0;
7291 w0[1] = 0;
7292 w0[0] = 0;
7293
7294 if (offset_mod_4 == 0)
7295 {
7296 w2[1] = w2[2];
7297 w2[2] = w2[3];
7298 w2[3] = w3[0];
7299 w3[0] = w3[1];
7300 w3[1] = w3[2];
7301 w3[2] = 0;
7302 }
7303
7304 break;
7305
7306 case 10:
7307 w3[2] = amd_bytealign_S ( 0, w0[3], offset_minus_4);
7308 w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7309 w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7310 w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7311 w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7312 w2[1] = 0;
7313 w2[0] = 0;
7314 w1[3] = 0;
7315 w1[2] = 0;
7316 w1[1] = 0;
7317 w1[0] = 0;
7318 w0[3] = 0;
7319 w0[2] = 0;
7320 w0[1] = 0;
7321 w0[0] = 0;
7322
7323 if (offset_mod_4 == 0)
7324 {
7325 w2[2] = w2[3];
7326 w2[3] = w3[0];
7327 w3[0] = w3[1];
7328 w3[1] = w3[2];
7329 w3[2] = 0;
7330 }
7331
7332 break;
7333
7334 case 11:
7335 w3[2] = amd_bytealign_S ( 0, w0[2], offset_minus_4);
7336 w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7337 w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7338 w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7339 w2[2] = 0;
7340 w2[1] = 0;
7341 w2[0] = 0;
7342 w1[3] = 0;
7343 w1[2] = 0;
7344 w1[1] = 0;
7345 w1[0] = 0;
7346 w0[3] = 0;
7347 w0[2] = 0;
7348 w0[1] = 0;
7349 w0[0] = 0;
7350
7351 if (offset_mod_4 == 0)
7352 {
7353 w2[3] = w3[0];
7354 w3[0] = w3[1];
7355 w3[1] = w3[2];
7356 w3[2] = 0;
7357 }
7358
7359 break;
7360
7361 case 12:
7362 w3[2] = amd_bytealign_S ( 0, w0[1], offset_minus_4);
7363 w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7364 w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7365 w2[3] = 0;
7366 w2[2] = 0;
7367 w2[1] = 0;
7368 w2[0] = 0;
7369 w1[3] = 0;
7370 w1[2] = 0;
7371 w1[1] = 0;
7372 w1[0] = 0;
7373 w0[3] = 0;
7374 w0[2] = 0;
7375 w0[1] = 0;
7376 w0[0] = 0;
7377
7378 if (offset_mod_4 == 0)
7379 {
7380 w3[0] = w3[1];
7381 w3[1] = w3[2];
7382 w3[2] = 0;
7383 }
7384
7385 break;
7386
7387 case 13:
7388 w3[2] = amd_bytealign_S ( 0, w0[0], offset_minus_4);
7389 w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7390 w3[0] = 0;
7391 w2[3] = 0;
7392 w2[2] = 0;
7393 w2[1] = 0;
7394 w2[0] = 0;
7395 w1[3] = 0;
7396 w1[2] = 0;
7397 w1[1] = 0;
7398 w1[0] = 0;
7399 w0[3] = 0;
7400 w0[2] = 0;
7401 w0[1] = 0;
7402 w0[0] = 0;
7403
7404 if (offset_mod_4 == 0)
7405 {
7406 w3[1] = w3[2];
7407 w3[2] = 0;
7408 }
7409
7410 break;
7411 }
7412 #endif
7413
7414 #ifdef IS_NV
7415 const int offset_minus_4 = 4 - (offset % 4);
7416
7417 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
7418
7419 switch (offset / 4)
7420 {
7421 case 0:
7422 w3[1] = __byte_perm_S (w3[0], w3[1], selector);
7423 w3[0] = __byte_perm_S (w2[3], w3[0], selector);
7424 w2[3] = __byte_perm_S (w2[2], w2[3], selector);
7425 w2[2] = __byte_perm_S (w2[1], w2[2], selector);
7426 w2[1] = __byte_perm_S (w2[0], w2[1], selector);
7427 w2[0] = __byte_perm_S (w1[3], w2[0], selector);
7428 w1[3] = __byte_perm_S (w1[2], w1[3], selector);
7429 w1[2] = __byte_perm_S (w1[1], w1[2], selector);
7430 w1[1] = __byte_perm_S (w1[0], w1[1], selector);
7431 w1[0] = __byte_perm_S (w0[3], w1[0], selector);
7432 w0[3] = __byte_perm_S (w0[2], w0[3], selector);
7433 w0[2] = __byte_perm_S (w0[1], w0[2], selector);
7434 w0[1] = __byte_perm_S (w0[0], w0[1], selector);
7435 w0[0] = __byte_perm_S ( 0, w0[0], selector);
7436
7437 break;
7438
7439 case 1:
7440 w3[1] = __byte_perm_S (w2[3], w3[0], selector);
7441 w3[0] = __byte_perm_S (w2[2], w2[3], selector);
7442 w2[3] = __byte_perm_S (w2[1], w2[2], selector);
7443 w2[2] = __byte_perm_S (w2[0], w2[1], selector);
7444 w2[1] = __byte_perm_S (w1[3], w2[0], selector);
7445 w2[0] = __byte_perm_S (w1[2], w1[3], selector);
7446 w1[3] = __byte_perm_S (w1[1], w1[2], selector);
7447 w1[2] = __byte_perm_S (w1[0], w1[1], selector);
7448 w1[1] = __byte_perm_S (w0[3], w1[0], selector);
7449 w1[0] = __byte_perm_S (w0[2], w0[3], selector);
7450 w0[3] = __byte_perm_S (w0[1], w0[2], selector);
7451 w0[2] = __byte_perm_S (w0[0], w0[1], selector);
7452 w0[1] = __byte_perm_S ( 0, w0[0], selector);
7453 w0[0] = 0;
7454
7455 break;
7456
7457 case 2:
7458 w3[1] = __byte_perm_S (w2[2], w2[3], selector);
7459 w3[0] = __byte_perm_S (w2[1], w2[2], selector);
7460 w2[3] = __byte_perm_S (w2[0], w2[1], selector);
7461 w2[2] = __byte_perm_S (w1[3], w2[0], selector);
7462 w2[1] = __byte_perm_S (w1[2], w1[3], selector);
7463 w2[0] = __byte_perm_S (w1[1], w1[2], selector);
7464 w1[3] = __byte_perm_S (w1[0], w1[1], selector);
7465 w1[2] = __byte_perm_S (w0[3], w1[0], selector);
7466 w1[1] = __byte_perm_S (w0[2], w0[3], selector);
7467 w1[0] = __byte_perm_S (w0[1], w0[2], selector);
7468 w0[3] = __byte_perm_S (w0[0], w0[1], selector);
7469 w0[2] = __byte_perm_S ( 0, w0[0], selector);
7470 w0[1] = 0;
7471 w0[0] = 0;
7472
7473 break;
7474
7475 case 3:
7476 w3[1] = __byte_perm_S (w2[1], w2[2], selector);
7477 w3[0] = __byte_perm_S (w2[0], w2[1], selector);
7478 w2[3] = __byte_perm_S (w1[3], w2[0], selector);
7479 w2[2] = __byte_perm_S (w1[2], w1[3], selector);
7480 w2[1] = __byte_perm_S (w1[1], w1[2], selector);
7481 w2[0] = __byte_perm_S (w1[0], w1[1], selector);
7482 w1[3] = __byte_perm_S (w0[3], w1[0], selector);
7483 w1[2] = __byte_perm_S (w0[2], w0[3], selector);
7484 w1[1] = __byte_perm_S (w0[1], w0[2], selector);
7485 w1[0] = __byte_perm_S (w0[0], w0[1], selector);
7486 w0[3] = __byte_perm_S ( 0, w0[0], selector);
7487 w0[2] = 0;
7488 w0[1] = 0;
7489 w0[0] = 0;
7490
7491 break;
7492
7493 case 4:
7494 w3[1] = __byte_perm_S (w2[0], w2[1], selector);
7495 w3[0] = __byte_perm_S (w1[3], w2[0], selector);
7496 w2[3] = __byte_perm_S (w1[2], w1[3], selector);
7497 w2[2] = __byte_perm_S (w1[1], w1[2], selector);
7498 w2[1] = __byte_perm_S (w1[0], w1[1], selector);
7499 w2[0] = __byte_perm_S (w0[3], w1[0], selector);
7500 w1[3] = __byte_perm_S (w0[2], w0[3], selector);
7501 w1[2] = __byte_perm_S (w0[1], w0[2], selector);
7502 w1[1] = __byte_perm_S (w0[0], w0[1], selector);
7503 w1[0] = __byte_perm_S ( 0, w0[0], selector);
7504 w0[3] = 0;
7505 w0[2] = 0;
7506 w0[1] = 0;
7507 w0[0] = 0;
7508
7509 break;
7510
7511 case 5:
7512 w3[1] = __byte_perm_S (w1[3], w2[0], selector);
7513 w3[0] = __byte_perm_S (w1[2], w1[3], selector);
7514 w2[3] = __byte_perm_S (w1[1], w1[2], selector);
7515 w2[2] = __byte_perm_S (w1[0], w1[1], selector);
7516 w2[1] = __byte_perm_S (w0[3], w1[0], selector);
7517 w2[0] = __byte_perm_S (w0[2], w0[3], selector);
7518 w1[3] = __byte_perm_S (w0[1], w0[2], selector);
7519 w1[2] = __byte_perm_S (w0[0], w0[1], selector);
7520 w1[1] = __byte_perm_S ( 0, w0[0], selector);
7521 w1[0] = 0;
7522 w0[3] = 0;
7523 w0[2] = 0;
7524 w0[1] = 0;
7525 w0[0] = 0;
7526
7527 break;
7528
7529 case 6:
7530 w3[1] = __byte_perm_S (w1[2], w1[3], selector);
7531 w3[0] = __byte_perm_S (w1[1], w1[2], selector);
7532 w2[3] = __byte_perm_S (w1[0], w1[1], selector);
7533 w2[2] = __byte_perm_S (w0[3], w1[0], selector);
7534 w2[1] = __byte_perm_S (w0[2], w0[3], selector);
7535 w2[0] = __byte_perm_S (w0[1], w0[2], selector);
7536 w1[3] = __byte_perm_S (w0[0], w0[1], selector);
7537 w1[2] = __byte_perm_S ( 0, w0[0], selector);
7538 w1[1] = 0;
7539 w1[0] = 0;
7540 w0[3] = 0;
7541 w0[2] = 0;
7542 w0[1] = 0;
7543 w0[0] = 0;
7544
7545 break;
7546
7547 case 7:
7548 w3[1] = __byte_perm_S (w1[1], w1[2], selector);
7549 w3[0] = __byte_perm_S (w1[0], w1[1], selector);
7550 w2[3] = __byte_perm_S (w0[3], w1[0], selector);
7551 w2[2] = __byte_perm_S (w0[2], w0[3], selector);
7552 w2[1] = __byte_perm_S (w0[1], w0[2], selector);
7553 w2[0] = __byte_perm_S (w0[0], w0[1], selector);
7554 w1[3] = __byte_perm_S ( 0, w0[0], selector);
7555 w1[2] = 0;
7556 w1[1] = 0;
7557 w1[0] = 0;
7558 w0[3] = 0;
7559 w0[2] = 0;
7560 w0[1] = 0;
7561 w0[0] = 0;
7562
7563 break;
7564
7565 case 8:
7566 w3[1] = __byte_perm_S (w1[0], w1[1], selector);
7567 w3[0] = __byte_perm_S (w0[3], w1[0], selector);
7568 w2[3] = __byte_perm_S (w0[2], w0[3], selector);
7569 w2[2] = __byte_perm_S (w0[1], w0[2], selector);
7570 w2[1] = __byte_perm_S (w0[0], w0[1], selector);
7571 w2[0] = __byte_perm_S ( 0, w0[0], selector);
7572 w1[3] = 0;
7573 w1[2] = 0;
7574 w1[1] = 0;
7575 w1[0] = 0;
7576 w0[3] = 0;
7577 w0[2] = 0;
7578 w0[1] = 0;
7579 w0[0] = 0;
7580
7581 break;
7582
7583 case 9:
7584 w3[1] = __byte_perm_S (w0[3], w1[0], selector);
7585 w3[0] = __byte_perm_S (w0[2], w0[3], selector);
7586 w2[3] = __byte_perm_S (w0[1], w0[2], selector);
7587 w2[2] = __byte_perm_S (w0[0], w0[1], selector);
7588 w2[1] = __byte_perm_S ( 0, w0[0], selector);
7589 w2[0] = 0;
7590 w1[3] = 0;
7591 w1[2] = 0;
7592 w1[1] = 0;
7593 w1[0] = 0;
7594 w0[3] = 0;
7595 w0[2] = 0;
7596 w0[1] = 0;
7597 w0[0] = 0;
7598
7599 break;
7600
7601 case 10:
7602 w3[1] = __byte_perm_S (w0[2], w0[3], selector);
7603 w3[0] = __byte_perm_S (w0[1], w0[2], selector);
7604 w2[3] = __byte_perm_S (w0[0], w0[1], selector);
7605 w2[2] = __byte_perm_S ( 0, w0[0], selector);
7606 w2[1] = 0;
7607 w2[0] = 0;
7608 w1[3] = 0;
7609 w1[2] = 0;
7610 w1[1] = 0;
7611 w1[0] = 0;
7612 w0[3] = 0;
7613 w0[2] = 0;
7614 w0[1] = 0;
7615 w0[0] = 0;
7616
7617 break;
7618
7619 case 11:
7620 w3[1] = __byte_perm_S (w0[1], w0[2], selector);
7621 w3[0] = __byte_perm_S (w0[0], w0[1], selector);
7622 w2[3] = __byte_perm_S ( 0, w0[0], selector);
7623 w2[2] = 0;
7624 w2[1] = 0;
7625 w2[0] = 0;
7626 w1[3] = 0;
7627 w1[2] = 0;
7628 w1[1] = 0;
7629 w1[0] = 0;
7630 w0[3] = 0;
7631 w0[2] = 0;
7632 w0[1] = 0;
7633 w0[0] = 0;
7634
7635 break;
7636
7637 case 12:
7638 w3[1] = __byte_perm_S (w0[0], w0[1], selector);
7639 w3[0] = __byte_perm_S ( 0, w0[0], selector);
7640 w2[3] = 0;
7641 w2[2] = 0;
7642 w2[1] = 0;
7643 w2[0] = 0;
7644 w1[3] = 0;
7645 w1[2] = 0;
7646 w1[1] = 0;
7647 w1[0] = 0;
7648 w0[3] = 0;
7649 w0[2] = 0;
7650 w0[1] = 0;
7651 w0[0] = 0;
7652
7653 break;
7654
7655 case 13:
7656 w3[1] = __byte_perm_S ( 0, w0[0], selector);
7657 w3[0] = 0;
7658 w2[3] = 0;
7659 w2[2] = 0;
7660 w2[1] = 0;
7661 w2[0] = 0;
7662 w1[3] = 0;
7663 w1[2] = 0;
7664 w1[1] = 0;
7665 w1[0] = 0;
7666 w0[3] = 0;
7667 w0[2] = 0;
7668 w0[1] = 0;
7669 w0[0] = 0;
7670
7671 break;
7672 }
7673 #endif
7674 }
7675
7676 inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
7677 {
7678 #if defined IS_AMD || defined IS_GENERIC
7679 switch (offset / 4)
7680 {
7681 case 0:
7682 w3[2] = amd_bytealign_S (w3[1], 0, offset);
7683 w3[1] = amd_bytealign_S (w3[0], w3[1], offset);
7684 w3[0] = amd_bytealign_S (w2[3], w3[0], offset);
7685 w2[3] = amd_bytealign_S (w2[2], w2[3], offset);
7686 w2[2] = amd_bytealign_S (w2[1], w2[2], offset);
7687 w2[1] = amd_bytealign_S (w2[0], w2[1], offset);
7688 w2[0] = amd_bytealign_S (w1[3], w2[0], offset);
7689 w1[3] = amd_bytealign_S (w1[2], w1[3], offset);
7690 w1[2] = amd_bytealign_S (w1[1], w1[2], offset);
7691 w1[1] = amd_bytealign_S (w1[0], w1[1], offset);
7692 w1[0] = amd_bytealign_S (w0[3], w1[0], offset);
7693 w0[3] = amd_bytealign_S (w0[2], w0[3], offset);
7694 w0[2] = amd_bytealign_S (w0[1], w0[2], offset);
7695 w0[1] = amd_bytealign_S (w0[0], w0[1], offset);
7696 w0[0] = amd_bytealign_S ( 0, w0[0], offset);
7697 break;
7698
7699 case 1:
7700 w3[2] = amd_bytealign_S (w3[0], 0, offset);
7701 w3[1] = amd_bytealign_S (w2[3], w3[0], offset);
7702 w3[0] = amd_bytealign_S (w2[2], w2[3], offset);
7703 w2[3] = amd_bytealign_S (w2[1], w2[2], offset);
7704 w2[2] = amd_bytealign_S (w2[0], w2[1], offset);
7705 w2[1] = amd_bytealign_S (w1[3], w2[0], offset);
7706 w2[0] = amd_bytealign_S (w1[2], w1[3], offset);
7707 w1[3] = amd_bytealign_S (w1[1], w1[2], offset);
7708 w1[2] = amd_bytealign_S (w1[0], w1[1], offset);
7709 w1[1] = amd_bytealign_S (w0[3], w1[0], offset);
7710 w1[0] = amd_bytealign_S (w0[2], w0[3], offset);
7711 w0[3] = amd_bytealign_S (w0[1], w0[2], offset);
7712 w0[2] = amd_bytealign_S (w0[0], w0[1], offset);
7713 w0[1] = amd_bytealign_S ( 0, w0[0], offset);
7714 w0[0] = 0;
7715 break;
7716
7717 case 2:
7718 w3[2] = amd_bytealign_S (w2[3], 0, offset);
7719 w3[1] = amd_bytealign_S (w2[2], w2[3], offset);
7720 w3[0] = amd_bytealign_S (w2[1], w2[2], offset);
7721 w2[3] = amd_bytealign_S (w2[0], w2[1], offset);
7722 w2[2] = amd_bytealign_S (w1[3], w2[0], offset);
7723 w2[1] = amd_bytealign_S (w1[2], w1[3], offset);
7724 w2[0] = amd_bytealign_S (w1[1], w1[2], offset);
7725 w1[3] = amd_bytealign_S (w1[0], w1[1], offset);
7726 w1[2] = amd_bytealign_S (w0[3], w1[0], offset);
7727 w1[1] = amd_bytealign_S (w0[2], w0[3], offset);
7728 w1[0] = amd_bytealign_S (w0[1], w0[2], offset);
7729 w0[3] = amd_bytealign_S (w0[0], w0[1], offset);
7730 w0[2] = amd_bytealign_S ( 0, w0[0], offset);
7731 w0[1] = 0;
7732 w0[0] = 0;
7733 break;
7734
7735 case 3:
7736 w3[2] = amd_bytealign_S (w2[2], 0, offset);
7737 w3[1] = amd_bytealign_S (w2[1], w2[2], offset);
7738 w3[0] = amd_bytealign_S (w2[0], w2[1], offset);
7739 w2[3] = amd_bytealign_S (w1[3], w2[0], offset);
7740 w2[2] = amd_bytealign_S (w1[2], w1[3], offset);
7741 w2[1] = amd_bytealign_S (w1[1], w1[2], offset);
7742 w2[0] = amd_bytealign_S (w1[0], w1[1], offset);
7743 w1[3] = amd_bytealign_S (w0[3], w1[0], offset);
7744 w1[2] = amd_bytealign_S (w0[2], w0[3], offset);
7745 w1[1] = amd_bytealign_S (w0[1], w0[2], offset);
7746 w1[0] = amd_bytealign_S (w0[0], w0[1], offset);
7747 w0[3] = amd_bytealign_S ( 0, w0[0], offset);
7748 w0[2] = 0;
7749 w0[1] = 0;
7750 w0[0] = 0;
7751 break;
7752
7753 case 4:
7754 w3[2] = amd_bytealign_S (w2[1], 0, offset);
7755 w3[1] = amd_bytealign_S (w2[0], w2[1], offset);
7756 w3[0] = amd_bytealign_S (w1[3], w2[0], offset);
7757 w2[3] = amd_bytealign_S (w1[2], w1[3], offset);
7758 w2[2] = amd_bytealign_S (w1[1], w1[2], offset);
7759 w2[1] = amd_bytealign_S (w1[0], w1[1], offset);
7760 w2[0] = amd_bytealign_S (w0[3], w1[0], offset);
7761 w1[3] = amd_bytealign_S (w0[2], w0[3], offset);
7762 w1[2] = amd_bytealign_S (w0[1], w0[2], offset);
7763 w1[1] = amd_bytealign_S (w0[0], w0[1], offset);
7764 w1[0] = amd_bytealign_S ( 0, w0[0], offset);
7765 w0[3] = 0;
7766 w0[2] = 0;
7767 w0[1] = 0;
7768 w0[0] = 0;
7769 break;
7770
7771 case 5:
7772 w3[2] = amd_bytealign_S (w2[0], 0, offset);
7773 w3[1] = amd_bytealign_S (w1[3], w2[0], offset);
7774 w3[0] = amd_bytealign_S (w1[2], w1[3], offset);
7775 w2[3] = amd_bytealign_S (w1[1], w1[2], offset);
7776 w2[2] = amd_bytealign_S (w1[0], w1[1], offset);
7777 w2[1] = amd_bytealign_S (w0[3], w1[0], offset);
7778 w2[0] = amd_bytealign_S (w0[2], w0[3], offset);
7779 w1[3] = amd_bytealign_S (w0[1], w0[2], offset);
7780 w1[2] = amd_bytealign_S (w0[0], w0[1], offset);
7781 w1[1] = amd_bytealign_S ( 0, w0[0], offset);
7782 w1[0] = 0;
7783 w0[3] = 0;
7784 w0[2] = 0;
7785 w0[1] = 0;
7786 w0[0] = 0;
7787 break;
7788
7789 case 6:
7790 w3[2] = amd_bytealign_S (w1[3], 0, offset);
7791 w3[1] = amd_bytealign_S (w1[2], w1[3], offset);
7792 w3[0] = amd_bytealign_S (w1[1], w1[2], offset);
7793 w2[3] = amd_bytealign_S (w1[0], w1[1], offset);
7794 w2[2] = amd_bytealign_S (w0[3], w1[0], offset);
7795 w2[1] = amd_bytealign_S (w0[2], w0[3], offset);
7796 w2[0] = amd_bytealign_S (w0[1], w0[2], offset);
7797 w1[3] = amd_bytealign_S (w0[0], w0[1], offset);
7798 w1[2] = amd_bytealign_S ( 0, w0[0], offset);
7799 w1[1] = 0;
7800 w1[0] = 0;
7801 w0[3] = 0;
7802 w0[2] = 0;
7803 w0[1] = 0;
7804 w0[0] = 0;
7805 break;
7806
7807 case 7:
7808 w3[2] = amd_bytealign_S (w1[2], 0, offset);
7809 w3[1] = amd_bytealign_S (w1[1], w1[2], offset);
7810 w3[0] = amd_bytealign_S (w1[0], w1[1], offset);
7811 w2[3] = amd_bytealign_S (w0[3], w1[0], offset);
7812 w2[2] = amd_bytealign_S (w0[2], w0[3], offset);
7813 w2[1] = amd_bytealign_S (w0[1], w0[2], offset);
7814 w2[0] = amd_bytealign_S (w0[0], w0[1], offset);
7815 w1[3] = amd_bytealign_S ( 0, w0[0], offset);
7816 w1[2] = 0;
7817 w1[1] = 0;
7818 w1[0] = 0;
7819 w0[3] = 0;
7820 w0[2] = 0;
7821 w0[1] = 0;
7822 w0[0] = 0;
7823 break;
7824
7825 case 8:
7826 w3[2] = amd_bytealign_S (w1[1], 0, offset);
7827 w3[1] = amd_bytealign_S (w1[0], w1[1], offset);
7828 w3[0] = amd_bytealign_S (w0[3], w1[0], offset);
7829 w2[3] = amd_bytealign_S (w0[2], w0[3], offset);
7830 w2[2] = amd_bytealign_S (w0[1], w0[2], offset);
7831 w2[1] = amd_bytealign_S (w0[0], w0[1], offset);
7832 w2[0] = amd_bytealign_S ( 0, w0[0], offset);
7833 w1[3] = 0;
7834 w1[2] = 0;
7835 w1[1] = 0;
7836 w1[0] = 0;
7837 w0[3] = 0;
7838 w0[2] = 0;
7839 w0[1] = 0;
7840 w0[0] = 0;
7841 break;
7842
7843 case 9:
7844 w3[2] = amd_bytealign_S (w1[0], 0, offset);
7845 w3[1] = amd_bytealign_S (w0[3], w1[0], offset);
7846 w3[0] = amd_bytealign_S (w0[2], w0[3], offset);
7847 w2[3] = amd_bytealign_S (w0[1], w0[2], offset);
7848 w2[2] = amd_bytealign_S (w0[0], w0[1], offset);
7849 w2[1] = amd_bytealign_S ( 0, w0[0], offset);
7850 w2[0] = 0;
7851 w1[3] = 0;
7852 w1[2] = 0;
7853 w1[1] = 0;
7854 w1[0] = 0;
7855 w0[3] = 0;
7856 w0[2] = 0;
7857 w0[1] = 0;
7858 w0[0] = 0;
7859 break;
7860
7861 case 10:
7862 w3[2] = amd_bytealign_S (w0[3], 0, offset);
7863 w3[1] = amd_bytealign_S (w0[2], w0[3], offset);
7864 w3[0] = amd_bytealign_S (w0[1], w0[2], offset);
7865 w2[3] = amd_bytealign_S (w0[0], w0[1], offset);
7866 w2[2] = amd_bytealign_S ( 0, w0[0], offset);
7867 w2[1] = 0;
7868 w2[0] = 0;
7869 w1[3] = 0;
7870 w1[2] = 0;
7871 w1[1] = 0;
7872 w1[0] = 0;
7873 w0[3] = 0;
7874 w0[2] = 0;
7875 w0[1] = 0;
7876 w0[0] = 0;
7877 break;
7878
7879 case 11:
7880 w3[2] = amd_bytealign_S (w0[2], 0, offset);
7881 w3[1] = amd_bytealign_S (w0[1], w0[2], offset);
7882 w3[0] = amd_bytealign_S (w0[0], w0[1], offset);
7883 w2[3] = amd_bytealign_S ( 0, w0[0], offset);
7884 w2[2] = 0;
7885 w2[1] = 0;
7886 w2[0] = 0;
7887 w1[3] = 0;
7888 w1[2] = 0;
7889 w1[1] = 0;
7890 w1[0] = 0;
7891 w0[3] = 0;
7892 w0[2] = 0;
7893 w0[1] = 0;
7894 w0[0] = 0;
7895 break;
7896
7897 case 12:
7898 w3[2] = amd_bytealign_S (w0[1], 0, offset);
7899 w3[1] = amd_bytealign_S (w0[0], w0[1], offset);
7900 w3[0] = amd_bytealign_S ( 0, w0[0], offset);
7901 w2[3] = 0;
7902 w2[2] = 0;
7903 w2[1] = 0;
7904 w2[0] = 0;
7905 w1[3] = 0;
7906 w1[2] = 0;
7907 w1[1] = 0;
7908 w1[0] = 0;
7909 w0[3] = 0;
7910 w0[2] = 0;
7911 w0[1] = 0;
7912 w0[0] = 0;
7913 break;
7914
7915 case 13:
7916 w3[2] = amd_bytealign_S (w0[0], 0, offset);
7917 w3[1] = amd_bytealign_S ( 0, w0[0], offset);
7918 w3[0] = 0;
7919 w2[3] = 0;
7920 w2[2] = 0;
7921 w2[1] = 0;
7922 w2[0] = 0;
7923 w1[3] = 0;
7924 w1[2] = 0;
7925 w1[1] = 0;
7926 w1[0] = 0;
7927 w0[3] = 0;
7928 w0[2] = 0;
7929 w0[1] = 0;
7930 w0[0] = 0;
7931 break;
7932 }
7933 #endif
7934
7935 #ifdef IS_NV
7936 const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
7937
7938 switch (offset / 4)
7939 {
7940 case 0:
7941 w3[1] = __byte_perm_S (w3[1], w3[0], selector);
7942 w3[0] = __byte_perm_S (w3[0], w2[3], selector);
7943 w2[3] = __byte_perm_S (w2[3], w2[2], selector);
7944 w2[2] = __byte_perm_S (w2[2], w2[1], selector);
7945 w2[1] = __byte_perm_S (w2[1], w2[0], selector);
7946 w2[0] = __byte_perm_S (w2[0], w1[3], selector);
7947 w1[3] = __byte_perm_S (w1[3], w1[2], selector);
7948 w1[2] = __byte_perm_S (w1[2], w1[1], selector);
7949 w1[1] = __byte_perm_S (w1[1], w1[0], selector);
7950 w1[0] = __byte_perm_S (w1[0], w0[3], selector);
7951 w0[3] = __byte_perm_S (w0[3], w0[2], selector);
7952 w0[2] = __byte_perm_S (w0[2], w0[1], selector);
7953 w0[1] = __byte_perm_S (w0[1], w0[0], selector);
7954 w0[0] = __byte_perm_S (w0[0], 0, selector);
7955 break;
7956
7957 case 1:
7958 w3[1] = __byte_perm_S (w3[0], w2[3], selector);
7959 w3[0] = __byte_perm_S (w2[3], w2[2], selector);
7960 w2[3] = __byte_perm_S (w2[2], w2[1], selector);
7961 w2[2] = __byte_perm_S (w2[1], w2[0], selector);
7962 w2[1] = __byte_perm_S (w2[0], w1[3], selector);
7963 w2[0] = __byte_perm_S (w1[3], w1[2], selector);
7964 w1[3] = __byte_perm_S (w1[2], w1[1], selector);
7965 w1[2] = __byte_perm_S (w1[1], w1[0], selector);
7966 w1[1] = __byte_perm_S (w1[0], w0[3], selector);
7967 w1[0] = __byte_perm_S (w0[3], w0[2], selector);
7968 w0[3] = __byte_perm_S (w0[2], w0[1], selector);
7969 w0[2] = __byte_perm_S (w0[1], w0[0], selector);
7970 w0[1] = __byte_perm_S (w0[0], 0, selector);
7971 w0[0] = 0;
7972 break;
7973
7974 case 2:
7975 w3[1] = __byte_perm_S (w2[3], w2[2], selector);
7976 w3[0] = __byte_perm_S (w2[2], w2[1], selector);
7977 w2[3] = __byte_perm_S (w2[1], w2[0], selector);
7978 w2[2] = __byte_perm_S (w2[0], w1[3], selector);
7979 w2[1] = __byte_perm_S (w1[3], w1[2], selector);
7980 w2[0] = __byte_perm_S (w1[2], w1[1], selector);
7981 w1[3] = __byte_perm_S (w1[1], w1[0], selector);
7982 w1[2] = __byte_perm_S (w1[0], w0[3], selector);
7983 w1[1] = __byte_perm_S (w0[3], w0[2], selector);
7984 w1[0] = __byte_perm_S (w0[2], w0[1], selector);
7985 w0[3] = __byte_perm_S (w0[1], w0[0], selector);
7986 w0[2] = __byte_perm_S (w0[0], 0, selector);
7987 w0[1] = 0;
7988 w0[0] = 0;
7989 break;
7990
7991 case 3:
7992 w3[1] = __byte_perm_S (w2[2], w2[1], selector);
7993 w3[0] = __byte_perm_S (w2[1], w2[0], selector);
7994 w2[3] = __byte_perm_S (w2[0], w1[3], selector);
7995 w2[2] = __byte_perm_S (w1[3], w1[2], selector);
7996 w2[1] = __byte_perm_S (w1[2], w1[1], selector);
7997 w2[0] = __byte_perm_S (w1[1], w1[0], selector);
7998 w1[3] = __byte_perm_S (w1[0], w0[3], selector);
7999 w1[2] = __byte_perm_S (w0[3], w0[2], selector);
8000 w1[1] = __byte_perm_S (w0[2], w0[1], selector);
8001 w1[0] = __byte_perm_S (w0[1], w0[0], selector);
8002 w0[3] = __byte_perm_S (w0[0], 0, selector);
8003 w0[2] = 0;
8004 w0[1] = 0;
8005 w0[0] = 0;
8006 break;
8007
8008 case 4:
8009 w3[1] = __byte_perm_S (w2[1], w2[0], selector);
8010 w3[0] = __byte_perm_S (w2[0], w1[3], selector);
8011 w2[3] = __byte_perm_S (w1[3], w1[2], selector);
8012 w2[2] = __byte_perm_S (w1[2], w1[1], selector);
8013 w2[1] = __byte_perm_S (w1[1], w1[0], selector);
8014 w2[0] = __byte_perm_S (w1[0], w0[3], selector);
8015 w1[3] = __byte_perm_S (w0[3], w0[2], selector);
8016 w1[2] = __byte_perm_S (w0[2], w0[1], selector);
8017 w1[1] = __byte_perm_S (w0[1], w0[0], selector);
8018 w1[0] = __byte_perm_S (w0[0], 0, selector);
8019 w0[3] = 0;
8020 w0[2] = 0;
8021 w0[1] = 0;
8022 w0[0] = 0;
8023 break;
8024
8025 case 5:
8026 w3[1] = __byte_perm_S (w2[0], w1[3], selector);
8027 w3[0] = __byte_perm_S (w1[3], w1[2], selector);
8028 w2[3] = __byte_perm_S (w1[2], w1[1], selector);
8029 w2[2] = __byte_perm_S (w1[1], w1[0], selector);
8030 w2[1] = __byte_perm_S (w1[0], w0[3], selector);
8031 w2[0] = __byte_perm_S (w0[3], w0[2], selector);
8032 w1[3] = __byte_perm_S (w0[2], w0[1], selector);
8033 w1[2] = __byte_perm_S (w0[1], w0[0], selector);
8034 w1[1] = __byte_perm_S (w0[0], 0, selector);
8035 w1[0] = 0;
8036 w0[3] = 0;
8037 w0[2] = 0;
8038 w0[1] = 0;
8039 w0[0] = 0;
8040 break;
8041
8042 case 6:
8043 w3[1] = __byte_perm_S (w1[3], w1[2], selector);
8044 w3[0] = __byte_perm_S (w1[2], w1[1], selector);
8045 w2[3] = __byte_perm_S (w1[1], w1[0], selector);
8046 w2[2] = __byte_perm_S (w1[0], w0[3], selector);
8047 w2[1] = __byte_perm_S (w0[3], w0[2], selector);
8048 w2[0] = __byte_perm_S (w0[2], w0[1], selector);
8049 w1[3] = __byte_perm_S (w0[1], w0[0], selector);
8050 w1[2] = __byte_perm_S (w0[0], 0, selector);
8051 w1[1] = 0;
8052 w1[0] = 0;
8053 w0[3] = 0;
8054 w0[2] = 0;
8055 w0[1] = 0;
8056 w0[0] = 0;
8057 break;
8058
8059 case 7:
8060 w3[1] = __byte_perm_S (w1[2], w1[1], selector);
8061 w3[0] = __byte_perm_S (w1[1], w1[0], selector);
8062 w2[3] = __byte_perm_S (w1[0], w0[3], selector);
8063 w2[2] = __byte_perm_S (w0[3], w0[2], selector);
8064 w2[1] = __byte_perm_S (w0[2], w0[1], selector);
8065 w2[0] = __byte_perm_S (w0[1], w0[0], selector);
8066 w1[3] = __byte_perm_S (w0[0], 0, selector);
8067 w1[2] = 0;
8068 w1[1] = 0;
8069 w1[0] = 0;
8070 w0[3] = 0;
8071 w0[2] = 0;
8072 w0[1] = 0;
8073 w0[0] = 0;
8074 break;
8075
8076 case 8:
8077 w3[1] = __byte_perm_S (w1[1], w1[0], selector);
8078 w3[0] = __byte_perm_S (w1[0], w0[3], selector);
8079 w2[3] = __byte_perm_S (w0[3], w0[2], selector);
8080 w2[2] = __byte_perm_S (w0[2], w0[1], selector);
8081 w2[1] = __byte_perm_S (w0[1], w0[0], selector);
8082 w2[0] = __byte_perm_S (w0[0], 0, selector);
8083 w1[3] = 0;
8084 w1[2] = 0;
8085 w1[1] = 0;
8086 w1[0] = 0;
8087 w0[3] = 0;
8088 w0[2] = 0;
8089 w0[1] = 0;
8090 w0[0] = 0;
8091 break;
8092
8093 case 9:
8094 w3[1] = __byte_perm_S (w1[0], w0[3], selector);
8095 w3[0] = __byte_perm_S (w0[3], w0[2], selector);
8096 w2[3] = __byte_perm_S (w0[2], w0[1], selector);
8097 w2[2] = __byte_perm_S (w0[1], w0[0], selector);
8098 w2[1] = __byte_perm_S (w0[0], 0, selector);
8099 w2[0] = 0;
8100 w1[3] = 0;
8101 w1[2] = 0;
8102 w1[1] = 0;
8103 w1[0] = 0;
8104 w0[3] = 0;
8105 w0[2] = 0;
8106 w0[1] = 0;
8107 w0[0] = 0;
8108 break;
8109
8110 case 10:
8111 w3[1] = __byte_perm_S (w0[3], w0[2], selector);
8112 w3[0] = __byte_perm_S (w0[2], w0[1], selector);
8113 w2[3] = __byte_perm_S (w0[1], w0[0], selector);
8114 w2[2] = __byte_perm_S (w0[0], 0, selector);
8115 w2[1] = 0;
8116 w2[0] = 0;
8117 w1[3] = 0;
8118 w1[2] = 0;
8119 w1[1] = 0;
8120 w1[0] = 0;
8121 w0[3] = 0;
8122 w0[2] = 0;
8123 w0[1] = 0;
8124 w0[0] = 0;
8125 break;
8126
8127 case 11:
8128 w3[1] = __byte_perm_S (w0[2], w0[1], selector);
8129 w3[0] = __byte_perm_S (w0[1], w0[0], selector);
8130 w2[3] = __byte_perm_S (w0[0], 0, selector);
8131 w2[2] = 0;
8132 w2[1] = 0;
8133 w2[0] = 0;
8134 w1[3] = 0;
8135 w1[2] = 0;
8136 w1[1] = 0;
8137 w1[0] = 0;
8138 w0[3] = 0;
8139 w0[2] = 0;
8140 w0[1] = 0;
8141 w0[0] = 0;
8142 break;
8143
8144 case 12:
8145 w3[1] = __byte_perm_S (w0[1], w0[0], selector);
8146 w3[0] = __byte_perm_S (w0[0], 0, selector);
8147 w2[3] = 0;
8148 w2[2] = 0;
8149 w2[1] = 0;
8150 w2[0] = 0;
8151 w1[3] = 0;
8152 w1[2] = 0;
8153 w1[1] = 0;
8154 w1[0] = 0;
8155 w0[3] = 0;
8156 w0[2] = 0;
8157 w0[1] = 0;
8158 w0[0] = 0;
8159 break;
8160
8161 case 13:
8162 w3[1] = __byte_perm_S (w0[0], 0, selector);
8163 w3[0] = 0;
8164 w2[3] = 0;
8165 w2[2] = 0;
8166 w2[1] = 0;
8167 w2[0] = 0;
8168 w1[3] = 0;
8169 w1[2] = 0;
8170 w1[1] = 0;
8171 w1[0] = 0;
8172 w0[3] = 0;
8173 w0[2] = 0;
8174 w0[1] = 0;
8175 w0[0] = 0;
8176 break;
8177 }
8178 #endif
8179 }
8180
8181 /**
8182 * vector functions on scalar types (for inner loop usage)
8183 */
8184
8185 #define PACKVS2(sn,vn,e) \
8186 sn[0] = vn[0].s##e; \
8187 sn[1] = vn[1].s##e;
8188
8189 #define PACKSV2(sn,vn,e) \
8190 vn[0].s##e = sn[0]; \
8191 vn[1].s##e = sn[1];
8192
8193 #define PACKVS24(s0,s1,v0,v1,e) \
8194 PACKVS4 (s0, v0, e); \
8195 PACKVS4 (s1, v1, e);
8196
8197 #define PACKSV24(s0,s1,v0,v1,e) \
8198 PACKSV4 (s0, v0, e); \
8199 PACKSV4 (s1, v1, e);
8200
8201 #define PACKVS4(sn,vn,e) \
8202 sn[0] = vn[0].s##e; \
8203 sn[1] = vn[1].s##e; \
8204 sn[2] = vn[2].s##e; \
8205 sn[3] = vn[3].s##e;
8206
8207 #define PACKSV4(sn,vn,e) \
8208 vn[0].s##e = sn[0]; \
8209 vn[1].s##e = sn[1]; \
8210 vn[2].s##e = sn[2]; \
8211 vn[3].s##e = sn[3];
8212
8213 #define PACKVS44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
8214 PACKVS4 (s0, v0, e); \
8215 PACKVS4 (s1, v1, e); \
8216 PACKVS4 (s2, v2, e); \
8217 PACKVS4 (s3, v3, e);
8218
8219 #define PACKSV44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
8220 PACKSV4 (s0, v0, e); \
8221 PACKSV4 (s1, v1, e); \
8222 PACKSV4 (s2, v2, e); \
8223 PACKSV4 (s3, v3, e);
8224
8225 inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
8226 {
8227 #if VECT_SIZE == 1
8228
8229 switch_buffer_by_offset_le_S (w0, w1, w2, w3, offset);
8230
8231 #else
8232
8233 u32 t0[4];
8234 u32 t1[4];
8235 u32 t2[4];
8236 u32 t3[4];
8237
8238 #endif
8239
8240 #if VECT_SIZE == 2
8241
8242 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8243 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8244
8245 #elif VECT_SIZE == 4
8246
8247 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8248 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8249 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
8250 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
8251
8252 #elif VECT_SIZE == 8
8253
8254 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8255 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8256 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
8257 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
8258 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
8259 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
8260 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
8261 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
8262
8263 #elif VECT_SIZE == 16
8264
8265 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8266 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8267 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
8268 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
8269 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
8270 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
8271 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
8272 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
8273 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
8274 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
8275 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
8276 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
8277 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
8278 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
8279 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
8280 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
8281
8282 #endif
8283 }
8284
8285 inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
8286 {
8287 #if VECT_SIZE == 1
8288
8289 append_0x01_2x4_S (w0, w1, offset);
8290
8291 #else
8292
8293 u32 t0[4];
8294 u32 t1[4];
8295
8296 #endif
8297
8298 #if VECT_SIZE == 2
8299
8300 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8301 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8302
8303 #elif VECT_SIZE == 4
8304
8305 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8306 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8307 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
8308 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
8309
8310 #elif VECT_SIZE == 8
8311
8312 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8313 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8314 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
8315 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
8316 PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
8317 PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
8318 PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
8319 PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
8320
8321 #elif VECT_SIZE == 16
8322
8323 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8324 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8325 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
8326 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
8327 PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
8328 PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
8329 PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
8330 PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
8331 PACKVS24 (t0, t1, w0, w1, 8); append_0x01_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
8332 PACKVS24 (t0, t1, w0, w1, 9); append_0x01_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
8333 PACKVS24 (t0, t1, w0, w1, a); append_0x01_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
8334 PACKVS24 (t0, t1, w0, w1, b); append_0x01_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
8335 PACKVS24 (t0, t1, w0, w1, c); append_0x01_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
8336 PACKVS24 (t0, t1, w0, w1, d); append_0x01_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
8337 PACKVS24 (t0, t1, w0, w1, e); append_0x01_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
8338 PACKVS24 (t0, t1, w0, w1, f); append_0x01_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
8339
8340 #endif
8341 }
8342
8343 inline void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
8344 {
8345 #if VECT_SIZE == 1
8346
8347 append_0x80_2x4_S (w0, w1, offset);
8348
8349 #else
8350
8351 u32 t0[4];
8352 u32 t1[4];
8353
8354 #endif
8355
8356 #if VECT_SIZE == 2
8357
8358 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8359 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8360
8361 #elif VECT_SIZE == 4
8362
8363 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8364 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8365 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
8366 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
8367
8368 #elif VECT_SIZE == 8
8369
8370 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8371 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8372 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
8373 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
8374 PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
8375 PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
8376 PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
8377 PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
8378
8379 #elif VECT_SIZE == 16
8380
8381 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
8382 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
8383 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
8384 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
8385 PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
8386 PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
8387 PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
8388 PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
8389 PACKVS24 (t0, t1, w0, w1, 8); append_0x80_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
8390 PACKVS24 (t0, t1, w0, w1, 9); append_0x80_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
8391 PACKVS24 (t0, t1, w0, w1, a); append_0x80_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
8392 PACKVS24 (t0, t1, w0, w1, b); append_0x80_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
8393 PACKVS24 (t0, t1, w0, w1, c); append_0x80_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
8394 PACKVS24 (t0, t1, w0, w1, d); append_0x80_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
8395 PACKVS24 (t0, t1, w0, w1, e); append_0x80_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
8396 PACKVS24 (t0, t1, w0, w1, f); append_0x80_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
8397
8398 #endif
8399 }
8400
8401 inline void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
8402 {
8403 #if VECT_SIZE == 1
8404
8405 append_0x80_4x4_S (w0, w1, w2, w3, offset);
8406
8407 #else
8408
8409 u32 t0[4];
8410 u32 t1[4];
8411 u32 t2[4];
8412 u32 t3[4];
8413
8414 #endif
8415
8416 #if VECT_SIZE == 2
8417
8418 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8419 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8420
8421 #elif VECT_SIZE == 4
8422
8423 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8424 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8425 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
8426 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
8427
8428 #elif VECT_SIZE == 8
8429
8430 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8431 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8432 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
8433 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
8434 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
8435 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
8436 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
8437 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
8438
8439 #elif VECT_SIZE == 16
8440
8441 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
8442 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
8443 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
8444 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
8445 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
8446 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
8447 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
8448 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
8449 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); append_0x80_4x4_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
8450 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); append_0x80_4x4_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
8451 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); append_0x80_4x4_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
8452 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); append_0x80_4x4_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
8453 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); append_0x80_4x4_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
8454 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); append_0x80_4x4_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
8455 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); append_0x80_4x4_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
8456 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); append_0x80_4x4_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
8457
8458 #endif
8459 }