Added SIMD code for all generic PBKDF2-HMAC-* modes
[hashcat.git] / OpenCL / common.c
1 /**
2 * Author......: Jens Steube <jens.steube@gmail.com>
3 * License.....: MIT
4 */
5
6 /**
7 * pure scalar functions
8 */
9
10 inline int hash_comp (const u32 d1[4], __global u32 *d2)
11 {
12 if (d1[3] > d2[DGST_R3]) return ( 1);
13 if (d1[3] < d2[DGST_R3]) return (-1);
14 if (d1[2] > d2[DGST_R2]) return ( 1);
15 if (d1[2] < d2[DGST_R2]) return (-1);
16 if (d1[1] > d2[DGST_R1]) return ( 1);
17 if (d1[1] < d2[DGST_R1]) return (-1);
18 if (d1[0] > d2[DGST_R0]) return ( 1);
19 if (d1[0] < d2[DGST_R0]) return (-1);
20
21 return (0);
22 }
23
24 inline int find_hash (const u32 digest[4], const u32 digests_cnt, __global digest_t *digests_buf)
25 {
26 for (u32 l = 0, r = digests_cnt; r; r >>= 1)
27 {
28 const u32 m = r >> 1;
29
30 const u32 c = l + m;
31
32 const int cmp = hash_comp (digest, digests_buf[c].digest_buf);
33
34 if (cmp > 0)
35 {
36 l += m + 1;
37
38 r--;
39 }
40
41 if (cmp == 0) return (c);
42 }
43
44 return (-1);
45 }
46
47 inline u32 check_bitmap (__global u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest)
48 {
49 return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f)));
50 }
51
52 inline u32 check (const u32 digest[2], __global u32 *bitmap_s1_a, __global u32 *bitmap_s1_b, __global u32 *bitmap_s1_c, __global u32 *bitmap_s1_d, __global u32 *bitmap_s2_a, __global u32 *bitmap_s2_b, __global u32 *bitmap_s2_c, __global u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2)
53 {
54 if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0);
55 if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0);
56 if (check_bitmap (bitmap_s1_c, bitmap_mask, bitmap_shift1, digest[2]) == 0) return (0);
57 if (check_bitmap (bitmap_s1_d, bitmap_mask, bitmap_shift1, digest[3]) == 0) return (0);
58
59 if (check_bitmap (bitmap_s2_a, bitmap_mask, bitmap_shift2, digest[0]) == 0) return (0);
60 if (check_bitmap (bitmap_s2_b, bitmap_mask, bitmap_shift2, digest[1]) == 0) return (0);
61 if (check_bitmap (bitmap_s2_c, bitmap_mask, bitmap_shift2, digest[2]) == 0) return (0);
62 if (check_bitmap (bitmap_s2_d, bitmap_mask, bitmap_shift2, digest[3]) == 0) return (0);
63
64 return (1);
65 }
66
67 inline void mark_hash (__global plain_t *plains_buf, __global u32 *hashes_shown, const int hash_pos, const u32 gid, const u32 il_pos)
68 {
69 hashes_shown[hash_pos] = 1;
70
71 plains_buf[hash_pos].gidvid = (gid * 1) + 0;
72 plains_buf[hash_pos].il_pos = il_pos;
73 }
74
75 /**
76 * vector functions
77 */
78
79 inline void truncate_block (u32x w[4], const u32 len)
80 {
81 switch (len)
82 {
83 case 0: w[0] &= 0;
84 w[1] &= 0;
85 w[2] &= 0;
86 w[3] &= 0;
87 break;
88 case 1: w[0] &= 0x000000FF;
89 w[1] &= 0;
90 w[2] &= 0;
91 w[3] &= 0;
92 break;
93 case 2: w[0] &= 0x0000FFFF;
94 w[1] &= 0;
95 w[2] &= 0;
96 w[3] &= 0;
97 break;
98 case 3: w[0] &= 0x00FFFFFF;
99 w[1] &= 0;
100 w[2] &= 0;
101 w[3] &= 0;
102 break;
103 case 4: w[1] &= 0;
104 w[2] &= 0;
105 w[3] &= 0;
106 break;
107 case 5: w[1] &= 0x000000FF;
108 w[2] &= 0;
109 w[3] &= 0;
110 break;
111 case 6: w[1] &= 0x0000FFFF;
112 w[2] &= 0;
113 w[3] &= 0;
114 break;
115 case 7: w[1] &= 0x00FFFFFF;
116 w[2] &= 0;
117 w[3] &= 0;
118 break;
119 case 8: w[2] &= 0;
120 w[3] &= 0;
121 break;
122 case 9: w[2] &= 0x000000FF;
123 w[3] &= 0;
124 break;
125 case 10: w[2] &= 0x0000FFFF;
126 w[3] &= 0;
127 break;
128 case 11: w[2] &= 0x00FFFFFF;
129 w[3] &= 0;
130 break;
131 case 12: w[3] &= 0;
132 break;
133 case 13: w[3] &= 0x000000FF;
134 break;
135 case 14: w[3] &= 0x0000FFFF;
136 break;
137 case 15: w[3] &= 0x00FFFFFF;
138 break;
139 }
140 }
141
142 inline void make_unicode (const u32x in[4], u32x out1[4], u32x out2[4])
143 {
144 #ifdef IS_NV
145 out2[3] = __byte_perm (in[3], 0, 0x7372);
146 out2[2] = __byte_perm (in[3], 0, 0x7170);
147 out2[1] = __byte_perm (in[2], 0, 0x7372);
148 out2[0] = __byte_perm (in[2], 0, 0x7170);
149 out1[3] = __byte_perm (in[1], 0, 0x7372);
150 out1[2] = __byte_perm (in[1], 0, 0x7170);
151 out1[1] = __byte_perm (in[0], 0, 0x7372);
152 out1[0] = __byte_perm (in[0], 0, 0x7170);
153 #endif
154
155 #if defined IS_AMD || defined IS_GENERIC
156 out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
157 out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
158 out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
159 out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
160 out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
161 out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
162 out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
163 out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
164 #endif
165 }
166
167 inline void undo_unicode (const u32x in1[4], const u32x in2[4], u32x out[4])
168 {
169 #ifdef IS_NV
170 out[0] = __byte_perm (in1[0], in1[1], 0x6420);
171 out[1] = __byte_perm (in1[2], in1[3], 0x6420);
172 out[2] = __byte_perm (in2[0], in2[1], 0x6420);
173 out[3] = __byte_perm (in2[2], in2[3], 0x6420);
174 #endif
175
176 #if defined IS_AMD || defined IS_GENERIC
177 out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
178 | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
179 out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
180 | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
181 out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
182 | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
183 out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
184 | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
185 #endif
186 }
187
188 inline void append_0x01_1x4 (u32x w0[4], const u32 offset)
189 {
190 switch (offset)
191 {
192 case 0:
193 w0[0] = 0x01;
194 break;
195
196 case 1:
197 w0[0] = w0[0] | 0x0100;
198 break;
199
200 case 2:
201 w0[0] = w0[0] | 0x010000;
202 break;
203
204 case 3:
205 w0[0] = w0[0] | 0x01000000;
206 break;
207
208 case 4:
209 w0[1] = 0x01;
210 break;
211
212 case 5:
213 w0[1] = w0[1] | 0x0100;
214 break;
215
216 case 6:
217 w0[1] = w0[1] | 0x010000;
218 break;
219
220 case 7:
221 w0[1] = w0[1] | 0x01000000;
222 break;
223
224 case 8:
225 w0[2] = 0x01;
226 break;
227
228 case 9:
229 w0[2] = w0[2] | 0x0100;
230 break;
231
232 case 10:
233 w0[2] = w0[2] | 0x010000;
234 break;
235
236 case 11:
237 w0[2] = w0[2] | 0x01000000;
238 break;
239
240 case 12:
241 w0[3] = 0x01;
242 break;
243
244 case 13:
245 w0[3] = w0[3] | 0x0100;
246 break;
247
248 case 14:
249 w0[3] = w0[3] | 0x010000;
250 break;
251
252 case 15:
253 w0[3] = w0[3] | 0x01000000;
254 break;
255 }
256 }
257
258 inline void append_0x01_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
259 {
260 switch (offset)
261 {
262 case 0:
263 w0[0] = 0x01;
264 break;
265
266 case 1:
267 w0[0] = w0[0] | 0x0100;
268 break;
269
270 case 2:
271 w0[0] = w0[0] | 0x010000;
272 break;
273
274 case 3:
275 w0[0] = w0[0] | 0x01000000;
276 break;
277
278 case 4:
279 w0[1] = 0x01;
280 break;
281
282 case 5:
283 w0[1] = w0[1] | 0x0100;
284 break;
285
286 case 6:
287 w0[1] = w0[1] | 0x010000;
288 break;
289
290 case 7:
291 w0[1] = w0[1] | 0x01000000;
292 break;
293
294 case 8:
295 w0[2] = 0x01;
296 break;
297
298 case 9:
299 w0[2] = w0[2] | 0x0100;
300 break;
301
302 case 10:
303 w0[2] = w0[2] | 0x010000;
304 break;
305
306 case 11:
307 w0[2] = w0[2] | 0x01000000;
308 break;
309
310 case 12:
311 w0[3] = 0x01;
312 break;
313
314 case 13:
315 w0[3] = w0[3] | 0x0100;
316 break;
317
318 case 14:
319 w0[3] = w0[3] | 0x010000;
320 break;
321
322 case 15:
323 w0[3] = w0[3] | 0x01000000;
324 break;
325
326 case 16:
327 w1[0] = 0x01;
328 break;
329
330 case 17:
331 w1[0] = w1[0] | 0x0100;
332 break;
333
334 case 18:
335 w1[0] = w1[0] | 0x010000;
336 break;
337
338 case 19:
339 w1[0] = w1[0] | 0x01000000;
340 break;
341
342 case 20:
343 w1[1] = 0x01;
344 break;
345
346 case 21:
347 w1[1] = w1[1] | 0x0100;
348 break;
349
350 case 22:
351 w1[1] = w1[1] | 0x010000;
352 break;
353
354 case 23:
355 w1[1] = w1[1] | 0x01000000;
356 break;
357
358 case 24:
359 w1[2] = 0x01;
360 break;
361
362 case 25:
363 w1[2] = w1[2] | 0x0100;
364 break;
365
366 case 26:
367 w1[2] = w1[2] | 0x010000;
368 break;
369
370 case 27:
371 w1[2] = w1[2] | 0x01000000;
372 break;
373
374 case 28:
375 w1[3] = 0x01;
376 break;
377
378 case 29:
379 w1[3] = w1[3] | 0x0100;
380 break;
381
382 case 30:
383 w1[3] = w1[3] | 0x010000;
384 break;
385
386 case 31:
387 w1[3] = w1[3] | 0x01000000;
388 break;
389 }
390 }
391
392 inline void append_0x01_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
393 {
394 switch (offset)
395 {
396 case 0:
397 w0[0] = 0x01;
398 break;
399
400 case 1:
401 w0[0] = w0[0] | 0x0100;
402 break;
403
404 case 2:
405 w0[0] = w0[0] | 0x010000;
406 break;
407
408 case 3:
409 w0[0] = w0[0] | 0x01000000;
410 break;
411
412 case 4:
413 w0[1] = 0x01;
414 break;
415
416 case 5:
417 w0[1] = w0[1] | 0x0100;
418 break;
419
420 case 6:
421 w0[1] = w0[1] | 0x010000;
422 break;
423
424 case 7:
425 w0[1] = w0[1] | 0x01000000;
426 break;
427
428 case 8:
429 w0[2] = 0x01;
430 break;
431
432 case 9:
433 w0[2] = w0[2] | 0x0100;
434 break;
435
436 case 10:
437 w0[2] = w0[2] | 0x010000;
438 break;
439
440 case 11:
441 w0[2] = w0[2] | 0x01000000;
442 break;
443
444 case 12:
445 w0[3] = 0x01;
446 break;
447
448 case 13:
449 w0[3] = w0[3] | 0x0100;
450 break;
451
452 case 14:
453 w0[3] = w0[3] | 0x010000;
454 break;
455
456 case 15:
457 w0[3] = w0[3] | 0x01000000;
458 break;
459
460 case 16:
461 w1[0] = 0x01;
462 break;
463
464 case 17:
465 w1[0] = w1[0] | 0x0100;
466 break;
467
468 case 18:
469 w1[0] = w1[0] | 0x010000;
470 break;
471
472 case 19:
473 w1[0] = w1[0] | 0x01000000;
474 break;
475
476 case 20:
477 w1[1] = 0x01;
478 break;
479
480 case 21:
481 w1[1] = w1[1] | 0x0100;
482 break;
483
484 case 22:
485 w1[1] = w1[1] | 0x010000;
486 break;
487
488 case 23:
489 w1[1] = w1[1] | 0x01000000;
490 break;
491
492 case 24:
493 w1[2] = 0x01;
494 break;
495
496 case 25:
497 w1[2] = w1[2] | 0x0100;
498 break;
499
500 case 26:
501 w1[2] = w1[2] | 0x010000;
502 break;
503
504 case 27:
505 w1[2] = w1[2] | 0x01000000;
506 break;
507
508 case 28:
509 w1[3] = 0x01;
510 break;
511
512 case 29:
513 w1[3] = w1[3] | 0x0100;
514 break;
515
516 case 30:
517 w1[3] = w1[3] | 0x010000;
518 break;
519
520 case 31:
521 w1[3] = w1[3] | 0x01000000;
522 break;
523
524 case 32:
525 w2[0] = 0x01;
526 break;
527
528 case 33:
529 w2[0] = w2[0] | 0x0100;
530 break;
531
532 case 34:
533 w2[0] = w2[0] | 0x010000;
534 break;
535
536 case 35:
537 w2[0] = w2[0] | 0x01000000;
538 break;
539
540 case 36:
541 w2[1] = 0x01;
542 break;
543
544 case 37:
545 w2[1] = w2[1] | 0x0100;
546 break;
547
548 case 38:
549 w2[1] = w2[1] | 0x010000;
550 break;
551
552 case 39:
553 w2[1] = w2[1] | 0x01000000;
554 break;
555
556 case 40:
557 w2[2] = 0x01;
558 break;
559
560 case 41:
561 w2[2] = w2[2] | 0x0100;
562 break;
563
564 case 42:
565 w2[2] = w2[2] | 0x010000;
566 break;
567
568 case 43:
569 w2[2] = w2[2] | 0x01000000;
570 break;
571
572 case 44:
573 w2[3] = 0x01;
574 break;
575
576 case 45:
577 w2[3] = w2[3] | 0x0100;
578 break;
579
580 case 46:
581 w2[3] = w2[3] | 0x010000;
582 break;
583
584 case 47:
585 w2[3] = w2[3] | 0x01000000;
586 break;
587 }
588 }
589
590 inline void append_0x01_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
591 {
592 switch (offset)
593 {
594 case 0:
595 w0[0] = 0x01;
596 break;
597
598 case 1:
599 w0[0] = w0[0] | 0x0100;
600 break;
601
602 case 2:
603 w0[0] = w0[0] | 0x010000;
604 break;
605
606 case 3:
607 w0[0] = w0[0] | 0x01000000;
608 break;
609
610 case 4:
611 w0[1] = 0x01;
612 break;
613
614 case 5:
615 w0[1] = w0[1] | 0x0100;
616 break;
617
618 case 6:
619 w0[1] = w0[1] | 0x010000;
620 break;
621
622 case 7:
623 w0[1] = w0[1] | 0x01000000;
624 break;
625
626 case 8:
627 w0[2] = 0x01;
628 break;
629
630 case 9:
631 w0[2] = w0[2] | 0x0100;
632 break;
633
634 case 10:
635 w0[2] = w0[2] | 0x010000;
636 break;
637
638 case 11:
639 w0[2] = w0[2] | 0x01000000;
640 break;
641
642 case 12:
643 w0[3] = 0x01;
644 break;
645
646 case 13:
647 w0[3] = w0[3] | 0x0100;
648 break;
649
650 case 14:
651 w0[3] = w0[3] | 0x010000;
652 break;
653
654 case 15:
655 w0[3] = w0[3] | 0x01000000;
656 break;
657
658 case 16:
659 w1[0] = 0x01;
660 break;
661
662 case 17:
663 w1[0] = w1[0] | 0x0100;
664 break;
665
666 case 18:
667 w1[0] = w1[0] | 0x010000;
668 break;
669
670 case 19:
671 w1[0] = w1[0] | 0x01000000;
672 break;
673
674 case 20:
675 w1[1] = 0x01;
676 break;
677
678 case 21:
679 w1[1] = w1[1] | 0x0100;
680 break;
681
682 case 22:
683 w1[1] = w1[1] | 0x010000;
684 break;
685
686 case 23:
687 w1[1] = w1[1] | 0x01000000;
688 break;
689
690 case 24:
691 w1[2] = 0x01;
692 break;
693
694 case 25:
695 w1[2] = w1[2] | 0x0100;
696 break;
697
698 case 26:
699 w1[2] = w1[2] | 0x010000;
700 break;
701
702 case 27:
703 w1[2] = w1[2] | 0x01000000;
704 break;
705
706 case 28:
707 w1[3] = 0x01;
708 break;
709
710 case 29:
711 w1[3] = w1[3] | 0x0100;
712 break;
713
714 case 30:
715 w1[3] = w1[3] | 0x010000;
716 break;
717
718 case 31:
719 w1[3] = w1[3] | 0x01000000;
720 break;
721
722 case 32:
723 w2[0] = 0x01;
724 break;
725
726 case 33:
727 w2[0] = w2[0] | 0x0100;
728 break;
729
730 case 34:
731 w2[0] = w2[0] | 0x010000;
732 break;
733
734 case 35:
735 w2[0] = w2[0] | 0x01000000;
736 break;
737
738 case 36:
739 w2[1] = 0x01;
740 break;
741
742 case 37:
743 w2[1] = w2[1] | 0x0100;
744 break;
745
746 case 38:
747 w2[1] = w2[1] | 0x010000;
748 break;
749
750 case 39:
751 w2[1] = w2[1] | 0x01000000;
752 break;
753
754 case 40:
755 w2[2] = 0x01;
756 break;
757
758 case 41:
759 w2[2] = w2[2] | 0x0100;
760 break;
761
762 case 42:
763 w2[2] = w2[2] | 0x010000;
764 break;
765
766 case 43:
767 w2[2] = w2[2] | 0x01000000;
768 break;
769
770 case 44:
771 w2[3] = 0x01;
772 break;
773
774 case 45:
775 w2[3] = w2[3] | 0x0100;
776 break;
777
778 case 46:
779 w2[3] = w2[3] | 0x010000;
780 break;
781
782 case 47:
783 w2[3] = w2[3] | 0x01000000;
784 break;
785
786 case 48:
787 w3[0] = 0x01;
788 break;
789
790 case 49:
791 w3[0] = w3[0] | 0x0100;
792 break;
793
794 case 50:
795 w3[0] = w3[0] | 0x010000;
796 break;
797
798 case 51:
799 w3[0] = w3[0] | 0x01000000;
800 break;
801
802 case 52:
803 w3[1] = 0x01;
804 break;
805
806 case 53:
807 w3[1] = w3[1] | 0x0100;
808 break;
809
810 case 54:
811 w3[1] = w3[1] | 0x010000;
812 break;
813
814 case 55:
815 w3[1] = w3[1] | 0x01000000;
816 break;
817
818 case 56:
819 w3[2] = 0x01;
820 break;
821
822 case 57:
823 w3[2] = w3[2] | 0x0100;
824 break;
825
826 case 58:
827 w3[2] = w3[2] | 0x010000;
828 break;
829
830 case 59:
831 w3[2] = w3[2] | 0x01000000;
832 break;
833
834 case 60:
835 w3[3] = 0x01;
836 break;
837
838 case 61:
839 w3[3] = w3[3] | 0x0100;
840 break;
841
842 case 62:
843 w3[3] = w3[3] | 0x010000;
844 break;
845
846 case 63:
847 w3[3] = w3[3] | 0x01000000;
848 break;
849 }
850 }
851
852 inline void append_0x01_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
853 {
854 switch (offset)
855 {
856 case 0:
857 w0[0] = 0x01;
858 break;
859
860 case 1:
861 w0[0] = w0[0] | 0x0100;
862 break;
863
864 case 2:
865 w0[0] = w0[0] | 0x010000;
866 break;
867
868 case 3:
869 w0[0] = w0[0] | 0x01000000;
870 break;
871
872 case 4:
873 w0[1] = 0x01;
874 break;
875
876 case 5:
877 w0[1] = w0[1] | 0x0100;
878 break;
879
880 case 6:
881 w0[1] = w0[1] | 0x010000;
882 break;
883
884 case 7:
885 w0[1] = w0[1] | 0x01000000;
886 break;
887
888 case 8:
889 w0[2] = 0x01;
890 break;
891
892 case 9:
893 w0[2] = w0[2] | 0x0100;
894 break;
895
896 case 10:
897 w0[2] = w0[2] | 0x010000;
898 break;
899
900 case 11:
901 w0[2] = w0[2] | 0x01000000;
902 break;
903
904 case 12:
905 w0[3] = 0x01;
906 break;
907
908 case 13:
909 w0[3] = w0[3] | 0x0100;
910 break;
911
912 case 14:
913 w0[3] = w0[3] | 0x010000;
914 break;
915
916 case 15:
917 w0[3] = w0[3] | 0x01000000;
918 break;
919
920 case 16:
921 w1[0] = 0x01;
922 break;
923
924 case 17:
925 w1[0] = w1[0] | 0x0100;
926 break;
927
928 case 18:
929 w1[0] = w1[0] | 0x010000;
930 break;
931
932 case 19:
933 w1[0] = w1[0] | 0x01000000;
934 break;
935
936 case 20:
937 w1[1] = 0x01;
938 break;
939
940 case 21:
941 w1[1] = w1[1] | 0x0100;
942 break;
943
944 case 22:
945 w1[1] = w1[1] | 0x010000;
946 break;
947
948 case 23:
949 w1[1] = w1[1] | 0x01000000;
950 break;
951
952 case 24:
953 w1[2] = 0x01;
954 break;
955
956 case 25:
957 w1[2] = w1[2] | 0x0100;
958 break;
959
960 case 26:
961 w1[2] = w1[2] | 0x010000;
962 break;
963
964 case 27:
965 w1[2] = w1[2] | 0x01000000;
966 break;
967
968 case 28:
969 w1[3] = 0x01;
970 break;
971
972 case 29:
973 w1[3] = w1[3] | 0x0100;
974 break;
975
976 case 30:
977 w1[3] = w1[3] | 0x010000;
978 break;
979
980 case 31:
981 w1[3] = w1[3] | 0x01000000;
982 break;
983
984 case 32:
985 w2[0] = 0x01;
986 break;
987
988 case 33:
989 w2[0] = w2[0] | 0x0100;
990 break;
991
992 case 34:
993 w2[0] = w2[0] | 0x010000;
994 break;
995
996 case 35:
997 w2[0] = w2[0] | 0x01000000;
998 break;
999
1000 case 36:
1001 w2[1] = 0x01;
1002 break;
1003
1004 case 37:
1005 w2[1] = w2[1] | 0x0100;
1006 break;
1007
1008 case 38:
1009 w2[1] = w2[1] | 0x010000;
1010 break;
1011
1012 case 39:
1013 w2[1] = w2[1] | 0x01000000;
1014 break;
1015
1016 case 40:
1017 w2[2] = 0x01;
1018 break;
1019
1020 case 41:
1021 w2[2] = w2[2] | 0x0100;
1022 break;
1023
1024 case 42:
1025 w2[2] = w2[2] | 0x010000;
1026 break;
1027
1028 case 43:
1029 w2[2] = w2[2] | 0x01000000;
1030 break;
1031
1032 case 44:
1033 w2[3] = 0x01;
1034 break;
1035
1036 case 45:
1037 w2[3] = w2[3] | 0x0100;
1038 break;
1039
1040 case 46:
1041 w2[3] = w2[3] | 0x010000;
1042 break;
1043
1044 case 47:
1045 w2[3] = w2[3] | 0x01000000;
1046 break;
1047
1048 case 48:
1049 w3[0] = 0x01;
1050 break;
1051
1052 case 49:
1053 w3[0] = w3[0] | 0x0100;
1054 break;
1055
1056 case 50:
1057 w3[0] = w3[0] | 0x010000;
1058 break;
1059
1060 case 51:
1061 w3[0] = w3[0] | 0x01000000;
1062 break;
1063
1064 case 52:
1065 w3[1] = 0x01;
1066 break;
1067
1068 case 53:
1069 w3[1] = w3[1] | 0x0100;
1070 break;
1071
1072 case 54:
1073 w3[1] = w3[1] | 0x010000;
1074 break;
1075
1076 case 55:
1077 w3[1] = w3[1] | 0x01000000;
1078 break;
1079
1080 case 56:
1081 w3[2] = 0x01;
1082 break;
1083
1084 case 57:
1085 w3[2] = w3[2] | 0x0100;
1086 break;
1087
1088 case 58:
1089 w3[2] = w3[2] | 0x010000;
1090 break;
1091
1092 case 59:
1093 w3[2] = w3[2] | 0x01000000;
1094 break;
1095
1096 case 60:
1097 w3[3] = 0x01;
1098 break;
1099
1100 case 61:
1101 w3[3] = w3[3] | 0x0100;
1102 break;
1103
1104 case 62:
1105 w3[3] = w3[3] | 0x010000;
1106 break;
1107
1108 case 63:
1109 w3[3] = w3[3] | 0x01000000;
1110 break;
1111
1112 case 64:
1113 w4[0] = 0x01;
1114 break;
1115
1116 case 65:
1117 w4[0] = w4[0] | 0x0100;
1118 break;
1119
1120 case 66:
1121 w4[0] = w4[0] | 0x010000;
1122 break;
1123
1124 case 67:
1125 w4[0] = w4[0] | 0x01000000;
1126 break;
1127
1128 case 68:
1129 w4[1] = 0x01;
1130 break;
1131
1132 case 69:
1133 w4[1] = w4[1] | 0x0100;
1134 break;
1135
1136 case 70:
1137 w4[1] = w4[1] | 0x010000;
1138 break;
1139
1140 case 71:
1141 w4[1] = w4[1] | 0x01000000;
1142 break;
1143
1144 case 72:
1145 w4[2] = 0x01;
1146 break;
1147
1148 case 73:
1149 w4[2] = w4[2] | 0x0100;
1150 break;
1151
1152 case 74:
1153 w4[2] = w4[2] | 0x010000;
1154 break;
1155
1156 case 75:
1157 w4[2] = w4[2] | 0x01000000;
1158 break;
1159
1160 case 76:
1161 w4[3] = 0x01;
1162 break;
1163
1164 case 77:
1165 w4[3] = w4[3] | 0x0100;
1166 break;
1167
1168 case 78:
1169 w4[3] = w4[3] | 0x010000;
1170 break;
1171
1172 case 79:
1173 w4[3] = w4[3] | 0x01000000;
1174 break;
1175
1176 case 80:
1177 w5[0] = 0x01;
1178 break;
1179
1180 case 81:
1181 w5[0] = w5[0] | 0x0100;
1182 break;
1183
1184 case 82:
1185 w5[0] = w5[0] | 0x010000;
1186 break;
1187
1188 case 83:
1189 w5[0] = w5[0] | 0x01000000;
1190 break;
1191
1192 case 84:
1193 w5[1] = 0x01;
1194 break;
1195
1196 case 85:
1197 w5[1] = w5[1] | 0x0100;
1198 break;
1199
1200 case 86:
1201 w5[1] = w5[1] | 0x010000;
1202 break;
1203
1204 case 87:
1205 w5[1] = w5[1] | 0x01000000;
1206 break;
1207
1208 case 88:
1209 w5[2] = 0x01;
1210 break;
1211
1212 case 89:
1213 w5[2] = w5[2] | 0x0100;
1214 break;
1215
1216 case 90:
1217 w5[2] = w5[2] | 0x010000;
1218 break;
1219
1220 case 91:
1221 w5[2] = w5[2] | 0x01000000;
1222 break;
1223
1224 case 92:
1225 w5[3] = 0x01;
1226 break;
1227
1228 case 93:
1229 w5[3] = w5[3] | 0x0100;
1230 break;
1231
1232 case 94:
1233 w5[3] = w5[3] | 0x010000;
1234 break;
1235
1236 case 95:
1237 w5[3] = w5[3] | 0x01000000;
1238 break;
1239
1240 case 96:
1241 w6[0] = 0x01;
1242 break;
1243
1244 case 97:
1245 w6[0] = w6[0] | 0x0100;
1246 break;
1247
1248 case 98:
1249 w6[0] = w6[0] | 0x010000;
1250 break;
1251
1252 case 99:
1253 w6[0] = w6[0] | 0x01000000;
1254 break;
1255
1256 case 100:
1257 w6[1] = 0x01;
1258 break;
1259
1260 case 101:
1261 w6[1] = w6[1] | 0x0100;
1262 break;
1263
1264 case 102:
1265 w6[1] = w6[1] | 0x010000;
1266 break;
1267
1268 case 103:
1269 w6[1] = w6[1] | 0x01000000;
1270 break;
1271
1272 case 104:
1273 w6[2] = 0x01;
1274 break;
1275
1276 case 105:
1277 w6[2] = w6[2] | 0x0100;
1278 break;
1279
1280 case 106:
1281 w6[2] = w6[2] | 0x010000;
1282 break;
1283
1284 case 107:
1285 w6[2] = w6[2] | 0x01000000;
1286 break;
1287
1288 case 108:
1289 w6[3] = 0x01;
1290 break;
1291
1292 case 109:
1293 w6[3] = w6[3] | 0x0100;
1294 break;
1295
1296 case 110:
1297 w6[3] = w6[3] | 0x010000;
1298 break;
1299
1300 case 111:
1301 w6[3] = w6[3] | 0x01000000;
1302 break;
1303
1304 case 112:
1305 w7[0] = 0x01;
1306 break;
1307
1308 case 113:
1309 w7[0] = w7[0] | 0x0100;
1310 break;
1311
1312 case 114:
1313 w7[0] = w7[0] | 0x010000;
1314 break;
1315
1316 case 115:
1317 w7[0] = w7[0] | 0x01000000;
1318 break;
1319
1320 case 116:
1321 w7[1] = 0x01;
1322 break;
1323
1324 case 117:
1325 w7[1] = w7[1] | 0x0100;
1326 break;
1327
1328 case 118:
1329 w7[1] = w7[1] | 0x010000;
1330 break;
1331
1332 case 119:
1333 w7[1] = w7[1] | 0x01000000;
1334 break;
1335
1336 case 120:
1337 w7[2] = 0x01;
1338 break;
1339
1340 case 121:
1341 w7[2] = w7[2] | 0x0100;
1342 break;
1343
1344 case 122:
1345 w7[2] = w7[2] | 0x010000;
1346 break;
1347
1348 case 123:
1349 w7[2] = w7[2] | 0x01000000;
1350 break;
1351
1352 case 124:
1353 w7[3] = 0x01;
1354 break;
1355
1356 case 125:
1357 w7[3] = w7[3] | 0x0100;
1358 break;
1359
1360 case 126:
1361 w7[3] = w7[3] | 0x010000;
1362 break;
1363
1364 case 127:
1365 w7[3] = w7[3] | 0x01000000;
1366 break;
1367 }
1368 }
1369
1370 inline void append_0x02_1x4 (u32x w0[4], const u32 offset)
1371 {
1372 switch (offset)
1373 {
1374 case 0:
1375 w0[0] = 0x02;
1376 break;
1377
1378 case 1:
1379 w0[0] = w0[0] | 0x0200;
1380 break;
1381
1382 case 2:
1383 w0[0] = w0[0] | 0x020000;
1384 break;
1385
1386 case 3:
1387 w0[0] = w0[0] | 0x02000000;
1388 break;
1389
1390 case 4:
1391 w0[1] = 0x02;
1392 break;
1393
1394 case 5:
1395 w0[1] = w0[1] | 0x0200;
1396 break;
1397
1398 case 6:
1399 w0[1] = w0[1] | 0x020000;
1400 break;
1401
1402 case 7:
1403 w0[1] = w0[1] | 0x02000000;
1404 break;
1405
1406 case 8:
1407 w0[2] = 0x02;
1408 break;
1409
1410 case 9:
1411 w0[2] = w0[2] | 0x0200;
1412 break;
1413
1414 case 10:
1415 w0[2] = w0[2] | 0x020000;
1416 break;
1417
1418 case 11:
1419 w0[2] = w0[2] | 0x02000000;
1420 break;
1421
1422 case 12:
1423 w0[3] = 0x02;
1424 break;
1425
1426 case 13:
1427 w0[3] = w0[3] | 0x0200;
1428 break;
1429
1430 case 14:
1431 w0[3] = w0[3] | 0x020000;
1432 break;
1433
1434 case 15:
1435 w0[3] = w0[3] | 0x02000000;
1436 break;
1437 }
1438 }
1439
1440 inline void append_0x02_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
1441 {
1442 switch (offset)
1443 {
1444 case 0:
1445 w0[0] = 0x02;
1446 break;
1447
1448 case 1:
1449 w0[0] = w0[0] | 0x0200;
1450 break;
1451
1452 case 2:
1453 w0[0] = w0[0] | 0x020000;
1454 break;
1455
1456 case 3:
1457 w0[0] = w0[0] | 0x02000000;
1458 break;
1459
1460 case 4:
1461 w0[1] = 0x02;
1462 break;
1463
1464 case 5:
1465 w0[1] = w0[1] | 0x0200;
1466 break;
1467
1468 case 6:
1469 w0[1] = w0[1] | 0x020000;
1470 break;
1471
1472 case 7:
1473 w0[1] = w0[1] | 0x02000000;
1474 break;
1475
1476 case 8:
1477 w0[2] = 0x02;
1478 break;
1479
1480 case 9:
1481 w0[2] = w0[2] | 0x0200;
1482 break;
1483
1484 case 10:
1485 w0[2] = w0[2] | 0x020000;
1486 break;
1487
1488 case 11:
1489 w0[2] = w0[2] | 0x02000000;
1490 break;
1491
1492 case 12:
1493 w0[3] = 0x02;
1494 break;
1495
1496 case 13:
1497 w0[3] = w0[3] | 0x0200;
1498 break;
1499
1500 case 14:
1501 w0[3] = w0[3] | 0x020000;
1502 break;
1503
1504 case 15:
1505 w0[3] = w0[3] | 0x02000000;
1506 break;
1507
1508 case 16:
1509 w1[0] = 0x02;
1510 break;
1511
1512 case 17:
1513 w1[0] = w1[0] | 0x0200;
1514 break;
1515
1516 case 18:
1517 w1[0] = w1[0] | 0x020000;
1518 break;
1519
1520 case 19:
1521 w1[0] = w1[0] | 0x02000000;
1522 break;
1523
1524 case 20:
1525 w1[1] = 0x02;
1526 break;
1527
1528 case 21:
1529 w1[1] = w1[1] | 0x0200;
1530 break;
1531
1532 case 22:
1533 w1[1] = w1[1] | 0x020000;
1534 break;
1535
1536 case 23:
1537 w1[1] = w1[1] | 0x02000000;
1538 break;
1539
1540 case 24:
1541 w1[2] = 0x02;
1542 break;
1543
1544 case 25:
1545 w1[2] = w1[2] | 0x0200;
1546 break;
1547
1548 case 26:
1549 w1[2] = w1[2] | 0x020000;
1550 break;
1551
1552 case 27:
1553 w1[2] = w1[2] | 0x02000000;
1554 break;
1555
1556 case 28:
1557 w1[3] = 0x02;
1558 break;
1559
1560 case 29:
1561 w1[3] = w1[3] | 0x0200;
1562 break;
1563
1564 case 30:
1565 w1[3] = w1[3] | 0x020000;
1566 break;
1567
1568 case 31:
1569 w1[3] = w1[3] | 0x02000000;
1570 break;
1571 }
1572 }
1573
1574 inline void append_0x02_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
1575 {
1576 switch (offset)
1577 {
1578 case 0:
1579 w0[0] = 0x02;
1580 break;
1581
1582 case 1:
1583 w0[0] = w0[0] | 0x0200;
1584 break;
1585
1586 case 2:
1587 w0[0] = w0[0] | 0x020000;
1588 break;
1589
1590 case 3:
1591 w0[0] = w0[0] | 0x02000000;
1592 break;
1593
1594 case 4:
1595 w0[1] = 0x02;
1596 break;
1597
1598 case 5:
1599 w0[1] = w0[1] | 0x0200;
1600 break;
1601
1602 case 6:
1603 w0[1] = w0[1] | 0x020000;
1604 break;
1605
1606 case 7:
1607 w0[1] = w0[1] | 0x02000000;
1608 break;
1609
1610 case 8:
1611 w0[2] = 0x02;
1612 break;
1613
1614 case 9:
1615 w0[2] = w0[2] | 0x0200;
1616 break;
1617
1618 case 10:
1619 w0[2] = w0[2] | 0x020000;
1620 break;
1621
1622 case 11:
1623 w0[2] = w0[2] | 0x02000000;
1624 break;
1625
1626 case 12:
1627 w0[3] = 0x02;
1628 break;
1629
1630 case 13:
1631 w0[3] = w0[3] | 0x0200;
1632 break;
1633
1634 case 14:
1635 w0[3] = w0[3] | 0x020000;
1636 break;
1637
1638 case 15:
1639 w0[3] = w0[3] | 0x02000000;
1640 break;
1641
1642 case 16:
1643 w1[0] = 0x02;
1644 break;
1645
1646 case 17:
1647 w1[0] = w1[0] | 0x0200;
1648 break;
1649
1650 case 18:
1651 w1[0] = w1[0] | 0x020000;
1652 break;
1653
1654 case 19:
1655 w1[0] = w1[0] | 0x02000000;
1656 break;
1657
1658 case 20:
1659 w1[1] = 0x02;
1660 break;
1661
1662 case 21:
1663 w1[1] = w1[1] | 0x0200;
1664 break;
1665
1666 case 22:
1667 w1[1] = w1[1] | 0x020000;
1668 break;
1669
1670 case 23:
1671 w1[1] = w1[1] | 0x02000000;
1672 break;
1673
1674 case 24:
1675 w1[2] = 0x02;
1676 break;
1677
1678 case 25:
1679 w1[2] = w1[2] | 0x0200;
1680 break;
1681
1682 case 26:
1683 w1[2] = w1[2] | 0x020000;
1684 break;
1685
1686 case 27:
1687 w1[2] = w1[2] | 0x02000000;
1688 break;
1689
1690 case 28:
1691 w1[3] = 0x02;
1692 break;
1693
1694 case 29:
1695 w1[3] = w1[3] | 0x0200;
1696 break;
1697
1698 case 30:
1699 w1[3] = w1[3] | 0x020000;
1700 break;
1701
1702 case 31:
1703 w1[3] = w1[3] | 0x02000000;
1704 break;
1705
1706 case 32:
1707 w2[0] = 0x02;
1708 break;
1709
1710 case 33:
1711 w2[0] = w2[0] | 0x0200;
1712 break;
1713
1714 case 34:
1715 w2[0] = w2[0] | 0x020000;
1716 break;
1717
1718 case 35:
1719 w2[0] = w2[0] | 0x02000000;
1720 break;
1721
1722 case 36:
1723 w2[1] = 0x02;
1724 break;
1725
1726 case 37:
1727 w2[1] = w2[1] | 0x0200;
1728 break;
1729
1730 case 38:
1731 w2[1] = w2[1] | 0x020000;
1732 break;
1733
1734 case 39:
1735 w2[1] = w2[1] | 0x02000000;
1736 break;
1737
1738 case 40:
1739 w2[2] = 0x02;
1740 break;
1741
1742 case 41:
1743 w2[2] = w2[2] | 0x0200;
1744 break;
1745
1746 case 42:
1747 w2[2] = w2[2] | 0x020000;
1748 break;
1749
1750 case 43:
1751 w2[2] = w2[2] | 0x02000000;
1752 break;
1753
1754 case 44:
1755 w2[3] = 0x02;
1756 break;
1757
1758 case 45:
1759 w2[3] = w2[3] | 0x0200;
1760 break;
1761
1762 case 46:
1763 w2[3] = w2[3] | 0x020000;
1764 break;
1765
1766 case 47:
1767 w2[3] = w2[3] | 0x02000000;
1768 break;
1769 }
1770 }
1771
1772 inline void append_0x02_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
1773 {
1774 switch (offset)
1775 {
1776 case 0:
1777 w0[0] = 0x02;
1778 break;
1779
1780 case 1:
1781 w0[0] = w0[0] | 0x0200;
1782 break;
1783
1784 case 2:
1785 w0[0] = w0[0] | 0x020000;
1786 break;
1787
1788 case 3:
1789 w0[0] = w0[0] | 0x02000000;
1790 break;
1791
1792 case 4:
1793 w0[1] = 0x02;
1794 break;
1795
1796 case 5:
1797 w0[1] = w0[1] | 0x0200;
1798 break;
1799
1800 case 6:
1801 w0[1] = w0[1] | 0x020000;
1802 break;
1803
1804 case 7:
1805 w0[1] = w0[1] | 0x02000000;
1806 break;
1807
1808 case 8:
1809 w0[2] = 0x02;
1810 break;
1811
1812 case 9:
1813 w0[2] = w0[2] | 0x0200;
1814 break;
1815
1816 case 10:
1817 w0[2] = w0[2] | 0x020000;
1818 break;
1819
1820 case 11:
1821 w0[2] = w0[2] | 0x02000000;
1822 break;
1823
1824 case 12:
1825 w0[3] = 0x02;
1826 break;
1827
1828 case 13:
1829 w0[3] = w0[3] | 0x0200;
1830 break;
1831
1832 case 14:
1833 w0[3] = w0[3] | 0x020000;
1834 break;
1835
1836 case 15:
1837 w0[3] = w0[3] | 0x02000000;
1838 break;
1839
1840 case 16:
1841 w1[0] = 0x02;
1842 break;
1843
1844 case 17:
1845 w1[0] = w1[0] | 0x0200;
1846 break;
1847
1848 case 18:
1849 w1[0] = w1[0] | 0x020000;
1850 break;
1851
1852 case 19:
1853 w1[0] = w1[0] | 0x02000000;
1854 break;
1855
1856 case 20:
1857 w1[1] = 0x02;
1858 break;
1859
1860 case 21:
1861 w1[1] = w1[1] | 0x0200;
1862 break;
1863
1864 case 22:
1865 w1[1] = w1[1] | 0x020000;
1866 break;
1867
1868 case 23:
1869 w1[1] = w1[1] | 0x02000000;
1870 break;
1871
1872 case 24:
1873 w1[2] = 0x02;
1874 break;
1875
1876 case 25:
1877 w1[2] = w1[2] | 0x0200;
1878 break;
1879
1880 case 26:
1881 w1[2] = w1[2] | 0x020000;
1882 break;
1883
1884 case 27:
1885 w1[2] = w1[2] | 0x02000000;
1886 break;
1887
1888 case 28:
1889 w1[3] = 0x02;
1890 break;
1891
1892 case 29:
1893 w1[3] = w1[3] | 0x0200;
1894 break;
1895
1896 case 30:
1897 w1[3] = w1[3] | 0x020000;
1898 break;
1899
1900 case 31:
1901 w1[3] = w1[3] | 0x02000000;
1902 break;
1903
1904 case 32:
1905 w2[0] = 0x02;
1906 break;
1907
1908 case 33:
1909 w2[0] = w2[0] | 0x0200;
1910 break;
1911
1912 case 34:
1913 w2[0] = w2[0] | 0x020000;
1914 break;
1915
1916 case 35:
1917 w2[0] = w2[0] | 0x02000000;
1918 break;
1919
1920 case 36:
1921 w2[1] = 0x02;
1922 break;
1923
1924 case 37:
1925 w2[1] = w2[1] | 0x0200;
1926 break;
1927
1928 case 38:
1929 w2[1] = w2[1] | 0x020000;
1930 break;
1931
1932 case 39:
1933 w2[1] = w2[1] | 0x02000000;
1934 break;
1935
1936 case 40:
1937 w2[2] = 0x02;
1938 break;
1939
1940 case 41:
1941 w2[2] = w2[2] | 0x0200;
1942 break;
1943
1944 case 42:
1945 w2[2] = w2[2] | 0x020000;
1946 break;
1947
1948 case 43:
1949 w2[2] = w2[2] | 0x02000000;
1950 break;
1951
1952 case 44:
1953 w2[3] = 0x02;
1954 break;
1955
1956 case 45:
1957 w2[3] = w2[3] | 0x0200;
1958 break;
1959
1960 case 46:
1961 w2[3] = w2[3] | 0x020000;
1962 break;
1963
1964 case 47:
1965 w2[3] = w2[3] | 0x02000000;
1966 break;
1967
1968 case 48:
1969 w3[0] = 0x02;
1970 break;
1971
1972 case 49:
1973 w3[0] = w3[0] | 0x0200;
1974 break;
1975
1976 case 50:
1977 w3[0] = w3[0] | 0x020000;
1978 break;
1979
1980 case 51:
1981 w3[0] = w3[0] | 0x02000000;
1982 break;
1983
1984 case 52:
1985 w3[1] = 0x02;
1986 break;
1987
1988 case 53:
1989 w3[1] = w3[1] | 0x0200;
1990 break;
1991
1992 case 54:
1993 w3[1] = w3[1] | 0x020000;
1994 break;
1995
1996 case 55:
1997 w3[1] = w3[1] | 0x02000000;
1998 break;
1999
2000 case 56:
2001 w3[2] = 0x02;
2002 break;
2003
2004 case 57:
2005 w3[2] = w3[2] | 0x0200;
2006 break;
2007
2008 case 58:
2009 w3[2] = w3[2] | 0x020000;
2010 break;
2011
2012 case 59:
2013 w3[2] = w3[2] | 0x02000000;
2014 break;
2015
2016 case 60:
2017 w3[3] = 0x02;
2018 break;
2019
2020 case 61:
2021 w3[3] = w3[3] | 0x0200;
2022 break;
2023
2024 case 62:
2025 w3[3] = w3[3] | 0x020000;
2026 break;
2027
2028 case 63:
2029 w3[3] = w3[3] | 0x02000000;
2030 break;
2031 }
2032 }
2033
2034 inline void append_0x02_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
2035 {
2036 switch (offset)
2037 {
2038 case 0:
2039 w0[0] = 0x02;
2040 break;
2041
2042 case 1:
2043 w0[0] = w0[0] | 0x0200;
2044 break;
2045
2046 case 2:
2047 w0[0] = w0[0] | 0x020000;
2048 break;
2049
2050 case 3:
2051 w0[0] = w0[0] | 0x02000000;
2052 break;
2053
2054 case 4:
2055 w0[1] = 0x02;
2056 break;
2057
2058 case 5:
2059 w0[1] = w0[1] | 0x0200;
2060 break;
2061
2062 case 6:
2063 w0[1] = w0[1] | 0x020000;
2064 break;
2065
2066 case 7:
2067 w0[1] = w0[1] | 0x02000000;
2068 break;
2069
2070 case 8:
2071 w0[2] = 0x02;
2072 break;
2073
2074 case 9:
2075 w0[2] = w0[2] | 0x0200;
2076 break;
2077
2078 case 10:
2079 w0[2] = w0[2] | 0x020000;
2080 break;
2081
2082 case 11:
2083 w0[2] = w0[2] | 0x02000000;
2084 break;
2085
2086 case 12:
2087 w0[3] = 0x02;
2088 break;
2089
2090 case 13:
2091 w0[3] = w0[3] | 0x0200;
2092 break;
2093
2094 case 14:
2095 w0[3] = w0[3] | 0x020000;
2096 break;
2097
2098 case 15:
2099 w0[3] = w0[3] | 0x02000000;
2100 break;
2101
2102 case 16:
2103 w1[0] = 0x02;
2104 break;
2105
2106 case 17:
2107 w1[0] = w1[0] | 0x0200;
2108 break;
2109
2110 case 18:
2111 w1[0] = w1[0] | 0x020000;
2112 break;
2113
2114 case 19:
2115 w1[0] = w1[0] | 0x02000000;
2116 break;
2117
2118 case 20:
2119 w1[1] = 0x02;
2120 break;
2121
2122 case 21:
2123 w1[1] = w1[1] | 0x0200;
2124 break;
2125
2126 case 22:
2127 w1[1] = w1[1] | 0x020000;
2128 break;
2129
2130 case 23:
2131 w1[1] = w1[1] | 0x02000000;
2132 break;
2133
2134 case 24:
2135 w1[2] = 0x02;
2136 break;
2137
2138 case 25:
2139 w1[2] = w1[2] | 0x0200;
2140 break;
2141
2142 case 26:
2143 w1[2] = w1[2] | 0x020000;
2144 break;
2145
2146 case 27:
2147 w1[2] = w1[2] | 0x02000000;
2148 break;
2149
2150 case 28:
2151 w1[3] = 0x02;
2152 break;
2153
2154 case 29:
2155 w1[3] = w1[3] | 0x0200;
2156 break;
2157
2158 case 30:
2159 w1[3] = w1[3] | 0x020000;
2160 break;
2161
2162 case 31:
2163 w1[3] = w1[3] | 0x02000000;
2164 break;
2165
2166 case 32:
2167 w2[0] = 0x02;
2168 break;
2169
2170 case 33:
2171 w2[0] = w2[0] | 0x0200;
2172 break;
2173
2174 case 34:
2175 w2[0] = w2[0] | 0x020000;
2176 break;
2177
2178 case 35:
2179 w2[0] = w2[0] | 0x02000000;
2180 break;
2181
2182 case 36:
2183 w2[1] = 0x02;
2184 break;
2185
2186 case 37:
2187 w2[1] = w2[1] | 0x0200;
2188 break;
2189
2190 case 38:
2191 w2[1] = w2[1] | 0x020000;
2192 break;
2193
2194 case 39:
2195 w2[1] = w2[1] | 0x02000000;
2196 break;
2197
2198 case 40:
2199 w2[2] = 0x02;
2200 break;
2201
2202 case 41:
2203 w2[2] = w2[2] | 0x0200;
2204 break;
2205
2206 case 42:
2207 w2[2] = w2[2] | 0x020000;
2208 break;
2209
2210 case 43:
2211 w2[2] = w2[2] | 0x02000000;
2212 break;
2213
2214 case 44:
2215 w2[3] = 0x02;
2216 break;
2217
2218 case 45:
2219 w2[3] = w2[3] | 0x0200;
2220 break;
2221
2222 case 46:
2223 w2[3] = w2[3] | 0x020000;
2224 break;
2225
2226 case 47:
2227 w2[3] = w2[3] | 0x02000000;
2228 break;
2229
2230 case 48:
2231 w3[0] = 0x02;
2232 break;
2233
2234 case 49:
2235 w3[0] = w3[0] | 0x0200;
2236 break;
2237
2238 case 50:
2239 w3[0] = w3[0] | 0x020000;
2240 break;
2241
2242 case 51:
2243 w3[0] = w3[0] | 0x02000000;
2244 break;
2245
2246 case 52:
2247 w3[1] = 0x02;
2248 break;
2249
2250 case 53:
2251 w3[1] = w3[1] | 0x0200;
2252 break;
2253
2254 case 54:
2255 w3[1] = w3[1] | 0x020000;
2256 break;
2257
2258 case 55:
2259 w3[1] = w3[1] | 0x02000000;
2260 break;
2261
2262 case 56:
2263 w3[2] = 0x02;
2264 break;
2265
2266 case 57:
2267 w3[2] = w3[2] | 0x0200;
2268 break;
2269
2270 case 58:
2271 w3[2] = w3[2] | 0x020000;
2272 break;
2273
2274 case 59:
2275 w3[2] = w3[2] | 0x02000000;
2276 break;
2277
2278 case 60:
2279 w3[3] = 0x02;
2280 break;
2281
2282 case 61:
2283 w3[3] = w3[3] | 0x0200;
2284 break;
2285
2286 case 62:
2287 w3[3] = w3[3] | 0x020000;
2288 break;
2289
2290 case 63:
2291 w3[3] = w3[3] | 0x02000000;
2292 break;
2293
2294 case 64:
2295 w4[0] = 0x02;
2296 break;
2297
2298 case 65:
2299 w4[0] = w4[0] | 0x0200;
2300 break;
2301
2302 case 66:
2303 w4[0] = w4[0] | 0x020000;
2304 break;
2305
2306 case 67:
2307 w4[0] = w4[0] | 0x02000000;
2308 break;
2309
2310 case 68:
2311 w4[1] = 0x02;
2312 break;
2313
2314 case 69:
2315 w4[1] = w4[1] | 0x0200;
2316 break;
2317
2318 case 70:
2319 w4[1] = w4[1] | 0x020000;
2320 break;
2321
2322 case 71:
2323 w4[1] = w4[1] | 0x02000000;
2324 break;
2325
2326 case 72:
2327 w4[2] = 0x02;
2328 break;
2329
2330 case 73:
2331 w4[2] = w4[2] | 0x0200;
2332 break;
2333
2334 case 74:
2335 w4[2] = w4[2] | 0x020000;
2336 break;
2337
2338 case 75:
2339 w4[2] = w4[2] | 0x02000000;
2340 break;
2341
2342 case 76:
2343 w4[3] = 0x02;
2344 break;
2345
2346 case 77:
2347 w4[3] = w4[3] | 0x0200;
2348 break;
2349
2350 case 78:
2351 w4[3] = w4[3] | 0x020000;
2352 break;
2353
2354 case 79:
2355 w4[3] = w4[3] | 0x02000000;
2356 break;
2357
2358 case 80:
2359 w5[0] = 0x02;
2360 break;
2361
2362 case 81:
2363 w5[0] = w5[0] | 0x0200;
2364 break;
2365
2366 case 82:
2367 w5[0] = w5[0] | 0x020000;
2368 break;
2369
2370 case 83:
2371 w5[0] = w5[0] | 0x02000000;
2372 break;
2373
2374 case 84:
2375 w5[1] = 0x02;
2376 break;
2377
2378 case 85:
2379 w5[1] = w5[1] | 0x0200;
2380 break;
2381
2382 case 86:
2383 w5[1] = w5[1] | 0x020000;
2384 break;
2385
2386 case 87:
2387 w5[1] = w5[1] | 0x02000000;
2388 break;
2389
2390 case 88:
2391 w5[2] = 0x02;
2392 break;
2393
2394 case 89:
2395 w5[2] = w5[2] | 0x0200;
2396 break;
2397
2398 case 90:
2399 w5[2] = w5[2] | 0x020000;
2400 break;
2401
2402 case 91:
2403 w5[2] = w5[2] | 0x02000000;
2404 break;
2405
2406 case 92:
2407 w5[3] = 0x02;
2408 break;
2409
2410 case 93:
2411 w5[3] = w5[3] | 0x0200;
2412 break;
2413
2414 case 94:
2415 w5[3] = w5[3] | 0x020000;
2416 break;
2417
2418 case 95:
2419 w5[3] = w5[3] | 0x02000000;
2420 break;
2421
2422 case 96:
2423 w6[0] = 0x02;
2424 break;
2425
2426 case 97:
2427 w6[0] = w6[0] | 0x0200;
2428 break;
2429
2430 case 98:
2431 w6[0] = w6[0] | 0x020000;
2432 break;
2433
2434 case 99:
2435 w6[0] = w6[0] | 0x02000000;
2436 break;
2437
2438 case 100:
2439 w6[1] = 0x02;
2440 break;
2441
2442 case 101:
2443 w6[1] = w6[1] | 0x0200;
2444 break;
2445
2446 case 102:
2447 w6[1] = w6[1] | 0x020000;
2448 break;
2449
2450 case 103:
2451 w6[1] = w6[1] | 0x02000000;
2452 break;
2453
2454 case 104:
2455 w6[2] = 0x02;
2456 break;
2457
2458 case 105:
2459 w6[2] = w6[2] | 0x0200;
2460 break;
2461
2462 case 106:
2463 w6[2] = w6[2] | 0x020000;
2464 break;
2465
2466 case 107:
2467 w6[2] = w6[2] | 0x02000000;
2468 break;
2469
2470 case 108:
2471 w6[3] = 0x02;
2472 break;
2473
2474 case 109:
2475 w6[3] = w6[3] | 0x0200;
2476 break;
2477
2478 case 110:
2479 w6[3] = w6[3] | 0x020000;
2480 break;
2481
2482 case 111:
2483 w6[3] = w6[3] | 0x02000000;
2484 break;
2485
2486 case 112:
2487 w7[0] = 0x02;
2488 break;
2489
2490 case 113:
2491 w7[0] = w7[0] | 0x0200;
2492 break;
2493
2494 case 114:
2495 w7[0] = w7[0] | 0x020000;
2496 break;
2497
2498 case 115:
2499 w7[0] = w7[0] | 0x02000000;
2500 break;
2501
2502 case 116:
2503 w7[1] = 0x02;
2504 break;
2505
2506 case 117:
2507 w7[1] = w7[1] | 0x0200;
2508 break;
2509
2510 case 118:
2511 w7[1] = w7[1] | 0x020000;
2512 break;
2513
2514 case 119:
2515 w7[1] = w7[1] | 0x02000000;
2516 break;
2517
2518 case 120:
2519 w7[2] = 0x02;
2520 break;
2521
2522 case 121:
2523 w7[2] = w7[2] | 0x0200;
2524 break;
2525
2526 case 122:
2527 w7[2] = w7[2] | 0x020000;
2528 break;
2529
2530 case 123:
2531 w7[2] = w7[2] | 0x02000000;
2532 break;
2533
2534 case 124:
2535 w7[3] = 0x02;
2536 break;
2537
2538 case 125:
2539 w7[3] = w7[3] | 0x0200;
2540 break;
2541
2542 case 126:
2543 w7[3] = w7[3] | 0x020000;
2544 break;
2545
2546 case 127:
2547 w7[3] = w7[3] | 0x02000000;
2548 break;
2549 }
2550 }
2551
2552 inline void append_0x80_1x4 (u32x w0[4], const u32 offset)
2553 {
2554 switch (offset)
2555 {
2556 case 0:
2557 w0[0] = 0x80;
2558 break;
2559
2560 case 1:
2561 w0[0] = w0[0] | 0x8000;
2562 break;
2563
2564 case 2:
2565 w0[0] = w0[0] | 0x800000;
2566 break;
2567
2568 case 3:
2569 w0[0] = w0[0] | 0x80000000;
2570 break;
2571
2572 case 4:
2573 w0[1] = 0x80;
2574 break;
2575
2576 case 5:
2577 w0[1] = w0[1] | 0x8000;
2578 break;
2579
2580 case 6:
2581 w0[1] = w0[1] | 0x800000;
2582 break;
2583
2584 case 7:
2585 w0[1] = w0[1] | 0x80000000;
2586 break;
2587
2588 case 8:
2589 w0[2] = 0x80;
2590 break;
2591
2592 case 9:
2593 w0[2] = w0[2] | 0x8000;
2594 break;
2595
2596 case 10:
2597 w0[2] = w0[2] | 0x800000;
2598 break;
2599
2600 case 11:
2601 w0[2] = w0[2] | 0x80000000;
2602 break;
2603
2604 case 12:
2605 w0[3] = 0x80;
2606 break;
2607
2608 case 13:
2609 w0[3] = w0[3] | 0x8000;
2610 break;
2611
2612 case 14:
2613 w0[3] = w0[3] | 0x800000;
2614 break;
2615
2616 case 15:
2617 w0[3] = w0[3] | 0x80000000;
2618 break;
2619 }
2620 }
2621
2622 inline void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
2623 {
2624 switch (offset)
2625 {
2626 case 0:
2627 w0[0] = 0x80;
2628 break;
2629
2630 case 1:
2631 w0[0] = w0[0] | 0x8000;
2632 break;
2633
2634 case 2:
2635 w0[0] = w0[0] | 0x800000;
2636 break;
2637
2638 case 3:
2639 w0[0] = w0[0] | 0x80000000;
2640 break;
2641
2642 case 4:
2643 w0[1] = 0x80;
2644 break;
2645
2646 case 5:
2647 w0[1] = w0[1] | 0x8000;
2648 break;
2649
2650 case 6:
2651 w0[1] = w0[1] | 0x800000;
2652 break;
2653
2654 case 7:
2655 w0[1] = w0[1] | 0x80000000;
2656 break;
2657
2658 case 8:
2659 w0[2] = 0x80;
2660 break;
2661
2662 case 9:
2663 w0[2] = w0[2] | 0x8000;
2664 break;
2665
2666 case 10:
2667 w0[2] = w0[2] | 0x800000;
2668 break;
2669
2670 case 11:
2671 w0[2] = w0[2] | 0x80000000;
2672 break;
2673
2674 case 12:
2675 w0[3] = 0x80;
2676 break;
2677
2678 case 13:
2679 w0[3] = w0[3] | 0x8000;
2680 break;
2681
2682 case 14:
2683 w0[3] = w0[3] | 0x800000;
2684 break;
2685
2686 case 15:
2687 w0[3] = w0[3] | 0x80000000;
2688 break;
2689
2690 case 16:
2691 w1[0] = 0x80;
2692 break;
2693
2694 case 17:
2695 w1[0] = w1[0] | 0x8000;
2696 break;
2697
2698 case 18:
2699 w1[0] = w1[0] | 0x800000;
2700 break;
2701
2702 case 19:
2703 w1[0] = w1[0] | 0x80000000;
2704 break;
2705
2706 case 20:
2707 w1[1] = 0x80;
2708 break;
2709
2710 case 21:
2711 w1[1] = w1[1] | 0x8000;
2712 break;
2713
2714 case 22:
2715 w1[1] = w1[1] | 0x800000;
2716 break;
2717
2718 case 23:
2719 w1[1] = w1[1] | 0x80000000;
2720 break;
2721
2722 case 24:
2723 w1[2] = 0x80;
2724 break;
2725
2726 case 25:
2727 w1[2] = w1[2] | 0x8000;
2728 break;
2729
2730 case 26:
2731 w1[2] = w1[2] | 0x800000;
2732 break;
2733
2734 case 27:
2735 w1[2] = w1[2] | 0x80000000;
2736 break;
2737
2738 case 28:
2739 w1[3] = 0x80;
2740 break;
2741
2742 case 29:
2743 w1[3] = w1[3] | 0x8000;
2744 break;
2745
2746 case 30:
2747 w1[3] = w1[3] | 0x800000;
2748 break;
2749
2750 case 31:
2751 w1[3] = w1[3] | 0x80000000;
2752 break;
2753 }
2754 }
2755
2756 inline void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
2757 {
2758 switch (offset)
2759 {
2760 case 0:
2761 w0[0] = 0x80;
2762 break;
2763
2764 case 1:
2765 w0[0] = w0[0] | 0x8000;
2766 break;
2767
2768 case 2:
2769 w0[0] = w0[0] | 0x800000;
2770 break;
2771
2772 case 3:
2773 w0[0] = w0[0] | 0x80000000;
2774 break;
2775
2776 case 4:
2777 w0[1] = 0x80;
2778 break;
2779
2780 case 5:
2781 w0[1] = w0[1] | 0x8000;
2782 break;
2783
2784 case 6:
2785 w0[1] = w0[1] | 0x800000;
2786 break;
2787
2788 case 7:
2789 w0[1] = w0[1] | 0x80000000;
2790 break;
2791
2792 case 8:
2793 w0[2] = 0x80;
2794 break;
2795
2796 case 9:
2797 w0[2] = w0[2] | 0x8000;
2798 break;
2799
2800 case 10:
2801 w0[2] = w0[2] | 0x800000;
2802 break;
2803
2804 case 11:
2805 w0[2] = w0[2] | 0x80000000;
2806 break;
2807
2808 case 12:
2809 w0[3] = 0x80;
2810 break;
2811
2812 case 13:
2813 w0[3] = w0[3] | 0x8000;
2814 break;
2815
2816 case 14:
2817 w0[3] = w0[3] | 0x800000;
2818 break;
2819
2820 case 15:
2821 w0[3] = w0[3] | 0x80000000;
2822 break;
2823
2824 case 16:
2825 w1[0] = 0x80;
2826 break;
2827
2828 case 17:
2829 w1[0] = w1[0] | 0x8000;
2830 break;
2831
2832 case 18:
2833 w1[0] = w1[0] | 0x800000;
2834 break;
2835
2836 case 19:
2837 w1[0] = w1[0] | 0x80000000;
2838 break;
2839
2840 case 20:
2841 w1[1] = 0x80;
2842 break;
2843
2844 case 21:
2845 w1[1] = w1[1] | 0x8000;
2846 break;
2847
2848 case 22:
2849 w1[1] = w1[1] | 0x800000;
2850 break;
2851
2852 case 23:
2853 w1[1] = w1[1] | 0x80000000;
2854 break;
2855
2856 case 24:
2857 w1[2] = 0x80;
2858 break;
2859
2860 case 25:
2861 w1[2] = w1[2] | 0x8000;
2862 break;
2863
2864 case 26:
2865 w1[2] = w1[2] | 0x800000;
2866 break;
2867
2868 case 27:
2869 w1[2] = w1[2] | 0x80000000;
2870 break;
2871
2872 case 28:
2873 w1[3] = 0x80;
2874 break;
2875
2876 case 29:
2877 w1[3] = w1[3] | 0x8000;
2878 break;
2879
2880 case 30:
2881 w1[3] = w1[3] | 0x800000;
2882 break;
2883
2884 case 31:
2885 w1[3] = w1[3] | 0x80000000;
2886 break;
2887
2888 case 32:
2889 w2[0] = 0x80;
2890 break;
2891
2892 case 33:
2893 w2[0] = w2[0] | 0x8000;
2894 break;
2895
2896 case 34:
2897 w2[0] = w2[0] | 0x800000;
2898 break;
2899
2900 case 35:
2901 w2[0] = w2[0] | 0x80000000;
2902 break;
2903
2904 case 36:
2905 w2[1] = 0x80;
2906 break;
2907
2908 case 37:
2909 w2[1] = w2[1] | 0x8000;
2910 break;
2911
2912 case 38:
2913 w2[1] = w2[1] | 0x800000;
2914 break;
2915
2916 case 39:
2917 w2[1] = w2[1] | 0x80000000;
2918 break;
2919
2920 case 40:
2921 w2[2] = 0x80;
2922 break;
2923
2924 case 41:
2925 w2[2] = w2[2] | 0x8000;
2926 break;
2927
2928 case 42:
2929 w2[2] = w2[2] | 0x800000;
2930 break;
2931
2932 case 43:
2933 w2[2] = w2[2] | 0x80000000;
2934 break;
2935
2936 case 44:
2937 w2[3] = 0x80;
2938 break;
2939
2940 case 45:
2941 w2[3] = w2[3] | 0x8000;
2942 break;
2943
2944 case 46:
2945 w2[3] = w2[3] | 0x800000;
2946 break;
2947
2948 case 47:
2949 w2[3] = w2[3] | 0x80000000;
2950 break;
2951 }
2952 }
2953
2954 inline void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
2955 {
2956 switch (offset)
2957 {
2958 case 0:
2959 w0[0] = 0x80;
2960 break;
2961
2962 case 1:
2963 w0[0] = w0[0] | 0x8000;
2964 break;
2965
2966 case 2:
2967 w0[0] = w0[0] | 0x800000;
2968 break;
2969
2970 case 3:
2971 w0[0] = w0[0] | 0x80000000;
2972 break;
2973
2974 case 4:
2975 w0[1] = 0x80;
2976 break;
2977
2978 case 5:
2979 w0[1] = w0[1] | 0x8000;
2980 break;
2981
2982 case 6:
2983 w0[1] = w0[1] | 0x800000;
2984 break;
2985
2986 case 7:
2987 w0[1] = w0[1] | 0x80000000;
2988 break;
2989
2990 case 8:
2991 w0[2] = 0x80;
2992 break;
2993
2994 case 9:
2995 w0[2] = w0[2] | 0x8000;
2996 break;
2997
2998 case 10:
2999 w0[2] = w0[2] | 0x800000;
3000 break;
3001
3002 case 11:
3003 w0[2] = w0[2] | 0x80000000;
3004 break;
3005
3006 case 12:
3007 w0[3] = 0x80;
3008 break;
3009
3010 case 13:
3011 w0[3] = w0[3] | 0x8000;
3012 break;
3013
3014 case 14:
3015 w0[3] = w0[3] | 0x800000;
3016 break;
3017
3018 case 15:
3019 w0[3] = w0[3] | 0x80000000;
3020 break;
3021
3022 case 16:
3023 w1[0] = 0x80;
3024 break;
3025
3026 case 17:
3027 w1[0] = w1[0] | 0x8000;
3028 break;
3029
3030 case 18:
3031 w1[0] = w1[0] | 0x800000;
3032 break;
3033
3034 case 19:
3035 w1[0] = w1[0] | 0x80000000;
3036 break;
3037
3038 case 20:
3039 w1[1] = 0x80;
3040 break;
3041
3042 case 21:
3043 w1[1] = w1[1] | 0x8000;
3044 break;
3045
3046 case 22:
3047 w1[1] = w1[1] | 0x800000;
3048 break;
3049
3050 case 23:
3051 w1[1] = w1[1] | 0x80000000;
3052 break;
3053
3054 case 24:
3055 w1[2] = 0x80;
3056 break;
3057
3058 case 25:
3059 w1[2] = w1[2] | 0x8000;
3060 break;
3061
3062 case 26:
3063 w1[2] = w1[2] | 0x800000;
3064 break;
3065
3066 case 27:
3067 w1[2] = w1[2] | 0x80000000;
3068 break;
3069
3070 case 28:
3071 w1[3] = 0x80;
3072 break;
3073
3074 case 29:
3075 w1[3] = w1[3] | 0x8000;
3076 break;
3077
3078 case 30:
3079 w1[3] = w1[3] | 0x800000;
3080 break;
3081
3082 case 31:
3083 w1[3] = w1[3] | 0x80000000;
3084 break;
3085
3086 case 32:
3087 w2[0] = 0x80;
3088 break;
3089
3090 case 33:
3091 w2[0] = w2[0] | 0x8000;
3092 break;
3093
3094 case 34:
3095 w2[0] = w2[0] | 0x800000;
3096 break;
3097
3098 case 35:
3099 w2[0] = w2[0] | 0x80000000;
3100 break;
3101
3102 case 36:
3103 w2[1] = 0x80;
3104 break;
3105
3106 case 37:
3107 w2[1] = w2[1] | 0x8000;
3108 break;
3109
3110 case 38:
3111 w2[1] = w2[1] | 0x800000;
3112 break;
3113
3114 case 39:
3115 w2[1] = w2[1] | 0x80000000;
3116 break;
3117
3118 case 40:
3119 w2[2] = 0x80;
3120 break;
3121
3122 case 41:
3123 w2[2] = w2[2] | 0x8000;
3124 break;
3125
3126 case 42:
3127 w2[2] = w2[2] | 0x800000;
3128 break;
3129
3130 case 43:
3131 w2[2] = w2[2] | 0x80000000;
3132 break;
3133
3134 case 44:
3135 w2[3] = 0x80;
3136 break;
3137
3138 case 45:
3139 w2[3] = w2[3] | 0x8000;
3140 break;
3141
3142 case 46:
3143 w2[3] = w2[3] | 0x800000;
3144 break;
3145
3146 case 47:
3147 w2[3] = w2[3] | 0x80000000;
3148 break;
3149
3150 case 48:
3151 w3[0] = 0x80;
3152 break;
3153
3154 case 49:
3155 w3[0] = w3[0] | 0x8000;
3156 break;
3157
3158 case 50:
3159 w3[0] = w3[0] | 0x800000;
3160 break;
3161
3162 case 51:
3163 w3[0] = w3[0] | 0x80000000;
3164 break;
3165
3166 case 52:
3167 w3[1] = 0x80;
3168 break;
3169
3170 case 53:
3171 w3[1] = w3[1] | 0x8000;
3172 break;
3173
3174 case 54:
3175 w3[1] = w3[1] | 0x800000;
3176 break;
3177
3178 case 55:
3179 w3[1] = w3[1] | 0x80000000;
3180 break;
3181
3182 case 56:
3183 w3[2] = 0x80;
3184 break;
3185
3186 case 57:
3187 w3[2] = w3[2] | 0x8000;
3188 break;
3189
3190 case 58:
3191 w3[2] = w3[2] | 0x800000;
3192 break;
3193
3194 case 59:
3195 w3[2] = w3[2] | 0x80000000;
3196 break;
3197
3198 case 60:
3199 w3[3] = 0x80;
3200 break;
3201
3202 case 61:
3203 w3[3] = w3[3] | 0x8000;
3204 break;
3205
3206 case 62:
3207 w3[3] = w3[3] | 0x800000;
3208 break;
3209
3210 case 63:
3211 w3[3] = w3[3] | 0x80000000;
3212 break;
3213 }
3214 }
3215
3216 inline void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
3217 {
3218 switch (offset)
3219 {
3220 case 0:
3221 w0[0] = 0x80;
3222 break;
3223
3224 case 1:
3225 w0[0] = w0[0] | 0x8000;
3226 break;
3227
3228 case 2:
3229 w0[0] = w0[0] | 0x800000;
3230 break;
3231
3232 case 3:
3233 w0[0] = w0[0] | 0x80000000;
3234 break;
3235
3236 case 4:
3237 w0[1] = 0x80;
3238 break;
3239
3240 case 5:
3241 w0[1] = w0[1] | 0x8000;
3242 break;
3243
3244 case 6:
3245 w0[1] = w0[1] | 0x800000;
3246 break;
3247
3248 case 7:
3249 w0[1] = w0[1] | 0x80000000;
3250 break;
3251
3252 case 8:
3253 w0[2] = 0x80;
3254 break;
3255
3256 case 9:
3257 w0[2] = w0[2] | 0x8000;
3258 break;
3259
3260 case 10:
3261 w0[2] = w0[2] | 0x800000;
3262 break;
3263
3264 case 11:
3265 w0[2] = w0[2] | 0x80000000;
3266 break;
3267
3268 case 12:
3269 w0[3] = 0x80;
3270 break;
3271
3272 case 13:
3273 w0[3] = w0[3] | 0x8000;
3274 break;
3275
3276 case 14:
3277 w0[3] = w0[3] | 0x800000;
3278 break;
3279
3280 case 15:
3281 w0[3] = w0[3] | 0x80000000;
3282 break;
3283
3284 case 16:
3285 w1[0] = 0x80;
3286 break;
3287
3288 case 17:
3289 w1[0] = w1[0] | 0x8000;
3290 break;
3291
3292 case 18:
3293 w1[0] = w1[0] | 0x800000;
3294 break;
3295
3296 case 19:
3297 w1[0] = w1[0] | 0x80000000;
3298 break;
3299
3300 case 20:
3301 w1[1] = 0x80;
3302 break;
3303
3304 case 21:
3305 w1[1] = w1[1] | 0x8000;
3306 break;
3307
3308 case 22:
3309 w1[1] = w1[1] | 0x800000;
3310 break;
3311
3312 case 23:
3313 w1[1] = w1[1] | 0x80000000;
3314 break;
3315
3316 case 24:
3317 w1[2] = 0x80;
3318 break;
3319
3320 case 25:
3321 w1[2] = w1[2] | 0x8000;
3322 break;
3323
3324 case 26:
3325 w1[2] = w1[2] | 0x800000;
3326 break;
3327
3328 case 27:
3329 w1[2] = w1[2] | 0x80000000;
3330 break;
3331
3332 case 28:
3333 w1[3] = 0x80;
3334 break;
3335
3336 case 29:
3337 w1[3] = w1[3] | 0x8000;
3338 break;
3339
3340 case 30:
3341 w1[3] = w1[3] | 0x800000;
3342 break;
3343
3344 case 31:
3345 w1[3] = w1[3] | 0x80000000;
3346 break;
3347
3348 case 32:
3349 w2[0] = 0x80;
3350 break;
3351
3352 case 33:
3353 w2[0] = w2[0] | 0x8000;
3354 break;
3355
3356 case 34:
3357 w2[0] = w2[0] | 0x800000;
3358 break;
3359
3360 case 35:
3361 w2[0] = w2[0] | 0x80000000;
3362 break;
3363
3364 case 36:
3365 w2[1] = 0x80;
3366 break;
3367
3368 case 37:
3369 w2[1] = w2[1] | 0x8000;
3370 break;
3371
3372 case 38:
3373 w2[1] = w2[1] | 0x800000;
3374 break;
3375
3376 case 39:
3377 w2[1] = w2[1] | 0x80000000;
3378 break;
3379
3380 case 40:
3381 w2[2] = 0x80;
3382 break;
3383
3384 case 41:
3385 w2[2] = w2[2] | 0x8000;
3386 break;
3387
3388 case 42:
3389 w2[2] = w2[2] | 0x800000;
3390 break;
3391
3392 case 43:
3393 w2[2] = w2[2] | 0x80000000;
3394 break;
3395
3396 case 44:
3397 w2[3] = 0x80;
3398 break;
3399
3400 case 45:
3401 w2[3] = w2[3] | 0x8000;
3402 break;
3403
3404 case 46:
3405 w2[3] = w2[3] | 0x800000;
3406 break;
3407
3408 case 47:
3409 w2[3] = w2[3] | 0x80000000;
3410 break;
3411
3412 case 48:
3413 w3[0] = 0x80;
3414 break;
3415
3416 case 49:
3417 w3[0] = w3[0] | 0x8000;
3418 break;
3419
3420 case 50:
3421 w3[0] = w3[0] | 0x800000;
3422 break;
3423
3424 case 51:
3425 w3[0] = w3[0] | 0x80000000;
3426 break;
3427
3428 case 52:
3429 w3[1] = 0x80;
3430 break;
3431
3432 case 53:
3433 w3[1] = w3[1] | 0x8000;
3434 break;
3435
3436 case 54:
3437 w3[1] = w3[1] | 0x800000;
3438 break;
3439
3440 case 55:
3441 w3[1] = w3[1] | 0x80000000;
3442 break;
3443
3444 case 56:
3445 w3[2] = 0x80;
3446 break;
3447
3448 case 57:
3449 w3[2] = w3[2] | 0x8000;
3450 break;
3451
3452 case 58:
3453 w3[2] = w3[2] | 0x800000;
3454 break;
3455
3456 case 59:
3457 w3[2] = w3[2] | 0x80000000;
3458 break;
3459
3460 case 60:
3461 w3[3] = 0x80;
3462 break;
3463
3464 case 61:
3465 w3[3] = w3[3] | 0x8000;
3466 break;
3467
3468 case 62:
3469 w3[3] = w3[3] | 0x800000;
3470 break;
3471
3472 case 63:
3473 w3[3] = w3[3] | 0x80000000;
3474 break;
3475
3476 case 64:
3477 w4[0] = 0x80;
3478 break;
3479
3480 case 65:
3481 w4[0] = w4[0] | 0x8000;
3482 break;
3483
3484 case 66:
3485 w4[0] = w4[0] | 0x800000;
3486 break;
3487
3488 case 67:
3489 w4[0] = w4[0] | 0x80000000;
3490 break;
3491
3492 case 68:
3493 w4[1] = 0x80;
3494 break;
3495
3496 case 69:
3497 w4[1] = w4[1] | 0x8000;
3498 break;
3499
3500 case 70:
3501 w4[1] = w4[1] | 0x800000;
3502 break;
3503
3504 case 71:
3505 w4[1] = w4[1] | 0x80000000;
3506 break;
3507
3508 case 72:
3509 w4[2] = 0x80;
3510 break;
3511
3512 case 73:
3513 w4[2] = w4[2] | 0x8000;
3514 break;
3515
3516 case 74:
3517 w4[2] = w4[2] | 0x800000;
3518 break;
3519
3520 case 75:
3521 w4[2] = w4[2] | 0x80000000;
3522 break;
3523
3524 case 76:
3525 w4[3] = 0x80;
3526 break;
3527
3528 case 77:
3529 w4[3] = w4[3] | 0x8000;
3530 break;
3531
3532 case 78:
3533 w4[3] = w4[3] | 0x800000;
3534 break;
3535
3536 case 79:
3537 w4[3] = w4[3] | 0x80000000;
3538 break;
3539
3540 case 80:
3541 w5[0] = 0x80;
3542 break;
3543
3544 case 81:
3545 w5[0] = w5[0] | 0x8000;
3546 break;
3547
3548 case 82:
3549 w5[0] = w5[0] | 0x800000;
3550 break;
3551
3552 case 83:
3553 w5[0] = w5[0] | 0x80000000;
3554 break;
3555
3556 case 84:
3557 w5[1] = 0x80;
3558 break;
3559
3560 case 85:
3561 w5[1] = w5[1] | 0x8000;
3562 break;
3563
3564 case 86:
3565 w5[1] = w5[1] | 0x800000;
3566 break;
3567
3568 case 87:
3569 w5[1] = w5[1] | 0x80000000;
3570 break;
3571
3572 case 88:
3573 w5[2] = 0x80;
3574 break;
3575
3576 case 89:
3577 w5[2] = w5[2] | 0x8000;
3578 break;
3579
3580 case 90:
3581 w5[2] = w5[2] | 0x800000;
3582 break;
3583
3584 case 91:
3585 w5[2] = w5[2] | 0x80000000;
3586 break;
3587
3588 case 92:
3589 w5[3] = 0x80;
3590 break;
3591
3592 case 93:
3593 w5[3] = w5[3] | 0x8000;
3594 break;
3595
3596 case 94:
3597 w5[3] = w5[3] | 0x800000;
3598 break;
3599
3600 case 95:
3601 w5[3] = w5[3] | 0x80000000;
3602 break;
3603
3604 case 96:
3605 w6[0] = 0x80;
3606 break;
3607
3608 case 97:
3609 w6[0] = w6[0] | 0x8000;
3610 break;
3611
3612 case 98:
3613 w6[0] = w6[0] | 0x800000;
3614 break;
3615
3616 case 99:
3617 w6[0] = w6[0] | 0x80000000;
3618 break;
3619
3620 case 100:
3621 w6[1] = 0x80;
3622 break;
3623
3624 case 101:
3625 w6[1] = w6[1] | 0x8000;
3626 break;
3627
3628 case 102:
3629 w6[1] = w6[1] | 0x800000;
3630 break;
3631
3632 case 103:
3633 w6[1] = w6[1] | 0x80000000;
3634 break;
3635
3636 case 104:
3637 w6[2] = 0x80;
3638 break;
3639
3640 case 105:
3641 w6[2] = w6[2] | 0x8000;
3642 break;
3643
3644 case 106:
3645 w6[2] = w6[2] | 0x800000;
3646 break;
3647
3648 case 107:
3649 w6[2] = w6[2] | 0x80000000;
3650 break;
3651
3652 case 108:
3653 w6[3] = 0x80;
3654 break;
3655
3656 case 109:
3657 w6[3] = w6[3] | 0x8000;
3658 break;
3659
3660 case 110:
3661 w6[3] = w6[3] | 0x800000;
3662 break;
3663
3664 case 111:
3665 w6[3] = w6[3] | 0x80000000;
3666 break;
3667
3668 case 112:
3669 w7[0] = 0x80;
3670 break;
3671
3672 case 113:
3673 w7[0] = w7[0] | 0x8000;
3674 break;
3675
3676 case 114:
3677 w7[0] = w7[0] | 0x800000;
3678 break;
3679
3680 case 115:
3681 w7[0] = w7[0] | 0x80000000;
3682 break;
3683
3684 case 116:
3685 w7[1] = 0x80;
3686 break;
3687
3688 case 117:
3689 w7[1] = w7[1] | 0x8000;
3690 break;
3691
3692 case 118:
3693 w7[1] = w7[1] | 0x800000;
3694 break;
3695
3696 case 119:
3697 w7[1] = w7[1] | 0x80000000;
3698 break;
3699
3700 case 120:
3701 w7[2] = 0x80;
3702 break;
3703
3704 case 121:
3705 w7[2] = w7[2] | 0x8000;
3706 break;
3707
3708 case 122:
3709 w7[2] = w7[2] | 0x800000;
3710 break;
3711
3712 case 123:
3713 w7[2] = w7[2] | 0x80000000;
3714 break;
3715
3716 case 124:
3717 w7[3] = 0x80;
3718 break;
3719
3720 case 125:
3721 w7[3] = w7[3] | 0x8000;
3722 break;
3723
3724 case 126:
3725 w7[3] = w7[3] | 0x800000;
3726 break;
3727
3728 case 127:
3729 w7[3] = w7[3] | 0x80000000;
3730 break;
3731 }
3732 }
3733
3734 inline void append_0x80_1x16 (u32x w[16], const u32 offset)
3735 {
3736 switch (offset)
3737 {
3738 case 0:
3739 w[ 0] = 0x80;
3740 break;
3741
3742 case 1:
3743 w[ 0] = w[ 0] | 0x8000;
3744 break;
3745
3746 case 2:
3747 w[ 0] = w[ 0] | 0x800000;
3748 break;
3749
3750 case 3:
3751 w[ 0] = w[ 0] | 0x80000000;
3752 break;
3753
3754 case 4:
3755 w[ 1] = 0x80;
3756 break;
3757
3758 case 5:
3759 w[ 1] = w[ 1] | 0x8000;
3760 break;
3761
3762 case 6:
3763 w[ 1] = w[ 1] | 0x800000;
3764 break;
3765
3766 case 7:
3767 w[ 1] = w[ 1] | 0x80000000;
3768 break;
3769
3770 case 8:
3771 w[ 2] = 0x80;
3772 break;
3773
3774 case 9:
3775 w[ 2] = w[ 2] | 0x8000;
3776 break;
3777
3778 case 10:
3779 w[ 2] = w[ 2] | 0x800000;
3780 break;
3781
3782 case 11:
3783 w[ 2] = w[ 2] | 0x80000000;
3784 break;
3785
3786 case 12:
3787 w[ 3] = 0x80;
3788 break;
3789
3790 case 13:
3791 w[ 3] = w[ 3] | 0x8000;
3792 break;
3793
3794 case 14:
3795 w[ 3] = w[ 3] | 0x800000;
3796 break;
3797
3798 case 15:
3799 w[ 3] = w[ 3] | 0x80000000;
3800 break;
3801
3802 case 16:
3803 w[ 4] = 0x80;
3804 break;
3805
3806 case 17:
3807 w[ 4] = w[ 4] | 0x8000;
3808 break;
3809
3810 case 18:
3811 w[ 4] = w[ 4] | 0x800000;
3812 break;
3813
3814 case 19:
3815 w[ 4] = w[ 4] | 0x80000000;
3816 break;
3817
3818 case 20:
3819 w[ 5] = 0x80;
3820 break;
3821
3822 case 21:
3823 w[ 5] = w[ 5] | 0x8000;
3824 break;
3825
3826 case 22:
3827 w[ 5] = w[ 5] | 0x800000;
3828 break;
3829
3830 case 23:
3831 w[ 5] = w[ 5] | 0x80000000;
3832 break;
3833
3834 case 24:
3835 w[ 6] = 0x80;
3836 break;
3837
3838 case 25:
3839 w[ 6] = w[ 6] | 0x8000;
3840 break;
3841
3842 case 26:
3843 w[ 6] = w[ 6] | 0x800000;
3844 break;
3845
3846 case 27:
3847 w[ 6] = w[ 6] | 0x80000000;
3848 break;
3849
3850 case 28:
3851 w[ 7] = 0x80;
3852 break;
3853
3854 case 29:
3855 w[ 7] = w[ 7] | 0x8000;
3856 break;
3857
3858 case 30:
3859 w[ 7] = w[ 7] | 0x800000;
3860 break;
3861
3862 case 31:
3863 w[ 7] = w[ 7] | 0x80000000;
3864 break;
3865
3866 case 32:
3867 w[ 8] = 0x80;
3868 break;
3869
3870 case 33:
3871 w[ 8] = w[ 8] | 0x8000;
3872 break;
3873
3874 case 34:
3875 w[ 8] = w[ 8] | 0x800000;
3876 break;
3877
3878 case 35:
3879 w[ 8] = w[ 8] | 0x80000000;
3880 break;
3881
3882 case 36:
3883 w[ 9] = 0x80;
3884 break;
3885
3886 case 37:
3887 w[ 9] = w[ 9] | 0x8000;
3888 break;
3889
3890 case 38:
3891 w[ 9] = w[ 9] | 0x800000;
3892 break;
3893
3894 case 39:
3895 w[ 9] = w[ 9] | 0x80000000;
3896 break;
3897
3898 case 40:
3899 w[10] = 0x80;
3900 break;
3901
3902 case 41:
3903 w[10] = w[10] | 0x8000;
3904 break;
3905
3906 case 42:
3907 w[10] = w[10] | 0x800000;
3908 break;
3909
3910 case 43:
3911 w[10] = w[10] | 0x80000000;
3912 break;
3913
3914 case 44:
3915 w[11] = 0x80;
3916 break;
3917
3918 case 45:
3919 w[11] = w[11] | 0x8000;
3920 break;
3921
3922 case 46:
3923 w[11] = w[11] | 0x800000;
3924 break;
3925
3926 case 47:
3927 w[11] = w[11] | 0x80000000;
3928 break;
3929
3930 case 48:
3931 w[12] = 0x80;
3932 break;
3933
3934 case 49:
3935 w[12] = w[12] | 0x8000;
3936 break;
3937
3938 case 50:
3939 w[12] = w[12] | 0x800000;
3940 break;
3941
3942 case 51:
3943 w[12] = w[12] | 0x80000000;
3944 break;
3945
3946 case 52:
3947 w[13] = 0x80;
3948 break;
3949
3950 case 53:
3951 w[13] = w[13] | 0x8000;
3952 break;
3953
3954 case 54:
3955 w[13] = w[13] | 0x800000;
3956 break;
3957
3958 case 55:
3959 w[13] = w[13] | 0x80000000;
3960 break;
3961
3962 case 56:
3963 w[14] = 0x80;
3964 break;
3965
3966 case 57:
3967 w[14] = w[14] | 0x8000;
3968 break;
3969
3970 case 58:
3971 w[14] = w[14] | 0x800000;
3972 break;
3973
3974 case 59:
3975 w[14] = w[14] | 0x80000000;
3976 break;
3977
3978 case 60:
3979 w[15] = 0x80;
3980 break;
3981
3982 case 61:
3983 w[15] = w[15] | 0x8000;
3984 break;
3985
3986 case 62:
3987 w[15] = w[15] | 0x800000;
3988 break;
3989
3990 case 63:
3991 w[15] = w[15] | 0x80000000;
3992 break;
3993 }
3994 }
3995
3996 inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
3997 {
3998 #if defined IS_AMD || defined IS_GENERIC
3999 const int offset_mod_4 = offset & 3;
4000
4001 const int offset_minus_4 = 4 - offset;
4002
4003 switch (offset / 4)
4004 {
4005 case 0:
4006 w3[2] = amd_bytealign ( 0, w3[1], offset_minus_4);
4007 w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4);
4008 w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4);
4009 w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4010 w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4011 w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4012 w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4013 w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4014 w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4015 w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4016 w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4017 w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4018 w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4019 w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4020 w0[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4021
4022 if (offset_mod_4 == 0)
4023 {
4024 w0[0] = w0[1];
4025 w0[1] = w0[2];
4026 w0[2] = w0[3];
4027 w0[3] = w1[0];
4028 w1[0] = w1[1];
4029 w1[1] = w1[2];
4030 w1[2] = w1[3];
4031 w1[3] = w2[0];
4032 w2[0] = w2[1];
4033 w2[1] = w2[2];
4034 w2[2] = w2[3];
4035 w2[3] = w3[0];
4036 w3[0] = w3[1];
4037 w3[1] = w3[2];
4038 w3[2] = 0;
4039 }
4040
4041 break;
4042
4043 case 1:
4044 w3[2] = amd_bytealign ( 0, w3[0], offset_minus_4);
4045 w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4);
4046 w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4047 w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4048 w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4049 w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4050 w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4051 w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4052 w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4053 w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4054 w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4055 w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4056 w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4057 w0[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4058 w0[0] = 0;
4059
4060 if (offset_mod_4 == 0)
4061 {
4062 w0[1] = w0[2];
4063 w0[2] = w0[3];
4064 w0[3] = w1[0];
4065 w1[0] = w1[1];
4066 w1[1] = w1[2];
4067 w1[2] = w1[3];
4068 w1[3] = w2[0];
4069 w2[0] = w2[1];
4070 w2[1] = w2[2];
4071 w2[2] = w2[3];
4072 w2[3] = w3[0];
4073 w3[0] = w3[1];
4074 w3[1] = w3[2];
4075 w3[2] = 0;
4076 }
4077
4078 break;
4079
4080 case 2:
4081 w3[2] = amd_bytealign ( 0, w2[3], offset_minus_4);
4082 w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4083 w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4084 w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4085 w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4086 w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4087 w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4088 w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4089 w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4090 w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4091 w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4092 w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4093 w0[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4094 w0[1] = 0;
4095 w0[0] = 0;
4096
4097 if (offset_mod_4 == 0)
4098 {
4099 w0[2] = w0[3];
4100 w0[3] = w1[0];
4101 w1[0] = w1[1];
4102 w1[1] = w1[2];
4103 w1[2] = w1[3];
4104 w1[3] = w2[0];
4105 w2[0] = w2[1];
4106 w2[1] = w2[2];
4107 w2[2] = w2[3];
4108 w2[3] = w3[0];
4109 w3[0] = w3[1];
4110 w3[1] = w3[2];
4111 w3[2] = 0;
4112 }
4113
4114 break;
4115
4116 case 3:
4117 w3[2] = amd_bytealign ( 0, w2[2], offset_minus_4);
4118 w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4119 w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4120 w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4121 w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4122 w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4123 w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4124 w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4125 w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4126 w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4127 w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4128 w0[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4129 w0[2] = 0;
4130 w0[1] = 0;
4131 w0[0] = 0;
4132
4133 if (offset_mod_4 == 0)
4134 {
4135 w0[3] = w1[0];
4136 w1[0] = w1[1];
4137 w1[1] = w1[2];
4138 w1[2] = w1[3];
4139 w1[3] = w2[0];
4140 w2[0] = w2[1];
4141 w2[1] = w2[2];
4142 w2[2] = w2[3];
4143 w2[3] = w3[0];
4144 w3[0] = w3[1];
4145 w3[1] = w3[2];
4146 w3[2] = 0;
4147 }
4148
4149 break;
4150
4151 case 4:
4152 w3[2] = amd_bytealign ( 0, w2[1], offset_minus_4);
4153 w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4154 w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4155 w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4156 w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4157 w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4158 w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4159 w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4160 w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4161 w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4162 w1[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4163 w0[3] = 0;
4164 w0[2] = 0;
4165 w0[1] = 0;
4166 w0[0] = 0;
4167
4168 if (offset_mod_4 == 0)
4169 {
4170 w1[0] = w1[1];
4171 w1[1] = w1[2];
4172 w1[2] = w1[3];
4173 w1[3] = w2[0];
4174 w2[0] = w2[1];
4175 w2[1] = w2[2];
4176 w2[2] = w2[3];
4177 w2[3] = w3[0];
4178 w3[0] = w3[1];
4179 w3[1] = w3[2];
4180 w3[2] = 0;
4181 }
4182
4183 break;
4184
4185 case 5:
4186 w3[2] = amd_bytealign ( 0, w2[0], offset_minus_4);
4187 w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4188 w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4189 w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4190 w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4191 w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4192 w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4193 w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4194 w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4195 w1[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4196 w1[0] = 0;
4197 w0[3] = 0;
4198 w0[2] = 0;
4199 w0[1] = 0;
4200 w0[0] = 0;
4201
4202 if (offset_mod_4 == 0)
4203 {
4204 w1[1] = w1[2];
4205 w1[2] = w1[3];
4206 w1[3] = w2[0];
4207 w2[0] = w2[1];
4208 w2[1] = w2[2];
4209 w2[2] = w2[3];
4210 w2[3] = w3[0];
4211 w3[0] = w3[1];
4212 w3[1] = w3[2];
4213 w3[2] = 0;
4214 }
4215
4216 break;
4217
4218 case 6:
4219 w3[2] = amd_bytealign ( 0, w1[3], offset_minus_4);
4220 w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4221 w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4222 w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4223 w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4224 w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4225 w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4226 w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4227 w1[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4228 w1[1] = 0;
4229 w1[0] = 0;
4230 w0[3] = 0;
4231 w0[2] = 0;
4232 w0[1] = 0;
4233 w0[0] = 0;
4234
4235 if (offset_mod_4 == 0)
4236 {
4237 w1[2] = w1[3];
4238 w1[3] = w2[0];
4239 w2[0] = w2[1];
4240 w2[1] = w2[2];
4241 w2[2] = w2[3];
4242 w2[3] = w3[0];
4243 w3[0] = w3[1];
4244 w3[1] = w3[2];
4245 w3[2] = 0;
4246 }
4247
4248 break;
4249
4250 case 7:
4251 w3[2] = amd_bytealign ( 0, w1[2], offset_minus_4);
4252 w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4253 w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4254 w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4255 w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4256 w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4257 w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4258 w1[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4259 w1[2] = 0;
4260 w1[1] = 0;
4261 w1[0] = 0;
4262 w0[3] = 0;
4263 w0[2] = 0;
4264 w0[1] = 0;
4265 w0[0] = 0;
4266
4267 if (offset_mod_4 == 0)
4268 {
4269 w1[3] = w2[0];
4270 w2[0] = w2[1];
4271 w2[1] = w2[2];
4272 w2[2] = w2[3];
4273 w2[3] = w3[0];
4274 w3[0] = w3[1];
4275 w3[1] = w3[2];
4276 w3[2] = 0;
4277 }
4278
4279 break;
4280
4281 case 8:
4282 w3[2] = amd_bytealign ( 0, w1[1], offset_minus_4);
4283 w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4284 w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4285 w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4286 w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4287 w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4288 w2[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4289 w1[3] = 0;
4290 w1[2] = 0;
4291 w1[1] = 0;
4292 w1[0] = 0;
4293 w0[3] = 0;
4294 w0[2] = 0;
4295 w0[1] = 0;
4296 w0[0] = 0;
4297
4298 if (offset_mod_4 == 0)
4299 {
4300 w2[0] = w2[1];
4301 w2[1] = w2[2];
4302 w2[2] = w2[3];
4303 w2[3] = w3[0];
4304 w3[0] = w3[1];
4305 w3[1] = w3[2];
4306 w3[2] = 0;
4307 }
4308
4309 break;
4310
4311 case 9:
4312 w3[2] = amd_bytealign ( 0, w1[0], offset_minus_4);
4313 w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4314 w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4315 w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4316 w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4317 w2[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4318 w2[0] = 0;
4319 w1[3] = 0;
4320 w1[2] = 0;
4321 w1[1] = 0;
4322 w1[0] = 0;
4323 w0[3] = 0;
4324 w0[2] = 0;
4325 w0[1] = 0;
4326 w0[0] = 0;
4327
4328 if (offset_mod_4 == 0)
4329 {
4330 w2[1] = w2[2];
4331 w2[2] = w2[3];
4332 w2[3] = w3[0];
4333 w3[0] = w3[1];
4334 w3[1] = w3[2];
4335 w3[2] = 0;
4336 }
4337
4338 break;
4339
4340 case 10:
4341 w3[2] = amd_bytealign ( 0, w0[3], offset_minus_4);
4342 w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4343 w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4344 w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4345 w2[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4346 w2[1] = 0;
4347 w2[0] = 0;
4348 w1[3] = 0;
4349 w1[2] = 0;
4350 w1[1] = 0;
4351 w1[0] = 0;
4352 w0[3] = 0;
4353 w0[2] = 0;
4354 w0[1] = 0;
4355 w0[0] = 0;
4356
4357 if (offset_mod_4 == 0)
4358 {
4359 w2[2] = w2[3];
4360 w2[3] = w3[0];
4361 w3[0] = w3[1];
4362 w3[1] = w3[2];
4363 w3[2] = 0;
4364 }
4365
4366 break;
4367
4368 case 11:
4369 w3[2] = amd_bytealign ( 0, w0[2], offset_minus_4);
4370 w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4371 w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4372 w2[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4373 w2[2] = 0;
4374 w2[1] = 0;
4375 w2[0] = 0;
4376 w1[3] = 0;
4377 w1[2] = 0;
4378 w1[1] = 0;
4379 w1[0] = 0;
4380 w0[3] = 0;
4381 w0[2] = 0;
4382 w0[1] = 0;
4383 w0[0] = 0;
4384
4385 if (offset_mod_4 == 0)
4386 {
4387 w2[3] = w3[0];
4388 w3[0] = w3[1];
4389 w3[1] = w3[2];
4390 w3[2] = 0;
4391 }
4392
4393 break;
4394
4395 case 12:
4396 w3[2] = amd_bytealign ( 0, w0[1], offset_minus_4);
4397 w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4398 w3[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4399 w2[3] = 0;
4400 w2[2] = 0;
4401 w2[1] = 0;
4402 w2[0] = 0;
4403 w1[3] = 0;
4404 w1[2] = 0;
4405 w1[1] = 0;
4406 w1[0] = 0;
4407 w0[3] = 0;
4408 w0[2] = 0;
4409 w0[1] = 0;
4410 w0[0] = 0;
4411
4412 if (offset_mod_4 == 0)
4413 {
4414 w3[0] = w3[1];
4415 w3[1] = w3[2];
4416 w3[2] = 0;
4417 }
4418
4419 break;
4420
4421 case 13:
4422 w3[2] = amd_bytealign ( 0, w0[0], offset_minus_4);
4423 w3[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4424 w3[0] = 0;
4425 w2[3] = 0;
4426 w2[2] = 0;
4427 w2[1] = 0;
4428 w2[0] = 0;
4429 w1[3] = 0;
4430 w1[2] = 0;
4431 w1[1] = 0;
4432 w1[0] = 0;
4433 w0[3] = 0;
4434 w0[2] = 0;
4435 w0[1] = 0;
4436 w0[0] = 0;
4437
4438 if (offset_mod_4 == 0)
4439 {
4440 w3[1] = w3[2];
4441 w3[2] = 0;
4442 }
4443
4444 break;
4445 }
4446 #endif
4447
4448 #ifdef IS_NV
4449 const int offset_minus_4 = 4 - (offset % 4);
4450
4451 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
4452
4453 switch (offset / 4)
4454 {
4455 case 0:
4456 w3[1] = __byte_perm (w3[0], w3[1], selector);
4457 w3[0] = __byte_perm (w2[3], w3[0], selector);
4458 w2[3] = __byte_perm (w2[2], w2[3], selector);
4459 w2[2] = __byte_perm (w2[1], w2[2], selector);
4460 w2[1] = __byte_perm (w2[0], w2[1], selector);
4461 w2[0] = __byte_perm (w1[3], w2[0], selector);
4462 w1[3] = __byte_perm (w1[2], w1[3], selector);
4463 w1[2] = __byte_perm (w1[1], w1[2], selector);
4464 w1[1] = __byte_perm (w1[0], w1[1], selector);
4465 w1[0] = __byte_perm (w0[3], w1[0], selector);
4466 w0[3] = __byte_perm (w0[2], w0[3], selector);
4467 w0[2] = __byte_perm (w0[1], w0[2], selector);
4468 w0[1] = __byte_perm (w0[0], w0[1], selector);
4469 w0[0] = __byte_perm ( 0, w0[0], selector);
4470
4471 break;
4472
4473 case 1:
4474 w3[1] = __byte_perm (w2[3], w3[0], selector);
4475 w3[0] = __byte_perm (w2[2], w2[3], selector);
4476 w2[3] = __byte_perm (w2[1], w2[2], selector);
4477 w2[2] = __byte_perm (w2[0], w2[1], selector);
4478 w2[1] = __byte_perm (w1[3], w2[0], selector);
4479 w2[0] = __byte_perm (w1[2], w1[3], selector);
4480 w1[3] = __byte_perm (w1[1], w1[2], selector);
4481 w1[2] = __byte_perm (w1[0], w1[1], selector);
4482 w1[1] = __byte_perm (w0[3], w1[0], selector);
4483 w1[0] = __byte_perm (w0[2], w0[3], selector);
4484 w0[3] = __byte_perm (w0[1], w0[2], selector);
4485 w0[2] = __byte_perm (w0[0], w0[1], selector);
4486 w0[1] = __byte_perm ( 0, w0[0], selector);
4487 w0[0] = 0;
4488
4489 break;
4490
4491 case 2:
4492 w3[1] = __byte_perm (w2[2], w2[3], selector);
4493 w3[0] = __byte_perm (w2[1], w2[2], selector);
4494 w2[3] = __byte_perm (w2[0], w2[1], selector);
4495 w2[2] = __byte_perm (w1[3], w2[0], selector);
4496 w2[1] = __byte_perm (w1[2], w1[3], selector);
4497 w2[0] = __byte_perm (w1[1], w1[2], selector);
4498 w1[3] = __byte_perm (w1[0], w1[1], selector);
4499 w1[2] = __byte_perm (w0[3], w1[0], selector);
4500 w1[1] = __byte_perm (w0[2], w0[3], selector);
4501 w1[0] = __byte_perm (w0[1], w0[2], selector);
4502 w0[3] = __byte_perm (w0[0], w0[1], selector);
4503 w0[2] = __byte_perm ( 0, w0[0], selector);
4504 w0[1] = 0;
4505 w0[0] = 0;
4506
4507 break;
4508
4509 case 3:
4510 w3[1] = __byte_perm (w2[1], w2[2], selector);
4511 w3[0] = __byte_perm (w2[0], w2[1], selector);
4512 w2[3] = __byte_perm (w1[3], w2[0], selector);
4513 w2[2] = __byte_perm (w1[2], w1[3], selector);
4514 w2[1] = __byte_perm (w1[1], w1[2], selector);
4515 w2[0] = __byte_perm (w1[0], w1[1], selector);
4516 w1[3] = __byte_perm (w0[3], w1[0], selector);
4517 w1[2] = __byte_perm (w0[2], w0[3], selector);
4518 w1[1] = __byte_perm (w0[1], w0[2], selector);
4519 w1[0] = __byte_perm (w0[0], w0[1], selector);
4520 w0[3] = __byte_perm ( 0, w0[0], selector);
4521 w0[2] = 0;
4522 w0[1] = 0;
4523 w0[0] = 0;
4524
4525 break;
4526
4527 case 4:
4528 w3[1] = __byte_perm (w2[0], w2[1], selector);
4529 w3[0] = __byte_perm (w1[3], w2[0], selector);
4530 w2[3] = __byte_perm (w1[2], w1[3], selector);
4531 w2[2] = __byte_perm (w1[1], w1[2], selector);
4532 w2[1] = __byte_perm (w1[0], w1[1], selector);
4533 w2[0] = __byte_perm (w0[3], w1[0], selector);
4534 w1[3] = __byte_perm (w0[2], w0[3], selector);
4535 w1[2] = __byte_perm (w0[1], w0[2], selector);
4536 w1[1] = __byte_perm (w0[0], w0[1], selector);
4537 w1[0] = __byte_perm ( 0, w0[0], selector);
4538 w0[3] = 0;
4539 w0[2] = 0;
4540 w0[1] = 0;
4541 w0[0] = 0;
4542
4543 break;
4544
4545 case 5:
4546 w3[1] = __byte_perm (w1[3], w2[0], selector);
4547 w3[0] = __byte_perm (w1[2], w1[3], selector);
4548 w2[3] = __byte_perm (w1[1], w1[2], selector);
4549 w2[2] = __byte_perm (w1[0], w1[1], selector);
4550 w2[1] = __byte_perm (w0[3], w1[0], selector);
4551 w2[0] = __byte_perm (w0[2], w0[3], selector);
4552 w1[3] = __byte_perm (w0[1], w0[2], selector);
4553 w1[2] = __byte_perm (w0[0], w0[1], selector);
4554 w1[1] = __byte_perm ( 0, w0[0], selector);
4555 w1[0] = 0;
4556 w0[3] = 0;
4557 w0[2] = 0;
4558 w0[1] = 0;
4559 w0[0] = 0;
4560
4561 break;
4562
4563 case 6:
4564 w3[1] = __byte_perm (w1[2], w1[3], selector);
4565 w3[0] = __byte_perm (w1[1], w1[2], selector);
4566 w2[3] = __byte_perm (w1[0], w1[1], selector);
4567 w2[2] = __byte_perm (w0[3], w1[0], selector);
4568 w2[1] = __byte_perm (w0[2], w0[3], selector);
4569 w2[0] = __byte_perm (w0[1], w0[2], selector);
4570 w1[3] = __byte_perm (w0[0], w0[1], selector);
4571 w1[2] = __byte_perm ( 0, w0[0], selector);
4572 w1[1] = 0;
4573 w1[0] = 0;
4574 w0[3] = 0;
4575 w0[2] = 0;
4576 w0[1] = 0;
4577 w0[0] = 0;
4578
4579 break;
4580
4581 case 7:
4582 w3[1] = __byte_perm (w1[1], w1[2], selector);
4583 w3[0] = __byte_perm (w1[0], w1[1], selector);
4584 w2[3] = __byte_perm (w0[3], w1[0], selector);
4585 w2[2] = __byte_perm (w0[2], w0[3], selector);
4586 w2[1] = __byte_perm (w0[1], w0[2], selector);
4587 w2[0] = __byte_perm (w0[0], w0[1], selector);
4588 w1[3] = __byte_perm ( 0, w0[0], selector);
4589 w1[2] = 0;
4590 w1[1] = 0;
4591 w1[0] = 0;
4592 w0[3] = 0;
4593 w0[2] = 0;
4594 w0[1] = 0;
4595 w0[0] = 0;
4596
4597 break;
4598
4599 case 8:
4600 w3[1] = __byte_perm (w1[0], w1[1], selector);
4601 w3[0] = __byte_perm (w0[3], w1[0], selector);
4602 w2[3] = __byte_perm (w0[2], w0[3], selector);
4603 w2[2] = __byte_perm (w0[1], w0[2], selector);
4604 w2[1] = __byte_perm (w0[0], w0[1], selector);
4605 w2[0] = __byte_perm ( 0, w0[0], selector);
4606 w1[3] = 0;
4607 w1[2] = 0;
4608 w1[1] = 0;
4609 w1[0] = 0;
4610 w0[3] = 0;
4611 w0[2] = 0;
4612 w0[1] = 0;
4613 w0[0] = 0;
4614
4615 break;
4616
4617 case 9:
4618 w3[1] = __byte_perm (w0[3], w1[0], selector);
4619 w3[0] = __byte_perm (w0[2], w0[3], selector);
4620 w2[3] = __byte_perm (w0[1], w0[2], selector);
4621 w2[2] = __byte_perm (w0[0], w0[1], selector);
4622 w2[1] = __byte_perm ( 0, w0[0], selector);
4623 w2[0] = 0;
4624 w1[3] = 0;
4625 w1[2] = 0;
4626 w1[1] = 0;
4627 w1[0] = 0;
4628 w0[3] = 0;
4629 w0[2] = 0;
4630 w0[1] = 0;
4631 w0[0] = 0;
4632
4633 break;
4634
4635 case 10:
4636 w3[1] = __byte_perm (w0[2], w0[3], selector);
4637 w3[0] = __byte_perm (w0[1], w0[2], selector);
4638 w2[3] = __byte_perm (w0[0], w0[1], selector);
4639 w2[2] = __byte_perm ( 0, w0[0], selector);
4640 w2[1] = 0;
4641 w2[0] = 0;
4642 w1[3] = 0;
4643 w1[2] = 0;
4644 w1[1] = 0;
4645 w1[0] = 0;
4646 w0[3] = 0;
4647 w0[2] = 0;
4648 w0[1] = 0;
4649 w0[0] = 0;
4650
4651 break;
4652
4653 case 11:
4654 w3[1] = __byte_perm (w0[1], w0[2], selector);
4655 w3[0] = __byte_perm (w0[0], w0[1], selector);
4656 w2[3] = __byte_perm ( 0, w0[0], selector);
4657 w2[2] = 0;
4658 w2[1] = 0;
4659 w2[0] = 0;
4660 w1[3] = 0;
4661 w1[2] = 0;
4662 w1[1] = 0;
4663 w1[0] = 0;
4664 w0[3] = 0;
4665 w0[2] = 0;
4666 w0[1] = 0;
4667 w0[0] = 0;
4668
4669 break;
4670
4671 case 12:
4672 w3[1] = __byte_perm (w0[0], w0[1], selector);
4673 w3[0] = __byte_perm ( 0, w0[0], selector);
4674 w2[3] = 0;
4675 w2[2] = 0;
4676 w2[1] = 0;
4677 w2[0] = 0;
4678 w1[3] = 0;
4679 w1[2] = 0;
4680 w1[1] = 0;
4681 w1[0] = 0;
4682 w0[3] = 0;
4683 w0[2] = 0;
4684 w0[1] = 0;
4685 w0[0] = 0;
4686
4687 break;
4688
4689 case 13:
4690 w3[1] = __byte_perm ( 0, w0[0], selector);
4691 w3[0] = 0;
4692 w2[3] = 0;
4693 w2[2] = 0;
4694 w2[1] = 0;
4695 w2[0] = 0;
4696 w1[3] = 0;
4697 w1[2] = 0;
4698 w1[1] = 0;
4699 w1[0] = 0;
4700 w0[3] = 0;
4701 w0[2] = 0;
4702 w0[1] = 0;
4703 w0[0] = 0;
4704
4705 break;
4706 }
4707 #endif
4708 }
4709
4710 inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
4711 {
4712 #if defined IS_AMD || defined IS_GENERIC
4713 switch (offset / 4)
4714 {
4715 case 0:
4716 w3[2] = amd_bytealign (w3[1], 0, offset);
4717 w3[1] = amd_bytealign (w3[0], w3[1], offset);
4718 w3[0] = amd_bytealign (w2[3], w3[0], offset);
4719 w2[3] = amd_bytealign (w2[2], w2[3], offset);
4720 w2[2] = amd_bytealign (w2[1], w2[2], offset);
4721 w2[1] = amd_bytealign (w2[0], w2[1], offset);
4722 w2[0] = amd_bytealign (w1[3], w2[0], offset);
4723 w1[3] = amd_bytealign (w1[2], w1[3], offset);
4724 w1[2] = amd_bytealign (w1[1], w1[2], offset);
4725 w1[1] = amd_bytealign (w1[0], w1[1], offset);
4726 w1[0] = amd_bytealign (w0[3], w1[0], offset);
4727 w0[3] = amd_bytealign (w0[2], w0[3], offset);
4728 w0[2] = amd_bytealign (w0[1], w0[2], offset);
4729 w0[1] = amd_bytealign (w0[0], w0[1], offset);
4730 w0[0] = amd_bytealign ( 0, w0[0], offset);
4731 break;
4732
4733 case 1:
4734 w3[2] = amd_bytealign (w3[0], 0, offset);
4735 w3[1] = amd_bytealign (w2[3], w3[0], offset);
4736 w3[0] = amd_bytealign (w2[2], w2[3], offset);
4737 w2[3] = amd_bytealign (w2[1], w2[2], offset);
4738 w2[2] = amd_bytealign (w2[0], w2[1], offset);
4739 w2[1] = amd_bytealign (w1[3], w2[0], offset);
4740 w2[0] = amd_bytealign (w1[2], w1[3], offset);
4741 w1[3] = amd_bytealign (w1[1], w1[2], offset);
4742 w1[2] = amd_bytealign (w1[0], w1[1], offset);
4743 w1[1] = amd_bytealign (w0[3], w1[0], offset);
4744 w1[0] = amd_bytealign (w0[2], w0[3], offset);
4745 w0[3] = amd_bytealign (w0[1], w0[2], offset);
4746 w0[2] = amd_bytealign (w0[0], w0[1], offset);
4747 w0[1] = amd_bytealign ( 0, w0[0], offset);
4748 w0[0] = 0;
4749 break;
4750
4751 case 2:
4752 w3[2] = amd_bytealign (w2[3], 0, offset);
4753 w3[1] = amd_bytealign (w2[2], w2[3], offset);
4754 w3[0] = amd_bytealign (w2[1], w2[2], offset);
4755 w2[3] = amd_bytealign (w2[0], w2[1], offset);
4756 w2[2] = amd_bytealign (w1[3], w2[0], offset);
4757 w2[1] = amd_bytealign (w1[2], w1[3], offset);
4758 w2[0] = amd_bytealign (w1[1], w1[2], offset);
4759 w1[3] = amd_bytealign (w1[0], w1[1], offset);
4760 w1[2] = amd_bytealign (w0[3], w1[0], offset);
4761 w1[1] = amd_bytealign (w0[2], w0[3], offset);
4762 w1[0] = amd_bytealign (w0[1], w0[2], offset);
4763 w0[3] = amd_bytealign (w0[0], w0[1], offset);
4764 w0[2] = amd_bytealign ( 0, w0[0], offset);
4765 w0[1] = 0;
4766 w0[0] = 0;
4767 break;
4768
4769 case 3:
4770 w3[2] = amd_bytealign (w2[2], 0, offset);
4771 w3[1] = amd_bytealign (w2[1], w2[2], offset);
4772 w3[0] = amd_bytealign (w2[0], w2[1], offset);
4773 w2[3] = amd_bytealign (w1[3], w2[0], offset);
4774 w2[2] = amd_bytealign (w1[2], w1[3], offset);
4775 w2[1] = amd_bytealign (w1[1], w1[2], offset);
4776 w2[0] = amd_bytealign (w1[0], w1[1], offset);
4777 w1[3] = amd_bytealign (w0[3], w1[0], offset);
4778 w1[2] = amd_bytealign (w0[2], w0[3], offset);
4779 w1[1] = amd_bytealign (w0[1], w0[2], offset);
4780 w1[0] = amd_bytealign (w0[0], w0[1], offset);
4781 w0[3] = amd_bytealign ( 0, w0[0], offset);
4782 w0[2] = 0;
4783 w0[1] = 0;
4784 w0[0] = 0;
4785 break;
4786
4787 case 4:
4788 w3[2] = amd_bytealign (w2[1], 0, offset);
4789 w3[1] = amd_bytealign (w2[0], w2[1], offset);
4790 w3[0] = amd_bytealign (w1[3], w2[0], offset);
4791 w2[3] = amd_bytealign (w1[2], w1[3], offset);
4792 w2[2] = amd_bytealign (w1[1], w1[2], offset);
4793 w2[1] = amd_bytealign (w1[0], w1[1], offset);
4794 w2[0] = amd_bytealign (w0[3], w1[0], offset);
4795 w1[3] = amd_bytealign (w0[2], w0[3], offset);
4796 w1[2] = amd_bytealign (w0[1], w0[2], offset);
4797 w1[1] = amd_bytealign (w0[0], w0[1], offset);
4798 w1[0] = amd_bytealign ( 0, w0[0], offset);
4799 w0[3] = 0;
4800 w0[2] = 0;
4801 w0[1] = 0;
4802 w0[0] = 0;
4803 break;
4804
4805 case 5:
4806 w3[2] = amd_bytealign (w2[0], 0, offset);
4807 w3[1] = amd_bytealign (w1[3], w2[0], offset);
4808 w3[0] = amd_bytealign (w1[2], w1[3], offset);
4809 w2[3] = amd_bytealign (w1[1], w1[2], offset);
4810 w2[2] = amd_bytealign (w1[0], w1[1], offset);
4811 w2[1] = amd_bytealign (w0[3], w1[0], offset);
4812 w2[0] = amd_bytealign (w0[2], w0[3], offset);
4813 w1[3] = amd_bytealign (w0[1], w0[2], offset);
4814 w1[2] = amd_bytealign (w0[0], w0[1], offset);
4815 w1[1] = amd_bytealign ( 0, w0[0], offset);
4816 w1[0] = 0;
4817 w0[3] = 0;
4818 w0[2] = 0;
4819 w0[1] = 0;
4820 w0[0] = 0;
4821 break;
4822
4823 case 6:
4824 w3[2] = amd_bytealign (w1[3], 0, offset);
4825 w3[1] = amd_bytealign (w1[2], w1[3], offset);
4826 w3[0] = amd_bytealign (w1[1], w1[2], offset);
4827 w2[3] = amd_bytealign (w1[0], w1[1], offset);
4828 w2[2] = amd_bytealign (w0[3], w1[0], offset);
4829 w2[1] = amd_bytealign (w0[2], w0[3], offset);
4830 w2[0] = amd_bytealign (w0[1], w0[2], offset);
4831 w1[3] = amd_bytealign (w0[0], w0[1], offset);
4832 w1[2] = amd_bytealign ( 0, w0[0], offset);
4833 w1[1] = 0;
4834 w1[0] = 0;
4835 w0[3] = 0;
4836 w0[2] = 0;
4837 w0[1] = 0;
4838 w0[0] = 0;
4839 break;
4840
4841 case 7:
4842 w3[2] = amd_bytealign (w1[2], 0, offset);
4843 w3[1] = amd_bytealign (w1[1], w1[2], offset);
4844 w3[0] = amd_bytealign (w1[0], w1[1], offset);
4845 w2[3] = amd_bytealign (w0[3], w1[0], offset);
4846 w2[2] = amd_bytealign (w0[2], w0[3], offset);
4847 w2[1] = amd_bytealign (w0[1], w0[2], offset);
4848 w2[0] = amd_bytealign (w0[0], w0[1], offset);
4849 w1[3] = amd_bytealign ( 0, w0[0], offset);
4850 w1[2] = 0;
4851 w1[1] = 0;
4852 w1[0] = 0;
4853 w0[3] = 0;
4854 w0[2] = 0;
4855 w0[1] = 0;
4856 w0[0] = 0;
4857 break;
4858
4859 case 8:
4860 w3[2] = amd_bytealign (w1[1], 0, offset);
4861 w3[1] = amd_bytealign (w1[0], w1[1], offset);
4862 w3[0] = amd_bytealign (w0[3], w1[0], offset);
4863 w2[3] = amd_bytealign (w0[2], w0[3], offset);
4864 w2[2] = amd_bytealign (w0[1], w0[2], offset);
4865 w2[1] = amd_bytealign (w0[0], w0[1], offset);
4866 w2[0] = amd_bytealign ( 0, w0[0], offset);
4867 w1[3] = 0;
4868 w1[2] = 0;
4869 w1[1] = 0;
4870 w1[0] = 0;
4871 w0[3] = 0;
4872 w0[2] = 0;
4873 w0[1] = 0;
4874 w0[0] = 0;
4875 break;
4876
4877 case 9:
4878 w3[2] = amd_bytealign (w1[0], 0, offset);
4879 w3[1] = amd_bytealign (w0[3], w1[0], offset);
4880 w3[0] = amd_bytealign (w0[2], w0[3], offset);
4881 w2[3] = amd_bytealign (w0[1], w0[2], offset);
4882 w2[2] = amd_bytealign (w0[0], w0[1], offset);
4883 w2[1] = amd_bytealign ( 0, w0[0], offset);
4884 w2[0] = 0;
4885 w1[3] = 0;
4886 w1[2] = 0;
4887 w1[1] = 0;
4888 w1[0] = 0;
4889 w0[3] = 0;
4890 w0[2] = 0;
4891 w0[1] = 0;
4892 w0[0] = 0;
4893 break;
4894
4895 case 10:
4896 w3[2] = amd_bytealign (w0[3], 0, offset);
4897 w3[1] = amd_bytealign (w0[2], w0[3], offset);
4898 w3[0] = amd_bytealign (w0[1], w0[2], offset);
4899 w2[3] = amd_bytealign (w0[0], w0[1], offset);
4900 w2[2] = amd_bytealign ( 0, w0[0], offset);
4901 w2[1] = 0;
4902 w2[0] = 0;
4903 w1[3] = 0;
4904 w1[2] = 0;
4905 w1[1] = 0;
4906 w1[0] = 0;
4907 w0[3] = 0;
4908 w0[2] = 0;
4909 w0[1] = 0;
4910 w0[0] = 0;
4911 break;
4912
4913 case 11:
4914 w3[2] = amd_bytealign (w0[2], 0, offset);
4915 w3[1] = amd_bytealign (w0[1], w0[2], offset);
4916 w3[0] = amd_bytealign (w0[0], w0[1], offset);
4917 w2[3] = amd_bytealign ( 0, w0[0], offset);
4918 w2[2] = 0;
4919 w2[1] = 0;
4920 w2[0] = 0;
4921 w1[3] = 0;
4922 w1[2] = 0;
4923 w1[1] = 0;
4924 w1[0] = 0;
4925 w0[3] = 0;
4926 w0[2] = 0;
4927 w0[1] = 0;
4928 w0[0] = 0;
4929 break;
4930
4931 case 12:
4932 w3[2] = amd_bytealign (w0[1], 0, offset);
4933 w3[1] = amd_bytealign (w0[0], w0[1], offset);
4934 w3[0] = amd_bytealign ( 0, w0[0], offset);
4935 w2[3] = 0;
4936 w2[2] = 0;
4937 w2[1] = 0;
4938 w2[0] = 0;
4939 w1[3] = 0;
4940 w1[2] = 0;
4941 w1[1] = 0;
4942 w1[0] = 0;
4943 w0[3] = 0;
4944 w0[2] = 0;
4945 w0[1] = 0;
4946 w0[0] = 0;
4947 break;
4948
4949 case 13:
4950 w3[2] = amd_bytealign (w0[0], 0, offset);
4951 w3[1] = amd_bytealign ( 0, w0[0], offset);
4952 w3[0] = 0;
4953 w2[3] = 0;
4954 w2[2] = 0;
4955 w2[1] = 0;
4956 w2[0] = 0;
4957 w1[3] = 0;
4958 w1[2] = 0;
4959 w1[1] = 0;
4960 w1[0] = 0;
4961 w0[3] = 0;
4962 w0[2] = 0;
4963 w0[1] = 0;
4964 w0[0] = 0;
4965 break;
4966 }
4967 #endif
4968
4969 #ifdef IS_NV
4970 const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
4971
4972 switch (offset / 4)
4973 {
4974 case 0:
4975 w3[1] = __byte_perm (w3[1], w3[0], selector);
4976 w3[0] = __byte_perm (w3[0], w2[3], selector);
4977 w2[3] = __byte_perm (w2[3], w2[2], selector);
4978 w2[2] = __byte_perm (w2[2], w2[1], selector);
4979 w2[1] = __byte_perm (w2[1], w2[0], selector);
4980 w2[0] = __byte_perm (w2[0], w1[3], selector);
4981 w1[3] = __byte_perm (w1[3], w1[2], selector);
4982 w1[2] = __byte_perm (w1[2], w1[1], selector);
4983 w1[1] = __byte_perm (w1[1], w1[0], selector);
4984 w1[0] = __byte_perm (w1[0], w0[3], selector);
4985 w0[3] = __byte_perm (w0[3], w0[2], selector);
4986 w0[2] = __byte_perm (w0[2], w0[1], selector);
4987 w0[1] = __byte_perm (w0[1], w0[0], selector);
4988 w0[0] = __byte_perm (w0[0], 0, selector);
4989 break;
4990
4991 case 1:
4992 w3[1] = __byte_perm (w3[0], w2[3], selector);
4993 w3[0] = __byte_perm (w2[3], w2[2], selector);
4994 w2[3] = __byte_perm (w2[2], w2[1], selector);
4995 w2[2] = __byte_perm (w2[1], w2[0], selector);
4996 w2[1] = __byte_perm (w2[0], w1[3], selector);
4997 w2[0] = __byte_perm (w1[3], w1[2], selector);
4998 w1[3] = __byte_perm (w1[2], w1[1], selector);
4999 w1[2] = __byte_perm (w1[1], w1[0], selector);
5000 w1[1] = __byte_perm (w1[0], w0[3], selector);
5001 w1[0] = __byte_perm (w0[3], w0[2], selector);
5002 w0[3] = __byte_perm (w0[2], w0[1], selector);
5003 w0[2] = __byte_perm (w0[1], w0[0], selector);
5004 w0[1] = __byte_perm (w0[0], 0, selector);
5005 w0[0] = 0;
5006 break;
5007
5008 case 2:
5009 w3[1] = __byte_perm (w2[3], w2[2], selector);
5010 w3[0] = __byte_perm (w2[2], w2[1], selector);
5011 w2[3] = __byte_perm (w2[1], w2[0], selector);
5012 w2[2] = __byte_perm (w2[0], w1[3], selector);
5013 w2[1] = __byte_perm (w1[3], w1[2], selector);
5014 w2[0] = __byte_perm (w1[2], w1[1], selector);
5015 w1[3] = __byte_perm (w1[1], w1[0], selector);
5016 w1[2] = __byte_perm (w1[0], w0[3], selector);
5017 w1[1] = __byte_perm (w0[3], w0[2], selector);
5018 w1[0] = __byte_perm (w0[2], w0[1], selector);
5019 w0[3] = __byte_perm (w0[1], w0[0], selector);
5020 w0[2] = __byte_perm (w0[0], 0, selector);
5021 w0[1] = 0;
5022 w0[0] = 0;
5023 break;
5024
5025 case 3:
5026 w3[1] = __byte_perm (w2[2], w2[1], selector);
5027 w3[0] = __byte_perm (w2[1], w2[0], selector);
5028 w2[3] = __byte_perm (w2[0], w1[3], selector);
5029 w2[2] = __byte_perm (w1[3], w1[2], selector);
5030 w2[1] = __byte_perm (w1[2], w1[1], selector);
5031 w2[0] = __byte_perm (w1[1], w1[0], selector);
5032 w1[3] = __byte_perm (w1[0], w0[3], selector);
5033 w1[2] = __byte_perm (w0[3], w0[2], selector);
5034 w1[1] = __byte_perm (w0[2], w0[1], selector);
5035 w1[0] = __byte_perm (w0[1], w0[0], selector);
5036 w0[3] = __byte_perm (w0[0], 0, selector);
5037 w0[2] = 0;
5038 w0[1] = 0;
5039 w0[0] = 0;
5040 break;
5041
5042 case 4:
5043 w3[1] = __byte_perm (w2[1], w2[0], selector);
5044 w3[0] = __byte_perm (w2[0], w1[3], selector);
5045 w2[3] = __byte_perm (w1[3], w1[2], selector);
5046 w2[2] = __byte_perm (w1[2], w1[1], selector);
5047 w2[1] = __byte_perm (w1[1], w1[0], selector);
5048 w2[0] = __byte_perm (w1[0], w0[3], selector);
5049 w1[3] = __byte_perm (w0[3], w0[2], selector);
5050 w1[2] = __byte_perm (w0[2], w0[1], selector);
5051 w1[1] = __byte_perm (w0[1], w0[0], selector);
5052 w1[0] = __byte_perm (w0[0], 0, selector);
5053 w0[3] = 0;
5054 w0[2] = 0;
5055 w0[1] = 0;
5056 w0[0] = 0;
5057 break;
5058
5059 case 5:
5060 w3[1] = __byte_perm (w2[0], w1[3], selector);
5061 w3[0] = __byte_perm (w1[3], w1[2], selector);
5062 w2[3] = __byte_perm (w1[2], w1[1], selector);
5063 w2[2] = __byte_perm (w1[1], w1[0], selector);
5064 w2[1] = __byte_perm (w1[0], w0[3], selector);
5065 w2[0] = __byte_perm (w0[3], w0[2], selector);
5066 w1[3] = __byte_perm (w0[2], w0[1], selector);
5067 w1[2] = __byte_perm (w0[1], w0[0], selector);
5068 w1[1] = __byte_perm (w0[0], 0, selector);
5069 w1[0] = 0;
5070 w0[3] = 0;
5071 w0[2] = 0;
5072 w0[1] = 0;
5073 w0[0] = 0;
5074 break;
5075
5076 case 6:
5077 w3[1] = __byte_perm (w1[3], w1[2], selector);
5078 w3[0] = __byte_perm (w1[2], w1[1], selector);
5079 w2[3] = __byte_perm (w1[1], w1[0], selector);
5080 w2[2] = __byte_perm (w1[0], w0[3], selector);
5081 w2[1] = __byte_perm (w0[3], w0[2], selector);
5082 w2[0] = __byte_perm (w0[2], w0[1], selector);
5083 w1[3] = __byte_perm (w0[1], w0[0], selector);
5084 w1[2] = __byte_perm (w0[0], 0, selector);
5085 w1[1] = 0;
5086 w1[0] = 0;
5087 w0[3] = 0;
5088 w0[2] = 0;
5089 w0[1] = 0;
5090 w0[0] = 0;
5091 break;
5092
5093 case 7:
5094 w3[1] = __byte_perm (w1[2], w1[1], selector);
5095 w3[0] = __byte_perm (w1[1], w1[0], selector);
5096 w2[3] = __byte_perm (w1[0], w0[3], selector);
5097 w2[2] = __byte_perm (w0[3], w0[2], selector);
5098 w2[1] = __byte_perm (w0[2], w0[1], selector);
5099 w2[0] = __byte_perm (w0[1], w0[0], selector);
5100 w1[3] = __byte_perm (w0[0], 0, selector);
5101 w1[2] = 0;
5102 w1[1] = 0;
5103 w1[0] = 0;
5104 w0[3] = 0;
5105 w0[2] = 0;
5106 w0[1] = 0;
5107 w0[0] = 0;
5108 break;
5109
5110 case 8:
5111 w3[1] = __byte_perm (w1[1], w1[0], selector);
5112 w3[0] = __byte_perm (w1[0], w0[3], selector);
5113 w2[3] = __byte_perm (w0[3], w0[2], selector);
5114 w2[2] = __byte_perm (w0[2], w0[1], selector);
5115 w2[1] = __byte_perm (w0[1], w0[0], selector);
5116 w2[0] = __byte_perm (w0[0], 0, selector);
5117 w1[3] = 0;
5118 w1[2] = 0;
5119 w1[1] = 0;
5120 w1[0] = 0;
5121 w0[3] = 0;
5122 w0[2] = 0;
5123 w0[1] = 0;
5124 w0[0] = 0;
5125 break;
5126
5127 case 9:
5128 w3[1] = __byte_perm (w1[0], w0[3], selector);
5129 w3[0] = __byte_perm (w0[3], w0[2], selector);
5130 w2[3] = __byte_perm (w0[2], w0[1], selector);
5131 w2[2] = __byte_perm (w0[1], w0[0], selector);
5132 w2[1] = __byte_perm (w0[0], 0, selector);
5133 w2[0] = 0;
5134 w1[3] = 0;
5135 w1[2] = 0;
5136 w1[1] = 0;
5137 w1[0] = 0;
5138 w0[3] = 0;
5139 w0[2] = 0;
5140 w0[1] = 0;
5141 w0[0] = 0;
5142 break;
5143
5144 case 10:
5145 w3[1] = __byte_perm (w0[3], w0[2], selector);
5146 w3[0] = __byte_perm (w0[2], w0[1], selector);
5147 w2[3] = __byte_perm (w0[1], w0[0], selector);
5148 w2[2] = __byte_perm (w0[0], 0, selector);
5149 w2[1] = 0;
5150 w2[0] = 0;
5151 w1[3] = 0;
5152 w1[2] = 0;
5153 w1[1] = 0;
5154 w1[0] = 0;
5155 w0[3] = 0;
5156 w0[2] = 0;
5157 w0[1] = 0;
5158 w0[0] = 0;
5159 break;
5160
5161 case 11:
5162 w3[1] = __byte_perm (w0[2], w0[1], selector);
5163 w3[0] = __byte_perm (w0[1], w0[0], selector);
5164 w2[3] = __byte_perm (w0[0], 0, selector);
5165 w2[2] = 0;
5166 w2[1] = 0;
5167 w2[0] = 0;
5168 w1[3] = 0;
5169 w1[2] = 0;
5170 w1[1] = 0;
5171 w1[0] = 0;
5172 w0[3] = 0;
5173 w0[2] = 0;
5174 w0[1] = 0;
5175 w0[0] = 0;
5176 break;
5177
5178 case 12:
5179 w3[1] = __byte_perm (w0[1], w0[0], selector);
5180 w3[0] = __byte_perm (w0[0], 0, selector);
5181 w2[3] = 0;
5182 w2[2] = 0;
5183 w2[1] = 0;
5184 w2[0] = 0;
5185 w1[3] = 0;
5186 w1[2] = 0;
5187 w1[1] = 0;
5188 w1[0] = 0;
5189 w0[3] = 0;
5190 w0[2] = 0;
5191 w0[1] = 0;
5192 w0[0] = 0;
5193 break;
5194
5195 case 13:
5196 w3[1] = __byte_perm (w0[0], 0, selector);
5197 w3[0] = 0;
5198 w2[3] = 0;
5199 w2[2] = 0;
5200 w2[1] = 0;
5201 w2[0] = 0;
5202 w1[3] = 0;
5203 w1[2] = 0;
5204 w1[1] = 0;
5205 w1[0] = 0;
5206 w0[3] = 0;
5207 w0[2] = 0;
5208 w0[1] = 0;
5209 w0[0] = 0;
5210 break;
5211 }
5212 #endif
5213 }
5214
5215 inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len)
5216 {
5217 #if defined cl_amd_media_ops
5218 switch (salt_len)
5219 {
5220 case 0: sw[0] = w0;
5221 break;
5222 case 1: sw[0] = amd_bytealign (w0, sw[0] << 24, 3);
5223 sw[1] = amd_bytealign (sw[1] >> 8, w0, 3);
5224 break;
5225 case 2: sw[0] = amd_bytealign (w0, sw[0] << 16, 2);
5226 sw[1] = amd_bytealign (sw[1] >> 16, w0, 2);
5227 break;
5228 case 3: sw[0] = amd_bytealign (w0, sw[0] << 8, 1);
5229 sw[1] = amd_bytealign (sw[1] >> 24, w0, 1);
5230 break;
5231 case 4: sw[1] = w0;
5232 break;
5233 case 5: sw[1] = amd_bytealign (w0, sw[1] << 24, 3);
5234 sw[2] = amd_bytealign (sw[2] >> 8, w0, 3);
5235 break;
5236 case 6: sw[1] = amd_bytealign (w0, sw[1] << 16, 2);
5237 sw[2] = amd_bytealign (sw[2] >> 16, w0, 2);
5238 break;
5239 case 7: sw[1] = amd_bytealign (w0, sw[1] << 8, 1);
5240 sw[2] = amd_bytealign (sw[2] >> 24, w0, 1);
5241 break;
5242 case 8: sw[2] = w0;
5243 break;
5244 case 9: sw[2] = amd_bytealign (w0, sw[2] << 24, 3);
5245 sw[3] = amd_bytealign (sw[3] >> 8, w0, 3);
5246 break;
5247 case 10: sw[2] = amd_bytealign (w0, sw[2] << 16, 2);
5248 sw[3] = amd_bytealign (sw[3] >> 16, w0, 2);
5249 break;
5250 case 11: sw[2] = amd_bytealign (w0, sw[2] << 8, 1);
5251 sw[3] = amd_bytealign (sw[3] >> 24, w0, 1);
5252 break;
5253 case 12: sw[3] = w0;
5254 break;
5255 case 13: sw[3] = amd_bytealign (w0, sw[3] << 24, 3);
5256 sw[4] = amd_bytealign (sw[4] >> 8, w0, 3);
5257 break;
5258 case 14: sw[3] = amd_bytealign (w0, sw[3] << 16, 2);
5259 sw[4] = amd_bytealign (sw[4] >> 16, w0, 2);
5260 break;
5261 case 15: sw[3] = amd_bytealign (w0, sw[3] << 8, 1);
5262 sw[4] = amd_bytealign (sw[4] >> 24, w0, 1);
5263 break;
5264 case 16: sw[4] = w0;
5265 break;
5266 case 17: sw[4] = amd_bytealign (w0, sw[4] << 24, 3);
5267 sw[5] = amd_bytealign (sw[5] >> 8, w0, 3);
5268 break;
5269 case 18: sw[4] = amd_bytealign (w0, sw[4] << 16, 2);
5270 sw[5] = amd_bytealign (sw[5] >> 16, w0, 2);
5271 break;
5272 case 19: sw[4] = amd_bytealign (w0, sw[4] << 8, 1);
5273 sw[5] = amd_bytealign (sw[5] >> 24, w0, 1);
5274 break;
5275 case 20: sw[5] = w0;
5276 break;
5277 case 21: sw[5] = amd_bytealign (w0, sw[5] << 24, 3);
5278 sw[6] = amd_bytealign (sw[6] >> 8, w0, 3);
5279 break;
5280 case 22: sw[5] = amd_bytealign (w0, sw[5] << 16, 2);
5281 sw[6] = amd_bytealign (sw[6] >> 16, w0, 2);
5282 break;
5283 case 23: sw[5] = amd_bytealign (w0, sw[5] << 8, 1);
5284 sw[6] = amd_bytealign (sw[6] >> 24, w0, 1);
5285 break;
5286 case 24: sw[6] = w0;
5287 break;
5288 case 25: sw[6] = amd_bytealign (w0, sw[6] << 24, 3);
5289 sw[7] = amd_bytealign (sw[7] >> 8, w0, 3);
5290 break;
5291 case 26: sw[6] = amd_bytealign (w0, sw[6] << 16, 2);
5292 sw[7] = amd_bytealign (sw[7] >> 16, w0, 2);
5293 break;
5294 case 27: sw[6] = amd_bytealign (w0, sw[6] << 8, 1);
5295 sw[7] = amd_bytealign (sw[7] >> 24, w0, 1);
5296 break;
5297 case 28: sw[7] = w0;
5298 break;
5299 case 29: sw[7] = amd_bytealign (w0, sw[7] << 24, 3);
5300 sw[8] = amd_bytealign (sw[8] >> 8, w0, 3);
5301 break;
5302 case 30: sw[7] = amd_bytealign (w0, sw[7] << 16, 2);
5303 sw[8] = amd_bytealign (sw[8] >> 16, w0, 2);
5304 break;
5305 case 31: sw[7] = amd_bytealign (w0, sw[7] << 8, 1);
5306 sw[8] = amd_bytealign (sw[8] >> 24, w0, 1);
5307 break;
5308 }
5309 #else
5310 switch (salt_len)
5311 {
5312 case 0: sw[0] = w0;
5313 break;
5314 case 1: sw[0] = (sw[0] & 0x000000ff) | (w0 << 8);
5315 sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
5316 break;
5317 case 2: sw[0] = (sw[0] & 0x0000ffff) | (w0 << 16);
5318 sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
5319 break;
5320 case 3: sw[0] = (sw[0] & 0x00ffffff) | (w0 << 24);
5321 sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
5322 break;
5323 case 4: sw[1] = w0;
5324 break;
5325 case 5: sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
5326 sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
5327 break;
5328 case 6: sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
5329 sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
5330 break;
5331 case 7: sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
5332 sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
5333 break;
5334 case 8: sw[2] = w0;
5335 break;
5336 case 9: sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
5337 sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
5338 break;
5339 case 10: sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
5340 sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
5341 break;
5342 case 11: sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
5343 sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
5344 break;
5345 case 12: sw[3] = w0;
5346 break;
5347 case 13: sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
5348 sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
5349 break;
5350 case 14: sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
5351 sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
5352 break;
5353 case 15: sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
5354 sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
5355 break;
5356 case 16: sw[4] = w0;
5357 break;
5358 case 17: sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
5359 sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
5360 break;
5361 case 18: sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
5362 sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
5363 break;
5364 case 19: sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
5365 sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
5366 break;
5367 case 20: sw[5] = w0;
5368 break;
5369 case 21: sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
5370 sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
5371 break;
5372 case 22: sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
5373 sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
5374 break;
5375 case 23: sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
5376 sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
5377 break;
5378 case 24: sw[6] = w0;
5379 break;
5380 case 25: sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
5381 sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
5382 break;
5383 case 26: sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
5384 sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
5385 break;
5386 case 27: sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
5387 sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
5388 break;
5389 case 28: sw[7] = w0;
5390 break;
5391 case 29: sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
5392 sw[8] = (sw[8] & 0xffffff00) | (w0 >> 24);
5393 break;
5394 case 30: sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
5395 sw[8] = (sw[8] & 0xffff0000) | (w0 >> 16);
5396 break;
5397 case 31: sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
5398 sw[8] = (sw[8] & 0xff000000) | (w0 >> 8);
5399 break;
5400 }
5401 #endif
5402 }
5403
5404 inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len)
5405 {
5406 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5407
5408 switch (salt_len)
5409 {
5410 case 0: sw[0] = w0;
5411 break;
5412 case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8);
5413 sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
5414 break;
5415 case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16);
5416 sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
5417 break;
5418 case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24);
5419 sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
5420 break;
5421 case 4: sw[1] = w0;
5422 break;
5423 case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
5424 sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
5425 break;
5426 case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
5427 sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
5428 break;
5429 case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
5430 sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
5431 break;
5432 case 8: sw[2] = w0;
5433 break;
5434 case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
5435 sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
5436 break;
5437 case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
5438 sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
5439 break;
5440 case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
5441 sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
5442 break;
5443 case 12: sw[3] = w0;
5444 break;
5445 case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
5446 sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
5447 break;
5448 case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
5449 sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
5450 break;
5451 case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
5452 sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
5453 break;
5454 case 16: sw[4] = w0;
5455 break;
5456 case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
5457 sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
5458 break;
5459 case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
5460 sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
5461 break;
5462 case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
5463 sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
5464 break;
5465 case 20: sw[5] = w0;
5466 break;
5467 case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
5468 sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
5469 break;
5470 case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
5471 sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
5472 break;
5473 case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
5474 sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
5475 break;
5476 case 24: sw[6] = w0;
5477 break;
5478 case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
5479 sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
5480 break;
5481 case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
5482 sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
5483 break;
5484 case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
5485 sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
5486 break;
5487 case 28: sw[7] = w0;
5488 break;
5489 case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
5490 sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24);
5491 break;
5492 case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
5493 sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16);
5494 break;
5495 case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
5496 sw[8] = (sw[8] & 0x000000ff) | (w0 << 8);
5497 break;
5498 }
5499 }
5500
5501 inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len)
5502 {
5503 #if defined cl_amd_media_ops
5504 switch (salt_len)
5505 {
5506 case 0: w0[0] = wx;
5507 break;
5508 case 1: w0[0] = amd_bytealign (wx, w0[0] << 24, 3);
5509 w0[1] = amd_bytealign (w0[1] >> 8, wx, 3);
5510 break;
5511 case 2: w0[0] = amd_bytealign (wx, w0[0] << 16, 2);
5512 w0[1] = amd_bytealign (w0[1] >> 16, wx, 2);
5513 break;
5514 case 3: w0[0] = amd_bytealign (wx, w0[0] << 8, 1);
5515 w0[1] = amd_bytealign (w0[1] >> 24, wx, 1);
5516 break;
5517 case 4: w0[1] = wx;
5518 break;
5519 case 5: w0[1] = amd_bytealign (wx, w0[1] << 24, 3);
5520 w0[2] = amd_bytealign (w0[2] >> 8, wx, 3);
5521 break;
5522 case 6: w0[1] = amd_bytealign (wx, w0[1] << 16, 2);
5523 w0[2] = amd_bytealign (w0[2] >> 16, wx, 2);
5524 break;
5525 case 7: w0[1] = amd_bytealign (wx, w0[1] << 8, 1);
5526 w0[2] = amd_bytealign (w0[2] >> 24, wx, 1);
5527 break;
5528 case 8: w0[2] = wx;
5529 break;
5530 case 9: w0[2] = amd_bytealign (wx, w0[2] << 24, 3);
5531 w0[3] = amd_bytealign (w0[3] >> 8, wx, 3);
5532 break;
5533 case 10: w0[2] = amd_bytealign (wx, w0[2] << 16, 2);
5534 w0[3] = amd_bytealign (w0[3] >> 16, wx, 2);
5535 break;
5536 case 11: w0[2] = amd_bytealign (wx, w0[2] << 8, 1);
5537 w0[3] = amd_bytealign (w0[3] >> 24, wx, 1);
5538 break;
5539 case 12: w0[3] = wx;
5540 break;
5541 case 13: w0[3] = amd_bytealign (wx, w0[3] << 24, 3);
5542 w1[0] = amd_bytealign (w1[0] >> 8, wx, 3);
5543 break;
5544 case 14: w0[3] = amd_bytealign (wx, w0[3] << 16, 2);
5545 w1[0] = amd_bytealign (w1[0] >> 16, wx, 2);
5546 break;
5547 case 15: w0[3] = amd_bytealign (wx, w0[3] << 8, 1);
5548 w1[0] = amd_bytealign (w1[0] >> 24, wx, 1);
5549 break;
5550 case 16: w1[0] = wx;
5551 break;
5552 case 17: w1[0] = amd_bytealign (wx, w1[0] << 24, 3);
5553 w1[1] = amd_bytealign (w1[1] >> 8, wx, 3);
5554 break;
5555 case 18: w1[0] = amd_bytealign (wx, w1[0] << 16, 2);
5556 w1[1] = amd_bytealign (w1[1] >> 16, wx, 2);
5557 break;
5558 case 19: w1[0] = amd_bytealign (wx, w1[0] << 8, 1);
5559 w1[1] = amd_bytealign (w1[1] >> 24, wx, 1);
5560 break;
5561 case 20: w1[1] = wx;
5562 break;
5563 case 21: w1[1] = amd_bytealign (wx, w1[1] << 24, 3);
5564 w1[2] = amd_bytealign (w1[2] >> 8, wx, 3);
5565 break;
5566 case 22: w1[1] = amd_bytealign (wx, w1[1] << 16, 2);
5567 w1[2] = amd_bytealign (w1[2] >> 16, wx, 2);
5568 break;
5569 case 23: w1[1] = amd_bytealign (wx, w1[1] << 8, 1);
5570 w1[2] = amd_bytealign (w1[2] >> 24, wx, 1);
5571 break;
5572 case 24: w1[2] = wx;
5573 break;
5574 case 25: w1[2] = amd_bytealign (wx, w1[2] << 24, 3);
5575 w1[3] = amd_bytealign (w1[3] >> 8, wx, 3);
5576 break;
5577 case 26: w1[2] = amd_bytealign (wx, w1[2] << 16, 2);
5578 w1[3] = amd_bytealign (w1[3] >> 16, wx, 2);
5579 break;
5580 case 27: w1[2] = amd_bytealign (wx, w1[2] << 8, 1);
5581 w1[3] = amd_bytealign (w1[3] >> 24, wx, 1);
5582 break;
5583 case 28: w1[3] = wx;
5584 break;
5585 case 29: w1[3] = amd_bytealign (wx, w1[3] << 24, 3);
5586 w2[0] = amd_bytealign (w2[0] >> 8, wx, 3);
5587 break;
5588 case 30: w1[3] = amd_bytealign (wx, w1[3] << 16, 2);
5589 w2[0] = amd_bytealign (w2[0] >> 16, wx, 2);
5590 break;
5591 case 31: w1[3] = amd_bytealign (wx, w1[3] << 8, 1);
5592 w2[0] = amd_bytealign (w2[0] >> 24, wx, 1);
5593 break;
5594 case 32: w2[0] = wx;
5595 break;
5596 case 33: w2[0] = amd_bytealign (wx, w2[0] << 24, 3);
5597 w2[1] = amd_bytealign (w2[1] >> 8, wx, 3);
5598 break;
5599 case 34: w2[0] = amd_bytealign (wx, w2[0] << 16, 2);
5600 w2[1] = amd_bytealign (w2[1] >> 16, wx, 2);
5601 break;
5602 case 35: w2[0] = amd_bytealign (wx, w2[0] << 8, 1);
5603 w2[1] = amd_bytealign (w2[1] >> 24, wx, 1);
5604 break;
5605 case 36: w2[1] = wx;
5606 break;
5607 case 37: w2[1] = amd_bytealign (wx, w2[1] << 24, 3);
5608 w2[2] = amd_bytealign (w2[2] >> 8, wx, 3);
5609 break;
5610 case 38: w2[1] = amd_bytealign (wx, w2[1] << 16, 2);
5611 w2[2] = amd_bytealign (w2[2] >> 16, wx, 2);
5612 break;
5613 case 39: w2[1] = amd_bytealign (wx, w2[1] << 8, 1);
5614 w2[2] = amd_bytealign (w2[2] >> 24, wx, 1);
5615 break;
5616 case 40: w2[2] = wx;
5617 break;
5618 case 41: w2[2] = amd_bytealign (wx, w2[2] << 24, 3);
5619 w2[3] = amd_bytealign (w2[3] >> 8, wx, 3);
5620 break;
5621 case 42: w2[2] = amd_bytealign (wx, w2[2] << 16, 2);
5622 w2[3] = amd_bytealign (w2[3] >> 16, wx, 2);
5623 break;
5624 case 43: w2[2] = amd_bytealign (wx, w2[2] << 8, 1);
5625 w2[3] = amd_bytealign (w2[3] >> 24, wx, 1);
5626 break;
5627 case 44: w2[3] = wx;
5628 break;
5629 case 45: w2[3] = amd_bytealign (wx, w2[3] << 24, 3);
5630 w3[0] = amd_bytealign (w3[0] >> 8, wx, 3);
5631 break;
5632 case 46: w2[3] = amd_bytealign (wx, w2[3] << 16, 2);
5633 w3[0] = amd_bytealign (w3[0] >> 16, wx, 2);
5634 break;
5635 case 47: w2[3] = amd_bytealign (wx, w2[3] << 8, 1);
5636 w3[0] = amd_bytealign (w3[0] >> 24, wx, 1);
5637 break;
5638 case 48: w3[0] = wx;
5639 break;
5640 case 49: w3[0] = amd_bytealign (wx, w3[0] << 24, 3);
5641 w3[1] = amd_bytealign (w3[1] >> 8, wx, 3);
5642 break;
5643 case 50: w3[0] = amd_bytealign (wx, w3[0] << 16, 2);
5644 w3[1] = amd_bytealign (w3[1] >> 16, wx, 2);
5645 break;
5646 case 51: w3[0] = amd_bytealign (wx, w3[0] << 8, 1);
5647 w3[1] = amd_bytealign (w3[1] >> 24, wx, 1);
5648 break;
5649 case 52: w3[1] = wx;
5650 break;
5651 case 53: w3[1] = amd_bytealign (wx, w3[1] << 24, 3);
5652 w3[2] = amd_bytealign (w3[2] >> 8, wx, 3);
5653 break;
5654 case 54: w3[1] = amd_bytealign (wx, w3[1] << 16, 2);
5655 w3[2] = amd_bytealign (w3[2] >> 16, wx, 2);
5656 break;
5657 case 55: w3[1] = amd_bytealign (wx, w3[1] << 8, 1);
5658 w3[2] = amd_bytealign (w3[2] >> 24, wx, 1);
5659 break;
5660 case 56: w3[2] = wx;
5661 break;
5662 case 57: w3[2] = amd_bytealign (wx, w3[2] << 24, 3);
5663 w3[3] = amd_bytealign (w3[3] >> 8, wx, 3);
5664 break;
5665 case 58: w3[2] = amd_bytealign (wx, w3[2] << 16, 2);
5666 w3[3] = amd_bytealign (w3[3] >> 16, wx, 2);
5667 break;
5668 case 59: w3[2] = amd_bytealign (wx, w3[2] << 8, 1);
5669 w3[3] = amd_bytealign (w3[3] >> 24, wx, 1);
5670 break;
5671 case 60: w3[3] = wx;
5672 break;
5673 case 61: w3[3] = amd_bytealign (wx, w3[3] << 24, 3);
5674 //w4[0] = amd_bytealign (w4[0] >> 8, wx, 3);
5675 break;
5676 case 62: w3[3] = amd_bytealign (wx, w3[3] << 16, 2);
5677 //w4[0] = amd_bytealign (w4[0] >> 16, wx, 2);
5678 break;
5679 case 63: w3[3] = amd_bytealign (wx, w3[3] << 8, 1);
5680 //w4[0] = amd_bytealign (w4[0] >> 24, wx, 1);
5681 break;
5682 }
5683 #else
5684 switch (salt_len)
5685 {
5686 case 0: w0[0] = wx;
5687 break;
5688 case 1: w0[0] = (w0[0] & 0x000000ff) | (wx << 8);
5689 w0[1] = (w0[1] & 0xffffff00) | (wx >> 24);
5690 break;
5691 case 2: w0[0] = (w0[0] & 0x0000ffff) | (wx << 16);
5692 w0[1] = (w0[1] & 0xffff0000) | (wx >> 16);
5693 break;
5694 case 3: w0[0] = (w0[0] & 0x00ffffff) | (wx << 24);
5695 w0[1] = (w0[1] & 0xff000000) | (wx >> 8);
5696 break;
5697 case 4: w0[1] = wx;
5698 break;
5699 case 5: w0[1] = (w0[1] & 0x000000ff) | (wx << 8);
5700 w0[2] = (w0[2] & 0xffffff00) | (wx >> 24);
5701 break;
5702 case 6: w0[1] = (w0[1] & 0x0000ffff) | (wx << 16);
5703 w0[2] = (w0[2] & 0xffff0000) | (wx >> 16);
5704 break;
5705 case 7: w0[1] = (w0[1] & 0x00ffffff) | (wx << 24);
5706 w0[2] = (w0[2] & 0xff000000) | (wx >> 8);
5707 break;
5708 case 8: w0[2] = wx;
5709 break;
5710 case 9: w0[2] = (w0[2] & 0x000000ff) | (wx << 8);
5711 w0[3] = (w0[3] & 0xffffff00) | (wx >> 24);
5712 break;
5713 case 10: w0[2] = (w0[2] & 0x0000ffff) | (wx << 16);
5714 w0[3] = (w0[3] & 0xffff0000) | (wx >> 16);
5715 break;
5716 case 11: w0[2] = (w0[2] & 0x00ffffff) | (wx << 24);
5717 w0[3] = (w0[3] & 0xff000000) | (wx >> 8);
5718 break;
5719 case 12: w0[3] = wx;
5720 break;
5721 case 13: w0[3] = (w0[3] & 0x000000ff) | (wx << 8);
5722 w1[0] = (w1[0] & 0xffffff00) | (wx >> 24);
5723 break;
5724 case 14: w0[3] = (w0[3] & 0x0000ffff) | (wx << 16);
5725 w1[0] = (w1[0] & 0xffff0000) | (wx >> 16);
5726 break;
5727 case 15: w0[3] = (w0[3] & 0x00ffffff) | (wx << 24);
5728 w1[0] = (w1[0] & 0xff000000) | (wx >> 8);
5729 break;
5730 case 16: w1[0] = wx;
5731 break;
5732 case 17: w1[0] = (w1[0] & 0x000000ff) | (wx << 8);
5733 w1[1] = (w1[1] & 0xffffff00) | (wx >> 24);
5734 break;
5735 case 18: w1[0] = (w1[0] & 0x0000ffff) | (wx << 16);
5736 w1[1] = (w1[1] & 0xffff0000) | (wx >> 16);
5737 break;
5738 case 19: w1[0] = (w1[0] & 0x00ffffff) | (wx << 24);
5739 w1[1] = (w1[1] & 0xff000000) | (wx >> 8);
5740 break;
5741 case 20: w1[1] = wx;
5742 break;
5743 case 21: w1[1] = (w1[1] & 0x000000ff) | (wx << 8);
5744 w1[2] = (w1[2] & 0xffffff00) | (wx >> 24);
5745 break;
5746 case 22: w1[1] = (w1[1] & 0x0000ffff) | (wx << 16);
5747 w1[2] = (w1[2] & 0xffff0000) | (wx >> 16);
5748 break;
5749 case 23: w1[1] = (w1[1] & 0x00ffffff) | (wx << 24);
5750 w1[2] = (w1[2] & 0xff000000) | (wx >> 8);
5751 break;
5752 case 24: w1[2] = wx;
5753 break;
5754 case 25: w1[2] = (w1[2] & 0x000000ff) | (wx << 8);
5755 w1[3] = (w1[3] & 0xffffff00) | (wx >> 24);
5756 break;
5757 case 26: w1[2] = (w1[2] & 0x0000ffff) | (wx << 16);
5758 w1[3] = (w1[3] & 0xffff0000) | (wx >> 16);
5759 break;
5760 case 27: w1[2] = (w1[2] & 0x00ffffff) | (wx << 24);
5761 w1[3] = (w1[3] & 0xff000000) | (wx >> 8);
5762 break;
5763 case 28: w1[3] = wx;
5764 break;
5765 case 29: w1[3] = (w1[3] & 0x000000ff) | (wx << 8);
5766 w2[0] = (w2[0] & 0xffffff00) | (wx >> 24);
5767 break;
5768 case 30: w1[3] = (w1[3] & 0x0000ffff) | (wx << 16);
5769 w2[0] = (w2[0] & 0xffff0000) | (wx >> 16);
5770 break;
5771 case 31: w1[3] = (w1[3] & 0x00ffffff) | (wx << 24);
5772 w2[0] = (w2[0] & 0xff000000) | (wx >> 8);
5773 break;
5774 case 32: w2[0] = wx;
5775 break;
5776 case 33: w2[0] = (w2[0] & 0x000000ff) | (wx << 8);
5777 w2[1] = (w2[1] & 0xffffff00) | (wx >> 24);
5778 break;
5779 case 34: w2[0] = (w2[0] & 0x0000ffff) | (wx << 16);
5780 w2[1] = (w2[1] & 0xffff0000) | (wx >> 16);
5781 break;
5782 case 35: w2[0] = (w2[0] & 0x00ffffff) | (wx << 24);
5783 w2[1] = (w2[1] & 0xff000000) | (wx >> 8);
5784 break;
5785 case 36: w2[1] = wx;
5786 break;
5787 case 37: w2[1] = (w2[1] & 0x000000ff) | (wx << 8);
5788 w2[2] = (w2[2] & 0xffffff00) | (wx >> 24);
5789 break;
5790 case 38: w2[1] = (w2[1] & 0x0000ffff) | (wx << 16);
5791 w2[2] = (w2[2] & 0xffff0000) | (wx >> 16);
5792 break;
5793 case 39: w2[1] = (w2[1] & 0x00ffffff) | (wx << 24);
5794 w2[2] = (w2[2] & 0xff000000) | (wx >> 8);
5795 break;
5796 case 40: w2[2] = wx;
5797 break;
5798 case 41: w2[2] = (w2[2] & 0x000000ff) | (wx << 8);
5799 w2[3] = (w2[3] & 0xffffff00) | (wx >> 24);
5800 break;
5801 case 42: w2[2] = (w2[2] & 0x0000ffff) | (wx << 16);
5802 w2[3] = (w2[3] & 0xffff0000) | (wx >> 16);
5803 break;
5804 case 43: w2[2] = (w2[2] & 0x00ffffff) | (wx << 24);
5805 w2[3] = (w2[3] & 0xff000000) | (wx >> 8);
5806 break;
5807 case 44: w2[3] = wx;
5808 break;
5809 case 45: w2[3] = (w2[3] & 0x000000ff) | (wx << 8);
5810 w3[0] = (w3[0] & 0xffffff00) | (wx >> 24);
5811 break;
5812 case 46: w2[3] = (w2[3] & 0x0000ffff) | (wx << 16);
5813 w3[0] = (w3[0] & 0xffff0000) | (wx >> 16);
5814 break;
5815 case 47: w2[3] = (w2[3] & 0x00ffffff) | (wx << 24);
5816 w3[0] = (w3[0] & 0xff000000) | (wx >> 8);
5817 break;
5818 case 48: w3[0] = wx;
5819 break;
5820 case 49: w3[0] = (w3[0] & 0x000000ff) | (wx << 8);
5821 w3[1] = (w3[1] & 0xffffff00) | (wx >> 24);
5822 break;
5823 case 50: w3[0] = (w3[0] & 0x0000ffff) | (wx << 16);
5824 w3[1] = (w3[1] & 0xffff0000) | (wx >> 16);
5825 break;
5826 case 51: w3[0] = (w3[0] & 0x00ffffff) | (wx << 24);
5827 w3[1] = (w3[1] & 0xff000000) | (wx >> 8);
5828 break;
5829 case 52: w3[1] = wx;
5830 break;
5831 case 53: w3[1] = (w3[1] & 0x000000ff) | (wx << 8);
5832 w3[2] = (w3[2] & 0xffffff00) | (wx >> 24);
5833 break;
5834 case 54: w3[1] = (w3[1] & 0x0000ffff) | (wx << 16);
5835 w3[2] = (w3[2] & 0xffff0000) | (wx >> 16);
5836 break;
5837 case 55: w3[1] = (w3[1] & 0x00ffffff) | (wx << 24);
5838 w3[2] = (w3[2] & 0xff000000) | (wx >> 8);
5839 break;
5840 case 56: w3[2] = wx;
5841 break;
5842 case 57: w3[2] = (w3[2] & 0x000000ff) | (wx << 8);
5843 w3[3] = (w3[3] & 0xffffff00) | (wx >> 24);
5844 break;
5845 case 58: w3[2] = (w3[2] & 0x0000ffff) | (wx << 16);
5846 w3[3] = (w3[3] & 0xffff0000) | (wx >> 16);
5847 break;
5848 case 59: w3[2] = (w3[2] & 0x00ffffff) | (wx << 24);
5849 w3[3] = (w3[3] & 0xff000000) | (wx >> 8);
5850 break;
5851 case 60: w3[3] = wx;
5852 break;
5853 case 61: w3[3] = (w3[3] & 0x000000ff) | (wx << 8);
5854 //w4[0] = (w4[0] & 0xffffff00) | (wx >> 24);
5855 break;
5856 case 62: w3[3] = (w3[3] & 0x0000ffff) | (wx << 16);
5857 //w4[0] = (w4[0] & 0xffff0000) | (wx >> 16);
5858 break;
5859 case 63: w3[3] = (w3[3] & 0x00ffffff) | (wx << 24);
5860 //w4[0] = (w4[0] & 0xff000000) | (wx >> 8);
5861 break;
5862 }
5863 #endif
5864 }
5865
5866 inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len)
5867 {
5868 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5869
5870 switch (salt_len)
5871 {
5872 case 0: w0[0] = wx;
5873 break;
5874 case 1: w0[0] = (w0[0] & 0xff000000) | (wx >> 8);
5875 w0[1] = (w0[1] & 0x00ffffff) | (wx << 24);
5876 break;
5877 case 2: w0[0] = (w0[0] & 0xffff0000) | (wx >> 16);
5878 w0[1] = (w0[1] & 0x0000ffff) | (wx << 16);
5879 break;
5880 case 3: w0[0] = (w0[0] & 0xffffff00) | (wx >> 24);
5881 w0[1] = (w0[1] & 0x000000ff) | (wx << 8);
5882 break;
5883 case 4: w0[1] = wx;
5884 break;
5885 case 5: w0[1] = (w0[1] & 0xff000000) | (wx >> 8);
5886 w0[2] = (w0[2] & 0x00ffffff) | (wx << 24);
5887 break;
5888 case 6: w0[1] = (w0[1] & 0xffff0000) | (wx >> 16);
5889 w0[2] = (w0[2] & 0x0000ffff) | (wx << 16);
5890 break;
5891 case 7: w0[1] = (w0[1] & 0xffffff00) | (wx >> 24);
5892 w0[2] = (w0[2] & 0x000000ff) | (wx << 8);
5893 break;
5894 case 8: w0[2] = wx;
5895 break;
5896 case 9: w0[2] = (w0[2] & 0xff000000) | (wx >> 8);
5897 w0[3] = (w0[3] & 0x00ffffff) | (wx << 24);
5898 break;
5899 case 10: w0[2] = (w0[2] & 0xffff0000) | (wx >> 16);
5900 w0[3] = (w0[3] & 0x0000ffff) | (wx << 16);
5901 break;
5902 case 11: w0[2] = (w0[2] & 0xffffff00) | (wx >> 24);
5903 w0[3] = (w0[3] & 0x000000ff) | (wx << 8);
5904 break;
5905 case 12: w0[3] = wx;
5906 break;
5907 case 13: w0[3] = (w0[3] & 0xff000000) | (wx >> 8);
5908 w1[0] = (w1[0] & 0x00ffffff) | (wx << 24);
5909 break;
5910 case 14: w0[3] = (w0[3] & 0xffff0000) | (wx >> 16);
5911 w1[0] = (w1[0] & 0x0000ffff) | (wx << 16);
5912 break;
5913 case 15: w0[3] = (w0[3] & 0xffffff00) | (wx >> 24);
5914 w1[0] = (w1[0] & 0x000000ff) | (wx << 8);
5915 break;
5916 case 16: w1[0] = wx;
5917 break;
5918 case 17: w1[0] = (w1[0] & 0xff000000) | (wx >> 8);
5919 w1[1] = (w1[1] & 0x00ffffff) | (wx << 24);
5920 break;
5921 case 18: w1[0] = (w1[0] & 0xffff0000) | (wx >> 16);
5922 w1[1] = (w1[1] & 0x0000ffff) | (wx << 16);
5923 break;
5924 case 19: w1[0] = (w1[0] & 0xffffff00) | (wx >> 24);
5925 w1[1] = (w1[1] & 0x000000ff) | (wx << 8);
5926 break;
5927 case 20: w1[1] = wx;
5928 break;
5929 case 21: w1[1] = (w1[1] & 0xff000000) | (wx >> 8);
5930 w1[2] = (w1[2] & 0x00ffffff) | (wx << 24);
5931 break;
5932 case 22: w1[1] = (w1[1] & 0xffff0000) | (wx >> 16);
5933 w1[2] = (w1[2] & 0x0000ffff) | (wx << 16);
5934 break;
5935 case 23: w1[1] = (w1[1] & 0xffffff00) | (wx >> 24);
5936 w1[2] = (w1[2] & 0x000000ff) | (wx << 8);
5937 break;
5938 case 24: w1[2] = wx;
5939 break;
5940 case 25: w1[2] = (w1[2] & 0xff000000) | (wx >> 8);
5941 w1[3] = (w1[3] & 0x00ffffff) | (wx << 24);
5942 break;
5943 case 26: w1[2] = (w1[2] & 0xffff0000) | (wx >> 16);
5944 w1[3] = (w1[3] & 0x0000ffff) | (wx << 16);
5945 break;
5946 case 27: w1[2] = (w1[2] & 0xffffff00) | (wx >> 24);
5947 w1[3] = (w1[3] & 0x000000ff) | (wx << 8);
5948 break;
5949 case 28: w1[3] = wx;
5950 break;
5951 case 29: w1[3] = (w1[3] & 0xff000000) | (wx >> 8);
5952 w2[0] = (w2[0] & 0x00ffffff) | (wx << 24);
5953 break;
5954 case 30: w1[3] = (w1[3] & 0xffff0000) | (wx >> 16);
5955 w2[0] = (w2[0] & 0x0000ffff) | (wx << 16);
5956 break;
5957 case 31: w1[3] = (w1[3] & 0xffffff00) | (wx >> 24);
5958 w2[0] = (w2[0] & 0x000000ff) | (wx << 8);
5959 break;
5960 case 32: w2[0] = wx;
5961 break;
5962 case 33: w2[0] = (w2[0] & 0xff000000) | (wx >> 8);
5963 w2[1] = (w2[1] & 0x00ffffff) | (wx << 24);
5964 break;
5965 case 34: w2[0] = (w2[0] & 0xffff0000) | (wx >> 16);
5966 w2[1] = (w2[1] & 0x0000ffff) | (wx << 16);
5967 break;
5968 case 35: w2[0] = (w2[0] & 0xffffff00) | (wx >> 24);
5969 w2[1] = (w2[1] & 0x000000ff) | (wx << 8);
5970 break;
5971 case 36: w2[1] = wx;
5972 break;
5973 case 37: w2[1] = (w2[1] & 0xff000000) | (wx >> 8);
5974 w2[2] = (w2[2] & 0x00ffffff) | (wx << 24);
5975 break;
5976 case 38: w2[1] = (w2[1] & 0xffff0000) | (wx >> 16);
5977 w2[2] = (w2[2] & 0x0000ffff) | (wx << 16);
5978 break;
5979 case 39: w2[1] = (w2[1] & 0xffffff00) | (wx >> 24);
5980 w2[2] = (w2[2] & 0x000000ff) | (wx << 8);
5981 break;
5982 case 40: w2[2] = wx;
5983 break;
5984 case 41: w2[2] = (w2[2] & 0xff000000) | (wx >> 8);
5985 w2[3] = (w2[3] & 0x00ffffff) | (wx << 24);
5986 break;
5987 case 42: w2[2] = (w2[2] & 0xffff0000) | (wx >> 16);
5988 w2[3] = (w2[3] & 0x0000ffff) | (wx << 16);
5989 break;
5990 case 43: w2[2] = (w2[2] & 0xffffff00) | (wx >> 24);
5991 w2[3] = (w2[3] & 0x000000ff) | (wx << 8);
5992 break;
5993 case 44: w2[3] = wx;
5994 break;
5995 case 45: w2[3] = (w2[3] & 0xff000000) | (wx >> 8);
5996 w3[0] = (w3[0] & 0x00ffffff) | (wx << 24);
5997 break;
5998 case 46: w2[3] = (w2[3] & 0xffff0000) | (wx >> 16);
5999 w3[0] = (w3[0] & 0x0000ffff) | (wx << 16);
6000 break;
6001 case 47: w2[3] = (w2[3] & 0xffffff00) | (wx >> 24);
6002 w3[0] = (w3[0] & 0x000000ff) | (wx << 8);
6003 break;
6004 case 48: w3[0] = wx;
6005 break;
6006 case 49: w3[0] = (w3[0] & 0xff000000) | (wx >> 8);
6007 w3[1] = (w3[1] & 0x00ffffff) | (wx << 24);
6008 break;
6009 case 50: w3[0] = (w3[0] & 0xffff0000) | (wx >> 16);
6010 w3[1] = (w3[1] & 0x0000ffff) | (wx << 16);
6011 break;
6012 case 51: w3[0] = (w3[0] & 0xffffff00) | (wx >> 24);
6013 w3[1] = (w3[1] & 0x000000ff) | (wx << 8);
6014 break;
6015 case 52: w3[1] = wx;
6016 break;
6017 case 53: w3[1] = (w3[1] & 0xff000000) | (wx >> 8);
6018 w3[2] = (w3[2] & 0x00ffffff) | (wx << 24);
6019 break;
6020 case 54: w3[1] = (w3[1] & 0xffff0000) | (wx >> 16);
6021 w3[2] = (w3[2] & 0x0000ffff) | (wx << 16);
6022 break;
6023 case 55: w3[1] = (w3[1] & 0xffffff00) | (wx >> 24);
6024 w3[2] = (w3[2] & 0x000000ff) | (wx << 8);
6025 break;
6026 case 56: w3[2] = wx;
6027 break;
6028 case 57: w3[2] = (w3[2] & 0xff000000) | (wx >> 8);
6029 w3[3] = (w3[3] & 0x00ffffff) | (wx << 24);
6030 break;
6031 case 58: w3[2] = (w3[2] & 0xffff0000) | (wx >> 16);
6032 w3[3] = (w3[3] & 0x0000ffff) | (wx << 16);
6033 break;
6034 case 59: w3[2] = (w3[2] & 0xffffff00) | (wx >> 24);
6035 w3[3] = (w3[3] & 0x000000ff) | (wx << 8);
6036 break;
6037 case 60: w3[3] = wx;
6038 break;
6039 case 61: w3[3] = (w3[3] & 0xff000000) | (wx >> 8);
6040 //w4[0] = (w4[0] & 0x00ffffff) | (wx << 24);
6041 break;
6042 case 62: w3[3] = (w3[3] & 0xffff0000) | (wx >> 16);
6043 //w4[0] = (w4[0] & 0x0000ffff) | (wx << 16);
6044 break;
6045 case 63: w3[3] = (w3[3] & 0xffffff00) | (wx >> 24);
6046 //w4[0] = (w4[0] & 0x000000ff) | (wx << 8);
6047 break;
6048 }
6049 }
6050
6051 /**
6052 * vector functions as scalar (for outer loop usage)
6053 */
6054
6055 inline void append_0x01_1x4_S (u32 w0[4], const u32 offset)
6056 {
6057 switch (offset)
6058 {
6059 case 0:
6060 w0[0] = 0x01;
6061 break;
6062
6063 case 1:
6064 w0[0] = w0[0] | 0x0100;
6065 break;
6066
6067 case 2:
6068 w0[0] = w0[0] | 0x010000;
6069 break;
6070
6071 case 3:
6072 w0[0] = w0[0] | 0x01000000;
6073 break;
6074
6075 case 4:
6076 w0[1] = 0x01;
6077 break;
6078
6079 case 5:
6080 w0[1] = w0[1] | 0x0100;
6081 break;
6082
6083 case 6:
6084 w0[1] = w0[1] | 0x010000;
6085 break;
6086
6087 case 7:
6088 w0[1] = w0[1] | 0x01000000;
6089 break;
6090
6091 case 8:
6092 w0[2] = 0x01;
6093 break;
6094
6095 case 9:
6096 w0[2] = w0[2] | 0x0100;
6097 break;
6098
6099 case 10:
6100 w0[2] = w0[2] | 0x010000;
6101 break;
6102
6103 case 11:
6104 w0[2] = w0[2] | 0x01000000;
6105 break;
6106
6107 case 12:
6108 w0[3] = 0x01;
6109 break;
6110
6111 case 13:
6112 w0[3] = w0[3] | 0x0100;
6113 break;
6114
6115 case 14:
6116 w0[3] = w0[3] | 0x010000;
6117 break;
6118
6119 case 15:
6120 w0[3] = w0[3] | 0x01000000;
6121 break;
6122 }
6123 }
6124
6125 inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
6126 {
6127 switch (offset)
6128 {
6129 case 0:
6130 w0[0] = 0x01;
6131 break;
6132
6133 case 1:
6134 w0[0] = w0[0] | 0x0100;
6135 break;
6136
6137 case 2:
6138 w0[0] = w0[0] | 0x010000;
6139 break;
6140
6141 case 3:
6142 w0[0] = w0[0] | 0x01000000;
6143 break;
6144
6145 case 4:
6146 w0[1] = 0x01;
6147 break;
6148
6149 case 5:
6150 w0[1] = w0[1] | 0x0100;
6151 break;
6152
6153 case 6:
6154 w0[1] = w0[1] | 0x010000;
6155 break;
6156
6157 case 7:
6158 w0[1] = w0[1] | 0x01000000;
6159 break;
6160
6161 case 8:
6162 w0[2] = 0x01;
6163 break;
6164
6165 case 9:
6166 w0[2] = w0[2] | 0x0100;
6167 break;
6168
6169 case 10:
6170 w0[2] = w0[2] | 0x010000;
6171 break;
6172
6173 case 11:
6174 w0[2] = w0[2] | 0x01000000;
6175 break;
6176
6177 case 12:
6178 w0[3] = 0x01;
6179 break;
6180
6181 case 13:
6182 w0[3] = w0[3] | 0x0100;
6183 break;
6184
6185 case 14:
6186 w0[3] = w0[3] | 0x010000;
6187 break;
6188
6189 case 15:
6190 w0[3] = w0[3] | 0x01000000;
6191 break;
6192
6193 case 16:
6194 w1[0] = 0x01;
6195 break;
6196
6197 case 17:
6198 w1[0] = w1[0] | 0x0100;
6199 break;
6200
6201 case 18:
6202 w1[0] = w1[0] | 0x010000;
6203 break;
6204
6205 case 19:
6206 w1[0] = w1[0] | 0x01000000;
6207 break;
6208
6209 case 20:
6210 w1[1] = 0x01;
6211 break;
6212
6213 case 21:
6214 w1[1] = w1[1] | 0x0100;
6215 break;
6216
6217 case 22:
6218 w1[1] = w1[1] | 0x010000;
6219 break;
6220
6221 case 23:
6222 w1[1] = w1[1] | 0x01000000;
6223 break;
6224
6225 case 24:
6226 w1[2] = 0x01;
6227 break;
6228
6229 case 25:
6230 w1[2] = w1[2] | 0x0100;
6231 break;
6232
6233 case 26:
6234 w1[2] = w1[2] | 0x010000;
6235 break;
6236
6237 case 27:
6238 w1[2] = w1[2] | 0x01000000;
6239 break;
6240
6241 case 28:
6242 w1[3] = 0x01;
6243 break;
6244
6245 case 29:
6246 w1[3] = w1[3] | 0x0100;
6247 break;
6248
6249 case 30:
6250 w1[3] = w1[3] | 0x010000;
6251 break;
6252
6253 case 31:
6254 w1[3] = w1[3] | 0x01000000;
6255 break;
6256 }
6257 }
6258
6259 inline void append_0x01_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
6260 {
6261 switch (offset)
6262 {
6263 case 0:
6264 w0[0] = 0x01;
6265 break;
6266
6267 case 1:
6268 w0[0] = w0[0] | 0x0100;
6269 break;
6270
6271 case 2:
6272 w0[0] = w0[0] | 0x010000;
6273 break;
6274
6275 case 3:
6276 w0[0] = w0[0] | 0x01000000;
6277 break;
6278
6279 case 4:
6280 w0[1] = 0x01;
6281 break;
6282
6283 case 5:
6284 w0[1] = w0[1] | 0x0100;
6285 break;
6286
6287 case 6:
6288 w0[1] = w0[1] | 0x010000;
6289 break;
6290
6291 case 7:
6292 w0[1] = w0[1] | 0x01000000;
6293 break;
6294
6295 case 8:
6296 w0[2] = 0x01;
6297 break;
6298
6299 case 9:
6300 w0[2] = w0[2] | 0x0100;
6301 break;
6302
6303 case 10:
6304 w0[2] = w0[2] | 0x010000;
6305 break;
6306
6307 case 11:
6308 w0[2] = w0[2] | 0x01000000;
6309 break;
6310
6311 case 12:
6312 w0[3] = 0x01;
6313 break;
6314
6315 case 13:
6316 w0[3] = w0[3] | 0x0100;
6317 break;
6318
6319 case 14:
6320 w0[3] = w0[3] | 0x010000;
6321 break;
6322
6323 case 15:
6324 w0[3] = w0[3] | 0x01000000;
6325 break;
6326
6327 case 16:
6328 w1[0] = 0x01;
6329 break;
6330
6331 case 17:
6332 w1[0] = w1[0] | 0x0100;
6333 break;
6334
6335 case 18:
6336 w1[0] = w1[0] | 0x010000;
6337 break;
6338
6339 case 19:
6340 w1[0] = w1[0] | 0x01000000;
6341 break;
6342
6343 case 20:
6344 w1[1] = 0x01;
6345 break;
6346
6347 case 21:
6348 w1[1] = w1[1] | 0x0100;
6349 break;
6350
6351 case 22:
6352 w1[1] = w1[1] | 0x010000;
6353 break;
6354
6355 case 23:
6356 w1[1] = w1[1] | 0x01000000;
6357 break;
6358
6359 case 24:
6360 w1[2] = 0x01;
6361 break;
6362
6363 case 25:
6364 w1[2] = w1[2] | 0x0100;
6365 break;
6366
6367 case 26:
6368 w1[2] = w1[2] | 0x010000;
6369 break;
6370
6371 case 27:
6372 w1[2] = w1[2] | 0x01000000;
6373 break;
6374
6375 case 28:
6376 w1[3] = 0x01;
6377 break;
6378
6379 case 29:
6380 w1[3] = w1[3] | 0x0100;
6381 break;
6382
6383 case 30:
6384 w1[3] = w1[3] | 0x010000;
6385 break;
6386
6387 case 31:
6388 w1[3] = w1[3] | 0x01000000;
6389 break;
6390
6391 case 32:
6392 w2[0] = 0x01;
6393 break;
6394
6395 case 33:
6396 w2[0] = w2[0] | 0x0100;
6397 break;
6398
6399 case 34:
6400 w2[0] = w2[0] | 0x010000;
6401 break;
6402
6403 case 35:
6404 w2[0] = w2[0] | 0x01000000;
6405 break;
6406
6407 case 36:
6408 w2[1] = 0x01;
6409 break;
6410
6411 case 37:
6412 w2[1] = w2[1] | 0x0100;
6413 break;
6414
6415 case 38:
6416 w2[1] = w2[1] | 0x010000;
6417 break;
6418
6419 case 39:
6420 w2[1] = w2[1] | 0x01000000;
6421 break;
6422
6423 case 40:
6424 w2[2] = 0x01;
6425 break;
6426
6427 case 41:
6428 w2[2] = w2[2] | 0x0100;
6429 break;
6430
6431 case 42:
6432 w2[2] = w2[2] | 0x010000;
6433 break;
6434
6435 case 43:
6436 w2[2] = w2[2] | 0x01000000;
6437 break;
6438
6439 case 44:
6440 w2[3] = 0x01;
6441 break;
6442
6443 case 45:
6444 w2[3] = w2[3] | 0x0100;
6445 break;
6446
6447 case 46:
6448 w2[3] = w2[3] | 0x010000;
6449 break;
6450
6451 case 47:
6452 w2[3] = w2[3] | 0x01000000;
6453 break;
6454 }
6455 }
6456
6457 inline void append_0x01_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
6458 {
6459 switch (offset)
6460 {
6461 case 0:
6462 w0[0] = 0x01;
6463 break;
6464
6465 case 1:
6466 w0[0] = w0[0] | 0x0100;
6467 break;
6468
6469 case 2:
6470 w0[0] = w0[0] | 0x010000;
6471 break;
6472
6473 case 3:
6474 w0[0] = w0[0] | 0x01000000;
6475 break;
6476
6477 case 4:
6478 w0[1] = 0x01;
6479 break;
6480
6481 case 5:
6482 w0[1] = w0[1] | 0x0100;
6483 break;
6484
6485 case 6:
6486 w0[1] = w0[1] | 0x010000;
6487 break;
6488
6489 case 7:
6490 w0[1] = w0[1] | 0x01000000;
6491 break;
6492
6493 case 8:
6494 w0[2] = 0x01;
6495 break;
6496
6497 case 9:
6498 w0[2] = w0[2] | 0x0100;
6499 break;
6500
6501 case 10:
6502 w0[2] = w0[2] | 0x010000;
6503 break;
6504
6505 case 11:
6506 w0[2] = w0[2] | 0x01000000;
6507 break;
6508
6509 case 12:
6510 w0[3] = 0x01;
6511 break;
6512
6513 case 13:
6514 w0[3] = w0[3] | 0x0100;
6515 break;
6516
6517 case 14:
6518 w0[3] = w0[3] | 0x010000;
6519 break;
6520
6521 case 15:
6522 w0[3] = w0[3] | 0x01000000;
6523 break;
6524
6525 case 16:
6526 w1[0] = 0x01;
6527 break;
6528
6529 case 17:
6530 w1[0] = w1[0] | 0x0100;
6531 break;
6532
6533 case 18:
6534 w1[0] = w1[0] | 0x010000;
6535 break;
6536
6537 case 19:
6538 w1[0] = w1[0] | 0x01000000;
6539 break;
6540
6541 case 20:
6542 w1[1] = 0x01;
6543 break;
6544
6545 case 21:
6546 w1[1] = w1[1] | 0x0100;
6547 break;
6548
6549 case 22:
6550 w1[1] = w1[1] | 0x010000;
6551 break;
6552
6553 case 23:
6554 w1[1] = w1[1] | 0x01000000;
6555 break;
6556
6557 case 24:
6558 w1[2] = 0x01;
6559 break;
6560
6561 case 25:
6562 w1[2] = w1[2] | 0x0100;
6563 break;
6564
6565 case 26:
6566 w1[2] = w1[2] | 0x010000;
6567 break;
6568
6569 case 27:
6570 w1[2] = w1[2] | 0x01000000;
6571 break;
6572
6573 case 28:
6574 w1[3] = 0x01;
6575 break;
6576
6577 case 29:
6578 w1[3] = w1[3] | 0x0100;
6579 break;
6580
6581 case 30:
6582 w1[3] = w1[3] | 0x010000;
6583 break;
6584
6585 case 31:
6586 w1[3] = w1[3] | 0x01000000;
6587 break;
6588
6589 case 32:
6590 w2[0] = 0x01;
6591 break;
6592
6593 case 33:
6594 w2[0] = w2[0] | 0x0100;
6595 break;
6596
6597 case 34:
6598 w2[0] = w2[0] | 0x010000;
6599 break;
6600
6601 case 35:
6602 w2[0] = w2[0] | 0x01000000;
6603 break;
6604
6605 case 36:
6606 w2[1] = 0x01;
6607 break;
6608
6609 case 37:
6610 w2[1] = w2[1] | 0x0100;
6611 break;
6612
6613 case 38:
6614 w2[1] = w2[1] | 0x010000;
6615 break;
6616
6617 case 39:
6618 w2[1] = w2[1] | 0x01000000;
6619 break;
6620
6621 case 40:
6622 w2[2] = 0x01;
6623 break;
6624
6625 case 41:
6626 w2[2] = w2[2] | 0x0100;
6627 break;
6628
6629 case 42:
6630 w2[2] = w2[2] | 0x010000;
6631 break;
6632
6633 case 43:
6634 w2[2] = w2[2] | 0x01000000;
6635 break;
6636
6637 case 44:
6638 w2[3] = 0x01;
6639 break;
6640
6641 case 45:
6642 w2[3] = w2[3] | 0x0100;
6643 break;
6644
6645 case 46:
6646 w2[3] = w2[3] | 0x010000;
6647 break;
6648
6649 case 47:
6650 w2[3] = w2[3] | 0x01000000;
6651 break;
6652
6653 case 48:
6654 w3[0] = 0x01;
6655 break;
6656
6657 case 49:
6658 w3[0] = w3[0] | 0x0100;
6659 break;
6660
6661 case 50:
6662 w3[0] = w3[0] | 0x010000;
6663 break;
6664
6665 case 51:
6666 w3[0] = w3[0] | 0x01000000;
6667 break;
6668
6669 case 52:
6670 w3[1] = 0x01;
6671 break;
6672
6673 case 53:
6674 w3[1] = w3[1] | 0x0100;
6675 break;
6676
6677 case 54:
6678 w3[1] = w3[1] | 0x010000;
6679 break;
6680
6681 case 55:
6682 w3[1] = w3[1] | 0x01000000;
6683 break;
6684
6685 case 56:
6686 w3[2] = 0x01;
6687 break;
6688
6689 case 57:
6690 w3[2] = w3[2] | 0x0100;
6691 break;
6692
6693 case 58:
6694 w3[2] = w3[2] | 0x010000;
6695 break;
6696
6697 case 59:
6698 w3[2] = w3[2] | 0x01000000;
6699 break;
6700
6701 case 60:
6702 w3[3] = 0x01;
6703 break;
6704
6705 case 61:
6706 w3[3] = w3[3] | 0x0100;
6707 break;
6708
6709 case 62:
6710 w3[3] = w3[3] | 0x010000;
6711 break;
6712
6713 case 63:
6714 w3[3] = w3[3] | 0x01000000;
6715 break;
6716 }
6717 }
6718
6719 inline void append_0x02_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
6720 {
6721 switch (offset)
6722 {
6723 case 0:
6724 w0[0] = 0x02;
6725 break;
6726
6727 case 1:
6728 w0[0] = w0[0] | 0x0200;
6729 break;
6730
6731 case 2:
6732 w0[0] = w0[0] | 0x020000;
6733 break;
6734
6735 case 3:
6736 w0[0] = w0[0] | 0x02000000;
6737 break;
6738
6739 case 4:
6740 w0[1] = 0x02;
6741 break;
6742
6743 case 5:
6744 w0[1] = w0[1] | 0x0200;
6745 break;
6746
6747 case 6:
6748 w0[1] = w0[1] | 0x020000;
6749 break;
6750
6751 case 7:
6752 w0[1] = w0[1] | 0x02000000;
6753 break;
6754
6755 case 8:
6756 w0[2] = 0x02;
6757 break;
6758
6759 case 9:
6760 w0[2] = w0[2] | 0x0200;
6761 break;
6762
6763 case 10:
6764 w0[2] = w0[2] | 0x020000;
6765 break;
6766
6767 case 11:
6768 w0[2] = w0[2] | 0x02000000;
6769 break;
6770
6771 case 12:
6772 w0[3] = 0x02;
6773 break;
6774
6775 case 13:
6776 w0[3] = w0[3] | 0x0200;
6777 break;
6778
6779 case 14:
6780 w0[3] = w0[3] | 0x020000;
6781 break;
6782
6783 case 15:
6784 w0[3] = w0[3] | 0x02000000;
6785 break;
6786
6787 case 16:
6788 w1[0] = 0x02;
6789 break;
6790
6791 case 17:
6792 w1[0] = w1[0] | 0x0200;
6793 break;
6794
6795 case 18:
6796 w1[0] = w1[0] | 0x020000;
6797 break;
6798
6799 case 19:
6800 w1[0] = w1[0] | 0x02000000;
6801 break;
6802
6803 case 20:
6804 w1[1] = 0x02;
6805 break;
6806
6807 case 21:
6808 w1[1] = w1[1] | 0x0200;
6809 break;
6810
6811 case 22:
6812 w1[1] = w1[1] | 0x020000;
6813 break;
6814
6815 case 23:
6816 w1[1] = w1[1] | 0x02000000;
6817 break;
6818
6819 case 24:
6820 w1[2] = 0x02;
6821 break;
6822
6823 case 25:
6824 w1[2] = w1[2] | 0x0200;
6825 break;
6826
6827 case 26:
6828 w1[2] = w1[2] | 0x020000;
6829 break;
6830
6831 case 27:
6832 w1[2] = w1[2] | 0x02000000;
6833 break;
6834
6835 case 28:
6836 w1[3] = 0x02;
6837 break;
6838
6839 case 29:
6840 w1[3] = w1[3] | 0x0200;
6841 break;
6842
6843 case 30:
6844 w1[3] = w1[3] | 0x020000;
6845 break;
6846
6847 case 31:
6848 w1[3] = w1[3] | 0x02000000;
6849 break;
6850 }
6851 }
6852
6853 inline void append_0x02_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
6854 {
6855 switch (offset)
6856 {
6857 case 0:
6858 w0[0] = 0x02;
6859 break;
6860
6861 case 1:
6862 w0[0] = w0[0] | 0x0200;
6863 break;
6864
6865 case 2:
6866 w0[0] = w0[0] | 0x020000;
6867 break;
6868
6869 case 3:
6870 w0[0] = w0[0] | 0x02000000;
6871 break;
6872
6873 case 4:
6874 w0[1] = 0x02;
6875 break;
6876
6877 case 5:
6878 w0[1] = w0[1] | 0x0200;
6879 break;
6880
6881 case 6:
6882 w0[1] = w0[1] | 0x020000;
6883 break;
6884
6885 case 7:
6886 w0[1] = w0[1] | 0x02000000;
6887 break;
6888
6889 case 8:
6890 w0[2] = 0x02;
6891 break;
6892
6893 case 9:
6894 w0[2] = w0[2] | 0x0200;
6895 break;
6896
6897 case 10:
6898 w0[2] = w0[2] | 0x020000;
6899 break;
6900
6901 case 11:
6902 w0[2] = w0[2] | 0x02000000;
6903 break;
6904
6905 case 12:
6906 w0[3] = 0x02;
6907 break;
6908
6909 case 13:
6910 w0[3] = w0[3] | 0x0200;
6911 break;
6912
6913 case 14:
6914 w0[3] = w0[3] | 0x020000;
6915 break;
6916
6917 case 15:
6918 w0[3] = w0[3] | 0x02000000;
6919 break;
6920
6921 case 16:
6922 w1[0] = 0x02;
6923 break;
6924
6925 case 17:
6926 w1[0] = w1[0] | 0x0200;
6927 break;
6928
6929 case 18:
6930 w1[0] = w1[0] | 0x020000;
6931 break;
6932
6933 case 19:
6934 w1[0] = w1[0] | 0x02000000;
6935 break;
6936
6937 case 20:
6938 w1[1] = 0x02;
6939 break;
6940
6941 case 21:
6942 w1[1] = w1[1] | 0x0200;
6943 break;
6944
6945 case 22:
6946 w1[1] = w1[1] | 0x020000;
6947 break;
6948
6949 case 23:
6950 w1[1] = w1[1] | 0x02000000;
6951 break;
6952
6953 case 24:
6954 w1[2] = 0x02;
6955 break;
6956
6957 case 25:
6958 w1[2] = w1[2] | 0x0200;
6959 break;
6960
6961 case 26:
6962 w1[2] = w1[2] | 0x020000;
6963 break;
6964
6965 case 27:
6966 w1[2] = w1[2] | 0x02000000;
6967 break;
6968
6969 case 28:
6970 w1[3] = 0x02;
6971 break;
6972
6973 case 29:
6974 w1[3] = w1[3] | 0x0200;
6975 break;
6976
6977 case 30:
6978 w1[3] = w1[3] | 0x020000;
6979 break;
6980
6981 case 31:
6982 w1[3] = w1[3] | 0x02000000;
6983 break;
6984
6985 case 32:
6986 w2[0] = 0x02;
6987 break;
6988
6989 case 33:
6990 w2[0] = w2[0] | 0x0200;
6991 break;
6992
6993 case 34:
6994 w2[0] = w2[0] | 0x020000;
6995 break;
6996
6997 case 35:
6998 w2[0] = w2[0] | 0x02000000;
6999 break;
7000
7001 case 36:
7002 w2[1] = 0x02;
7003 break;
7004
7005 case 37:
7006 w2[1] = w2[1] | 0x0200;
7007 break;
7008
7009 case 38:
7010 w2[1] = w2[1] | 0x020000;
7011 break;
7012
7013 case 39:
7014 w2[1] = w2[1] | 0x02000000;
7015 break;
7016
7017 case 40:
7018 w2[2] = 0x02;
7019 break;
7020
7021 case 41:
7022 w2[2] = w2[2] | 0x0200;
7023 break;
7024
7025 case 42:
7026 w2[2] = w2[2] | 0x020000;
7027 break;
7028
7029 case 43:
7030 w2[2] = w2[2] | 0x02000000;
7031 break;
7032
7033 case 44:
7034 w2[3] = 0x02;
7035 break;
7036
7037 case 45:
7038 w2[3] = w2[3] | 0x0200;
7039 break;
7040
7041 case 46:
7042 w2[3] = w2[3] | 0x020000;
7043 break;
7044
7045 case 47:
7046 w2[3] = w2[3] | 0x02000000;
7047 break;
7048 }
7049 }
7050
7051 inline void append_0x80_1x4_S (u32 w0[4], const u32 offset)
7052 {
7053 switch (offset)
7054 {
7055 case 0:
7056 w0[0] = 0x80;
7057 break;
7058
7059 case 1:
7060 w0[0] = w0[0] | 0x8000;
7061 break;
7062
7063 case 2:
7064 w0[0] = w0[0] | 0x800000;
7065 break;
7066
7067 case 3:
7068 w0[0] = w0[0] | 0x80000000;
7069 break;
7070
7071 case 4:
7072 w0[1] = 0x80;
7073 break;
7074
7075 case 5:
7076 w0[1] = w0[1] | 0x8000;
7077 break;
7078
7079 case 6:
7080 w0[1] = w0[1] | 0x800000;
7081 break;
7082
7083 case 7:
7084 w0[1] = w0[1] | 0x80000000;
7085 break;
7086
7087 case 8:
7088 w0[2] = 0x80;
7089 break;
7090
7091 case 9:
7092 w0[2] = w0[2] | 0x8000;
7093 break;
7094
7095 case 10:
7096 w0[2] = w0[2] | 0x800000;
7097 break;
7098
7099 case 11:
7100 w0[2] = w0[2] | 0x80000000;
7101 break;
7102
7103 case 12:
7104 w0[3] = 0x80;
7105 break;
7106
7107 case 13:
7108 w0[3] = w0[3] | 0x8000;
7109 break;
7110
7111 case 14:
7112 w0[3] = w0[3] | 0x800000;
7113 break;
7114
7115 case 15:
7116 w0[3] = w0[3] | 0x80000000;
7117 break;
7118 }
7119 }
7120
7121 inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
7122 {
7123 switch (offset)
7124 {
7125 case 0:
7126 w0[0] = 0x80;
7127 break;
7128
7129 case 1:
7130 w0[0] = w0[0] | 0x8000;
7131 break;
7132
7133 case 2:
7134 w0[0] = w0[0] | 0x800000;
7135 break;
7136
7137 case 3:
7138 w0[0] = w0[0] | 0x80000000;
7139 break;
7140
7141 case 4:
7142 w0[1] = 0x80;
7143 break;
7144
7145 case 5:
7146 w0[1] = w0[1] | 0x8000;
7147 break;
7148
7149 case 6:
7150 w0[1] = w0[1] | 0x800000;
7151 break;
7152
7153 case 7:
7154 w0[1] = w0[1] | 0x80000000;
7155 break;
7156
7157 case 8:
7158 w0[2] = 0x80;
7159 break;
7160
7161 case 9:
7162 w0[2] = w0[2] | 0x8000;
7163 break;
7164
7165 case 10:
7166 w0[2] = w0[2] | 0x800000;
7167 break;
7168
7169 case 11:
7170 w0[2] = w0[2] | 0x80000000;
7171 break;
7172
7173 case 12:
7174 w0[3] = 0x80;
7175 break;
7176
7177 case 13:
7178 w0[3] = w0[3] | 0x8000;
7179 break;
7180
7181 case 14:
7182 w0[3] = w0[3] | 0x800000;
7183 break;
7184
7185 case 15:
7186 w0[3] = w0[3] | 0x80000000;
7187 break;
7188
7189 case 16:
7190 w1[0] = 0x80;
7191 break;
7192
7193 case 17:
7194 w1[0] = w1[0] | 0x8000;
7195 break;
7196
7197 case 18:
7198 w1[0] = w1[0] | 0x800000;
7199 break;
7200
7201 case 19:
7202 w1[0] = w1[0] | 0x80000000;
7203 break;
7204
7205 case 20:
7206 w1[1] = 0x80;
7207 break;
7208
7209 case 21:
7210 w1[1] = w1[1] | 0x8000;
7211 break;
7212
7213 case 22:
7214 w1[1] = w1[1] | 0x800000;
7215 break;
7216
7217 case 23:
7218 w1[1] = w1[1] | 0x80000000;
7219 break;
7220
7221 case 24:
7222 w1[2] = 0x80;
7223 break;
7224
7225 case 25:
7226 w1[2] = w1[2] | 0x8000;
7227 break;
7228
7229 case 26:
7230 w1[2] = w1[2] | 0x800000;
7231 break;
7232
7233 case 27:
7234 w1[2] = w1[2] | 0x80000000;
7235 break;
7236
7237 case 28:
7238 w1[3] = 0x80;
7239 break;
7240
7241 case 29:
7242 w1[3] = w1[3] | 0x8000;
7243 break;
7244
7245 case 30:
7246 w1[3] = w1[3] | 0x800000;
7247 break;
7248
7249 case 31:
7250 w1[3] = w1[3] | 0x80000000;
7251 break;
7252 }
7253 }
7254
7255 inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
7256 {
7257 switch (offset)
7258 {
7259 case 0:
7260 w0[0] = 0x80;
7261 break;
7262
7263 case 1:
7264 w0[0] = w0[0] | 0x8000;
7265 break;
7266
7267 case 2:
7268 w0[0] = w0[0] | 0x800000;
7269 break;
7270
7271 case 3:
7272 w0[0] = w0[0] | 0x80000000;
7273 break;
7274
7275 case 4:
7276 w0[1] = 0x80;
7277 break;
7278
7279 case 5:
7280 w0[1] = w0[1] | 0x8000;
7281 break;
7282
7283 case 6:
7284 w0[1] = w0[1] | 0x800000;
7285 break;
7286
7287 case 7:
7288 w0[1] = w0[1] | 0x80000000;
7289 break;
7290
7291 case 8:
7292 w0[2] = 0x80;
7293 break;
7294
7295 case 9:
7296 w0[2] = w0[2] | 0x8000;
7297 break;
7298
7299 case 10:
7300 w0[2] = w0[2] | 0x800000;
7301 break;
7302
7303 case 11:
7304 w0[2] = w0[2] | 0x80000000;
7305 break;
7306
7307 case 12:
7308 w0[3] = 0x80;
7309 break;
7310
7311 case 13:
7312 w0[3] = w0[3] | 0x8000;
7313 break;
7314
7315 case 14:
7316 w0[3] = w0[3] | 0x800000;
7317 break;
7318
7319 case 15:
7320 w0[3] = w0[3] | 0x80000000;
7321 break;
7322
7323 case 16:
7324 w1[0] = 0x80;
7325 break;
7326
7327 case 17:
7328 w1[0] = w1[0] | 0x8000;
7329 break;
7330
7331 case 18:
7332 w1[0] = w1[0] | 0x800000;
7333 break;
7334
7335 case 19:
7336 w1[0] = w1[0] | 0x80000000;
7337 break;
7338
7339 case 20:
7340 w1[1] = 0x80;
7341 break;
7342
7343 case 21:
7344 w1[1] = w1[1] | 0x8000;
7345 break;
7346
7347 case 22:
7348 w1[1] = w1[1] | 0x800000;
7349 break;
7350
7351 case 23:
7352 w1[1] = w1[1] | 0x80000000;
7353 break;
7354
7355 case 24:
7356 w1[2] = 0x80;
7357 break;
7358
7359 case 25:
7360 w1[2] = w1[2] | 0x8000;
7361 break;
7362
7363 case 26:
7364 w1[2] = w1[2] | 0x800000;
7365 break;
7366
7367 case 27:
7368 w1[2] = w1[2] | 0x80000000;
7369 break;
7370
7371 case 28:
7372 w1[3] = 0x80;
7373 break;
7374
7375 case 29:
7376 w1[3] = w1[3] | 0x8000;
7377 break;
7378
7379 case 30:
7380 w1[3] = w1[3] | 0x800000;
7381 break;
7382
7383 case 31:
7384 w1[3] = w1[3] | 0x80000000;
7385 break;
7386
7387 case 32:
7388 w2[0] = 0x80;
7389 break;
7390
7391 case 33:
7392 w2[0] = w2[0] | 0x8000;
7393 break;
7394
7395 case 34:
7396 w2[0] = w2[0] | 0x800000;
7397 break;
7398
7399 case 35:
7400 w2[0] = w2[0] | 0x80000000;
7401 break;
7402
7403 case 36:
7404 w2[1] = 0x80;
7405 break;
7406
7407 case 37:
7408 w2[1] = w2[1] | 0x8000;
7409 break;
7410
7411 case 38:
7412 w2[1] = w2[1] | 0x800000;
7413 break;
7414
7415 case 39:
7416 w2[1] = w2[1] | 0x80000000;
7417 break;
7418
7419 case 40:
7420 w2[2] = 0x80;
7421 break;
7422
7423 case 41:
7424 w2[2] = w2[2] | 0x8000;
7425 break;
7426
7427 case 42:
7428 w2[2] = w2[2] | 0x800000;
7429 break;
7430
7431 case 43:
7432 w2[2] = w2[2] | 0x80000000;
7433 break;
7434
7435 case 44:
7436 w2[3] = 0x80;
7437 break;
7438
7439 case 45:
7440 w2[3] = w2[3] | 0x8000;
7441 break;
7442
7443 case 46:
7444 w2[3] = w2[3] | 0x800000;
7445 break;
7446
7447 case 47:
7448 w2[3] = w2[3] | 0x80000000;
7449 break;
7450 }
7451 }
7452
7453 inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
7454 {
7455 switch (offset)
7456 {
7457 case 0:
7458 w0[0] = 0x80;
7459 break;
7460
7461 case 1:
7462 w0[0] = w0[0] | 0x8000;
7463 break;
7464
7465 case 2:
7466 w0[0] = w0[0] | 0x800000;
7467 break;
7468
7469 case 3:
7470 w0[0] = w0[0] | 0x80000000;
7471 break;
7472
7473 case 4:
7474 w0[1] = 0x80;
7475 break;
7476
7477 case 5:
7478 w0[1] = w0[1] | 0x8000;
7479 break;
7480
7481 case 6:
7482 w0[1] = w0[1] | 0x800000;
7483 break;
7484
7485 case 7:
7486 w0[1] = w0[1] | 0x80000000;
7487 break;
7488
7489 case 8:
7490 w0[2] = 0x80;
7491 break;
7492
7493 case 9:
7494 w0[2] = w0[2] | 0x8000;
7495 break;
7496
7497 case 10:
7498 w0[2] = w0[2] | 0x800000;
7499 break;
7500
7501 case 11:
7502 w0[2] = w0[2] | 0x80000000;
7503 break;
7504
7505 case 12:
7506 w0[3] = 0x80;
7507 break;
7508
7509 case 13:
7510 w0[3] = w0[3] | 0x8000;
7511 break;
7512
7513 case 14:
7514 w0[3] = w0[3] | 0x800000;
7515 break;
7516
7517 case 15:
7518 w0[3] = w0[3] | 0x80000000;
7519 break;
7520
7521 case 16:
7522 w1[0] = 0x80;
7523 break;
7524
7525 case 17:
7526 w1[0] = w1[0] | 0x8000;
7527 break;
7528
7529 case 18:
7530 w1[0] = w1[0] | 0x800000;
7531 break;
7532
7533 case 19:
7534 w1[0] = w1[0] | 0x80000000;
7535 break;
7536
7537 case 20:
7538 w1[1] = 0x80;
7539 break;
7540
7541 case 21:
7542 w1[1] = w1[1] | 0x8000;
7543 break;
7544
7545 case 22:
7546 w1[1] = w1[1] | 0x800000;
7547 break;
7548
7549 case 23:
7550 w1[1] = w1[1] | 0x80000000;
7551 break;
7552
7553 case 24:
7554 w1[2] = 0x80;
7555 break;
7556
7557 case 25:
7558 w1[2] = w1[2] | 0x8000;
7559 break;
7560
7561 case 26:
7562 w1[2] = w1[2] | 0x800000;
7563 break;
7564
7565 case 27:
7566 w1[2] = w1[2] | 0x80000000;
7567 break;
7568
7569 case 28:
7570 w1[3] = 0x80;
7571 break;
7572
7573 case 29:
7574 w1[3] = w1[3] | 0x8000;
7575 break;
7576
7577 case 30:
7578 w1[3] = w1[3] | 0x800000;
7579 break;
7580
7581 case 31:
7582 w1[3] = w1[3] | 0x80000000;
7583 break;
7584
7585 case 32:
7586 w2[0] = 0x80;
7587 break;
7588
7589 case 33:
7590 w2[0] = w2[0] | 0x8000;
7591 break;
7592
7593 case 34:
7594 w2[0] = w2[0] | 0x800000;
7595 break;
7596
7597 case 35:
7598 w2[0] = w2[0] | 0x80000000;
7599 break;
7600
7601 case 36:
7602 w2[1] = 0x80;
7603 break;
7604
7605 case 37:
7606 w2[1] = w2[1] | 0x8000;
7607 break;
7608
7609 case 38:
7610 w2[1] = w2[1] | 0x800000;
7611 break;
7612
7613 case 39:
7614 w2[1] = w2[1] | 0x80000000;
7615 break;
7616
7617 case 40:
7618 w2[2] = 0x80;
7619 break;
7620
7621 case 41:
7622 w2[2] = w2[2] | 0x8000;
7623 break;
7624
7625 case 42:
7626 w2[2] = w2[2] | 0x800000;
7627 break;
7628
7629 case 43:
7630 w2[2] = w2[2] | 0x80000000;
7631 break;
7632
7633 case 44:
7634 w2[3] = 0x80;
7635 break;
7636
7637 case 45:
7638 w2[3] = w2[3] | 0x8000;
7639 break;
7640
7641 case 46:
7642 w2[3] = w2[3] | 0x800000;
7643 break;
7644
7645 case 47:
7646 w2[3] = w2[3] | 0x80000000;
7647 break;
7648
7649 case 48:
7650 w3[0] = 0x80;
7651 break;
7652
7653 case 49:
7654 w3[0] = w3[0] | 0x8000;
7655 break;
7656
7657 case 50:
7658 w3[0] = w3[0] | 0x800000;
7659 break;
7660
7661 case 51:
7662 w3[0] = w3[0] | 0x80000000;
7663 break;
7664
7665 case 52:
7666 w3[1] = 0x80;
7667 break;
7668
7669 case 53:
7670 w3[1] = w3[1] | 0x8000;
7671 break;
7672
7673 case 54:
7674 w3[1] = w3[1] | 0x800000;
7675 break;
7676
7677 case 55:
7678 w3[1] = w3[1] | 0x80000000;
7679 break;
7680
7681 case 56:
7682 w3[2] = 0x80;
7683 break;
7684
7685 case 57:
7686 w3[2] = w3[2] | 0x8000;
7687 break;
7688
7689 case 58:
7690 w3[2] = w3[2] | 0x800000;
7691 break;
7692
7693 case 59:
7694 w3[2] = w3[2] | 0x80000000;
7695 break;
7696
7697 case 60:
7698 w3[3] = 0x80;
7699 break;
7700
7701 case 61:
7702 w3[3] = w3[3] | 0x8000;
7703 break;
7704
7705 case 62:
7706 w3[3] = w3[3] | 0x800000;
7707 break;
7708
7709 case 63:
7710 w3[3] = w3[3] | 0x80000000;
7711 break;
7712 }
7713 }
7714
7715 inline void truncate_block_S (u32 w[4], const u32 len)
7716 {
7717 switch (len)
7718 {
7719 case 0: w[0] &= 0;
7720 w[1] &= 0;
7721 w[2] &= 0;
7722 w[3] &= 0;
7723 break;
7724 case 1: w[0] &= 0x000000FF;
7725 w[1] &= 0;
7726 w[2] &= 0;
7727 w[3] &= 0;
7728 break;
7729 case 2: w[0] &= 0x0000FFFF;
7730 w[1] &= 0;
7731 w[2] &= 0;
7732 w[3] &= 0;
7733 break;
7734 case 3: w[0] &= 0x00FFFFFF;
7735 w[1] &= 0;
7736 w[2] &= 0;
7737 w[3] &= 0;
7738 break;
7739 case 4: w[1] &= 0;
7740 w[2] &= 0;
7741 w[3] &= 0;
7742 break;
7743 case 5: w[1] &= 0x000000FF;
7744 w[2] &= 0;
7745 w[3] &= 0;
7746 break;
7747 case 6: w[1] &= 0x0000FFFF;
7748 w[2] &= 0;
7749 w[3] &= 0;
7750 break;
7751 case 7: w[1] &= 0x00FFFFFF;
7752 w[2] &= 0;
7753 w[3] &= 0;
7754 break;
7755 case 8: w[2] &= 0;
7756 w[3] &= 0;
7757 break;
7758 case 9: w[2] &= 0x000000FF;
7759 w[3] &= 0;
7760 break;
7761 case 10: w[2] &= 0x0000FFFF;
7762 w[3] &= 0;
7763 break;
7764 case 11: w[2] &= 0x00FFFFFF;
7765 w[3] &= 0;
7766 break;
7767 case 12: w[3] &= 0;
7768 break;
7769 case 13: w[3] &= 0x000000FF;
7770 break;
7771 case 14: w[3] &= 0x0000FFFF;
7772 break;
7773 case 15: w[3] &= 0x00FFFFFF;
7774 break;
7775 }
7776 }
7777
7778 inline void make_unicode_S (const u32 in[4], u32 out1[4], u32 out2[4])
7779 {
7780 #ifdef IS_NV
7781 out2[3] = __byte_perm_S (in[3], 0, 0x7372);
7782 out2[2] = __byte_perm_S (in[3], 0, 0x7170);
7783 out2[1] = __byte_perm_S (in[2], 0, 0x7372);
7784 out2[0] = __byte_perm_S (in[2], 0, 0x7170);
7785 out1[3] = __byte_perm_S (in[1], 0, 0x7372);
7786 out1[2] = __byte_perm_S (in[1], 0, 0x7170);
7787 out1[1] = __byte_perm_S (in[0], 0, 0x7372);
7788 out1[0] = __byte_perm_S (in[0], 0, 0x7170);
7789 #endif
7790
7791 #if defined IS_AMD || defined IS_GENERIC
7792 out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
7793 out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
7794 out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
7795 out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
7796 out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
7797 out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
7798 out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
7799 out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
7800 #endif
7801 }
7802
7803 inline void undo_unicode_S (const u32 in1[4], const u32 in2[4], u32 out[4])
7804 {
7805 #ifdef IS_NV
7806 out[0] = __byte_perm_S (in1[0], in1[1], 0x6420);
7807 out[1] = __byte_perm_S (in1[2], in1[3], 0x6420);
7808 out[2] = __byte_perm_S (in2[0], in2[1], 0x6420);
7809 out[3] = __byte_perm_S (in2[2], in2[3], 0x6420);
7810 #endif
7811
7812 #if defined IS_AMD || defined IS_GENERIC
7813 out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
7814 | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
7815 out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
7816 | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
7817 out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
7818 | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
7819 out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
7820 | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
7821 #endif
7822 }
7823
7824 inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
7825 {
7826 #if defined IS_AMD || defined IS_GENERIC
7827 const int offset_mod_4 = offset & 3;
7828
7829 const int offset_minus_4 = 4 - offset;
7830
7831 switch (offset / 4)
7832 {
7833 case 0:
7834 w3[2] = amd_bytealign_S ( 0, w3[1], offset_minus_4);
7835 w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4);
7836 w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
7837 w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7838 w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7839 w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7840 w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7841 w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7842 w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7843 w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7844 w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7845 w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7846 w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7847 w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7848 w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7849
7850 if (offset_mod_4 == 0)
7851 {
7852 w0[0] = w0[1];
7853 w0[1] = w0[2];
7854 w0[2] = w0[3];
7855 w0[3] = w1[0];
7856 w1[0] = w1[1];
7857 w1[1] = w1[2];
7858 w1[2] = w1[3];
7859 w1[3] = w2[0];
7860 w2[0] = w2[1];
7861 w2[1] = w2[2];
7862 w2[2] = w2[3];
7863 w2[3] = w3[0];
7864 w3[0] = w3[1];
7865 w3[1] = w3[2];
7866 w3[2] = 0;
7867 }
7868
7869 break;
7870
7871 case 1:
7872 w3[2] = amd_bytealign_S ( 0, w3[0], offset_minus_4);
7873 w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
7874 w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7875 w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7876 w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7877 w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7878 w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7879 w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7880 w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7881 w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7882 w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7883 w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7884 w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7885 w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7886 w0[0] = 0;
7887
7888 if (offset_mod_4 == 0)
7889 {
7890 w0[1] = w0[2];
7891 w0[2] = w0[3];
7892 w0[3] = w1[0];
7893 w1[0] = w1[1];
7894 w1[1] = w1[2];
7895 w1[2] = w1[3];
7896 w1[3] = w2[0];
7897 w2[0] = w2[1];
7898 w2[1] = w2[2];
7899 w2[2] = w2[3];
7900 w2[3] = w3[0];
7901 w3[0] = w3[1];
7902 w3[1] = w3[2];
7903 w3[2] = 0;
7904 }
7905
7906 break;
7907
7908 case 2:
7909 w3[2] = amd_bytealign_S ( 0, w2[3], offset_minus_4);
7910 w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7911 w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7912 w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7913 w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7914 w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7915 w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7916 w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7917 w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7918 w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7919 w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7920 w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7921 w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7922 w0[1] = 0;
7923 w0[0] = 0;
7924
7925 if (offset_mod_4 == 0)
7926 {
7927 w0[2] = w0[3];
7928 w0[3] = w1[0];
7929 w1[0] = w1[1];
7930 w1[1] = w1[2];
7931 w1[2] = w1[3];
7932 w1[3] = w2[0];
7933 w2[0] = w2[1];
7934 w2[1] = w2[2];
7935 w2[2] = w2[3];
7936 w2[3] = w3[0];
7937 w3[0] = w3[1];
7938 w3[1] = w3[2];
7939 w3[2] = 0;
7940 }
7941
7942 break;
7943
7944 case 3:
7945 w3[2] = amd_bytealign_S ( 0, w2[2], offset_minus_4);
7946 w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7947 w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7948 w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7949 w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7950 w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7951 w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7952 w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7953 w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7954 w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7955 w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7956 w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7957 w0[2] = 0;
7958 w0[1] = 0;
7959 w0[0] = 0;
7960
7961 if (offset_mod_4 == 0)
7962 {
7963 w0[3] = w1[0];
7964 w1[0] = w1[1];
7965 w1[1] = w1[2];
7966 w1[2] = w1[3];
7967 w1[3] = w2[0];
7968 w2[0] = w2[1];
7969 w2[1] = w2[2];
7970 w2[2] = w2[3];
7971 w2[3] = w3[0];
7972 w3[0] = w3[1];
7973 w3[1] = w3[2];
7974 w3[2] = 0;
7975 }
7976
7977 break;
7978
7979 case 4:
7980 w3[2] = amd_bytealign_S ( 0, w2[1], offset_minus_4);
7981 w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7982 w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7983 w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7984 w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7985 w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7986 w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7987 w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7988 w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7989 w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7990 w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7991 w0[3] = 0;
7992 w0[2] = 0;
7993 w0[1] = 0;
7994 w0[0] = 0;
7995
7996 if (offset_mod_4 == 0)
7997 {
7998 w1[0] = w1[1];
7999 w1[1] = w1[2];
8000 w1[2] = w1[3];
8001 w1[3] = w2[0];
8002 w2[0] = w2[1];
8003 w2[1] = w2[2];
8004 w2[2] = w2[3];
8005 w2[3] = w3[0];
8006 w3[0] = w3[1];
8007 w3[1] = w3[2];
8008 w3[2] = 0;
8009 }
8010
8011 break;
8012
8013 case 5:
8014 w3[2] = amd_bytealign_S ( 0, w2[0], offset_minus_4);
8015 w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
8016 w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
8017 w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
8018 w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
8019 w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8020 w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8021 w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8022 w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8023 w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8024 w1[0] = 0;
8025 w0[3] = 0;
8026 w0[2] = 0;
8027 w0[1] = 0;
8028 w0[0] = 0;
8029
8030 if (offset_mod_4 == 0)
8031 {
8032 w1[1] = w1[2];
8033 w1[2] = w1[3];
8034 w1[3] = w2[0];
8035 w2[0] = w2[1];
8036 w2[1] = w2[2];
8037 w2[2] = w2[3];
8038 w2[3] = w3[0];
8039 w3[0] = w3[1];
8040 w3[1] = w3[2];
8041 w3[2] = 0;
8042 }
8043
8044 break;
8045
8046 case 6:
8047 w3[2] = amd_bytealign_S ( 0, w1[3], offset_minus_4);
8048 w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
8049 w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
8050 w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
8051 w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8052 w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8053 w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8054 w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8055 w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8056 w1[1] = 0;
8057 w1[0] = 0;
8058 w0[3] = 0;
8059 w0[2] = 0;
8060 w0[1] = 0;
8061 w0[0] = 0;
8062
8063 if (offset_mod_4 == 0)
8064 {
8065 w1[2] = w1[3];
8066 w1[3] = w2[0];
8067 w2[0] = w2[1];
8068 w2[1] = w2[2];
8069 w2[2] = w2[3];
8070 w2[3] = w3[0];
8071 w3[0] = w3[1];
8072 w3[1] = w3[2];
8073 w3[2] = 0;
8074 }
8075
8076 break;
8077
8078 case 7:
8079 w3[2] = amd_bytealign_S ( 0, w1[2], offset_minus_4);
8080 w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
8081 w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
8082 w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8083 w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8084 w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8085 w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8086 w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8087 w1[2] = 0;
8088 w1[1] = 0;
8089 w1[0] = 0;
8090 w0[3] = 0;
8091 w0[2] = 0;
8092 w0[1] = 0;
8093 w0[0] = 0;
8094
8095 if (offset_mod_4 == 0)
8096 {
8097 w1[3] = w2[0];
8098 w2[0] = w2[1];
8099 w2[1] = w2[2];
8100 w2[2] = w2[3];
8101 w2[3] = w3[0];
8102 w3[0] = w3[1];
8103 w3[1] = w3[2];
8104 w3[2] = 0;
8105 }
8106
8107 break;
8108
8109 case 8:
8110 w3[2] = amd_bytealign_S ( 0, w1[1], offset_minus_4);
8111 w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
8112 w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8113 w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8114 w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8115 w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8116 w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8117 w1[3] = 0;
8118 w1[2] = 0;
8119 w1[1] = 0;
8120 w1[0] = 0;
8121 w0[3] = 0;
8122 w0[2] = 0;
8123 w0[1] = 0;
8124 w0[0] = 0;
8125
8126 if (offset_mod_4 == 0)
8127 {
8128 w2[0] = w2[1];
8129 w2[1] = w2[2];
8130 w2[2] = w2[3];
8131 w2[3] = w3[0];
8132 w3[0] = w3[1];
8133 w3[1] = w3[2];
8134 w3[2] = 0;
8135 }
8136
8137 break;
8138
8139 case 9:
8140 w3[2] = amd_bytealign_S ( 0, w1[0], offset_minus_4);
8141 w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8142 w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8143 w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8144 w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8145 w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8146 w2[0] = 0;
8147 w1[3] = 0;
8148 w1[2] = 0;
8149 w1[1] = 0;
8150 w1[0] = 0;
8151 w0[3] = 0;
8152 w0[2] = 0;
8153 w0[1] = 0;
8154 w0[0] = 0;
8155
8156 if (offset_mod_4 == 0)
8157 {
8158 w2[1] = w2[2];
8159 w2[2] = w2[3];
8160 w2[3] = w3[0];
8161 w3[0] = w3[1];
8162 w3[1] = w3[2];
8163 w3[2] = 0;
8164 }
8165
8166 break;
8167
8168 case 10:
8169 w3[2] = amd_bytealign_S ( 0, w0[3], offset_minus_4);
8170 w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8171 w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8172 w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8173 w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8174 w2[1] = 0;
8175 w2[0] = 0;
8176 w1[3] = 0;
8177 w1[2] = 0;
8178 w1[1] = 0;
8179 w1[0] = 0;
8180 w0[3] = 0;
8181 w0[2] = 0;
8182 w0[1] = 0;
8183 w0[0] = 0;
8184
8185 if (offset_mod_4 == 0)
8186 {
8187 w2[2] = w2[3];
8188 w2[3] = w3[0];
8189 w3[0] = w3[1];
8190 w3[1] = w3[2];
8191 w3[2] = 0;
8192 }
8193
8194 break;
8195
8196 case 11:
8197 w3[2] = amd_bytealign_S ( 0, w0[2], offset_minus_4);
8198 w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8199 w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8200 w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8201 w2[2] = 0;
8202 w2[1] = 0;
8203 w2[0] = 0;
8204 w1[3] = 0;
8205 w1[2] = 0;
8206 w1[1] = 0;
8207 w1[0] = 0;
8208 w0[3] = 0;
8209 w0[2] = 0;
8210 w0[1] = 0;
8211 w0[0] = 0;
8212
8213 if (offset_mod_4 == 0)
8214 {
8215 w2[3] = w3[0];
8216 w3[0] = w3[1];
8217 w3[1] = w3[2];
8218 w3[2] = 0;
8219 }
8220
8221 break;
8222
8223 case 12:
8224 w3[2] = amd_bytealign_S ( 0, w0[1], offset_minus_4);
8225 w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8226 w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8227 w2[3] = 0;
8228 w2[2] = 0;
8229 w2[1] = 0;
8230 w2[0] = 0;
8231 w1[3] = 0;
8232 w1[2] = 0;
8233 w1[1] = 0;
8234 w1[0] = 0;
8235 w0[3] = 0;
8236 w0[2] = 0;
8237 w0[1] = 0;
8238 w0[0] = 0;
8239
8240 if (offset_mod_4 == 0)
8241 {
8242 w3[0] = w3[1];
8243 w3[1] = w3[2];
8244 w3[2] = 0;
8245 }
8246
8247 break;
8248
8249 case 13:
8250 w3[2] = amd_bytealign_S ( 0, w0[0], offset_minus_4);
8251 w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8252 w3[0] = 0;
8253 w2[3] = 0;
8254 w2[2] = 0;
8255 w2[1] = 0;
8256 w2[0] = 0;
8257 w1[3] = 0;
8258 w1[2] = 0;
8259 w1[1] = 0;
8260 w1[0] = 0;
8261 w0[3] = 0;
8262 w0[2] = 0;
8263 w0[1] = 0;
8264 w0[0] = 0;
8265
8266 if (offset_mod_4 == 0)
8267 {
8268 w3[1] = w3[2];
8269 w3[2] = 0;
8270 }
8271
8272 break;
8273 }
8274 #endif
8275
8276 #ifdef IS_NV
8277 const int offset_minus_4 = 4 - (offset % 4);
8278
8279 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
8280
8281 switch (offset / 4)
8282 {
8283 case 0:
8284 w3[1] = __byte_perm_S (w3[0], w3[1], selector);
8285 w3[0] = __byte_perm_S (w2[3], w3[0], selector);
8286 w2[3] = __byte_perm_S (w2[2], w2[3], selector);
8287 w2[2] = __byte_perm_S (w2[1], w2[2], selector);
8288 w2[1] = __byte_perm_S (w2[0], w2[1], selector);
8289 w2[0] = __byte_perm_S (w1[3], w2[0], selector);
8290 w1[3] = __byte_perm_S (w1[2], w1[3], selector);
8291 w1[2] = __byte_perm_S (w1[1], w1[2], selector);
8292 w1[1] = __byte_perm_S (w1[0], w1[1], selector);
8293 w1[0] = __byte_perm_S (w0[3], w1[0], selector);
8294 w0[3] = __byte_perm_S (w0[2], w0[3], selector);
8295 w0[2] = __byte_perm_S (w0[1], w0[2], selector);
8296 w0[1] = __byte_perm_S (w0[0], w0[1], selector);
8297 w0[0] = __byte_perm_S ( 0, w0[0], selector);
8298
8299 break;
8300
8301 case 1:
8302 w3[1] = __byte_perm_S (w2[3], w3[0], selector);
8303 w3[0] = __byte_perm_S (w2[2], w2[3], selector);
8304 w2[3] = __byte_perm_S (w2[1], w2[2], selector);
8305 w2[2] = __byte_perm_S (w2[0], w2[1], selector);
8306 w2[1] = __byte_perm_S (w1[3], w2[0], selector);
8307 w2[0] = __byte_perm_S (w1[2], w1[3], selector);
8308 w1[3] = __byte_perm_S (w1[1], w1[2], selector);
8309 w1[2] = __byte_perm_S (w1[0], w1[1], selector);
8310 w1[1] = __byte_perm_S (w0[3], w1[0], selector);
8311 w1[0] = __byte_perm_S (w0[2], w0[3], selector);
8312 w0[3] = __byte_perm_S (w0[1], w0[2], selector);
8313 w0[2] = __byte_perm_S (w0[0], w0[1], selector);
8314 w0[1] = __byte_perm_S ( 0, w0[0], selector);
8315 w0[0] = 0;
8316
8317 break;
8318
8319 case 2:
8320 w3[1] = __byte_perm_S (w2[2], w2[3], selector);
8321 w3[0] = __byte_perm_S (w2[1], w2[2], selector);
8322 w2[3] = __byte_perm_S (w2[0], w2[1], selector);
8323 w2[2] = __byte_perm_S (w1[3], w2[0], selector);
8324 w2[1] = __byte_perm_S (w1[2], w1[3], selector);
8325 w2[0] = __byte_perm_S (w1[1], w1[2], selector);
8326 w1[3] = __byte_perm_S (w1[0], w1[1], selector);
8327 w1[2] = __byte_perm_S (w0[3], w1[0], selector);
8328 w1[1] = __byte_perm_S (w0[2], w0[3], selector);
8329 w1[0] = __byte_perm_S (w0[1], w0[2], selector);
8330 w0[3] = __byte_perm_S (w0[0], w0[1], selector);
8331 w0[2] = __byte_perm_S ( 0, w0[0], selector);
8332 w0[1] = 0;
8333 w0[0] = 0;
8334
8335 break;
8336
8337 case 3:
8338 w3[1] = __byte_perm_S (w2[1], w2[2], selector);
8339 w3[0] = __byte_perm_S (w2[0], w2[1], selector);
8340 w2[3] = __byte_perm_S (w1[3], w2[0], selector);
8341 w2[2] = __byte_perm_S (w1[2], w1[3], selector);
8342 w2[1] = __byte_perm_S (w1[1], w1[2], selector);
8343 w2[0] = __byte_perm_S (w1[0], w1[1], selector);
8344 w1[3] = __byte_perm_S (w0[3], w1[0], selector);
8345 w1[2] = __byte_perm_S (w0[2], w0[3], selector);
8346 w1[1] = __byte_perm_S (w0[1], w0[2], selector);
8347 w1[0] = __byte_perm_S (w0[0], w0[1], selector);
8348 w0[3] = __byte_perm_S ( 0, w0[0], selector);
8349 w0[2] = 0;
8350 w0[1] = 0;
8351 w0[0] = 0;
8352
8353 break;
8354
8355 case 4:
8356 w3[1] = __byte_perm_S (w2[0], w2[1], selector);
8357 w3[0] = __byte_perm_S (w1[3], w2[0], selector);
8358 w2[3] = __byte_perm_S (w1[2], w1[3], selector);
8359 w2[2] = __byte_perm_S (w1[1], w1[2], selector);
8360 w2[1] = __byte_perm_S (w1[0], w1[1], selector);
8361 w2[0] = __byte_perm_S (w0[3], w1[0], selector);
8362 w1[3] = __byte_perm_S (w0[2], w0[3], selector);
8363 w1[2] = __byte_perm_S (w0[1], w0[2], selector);
8364 w1[1] = __byte_perm_S (w0[0], w0[1], selector);
8365 w1[0] = __byte_perm_S ( 0, w0[0], selector);
8366 w0[3] = 0;
8367 w0[2] = 0;
8368 w0[1] = 0;
8369 w0[0] = 0;
8370
8371 break;
8372
8373 case 5:
8374 w3[1] = __byte_perm_S (w1[3], w2[0], selector);
8375 w3[0] = __byte_perm_S (w1[2], w1[3], selector);
8376 w2[3] = __byte_perm_S (w1[1], w1[2], selector);
8377 w2[2] = __byte_perm_S (w1[0], w1[1], selector);
8378 w2[1] = __byte_perm_S (w0[3], w1[0], selector);
8379 w2[0] = __byte_perm_S (w0[2], w0[3], selector);
8380 w1[3] = __byte_perm_S (w0[1], w0[2], selector);
8381 w1[2] = __byte_perm_S (w0[0], w0[1], selector);
8382 w1[1] = __byte_perm_S ( 0, w0[0], selector);
8383 w1[0] = 0;
8384 w0[3] = 0;
8385 w0[2] = 0;
8386 w0[1] = 0;
8387 w0[0] = 0;
8388
8389 break;
8390
8391 case 6:
8392 w3[1] = __byte_perm_S (w1[2], w1[3], selector);
8393 w3[0] = __byte_perm_S (w1[1], w1[2], selector);
8394 w2[3] = __byte_perm_S (w1[0], w1[1], selector);
8395 w2[2] = __byte_perm_S (w0[3], w1[0], selector);
8396 w2[1] = __byte_perm_S (w0[2], w0[3], selector);
8397 w2[0] = __byte_perm_S (w0[1], w0[2], selector);
8398 w1[3] = __byte_perm_S (w0[0], w0[1], selector);
8399 w1[2] = __byte_perm_S ( 0, w0[0], selector);
8400 w1[1] = 0;
8401 w1[0] = 0;
8402 w0[3] = 0;
8403 w0[2] = 0;
8404 w0[1] = 0;
8405 w0[0] = 0;
8406
8407 break;
8408
8409 case 7:
8410 w3[1] = __byte_perm_S (w1[1], w1[2], selector);
8411 w3[0] = __byte_perm_S (w1[0], w1[1], selector);
8412 w2[3] = __byte_perm_S (w0[3], w1[0], selector);
8413 w2[2] = __byte_perm_S (w0[2], w0[3], selector);
8414 w2[1] = __byte_perm_S (w0[1], w0[2], selector);
8415 w2[0] = __byte_perm_S (w0[0], w0[1], selector);
8416 w1[3] = __byte_perm_S ( 0, w0[0], selector);
8417 w1[2] = 0;
8418 w1[1] = 0;
8419 w1[0] = 0;
8420 w0[3] = 0;
8421 w0[2] = 0;
8422 w0[1] = 0;
8423 w0[0] = 0;
8424
8425 break;
8426
8427 case 8:
8428 w3[1] = __byte_perm_S (w1[0], w1[1], selector);
8429 w3[0] = __byte_perm_S (w0[3], w1[0], selector);
8430 w2[3] = __byte_perm_S (w0[2], w0[3], selector);
8431 w2[2] = __byte_perm_S (w0[1], w0[2], selector);
8432 w2[1] = __byte_perm_S (w0[0], w0[1], selector);
8433 w2[0] = __byte_perm_S ( 0, w0[0], selector);
8434 w1[3] = 0;
8435 w1[2] = 0;
8436 w1[1] = 0;
8437 w1[0] = 0;
8438 w0[3] = 0;
8439 w0[2] = 0;
8440 w0[1] = 0;
8441 w0[0] = 0;
8442
8443 break;
8444
8445 case 9:
8446 w3[1] = __byte_perm_S (w0[3], w1[0], selector);
8447 w3[0] = __byte_perm_S (w0[2], w0[3], selector);
8448 w2[3] = __byte_perm_S (w0[1], w0[2], selector);
8449 w2[2] = __byte_perm_S (w0[0], w0[1], selector);
8450 w2[1] = __byte_perm_S ( 0, w0[0], selector);
8451 w2[0] = 0;
8452 w1[3] = 0;
8453 w1[2] = 0;
8454 w1[1] = 0;
8455 w1[0] = 0;
8456 w0[3] = 0;
8457 w0[2] = 0;
8458 w0[1] = 0;
8459 w0[0] = 0;
8460
8461 break;
8462
8463 case 10:
8464 w3[1] = __byte_perm_S (w0[2], w0[3], selector);
8465 w3[0] = __byte_perm_S (w0[1], w0[2], selector);
8466 w2[3] = __byte_perm_S (w0[0], w0[1], selector);
8467 w2[2] = __byte_perm_S ( 0, w0[0], selector);
8468 w2[1] = 0;
8469 w2[0] = 0;
8470 w1[3] = 0;
8471 w1[2] = 0;
8472 w1[1] = 0;
8473 w1[0] = 0;
8474 w0[3] = 0;
8475 w0[2] = 0;
8476 w0[1] = 0;
8477 w0[0] = 0;
8478
8479 break;
8480
8481 case 11:
8482 w3[1] = __byte_perm_S (w0[1], w0[2], selector);
8483 w3[0] = __byte_perm_S (w0[0], w0[1], selector);
8484 w2[3] = __byte_perm_S ( 0, w0[0], selector);
8485 w2[2] = 0;
8486 w2[1] = 0;
8487 w2[0] = 0;
8488 w1[3] = 0;
8489 w1[2] = 0;
8490 w1[1] = 0;
8491 w1[0] = 0;
8492 w0[3] = 0;
8493 w0[2] = 0;
8494 w0[1] = 0;
8495 w0[0] = 0;
8496
8497 break;
8498
8499 case 12:
8500 w3[1] = __byte_perm_S (w0[0], w0[1], selector);
8501 w3[0] = __byte_perm_S ( 0, w0[0], selector);
8502 w2[3] = 0;
8503 w2[2] = 0;
8504 w2[1] = 0;
8505 w2[0] = 0;
8506 w1[3] = 0;
8507 w1[2] = 0;
8508 w1[1] = 0;
8509 w1[0] = 0;
8510 w0[3] = 0;
8511 w0[2] = 0;
8512 w0[1] = 0;
8513 w0[0] = 0;
8514
8515 break;
8516
8517 case 13:
8518 w3[1] = __byte_perm_S ( 0, w0[0], selector);
8519 w3[0] = 0;
8520 w2[3] = 0;
8521 w2[2] = 0;
8522 w2[1] = 0;
8523 w2[0] = 0;
8524 w1[3] = 0;
8525 w1[2] = 0;
8526 w1[1] = 0;
8527 w1[0] = 0;
8528 w0[3] = 0;
8529 w0[2] = 0;
8530 w0[1] = 0;
8531 w0[0] = 0;
8532
8533 break;
8534 }
8535 #endif
8536 }
8537
8538 inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
8539 {
8540 #if defined IS_AMD || defined IS_GENERIC
8541 switch (offset / 4)
8542 {
8543 case 0:
8544 w3[2] = amd_bytealign_S (w3[1], 0, offset);
8545 w3[1] = amd_bytealign_S (w3[0], w3[1], offset);
8546 w3[0] = amd_bytealign_S (w2[3], w3[0], offset);
8547 w2[3] = amd_bytealign_S (w2[2], w2[3], offset);
8548 w2[2] = amd_bytealign_S (w2[1], w2[2], offset);
8549 w2[1] = amd_bytealign_S (w2[0], w2[1], offset);
8550 w2[0] = amd_bytealign_S (w1[3], w2[0], offset);
8551 w1[3] = amd_bytealign_S (w1[2], w1[3], offset);
8552 w1[2] = amd_bytealign_S (w1[1], w1[2], offset);
8553 w1[1] = amd_bytealign_S (w1[0], w1[1], offset);
8554 w1[0] = amd_bytealign_S (w0[3], w1[0], offset);
8555 w0[3] = amd_bytealign_S (w0[2], w0[3], offset);
8556 w0[2] = amd_bytealign_S (w0[1], w0[2], offset);
8557 w0[1] = amd_bytealign_S (w0[0], w0[1], offset);
8558 w0[0] = amd_bytealign_S ( 0, w0[0], offset);
8559 break;
8560
8561 case 1:
8562 w3[2] = amd_bytealign_S (w3[0], 0, offset);
8563 w3[1] = amd_bytealign_S (w2[3], w3[0], offset);
8564 w3[0] = amd_bytealign_S (w2[2], w2[3], offset);
8565 w2[3] = amd_bytealign_S (w2[1], w2[2], offset);
8566 w2[2] = amd_bytealign_S (w2[0], w2[1], offset);
8567 w2[1] = amd_bytealign_S (w1[3], w2[0], offset);
8568 w2[0] = amd_bytealign_S (w1[2], w1[3], offset);
8569 w1[3] = amd_bytealign_S (w1[1], w1[2], offset);
8570 w1[2] = amd_bytealign_S (w1[0], w1[1], offset);
8571 w1[1] = amd_bytealign_S (w0[3], w1[0], offset);
8572 w1[0] = amd_bytealign_S (w0[2], w0[3], offset);
8573 w0[3] = amd_bytealign_S (w0[1], w0[2], offset);
8574 w0[2] = amd_bytealign_S (w0[0], w0[1], offset);
8575 w0[1] = amd_bytealign_S ( 0, w0[0], offset);
8576 w0[0] = 0;
8577 break;
8578
8579 case 2:
8580 w3[2] = amd_bytealign_S (w2[3], 0, offset);
8581 w3[1] = amd_bytealign_S (w2[2], w2[3], offset);
8582 w3[0] = amd_bytealign_S (w2[1], w2[2], offset);
8583 w2[3] = amd_bytealign_S (w2[0], w2[1], offset);
8584 w2[2] = amd_bytealign_S (w1[3], w2[0], offset);
8585 w2[1] = amd_bytealign_S (w1[2], w1[3], offset);
8586 w2[0] = amd_bytealign_S (w1[1], w1[2], offset);
8587 w1[3] = amd_bytealign_S (w1[0], w1[1], offset);
8588 w1[2] = amd_bytealign_S (w0[3], w1[0], offset);
8589 w1[1] = amd_bytealign_S (w0[2], w0[3], offset);
8590 w1[0] = amd_bytealign_S (w0[1], w0[2], offset);
8591 w0[3] = amd_bytealign_S (w0[0], w0[1], offset);
8592 w0[2] = amd_bytealign_S ( 0, w0[0], offset);
8593 w0[1] = 0;
8594 w0[0] = 0;
8595 break;
8596
8597 case 3:
8598 w3[2] = amd_bytealign_S (w2[2], 0, offset);
8599 w3[1] = amd_bytealign_S (w2[1], w2[2], offset);
8600 w3[0] = amd_bytealign_S (w2[0], w2[1], offset);
8601 w2[3] = amd_bytealign_S (w1[3], w2[0], offset);
8602 w2[2] = amd_bytealign_S (w1[2], w1[3], offset);
8603 w2[1] = amd_bytealign_S (w1[1], w1[2], offset);
8604 w2[0] = amd_bytealign_S (w1[0], w1[1], offset);
8605 w1[3] = amd_bytealign_S (w0[3], w1[0], offset);
8606 w1[2] = amd_bytealign_S (w0[2], w0[3], offset);
8607 w1[1] = amd_bytealign_S (w0[1], w0[2], offset);
8608 w1[0] = amd_bytealign_S (w0[0], w0[1], offset);
8609 w0[3] = amd_bytealign_S ( 0, w0[0], offset);
8610 w0[2] = 0;
8611 w0[1] = 0;
8612 w0[0] = 0;
8613 break;
8614
8615 case 4:
8616 w3[2] = amd_bytealign_S (w2[1], 0, offset);
8617 w3[1] = amd_bytealign_S (w2[0], w2[1], offset);
8618 w3[0] = amd_bytealign_S (w1[3], w2[0], offset);
8619 w2[3] = amd_bytealign_S (w1[2], w1[3], offset);
8620 w2[2] = amd_bytealign_S (w1[1], w1[2], offset);
8621 w2[1] = amd_bytealign_S (w1[0], w1[1], offset);
8622 w2[0] = amd_bytealign_S (w0[3], w1[0], offset);
8623 w1[3] = amd_bytealign_S (w0[2], w0[3], offset);
8624 w1[2] = amd_bytealign_S (w0[1], w0[2], offset);
8625 w1[1] = amd_bytealign_S (w0[0], w0[1], offset);
8626 w1[0] = amd_bytealign_S ( 0, w0[0], offset);
8627 w0[3] = 0;
8628 w0[2] = 0;
8629 w0[1] = 0;
8630 w0[0] = 0;
8631 break;
8632
8633 case 5:
8634 w3[2] = amd_bytealign_S (w2[0], 0, offset);
8635 w3[1] = amd_bytealign_S (w1[3], w2[0], offset);
8636 w3[0] = amd_bytealign_S (w1[2], w1[3], offset);
8637 w2[3] = amd_bytealign_S (w1[1], w1[2], offset);
8638 w2[2] = amd_bytealign_S (w1[0], w1[1], offset);
8639 w2[1] = amd_bytealign_S (w0[3], w1[0], offset);
8640 w2[0] = amd_bytealign_S (w0[2], w0[3], offset);
8641 w1[3] = amd_bytealign_S (w0[1], w0[2], offset);
8642 w1[2] = amd_bytealign_S (w0[0], w0[1], offset);
8643 w1[1] = amd_bytealign_S ( 0, w0[0], offset);
8644 w1[0] = 0;
8645 w0[3] = 0;
8646 w0[2] = 0;
8647 w0[1] = 0;
8648 w0[0] = 0;
8649 break;
8650
8651 case 6:
8652 w3[2] = amd_bytealign_S (w1[3], 0, offset);
8653 w3[1] = amd_bytealign_S (w1[2], w1[3], offset);
8654 w3[0] = amd_bytealign_S (w1[1], w1[2], offset);
8655 w2[3] = amd_bytealign_S (w1[0], w1[1], offset);
8656 w2[2] = amd_bytealign_S (w0[3], w1[0], offset);
8657 w2[1] = amd_bytealign_S (w0[2], w0[3], offset);
8658 w2[0] = amd_bytealign_S (w0[1], w0[2], offset);
8659 w1[3] = amd_bytealign_S (w0[0], w0[1], offset);
8660 w1[2] = amd_bytealign_S ( 0, w0[0], offset);
8661 w1[1] = 0;
8662 w1[0] = 0;
8663 w0[3] = 0;
8664 w0[2] = 0;
8665 w0[1] = 0;
8666 w0[0] = 0;
8667 break;
8668
8669 case 7:
8670 w3[2] = amd_bytealign_S (w1[2], 0, offset);
8671 w3[1] = amd_bytealign_S (w1[1], w1[2], offset);
8672 w3[0] = amd_bytealign_S (w1[0], w1[1], offset);
8673 w2[3] = amd_bytealign_S (w0[3], w1[0], offset);
8674 w2[2] = amd_bytealign_S (w0[2], w0[3], offset);
8675 w2[1] = amd_bytealign_S (w0[1], w0[2], offset);
8676 w2[0] = amd_bytealign_S (w0[0], w0[1], offset);
8677 w1[3] = amd_bytealign_S ( 0, w0[0], offset);
8678 w1[2] = 0;
8679 w1[1] = 0;
8680 w1[0] = 0;
8681 w0[3] = 0;
8682 w0[2] = 0;
8683 w0[1] = 0;
8684 w0[0] = 0;
8685 break;
8686
8687 case 8:
8688 w3[2] = amd_bytealign_S (w1[1], 0, offset);
8689 w3[1] = amd_bytealign_S (w1[0], w1[1], offset);
8690 w3[0] = amd_bytealign_S (w0[3], w1[0], offset);
8691 w2[3] = amd_bytealign_S (w0[2], w0[3], offset);
8692 w2[2] = amd_bytealign_S (w0[1], w0[2], offset);
8693 w2[1] = amd_bytealign_S (w0[0], w0[1], offset);
8694 w2[0] = amd_bytealign_S ( 0, w0[0], offset);
8695 w1[3] = 0;
8696 w1[2] = 0;
8697 w1[1] = 0;
8698 w1[0] = 0;
8699 w0[3] = 0;
8700 w0[2] = 0;
8701 w0[1] = 0;
8702 w0[0] = 0;
8703 break;
8704
8705 case 9:
8706 w3[2] = amd_bytealign_S (w1[0], 0, offset);
8707 w3[1] = amd_bytealign_S (w0[3], w1[0], offset);
8708 w3[0] = amd_bytealign_S (w0[2], w0[3], offset);
8709 w2[3] = amd_bytealign_S (w0[1], w0[2], offset);
8710 w2[2] = amd_bytealign_S (w0[0], w0[1], offset);
8711 w2[1] = amd_bytealign_S ( 0, w0[0], offset);
8712 w2[0] = 0;
8713 w1[3] = 0;
8714 w1[2] = 0;
8715 w1[1] = 0;
8716 w1[0] = 0;
8717 w0[3] = 0;
8718 w0[2] = 0;
8719 w0[1] = 0;
8720 w0[0] = 0;
8721 break;
8722
8723 case 10:
8724 w3[2] = amd_bytealign_S (w0[3], 0, offset);
8725 w3[1] = amd_bytealign_S (w0[2], w0[3], offset);
8726 w3[0] = amd_bytealign_S (w0[1], w0[2], offset);
8727 w2[3] = amd_bytealign_S (w0[0], w0[1], offset);
8728 w2[2] = amd_bytealign_S ( 0, w0[0], offset);
8729 w2[1] = 0;
8730 w2[0] = 0;
8731 w1[3] = 0;
8732 w1[2] = 0;
8733 w1[1] = 0;
8734 w1[0] = 0;
8735 w0[3] = 0;
8736 w0[2] = 0;
8737 w0[1] = 0;
8738 w0[0] = 0;
8739 break;
8740
8741 case 11:
8742 w3[2] = amd_bytealign_S (w0[2], 0, offset);
8743 w3[1] = amd_bytealign_S (w0[1], w0[2], offset);
8744 w3[0] = amd_bytealign_S (w0[0], w0[1], offset);
8745 w2[3] = amd_bytealign_S ( 0, w0[0], offset);
8746 w2[2] = 0;
8747 w2[1] = 0;
8748 w2[0] = 0;
8749 w1[3] = 0;
8750 w1[2] = 0;
8751 w1[1] = 0;
8752 w1[0] = 0;
8753 w0[3] = 0;
8754 w0[2] = 0;
8755 w0[1] = 0;
8756 w0[0] = 0;
8757 break;
8758
8759 case 12:
8760 w3[2] = amd_bytealign_S (w0[1], 0, offset);
8761 w3[1] = amd_bytealign_S (w0[0], w0[1], offset);
8762 w3[0] = amd_bytealign_S ( 0, w0[0], offset);
8763 w2[3] = 0;
8764 w2[2] = 0;
8765 w2[1] = 0;
8766 w2[0] = 0;
8767 w1[3] = 0;
8768 w1[2] = 0;
8769 w1[1] = 0;
8770 w1[0] = 0;
8771 w0[3] = 0;
8772 w0[2] = 0;
8773 w0[1] = 0;
8774 w0[0] = 0;
8775 break;
8776
8777 case 13:
8778 w3[2] = amd_bytealign_S (w0[0], 0, offset);
8779 w3[1] = amd_bytealign_S ( 0, w0[0], offset);
8780 w3[0] = 0;
8781 w2[3] = 0;
8782 w2[2] = 0;
8783 w2[1] = 0;
8784 w2[0] = 0;
8785 w1[3] = 0;
8786 w1[2] = 0;
8787 w1[1] = 0;
8788 w1[0] = 0;
8789 w0[3] = 0;
8790 w0[2] = 0;
8791 w0[1] = 0;
8792 w0[0] = 0;
8793 break;
8794 }
8795 #endif
8796
8797 #ifdef IS_NV
8798 const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
8799
8800 switch (offset / 4)
8801 {
8802 case 0:
8803 w3[1] = __byte_perm_S (w3[1], w3[0], selector);
8804 w3[0] = __byte_perm_S (w3[0], w2[3], selector);
8805 w2[3] = __byte_perm_S (w2[3], w2[2], selector);
8806 w2[2] = __byte_perm_S (w2[2], w2[1], selector);
8807 w2[1] = __byte_perm_S (w2[1], w2[0], selector);
8808 w2[0] = __byte_perm_S (w2[0], w1[3], selector);
8809 w1[3] = __byte_perm_S (w1[3], w1[2], selector);
8810 w1[2] = __byte_perm_S (w1[2], w1[1], selector);
8811 w1[1] = __byte_perm_S (w1[1], w1[0], selector);
8812 w1[0] = __byte_perm_S (w1[0], w0[3], selector);
8813 w0[3] = __byte_perm_S (w0[3], w0[2], selector);
8814 w0[2] = __byte_perm_S (w0[2], w0[1], selector);
8815 w0[1] = __byte_perm_S (w0[1], w0[0], selector);
8816 w0[0] = __byte_perm_S (w0[0], 0, selector);
8817 break;
8818
8819 case 1:
8820 w3[1] = __byte_perm_S (w3[0], w2[3], selector);
8821 w3[0] = __byte_perm_S (w2[3], w2[2], selector);
8822 w2[3] = __byte_perm_S (w2[2], w2[1], selector);
8823 w2[2] = __byte_perm_S (w2[1], w2[0], selector);
8824 w2[1] = __byte_perm_S (w2[0], w1[3], selector);
8825 w2[0] = __byte_perm_S (w1[3], w1[2], selector);
8826 w1[3] = __byte_perm_S (w1[2], w1[1], selector);
8827 w1[2] = __byte_perm_S (w1[1], w1[0], selector);
8828 w1[1] = __byte_perm_S (w1[0], w0[3], selector);
8829 w1[0] = __byte_perm_S (w0[3], w0[2], selector);
8830 w0[3] = __byte_perm_S (w0[2], w0[1], selector);
8831 w0[2] = __byte_perm_S (w0[1], w0[0], selector);
8832 w0[1] = __byte_perm_S (w0[0], 0, selector);
8833 w0[0] = 0;
8834 break;
8835
8836 case 2:
8837 w3[1] = __byte_perm_S (w2[3], w2[2], selector);
8838 w3[0] = __byte_perm_S (w2[2], w2[1], selector);
8839 w2[3] = __byte_perm_S (w2[1], w2[0], selector);
8840 w2[2] = __byte_perm_S (w2[0], w1[3], selector);
8841 w2[1] = __byte_perm_S (w1[3], w1[2], selector);
8842 w2[0] = __byte_perm_S (w1[2], w1[1], selector);
8843 w1[3] = __byte_perm_S (w1[1], w1[0], selector);
8844 w1[2] = __byte_perm_S (w1[0], w0[3], selector);
8845 w1[1] = __byte_perm_S (w0[3], w0[2], selector);
8846 w1[0] = __byte_perm_S (w0[2], w0[1], selector);
8847 w0[3] = __byte_perm_S (w0[1], w0[0], selector);
8848 w0[2] = __byte_perm_S (w0[0], 0, selector);
8849 w0[1] = 0;
8850 w0[0] = 0;
8851 break;
8852
8853 case 3:
8854 w3[1] = __byte_perm_S (w2[2], w2[1], selector);
8855 w3[0] = __byte_perm_S (w2[1], w2[0], selector);
8856 w2[3] = __byte_perm_S (w2[0], w1[3], selector);
8857 w2[2] = __byte_perm_S (w1[3], w1[2], selector);
8858 w2[1] = __byte_perm_S (w1[2], w1[1], selector);
8859 w2[0] = __byte_perm_S (w1[1], w1[0], selector);
8860 w1[3] = __byte_perm_S (w1[0], w0[3], selector);
8861 w1[2] = __byte_perm_S (w0[3], w0[2], selector);
8862 w1[1] = __byte_perm_S (w0[2], w0[1], selector);
8863 w1[0] = __byte_perm_S (w0[1], w0[0], selector);
8864 w0[3] = __byte_perm_S (w0[0], 0, selector);
8865 w0[2] = 0;
8866 w0[1] = 0;
8867 w0[0] = 0;
8868 break;
8869
8870 case 4:
8871 w3[1] = __byte_perm_S (w2[1], w2[0], selector);
8872 w3[0] = __byte_perm_S (w2[0], w1[3], selector);
8873 w2[3] = __byte_perm_S (w1[3], w1[2], selector);
8874 w2[2] = __byte_perm_S (w1[2], w1[1], selector);
8875 w2[1] = __byte_perm_S (w1[1], w1[0], selector);
8876 w2[0] = __byte_perm_S (w1[0], w0[3], selector);
8877 w1[3] = __byte_perm_S (w0[3], w0[2], selector);
8878 w1[2] = __byte_perm_S (w0[2], w0[1], selector);
8879 w1[1] = __byte_perm_S (w0[1], w0[0], selector);
8880 w1[0] = __byte_perm_S (w0[0], 0, selector);
8881 w0[3] = 0;
8882 w0[2] = 0;
8883 w0[1] = 0;
8884 w0[0] = 0;
8885 break;
8886
8887 case 5:
8888 w3[1] = __byte_perm_S (w2[0], w1[3], selector);
8889 w3[0] = __byte_perm_S (w1[3], w1[2], selector);
8890 w2[3] = __byte_perm_S (w1[2], w1[1], selector);
8891 w2[2] = __byte_perm_S (w1[1], w1[0], selector);
8892 w2[1] = __byte_perm_S (w1[0], w0[3], selector);
8893 w2[0] = __byte_perm_S (w0[3], w0[2], selector);
8894 w1[3] = __byte_perm_S (w0[2], w0[1], selector);
8895 w1[2] = __byte_perm_S (w0[1], w0[0], selector);
8896 w1[1] = __byte_perm_S (w0[0], 0, selector);
8897 w1[0] = 0;
8898 w0[3] = 0;
8899 w0[2] = 0;
8900 w0[1] = 0;
8901 w0[0] = 0;
8902 break;
8903
8904 case 6:
8905 w3[1] = __byte_perm_S (w1[3], w1[2], selector);
8906 w3[0] = __byte_perm_S (w1[2], w1[1], selector);
8907 w2[3] = __byte_perm_S (w1[1], w1[0], selector);
8908 w2[2] = __byte_perm_S (w1[0], w0[3], selector);
8909 w2[1] = __byte_perm_S (w0[3], w0[2], selector);
8910 w2[0] = __byte_perm_S (w0[2], w0[1], selector);
8911 w1[3] = __byte_perm_S (w0[1], w0[0], selector);
8912 w1[2] = __byte_perm_S (w0[0], 0, selector);
8913 w1[1] = 0;
8914 w1[0] = 0;
8915 w0[3] = 0;
8916 w0[2] = 0;
8917 w0[1] = 0;
8918 w0[0] = 0;
8919 break;
8920
8921 case 7:
8922 w3[1] = __byte_perm_S (w1[2], w1[1], selector);
8923 w3[0] = __byte_perm_S (w1[1], w1[0], selector);
8924 w2[3] = __byte_perm_S (w1[0], w0[3], selector);
8925 w2[2] = __byte_perm_S (w0[3], w0[2], selector);
8926 w2[1] = __byte_perm_S (w0[2], w0[1], selector);
8927 w2[0] = __byte_perm_S (w0[1], w0[0], selector);
8928 w1[3] = __byte_perm_S (w0[0], 0, selector);
8929 w1[2] = 0;
8930 w1[1] = 0;
8931 w1[0] = 0;
8932 w0[3] = 0;
8933 w0[2] = 0;
8934 w0[1] = 0;
8935 w0[0] = 0;
8936 break;
8937
8938 case 8:
8939 w3[1] = __byte_perm_S (w1[1], w1[0], selector);
8940 w3[0] = __byte_perm_S (w1[0], w0[3], selector);
8941 w2[3] = __byte_perm_S (w0[3], w0[2], selector);
8942 w2[2] = __byte_perm_S (w0[2], w0[1], selector);
8943 w2[1] = __byte_perm_S (w0[1], w0[0], selector);
8944 w2[0] = __byte_perm_S (w0[0], 0, selector);
8945 w1[3] = 0;
8946 w1[2] = 0;
8947 w1[1] = 0;
8948 w1[0] = 0;
8949 w0[3] = 0;
8950 w0[2] = 0;
8951 w0[1] = 0;
8952 w0[0] = 0;
8953 break;
8954
8955 case 9:
8956 w3[1] = __byte_perm_S (w1[0], w0[3], selector);
8957 w3[0] = __byte_perm_S (w0[3], w0[2], selector);
8958 w2[3] = __byte_perm_S (w0[2], w0[1], selector);
8959 w2[2] = __byte_perm_S (w0[1], w0[0], selector);
8960 w2[1] = __byte_perm_S (w0[0], 0, selector);
8961 w2[0] = 0;
8962 w1[3] = 0;
8963 w1[2] = 0;
8964 w1[1] = 0;
8965 w1[0] = 0;
8966 w0[3] = 0;
8967 w0[2] = 0;
8968 w0[1] = 0;
8969 w0[0] = 0;
8970 break;
8971
8972 case 10:
8973 w3[1] = __byte_perm_S (w0[3], w0[2], selector);
8974 w3[0] = __byte_perm_S (w0[2], w0[1], selector);
8975 w2[3] = __byte_perm_S (w0[1], w0[0], selector);
8976 w2[2] = __byte_perm_S (w0[0], 0, selector);
8977 w2[1] = 0;
8978 w2[0] = 0;
8979 w1[3] = 0;
8980 w1[2] = 0;
8981 w1[1] = 0;
8982 w1[0] = 0;
8983 w0[3] = 0;
8984 w0[2] = 0;
8985 w0[1] = 0;
8986 w0[0] = 0;
8987 break;
8988
8989 case 11:
8990 w3[1] = __byte_perm_S (w0[2], w0[1], selector);
8991 w3[0] = __byte_perm_S (w0[1], w0[0], selector);
8992 w2[3] = __byte_perm_S (w0[0], 0, selector);
8993 w2[2] = 0;
8994 w2[1] = 0;
8995 w2[0] = 0;
8996 w1[3] = 0;
8997 w1[2] = 0;
8998 w1[1] = 0;
8999 w1[0] = 0;
9000 w0[3] = 0;
9001 w0[2] = 0;
9002 w0[1] = 0;
9003 w0[0] = 0;
9004 break;
9005
9006 case 12:
9007 w3[1] = __byte_perm_S (w0[1], w0[0], selector);
9008 w3[0] = __byte_perm_S (w0[0], 0, selector);
9009 w2[3] = 0;
9010 w2[2] = 0;
9011 w2[1] = 0;
9012 w2[0] = 0;
9013 w1[3] = 0;
9014 w1[2] = 0;
9015 w1[1] = 0;
9016 w1[0] = 0;
9017 w0[3] = 0;
9018 w0[2] = 0;
9019 w0[1] = 0;
9020 w0[0] = 0;
9021 break;
9022
9023 case 13:
9024 w3[1] = __byte_perm_S (w0[0], 0, selector);
9025 w3[0] = 0;
9026 w2[3] = 0;
9027 w2[2] = 0;
9028 w2[1] = 0;
9029 w2[0] = 0;
9030 w1[3] = 0;
9031 w1[2] = 0;
9032 w1[1] = 0;
9033 w1[0] = 0;
9034 w0[3] = 0;
9035 w0[2] = 0;
9036 w0[1] = 0;
9037 w0[0] = 0;
9038 break;
9039 }
9040 #endif
9041 }
9042
9043 /**
9044 * vector functions on scalar types (for inner loop usage)
9045 */
9046
9047 #define PACKVS2(sn,vn,e) \
9048 sn[0] = vn[0].s##e; \
9049 sn[1] = vn[1].s##e;
9050
9051 #define PACKSV2(sn,vn,e) \
9052 vn[0].s##e = sn[0]; \
9053 vn[1].s##e = sn[1];
9054
9055 #define PACKVS24(s0,s1,v0,v1,e) \
9056 PACKVS4 (s0, v0, e); \
9057 PACKVS4 (s1, v1, e);
9058
9059 #define PACKSV24(s0,s1,v0,v1,e) \
9060 PACKSV4 (s0, v0, e); \
9061 PACKSV4 (s1, v1, e);
9062
9063 #define PACKVS4(sn,vn,e) \
9064 sn[0] = vn[0].s##e; \
9065 sn[1] = vn[1].s##e; \
9066 sn[2] = vn[2].s##e; \
9067 sn[3] = vn[3].s##e;
9068
9069 #define PACKSV4(sn,vn,e) \
9070 vn[0].s##e = sn[0]; \
9071 vn[1].s##e = sn[1]; \
9072 vn[2].s##e = sn[2]; \
9073 vn[3].s##e = sn[3];
9074
9075 #define PACKVS44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
9076 PACKVS4 (s0, v0, e); \
9077 PACKVS4 (s1, v1, e); \
9078 PACKVS4 (s2, v2, e); \
9079 PACKVS4 (s3, v3, e);
9080
9081 #define PACKSV44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
9082 PACKSV4 (s0, v0, e); \
9083 PACKSV4 (s1, v1, e); \
9084 PACKSV4 (s2, v2, e); \
9085 PACKSV4 (s3, v3, e);
9086
9087 inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
9088 {
9089 #if VECT_SIZE == 1
9090
9091 switch_buffer_by_offset_le_S (w0, w1, w2, w3, offset);
9092
9093 #else
9094
9095 u32 t0[4];
9096 u32 t1[4];
9097 u32 t2[4];
9098 u32 t3[4];
9099
9100 #endif
9101
9102 #if VECT_SIZE == 2
9103
9104 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9105 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9106
9107 #elif VECT_SIZE == 4
9108
9109 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9110 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9111 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9112 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9113
9114 #elif VECT_SIZE == 8
9115
9116 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9117 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9118 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9119 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9120 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
9121 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
9122 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
9123 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
9124
9125 #elif VECT_SIZE == 16
9126
9127 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9128 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9129 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9130 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9131 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
9132 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
9133 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
9134 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
9135 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
9136 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
9137 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
9138 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
9139 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
9140 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
9141 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
9142 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
9143
9144 #endif
9145 }
9146
9147 inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
9148 {
9149 #if VECT_SIZE == 1
9150
9151 append_0x01_2x4_S (w0, w1, offset);
9152
9153 #else
9154
9155 u32 t0[4];
9156 u32 t1[4];
9157
9158 #endif
9159
9160 #if VECT_SIZE == 2
9161
9162 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9163 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9164
9165 #elif VECT_SIZE == 4
9166
9167 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9168 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9169 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9170 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9171
9172 #elif VECT_SIZE == 8
9173
9174 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9175 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9176 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9177 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9178 PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
9179 PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
9180 PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
9181 PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
9182
9183 #elif VECT_SIZE == 16
9184
9185 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9186 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9187 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9188 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9189 PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
9190 PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
9191 PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
9192 PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
9193 PACKVS24 (t0, t1, w0, w1, 8); append_0x01_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
9194 PACKVS24 (t0, t1, w0, w1, 9); append_0x01_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
9195 PACKVS24 (t0, t1, w0, w1, a); append_0x01_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
9196 PACKVS24 (t0, t1, w0, w1, b); append_0x01_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
9197 PACKVS24 (t0, t1, w0, w1, c); append_0x01_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
9198 PACKVS24 (t0, t1, w0, w1, d); append_0x01_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
9199 PACKVS24 (t0, t1, w0, w1, e); append_0x01_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
9200 PACKVS24 (t0, t1, w0, w1, f); append_0x01_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
9201
9202 #endif
9203 }
9204
9205 inline void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
9206 {
9207 #if VECT_SIZE == 1
9208
9209 append_0x80_2x4_S (w0, w1, offset);
9210
9211 #else
9212
9213 u32 t0[4];
9214 u32 t1[4];
9215
9216 #endif
9217
9218 #if VECT_SIZE == 2
9219
9220 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9221 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9222
9223 #elif VECT_SIZE == 4
9224
9225 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9226 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9227 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9228 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9229
9230 #elif VECT_SIZE == 8
9231
9232 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9233 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9234 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9235 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9236 PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
9237 PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
9238 PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
9239 PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
9240
9241 #elif VECT_SIZE == 16
9242
9243 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9244 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9245 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9246 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9247 PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
9248 PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
9249 PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
9250 PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
9251 PACKVS24 (t0, t1, w0, w1, 8); append_0x80_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
9252 PACKVS24 (t0, t1, w0, w1, 9); append_0x80_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
9253 PACKVS24 (t0, t1, w0, w1, a); append_0x80_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
9254 PACKVS24 (t0, t1, w0, w1, b); append_0x80_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
9255 PACKVS24 (t0, t1, w0, w1, c); append_0x80_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
9256 PACKVS24 (t0, t1, w0, w1, d); append_0x80_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
9257 PACKVS24 (t0, t1, w0, w1, e); append_0x80_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
9258 PACKVS24 (t0, t1, w0, w1, f); append_0x80_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
9259
9260 #endif
9261 }
9262
9263 inline void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
9264 {
9265 #if VECT_SIZE == 1
9266
9267 append_0x80_4x4_S (w0, w1, w2, w3, offset);
9268
9269 #else
9270
9271 u32 t0[4];
9272 u32 t1[4];
9273 u32 t2[4];
9274 u32 t3[4];
9275
9276 #endif
9277
9278 #if VECT_SIZE == 2
9279
9280 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9281 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9282
9283 #elif VECT_SIZE == 4
9284
9285 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9286 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9287 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9288 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9289
9290 #elif VECT_SIZE == 8
9291
9292 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9293 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9294 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9295 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9296 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
9297 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
9298 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
9299 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
9300
9301 #elif VECT_SIZE == 16
9302
9303 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9304 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9305 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9306 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9307 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
9308 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
9309 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
9310 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
9311 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); append_0x80_4x4_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
9312 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); append_0x80_4x4_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
9313 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); append_0x80_4x4_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
9314 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); append_0x80_4x4_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
9315 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); append_0x80_4x4_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
9316 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); append_0x80_4x4_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
9317 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); append_0x80_4x4_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
9318 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); append_0x80_4x4_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
9319
9320 #endif
9321 }