Reduce use of mux_display to a minimum
[hashcat.git] / OpenCL / common.c
1 /**
2 * Author......: Jens Steube <jens.steube@gmail.com>
3 * License.....: MIT
4 */
5
6 /**
7 * pure scalar functions
8 */
9
10 inline int hash_comp (const u32 d1[4], __global u32 *d2)
11 {
12 if (d1[3] > d2[DGST_R3]) return ( 1);
13 if (d1[3] < d2[DGST_R3]) return (-1);
14 if (d1[2] > d2[DGST_R2]) return ( 1);
15 if (d1[2] < d2[DGST_R2]) return (-1);
16 if (d1[1] > d2[DGST_R1]) return ( 1);
17 if (d1[1] < d2[DGST_R1]) return (-1);
18 if (d1[0] > d2[DGST_R0]) return ( 1);
19 if (d1[0] < d2[DGST_R0]) return (-1);
20
21 return (0);
22 }
23
24 inline int find_hash (const u32 digest[4], const u32 digests_cnt, __global digest_t *digests_buf)
25 {
26 for (u32 l = 0, r = digests_cnt; r; r >>= 1)
27 {
28 const u32 m = r >> 1;
29
30 const u32 c = l + m;
31
32 const int cmp = hash_comp (digest, digests_buf[c].digest_buf);
33
34 if (cmp > 0)
35 {
36 l += m + 1;
37
38 r--;
39 }
40
41 if (cmp == 0) return (c);
42 }
43
44 return (-1);
45 }
46
47 inline u32 check_bitmap (__global u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest)
48 {
49 return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f)));
50 }
51
52 inline u32 check (const u32 digest[2], __global u32 *bitmap_s1_a, __global u32 *bitmap_s1_b, __global u32 *bitmap_s1_c, __global u32 *bitmap_s1_d, __global u32 *bitmap_s2_a, __global u32 *bitmap_s2_b, __global u32 *bitmap_s2_c, __global u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2)
53 {
54 if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0);
55 if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0);
56 if (check_bitmap (bitmap_s1_c, bitmap_mask, bitmap_shift1, digest[2]) == 0) return (0);
57 if (check_bitmap (bitmap_s1_d, bitmap_mask, bitmap_shift1, digest[3]) == 0) return (0);
58
59 if (check_bitmap (bitmap_s2_a, bitmap_mask, bitmap_shift2, digest[0]) == 0) return (0);
60 if (check_bitmap (bitmap_s2_b, bitmap_mask, bitmap_shift2, digest[1]) == 0) return (0);
61 if (check_bitmap (bitmap_s2_c, bitmap_mask, bitmap_shift2, digest[2]) == 0) return (0);
62 if (check_bitmap (bitmap_s2_d, bitmap_mask, bitmap_shift2, digest[3]) == 0) return (0);
63
64 return (1);
65 }
66
67 inline void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, const int salt_pos, const int digest_pos, const int hash_pos, const u32 gid, const u32 il_pos)
68 {
69 const u32 idx = atomic_inc (d_result);
70
71 plains_buf[idx].salt_pos = salt_pos;
72 plains_buf[idx].digest_pos = digest_pos; // relative
73 plains_buf[idx].hash_pos = hash_pos; // absolute
74 plains_buf[idx].gidvid = gid;
75 plains_buf[idx].il_pos = il_pos;
76 }
77
78 /**
79 * vector functions
80 */
81
82 inline void truncate_block (u32x w[4], const u32 len)
83 {
84 switch (len)
85 {
86 case 0: w[0] &= 0;
87 w[1] &= 0;
88 w[2] &= 0;
89 w[3] &= 0;
90 break;
91 case 1: w[0] &= 0x000000FF;
92 w[1] &= 0;
93 w[2] &= 0;
94 w[3] &= 0;
95 break;
96 case 2: w[0] &= 0x0000FFFF;
97 w[1] &= 0;
98 w[2] &= 0;
99 w[3] &= 0;
100 break;
101 case 3: w[0] &= 0x00FFFFFF;
102 w[1] &= 0;
103 w[2] &= 0;
104 w[3] &= 0;
105 break;
106 case 4: w[1] &= 0;
107 w[2] &= 0;
108 w[3] &= 0;
109 break;
110 case 5: w[1] &= 0x000000FF;
111 w[2] &= 0;
112 w[3] &= 0;
113 break;
114 case 6: w[1] &= 0x0000FFFF;
115 w[2] &= 0;
116 w[3] &= 0;
117 break;
118 case 7: w[1] &= 0x00FFFFFF;
119 w[2] &= 0;
120 w[3] &= 0;
121 break;
122 case 8: w[2] &= 0;
123 w[3] &= 0;
124 break;
125 case 9: w[2] &= 0x000000FF;
126 w[3] &= 0;
127 break;
128 case 10: w[2] &= 0x0000FFFF;
129 w[3] &= 0;
130 break;
131 case 11: w[2] &= 0x00FFFFFF;
132 w[3] &= 0;
133 break;
134 case 12: w[3] &= 0;
135 break;
136 case 13: w[3] &= 0x000000FF;
137 break;
138 case 14: w[3] &= 0x0000FFFF;
139 break;
140 case 15: w[3] &= 0x00FFFFFF;
141 break;
142 }
143 }
144
145 inline void make_unicode (const u32x in[4], u32x out1[4], u32x out2[4])
146 {
147 #ifdef IS_NV
148 out2[3] = __byte_perm (in[3], 0, 0x7372);
149 out2[2] = __byte_perm (in[3], 0, 0x7170);
150 out2[1] = __byte_perm (in[2], 0, 0x7372);
151 out2[0] = __byte_perm (in[2], 0, 0x7170);
152 out1[3] = __byte_perm (in[1], 0, 0x7372);
153 out1[2] = __byte_perm (in[1], 0, 0x7170);
154 out1[1] = __byte_perm (in[0], 0, 0x7372);
155 out1[0] = __byte_perm (in[0], 0, 0x7170);
156 #endif
157
158 #if defined IS_AMD || defined IS_GENERIC
159 out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
160 out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
161 out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
162 out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
163 out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
164 out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
165 out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
166 out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
167 #endif
168 }
169
170 inline void undo_unicode (const u32x in1[4], const u32x in2[4], u32x out[4])
171 {
172 #ifdef IS_NV
173 out[0] = __byte_perm (in1[0], in1[1], 0x6420);
174 out[1] = __byte_perm (in1[2], in1[3], 0x6420);
175 out[2] = __byte_perm (in2[0], in2[1], 0x6420);
176 out[3] = __byte_perm (in2[2], in2[3], 0x6420);
177 #endif
178
179 #if defined IS_AMD || defined IS_GENERIC
180 out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
181 | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
182 out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
183 | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
184 out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
185 | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
186 out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
187 | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
188 #endif
189 }
190
191 inline void append_0x01_1x4 (u32x w0[4], const u32 offset)
192 {
193 switch (offset)
194 {
195 case 0:
196 w0[0] = 0x01;
197 break;
198
199 case 1:
200 w0[0] = w0[0] | 0x0100;
201 break;
202
203 case 2:
204 w0[0] = w0[0] | 0x010000;
205 break;
206
207 case 3:
208 w0[0] = w0[0] | 0x01000000;
209 break;
210
211 case 4:
212 w0[1] = 0x01;
213 break;
214
215 case 5:
216 w0[1] = w0[1] | 0x0100;
217 break;
218
219 case 6:
220 w0[1] = w0[1] | 0x010000;
221 break;
222
223 case 7:
224 w0[1] = w0[1] | 0x01000000;
225 break;
226
227 case 8:
228 w0[2] = 0x01;
229 break;
230
231 case 9:
232 w0[2] = w0[2] | 0x0100;
233 break;
234
235 case 10:
236 w0[2] = w0[2] | 0x010000;
237 break;
238
239 case 11:
240 w0[2] = w0[2] | 0x01000000;
241 break;
242
243 case 12:
244 w0[3] = 0x01;
245 break;
246
247 case 13:
248 w0[3] = w0[3] | 0x0100;
249 break;
250
251 case 14:
252 w0[3] = w0[3] | 0x010000;
253 break;
254
255 case 15:
256 w0[3] = w0[3] | 0x01000000;
257 break;
258 }
259 }
260
261 inline void append_0x01_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
262 {
263 switch (offset)
264 {
265 case 0:
266 w0[0] = 0x01;
267 break;
268
269 case 1:
270 w0[0] = w0[0] | 0x0100;
271 break;
272
273 case 2:
274 w0[0] = w0[0] | 0x010000;
275 break;
276
277 case 3:
278 w0[0] = w0[0] | 0x01000000;
279 break;
280
281 case 4:
282 w0[1] = 0x01;
283 break;
284
285 case 5:
286 w0[1] = w0[1] | 0x0100;
287 break;
288
289 case 6:
290 w0[1] = w0[1] | 0x010000;
291 break;
292
293 case 7:
294 w0[1] = w0[1] | 0x01000000;
295 break;
296
297 case 8:
298 w0[2] = 0x01;
299 break;
300
301 case 9:
302 w0[2] = w0[2] | 0x0100;
303 break;
304
305 case 10:
306 w0[2] = w0[2] | 0x010000;
307 break;
308
309 case 11:
310 w0[2] = w0[2] | 0x01000000;
311 break;
312
313 case 12:
314 w0[3] = 0x01;
315 break;
316
317 case 13:
318 w0[3] = w0[3] | 0x0100;
319 break;
320
321 case 14:
322 w0[3] = w0[3] | 0x010000;
323 break;
324
325 case 15:
326 w0[3] = w0[3] | 0x01000000;
327 break;
328
329 case 16:
330 w1[0] = 0x01;
331 break;
332
333 case 17:
334 w1[0] = w1[0] | 0x0100;
335 break;
336
337 case 18:
338 w1[0] = w1[0] | 0x010000;
339 break;
340
341 case 19:
342 w1[0] = w1[0] | 0x01000000;
343 break;
344
345 case 20:
346 w1[1] = 0x01;
347 break;
348
349 case 21:
350 w1[1] = w1[1] | 0x0100;
351 break;
352
353 case 22:
354 w1[1] = w1[1] | 0x010000;
355 break;
356
357 case 23:
358 w1[1] = w1[1] | 0x01000000;
359 break;
360
361 case 24:
362 w1[2] = 0x01;
363 break;
364
365 case 25:
366 w1[2] = w1[2] | 0x0100;
367 break;
368
369 case 26:
370 w1[2] = w1[2] | 0x010000;
371 break;
372
373 case 27:
374 w1[2] = w1[2] | 0x01000000;
375 break;
376
377 case 28:
378 w1[3] = 0x01;
379 break;
380
381 case 29:
382 w1[3] = w1[3] | 0x0100;
383 break;
384
385 case 30:
386 w1[3] = w1[3] | 0x010000;
387 break;
388
389 case 31:
390 w1[3] = w1[3] | 0x01000000;
391 break;
392 }
393 }
394
395 inline void append_0x01_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
396 {
397 switch (offset)
398 {
399 case 0:
400 w0[0] = 0x01;
401 break;
402
403 case 1:
404 w0[0] = w0[0] | 0x0100;
405 break;
406
407 case 2:
408 w0[0] = w0[0] | 0x010000;
409 break;
410
411 case 3:
412 w0[0] = w0[0] | 0x01000000;
413 break;
414
415 case 4:
416 w0[1] = 0x01;
417 break;
418
419 case 5:
420 w0[1] = w0[1] | 0x0100;
421 break;
422
423 case 6:
424 w0[1] = w0[1] | 0x010000;
425 break;
426
427 case 7:
428 w0[1] = w0[1] | 0x01000000;
429 break;
430
431 case 8:
432 w0[2] = 0x01;
433 break;
434
435 case 9:
436 w0[2] = w0[2] | 0x0100;
437 break;
438
439 case 10:
440 w0[2] = w0[2] | 0x010000;
441 break;
442
443 case 11:
444 w0[2] = w0[2] | 0x01000000;
445 break;
446
447 case 12:
448 w0[3] = 0x01;
449 break;
450
451 case 13:
452 w0[3] = w0[3] | 0x0100;
453 break;
454
455 case 14:
456 w0[3] = w0[3] | 0x010000;
457 break;
458
459 case 15:
460 w0[3] = w0[3] | 0x01000000;
461 break;
462
463 case 16:
464 w1[0] = 0x01;
465 break;
466
467 case 17:
468 w1[0] = w1[0] | 0x0100;
469 break;
470
471 case 18:
472 w1[0] = w1[0] | 0x010000;
473 break;
474
475 case 19:
476 w1[0] = w1[0] | 0x01000000;
477 break;
478
479 case 20:
480 w1[1] = 0x01;
481 break;
482
483 case 21:
484 w1[1] = w1[1] | 0x0100;
485 break;
486
487 case 22:
488 w1[1] = w1[1] | 0x010000;
489 break;
490
491 case 23:
492 w1[1] = w1[1] | 0x01000000;
493 break;
494
495 case 24:
496 w1[2] = 0x01;
497 break;
498
499 case 25:
500 w1[2] = w1[2] | 0x0100;
501 break;
502
503 case 26:
504 w1[2] = w1[2] | 0x010000;
505 break;
506
507 case 27:
508 w1[2] = w1[2] | 0x01000000;
509 break;
510
511 case 28:
512 w1[3] = 0x01;
513 break;
514
515 case 29:
516 w1[3] = w1[3] | 0x0100;
517 break;
518
519 case 30:
520 w1[3] = w1[3] | 0x010000;
521 break;
522
523 case 31:
524 w1[3] = w1[3] | 0x01000000;
525 break;
526
527 case 32:
528 w2[0] = 0x01;
529 break;
530
531 case 33:
532 w2[0] = w2[0] | 0x0100;
533 break;
534
535 case 34:
536 w2[0] = w2[0] | 0x010000;
537 break;
538
539 case 35:
540 w2[0] = w2[0] | 0x01000000;
541 break;
542
543 case 36:
544 w2[1] = 0x01;
545 break;
546
547 case 37:
548 w2[1] = w2[1] | 0x0100;
549 break;
550
551 case 38:
552 w2[1] = w2[1] | 0x010000;
553 break;
554
555 case 39:
556 w2[1] = w2[1] | 0x01000000;
557 break;
558
559 case 40:
560 w2[2] = 0x01;
561 break;
562
563 case 41:
564 w2[2] = w2[2] | 0x0100;
565 break;
566
567 case 42:
568 w2[2] = w2[2] | 0x010000;
569 break;
570
571 case 43:
572 w2[2] = w2[2] | 0x01000000;
573 break;
574
575 case 44:
576 w2[3] = 0x01;
577 break;
578
579 case 45:
580 w2[3] = w2[3] | 0x0100;
581 break;
582
583 case 46:
584 w2[3] = w2[3] | 0x010000;
585 break;
586
587 case 47:
588 w2[3] = w2[3] | 0x01000000;
589 break;
590 }
591 }
592
593 inline void append_0x01_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
594 {
595 switch (offset)
596 {
597 case 0:
598 w0[0] = 0x01;
599 break;
600
601 case 1:
602 w0[0] = w0[0] | 0x0100;
603 break;
604
605 case 2:
606 w0[0] = w0[0] | 0x010000;
607 break;
608
609 case 3:
610 w0[0] = w0[0] | 0x01000000;
611 break;
612
613 case 4:
614 w0[1] = 0x01;
615 break;
616
617 case 5:
618 w0[1] = w0[1] | 0x0100;
619 break;
620
621 case 6:
622 w0[1] = w0[1] | 0x010000;
623 break;
624
625 case 7:
626 w0[1] = w0[1] | 0x01000000;
627 break;
628
629 case 8:
630 w0[2] = 0x01;
631 break;
632
633 case 9:
634 w0[2] = w0[2] | 0x0100;
635 break;
636
637 case 10:
638 w0[2] = w0[2] | 0x010000;
639 break;
640
641 case 11:
642 w0[2] = w0[2] | 0x01000000;
643 break;
644
645 case 12:
646 w0[3] = 0x01;
647 break;
648
649 case 13:
650 w0[3] = w0[3] | 0x0100;
651 break;
652
653 case 14:
654 w0[3] = w0[3] | 0x010000;
655 break;
656
657 case 15:
658 w0[3] = w0[3] | 0x01000000;
659 break;
660
661 case 16:
662 w1[0] = 0x01;
663 break;
664
665 case 17:
666 w1[0] = w1[0] | 0x0100;
667 break;
668
669 case 18:
670 w1[0] = w1[0] | 0x010000;
671 break;
672
673 case 19:
674 w1[0] = w1[0] | 0x01000000;
675 break;
676
677 case 20:
678 w1[1] = 0x01;
679 break;
680
681 case 21:
682 w1[1] = w1[1] | 0x0100;
683 break;
684
685 case 22:
686 w1[1] = w1[1] | 0x010000;
687 break;
688
689 case 23:
690 w1[1] = w1[1] | 0x01000000;
691 break;
692
693 case 24:
694 w1[2] = 0x01;
695 break;
696
697 case 25:
698 w1[2] = w1[2] | 0x0100;
699 break;
700
701 case 26:
702 w1[2] = w1[2] | 0x010000;
703 break;
704
705 case 27:
706 w1[2] = w1[2] | 0x01000000;
707 break;
708
709 case 28:
710 w1[3] = 0x01;
711 break;
712
713 case 29:
714 w1[3] = w1[3] | 0x0100;
715 break;
716
717 case 30:
718 w1[3] = w1[3] | 0x010000;
719 break;
720
721 case 31:
722 w1[3] = w1[3] | 0x01000000;
723 break;
724
725 case 32:
726 w2[0] = 0x01;
727 break;
728
729 case 33:
730 w2[0] = w2[0] | 0x0100;
731 break;
732
733 case 34:
734 w2[0] = w2[0] | 0x010000;
735 break;
736
737 case 35:
738 w2[0] = w2[0] | 0x01000000;
739 break;
740
741 case 36:
742 w2[1] = 0x01;
743 break;
744
745 case 37:
746 w2[1] = w2[1] | 0x0100;
747 break;
748
749 case 38:
750 w2[1] = w2[1] | 0x010000;
751 break;
752
753 case 39:
754 w2[1] = w2[1] | 0x01000000;
755 break;
756
757 case 40:
758 w2[2] = 0x01;
759 break;
760
761 case 41:
762 w2[2] = w2[2] | 0x0100;
763 break;
764
765 case 42:
766 w2[2] = w2[2] | 0x010000;
767 break;
768
769 case 43:
770 w2[2] = w2[2] | 0x01000000;
771 break;
772
773 case 44:
774 w2[3] = 0x01;
775 break;
776
777 case 45:
778 w2[3] = w2[3] | 0x0100;
779 break;
780
781 case 46:
782 w2[3] = w2[3] | 0x010000;
783 break;
784
785 case 47:
786 w2[3] = w2[3] | 0x01000000;
787 break;
788
789 case 48:
790 w3[0] = 0x01;
791 break;
792
793 case 49:
794 w3[0] = w3[0] | 0x0100;
795 break;
796
797 case 50:
798 w3[0] = w3[0] | 0x010000;
799 break;
800
801 case 51:
802 w3[0] = w3[0] | 0x01000000;
803 break;
804
805 case 52:
806 w3[1] = 0x01;
807 break;
808
809 case 53:
810 w3[1] = w3[1] | 0x0100;
811 break;
812
813 case 54:
814 w3[1] = w3[1] | 0x010000;
815 break;
816
817 case 55:
818 w3[1] = w3[1] | 0x01000000;
819 break;
820
821 case 56:
822 w3[2] = 0x01;
823 break;
824
825 case 57:
826 w3[2] = w3[2] | 0x0100;
827 break;
828
829 case 58:
830 w3[2] = w3[2] | 0x010000;
831 break;
832
833 case 59:
834 w3[2] = w3[2] | 0x01000000;
835 break;
836
837 case 60:
838 w3[3] = 0x01;
839 break;
840
841 case 61:
842 w3[3] = w3[3] | 0x0100;
843 break;
844
845 case 62:
846 w3[3] = w3[3] | 0x010000;
847 break;
848
849 case 63:
850 w3[3] = w3[3] | 0x01000000;
851 break;
852 }
853 }
854
855 inline void append_0x01_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
856 {
857 switch (offset)
858 {
859 case 0:
860 w0[0] = 0x01;
861 break;
862
863 case 1:
864 w0[0] = w0[0] | 0x0100;
865 break;
866
867 case 2:
868 w0[0] = w0[0] | 0x010000;
869 break;
870
871 case 3:
872 w0[0] = w0[0] | 0x01000000;
873 break;
874
875 case 4:
876 w0[1] = 0x01;
877 break;
878
879 case 5:
880 w0[1] = w0[1] | 0x0100;
881 break;
882
883 case 6:
884 w0[1] = w0[1] | 0x010000;
885 break;
886
887 case 7:
888 w0[1] = w0[1] | 0x01000000;
889 break;
890
891 case 8:
892 w0[2] = 0x01;
893 break;
894
895 case 9:
896 w0[2] = w0[2] | 0x0100;
897 break;
898
899 case 10:
900 w0[2] = w0[2] | 0x010000;
901 break;
902
903 case 11:
904 w0[2] = w0[2] | 0x01000000;
905 break;
906
907 case 12:
908 w0[3] = 0x01;
909 break;
910
911 case 13:
912 w0[3] = w0[3] | 0x0100;
913 break;
914
915 case 14:
916 w0[3] = w0[3] | 0x010000;
917 break;
918
919 case 15:
920 w0[3] = w0[3] | 0x01000000;
921 break;
922
923 case 16:
924 w1[0] = 0x01;
925 break;
926
927 case 17:
928 w1[0] = w1[0] | 0x0100;
929 break;
930
931 case 18:
932 w1[0] = w1[0] | 0x010000;
933 break;
934
935 case 19:
936 w1[0] = w1[0] | 0x01000000;
937 break;
938
939 case 20:
940 w1[1] = 0x01;
941 break;
942
943 case 21:
944 w1[1] = w1[1] | 0x0100;
945 break;
946
947 case 22:
948 w1[1] = w1[1] | 0x010000;
949 break;
950
951 case 23:
952 w1[1] = w1[1] | 0x01000000;
953 break;
954
955 case 24:
956 w1[2] = 0x01;
957 break;
958
959 case 25:
960 w1[2] = w1[2] | 0x0100;
961 break;
962
963 case 26:
964 w1[2] = w1[2] | 0x010000;
965 break;
966
967 case 27:
968 w1[2] = w1[2] | 0x01000000;
969 break;
970
971 case 28:
972 w1[3] = 0x01;
973 break;
974
975 case 29:
976 w1[3] = w1[3] | 0x0100;
977 break;
978
979 case 30:
980 w1[3] = w1[3] | 0x010000;
981 break;
982
983 case 31:
984 w1[3] = w1[3] | 0x01000000;
985 break;
986
987 case 32:
988 w2[0] = 0x01;
989 break;
990
991 case 33:
992 w2[0] = w2[0] | 0x0100;
993 break;
994
995 case 34:
996 w2[0] = w2[0] | 0x010000;
997 break;
998
999 case 35:
1000 w2[0] = w2[0] | 0x01000000;
1001 break;
1002
1003 case 36:
1004 w2[1] = 0x01;
1005 break;
1006
1007 case 37:
1008 w2[1] = w2[1] | 0x0100;
1009 break;
1010
1011 case 38:
1012 w2[1] = w2[1] | 0x010000;
1013 break;
1014
1015 case 39:
1016 w2[1] = w2[1] | 0x01000000;
1017 break;
1018
1019 case 40:
1020 w2[2] = 0x01;
1021 break;
1022
1023 case 41:
1024 w2[2] = w2[2] | 0x0100;
1025 break;
1026
1027 case 42:
1028 w2[2] = w2[2] | 0x010000;
1029 break;
1030
1031 case 43:
1032 w2[2] = w2[2] | 0x01000000;
1033 break;
1034
1035 case 44:
1036 w2[3] = 0x01;
1037 break;
1038
1039 case 45:
1040 w2[3] = w2[3] | 0x0100;
1041 break;
1042
1043 case 46:
1044 w2[3] = w2[3] | 0x010000;
1045 break;
1046
1047 case 47:
1048 w2[3] = w2[3] | 0x01000000;
1049 break;
1050
1051 case 48:
1052 w3[0] = 0x01;
1053 break;
1054
1055 case 49:
1056 w3[0] = w3[0] | 0x0100;
1057 break;
1058
1059 case 50:
1060 w3[0] = w3[0] | 0x010000;
1061 break;
1062
1063 case 51:
1064 w3[0] = w3[0] | 0x01000000;
1065 break;
1066
1067 case 52:
1068 w3[1] = 0x01;
1069 break;
1070
1071 case 53:
1072 w3[1] = w3[1] | 0x0100;
1073 break;
1074
1075 case 54:
1076 w3[1] = w3[1] | 0x010000;
1077 break;
1078
1079 case 55:
1080 w3[1] = w3[1] | 0x01000000;
1081 break;
1082
1083 case 56:
1084 w3[2] = 0x01;
1085 break;
1086
1087 case 57:
1088 w3[2] = w3[2] | 0x0100;
1089 break;
1090
1091 case 58:
1092 w3[2] = w3[2] | 0x010000;
1093 break;
1094
1095 case 59:
1096 w3[2] = w3[2] | 0x01000000;
1097 break;
1098
1099 case 60:
1100 w3[3] = 0x01;
1101 break;
1102
1103 case 61:
1104 w3[3] = w3[3] | 0x0100;
1105 break;
1106
1107 case 62:
1108 w3[3] = w3[3] | 0x010000;
1109 break;
1110
1111 case 63:
1112 w3[3] = w3[3] | 0x01000000;
1113 break;
1114
1115 case 64:
1116 w4[0] = 0x01;
1117 break;
1118
1119 case 65:
1120 w4[0] = w4[0] | 0x0100;
1121 break;
1122
1123 case 66:
1124 w4[0] = w4[0] | 0x010000;
1125 break;
1126
1127 case 67:
1128 w4[0] = w4[0] | 0x01000000;
1129 break;
1130
1131 case 68:
1132 w4[1] = 0x01;
1133 break;
1134
1135 case 69:
1136 w4[1] = w4[1] | 0x0100;
1137 break;
1138
1139 case 70:
1140 w4[1] = w4[1] | 0x010000;
1141 break;
1142
1143 case 71:
1144 w4[1] = w4[1] | 0x01000000;
1145 break;
1146
1147 case 72:
1148 w4[2] = 0x01;
1149 break;
1150
1151 case 73:
1152 w4[2] = w4[2] | 0x0100;
1153 break;
1154
1155 case 74:
1156 w4[2] = w4[2] | 0x010000;
1157 break;
1158
1159 case 75:
1160 w4[2] = w4[2] | 0x01000000;
1161 break;
1162
1163 case 76:
1164 w4[3] = 0x01;
1165 break;
1166
1167 case 77:
1168 w4[3] = w4[3] | 0x0100;
1169 break;
1170
1171 case 78:
1172 w4[3] = w4[3] | 0x010000;
1173 break;
1174
1175 case 79:
1176 w4[3] = w4[3] | 0x01000000;
1177 break;
1178
1179 case 80:
1180 w5[0] = 0x01;
1181 break;
1182
1183 case 81:
1184 w5[0] = w5[0] | 0x0100;
1185 break;
1186
1187 case 82:
1188 w5[0] = w5[0] | 0x010000;
1189 break;
1190
1191 case 83:
1192 w5[0] = w5[0] | 0x01000000;
1193 break;
1194
1195 case 84:
1196 w5[1] = 0x01;
1197 break;
1198
1199 case 85:
1200 w5[1] = w5[1] | 0x0100;
1201 break;
1202
1203 case 86:
1204 w5[1] = w5[1] | 0x010000;
1205 break;
1206
1207 case 87:
1208 w5[1] = w5[1] | 0x01000000;
1209 break;
1210
1211 case 88:
1212 w5[2] = 0x01;
1213 break;
1214
1215 case 89:
1216 w5[2] = w5[2] | 0x0100;
1217 break;
1218
1219 case 90:
1220 w5[2] = w5[2] | 0x010000;
1221 break;
1222
1223 case 91:
1224 w5[2] = w5[2] | 0x01000000;
1225 break;
1226
1227 case 92:
1228 w5[3] = 0x01;
1229 break;
1230
1231 case 93:
1232 w5[3] = w5[3] | 0x0100;
1233 break;
1234
1235 case 94:
1236 w5[3] = w5[3] | 0x010000;
1237 break;
1238
1239 case 95:
1240 w5[3] = w5[3] | 0x01000000;
1241 break;
1242
1243 case 96:
1244 w6[0] = 0x01;
1245 break;
1246
1247 case 97:
1248 w6[0] = w6[0] | 0x0100;
1249 break;
1250
1251 case 98:
1252 w6[0] = w6[0] | 0x010000;
1253 break;
1254
1255 case 99:
1256 w6[0] = w6[0] | 0x01000000;
1257 break;
1258
1259 case 100:
1260 w6[1] = 0x01;
1261 break;
1262
1263 case 101:
1264 w6[1] = w6[1] | 0x0100;
1265 break;
1266
1267 case 102:
1268 w6[1] = w6[1] | 0x010000;
1269 break;
1270
1271 case 103:
1272 w6[1] = w6[1] | 0x01000000;
1273 break;
1274
1275 case 104:
1276 w6[2] = 0x01;
1277 break;
1278
1279 case 105:
1280 w6[2] = w6[2] | 0x0100;
1281 break;
1282
1283 case 106:
1284 w6[2] = w6[2] | 0x010000;
1285 break;
1286
1287 case 107:
1288 w6[2] = w6[2] | 0x01000000;
1289 break;
1290
1291 case 108:
1292 w6[3] = 0x01;
1293 break;
1294
1295 case 109:
1296 w6[3] = w6[3] | 0x0100;
1297 break;
1298
1299 case 110:
1300 w6[3] = w6[3] | 0x010000;
1301 break;
1302
1303 case 111:
1304 w6[3] = w6[3] | 0x01000000;
1305 break;
1306
1307 case 112:
1308 w7[0] = 0x01;
1309 break;
1310
1311 case 113:
1312 w7[0] = w7[0] | 0x0100;
1313 break;
1314
1315 case 114:
1316 w7[0] = w7[0] | 0x010000;
1317 break;
1318
1319 case 115:
1320 w7[0] = w7[0] | 0x01000000;
1321 break;
1322
1323 case 116:
1324 w7[1] = 0x01;
1325 break;
1326
1327 case 117:
1328 w7[1] = w7[1] | 0x0100;
1329 break;
1330
1331 case 118:
1332 w7[1] = w7[1] | 0x010000;
1333 break;
1334
1335 case 119:
1336 w7[1] = w7[1] | 0x01000000;
1337 break;
1338
1339 case 120:
1340 w7[2] = 0x01;
1341 break;
1342
1343 case 121:
1344 w7[2] = w7[2] | 0x0100;
1345 break;
1346
1347 case 122:
1348 w7[2] = w7[2] | 0x010000;
1349 break;
1350
1351 case 123:
1352 w7[2] = w7[2] | 0x01000000;
1353 break;
1354
1355 case 124:
1356 w7[3] = 0x01;
1357 break;
1358
1359 case 125:
1360 w7[3] = w7[3] | 0x0100;
1361 break;
1362
1363 case 126:
1364 w7[3] = w7[3] | 0x010000;
1365 break;
1366
1367 case 127:
1368 w7[3] = w7[3] | 0x01000000;
1369 break;
1370 }
1371 }
1372
1373 inline void append_0x02_1x4 (u32x w0[4], const u32 offset)
1374 {
1375 switch (offset)
1376 {
1377 case 0:
1378 w0[0] = 0x02;
1379 break;
1380
1381 case 1:
1382 w0[0] = w0[0] | 0x0200;
1383 break;
1384
1385 case 2:
1386 w0[0] = w0[0] | 0x020000;
1387 break;
1388
1389 case 3:
1390 w0[0] = w0[0] | 0x02000000;
1391 break;
1392
1393 case 4:
1394 w0[1] = 0x02;
1395 break;
1396
1397 case 5:
1398 w0[1] = w0[1] | 0x0200;
1399 break;
1400
1401 case 6:
1402 w0[1] = w0[1] | 0x020000;
1403 break;
1404
1405 case 7:
1406 w0[1] = w0[1] | 0x02000000;
1407 break;
1408
1409 case 8:
1410 w0[2] = 0x02;
1411 break;
1412
1413 case 9:
1414 w0[2] = w0[2] | 0x0200;
1415 break;
1416
1417 case 10:
1418 w0[2] = w0[2] | 0x020000;
1419 break;
1420
1421 case 11:
1422 w0[2] = w0[2] | 0x02000000;
1423 break;
1424
1425 case 12:
1426 w0[3] = 0x02;
1427 break;
1428
1429 case 13:
1430 w0[3] = w0[3] | 0x0200;
1431 break;
1432
1433 case 14:
1434 w0[3] = w0[3] | 0x020000;
1435 break;
1436
1437 case 15:
1438 w0[3] = w0[3] | 0x02000000;
1439 break;
1440 }
1441 }
1442
1443 inline void append_0x02_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
1444 {
1445 switch (offset)
1446 {
1447 case 0:
1448 w0[0] = 0x02;
1449 break;
1450
1451 case 1:
1452 w0[0] = w0[0] | 0x0200;
1453 break;
1454
1455 case 2:
1456 w0[0] = w0[0] | 0x020000;
1457 break;
1458
1459 case 3:
1460 w0[0] = w0[0] | 0x02000000;
1461 break;
1462
1463 case 4:
1464 w0[1] = 0x02;
1465 break;
1466
1467 case 5:
1468 w0[1] = w0[1] | 0x0200;
1469 break;
1470
1471 case 6:
1472 w0[1] = w0[1] | 0x020000;
1473 break;
1474
1475 case 7:
1476 w0[1] = w0[1] | 0x02000000;
1477 break;
1478
1479 case 8:
1480 w0[2] = 0x02;
1481 break;
1482
1483 case 9:
1484 w0[2] = w0[2] | 0x0200;
1485 break;
1486
1487 case 10:
1488 w0[2] = w0[2] | 0x020000;
1489 break;
1490
1491 case 11:
1492 w0[2] = w0[2] | 0x02000000;
1493 break;
1494
1495 case 12:
1496 w0[3] = 0x02;
1497 break;
1498
1499 case 13:
1500 w0[3] = w0[3] | 0x0200;
1501 break;
1502
1503 case 14:
1504 w0[3] = w0[3] | 0x020000;
1505 break;
1506
1507 case 15:
1508 w0[3] = w0[3] | 0x02000000;
1509 break;
1510
1511 case 16:
1512 w1[0] = 0x02;
1513 break;
1514
1515 case 17:
1516 w1[0] = w1[0] | 0x0200;
1517 break;
1518
1519 case 18:
1520 w1[0] = w1[0] | 0x020000;
1521 break;
1522
1523 case 19:
1524 w1[0] = w1[0] | 0x02000000;
1525 break;
1526
1527 case 20:
1528 w1[1] = 0x02;
1529 break;
1530
1531 case 21:
1532 w1[1] = w1[1] | 0x0200;
1533 break;
1534
1535 case 22:
1536 w1[1] = w1[1] | 0x020000;
1537 break;
1538
1539 case 23:
1540 w1[1] = w1[1] | 0x02000000;
1541 break;
1542
1543 case 24:
1544 w1[2] = 0x02;
1545 break;
1546
1547 case 25:
1548 w1[2] = w1[2] | 0x0200;
1549 break;
1550
1551 case 26:
1552 w1[2] = w1[2] | 0x020000;
1553 break;
1554
1555 case 27:
1556 w1[2] = w1[2] | 0x02000000;
1557 break;
1558
1559 case 28:
1560 w1[3] = 0x02;
1561 break;
1562
1563 case 29:
1564 w1[3] = w1[3] | 0x0200;
1565 break;
1566
1567 case 30:
1568 w1[3] = w1[3] | 0x020000;
1569 break;
1570
1571 case 31:
1572 w1[3] = w1[3] | 0x02000000;
1573 break;
1574 }
1575 }
1576
1577 inline void append_0x02_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
1578 {
1579 switch (offset)
1580 {
1581 case 0:
1582 w0[0] = 0x02;
1583 break;
1584
1585 case 1:
1586 w0[0] = w0[0] | 0x0200;
1587 break;
1588
1589 case 2:
1590 w0[0] = w0[0] | 0x020000;
1591 break;
1592
1593 case 3:
1594 w0[0] = w0[0] | 0x02000000;
1595 break;
1596
1597 case 4:
1598 w0[1] = 0x02;
1599 break;
1600
1601 case 5:
1602 w0[1] = w0[1] | 0x0200;
1603 break;
1604
1605 case 6:
1606 w0[1] = w0[1] | 0x020000;
1607 break;
1608
1609 case 7:
1610 w0[1] = w0[1] | 0x02000000;
1611 break;
1612
1613 case 8:
1614 w0[2] = 0x02;
1615 break;
1616
1617 case 9:
1618 w0[2] = w0[2] | 0x0200;
1619 break;
1620
1621 case 10:
1622 w0[2] = w0[2] | 0x020000;
1623 break;
1624
1625 case 11:
1626 w0[2] = w0[2] | 0x02000000;
1627 break;
1628
1629 case 12:
1630 w0[3] = 0x02;
1631 break;
1632
1633 case 13:
1634 w0[3] = w0[3] | 0x0200;
1635 break;
1636
1637 case 14:
1638 w0[3] = w0[3] | 0x020000;
1639 break;
1640
1641 case 15:
1642 w0[3] = w0[3] | 0x02000000;
1643 break;
1644
1645 case 16:
1646 w1[0] = 0x02;
1647 break;
1648
1649 case 17:
1650 w1[0] = w1[0] | 0x0200;
1651 break;
1652
1653 case 18:
1654 w1[0] = w1[0] | 0x020000;
1655 break;
1656
1657 case 19:
1658 w1[0] = w1[0] | 0x02000000;
1659 break;
1660
1661 case 20:
1662 w1[1] = 0x02;
1663 break;
1664
1665 case 21:
1666 w1[1] = w1[1] | 0x0200;
1667 break;
1668
1669 case 22:
1670 w1[1] = w1[1] | 0x020000;
1671 break;
1672
1673 case 23:
1674 w1[1] = w1[1] | 0x02000000;
1675 break;
1676
1677 case 24:
1678 w1[2] = 0x02;
1679 break;
1680
1681 case 25:
1682 w1[2] = w1[2] | 0x0200;
1683 break;
1684
1685 case 26:
1686 w1[2] = w1[2] | 0x020000;
1687 break;
1688
1689 case 27:
1690 w1[2] = w1[2] | 0x02000000;
1691 break;
1692
1693 case 28:
1694 w1[3] = 0x02;
1695 break;
1696
1697 case 29:
1698 w1[3] = w1[3] | 0x0200;
1699 break;
1700
1701 case 30:
1702 w1[3] = w1[3] | 0x020000;
1703 break;
1704
1705 case 31:
1706 w1[3] = w1[3] | 0x02000000;
1707 break;
1708
1709 case 32:
1710 w2[0] = 0x02;
1711 break;
1712
1713 case 33:
1714 w2[0] = w2[0] | 0x0200;
1715 break;
1716
1717 case 34:
1718 w2[0] = w2[0] | 0x020000;
1719 break;
1720
1721 case 35:
1722 w2[0] = w2[0] | 0x02000000;
1723 break;
1724
1725 case 36:
1726 w2[1] = 0x02;
1727 break;
1728
1729 case 37:
1730 w2[1] = w2[1] | 0x0200;
1731 break;
1732
1733 case 38:
1734 w2[1] = w2[1] | 0x020000;
1735 break;
1736
1737 case 39:
1738 w2[1] = w2[1] | 0x02000000;
1739 break;
1740
1741 case 40:
1742 w2[2] = 0x02;
1743 break;
1744
1745 case 41:
1746 w2[2] = w2[2] | 0x0200;
1747 break;
1748
1749 case 42:
1750 w2[2] = w2[2] | 0x020000;
1751 break;
1752
1753 case 43:
1754 w2[2] = w2[2] | 0x02000000;
1755 break;
1756
1757 case 44:
1758 w2[3] = 0x02;
1759 break;
1760
1761 case 45:
1762 w2[3] = w2[3] | 0x0200;
1763 break;
1764
1765 case 46:
1766 w2[3] = w2[3] | 0x020000;
1767 break;
1768
1769 case 47:
1770 w2[3] = w2[3] | 0x02000000;
1771 break;
1772 }
1773 }
1774
1775 inline void append_0x02_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
1776 {
1777 switch (offset)
1778 {
1779 case 0:
1780 w0[0] = 0x02;
1781 break;
1782
1783 case 1:
1784 w0[0] = w0[0] | 0x0200;
1785 break;
1786
1787 case 2:
1788 w0[0] = w0[0] | 0x020000;
1789 break;
1790
1791 case 3:
1792 w0[0] = w0[0] | 0x02000000;
1793 break;
1794
1795 case 4:
1796 w0[1] = 0x02;
1797 break;
1798
1799 case 5:
1800 w0[1] = w0[1] | 0x0200;
1801 break;
1802
1803 case 6:
1804 w0[1] = w0[1] | 0x020000;
1805 break;
1806
1807 case 7:
1808 w0[1] = w0[1] | 0x02000000;
1809 break;
1810
1811 case 8:
1812 w0[2] = 0x02;
1813 break;
1814
1815 case 9:
1816 w0[2] = w0[2] | 0x0200;
1817 break;
1818
1819 case 10:
1820 w0[2] = w0[2] | 0x020000;
1821 break;
1822
1823 case 11:
1824 w0[2] = w0[2] | 0x02000000;
1825 break;
1826
1827 case 12:
1828 w0[3] = 0x02;
1829 break;
1830
1831 case 13:
1832 w0[3] = w0[3] | 0x0200;
1833 break;
1834
1835 case 14:
1836 w0[3] = w0[3] | 0x020000;
1837 break;
1838
1839 case 15:
1840 w0[3] = w0[3] | 0x02000000;
1841 break;
1842
1843 case 16:
1844 w1[0] = 0x02;
1845 break;
1846
1847 case 17:
1848 w1[0] = w1[0] | 0x0200;
1849 break;
1850
1851 case 18:
1852 w1[0] = w1[0] | 0x020000;
1853 break;
1854
1855 case 19:
1856 w1[0] = w1[0] | 0x02000000;
1857 break;
1858
1859 case 20:
1860 w1[1] = 0x02;
1861 break;
1862
1863 case 21:
1864 w1[1] = w1[1] | 0x0200;
1865 break;
1866
1867 case 22:
1868 w1[1] = w1[1] | 0x020000;
1869 break;
1870
1871 case 23:
1872 w1[1] = w1[1] | 0x02000000;
1873 break;
1874
1875 case 24:
1876 w1[2] = 0x02;
1877 break;
1878
1879 case 25:
1880 w1[2] = w1[2] | 0x0200;
1881 break;
1882
1883 case 26:
1884 w1[2] = w1[2] | 0x020000;
1885 break;
1886
1887 case 27:
1888 w1[2] = w1[2] | 0x02000000;
1889 break;
1890
1891 case 28:
1892 w1[3] = 0x02;
1893 break;
1894
1895 case 29:
1896 w1[3] = w1[3] | 0x0200;
1897 break;
1898
1899 case 30:
1900 w1[3] = w1[3] | 0x020000;
1901 break;
1902
1903 case 31:
1904 w1[3] = w1[3] | 0x02000000;
1905 break;
1906
1907 case 32:
1908 w2[0] = 0x02;
1909 break;
1910
1911 case 33:
1912 w2[0] = w2[0] | 0x0200;
1913 break;
1914
1915 case 34:
1916 w2[0] = w2[0] | 0x020000;
1917 break;
1918
1919 case 35:
1920 w2[0] = w2[0] | 0x02000000;
1921 break;
1922
1923 case 36:
1924 w2[1] = 0x02;
1925 break;
1926
1927 case 37:
1928 w2[1] = w2[1] | 0x0200;
1929 break;
1930
1931 case 38:
1932 w2[1] = w2[1] | 0x020000;
1933 break;
1934
1935 case 39:
1936 w2[1] = w2[1] | 0x02000000;
1937 break;
1938
1939 case 40:
1940 w2[2] = 0x02;
1941 break;
1942
1943 case 41:
1944 w2[2] = w2[2] | 0x0200;
1945 break;
1946
1947 case 42:
1948 w2[2] = w2[2] | 0x020000;
1949 break;
1950
1951 case 43:
1952 w2[2] = w2[2] | 0x02000000;
1953 break;
1954
1955 case 44:
1956 w2[3] = 0x02;
1957 break;
1958
1959 case 45:
1960 w2[3] = w2[3] | 0x0200;
1961 break;
1962
1963 case 46:
1964 w2[3] = w2[3] | 0x020000;
1965 break;
1966
1967 case 47:
1968 w2[3] = w2[3] | 0x02000000;
1969 break;
1970
1971 case 48:
1972 w3[0] = 0x02;
1973 break;
1974
1975 case 49:
1976 w3[0] = w3[0] | 0x0200;
1977 break;
1978
1979 case 50:
1980 w3[0] = w3[0] | 0x020000;
1981 break;
1982
1983 case 51:
1984 w3[0] = w3[0] | 0x02000000;
1985 break;
1986
1987 case 52:
1988 w3[1] = 0x02;
1989 break;
1990
1991 case 53:
1992 w3[1] = w3[1] | 0x0200;
1993 break;
1994
1995 case 54:
1996 w3[1] = w3[1] | 0x020000;
1997 break;
1998
1999 case 55:
2000 w3[1] = w3[1] | 0x02000000;
2001 break;
2002
2003 case 56:
2004 w3[2] = 0x02;
2005 break;
2006
2007 case 57:
2008 w3[2] = w3[2] | 0x0200;
2009 break;
2010
2011 case 58:
2012 w3[2] = w3[2] | 0x020000;
2013 break;
2014
2015 case 59:
2016 w3[2] = w3[2] | 0x02000000;
2017 break;
2018
2019 case 60:
2020 w3[3] = 0x02;
2021 break;
2022
2023 case 61:
2024 w3[3] = w3[3] | 0x0200;
2025 break;
2026
2027 case 62:
2028 w3[3] = w3[3] | 0x020000;
2029 break;
2030
2031 case 63:
2032 w3[3] = w3[3] | 0x02000000;
2033 break;
2034 }
2035 }
2036
2037 inline void append_0x02_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
2038 {
2039 switch (offset)
2040 {
2041 case 0:
2042 w0[0] = 0x02;
2043 break;
2044
2045 case 1:
2046 w0[0] = w0[0] | 0x0200;
2047 break;
2048
2049 case 2:
2050 w0[0] = w0[0] | 0x020000;
2051 break;
2052
2053 case 3:
2054 w0[0] = w0[0] | 0x02000000;
2055 break;
2056
2057 case 4:
2058 w0[1] = 0x02;
2059 break;
2060
2061 case 5:
2062 w0[1] = w0[1] | 0x0200;
2063 break;
2064
2065 case 6:
2066 w0[1] = w0[1] | 0x020000;
2067 break;
2068
2069 case 7:
2070 w0[1] = w0[1] | 0x02000000;
2071 break;
2072
2073 case 8:
2074 w0[2] = 0x02;
2075 break;
2076
2077 case 9:
2078 w0[2] = w0[2] | 0x0200;
2079 break;
2080
2081 case 10:
2082 w0[2] = w0[2] | 0x020000;
2083 break;
2084
2085 case 11:
2086 w0[2] = w0[2] | 0x02000000;
2087 break;
2088
2089 case 12:
2090 w0[3] = 0x02;
2091 break;
2092
2093 case 13:
2094 w0[3] = w0[3] | 0x0200;
2095 break;
2096
2097 case 14:
2098 w0[3] = w0[3] | 0x020000;
2099 break;
2100
2101 case 15:
2102 w0[3] = w0[3] | 0x02000000;
2103 break;
2104
2105 case 16:
2106 w1[0] = 0x02;
2107 break;
2108
2109 case 17:
2110 w1[0] = w1[0] | 0x0200;
2111 break;
2112
2113 case 18:
2114 w1[0] = w1[0] | 0x020000;
2115 break;
2116
2117 case 19:
2118 w1[0] = w1[0] | 0x02000000;
2119 break;
2120
2121 case 20:
2122 w1[1] = 0x02;
2123 break;
2124
2125 case 21:
2126 w1[1] = w1[1] | 0x0200;
2127 break;
2128
2129 case 22:
2130 w1[1] = w1[1] | 0x020000;
2131 break;
2132
2133 case 23:
2134 w1[1] = w1[1] | 0x02000000;
2135 break;
2136
2137 case 24:
2138 w1[2] = 0x02;
2139 break;
2140
2141 case 25:
2142 w1[2] = w1[2] | 0x0200;
2143 break;
2144
2145 case 26:
2146 w1[2] = w1[2] | 0x020000;
2147 break;
2148
2149 case 27:
2150 w1[2] = w1[2] | 0x02000000;
2151 break;
2152
2153 case 28:
2154 w1[3] = 0x02;
2155 break;
2156
2157 case 29:
2158 w1[3] = w1[3] | 0x0200;
2159 break;
2160
2161 case 30:
2162 w1[3] = w1[3] | 0x020000;
2163 break;
2164
2165 case 31:
2166 w1[3] = w1[3] | 0x02000000;
2167 break;
2168
2169 case 32:
2170 w2[0] = 0x02;
2171 break;
2172
2173 case 33:
2174 w2[0] = w2[0] | 0x0200;
2175 break;
2176
2177 case 34:
2178 w2[0] = w2[0] | 0x020000;
2179 break;
2180
2181 case 35:
2182 w2[0] = w2[0] | 0x02000000;
2183 break;
2184
2185 case 36:
2186 w2[1] = 0x02;
2187 break;
2188
2189 case 37:
2190 w2[1] = w2[1] | 0x0200;
2191 break;
2192
2193 case 38:
2194 w2[1] = w2[1] | 0x020000;
2195 break;
2196
2197 case 39:
2198 w2[1] = w2[1] | 0x02000000;
2199 break;
2200
2201 case 40:
2202 w2[2] = 0x02;
2203 break;
2204
2205 case 41:
2206 w2[2] = w2[2] | 0x0200;
2207 break;
2208
2209 case 42:
2210 w2[2] = w2[2] | 0x020000;
2211 break;
2212
2213 case 43:
2214 w2[2] = w2[2] | 0x02000000;
2215 break;
2216
2217 case 44:
2218 w2[3] = 0x02;
2219 break;
2220
2221 case 45:
2222 w2[3] = w2[3] | 0x0200;
2223 break;
2224
2225 case 46:
2226 w2[3] = w2[3] | 0x020000;
2227 break;
2228
2229 case 47:
2230 w2[3] = w2[3] | 0x02000000;
2231 break;
2232
2233 case 48:
2234 w3[0] = 0x02;
2235 break;
2236
2237 case 49:
2238 w3[0] = w3[0] | 0x0200;
2239 break;
2240
2241 case 50:
2242 w3[0] = w3[0] | 0x020000;
2243 break;
2244
2245 case 51:
2246 w3[0] = w3[0] | 0x02000000;
2247 break;
2248
2249 case 52:
2250 w3[1] = 0x02;
2251 break;
2252
2253 case 53:
2254 w3[1] = w3[1] | 0x0200;
2255 break;
2256
2257 case 54:
2258 w3[1] = w3[1] | 0x020000;
2259 break;
2260
2261 case 55:
2262 w3[1] = w3[1] | 0x02000000;
2263 break;
2264
2265 case 56:
2266 w3[2] = 0x02;
2267 break;
2268
2269 case 57:
2270 w3[2] = w3[2] | 0x0200;
2271 break;
2272
2273 case 58:
2274 w3[2] = w3[2] | 0x020000;
2275 break;
2276
2277 case 59:
2278 w3[2] = w3[2] | 0x02000000;
2279 break;
2280
2281 case 60:
2282 w3[3] = 0x02;
2283 break;
2284
2285 case 61:
2286 w3[3] = w3[3] | 0x0200;
2287 break;
2288
2289 case 62:
2290 w3[3] = w3[3] | 0x020000;
2291 break;
2292
2293 case 63:
2294 w3[3] = w3[3] | 0x02000000;
2295 break;
2296
2297 case 64:
2298 w4[0] = 0x02;
2299 break;
2300
2301 case 65:
2302 w4[0] = w4[0] | 0x0200;
2303 break;
2304
2305 case 66:
2306 w4[0] = w4[0] | 0x020000;
2307 break;
2308
2309 case 67:
2310 w4[0] = w4[0] | 0x02000000;
2311 break;
2312
2313 case 68:
2314 w4[1] = 0x02;
2315 break;
2316
2317 case 69:
2318 w4[1] = w4[1] | 0x0200;
2319 break;
2320
2321 case 70:
2322 w4[1] = w4[1] | 0x020000;
2323 break;
2324
2325 case 71:
2326 w4[1] = w4[1] | 0x02000000;
2327 break;
2328
2329 case 72:
2330 w4[2] = 0x02;
2331 break;
2332
2333 case 73:
2334 w4[2] = w4[2] | 0x0200;
2335 break;
2336
2337 case 74:
2338 w4[2] = w4[2] | 0x020000;
2339 break;
2340
2341 case 75:
2342 w4[2] = w4[2] | 0x02000000;
2343 break;
2344
2345 case 76:
2346 w4[3] = 0x02;
2347 break;
2348
2349 case 77:
2350 w4[3] = w4[3] | 0x0200;
2351 break;
2352
2353 case 78:
2354 w4[3] = w4[3] | 0x020000;
2355 break;
2356
2357 case 79:
2358 w4[3] = w4[3] | 0x02000000;
2359 break;
2360
2361 case 80:
2362 w5[0] = 0x02;
2363 break;
2364
2365 case 81:
2366 w5[0] = w5[0] | 0x0200;
2367 break;
2368
2369 case 82:
2370 w5[0] = w5[0] | 0x020000;
2371 break;
2372
2373 case 83:
2374 w5[0] = w5[0] | 0x02000000;
2375 break;
2376
2377 case 84:
2378 w5[1] = 0x02;
2379 break;
2380
2381 case 85:
2382 w5[1] = w5[1] | 0x0200;
2383 break;
2384
2385 case 86:
2386 w5[1] = w5[1] | 0x020000;
2387 break;
2388
2389 case 87:
2390 w5[1] = w5[1] | 0x02000000;
2391 break;
2392
2393 case 88:
2394 w5[2] = 0x02;
2395 break;
2396
2397 case 89:
2398 w5[2] = w5[2] | 0x0200;
2399 break;
2400
2401 case 90:
2402 w5[2] = w5[2] | 0x020000;
2403 break;
2404
2405 case 91:
2406 w5[2] = w5[2] | 0x02000000;
2407 break;
2408
2409 case 92:
2410 w5[3] = 0x02;
2411 break;
2412
2413 case 93:
2414 w5[3] = w5[3] | 0x0200;
2415 break;
2416
2417 case 94:
2418 w5[3] = w5[3] | 0x020000;
2419 break;
2420
2421 case 95:
2422 w5[3] = w5[3] | 0x02000000;
2423 break;
2424
2425 case 96:
2426 w6[0] = 0x02;
2427 break;
2428
2429 case 97:
2430 w6[0] = w6[0] | 0x0200;
2431 break;
2432
2433 case 98:
2434 w6[0] = w6[0] | 0x020000;
2435 break;
2436
2437 case 99:
2438 w6[0] = w6[0] | 0x02000000;
2439 break;
2440
2441 case 100:
2442 w6[1] = 0x02;
2443 break;
2444
2445 case 101:
2446 w6[1] = w6[1] | 0x0200;
2447 break;
2448
2449 case 102:
2450 w6[1] = w6[1] | 0x020000;
2451 break;
2452
2453 case 103:
2454 w6[1] = w6[1] | 0x02000000;
2455 break;
2456
2457 case 104:
2458 w6[2] = 0x02;
2459 break;
2460
2461 case 105:
2462 w6[2] = w6[2] | 0x0200;
2463 break;
2464
2465 case 106:
2466 w6[2] = w6[2] | 0x020000;
2467 break;
2468
2469 case 107:
2470 w6[2] = w6[2] | 0x02000000;
2471 break;
2472
2473 case 108:
2474 w6[3] = 0x02;
2475 break;
2476
2477 case 109:
2478 w6[3] = w6[3] | 0x0200;
2479 break;
2480
2481 case 110:
2482 w6[3] = w6[3] | 0x020000;
2483 break;
2484
2485 case 111:
2486 w6[3] = w6[3] | 0x02000000;
2487 break;
2488
2489 case 112:
2490 w7[0] = 0x02;
2491 break;
2492
2493 case 113:
2494 w7[0] = w7[0] | 0x0200;
2495 break;
2496
2497 case 114:
2498 w7[0] = w7[0] | 0x020000;
2499 break;
2500
2501 case 115:
2502 w7[0] = w7[0] | 0x02000000;
2503 break;
2504
2505 case 116:
2506 w7[1] = 0x02;
2507 break;
2508
2509 case 117:
2510 w7[1] = w7[1] | 0x0200;
2511 break;
2512
2513 case 118:
2514 w7[1] = w7[1] | 0x020000;
2515 break;
2516
2517 case 119:
2518 w7[1] = w7[1] | 0x02000000;
2519 break;
2520
2521 case 120:
2522 w7[2] = 0x02;
2523 break;
2524
2525 case 121:
2526 w7[2] = w7[2] | 0x0200;
2527 break;
2528
2529 case 122:
2530 w7[2] = w7[2] | 0x020000;
2531 break;
2532
2533 case 123:
2534 w7[2] = w7[2] | 0x02000000;
2535 break;
2536
2537 case 124:
2538 w7[3] = 0x02;
2539 break;
2540
2541 case 125:
2542 w7[3] = w7[3] | 0x0200;
2543 break;
2544
2545 case 126:
2546 w7[3] = w7[3] | 0x020000;
2547 break;
2548
2549 case 127:
2550 w7[3] = w7[3] | 0x02000000;
2551 break;
2552 }
2553 }
2554
2555 inline void append_0x80_1x4 (u32x w0[4], const u32 offset)
2556 {
2557 switch (offset)
2558 {
2559 case 0:
2560 w0[0] = 0x80;
2561 break;
2562
2563 case 1:
2564 w0[0] = w0[0] | 0x8000;
2565 break;
2566
2567 case 2:
2568 w0[0] = w0[0] | 0x800000;
2569 break;
2570
2571 case 3:
2572 w0[0] = w0[0] | 0x80000000;
2573 break;
2574
2575 case 4:
2576 w0[1] = 0x80;
2577 break;
2578
2579 case 5:
2580 w0[1] = w0[1] | 0x8000;
2581 break;
2582
2583 case 6:
2584 w0[1] = w0[1] | 0x800000;
2585 break;
2586
2587 case 7:
2588 w0[1] = w0[1] | 0x80000000;
2589 break;
2590
2591 case 8:
2592 w0[2] = 0x80;
2593 break;
2594
2595 case 9:
2596 w0[2] = w0[2] | 0x8000;
2597 break;
2598
2599 case 10:
2600 w0[2] = w0[2] | 0x800000;
2601 break;
2602
2603 case 11:
2604 w0[2] = w0[2] | 0x80000000;
2605 break;
2606
2607 case 12:
2608 w0[3] = 0x80;
2609 break;
2610
2611 case 13:
2612 w0[3] = w0[3] | 0x8000;
2613 break;
2614
2615 case 14:
2616 w0[3] = w0[3] | 0x800000;
2617 break;
2618
2619 case 15:
2620 w0[3] = w0[3] | 0x80000000;
2621 break;
2622 }
2623 }
2624
2625 inline void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset)
2626 {
2627 switch (offset)
2628 {
2629 case 0:
2630 w0[0] = 0x80;
2631 break;
2632
2633 case 1:
2634 w0[0] = w0[0] | 0x8000;
2635 break;
2636
2637 case 2:
2638 w0[0] = w0[0] | 0x800000;
2639 break;
2640
2641 case 3:
2642 w0[0] = w0[0] | 0x80000000;
2643 break;
2644
2645 case 4:
2646 w0[1] = 0x80;
2647 break;
2648
2649 case 5:
2650 w0[1] = w0[1] | 0x8000;
2651 break;
2652
2653 case 6:
2654 w0[1] = w0[1] | 0x800000;
2655 break;
2656
2657 case 7:
2658 w0[1] = w0[1] | 0x80000000;
2659 break;
2660
2661 case 8:
2662 w0[2] = 0x80;
2663 break;
2664
2665 case 9:
2666 w0[2] = w0[2] | 0x8000;
2667 break;
2668
2669 case 10:
2670 w0[2] = w0[2] | 0x800000;
2671 break;
2672
2673 case 11:
2674 w0[2] = w0[2] | 0x80000000;
2675 break;
2676
2677 case 12:
2678 w0[3] = 0x80;
2679 break;
2680
2681 case 13:
2682 w0[3] = w0[3] | 0x8000;
2683 break;
2684
2685 case 14:
2686 w0[3] = w0[3] | 0x800000;
2687 break;
2688
2689 case 15:
2690 w0[3] = w0[3] | 0x80000000;
2691 break;
2692
2693 case 16:
2694 w1[0] = 0x80;
2695 break;
2696
2697 case 17:
2698 w1[0] = w1[0] | 0x8000;
2699 break;
2700
2701 case 18:
2702 w1[0] = w1[0] | 0x800000;
2703 break;
2704
2705 case 19:
2706 w1[0] = w1[0] | 0x80000000;
2707 break;
2708
2709 case 20:
2710 w1[1] = 0x80;
2711 break;
2712
2713 case 21:
2714 w1[1] = w1[1] | 0x8000;
2715 break;
2716
2717 case 22:
2718 w1[1] = w1[1] | 0x800000;
2719 break;
2720
2721 case 23:
2722 w1[1] = w1[1] | 0x80000000;
2723 break;
2724
2725 case 24:
2726 w1[2] = 0x80;
2727 break;
2728
2729 case 25:
2730 w1[2] = w1[2] | 0x8000;
2731 break;
2732
2733 case 26:
2734 w1[2] = w1[2] | 0x800000;
2735 break;
2736
2737 case 27:
2738 w1[2] = w1[2] | 0x80000000;
2739 break;
2740
2741 case 28:
2742 w1[3] = 0x80;
2743 break;
2744
2745 case 29:
2746 w1[3] = w1[3] | 0x8000;
2747 break;
2748
2749 case 30:
2750 w1[3] = w1[3] | 0x800000;
2751 break;
2752
2753 case 31:
2754 w1[3] = w1[3] | 0x80000000;
2755 break;
2756 }
2757 }
2758
2759 inline void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset)
2760 {
2761 switch (offset)
2762 {
2763 case 0:
2764 w0[0] = 0x80;
2765 break;
2766
2767 case 1:
2768 w0[0] = w0[0] | 0x8000;
2769 break;
2770
2771 case 2:
2772 w0[0] = w0[0] | 0x800000;
2773 break;
2774
2775 case 3:
2776 w0[0] = w0[0] | 0x80000000;
2777 break;
2778
2779 case 4:
2780 w0[1] = 0x80;
2781 break;
2782
2783 case 5:
2784 w0[1] = w0[1] | 0x8000;
2785 break;
2786
2787 case 6:
2788 w0[1] = w0[1] | 0x800000;
2789 break;
2790
2791 case 7:
2792 w0[1] = w0[1] | 0x80000000;
2793 break;
2794
2795 case 8:
2796 w0[2] = 0x80;
2797 break;
2798
2799 case 9:
2800 w0[2] = w0[2] | 0x8000;
2801 break;
2802
2803 case 10:
2804 w0[2] = w0[2] | 0x800000;
2805 break;
2806
2807 case 11:
2808 w0[2] = w0[2] | 0x80000000;
2809 break;
2810
2811 case 12:
2812 w0[3] = 0x80;
2813 break;
2814
2815 case 13:
2816 w0[3] = w0[3] | 0x8000;
2817 break;
2818
2819 case 14:
2820 w0[3] = w0[3] | 0x800000;
2821 break;
2822
2823 case 15:
2824 w0[3] = w0[3] | 0x80000000;
2825 break;
2826
2827 case 16:
2828 w1[0] = 0x80;
2829 break;
2830
2831 case 17:
2832 w1[0] = w1[0] | 0x8000;
2833 break;
2834
2835 case 18:
2836 w1[0] = w1[0] | 0x800000;
2837 break;
2838
2839 case 19:
2840 w1[0] = w1[0] | 0x80000000;
2841 break;
2842
2843 case 20:
2844 w1[1] = 0x80;
2845 break;
2846
2847 case 21:
2848 w1[1] = w1[1] | 0x8000;
2849 break;
2850
2851 case 22:
2852 w1[1] = w1[1] | 0x800000;
2853 break;
2854
2855 case 23:
2856 w1[1] = w1[1] | 0x80000000;
2857 break;
2858
2859 case 24:
2860 w1[2] = 0x80;
2861 break;
2862
2863 case 25:
2864 w1[2] = w1[2] | 0x8000;
2865 break;
2866
2867 case 26:
2868 w1[2] = w1[2] | 0x800000;
2869 break;
2870
2871 case 27:
2872 w1[2] = w1[2] | 0x80000000;
2873 break;
2874
2875 case 28:
2876 w1[3] = 0x80;
2877 break;
2878
2879 case 29:
2880 w1[3] = w1[3] | 0x8000;
2881 break;
2882
2883 case 30:
2884 w1[3] = w1[3] | 0x800000;
2885 break;
2886
2887 case 31:
2888 w1[3] = w1[3] | 0x80000000;
2889 break;
2890
2891 case 32:
2892 w2[0] = 0x80;
2893 break;
2894
2895 case 33:
2896 w2[0] = w2[0] | 0x8000;
2897 break;
2898
2899 case 34:
2900 w2[0] = w2[0] | 0x800000;
2901 break;
2902
2903 case 35:
2904 w2[0] = w2[0] | 0x80000000;
2905 break;
2906
2907 case 36:
2908 w2[1] = 0x80;
2909 break;
2910
2911 case 37:
2912 w2[1] = w2[1] | 0x8000;
2913 break;
2914
2915 case 38:
2916 w2[1] = w2[1] | 0x800000;
2917 break;
2918
2919 case 39:
2920 w2[1] = w2[1] | 0x80000000;
2921 break;
2922
2923 case 40:
2924 w2[2] = 0x80;
2925 break;
2926
2927 case 41:
2928 w2[2] = w2[2] | 0x8000;
2929 break;
2930
2931 case 42:
2932 w2[2] = w2[2] | 0x800000;
2933 break;
2934
2935 case 43:
2936 w2[2] = w2[2] | 0x80000000;
2937 break;
2938
2939 case 44:
2940 w2[3] = 0x80;
2941 break;
2942
2943 case 45:
2944 w2[3] = w2[3] | 0x8000;
2945 break;
2946
2947 case 46:
2948 w2[3] = w2[3] | 0x800000;
2949 break;
2950
2951 case 47:
2952 w2[3] = w2[3] | 0x80000000;
2953 break;
2954 }
2955 }
2956
2957 inline void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
2958 {
2959 switch (offset)
2960 {
2961 case 0:
2962 w0[0] = 0x80;
2963 break;
2964
2965 case 1:
2966 w0[0] = w0[0] | 0x8000;
2967 break;
2968
2969 case 2:
2970 w0[0] = w0[0] | 0x800000;
2971 break;
2972
2973 case 3:
2974 w0[0] = w0[0] | 0x80000000;
2975 break;
2976
2977 case 4:
2978 w0[1] = 0x80;
2979 break;
2980
2981 case 5:
2982 w0[1] = w0[1] | 0x8000;
2983 break;
2984
2985 case 6:
2986 w0[1] = w0[1] | 0x800000;
2987 break;
2988
2989 case 7:
2990 w0[1] = w0[1] | 0x80000000;
2991 break;
2992
2993 case 8:
2994 w0[2] = 0x80;
2995 break;
2996
2997 case 9:
2998 w0[2] = w0[2] | 0x8000;
2999 break;
3000
3001 case 10:
3002 w0[2] = w0[2] | 0x800000;
3003 break;
3004
3005 case 11:
3006 w0[2] = w0[2] | 0x80000000;
3007 break;
3008
3009 case 12:
3010 w0[3] = 0x80;
3011 break;
3012
3013 case 13:
3014 w0[3] = w0[3] | 0x8000;
3015 break;
3016
3017 case 14:
3018 w0[3] = w0[3] | 0x800000;
3019 break;
3020
3021 case 15:
3022 w0[3] = w0[3] | 0x80000000;
3023 break;
3024
3025 case 16:
3026 w1[0] = 0x80;
3027 break;
3028
3029 case 17:
3030 w1[0] = w1[0] | 0x8000;
3031 break;
3032
3033 case 18:
3034 w1[0] = w1[0] | 0x800000;
3035 break;
3036
3037 case 19:
3038 w1[0] = w1[0] | 0x80000000;
3039 break;
3040
3041 case 20:
3042 w1[1] = 0x80;
3043 break;
3044
3045 case 21:
3046 w1[1] = w1[1] | 0x8000;
3047 break;
3048
3049 case 22:
3050 w1[1] = w1[1] | 0x800000;
3051 break;
3052
3053 case 23:
3054 w1[1] = w1[1] | 0x80000000;
3055 break;
3056
3057 case 24:
3058 w1[2] = 0x80;
3059 break;
3060
3061 case 25:
3062 w1[2] = w1[2] | 0x8000;
3063 break;
3064
3065 case 26:
3066 w1[2] = w1[2] | 0x800000;
3067 break;
3068
3069 case 27:
3070 w1[2] = w1[2] | 0x80000000;
3071 break;
3072
3073 case 28:
3074 w1[3] = 0x80;
3075 break;
3076
3077 case 29:
3078 w1[3] = w1[3] | 0x8000;
3079 break;
3080
3081 case 30:
3082 w1[3] = w1[3] | 0x800000;
3083 break;
3084
3085 case 31:
3086 w1[3] = w1[3] | 0x80000000;
3087 break;
3088
3089 case 32:
3090 w2[0] = 0x80;
3091 break;
3092
3093 case 33:
3094 w2[0] = w2[0] | 0x8000;
3095 break;
3096
3097 case 34:
3098 w2[0] = w2[0] | 0x800000;
3099 break;
3100
3101 case 35:
3102 w2[0] = w2[0] | 0x80000000;
3103 break;
3104
3105 case 36:
3106 w2[1] = 0x80;
3107 break;
3108
3109 case 37:
3110 w2[1] = w2[1] | 0x8000;
3111 break;
3112
3113 case 38:
3114 w2[1] = w2[1] | 0x800000;
3115 break;
3116
3117 case 39:
3118 w2[1] = w2[1] | 0x80000000;
3119 break;
3120
3121 case 40:
3122 w2[2] = 0x80;
3123 break;
3124
3125 case 41:
3126 w2[2] = w2[2] | 0x8000;
3127 break;
3128
3129 case 42:
3130 w2[2] = w2[2] | 0x800000;
3131 break;
3132
3133 case 43:
3134 w2[2] = w2[2] | 0x80000000;
3135 break;
3136
3137 case 44:
3138 w2[3] = 0x80;
3139 break;
3140
3141 case 45:
3142 w2[3] = w2[3] | 0x8000;
3143 break;
3144
3145 case 46:
3146 w2[3] = w2[3] | 0x800000;
3147 break;
3148
3149 case 47:
3150 w2[3] = w2[3] | 0x80000000;
3151 break;
3152
3153 case 48:
3154 w3[0] = 0x80;
3155 break;
3156
3157 case 49:
3158 w3[0] = w3[0] | 0x8000;
3159 break;
3160
3161 case 50:
3162 w3[0] = w3[0] | 0x800000;
3163 break;
3164
3165 case 51:
3166 w3[0] = w3[0] | 0x80000000;
3167 break;
3168
3169 case 52:
3170 w3[1] = 0x80;
3171 break;
3172
3173 case 53:
3174 w3[1] = w3[1] | 0x8000;
3175 break;
3176
3177 case 54:
3178 w3[1] = w3[1] | 0x800000;
3179 break;
3180
3181 case 55:
3182 w3[1] = w3[1] | 0x80000000;
3183 break;
3184
3185 case 56:
3186 w3[2] = 0x80;
3187 break;
3188
3189 case 57:
3190 w3[2] = w3[2] | 0x8000;
3191 break;
3192
3193 case 58:
3194 w3[2] = w3[2] | 0x800000;
3195 break;
3196
3197 case 59:
3198 w3[2] = w3[2] | 0x80000000;
3199 break;
3200
3201 case 60:
3202 w3[3] = 0x80;
3203 break;
3204
3205 case 61:
3206 w3[3] = w3[3] | 0x8000;
3207 break;
3208
3209 case 62:
3210 w3[3] = w3[3] | 0x800000;
3211 break;
3212
3213 case 63:
3214 w3[3] = w3[3] | 0x80000000;
3215 break;
3216 }
3217 }
3218
3219 inline void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
3220 {
3221 switch (offset)
3222 {
3223 case 0:
3224 w0[0] = 0x80;
3225 break;
3226
3227 case 1:
3228 w0[0] = w0[0] | 0x8000;
3229 break;
3230
3231 case 2:
3232 w0[0] = w0[0] | 0x800000;
3233 break;
3234
3235 case 3:
3236 w0[0] = w0[0] | 0x80000000;
3237 break;
3238
3239 case 4:
3240 w0[1] = 0x80;
3241 break;
3242
3243 case 5:
3244 w0[1] = w0[1] | 0x8000;
3245 break;
3246
3247 case 6:
3248 w0[1] = w0[1] | 0x800000;
3249 break;
3250
3251 case 7:
3252 w0[1] = w0[1] | 0x80000000;
3253 break;
3254
3255 case 8:
3256 w0[2] = 0x80;
3257 break;
3258
3259 case 9:
3260 w0[2] = w0[2] | 0x8000;
3261 break;
3262
3263 case 10:
3264 w0[2] = w0[2] | 0x800000;
3265 break;
3266
3267 case 11:
3268 w0[2] = w0[2] | 0x80000000;
3269 break;
3270
3271 case 12:
3272 w0[3] = 0x80;
3273 break;
3274
3275 case 13:
3276 w0[3] = w0[3] | 0x8000;
3277 break;
3278
3279 case 14:
3280 w0[3] = w0[3] | 0x800000;
3281 break;
3282
3283 case 15:
3284 w0[3] = w0[3] | 0x80000000;
3285 break;
3286
3287 case 16:
3288 w1[0] = 0x80;
3289 break;
3290
3291 case 17:
3292 w1[0] = w1[0] | 0x8000;
3293 break;
3294
3295 case 18:
3296 w1[0] = w1[0] | 0x800000;
3297 break;
3298
3299 case 19:
3300 w1[0] = w1[0] | 0x80000000;
3301 break;
3302
3303 case 20:
3304 w1[1] = 0x80;
3305 break;
3306
3307 case 21:
3308 w1[1] = w1[1] | 0x8000;
3309 break;
3310
3311 case 22:
3312 w1[1] = w1[1] | 0x800000;
3313 break;
3314
3315 case 23:
3316 w1[1] = w1[1] | 0x80000000;
3317 break;
3318
3319 case 24:
3320 w1[2] = 0x80;
3321 break;
3322
3323 case 25:
3324 w1[2] = w1[2] | 0x8000;
3325 break;
3326
3327 case 26:
3328 w1[2] = w1[2] | 0x800000;
3329 break;
3330
3331 case 27:
3332 w1[2] = w1[2] | 0x80000000;
3333 break;
3334
3335 case 28:
3336 w1[3] = 0x80;
3337 break;
3338
3339 case 29:
3340 w1[3] = w1[3] | 0x8000;
3341 break;
3342
3343 case 30:
3344 w1[3] = w1[3] | 0x800000;
3345 break;
3346
3347 case 31:
3348 w1[3] = w1[3] | 0x80000000;
3349 break;
3350
3351 case 32:
3352 w2[0] = 0x80;
3353 break;
3354
3355 case 33:
3356 w2[0] = w2[0] | 0x8000;
3357 break;
3358
3359 case 34:
3360 w2[0] = w2[0] | 0x800000;
3361 break;
3362
3363 case 35:
3364 w2[0] = w2[0] | 0x80000000;
3365 break;
3366
3367 case 36:
3368 w2[1] = 0x80;
3369 break;
3370
3371 case 37:
3372 w2[1] = w2[1] | 0x8000;
3373 break;
3374
3375 case 38:
3376 w2[1] = w2[1] | 0x800000;
3377 break;
3378
3379 case 39:
3380 w2[1] = w2[1] | 0x80000000;
3381 break;
3382
3383 case 40:
3384 w2[2] = 0x80;
3385 break;
3386
3387 case 41:
3388 w2[2] = w2[2] | 0x8000;
3389 break;
3390
3391 case 42:
3392 w2[2] = w2[2] | 0x800000;
3393 break;
3394
3395 case 43:
3396 w2[2] = w2[2] | 0x80000000;
3397 break;
3398
3399 case 44:
3400 w2[3] = 0x80;
3401 break;
3402
3403 case 45:
3404 w2[3] = w2[3] | 0x8000;
3405 break;
3406
3407 case 46:
3408 w2[3] = w2[3] | 0x800000;
3409 break;
3410
3411 case 47:
3412 w2[3] = w2[3] | 0x80000000;
3413 break;
3414
3415 case 48:
3416 w3[0] = 0x80;
3417 break;
3418
3419 case 49:
3420 w3[0] = w3[0] | 0x8000;
3421 break;
3422
3423 case 50:
3424 w3[0] = w3[0] | 0x800000;
3425 break;
3426
3427 case 51:
3428 w3[0] = w3[0] | 0x80000000;
3429 break;
3430
3431 case 52:
3432 w3[1] = 0x80;
3433 break;
3434
3435 case 53:
3436 w3[1] = w3[1] | 0x8000;
3437 break;
3438
3439 case 54:
3440 w3[1] = w3[1] | 0x800000;
3441 break;
3442
3443 case 55:
3444 w3[1] = w3[1] | 0x80000000;
3445 break;
3446
3447 case 56:
3448 w3[2] = 0x80;
3449 break;
3450
3451 case 57:
3452 w3[2] = w3[2] | 0x8000;
3453 break;
3454
3455 case 58:
3456 w3[2] = w3[2] | 0x800000;
3457 break;
3458
3459 case 59:
3460 w3[2] = w3[2] | 0x80000000;
3461 break;
3462
3463 case 60:
3464 w3[3] = 0x80;
3465 break;
3466
3467 case 61:
3468 w3[3] = w3[3] | 0x8000;
3469 break;
3470
3471 case 62:
3472 w3[3] = w3[3] | 0x800000;
3473 break;
3474
3475 case 63:
3476 w3[3] = w3[3] | 0x80000000;
3477 break;
3478
3479 case 64:
3480 w4[0] = 0x80;
3481 break;
3482
3483 case 65:
3484 w4[0] = w4[0] | 0x8000;
3485 break;
3486
3487 case 66:
3488 w4[0] = w4[0] | 0x800000;
3489 break;
3490
3491 case 67:
3492 w4[0] = w4[0] | 0x80000000;
3493 break;
3494
3495 case 68:
3496 w4[1] = 0x80;
3497 break;
3498
3499 case 69:
3500 w4[1] = w4[1] | 0x8000;
3501 break;
3502
3503 case 70:
3504 w4[1] = w4[1] | 0x800000;
3505 break;
3506
3507 case 71:
3508 w4[1] = w4[1] | 0x80000000;
3509 break;
3510
3511 case 72:
3512 w4[2] = 0x80;
3513 break;
3514
3515 case 73:
3516 w4[2] = w4[2] | 0x8000;
3517 break;
3518
3519 case 74:
3520 w4[2] = w4[2] | 0x800000;
3521 break;
3522
3523 case 75:
3524 w4[2] = w4[2] | 0x80000000;
3525 break;
3526
3527 case 76:
3528 w4[3] = 0x80;
3529 break;
3530
3531 case 77:
3532 w4[3] = w4[3] | 0x8000;
3533 break;
3534
3535 case 78:
3536 w4[3] = w4[3] | 0x800000;
3537 break;
3538
3539 case 79:
3540 w4[3] = w4[3] | 0x80000000;
3541 break;
3542
3543 case 80:
3544 w5[0] = 0x80;
3545 break;
3546
3547 case 81:
3548 w5[0] = w5[0] | 0x8000;
3549 break;
3550
3551 case 82:
3552 w5[0] = w5[0] | 0x800000;
3553 break;
3554
3555 case 83:
3556 w5[0] = w5[0] | 0x80000000;
3557 break;
3558
3559 case 84:
3560 w5[1] = 0x80;
3561 break;
3562
3563 case 85:
3564 w5[1] = w5[1] | 0x8000;
3565 break;
3566
3567 case 86:
3568 w5[1] = w5[1] | 0x800000;
3569 break;
3570
3571 case 87:
3572 w5[1] = w5[1] | 0x80000000;
3573 break;
3574
3575 case 88:
3576 w5[2] = 0x80;
3577 break;
3578
3579 case 89:
3580 w5[2] = w5[2] | 0x8000;
3581 break;
3582
3583 case 90:
3584 w5[2] = w5[2] | 0x800000;
3585 break;
3586
3587 case 91:
3588 w5[2] = w5[2] | 0x80000000;
3589 break;
3590
3591 case 92:
3592 w5[3] = 0x80;
3593 break;
3594
3595 case 93:
3596 w5[3] = w5[3] | 0x8000;
3597 break;
3598
3599 case 94:
3600 w5[3] = w5[3] | 0x800000;
3601 break;
3602
3603 case 95:
3604 w5[3] = w5[3] | 0x80000000;
3605 break;
3606
3607 case 96:
3608 w6[0] = 0x80;
3609 break;
3610
3611 case 97:
3612 w6[0] = w6[0] | 0x8000;
3613 break;
3614
3615 case 98:
3616 w6[0] = w6[0] | 0x800000;
3617 break;
3618
3619 case 99:
3620 w6[0] = w6[0] | 0x80000000;
3621 break;
3622
3623 case 100:
3624 w6[1] = 0x80;
3625 break;
3626
3627 case 101:
3628 w6[1] = w6[1] | 0x8000;
3629 break;
3630
3631 case 102:
3632 w6[1] = w6[1] | 0x800000;
3633 break;
3634
3635 case 103:
3636 w6[1] = w6[1] | 0x80000000;
3637 break;
3638
3639 case 104:
3640 w6[2] = 0x80;
3641 break;
3642
3643 case 105:
3644 w6[2] = w6[2] | 0x8000;
3645 break;
3646
3647 case 106:
3648 w6[2] = w6[2] | 0x800000;
3649 break;
3650
3651 case 107:
3652 w6[2] = w6[2] | 0x80000000;
3653 break;
3654
3655 case 108:
3656 w6[3] = 0x80;
3657 break;
3658
3659 case 109:
3660 w6[3] = w6[3] | 0x8000;
3661 break;
3662
3663 case 110:
3664 w6[3] = w6[3] | 0x800000;
3665 break;
3666
3667 case 111:
3668 w6[3] = w6[3] | 0x80000000;
3669 break;
3670
3671 case 112:
3672 w7[0] = 0x80;
3673 break;
3674
3675 case 113:
3676 w7[0] = w7[0] | 0x8000;
3677 break;
3678
3679 case 114:
3680 w7[0] = w7[0] | 0x800000;
3681 break;
3682
3683 case 115:
3684 w7[0] = w7[0] | 0x80000000;
3685 break;
3686
3687 case 116:
3688 w7[1] = 0x80;
3689 break;
3690
3691 case 117:
3692 w7[1] = w7[1] | 0x8000;
3693 break;
3694
3695 case 118:
3696 w7[1] = w7[1] | 0x800000;
3697 break;
3698
3699 case 119:
3700 w7[1] = w7[1] | 0x80000000;
3701 break;
3702
3703 case 120:
3704 w7[2] = 0x80;
3705 break;
3706
3707 case 121:
3708 w7[2] = w7[2] | 0x8000;
3709 break;
3710
3711 case 122:
3712 w7[2] = w7[2] | 0x800000;
3713 break;
3714
3715 case 123:
3716 w7[2] = w7[2] | 0x80000000;
3717 break;
3718
3719 case 124:
3720 w7[3] = 0x80;
3721 break;
3722
3723 case 125:
3724 w7[3] = w7[3] | 0x8000;
3725 break;
3726
3727 case 126:
3728 w7[3] = w7[3] | 0x800000;
3729 break;
3730
3731 case 127:
3732 w7[3] = w7[3] | 0x80000000;
3733 break;
3734 }
3735 }
3736
3737 inline void append_0x80_1x16 (u32x w[16], const u32 offset)
3738 {
3739 switch (offset)
3740 {
3741 case 0:
3742 w[ 0] = 0x80;
3743 break;
3744
3745 case 1:
3746 w[ 0] = w[ 0] | 0x8000;
3747 break;
3748
3749 case 2:
3750 w[ 0] = w[ 0] | 0x800000;
3751 break;
3752
3753 case 3:
3754 w[ 0] = w[ 0] | 0x80000000;
3755 break;
3756
3757 case 4:
3758 w[ 1] = 0x80;
3759 break;
3760
3761 case 5:
3762 w[ 1] = w[ 1] | 0x8000;
3763 break;
3764
3765 case 6:
3766 w[ 1] = w[ 1] | 0x800000;
3767 break;
3768
3769 case 7:
3770 w[ 1] = w[ 1] | 0x80000000;
3771 break;
3772
3773 case 8:
3774 w[ 2] = 0x80;
3775 break;
3776
3777 case 9:
3778 w[ 2] = w[ 2] | 0x8000;
3779 break;
3780
3781 case 10:
3782 w[ 2] = w[ 2] | 0x800000;
3783 break;
3784
3785 case 11:
3786 w[ 2] = w[ 2] | 0x80000000;
3787 break;
3788
3789 case 12:
3790 w[ 3] = 0x80;
3791 break;
3792
3793 case 13:
3794 w[ 3] = w[ 3] | 0x8000;
3795 break;
3796
3797 case 14:
3798 w[ 3] = w[ 3] | 0x800000;
3799 break;
3800
3801 case 15:
3802 w[ 3] = w[ 3] | 0x80000000;
3803 break;
3804
3805 case 16:
3806 w[ 4] = 0x80;
3807 break;
3808
3809 case 17:
3810 w[ 4] = w[ 4] | 0x8000;
3811 break;
3812
3813 case 18:
3814 w[ 4] = w[ 4] | 0x800000;
3815 break;
3816
3817 case 19:
3818 w[ 4] = w[ 4] | 0x80000000;
3819 break;
3820
3821 case 20:
3822 w[ 5] = 0x80;
3823 break;
3824
3825 case 21:
3826 w[ 5] = w[ 5] | 0x8000;
3827 break;
3828
3829 case 22:
3830 w[ 5] = w[ 5] | 0x800000;
3831 break;
3832
3833 case 23:
3834 w[ 5] = w[ 5] | 0x80000000;
3835 break;
3836
3837 case 24:
3838 w[ 6] = 0x80;
3839 break;
3840
3841 case 25:
3842 w[ 6] = w[ 6] | 0x8000;
3843 break;
3844
3845 case 26:
3846 w[ 6] = w[ 6] | 0x800000;
3847 break;
3848
3849 case 27:
3850 w[ 6] = w[ 6] | 0x80000000;
3851 break;
3852
3853 case 28:
3854 w[ 7] = 0x80;
3855 break;
3856
3857 case 29:
3858 w[ 7] = w[ 7] | 0x8000;
3859 break;
3860
3861 case 30:
3862 w[ 7] = w[ 7] | 0x800000;
3863 break;
3864
3865 case 31:
3866 w[ 7] = w[ 7] | 0x80000000;
3867 break;
3868
3869 case 32:
3870 w[ 8] = 0x80;
3871 break;
3872
3873 case 33:
3874 w[ 8] = w[ 8] | 0x8000;
3875 break;
3876
3877 case 34:
3878 w[ 8] = w[ 8] | 0x800000;
3879 break;
3880
3881 case 35:
3882 w[ 8] = w[ 8] | 0x80000000;
3883 break;
3884
3885 case 36:
3886 w[ 9] = 0x80;
3887 break;
3888
3889 case 37:
3890 w[ 9] = w[ 9] | 0x8000;
3891 break;
3892
3893 case 38:
3894 w[ 9] = w[ 9] | 0x800000;
3895 break;
3896
3897 case 39:
3898 w[ 9] = w[ 9] | 0x80000000;
3899 break;
3900
3901 case 40:
3902 w[10] = 0x80;
3903 break;
3904
3905 case 41:
3906 w[10] = w[10] | 0x8000;
3907 break;
3908
3909 case 42:
3910 w[10] = w[10] | 0x800000;
3911 break;
3912
3913 case 43:
3914 w[10] = w[10] | 0x80000000;
3915 break;
3916
3917 case 44:
3918 w[11] = 0x80;
3919 break;
3920
3921 case 45:
3922 w[11] = w[11] | 0x8000;
3923 break;
3924
3925 case 46:
3926 w[11] = w[11] | 0x800000;
3927 break;
3928
3929 case 47:
3930 w[11] = w[11] | 0x80000000;
3931 break;
3932
3933 case 48:
3934 w[12] = 0x80;
3935 break;
3936
3937 case 49:
3938 w[12] = w[12] | 0x8000;
3939 break;
3940
3941 case 50:
3942 w[12] = w[12] | 0x800000;
3943 break;
3944
3945 case 51:
3946 w[12] = w[12] | 0x80000000;
3947 break;
3948
3949 case 52:
3950 w[13] = 0x80;
3951 break;
3952
3953 case 53:
3954 w[13] = w[13] | 0x8000;
3955 break;
3956
3957 case 54:
3958 w[13] = w[13] | 0x800000;
3959 break;
3960
3961 case 55:
3962 w[13] = w[13] | 0x80000000;
3963 break;
3964
3965 case 56:
3966 w[14] = 0x80;
3967 break;
3968
3969 case 57:
3970 w[14] = w[14] | 0x8000;
3971 break;
3972
3973 case 58:
3974 w[14] = w[14] | 0x800000;
3975 break;
3976
3977 case 59:
3978 w[14] = w[14] | 0x80000000;
3979 break;
3980
3981 case 60:
3982 w[15] = 0x80;
3983 break;
3984
3985 case 61:
3986 w[15] = w[15] | 0x8000;
3987 break;
3988
3989 case 62:
3990 w[15] = w[15] | 0x800000;
3991 break;
3992
3993 case 63:
3994 w[15] = w[15] | 0x80000000;
3995 break;
3996 }
3997 }
3998
3999 inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
4000 {
4001 #if defined IS_AMD || defined IS_GENERIC
4002 const int offset_mod_4 = offset & 3;
4003
4004 const int offset_minus_4 = 4 - offset;
4005
4006 switch (offset / 4)
4007 {
4008 case 0:
4009 w3[2] = amd_bytealign ( 0, w3[1], offset_minus_4);
4010 w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4);
4011 w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4);
4012 w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4013 w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4014 w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4015 w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4016 w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4017 w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4018 w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4019 w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4020 w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4021 w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4022 w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4023 w0[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4024
4025 if (offset_mod_4 == 0)
4026 {
4027 w0[0] = w0[1];
4028 w0[1] = w0[2];
4029 w0[2] = w0[3];
4030 w0[3] = w1[0];
4031 w1[0] = w1[1];
4032 w1[1] = w1[2];
4033 w1[2] = w1[3];
4034 w1[3] = w2[0];
4035 w2[0] = w2[1];
4036 w2[1] = w2[2];
4037 w2[2] = w2[3];
4038 w2[3] = w3[0];
4039 w3[0] = w3[1];
4040 w3[1] = w3[2];
4041 w3[2] = 0;
4042 }
4043
4044 break;
4045
4046 case 1:
4047 w3[2] = amd_bytealign ( 0, w3[0], offset_minus_4);
4048 w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4);
4049 w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4050 w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4051 w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4052 w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4053 w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4054 w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4055 w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4056 w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4057 w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4058 w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4059 w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4060 w0[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4061 w0[0] = 0;
4062
4063 if (offset_mod_4 == 0)
4064 {
4065 w0[1] = w0[2];
4066 w0[2] = w0[3];
4067 w0[3] = w1[0];
4068 w1[0] = w1[1];
4069 w1[1] = w1[2];
4070 w1[2] = w1[3];
4071 w1[3] = w2[0];
4072 w2[0] = w2[1];
4073 w2[1] = w2[2];
4074 w2[2] = w2[3];
4075 w2[3] = w3[0];
4076 w3[0] = w3[1];
4077 w3[1] = w3[2];
4078 w3[2] = 0;
4079 }
4080
4081 break;
4082
4083 case 2:
4084 w3[2] = amd_bytealign ( 0, w2[3], offset_minus_4);
4085 w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4);
4086 w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4087 w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4088 w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4089 w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4090 w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4091 w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4092 w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4093 w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4094 w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4095 w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4096 w0[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4097 w0[1] = 0;
4098 w0[0] = 0;
4099
4100 if (offset_mod_4 == 0)
4101 {
4102 w0[2] = w0[3];
4103 w0[3] = w1[0];
4104 w1[0] = w1[1];
4105 w1[1] = w1[2];
4106 w1[2] = w1[3];
4107 w1[3] = w2[0];
4108 w2[0] = w2[1];
4109 w2[1] = w2[2];
4110 w2[2] = w2[3];
4111 w2[3] = w3[0];
4112 w3[0] = w3[1];
4113 w3[1] = w3[2];
4114 w3[2] = 0;
4115 }
4116
4117 break;
4118
4119 case 3:
4120 w3[2] = amd_bytealign ( 0, w2[2], offset_minus_4);
4121 w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4);
4122 w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4123 w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4124 w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4125 w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4126 w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4127 w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4128 w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4129 w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4130 w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4131 w0[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4132 w0[2] = 0;
4133 w0[1] = 0;
4134 w0[0] = 0;
4135
4136 if (offset_mod_4 == 0)
4137 {
4138 w0[3] = w1[0];
4139 w1[0] = w1[1];
4140 w1[1] = w1[2];
4141 w1[2] = w1[3];
4142 w1[3] = w2[0];
4143 w2[0] = w2[1];
4144 w2[1] = w2[2];
4145 w2[2] = w2[3];
4146 w2[3] = w3[0];
4147 w3[0] = w3[1];
4148 w3[1] = w3[2];
4149 w3[2] = 0;
4150 }
4151
4152 break;
4153
4154 case 4:
4155 w3[2] = amd_bytealign ( 0, w2[1], offset_minus_4);
4156 w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4);
4157 w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4158 w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4159 w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4160 w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4161 w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4162 w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4163 w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4164 w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4165 w1[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4166 w0[3] = 0;
4167 w0[2] = 0;
4168 w0[1] = 0;
4169 w0[0] = 0;
4170
4171 if (offset_mod_4 == 0)
4172 {
4173 w1[0] = w1[1];
4174 w1[1] = w1[2];
4175 w1[2] = w1[3];
4176 w1[3] = w2[0];
4177 w2[0] = w2[1];
4178 w2[1] = w2[2];
4179 w2[2] = w2[3];
4180 w2[3] = w3[0];
4181 w3[0] = w3[1];
4182 w3[1] = w3[2];
4183 w3[2] = 0;
4184 }
4185
4186 break;
4187
4188 case 5:
4189 w3[2] = amd_bytealign ( 0, w2[0], offset_minus_4);
4190 w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4);
4191 w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4192 w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4193 w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4194 w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4195 w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4196 w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4197 w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4198 w1[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4199 w1[0] = 0;
4200 w0[3] = 0;
4201 w0[2] = 0;
4202 w0[1] = 0;
4203 w0[0] = 0;
4204
4205 if (offset_mod_4 == 0)
4206 {
4207 w1[1] = w1[2];
4208 w1[2] = w1[3];
4209 w1[3] = w2[0];
4210 w2[0] = w2[1];
4211 w2[1] = w2[2];
4212 w2[2] = w2[3];
4213 w2[3] = w3[0];
4214 w3[0] = w3[1];
4215 w3[1] = w3[2];
4216 w3[2] = 0;
4217 }
4218
4219 break;
4220
4221 case 6:
4222 w3[2] = amd_bytealign ( 0, w1[3], offset_minus_4);
4223 w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4);
4224 w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4225 w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4226 w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4227 w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4228 w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4229 w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4230 w1[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4231 w1[1] = 0;
4232 w1[0] = 0;
4233 w0[3] = 0;
4234 w0[2] = 0;
4235 w0[1] = 0;
4236 w0[0] = 0;
4237
4238 if (offset_mod_4 == 0)
4239 {
4240 w1[2] = w1[3];
4241 w1[3] = w2[0];
4242 w2[0] = w2[1];
4243 w2[1] = w2[2];
4244 w2[2] = w2[3];
4245 w2[3] = w3[0];
4246 w3[0] = w3[1];
4247 w3[1] = w3[2];
4248 w3[2] = 0;
4249 }
4250
4251 break;
4252
4253 case 7:
4254 w3[2] = amd_bytealign ( 0, w1[2], offset_minus_4);
4255 w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4);
4256 w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4257 w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4258 w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4259 w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4260 w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4261 w1[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4262 w1[2] = 0;
4263 w1[1] = 0;
4264 w1[0] = 0;
4265 w0[3] = 0;
4266 w0[2] = 0;
4267 w0[1] = 0;
4268 w0[0] = 0;
4269
4270 if (offset_mod_4 == 0)
4271 {
4272 w1[3] = w2[0];
4273 w2[0] = w2[1];
4274 w2[1] = w2[2];
4275 w2[2] = w2[3];
4276 w2[3] = w3[0];
4277 w3[0] = w3[1];
4278 w3[1] = w3[2];
4279 w3[2] = 0;
4280 }
4281
4282 break;
4283
4284 case 8:
4285 w3[2] = amd_bytealign ( 0, w1[1], offset_minus_4);
4286 w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4);
4287 w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4288 w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4289 w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4290 w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4291 w2[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4292 w1[3] = 0;
4293 w1[2] = 0;
4294 w1[1] = 0;
4295 w1[0] = 0;
4296 w0[3] = 0;
4297 w0[2] = 0;
4298 w0[1] = 0;
4299 w0[0] = 0;
4300
4301 if (offset_mod_4 == 0)
4302 {
4303 w2[0] = w2[1];
4304 w2[1] = w2[2];
4305 w2[2] = w2[3];
4306 w2[3] = w3[0];
4307 w3[0] = w3[1];
4308 w3[1] = w3[2];
4309 w3[2] = 0;
4310 }
4311
4312 break;
4313
4314 case 9:
4315 w3[2] = amd_bytealign ( 0, w1[0], offset_minus_4);
4316 w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4);
4317 w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4318 w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4319 w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4320 w2[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4321 w2[0] = 0;
4322 w1[3] = 0;
4323 w1[2] = 0;
4324 w1[1] = 0;
4325 w1[0] = 0;
4326 w0[3] = 0;
4327 w0[2] = 0;
4328 w0[1] = 0;
4329 w0[0] = 0;
4330
4331 if (offset_mod_4 == 0)
4332 {
4333 w2[1] = w2[2];
4334 w2[2] = w2[3];
4335 w2[3] = w3[0];
4336 w3[0] = w3[1];
4337 w3[1] = w3[2];
4338 w3[2] = 0;
4339 }
4340
4341 break;
4342
4343 case 10:
4344 w3[2] = amd_bytealign ( 0, w0[3], offset_minus_4);
4345 w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4);
4346 w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4347 w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4348 w2[2] = amd_bytealign (w0[0], 0, offset_minus_4);
4349 w2[1] = 0;
4350 w2[0] = 0;
4351 w1[3] = 0;
4352 w1[2] = 0;
4353 w1[1] = 0;
4354 w1[0] = 0;
4355 w0[3] = 0;
4356 w0[2] = 0;
4357 w0[1] = 0;
4358 w0[0] = 0;
4359
4360 if (offset_mod_4 == 0)
4361 {
4362 w2[2] = w2[3];
4363 w2[3] = w3[0];
4364 w3[0] = w3[1];
4365 w3[1] = w3[2];
4366 w3[2] = 0;
4367 }
4368
4369 break;
4370
4371 case 11:
4372 w3[2] = amd_bytealign ( 0, w0[2], offset_minus_4);
4373 w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4);
4374 w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4375 w2[3] = amd_bytealign (w0[0], 0, offset_minus_4);
4376 w2[2] = 0;
4377 w2[1] = 0;
4378 w2[0] = 0;
4379 w1[3] = 0;
4380 w1[2] = 0;
4381 w1[1] = 0;
4382 w1[0] = 0;
4383 w0[3] = 0;
4384 w0[2] = 0;
4385 w0[1] = 0;
4386 w0[0] = 0;
4387
4388 if (offset_mod_4 == 0)
4389 {
4390 w2[3] = w3[0];
4391 w3[0] = w3[1];
4392 w3[1] = w3[2];
4393 w3[2] = 0;
4394 }
4395
4396 break;
4397
4398 case 12:
4399 w3[2] = amd_bytealign ( 0, w0[1], offset_minus_4);
4400 w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4);
4401 w3[0] = amd_bytealign (w0[0], 0, offset_minus_4);
4402 w2[3] = 0;
4403 w2[2] = 0;
4404 w2[1] = 0;
4405 w2[0] = 0;
4406 w1[3] = 0;
4407 w1[2] = 0;
4408 w1[1] = 0;
4409 w1[0] = 0;
4410 w0[3] = 0;
4411 w0[2] = 0;
4412 w0[1] = 0;
4413 w0[0] = 0;
4414
4415 if (offset_mod_4 == 0)
4416 {
4417 w3[0] = w3[1];
4418 w3[1] = w3[2];
4419 w3[2] = 0;
4420 }
4421
4422 break;
4423
4424 case 13:
4425 w3[2] = amd_bytealign ( 0, w0[0], offset_minus_4);
4426 w3[1] = amd_bytealign (w0[0], 0, offset_minus_4);
4427 w3[0] = 0;
4428 w2[3] = 0;
4429 w2[2] = 0;
4430 w2[1] = 0;
4431 w2[0] = 0;
4432 w1[3] = 0;
4433 w1[2] = 0;
4434 w1[1] = 0;
4435 w1[0] = 0;
4436 w0[3] = 0;
4437 w0[2] = 0;
4438 w0[1] = 0;
4439 w0[0] = 0;
4440
4441 if (offset_mod_4 == 0)
4442 {
4443 w3[1] = w3[2];
4444 w3[2] = 0;
4445 }
4446
4447 break;
4448 }
4449 #endif
4450
4451 #ifdef IS_NV
4452 const int offset_minus_4 = 4 - (offset % 4);
4453
4454 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
4455
4456 switch (offset / 4)
4457 {
4458 case 0:
4459 w3[1] = __byte_perm (w3[0], w3[1], selector);
4460 w3[0] = __byte_perm (w2[3], w3[0], selector);
4461 w2[3] = __byte_perm (w2[2], w2[3], selector);
4462 w2[2] = __byte_perm (w2[1], w2[2], selector);
4463 w2[1] = __byte_perm (w2[0], w2[1], selector);
4464 w2[0] = __byte_perm (w1[3], w2[0], selector);
4465 w1[3] = __byte_perm (w1[2], w1[3], selector);
4466 w1[2] = __byte_perm (w1[1], w1[2], selector);
4467 w1[1] = __byte_perm (w1[0], w1[1], selector);
4468 w1[0] = __byte_perm (w0[3], w1[0], selector);
4469 w0[3] = __byte_perm (w0[2], w0[3], selector);
4470 w0[2] = __byte_perm (w0[1], w0[2], selector);
4471 w0[1] = __byte_perm (w0[0], w0[1], selector);
4472 w0[0] = __byte_perm ( 0, w0[0], selector);
4473
4474 break;
4475
4476 case 1:
4477 w3[1] = __byte_perm (w2[3], w3[0], selector);
4478 w3[0] = __byte_perm (w2[2], w2[3], selector);
4479 w2[3] = __byte_perm (w2[1], w2[2], selector);
4480 w2[2] = __byte_perm (w2[0], w2[1], selector);
4481 w2[1] = __byte_perm (w1[3], w2[0], selector);
4482 w2[0] = __byte_perm (w1[2], w1[3], selector);
4483 w1[3] = __byte_perm (w1[1], w1[2], selector);
4484 w1[2] = __byte_perm (w1[0], w1[1], selector);
4485 w1[1] = __byte_perm (w0[3], w1[0], selector);
4486 w1[0] = __byte_perm (w0[2], w0[3], selector);
4487 w0[3] = __byte_perm (w0[1], w0[2], selector);
4488 w0[2] = __byte_perm (w0[0], w0[1], selector);
4489 w0[1] = __byte_perm ( 0, w0[0], selector);
4490 w0[0] = 0;
4491
4492 break;
4493
4494 case 2:
4495 w3[1] = __byte_perm (w2[2], w2[3], selector);
4496 w3[0] = __byte_perm (w2[1], w2[2], selector);
4497 w2[3] = __byte_perm (w2[0], w2[1], selector);
4498 w2[2] = __byte_perm (w1[3], w2[0], selector);
4499 w2[1] = __byte_perm (w1[2], w1[3], selector);
4500 w2[0] = __byte_perm (w1[1], w1[2], selector);
4501 w1[3] = __byte_perm (w1[0], w1[1], selector);
4502 w1[2] = __byte_perm (w0[3], w1[0], selector);
4503 w1[1] = __byte_perm (w0[2], w0[3], selector);
4504 w1[0] = __byte_perm (w0[1], w0[2], selector);
4505 w0[3] = __byte_perm (w0[0], w0[1], selector);
4506 w0[2] = __byte_perm ( 0, w0[0], selector);
4507 w0[1] = 0;
4508 w0[0] = 0;
4509
4510 break;
4511
4512 case 3:
4513 w3[1] = __byte_perm (w2[1], w2[2], selector);
4514 w3[0] = __byte_perm (w2[0], w2[1], selector);
4515 w2[3] = __byte_perm (w1[3], w2[0], selector);
4516 w2[2] = __byte_perm (w1[2], w1[3], selector);
4517 w2[1] = __byte_perm (w1[1], w1[2], selector);
4518 w2[0] = __byte_perm (w1[0], w1[1], selector);
4519 w1[3] = __byte_perm (w0[3], w1[0], selector);
4520 w1[2] = __byte_perm (w0[2], w0[3], selector);
4521 w1[1] = __byte_perm (w0[1], w0[2], selector);
4522 w1[0] = __byte_perm (w0[0], w0[1], selector);
4523 w0[3] = __byte_perm ( 0, w0[0], selector);
4524 w0[2] = 0;
4525 w0[1] = 0;
4526 w0[0] = 0;
4527
4528 break;
4529
4530 case 4:
4531 w3[1] = __byte_perm (w2[0], w2[1], selector);
4532 w3[0] = __byte_perm (w1[3], w2[0], selector);
4533 w2[3] = __byte_perm (w1[2], w1[3], selector);
4534 w2[2] = __byte_perm (w1[1], w1[2], selector);
4535 w2[1] = __byte_perm (w1[0], w1[1], selector);
4536 w2[0] = __byte_perm (w0[3], w1[0], selector);
4537 w1[3] = __byte_perm (w0[2], w0[3], selector);
4538 w1[2] = __byte_perm (w0[1], w0[2], selector);
4539 w1[1] = __byte_perm (w0[0], w0[1], selector);
4540 w1[0] = __byte_perm ( 0, w0[0], selector);
4541 w0[3] = 0;
4542 w0[2] = 0;
4543 w0[1] = 0;
4544 w0[0] = 0;
4545
4546 break;
4547
4548 case 5:
4549 w3[1] = __byte_perm (w1[3], w2[0], selector);
4550 w3[0] = __byte_perm (w1[2], w1[3], selector);
4551 w2[3] = __byte_perm (w1[1], w1[2], selector);
4552 w2[2] = __byte_perm (w1[0], w1[1], selector);
4553 w2[1] = __byte_perm (w0[3], w1[0], selector);
4554 w2[0] = __byte_perm (w0[2], w0[3], selector);
4555 w1[3] = __byte_perm (w0[1], w0[2], selector);
4556 w1[2] = __byte_perm (w0[0], w0[1], selector);
4557 w1[1] = __byte_perm ( 0, w0[0], selector);
4558 w1[0] = 0;
4559 w0[3] = 0;
4560 w0[2] = 0;
4561 w0[1] = 0;
4562 w0[0] = 0;
4563
4564 break;
4565
4566 case 6:
4567 w3[1] = __byte_perm (w1[2], w1[3], selector);
4568 w3[0] = __byte_perm (w1[1], w1[2], selector);
4569 w2[3] = __byte_perm (w1[0], w1[1], selector);
4570 w2[2] = __byte_perm (w0[3], w1[0], selector);
4571 w2[1] = __byte_perm (w0[2], w0[3], selector);
4572 w2[0] = __byte_perm (w0[1], w0[2], selector);
4573 w1[3] = __byte_perm (w0[0], w0[1], selector);
4574 w1[2] = __byte_perm ( 0, w0[0], selector);
4575 w1[1] = 0;
4576 w1[0] = 0;
4577 w0[3] = 0;
4578 w0[2] = 0;
4579 w0[1] = 0;
4580 w0[0] = 0;
4581
4582 break;
4583
4584 case 7:
4585 w3[1] = __byte_perm (w1[1], w1[2], selector);
4586 w3[0] = __byte_perm (w1[0], w1[1], selector);
4587 w2[3] = __byte_perm (w0[3], w1[0], selector);
4588 w2[2] = __byte_perm (w0[2], w0[3], selector);
4589 w2[1] = __byte_perm (w0[1], w0[2], selector);
4590 w2[0] = __byte_perm (w0[0], w0[1], selector);
4591 w1[3] = __byte_perm ( 0, w0[0], selector);
4592 w1[2] = 0;
4593 w1[1] = 0;
4594 w1[0] = 0;
4595 w0[3] = 0;
4596 w0[2] = 0;
4597 w0[1] = 0;
4598 w0[0] = 0;
4599
4600 break;
4601
4602 case 8:
4603 w3[1] = __byte_perm (w1[0], w1[1], selector);
4604 w3[0] = __byte_perm (w0[3], w1[0], selector);
4605 w2[3] = __byte_perm (w0[2], w0[3], selector);
4606 w2[2] = __byte_perm (w0[1], w0[2], selector);
4607 w2[1] = __byte_perm (w0[0], w0[1], selector);
4608 w2[0] = __byte_perm ( 0, w0[0], selector);
4609 w1[3] = 0;
4610 w1[2] = 0;
4611 w1[1] = 0;
4612 w1[0] = 0;
4613 w0[3] = 0;
4614 w0[2] = 0;
4615 w0[1] = 0;
4616 w0[0] = 0;
4617
4618 break;
4619
4620 case 9:
4621 w3[1] = __byte_perm (w0[3], w1[0], selector);
4622 w3[0] = __byte_perm (w0[2], w0[3], selector);
4623 w2[3] = __byte_perm (w0[1], w0[2], selector);
4624 w2[2] = __byte_perm (w0[0], w0[1], selector);
4625 w2[1] = __byte_perm ( 0, w0[0], selector);
4626 w2[0] = 0;
4627 w1[3] = 0;
4628 w1[2] = 0;
4629 w1[1] = 0;
4630 w1[0] = 0;
4631 w0[3] = 0;
4632 w0[2] = 0;
4633 w0[1] = 0;
4634 w0[0] = 0;
4635
4636 break;
4637
4638 case 10:
4639 w3[1] = __byte_perm (w0[2], w0[3], selector);
4640 w3[0] = __byte_perm (w0[1], w0[2], selector);
4641 w2[3] = __byte_perm (w0[0], w0[1], selector);
4642 w2[2] = __byte_perm ( 0, w0[0], selector);
4643 w2[1] = 0;
4644 w2[0] = 0;
4645 w1[3] = 0;
4646 w1[2] = 0;
4647 w1[1] = 0;
4648 w1[0] = 0;
4649 w0[3] = 0;
4650 w0[2] = 0;
4651 w0[1] = 0;
4652 w0[0] = 0;
4653
4654 break;
4655
4656 case 11:
4657 w3[1] = __byte_perm (w0[1], w0[2], selector);
4658 w3[0] = __byte_perm (w0[0], w0[1], selector);
4659 w2[3] = __byte_perm ( 0, w0[0], selector);
4660 w2[2] = 0;
4661 w2[1] = 0;
4662 w2[0] = 0;
4663 w1[3] = 0;
4664 w1[2] = 0;
4665 w1[1] = 0;
4666 w1[0] = 0;
4667 w0[3] = 0;
4668 w0[2] = 0;
4669 w0[1] = 0;
4670 w0[0] = 0;
4671
4672 break;
4673
4674 case 12:
4675 w3[1] = __byte_perm (w0[0], w0[1], selector);
4676 w3[0] = __byte_perm ( 0, w0[0], selector);
4677 w2[3] = 0;
4678 w2[2] = 0;
4679 w2[1] = 0;
4680 w2[0] = 0;
4681 w1[3] = 0;
4682 w1[2] = 0;
4683 w1[1] = 0;
4684 w1[0] = 0;
4685 w0[3] = 0;
4686 w0[2] = 0;
4687 w0[1] = 0;
4688 w0[0] = 0;
4689
4690 break;
4691
4692 case 13:
4693 w3[1] = __byte_perm ( 0, w0[0], selector);
4694 w3[0] = 0;
4695 w2[3] = 0;
4696 w2[2] = 0;
4697 w2[1] = 0;
4698 w2[0] = 0;
4699 w1[3] = 0;
4700 w1[2] = 0;
4701 w1[1] = 0;
4702 w1[0] = 0;
4703 w0[3] = 0;
4704 w0[2] = 0;
4705 w0[1] = 0;
4706 w0[0] = 0;
4707
4708 break;
4709 }
4710 #endif
4711 }
4712
4713 inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
4714 {
4715 #if defined IS_AMD || defined IS_GENERIC
4716 switch (offset / 4)
4717 {
4718 case 0:
4719 w3[2] = amd_bytealign (w3[1], 0, offset);
4720 w3[1] = amd_bytealign (w3[0], w3[1], offset);
4721 w3[0] = amd_bytealign (w2[3], w3[0], offset);
4722 w2[3] = amd_bytealign (w2[2], w2[3], offset);
4723 w2[2] = amd_bytealign (w2[1], w2[2], offset);
4724 w2[1] = amd_bytealign (w2[0], w2[1], offset);
4725 w2[0] = amd_bytealign (w1[3], w2[0], offset);
4726 w1[3] = amd_bytealign (w1[2], w1[3], offset);
4727 w1[2] = amd_bytealign (w1[1], w1[2], offset);
4728 w1[1] = amd_bytealign (w1[0], w1[1], offset);
4729 w1[0] = amd_bytealign (w0[3], w1[0], offset);
4730 w0[3] = amd_bytealign (w0[2], w0[3], offset);
4731 w0[2] = amd_bytealign (w0[1], w0[2], offset);
4732 w0[1] = amd_bytealign (w0[0], w0[1], offset);
4733 w0[0] = amd_bytealign ( 0, w0[0], offset);
4734 break;
4735
4736 case 1:
4737 w3[2] = amd_bytealign (w3[0], 0, offset);
4738 w3[1] = amd_bytealign (w2[3], w3[0], offset);
4739 w3[0] = amd_bytealign (w2[2], w2[3], offset);
4740 w2[3] = amd_bytealign (w2[1], w2[2], offset);
4741 w2[2] = amd_bytealign (w2[0], w2[1], offset);
4742 w2[1] = amd_bytealign (w1[3], w2[0], offset);
4743 w2[0] = amd_bytealign (w1[2], w1[3], offset);
4744 w1[3] = amd_bytealign (w1[1], w1[2], offset);
4745 w1[2] = amd_bytealign (w1[0], w1[1], offset);
4746 w1[1] = amd_bytealign (w0[3], w1[0], offset);
4747 w1[0] = amd_bytealign (w0[2], w0[3], offset);
4748 w0[3] = amd_bytealign (w0[1], w0[2], offset);
4749 w0[2] = amd_bytealign (w0[0], w0[1], offset);
4750 w0[1] = amd_bytealign ( 0, w0[0], offset);
4751 w0[0] = 0;
4752 break;
4753
4754 case 2:
4755 w3[2] = amd_bytealign (w2[3], 0, offset);
4756 w3[1] = amd_bytealign (w2[2], w2[3], offset);
4757 w3[0] = amd_bytealign (w2[1], w2[2], offset);
4758 w2[3] = amd_bytealign (w2[0], w2[1], offset);
4759 w2[2] = amd_bytealign (w1[3], w2[0], offset);
4760 w2[1] = amd_bytealign (w1[2], w1[3], offset);
4761 w2[0] = amd_bytealign (w1[1], w1[2], offset);
4762 w1[3] = amd_bytealign (w1[0], w1[1], offset);
4763 w1[2] = amd_bytealign (w0[3], w1[0], offset);
4764 w1[1] = amd_bytealign (w0[2], w0[3], offset);
4765 w1[0] = amd_bytealign (w0[1], w0[2], offset);
4766 w0[3] = amd_bytealign (w0[0], w0[1], offset);
4767 w0[2] = amd_bytealign ( 0, w0[0], offset);
4768 w0[1] = 0;
4769 w0[0] = 0;
4770 break;
4771
4772 case 3:
4773 w3[2] = amd_bytealign (w2[2], 0, offset);
4774 w3[1] = amd_bytealign (w2[1], w2[2], offset);
4775 w3[0] = amd_bytealign (w2[0], w2[1], offset);
4776 w2[3] = amd_bytealign (w1[3], w2[0], offset);
4777 w2[2] = amd_bytealign (w1[2], w1[3], offset);
4778 w2[1] = amd_bytealign (w1[1], w1[2], offset);
4779 w2[0] = amd_bytealign (w1[0], w1[1], offset);
4780 w1[3] = amd_bytealign (w0[3], w1[0], offset);
4781 w1[2] = amd_bytealign (w0[2], w0[3], offset);
4782 w1[1] = amd_bytealign (w0[1], w0[2], offset);
4783 w1[0] = amd_bytealign (w0[0], w0[1], offset);
4784 w0[3] = amd_bytealign ( 0, w0[0], offset);
4785 w0[2] = 0;
4786 w0[1] = 0;
4787 w0[0] = 0;
4788 break;
4789
4790 case 4:
4791 w3[2] = amd_bytealign (w2[1], 0, offset);
4792 w3[1] = amd_bytealign (w2[0], w2[1], offset);
4793 w3[0] = amd_bytealign (w1[3], w2[0], offset);
4794 w2[3] = amd_bytealign (w1[2], w1[3], offset);
4795 w2[2] = amd_bytealign (w1[1], w1[2], offset);
4796 w2[1] = amd_bytealign (w1[0], w1[1], offset);
4797 w2[0] = amd_bytealign (w0[3], w1[0], offset);
4798 w1[3] = amd_bytealign (w0[2], w0[3], offset);
4799 w1[2] = amd_bytealign (w0[1], w0[2], offset);
4800 w1[1] = amd_bytealign (w0[0], w0[1], offset);
4801 w1[0] = amd_bytealign ( 0, w0[0], offset);
4802 w0[3] = 0;
4803 w0[2] = 0;
4804 w0[1] = 0;
4805 w0[0] = 0;
4806 break;
4807
4808 case 5:
4809 w3[2] = amd_bytealign (w2[0], 0, offset);
4810 w3[1] = amd_bytealign (w1[3], w2[0], offset);
4811 w3[0] = amd_bytealign (w1[2], w1[3], offset);
4812 w2[3] = amd_bytealign (w1[1], w1[2], offset);
4813 w2[2] = amd_bytealign (w1[0], w1[1], offset);
4814 w2[1] = amd_bytealign (w0[3], w1[0], offset);
4815 w2[0] = amd_bytealign (w0[2], w0[3], offset);
4816 w1[3] = amd_bytealign (w0[1], w0[2], offset);
4817 w1[2] = amd_bytealign (w0[0], w0[1], offset);
4818 w1[1] = amd_bytealign ( 0, w0[0], offset);
4819 w1[0] = 0;
4820 w0[3] = 0;
4821 w0[2] = 0;
4822 w0[1] = 0;
4823 w0[0] = 0;
4824 break;
4825
4826 case 6:
4827 w3[2] = amd_bytealign (w1[3], 0, offset);
4828 w3[1] = amd_bytealign (w1[2], w1[3], offset);
4829 w3[0] = amd_bytealign (w1[1], w1[2], offset);
4830 w2[3] = amd_bytealign (w1[0], w1[1], offset);
4831 w2[2] = amd_bytealign (w0[3], w1[0], offset);
4832 w2[1] = amd_bytealign (w0[2], w0[3], offset);
4833 w2[0] = amd_bytealign (w0[1], w0[2], offset);
4834 w1[3] = amd_bytealign (w0[0], w0[1], offset);
4835 w1[2] = amd_bytealign ( 0, w0[0], offset);
4836 w1[1] = 0;
4837 w1[0] = 0;
4838 w0[3] = 0;
4839 w0[2] = 0;
4840 w0[1] = 0;
4841 w0[0] = 0;
4842 break;
4843
4844 case 7:
4845 w3[2] = amd_bytealign (w1[2], 0, offset);
4846 w3[1] = amd_bytealign (w1[1], w1[2], offset);
4847 w3[0] = amd_bytealign (w1[0], w1[1], offset);
4848 w2[3] = amd_bytealign (w0[3], w1[0], offset);
4849 w2[2] = amd_bytealign (w0[2], w0[3], offset);
4850 w2[1] = amd_bytealign (w0[1], w0[2], offset);
4851 w2[0] = amd_bytealign (w0[0], w0[1], offset);
4852 w1[3] = amd_bytealign ( 0, w0[0], offset);
4853 w1[2] = 0;
4854 w1[1] = 0;
4855 w1[0] = 0;
4856 w0[3] = 0;
4857 w0[2] = 0;
4858 w0[1] = 0;
4859 w0[0] = 0;
4860 break;
4861
4862 case 8:
4863 w3[2] = amd_bytealign (w1[1], 0, offset);
4864 w3[1] = amd_bytealign (w1[0], w1[1], offset);
4865 w3[0] = amd_bytealign (w0[3], w1[0], offset);
4866 w2[3] = amd_bytealign (w0[2], w0[3], offset);
4867 w2[2] = amd_bytealign (w0[1], w0[2], offset);
4868 w2[1] = amd_bytealign (w0[0], w0[1], offset);
4869 w2[0] = amd_bytealign ( 0, w0[0], offset);
4870 w1[3] = 0;
4871 w1[2] = 0;
4872 w1[1] = 0;
4873 w1[0] = 0;
4874 w0[3] = 0;
4875 w0[2] = 0;
4876 w0[1] = 0;
4877 w0[0] = 0;
4878 break;
4879
4880 case 9:
4881 w3[2] = amd_bytealign (w1[0], 0, offset);
4882 w3[1] = amd_bytealign (w0[3], w1[0], offset);
4883 w3[0] = amd_bytealign (w0[2], w0[3], offset);
4884 w2[3] = amd_bytealign (w0[1], w0[2], offset);
4885 w2[2] = amd_bytealign (w0[0], w0[1], offset);
4886 w2[1] = amd_bytealign ( 0, w0[0], offset);
4887 w2[0] = 0;
4888 w1[3] = 0;
4889 w1[2] = 0;
4890 w1[1] = 0;
4891 w1[0] = 0;
4892 w0[3] = 0;
4893 w0[2] = 0;
4894 w0[1] = 0;
4895 w0[0] = 0;
4896 break;
4897
4898 case 10:
4899 w3[2] = amd_bytealign (w0[3], 0, offset);
4900 w3[1] = amd_bytealign (w0[2], w0[3], offset);
4901 w3[0] = amd_bytealign (w0[1], w0[2], offset);
4902 w2[3] = amd_bytealign (w0[0], w0[1], offset);
4903 w2[2] = amd_bytealign ( 0, w0[0], offset);
4904 w2[1] = 0;
4905 w2[0] = 0;
4906 w1[3] = 0;
4907 w1[2] = 0;
4908 w1[1] = 0;
4909 w1[0] = 0;
4910 w0[3] = 0;
4911 w0[2] = 0;
4912 w0[1] = 0;
4913 w0[0] = 0;
4914 break;
4915
4916 case 11:
4917 w3[2] = amd_bytealign (w0[2], 0, offset);
4918 w3[1] = amd_bytealign (w0[1], w0[2], offset);
4919 w3[0] = amd_bytealign (w0[0], w0[1], offset);
4920 w2[3] = amd_bytealign ( 0, w0[0], offset);
4921 w2[2] = 0;
4922 w2[1] = 0;
4923 w2[0] = 0;
4924 w1[3] = 0;
4925 w1[2] = 0;
4926 w1[1] = 0;
4927 w1[0] = 0;
4928 w0[3] = 0;
4929 w0[2] = 0;
4930 w0[1] = 0;
4931 w0[0] = 0;
4932 break;
4933
4934 case 12:
4935 w3[2] = amd_bytealign (w0[1], 0, offset);
4936 w3[1] = amd_bytealign (w0[0], w0[1], offset);
4937 w3[0] = amd_bytealign ( 0, w0[0], offset);
4938 w2[3] = 0;
4939 w2[2] = 0;
4940 w2[1] = 0;
4941 w2[0] = 0;
4942 w1[3] = 0;
4943 w1[2] = 0;
4944 w1[1] = 0;
4945 w1[0] = 0;
4946 w0[3] = 0;
4947 w0[2] = 0;
4948 w0[1] = 0;
4949 w0[0] = 0;
4950 break;
4951
4952 case 13:
4953 w3[2] = amd_bytealign (w0[0], 0, offset);
4954 w3[1] = amd_bytealign ( 0, w0[0], offset);
4955 w3[0] = 0;
4956 w2[3] = 0;
4957 w2[2] = 0;
4958 w2[1] = 0;
4959 w2[0] = 0;
4960 w1[3] = 0;
4961 w1[2] = 0;
4962 w1[1] = 0;
4963 w1[0] = 0;
4964 w0[3] = 0;
4965 w0[2] = 0;
4966 w0[1] = 0;
4967 w0[0] = 0;
4968 break;
4969 }
4970 #endif
4971
4972 #ifdef IS_NV
4973 const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
4974
4975 switch (offset / 4)
4976 {
4977 case 0:
4978 w3[1] = __byte_perm (w3[1], w3[0], selector);
4979 w3[0] = __byte_perm (w3[0], w2[3], selector);
4980 w2[3] = __byte_perm (w2[3], w2[2], selector);
4981 w2[2] = __byte_perm (w2[2], w2[1], selector);
4982 w2[1] = __byte_perm (w2[1], w2[0], selector);
4983 w2[0] = __byte_perm (w2[0], w1[3], selector);
4984 w1[3] = __byte_perm (w1[3], w1[2], selector);
4985 w1[2] = __byte_perm (w1[2], w1[1], selector);
4986 w1[1] = __byte_perm (w1[1], w1[0], selector);
4987 w1[0] = __byte_perm (w1[0], w0[3], selector);
4988 w0[3] = __byte_perm (w0[3], w0[2], selector);
4989 w0[2] = __byte_perm (w0[2], w0[1], selector);
4990 w0[1] = __byte_perm (w0[1], w0[0], selector);
4991 w0[0] = __byte_perm (w0[0], 0, selector);
4992 break;
4993
4994 case 1:
4995 w3[1] = __byte_perm (w3[0], w2[3], selector);
4996 w3[0] = __byte_perm (w2[3], w2[2], selector);
4997 w2[3] = __byte_perm (w2[2], w2[1], selector);
4998 w2[2] = __byte_perm (w2[1], w2[0], selector);
4999 w2[1] = __byte_perm (w2[0], w1[3], selector);
5000 w2[0] = __byte_perm (w1[3], w1[2], selector);
5001 w1[3] = __byte_perm (w1[2], w1[1], selector);
5002 w1[2] = __byte_perm (w1[1], w1[0], selector);
5003 w1[1] = __byte_perm (w1[0], w0[3], selector);
5004 w1[0] = __byte_perm (w0[3], w0[2], selector);
5005 w0[3] = __byte_perm (w0[2], w0[1], selector);
5006 w0[2] = __byte_perm (w0[1], w0[0], selector);
5007 w0[1] = __byte_perm (w0[0], 0, selector);
5008 w0[0] = 0;
5009 break;
5010
5011 case 2:
5012 w3[1] = __byte_perm (w2[3], w2[2], selector);
5013 w3[0] = __byte_perm (w2[2], w2[1], selector);
5014 w2[3] = __byte_perm (w2[1], w2[0], selector);
5015 w2[2] = __byte_perm (w2[0], w1[3], selector);
5016 w2[1] = __byte_perm (w1[3], w1[2], selector);
5017 w2[0] = __byte_perm (w1[2], w1[1], selector);
5018 w1[3] = __byte_perm (w1[1], w1[0], selector);
5019 w1[2] = __byte_perm (w1[0], w0[3], selector);
5020 w1[1] = __byte_perm (w0[3], w0[2], selector);
5021 w1[0] = __byte_perm (w0[2], w0[1], selector);
5022 w0[3] = __byte_perm (w0[1], w0[0], selector);
5023 w0[2] = __byte_perm (w0[0], 0, selector);
5024 w0[1] = 0;
5025 w0[0] = 0;
5026 break;
5027
5028 case 3:
5029 w3[1] = __byte_perm (w2[2], w2[1], selector);
5030 w3[0] = __byte_perm (w2[1], w2[0], selector);
5031 w2[3] = __byte_perm (w2[0], w1[3], selector);
5032 w2[2] = __byte_perm (w1[3], w1[2], selector);
5033 w2[1] = __byte_perm (w1[2], w1[1], selector);
5034 w2[0] = __byte_perm (w1[1], w1[0], selector);
5035 w1[3] = __byte_perm (w1[0], w0[3], selector);
5036 w1[2] = __byte_perm (w0[3], w0[2], selector);
5037 w1[1] = __byte_perm (w0[2], w0[1], selector);
5038 w1[0] = __byte_perm (w0[1], w0[0], selector);
5039 w0[3] = __byte_perm (w0[0], 0, selector);
5040 w0[2] = 0;
5041 w0[1] = 0;
5042 w0[0] = 0;
5043 break;
5044
5045 case 4:
5046 w3[1] = __byte_perm (w2[1], w2[0], selector);
5047 w3[0] = __byte_perm (w2[0], w1[3], selector);
5048 w2[3] = __byte_perm (w1[3], w1[2], selector);
5049 w2[2] = __byte_perm (w1[2], w1[1], selector);
5050 w2[1] = __byte_perm (w1[1], w1[0], selector);
5051 w2[0] = __byte_perm (w1[0], w0[3], selector);
5052 w1[3] = __byte_perm (w0[3], w0[2], selector);
5053 w1[2] = __byte_perm (w0[2], w0[1], selector);
5054 w1[1] = __byte_perm (w0[1], w0[0], selector);
5055 w1[0] = __byte_perm (w0[0], 0, selector);
5056 w0[3] = 0;
5057 w0[2] = 0;
5058 w0[1] = 0;
5059 w0[0] = 0;
5060 break;
5061
5062 case 5:
5063 w3[1] = __byte_perm (w2[0], w1[3], selector);
5064 w3[0] = __byte_perm (w1[3], w1[2], selector);
5065 w2[3] = __byte_perm (w1[2], w1[1], selector);
5066 w2[2] = __byte_perm (w1[1], w1[0], selector);
5067 w2[1] = __byte_perm (w1[0], w0[3], selector);
5068 w2[0] = __byte_perm (w0[3], w0[2], selector);
5069 w1[3] = __byte_perm (w0[2], w0[1], selector);
5070 w1[2] = __byte_perm (w0[1], w0[0], selector);
5071 w1[1] = __byte_perm (w0[0], 0, selector);
5072 w1[0] = 0;
5073 w0[3] = 0;
5074 w0[2] = 0;
5075 w0[1] = 0;
5076 w0[0] = 0;
5077 break;
5078
5079 case 6:
5080 w3[1] = __byte_perm (w1[3], w1[2], selector);
5081 w3[0] = __byte_perm (w1[2], w1[1], selector);
5082 w2[3] = __byte_perm (w1[1], w1[0], selector);
5083 w2[2] = __byte_perm (w1[0], w0[3], selector);
5084 w2[1] = __byte_perm (w0[3], w0[2], selector);
5085 w2[0] = __byte_perm (w0[2], w0[1], selector);
5086 w1[3] = __byte_perm (w0[1], w0[0], selector);
5087 w1[2] = __byte_perm (w0[0], 0, selector);
5088 w1[1] = 0;
5089 w1[0] = 0;
5090 w0[3] = 0;
5091 w0[2] = 0;
5092 w0[1] = 0;
5093 w0[0] = 0;
5094 break;
5095
5096 case 7:
5097 w3[1] = __byte_perm (w1[2], w1[1], selector);
5098 w3[0] = __byte_perm (w1[1], w1[0], selector);
5099 w2[3] = __byte_perm (w1[0], w0[3], selector);
5100 w2[2] = __byte_perm (w0[3], w0[2], selector);
5101 w2[1] = __byte_perm (w0[2], w0[1], selector);
5102 w2[0] = __byte_perm (w0[1], w0[0], selector);
5103 w1[3] = __byte_perm (w0[0], 0, selector);
5104 w1[2] = 0;
5105 w1[1] = 0;
5106 w1[0] = 0;
5107 w0[3] = 0;
5108 w0[2] = 0;
5109 w0[1] = 0;
5110 w0[0] = 0;
5111 break;
5112
5113 case 8:
5114 w3[1] = __byte_perm (w1[1], w1[0], selector);
5115 w3[0] = __byte_perm (w1[0], w0[3], selector);
5116 w2[3] = __byte_perm (w0[3], w0[2], selector);
5117 w2[2] = __byte_perm (w0[2], w0[1], selector);
5118 w2[1] = __byte_perm (w0[1], w0[0], selector);
5119 w2[0] = __byte_perm (w0[0], 0, selector);
5120 w1[3] = 0;
5121 w1[2] = 0;
5122 w1[1] = 0;
5123 w1[0] = 0;
5124 w0[3] = 0;
5125 w0[2] = 0;
5126 w0[1] = 0;
5127 w0[0] = 0;
5128 break;
5129
5130 case 9:
5131 w3[1] = __byte_perm (w1[0], w0[3], selector);
5132 w3[0] = __byte_perm (w0[3], w0[2], selector);
5133 w2[3] = __byte_perm (w0[2], w0[1], selector);
5134 w2[2] = __byte_perm (w0[1], w0[0], selector);
5135 w2[1] = __byte_perm (w0[0], 0, selector);
5136 w2[0] = 0;
5137 w1[3] = 0;
5138 w1[2] = 0;
5139 w1[1] = 0;
5140 w1[0] = 0;
5141 w0[3] = 0;
5142 w0[2] = 0;
5143 w0[1] = 0;
5144 w0[0] = 0;
5145 break;
5146
5147 case 10:
5148 w3[1] = __byte_perm (w0[3], w0[2], selector);
5149 w3[0] = __byte_perm (w0[2], w0[1], selector);
5150 w2[3] = __byte_perm (w0[1], w0[0], selector);
5151 w2[2] = __byte_perm (w0[0], 0, selector);
5152 w2[1] = 0;
5153 w2[0] = 0;
5154 w1[3] = 0;
5155 w1[2] = 0;
5156 w1[1] = 0;
5157 w1[0] = 0;
5158 w0[3] = 0;
5159 w0[2] = 0;
5160 w0[1] = 0;
5161 w0[0] = 0;
5162 break;
5163
5164 case 11:
5165 w3[1] = __byte_perm (w0[2], w0[1], selector);
5166 w3[0] = __byte_perm (w0[1], w0[0], selector);
5167 w2[3] = __byte_perm (w0[0], 0, selector);
5168 w2[2] = 0;
5169 w2[1] = 0;
5170 w2[0] = 0;
5171 w1[3] = 0;
5172 w1[2] = 0;
5173 w1[1] = 0;
5174 w1[0] = 0;
5175 w0[3] = 0;
5176 w0[2] = 0;
5177 w0[1] = 0;
5178 w0[0] = 0;
5179 break;
5180
5181 case 12:
5182 w3[1] = __byte_perm (w0[1], w0[0], selector);
5183 w3[0] = __byte_perm (w0[0], 0, selector);
5184 w2[3] = 0;
5185 w2[2] = 0;
5186 w2[1] = 0;
5187 w2[0] = 0;
5188 w1[3] = 0;
5189 w1[2] = 0;
5190 w1[1] = 0;
5191 w1[0] = 0;
5192 w0[3] = 0;
5193 w0[2] = 0;
5194 w0[1] = 0;
5195 w0[0] = 0;
5196 break;
5197
5198 case 13:
5199 w3[1] = __byte_perm (w0[0], 0, selector);
5200 w3[0] = 0;
5201 w2[3] = 0;
5202 w2[2] = 0;
5203 w2[1] = 0;
5204 w2[0] = 0;
5205 w1[3] = 0;
5206 w1[2] = 0;
5207 w1[1] = 0;
5208 w1[0] = 0;
5209 w0[3] = 0;
5210 w0[2] = 0;
5211 w0[1] = 0;
5212 w0[0] = 0;
5213 break;
5214 }
5215 #endif
5216 }
5217
5218 inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len)
5219 {
5220 #if defined cl_amd_media_ops
5221 switch (salt_len)
5222 {
5223 case 0: sw[0] = w0;
5224 break;
5225 case 1: sw[0] = amd_bytealign (w0, sw[0] << 24, 3);
5226 sw[1] = amd_bytealign (sw[1] >> 8, w0, 3);
5227 break;
5228 case 2: sw[0] = amd_bytealign (w0, sw[0] << 16, 2);
5229 sw[1] = amd_bytealign (sw[1] >> 16, w0, 2);
5230 break;
5231 case 3: sw[0] = amd_bytealign (w0, sw[0] << 8, 1);
5232 sw[1] = amd_bytealign (sw[1] >> 24, w0, 1);
5233 break;
5234 case 4: sw[1] = w0;
5235 break;
5236 case 5: sw[1] = amd_bytealign (w0, sw[1] << 24, 3);
5237 sw[2] = amd_bytealign (sw[2] >> 8, w0, 3);
5238 break;
5239 case 6: sw[1] = amd_bytealign (w0, sw[1] << 16, 2);
5240 sw[2] = amd_bytealign (sw[2] >> 16, w0, 2);
5241 break;
5242 case 7: sw[1] = amd_bytealign (w0, sw[1] << 8, 1);
5243 sw[2] = amd_bytealign (sw[2] >> 24, w0, 1);
5244 break;
5245 case 8: sw[2] = w0;
5246 break;
5247 case 9: sw[2] = amd_bytealign (w0, sw[2] << 24, 3);
5248 sw[3] = amd_bytealign (sw[3] >> 8, w0, 3);
5249 break;
5250 case 10: sw[2] = amd_bytealign (w0, sw[2] << 16, 2);
5251 sw[3] = amd_bytealign (sw[3] >> 16, w0, 2);
5252 break;
5253 case 11: sw[2] = amd_bytealign (w0, sw[2] << 8, 1);
5254 sw[3] = amd_bytealign (sw[3] >> 24, w0, 1);
5255 break;
5256 case 12: sw[3] = w0;
5257 break;
5258 case 13: sw[3] = amd_bytealign (w0, sw[3] << 24, 3);
5259 sw[4] = amd_bytealign (sw[4] >> 8, w0, 3);
5260 break;
5261 case 14: sw[3] = amd_bytealign (w0, sw[3] << 16, 2);
5262 sw[4] = amd_bytealign (sw[4] >> 16, w0, 2);
5263 break;
5264 case 15: sw[3] = amd_bytealign (w0, sw[3] << 8, 1);
5265 sw[4] = amd_bytealign (sw[4] >> 24, w0, 1);
5266 break;
5267 case 16: sw[4] = w0;
5268 break;
5269 case 17: sw[4] = amd_bytealign (w0, sw[4] << 24, 3);
5270 sw[5] = amd_bytealign (sw[5] >> 8, w0, 3);
5271 break;
5272 case 18: sw[4] = amd_bytealign (w0, sw[4] << 16, 2);
5273 sw[5] = amd_bytealign (sw[5] >> 16, w0, 2);
5274 break;
5275 case 19: sw[4] = amd_bytealign (w0, sw[4] << 8, 1);
5276 sw[5] = amd_bytealign (sw[5] >> 24, w0, 1);
5277 break;
5278 case 20: sw[5] = w0;
5279 break;
5280 case 21: sw[5] = amd_bytealign (w0, sw[5] << 24, 3);
5281 sw[6] = amd_bytealign (sw[6] >> 8, w0, 3);
5282 break;
5283 case 22: sw[5] = amd_bytealign (w0, sw[5] << 16, 2);
5284 sw[6] = amd_bytealign (sw[6] >> 16, w0, 2);
5285 break;
5286 case 23: sw[5] = amd_bytealign (w0, sw[5] << 8, 1);
5287 sw[6] = amd_bytealign (sw[6] >> 24, w0, 1);
5288 break;
5289 case 24: sw[6] = w0;
5290 break;
5291 case 25: sw[6] = amd_bytealign (w0, sw[6] << 24, 3);
5292 sw[7] = amd_bytealign (sw[7] >> 8, w0, 3);
5293 break;
5294 case 26: sw[6] = amd_bytealign (w0, sw[6] << 16, 2);
5295 sw[7] = amd_bytealign (sw[7] >> 16, w0, 2);
5296 break;
5297 case 27: sw[6] = amd_bytealign (w0, sw[6] << 8, 1);
5298 sw[7] = amd_bytealign (sw[7] >> 24, w0, 1);
5299 break;
5300 case 28: sw[7] = w0;
5301 break;
5302 case 29: sw[7] = amd_bytealign (w0, sw[7] << 24, 3);
5303 sw[8] = amd_bytealign (sw[8] >> 8, w0, 3);
5304 break;
5305 case 30: sw[7] = amd_bytealign (w0, sw[7] << 16, 2);
5306 sw[8] = amd_bytealign (sw[8] >> 16, w0, 2);
5307 break;
5308 case 31: sw[7] = amd_bytealign (w0, sw[7] << 8, 1);
5309 sw[8] = amd_bytealign (sw[8] >> 24, w0, 1);
5310 break;
5311 }
5312 #else
5313 switch (salt_len)
5314 {
5315 case 0: sw[0] = w0;
5316 break;
5317 case 1: sw[0] = (sw[0] & 0x000000ff) | (w0 << 8);
5318 sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
5319 break;
5320 case 2: sw[0] = (sw[0] & 0x0000ffff) | (w0 << 16);
5321 sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
5322 break;
5323 case 3: sw[0] = (sw[0] & 0x00ffffff) | (w0 << 24);
5324 sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
5325 break;
5326 case 4: sw[1] = w0;
5327 break;
5328 case 5: sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
5329 sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
5330 break;
5331 case 6: sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
5332 sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
5333 break;
5334 case 7: sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
5335 sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
5336 break;
5337 case 8: sw[2] = w0;
5338 break;
5339 case 9: sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
5340 sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
5341 break;
5342 case 10: sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
5343 sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
5344 break;
5345 case 11: sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
5346 sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
5347 break;
5348 case 12: sw[3] = w0;
5349 break;
5350 case 13: sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
5351 sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
5352 break;
5353 case 14: sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
5354 sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
5355 break;
5356 case 15: sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
5357 sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
5358 break;
5359 case 16: sw[4] = w0;
5360 break;
5361 case 17: sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
5362 sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
5363 break;
5364 case 18: sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
5365 sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
5366 break;
5367 case 19: sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
5368 sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
5369 break;
5370 case 20: sw[5] = w0;
5371 break;
5372 case 21: sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
5373 sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
5374 break;
5375 case 22: sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
5376 sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
5377 break;
5378 case 23: sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
5379 sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
5380 break;
5381 case 24: sw[6] = w0;
5382 break;
5383 case 25: sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
5384 sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
5385 break;
5386 case 26: sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
5387 sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
5388 break;
5389 case 27: sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
5390 sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
5391 break;
5392 case 28: sw[7] = w0;
5393 break;
5394 case 29: sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
5395 sw[8] = (sw[8] & 0xffffff00) | (w0 >> 24);
5396 break;
5397 case 30: sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
5398 sw[8] = (sw[8] & 0xffff0000) | (w0 >> 16);
5399 break;
5400 case 31: sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
5401 sw[8] = (sw[8] & 0xff000000) | (w0 >> 8);
5402 break;
5403 }
5404 #endif
5405 }
5406
5407 inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len)
5408 {
5409 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5410
5411 switch (salt_len)
5412 {
5413 case 0: sw[0] = w0;
5414 break;
5415 case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8);
5416 sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24);
5417 break;
5418 case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16);
5419 sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16);
5420 break;
5421 case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24);
5422 sw[1] = (sw[1] & 0x000000ff) | (w0 << 8);
5423 break;
5424 case 4: sw[1] = w0;
5425 break;
5426 case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8);
5427 sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24);
5428 break;
5429 case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16);
5430 sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16);
5431 break;
5432 case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24);
5433 sw[2] = (sw[2] & 0x000000ff) | (w0 << 8);
5434 break;
5435 case 8: sw[2] = w0;
5436 break;
5437 case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8);
5438 sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24);
5439 break;
5440 case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16);
5441 sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16);
5442 break;
5443 case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24);
5444 sw[3] = (sw[3] & 0x000000ff) | (w0 << 8);
5445 break;
5446 case 12: sw[3] = w0;
5447 break;
5448 case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8);
5449 sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24);
5450 break;
5451 case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16);
5452 sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16);
5453 break;
5454 case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24);
5455 sw[4] = (sw[4] & 0x000000ff) | (w0 << 8);
5456 break;
5457 case 16: sw[4] = w0;
5458 break;
5459 case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8);
5460 sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24);
5461 break;
5462 case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16);
5463 sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16);
5464 break;
5465 case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24);
5466 sw[5] = (sw[5] & 0x000000ff) | (w0 << 8);
5467 break;
5468 case 20: sw[5] = w0;
5469 break;
5470 case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8);
5471 sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24);
5472 break;
5473 case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16);
5474 sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16);
5475 break;
5476 case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24);
5477 sw[6] = (sw[6] & 0x000000ff) | (w0 << 8);
5478 break;
5479 case 24: sw[6] = w0;
5480 break;
5481 case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8);
5482 sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24);
5483 break;
5484 case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16);
5485 sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16);
5486 break;
5487 case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24);
5488 sw[7] = (sw[7] & 0x000000ff) | (w0 << 8);
5489 break;
5490 case 28: sw[7] = w0;
5491 break;
5492 case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8);
5493 sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24);
5494 break;
5495 case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16);
5496 sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16);
5497 break;
5498 case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24);
5499 sw[8] = (sw[8] & 0x000000ff) | (w0 << 8);
5500 break;
5501 }
5502 }
5503
5504 inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len)
5505 {
5506 #if defined cl_amd_media_ops
5507 switch (salt_len)
5508 {
5509 case 0: w0[0] = wx;
5510 break;
5511 case 1: w0[0] = amd_bytealign (wx, w0[0] << 24, 3);
5512 w0[1] = amd_bytealign (w0[1] >> 8, wx, 3);
5513 break;
5514 case 2: w0[0] = amd_bytealign (wx, w0[0] << 16, 2);
5515 w0[1] = amd_bytealign (w0[1] >> 16, wx, 2);
5516 break;
5517 case 3: w0[0] = amd_bytealign (wx, w0[0] << 8, 1);
5518 w0[1] = amd_bytealign (w0[1] >> 24, wx, 1);
5519 break;
5520 case 4: w0[1] = wx;
5521 break;
5522 case 5: w0[1] = amd_bytealign (wx, w0[1] << 24, 3);
5523 w0[2] = amd_bytealign (w0[2] >> 8, wx, 3);
5524 break;
5525 case 6: w0[1] = amd_bytealign (wx, w0[1] << 16, 2);
5526 w0[2] = amd_bytealign (w0[2] >> 16, wx, 2);
5527 break;
5528 case 7: w0[1] = amd_bytealign (wx, w0[1] << 8, 1);
5529 w0[2] = amd_bytealign (w0[2] >> 24, wx, 1);
5530 break;
5531 case 8: w0[2] = wx;
5532 break;
5533 case 9: w0[2] = amd_bytealign (wx, w0[2] << 24, 3);
5534 w0[3] = amd_bytealign (w0[3] >> 8, wx, 3);
5535 break;
5536 case 10: w0[2] = amd_bytealign (wx, w0[2] << 16, 2);
5537 w0[3] = amd_bytealign (w0[3] >> 16, wx, 2);
5538 break;
5539 case 11: w0[2] = amd_bytealign (wx, w0[2] << 8, 1);
5540 w0[3] = amd_bytealign (w0[3] >> 24, wx, 1);
5541 break;
5542 case 12: w0[3] = wx;
5543 break;
5544 case 13: w0[3] = amd_bytealign (wx, w0[3] << 24, 3);
5545 w1[0] = amd_bytealign (w1[0] >> 8, wx, 3);
5546 break;
5547 case 14: w0[3] = amd_bytealign (wx, w0[3] << 16, 2);
5548 w1[0] = amd_bytealign (w1[0] >> 16, wx, 2);
5549 break;
5550 case 15: w0[3] = amd_bytealign (wx, w0[3] << 8, 1);
5551 w1[0] = amd_bytealign (w1[0] >> 24, wx, 1);
5552 break;
5553 case 16: w1[0] = wx;
5554 break;
5555 case 17: w1[0] = amd_bytealign (wx, w1[0] << 24, 3);
5556 w1[1] = amd_bytealign (w1[1] >> 8, wx, 3);
5557 break;
5558 case 18: w1[0] = amd_bytealign (wx, w1[0] << 16, 2);
5559 w1[1] = amd_bytealign (w1[1] >> 16, wx, 2);
5560 break;
5561 case 19: w1[0] = amd_bytealign (wx, w1[0] << 8, 1);
5562 w1[1] = amd_bytealign (w1[1] >> 24, wx, 1);
5563 break;
5564 case 20: w1[1] = wx;
5565 break;
5566 case 21: w1[1] = amd_bytealign (wx, w1[1] << 24, 3);
5567 w1[2] = amd_bytealign (w1[2] >> 8, wx, 3);
5568 break;
5569 case 22: w1[1] = amd_bytealign (wx, w1[1] << 16, 2);
5570 w1[2] = amd_bytealign (w1[2] >> 16, wx, 2);
5571 break;
5572 case 23: w1[1] = amd_bytealign (wx, w1[1] << 8, 1);
5573 w1[2] = amd_bytealign (w1[2] >> 24, wx, 1);
5574 break;
5575 case 24: w1[2] = wx;
5576 break;
5577 case 25: w1[2] = amd_bytealign (wx, w1[2] << 24, 3);
5578 w1[3] = amd_bytealign (w1[3] >> 8, wx, 3);
5579 break;
5580 case 26: w1[2] = amd_bytealign (wx, w1[2] << 16, 2);
5581 w1[3] = amd_bytealign (w1[3] >> 16, wx, 2);
5582 break;
5583 case 27: w1[2] = amd_bytealign (wx, w1[2] << 8, 1);
5584 w1[3] = amd_bytealign (w1[3] >> 24, wx, 1);
5585 break;
5586 case 28: w1[3] = wx;
5587 break;
5588 case 29: w1[3] = amd_bytealign (wx, w1[3] << 24, 3);
5589 w2[0] = amd_bytealign (w2[0] >> 8, wx, 3);
5590 break;
5591 case 30: w1[3] = amd_bytealign (wx, w1[3] << 16, 2);
5592 w2[0] = amd_bytealign (w2[0] >> 16, wx, 2);
5593 break;
5594 case 31: w1[3] = amd_bytealign (wx, w1[3] << 8, 1);
5595 w2[0] = amd_bytealign (w2[0] >> 24, wx, 1);
5596 break;
5597 case 32: w2[0] = wx;
5598 break;
5599 case 33: w2[0] = amd_bytealign (wx, w2[0] << 24, 3);
5600 w2[1] = amd_bytealign (w2[1] >> 8, wx, 3);
5601 break;
5602 case 34: w2[0] = amd_bytealign (wx, w2[0] << 16, 2);
5603 w2[1] = amd_bytealign (w2[1] >> 16, wx, 2);
5604 break;
5605 case 35: w2[0] = amd_bytealign (wx, w2[0] << 8, 1);
5606 w2[1] = amd_bytealign (w2[1] >> 24, wx, 1);
5607 break;
5608 case 36: w2[1] = wx;
5609 break;
5610 case 37: w2[1] = amd_bytealign (wx, w2[1] << 24, 3);
5611 w2[2] = amd_bytealign (w2[2] >> 8, wx, 3);
5612 break;
5613 case 38: w2[1] = amd_bytealign (wx, w2[1] << 16, 2);
5614 w2[2] = amd_bytealign (w2[2] >> 16, wx, 2);
5615 break;
5616 case 39: w2[1] = amd_bytealign (wx, w2[1] << 8, 1);
5617 w2[2] = amd_bytealign (w2[2] >> 24, wx, 1);
5618 break;
5619 case 40: w2[2] = wx;
5620 break;
5621 case 41: w2[2] = amd_bytealign (wx, w2[2] << 24, 3);
5622 w2[3] = amd_bytealign (w2[3] >> 8, wx, 3);
5623 break;
5624 case 42: w2[2] = amd_bytealign (wx, w2[2] << 16, 2);
5625 w2[3] = amd_bytealign (w2[3] >> 16, wx, 2);
5626 break;
5627 case 43: w2[2] = amd_bytealign (wx, w2[2] << 8, 1);
5628 w2[3] = amd_bytealign (w2[3] >> 24, wx, 1);
5629 break;
5630 case 44: w2[3] = wx;
5631 break;
5632 case 45: w2[3] = amd_bytealign (wx, w2[3] << 24, 3);
5633 w3[0] = amd_bytealign (w3[0] >> 8, wx, 3);
5634 break;
5635 case 46: w2[3] = amd_bytealign (wx, w2[3] << 16, 2);
5636 w3[0] = amd_bytealign (w3[0] >> 16, wx, 2);
5637 break;
5638 case 47: w2[3] = amd_bytealign (wx, w2[3] << 8, 1);
5639 w3[0] = amd_bytealign (w3[0] >> 24, wx, 1);
5640 break;
5641 case 48: w3[0] = wx;
5642 break;
5643 case 49: w3[0] = amd_bytealign (wx, w3[0] << 24, 3);
5644 w3[1] = amd_bytealign (w3[1] >> 8, wx, 3);
5645 break;
5646 case 50: w3[0] = amd_bytealign (wx, w3[0] << 16, 2);
5647 w3[1] = amd_bytealign (w3[1] >> 16, wx, 2);
5648 break;
5649 case 51: w3[0] = amd_bytealign (wx, w3[0] << 8, 1);
5650 w3[1] = amd_bytealign (w3[1] >> 24, wx, 1);
5651 break;
5652 case 52: w3[1] = wx;
5653 break;
5654 case 53: w3[1] = amd_bytealign (wx, w3[1] << 24, 3);
5655 w3[2] = amd_bytealign (w3[2] >> 8, wx, 3);
5656 break;
5657 case 54: w3[1] = amd_bytealign (wx, w3[1] << 16, 2);
5658 w3[2] = amd_bytealign (w3[2] >> 16, wx, 2);
5659 break;
5660 case 55: w3[1] = amd_bytealign (wx, w3[1] << 8, 1);
5661 w3[2] = amd_bytealign (w3[2] >> 24, wx, 1);
5662 break;
5663 case 56: w3[2] = wx;
5664 break;
5665 case 57: w3[2] = amd_bytealign (wx, w3[2] << 24, 3);
5666 w3[3] = amd_bytealign (w3[3] >> 8, wx, 3);
5667 break;
5668 case 58: w3[2] = amd_bytealign (wx, w3[2] << 16, 2);
5669 w3[3] = amd_bytealign (w3[3] >> 16, wx, 2);
5670 break;
5671 case 59: w3[2] = amd_bytealign (wx, w3[2] << 8, 1);
5672 w3[3] = amd_bytealign (w3[3] >> 24, wx, 1);
5673 break;
5674 case 60: w3[3] = wx;
5675 break;
5676 case 61: w3[3] = amd_bytealign (wx, w3[3] << 24, 3);
5677 //w4[0] = amd_bytealign (w4[0] >> 8, wx, 3);
5678 break;
5679 case 62: w3[3] = amd_bytealign (wx, w3[3] << 16, 2);
5680 //w4[0] = amd_bytealign (w4[0] >> 16, wx, 2);
5681 break;
5682 case 63: w3[3] = amd_bytealign (wx, w3[3] << 8, 1);
5683 //w4[0] = amd_bytealign (w4[0] >> 24, wx, 1);
5684 break;
5685 }
5686 #else
5687 switch (salt_len)
5688 {
5689 case 0: w0[0] = wx;
5690 break;
5691 case 1: w0[0] = (w0[0] & 0x000000ff) | (wx << 8);
5692 w0[1] = (w0[1] & 0xffffff00) | (wx >> 24);
5693 break;
5694 case 2: w0[0] = (w0[0] & 0x0000ffff) | (wx << 16);
5695 w0[1] = (w0[1] & 0xffff0000) | (wx >> 16);
5696 break;
5697 case 3: w0[0] = (w0[0] & 0x00ffffff) | (wx << 24);
5698 w0[1] = (w0[1] & 0xff000000) | (wx >> 8);
5699 break;
5700 case 4: w0[1] = wx;
5701 break;
5702 case 5: w0[1] = (w0[1] & 0x000000ff) | (wx << 8);
5703 w0[2] = (w0[2] & 0xffffff00) | (wx >> 24);
5704 break;
5705 case 6: w0[1] = (w0[1] & 0x0000ffff) | (wx << 16);
5706 w0[2] = (w0[2] & 0xffff0000) | (wx >> 16);
5707 break;
5708 case 7: w0[1] = (w0[1] & 0x00ffffff) | (wx << 24);
5709 w0[2] = (w0[2] & 0xff000000) | (wx >> 8);
5710 break;
5711 case 8: w0[2] = wx;
5712 break;
5713 case 9: w0[2] = (w0[2] & 0x000000ff) | (wx << 8);
5714 w0[3] = (w0[3] & 0xffffff00) | (wx >> 24);
5715 break;
5716 case 10: w0[2] = (w0[2] & 0x0000ffff) | (wx << 16);
5717 w0[3] = (w0[3] & 0xffff0000) | (wx >> 16);
5718 break;
5719 case 11: w0[2] = (w0[2] & 0x00ffffff) | (wx << 24);
5720 w0[3] = (w0[3] & 0xff000000) | (wx >> 8);
5721 break;
5722 case 12: w0[3] = wx;
5723 break;
5724 case 13: w0[3] = (w0[3] & 0x000000ff) | (wx << 8);
5725 w1[0] = (w1[0] & 0xffffff00) | (wx >> 24);
5726 break;
5727 case 14: w0[3] = (w0[3] & 0x0000ffff) | (wx << 16);
5728 w1[0] = (w1[0] & 0xffff0000) | (wx >> 16);
5729 break;
5730 case 15: w0[3] = (w0[3] & 0x00ffffff) | (wx << 24);
5731 w1[0] = (w1[0] & 0xff000000) | (wx >> 8);
5732 break;
5733 case 16: w1[0] = wx;
5734 break;
5735 case 17: w1[0] = (w1[0] & 0x000000ff) | (wx << 8);
5736 w1[1] = (w1[1] & 0xffffff00) | (wx >> 24);
5737 break;
5738 case 18: w1[0] = (w1[0] & 0x0000ffff) | (wx << 16);
5739 w1[1] = (w1[1] & 0xffff0000) | (wx >> 16);
5740 break;
5741 case 19: w1[0] = (w1[0] & 0x00ffffff) | (wx << 24);
5742 w1[1] = (w1[1] & 0xff000000) | (wx >> 8);
5743 break;
5744 case 20: w1[1] = wx;
5745 break;
5746 case 21: w1[1] = (w1[1] & 0x000000ff) | (wx << 8);
5747 w1[2] = (w1[2] & 0xffffff00) | (wx >> 24);
5748 break;
5749 case 22: w1[1] = (w1[1] & 0x0000ffff) | (wx << 16);
5750 w1[2] = (w1[2] & 0xffff0000) | (wx >> 16);
5751 break;
5752 case 23: w1[1] = (w1[1] & 0x00ffffff) | (wx << 24);
5753 w1[2] = (w1[2] & 0xff000000) | (wx >> 8);
5754 break;
5755 case 24: w1[2] = wx;
5756 break;
5757 case 25: w1[2] = (w1[2] & 0x000000ff) | (wx << 8);
5758 w1[3] = (w1[3] & 0xffffff00) | (wx >> 24);
5759 break;
5760 case 26: w1[2] = (w1[2] & 0x0000ffff) | (wx << 16);
5761 w1[3] = (w1[3] & 0xffff0000) | (wx >> 16);
5762 break;
5763 case 27: w1[2] = (w1[2] & 0x00ffffff) | (wx << 24);
5764 w1[3] = (w1[3] & 0xff000000) | (wx >> 8);
5765 break;
5766 case 28: w1[3] = wx;
5767 break;
5768 case 29: w1[3] = (w1[3] & 0x000000ff) | (wx << 8);
5769 w2[0] = (w2[0] & 0xffffff00) | (wx >> 24);
5770 break;
5771 case 30: w1[3] = (w1[3] & 0x0000ffff) | (wx << 16);
5772 w2[0] = (w2[0] & 0xffff0000) | (wx >> 16);
5773 break;
5774 case 31: w1[3] = (w1[3] & 0x00ffffff) | (wx << 24);
5775 w2[0] = (w2[0] & 0xff000000) | (wx >> 8);
5776 break;
5777 case 32: w2[0] = wx;
5778 break;
5779 case 33: w2[0] = (w2[0] & 0x000000ff) | (wx << 8);
5780 w2[1] = (w2[1] & 0xffffff00) | (wx >> 24);
5781 break;
5782 case 34: w2[0] = (w2[0] & 0x0000ffff) | (wx << 16);
5783 w2[1] = (w2[1] & 0xffff0000) | (wx >> 16);
5784 break;
5785 case 35: w2[0] = (w2[0] & 0x00ffffff) | (wx << 24);
5786 w2[1] = (w2[1] & 0xff000000) | (wx >> 8);
5787 break;
5788 case 36: w2[1] = wx;
5789 break;
5790 case 37: w2[1] = (w2[1] & 0x000000ff) | (wx << 8);
5791 w2[2] = (w2[2] & 0xffffff00) | (wx >> 24);
5792 break;
5793 case 38: w2[1] = (w2[1] & 0x0000ffff) | (wx << 16);
5794 w2[2] = (w2[2] & 0xffff0000) | (wx >> 16);
5795 break;
5796 case 39: w2[1] = (w2[1] & 0x00ffffff) | (wx << 24);
5797 w2[2] = (w2[2] & 0xff000000) | (wx >> 8);
5798 break;
5799 case 40: w2[2] = wx;
5800 break;
5801 case 41: w2[2] = (w2[2] & 0x000000ff) | (wx << 8);
5802 w2[3] = (w2[3] & 0xffffff00) | (wx >> 24);
5803 break;
5804 case 42: w2[2] = (w2[2] & 0x0000ffff) | (wx << 16);
5805 w2[3] = (w2[3] & 0xffff0000) | (wx >> 16);
5806 break;
5807 case 43: w2[2] = (w2[2] & 0x00ffffff) | (wx << 24);
5808 w2[3] = (w2[3] & 0xff000000) | (wx >> 8);
5809 break;
5810 case 44: w2[3] = wx;
5811 break;
5812 case 45: w2[3] = (w2[3] & 0x000000ff) | (wx << 8);
5813 w3[0] = (w3[0] & 0xffffff00) | (wx >> 24);
5814 break;
5815 case 46: w2[3] = (w2[3] & 0x0000ffff) | (wx << 16);
5816 w3[0] = (w3[0] & 0xffff0000) | (wx >> 16);
5817 break;
5818 case 47: w2[3] = (w2[3] & 0x00ffffff) | (wx << 24);
5819 w3[0] = (w3[0] & 0xff000000) | (wx >> 8);
5820 break;
5821 case 48: w3[0] = wx;
5822 break;
5823 case 49: w3[0] = (w3[0] & 0x000000ff) | (wx << 8);
5824 w3[1] = (w3[1] & 0xffffff00) | (wx >> 24);
5825 break;
5826 case 50: w3[0] = (w3[0] & 0x0000ffff) | (wx << 16);
5827 w3[1] = (w3[1] & 0xffff0000) | (wx >> 16);
5828 break;
5829 case 51: w3[0] = (w3[0] & 0x00ffffff) | (wx << 24);
5830 w3[1] = (w3[1] & 0xff000000) | (wx >> 8);
5831 break;
5832 case 52: w3[1] = wx;
5833 break;
5834 case 53: w3[1] = (w3[1] & 0x000000ff) | (wx << 8);
5835 w3[2] = (w3[2] & 0xffffff00) | (wx >> 24);
5836 break;
5837 case 54: w3[1] = (w3[1] & 0x0000ffff) | (wx << 16);
5838 w3[2] = (w3[2] & 0xffff0000) | (wx >> 16);
5839 break;
5840 case 55: w3[1] = (w3[1] & 0x00ffffff) | (wx << 24);
5841 w3[2] = (w3[2] & 0xff000000) | (wx >> 8);
5842 break;
5843 case 56: w3[2] = wx;
5844 break;
5845 case 57: w3[2] = (w3[2] & 0x000000ff) | (wx << 8);
5846 w3[3] = (w3[3] & 0xffffff00) | (wx >> 24);
5847 break;
5848 case 58: w3[2] = (w3[2] & 0x0000ffff) | (wx << 16);
5849 w3[3] = (w3[3] & 0xffff0000) | (wx >> 16);
5850 break;
5851 case 59: w3[2] = (w3[2] & 0x00ffffff) | (wx << 24);
5852 w3[3] = (w3[3] & 0xff000000) | (wx >> 8);
5853 break;
5854 case 60: w3[3] = wx;
5855 break;
5856 case 61: w3[3] = (w3[3] & 0x000000ff) | (wx << 8);
5857 //w4[0] = (w4[0] & 0xffffff00) | (wx >> 24);
5858 break;
5859 case 62: w3[3] = (w3[3] & 0x0000ffff) | (wx << 16);
5860 //w4[0] = (w4[0] & 0xffff0000) | (wx >> 16);
5861 break;
5862 case 63: w3[3] = (w3[3] & 0x00ffffff) | (wx << 24);
5863 //w4[0] = (w4[0] & 0xff000000) | (wx >> 8);
5864 break;
5865 }
5866 #endif
5867 }
5868
5869 inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len)
5870 {
5871 // would be nice to have optimization based on amd_bytealign as with _le counterpart
5872
5873 switch (salt_len)
5874 {
5875 case 0: w0[0] = wx;
5876 break;
5877 case 1: w0[0] = (w0[0] & 0xff000000) | (wx >> 8);
5878 w0[1] = (w0[1] & 0x00ffffff) | (wx << 24);
5879 break;
5880 case 2: w0[0] = (w0[0] & 0xffff0000) | (wx >> 16);
5881 w0[1] = (w0[1] & 0x0000ffff) | (wx << 16);
5882 break;
5883 case 3: w0[0] = (w0[0] & 0xffffff00) | (wx >> 24);
5884 w0[1] = (w0[1] & 0x000000ff) | (wx << 8);
5885 break;
5886 case 4: w0[1] = wx;
5887 break;
5888 case 5: w0[1] = (w0[1] & 0xff000000) | (wx >> 8);
5889 w0[2] = (w0[2] & 0x00ffffff) | (wx << 24);
5890 break;
5891 case 6: w0[1] = (w0[1] & 0xffff0000) | (wx >> 16);
5892 w0[2] = (w0[2] & 0x0000ffff) | (wx << 16);
5893 break;
5894 case 7: w0[1] = (w0[1] & 0xffffff00) | (wx >> 24);
5895 w0[2] = (w0[2] & 0x000000ff) | (wx << 8);
5896 break;
5897 case 8: w0[2] = wx;
5898 break;
5899 case 9: w0[2] = (w0[2] & 0xff000000) | (wx >> 8);
5900 w0[3] = (w0[3] & 0x00ffffff) | (wx << 24);
5901 break;
5902 case 10: w0[2] = (w0[2] & 0xffff0000) | (wx >> 16);
5903 w0[3] = (w0[3] & 0x0000ffff) | (wx << 16);
5904 break;
5905 case 11: w0[2] = (w0[2] & 0xffffff00) | (wx >> 24);
5906 w0[3] = (w0[3] & 0x000000ff) | (wx << 8);
5907 break;
5908 case 12: w0[3] = wx;
5909 break;
5910 case 13: w0[3] = (w0[3] & 0xff000000) | (wx >> 8);
5911 w1[0] = (w1[0] & 0x00ffffff) | (wx << 24);
5912 break;
5913 case 14: w0[3] = (w0[3] & 0xffff0000) | (wx >> 16);
5914 w1[0] = (w1[0] & 0x0000ffff) | (wx << 16);
5915 break;
5916 case 15: w0[3] = (w0[3] & 0xffffff00) | (wx >> 24);
5917 w1[0] = (w1[0] & 0x000000ff) | (wx << 8);
5918 break;
5919 case 16: w1[0] = wx;
5920 break;
5921 case 17: w1[0] = (w1[0] & 0xff000000) | (wx >> 8);
5922 w1[1] = (w1[1] & 0x00ffffff) | (wx << 24);
5923 break;
5924 case 18: w1[0] = (w1[0] & 0xffff0000) | (wx >> 16);
5925 w1[1] = (w1[1] & 0x0000ffff) | (wx << 16);
5926 break;
5927 case 19: w1[0] = (w1[0] & 0xffffff00) | (wx >> 24);
5928 w1[1] = (w1[1] & 0x000000ff) | (wx << 8);
5929 break;
5930 case 20: w1[1] = wx;
5931 break;
5932 case 21: w1[1] = (w1[1] & 0xff000000) | (wx >> 8);
5933 w1[2] = (w1[2] & 0x00ffffff) | (wx << 24);
5934 break;
5935 case 22: w1[1] = (w1[1] & 0xffff0000) | (wx >> 16);
5936 w1[2] = (w1[2] & 0x0000ffff) | (wx << 16);
5937 break;
5938 case 23: w1[1] = (w1[1] & 0xffffff00) | (wx >> 24);
5939 w1[2] = (w1[2] & 0x000000ff) | (wx << 8);
5940 break;
5941 case 24: w1[2] = wx;
5942 break;
5943 case 25: w1[2] = (w1[2] & 0xff000000) | (wx >> 8);
5944 w1[3] = (w1[3] & 0x00ffffff) | (wx << 24);
5945 break;
5946 case 26: w1[2] = (w1[2] & 0xffff0000) | (wx >> 16);
5947 w1[3] = (w1[3] & 0x0000ffff) | (wx << 16);
5948 break;
5949 case 27: w1[2] = (w1[2] & 0xffffff00) | (wx >> 24);
5950 w1[3] = (w1[3] & 0x000000ff) | (wx << 8);
5951 break;
5952 case 28: w1[3] = wx;
5953 break;
5954 case 29: w1[3] = (w1[3] & 0xff000000) | (wx >> 8);
5955 w2[0] = (w2[0] & 0x00ffffff) | (wx << 24);
5956 break;
5957 case 30: w1[3] = (w1[3] & 0xffff0000) | (wx >> 16);
5958 w2[0] = (w2[0] & 0x0000ffff) | (wx << 16);
5959 break;
5960 case 31: w1[3] = (w1[3] & 0xffffff00) | (wx >> 24);
5961 w2[0] = (w2[0] & 0x000000ff) | (wx << 8);
5962 break;
5963 case 32: w2[0] = wx;
5964 break;
5965 case 33: w2[0] = (w2[0] & 0xff000000) | (wx >> 8);
5966 w2[1] = (w2[1] & 0x00ffffff) | (wx << 24);
5967 break;
5968 case 34: w2[0] = (w2[0] & 0xffff0000) | (wx >> 16);
5969 w2[1] = (w2[1] & 0x0000ffff) | (wx << 16);
5970 break;
5971 case 35: w2[0] = (w2[0] & 0xffffff00) | (wx >> 24);
5972 w2[1] = (w2[1] & 0x000000ff) | (wx << 8);
5973 break;
5974 case 36: w2[1] = wx;
5975 break;
5976 case 37: w2[1] = (w2[1] & 0xff000000) | (wx >> 8);
5977 w2[2] = (w2[2] & 0x00ffffff) | (wx << 24);
5978 break;
5979 case 38: w2[1] = (w2[1] & 0xffff0000) | (wx >> 16);
5980 w2[2] = (w2[2] & 0x0000ffff) | (wx << 16);
5981 break;
5982 case 39: w2[1] = (w2[1] & 0xffffff00) | (wx >> 24);
5983 w2[2] = (w2[2] & 0x000000ff) | (wx << 8);
5984 break;
5985 case 40: w2[2] = wx;
5986 break;
5987 case 41: w2[2] = (w2[2] & 0xff000000) | (wx >> 8);
5988 w2[3] = (w2[3] & 0x00ffffff) | (wx << 24);
5989 break;
5990 case 42: w2[2] = (w2[2] & 0xffff0000) | (wx >> 16);
5991 w2[3] = (w2[3] & 0x0000ffff) | (wx << 16);
5992 break;
5993 case 43: w2[2] = (w2[2] & 0xffffff00) | (wx >> 24);
5994 w2[3] = (w2[3] & 0x000000ff) | (wx << 8);
5995 break;
5996 case 44: w2[3] = wx;
5997 break;
5998 case 45: w2[3] = (w2[3] & 0xff000000) | (wx >> 8);
5999 w3[0] = (w3[0] & 0x00ffffff) | (wx << 24);
6000 break;
6001 case 46: w2[3] = (w2[3] & 0xffff0000) | (wx >> 16);
6002 w3[0] = (w3[0] & 0x0000ffff) | (wx << 16);
6003 break;
6004 case 47: w2[3] = (w2[3] & 0xffffff00) | (wx >> 24);
6005 w3[0] = (w3[0] & 0x000000ff) | (wx << 8);
6006 break;
6007 case 48: w3[0] = wx;
6008 break;
6009 case 49: w3[0] = (w3[0] & 0xff000000) | (wx >> 8);
6010 w3[1] = (w3[1] & 0x00ffffff) | (wx << 24);
6011 break;
6012 case 50: w3[0] = (w3[0] & 0xffff0000) | (wx >> 16);
6013 w3[1] = (w3[1] & 0x0000ffff) | (wx << 16);
6014 break;
6015 case 51: w3[0] = (w3[0] & 0xffffff00) | (wx >> 24);
6016 w3[1] = (w3[1] & 0x000000ff) | (wx << 8);
6017 break;
6018 case 52: w3[1] = wx;
6019 break;
6020 case 53: w3[1] = (w3[1] & 0xff000000) | (wx >> 8);
6021 w3[2] = (w3[2] & 0x00ffffff) | (wx << 24);
6022 break;
6023 case 54: w3[1] = (w3[1] & 0xffff0000) | (wx >> 16);
6024 w3[2] = (w3[2] & 0x0000ffff) | (wx << 16);
6025 break;
6026 case 55: w3[1] = (w3[1] & 0xffffff00) | (wx >> 24);
6027 w3[2] = (w3[2] & 0x000000ff) | (wx << 8);
6028 break;
6029 case 56: w3[2] = wx;
6030 break;
6031 case 57: w3[2] = (w3[2] & 0xff000000) | (wx >> 8);
6032 w3[3] = (w3[3] & 0x00ffffff) | (wx << 24);
6033 break;
6034 case 58: w3[2] = (w3[2] & 0xffff0000) | (wx >> 16);
6035 w3[3] = (w3[3] & 0x0000ffff) | (wx << 16);
6036 break;
6037 case 59: w3[2] = (w3[2] & 0xffffff00) | (wx >> 24);
6038 w3[3] = (w3[3] & 0x000000ff) | (wx << 8);
6039 break;
6040 case 60: w3[3] = wx;
6041 break;
6042 case 61: w3[3] = (w3[3] & 0xff000000) | (wx >> 8);
6043 //w4[0] = (w4[0] & 0x00ffffff) | (wx << 24);
6044 break;
6045 case 62: w3[3] = (w3[3] & 0xffff0000) | (wx >> 16);
6046 //w4[0] = (w4[0] & 0x0000ffff) | (wx << 16);
6047 break;
6048 case 63: w3[3] = (w3[3] & 0xffffff00) | (wx >> 24);
6049 //w4[0] = (w4[0] & 0x000000ff) | (wx << 8);
6050 break;
6051 }
6052 }
6053
6054 /**
6055 * vector functions as scalar (for outer loop usage)
6056 */
6057
6058 inline void append_0x01_1x4_S (u32 w0[4], const u32 offset)
6059 {
6060 switch (offset)
6061 {
6062 case 0:
6063 w0[0] = 0x01;
6064 break;
6065
6066 case 1:
6067 w0[0] = w0[0] | 0x0100;
6068 break;
6069
6070 case 2:
6071 w0[0] = w0[0] | 0x010000;
6072 break;
6073
6074 case 3:
6075 w0[0] = w0[0] | 0x01000000;
6076 break;
6077
6078 case 4:
6079 w0[1] = 0x01;
6080 break;
6081
6082 case 5:
6083 w0[1] = w0[1] | 0x0100;
6084 break;
6085
6086 case 6:
6087 w0[1] = w0[1] | 0x010000;
6088 break;
6089
6090 case 7:
6091 w0[1] = w0[1] | 0x01000000;
6092 break;
6093
6094 case 8:
6095 w0[2] = 0x01;
6096 break;
6097
6098 case 9:
6099 w0[2] = w0[2] | 0x0100;
6100 break;
6101
6102 case 10:
6103 w0[2] = w0[2] | 0x010000;
6104 break;
6105
6106 case 11:
6107 w0[2] = w0[2] | 0x01000000;
6108 break;
6109
6110 case 12:
6111 w0[3] = 0x01;
6112 break;
6113
6114 case 13:
6115 w0[3] = w0[3] | 0x0100;
6116 break;
6117
6118 case 14:
6119 w0[3] = w0[3] | 0x010000;
6120 break;
6121
6122 case 15:
6123 w0[3] = w0[3] | 0x01000000;
6124 break;
6125 }
6126 }
6127
6128 inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
6129 {
6130 switch (offset)
6131 {
6132 case 0:
6133 w0[0] = 0x01;
6134 break;
6135
6136 case 1:
6137 w0[0] = w0[0] | 0x0100;
6138 break;
6139
6140 case 2:
6141 w0[0] = w0[0] | 0x010000;
6142 break;
6143
6144 case 3:
6145 w0[0] = w0[0] | 0x01000000;
6146 break;
6147
6148 case 4:
6149 w0[1] = 0x01;
6150 break;
6151
6152 case 5:
6153 w0[1] = w0[1] | 0x0100;
6154 break;
6155
6156 case 6:
6157 w0[1] = w0[1] | 0x010000;
6158 break;
6159
6160 case 7:
6161 w0[1] = w0[1] | 0x01000000;
6162 break;
6163
6164 case 8:
6165 w0[2] = 0x01;
6166 break;
6167
6168 case 9:
6169 w0[2] = w0[2] | 0x0100;
6170 break;
6171
6172 case 10:
6173 w0[2] = w0[2] | 0x010000;
6174 break;
6175
6176 case 11:
6177 w0[2] = w0[2] | 0x01000000;
6178 break;
6179
6180 case 12:
6181 w0[3] = 0x01;
6182 break;
6183
6184 case 13:
6185 w0[3] = w0[3] | 0x0100;
6186 break;
6187
6188 case 14:
6189 w0[3] = w0[3] | 0x010000;
6190 break;
6191
6192 case 15:
6193 w0[3] = w0[3] | 0x01000000;
6194 break;
6195
6196 case 16:
6197 w1[0] = 0x01;
6198 break;
6199
6200 case 17:
6201 w1[0] = w1[0] | 0x0100;
6202 break;
6203
6204 case 18:
6205 w1[0] = w1[0] | 0x010000;
6206 break;
6207
6208 case 19:
6209 w1[0] = w1[0] | 0x01000000;
6210 break;
6211
6212 case 20:
6213 w1[1] = 0x01;
6214 break;
6215
6216 case 21:
6217 w1[1] = w1[1] | 0x0100;
6218 break;
6219
6220 case 22:
6221 w1[1] = w1[1] | 0x010000;
6222 break;
6223
6224 case 23:
6225 w1[1] = w1[1] | 0x01000000;
6226 break;
6227
6228 case 24:
6229 w1[2] = 0x01;
6230 break;
6231
6232 case 25:
6233 w1[2] = w1[2] | 0x0100;
6234 break;
6235
6236 case 26:
6237 w1[2] = w1[2] | 0x010000;
6238 break;
6239
6240 case 27:
6241 w1[2] = w1[2] | 0x01000000;
6242 break;
6243
6244 case 28:
6245 w1[3] = 0x01;
6246 break;
6247
6248 case 29:
6249 w1[3] = w1[3] | 0x0100;
6250 break;
6251
6252 case 30:
6253 w1[3] = w1[3] | 0x010000;
6254 break;
6255
6256 case 31:
6257 w1[3] = w1[3] | 0x01000000;
6258 break;
6259 }
6260 }
6261
6262 inline void append_0x01_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
6263 {
6264 switch (offset)
6265 {
6266 case 0:
6267 w0[0] = 0x01;
6268 break;
6269
6270 case 1:
6271 w0[0] = w0[0] | 0x0100;
6272 break;
6273
6274 case 2:
6275 w0[0] = w0[0] | 0x010000;
6276 break;
6277
6278 case 3:
6279 w0[0] = w0[0] | 0x01000000;
6280 break;
6281
6282 case 4:
6283 w0[1] = 0x01;
6284 break;
6285
6286 case 5:
6287 w0[1] = w0[1] | 0x0100;
6288 break;
6289
6290 case 6:
6291 w0[1] = w0[1] | 0x010000;
6292 break;
6293
6294 case 7:
6295 w0[1] = w0[1] | 0x01000000;
6296 break;
6297
6298 case 8:
6299 w0[2] = 0x01;
6300 break;
6301
6302 case 9:
6303 w0[2] = w0[2] | 0x0100;
6304 break;
6305
6306 case 10:
6307 w0[2] = w0[2] | 0x010000;
6308 break;
6309
6310 case 11:
6311 w0[2] = w0[2] | 0x01000000;
6312 break;
6313
6314 case 12:
6315 w0[3] = 0x01;
6316 break;
6317
6318 case 13:
6319 w0[3] = w0[3] | 0x0100;
6320 break;
6321
6322 case 14:
6323 w0[3] = w0[3] | 0x010000;
6324 break;
6325
6326 case 15:
6327 w0[3] = w0[3] | 0x01000000;
6328 break;
6329
6330 case 16:
6331 w1[0] = 0x01;
6332 break;
6333
6334 case 17:
6335 w1[0] = w1[0] | 0x0100;
6336 break;
6337
6338 case 18:
6339 w1[0] = w1[0] | 0x010000;
6340 break;
6341
6342 case 19:
6343 w1[0] = w1[0] | 0x01000000;
6344 break;
6345
6346 case 20:
6347 w1[1] = 0x01;
6348 break;
6349
6350 case 21:
6351 w1[1] = w1[1] | 0x0100;
6352 break;
6353
6354 case 22:
6355 w1[1] = w1[1] | 0x010000;
6356 break;
6357
6358 case 23:
6359 w1[1] = w1[1] | 0x01000000;
6360 break;
6361
6362 case 24:
6363 w1[2] = 0x01;
6364 break;
6365
6366 case 25:
6367 w1[2] = w1[2] | 0x0100;
6368 break;
6369
6370 case 26:
6371 w1[2] = w1[2] | 0x010000;
6372 break;
6373
6374 case 27:
6375 w1[2] = w1[2] | 0x01000000;
6376 break;
6377
6378 case 28:
6379 w1[3] = 0x01;
6380 break;
6381
6382 case 29:
6383 w1[3] = w1[3] | 0x0100;
6384 break;
6385
6386 case 30:
6387 w1[3] = w1[3] | 0x010000;
6388 break;
6389
6390 case 31:
6391 w1[3] = w1[3] | 0x01000000;
6392 break;
6393
6394 case 32:
6395 w2[0] = 0x01;
6396 break;
6397
6398 case 33:
6399 w2[0] = w2[0] | 0x0100;
6400 break;
6401
6402 case 34:
6403 w2[0] = w2[0] | 0x010000;
6404 break;
6405
6406 case 35:
6407 w2[0] = w2[0] | 0x01000000;
6408 break;
6409
6410 case 36:
6411 w2[1] = 0x01;
6412 break;
6413
6414 case 37:
6415 w2[1] = w2[1] | 0x0100;
6416 break;
6417
6418 case 38:
6419 w2[1] = w2[1] | 0x010000;
6420 break;
6421
6422 case 39:
6423 w2[1] = w2[1] | 0x01000000;
6424 break;
6425
6426 case 40:
6427 w2[2] = 0x01;
6428 break;
6429
6430 case 41:
6431 w2[2] = w2[2] | 0x0100;
6432 break;
6433
6434 case 42:
6435 w2[2] = w2[2] | 0x010000;
6436 break;
6437
6438 case 43:
6439 w2[2] = w2[2] | 0x01000000;
6440 break;
6441
6442 case 44:
6443 w2[3] = 0x01;
6444 break;
6445
6446 case 45:
6447 w2[3] = w2[3] | 0x0100;
6448 break;
6449
6450 case 46:
6451 w2[3] = w2[3] | 0x010000;
6452 break;
6453
6454 case 47:
6455 w2[3] = w2[3] | 0x01000000;
6456 break;
6457 }
6458 }
6459
6460 inline void append_0x01_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
6461 {
6462 switch (offset)
6463 {
6464 case 0:
6465 w0[0] = 0x01;
6466 break;
6467
6468 case 1:
6469 w0[0] = w0[0] | 0x0100;
6470 break;
6471
6472 case 2:
6473 w0[0] = w0[0] | 0x010000;
6474 break;
6475
6476 case 3:
6477 w0[0] = w0[0] | 0x01000000;
6478 break;
6479
6480 case 4:
6481 w0[1] = 0x01;
6482 break;
6483
6484 case 5:
6485 w0[1] = w0[1] | 0x0100;
6486 break;
6487
6488 case 6:
6489 w0[1] = w0[1] | 0x010000;
6490 break;
6491
6492 case 7:
6493 w0[1] = w0[1] | 0x01000000;
6494 break;
6495
6496 case 8:
6497 w0[2] = 0x01;
6498 break;
6499
6500 case 9:
6501 w0[2] = w0[2] | 0x0100;
6502 break;
6503
6504 case 10:
6505 w0[2] = w0[2] | 0x010000;
6506 break;
6507
6508 case 11:
6509 w0[2] = w0[2] | 0x01000000;
6510 break;
6511
6512 case 12:
6513 w0[3] = 0x01;
6514 break;
6515
6516 case 13:
6517 w0[3] = w0[3] | 0x0100;
6518 break;
6519
6520 case 14:
6521 w0[3] = w0[3] | 0x010000;
6522 break;
6523
6524 case 15:
6525 w0[3] = w0[3] | 0x01000000;
6526 break;
6527
6528 case 16:
6529 w1[0] = 0x01;
6530 break;
6531
6532 case 17:
6533 w1[0] = w1[0] | 0x0100;
6534 break;
6535
6536 case 18:
6537 w1[0] = w1[0] | 0x010000;
6538 break;
6539
6540 case 19:
6541 w1[0] = w1[0] | 0x01000000;
6542 break;
6543
6544 case 20:
6545 w1[1] = 0x01;
6546 break;
6547
6548 case 21:
6549 w1[1] = w1[1] | 0x0100;
6550 break;
6551
6552 case 22:
6553 w1[1] = w1[1] | 0x010000;
6554 break;
6555
6556 case 23:
6557 w1[1] = w1[1] | 0x01000000;
6558 break;
6559
6560 case 24:
6561 w1[2] = 0x01;
6562 break;
6563
6564 case 25:
6565 w1[2] = w1[2] | 0x0100;
6566 break;
6567
6568 case 26:
6569 w1[2] = w1[2] | 0x010000;
6570 break;
6571
6572 case 27:
6573 w1[2] = w1[2] | 0x01000000;
6574 break;
6575
6576 case 28:
6577 w1[3] = 0x01;
6578 break;
6579
6580 case 29:
6581 w1[3] = w1[3] | 0x0100;
6582 break;
6583
6584 case 30:
6585 w1[3] = w1[3] | 0x010000;
6586 break;
6587
6588 case 31:
6589 w1[3] = w1[3] | 0x01000000;
6590 break;
6591
6592 case 32:
6593 w2[0] = 0x01;
6594 break;
6595
6596 case 33:
6597 w2[0] = w2[0] | 0x0100;
6598 break;
6599
6600 case 34:
6601 w2[0] = w2[0] | 0x010000;
6602 break;
6603
6604 case 35:
6605 w2[0] = w2[0] | 0x01000000;
6606 break;
6607
6608 case 36:
6609 w2[1] = 0x01;
6610 break;
6611
6612 case 37:
6613 w2[1] = w2[1] | 0x0100;
6614 break;
6615
6616 case 38:
6617 w2[1] = w2[1] | 0x010000;
6618 break;
6619
6620 case 39:
6621 w2[1] = w2[1] | 0x01000000;
6622 break;
6623
6624 case 40:
6625 w2[2] = 0x01;
6626 break;
6627
6628 case 41:
6629 w2[2] = w2[2] | 0x0100;
6630 break;
6631
6632 case 42:
6633 w2[2] = w2[2] | 0x010000;
6634 break;
6635
6636 case 43:
6637 w2[2] = w2[2] | 0x01000000;
6638 break;
6639
6640 case 44:
6641 w2[3] = 0x01;
6642 break;
6643
6644 case 45:
6645 w2[3] = w2[3] | 0x0100;
6646 break;
6647
6648 case 46:
6649 w2[3] = w2[3] | 0x010000;
6650 break;
6651
6652 case 47:
6653 w2[3] = w2[3] | 0x01000000;
6654 break;
6655
6656 case 48:
6657 w3[0] = 0x01;
6658 break;
6659
6660 case 49:
6661 w3[0] = w3[0] | 0x0100;
6662 break;
6663
6664 case 50:
6665 w3[0] = w3[0] | 0x010000;
6666 break;
6667
6668 case 51:
6669 w3[0] = w3[0] | 0x01000000;
6670 break;
6671
6672 case 52:
6673 w3[1] = 0x01;
6674 break;
6675
6676 case 53:
6677 w3[1] = w3[1] | 0x0100;
6678 break;
6679
6680 case 54:
6681 w3[1] = w3[1] | 0x010000;
6682 break;
6683
6684 case 55:
6685 w3[1] = w3[1] | 0x01000000;
6686 break;
6687
6688 case 56:
6689 w3[2] = 0x01;
6690 break;
6691
6692 case 57:
6693 w3[2] = w3[2] | 0x0100;
6694 break;
6695
6696 case 58:
6697 w3[2] = w3[2] | 0x010000;
6698 break;
6699
6700 case 59:
6701 w3[2] = w3[2] | 0x01000000;
6702 break;
6703
6704 case 60:
6705 w3[3] = 0x01;
6706 break;
6707
6708 case 61:
6709 w3[3] = w3[3] | 0x0100;
6710 break;
6711
6712 case 62:
6713 w3[3] = w3[3] | 0x010000;
6714 break;
6715
6716 case 63:
6717 w3[3] = w3[3] | 0x01000000;
6718 break;
6719 }
6720 }
6721
6722 inline void append_0x02_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
6723 {
6724 switch (offset)
6725 {
6726 case 0:
6727 w0[0] = 0x02;
6728 break;
6729
6730 case 1:
6731 w0[0] = w0[0] | 0x0200;
6732 break;
6733
6734 case 2:
6735 w0[0] = w0[0] | 0x020000;
6736 break;
6737
6738 case 3:
6739 w0[0] = w0[0] | 0x02000000;
6740 break;
6741
6742 case 4:
6743 w0[1] = 0x02;
6744 break;
6745
6746 case 5:
6747 w0[1] = w0[1] | 0x0200;
6748 break;
6749
6750 case 6:
6751 w0[1] = w0[1] | 0x020000;
6752 break;
6753
6754 case 7:
6755 w0[1] = w0[1] | 0x02000000;
6756 break;
6757
6758 case 8:
6759 w0[2] = 0x02;
6760 break;
6761
6762 case 9:
6763 w0[2] = w0[2] | 0x0200;
6764 break;
6765
6766 case 10:
6767 w0[2] = w0[2] | 0x020000;
6768 break;
6769
6770 case 11:
6771 w0[2] = w0[2] | 0x02000000;
6772 break;
6773
6774 case 12:
6775 w0[3] = 0x02;
6776 break;
6777
6778 case 13:
6779 w0[3] = w0[3] | 0x0200;
6780 break;
6781
6782 case 14:
6783 w0[3] = w0[3] | 0x020000;
6784 break;
6785
6786 case 15:
6787 w0[3] = w0[3] | 0x02000000;
6788 break;
6789
6790 case 16:
6791 w1[0] = 0x02;
6792 break;
6793
6794 case 17:
6795 w1[0] = w1[0] | 0x0200;
6796 break;
6797
6798 case 18:
6799 w1[0] = w1[0] | 0x020000;
6800 break;
6801
6802 case 19:
6803 w1[0] = w1[0] | 0x02000000;
6804 break;
6805
6806 case 20:
6807 w1[1] = 0x02;
6808 break;
6809
6810 case 21:
6811 w1[1] = w1[1] | 0x0200;
6812 break;
6813
6814 case 22:
6815 w1[1] = w1[1] | 0x020000;
6816 break;
6817
6818 case 23:
6819 w1[1] = w1[1] | 0x02000000;
6820 break;
6821
6822 case 24:
6823 w1[2] = 0x02;
6824 break;
6825
6826 case 25:
6827 w1[2] = w1[2] | 0x0200;
6828 break;
6829
6830 case 26:
6831 w1[2] = w1[2] | 0x020000;
6832 break;
6833
6834 case 27:
6835 w1[2] = w1[2] | 0x02000000;
6836 break;
6837
6838 case 28:
6839 w1[3] = 0x02;
6840 break;
6841
6842 case 29:
6843 w1[3] = w1[3] | 0x0200;
6844 break;
6845
6846 case 30:
6847 w1[3] = w1[3] | 0x020000;
6848 break;
6849
6850 case 31:
6851 w1[3] = w1[3] | 0x02000000;
6852 break;
6853 }
6854 }
6855
6856 inline void append_0x02_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
6857 {
6858 switch (offset)
6859 {
6860 case 0:
6861 w0[0] = 0x02;
6862 break;
6863
6864 case 1:
6865 w0[0] = w0[0] | 0x0200;
6866 break;
6867
6868 case 2:
6869 w0[0] = w0[0] | 0x020000;
6870 break;
6871
6872 case 3:
6873 w0[0] = w0[0] | 0x02000000;
6874 break;
6875
6876 case 4:
6877 w0[1] = 0x02;
6878 break;
6879
6880 case 5:
6881 w0[1] = w0[1] | 0x0200;
6882 break;
6883
6884 case 6:
6885 w0[1] = w0[1] | 0x020000;
6886 break;
6887
6888 case 7:
6889 w0[1] = w0[1] | 0x02000000;
6890 break;
6891
6892 case 8:
6893 w0[2] = 0x02;
6894 break;
6895
6896 case 9:
6897 w0[2] = w0[2] | 0x0200;
6898 break;
6899
6900 case 10:
6901 w0[2] = w0[2] | 0x020000;
6902 break;
6903
6904 case 11:
6905 w0[2] = w0[2] | 0x02000000;
6906 break;
6907
6908 case 12:
6909 w0[3] = 0x02;
6910 break;
6911
6912 case 13:
6913 w0[3] = w0[3] | 0x0200;
6914 break;
6915
6916 case 14:
6917 w0[3] = w0[3] | 0x020000;
6918 break;
6919
6920 case 15:
6921 w0[3] = w0[3] | 0x02000000;
6922 break;
6923
6924 case 16:
6925 w1[0] = 0x02;
6926 break;
6927
6928 case 17:
6929 w1[0] = w1[0] | 0x0200;
6930 break;
6931
6932 case 18:
6933 w1[0] = w1[0] | 0x020000;
6934 break;
6935
6936 case 19:
6937 w1[0] = w1[0] | 0x02000000;
6938 break;
6939
6940 case 20:
6941 w1[1] = 0x02;
6942 break;
6943
6944 case 21:
6945 w1[1] = w1[1] | 0x0200;
6946 break;
6947
6948 case 22:
6949 w1[1] = w1[1] | 0x020000;
6950 break;
6951
6952 case 23:
6953 w1[1] = w1[1] | 0x02000000;
6954 break;
6955
6956 case 24:
6957 w1[2] = 0x02;
6958 break;
6959
6960 case 25:
6961 w1[2] = w1[2] | 0x0200;
6962 break;
6963
6964 case 26:
6965 w1[2] = w1[2] | 0x020000;
6966 break;
6967
6968 case 27:
6969 w1[2] = w1[2] | 0x02000000;
6970 break;
6971
6972 case 28:
6973 w1[3] = 0x02;
6974 break;
6975
6976 case 29:
6977 w1[3] = w1[3] | 0x0200;
6978 break;
6979
6980 case 30:
6981 w1[3] = w1[3] | 0x020000;
6982 break;
6983
6984 case 31:
6985 w1[3] = w1[3] | 0x02000000;
6986 break;
6987
6988 case 32:
6989 w2[0] = 0x02;
6990 break;
6991
6992 case 33:
6993 w2[0] = w2[0] | 0x0200;
6994 break;
6995
6996 case 34:
6997 w2[0] = w2[0] | 0x020000;
6998 break;
6999
7000 case 35:
7001 w2[0] = w2[0] | 0x02000000;
7002 break;
7003
7004 case 36:
7005 w2[1] = 0x02;
7006 break;
7007
7008 case 37:
7009 w2[1] = w2[1] | 0x0200;
7010 break;
7011
7012 case 38:
7013 w2[1] = w2[1] | 0x020000;
7014 break;
7015
7016 case 39:
7017 w2[1] = w2[1] | 0x02000000;
7018 break;
7019
7020 case 40:
7021 w2[2] = 0x02;
7022 break;
7023
7024 case 41:
7025 w2[2] = w2[2] | 0x0200;
7026 break;
7027
7028 case 42:
7029 w2[2] = w2[2] | 0x020000;
7030 break;
7031
7032 case 43:
7033 w2[2] = w2[2] | 0x02000000;
7034 break;
7035
7036 case 44:
7037 w2[3] = 0x02;
7038 break;
7039
7040 case 45:
7041 w2[3] = w2[3] | 0x0200;
7042 break;
7043
7044 case 46:
7045 w2[3] = w2[3] | 0x020000;
7046 break;
7047
7048 case 47:
7049 w2[3] = w2[3] | 0x02000000;
7050 break;
7051 }
7052 }
7053
7054 inline void append_0x80_1x4_S (u32 w0[4], const u32 offset)
7055 {
7056 switch (offset)
7057 {
7058 case 0:
7059 w0[0] = 0x80;
7060 break;
7061
7062 case 1:
7063 w0[0] = w0[0] | 0x8000;
7064 break;
7065
7066 case 2:
7067 w0[0] = w0[0] | 0x800000;
7068 break;
7069
7070 case 3:
7071 w0[0] = w0[0] | 0x80000000;
7072 break;
7073
7074 case 4:
7075 w0[1] = 0x80;
7076 break;
7077
7078 case 5:
7079 w0[1] = w0[1] | 0x8000;
7080 break;
7081
7082 case 6:
7083 w0[1] = w0[1] | 0x800000;
7084 break;
7085
7086 case 7:
7087 w0[1] = w0[1] | 0x80000000;
7088 break;
7089
7090 case 8:
7091 w0[2] = 0x80;
7092 break;
7093
7094 case 9:
7095 w0[2] = w0[2] | 0x8000;
7096 break;
7097
7098 case 10:
7099 w0[2] = w0[2] | 0x800000;
7100 break;
7101
7102 case 11:
7103 w0[2] = w0[2] | 0x80000000;
7104 break;
7105
7106 case 12:
7107 w0[3] = 0x80;
7108 break;
7109
7110 case 13:
7111 w0[3] = w0[3] | 0x8000;
7112 break;
7113
7114 case 14:
7115 w0[3] = w0[3] | 0x800000;
7116 break;
7117
7118 case 15:
7119 w0[3] = w0[3] | 0x80000000;
7120 break;
7121 }
7122 }
7123
7124 inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
7125 {
7126 switch (offset)
7127 {
7128 case 0:
7129 w0[0] = 0x80;
7130 break;
7131
7132 case 1:
7133 w0[0] = w0[0] | 0x8000;
7134 break;
7135
7136 case 2:
7137 w0[0] = w0[0] | 0x800000;
7138 break;
7139
7140 case 3:
7141 w0[0] = w0[0] | 0x80000000;
7142 break;
7143
7144 case 4:
7145 w0[1] = 0x80;
7146 break;
7147
7148 case 5:
7149 w0[1] = w0[1] | 0x8000;
7150 break;
7151
7152 case 6:
7153 w0[1] = w0[1] | 0x800000;
7154 break;
7155
7156 case 7:
7157 w0[1] = w0[1] | 0x80000000;
7158 break;
7159
7160 case 8:
7161 w0[2] = 0x80;
7162 break;
7163
7164 case 9:
7165 w0[2] = w0[2] | 0x8000;
7166 break;
7167
7168 case 10:
7169 w0[2] = w0[2] | 0x800000;
7170 break;
7171
7172 case 11:
7173 w0[2] = w0[2] | 0x80000000;
7174 break;
7175
7176 case 12:
7177 w0[3] = 0x80;
7178 break;
7179
7180 case 13:
7181 w0[3] = w0[3] | 0x8000;
7182 break;
7183
7184 case 14:
7185 w0[3] = w0[3] | 0x800000;
7186 break;
7187
7188 case 15:
7189 w0[3] = w0[3] | 0x80000000;
7190 break;
7191
7192 case 16:
7193 w1[0] = 0x80;
7194 break;
7195
7196 case 17:
7197 w1[0] = w1[0] | 0x8000;
7198 break;
7199
7200 case 18:
7201 w1[0] = w1[0] | 0x800000;
7202 break;
7203
7204 case 19:
7205 w1[0] = w1[0] | 0x80000000;
7206 break;
7207
7208 case 20:
7209 w1[1] = 0x80;
7210 break;
7211
7212 case 21:
7213 w1[1] = w1[1] | 0x8000;
7214 break;
7215
7216 case 22:
7217 w1[1] = w1[1] | 0x800000;
7218 break;
7219
7220 case 23:
7221 w1[1] = w1[1] | 0x80000000;
7222 break;
7223
7224 case 24:
7225 w1[2] = 0x80;
7226 break;
7227
7228 case 25:
7229 w1[2] = w1[2] | 0x8000;
7230 break;
7231
7232 case 26:
7233 w1[2] = w1[2] | 0x800000;
7234 break;
7235
7236 case 27:
7237 w1[2] = w1[2] | 0x80000000;
7238 break;
7239
7240 case 28:
7241 w1[3] = 0x80;
7242 break;
7243
7244 case 29:
7245 w1[3] = w1[3] | 0x8000;
7246 break;
7247
7248 case 30:
7249 w1[3] = w1[3] | 0x800000;
7250 break;
7251
7252 case 31:
7253 w1[3] = w1[3] | 0x80000000;
7254 break;
7255 }
7256 }
7257
7258 inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
7259 {
7260 switch (offset)
7261 {
7262 case 0:
7263 w0[0] = 0x80;
7264 break;
7265
7266 case 1:
7267 w0[0] = w0[0] | 0x8000;
7268 break;
7269
7270 case 2:
7271 w0[0] = w0[0] | 0x800000;
7272 break;
7273
7274 case 3:
7275 w0[0] = w0[0] | 0x80000000;
7276 break;
7277
7278 case 4:
7279 w0[1] = 0x80;
7280 break;
7281
7282 case 5:
7283 w0[1] = w0[1] | 0x8000;
7284 break;
7285
7286 case 6:
7287 w0[1] = w0[1] | 0x800000;
7288 break;
7289
7290 case 7:
7291 w0[1] = w0[1] | 0x80000000;
7292 break;
7293
7294 case 8:
7295 w0[2] = 0x80;
7296 break;
7297
7298 case 9:
7299 w0[2] = w0[2] | 0x8000;
7300 break;
7301
7302 case 10:
7303 w0[2] = w0[2] | 0x800000;
7304 break;
7305
7306 case 11:
7307 w0[2] = w0[2] | 0x80000000;
7308 break;
7309
7310 case 12:
7311 w0[3] = 0x80;
7312 break;
7313
7314 case 13:
7315 w0[3] = w0[3] | 0x8000;
7316 break;
7317
7318 case 14:
7319 w0[3] = w0[3] | 0x800000;
7320 break;
7321
7322 case 15:
7323 w0[3] = w0[3] | 0x80000000;
7324 break;
7325
7326 case 16:
7327 w1[0] = 0x80;
7328 break;
7329
7330 case 17:
7331 w1[0] = w1[0] | 0x8000;
7332 break;
7333
7334 case 18:
7335 w1[0] = w1[0] | 0x800000;
7336 break;
7337
7338 case 19:
7339 w1[0] = w1[0] | 0x80000000;
7340 break;
7341
7342 case 20:
7343 w1[1] = 0x80;
7344 break;
7345
7346 case 21:
7347 w1[1] = w1[1] | 0x8000;
7348 break;
7349
7350 case 22:
7351 w1[1] = w1[1] | 0x800000;
7352 break;
7353
7354 case 23:
7355 w1[1] = w1[1] | 0x80000000;
7356 break;
7357
7358 case 24:
7359 w1[2] = 0x80;
7360 break;
7361
7362 case 25:
7363 w1[2] = w1[2] | 0x8000;
7364 break;
7365
7366 case 26:
7367 w1[2] = w1[2] | 0x800000;
7368 break;
7369
7370 case 27:
7371 w1[2] = w1[2] | 0x80000000;
7372 break;
7373
7374 case 28:
7375 w1[3] = 0x80;
7376 break;
7377
7378 case 29:
7379 w1[3] = w1[3] | 0x8000;
7380 break;
7381
7382 case 30:
7383 w1[3] = w1[3] | 0x800000;
7384 break;
7385
7386 case 31:
7387 w1[3] = w1[3] | 0x80000000;
7388 break;
7389
7390 case 32:
7391 w2[0] = 0x80;
7392 break;
7393
7394 case 33:
7395 w2[0] = w2[0] | 0x8000;
7396 break;
7397
7398 case 34:
7399 w2[0] = w2[0] | 0x800000;
7400 break;
7401
7402 case 35:
7403 w2[0] = w2[0] | 0x80000000;
7404 break;
7405
7406 case 36:
7407 w2[1] = 0x80;
7408 break;
7409
7410 case 37:
7411 w2[1] = w2[1] | 0x8000;
7412 break;
7413
7414 case 38:
7415 w2[1] = w2[1] | 0x800000;
7416 break;
7417
7418 case 39:
7419 w2[1] = w2[1] | 0x80000000;
7420 break;
7421
7422 case 40:
7423 w2[2] = 0x80;
7424 break;
7425
7426 case 41:
7427 w2[2] = w2[2] | 0x8000;
7428 break;
7429
7430 case 42:
7431 w2[2] = w2[2] | 0x800000;
7432 break;
7433
7434 case 43:
7435 w2[2] = w2[2] | 0x80000000;
7436 break;
7437
7438 case 44:
7439 w2[3] = 0x80;
7440 break;
7441
7442 case 45:
7443 w2[3] = w2[3] | 0x8000;
7444 break;
7445
7446 case 46:
7447 w2[3] = w2[3] | 0x800000;
7448 break;
7449
7450 case 47:
7451 w2[3] = w2[3] | 0x80000000;
7452 break;
7453 }
7454 }
7455
7456 inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
7457 {
7458 switch (offset)
7459 {
7460 case 0:
7461 w0[0] = 0x80;
7462 break;
7463
7464 case 1:
7465 w0[0] = w0[0] | 0x8000;
7466 break;
7467
7468 case 2:
7469 w0[0] = w0[0] | 0x800000;
7470 break;
7471
7472 case 3:
7473 w0[0] = w0[0] | 0x80000000;
7474 break;
7475
7476 case 4:
7477 w0[1] = 0x80;
7478 break;
7479
7480 case 5:
7481 w0[1] = w0[1] | 0x8000;
7482 break;
7483
7484 case 6:
7485 w0[1] = w0[1] | 0x800000;
7486 break;
7487
7488 case 7:
7489 w0[1] = w0[1] | 0x80000000;
7490 break;
7491
7492 case 8:
7493 w0[2] = 0x80;
7494 break;
7495
7496 case 9:
7497 w0[2] = w0[2] | 0x8000;
7498 break;
7499
7500 case 10:
7501 w0[2] = w0[2] | 0x800000;
7502 break;
7503
7504 case 11:
7505 w0[2] = w0[2] | 0x80000000;
7506 break;
7507
7508 case 12:
7509 w0[3] = 0x80;
7510 break;
7511
7512 case 13:
7513 w0[3] = w0[3] | 0x8000;
7514 break;
7515
7516 case 14:
7517 w0[3] = w0[3] | 0x800000;
7518 break;
7519
7520 case 15:
7521 w0[3] = w0[3] | 0x80000000;
7522 break;
7523
7524 case 16:
7525 w1[0] = 0x80;
7526 break;
7527
7528 case 17:
7529 w1[0] = w1[0] | 0x8000;
7530 break;
7531
7532 case 18:
7533 w1[0] = w1[0] | 0x800000;
7534 break;
7535
7536 case 19:
7537 w1[0] = w1[0] | 0x80000000;
7538 break;
7539
7540 case 20:
7541 w1[1] = 0x80;
7542 break;
7543
7544 case 21:
7545 w1[1] = w1[1] | 0x8000;
7546 break;
7547
7548 case 22:
7549 w1[1] = w1[1] | 0x800000;
7550 break;
7551
7552 case 23:
7553 w1[1] = w1[1] | 0x80000000;
7554 break;
7555
7556 case 24:
7557 w1[2] = 0x80;
7558 break;
7559
7560 case 25:
7561 w1[2] = w1[2] | 0x8000;
7562 break;
7563
7564 case 26:
7565 w1[2] = w1[2] | 0x800000;
7566 break;
7567
7568 case 27:
7569 w1[2] = w1[2] | 0x80000000;
7570 break;
7571
7572 case 28:
7573 w1[3] = 0x80;
7574 break;
7575
7576 case 29:
7577 w1[3] = w1[3] | 0x8000;
7578 break;
7579
7580 case 30:
7581 w1[3] = w1[3] | 0x800000;
7582 break;
7583
7584 case 31:
7585 w1[3] = w1[3] | 0x80000000;
7586 break;
7587
7588 case 32:
7589 w2[0] = 0x80;
7590 break;
7591
7592 case 33:
7593 w2[0] = w2[0] | 0x8000;
7594 break;
7595
7596 case 34:
7597 w2[0] = w2[0] | 0x800000;
7598 break;
7599
7600 case 35:
7601 w2[0] = w2[0] | 0x80000000;
7602 break;
7603
7604 case 36:
7605 w2[1] = 0x80;
7606 break;
7607
7608 case 37:
7609 w2[1] = w2[1] | 0x8000;
7610 break;
7611
7612 case 38:
7613 w2[1] = w2[1] | 0x800000;
7614 break;
7615
7616 case 39:
7617 w2[1] = w2[1] | 0x80000000;
7618 break;
7619
7620 case 40:
7621 w2[2] = 0x80;
7622 break;
7623
7624 case 41:
7625 w2[2] = w2[2] | 0x8000;
7626 break;
7627
7628 case 42:
7629 w2[2] = w2[2] | 0x800000;
7630 break;
7631
7632 case 43:
7633 w2[2] = w2[2] | 0x80000000;
7634 break;
7635
7636 case 44:
7637 w2[3] = 0x80;
7638 break;
7639
7640 case 45:
7641 w2[3] = w2[3] | 0x8000;
7642 break;
7643
7644 case 46:
7645 w2[3] = w2[3] | 0x800000;
7646 break;
7647
7648 case 47:
7649 w2[3] = w2[3] | 0x80000000;
7650 break;
7651
7652 case 48:
7653 w3[0] = 0x80;
7654 break;
7655
7656 case 49:
7657 w3[0] = w3[0] | 0x8000;
7658 break;
7659
7660 case 50:
7661 w3[0] = w3[0] | 0x800000;
7662 break;
7663
7664 case 51:
7665 w3[0] = w3[0] | 0x80000000;
7666 break;
7667
7668 case 52:
7669 w3[1] = 0x80;
7670 break;
7671
7672 case 53:
7673 w3[1] = w3[1] | 0x8000;
7674 break;
7675
7676 case 54:
7677 w3[1] = w3[1] | 0x800000;
7678 break;
7679
7680 case 55:
7681 w3[1] = w3[1] | 0x80000000;
7682 break;
7683
7684 case 56:
7685 w3[2] = 0x80;
7686 break;
7687
7688 case 57:
7689 w3[2] = w3[2] | 0x8000;
7690 break;
7691
7692 case 58:
7693 w3[2] = w3[2] | 0x800000;
7694 break;
7695
7696 case 59:
7697 w3[2] = w3[2] | 0x80000000;
7698 break;
7699
7700 case 60:
7701 w3[3] = 0x80;
7702 break;
7703
7704 case 61:
7705 w3[3] = w3[3] | 0x8000;
7706 break;
7707
7708 case 62:
7709 w3[3] = w3[3] | 0x800000;
7710 break;
7711
7712 case 63:
7713 w3[3] = w3[3] | 0x80000000;
7714 break;
7715 }
7716 }
7717
7718 inline void truncate_block_S (u32 w[4], const u32 len)
7719 {
7720 switch (len)
7721 {
7722 case 0: w[0] &= 0;
7723 w[1] &= 0;
7724 w[2] &= 0;
7725 w[3] &= 0;
7726 break;
7727 case 1: w[0] &= 0x000000FF;
7728 w[1] &= 0;
7729 w[2] &= 0;
7730 w[3] &= 0;
7731 break;
7732 case 2: w[0] &= 0x0000FFFF;
7733 w[1] &= 0;
7734 w[2] &= 0;
7735 w[3] &= 0;
7736 break;
7737 case 3: w[0] &= 0x00FFFFFF;
7738 w[1] &= 0;
7739 w[2] &= 0;
7740 w[3] &= 0;
7741 break;
7742 case 4: w[1] &= 0;
7743 w[2] &= 0;
7744 w[3] &= 0;
7745 break;
7746 case 5: w[1] &= 0x000000FF;
7747 w[2] &= 0;
7748 w[3] &= 0;
7749 break;
7750 case 6: w[1] &= 0x0000FFFF;
7751 w[2] &= 0;
7752 w[3] &= 0;
7753 break;
7754 case 7: w[1] &= 0x00FFFFFF;
7755 w[2] &= 0;
7756 w[3] &= 0;
7757 break;
7758 case 8: w[2] &= 0;
7759 w[3] &= 0;
7760 break;
7761 case 9: w[2] &= 0x000000FF;
7762 w[3] &= 0;
7763 break;
7764 case 10: w[2] &= 0x0000FFFF;
7765 w[3] &= 0;
7766 break;
7767 case 11: w[2] &= 0x00FFFFFF;
7768 w[3] &= 0;
7769 break;
7770 case 12: w[3] &= 0;
7771 break;
7772 case 13: w[3] &= 0x000000FF;
7773 break;
7774 case 14: w[3] &= 0x0000FFFF;
7775 break;
7776 case 15: w[3] &= 0x00FFFFFF;
7777 break;
7778 }
7779 }
7780
7781 inline void make_unicode_S (const u32 in[4], u32 out1[4], u32 out2[4])
7782 {
7783 #ifdef IS_NV
7784 out2[3] = __byte_perm_S (in[3], 0, 0x7372);
7785 out2[2] = __byte_perm_S (in[3], 0, 0x7170);
7786 out2[1] = __byte_perm_S (in[2], 0, 0x7372);
7787 out2[0] = __byte_perm_S (in[2], 0, 0x7170);
7788 out1[3] = __byte_perm_S (in[1], 0, 0x7372);
7789 out1[2] = __byte_perm_S (in[1], 0, 0x7170);
7790 out1[1] = __byte_perm_S (in[0], 0, 0x7372);
7791 out1[0] = __byte_perm_S (in[0], 0, 0x7170);
7792 #endif
7793
7794 #if defined IS_AMD || defined IS_GENERIC
7795 out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
7796 out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF);
7797 out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
7798 out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF);
7799 out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
7800 out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF);
7801 out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
7802 out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF);
7803 #endif
7804 }
7805
7806 inline void undo_unicode_S (const u32 in1[4], const u32 in2[4], u32 out[4])
7807 {
7808 #ifdef IS_NV
7809 out[0] = __byte_perm_S (in1[0], in1[1], 0x6420);
7810 out[1] = __byte_perm_S (in1[2], in1[3], 0x6420);
7811 out[2] = __byte_perm_S (in2[0], in2[1], 0x6420);
7812 out[3] = __byte_perm_S (in2[2], in2[3], 0x6420);
7813 #endif
7814
7815 #if defined IS_AMD || defined IS_GENERIC
7816 out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8)
7817 | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8);
7818 out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8)
7819 | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8);
7820 out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8)
7821 | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8);
7822 out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8)
7823 | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8);
7824 #endif
7825 }
7826
7827 inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
7828 {
7829 #if defined IS_AMD || defined IS_GENERIC
7830 const int offset_mod_4 = offset & 3;
7831
7832 const int offset_minus_4 = 4 - offset;
7833
7834 switch (offset / 4)
7835 {
7836 case 0:
7837 w3[2] = amd_bytealign_S ( 0, w3[1], offset_minus_4);
7838 w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4);
7839 w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
7840 w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7841 w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7842 w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7843 w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7844 w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7845 w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7846 w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7847 w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7848 w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7849 w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7850 w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7851 w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7852
7853 if (offset_mod_4 == 0)
7854 {
7855 w0[0] = w0[1];
7856 w0[1] = w0[2];
7857 w0[2] = w0[3];
7858 w0[3] = w1[0];
7859 w1[0] = w1[1];
7860 w1[1] = w1[2];
7861 w1[2] = w1[3];
7862 w1[3] = w2[0];
7863 w2[0] = w2[1];
7864 w2[1] = w2[2];
7865 w2[2] = w2[3];
7866 w2[3] = w3[0];
7867 w3[0] = w3[1];
7868 w3[1] = w3[2];
7869 w3[2] = 0;
7870 }
7871
7872 break;
7873
7874 case 1:
7875 w3[2] = amd_bytealign_S ( 0, w3[0], offset_minus_4);
7876 w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4);
7877 w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7878 w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7879 w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7880 w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7881 w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7882 w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7883 w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7884 w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7885 w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7886 w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7887 w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7888 w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7889 w0[0] = 0;
7890
7891 if (offset_mod_4 == 0)
7892 {
7893 w0[1] = w0[2];
7894 w0[2] = w0[3];
7895 w0[3] = w1[0];
7896 w1[0] = w1[1];
7897 w1[1] = w1[2];
7898 w1[2] = w1[3];
7899 w1[3] = w2[0];
7900 w2[0] = w2[1];
7901 w2[1] = w2[2];
7902 w2[2] = w2[3];
7903 w2[3] = w3[0];
7904 w3[0] = w3[1];
7905 w3[1] = w3[2];
7906 w3[2] = 0;
7907 }
7908
7909 break;
7910
7911 case 2:
7912 w3[2] = amd_bytealign_S ( 0, w2[3], offset_minus_4);
7913 w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4);
7914 w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7915 w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7916 w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7917 w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7918 w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7919 w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7920 w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7921 w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7922 w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7923 w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7924 w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7925 w0[1] = 0;
7926 w0[0] = 0;
7927
7928 if (offset_mod_4 == 0)
7929 {
7930 w0[2] = w0[3];
7931 w0[3] = w1[0];
7932 w1[0] = w1[1];
7933 w1[1] = w1[2];
7934 w1[2] = w1[3];
7935 w1[3] = w2[0];
7936 w2[0] = w2[1];
7937 w2[1] = w2[2];
7938 w2[2] = w2[3];
7939 w2[3] = w3[0];
7940 w3[0] = w3[1];
7941 w3[1] = w3[2];
7942 w3[2] = 0;
7943 }
7944
7945 break;
7946
7947 case 3:
7948 w3[2] = amd_bytealign_S ( 0, w2[2], offset_minus_4);
7949 w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4);
7950 w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7951 w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7952 w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7953 w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7954 w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7955 w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7956 w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7957 w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7958 w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7959 w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7960 w0[2] = 0;
7961 w0[1] = 0;
7962 w0[0] = 0;
7963
7964 if (offset_mod_4 == 0)
7965 {
7966 w0[3] = w1[0];
7967 w1[0] = w1[1];
7968 w1[1] = w1[2];
7969 w1[2] = w1[3];
7970 w1[3] = w2[0];
7971 w2[0] = w2[1];
7972 w2[1] = w2[2];
7973 w2[2] = w2[3];
7974 w2[3] = w3[0];
7975 w3[0] = w3[1];
7976 w3[1] = w3[2];
7977 w3[2] = 0;
7978 }
7979
7980 break;
7981
7982 case 4:
7983 w3[2] = amd_bytealign_S ( 0, w2[1], offset_minus_4);
7984 w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4);
7985 w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
7986 w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
7987 w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
7988 w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
7989 w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
7990 w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
7991 w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
7992 w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
7993 w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
7994 w0[3] = 0;
7995 w0[2] = 0;
7996 w0[1] = 0;
7997 w0[0] = 0;
7998
7999 if (offset_mod_4 == 0)
8000 {
8001 w1[0] = w1[1];
8002 w1[1] = w1[2];
8003 w1[2] = w1[3];
8004 w1[3] = w2[0];
8005 w2[0] = w2[1];
8006 w2[1] = w2[2];
8007 w2[2] = w2[3];
8008 w2[3] = w3[0];
8009 w3[0] = w3[1];
8010 w3[1] = w3[2];
8011 w3[2] = 0;
8012 }
8013
8014 break;
8015
8016 case 5:
8017 w3[2] = amd_bytealign_S ( 0, w2[0], offset_minus_4);
8018 w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4);
8019 w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
8020 w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
8021 w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
8022 w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8023 w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8024 w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8025 w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8026 w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8027 w1[0] = 0;
8028 w0[3] = 0;
8029 w0[2] = 0;
8030 w0[1] = 0;
8031 w0[0] = 0;
8032
8033 if (offset_mod_4 == 0)
8034 {
8035 w1[1] = w1[2];
8036 w1[2] = w1[3];
8037 w1[3] = w2[0];
8038 w2[0] = w2[1];
8039 w2[1] = w2[2];
8040 w2[2] = w2[3];
8041 w2[3] = w3[0];
8042 w3[0] = w3[1];
8043 w3[1] = w3[2];
8044 w3[2] = 0;
8045 }
8046
8047 break;
8048
8049 case 6:
8050 w3[2] = amd_bytealign_S ( 0, w1[3], offset_minus_4);
8051 w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4);
8052 w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
8053 w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
8054 w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8055 w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8056 w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8057 w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8058 w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8059 w1[1] = 0;
8060 w1[0] = 0;
8061 w0[3] = 0;
8062 w0[2] = 0;
8063 w0[1] = 0;
8064 w0[0] = 0;
8065
8066 if (offset_mod_4 == 0)
8067 {
8068 w1[2] = w1[3];
8069 w1[3] = w2[0];
8070 w2[0] = w2[1];
8071 w2[1] = w2[2];
8072 w2[2] = w2[3];
8073 w2[3] = w3[0];
8074 w3[0] = w3[1];
8075 w3[1] = w3[2];
8076 w3[2] = 0;
8077 }
8078
8079 break;
8080
8081 case 7:
8082 w3[2] = amd_bytealign_S ( 0, w1[2], offset_minus_4);
8083 w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4);
8084 w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
8085 w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8086 w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8087 w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8088 w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8089 w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8090 w1[2] = 0;
8091 w1[1] = 0;
8092 w1[0] = 0;
8093 w0[3] = 0;
8094 w0[2] = 0;
8095 w0[1] = 0;
8096 w0[0] = 0;
8097
8098 if (offset_mod_4 == 0)
8099 {
8100 w1[3] = w2[0];
8101 w2[0] = w2[1];
8102 w2[1] = w2[2];
8103 w2[2] = w2[3];
8104 w2[3] = w3[0];
8105 w3[0] = w3[1];
8106 w3[1] = w3[2];
8107 w3[2] = 0;
8108 }
8109
8110 break;
8111
8112 case 8:
8113 w3[2] = amd_bytealign_S ( 0, w1[1], offset_minus_4);
8114 w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4);
8115 w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8116 w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8117 w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8118 w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8119 w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8120 w1[3] = 0;
8121 w1[2] = 0;
8122 w1[1] = 0;
8123 w1[0] = 0;
8124 w0[3] = 0;
8125 w0[2] = 0;
8126 w0[1] = 0;
8127 w0[0] = 0;
8128
8129 if (offset_mod_4 == 0)
8130 {
8131 w2[0] = w2[1];
8132 w2[1] = w2[2];
8133 w2[2] = w2[3];
8134 w2[3] = w3[0];
8135 w3[0] = w3[1];
8136 w3[1] = w3[2];
8137 w3[2] = 0;
8138 }
8139
8140 break;
8141
8142 case 9:
8143 w3[2] = amd_bytealign_S ( 0, w1[0], offset_minus_4);
8144 w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4);
8145 w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8146 w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8147 w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8148 w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8149 w2[0] = 0;
8150 w1[3] = 0;
8151 w1[2] = 0;
8152 w1[1] = 0;
8153 w1[0] = 0;
8154 w0[3] = 0;
8155 w0[2] = 0;
8156 w0[1] = 0;
8157 w0[0] = 0;
8158
8159 if (offset_mod_4 == 0)
8160 {
8161 w2[1] = w2[2];
8162 w2[2] = w2[3];
8163 w2[3] = w3[0];
8164 w3[0] = w3[1];
8165 w3[1] = w3[2];
8166 w3[2] = 0;
8167 }
8168
8169 break;
8170
8171 case 10:
8172 w3[2] = amd_bytealign_S ( 0, w0[3], offset_minus_4);
8173 w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4);
8174 w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8175 w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8176 w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8177 w2[1] = 0;
8178 w2[0] = 0;
8179 w1[3] = 0;
8180 w1[2] = 0;
8181 w1[1] = 0;
8182 w1[0] = 0;
8183 w0[3] = 0;
8184 w0[2] = 0;
8185 w0[1] = 0;
8186 w0[0] = 0;
8187
8188 if (offset_mod_4 == 0)
8189 {
8190 w2[2] = w2[3];
8191 w2[3] = w3[0];
8192 w3[0] = w3[1];
8193 w3[1] = w3[2];
8194 w3[2] = 0;
8195 }
8196
8197 break;
8198
8199 case 11:
8200 w3[2] = amd_bytealign_S ( 0, w0[2], offset_minus_4);
8201 w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4);
8202 w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8203 w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8204 w2[2] = 0;
8205 w2[1] = 0;
8206 w2[0] = 0;
8207 w1[3] = 0;
8208 w1[2] = 0;
8209 w1[1] = 0;
8210 w1[0] = 0;
8211 w0[3] = 0;
8212 w0[2] = 0;
8213 w0[1] = 0;
8214 w0[0] = 0;
8215
8216 if (offset_mod_4 == 0)
8217 {
8218 w2[3] = w3[0];
8219 w3[0] = w3[1];
8220 w3[1] = w3[2];
8221 w3[2] = 0;
8222 }
8223
8224 break;
8225
8226 case 12:
8227 w3[2] = amd_bytealign_S ( 0, w0[1], offset_minus_4);
8228 w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4);
8229 w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8230 w2[3] = 0;
8231 w2[2] = 0;
8232 w2[1] = 0;
8233 w2[0] = 0;
8234 w1[3] = 0;
8235 w1[2] = 0;
8236 w1[1] = 0;
8237 w1[0] = 0;
8238 w0[3] = 0;
8239 w0[2] = 0;
8240 w0[1] = 0;
8241 w0[0] = 0;
8242
8243 if (offset_mod_4 == 0)
8244 {
8245 w3[0] = w3[1];
8246 w3[1] = w3[2];
8247 w3[2] = 0;
8248 }
8249
8250 break;
8251
8252 case 13:
8253 w3[2] = amd_bytealign_S ( 0, w0[0], offset_minus_4);
8254 w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4);
8255 w3[0] = 0;
8256 w2[3] = 0;
8257 w2[2] = 0;
8258 w2[1] = 0;
8259 w2[0] = 0;
8260 w1[3] = 0;
8261 w1[2] = 0;
8262 w1[1] = 0;
8263 w1[0] = 0;
8264 w0[3] = 0;
8265 w0[2] = 0;
8266 w0[1] = 0;
8267 w0[0] = 0;
8268
8269 if (offset_mod_4 == 0)
8270 {
8271 w3[1] = w3[2];
8272 w3[2] = 0;
8273 }
8274
8275 break;
8276 }
8277 #endif
8278
8279 #ifdef IS_NV
8280 const int offset_minus_4 = 4 - (offset % 4);
8281
8282 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
8283
8284 switch (offset / 4)
8285 {
8286 case 0:
8287 w3[1] = __byte_perm_S (w3[0], w3[1], selector);
8288 w3[0] = __byte_perm_S (w2[3], w3[0], selector);
8289 w2[3] = __byte_perm_S (w2[2], w2[3], selector);
8290 w2[2] = __byte_perm_S (w2[1], w2[2], selector);
8291 w2[1] = __byte_perm_S (w2[0], w2[1], selector);
8292 w2[0] = __byte_perm_S (w1[3], w2[0], selector);
8293 w1[3] = __byte_perm_S (w1[2], w1[3], selector);
8294 w1[2] = __byte_perm_S (w1[1], w1[2], selector);
8295 w1[1] = __byte_perm_S (w1[0], w1[1], selector);
8296 w1[0] = __byte_perm_S (w0[3], w1[0], selector);
8297 w0[3] = __byte_perm_S (w0[2], w0[3], selector);
8298 w0[2] = __byte_perm_S (w0[1], w0[2], selector);
8299 w0[1] = __byte_perm_S (w0[0], w0[1], selector);
8300 w0[0] = __byte_perm_S ( 0, w0[0], selector);
8301
8302 break;
8303
8304 case 1:
8305 w3[1] = __byte_perm_S (w2[3], w3[0], selector);
8306 w3[0] = __byte_perm_S (w2[2], w2[3], selector);
8307 w2[3] = __byte_perm_S (w2[1], w2[2], selector);
8308 w2[2] = __byte_perm_S (w2[0], w2[1], selector);
8309 w2[1] = __byte_perm_S (w1[3], w2[0], selector);
8310 w2[0] = __byte_perm_S (w1[2], w1[3], selector);
8311 w1[3] = __byte_perm_S (w1[1], w1[2], selector);
8312 w1[2] = __byte_perm_S (w1[0], w1[1], selector);
8313 w1[1] = __byte_perm_S (w0[3], w1[0], selector);
8314 w1[0] = __byte_perm_S (w0[2], w0[3], selector);
8315 w0[3] = __byte_perm_S (w0[1], w0[2], selector);
8316 w0[2] = __byte_perm_S (w0[0], w0[1], selector);
8317 w0[1] = __byte_perm_S ( 0, w0[0], selector);
8318 w0[0] = 0;
8319
8320 break;
8321
8322 case 2:
8323 w3[1] = __byte_perm_S (w2[2], w2[3], selector);
8324 w3[0] = __byte_perm_S (w2[1], w2[2], selector);
8325 w2[3] = __byte_perm_S (w2[0], w2[1], selector);
8326 w2[2] = __byte_perm_S (w1[3], w2[0], selector);
8327 w2[1] = __byte_perm_S (w1[2], w1[3], selector);
8328 w2[0] = __byte_perm_S (w1[1], w1[2], selector);
8329 w1[3] = __byte_perm_S (w1[0], w1[1], selector);
8330 w1[2] = __byte_perm_S (w0[3], w1[0], selector);
8331 w1[1] = __byte_perm_S (w0[2], w0[3], selector);
8332 w1[0] = __byte_perm_S (w0[1], w0[2], selector);
8333 w0[3] = __byte_perm_S (w0[0], w0[1], selector);
8334 w0[2] = __byte_perm_S ( 0, w0[0], selector);
8335 w0[1] = 0;
8336 w0[0] = 0;
8337
8338 break;
8339
8340 case 3:
8341 w3[1] = __byte_perm_S (w2[1], w2[2], selector);
8342 w3[0] = __byte_perm_S (w2[0], w2[1], selector);
8343 w2[3] = __byte_perm_S (w1[3], w2[0], selector);
8344 w2[2] = __byte_perm_S (w1[2], w1[3], selector);
8345 w2[1] = __byte_perm_S (w1[1], w1[2], selector);
8346 w2[0] = __byte_perm_S (w1[0], w1[1], selector);
8347 w1[3] = __byte_perm_S (w0[3], w1[0], selector);
8348 w1[2] = __byte_perm_S (w0[2], w0[3], selector);
8349 w1[1] = __byte_perm_S (w0[1], w0[2], selector);
8350 w1[0] = __byte_perm_S (w0[0], w0[1], selector);
8351 w0[3] = __byte_perm_S ( 0, w0[0], selector);
8352 w0[2] = 0;
8353 w0[1] = 0;
8354 w0[0] = 0;
8355
8356 break;
8357
8358 case 4:
8359 w3[1] = __byte_perm_S (w2[0], w2[1], selector);
8360 w3[0] = __byte_perm_S (w1[3], w2[0], selector);
8361 w2[3] = __byte_perm_S (w1[2], w1[3], selector);
8362 w2[2] = __byte_perm_S (w1[1], w1[2], selector);
8363 w2[1] = __byte_perm_S (w1[0], w1[1], selector);
8364 w2[0] = __byte_perm_S (w0[3], w1[0], selector);
8365 w1[3] = __byte_perm_S (w0[2], w0[3], selector);
8366 w1[2] = __byte_perm_S (w0[1], w0[2], selector);
8367 w1[1] = __byte_perm_S (w0[0], w0[1], selector);
8368 w1[0] = __byte_perm_S ( 0, w0[0], selector);
8369 w0[3] = 0;
8370 w0[2] = 0;
8371 w0[1] = 0;
8372 w0[0] = 0;
8373
8374 break;
8375
8376 case 5:
8377 w3[1] = __byte_perm_S (w1[3], w2[0], selector);
8378 w3[0] = __byte_perm_S (w1[2], w1[3], selector);
8379 w2[3] = __byte_perm_S (w1[1], w1[2], selector);
8380 w2[2] = __byte_perm_S (w1[0], w1[1], selector);
8381 w2[1] = __byte_perm_S (w0[3], w1[0], selector);
8382 w2[0] = __byte_perm_S (w0[2], w0[3], selector);
8383 w1[3] = __byte_perm_S (w0[1], w0[2], selector);
8384 w1[2] = __byte_perm_S (w0[0], w0[1], selector);
8385 w1[1] = __byte_perm_S ( 0, w0[0], selector);
8386 w1[0] = 0;
8387 w0[3] = 0;
8388 w0[2] = 0;
8389 w0[1] = 0;
8390 w0[0] = 0;
8391
8392 break;
8393
8394 case 6:
8395 w3[1] = __byte_perm_S (w1[2], w1[3], selector);
8396 w3[0] = __byte_perm_S (w1[1], w1[2], selector);
8397 w2[3] = __byte_perm_S (w1[0], w1[1], selector);
8398 w2[2] = __byte_perm_S (w0[3], w1[0], selector);
8399 w2[1] = __byte_perm_S (w0[2], w0[3], selector);
8400 w2[0] = __byte_perm_S (w0[1], w0[2], selector);
8401 w1[3] = __byte_perm_S (w0[0], w0[1], selector);
8402 w1[2] = __byte_perm_S ( 0, w0[0], selector);
8403 w1[1] = 0;
8404 w1[0] = 0;
8405 w0[3] = 0;
8406 w0[2] = 0;
8407 w0[1] = 0;
8408 w0[0] = 0;
8409
8410 break;
8411
8412 case 7:
8413 w3[1] = __byte_perm_S (w1[1], w1[2], selector);
8414 w3[0] = __byte_perm_S (w1[0], w1[1], selector);
8415 w2[3] = __byte_perm_S (w0[3], w1[0], selector);
8416 w2[2] = __byte_perm_S (w0[2], w0[3], selector);
8417 w2[1] = __byte_perm_S (w0[1], w0[2], selector);
8418 w2[0] = __byte_perm_S (w0[0], w0[1], selector);
8419 w1[3] = __byte_perm_S ( 0, w0[0], selector);
8420 w1[2] = 0;
8421 w1[1] = 0;
8422 w1[0] = 0;
8423 w0[3] = 0;
8424 w0[2] = 0;
8425 w0[1] = 0;
8426 w0[0] = 0;
8427
8428 break;
8429
8430 case 8:
8431 w3[1] = __byte_perm_S (w1[0], w1[1], selector);
8432 w3[0] = __byte_perm_S (w0[3], w1[0], selector);
8433 w2[3] = __byte_perm_S (w0[2], w0[3], selector);
8434 w2[2] = __byte_perm_S (w0[1], w0[2], selector);
8435 w2[1] = __byte_perm_S (w0[0], w0[1], selector);
8436 w2[0] = __byte_perm_S ( 0, w0[0], selector);
8437 w1[3] = 0;
8438 w1[2] = 0;
8439 w1[1] = 0;
8440 w1[0] = 0;
8441 w0[3] = 0;
8442 w0[2] = 0;
8443 w0[1] = 0;
8444 w0[0] = 0;
8445
8446 break;
8447
8448 case 9:
8449 w3[1] = __byte_perm_S (w0[3], w1[0], selector);
8450 w3[0] = __byte_perm_S (w0[2], w0[3], selector);
8451 w2[3] = __byte_perm_S (w0[1], w0[2], selector);
8452 w2[2] = __byte_perm_S (w0[0], w0[1], selector);
8453 w2[1] = __byte_perm_S ( 0, w0[0], selector);
8454 w2[0] = 0;
8455 w1[3] = 0;
8456 w1[2] = 0;
8457 w1[1] = 0;
8458 w1[0] = 0;
8459 w0[3] = 0;
8460 w0[2] = 0;
8461 w0[1] = 0;
8462 w0[0] = 0;
8463
8464 break;
8465
8466 case 10:
8467 w3[1] = __byte_perm_S (w0[2], w0[3], selector);
8468 w3[0] = __byte_perm_S (w0[1], w0[2], selector);
8469 w2[3] = __byte_perm_S (w0[0], w0[1], selector);
8470 w2[2] = __byte_perm_S ( 0, w0[0], selector);
8471 w2[1] = 0;
8472 w2[0] = 0;
8473 w1[3] = 0;
8474 w1[2] = 0;
8475 w1[1] = 0;
8476 w1[0] = 0;
8477 w0[3] = 0;
8478 w0[2] = 0;
8479 w0[1] = 0;
8480 w0[0] = 0;
8481
8482 break;
8483
8484 case 11:
8485 w3[1] = __byte_perm_S (w0[1], w0[2], selector);
8486 w3[0] = __byte_perm_S (w0[0], w0[1], selector);
8487 w2[3] = __byte_perm_S ( 0, w0[0], selector);
8488 w2[2] = 0;
8489 w2[1] = 0;
8490 w2[0] = 0;
8491 w1[3] = 0;
8492 w1[2] = 0;
8493 w1[1] = 0;
8494 w1[0] = 0;
8495 w0[3] = 0;
8496 w0[2] = 0;
8497 w0[1] = 0;
8498 w0[0] = 0;
8499
8500 break;
8501
8502 case 12:
8503 w3[1] = __byte_perm_S (w0[0], w0[1], selector);
8504 w3[0] = __byte_perm_S ( 0, w0[0], selector);
8505 w2[3] = 0;
8506 w2[2] = 0;
8507 w2[1] = 0;
8508 w2[0] = 0;
8509 w1[3] = 0;
8510 w1[2] = 0;
8511 w1[1] = 0;
8512 w1[0] = 0;
8513 w0[3] = 0;
8514 w0[2] = 0;
8515 w0[1] = 0;
8516 w0[0] = 0;
8517
8518 break;
8519
8520 case 13:
8521 w3[1] = __byte_perm_S ( 0, w0[0], selector);
8522 w3[0] = 0;
8523 w2[3] = 0;
8524 w2[2] = 0;
8525 w2[1] = 0;
8526 w2[0] = 0;
8527 w1[3] = 0;
8528 w1[2] = 0;
8529 w1[1] = 0;
8530 w1[0] = 0;
8531 w0[3] = 0;
8532 w0[2] = 0;
8533 w0[1] = 0;
8534 w0[0] = 0;
8535
8536 break;
8537 }
8538 #endif
8539 }
8540
8541 inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
8542 {
8543 #if defined IS_AMD || defined IS_GENERIC
8544 switch (offset / 4)
8545 {
8546 case 0:
8547 w3[2] = amd_bytealign_S (w3[1], 0, offset);
8548 w3[1] = amd_bytealign_S (w3[0], w3[1], offset);
8549 w3[0] = amd_bytealign_S (w2[3], w3[0], offset);
8550 w2[3] = amd_bytealign_S (w2[2], w2[3], offset);
8551 w2[2] = amd_bytealign_S (w2[1], w2[2], offset);
8552 w2[1] = amd_bytealign_S (w2[0], w2[1], offset);
8553 w2[0] = amd_bytealign_S (w1[3], w2[0], offset);
8554 w1[3] = amd_bytealign_S (w1[2], w1[3], offset);
8555 w1[2] = amd_bytealign_S (w1[1], w1[2], offset);
8556 w1[1] = amd_bytealign_S (w1[0], w1[1], offset);
8557 w1[0] = amd_bytealign_S (w0[3], w1[0], offset);
8558 w0[3] = amd_bytealign_S (w0[2], w0[3], offset);
8559 w0[2] = amd_bytealign_S (w0[1], w0[2], offset);
8560 w0[1] = amd_bytealign_S (w0[0], w0[1], offset);
8561 w0[0] = amd_bytealign_S ( 0, w0[0], offset);
8562 break;
8563
8564 case 1:
8565 w3[2] = amd_bytealign_S (w3[0], 0, offset);
8566 w3[1] = amd_bytealign_S (w2[3], w3[0], offset);
8567 w3[0] = amd_bytealign_S (w2[2], w2[3], offset);
8568 w2[3] = amd_bytealign_S (w2[1], w2[2], offset);
8569 w2[2] = amd_bytealign_S (w2[0], w2[1], offset);
8570 w2[1] = amd_bytealign_S (w1[3], w2[0], offset);
8571 w2[0] = amd_bytealign_S (w1[2], w1[3], offset);
8572 w1[3] = amd_bytealign_S (w1[1], w1[2], offset);
8573 w1[2] = amd_bytealign_S (w1[0], w1[1], offset);
8574 w1[1] = amd_bytealign_S (w0[3], w1[0], offset);
8575 w1[0] = amd_bytealign_S (w0[2], w0[3], offset);
8576 w0[3] = amd_bytealign_S (w0[1], w0[2], offset);
8577 w0[2] = amd_bytealign_S (w0[0], w0[1], offset);
8578 w0[1] = amd_bytealign_S ( 0, w0[0], offset);
8579 w0[0] = 0;
8580 break;
8581
8582 case 2:
8583 w3[2] = amd_bytealign_S (w2[3], 0, offset);
8584 w3[1] = amd_bytealign_S (w2[2], w2[3], offset);
8585 w3[0] = amd_bytealign_S (w2[1], w2[2], offset);
8586 w2[3] = amd_bytealign_S (w2[0], w2[1], offset);
8587 w2[2] = amd_bytealign_S (w1[3], w2[0], offset);
8588 w2[1] = amd_bytealign_S (w1[2], w1[3], offset);
8589 w2[0] = amd_bytealign_S (w1[1], w1[2], offset);
8590 w1[3] = amd_bytealign_S (w1[0], w1[1], offset);
8591 w1[2] = amd_bytealign_S (w0[3], w1[0], offset);
8592 w1[1] = amd_bytealign_S (w0[2], w0[3], offset);
8593 w1[0] = amd_bytealign_S (w0[1], w0[2], offset);
8594 w0[3] = amd_bytealign_S (w0[0], w0[1], offset);
8595 w0[2] = amd_bytealign_S ( 0, w0[0], offset);
8596 w0[1] = 0;
8597 w0[0] = 0;
8598 break;
8599
8600 case 3:
8601 w3[2] = amd_bytealign_S (w2[2], 0, offset);
8602 w3[1] = amd_bytealign_S (w2[1], w2[2], offset);
8603 w3[0] = amd_bytealign_S (w2[0], w2[1], offset);
8604 w2[3] = amd_bytealign_S (w1[3], w2[0], offset);
8605 w2[2] = amd_bytealign_S (w1[2], w1[3], offset);
8606 w2[1] = amd_bytealign_S (w1[1], w1[2], offset);
8607 w2[0] = amd_bytealign_S (w1[0], w1[1], offset);
8608 w1[3] = amd_bytealign_S (w0[3], w1[0], offset);
8609 w1[2] = amd_bytealign_S (w0[2], w0[3], offset);
8610 w1[1] = amd_bytealign_S (w0[1], w0[2], offset);
8611 w1[0] = amd_bytealign_S (w0[0], w0[1], offset);
8612 w0[3] = amd_bytealign_S ( 0, w0[0], offset);
8613 w0[2] = 0;
8614 w0[1] = 0;
8615 w0[0] = 0;
8616 break;
8617
8618 case 4:
8619 w3[2] = amd_bytealign_S (w2[1], 0, offset);
8620 w3[1] = amd_bytealign_S (w2[0], w2[1], offset);
8621 w3[0] = amd_bytealign_S (w1[3], w2[0], offset);
8622 w2[3] = amd_bytealign_S (w1[2], w1[3], offset);
8623 w2[2] = amd_bytealign_S (w1[1], w1[2], offset);
8624 w2[1] = amd_bytealign_S (w1[0], w1[1], offset);
8625 w2[0] = amd_bytealign_S (w0[3], w1[0], offset);
8626 w1[3] = amd_bytealign_S (w0[2], w0[3], offset);
8627 w1[2] = amd_bytealign_S (w0[1], w0[2], offset);
8628 w1[1] = amd_bytealign_S (w0[0], w0[1], offset);
8629 w1[0] = amd_bytealign_S ( 0, w0[0], offset);
8630 w0[3] = 0;
8631 w0[2] = 0;
8632 w0[1] = 0;
8633 w0[0] = 0;
8634 break;
8635
8636 case 5:
8637 w3[2] = amd_bytealign_S (w2[0], 0, offset);
8638 w3[1] = amd_bytealign_S (w1[3], w2[0], offset);
8639 w3[0] = amd_bytealign_S (w1[2], w1[3], offset);
8640 w2[3] = amd_bytealign_S (w1[1], w1[2], offset);
8641 w2[2] = amd_bytealign_S (w1[0], w1[1], offset);
8642 w2[1] = amd_bytealign_S (w0[3], w1[0], offset);
8643 w2[0] = amd_bytealign_S (w0[2], w0[3], offset);
8644 w1[3] = amd_bytealign_S (w0[1], w0[2], offset);
8645 w1[2] = amd_bytealign_S (w0[0], w0[1], offset);
8646 w1[1] = amd_bytealign_S ( 0, w0[0], offset);
8647 w1[0] = 0;
8648 w0[3] = 0;
8649 w0[2] = 0;
8650 w0[1] = 0;
8651 w0[0] = 0;
8652 break;
8653
8654 case 6:
8655 w3[2] = amd_bytealign_S (w1[3], 0, offset);
8656 w3[1] = amd_bytealign_S (w1[2], w1[3], offset);
8657 w3[0] = amd_bytealign_S (w1[1], w1[2], offset);
8658 w2[3] = amd_bytealign_S (w1[0], w1[1], offset);
8659 w2[2] = amd_bytealign_S (w0[3], w1[0], offset);
8660 w2[1] = amd_bytealign_S (w0[2], w0[3], offset);
8661 w2[0] = amd_bytealign_S (w0[1], w0[2], offset);
8662 w1[3] = amd_bytealign_S (w0[0], w0[1], offset);
8663 w1[2] = amd_bytealign_S ( 0, w0[0], offset);
8664 w1[1] = 0;
8665 w1[0] = 0;
8666 w0[3] = 0;
8667 w0[2] = 0;
8668 w0[1] = 0;
8669 w0[0] = 0;
8670 break;
8671
8672 case 7:
8673 w3[2] = amd_bytealign_S (w1[2], 0, offset);
8674 w3[1] = amd_bytealign_S (w1[1], w1[2], offset);
8675 w3[0] = amd_bytealign_S (w1[0], w1[1], offset);
8676 w2[3] = amd_bytealign_S (w0[3], w1[0], offset);
8677 w2[2] = amd_bytealign_S (w0[2], w0[3], offset);
8678 w2[1] = amd_bytealign_S (w0[1], w0[2], offset);
8679 w2[0] = amd_bytealign_S (w0[0], w0[1], offset);
8680 w1[3] = amd_bytealign_S ( 0, w0[0], offset);
8681 w1[2] = 0;
8682 w1[1] = 0;
8683 w1[0] = 0;
8684 w0[3] = 0;
8685 w0[2] = 0;
8686 w0[1] = 0;
8687 w0[0] = 0;
8688 break;
8689
8690 case 8:
8691 w3[2] = amd_bytealign_S (w1[1], 0, offset);
8692 w3[1] = amd_bytealign_S (w1[0], w1[1], offset);
8693 w3[0] = amd_bytealign_S (w0[3], w1[0], offset);
8694 w2[3] = amd_bytealign_S (w0[2], w0[3], offset);
8695 w2[2] = amd_bytealign_S (w0[1], w0[2], offset);
8696 w2[1] = amd_bytealign_S (w0[0], w0[1], offset);
8697 w2[0] = amd_bytealign_S ( 0, w0[0], offset);
8698 w1[3] = 0;
8699 w1[2] = 0;
8700 w1[1] = 0;
8701 w1[0] = 0;
8702 w0[3] = 0;
8703 w0[2] = 0;
8704 w0[1] = 0;
8705 w0[0] = 0;
8706 break;
8707
8708 case 9:
8709 w3[2] = amd_bytealign_S (w1[0], 0, offset);
8710 w3[1] = amd_bytealign_S (w0[3], w1[0], offset);
8711 w3[0] = amd_bytealign_S (w0[2], w0[3], offset);
8712 w2[3] = amd_bytealign_S (w0[1], w0[2], offset);
8713 w2[2] = amd_bytealign_S (w0[0], w0[1], offset);
8714 w2[1] = amd_bytealign_S ( 0, w0[0], offset);
8715 w2[0] = 0;
8716 w1[3] = 0;
8717 w1[2] = 0;
8718 w1[1] = 0;
8719 w1[0] = 0;
8720 w0[3] = 0;
8721 w0[2] = 0;
8722 w0[1] = 0;
8723 w0[0] = 0;
8724 break;
8725
8726 case 10:
8727 w3[2] = amd_bytealign_S (w0[3], 0, offset);
8728 w3[1] = amd_bytealign_S (w0[2], w0[3], offset);
8729 w3[0] = amd_bytealign_S (w0[1], w0[2], offset);
8730 w2[3] = amd_bytealign_S (w0[0], w0[1], offset);
8731 w2[2] = amd_bytealign_S ( 0, w0[0], offset);
8732 w2[1] = 0;
8733 w2[0] = 0;
8734 w1[3] = 0;
8735 w1[2] = 0;
8736 w1[1] = 0;
8737 w1[0] = 0;
8738 w0[3] = 0;
8739 w0[2] = 0;
8740 w0[1] = 0;
8741 w0[0] = 0;
8742 break;
8743
8744 case 11:
8745 w3[2] = amd_bytealign_S (w0[2], 0, offset);
8746 w3[1] = amd_bytealign_S (w0[1], w0[2], offset);
8747 w3[0] = amd_bytealign_S (w0[0], w0[1], offset);
8748 w2[3] = amd_bytealign_S ( 0, w0[0], offset);
8749 w2[2] = 0;
8750 w2[1] = 0;
8751 w2[0] = 0;
8752 w1[3] = 0;
8753 w1[2] = 0;
8754 w1[1] = 0;
8755 w1[0] = 0;
8756 w0[3] = 0;
8757 w0[2] = 0;
8758 w0[1] = 0;
8759 w0[0] = 0;
8760 break;
8761
8762 case 12:
8763 w3[2] = amd_bytealign_S (w0[1], 0, offset);
8764 w3[1] = amd_bytealign_S (w0[0], w0[1], offset);
8765 w3[0] = amd_bytealign_S ( 0, w0[0], offset);
8766 w2[3] = 0;
8767 w2[2] = 0;
8768 w2[1] = 0;
8769 w2[0] = 0;
8770 w1[3] = 0;
8771 w1[2] = 0;
8772 w1[1] = 0;
8773 w1[0] = 0;
8774 w0[3] = 0;
8775 w0[2] = 0;
8776 w0[1] = 0;
8777 w0[0] = 0;
8778 break;
8779
8780 case 13:
8781 w3[2] = amd_bytealign_S (w0[0], 0, offset);
8782 w3[1] = amd_bytealign_S ( 0, w0[0], offset);
8783 w3[0] = 0;
8784 w2[3] = 0;
8785 w2[2] = 0;
8786 w2[1] = 0;
8787 w2[0] = 0;
8788 w1[3] = 0;
8789 w1[2] = 0;
8790 w1[1] = 0;
8791 w1[0] = 0;
8792 w0[3] = 0;
8793 w0[2] = 0;
8794 w0[1] = 0;
8795 w0[0] = 0;
8796 break;
8797 }
8798 #endif
8799
8800 #ifdef IS_NV
8801 const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
8802
8803 switch (offset / 4)
8804 {
8805 case 0:
8806 w3[1] = __byte_perm_S (w3[1], w3[0], selector);
8807 w3[0] = __byte_perm_S (w3[0], w2[3], selector);
8808 w2[3] = __byte_perm_S (w2[3], w2[2], selector);
8809 w2[2] = __byte_perm_S (w2[2], w2[1], selector);
8810 w2[1] = __byte_perm_S (w2[1], w2[0], selector);
8811 w2[0] = __byte_perm_S (w2[0], w1[3], selector);
8812 w1[3] = __byte_perm_S (w1[3], w1[2], selector);
8813 w1[2] = __byte_perm_S (w1[2], w1[1], selector);
8814 w1[1] = __byte_perm_S (w1[1], w1[0], selector);
8815 w1[0] = __byte_perm_S (w1[0], w0[3], selector);
8816 w0[3] = __byte_perm_S (w0[3], w0[2], selector);
8817 w0[2] = __byte_perm_S (w0[2], w0[1], selector);
8818 w0[1] = __byte_perm_S (w0[1], w0[0], selector);
8819 w0[0] = __byte_perm_S (w0[0], 0, selector);
8820 break;
8821
8822 case 1:
8823 w3[1] = __byte_perm_S (w3[0], w2[3], selector);
8824 w3[0] = __byte_perm_S (w2[3], w2[2], selector);
8825 w2[3] = __byte_perm_S (w2[2], w2[1], selector);
8826 w2[2] = __byte_perm_S (w2[1], w2[0], selector);
8827 w2[1] = __byte_perm_S (w2[0], w1[3], selector);
8828 w2[0] = __byte_perm_S (w1[3], w1[2], selector);
8829 w1[3] = __byte_perm_S (w1[2], w1[1], selector);
8830 w1[2] = __byte_perm_S (w1[1], w1[0], selector);
8831 w1[1] = __byte_perm_S (w1[0], w0[3], selector);
8832 w1[0] = __byte_perm_S (w0[3], w0[2], selector);
8833 w0[3] = __byte_perm_S (w0[2], w0[1], selector);
8834 w0[2] = __byte_perm_S (w0[1], w0[0], selector);
8835 w0[1] = __byte_perm_S (w0[0], 0, selector);
8836 w0[0] = 0;
8837 break;
8838
8839 case 2:
8840 w3[1] = __byte_perm_S (w2[3], w2[2], selector);
8841 w3[0] = __byte_perm_S (w2[2], w2[1], selector);
8842 w2[3] = __byte_perm_S (w2[1], w2[0], selector);
8843 w2[2] = __byte_perm_S (w2[0], w1[3], selector);
8844 w2[1] = __byte_perm_S (w1[3], w1[2], selector);
8845 w2[0] = __byte_perm_S (w1[2], w1[1], selector);
8846 w1[3] = __byte_perm_S (w1[1], w1[0], selector);
8847 w1[2] = __byte_perm_S (w1[0], w0[3], selector);
8848 w1[1] = __byte_perm_S (w0[3], w0[2], selector);
8849 w1[0] = __byte_perm_S (w0[2], w0[1], selector);
8850 w0[3] = __byte_perm_S (w0[1], w0[0], selector);
8851 w0[2] = __byte_perm_S (w0[0], 0, selector);
8852 w0[1] = 0;
8853 w0[0] = 0;
8854 break;
8855
8856 case 3:
8857 w3[1] = __byte_perm_S (w2[2], w2[1], selector);
8858 w3[0] = __byte_perm_S (w2[1], w2[0], selector);
8859 w2[3] = __byte_perm_S (w2[0], w1[3], selector);
8860 w2[2] = __byte_perm_S (w1[3], w1[2], selector);
8861 w2[1] = __byte_perm_S (w1[2], w1[1], selector);
8862 w2[0] = __byte_perm_S (w1[1], w1[0], selector);
8863 w1[3] = __byte_perm_S (w1[0], w0[3], selector);
8864 w1[2] = __byte_perm_S (w0[3], w0[2], selector);
8865 w1[1] = __byte_perm_S (w0[2], w0[1], selector);
8866 w1[0] = __byte_perm_S (w0[1], w0[0], selector);
8867 w0[3] = __byte_perm_S (w0[0], 0, selector);
8868 w0[2] = 0;
8869 w0[1] = 0;
8870 w0[0] = 0;
8871 break;
8872
8873 case 4:
8874 w3[1] = __byte_perm_S (w2[1], w2[0], selector);
8875 w3[0] = __byte_perm_S (w2[0], w1[3], selector);
8876 w2[3] = __byte_perm_S (w1[3], w1[2], selector);
8877 w2[2] = __byte_perm_S (w1[2], w1[1], selector);
8878 w2[1] = __byte_perm_S (w1[1], w1[0], selector);
8879 w2[0] = __byte_perm_S (w1[0], w0[3], selector);
8880 w1[3] = __byte_perm_S (w0[3], w0[2], selector);
8881 w1[2] = __byte_perm_S (w0[2], w0[1], selector);
8882 w1[1] = __byte_perm_S (w0[1], w0[0], selector);
8883 w1[0] = __byte_perm_S (w0[0], 0, selector);
8884 w0[3] = 0;
8885 w0[2] = 0;
8886 w0[1] = 0;
8887 w0[0] = 0;
8888 break;
8889
8890 case 5:
8891 w3[1] = __byte_perm_S (w2[0], w1[3], selector);
8892 w3[0] = __byte_perm_S (w1[3], w1[2], selector);
8893 w2[3] = __byte_perm_S (w1[2], w1[1], selector);
8894 w2[2] = __byte_perm_S (w1[1], w1[0], selector);
8895 w2[1] = __byte_perm_S (w1[0], w0[3], selector);
8896 w2[0] = __byte_perm_S (w0[3], w0[2], selector);
8897 w1[3] = __byte_perm_S (w0[2], w0[1], selector);
8898 w1[2] = __byte_perm_S (w0[1], w0[0], selector);
8899 w1[1] = __byte_perm_S (w0[0], 0, selector);
8900 w1[0] = 0;
8901 w0[3] = 0;
8902 w0[2] = 0;
8903 w0[1] = 0;
8904 w0[0] = 0;
8905 break;
8906
8907 case 6:
8908 w3[1] = __byte_perm_S (w1[3], w1[2], selector);
8909 w3[0] = __byte_perm_S (w1[2], w1[1], selector);
8910 w2[3] = __byte_perm_S (w1[1], w1[0], selector);
8911 w2[2] = __byte_perm_S (w1[0], w0[3], selector);
8912 w2[1] = __byte_perm_S (w0[3], w0[2], selector);
8913 w2[0] = __byte_perm_S (w0[2], w0[1], selector);
8914 w1[3] = __byte_perm_S (w0[1], w0[0], selector);
8915 w1[2] = __byte_perm_S (w0[0], 0, selector);
8916 w1[1] = 0;
8917 w1[0] = 0;
8918 w0[3] = 0;
8919 w0[2] = 0;
8920 w0[1] = 0;
8921 w0[0] = 0;
8922 break;
8923
8924 case 7:
8925 w3[1] = __byte_perm_S (w1[2], w1[1], selector);
8926 w3[0] = __byte_perm_S (w1[1], w1[0], selector);
8927 w2[3] = __byte_perm_S (w1[0], w0[3], selector);
8928 w2[2] = __byte_perm_S (w0[3], w0[2], selector);
8929 w2[1] = __byte_perm_S (w0[2], w0[1], selector);
8930 w2[0] = __byte_perm_S (w0[1], w0[0], selector);
8931 w1[3] = __byte_perm_S (w0[0], 0, selector);
8932 w1[2] = 0;
8933 w1[1] = 0;
8934 w1[0] = 0;
8935 w0[3] = 0;
8936 w0[2] = 0;
8937 w0[1] = 0;
8938 w0[0] = 0;
8939 break;
8940
8941 case 8:
8942 w3[1] = __byte_perm_S (w1[1], w1[0], selector);
8943 w3[0] = __byte_perm_S (w1[0], w0[3], selector);
8944 w2[3] = __byte_perm_S (w0[3], w0[2], selector);
8945 w2[2] = __byte_perm_S (w0[2], w0[1], selector);
8946 w2[1] = __byte_perm_S (w0[1], w0[0], selector);
8947 w2[0] = __byte_perm_S (w0[0], 0, selector);
8948 w1[3] = 0;
8949 w1[2] = 0;
8950 w1[1] = 0;
8951 w1[0] = 0;
8952 w0[3] = 0;
8953 w0[2] = 0;
8954 w0[1] = 0;
8955 w0[0] = 0;
8956 break;
8957
8958 case 9:
8959 w3[1] = __byte_perm_S (w1[0], w0[3], selector);
8960 w3[0] = __byte_perm_S (w0[3], w0[2], selector);
8961 w2[3] = __byte_perm_S (w0[2], w0[1], selector);
8962 w2[2] = __byte_perm_S (w0[1], w0[0], selector);
8963 w2[1] = __byte_perm_S (w0[0], 0, selector);
8964 w2[0] = 0;
8965 w1[3] = 0;
8966 w1[2] = 0;
8967 w1[1] = 0;
8968 w1[0] = 0;
8969 w0[3] = 0;
8970 w0[2] = 0;
8971 w0[1] = 0;
8972 w0[0] = 0;
8973 break;
8974
8975 case 10:
8976 w3[1] = __byte_perm_S (w0[3], w0[2], selector);
8977 w3[0] = __byte_perm_S (w0[2], w0[1], selector);
8978 w2[3] = __byte_perm_S (w0[1], w0[0], selector);
8979 w2[2] = __byte_perm_S (w0[0], 0, selector);
8980 w2[1] = 0;
8981 w2[0] = 0;
8982 w1[3] = 0;
8983 w1[2] = 0;
8984 w1[1] = 0;
8985 w1[0] = 0;
8986 w0[3] = 0;
8987 w0[2] = 0;
8988 w0[1] = 0;
8989 w0[0] = 0;
8990 break;
8991
8992 case 11:
8993 w3[1] = __byte_perm_S (w0[2], w0[1], selector);
8994 w3[0] = __byte_perm_S (w0[1], w0[0], selector);
8995 w2[3] = __byte_perm_S (w0[0], 0, selector);
8996 w2[2] = 0;
8997 w2[1] = 0;
8998 w2[0] = 0;
8999 w1[3] = 0;
9000 w1[2] = 0;
9001 w1[1] = 0;
9002 w1[0] = 0;
9003 w0[3] = 0;
9004 w0[2] = 0;
9005 w0[1] = 0;
9006 w0[0] = 0;
9007 break;
9008
9009 case 12:
9010 w3[1] = __byte_perm_S (w0[1], w0[0], selector);
9011 w3[0] = __byte_perm_S (w0[0], 0, selector);
9012 w2[3] = 0;
9013 w2[2] = 0;
9014 w2[1] = 0;
9015 w2[0] = 0;
9016 w1[3] = 0;
9017 w1[2] = 0;
9018 w1[1] = 0;
9019 w1[0] = 0;
9020 w0[3] = 0;
9021 w0[2] = 0;
9022 w0[1] = 0;
9023 w0[0] = 0;
9024 break;
9025
9026 case 13:
9027 w3[1] = __byte_perm_S (w0[0], 0, selector);
9028 w3[0] = 0;
9029 w2[3] = 0;
9030 w2[2] = 0;
9031 w2[1] = 0;
9032 w2[0] = 0;
9033 w1[3] = 0;
9034 w1[2] = 0;
9035 w1[1] = 0;
9036 w1[0] = 0;
9037 w0[3] = 0;
9038 w0[2] = 0;
9039 w0[1] = 0;
9040 w0[0] = 0;
9041 break;
9042 }
9043 #endif
9044 }
9045
9046 /**
9047 * vector functions on scalar types (for inner loop usage)
9048 */
9049
9050 #define PACKVS2(sn,vn,e) \
9051 sn[0] = vn[0].s##e; \
9052 sn[1] = vn[1].s##e;
9053
9054 #define PACKSV2(sn,vn,e) \
9055 vn[0].s##e = sn[0]; \
9056 vn[1].s##e = sn[1];
9057
9058 #define PACKVS24(s0,s1,v0,v1,e) \
9059 PACKVS4 (s0, v0, e); \
9060 PACKVS4 (s1, v1, e);
9061
9062 #define PACKSV24(s0,s1,v0,v1,e) \
9063 PACKSV4 (s0, v0, e); \
9064 PACKSV4 (s1, v1, e);
9065
9066 #define PACKVS4(sn,vn,e) \
9067 sn[0] = vn[0].s##e; \
9068 sn[1] = vn[1].s##e; \
9069 sn[2] = vn[2].s##e; \
9070 sn[3] = vn[3].s##e;
9071
9072 #define PACKSV4(sn,vn,e) \
9073 vn[0].s##e = sn[0]; \
9074 vn[1].s##e = sn[1]; \
9075 vn[2].s##e = sn[2]; \
9076 vn[3].s##e = sn[3];
9077
9078 #define PACKVS44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
9079 PACKVS4 (s0, v0, e); \
9080 PACKVS4 (s1, v1, e); \
9081 PACKVS4 (s2, v2, e); \
9082 PACKVS4 (s3, v3, e);
9083
9084 #define PACKSV44(s0,s1,s2,s3,v0,v1,v2,v3,e) \
9085 PACKSV4 (s0, v0, e); \
9086 PACKSV4 (s1, v1, e); \
9087 PACKSV4 (s2, v2, e); \
9088 PACKSV4 (s3, v3, e);
9089
9090 inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
9091 {
9092 #if VECT_SIZE == 1
9093
9094 switch_buffer_by_offset_le_S (w0, w1, w2, w3, offset);
9095
9096 #else
9097
9098 u32 t0[4];
9099 u32 t1[4];
9100 u32 t2[4];
9101 u32 t3[4];
9102
9103 #endif
9104
9105 #if VECT_SIZE == 2
9106
9107 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9108 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9109
9110 #elif VECT_SIZE == 4
9111
9112 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9113 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9114 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9115 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9116
9117 #elif VECT_SIZE == 8
9118
9119 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9120 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9121 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9122 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9123 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
9124 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
9125 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
9126 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
9127
9128 #elif VECT_SIZE == 16
9129
9130 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9131 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9132 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9133 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9134 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
9135 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
9136 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
9137 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
9138 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
9139 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
9140 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
9141 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
9142 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
9143 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
9144 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
9145 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
9146
9147 #endif
9148 }
9149
9150 inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
9151 {
9152 #if VECT_SIZE == 1
9153
9154 append_0x01_2x4_S (w0, w1, offset);
9155
9156 #else
9157
9158 u32 t0[4];
9159 u32 t1[4];
9160
9161 #endif
9162
9163 #if VECT_SIZE == 2
9164
9165 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9166 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9167
9168 #elif VECT_SIZE == 4
9169
9170 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9171 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9172 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9173 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9174
9175 #elif VECT_SIZE == 8
9176
9177 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9178 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9179 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9180 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9181 PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
9182 PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
9183 PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
9184 PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
9185
9186 #elif VECT_SIZE == 16
9187
9188 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9189 PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9190 PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9191 PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9192 PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
9193 PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
9194 PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
9195 PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
9196 PACKVS24 (t0, t1, w0, w1, 8); append_0x01_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
9197 PACKVS24 (t0, t1, w0, w1, 9); append_0x01_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
9198 PACKVS24 (t0, t1, w0, w1, a); append_0x01_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
9199 PACKVS24 (t0, t1, w0, w1, b); append_0x01_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
9200 PACKVS24 (t0, t1, w0, w1, c); append_0x01_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
9201 PACKVS24 (t0, t1, w0, w1, d); append_0x01_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
9202 PACKVS24 (t0, t1, w0, w1, e); append_0x01_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
9203 PACKVS24 (t0, t1, w0, w1, f); append_0x01_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
9204
9205 #endif
9206 }
9207
9208 inline void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset)
9209 {
9210 #if VECT_SIZE == 1
9211
9212 append_0x80_2x4_S (w0, w1, offset);
9213
9214 #else
9215
9216 u32 t0[4];
9217 u32 t1[4];
9218
9219 #endif
9220
9221 #if VECT_SIZE == 2
9222
9223 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9224 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9225
9226 #elif VECT_SIZE == 4
9227
9228 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9229 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9230 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9231 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9232
9233 #elif VECT_SIZE == 8
9234
9235 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9236 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9237 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9238 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9239 PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
9240 PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
9241 PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
9242 PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
9243
9244 #elif VECT_SIZE == 16
9245
9246 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0);
9247 PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1);
9248 PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2);
9249 PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3);
9250 PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4);
9251 PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5);
9252 PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6);
9253 PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7);
9254 PACKVS24 (t0, t1, w0, w1, 8); append_0x80_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8);
9255 PACKVS24 (t0, t1, w0, w1, 9); append_0x80_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9);
9256 PACKVS24 (t0, t1, w0, w1, a); append_0x80_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a);
9257 PACKVS24 (t0, t1, w0, w1, b); append_0x80_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b);
9258 PACKVS24 (t0, t1, w0, w1, c); append_0x80_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c);
9259 PACKVS24 (t0, t1, w0, w1, d); append_0x80_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d);
9260 PACKVS24 (t0, t1, w0, w1, e); append_0x80_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e);
9261 PACKVS24 (t0, t1, w0, w1, f); append_0x80_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f);
9262
9263 #endif
9264 }
9265
9266 inline void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset)
9267 {
9268 #if VECT_SIZE == 1
9269
9270 append_0x80_4x4_S (w0, w1, w2, w3, offset);
9271
9272 #else
9273
9274 u32 t0[4];
9275 u32 t1[4];
9276 u32 t2[4];
9277 u32 t3[4];
9278
9279 #endif
9280
9281 #if VECT_SIZE == 2
9282
9283 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9284 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9285
9286 #elif VECT_SIZE == 4
9287
9288 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9289 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9290 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9291 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9292
9293 #elif VECT_SIZE == 8
9294
9295 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9296 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9297 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9298 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9299 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
9300 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
9301 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
9302 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
9303
9304 #elif VECT_SIZE == 16
9305
9306 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0);
9307 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1);
9308 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2);
9309 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3);
9310 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4);
9311 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5);
9312 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6);
9313 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7);
9314 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); append_0x80_4x4_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8);
9315 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); append_0x80_4x4_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9);
9316 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); append_0x80_4x4_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a);
9317 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); append_0x80_4x4_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b);
9318 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); append_0x80_4x4_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c);
9319 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); append_0x80_4x4_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d);
9320 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); append_0x80_4x4_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e);
9321 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); append_0x80_4x4_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f);
9322
9323 #endif
9324 }