- u32 tmp0;
- u32 tmp1;
- u32 tmp2;
- u32 tmp3;
- u32 tmp4;
- u32 tmp5;
- u32 tmp6;
- u32 tmp7;
- u32 tmp8;
-
- const int offset_minus_4 = 4 - block_len;
-
- tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
- tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
- tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
- tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
- tmp4 = amd_bytealign (append[4], append[3], offset_minus_4);
- tmp5 = amd_bytealign (append[5], append[4], offset_minus_4);
- tmp6 = amd_bytealign (append[6], append[5], offset_minus_4);
- tmp7 = amd_bytealign (append[7], append[6], offset_minus_4);
- tmp8 = amd_bytealign ( 0, append[7], offset_minus_4);
-
- if (mod == 0)
- {
- tmp0 = tmp1;
- tmp1 = tmp2;
- tmp2 = tmp3;
- tmp3 = tmp4;
- tmp4 = tmp5;
- tmp5 = tmp6;
- tmp6 = tmp7;
- tmp7 = tmp8;
- tmp8 = 0;
- }
+ u32 tmp00;
+ u32 tmp01;
+ u32 tmp02;
+ u32 tmp03;
+ u32 tmp04;
+ u32 tmp05;
+ u32 tmp06;
+ u32 tmp07;
+ u32 tmp08;
+ u32 tmp09;
+ u32 tmp10;
+ u32 tmp11;
+ u32 tmp12;
+ u32 tmp13;
+ u32 tmp14;
+ u32 tmp15;
+ u32 tmp16;
+
+ #ifdef IS_NV
+ const int selector = (0x76543210 >> ((block_len & 3) * 4)) & 0xffff;
+
+ tmp00 = __byte_perm (append[ 0], 0, selector);
+ tmp01 = __byte_perm (append[ 1], append[ 0], selector);
+ tmp02 = __byte_perm (append[ 2], append[ 1], selector);
+ tmp03 = __byte_perm (append[ 3], append[ 2], selector);
+ tmp04 = __byte_perm (append[ 4], append[ 3], selector);
+ tmp05 = __byte_perm (append[ 5], append[ 4], selector);
+ tmp06 = __byte_perm (append[ 6], append[ 5], selector);
+ tmp07 = __byte_perm (append[ 7], append[ 6], selector);
+ tmp08 = __byte_perm (append[ 8], append[ 7], selector);
+ tmp09 = __byte_perm (append[ 9], append[ 8], selector);
+ tmp10 = __byte_perm (append[10], append[ 9], selector);
+ tmp11 = __byte_perm (append[11], append[10], selector);
+ tmp12 = __byte_perm (append[12], append[11], selector);
+ tmp13 = __byte_perm (append[13], append[12], selector);
+ tmp14 = __byte_perm (append[14], append[13], selector);
+ tmp15 = __byte_perm (append[15], append[14], selector);
+ tmp16 = __byte_perm ( 0, append[15], selector);
+ #endif
+
+ #if defined IS_AMD || defined IS_GENERIC
+ tmp00 = amd_bytealign ( 0, append[ 0], block_len);
+ tmp01 = amd_bytealign (append[ 0], append[ 1], block_len);
+ tmp02 = amd_bytealign (append[ 1], append[ 2], block_len);
+ tmp03 = amd_bytealign (append[ 2], append[ 3], block_len);
+ tmp04 = amd_bytealign (append[ 3], append[ 4], block_len);
+ tmp05 = amd_bytealign (append[ 4], append[ 5], block_len);
+ tmp06 = amd_bytealign (append[ 5], append[ 6], block_len);
+ tmp07 = amd_bytealign (append[ 6], append[ 7], block_len);
+ tmp08 = amd_bytealign (append[ 7], append[ 8], block_len);
+ tmp09 = amd_bytealign (append[ 8], append[ 9], block_len);
+ tmp10 = amd_bytealign (append[ 9], append[10], block_len);
+ tmp11 = amd_bytealign (append[10], append[11], block_len);
+ tmp12 = amd_bytealign (append[11], append[12], block_len);
+ tmp13 = amd_bytealign (append[12], append[13], block_len);
+ tmp14 = amd_bytealign (append[13], append[14], block_len);
+ tmp15 = amd_bytealign (append[14], append[15], block_len);
+ tmp16 = amd_bytealign (append[15], 0, block_len);
+ #endif