Prepare for a more dynamic #pragma unroll use
[hashcat.git] / OpenCL / rp.c
1 /**
2 * Authors.....: Jens Steube <jens.steube@gmail.com>
3 * magnum <john.magnum@hushmail.com>
4 *
5 * License.....: MIT
6 */
7
8 inline u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len);
9 inline u32 apply_rules (const __global u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len);
10 inline u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, const __global kernel_rule_t *rules_buf, const u32 il_pos, u32x w0[4], u32x w1[4]);
11
12 inline u32 generate_cmask (u32 buf)
13 {
14 const u32 rmask = ((buf & 0x40404040) >> 1)
15 & ~((buf & 0x80808080) >> 2);
16
17 const u32 hmask = (buf & 0x1f1f1f1f) + 0x05050505;
18 const u32 lmask = (buf & 0x1f1f1f1f) + 0x1f1f1f1f;
19
20 return rmask & ~hmask & lmask;
21 }
22
23 inline void truncate_right (u32 w0[4], u32 w1[4], const u32 len)
24 {
25 const u32 tmp = (1 << ((len % 4) * 8)) - 1;
26
27 switch (len / 4)
28 {
29 case 0: w0[0] &= tmp;
30 w0[1] = 0;
31 w0[2] = 0;
32 w0[3] = 0;
33 w1[0] = 0;
34 w1[1] = 0;
35 w1[2] = 0;
36 w1[3] = 0;
37 break;
38 case 1: w0[1] &= tmp;
39 w0[2] = 0;
40 w0[3] = 0;
41 w1[0] = 0;
42 w1[1] = 0;
43 w1[2] = 0;
44 w1[3] = 0;
45 break;
46 case 2: w0[2] &= tmp;
47 w0[3] = 0;
48 w1[0] = 0;
49 w1[1] = 0;
50 w1[2] = 0;
51 w1[3] = 0;
52 break;
53 case 3: w0[3] &= tmp;
54 w1[0] = 0;
55 w1[1] = 0;
56 w1[2] = 0;
57 w1[3] = 0;
58 break;
59 case 4: w1[0] &= tmp;
60 w1[1] = 0;
61 w1[2] = 0;
62 w1[3] = 0;
63 break;
64 case 5: w1[1] &= tmp;
65 w1[2] = 0;
66 w1[3] = 0;
67 break;
68 case 6: w1[2] &= tmp;
69 w1[3] = 0;
70 break;
71 case 7: w1[3] &= tmp;
72 break;
73 }
74 }
75
76 inline void truncate_left (u32 w0[4], u32 w1[4], const u32 len)
77 {
78 const u32 tmp = ~((1 << ((len % 4) * 8)) - 1);
79
80 switch (len / 4)
81 {
82 case 0: w0[0] &= tmp;
83 break;
84 case 1: w0[0] = 0;
85 w0[1] &= tmp;
86 break;
87 case 2: w0[0] = 0;
88 w0[1] = 0;
89 w0[2] &= tmp;
90 break;
91 case 3: w0[0] = 0;
92 w0[1] = 0;
93 w0[2] = 0;
94 w0[3] &= tmp;
95 break;
96 case 4: w0[0] = 0;
97 w0[1] = 0;
98 w0[2] = 0;
99 w0[3] = 0;
100 w1[0] &= tmp;
101 break;
102 case 5: w0[0] = 0;
103 w0[1] = 0;
104 w0[2] = 0;
105 w0[3] = 0;
106 w1[0] = 0;
107 w1[1] &= tmp;
108 break;
109 case 6: w0[0] = 0;
110 w0[1] = 0;
111 w0[2] = 0;
112 w0[3] = 0;
113 w1[0] = 0;
114 w1[1] = 0;
115 w1[2] &= tmp;
116 break;
117 case 7: w0[0] = 0;
118 w0[1] = 0;
119 w0[2] = 0;
120 w0[3] = 0;
121 w1[0] = 0;
122 w1[1] = 0;
123 w1[2] = 0;
124 w1[3] &= tmp;
125 break;
126 }
127 }
128
129 inline void lshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4])
130 {
131 #ifdef IS_NV
132 out0[0] = __byte_perm_S (in0[0], in0[1], 0x4321);
133 out0[1] = __byte_perm_S (in0[1], in0[2], 0x4321);
134 out0[2] = __byte_perm_S (in0[2], in0[3], 0x4321);
135 out0[3] = __byte_perm_S (in0[3], in1[0], 0x4321);
136 out1[0] = __byte_perm_S (in1[0], in1[1], 0x4321);
137 out1[1] = __byte_perm_S (in1[1], in1[2], 0x4321);
138 out1[2] = __byte_perm_S (in1[2], in1[3], 0x4321);
139 out1[3] = __byte_perm_S (in1[3], 0, 0x4321);
140 #endif
141
142 #if defined IS_AMD || defined IS_GENERIC
143 out0[0] = amd_bytealign_S (in0[1], in0[0], 1);
144 out0[1] = amd_bytealign_S (in0[2], in0[1], 1);
145 out0[2] = amd_bytealign_S (in0[3], in0[2], 1);
146 out0[3] = amd_bytealign_S (in1[0], in0[3], 1);
147 out1[0] = amd_bytealign_S (in1[1], in1[0], 1);
148 out1[1] = amd_bytealign_S (in1[2], in1[1], 1);
149 out1[2] = amd_bytealign_S (in1[3], in1[2], 1);
150 out1[3] = amd_bytealign_S ( 0, in1[3], 1);
151 #endif
152 }
153
154 inline void rshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4])
155 {
156 #ifdef IS_NV
157 out1[3] = __byte_perm_S (in1[2], in1[3], 0x6543);
158 out1[2] = __byte_perm_S (in1[1], in1[2], 0x6543);
159 out1[1] = __byte_perm_S (in1[0], in1[1], 0x6543);
160 out1[0] = __byte_perm_S (in0[3], in1[0], 0x6543);
161 out0[3] = __byte_perm_S (in0[2], in0[3], 0x6543);
162 out0[2] = __byte_perm_S (in0[1], in0[2], 0x6543);
163 out0[1] = __byte_perm_S (in0[0], in0[1], 0x6543);
164 out0[0] = __byte_perm_S ( 0, in0[0], 0x6543);
165 #endif
166
167 #if defined IS_AMD || defined IS_GENERIC
168 out1[3] = amd_bytealign_S (in1[3], in1[2], 3);
169 out1[2] = amd_bytealign_S (in1[2], in1[1], 3);
170 out1[1] = amd_bytealign_S (in1[1], in1[0], 3);
171 out1[0] = amd_bytealign_S (in1[0], in0[3], 3);
172 out0[3] = amd_bytealign_S (in0[3], in0[2], 3);
173 out0[2] = amd_bytealign_S (in0[2], in0[1], 3);
174 out0[1] = amd_bytealign_S (in0[1], in0[0], 3);
175 out0[0] = amd_bytealign_S (in0[0], 0, 3);
176 #endif
177 }
178
179 inline void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num)
180 {
181 #ifdef IS_NV
182 switch (num)
183 {
184 case 0: out0[0] = in0[0];
185 out0[1] = in0[1];
186 out0[2] = in0[2];
187 out0[3] = in0[3];
188 out1[0] = in1[0];
189 out1[1] = in1[1];
190 out1[2] = in1[2];
191 out1[3] = in1[3];
192 break;
193 case 1: out0[0] = __byte_perm_S (in0[0], in0[1], 0x4321);
194 out0[1] = __byte_perm_S (in0[1], in0[2], 0x4321);
195 out0[2] = __byte_perm_S (in0[2], in0[3], 0x4321);
196 out0[3] = __byte_perm_S (in0[3], in1[0], 0x4321);
197 out1[0] = __byte_perm_S (in1[0], in1[1], 0x4321);
198 out1[1] = __byte_perm_S (in1[1], in1[2], 0x4321);
199 out1[2] = __byte_perm_S (in1[2], in1[3], 0x4321);
200 out1[3] = __byte_perm_S (in1[3], 0, 0x4321);
201 break;
202 case 2: out0[0] = __byte_perm_S (in0[0], in0[1], 0x5432);
203 out0[1] = __byte_perm_S (in0[1], in0[2], 0x5432);
204 out0[2] = __byte_perm_S (in0[2], in0[3], 0x5432);
205 out0[3] = __byte_perm_S (in0[3], in1[0], 0x5432);
206 out1[0] = __byte_perm_S (in1[0], in1[1], 0x5432);
207 out1[1] = __byte_perm_S (in1[1], in1[2], 0x5432);
208 out1[2] = __byte_perm_S (in1[2], in1[3], 0x5432);
209 out1[3] = __byte_perm_S (in1[3], 0, 0x5432);
210 break;
211 case 3: out0[0] = __byte_perm_S (in0[0], in0[1], 0x6543);
212 out0[1] = __byte_perm_S (in0[1], in0[2], 0x6543);
213 out0[2] = __byte_perm_S (in0[2], in0[3], 0x6543);
214 out0[3] = __byte_perm_S (in0[3], in1[0], 0x6543);
215 out1[0] = __byte_perm_S (in1[0], in1[1], 0x6543);
216 out1[1] = __byte_perm_S (in1[1], in1[2], 0x6543);
217 out1[2] = __byte_perm_S (in1[2], in1[3], 0x6543);
218 out1[3] = __byte_perm_S (in1[3], 0, 0x6543);
219 break;
220 case 4: out0[0] = in0[1];
221 out0[1] = in0[2];
222 out0[2] = in0[3];
223 out0[3] = in1[0];
224 out1[0] = in1[1];
225 out1[1] = in1[2];
226 out1[2] = in1[3];
227 out1[3] = 0;
228 break;
229 case 5: out0[0] = __byte_perm_S (in0[1], in0[2], 0x4321);
230 out0[1] = __byte_perm_S (in0[2], in0[3], 0x4321);
231 out0[2] = __byte_perm_S (in0[3], in1[0], 0x4321);
232 out0[3] = __byte_perm_S (in1[0], in1[1], 0x4321);
233 out1[0] = __byte_perm_S (in1[1], in1[2], 0x4321);
234 out1[1] = __byte_perm_S (in1[2], in1[3], 0x4321);
235 out1[2] = __byte_perm_S (in1[3], 0, 0x4321);
236 out1[3] = 0;
237 break;
238 case 6: out0[0] = __byte_perm_S (in0[1], in0[2], 0x5432);
239 out0[1] = __byte_perm_S (in0[2], in0[3], 0x5432);
240 out0[2] = __byte_perm_S (in0[3], in1[0], 0x5432);
241 out0[3] = __byte_perm_S (in1[0], in1[1], 0x5432);
242 out1[0] = __byte_perm_S (in1[1], in1[2], 0x5432);
243 out1[1] = __byte_perm_S (in1[2], in1[3], 0x5432);
244 out1[2] = __byte_perm_S (in1[3], 0, 0x5432);
245 out1[3] = 0;
246 break;
247 case 7: out0[0] = __byte_perm_S (in0[1], in0[2], 0x6543);
248 out0[1] = __byte_perm_S (in0[2], in0[3], 0x6543);
249 out0[2] = __byte_perm_S (in0[3], in1[0], 0x6543);
250 out0[3] = __byte_perm_S (in1[0], in1[1], 0x6543);
251 out1[0] = __byte_perm_S (in1[1], in1[2], 0x6543);
252 out1[1] = __byte_perm_S (in1[2], in1[3], 0x6543);
253 out1[2] = __byte_perm_S (in1[3], 0, 0x6543);
254 out1[3] = 0;
255 break;
256 case 8: out0[0] = in0[2];
257 out0[1] = in0[3];
258 out0[2] = in1[0];
259 out0[3] = in1[1];
260 out1[0] = in1[2];
261 out1[1] = in1[3];
262 out1[2] = 0;
263 out1[3] = 0;
264 break;
265 case 9: out0[0] = __byte_perm_S (in0[2], in0[3], 0x4321);
266 out0[1] = __byte_perm_S (in0[3], in1[0], 0x4321);
267 out0[2] = __byte_perm_S (in1[0], in1[1], 0x4321);
268 out0[3] = __byte_perm_S (in1[1], in1[2], 0x4321);
269 out1[0] = __byte_perm_S (in1[2], in1[3], 0x4321);
270 out1[1] = __byte_perm_S (in1[3], 0, 0x4321);
271 out1[2] = 0;
272 out1[3] = 0;
273 break;
274 case 10: out0[0] = __byte_perm_S (in0[2], in0[3], 0x5432);
275 out0[1] = __byte_perm_S (in0[3], in1[0], 0x5432);
276 out0[2] = __byte_perm_S (in1[0], in1[1], 0x5432);
277 out0[3] = __byte_perm_S (in1[1], in1[2], 0x5432);
278 out1[0] = __byte_perm_S (in1[2], in1[3], 0x5432);
279 out1[1] = __byte_perm_S (in1[3], 0, 0x5432);
280 out1[2] = 0;
281 out1[3] = 0;
282 break;
283 case 11: out0[0] = __byte_perm_S (in0[2], in0[3], 0x6543);
284 out0[1] = __byte_perm_S (in0[3], in1[0], 0x6543);
285 out0[2] = __byte_perm_S (in1[0], in1[1], 0x6543);
286 out0[3] = __byte_perm_S (in1[1], in1[2], 0x6543);
287 out1[0] = __byte_perm_S (in1[2], in1[3], 0x6543);
288 out1[1] = __byte_perm_S (in1[3], 0, 0x6543);
289 out1[2] = 0;
290 out1[3] = 0;
291 break;
292 case 12: out0[0] = in0[3];
293 out0[1] = in1[0];
294 out0[2] = in1[1];
295 out0[3] = in1[2];
296 out1[0] = in1[3];
297 out1[1] = 0;
298 out1[2] = 0;
299 out1[3] = 0;
300 break;
301 case 13:
302 out0[0] = __byte_perm_S (in0[3], in1[0], 0x4321);
303 out0[1] = __byte_perm_S (in1[0], in1[1], 0x4321);
304 out0[2] = __byte_perm_S (in1[1], in1[2], 0x4321);
305 out0[3] = __byte_perm_S (in1[2], in1[3], 0x4321);
306 out1[0] = __byte_perm_S (in1[3], 0, 0x4321);
307 out1[1] = 0;
308 out1[2] = 0;
309 out1[3] = 0;
310 break;
311 case 14: out0[0] = __byte_perm_S (in0[3], in1[0], 0x5432);
312 out0[1] = __byte_perm_S (in1[0], in1[1], 0x5432);
313 out0[2] = __byte_perm_S (in1[1], in1[2], 0x5432);
314 out0[3] = __byte_perm_S (in1[2], in1[3], 0x5432);
315 out1[0] = __byte_perm_S (in1[3], 0, 0x5432);
316 out1[1] = 0;
317 out1[2] = 0;
318 out1[3] = 0;
319 break;
320 case 15: out0[0] = __byte_perm_S (in0[3], in1[0], 0x6543);
321 out0[1] = __byte_perm_S (in1[0], in1[1], 0x6543);
322 out0[2] = __byte_perm_S (in1[1], in1[2], 0x6543);
323 out0[3] = __byte_perm_S (in1[2], in1[3], 0x6543);
324 out1[0] = __byte_perm_S (in1[3], 0, 0x6543);
325 out1[1] = 0;
326 out1[2] = 0;
327 out1[3] = 0;
328 break;
329 case 16: out0[0] = in1[0];
330 out0[1] = in1[1];
331 out0[2] = in1[2];
332 out0[3] = in1[3];
333 out1[0] = 0;
334 out1[1] = 0;
335 out1[2] = 0;
336 out1[3] = 0;
337 break;
338 case 17: out0[0] = __byte_perm_S (in1[0], in1[1], 0x4321);
339 out0[1] = __byte_perm_S (in1[1], in1[2], 0x4321);
340 out0[2] = __byte_perm_S (in1[2], in1[3], 0x4321);
341 out0[3] = __byte_perm_S (in1[3], 0, 0x4321);
342 out1[0] = 0;
343 out1[1] = 0;
344 out1[2] = 0;
345 out1[3] = 0;
346 break;
347 case 18: out0[0] = __byte_perm_S (in1[0], in1[1], 0x5432);
348 out0[1] = __byte_perm_S (in1[1], in1[2], 0x5432);
349 out0[2] = __byte_perm_S (in1[2], in1[3], 0x5432);
350 out0[3] = __byte_perm_S (in1[3], 0, 0x5432);
351 out1[0] = 0;
352 out1[1] = 0;
353 out1[2] = 0;
354 out1[3] = 0;
355 break;
356 case 19: out0[0] = __byte_perm_S (in1[0], in1[1], 0x6543);
357 out0[1] = __byte_perm_S (in1[1], in1[2], 0x6543);
358 out0[2] = __byte_perm_S (in1[2], in1[3], 0x6543);
359 out0[3] = __byte_perm_S (in1[3], 0, 0x6543);
360 out1[0] = 0;
361 out1[1] = 0;
362 out1[2] = 0;
363 out1[3] = 0;
364 break;
365 case 20: out0[0] = in1[1];
366 out0[1] = in1[2];
367 out0[2] = in1[3];
368 out0[3] = 0;
369 out1[0] = 0;
370 out1[1] = 0;
371 out1[2] = 0;
372 out1[3] = 0;
373 break;
374 case 21: out0[0] = __byte_perm_S (in1[1], in1[2], 0x4321);
375 out0[1] = __byte_perm_S (in1[2], in1[3], 0x4321);
376 out0[2] = __byte_perm_S (in1[3], 0, 0x4321);
377 out0[3] = 0;
378 out1[0] = 0;
379 out1[1] = 0;
380 out1[2] = 0;
381 out1[3] = 0;
382 break;
383 case 22: out0[0] = __byte_perm_S (in1[1], in1[2], 0x5432);
384 out0[1] = __byte_perm_S (in1[2], in1[3], 0x5432);
385 out0[2] = __byte_perm_S (in1[3], 0, 0x5432);
386 out0[3] = 0;
387 out1[0] = 0;
388 out1[1] = 0;
389 out1[2] = 0;
390 out1[3] = 0;
391 break;
392 case 23: out0[0] = __byte_perm_S (in1[1], in1[2], 0x6543);
393 out0[1] = __byte_perm_S (in1[2], in1[3], 0x6543);
394 out0[2] = __byte_perm_S (in1[3], 0, 0x6543);
395 out0[3] = 0;
396 out1[0] = 0;
397 out1[1] = 0;
398 out1[2] = 0;
399 out1[3] = 0;
400 break;
401 case 24: out0[0] = in1[2];
402 out0[1] = in1[3];
403 out0[2] = 0;
404 out0[3] = 0;
405 out1[0] = 0;
406 out1[1] = 0;
407 out1[2] = 0;
408 out1[3] = 0;
409 break;
410 case 25: out0[0] = __byte_perm_S (in1[2], in1[3], 0x4321);
411 out0[1] = __byte_perm_S (in1[3], 0, 0x4321);
412 out0[2] = 0;
413 out0[3] = 0;
414 out1[0] = 0;
415 out1[1] = 0;
416 out1[2] = 0;
417 out1[3] = 0;
418 break;
419 case 26: out0[0] = __byte_perm_S (in1[2], in1[3], 0x5432);
420 out0[1] = __byte_perm_S (in1[3], 0, 0x5432);
421 out0[2] = 0;
422 out0[3] = 0;
423 out1[0] = 0;
424 out1[1] = 0;
425 out1[2] = 0;
426 out1[3] = 0;
427 break;
428 case 27: out0[0] = __byte_perm_S (in1[2], in1[3], 0x6543);
429 out0[1] = __byte_perm_S (in1[3], 0, 0x6543);
430 out0[2] = 0;
431 out0[3] = 0;
432 out1[0] = 0;
433 out1[1] = 0;
434 out1[2] = 0;
435 out1[3] = 0;
436 break;
437 case 28: out0[0] = in1[3];
438 out0[1] = 0;
439 out0[2] = 0;
440 out0[3] = 0;
441 out1[0] = 0;
442 out1[1] = 0;
443 out1[2] = 0;
444 out1[3] = 0;
445 break;
446 case 29: out0[0] = __byte_perm_S (in1[3], 0, 0x4321);
447 out0[1] = 0;
448 out0[2] = 0;
449 out0[3] = 0;
450 out1[0] = 0;
451 out1[1] = 0;
452 out1[2] = 0;
453 out1[3] = 0;
454 break;
455 case 30: out0[0] = __byte_perm_S (in1[3], 0, 0x5432);
456 out0[1] = 0;
457 out0[2] = 0;
458 out0[3] = 0;
459 out1[0] = 0;
460 out1[1] = 0;
461 out1[2] = 0;
462 out1[3] = 0;
463 break;
464 case 31: out0[0] = __byte_perm_S (in1[3], 0, 0x6543);
465 out0[1] = 0;
466 out0[2] = 0;
467 out0[3] = 0;
468 out1[0] = 0;
469 out1[1] = 0;
470 out1[2] = 0;
471 out1[3] = 0;
472 break;
473 }
474 #endif
475
476 #if defined IS_AMD || defined IS_GENERIC
477 switch (num)
478 {
479 case 0: out0[0] = in0[0];
480 out0[1] = in0[1];
481 out0[2] = in0[2];
482 out0[3] = in0[3];
483 out1[0] = in1[0];
484 out1[1] = in1[1];
485 out1[2] = in1[2];
486 out1[3] = in1[3];
487 break;
488 case 1: out0[0] = amd_bytealign_S (in0[1], in0[0], 1);
489 out0[1] = amd_bytealign_S (in0[2], in0[1], 1);
490 out0[2] = amd_bytealign_S (in0[3], in0[2], 1);
491 out0[3] = amd_bytealign_S (in1[0], in0[3], 1);
492 out1[0] = amd_bytealign_S (in1[1], in1[0], 1);
493 out1[1] = amd_bytealign_S (in1[2], in1[1], 1);
494 out1[2] = amd_bytealign_S (in1[3], in1[2], 1);
495 out1[3] = amd_bytealign_S ( 0, in1[3], 1);
496 break;
497 case 2: out0[0] = amd_bytealign_S (in0[1], in0[0], 2);
498 out0[1] = amd_bytealign_S (in0[2], in0[1], 2);
499 out0[2] = amd_bytealign_S (in0[3], in0[2], 2);
500 out0[3] = amd_bytealign_S (in1[0], in0[3], 2);
501 out1[0] = amd_bytealign_S (in1[1], in1[0], 2);
502 out1[1] = amd_bytealign_S (in1[2], in1[1], 2);
503 out1[2] = amd_bytealign_S (in1[3], in1[2], 2);
504 out1[3] = amd_bytealign_S ( 0, in1[3], 2);
505 break;
506 case 3: out0[0] = amd_bytealign_S (in0[1], in0[0], 3);
507 out0[1] = amd_bytealign_S (in0[2], in0[1], 3);
508 out0[2] = amd_bytealign_S (in0[3], in0[2], 3);
509 out0[3] = amd_bytealign_S (in1[0], in0[3], 3);
510 out1[0] = amd_bytealign_S (in1[1], in1[0], 3);
511 out1[1] = amd_bytealign_S (in1[2], in1[1], 3);
512 out1[2] = amd_bytealign_S (in1[3], in1[2], 3);
513 out1[3] = amd_bytealign_S ( 0, in1[3], 3);
514 break;
515 case 4: out0[0] = in0[1];
516 out0[1] = in0[2];
517 out0[2] = in0[3];
518 out0[3] = in1[0];
519 out1[0] = in1[1];
520 out1[1] = in1[2];
521 out1[2] = in1[3];
522 out1[3] = 0;
523 break;
524 case 5: out0[0] = amd_bytealign_S (in0[2], in0[1], 1);
525 out0[1] = amd_bytealign_S (in0[3], in0[2], 1);
526 out0[2] = amd_bytealign_S (in1[0], in0[3], 1);
527 out0[3] = amd_bytealign_S (in1[1], in1[0], 1);
528 out1[0] = amd_bytealign_S (in1[2], in1[1], 1);
529 out1[1] = amd_bytealign_S (in1[3], in1[2], 1);
530 out1[2] = amd_bytealign_S ( 0, in1[3], 1);
531 out1[3] = 0;
532 break;
533 case 6: out0[0] = amd_bytealign_S (in0[2], in0[1], 2);
534 out0[1] = amd_bytealign_S (in0[3], in0[2], 2);
535 out0[2] = amd_bytealign_S (in1[0], in0[3], 2);
536 out0[3] = amd_bytealign_S (in1[1], in1[0], 2);
537 out1[0] = amd_bytealign_S (in1[2], in1[1], 2);
538 out1[1] = amd_bytealign_S (in1[3], in1[2], 2);
539 out1[2] = amd_bytealign_S ( 0, in1[3], 2);
540 out1[3] = 0;
541 break;
542 case 7: out0[0] = amd_bytealign_S (in0[2], in0[1], 3);
543 out0[1] = amd_bytealign_S (in0[3], in0[2], 3);
544 out0[2] = amd_bytealign_S (in1[0], in0[3], 3);
545 out0[3] = amd_bytealign_S (in1[1], in1[0], 3);
546 out1[0] = amd_bytealign_S (in1[2], in1[1], 3);
547 out1[1] = amd_bytealign_S (in1[3], in1[2], 3);
548 out1[2] = amd_bytealign_S ( 0, in1[3], 3);
549 out1[3] = 0;
550 break;
551 case 8: out0[0] = in0[2];
552 out0[1] = in0[3];
553 out0[2] = in1[0];
554 out0[3] = in1[1];
555 out1[0] = in1[2];
556 out1[1] = in1[3];
557 out1[2] = 0;
558 out1[3] = 0;
559 break;
560 case 9: out0[0] = amd_bytealign_S (in0[3], in0[2], 1);
561 out0[1] = amd_bytealign_S (in1[0], in0[3], 1);
562 out0[2] = amd_bytealign_S (in1[1], in1[0], 1);
563 out0[3] = amd_bytealign_S (in1[2], in1[1], 1);
564 out1[0] = amd_bytealign_S (in1[3], in1[2], 1);
565 out1[1] = amd_bytealign_S ( 0, in1[3], 1);
566 out1[2] = 0;
567 out1[3] = 0;
568 break;
569 case 10: out0[0] = amd_bytealign_S (in0[3], in0[2], 2);
570 out0[1] = amd_bytealign_S (in1[0], in0[3], 2);
571 out0[2] = amd_bytealign_S (in1[1], in1[0], 2);
572 out0[3] = amd_bytealign_S (in1[2], in1[1], 2);
573 out1[0] = amd_bytealign_S (in1[3], in1[2], 2);
574 out1[1] = amd_bytealign_S ( 0, in1[3], 2);
575 out1[2] = 0;
576 out1[3] = 0;
577 break;
578 case 11: out0[0] = amd_bytealign_S (in0[3], in0[2], 3);
579 out0[1] = amd_bytealign_S (in1[0], in0[3], 3);
580 out0[2] = amd_bytealign_S (in1[1], in1[0], 3);
581 out0[3] = amd_bytealign_S (in1[2], in1[1], 3);
582 out1[0] = amd_bytealign_S (in1[3], in1[2], 3);
583 out1[1] = amd_bytealign_S ( 0, in1[3], 3);
584 out1[2] = 0;
585 out1[3] = 0;
586 break;
587 case 12: out0[0] = in0[3];
588 out0[1] = in1[0];
589 out0[2] = in1[1];
590 out0[3] = in1[2];
591 out1[0] = in1[3];
592 out1[1] = 0;
593 out1[2] = 0;
594 out1[3] = 0;
595 break;
596 case 13: out0[0] = amd_bytealign_S (in1[0], in0[3], 1);
597 out0[1] = amd_bytealign_S (in1[1], in1[0], 1);
598 out0[2] = amd_bytealign_S (in1[2], in1[1], 1);
599 out0[3] = amd_bytealign_S (in1[3], in1[2], 1);
600 out1[0] = amd_bytealign_S ( 0, in1[3], 1);
601 out1[1] = 0;
602 out1[2] = 0;
603 out1[3] = 0;
604 break;
605 case 14: out0[0] = amd_bytealign_S (in1[0], in0[3], 2);
606 out0[1] = amd_bytealign_S (in1[1], in1[0], 2);
607 out0[2] = amd_bytealign_S (in1[2], in1[1], 2);
608 out0[3] = amd_bytealign_S (in1[3], in1[2], 2);
609 out1[0] = amd_bytealign_S ( 0, in1[3], 2);
610 out1[1] = 0;
611 out1[2] = 0;
612 out1[3] = 0;
613 break;
614 case 15: out0[0] = amd_bytealign_S (in1[0], in0[3], 3);
615 out0[1] = amd_bytealign_S (in1[1], in1[0], 3);
616 out0[2] = amd_bytealign_S (in1[2], in1[1], 3);
617 out0[3] = amd_bytealign_S (in1[3], in1[2], 3);
618 out1[0] = amd_bytealign_S ( 0, in1[3], 3);
619 out1[1] = 0;
620 out1[2] = 0;
621 out1[3] = 0;
622 break;
623 case 16: out0[0] = in1[0];
624 out0[1] = in1[1];
625 out0[2] = in1[2];
626 out0[3] = in1[3];
627 out1[0] = 0;
628 out1[1] = 0;
629 out1[2] = 0;
630 out1[3] = 0;
631 break;
632 case 17: out0[0] = amd_bytealign_S (in1[1], in1[0], 1);
633 out0[1] = amd_bytealign_S (in1[2], in1[1], 1);
634 out0[2] = amd_bytealign_S (in1[3], in1[2], 1);
635 out0[3] = amd_bytealign_S ( 0, in1[3], 1);
636 out1[0] = 0;
637 out1[1] = 0;
638 out1[2] = 0;
639 out1[3] = 0;
640 break;
641 case 18: out0[0] = amd_bytealign_S (in1[1], in1[0], 2);
642 out0[1] = amd_bytealign_S (in1[2], in1[1], 2);
643 out0[2] = amd_bytealign_S (in1[3], in1[2], 2);
644 out0[3] = amd_bytealign_S ( 0, in1[3], 2);
645 out1[0] = 0;
646 out1[1] = 0;
647 out1[2] = 0;
648 out1[3] = 0;
649 break;
650 case 19: out0[0] = amd_bytealign_S (in1[1], in1[0], 3);
651 out0[1] = amd_bytealign_S (in1[2], in1[1], 3);
652 out0[2] = amd_bytealign_S (in1[3], in1[2], 3);
653 out0[3] = amd_bytealign_S ( 0, in1[3], 3);
654 out1[0] = 0;
655 out1[1] = 0;
656 out1[2] = 0;
657 out1[3] = 0;
658 break;
659 case 20: out0[0] = in1[1];
660 out0[1] = in1[2];
661 out0[2] = in1[3];
662 out0[3] = 0;
663 out1[0] = 0;
664 out1[1] = 0;
665 out1[2] = 0;
666 out1[3] = 0;
667 break;
668 case 21: out0[0] = amd_bytealign_S (in1[2], in1[1], 1);
669 out0[1] = amd_bytealign_S (in1[3], in1[2], 1);
670 out0[2] = amd_bytealign_S ( 0, in1[3], 1);
671 out0[3] = 0;
672 out1[0] = 0;
673 out1[1] = 0;
674 out1[2] = 0;
675 out1[3] = 0;
676 break;
677 case 22: out0[0] = amd_bytealign_S (in1[2], in1[1], 2);
678 out0[1] = amd_bytealign_S (in1[3], in1[2], 2);
679 out0[2] = amd_bytealign_S ( 0, in1[3], 2);
680 out0[3] = 0;
681 out1[0] = 0;
682 out1[1] = 0;
683 out1[2] = 0;
684 out1[3] = 0;
685 break;
686 case 23: out0[0] = amd_bytealign_S (in1[2], in1[1], 3);
687 out0[1] = amd_bytealign_S (in1[3], in1[2], 3);
688 out0[2] = amd_bytealign_S ( 0, in1[3], 3);
689 out0[3] = 0;
690 out1[0] = 0;
691 out1[1] = 0;
692 out1[2] = 0;
693 out1[3] = 0;
694 break;
695 case 24: out0[0] = in1[2];
696 out0[1] = in1[3];
697 out0[2] = 0;
698 out0[3] = 0;
699 out1[0] = 0;
700 out1[1] = 0;
701 out1[2] = 0;
702 out1[3] = 0;
703 break;
704 case 25: out0[0] = amd_bytealign_S (in1[3], in1[2], 1);
705 out0[1] = amd_bytealign_S ( 0, in1[3], 1);
706 out0[2] = 0;
707 out0[3] = 0;
708 out1[0] = 0;
709 out1[1] = 0;
710 out1[2] = 0;
711 out1[3] = 0;
712 break;
713 case 26: out0[0] = amd_bytealign_S (in1[3], in1[2], 2);
714 out0[1] = amd_bytealign_S ( 0, in1[3], 2);
715 out0[2] = 0;
716 out0[3] = 0;
717 out1[0] = 0;
718 out1[1] = 0;
719 out1[2] = 0;
720 out1[3] = 0;
721 break;
722 case 27: out0[0] = amd_bytealign_S (in1[3], in1[2], 3);
723 out0[1] = amd_bytealign_S ( 0, in1[3], 3);
724 out0[2] = 0;
725 out0[3] = 0;
726 out1[0] = 0;
727 out1[1] = 0;
728 out1[2] = 0;
729 out1[3] = 0;
730 break;
731 case 28: out0[0] = in1[3];
732 out0[1] = 0;
733 out0[2] = 0;
734 out0[3] = 0;
735 out1[0] = 0;
736 out1[1] = 0;
737 out1[2] = 0;
738 out1[3] = 0;
739 break;
740 case 29: out0[0] = amd_bytealign_S ( 0, in1[3], 1);
741 out0[1] = 0;
742 out0[2] = 0;
743 out0[3] = 0;
744 out1[0] = 0;
745 out1[1] = 0;
746 out1[2] = 0;
747 out1[3] = 0;
748 break;
749 case 30: out0[0] = amd_bytealign_S ( 0, in1[3], 2);
750 out0[1] = 0;
751 out0[2] = 0;
752 out0[3] = 0;
753 out1[0] = 0;
754 out1[1] = 0;
755 out1[2] = 0;
756 out1[3] = 0;
757 break;
758 case 31: out0[0] = amd_bytealign_S ( 0, in1[3], 3);
759 out0[1] = 0;
760 out0[2] = 0;
761 out0[3] = 0;
762 out1[0] = 0;
763 out1[1] = 0;
764 out1[2] = 0;
765 out1[3] = 0;
766 break;
767 }
768 #endif
769 }
770
771 inline void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num)
772 {
773 #ifdef IS_NV
774 switch (num)
775 {
776 case 0: out1[3] = in1[3];
777 out1[2] = in1[2];
778 out1[1] = in1[1];
779 out1[0] = in1[0];
780 out0[3] = in0[3];
781 out0[2] = in0[2];
782 out0[1] = in0[1];
783 out0[0] = in0[0];
784 break;
785 case 1: out1[3] = __byte_perm_S (in1[2], in1[3], 0x6543);
786 out1[2] = __byte_perm_S (in1[1], in1[2], 0x6543);
787 out1[1] = __byte_perm_S (in1[0], in1[1], 0x6543);
788 out1[0] = __byte_perm_S (in0[3], in1[0], 0x6543);
789 out0[3] = __byte_perm_S (in0[2], in0[3], 0x6543);
790 out0[2] = __byte_perm_S (in0[1], in0[2], 0x6543);
791 out0[1] = __byte_perm_S (in0[0], in0[1], 0x6543);
792 out0[0] = __byte_perm_S ( 0, in0[0], 0x6543);
793 break;
794 case 2: out1[3] = __byte_perm_S (in1[2], in1[3], 0x5432);
795 out1[2] = __byte_perm_S (in1[1], in1[2], 0x5432);
796 out1[1] = __byte_perm_S (in1[0], in1[1], 0x5432);
797 out1[0] = __byte_perm_S (in0[3], in1[0], 0x5432);
798 out0[3] = __byte_perm_S (in0[2], in0[3], 0x5432);
799 out0[2] = __byte_perm_S (in0[1], in0[2], 0x5432);
800 out0[1] = __byte_perm_S (in0[0], in0[1], 0x5432);
801 out0[0] = __byte_perm_S ( 0, in0[0], 0x5432);
802 break;
803 case 3: out1[3] = __byte_perm_S (in1[2], in1[3], 0x4321);
804 out1[2] = __byte_perm_S (in1[1], in1[2], 0x4321);
805 out1[1] = __byte_perm_S (in1[0], in1[1], 0x4321);
806 out1[0] = __byte_perm_S (in0[3], in1[0], 0x4321);
807 out0[3] = __byte_perm_S (in0[2], in0[3], 0x4321);
808 out0[2] = __byte_perm_S (in0[1], in0[2], 0x4321);
809 out0[1] = __byte_perm_S (in0[0], in0[1], 0x4321);
810 out0[0] = __byte_perm_S ( 0, in0[0], 0x4321);
811 break;
812 case 4: out1[3] = in1[2];
813 out1[2] = in1[1];
814 out1[1] = in1[0];
815 out1[0] = in0[3];
816 out0[3] = in0[2];
817 out0[2] = in0[1];
818 out0[1] = in0[0];
819 out0[0] = 0;
820 break;
821 case 5: out1[3] = __byte_perm_S (in1[1], in1[2], 0x6543);
822 out1[2] = __byte_perm_S (in1[0], in1[1], 0x6543);
823 out1[1] = __byte_perm_S (in0[3], in1[0], 0x6543);
824 out1[0] = __byte_perm_S (in0[2], in0[3], 0x6543);
825 out0[3] = __byte_perm_S (in0[1], in0[2], 0x6543);
826 out0[2] = __byte_perm_S (in0[0], in0[1], 0x6543);
827 out0[1] = __byte_perm_S ( 0, in0[0], 0x6543);
828 out0[0] = 0;
829 break;
830 case 6: out1[3] = __byte_perm_S (in1[1], in1[2], 0x5432);
831 out1[2] = __byte_perm_S (in1[0], in1[1], 0x5432);
832 out1[1] = __byte_perm_S (in0[3], in1[0], 0x5432);
833 out1[0] = __byte_perm_S (in0[2], in0[3], 0x5432);
834 out0[3] = __byte_perm_S (in0[1], in0[2], 0x5432);
835 out0[2] = __byte_perm_S (in0[0], in0[1], 0x5432);
836 out0[1] = __byte_perm_S ( 0, in0[0], 0x5432);
837 out0[0] = 0;
838 break;
839 case 7: out1[3] = __byte_perm_S (in1[1], in1[2], 0x4321);
840 out1[2] = __byte_perm_S (in1[0], in1[1], 0x4321);
841 out1[1] = __byte_perm_S (in0[3], in1[0], 0x4321);
842 out1[0] = __byte_perm_S (in0[2], in0[3], 0x4321);
843 out0[3] = __byte_perm_S (in0[1], in0[2], 0x4321);
844 out0[2] = __byte_perm_S (in0[0], in0[1], 0x4321);
845 out0[1] = __byte_perm_S ( 0, in0[0], 0x4321);
846 out0[0] = 0;
847 break;
848 case 8: out1[3] = in1[1];
849 out1[2] = in1[0];
850 out1[1] = in0[3];
851 out1[0] = in0[2];
852 out0[3] = in0[1];
853 out0[2] = in0[0];
854 out0[1] = 0;
855 out0[0] = 0;
856 break;
857 case 9: out1[3] = __byte_perm_S (in1[0], in1[1], 0x6543);
858 out1[2] = __byte_perm_S (in0[3], in1[0], 0x6543);
859 out1[1] = __byte_perm_S (in0[2], in0[3], 0x6543);
860 out1[0] = __byte_perm_S (in0[1], in0[2], 0x6543);
861 out0[3] = __byte_perm_S (in0[0], in0[1], 0x6543);
862 out0[2] = __byte_perm_S ( 0, in0[0], 0x6543);
863 out0[1] = 0;
864 out0[0] = 0;
865 break;
866 case 10: out1[3] = __byte_perm_S (in1[0], in1[1], 0x5432);
867 out1[2] = __byte_perm_S (in0[3], in1[0], 0x5432);
868 out1[1] = __byte_perm_S (in0[2], in0[3], 0x5432);
869 out1[0] = __byte_perm_S (in0[1], in0[2], 0x5432);
870 out0[3] = __byte_perm_S (in0[0], in0[1], 0x5432);
871 out0[2] = __byte_perm_S ( 0, in0[0], 0x5432);
872 out0[1] = 0;
873 out0[0] = 0;
874 break;
875 case 11: out1[3] = __byte_perm_S (in1[0], in1[1], 0x4321);
876 out1[2] = __byte_perm_S (in0[3], in1[0], 0x4321);
877 out1[1] = __byte_perm_S (in0[2], in0[3], 0x4321);
878 out1[0] = __byte_perm_S (in0[1], in0[2], 0x4321);
879 out0[3] = __byte_perm_S (in0[0], in0[1], 0x4321);
880 out0[2] = __byte_perm_S ( 0, in0[0], 0x4321);
881 out0[1] = 0;
882 out0[0] = 0;
883 break;
884 case 12: out1[3] = in1[0];
885 out1[2] = in0[3];
886 out1[1] = in0[2];
887 out1[0] = in0[1];
888 out0[3] = in0[0];
889 out0[2] = 0;
890 out0[1] = 0;
891 out0[0] = 0;
892 break;
893 case 13: out1[3] = __byte_perm_S (in0[3], in1[0], 0x6543);
894 out1[2] = __byte_perm_S (in0[2], in0[3], 0x6543);
895 out1[1] = __byte_perm_S (in0[1], in0[2], 0x6543);
896 out1[0] = __byte_perm_S (in0[0], in0[1], 0x6543);
897 out0[3] = __byte_perm_S ( 0, in0[0], 0x6543);
898 out0[2] = 0;
899 out0[1] = 0;
900 out0[0] = 0;
901 break;
902 case 14: out1[3] = __byte_perm_S (in0[3], in1[0], 0x5432);
903 out1[2] = __byte_perm_S (in0[2], in0[3], 0x5432);
904 out1[1] = __byte_perm_S (in0[1], in0[2], 0x5432);
905 out1[0] = __byte_perm_S (in0[0], in0[1], 0x5432);
906 out0[3] = __byte_perm_S ( 0, in0[0], 0x5432);
907 out0[2] = 0;
908 out0[1] = 0;
909 out0[0] = 0;
910 break;
911 case 15: out1[3] = __byte_perm_S (in0[3], in1[0], 0x4321);
912 out1[2] = __byte_perm_S (in0[2], in0[3], 0x4321);
913 out1[1] = __byte_perm_S (in0[1], in0[2], 0x4321);
914 out1[0] = __byte_perm_S (in0[0], in0[1], 0x4321);
915 out0[3] = __byte_perm_S ( 0, in0[0], 0x4321);
916 out0[2] = 0;
917 out0[1] = 0;
918 out0[0] = 0;
919 break;
920 case 16: out1[3] = in0[3];
921 out1[2] = in0[2];
922 out1[1] = in0[1];
923 out1[0] = in0[0];
924 out0[3] = 0;
925 out0[2] = 0;
926 out0[1] = 0;
927 out0[0] = 0;
928 break;
929 case 17: out1[3] = __byte_perm_S (in0[2], in0[3], 0x6543);
930 out1[2] = __byte_perm_S (in0[1], in0[2], 0x6543);
931 out1[1] = __byte_perm_S (in0[0], in0[1], 0x6543);
932 out1[0] = __byte_perm_S ( 0, in0[0], 0x6543);
933 out0[3] = 0;
934 out0[2] = 0;
935 out0[1] = 0;
936 out0[0] = 0;
937 break;
938 case 18: out1[3] = __byte_perm_S (in0[2], in0[3], 0x5432);
939 out1[2] = __byte_perm_S (in0[1], in0[2], 0x5432);
940 out1[1] = __byte_perm_S (in0[0], in0[1], 0x5432);
941 out1[0] = __byte_perm_S ( 0, in0[0], 0x5432);
942 out0[3] = 0;
943 out0[2] = 0;
944 out0[1] = 0;
945 out0[0] = 0;
946 break;
947 case 19: out1[3] = __byte_perm_S (in0[2], in0[3], 0x4321);
948 out1[2] = __byte_perm_S (in0[1], in0[2], 0x4321);
949 out1[1] = __byte_perm_S (in0[0], in0[1], 0x4321);
950 out1[0] = __byte_perm_S ( 0, in0[0], 0x4321);
951 out0[3] = 0;
952 out0[2] = 0;
953 out0[1] = 0;
954 out0[0] = 0;
955 break;
956 case 20: out1[3] = in0[2];
957 out1[2] = in0[1];
958 out1[1] = in0[0];
959 out1[0] = 0;
960 out0[3] = 0;
961 out0[2] = 0;
962 out0[1] = 0;
963 out0[0] = 0;
964 break;
965 case 21: out1[3] = __byte_perm_S (in0[1], in0[2], 0x6543);
966 out1[2] = __byte_perm_S (in0[0], in0[1], 0x6543);
967 out1[1] = __byte_perm_S ( 0, in0[0], 0x6543);
968 out1[0] = 0;
969 out0[3] = 0;
970 out0[2] = 0;
971 out0[1] = 0;
972 out0[0] = 0;
973 break;
974 case 22: out1[3] = __byte_perm_S (in0[1], in0[2], 0x5432);
975 out1[2] = __byte_perm_S (in0[0], in0[1], 0x5432);
976 out1[1] = __byte_perm_S ( 0, in0[0], 0x5432);
977 out1[0] = 0;
978 out0[3] = 0;
979 out0[2] = 0;
980 out0[1] = 0;
981 out0[0] = 0;
982 break;
983 case 23: out1[3] = __byte_perm_S (in0[1], in0[2], 0x4321);
984 out1[2] = __byte_perm_S (in0[0], in0[1], 0x4321);
985 out1[1] = __byte_perm_S ( 0, in0[0], 0x4321);
986 out1[0] = 0;
987 out0[3] = 0;
988 out0[2] = 0;
989 out0[1] = 0;
990 out0[0] = 0;
991 break;
992 case 24: out1[3] = in0[1];
993 out1[2] = in0[0];
994 out1[1] = 0;
995 out1[0] = 0;
996 out0[3] = 0;
997 out0[2] = 0;
998 out0[1] = 0;
999 out0[0] = 0;
1000 break;
1001 case 25: out1[3] = __byte_perm_S (in0[0], in0[1], 0x6543);
1002 out1[2] = __byte_perm_S ( 0, in0[0], 0x6543);
1003 out1[1] = 0;
1004 out1[0] = 0;
1005 out0[3] = 0;
1006 out0[2] = 0;
1007 out0[1] = 0;
1008 out0[0] = 0;
1009 break;
1010 case 26: out1[3] = __byte_perm_S (in0[0], in0[1], 0x5432);
1011 out1[2] = __byte_perm_S ( 0, in0[0], 0x5432);
1012 out1[1] = 0;
1013 out1[0] = 0;
1014 out0[3] = 0;
1015 out0[2] = 0;
1016 out0[1] = 0;
1017 out0[0] = 0;
1018 break;
1019 case 27: out1[3] = __byte_perm_S (in0[0], in0[1], 0x4321);
1020 out1[2] = __byte_perm_S ( 0, in0[0], 0x4321);
1021 out1[1] = 0;
1022 out1[0] = 0;
1023 out0[3] = 0;
1024 out0[2] = 0;
1025 out0[1] = 0;
1026 out0[0] = 0;
1027 break;
1028 case 28: out1[3] = in0[0];
1029 out1[2] = 0;
1030 out1[1] = 0;
1031 out1[0] = 0;
1032 out0[3] = 0;
1033 out0[2] = 0;
1034 out0[1] = 0;
1035 out0[0] = 0;
1036 break;
1037 case 29: out1[3] = __byte_perm_S ( 0, in0[0], 0x6543);
1038 out1[2] = 0;
1039 out1[1] = 0;
1040 out1[0] = 0;
1041 out0[3] = 0;
1042 out0[2] = 0;
1043 out0[1] = 0;
1044 out0[0] = 0;
1045 break;
1046 case 30: out1[3] = __byte_perm_S ( 0, in0[0], 0x5432);
1047 out1[2] = 0;
1048 out1[1] = 0;
1049 out1[0] = 0;
1050 out0[3] = 0;
1051 out0[2] = 0;
1052 out0[1] = 0;
1053 out0[0] = 0;
1054 break;
1055 case 31: out1[3] = __byte_perm_S ( 0, in0[0], 0x4321);
1056 out1[2] = 0;
1057 out1[1] = 0;
1058 out1[0] = 0;
1059 out0[3] = 0;
1060 out0[2] = 0;
1061 out0[1] = 0;
1062 out0[0] = 0;
1063 break;
1064 }
1065 #endif
1066
1067 #if defined IS_AMD || defined IS_GENERIC
1068 switch (num)
1069 {
1070 case 0: out1[3] = in1[3];
1071 out1[2] = in1[2];
1072 out1[1] = in1[1];
1073 out1[0] = in1[0];
1074 out0[3] = in0[3];
1075 out0[2] = in0[2];
1076 out0[1] = in0[1];
1077 out0[0] = in0[0];
1078 break;
1079 case 1: out1[3] = amd_bytealign_S (in1[3], in1[2], 3);
1080 out1[2] = amd_bytealign_S (in1[2], in1[1], 3);
1081 out1[1] = amd_bytealign_S (in1[1], in1[0], 3);
1082 out1[0] = amd_bytealign_S (in1[0], in0[3], 3);
1083 out0[3] = amd_bytealign_S (in0[3], in0[2], 3);
1084 out0[2] = amd_bytealign_S (in0[2], in0[1], 3);
1085 out0[1] = amd_bytealign_S (in0[1], in0[0], 3);
1086 out0[0] = amd_bytealign_S (in0[0], 0, 3);
1087 break;
1088 case 2: out1[3] = amd_bytealign_S (in1[3], in1[2], 2);
1089 out1[2] = amd_bytealign_S (in1[2], in1[1], 2);
1090 out1[1] = amd_bytealign_S (in1[1], in1[0], 2);
1091 out1[0] = amd_bytealign_S (in1[0], in0[3], 2);
1092 out0[3] = amd_bytealign_S (in0[3], in0[2], 2);
1093 out0[2] = amd_bytealign_S (in0[2], in0[1], 2);
1094 out0[1] = amd_bytealign_S (in0[1], in0[0], 2);
1095 out0[0] = amd_bytealign_S (in0[0], 0, 2);
1096 break;
1097 case 3: out1[3] = amd_bytealign_S (in1[3], in1[2], 1);
1098 out1[2] = amd_bytealign_S (in1[2], in1[1], 1);
1099 out1[1] = amd_bytealign_S (in1[1], in1[0], 1);
1100 out1[0] = amd_bytealign_S (in1[0], in0[3], 1);
1101 out0[3] = amd_bytealign_S (in0[3], in0[2], 1);
1102 out0[2] = amd_bytealign_S (in0[2], in0[1], 1);
1103 out0[1] = amd_bytealign_S (in0[1], in0[0], 1);
1104 out0[0] = amd_bytealign_S (in0[0], 0, 1);
1105 break;
1106 case 4: out1[3] = in1[2];
1107 out1[2] = in1[1];
1108 out1[1] = in1[0];
1109 out1[0] = in0[3];
1110 out0[3] = in0[2];
1111 out0[2] = in0[1];
1112 out0[1] = in0[0];
1113 out0[0] = 0;
1114 break;
1115 case 5: out1[3] = amd_bytealign_S (in1[2], in1[1], 3);
1116 out1[2] = amd_bytealign_S (in1[1], in1[0], 3);
1117 out1[1] = amd_bytealign_S (in1[0], in0[3], 3);
1118 out1[0] = amd_bytealign_S (in0[3], in0[2], 3);
1119 out0[3] = amd_bytealign_S (in0[2], in0[1], 3);
1120 out0[2] = amd_bytealign_S (in0[1], in0[0], 3);
1121 out0[1] = amd_bytealign_S (in0[0], 0, 3);
1122 out0[0] = 0;
1123 break;
1124 case 6: out1[3] = amd_bytealign_S (in1[2], in1[1], 2);
1125 out1[2] = amd_bytealign_S (in1[1], in1[0], 2);
1126 out1[1] = amd_bytealign_S (in1[0], in0[3], 2);
1127 out1[0] = amd_bytealign_S (in0[3], in0[2], 2);
1128 out0[3] = amd_bytealign_S (in0[2], in0[1], 2);
1129 out0[2] = amd_bytealign_S (in0[1], in0[0], 2);
1130 out0[1] = amd_bytealign_S (in0[0], 0, 2);
1131 out0[0] = 0;
1132 break;
1133 case 7: out1[3] = amd_bytealign_S (in1[2], in1[1], 1);
1134 out1[2] = amd_bytealign_S (in1[1], in1[0], 1);
1135 out1[1] = amd_bytealign_S (in1[0], in0[3], 1);
1136 out1[0] = amd_bytealign_S (in0[3], in0[2], 1);
1137 out0[3] = amd_bytealign_S (in0[2], in0[1], 1);
1138 out0[2] = amd_bytealign_S (in0[1], in0[0], 1);
1139 out0[1] = amd_bytealign_S (in0[0], 0, 1);
1140 out0[0] = 0;
1141 break;
1142 case 8: out1[3] = in1[1];
1143 out1[2] = in1[0];
1144 out1[1] = in0[3];
1145 out1[0] = in0[2];
1146 out0[3] = in0[1];
1147 out0[2] = in0[0];
1148 out0[1] = 0;
1149 out0[0] = 0;
1150 break;
1151 case 9: out1[3] = amd_bytealign_S (in1[1], in1[0], 3);
1152 out1[2] = amd_bytealign_S (in1[0], in0[3], 3);
1153 out1[1] = amd_bytealign_S (in0[3], in0[2], 3);
1154 out1[0] = amd_bytealign_S (in0[2], in0[1], 3);
1155 out0[3] = amd_bytealign_S (in0[1], in0[0], 3);
1156 out0[2] = amd_bytealign_S (in0[0], 0, 3);
1157 out0[1] = 0;
1158 out0[0] = 0;
1159 break;
1160 case 10: out1[3] = amd_bytealign_S (in1[1], in1[0], 2);
1161 out1[2] = amd_bytealign_S (in1[0], in0[3], 2);
1162 out1[1] = amd_bytealign_S (in0[3], in0[2], 2);
1163 out1[0] = amd_bytealign_S (in0[2], in0[1], 2);
1164 out0[3] = amd_bytealign_S (in0[1], in0[0], 2);
1165 out0[2] = amd_bytealign_S (in0[0], 0, 2);
1166 out0[1] = 0;
1167 out0[0] = 0;
1168 break;
1169 case 11: out1[3] = amd_bytealign_S (in1[1], in1[0], 1);
1170 out1[2] = amd_bytealign_S (in1[0], in0[3], 1);
1171 out1[1] = amd_bytealign_S (in0[3], in0[2], 1);
1172 out1[0] = amd_bytealign_S (in0[2], in0[1], 1);
1173 out0[3] = amd_bytealign_S (in0[1], in0[0], 1);
1174 out0[2] = amd_bytealign_S (in0[0], 0, 1);
1175 out0[1] = 0;
1176 out0[0] = 0;
1177 break;
1178 case 12: out1[3] = in1[0];
1179 out1[2] = in0[3];
1180 out1[1] = in0[2];
1181 out1[0] = in0[1];
1182 out0[3] = in0[0];
1183 out0[2] = 0;
1184 out0[1] = 0;
1185 out0[0] = 0;
1186 break;
1187 case 13: out1[3] = amd_bytealign_S (in1[0], in0[3], 3);
1188 out1[2] = amd_bytealign_S (in0[3], in0[2], 3);
1189 out1[1] = amd_bytealign_S (in0[2], in0[1], 3);
1190 out1[0] = amd_bytealign_S (in0[1], in0[0], 3);
1191 out0[3] = amd_bytealign_S (in0[0], 0, 3);
1192 out0[2] = 0;
1193 out0[1] = 0;
1194 out0[0] = 0;
1195 break;
1196 case 14: out1[3] = amd_bytealign_S (in1[0], in0[3], 2);
1197 out1[2] = amd_bytealign_S (in0[3], in0[2], 2);
1198 out1[1] = amd_bytealign_S (in0[2], in0[1], 2);
1199 out1[0] = amd_bytealign_S (in0[1], in0[0], 2);
1200 out0[3] = amd_bytealign_S (in0[0], 0, 2);
1201 out0[2] = 0;
1202 out0[1] = 0;
1203 out0[0] = 0;
1204 break;
1205 case 15: out1[3] = amd_bytealign_S (in1[0], in0[3], 1);
1206 out1[2] = amd_bytealign_S (in0[3], in0[2], 1);
1207 out1[1] = amd_bytealign_S (in0[2], in0[1], 1);
1208 out1[0] = amd_bytealign_S (in0[1], in0[0], 1);
1209 out0[3] = amd_bytealign_S (in0[0], 0, 1);
1210 out0[2] = 0;
1211 out0[1] = 0;
1212 out0[0] = 0;
1213 break;
1214 case 16: out1[3] = in0[3];
1215 out1[2] = in0[2];
1216 out1[1] = in0[1];
1217 out1[0] = in0[0];
1218 out0[3] = 0;
1219 out0[2] = 0;
1220 out0[1] = 0;
1221 out0[0] = 0;
1222 break;
1223 case 17: out1[3] = amd_bytealign_S (in0[3], in0[2], 3);
1224 out1[2] = amd_bytealign_S (in0[2], in0[1], 3);
1225 out1[1] = amd_bytealign_S (in0[1], in0[0], 3);
1226 out1[0] = amd_bytealign_S (in0[0], 0, 3);
1227 out0[3] = 0;
1228 out0[2] = 0;
1229 out0[1] = 0;
1230 out0[0] = 0;
1231 break;
1232 case 18: out1[3] = amd_bytealign_S (in0[3], in0[2], 2);
1233 out1[2] = amd_bytealign_S (in0[2], in0[1], 2);
1234 out1[1] = amd_bytealign_S (in0[1], in0[0], 2);
1235 out1[0] = amd_bytealign_S (in0[0], 0, 2);
1236 out0[3] = 0;
1237 out0[2] = 0;
1238 out0[1] = 0;
1239 out0[0] = 0;
1240 break;
1241 case 19: out1[3] = amd_bytealign_S (in0[3], in0[2], 1);
1242 out1[2] = amd_bytealign_S (in0[2], in0[1], 1);
1243 out1[1] = amd_bytealign_S (in0[1], in0[0], 1);
1244 out1[0] = amd_bytealign_S (in0[0], 0, 1);
1245 out0[3] = 0;
1246 out0[2] = 0;
1247 out0[1] = 0;
1248 out0[0] = 0;
1249 break;
1250 case 20: out1[3] = in0[2];
1251 out1[2] = in0[1];
1252 out1[1] = in0[0];
1253 out1[0] = 0;
1254 out0[3] = 0;
1255 out0[2] = 0;
1256 out0[1] = 0;
1257 out0[0] = 0;
1258 break;
1259 case 21: out1[3] = amd_bytealign_S (in0[2], in0[1], 3);
1260 out1[2] = amd_bytealign_S (in0[1], in0[0], 3);
1261 out1[1] = amd_bytealign_S (in0[0], 0, 3);
1262 out1[0] = 0;
1263 out0[3] = 0;
1264 out0[2] = 0;
1265 out0[1] = 0;
1266 out0[0] = 0;
1267 break;
1268 case 22: out1[3] = amd_bytealign_S (in0[2], in0[1], 2);
1269 out1[2] = amd_bytealign_S (in0[1], in0[0], 2);
1270 out1[1] = amd_bytealign_S (in0[0], 0, 2);
1271 out1[0] = 0;
1272 out0[3] = 0;
1273 out0[2] = 0;
1274 out0[1] = 0;
1275 out0[0] = 0;
1276 break;
1277 case 23: out1[3] = amd_bytealign_S (in0[2], in0[1], 1);
1278 out1[2] = amd_bytealign_S (in0[1], in0[0], 1);
1279 out1[1] = amd_bytealign_S (in0[0], 0, 1);
1280 out1[0] = 0;
1281 out0[3] = 0;
1282 out0[2] = 0;
1283 out0[1] = 0;
1284 out0[0] = 0;
1285 break;
1286 case 24: out1[3] = in0[1];
1287 out1[2] = in0[0];
1288 out1[1] = 0;
1289 out1[0] = 0;
1290 out0[3] = 0;
1291 out0[2] = 0;
1292 out0[1] = 0;
1293 out0[0] = 0;
1294 break;
1295 case 25: out1[3] = amd_bytealign_S (in0[1], in0[0], 3);
1296 out1[2] = amd_bytealign_S (in0[0], 0, 3);
1297 out1[1] = 0;
1298 out1[0] = 0;
1299 out0[3] = 0;
1300 out0[2] = 0;
1301 out0[1] = 0;
1302 out0[0] = 0;
1303 break;
1304 case 26: out1[3] = amd_bytealign_S (in0[1], in0[0], 2);
1305 out1[2] = amd_bytealign_S (in0[0], 0, 2);
1306 out1[1] = 0;
1307 out1[0] = 0;
1308 out0[3] = 0;
1309 out0[2] = 0;
1310 out0[1] = 0;
1311 out0[0] = 0;
1312 break;
1313 case 27: out1[3] = amd_bytealign_S (in0[1], in0[0], 1);
1314 out1[2] = amd_bytealign_S (in0[0], 0, 1);
1315 out1[1] = 0;
1316 out1[0] = 0;
1317 out0[3] = 0;
1318 out0[2] = 0;
1319 out0[1] = 0;
1320 out0[0] = 0;
1321 break;
1322 case 28: out1[3] = in0[0];
1323 out1[2] = 0;
1324 out1[1] = 0;
1325 out1[0] = 0;
1326 out0[3] = 0;
1327 out0[2] = 0;
1328 out0[1] = 0;
1329 out0[0] = 0;
1330 break;
1331 case 29: out1[3] = amd_bytealign_S (in0[0], 0, 3);
1332 out1[2] = 0;
1333 out1[1] = 0;
1334 out1[0] = 0;
1335 out0[3] = 0;
1336 out0[2] = 0;
1337 out0[1] = 0;
1338 out0[0] = 0;
1339 break;
1340 case 30: out1[3] = amd_bytealign_S (in0[0], 0, 2);
1341 out1[2] = 0;
1342 out1[1] = 0;
1343 out1[0] = 0;
1344 out0[3] = 0;
1345 out0[2] = 0;
1346 out0[1] = 0;
1347 out0[0] = 0;
1348 break;
1349 case 31: out1[3] = amd_bytealign_S (in0[0], 0, 1);
1350 out1[2] = 0;
1351 out1[1] = 0;
1352 out1[0] = 0;
1353 out0[3] = 0;
1354 out0[2] = 0;
1355 out0[1] = 0;
1356 out0[0] = 0;
1357 break;
1358 }
1359 #endif
1360 }
1361
1362 inline void append_block1 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 src_r0)
1363 {
1364 u32 tmp[2];
1365
1366 switch (offset & 3)
1367 {
1368 case 0: tmp[0] = src_r0;
1369 tmp[1] = 0;
1370 break;
1371 case 1: tmp[0] = src_r0 << 8;
1372 tmp[1] = src_r0 >> 24;
1373 break;
1374 case 2: tmp[0] = src_r0 << 16;
1375 tmp[1] = src_r0 >> 16;
1376 break;
1377 case 3: tmp[0] = src_r0 << 24;
1378 tmp[1] = src_r0 >> 8;
1379 break;
1380 }
1381
1382 switch (offset / 4)
1383 {
1384 case 0: dst0[0] |= tmp[0];
1385 dst0[1] = tmp[1];
1386 break;
1387 case 1: dst0[1] |= tmp[0];
1388 dst0[2] = tmp[1];
1389 break;
1390 case 2: dst0[2] |= tmp[0];
1391 dst0[3] = tmp[1];
1392 break;
1393 case 3: dst0[3] |= tmp[0];
1394 dst1[0] = tmp[1];
1395 break;
1396 case 4: dst1[0] |= tmp[0];
1397 dst1[1] = tmp[1];
1398 break;
1399 case 5: dst1[1] |= tmp[0];
1400 dst1[2] = tmp[1];
1401 break;
1402 case 6: dst1[2] |= tmp[0];
1403 dst1[3] = tmp[1];
1404 break;
1405 case 7: dst1[3] |= tmp[0];
1406 break;
1407 }
1408 }
1409
1410 inline void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4])
1411 {
1412 #ifdef IS_NV
1413 switch (offset)
1414 {
1415 case 0:
1416 dst0[0] = src_r0[0];
1417 dst0[1] = src_r0[1];
1418 dst0[2] = src_r0[2];
1419 dst0[3] = src_r0[3];
1420 dst1[0] = src_r1[0];
1421 dst1[1] = src_r1[1];
1422 dst1[2] = src_r1[2];
1423 dst1[3] = src_r1[3];
1424 break;
1425
1426 case 1:
1427 dst0[0] = __byte_perm_S (src_l0[0], src_r0[0], 0x6540);
1428 dst0[1] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1429 dst0[2] = __byte_perm_S (src_r0[1], src_r0[2], 0x6543);
1430 dst0[3] = __byte_perm_S (src_r0[2], src_r0[3], 0x6543);
1431 dst1[0] = __byte_perm_S (src_r0[3], src_r1[0], 0x6543);
1432 dst1[1] = __byte_perm_S (src_r1[0], src_r1[1], 0x6543);
1433 dst1[2] = __byte_perm_S (src_r1[1], src_r1[2], 0x6543);
1434 dst1[3] = __byte_perm_S (src_r1[2], src_r1[3], 0x6543);
1435 break;
1436
1437 case 2:
1438 dst0[0] = __byte_perm_S (src_l0[0], src_r0[0], 0x5410);
1439 dst0[1] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1440 dst0[2] = __byte_perm_S (src_r0[1], src_r0[2], 0x5432);
1441 dst0[3] = __byte_perm_S (src_r0[2], src_r0[3], 0x5432);
1442 dst1[0] = __byte_perm_S (src_r0[3], src_r1[0], 0x5432);
1443 dst1[1] = __byte_perm_S (src_r1[0], src_r1[1], 0x5432);
1444 dst1[2] = __byte_perm_S (src_r1[1], src_r1[2], 0x5432);
1445 dst1[3] = __byte_perm_S (src_r1[2], src_r1[3], 0x5432);
1446 break;
1447
1448 case 3:
1449 dst0[0] = __byte_perm_S (src_l0[0], src_r0[0], 0x4210);
1450 dst0[1] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1451 dst0[2] = __byte_perm_S (src_r0[1], src_r0[2], 0x4321);
1452 dst0[3] = __byte_perm_S (src_r0[2], src_r0[3], 0x4321);
1453 dst1[0] = __byte_perm_S (src_r0[3], src_r1[0], 0x4321);
1454 dst1[1] = __byte_perm_S (src_r1[0], src_r1[1], 0x4321);
1455 dst1[2] = __byte_perm_S (src_r1[1], src_r1[2], 0x4321);
1456 dst1[3] = __byte_perm_S (src_r1[2], src_r1[3], 0x4321);
1457 break;
1458
1459 case 4:
1460 dst0[1] = src_r0[0];
1461 dst0[2] = src_r0[1];
1462 dst0[3] = src_r0[2];
1463 dst1[0] = src_r0[3];
1464 dst1[1] = src_r1[0];
1465 dst1[2] = src_r1[1];
1466 dst1[3] = src_r1[2];
1467 break;
1468
1469 case 5:
1470 dst0[1] = __byte_perm_S (src_l0[1], src_r0[0], 0x6540);
1471 dst0[2] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1472 dst0[3] = __byte_perm_S (src_r0[1], src_r0[2], 0x6543);
1473 dst1[0] = __byte_perm_S (src_r0[2], src_r0[3], 0x6543);
1474 dst1[1] = __byte_perm_S (src_r0[3], src_r1[0], 0x6543);
1475 dst1[2] = __byte_perm_S (src_r1[0], src_r1[1], 0x6543);
1476 dst1[3] = __byte_perm_S (src_r1[1], src_r1[2], 0x6543);
1477 break;
1478
1479 case 6:
1480 dst0[1] = __byte_perm_S (src_l0[1], src_r0[0], 0x5410);
1481 dst0[2] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1482 dst0[3] = __byte_perm_S (src_r0[1], src_r0[2], 0x5432);
1483 dst1[0] = __byte_perm_S (src_r0[2], src_r0[3], 0x5432);
1484 dst1[1] = __byte_perm_S (src_r0[3], src_r1[0], 0x5432);
1485 dst1[2] = __byte_perm_S (src_r1[0], src_r1[1], 0x5432);
1486 dst1[3] = __byte_perm_S (src_r1[1], src_r1[2], 0x5432);
1487 break;
1488
1489 case 7:
1490 dst0[1] = __byte_perm_S (src_l0[1], src_r0[0], 0x4210);
1491 dst0[2] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1492 dst0[3] = __byte_perm_S (src_r0[1], src_r0[2], 0x4321);
1493 dst1[0] = __byte_perm_S (src_r0[2], src_r0[3], 0x4321);
1494 dst1[1] = __byte_perm_S (src_r0[3], src_r1[0], 0x4321);
1495 dst1[2] = __byte_perm_S (src_r1[0], src_r1[1], 0x4321);
1496 dst1[3] = __byte_perm_S (src_r1[1], src_r1[2], 0x4321);
1497 break;
1498
1499 case 8:
1500 dst0[2] = src_r0[0];
1501 dst0[3] = src_r0[1];
1502 dst1[0] = src_r0[2];
1503 dst1[1] = src_r0[3];
1504 dst1[2] = src_r1[0];
1505 dst1[3] = src_r1[1];
1506 break;
1507
1508 case 9:
1509 dst0[2] = __byte_perm_S (src_l0[2], src_r0[0], 0x6540);
1510 dst0[3] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1511 dst1[0] = __byte_perm_S (src_r0[1], src_r0[2], 0x6543);
1512 dst1[1] = __byte_perm_S (src_r0[2], src_r0[3], 0x6543);
1513 dst1[2] = __byte_perm_S (src_r0[3], src_r1[0], 0x6543);
1514 dst1[3] = __byte_perm_S (src_r1[0], src_r1[1], 0x6543);
1515 break;
1516
1517 case 10:
1518 dst0[2] = __byte_perm_S (src_l0[2], src_r0[0], 0x5410);
1519 dst0[3] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1520 dst1[0] = __byte_perm_S (src_r0[1], src_r0[2], 0x5432);
1521 dst1[1] = __byte_perm_S (src_r0[2], src_r0[3], 0x5432);
1522 dst1[2] = __byte_perm_S (src_r0[3], src_r1[0], 0x5432);
1523 dst1[3] = __byte_perm_S (src_r1[0], src_r1[1], 0x5432);
1524 break;
1525
1526 case 11:
1527 dst0[2] = __byte_perm_S (src_l0[2], src_r0[0], 0x4210);
1528 dst0[3] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1529 dst1[0] = __byte_perm_S (src_r0[1], src_r0[2], 0x4321);
1530 dst1[1] = __byte_perm_S (src_r0[2], src_r0[3], 0x4321);
1531 dst1[2] = __byte_perm_S (src_r0[3], src_r1[0], 0x4321);
1532 dst1[3] = __byte_perm_S (src_r1[0], src_r1[1], 0x4321);
1533 break;
1534
1535 case 12:
1536 dst0[3] = src_r0[0];
1537 dst1[0] = src_r0[1];
1538 dst1[1] = src_r0[2];
1539 dst1[2] = src_r0[3];
1540 dst1[3] = src_r1[0];
1541 break;
1542
1543 case 13:
1544 dst0[3] = __byte_perm_S (src_l0[3], src_r0[0], 0x6540);
1545 dst1[0] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1546 dst1[1] = __byte_perm_S (src_r0[1], src_r0[2], 0x6543);
1547 dst1[2] = __byte_perm_S (src_r0[2], src_r0[3], 0x6543);
1548 dst1[3] = __byte_perm_S (src_r0[3], src_r1[0], 0x6543);
1549 break;
1550
1551 case 14:
1552 dst0[3] = __byte_perm_S (src_l0[3], src_r0[0], 0x5410);
1553 dst1[0] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1554 dst1[1] = __byte_perm_S (src_r0[1], src_r0[2], 0x5432);
1555 dst1[2] = __byte_perm_S (src_r0[2], src_r0[3], 0x5432);
1556 dst1[3] = __byte_perm_S (src_r0[3], src_r1[0], 0x5432);
1557 break;
1558
1559 case 15:
1560 dst0[3] = __byte_perm_S (src_l0[3], src_r0[0], 0x4210);
1561 dst1[0] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1562 dst1[1] = __byte_perm_S (src_r0[1], src_r0[2], 0x4321);
1563 dst1[2] = __byte_perm_S (src_r0[2], src_r0[3], 0x4321);
1564 dst1[3] = __byte_perm_S (src_r0[3], src_r1[0], 0x4321);
1565 break;
1566
1567 case 16:
1568 dst1[0] = src_r0[0];
1569 dst1[1] = src_r0[1];
1570 dst1[2] = src_r0[2];
1571 dst1[3] = src_r0[3];
1572 break;
1573
1574 case 17:
1575 dst1[0] = __byte_perm_S (src_l1[0], src_r0[0], 0x6540);
1576 dst1[1] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1577 dst1[2] = __byte_perm_S (src_r0[1], src_r0[2], 0x6543);
1578 dst1[3] = __byte_perm_S (src_r0[2], src_r0[3], 0x6543);
1579 break;
1580
1581 case 18:
1582 dst1[0] = __byte_perm_S (src_l1[0], src_r0[0], 0x5410);
1583 dst1[1] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1584 dst1[2] = __byte_perm_S (src_r0[1], src_r0[2], 0x5432);
1585 dst1[3] = __byte_perm_S (src_r0[2], src_r0[3], 0x5432);
1586 break;
1587
1588 case 19:
1589 dst1[0] = __byte_perm_S (src_l1[0], src_r0[0], 0x4210);
1590 dst1[1] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1591 dst1[2] = __byte_perm_S (src_r0[1], src_r0[2], 0x4321);
1592 dst1[3] = __byte_perm_S (src_r0[2], src_r0[3], 0x4321);
1593 break;
1594
1595 case 20:
1596 dst1[1] = src_r0[0];
1597 dst1[2] = src_r0[1];
1598 dst1[3] = src_r0[2];
1599 break;
1600
1601 case 21:
1602 dst1[1] = __byte_perm_S (src_l1[1], src_r0[0], 0x6540);
1603 dst1[2] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1604 dst1[3] = __byte_perm_S (src_r0[1], src_r0[2], 0x6543);
1605 break;
1606
1607 case 22:
1608 dst1[1] = __byte_perm_S (src_l1[1], src_r0[0], 0x5410);
1609 dst1[2] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1610 dst1[3] = __byte_perm_S (src_r0[1], src_r0[2], 0x5432);
1611 break;
1612
1613 case 23:
1614 dst1[1] = __byte_perm_S (src_l1[1], src_r0[0], 0x4210);
1615 dst1[2] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1616 dst1[3] = __byte_perm_S (src_r0[1], src_r0[2], 0x4321);
1617 break;
1618
1619 case 24:
1620 dst1[2] = src_r0[0];
1621 dst1[3] = src_r0[1];
1622 break;
1623
1624 case 25:
1625 dst1[2] = __byte_perm_S (src_l1[2], src_r0[0], 0x6540);
1626 dst1[3] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1627 break;
1628
1629 case 26:
1630 dst1[2] = __byte_perm_S (src_l1[2], src_r0[0], 0x5410);
1631 dst1[3] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1632 break;
1633
1634 case 27:
1635 dst1[2] = __byte_perm_S (src_l1[2], src_r0[0], 0x4210);
1636 dst1[3] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1637 break;
1638
1639 case 28:
1640 dst1[3] = src_r0[0];
1641 break;
1642
1643 case 29:
1644 dst1[3] = __byte_perm_S (src_l1[3], src_r0[0], 0x6540);
1645 break;
1646
1647 case 30:
1648 dst1[3] = __byte_perm_S (src_l1[3], src_r0[0], 0x5410);
1649 break;
1650
1651 case 31:
1652 dst1[3] = __byte_perm_S (src_l1[3], src_r0[0], 0x4210);
1653 break;
1654 }
1655 #endif
1656
1657 #if defined IS_AMD || defined IS_GENERIC
1658 switch (offset)
1659 {
1660 case 31:
1661 dst1[3] = src_l1[3] | src_r0[0] << 24;
1662 break;
1663 case 30:
1664 dst1[3] = src_l1[3] | src_r0[0] << 16;
1665 break;
1666 case 29:
1667 dst1[3] = src_l1[3] | src_r0[0] << 8;
1668 break;
1669 case 28:
1670 dst1[3] = src_r0[0];
1671 break;
1672 case 27:
1673 dst1[3] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1674 dst1[2] = src_l1[2] | src_r0[0] << 24;
1675 break;
1676 case 26:
1677 dst1[3] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1678 dst1[2] = src_l1[2] | src_r0[0] << 16;
1679 break;
1680 case 25:
1681 dst1[3] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1682 dst1[2] = src_l1[2] | src_r0[0] << 8;
1683 break;
1684 case 24:
1685 dst1[3] = src_r0[1];
1686 dst1[2] = src_r0[0];
1687 break;
1688 case 23:
1689 dst1[3] = amd_bytealign_S (src_r0[2], src_r0[1], 1);
1690 dst1[2] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1691 dst1[1] = src_l1[1] | src_r0[0] << 24;
1692 break;
1693 case 22:
1694 dst1[3] = amd_bytealign_S (src_r0[2], src_r0[1], 2);
1695 dst1[2] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1696 dst1[1] = src_l1[1] | src_r0[0] << 16;
1697 break;
1698 case 21:
1699 dst1[3] = amd_bytealign_S (src_r0[2], src_r0[1], 3);
1700 dst1[2] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1701 dst1[1] = src_l1[1] | src_r0[0] << 8;
1702 break;
1703 case 20:
1704 dst1[3] = src_r0[2];
1705 dst1[2] = src_r0[1];
1706 dst1[1] = src_r0[0];
1707 break;
1708 case 19:
1709 dst1[3] = amd_bytealign_S (src_r0[3], src_r0[2], 1);
1710 dst1[2] = amd_bytealign_S (src_r0[2], src_r0[1], 1);
1711 dst1[1] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1712 dst1[0] = src_l1[0] | src_r0[0] << 24;
1713 break;
1714 case 18:
1715 dst1[3] = amd_bytealign_S (src_r0[3], src_r0[2], 2);
1716 dst1[2] = amd_bytealign_S (src_r0[2], src_r0[1], 2);
1717 dst1[1] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1718 dst1[0] = src_l1[0] | src_r0[0] << 16;
1719 break;
1720 case 17:
1721 dst1[3] = amd_bytealign_S (src_r0[3], src_r0[2], 3);
1722 dst1[2] = amd_bytealign_S (src_r0[2], src_r0[1], 3);
1723 dst1[1] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1724 dst1[0] = src_l1[0] | src_r0[0] << 8;
1725 break;
1726 case 16:
1727 dst1[3] = src_r0[3];
1728 dst1[2] = src_r0[2];
1729 dst1[1] = src_r0[1];
1730 dst1[0] = src_r0[0];
1731 break;
1732 case 15:
1733 dst1[3] = amd_bytealign_S (src_r1[0], src_r0[3], 1);
1734 dst1[2] = amd_bytealign_S (src_r0[3], src_r0[2], 1);
1735 dst1[1] = amd_bytealign_S (src_r0[2], src_r0[1], 1);
1736 dst1[0] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1737 dst0[3] = src_l0[3] | src_r0[0] << 24;
1738 break;
1739 case 14:
1740 dst1[3] = amd_bytealign_S (src_r1[0], src_r0[3], 2);
1741 dst1[2] = amd_bytealign_S (src_r0[3], src_r0[2], 2);
1742 dst1[1] = amd_bytealign_S (src_r0[2], src_r0[1], 2);
1743 dst1[0] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1744 dst0[3] = src_l0[3] | src_r0[0] << 16;
1745 break;
1746 case 13:
1747 dst1[3] = amd_bytealign_S (src_r1[0], src_r0[3], 3);
1748 dst1[2] = amd_bytealign_S (src_r0[3], src_r0[2], 3);
1749 dst1[1] = amd_bytealign_S (src_r0[2], src_r0[1], 3);
1750 dst1[0] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1751 dst0[3] = src_l0[3] | src_r0[0] << 8;
1752 break;
1753 case 12:
1754 dst1[3] = src_r1[0];
1755 dst1[2] = src_r0[3];
1756 dst1[1] = src_r0[2];
1757 dst1[0] = src_r0[1];
1758 dst0[3] = src_r0[0];
1759 break;
1760 case 11:
1761 dst1[3] = amd_bytealign_S (src_r1[1], src_r1[0], 1);
1762 dst1[2] = amd_bytealign_S (src_r1[0], src_r0[3], 1);
1763 dst1[1] = amd_bytealign_S (src_r0[3], src_r0[2], 1);
1764 dst1[0] = amd_bytealign_S (src_r0[2], src_r0[1], 1);
1765 dst0[3] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1766 dst0[2] = src_l0[2] | src_r0[0] << 24;
1767 break;
1768 case 10:
1769 dst1[3] = amd_bytealign_S (src_r1[1], src_r1[0], 2);
1770 dst1[2] = amd_bytealign_S (src_r1[0], src_r0[3], 2);
1771 dst1[1] = amd_bytealign_S (src_r0[3], src_r0[2], 2);
1772 dst1[0] = amd_bytealign_S (src_r0[2], src_r0[1], 2);
1773 dst0[3] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1774 dst0[2] = src_l0[2] | src_r0[0] << 16;
1775 break;
1776 case 9:
1777 dst1[3] = amd_bytealign_S (src_r1[1], src_r1[0], 3);
1778 dst1[2] = amd_bytealign_S (src_r1[0], src_r0[3], 3);
1779 dst1[1] = amd_bytealign_S (src_r0[3], src_r0[2], 3);
1780 dst1[0] = amd_bytealign_S (src_r0[2], src_r0[1], 3);
1781 dst0[3] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1782 dst0[2] = src_l0[2] | src_r0[0] << 8;
1783 break;
1784 case 8:
1785 dst1[3] = src_r1[1];
1786 dst1[2] = src_r1[0];
1787 dst1[1] = src_r0[3];
1788 dst1[0] = src_r0[2];
1789 dst0[3] = src_r0[1];
1790 dst0[2] = src_r0[0];
1791 break;
1792 case 7:
1793 dst1[3] = amd_bytealign_S (src_r1[2], src_r1[1], 1);
1794 dst1[2] = amd_bytealign_S (src_r1[1], src_r1[0], 1);
1795 dst1[1] = amd_bytealign_S (src_r1[0], src_r0[3], 1);
1796 dst1[0] = amd_bytealign_S (src_r0[3], src_r0[2], 1);
1797 dst0[3] = amd_bytealign_S (src_r0[2], src_r0[1], 1);
1798 dst0[2] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1799 dst0[1] = src_l0[1] | src_r0[0] << 24;
1800 break;
1801 case 6:
1802 dst1[3] = amd_bytealign_S (src_r1[2], src_r1[1], 2);
1803 dst1[2] = amd_bytealign_S (src_r1[1], src_r1[0], 2);
1804 dst1[1] = amd_bytealign_S (src_r1[0], src_r0[3], 2);
1805 dst1[0] = amd_bytealign_S (src_r0[3], src_r0[2], 2);
1806 dst0[3] = amd_bytealign_S (src_r0[2], src_r0[1], 2);
1807 dst0[2] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1808 dst0[1] = src_l0[1] | src_r0[0] << 16;
1809 break;
1810 case 5:
1811 dst1[3] = amd_bytealign_S (src_r1[2], src_r1[1], 3);
1812 dst1[2] = amd_bytealign_S (src_r1[1], src_r1[0], 3);
1813 dst1[1] = amd_bytealign_S (src_r1[0], src_r0[3], 3);
1814 dst1[0] = amd_bytealign_S (src_r0[3], src_r0[2], 3);
1815 dst0[3] = amd_bytealign_S (src_r0[2], src_r0[1], 3);
1816 dst0[2] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1817 dst0[1] = src_l0[1] | src_r0[0] << 8;
1818 break;
1819 case 4:
1820 dst1[3] = src_r1[2];
1821 dst1[2] = src_r1[1];
1822 dst1[1] = src_r1[0];
1823 dst1[0] = src_r0[3];
1824 dst0[3] = src_r0[2];
1825 dst0[2] = src_r0[1];
1826 dst0[1] = src_r0[0];
1827 break;
1828 case 3:
1829 dst1[3] = amd_bytealign_S (src_r1[3], src_r1[2], 1);
1830 dst1[2] = amd_bytealign_S (src_r1[2], src_r1[1], 1);
1831 dst1[1] = amd_bytealign_S (src_r1[1], src_r1[0], 1);
1832 dst1[0] = amd_bytealign_S (src_r1[0], src_r0[3], 1);
1833 dst0[3] = amd_bytealign_S (src_r0[3], src_r0[2], 1);
1834 dst0[2] = amd_bytealign_S (src_r0[2], src_r0[1], 1);
1835 dst0[1] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1836 dst0[0] = src_l0[0] | src_r0[0] << 24;
1837 break;
1838 case 2:
1839 dst1[3] = amd_bytealign_S (src_r1[3], src_r1[2], 2);
1840 dst1[2] = amd_bytealign_S (src_r1[2], src_r1[1], 2);
1841 dst1[1] = amd_bytealign_S (src_r1[1], src_r1[0], 2);
1842 dst1[0] = amd_bytealign_S (src_r1[0], src_r0[3], 2);
1843 dst0[3] = amd_bytealign_S (src_r0[3], src_r0[2], 2);
1844 dst0[2] = amd_bytealign_S (src_r0[2], src_r0[1], 2);
1845 dst0[1] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1846 dst0[0] = src_l0[0] | src_r0[0] << 16;
1847 break;
1848 case 1:
1849 dst1[3] = amd_bytealign_S (src_r1[3], src_r1[2], 3);
1850 dst1[2] = amd_bytealign_S (src_r1[2], src_r1[1], 3);
1851 dst1[1] = amd_bytealign_S (src_r1[1], src_r1[0], 3);
1852 dst1[0] = amd_bytealign_S (src_r1[0], src_r0[3], 3);
1853 dst0[3] = amd_bytealign_S (src_r0[3], src_r0[2], 3);
1854 dst0[2] = amd_bytealign_S (src_r0[2], src_r0[1], 3);
1855 dst0[1] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1856 dst0[0] = src_l0[0] | src_r0[0] << 8;
1857 break;
1858 case 0:
1859 dst1[3] = src_r1[3];
1860 dst1[2] = src_r1[2];
1861 dst1[1] = src_r1[1];
1862 dst1[0] = src_r1[0];
1863 dst0[3] = src_r0[3];
1864 dst0[2] = src_r0[2];
1865 dst0[1] = src_r0[1];
1866 dst0[0] = src_r0[0];
1867 break;
1868 }
1869 #endif
1870 }
1871
1872 inline void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], const u32 len)
1873 {
1874 rshift_block_N (in0, in1, out0, out1, 32 - len);
1875
1876 u32 tib40[4];
1877 u32 tib41[4];
1878
1879 tib40[0] = out1[3];
1880 tib40[1] = out1[2];
1881 tib40[2] = out1[1];
1882 tib40[3] = out1[0];
1883 tib41[0] = out0[3];
1884 tib41[1] = out0[2];
1885 tib41[2] = out0[1];
1886 tib41[3] = out0[0];
1887
1888 out0[0] = swap32_S (tib40[0]);
1889 out0[1] = swap32_S (tib40[1]);
1890 out0[2] = swap32_S (tib40[2]);
1891 out0[3] = swap32_S (tib40[3]);
1892 out1[0] = swap32_S (tib41[0]);
1893 out1[1] = swap32_S (tib41[1]);
1894 out1[2] = swap32_S (tib41[2]);
1895 out1[3] = swap32_S (tib41[3]);
1896 }
1897
1898 inline u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1899 {
1900 buf0[0] |= (generate_cmask (buf0[0]));
1901 buf0[1] |= (generate_cmask (buf0[1]));
1902 buf0[2] |= (generate_cmask (buf0[2]));
1903 buf0[3] |= (generate_cmask (buf0[3]));
1904 buf1[0] |= (generate_cmask (buf1[0]));
1905 buf1[1] |= (generate_cmask (buf1[1]));
1906 buf1[2] |= (generate_cmask (buf1[2]));
1907 buf1[3] |= (generate_cmask (buf1[3]));
1908
1909 return in_len;
1910 }
1911
1912 inline u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1913 {
1914 buf0[0] &= ~(generate_cmask (buf0[0]));
1915 buf0[1] &= ~(generate_cmask (buf0[1]));
1916 buf0[2] &= ~(generate_cmask (buf0[2]));
1917 buf0[3] &= ~(generate_cmask (buf0[3]));
1918 buf1[0] &= ~(generate_cmask (buf1[0]));
1919 buf1[1] &= ~(generate_cmask (buf1[1]));
1920 buf1[2] &= ~(generate_cmask (buf1[2]));
1921 buf1[3] &= ~(generate_cmask (buf1[3]));
1922
1923 return in_len;
1924 }
1925
1926 inline u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1927 {
1928 rule_op_mangle_lrest (p0, p1, buf0, buf1, in_len);
1929
1930 buf0[0] &= ~(0x00000020 & generate_cmask (buf0[0]));
1931
1932 return in_len;
1933 }
1934
1935 inline u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1936 {
1937 rule_op_mangle_urest (p0, p1, buf0, buf1, in_len);
1938
1939 buf0[0] |= (0x00000020 & generate_cmask (buf0[0]));
1940
1941 return in_len;
1942 }
1943
1944 inline u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1945 {
1946 buf0[0] ^= (generate_cmask (buf0[0]));
1947 buf0[1] ^= (generate_cmask (buf0[1]));
1948 buf0[2] ^= (generate_cmask (buf0[2]));
1949 buf0[3] ^= (generate_cmask (buf0[3]));
1950 buf1[0] ^= (generate_cmask (buf1[0]));
1951 buf1[1] ^= (generate_cmask (buf1[1]));
1952 buf1[2] ^= (generate_cmask (buf1[2]));
1953 buf1[3] ^= (generate_cmask (buf1[3]));
1954
1955 return in_len;
1956 }
1957
1958 inline u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1959 {
1960 if (p0 >= in_len) return (in_len);
1961
1962 const u32 tmp = 0x20u << ((p0 & 3) * 8);
1963
1964 switch (p0 / 4)
1965 {
1966 case 0: buf0[0] ^= (tmp & generate_cmask (buf0[0])); break;
1967 case 1: buf0[1] ^= (tmp & generate_cmask (buf0[1])); break;
1968 case 2: buf0[2] ^= (tmp & generate_cmask (buf0[2])); break;
1969 case 3: buf0[3] ^= (tmp & generate_cmask (buf0[3])); break;
1970 case 4: buf1[0] ^= (tmp & generate_cmask (buf1[0])); break;
1971 case 5: buf1[1] ^= (tmp & generate_cmask (buf1[1])); break;
1972 case 6: buf1[2] ^= (tmp & generate_cmask (buf1[2])); break;
1973 case 7: buf1[3] ^= (tmp & generate_cmask (buf1[3])); break;
1974 }
1975
1976 return in_len;
1977 }
1978
1979 inline u32 rule_op_mangle_reverse (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1980 {
1981 reverse_block (buf0, buf1, buf0, buf1, in_len);
1982
1983 return in_len;
1984 }
1985
1986 inline u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1987 {
1988 if ((in_len + in_len) >= 32) return (in_len);
1989
1990 u32 out_len = in_len;
1991
1992 append_block8 (out_len, buf0, buf1, buf0, buf1, buf0, buf1);
1993
1994 out_len += in_len;
1995
1996 return out_len;
1997 }
1998
1999 inline u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2000 {
2001 if (((in_len * p0) + in_len) >= 32) return (in_len);
2002
2003 u32 out_len = in_len;
2004
2005 u32 tib40[4];
2006 u32 tib41[4];
2007
2008 tib40[0] = buf0[0];
2009 tib40[1] = buf0[1];
2010 tib40[2] = buf0[2];
2011 tib40[3] = buf0[3];
2012 tib41[0] = buf1[0];
2013 tib41[1] = buf1[1];
2014 tib41[2] = buf1[2];
2015 tib41[3] = buf1[3];
2016
2017 for (u32 i = 0; i < p0; i++)
2018 {
2019 append_block8 (out_len, buf0, buf1, buf0, buf1, tib40, tib41);
2020
2021 out_len += in_len;
2022 }
2023
2024 return out_len;
2025 }
2026
2027 inline u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2028 {
2029 if ((in_len + in_len) >= 32) return (in_len);
2030
2031 u32 out_len = in_len;
2032
2033 u32 tib40[4];
2034 u32 tib41[4];
2035
2036 reverse_block (buf0, buf1, tib40, tib41, out_len);
2037
2038 append_block8 (out_len, buf0, buf1, buf0, buf1, tib40, tib41);
2039
2040 out_len += in_len;
2041
2042 return out_len;
2043 }
2044
2045 inline u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2046 {
2047 if ((in_len + 1) >= 32) return (in_len);
2048
2049 u32 out_len = in_len;
2050
2051 append_block1 (out_len, buf0, buf1, p0);
2052
2053 out_len++;
2054
2055 return out_len;
2056 }
2057
2058 inline u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2059 {
2060 if ((in_len + 1) >= 32) return (in_len);
2061
2062 u32 out_len = in_len;
2063
2064 rshift_block (buf0, buf1, buf0, buf1);
2065
2066 buf0[0] = buf0[0] | p0;
2067
2068 out_len++;
2069
2070 return out_len;
2071 }
2072
2073 inline u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2074 {
2075 if (in_len == 0) return (in_len);
2076
2077 const u32 in_len1 = in_len - 1;
2078
2079 const u32 sh = (in_len1 & 3) * 8;
2080
2081 const u32 tmp = (buf0[0] & 0xff) << sh;
2082
2083 lshift_block (buf0, buf1, buf0, buf1);
2084
2085 switch (in_len1 / 4)
2086 {
2087 case 0: buf0[0] |= tmp; break;
2088 case 1: buf0[1] |= tmp; break;
2089 case 2: buf0[2] |= tmp; break;
2090 case 3: buf0[3] |= tmp; break;
2091 case 4: buf1[0] |= tmp; break;
2092 case 5: buf1[1] |= tmp; break;
2093 case 6: buf1[2] |= tmp; break;
2094 case 7: buf1[3] |= tmp; break;
2095 }
2096
2097 return in_len;
2098 }
2099
2100 inline u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2101 {
2102 if (in_len == 0) return (in_len);
2103
2104 const u32 in_len1 = in_len - 1;
2105
2106 const u32 sh = (in_len1 & 3) * 8;
2107
2108 u32 tmp = 0;
2109
2110 switch (in_len1 / 4)
2111 {
2112 case 0: tmp = (buf0[0] >> sh) & 0xff; break;
2113 case 1: tmp = (buf0[1] >> sh) & 0xff; break;
2114 case 2: tmp = (buf0[2] >> sh) & 0xff; break;
2115 case 3: tmp = (buf0[3] >> sh) & 0xff; break;
2116 case 4: tmp = (buf1[0] >> sh) & 0xff; break;
2117 case 5: tmp = (buf1[1] >> sh) & 0xff; break;
2118 case 6: tmp = (buf1[2] >> sh) & 0xff; break;
2119 case 7: tmp = (buf1[3] >> sh) & 0xff; break;
2120 }
2121
2122 rshift_block (buf0, buf1, buf0, buf1);
2123
2124 buf0[0] |= tmp;
2125
2126 truncate_right (buf0, buf1, in_len);
2127
2128 return in_len;
2129 }
2130
2131 inline u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2132 {
2133 if (in_len == 0) return (in_len);
2134
2135 const u32 in_len1 = in_len - 1;
2136
2137 lshift_block (buf0, buf1, buf0, buf1);
2138
2139 return in_len1;
2140 }
2141
2142 inline u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2143 {
2144 if (in_len == 0) return (in_len);
2145
2146 const u32 in_len1 = in_len - 1;
2147
2148 const u32 tmp = (1 << ((in_len1 & 3) * 8)) - 1;
2149
2150 switch (in_len1 / 4)
2151 {
2152 case 0: buf0[0] &= tmp; break;
2153 case 1: buf0[1] &= tmp; break;
2154 case 2: buf0[2] &= tmp; break;
2155 case 3: buf0[3] &= tmp; break;
2156 case 4: buf1[0] &= tmp; break;
2157 case 5: buf1[1] &= tmp; break;
2158 case 6: buf1[2] &= tmp; break;
2159 case 7: buf1[3] &= tmp; break;
2160 }
2161
2162 return in_len1;
2163 }
2164
2165 inline u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2166 {
2167 if (p0 >= in_len) return (in_len);
2168
2169 u32 out_len = in_len;
2170
2171 u32 tib40[4];
2172 u32 tib41[4];
2173
2174 lshift_block (buf0, buf1, tib40, tib41);
2175
2176 const u32 ml = (1 << ((p0 & 3) * 8)) - 1;
2177 const u32 mr = ~ml;
2178
2179 switch (p0 / 4)
2180 {
2181 case 0: buf0[0] = (buf0[0] & ml)
2182 | (tib40[0] & mr);
2183 buf0[1] = tib40[1];
2184 buf0[2] = tib40[2];
2185 buf0[3] = tib40[3];
2186 buf1[0] = tib41[0];
2187 buf1[1] = tib41[1];
2188 buf1[2] = tib41[2];
2189 buf1[3] = tib41[3];
2190 break;
2191 case 1: buf0[1] = (buf0[1] & ml)
2192 | (tib40[1] & mr);
2193 buf0[2] = tib40[2];
2194 buf0[3] = tib40[3];
2195 buf1[0] = tib41[0];
2196 buf1[1] = tib41[1];
2197 buf1[2] = tib41[2];
2198 buf1[3] = tib41[3];
2199 break;
2200 case 2: buf0[2] = (buf0[2] & ml)
2201 | (tib40[2] & mr);
2202 buf0[3] = tib40[3];
2203 buf1[0] = tib41[0];
2204 buf1[1] = tib41[1];
2205 buf1[2] = tib41[2];
2206 buf1[3] = tib41[3];
2207 break;
2208 case 3: buf0[3] = (buf0[3] & ml)
2209 | (tib40[3] & mr);
2210 buf1[0] = tib41[0];
2211 buf1[1] = tib41[1];
2212 buf1[2] = tib41[2];
2213 buf1[3] = tib41[3];
2214 break;
2215 case 4: buf1[0] = (buf1[0] & ml)
2216 | (tib41[0] & mr);
2217 buf1[1] = tib41[1];
2218 buf1[2] = tib41[2];
2219 buf1[3] = tib41[3];
2220 break;
2221 case 5: buf1[1] = (buf1[1] & ml)
2222 | (tib41[1] & mr);
2223 buf1[2] = tib41[2];
2224 buf1[3] = tib41[3];
2225 break;
2226 case 6: buf1[2] = (buf1[2] & ml)
2227 | (tib41[2] & mr);
2228 buf1[3] = tib41[3];
2229 break;
2230 case 7: buf1[3] = (buf1[3] & ml)
2231 | (tib41[3] & mr);
2232 break;
2233 }
2234
2235 out_len--;
2236
2237 return out_len;
2238 }
2239
2240 inline u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2241 {
2242 if (p0 >= in_len) return (in_len);
2243
2244 if ((p0 + p1) > in_len) return (in_len);
2245
2246 u32 out_len = p1;
2247
2248 lshift_block_N (buf0, buf1, buf0, buf1, p0);
2249
2250 truncate_right (buf0, buf1, out_len);
2251
2252 return out_len;
2253 }
2254
2255 inline u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2256 {
2257 if (p0 >= in_len) return (in_len);
2258
2259 if ((p0 + p1) > in_len) return (in_len);
2260
2261 u32 out_len = in_len;
2262
2263 u32 tib40[4];
2264 u32 tib41[4];
2265
2266 tib40[0] = 0;
2267 tib40[1] = 0;
2268 tib40[2] = 0;
2269 tib40[3] = 0;
2270 tib41[0] = 0;
2271 tib41[1] = 0;
2272 tib41[2] = 0;
2273 tib41[3] = 0;
2274
2275 lshift_block_N (buf0, buf1, tib40, tib41, p1);
2276
2277 const u32 ml = (1 << ((p0 & 3) * 8)) - 1;
2278 const u32 mr = ~ml;
2279
2280 switch (p0 / 4)
2281 {
2282 case 0: buf0[0] = (buf0[0] & ml)
2283 | (tib40[0] & mr);
2284 buf0[1] = tib40[1];
2285 buf0[2] = tib40[2];
2286 buf0[3] = tib40[3];
2287 buf1[0] = tib41[0];
2288 buf1[1] = tib41[1];
2289 buf1[2] = tib41[2];
2290 buf1[3] = tib41[3];
2291 break;
2292 case 1: buf0[1] = (buf0[1] & ml)
2293 | (tib40[1] & mr);
2294 buf0[2] = tib40[2];
2295 buf0[3] = tib40[3];
2296 buf1[0] = tib41[0];
2297 buf1[1] = tib41[1];
2298 buf1[2] = tib41[2];
2299 buf1[3] = tib41[3];
2300 break;
2301 case 2: buf0[2] = (buf0[2] & ml)
2302 | (tib40[2] & mr);
2303 buf0[3] = tib40[3];
2304 buf1[0] = tib41[0];
2305 buf1[1] = tib41[1];
2306 buf1[2] = tib41[2];
2307 buf1[3] = tib41[3];
2308 break;
2309 case 3: buf0[3] = (buf0[3] & ml)
2310 | (tib40[3] & mr);
2311 buf1[0] = tib41[0];
2312 buf1[1] = tib41[1];
2313 buf1[2] = tib41[2];
2314 buf1[3] = tib41[3];
2315 break;
2316 case 4: buf1[0] = (buf1[0] & ml)
2317 | (tib41[0] & mr);
2318 buf1[1] = tib41[1];
2319 buf1[2] = tib41[2];
2320 buf1[3] = tib41[3];
2321 break;
2322 case 5: buf1[1] = (buf1[1] & ml)
2323 | (tib41[1] & mr);
2324 buf1[2] = tib41[2];
2325 buf1[3] = tib41[3];
2326 break;
2327 case 6: buf1[2] = (buf1[2] & ml)
2328 | (tib41[2] & mr);
2329 buf1[3] = tib41[3];
2330 break;
2331 case 7: buf1[3] = (buf1[3] & ml)
2332 | (tib41[3] & mr);
2333 break;
2334 }
2335
2336 out_len -= p1;
2337
2338 return out_len;
2339 }
2340
2341 inline u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2342 {
2343 if (p0 > in_len) return (in_len);
2344
2345 if ((in_len + 1) >= 32) return (in_len);
2346
2347 u32 out_len = in_len;
2348
2349 u32 tib40[4];
2350 u32 tib41[4];
2351
2352 rshift_block (buf0, buf1, tib40, tib41);
2353
2354 const u32 p1n = p1 << ((p0 & 3) * 8);
2355
2356 const u32 ml = (1 << ((p0 & 3) * 8)) - 1;
2357
2358 const u32 mr = 0xffffff00 << ((p0 & 3) * 8);
2359
2360 switch (p0 / 4)
2361 {
2362 case 0: buf0[0] = (buf0[0] & ml) | p1n | (tib40[0] & mr);
2363 buf0[1] = tib40[1];
2364 buf0[2] = tib40[2];
2365 buf0[3] = tib40[3];
2366 buf1[0] = tib41[0];
2367 buf1[1] = tib41[1];
2368 buf1[2] = tib41[2];
2369 buf1[3] = tib41[3];
2370 break;
2371 case 1: buf0[1] = (buf0[1] & ml) | p1n | (tib40[1] & mr);
2372 buf0[2] = tib40[2];
2373 buf0[3] = tib40[3];
2374 buf1[0] = tib41[0];
2375 buf1[1] = tib41[1];
2376 buf1[2] = tib41[2];
2377 buf1[3] = tib41[3];
2378 break;
2379 case 2: buf0[2] = (buf0[2] & ml) | p1n | (tib40[2] & mr);
2380 buf0[3] = tib40[3];
2381 buf1[0] = tib41[0];
2382 buf1[1] = tib41[1];
2383 buf1[2] = tib41[2];
2384 buf1[3] = tib41[3];
2385 break;
2386 case 3: buf0[3] = (buf0[3] & ml) | p1n | (tib40[3] & mr);
2387 buf1[0] = tib41[0];
2388 buf1[1] = tib41[1];
2389 buf1[2] = tib41[2];
2390 buf1[3] = tib41[3];
2391 break;
2392 case 4: buf1[0] = (buf1[0] & ml) | p1n | (tib41[0] & mr);
2393 buf1[1] = tib41[1];
2394 buf1[2] = tib41[2];
2395 buf1[3] = tib41[3];
2396 break;
2397 case 5: buf1[1] = (buf1[1] & ml) | p1n | (tib41[1] & mr);
2398 buf1[2] = tib41[2];
2399 buf1[3] = tib41[3];
2400 break;
2401 case 6: buf1[2] = (buf1[2] & ml) | p1n | (tib41[2] & mr);
2402 buf1[3] = tib41[3];
2403 break;
2404 case 7: buf1[3] = (buf1[3] & ml) | p1n | (tib41[3] & mr);
2405 break;
2406 }
2407
2408 out_len++;
2409
2410 return out_len;
2411 }
2412
2413 inline u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2414 {
2415 if (p0 >= in_len) return (in_len);
2416
2417 const u32 p1n = p1 << ((p0 & 3) * 8);
2418
2419 const u32 m = ~(0xffu << ((p0 & 3) * 8));
2420
2421 switch (p0 / 4)
2422 {
2423 case 0: buf0[0] = (buf0[0] & m) | p1n; break;
2424 case 1: buf0[1] = (buf0[1] & m) | p1n; break;
2425 case 2: buf0[2] = (buf0[2] & m) | p1n; break;
2426 case 3: buf0[3] = (buf0[3] & m) | p1n; break;
2427 case 4: buf1[0] = (buf1[0] & m) | p1n; break;
2428 case 5: buf1[1] = (buf1[1] & m) | p1n; break;
2429 case 6: buf1[2] = (buf1[2] & m) | p1n; break;
2430 case 7: buf1[3] = (buf1[3] & m) | p1n; break;
2431 }
2432
2433 return in_len;
2434 }
2435
2436 inline u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2437 {
2438 if (p0 >= in_len) return (in_len);
2439
2440 truncate_right (buf0, buf1, p0);
2441
2442 return p0;
2443 }
2444
2445 inline u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2446 {
2447 #ifdef IS_NV
2448 for (u32 i = 0; i < in_len; i++)
2449 {
2450 switch (i)
2451 {
2452 case 0: if ((__byte_perm_S (buf0[0], 0, 0x6540)) == p0) buf0[0] = __byte_perm_S (p1, buf0[0], 0x7650);
2453 break;
2454 case 1: if ((__byte_perm_S (buf0[0], 0, 0x6541)) == p0) buf0[0] = __byte_perm_S (p1, buf0[0], 0x7604);
2455 break;
2456 case 2: if ((__byte_perm_S (buf0[0], 0, 0x6542)) == p0) buf0[0] = __byte_perm_S (p1, buf0[0], 0x7054);
2457 break;
2458 case 3: if ((__byte_perm_S (buf0[0], 0, 0x6543)) == p0) buf0[0] = __byte_perm_S (p1, buf0[0], 0x0654);
2459 break;
2460 case 4: if ((__byte_perm_S (buf0[1], 0, 0x6540)) == p0) buf0[1] = __byte_perm_S (p1, buf0[1], 0x7650);
2461 break;
2462 case 5: if ((__byte_perm_S (buf0[1], 0, 0x6541)) == p0) buf0[1] = __byte_perm_S (p1, buf0[1], 0x7604);
2463 break;
2464 case 6: if ((__byte_perm_S (buf0[1], 0, 0x6542)) == p0) buf0[1] = __byte_perm_S (p1, buf0[1], 0x7054);
2465 break;
2466 case 7: if ((__byte_perm_S (buf0[1], 0, 0x6543)) == p0) buf0[1] = __byte_perm_S (p1, buf0[1], 0x0654);
2467 break;
2468 case 8: if ((__byte_perm_S (buf0[2], 0, 0x6540)) == p0) buf0[2] = __byte_perm_S (p1, buf0[2], 0x7650);
2469 break;
2470 case 9: if ((__byte_perm_S (buf0[2], 0, 0x6541)) == p0) buf0[2] = __byte_perm_S (p1, buf0[2], 0x7604);
2471 break;
2472 case 10: if ((__byte_perm_S (buf0[2], 0, 0x6542)) == p0) buf0[2] = __byte_perm_S (p1, buf0[2], 0x7054);
2473 break;
2474 case 11: if ((__byte_perm_S (buf0[2], 0, 0x6543)) == p0) buf0[2] = __byte_perm_S (p1, buf0[2], 0x0654);
2475 break;
2476 case 12: if ((__byte_perm_S (buf0[3], 0, 0x6540)) == p0) buf0[3] = __byte_perm_S (p1, buf0[3], 0x7650);
2477 break;
2478 case 13: if ((__byte_perm_S (buf0[3], 0, 0x6541)) == p0) buf0[3] = __byte_perm_S (p1, buf0[3], 0x7604);
2479 break;
2480 case 14: if ((__byte_perm_S (buf0[3], 0, 0x6542)) == p0) buf0[3] = __byte_perm_S (p1, buf0[3], 0x7054);
2481 break;
2482 case 15: if ((__byte_perm_S (buf0[3], 0, 0x6543)) == p0) buf0[3] = __byte_perm_S (p1, buf0[3], 0x0654);
2483 break;
2484 case 16: if ((__byte_perm_S (buf1[0], 0, 0x6540)) == p0) buf1[0] = __byte_perm_S (p1, buf1[0], 0x7650);
2485 break;
2486 case 17: if ((__byte_perm_S (buf1[0], 0, 0x6541)) == p0) buf1[0] = __byte_perm_S (p1, buf1[0], 0x7604);
2487 break;
2488 case 18: if ((__byte_perm_S (buf1[0], 0, 0x6542)) == p0) buf1[0] = __byte_perm_S (p1, buf1[0], 0x7054);
2489 break;
2490 case 19: if ((__byte_perm_S (buf1[0], 0, 0x6543)) == p0) buf1[0] = __byte_perm_S (p1, buf1[0], 0x0654);
2491 break;
2492 case 20: if ((__byte_perm_S (buf1[1], 0, 0x6540)) == p0) buf1[1] = __byte_perm_S (p1, buf1[1], 0x7650);
2493 break;
2494 case 21: if ((__byte_perm_S (buf1[1], 0, 0x6541)) == p0) buf1[1] = __byte_perm_S (p1, buf1[1], 0x7604);
2495 break;
2496 case 22: if ((__byte_perm_S (buf1[1], 0, 0x6542)) == p0) buf1[1] = __byte_perm_S (p1, buf1[1], 0x7054);
2497 break;
2498 case 23: if ((__byte_perm_S (buf1[1], 0, 0x6543)) == p0) buf1[1] = __byte_perm_S (p1, buf1[1], 0x0654);
2499 break;
2500 case 24: if ((__byte_perm_S (buf1[2], 0, 0x6540)) == p0) buf1[2] = __byte_perm_S (p1, buf1[2], 0x7650);
2501 break;
2502 case 25: if ((__byte_perm_S (buf1[2], 0, 0x6541)) == p0) buf1[2] = __byte_perm_S (p1, buf1[2], 0x7604);
2503 break;
2504 case 26: if ((__byte_perm_S (buf1[2], 0, 0x6542)) == p0) buf1[2] = __byte_perm_S (p1, buf1[2], 0x7054);
2505 break;
2506 case 27: if ((__byte_perm_S (buf1[2], 0, 0x6543)) == p0) buf1[2] = __byte_perm_S (p1, buf1[2], 0x0654);
2507 break;
2508 case 28: if ((__byte_perm_S (buf1[3], 0, 0x6540)) == p0) buf1[3] = __byte_perm_S (p1, buf1[3], 0x7650);
2509 break;
2510 case 29: if ((__byte_perm_S (buf1[3], 0, 0x6541)) == p0) buf1[3] = __byte_perm_S (p1, buf1[3], 0x7604);
2511 break;
2512 case 30: if ((__byte_perm_S (buf1[3], 0, 0x6542)) == p0) buf1[3] = __byte_perm_S (p1, buf1[3], 0x7054);
2513 break;
2514 case 31: if ((__byte_perm_S (buf1[3], 0, 0x6543)) == p0) buf1[3] = __byte_perm_S (p1, buf1[3], 0x0654);
2515 break;
2516 }
2517 }
2518 #endif
2519
2520 #if defined IS_AMD || defined IS_GENERIC
2521 const uchar4 tmp0 = (uchar4) (p0);
2522 const uchar4 tmp1 = (uchar4) (p1);
2523
2524 uchar4 tmp;
2525
2526 tmp = as_uchar4 (buf0[0]); tmp = select (tmp, tmp1, tmp == tmp0); buf0[0] = as_uint (tmp);
2527 tmp = as_uchar4 (buf0[1]); tmp = select (tmp, tmp1, tmp == tmp0); buf0[1] = as_uint (tmp);
2528 tmp = as_uchar4 (buf0[2]); tmp = select (tmp, tmp1, tmp == tmp0); buf0[2] = as_uint (tmp);
2529 tmp = as_uchar4 (buf0[3]); tmp = select (tmp, tmp1, tmp == tmp0); buf0[3] = as_uint (tmp);
2530 tmp = as_uchar4 (buf1[0]); tmp = select (tmp, tmp1, tmp == tmp0); buf1[0] = as_uint (tmp);
2531 tmp = as_uchar4 (buf1[1]); tmp = select (tmp, tmp1, tmp == tmp0); buf1[1] = as_uint (tmp);
2532 tmp = as_uchar4 (buf1[2]); tmp = select (tmp, tmp1, tmp == tmp0); buf1[2] = as_uint (tmp);
2533 tmp = as_uchar4 (buf1[3]); tmp = select (tmp, tmp1, tmp == tmp0); buf1[3] = as_uint (tmp);
2534 #endif
2535
2536 return in_len;
2537 }
2538
2539 inline u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2540 {
2541 // TODO
2542 return in_len;
2543 }
2544
2545 inline u32 rule_op_mangle_togglecase_rec (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2546 {
2547 // TODO
2548 return in_len;
2549 }
2550
2551 inline u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2552 {
2553 if ( in_len == 0) return (in_len);
2554 if ((in_len + p0) >= 32) return (in_len);
2555
2556 u32 out_len = in_len;
2557
2558 const u32 tmp = buf0[0] & 0xFF;
2559
2560 rshift_block_N (buf0, buf1, buf0, buf1, p0);
2561
2562 #ifdef IS_NV
2563 switch (p0)
2564 {
2565 case 1: buf0[0] |= tmp;
2566 break;
2567 case 2: buf0[0] |= __byte_perm_S (tmp, 0, 0x5400);
2568 break;
2569 case 3: buf0[0] |= __byte_perm_S (tmp, 0, 0x4000);
2570 break;
2571 case 4: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2572 break;
2573 case 5: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2574 buf0[1] |= tmp;
2575 break;
2576 case 6: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2577 buf0[1] |= __byte_perm_S (tmp, 0, 0x5400);
2578 break;
2579 case 7: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2580 buf0[1] |= __byte_perm_S (tmp, 0, 0x4000);
2581 break;
2582 case 8: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2583 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2584 break;
2585 case 9: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2586 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2587 buf0[2] |= tmp;
2588 break;
2589 case 10: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2590 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2591 buf0[2] |= __byte_perm_S (tmp, 0, 0x5400);
2592 break;
2593 case 11: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2594 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2595 buf0[2] |= __byte_perm_S (tmp, 0, 0x4000);
2596 break;
2597 case 12: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2598 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2599 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2600 break;
2601 case 13: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2602 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2603 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2604 buf0[3] |= tmp;
2605 break;
2606 case 14: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2607 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2608 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2609 buf0[3] |= __byte_perm_S (tmp, 0, 0x5400);
2610 break;
2611 case 15: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2612 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2613 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2614 buf0[3] |= __byte_perm_S (tmp, 0, 0x4000);
2615 break;
2616 case 16: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2617 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2618 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2619 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2620 break;
2621 case 17: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2622 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2623 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2624 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2625 buf1[0] |= tmp;
2626 break;
2627 case 18: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2628 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2629 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2630 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2631 buf1[0] |= __byte_perm_S (tmp, 0, 0x5400);
2632 break;
2633 case 19: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2634 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2635 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2636 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2637 buf1[0] |= __byte_perm_S (tmp, 0, 0x4000);
2638 break;
2639 case 20: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2640 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2641 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2642 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2643 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2644 break;
2645 case 21: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2646 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2647 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2648 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2649 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2650 buf1[1] |= tmp;
2651 break;
2652 case 22: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2653 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2654 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2655 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2656 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2657 buf1[1] |= __byte_perm_S (tmp, 0, 0x5400);
2658 break;
2659 case 23: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2660 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2661 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2662 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2663 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2664 buf1[1] |= __byte_perm_S (tmp, 0, 0x4000);
2665 break;
2666 case 24: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2667 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2668 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2669 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2670 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2671 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2672 break;
2673 case 25: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2674 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2675 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2676 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2677 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2678 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2679 buf1[2] |= tmp;
2680 break;
2681 case 26: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2682 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2683 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2684 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2685 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2686 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2687 buf1[2] |= __byte_perm_S (tmp, 0, 0x5400);
2688 break;
2689 case 27: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2690 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2691 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2692 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2693 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2694 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2695 buf1[2] |= __byte_perm_S (tmp, 0, 0x4000);
2696 break;
2697 case 28: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2698 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2699 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2700 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2701 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2702 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2703 buf1[2] |= __byte_perm_S (tmp, 0, 0x0000);
2704 break;
2705 case 29: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2706 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2707 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2708 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2709 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2710 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2711 buf1[2] |= __byte_perm_S (tmp, 0, 0x0000);
2712 buf1[3] |= tmp;
2713 break;
2714 case 30: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2715 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2716 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2717 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2718 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2719 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2720 buf1[2] |= __byte_perm_S (tmp, 0, 0x0000);
2721 buf1[3] |= __byte_perm_S (tmp, 0, 0x5400);
2722 break;
2723 case 31: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2724 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2725 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2726 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2727 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2728 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2729 buf1[2] |= __byte_perm_S (tmp, 0, 0x0000);
2730 buf1[3] |= __byte_perm_S (tmp, 0, 0x4000);
2731 break;
2732 }
2733 #endif
2734
2735 #if defined IS_AMD || defined IS_GENERIC
2736 switch (p0)
2737 {
2738 case 1: buf0[0] |= tmp << 0;
2739 break;
2740 case 2: buf0[0] |= tmp << 0 | tmp << 8;
2741 break;
2742 case 3: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16;
2743 break;
2744 case 4: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2745 break;
2746 case 5: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2747 buf0[1] |= tmp << 0;
2748 break;
2749 case 6: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2750 buf0[1] |= tmp << 0 | tmp << 8;
2751 break;
2752 case 7: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2753 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16;
2754 break;
2755 case 8: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2756 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2757 break;
2758 case 9: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2759 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2760 buf0[2] |= tmp << 0;
2761 break;
2762 case 10: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2763 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2764 buf0[2] |= tmp << 0 | tmp << 8;
2765 break;
2766 case 11: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2767 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2768 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16;
2769 break;
2770 case 12: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2771 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2772 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2773 break;
2774 case 13: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2775 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2776 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2777 buf0[3] |= tmp << 0;
2778 break;
2779 case 14: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2780 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2781 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2782 buf0[3] |= tmp << 0 | tmp << 8;
2783 break;
2784 case 15: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2785 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2786 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2787 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16;
2788 break;
2789 case 16: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2790 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2791 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2792 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2793 break;
2794 case 17: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2795 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2796 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2797 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2798 buf1[0] |= tmp << 0;
2799 break;
2800 case 18: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2801 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2802 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2803 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2804 buf1[0] |= tmp << 0 | tmp << 8;
2805 break;
2806 case 19: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2807 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2808 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2809 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2810 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16;
2811 break;
2812 case 20: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2813 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2814 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2815 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2816 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2817 break;
2818 case 21: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2819 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2820 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2821 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2822 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2823 buf1[1] |= tmp << 0;
2824 break;
2825 case 22: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2826 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2827 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2828 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2829 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2830 buf1[1] |= tmp << 0 | tmp << 8;
2831 break;
2832 case 23: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2833 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2834 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2835 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2836 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2837 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16;
2838 break;
2839 case 24: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2840 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2841 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2842 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2843 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2844 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2845 break;
2846 case 25: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2847 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2848 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2849 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2850 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2851 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2852 buf1[2] |= tmp << 0;
2853 break;
2854 case 26: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2855 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2856 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2857 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2858 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2859 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2860 buf1[2] |= tmp << 0 | tmp << 8;
2861 break;
2862 case 27: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2863 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2864 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2865 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2866 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2867 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2868 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16;
2869 break;
2870 case 28: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2871 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2872 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2873 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2874 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2875 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2876 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2877 break;
2878 case 29: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2879 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2880 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2881 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2882 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2883 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2884 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2885 buf1[3] |= tmp << 0;
2886 break;
2887 case 30: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2888 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2889 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2890 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2891 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2892 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2893 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2894 buf1[3] |= tmp << 0 | tmp << 8;
2895 break;
2896 case 31: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2897 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2898 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2899 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2900 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2901 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2902 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2903 buf1[3] |= tmp << 0 | tmp << 8 | tmp << 16;
2904 break;
2905 }
2906 #endif
2907
2908 out_len += p0;
2909
2910 return out_len;
2911 }
2912
2913 inline u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2914 {
2915 if ( in_len == 0) return (in_len);
2916 if ((in_len + p0) >= 32) return (in_len);
2917
2918 const u32 in_len1 = in_len - 1;
2919
2920 const u32 sh = (in_len1 & 3) * 8;
2921
2922 u32 tmp = 0;
2923
2924 switch (in_len1 / 4)
2925 {
2926 case 0: tmp = (buf0[0] >> sh) & 0xff; break;
2927 case 1: tmp = (buf0[1] >> sh) & 0xff; break;
2928 case 2: tmp = (buf0[2] >> sh) & 0xff; break;
2929 case 3: tmp = (buf0[3] >> sh) & 0xff; break;
2930 case 4: tmp = (buf1[0] >> sh) & 0xff; break;
2931 case 5: tmp = (buf1[1] >> sh) & 0xff; break;
2932 case 6: tmp = (buf1[2] >> sh) & 0xff; break;
2933 case 7: tmp = (buf1[3] >> sh) & 0xff; break;
2934 }
2935
2936 u32 out_len = in_len;
2937
2938 for (u32 i = 0; i < p0; i++)
2939 {
2940 append_block1 (out_len, buf0, buf1, tmp);
2941
2942 out_len++;
2943 }
2944
2945 return out_len;
2946 }
2947
2948 inline u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2949 {
2950 if ( in_len == 0) return (in_len);
2951 if ((in_len + in_len) >= 32) return (in_len);
2952
2953 u32 out_len = in_len;
2954
2955 u32 tib40[4];
2956 u32 tib41[4];
2957
2958 #ifdef IS_NV
2959 tib40[0] = __byte_perm_S (buf0[0], 0, 0x1100);
2960 tib40[1] = __byte_perm_S (buf0[0], 0, 0x3322);
2961 tib40[2] = __byte_perm_S (buf0[1], 0, 0x1100);
2962 tib40[3] = __byte_perm_S (buf0[1], 0, 0x3322);
2963 tib41[0] = __byte_perm_S (buf0[2], 0, 0x1100);
2964 tib41[1] = __byte_perm_S (buf0[2], 0, 0x3322);
2965 tib41[2] = __byte_perm_S (buf0[3], 0, 0x1100);
2966 tib41[3] = __byte_perm_S (buf0[3], 0, 0x3322);
2967
2968 buf0[0] = tib40[0];
2969 buf0[1] = tib40[1];
2970 buf0[2] = tib40[2];
2971 buf0[3] = tib40[3];
2972 buf1[0] = tib41[0];
2973 buf1[1] = tib41[1];
2974 buf1[2] = tib41[2];
2975 buf1[3] = tib41[3];
2976 #endif
2977
2978 #if defined IS_AMD || defined IS_GENERIC
2979 tib40[0] = ((buf0[0] & 0x000000FF) << 0) | ((buf0[0] & 0x0000FF00) << 8);
2980 tib40[1] = ((buf0[0] & 0x00FF0000) >> 16) | ((buf0[0] & 0xFF000000) >> 8);
2981 tib40[2] = ((buf0[1] & 0x000000FF) << 0) | ((buf0[1] & 0x0000FF00) << 8);
2982 tib40[3] = ((buf0[1] & 0x00FF0000) >> 16) | ((buf0[1] & 0xFF000000) >> 8);
2983 tib41[0] = ((buf0[2] & 0x000000FF) << 0) | ((buf0[2] & 0x0000FF00) << 8);
2984 tib41[1] = ((buf0[2] & 0x00FF0000) >> 16) | ((buf0[2] & 0xFF000000) >> 8);
2985 tib41[2] = ((buf0[3] & 0x000000FF) << 0) | ((buf0[3] & 0x0000FF00) << 8);
2986 tib41[3] = ((buf0[3] & 0x00FF0000) >> 16) | ((buf0[3] & 0xFF000000) >> 8);
2987
2988 buf0[0] = tib40[0] | (tib40[0] << 8);
2989 buf0[1] = tib40[1] | (tib40[1] << 8);
2990 buf0[2] = tib40[2] | (tib40[2] << 8);
2991 buf0[3] = tib40[3] | (tib40[3] << 8);
2992 buf1[0] = tib41[0] | (tib41[0] << 8);
2993 buf1[1] = tib41[1] | (tib41[1] << 8);
2994 buf1[2] = tib41[2] | (tib41[2] << 8);
2995 buf1[3] = tib41[3] | (tib41[3] << 8);
2996 #endif
2997
2998 out_len = out_len + out_len;
2999
3000 return out_len;
3001 }
3002
3003 inline u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3004 {
3005 if (in_len < 2) return (in_len);
3006
3007 #ifdef IS_NV
3008 buf0[0] = __byte_perm_S (buf0[0], 0, 0x3201);
3009 #endif
3010
3011 #if defined IS_AMD || defined IS_GENERIC
3012 buf0[0] = (buf0[0] & 0xFFFF0000) | ((buf0[0] << 8) & 0x0000FF00) | ((buf0[0] >> 8) & 0x000000FF);
3013 #endif
3014
3015 return in_len;
3016 }
3017
3018 inline u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3019 {
3020 if (in_len < 2) return (in_len);
3021
3022 #ifdef IS_NV
3023 switch (in_len)
3024 {
3025 case 2: buf0[0] = __byte_perm_S (buf0[0], 0, 0x5401);
3026 break;
3027 case 3: buf0[0] = __byte_perm_S (buf0[0], 0, 0x4120);
3028 break;
3029 case 4: buf0[0] = __byte_perm_S (buf0[0], 0, 0x2310);
3030 break;
3031 case 5: buf0[1] = __byte_perm_S (buf0[1], buf0[0], 0x7210);
3032 buf0[0] = __byte_perm_S (buf0[0], buf0[1], 0x4210);
3033 buf0[1] = __byte_perm_S (buf0[1], 0, 0x6543);
3034 break;
3035 case 6: buf0[1] = __byte_perm_S (buf0[1], 0, 0x5401);
3036 break;
3037 case 7: buf0[1] = __byte_perm_S (buf0[1], 0, 0x4120);
3038 break;
3039 case 8: buf0[1] = __byte_perm_S (buf0[1], 0, 0x2310);
3040 break;
3041 case 9: buf0[2] = __byte_perm_S (buf0[2], buf0[1], 0x7210);
3042 buf0[1] = __byte_perm_S (buf0[1], buf0[2], 0x4210);
3043 buf0[2] = __byte_perm_S (buf0[2], 0, 0x6543);
3044 break;
3045 case 10: buf0[2] = __byte_perm_S (buf0[2], 0, 0x5401);
3046 break;
3047 case 11: buf0[2] = __byte_perm_S (buf0[2], 0, 0x4120);
3048 break;
3049 case 12: buf0[2] = __byte_perm_S (buf0[2], 0, 0x2310);
3050 break;
3051 case 13: buf0[3] = __byte_perm_S (buf0[3], buf0[2], 0x7210);
3052 buf0[2] = __byte_perm_S (buf0[2], buf0[3], 0x4210);
3053 buf0[3] = __byte_perm_S (buf0[3], 0, 0x6543);
3054 break;
3055 case 14: buf0[3] = __byte_perm_S (buf0[3], 0, 0x5401);
3056 break;
3057 case 15: buf0[3] = __byte_perm_S (buf0[3], 0, 0x4120);
3058 break;
3059 case 16: buf0[3] = __byte_perm_S (buf0[3], 0, 0x2310);
3060 break;
3061 case 17: buf1[0] = __byte_perm_S (buf1[0], buf0[3], 0x7210);
3062 buf0[3] = __byte_perm_S (buf0[3], buf1[0], 0x4210);
3063 buf1[0] = __byte_perm_S (buf1[0], 0, 0x6543);
3064 break;
3065 case 18: buf1[0] = __byte_perm_S (buf1[0], 0, 0x5401);
3066 break;
3067 case 19: buf1[0] = __byte_perm_S (buf1[0], 0, 0x4120);
3068 break;
3069 case 20: buf1[0] = __byte_perm_S (buf1[0], 0, 0x2310);
3070 break;
3071 case 21: buf1[1] = __byte_perm_S (buf1[1], buf1[0], 0x7210);
3072 buf1[0] = __byte_perm_S (buf1[0], buf1[1], 0x4210);
3073 buf1[1] = __byte_perm_S (buf1[1], 0, 0x6543);
3074 break;
3075 case 22: buf1[1] = __byte_perm_S (buf1[1], 0, 0x5401);
3076 break;
3077 case 23: buf1[1] = __byte_perm_S (buf1[1], 0, 0x4120);
3078 break;
3079 case 24: buf1[1] = __byte_perm_S (buf1[1], 0, 0x2310);
3080 break;
3081 case 25: buf1[2] = __byte_perm_S (buf1[2], buf1[1], 0x7210);
3082 buf1[1] = __byte_perm_S (buf1[1], buf1[2], 0x4210);
3083 buf1[2] = __byte_perm_S (buf1[2], 0, 0x6543);
3084 break;
3085 case 26: buf1[2] = __byte_perm_S (buf1[2], 0, 0x5401);
3086 break;
3087 case 27: buf1[2] = __byte_perm_S (buf1[2], 0, 0x4120);
3088 break;
3089 case 28: buf1[2] = __byte_perm_S (buf1[2], 0, 0x2310);
3090 break;
3091 case 29: buf1[3] = __byte_perm_S (buf1[3], buf1[2], 0x7210);
3092 buf1[2] = __byte_perm_S (buf1[2], buf1[3], 0x4210);
3093 buf1[3] = __byte_perm_S (buf1[3], 0, 0x6543);
3094 break;
3095 case 30: buf1[3] = __byte_perm_S (buf1[3], 0, 0x5401);
3096 break;
3097 case 31: buf1[3] = __byte_perm_S (buf1[3], 0, 0x4120);
3098 break;
3099 }
3100 #endif
3101
3102 #if defined IS_AMD || defined IS_GENERIC
3103 switch (in_len)
3104 {
3105 case 2: buf0[0] = ((buf0[0] << 8) & 0x0000FF00) | ((buf0[0] >> 8) & 0x000000FF);
3106 break;
3107 case 3: buf0[0] = (buf0[0] & 0x000000FF) | ((buf0[0] << 8) & 0x00FF0000) | ((buf0[0] >> 8) & 0x0000FF00);
3108 break;
3109 case 4: buf0[0] = (buf0[0] & 0x0000FFFF) | ((buf0[0] << 8) & 0xFF000000) | ((buf0[0] >> 8) & 0x00FF0000);
3110 break;
3111 case 5: buf0[1] = (buf0[0] & 0xFF000000) | buf0[1];
3112 buf0[0] = (buf0[0] & 0x00FFFFFF) | (buf0[1] << 24);
3113 buf0[1] = (buf0[1] >> 24);
3114 break;
3115 case 6: buf0[1] = ((buf0[1] << 8) & 0x0000FF00) | ((buf0[1] >> 8) & 0x000000FF);
3116 break;
3117 case 7: buf0[1] = (buf0[1] & 0x000000FF) | ((buf0[1] << 8) & 0x00FF0000) | ((buf0[1] >> 8) & 0x0000FF00);
3118 break;
3119 case 8: buf0[1] = (buf0[1] & 0x0000FFFF) | ((buf0[1] << 8) & 0xFF000000) | ((buf0[1] >> 8) & 0x00FF0000);
3120 break;
3121 case 9: buf0[2] = (buf0[1] & 0xFF000000) | buf0[2];
3122 buf0[1] = (buf0[1] & 0x00FFFFFF) | (buf0[2] << 24);
3123 buf0[2] = (buf0[2] >> 24);
3124 break;
3125 case 10: buf0[2] = ((buf0[2] << 8) & 0x0000FF00) | ((buf0[2] >> 8) & 0x000000FF);
3126 break;
3127 case 11: buf0[2] = (buf0[2] & 0x000000FF) | ((buf0[2] << 8) & 0x00FF0000) | ((buf0[2] >> 8) & 0x0000FF00);
3128 break;
3129 case 12: buf0[2] = (buf0[2] & 0x0000FFFF) | ((buf0[2] << 8) & 0xFF000000) | ((buf0[2] >> 8) & 0x00FF0000);
3130 break;
3131 case 13: buf0[3] = (buf0[2] & 0xFF000000) | buf0[3];
3132 buf0[2] = (buf0[2] & 0x00FFFFFF) | (buf0[3] << 24);
3133 buf0[3] = (buf0[3] >> 24);
3134 break;
3135 case 14: buf0[3] = ((buf0[3] << 8) & 0x0000FF00) | ((buf0[3] >> 8) & 0x000000FF);
3136 break;
3137 case 15: buf0[3] = (buf0[3] & 0x000000FF) | ((buf0[3] << 8) & 0x00FF0000) | ((buf0[3] >> 8) & 0x0000FF00);
3138 break;
3139 case 16: buf0[3] = (buf0[3] & 0x0000FFFF) | ((buf0[3] << 8) & 0xFF000000) | ((buf0[3] >> 8) & 0x00FF0000);
3140 break;
3141 case 17: buf1[0] = (buf0[3] & 0xFF000000) | buf1[0];
3142 buf0[3] = (buf0[3] & 0x00FFFFFF) | (buf1[0] << 24);
3143 buf1[0] = (buf1[0] >> 24);
3144 break;
3145 case 18: buf1[0] = ((buf1[0] << 8) & 0x0000FF00) | ((buf1[0] >> 8) & 0x000000FF);
3146 break;
3147 case 19: buf1[0] = (buf1[0] & 0x000000FF) | ((buf1[0] << 8) & 0x00FF0000) | ((buf1[0] >> 8) & 0x0000FF00);
3148 break;
3149 case 20: buf1[0] = (buf1[0] & 0x0000FFFF) | ((buf1[0] << 8) & 0xFF000000) | ((buf1[0] >> 8) & 0x00FF0000);
3150 break;
3151 case 21: buf1[1] = (buf1[0] & 0xFF000000) | buf1[1];
3152 buf1[0] = (buf1[0] & 0x00FFFFFF) | (buf1[1] << 24);
3153 buf1[1] = (buf1[1] >> 24);
3154 break;
3155 case 22: buf1[1] = ((buf1[1] << 8) & 0x0000FF00) | ((buf1[1] >> 8) & 0x000000FF);
3156 break;
3157 case 23: buf1[1] = (buf1[1] & 0x000000FF) | ((buf1[1] << 8) & 0x00FF0000) | ((buf1[1] >> 8) & 0x0000FF00);
3158 break;
3159 case 24: buf1[1] = (buf1[1] & 0x0000FFFF) | ((buf1[1] << 8) & 0xFF000000) | ((buf1[1] >> 8) & 0x00FF0000);
3160 break;
3161 case 25: buf1[2] = (buf1[1] & 0xFF000000) | buf1[2];
3162 buf1[1] = (buf1[1] & 0x00FFFFFF) | (buf1[2] << 24);
3163 buf1[2] = (buf1[2] >> 24);
3164 break;
3165 case 26: buf1[2] = ((buf1[2] << 8) & 0x0000FF00) | ((buf1[2] >> 8) & 0x000000FF);
3166 break;
3167 case 27: buf1[2] = (buf1[2] & 0x000000FF) | ((buf1[2] << 8) & 0x00FF0000) | ((buf1[2] >> 8) & 0x0000FF00);
3168 break;
3169 case 28: buf1[2] = (buf1[2] & 0x0000FFFF) | ((buf1[2] << 8) & 0xFF000000) | ((buf1[2] >> 8) & 0x00FF0000);
3170 break;
3171 case 29: buf1[3] = (buf1[2] & 0xFF000000) | buf1[3];
3172 buf1[2] = (buf1[2] & 0x00FFFFFF) | (buf1[3] << 24);
3173 buf1[3] = (buf1[3] >> 24);
3174 break;
3175 case 30: buf1[3] = ((buf1[3] << 8) & 0x0000FF00) | ((buf1[3] >> 8) & 0x000000FF);
3176 break;
3177 case 31: buf1[3] = (buf1[3] & 0x000000FF) | ((buf1[3] << 8) & 0x00FF0000) | ((buf1[3] >> 8) & 0x0000FF00);
3178 break;
3179 }
3180 #endif
3181
3182 return in_len;
3183 }
3184
3185 inline u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3186 {
3187 if (p0 >= in_len) return (in_len);
3188 if (p1 >= in_len) return (in_len);
3189
3190 u32 tmp0 = 0;
3191 u32 tmp1 = 0;
3192
3193 #ifdef IS_NV
3194 switch (p0)
3195 {
3196 case 0: tmp0 = __byte_perm_S (buf0[0], 0, 0x6540);
3197 break;
3198 case 1: tmp0 = __byte_perm_S (buf0[0], 0, 0x6541);
3199 break;
3200 case 2: tmp0 = __byte_perm_S (buf0[0], 0, 0x6542);
3201 break;
3202 case 3: tmp0 = __byte_perm_S (buf0[0], 0, 0x6543);
3203 break;
3204 case 4: tmp0 = __byte_perm_S (buf0[1], 0, 0x6540);
3205 break;
3206 case 5: tmp0 = __byte_perm_S (buf0[1], 0, 0x6541);
3207 break;
3208 case 6: tmp0 = __byte_perm_S (buf0[1], 0, 0x6542);
3209 break;
3210 case 7: tmp0 = __byte_perm_S (buf0[1], 0, 0x6543);
3211 break;
3212 case 8: tmp0 = __byte_perm_S (buf0[2], 0, 0x6540);
3213 break;
3214 case 9: tmp0 = __byte_perm_S (buf0[2], 0, 0x6541);
3215 break;
3216 case 10: tmp0 = __byte_perm_S (buf0[2], 0, 0x6542);
3217 break;
3218 case 11: tmp0 = __byte_perm_S (buf0[2], 0, 0x6543);
3219 break;
3220 case 12: tmp0 = __byte_perm_S (buf0[3], 0, 0x6540);
3221 break;
3222 case 13: tmp0 = __byte_perm_S (buf0[3], 0, 0x6541);
3223 break;
3224 case 14: tmp0 = __byte_perm_S (buf0[3], 0, 0x6542);
3225 break;
3226 case 15: tmp0 = __byte_perm_S (buf0[3], 0, 0x6543);
3227 break;
3228 case 16: tmp0 = __byte_perm_S (buf1[0], 0, 0x6540);
3229 break;
3230 case 17: tmp0 = __byte_perm_S (buf1[0], 0, 0x6541);
3231 break;
3232 case 18: tmp0 = __byte_perm_S (buf1[0], 0, 0x6542);
3233 break;
3234 case 19: tmp0 = __byte_perm_S (buf1[0], 0, 0x6543);
3235 break;
3236 case 20: tmp0 = __byte_perm_S (buf1[1], 0, 0x6540);
3237 break;
3238 case 21: tmp0 = __byte_perm_S (buf1[1], 0, 0x6541);
3239 break;
3240 case 22: tmp0 = __byte_perm_S (buf1[1], 0, 0x6542);
3241 break;
3242 case 23: tmp0 = __byte_perm_S (buf1[1], 0, 0x6543);
3243 break;
3244 case 24: tmp0 = __byte_perm_S (buf1[2], 0, 0x6540);
3245 break;
3246 case 25: tmp0 = __byte_perm_S (buf1[2], 0, 0x6541);
3247 break;
3248 case 26: tmp0 = __byte_perm_S (buf1[2], 0, 0x6542);
3249 break;
3250 case 27: tmp0 = __byte_perm_S (buf1[2], 0, 0x6543);
3251 break;
3252 case 28: tmp0 = __byte_perm_S (buf1[3], 0, 0x6540);
3253 break;
3254 case 29: tmp0 = __byte_perm_S (buf1[3], 0, 0x6541);
3255 break;
3256 case 30: tmp0 = __byte_perm_S (buf1[3], 0, 0x6542);
3257 break;
3258 case 31: tmp0 = __byte_perm_S (buf1[3], 0, 0x6543);
3259 break;
3260 }
3261
3262 switch (p1)
3263 {
3264 case 0: tmp1 = __byte_perm_S (buf0[0], 0, 0x6540);
3265 buf0[0] = __byte_perm_S (tmp0, buf0[0], 0x7650);
3266 break;
3267 case 1: tmp1 = __byte_perm_S (buf0[0], 0, 0x6541);
3268 buf0[0] = __byte_perm_S (tmp0, buf0[0], 0x7604);
3269 break;
3270 case 2: tmp1 = __byte_perm_S (buf0[0], 0, 0x6542);
3271 buf0[0] = __byte_perm_S (tmp0, buf0[0], 0x7054);
3272 break;
3273 case 3: tmp1 = __byte_perm_S (buf0[0], 0, 0x6543);
3274 buf0[0] = __byte_perm_S (tmp0, buf0[0], 0x0654);
3275 break;
3276 case 4: tmp1 = __byte_perm_S (buf0[1], 0, 0x6540);
3277 buf0[1] = __byte_perm_S (tmp0, buf0[1], 0x7650);
3278 break;
3279 case 5: tmp1 = __byte_perm_S (buf0[1], 0, 0x6541);
3280 buf0[1] = __byte_perm_S (tmp0, buf0[1], 0x7604);
3281 break;
3282 case 6: tmp1 = __byte_perm_S (buf0[1], 0, 0x6542);
3283 buf0[1] = __byte_perm_S (tmp0, buf0[1], 0x7054);
3284 break;
3285 case 7: tmp1 = __byte_perm_S (buf0[1], 0, 0x6543);
3286 buf0[1] = __byte_perm_S (tmp0, buf0[1], 0x0654);
3287 break;
3288 case 8: tmp1 = __byte_perm_S (buf0[2], 0, 0x6540);
3289 buf0[2] = __byte_perm_S (tmp0, buf0[2], 0x7650);
3290 break;
3291 case 9: tmp1 = __byte_perm_S (buf0[2], 0, 0x6541);
3292 buf0[2] = __byte_perm_S (tmp0, buf0[2], 0x7604);
3293 break;
3294 case 10: tmp1 = __byte_perm_S (buf0[2], 0, 0x6542);
3295 buf0[2] = __byte_perm_S (tmp0, buf0[2], 0x7054);
3296 break;
3297 case 11: tmp1 = __byte_perm_S (buf0[2], 0, 0x6543);
3298 buf0[2] = __byte_perm_S (tmp0, buf0[2], 0x0654);
3299 break;
3300 case 12: tmp1 = __byte_perm_S (buf0[3], 0, 0x6540);
3301 buf0[3] = __byte_perm_S (tmp0, buf0[3], 0x7650);
3302 break;
3303 case 13: tmp1 = __byte_perm_S (buf0[3], 0, 0x6541);
3304 buf0[3] = __byte_perm_S (tmp0, buf0[3], 0x7604);
3305 break;
3306 case 14: tmp1 = __byte_perm_S (buf0[3], 0, 0x6542);
3307 buf0[3] = __byte_perm_S (tmp0, buf0[3], 0x7054);
3308 break;
3309 case 15: tmp1 = __byte_perm_S (buf0[3], 0, 0x6543);
3310 buf0[3] = __byte_perm_S (tmp0, buf0[3], 0x0654);
3311 break;
3312 case 16: tmp1 = __byte_perm_S (buf1[0], 0, 0x6540);
3313 buf1[0] = __byte_perm_S (tmp0, buf1[0], 0x7650);
3314 break;
3315 case 17: tmp1 = __byte_perm_S (buf1[0], 0, 0x6541);
3316 buf1[0] = __byte_perm_S (tmp0, buf1[0], 0x7604);
3317 break;
3318 case 18: tmp1 = __byte_perm_S (buf1[0], 0, 0x6542);
3319 buf1[0] = __byte_perm_S (tmp0, buf1[0], 0x7054);
3320 break;
3321 case 19: tmp1 = __byte_perm_S (buf1[0], 0, 0x6543);
3322 buf1[0] = __byte_perm_S (tmp0, buf1[0], 0x0654);
3323 break;
3324 case 20: tmp1 = __byte_perm_S (buf1[1], 0, 0x6540);
3325 buf1[1] = __byte_perm_S (tmp0, buf1[1], 0x7650);
3326 break;
3327 case 21: tmp1 = __byte_perm_S (buf1[1], 0, 0x6541);
3328 buf1[1] = __byte_perm_S (tmp0, buf1[1], 0x7604);
3329 break;
3330 case 22: tmp1 = __byte_perm_S (buf1[1], 0, 0x6542);
3331 buf1[1] = __byte_perm_S (tmp0, buf1[1], 0x7054);
3332 break;
3333 case 23: tmp1 = __byte_perm_S (buf1[1], 0, 0x6543);
3334 buf1[1] = __byte_perm_S (tmp0, buf1[1], 0x0654);
3335 break;
3336 case 24: tmp1 = __byte_perm_S (buf1[2], 0, 0x6540);
3337 buf1[2] = __byte_perm_S (tmp0, buf1[2], 0x7650);
3338 break;
3339 case 25: tmp1 = __byte_perm_S (buf1[2], 0, 0x6541);
3340 buf1[2] = __byte_perm_S (tmp0, buf1[2], 0x7604);
3341 break;
3342 case 26: tmp1 = __byte_perm_S (buf1[2], 0, 0x6542);
3343 buf1[2] = __byte_perm_S (tmp0, buf1[2], 0x7054);
3344 break;
3345 case 27: tmp1 = __byte_perm_S (buf1[2], 0, 0x6543);
3346 buf1[2] = __byte_perm_S (tmp0, buf1[2], 0x0654);
3347 break;
3348 case 28: tmp1 = __byte_perm_S (buf1[3], 0, 0x6540);
3349 buf1[3] = __byte_perm_S (tmp0, buf1[3], 0x7650);
3350 break;
3351 case 29: tmp1 = __byte_perm_S (buf1[3], 0, 0x6541);
3352 buf1[3] = __byte_perm_S (tmp0, buf1[3], 0x7604);
3353 break;
3354 case 30: tmp1 = __byte_perm_S (buf1[3], 0, 0x6542);
3355 buf1[3] = __byte_perm_S (tmp0, buf1[3], 0x7054);
3356 break;
3357 case 31: tmp1 = __byte_perm_S (buf1[3], 0, 0x6543);
3358 buf1[3] = __byte_perm_S (tmp0, buf1[3], 0x0654);
3359 break;
3360 }
3361
3362 switch (p0)
3363 {
3364 case 0: buf0[0] = __byte_perm_S (tmp1, buf0[0], 0x7650);
3365 break;
3366 case 1: buf0[0] = __byte_perm_S (tmp1, buf0[0], 0x7604);
3367 break;
3368 case 2: buf0[0] = __byte_perm_S (tmp1, buf0[0], 0x7054);
3369 break;
3370 case 3: buf0[0] = __byte_perm_S (tmp1, buf0[0], 0x0654);
3371 break;
3372 case 4: buf0[1] = __byte_perm_S (tmp1, buf0[1], 0x7650);
3373 break;
3374 case 5: buf0[1] = __byte_perm_S (tmp1, buf0[1], 0x7604);
3375 break;
3376 case 6: buf0[1] = __byte_perm_S (tmp1, buf0[1], 0x7054);
3377 break;
3378 case 7: buf0[1] = __byte_perm_S (tmp1, buf0[1], 0x0654);
3379 break;
3380 case 8: buf0[2] = __byte_perm_S (tmp1, buf0[2], 0x7650);
3381 break;
3382 case 9: buf0[2] = __byte_perm_S (tmp1, buf0[2], 0x7604);
3383 break;
3384 case 10: buf0[2] = __byte_perm_S (tmp1, buf0[2], 0x7054);
3385 break;
3386 case 11: buf0[2] = __byte_perm_S (tmp1, buf0[2], 0x0654);
3387 break;
3388 case 12: buf0[3] = __byte_perm_S (tmp1, buf0[3], 0x7650);
3389 break;
3390 case 13: buf0[3] = __byte_perm_S (tmp1, buf0[3], 0x7604);
3391 break;
3392 case 14: buf0[3] = __byte_perm_S (tmp1, buf0[3], 0x7054);
3393 break;
3394 case 15: buf0[3] = __byte_perm_S (tmp1, buf0[3], 0x0654);
3395 break;
3396 case 16: buf1[0] = __byte_perm_S (tmp1, buf1[0], 0x7650);
3397 break;
3398 case 17: buf1[0] = __byte_perm_S (tmp1, buf1[0], 0x7604);
3399 break;
3400 case 18: buf1[0] = __byte_perm_S (tmp1, buf1[0], 0x7054);
3401 break;
3402 case 19: buf1[0] = __byte_perm_S (tmp1, buf1[0], 0x0654);
3403 break;
3404 case 20: buf1[1] = __byte_perm_S (tmp1, buf1[1], 0x7650);
3405 break;
3406 case 21: buf1[1] = __byte_perm_S (tmp1, buf1[1], 0x7604);
3407 break;
3408 case 22: buf1[1] = __byte_perm_S (tmp1, buf1[1], 0x7054);
3409 break;
3410 case 23: buf1[1] = __byte_perm_S (tmp1, buf1[1], 0x0654);
3411 break;
3412 case 24: buf1[2] = __byte_perm_S (tmp1, buf1[2], 0x7650);
3413 break;
3414 case 25: buf1[2] = __byte_perm_S (tmp1, buf1[2], 0x7604);
3415 break;
3416 case 26: buf1[2] = __byte_perm_S (tmp1, buf1[2], 0x7054);
3417 break;
3418 case 27: buf1[2] = __byte_perm_S (tmp1, buf1[2], 0x0654);
3419 break;
3420 case 28: buf1[3] = __byte_perm_S (tmp1, buf1[3], 0x7650);
3421 break;
3422 case 29: buf1[3] = __byte_perm_S (tmp1, buf1[3], 0x7604);
3423 break;
3424 case 30: buf1[3] = __byte_perm_S (tmp1, buf1[3], 0x7054);
3425 break;
3426 case 31: buf1[3] = __byte_perm_S (tmp1, buf1[3], 0x0654);
3427 break;
3428 }
3429 #endif
3430
3431 #if defined IS_AMD || defined IS_GENERIC
3432 switch (p0)
3433 {
3434 case 0: tmp0 = (buf0[0] >> 0) & 0xFF;
3435 break;
3436 case 1: tmp0 = (buf0[0] >> 8) & 0xFF;
3437 break;
3438 case 2: tmp0 = (buf0[0] >> 16) & 0xFF;
3439 break;
3440 case 3: tmp0 = (buf0[0] >> 24) & 0xFF;
3441 break;
3442 case 4: tmp0 = (buf0[1] >> 0) & 0xFF;
3443 break;
3444 case 5: tmp0 = (buf0[1] >> 8) & 0xFF;
3445 break;
3446 case 6: tmp0 = (buf0[1] >> 16) & 0xFF;
3447 break;
3448 case 7: tmp0 = (buf0[1] >> 24) & 0xFF;
3449 break;
3450 case 8: tmp0 = (buf0[2] >> 0) & 0xFF;
3451 break;
3452 case 9: tmp0 = (buf0[2] >> 8) & 0xFF;
3453 break;
3454 case 10: tmp0 = (buf0[2] >> 16) & 0xFF;
3455 break;
3456 case 11: tmp0 = (buf0[2] >> 24) & 0xFF;
3457 break;
3458 case 12: tmp0 = (buf0[3] >> 0) & 0xFF;
3459 break;
3460 case 13: tmp0 = (buf0[3] >> 8) & 0xFF;
3461 break;
3462 case 14: tmp0 = (buf0[3] >> 16) & 0xFF;
3463 break;
3464 case 15: tmp0 = (buf0[3] >> 24) & 0xFF;
3465 break;
3466 case 16: tmp0 = (buf1[0] >> 0) & 0xFF;
3467 break;
3468 case 17: tmp0 = (buf1[0] >> 8) & 0xFF;
3469 break;
3470 case 18: tmp0 = (buf1[0] >> 16) & 0xFF;
3471 break;
3472 case 19: tmp0 = (buf1[0] >> 24) & 0xFF;
3473 break;
3474 case 20: tmp0 = (buf1[1] >> 0) & 0xFF;
3475 break;
3476 case 21: tmp0 = (buf1[1] >> 8) & 0xFF;
3477 break;
3478 case 22: tmp0 = (buf1[1] >> 16) & 0xFF;
3479 break;
3480 case 23: tmp0 = (buf1[1] >> 24) & 0xFF;
3481 break;
3482 case 24: tmp0 = (buf1[2] >> 0) & 0xFF;
3483 break;
3484 case 25: tmp0 = (buf1[2] >> 8) & 0xFF;
3485 break;
3486 case 26: tmp0 = (buf1[2] >> 16) & 0xFF;
3487 break;
3488 case 27: tmp0 = (buf1[2] >> 24) & 0xFF;
3489 break;
3490 case 28: tmp0 = (buf1[3] >> 0) & 0xFF;
3491 break;
3492 case 29: tmp0 = (buf1[3] >> 8) & 0xFF;
3493 break;
3494 case 30: tmp0 = (buf1[3] >> 16) & 0xFF;
3495 break;
3496 case 31: tmp0 = (buf1[3] >> 24) & 0xFF;
3497 break;
3498 }
3499
3500 switch (p1)
3501 {
3502 case 0: tmp1 = (buf0[0] >> 0) & 0xff;
3503 buf0[0] = (buf0[0] & 0xffffff00) | tmp0 << 0;
3504 break;
3505 case 1: tmp1 = (buf0[0] >> 8) & 0xff;
3506 buf0[0] = (buf0[0] & 0xffff00ff) | tmp0 << 8;
3507 break;
3508 case 2: tmp1 = (buf0[0] >> 16) & 0xff;
3509 buf0[0] = (buf0[0] & 0xff00ffff) | tmp0 << 16;
3510 break;
3511 case 3: tmp1 = (buf0[0] >> 24) & 0xff;
3512 buf0[0] = (buf0[0] & 0x00ffffff) | tmp0 << 24;
3513 break;
3514 case 4: tmp1 = (buf0[1] >> 0) & 0xff;
3515 buf0[1] = (buf0[1] & 0xffffff00) | tmp0 << 0;
3516 break;
3517 case 5: tmp1 = (buf0[1] >> 8) & 0xff;
3518 buf0[1] = (buf0[1] & 0xffff00ff) | tmp0 << 8;
3519 break;
3520 case 6: tmp1 = (buf0[1] >> 16) & 0xff;
3521 buf0[1] = (buf0[1] & 0xff00ffff) | tmp0 << 16;
3522 break;
3523 case 7: tmp1 = (buf0[1] >> 24) & 0xff;
3524 buf0[1] = (buf0[1] & 0x00ffffff) | tmp0 << 24;
3525 break;
3526 case 8: tmp1 = (buf0[2] >> 0) & 0xff;
3527 buf0[2] = (buf0[2] & 0xffffff00) | tmp0 << 0;
3528 break;
3529 case 9: tmp1 = (buf0[2] >> 8) & 0xff;
3530 buf0[2] = (buf0[2] & 0xffff00ff) | tmp0 << 8;
3531 break;
3532 case 10: tmp1 = (buf0[2] >> 16) & 0xff;
3533 buf0[2] = (buf0[2] & 0xff00ffff) | tmp0 << 16;
3534 break;
3535 case 11: tmp1 = (buf0[2] >> 24) & 0xff;
3536 buf0[2] = (buf0[2] & 0x00ffffff) | tmp0 << 24;
3537 break;
3538 case 12: tmp1 = (buf0[3] >> 0) & 0xff;
3539 buf0[3] = (buf0[3] & 0xffffff00) | tmp0 << 0;
3540 break;
3541 case 13: tmp1 = (buf0[3] >> 8) & 0xff;
3542 buf0[3] = (buf0[3] & 0xffff00ff) | tmp0 << 8;
3543 break;
3544 case 14: tmp1 = (buf0[3] >> 16) & 0xff;
3545 buf0[3] = (buf0[3] & 0xff00ffff) | tmp0 << 16;
3546 break;
3547 case 15: tmp1 = (buf0[3] >> 24) & 0xff;
3548 buf0[3] = (buf0[3] & 0x00ffffff) | tmp0 << 24;
3549 break;
3550 case 16: tmp1 = (buf1[0] >> 0) & 0xff;
3551 buf1[0] = (buf1[0] & 0xffffff00) | tmp0 << 0;
3552 break;
3553 case 17: tmp1 = (buf1[0] >> 8) & 0xff;
3554 buf1[0] = (buf1[0] & 0xffff00ff) | tmp0 << 8;
3555 break;
3556 case 18: tmp1 = (buf1[0] >> 16) & 0xff;
3557 buf1[0] = (buf1[0] & 0xff00ffff) | tmp0 << 16;
3558 break;
3559 case 19: tmp1 = (buf1[0] >> 24) & 0xff;
3560 buf1[0] = (buf1[0] & 0x00ffffff) | tmp0 << 24;
3561 break;
3562 case 20: tmp1 = (buf1[1] >> 0) & 0xff;
3563 buf1[1] = (buf1[1] & 0xffffff00) | tmp0 << 0;
3564 break;
3565 case 21: tmp1 = (buf1[1] >> 8) & 0xff;
3566 buf1[1] = (buf1[1] & 0xffff00ff) | tmp0 << 8;
3567 break;
3568 case 22: tmp1 = (buf1[1] >> 16) & 0xff;
3569 buf1[1] = (buf1[1] & 0xff00ffff) | tmp0 << 16;
3570 break;
3571 case 23: tmp1 = (buf1[1] >> 24) & 0xff;
3572 buf1[1] = (buf1[1] & 0x00ffffff) | tmp0 << 24;
3573 break;
3574 case 24: tmp1 = (buf1[2] >> 0) & 0xff;
3575 buf1[2] = (buf1[2] & 0xffffff00) | tmp0 << 0;
3576 break;
3577 case 25: tmp1 = (buf1[2] >> 8) & 0xff;
3578 buf1[2] = (buf1[2] & 0xffff00ff) | tmp0 << 8;
3579 break;
3580 case 26: tmp1 = (buf1[2] >> 16) & 0xff;
3581 buf1[2] = (buf1[2] & 0xff00ffff) | tmp0 << 16;
3582 break;
3583 case 27: tmp1 = (buf1[2] >> 24) & 0xff;
3584 buf1[2] = (buf1[2] & 0x00ffffff) | tmp0 << 24;
3585 break;
3586 case 28: tmp1 = (buf1[3] >> 0) & 0xff;
3587 buf1[3] = (buf1[3] & 0xffffff00) | tmp0 << 0;
3588 break;
3589 case 29: tmp1 = (buf1[3] >> 8) & 0xff;
3590 buf1[3] = (buf1[3] & 0xffff00ff) | tmp0 << 8;
3591 break;
3592 case 30: tmp1 = (buf1[3] >> 16) & 0xff;
3593 buf1[3] = (buf1[3] & 0xff00ffff) | tmp0 << 16;
3594 break;
3595 case 31: tmp1 = (buf1[3] >> 24) & 0xff;
3596 buf1[3] = (buf1[3] & 0x00ffffff) | tmp0 << 24;
3597 break;
3598 }
3599
3600 switch (p0)
3601 {
3602 case 0: buf0[0] = (buf0[0] & 0xffffff00) | tmp1 << 0;
3603 break;
3604 case 1: buf0[0] = (buf0[0] & 0xffff00ff) | tmp1 << 8;
3605 break;
3606 case 2: buf0[0] = (buf0[0] & 0xff00ffff) | tmp1 << 16;
3607 break;
3608 case 3: buf0[0] = (buf0[0] & 0x00ffffff) | tmp1 << 24;
3609 break;
3610 case 4: buf0[1] = (buf0[1] & 0xffffff00) | tmp1 << 0;
3611 break;
3612 case 5: buf0[1] = (buf0[1] & 0xffff00ff) | tmp1 << 8;
3613 break;
3614 case 6: buf0[1] = (buf0[1] & 0xff00ffff) | tmp1 << 16;
3615 break;
3616 case 7: buf0[1] = (buf0[1] & 0x00ffffff) | tmp1 << 24;
3617 break;
3618 case 8: buf0[2] = (buf0[2] & 0xffffff00) | tmp1 << 0;
3619 break;
3620 case 9: buf0[2] = (buf0[2] & 0xffff00ff) | tmp1 << 8;
3621 break;
3622 case 10: buf0[2] = (buf0[2] & 0xff00ffff) | tmp1 << 16;
3623 break;
3624 case 11: buf0[2] = (buf0[2] & 0x00ffffff) | tmp1 << 24;
3625 break;
3626 case 12: buf0[3] = (buf0[3] & 0xffffff00) | tmp1 << 0;
3627 break;
3628 case 13: buf0[3] = (buf0[3] & 0xffff00ff) | tmp1 << 8;
3629 break;
3630 case 14: buf0[3] = (buf0[3] & 0xff00ffff) | tmp1 << 16;
3631 break;
3632 case 15: buf0[3] = (buf0[3] & 0x00ffffff) | tmp1 << 24;
3633 break;
3634 case 16: buf1[0] = (buf1[0] & 0xffffff00) | tmp1 << 0;
3635 break;
3636 case 17: buf1[0] = (buf1[0] & 0xffff00ff) | tmp1 << 8;
3637 break;
3638 case 18: buf1[0] = (buf1[0] & 0xff00ffff) | tmp1 << 16;
3639 break;
3640 case 19: buf1[0] = (buf1[0] & 0x00ffffff) | tmp1 << 24;
3641 break;
3642 case 20: buf1[1] = (buf1[1] & 0xffffff00) | tmp1 << 0;
3643 break;
3644 case 21: buf1[1] = (buf1[1] & 0xffff00ff) | tmp1 << 8;
3645 break;
3646 case 22: buf1[1] = (buf1[1] & 0xff00ffff) | tmp1 << 16;
3647 break;
3648 case 23: buf1[1] = (buf1[1] & 0x00ffffff) | tmp1 << 24;
3649 break;
3650 case 24: buf1[2] = (buf1[2] & 0xffffff00) | tmp1 << 0;
3651 break;
3652 case 25: buf1[2] = (buf1[2] & 0xffff00ff) | tmp1 << 8;
3653 break;
3654 case 26: buf1[2] = (buf1[2] & 0xff00ffff) | tmp1 << 16;
3655 break;
3656 case 27: buf1[2] = (buf1[2] & 0x00ffffff) | tmp1 << 24;
3657 break;
3658 case 28: buf1[3] = (buf1[3] & 0xffffff00) | tmp1 << 0;
3659 break;
3660 case 29: buf1[3] = (buf1[3] & 0xffff00ff) | tmp1 << 8;
3661 break;
3662 case 30: buf1[3] = (buf1[3] & 0xff00ffff) | tmp1 << 16;
3663 break;
3664 case 31: buf1[3] = (buf1[3] & 0x00ffffff) | tmp1 << 24;
3665 break;
3666 }
3667 #endif
3668
3669 return in_len;
3670 }
3671
3672 inline u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3673 {
3674 if (p0 >= in_len) return (in_len);
3675
3676 const u32 mr = 0xffu << ((p0 & 3) * 8);
3677 const u32 ml = ~mr;
3678
3679 switch (p0 / 4)
3680 {
3681 case 0: buf0[0] = (buf0[0] & ml) | (((buf0[0] & mr) << 1) & mr); break;
3682 case 1: buf0[1] = (buf0[1] & ml) | (((buf0[1] & mr) << 1) & mr); break;
3683 case 2: buf0[2] = (buf0[2] & ml) | (((buf0[2] & mr) << 1) & mr); break;
3684 case 3: buf0[3] = (buf0[3] & ml) | (((buf0[3] & mr) << 1) & mr); break;
3685 case 4: buf1[0] = (buf1[0] & ml) | (((buf1[0] & mr) << 1) & mr); break;
3686 case 5: buf1[1] = (buf1[1] & ml) | (((buf1[1] & mr) << 1) & mr); break;
3687 case 6: buf1[2] = (buf1[2] & ml) | (((buf1[2] & mr) << 1) & mr); break;
3688 case 7: buf1[3] = (buf1[3] & ml) | (((buf1[3] & mr) << 1) & mr); break;
3689 }
3690
3691 return in_len;
3692 }
3693
3694 inline u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3695 {
3696 if (p0 >= in_len) return (in_len);
3697
3698 const u32 mr = 0xffu << ((p0 & 3) * 8);
3699 const u32 ml = ~mr;
3700
3701 switch (p0 / 4)
3702 {
3703 case 0: buf0[0] = (buf0[0] & ml) | (((buf0[0] & mr) >> 1) & mr); break;
3704 case 1: buf0[1] = (buf0[1] & ml) | (((buf0[1] & mr) >> 1) & mr); break;
3705 case 2: buf0[2] = (buf0[2] & ml) | (((buf0[2] & mr) >> 1) & mr); break;
3706 case 3: buf0[3] = (buf0[3] & ml) | (((buf0[3] & mr) >> 1) & mr); break;
3707 case 4: buf1[0] = (buf1[0] & ml) | (((buf1[0] & mr) >> 1) & mr); break;
3708 case 5: buf1[1] = (buf1[1] & ml) | (((buf1[1] & mr) >> 1) & mr); break;
3709 case 6: buf1[2] = (buf1[2] & ml) | (((buf1[2] & mr) >> 1) & mr); break;
3710 case 7: buf1[3] = (buf1[3] & ml) | (((buf1[3] & mr) >> 1) & mr); break;
3711 }
3712
3713 return in_len;
3714 }
3715
3716 inline u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3717 {
3718 if (p0 >= in_len) return (in_len);
3719
3720 const u32 mr = 0xffu << ((p0 & 3) * 8);
3721 const u32 ml = ~mr;
3722
3723 const u32 n = 0x01010101 & mr;
3724
3725 switch (p0 / 4)
3726 {
3727 case 0: buf0[0] = (buf0[0] & ml) | (((buf0[0] & mr) + n) & mr); break;
3728 case 1: buf0[1] = (buf0[1] & ml) | (((buf0[1] & mr) + n) & mr); break;
3729 case 2: buf0[2] = (buf0[2] & ml) | (((buf0[2] & mr) + n) & mr); break;
3730 case 3: buf0[3] = (buf0[3] & ml) | (((buf0[3] & mr) + n) & mr); break;
3731 case 4: buf1[0] = (buf1[0] & ml) | (((buf1[0] & mr) + n) & mr); break;
3732 case 5: buf1[1] = (buf1[1] & ml) | (((buf1[1] & mr) + n) & mr); break;
3733 case 6: buf1[2] = (buf1[2] & ml) | (((buf1[2] & mr) + n) & mr); break;
3734 case 7: buf1[3] = (buf1[3] & ml) | (((buf1[3] & mr) + n) & mr); break;
3735 }
3736
3737 return in_len;
3738 }
3739
3740 inline u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3741 {
3742 if (p0 >= in_len) return (in_len);
3743
3744 const u32 mr = 0xffu << ((p0 & 3) * 8);
3745 const u32 ml = ~mr;
3746
3747 const u32 n = 0x01010101 & mr;
3748
3749 switch (p0 / 4)
3750 {
3751 case 0: buf0[0] = (buf0[0] & ml) | (((buf0[0] & mr) - n) & mr); break;
3752 case 1: buf0[1] = (buf0[1] & ml) | (((buf0[1] & mr) - n) & mr); break;
3753 case 2: buf0[2] = (buf0[2] & ml) | (((buf0[2] & mr) - n) & mr); break;
3754 case 3: buf0[3] = (buf0[3] & ml) | (((buf0[3] & mr) - n) & mr); break;
3755 case 4: buf1[0] = (buf1[0] & ml) | (((buf1[0] & mr) - n) & mr); break;
3756 case 5: buf1[1] = (buf1[1] & ml) | (((buf1[1] & mr) - n) & mr); break;
3757 case 6: buf1[2] = (buf1[2] & ml) | (((buf1[2] & mr) - n) & mr); break;
3758 case 7: buf1[3] = (buf1[3] & ml) | (((buf1[3] & mr) - n) & mr); break;
3759 }
3760
3761 return in_len;
3762 }
3763
3764 inline u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3765 {
3766 if ((p0 + 1) >= in_len) return (in_len);
3767
3768 u32 tib40[4];
3769 u32 tib41[4];
3770
3771 lshift_block (buf0, buf1, tib40, tib41);
3772
3773 const u32 mr = 0xffu << ((p0 & 3) * 8);
3774 const u32 ml = ~mr;
3775
3776 switch (p0 / 4)
3777 {
3778 case 0: buf0[0] = (buf0[0] & ml) | (tib40[0] & mr); break;
3779 case 1: buf0[1] = (buf0[1] & ml) | (tib40[1] & mr); break;
3780 case 2: buf0[2] = (buf0[2] & ml) | (tib40[2] & mr); break;
3781 case 3: buf0[3] = (buf0[3] & ml) | (tib40[3] & mr); break;
3782 case 4: buf1[0] = (buf1[0] & ml) | (tib41[0] & mr); break;
3783 case 5: buf1[1] = (buf1[1] & ml) | (tib41[1] & mr); break;
3784 case 6: buf1[2] = (buf1[2] & ml) | (tib41[2] & mr); break;
3785 case 7: buf1[3] = (buf1[3] & ml) | (tib41[3] & mr); break;
3786 }
3787
3788 return in_len;
3789 }
3790
3791 inline u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3792 {
3793 if (p0 == 0) return (in_len);
3794
3795 if (p0 >= in_len) return (in_len);
3796
3797 u32 tib40[4];
3798 u32 tib41[4];
3799
3800 rshift_block (buf0, buf1, tib40, tib41);
3801
3802 const u32 mr = 0xffu << ((p0 & 3) * 8);
3803 const u32 ml = ~mr;
3804
3805 switch (p0 / 4)
3806 {
3807 case 0: buf0[0] = (buf0[0] & ml) | (tib40[0] & mr); break;
3808 case 1: buf0[1] = (buf0[1] & ml) | (tib40[1] & mr); break;
3809 case 2: buf0[2] = (buf0[2] & ml) | (tib40[2] & mr); break;
3810 case 3: buf0[3] = (buf0[3] & ml) | (tib40[3] & mr); break;
3811 case 4: buf1[0] = (buf1[0] & ml) | (tib41[0] & mr); break;
3812 case 5: buf1[1] = (buf1[1] & ml) | (tib41[1] & mr); break;
3813 case 6: buf1[2] = (buf1[2] & ml) | (tib41[2] & mr); break;
3814 case 7: buf1[3] = (buf1[3] & ml) | (tib41[3] & mr); break;
3815 }
3816
3817 return in_len;
3818 }
3819
3820 inline u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3821 {
3822 if (p0 > in_len) return (in_len);
3823
3824 if ((in_len + p0) >= 32) return (in_len);
3825
3826 u32 out_len = in_len;
3827
3828 u32 tib40[4];
3829 u32 tib41[4];
3830
3831 tib40[0] = buf0[0];
3832 tib40[1] = buf0[1];
3833 tib40[2] = buf0[2];
3834 tib40[3] = buf0[3];
3835 tib41[0] = buf1[0];
3836 tib41[1] = buf1[1];
3837 tib41[2] = buf1[2];
3838 tib41[3] = buf1[3];
3839
3840 truncate_right (tib40, tib41, p0);
3841
3842 rshift_block_N (buf0, buf1, buf0, buf1, p0);
3843
3844 buf0[0] |= tib40[0];
3845 buf0[1] |= tib40[1];
3846 buf0[2] |= tib40[2];
3847 buf0[3] |= tib40[3];
3848 buf1[0] |= tib41[0];
3849 buf1[1] |= tib41[1];
3850 buf1[2] |= tib41[2];
3851 buf1[3] |= tib41[3];
3852
3853 out_len += p0;
3854
3855 return out_len;
3856 }
3857
3858 inline u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3859 {
3860 if (p0 > in_len) return (in_len);
3861
3862 if ((in_len + p0) >= 32) return (in_len);
3863
3864 u32 out_len = in_len;
3865
3866 u32 tib40[4];
3867 u32 tib41[4];
3868
3869 rshift_block_N (buf0, buf1, tib40, tib41, p0);
3870
3871 truncate_left (tib40, tib41, out_len);
3872
3873 buf0[0] |= tib40[0];
3874 buf0[1] |= tib40[1];
3875 buf0[2] |= tib40[2];
3876 buf0[3] |= tib40[3];
3877 buf1[0] |= tib41[0];
3878 buf1[1] |= tib41[1];
3879 buf1[2] |= tib41[2];
3880 buf1[3] |= tib41[3];
3881
3882 out_len += p0;
3883
3884 return out_len;
3885 }
3886
3887 inline u32 rule_op_mangle_title (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3888 {
3889 buf0[0] |= (generate_cmask (buf0[0]));
3890 buf0[1] |= (generate_cmask (buf0[1]));
3891 buf0[2] |= (generate_cmask (buf0[2]));
3892 buf0[3] |= (generate_cmask (buf0[3]));
3893 buf1[0] |= (generate_cmask (buf1[0]));
3894 buf1[1] |= (generate_cmask (buf1[1]));
3895 buf1[2] |= (generate_cmask (buf1[2]));
3896 buf1[3] |= (generate_cmask (buf1[3]));
3897
3898 #ifdef IS_NV
3899 buf0[0] &= ~(0x00000020 & generate_cmask (buf0[0]));
3900
3901 for (u32 i = 0; i < in_len; i++)
3902 {
3903 u32 tmp0;
3904 u32 tmp1;
3905
3906 switch (i)
3907 {
3908 case 0: tmp0 = __byte_perm_S (buf0[0], 0, 0x6540);
3909 tmp1 = ~(0x00002000 & generate_cmask (buf0[0])); break;
3910 case 1: tmp0 = __byte_perm_S (buf0[0], 0, 0x6541);
3911 tmp1 = ~(0x00200000 & generate_cmask (buf0[0])); break;
3912 case 2: tmp0 = __byte_perm_S (buf0[0], 0, 0x6542);
3913 tmp1 = ~(0x20000000 & generate_cmask (buf0[0])); break;
3914 case 3: tmp0 = __byte_perm_S (buf0[0], 0, 0x6543);
3915 tmp1 = ~(0x00000020 & generate_cmask (buf0[1])); break;
3916 case 4: tmp0 = __byte_perm_S (buf0[1], 0, 0x6540);
3917 tmp1 = ~(0x00002000 & generate_cmask (buf0[1])); break;
3918 case 5: tmp0 = __byte_perm_S (buf0[1], 0, 0x6541);
3919 tmp1 = ~(0x00200000 & generate_cmask (buf0[1])); break;
3920 case 6: tmp0 = __byte_perm_S (buf0[1], 0, 0x6542);
3921 tmp1 = ~(0x20000000 & generate_cmask (buf0[1])); break;
3922 case 7: tmp0 = __byte_perm_S (buf0[1], 0, 0x6543);
3923 tmp1 = ~(0x00000020 & generate_cmask (buf0[2])); break;
3924 case 8: tmp0 = __byte_perm_S (buf0[2], 0, 0x6540);
3925 tmp1 = ~(0x00002000 & generate_cmask (buf0[2])); break;
3926 case 9: tmp0 = __byte_perm_S (buf0[2], 0, 0x6541);
3927 tmp1 = ~(0x00200000 & generate_cmask (buf0[2])); break;
3928 case 10: tmp0 = __byte_perm_S (buf0[2], 0, 0x6542);
3929 tmp1 = ~(0x20000000 & generate_cmask (buf0[2])); break;
3930 case 11: tmp0 = __byte_perm_S (buf0[2], 0, 0x6543);
3931 tmp1 = ~(0x00000020 & generate_cmask (buf0[3])); break;
3932 case 12: tmp0 = __byte_perm_S (buf0[3], 0, 0x6540);
3933 tmp1 = ~(0x00002000 & generate_cmask (buf0[3])); break;
3934 case 13: tmp0 = __byte_perm_S (buf0[3], 0, 0x6541);
3935 tmp1 = ~(0x00200000 & generate_cmask (buf0[3])); break;
3936 case 14: tmp0 = __byte_perm_S (buf0[3], 0, 0x6542);
3937 tmp1 = ~(0x20000000 & generate_cmask (buf0[3])); break;
3938 case 15: tmp0 = __byte_perm_S (buf0[3], 0, 0x6543);
3939 tmp1 = ~(0x00000020 & generate_cmask (buf1[0])); break;
3940 case 16: tmp0 = __byte_perm_S (buf1[0], 0, 0x6540);
3941 tmp1 = ~(0x00002000 & generate_cmask (buf1[0])); break;
3942 case 17: tmp0 = __byte_perm_S (buf1[0], 0, 0x6541);
3943 tmp1 = ~(0x00200000 & generate_cmask (buf1[0])); break;
3944 case 18: tmp0 = __byte_perm_S (buf1[0], 0, 0x6542);
3945 tmp1 = ~(0x20000000 & generate_cmask (buf1[0])); break;
3946 case 19: tmp0 = __byte_perm_S (buf1[0], 0, 0x6543);
3947 tmp1 = ~(0x00000020 & generate_cmask (buf1[1])); break;
3948 case 20: tmp0 = __byte_perm_S (buf1[1], 0, 0x6540);
3949 tmp1 = ~(0x00002000 & generate_cmask (buf1[1])); break;
3950 case 21: tmp0 = __byte_perm_S (buf1[1], 0, 0x6541);
3951 tmp1 = ~(0x00200000 & generate_cmask (buf1[1])); break;
3952 case 22: tmp0 = __byte_perm_S (buf1[1], 0, 0x6542);
3953 tmp1 = ~(0x20000000 & generate_cmask (buf1[1])); break;
3954 case 23: tmp0 = __byte_perm_S (buf1[1], 0, 0x6543);
3955 tmp1 = ~(0x00000020 & generate_cmask (buf1[2])); break;
3956 case 24: tmp0 = __byte_perm_S (buf1[2], 0, 0x6540);
3957 tmp1 = ~(0x00002000 & generate_cmask (buf1[2])); break;
3958 case 25: tmp0 = __byte_perm_S (buf1[2], 0, 0x6541);
3959 tmp1 = ~(0x00200000 & generate_cmask (buf1[2])); break;
3960 case 26: tmp0 = __byte_perm_S (buf1[2], 0, 0x6542);
3961 tmp1 = ~(0x20000000 & generate_cmask (buf1[2])); break;
3962 case 27: tmp0 = __byte_perm_S (buf1[2], 0, 0x6543);
3963 tmp1 = ~(0x00000020 & generate_cmask (buf1[3])); break;
3964 case 28: tmp0 = __byte_perm_S (buf1[3], 0, 0x6540);
3965 tmp1 = ~(0x00002000 & generate_cmask (buf1[3])); break;
3966 case 29: tmp0 = __byte_perm_S (buf1[3], 0, 0x6541);
3967 tmp1 = ~(0x00200000 & generate_cmask (buf1[3])); break;
3968 case 30: tmp0 = __byte_perm_S (buf1[3], 0, 0x6542);
3969 tmp1 = ~(0x20000000 & generate_cmask (buf1[3])); break;
3970 }
3971
3972 if (i < 3)
3973 {
3974 if (tmp0 == ' ') buf0[0] &= tmp1 ;
3975 }
3976 else if (i < 7)
3977 {
3978 if (tmp0 == ' ') buf0[1] &= tmp1 ;
3979 }
3980 else if (i < 11)
3981 {
3982 if (tmp0 == ' ') buf0[2] &= tmp1 ;
3983 }
3984 else if (i < 15)
3985 {
3986 if (tmp0 == ' ') buf0[3] &= tmp1 ;
3987 }
3988 else if (i < 19)
3989 {
3990 if (tmp0 == ' ') buf1[0] &= tmp1 ;
3991 }
3992 else if (i < 23)
3993 {
3994 if (tmp0 == ' ') buf1[1] &= tmp1 ;
3995 }
3996 else if (i < 27)
3997 {
3998 if (tmp0 == ' ') buf1[2] &= tmp1 ;
3999 }
4000 else if (i < 31)
4001 {
4002 if (tmp0 == ' ') buf1[3] &= tmp1 ;
4003 }
4004 }
4005 #endif
4006
4007 #if defined IS_AMD || defined IS_GENERIC
4008 u32 tib40[4];
4009 u32 tib41[4];
4010
4011 const uchar4 tmp0 = (uchar4) (' ');
4012 const uchar4 tmp1 = (uchar4) (0x00);
4013 const uchar4 tmp2 = (uchar4) (0xff);
4014
4015 uchar4 tmp;
4016
4017 tmp = as_uchar4 (buf0[0]); tmp = select (tmp1, tmp2, tmp == tmp0); tib40[0] = as_uint (tmp);
4018 tmp = as_uchar4 (buf0[1]); tmp = select (tmp1, tmp2, tmp == tmp0); tib40[1] = as_uint (tmp);
4019 tmp = as_uchar4 (buf0[2]); tmp = select (tmp1, tmp2, tmp == tmp0); tib40[2] = as_uint (tmp);
4020 tmp = as_uchar4 (buf0[3]); tmp = select (tmp1, tmp2, tmp == tmp0); tib40[3] = as_uint (tmp);
4021 tmp = as_uchar4 (buf1[0]); tmp = select (tmp1, tmp2, tmp == tmp0); tib41[0] = as_uint (tmp);
4022 tmp = as_uchar4 (buf1[1]); tmp = select (tmp1, tmp2, tmp == tmp0); tib41[1] = as_uint (tmp);
4023 tmp = as_uchar4 (buf1[2]); tmp = select (tmp1, tmp2, tmp == tmp0); tib41[2] = as_uint (tmp);
4024 tmp = as_uchar4 (buf1[3]); tmp = select (tmp1, tmp2, tmp == tmp0); tib41[3] = as_uint (tmp);
4025
4026 rshift_block (tib40, tib41, tib40, tib41); tib40[0] |= 0xff;
4027
4028 buf0[0] &= ~(generate_cmask (buf0[0]) & tib40[0]);
4029 buf0[1] &= ~(generate_cmask (buf0[1]) & tib40[1]);
4030 buf0[2] &= ~(generate_cmask (buf0[2]) & tib40[2]);
4031 buf0[3] &= ~(generate_cmask (buf0[3]) & tib40[3]);
4032 buf1[0] &= ~(generate_cmask (buf1[0]) & tib41[0]);
4033 buf1[1] &= ~(generate_cmask (buf1[1]) & tib41[1]);
4034 buf1[2] &= ~(generate_cmask (buf1[2]) & tib41[2]);
4035 buf1[3] &= ~(generate_cmask (buf1[3]) & tib41[3]);
4036 #endif
4037
4038 return in_len;
4039 }
4040
4041 inline u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
4042 {
4043 u32 out_len = in_len;
4044
4045 switch (name)
4046 {
4047 case RULE_OP_MANGLE_LREST: out_len = rule_op_mangle_lrest (p0, p1, buf0, buf1, out_len); break;
4048 case RULE_OP_MANGLE_UREST: out_len = rule_op_mangle_urest (p0, p1, buf0, buf1, out_len); break;
4049 case RULE_OP_MANGLE_LREST_UFIRST: out_len = rule_op_mangle_lrest_ufirst (p0, p1, buf0, buf1, out_len); break;
4050 case RULE_OP_MANGLE_UREST_LFIRST: out_len = rule_op_mangle_urest_lfirst (p0, p1, buf0, buf1, out_len); break;
4051 case RULE_OP_MANGLE_TREST: out_len = rule_op_mangle_trest (p0, p1, buf0, buf1, out_len); break;
4052 case RULE_OP_MANGLE_TOGGLE_AT: out_len = rule_op_mangle_toggle_at (p0, p1, buf0, buf1, out_len); break;
4053 case RULE_OP_MANGLE_REVERSE: out_len = rule_op_mangle_reverse (p0, p1, buf0, buf1, out_len); break;
4054 case RULE_OP_MANGLE_DUPEWORD: out_len = rule_op_mangle_dupeword (p0, p1, buf0, buf1, out_len); break;
4055 case RULE_OP_MANGLE_DUPEWORD_TIMES: out_len = rule_op_mangle_dupeword_times (p0, p1, buf0, buf1, out_len); break;
4056 case RULE_OP_MANGLE_REFLECT: out_len = rule_op_mangle_reflect (p0, p1, buf0, buf1, out_len); break;
4057 case RULE_OP_MANGLE_APPEND: out_len = rule_op_mangle_append (p0, p1, buf0, buf1, out_len); break;
4058 case RULE_OP_MANGLE_PREPEND: out_len = rule_op_mangle_prepend (p0, p1, buf0, buf1, out_len); break;
4059 case RULE_OP_MANGLE_ROTATE_LEFT: out_len = rule_op_mangle_rotate_left (p0, p1, buf0, buf1, out_len); break;
4060 case RULE_OP_MANGLE_ROTATE_RIGHT: out_len = rule_op_mangle_rotate_right (p0, p1, buf0, buf1, out_len); break;
4061 case RULE_OP_MANGLE_DELETE_FIRST: out_len = rule_op_mangle_delete_first (p0, p1, buf0, buf1, out_len); break;
4062 case RULE_OP_MANGLE_DELETE_LAST: out_len = rule_op_mangle_delete_last (p0, p1, buf0, buf1, out_len); break;
4063 case RULE_OP_MANGLE_DELETE_AT: out_len = rule_op_mangle_delete_at (p0, p1, buf0, buf1, out_len); break;
4064 case RULE_OP_MANGLE_EXTRACT: out_len = rule_op_mangle_extract (p0, p1, buf0, buf1, out_len); break;
4065 case RULE_OP_MANGLE_OMIT: out_len = rule_op_mangle_omit (p0, p1, buf0, buf1, out_len); break;
4066 case RULE_OP_MANGLE_INSERT: out_len = rule_op_mangle_insert (p0, p1, buf0, buf1, out_len); break;
4067 case RULE_OP_MANGLE_OVERSTRIKE: out_len = rule_op_mangle_overstrike (p0, p1, buf0, buf1, out_len); break;
4068 case RULE_OP_MANGLE_TRUNCATE_AT: out_len = rule_op_mangle_truncate_at (p0, p1, buf0, buf1, out_len); break;
4069 case RULE_OP_MANGLE_REPLACE: out_len = rule_op_mangle_replace (p0, p1, buf0, buf1, out_len); break;
4070 //case RULE_OP_MANGLE_PURGECHAR: out_len = rule_op_mangle_purgechar (p0, p1, buf0, buf1, out_len); break;
4071 //case RULE_OP_MANGLE_TOGGLECASE_REC: out_len = rule_op_mangle_togglecase_rec (p0, p1, buf0, buf1, out_len); break;
4072 case RULE_OP_MANGLE_DUPECHAR_FIRST: out_len = rule_op_mangle_dupechar_first (p0, p1, buf0, buf1, out_len); break;
4073 case RULE_OP_MANGLE_DUPECHAR_LAST: out_len = rule_op_mangle_dupechar_last (p0, p1, buf0, buf1, out_len); break;
4074 case RULE_OP_MANGLE_DUPECHAR_ALL: out_len = rule_op_mangle_dupechar_all (p0, p1, buf0, buf1, out_len); break;
4075 case RULE_OP_MANGLE_SWITCH_FIRST: out_len = rule_op_mangle_switch_first (p0, p1, buf0, buf1, out_len); break;
4076 case RULE_OP_MANGLE_SWITCH_LAST: out_len = rule_op_mangle_switch_last (p0, p1, buf0, buf1, out_len); break;
4077 case RULE_OP_MANGLE_SWITCH_AT: out_len = rule_op_mangle_switch_at (p0, p1, buf0, buf1, out_len); break;
4078 case RULE_OP_MANGLE_CHR_SHIFTL: out_len = rule_op_mangle_chr_shiftl (p0, p1, buf0, buf1, out_len); break;
4079 case RULE_OP_MANGLE_CHR_SHIFTR: out_len = rule_op_mangle_chr_shiftr (p0, p1, buf0, buf1, out_len); break;
4080 case RULE_OP_MANGLE_CHR_INCR: out_len = rule_op_mangle_chr_incr (p0, p1, buf0, buf1, out_len); break;
4081 case RULE_OP_MANGLE_CHR_DECR: out_len = rule_op_mangle_chr_decr (p0, p1, buf0, buf1, out_len); break;
4082 case RULE_OP_MANGLE_REPLACE_NP1: out_len = rule_op_mangle_replace_np1 (p0, p1, buf0, buf1, out_len); break;
4083 case RULE_OP_MANGLE_REPLACE_NM1: out_len = rule_op_mangle_replace_nm1 (p0, p1, buf0, buf1, out_len); break;
4084 case RULE_OP_MANGLE_DUPEBLOCK_FIRST: out_len = rule_op_mangle_dupeblock_first (p0, p1, buf0, buf1, out_len); break;
4085 case RULE_OP_MANGLE_DUPEBLOCK_LAST: out_len = rule_op_mangle_dupeblock_last (p0, p1, buf0, buf1, out_len); break;
4086 case RULE_OP_MANGLE_TITLE: out_len = rule_op_mangle_title (p0, p1, buf0, buf1, out_len); break;
4087 }
4088
4089 return out_len;
4090 }
4091
4092 inline u32 apply_rules (const __global u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len)
4093 {
4094 u32 out_len = len;
4095
4096 for (u32 i = 0; cmds[i] != 0; i++)
4097 {
4098 const u32 cmd = cmds[i];
4099
4100 const u32 name = (cmd >> 0) & 0xff;
4101 const u32 p0 = (cmd >> 8) & 0xff;
4102 const u32 p1 = (cmd >> 16) & 0xff;
4103
4104 out_len = apply_rule (name, p0, p1, buf0, buf1, out_len);
4105 }
4106
4107 return out_len;
4108 }
4109
4110 inline u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, const __global kernel_rule_t *rules_buf, const u32 il_pos, u32x w0[4], u32x w1[4])
4111 {
4112 #if VECT_SIZE == 1
4113
4114 w0[0] = pw_buf0[0];
4115 w0[1] = pw_buf0[1];
4116 w0[2] = pw_buf0[2];
4117 w0[3] = pw_buf0[3];
4118 w1[0] = pw_buf1[0];
4119 w1[1] = pw_buf1[1];
4120 w1[2] = pw_buf1[2];
4121 w1[3] = pw_buf1[3];
4122
4123 return apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len);
4124
4125 #else
4126
4127 u32x out_len = 0;
4128
4129 #ifdef _unroll
4130 #pragma unroll
4131 #endif
4132 for (int i = 0; i < VECT_SIZE; i++)
4133 {
4134 u32 tmp0[4];
4135 u32 tmp1[4];
4136
4137 tmp0[0] = pw_buf0[0];
4138 tmp0[1] = pw_buf0[1];
4139 tmp0[2] = pw_buf0[2];
4140 tmp0[3] = pw_buf0[3];
4141 tmp1[0] = pw_buf1[0];
4142 tmp1[1] = pw_buf1[1];
4143 tmp1[2] = pw_buf1[2];
4144 tmp1[3] = pw_buf1[3];
4145
4146 const u32 tmp_len = apply_rules (rules_buf[il_pos + i].cmds, tmp0, tmp1, pw_len);
4147
4148 switch (i)
4149 {
4150 #if VECT_SIZE >= 2
4151 case 0:
4152 w0[0].s0 = tmp0[0];
4153 w0[1].s0 = tmp0[1];
4154 w0[2].s0 = tmp0[2];
4155 w0[3].s0 = tmp0[3];
4156 w1[0].s0 = tmp1[0];
4157 w1[1].s0 = tmp1[1];
4158 w1[2].s0 = tmp1[2];
4159 w1[3].s0 = tmp1[3];
4160 out_len.s0 = tmp_len;
4161 break;
4162
4163 case 1:
4164 w0[0].s1 = tmp0[0];
4165 w0[1].s1 = tmp0[1];
4166 w0[2].s1 = tmp0[2];
4167 w0[3].s1 = tmp0[3];
4168 w1[0].s1 = tmp1[0];
4169 w1[1].s1 = tmp1[1];
4170 w1[2].s1 = tmp1[2];
4171 w1[3].s1 = tmp1[3];
4172 out_len.s1 = tmp_len;
4173 break;
4174 #endif
4175
4176 #if VECT_SIZE >= 4
4177 case 2:
4178 w0[0].s2 = tmp0[0];
4179 w0[1].s2 = tmp0[1];
4180 w0[2].s2 = tmp0[2];
4181 w0[3].s2 = tmp0[3];
4182 w1[0].s2 = tmp1[0];
4183 w1[1].s2 = tmp1[1];
4184 w1[2].s2 = tmp1[2];
4185 w1[3].s2 = tmp1[3];
4186 out_len.s2 = tmp_len;
4187 break;
4188
4189 case 3:
4190 w0[0].s3 = tmp0[0];
4191 w0[1].s3 = tmp0[1];
4192 w0[2].s3 = tmp0[2];
4193 w0[3].s3 = tmp0[3];
4194 w1[0].s3 = tmp1[0];
4195 w1[1].s3 = tmp1[1];
4196 w1[2].s3 = tmp1[2];
4197 w1[3].s3 = tmp1[3];
4198 out_len.s3 = tmp_len;
4199 break;
4200 #endif
4201
4202 #if VECT_SIZE >= 8
4203 case 4:
4204 w0[0].s4 = tmp0[0];
4205 w0[1].s4 = tmp0[1];
4206 w0[2].s4 = tmp0[2];
4207 w0[3].s4 = tmp0[3];
4208 w1[0].s4 = tmp1[0];
4209 w1[1].s4 = tmp1[1];
4210 w1[2].s4 = tmp1[2];
4211 w1[3].s4 = tmp1[3];
4212 out_len.s4 = tmp_len;
4213 break;
4214
4215 case 5:
4216 w0[0].s5 = tmp0[0];
4217 w0[1].s5 = tmp0[1];
4218 w0[2].s5 = tmp0[2];
4219 w0[3].s5 = tmp0[3];
4220 w1[0].s5 = tmp1[0];
4221 w1[1].s5 = tmp1[1];
4222 w1[2].s5 = tmp1[2];
4223 w1[3].s5 = tmp1[3];
4224 out_len.s5 = tmp_len;
4225 break;
4226
4227 case 6:
4228 w0[0].s6 = tmp0[0];
4229 w0[1].s6 = tmp0[1];
4230 w0[2].s6 = tmp0[2];
4231 w0[3].s6 = tmp0[3];
4232 w1[0].s6 = tmp1[0];
4233 w1[1].s6 = tmp1[1];
4234 w1[2].s6 = tmp1[2];
4235 w1[3].s6 = tmp1[3];
4236 out_len.s6 = tmp_len;
4237 break;
4238
4239 case 7:
4240 w0[0].s7 = tmp0[0];
4241 w0[1].s7 = tmp0[1];
4242 w0[2].s7 = tmp0[2];
4243 w0[3].s7 = tmp0[3];
4244 w1[0].s7 = tmp1[0];
4245 w1[1].s7 = tmp1[1];
4246 w1[2].s7 = tmp1[2];
4247 w1[3].s7 = tmp1[3];
4248 out_len.s7 = tmp_len;
4249 break;
4250 #endif
4251
4252 #if VECT_SIZE >= 16
4253 case 8:
4254 w0[0].s8 = tmp0[0];
4255 w0[1].s8 = tmp0[1];
4256 w0[2].s8 = tmp0[2];
4257 w0[3].s8 = tmp0[3];
4258 w1[0].s8 = tmp1[0];
4259 w1[1].s8 = tmp1[1];
4260 w1[2].s8 = tmp1[2];
4261 w1[3].s8 = tmp1[3];
4262 out_len.s8 = tmp_len;
4263 break;
4264
4265 case 9:
4266 w0[0].s9 = tmp0[0];
4267 w0[1].s9 = tmp0[1];
4268 w0[2].s9 = tmp0[2];
4269 w0[3].s9 = tmp0[3];
4270 w1[0].s9 = tmp1[0];
4271 w1[1].s9 = tmp1[1];
4272 w1[2].s9 = tmp1[2];
4273 w1[3].s9 = tmp1[3];
4274 out_len.s9 = tmp_len;
4275 break;
4276
4277 case 10:
4278 w0[0].sa = tmp0[0];
4279 w0[1].sa = tmp0[1];
4280 w0[2].sa = tmp0[2];
4281 w0[3].sa = tmp0[3];
4282 w1[0].sa = tmp1[0];
4283 w1[1].sa = tmp1[1];
4284 w1[2].sa = tmp1[2];
4285 w1[3].sa = tmp1[3];
4286 out_len.sa = tmp_len;
4287 break;
4288
4289 case 11:
4290 w0[0].sb = tmp0[0];
4291 w0[1].sb = tmp0[1];
4292 w0[2].sb = tmp0[2];
4293 w0[3].sb = tmp0[3];
4294 w1[0].sb = tmp1[0];
4295 w1[1].sb = tmp1[1];
4296 w1[2].sb = tmp1[2];
4297 w1[3].sb = tmp1[3];
4298 out_len.sb = tmp_len;
4299 break;
4300
4301 case 12:
4302 w0[0].sc = tmp0[0];
4303 w0[1].sc = tmp0[1];
4304 w0[2].sc = tmp0[2];
4305 w0[3].sc = tmp0[3];
4306 w1[0].sc = tmp1[0];
4307 w1[1].sc = tmp1[1];
4308 w1[2].sc = tmp1[2];
4309 w1[3].sc = tmp1[3];
4310 out_len.sc = tmp_len;
4311 break;
4312
4313 case 13:
4314 w0[0].sd = tmp0[0];
4315 w0[1].sd = tmp0[1];
4316 w0[2].sd = tmp0[2];
4317 w0[3].sd = tmp0[3];
4318 w1[0].sd = tmp1[0];
4319 w1[1].sd = tmp1[1];
4320 w1[2].sd = tmp1[2];
4321 w1[3].sd = tmp1[3];
4322 out_len.sd = tmp_len;
4323 break;
4324
4325 case 14:
4326 w0[0].se = tmp0[0];
4327 w0[1].se = tmp0[1];
4328 w0[2].se = tmp0[2];
4329 w0[3].se = tmp0[3];
4330 w1[0].se = tmp1[0];
4331 w1[1].se = tmp1[1];
4332 w1[2].se = tmp1[2];
4333 w1[3].se = tmp1[3];
4334 out_len.se = tmp_len;
4335 break;
4336
4337 case 15:
4338 w0[0].sf = tmp0[0];
4339 w0[1].sf = tmp0[1];
4340 w0[2].sf = tmp0[2];
4341 w0[3].sf = tmp0[3];
4342 w1[0].sf = tmp1[0];
4343 w1[1].sf = tmp1[1];
4344 w1[2].sf = tmp1[2];
4345 w1[3].sf = tmp1[3];
4346 out_len.sf = tmp_len;
4347 break;
4348 #endif
4349 }
4350 }
4351
4352 return out_len;
4353
4354 #endif
4355 }