Prepare for VeraCrypt integration
[hashcat.git] / OpenCL / rp.c
1 /**
2 * Authors.....: Jens Steube <jens.steube@gmail.com>
3 * magnum <john.magnum@hushmail.com>
4 *
5 * License.....: MIT
6 */
7
8 inline u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len);
9 inline u32 apply_rules (const __global u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len);
10 inline u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, const __global kernel_rule_t *rules_buf, const u32 il_pos, u32x w0[4], u32x w1[4]);
11
12 inline u32 generate_cmask (u32 buf)
13 {
14 const u32 rmask = ((buf & 0x40404040) >> 1)
15 & ~((buf & 0x80808080) >> 2);
16
17 const u32 hmask = (buf & 0x1f1f1f1f) + 0x05050505;
18 const u32 lmask = (buf & 0x1f1f1f1f) + 0x1f1f1f1f;
19
20 return rmask & ~hmask & lmask;
21 }
22
23 inline void truncate_right (u32 w0[4], u32 w1[4], const u32 len)
24 {
25 const u32 tmp = (1 << ((len % 4) * 8)) - 1;
26
27 switch (len / 4)
28 {
29 case 0: w0[0] &= tmp;
30 w0[1] = 0;
31 w0[2] = 0;
32 w0[3] = 0;
33 w1[0] = 0;
34 w1[1] = 0;
35 w1[2] = 0;
36 w1[3] = 0;
37 break;
38 case 1: w0[1] &= tmp;
39 w0[2] = 0;
40 w0[3] = 0;
41 w1[0] = 0;
42 w1[1] = 0;
43 w1[2] = 0;
44 w1[3] = 0;
45 break;
46 case 2: w0[2] &= tmp;
47 w0[3] = 0;
48 w1[0] = 0;
49 w1[1] = 0;
50 w1[2] = 0;
51 w1[3] = 0;
52 break;
53 case 3: w0[3] &= tmp;
54 w1[0] = 0;
55 w1[1] = 0;
56 w1[2] = 0;
57 w1[3] = 0;
58 break;
59 case 4: w1[0] &= tmp;
60 w1[1] = 0;
61 w1[2] = 0;
62 w1[3] = 0;
63 break;
64 case 5: w1[1] &= tmp;
65 w1[2] = 0;
66 w1[3] = 0;
67 break;
68 case 6: w1[2] &= tmp;
69 w1[3] = 0;
70 break;
71 case 7: w1[3] &= tmp;
72 break;
73 }
74 }
75
76 inline void truncate_left (u32 w0[4], u32 w1[4], const u32 len)
77 {
78 const u32 tmp = ~((1 << ((len % 4) * 8)) - 1);
79
80 switch (len / 4)
81 {
82 case 0: w0[0] &= tmp;
83 break;
84 case 1: w0[0] = 0;
85 w0[1] &= tmp;
86 break;
87 case 2: w0[0] = 0;
88 w0[1] = 0;
89 w0[2] &= tmp;
90 break;
91 case 3: w0[0] = 0;
92 w0[1] = 0;
93 w0[2] = 0;
94 w0[3] &= tmp;
95 break;
96 case 4: w0[0] = 0;
97 w0[1] = 0;
98 w0[2] = 0;
99 w0[3] = 0;
100 w1[0] &= tmp;
101 break;
102 case 5: w0[0] = 0;
103 w0[1] = 0;
104 w0[2] = 0;
105 w0[3] = 0;
106 w1[0] = 0;
107 w1[1] &= tmp;
108 break;
109 case 6: w0[0] = 0;
110 w0[1] = 0;
111 w0[2] = 0;
112 w0[3] = 0;
113 w1[0] = 0;
114 w1[1] = 0;
115 w1[2] &= tmp;
116 break;
117 case 7: w0[0] = 0;
118 w0[1] = 0;
119 w0[2] = 0;
120 w0[3] = 0;
121 w1[0] = 0;
122 w1[1] = 0;
123 w1[2] = 0;
124 w1[3] &= tmp;
125 break;
126 }
127 }
128
129 inline void lshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4])
130 {
131 #ifdef IS_NV
132 out0[0] = __byte_perm_S (in0[0], in0[1], 0x4321);
133 out0[1] = __byte_perm_S (in0[1], in0[2], 0x4321);
134 out0[2] = __byte_perm_S (in0[2], in0[3], 0x4321);
135 out0[3] = __byte_perm_S (in0[3], in1[0], 0x4321);
136 out1[0] = __byte_perm_S (in1[0], in1[1], 0x4321);
137 out1[1] = __byte_perm_S (in1[1], in1[2], 0x4321);
138 out1[2] = __byte_perm_S (in1[2], in1[3], 0x4321);
139 out1[3] = __byte_perm_S (in1[3], 0, 0x4321);
140 #endif
141
142 #if defined IS_AMD || defined IS_GENERIC
143 out0[0] = amd_bytealign_S (in0[1], in0[0], 1);
144 out0[1] = amd_bytealign_S (in0[2], in0[1], 1);
145 out0[2] = amd_bytealign_S (in0[3], in0[2], 1);
146 out0[3] = amd_bytealign_S (in1[0], in0[3], 1);
147 out1[0] = amd_bytealign_S (in1[1], in1[0], 1);
148 out1[1] = amd_bytealign_S (in1[2], in1[1], 1);
149 out1[2] = amd_bytealign_S (in1[3], in1[2], 1);
150 out1[3] = amd_bytealign_S ( 0, in1[3], 1);
151 #endif
152 }
153
154 inline void rshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4])
155 {
156 #ifdef IS_NV
157 out1[3] = __byte_perm_S (in1[2], in1[3], 0x6543);
158 out1[2] = __byte_perm_S (in1[1], in1[2], 0x6543);
159 out1[1] = __byte_perm_S (in1[0], in1[1], 0x6543);
160 out1[0] = __byte_perm_S (in0[3], in1[0], 0x6543);
161 out0[3] = __byte_perm_S (in0[2], in0[3], 0x6543);
162 out0[2] = __byte_perm_S (in0[1], in0[2], 0x6543);
163 out0[1] = __byte_perm_S (in0[0], in0[1], 0x6543);
164 out0[0] = __byte_perm_S ( 0, in0[0], 0x6543);
165 #endif
166
167 #if defined IS_AMD || defined IS_GENERIC
168 out1[3] = amd_bytealign_S (in1[3], in1[2], 3);
169 out1[2] = amd_bytealign_S (in1[2], in1[1], 3);
170 out1[1] = amd_bytealign_S (in1[1], in1[0], 3);
171 out1[0] = amd_bytealign_S (in1[0], in0[3], 3);
172 out0[3] = amd_bytealign_S (in0[3], in0[2], 3);
173 out0[2] = amd_bytealign_S (in0[2], in0[1], 3);
174 out0[1] = amd_bytealign_S (in0[1], in0[0], 3);
175 out0[0] = amd_bytealign_S (in0[0], 0, 3);
176 #endif
177 }
178
179 inline void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num)
180 {
181 #ifdef IS_NV
182 switch (num)
183 {
184 case 0: out0[0] = in0[0];
185 out0[1] = in0[1];
186 out0[2] = in0[2];
187 out0[3] = in0[3];
188 out1[0] = in1[0];
189 out1[1] = in1[1];
190 out1[2] = in1[2];
191 out1[3] = in1[3];
192 break;
193 case 1: out0[0] = __byte_perm_S (in0[0], in0[1], 0x4321);
194 out0[1] = __byte_perm_S (in0[1], in0[2], 0x4321);
195 out0[2] = __byte_perm_S (in0[2], in0[3], 0x4321);
196 out0[3] = __byte_perm_S (in0[3], in1[0], 0x4321);
197 out1[0] = __byte_perm_S (in1[0], in1[1], 0x4321);
198 out1[1] = __byte_perm_S (in1[1], in1[2], 0x4321);
199 out1[2] = __byte_perm_S (in1[2], in1[3], 0x4321);
200 out1[3] = __byte_perm_S (in1[3], 0, 0x4321);
201 break;
202 case 2: out0[0] = __byte_perm_S (in0[0], in0[1], 0x5432);
203 out0[1] = __byte_perm_S (in0[1], in0[2], 0x5432);
204 out0[2] = __byte_perm_S (in0[2], in0[3], 0x5432);
205 out0[3] = __byte_perm_S (in0[3], in1[0], 0x5432);
206 out1[0] = __byte_perm_S (in1[0], in1[1], 0x5432);
207 out1[1] = __byte_perm_S (in1[1], in1[2], 0x5432);
208 out1[2] = __byte_perm_S (in1[2], in1[3], 0x5432);
209 out1[3] = __byte_perm_S (in1[3], 0, 0x5432);
210 break;
211 case 3: out0[0] = __byte_perm_S (in0[0], in0[1], 0x6543);
212 out0[1] = __byte_perm_S (in0[1], in0[2], 0x6543);
213 out0[2] = __byte_perm_S (in0[2], in0[3], 0x6543);
214 out0[3] = __byte_perm_S (in0[3], in1[0], 0x6543);
215 out1[0] = __byte_perm_S (in1[0], in1[1], 0x6543);
216 out1[1] = __byte_perm_S (in1[1], in1[2], 0x6543);
217 out1[2] = __byte_perm_S (in1[2], in1[3], 0x6543);
218 out1[3] = __byte_perm_S (in1[3], 0, 0x6543);
219 break;
220 case 4: out0[0] = in0[1];
221 out0[1] = in0[2];
222 out0[2] = in0[3];
223 out0[3] = in1[0];
224 out1[0] = in1[1];
225 out1[1] = in1[2];
226 out1[2] = in1[3];
227 out1[3] = 0;
228 break;
229 case 5: out0[0] = __byte_perm_S (in0[1], in0[2], 0x4321);
230 out0[1] = __byte_perm_S (in0[2], in0[3], 0x4321);
231 out0[2] = __byte_perm_S (in0[3], in1[0], 0x4321);
232 out0[3] = __byte_perm_S (in1[0], in1[1], 0x4321);
233 out1[0] = __byte_perm_S (in1[1], in1[2], 0x4321);
234 out1[1] = __byte_perm_S (in1[2], in1[3], 0x4321);
235 out1[2] = __byte_perm_S (in1[3], 0, 0x4321);
236 out1[3] = 0;
237 break;
238 case 6: out0[0] = __byte_perm_S (in0[1], in0[2], 0x5432);
239 out0[1] = __byte_perm_S (in0[2], in0[3], 0x5432);
240 out0[2] = __byte_perm_S (in0[3], in1[0], 0x5432);
241 out0[3] = __byte_perm_S (in1[0], in1[1], 0x5432);
242 out1[0] = __byte_perm_S (in1[1], in1[2], 0x5432);
243 out1[1] = __byte_perm_S (in1[2], in1[3], 0x5432);
244 out1[2] = __byte_perm_S (in1[3], 0, 0x5432);
245 out1[3] = 0;
246 break;
247 case 7: out0[0] = __byte_perm_S (in0[1], in0[2], 0x6543);
248 out0[1] = __byte_perm_S (in0[2], in0[3], 0x6543);
249 out0[2] = __byte_perm_S (in0[3], in1[0], 0x6543);
250 out0[3] = __byte_perm_S (in1[0], in1[1], 0x6543);
251 out1[0] = __byte_perm_S (in1[1], in1[2], 0x6543);
252 out1[1] = __byte_perm_S (in1[2], in1[3], 0x6543);
253 out1[2] = __byte_perm_S (in1[3], 0, 0x6543);
254 out1[3] = 0;
255 break;
256 case 8: out0[0] = in0[2];
257 out0[1] = in0[3];
258 out0[2] = in1[0];
259 out0[3] = in1[1];
260 out1[0] = in1[2];
261 out1[1] = in1[3];
262 out1[2] = 0;
263 out1[3] = 0;
264 break;
265 case 9: out0[0] = __byte_perm_S (in0[2], in0[3], 0x4321);
266 out0[1] = __byte_perm_S (in0[3], in1[0], 0x4321);
267 out0[2] = __byte_perm_S (in1[0], in1[1], 0x4321);
268 out0[3] = __byte_perm_S (in1[1], in1[2], 0x4321);
269 out1[0] = __byte_perm_S (in1[2], in1[3], 0x4321);
270 out1[1] = __byte_perm_S (in1[3], 0, 0x4321);
271 out1[2] = 0;
272 out1[3] = 0;
273 break;
274 case 10: out0[0] = __byte_perm_S (in0[2], in0[3], 0x5432);
275 out0[1] = __byte_perm_S (in0[3], in1[0], 0x5432);
276 out0[2] = __byte_perm_S (in1[0], in1[1], 0x5432);
277 out0[3] = __byte_perm_S (in1[1], in1[2], 0x5432);
278 out1[0] = __byte_perm_S (in1[2], in1[3], 0x5432);
279 out1[1] = __byte_perm_S (in1[3], 0, 0x5432);
280 out1[2] = 0;
281 out1[3] = 0;
282 break;
283 case 11: out0[0] = __byte_perm_S (in0[2], in0[3], 0x6543);
284 out0[1] = __byte_perm_S (in0[3], in1[0], 0x6543);
285 out0[2] = __byte_perm_S (in1[0], in1[1], 0x6543);
286 out0[3] = __byte_perm_S (in1[1], in1[2], 0x6543);
287 out1[0] = __byte_perm_S (in1[2], in1[3], 0x6543);
288 out1[1] = __byte_perm_S (in1[3], 0, 0x6543);
289 out1[2] = 0;
290 out1[3] = 0;
291 break;
292 case 12: out0[0] = in0[3];
293 out0[1] = in1[0];
294 out0[2] = in1[1];
295 out0[3] = in1[2];
296 out1[0] = in1[3];
297 out1[1] = 0;
298 out1[2] = 0;
299 out1[3] = 0;
300 break;
301 case 13:
302 out0[0] = __byte_perm_S (in0[3], in1[0], 0x4321);
303 out0[1] = __byte_perm_S (in1[0], in1[1], 0x4321);
304 out0[2] = __byte_perm_S (in1[1], in1[2], 0x4321);
305 out0[3] = __byte_perm_S (in1[2], in1[3], 0x4321);
306 out1[0] = __byte_perm_S (in1[3], 0, 0x4321);
307 out1[1] = 0;
308 out1[2] = 0;
309 out1[3] = 0;
310 break;
311 case 14: out0[0] = __byte_perm_S (in0[3], in1[0], 0x5432);
312 out0[1] = __byte_perm_S (in1[0], in1[1], 0x5432);
313 out0[2] = __byte_perm_S (in1[1], in1[2], 0x5432);
314 out0[3] = __byte_perm_S (in1[2], in1[3], 0x5432);
315 out1[0] = __byte_perm_S (in1[3], 0, 0x5432);
316 out1[1] = 0;
317 out1[2] = 0;
318 out1[3] = 0;
319 break;
320 case 15: out0[0] = __byte_perm_S (in0[3], in1[0], 0x6543);
321 out0[1] = __byte_perm_S (in1[0], in1[1], 0x6543);
322 out0[2] = __byte_perm_S (in1[1], in1[2], 0x6543);
323 out0[3] = __byte_perm_S (in1[2], in1[3], 0x6543);
324 out1[0] = __byte_perm_S (in1[3], 0, 0x6543);
325 out1[1] = 0;
326 out1[2] = 0;
327 out1[3] = 0;
328 break;
329 case 16: out0[0] = in1[0];
330 out0[1] = in1[1];
331 out0[2] = in1[2];
332 out0[3] = in1[3];
333 out1[0] = 0;
334 out1[1] = 0;
335 out1[2] = 0;
336 out1[3] = 0;
337 break;
338 case 17: out0[0] = __byte_perm_S (in1[0], in1[1], 0x4321);
339 out0[1] = __byte_perm_S (in1[1], in1[2], 0x4321);
340 out0[2] = __byte_perm_S (in1[2], in1[3], 0x4321);
341 out0[3] = __byte_perm_S (in1[3], 0, 0x4321);
342 out1[0] = 0;
343 out1[1] = 0;
344 out1[2] = 0;
345 out1[3] = 0;
346 break;
347 case 18: out0[0] = __byte_perm_S (in1[0], in1[1], 0x5432);
348 out0[1] = __byte_perm_S (in1[1], in1[2], 0x5432);
349 out0[2] = __byte_perm_S (in1[2], in1[3], 0x5432);
350 out0[3] = __byte_perm_S (in1[3], 0, 0x5432);
351 out1[0] = 0;
352 out1[1] = 0;
353 out1[2] = 0;
354 out1[3] = 0;
355 break;
356 case 19: out0[0] = __byte_perm_S (in1[0], in1[1], 0x6543);
357 out0[1] = __byte_perm_S (in1[1], in1[2], 0x6543);
358 out0[2] = __byte_perm_S (in1[2], in1[3], 0x6543);
359 out0[3] = __byte_perm_S (in1[3], 0, 0x6543);
360 out1[0] = 0;
361 out1[1] = 0;
362 out1[2] = 0;
363 out1[3] = 0;
364 break;
365 case 20: out0[0] = in1[1];
366 out0[1] = in1[2];
367 out0[2] = in1[3];
368 out0[3] = 0;
369 out1[0] = 0;
370 out1[1] = 0;
371 out1[2] = 0;
372 out1[3] = 0;
373 break;
374 case 21: out0[0] = __byte_perm_S (in1[1], in1[2], 0x4321);
375 out0[1] = __byte_perm_S (in1[2], in1[3], 0x4321);
376 out0[2] = __byte_perm_S (in1[3], 0, 0x4321);
377 out0[3] = 0;
378 out1[0] = 0;
379 out1[1] = 0;
380 out1[2] = 0;
381 out1[3] = 0;
382 break;
383 case 22: out0[0] = __byte_perm_S (in1[1], in1[2], 0x5432);
384 out0[1] = __byte_perm_S (in1[2], in1[3], 0x5432);
385 out0[2] = __byte_perm_S (in1[3], 0, 0x5432);
386 out0[3] = 0;
387 out1[0] = 0;
388 out1[1] = 0;
389 out1[2] = 0;
390 out1[3] = 0;
391 break;
392 case 23: out0[0] = __byte_perm_S (in1[1], in1[2], 0x6543);
393 out0[1] = __byte_perm_S (in1[2], in1[3], 0x6543);
394 out0[2] = __byte_perm_S (in1[3], 0, 0x6543);
395 out0[3] = 0;
396 out1[0] = 0;
397 out1[1] = 0;
398 out1[2] = 0;
399 out1[3] = 0;
400 break;
401 case 24: out0[0] = in1[2];
402 out0[1] = in1[3];
403 out0[2] = 0;
404 out0[3] = 0;
405 out1[0] = 0;
406 out1[1] = 0;
407 out1[2] = 0;
408 out1[3] = 0;
409 break;
410 case 25: out0[0] = __byte_perm_S (in1[2], in1[3], 0x4321);
411 out0[1] = __byte_perm_S (in1[3], 0, 0x4321);
412 out0[2] = 0;
413 out0[3] = 0;
414 out1[0] = 0;
415 out1[1] = 0;
416 out1[2] = 0;
417 out1[3] = 0;
418 break;
419 case 26: out0[0] = __byte_perm_S (in1[2], in1[3], 0x5432);
420 out0[1] = __byte_perm_S (in1[3], 0, 0x5432);
421 out0[2] = 0;
422 out0[3] = 0;
423 out1[0] = 0;
424 out1[1] = 0;
425 out1[2] = 0;
426 out1[3] = 0;
427 break;
428 case 27: out0[0] = __byte_perm_S (in1[2], in1[3], 0x6543);
429 out0[1] = __byte_perm_S (in1[3], 0, 0x6543);
430 out0[2] = 0;
431 out0[3] = 0;
432 out1[0] = 0;
433 out1[1] = 0;
434 out1[2] = 0;
435 out1[3] = 0;
436 break;
437 case 28: out0[0] = in1[3];
438 out0[1] = 0;
439 out0[2] = 0;
440 out0[3] = 0;
441 out1[0] = 0;
442 out1[1] = 0;
443 out1[2] = 0;
444 out1[3] = 0;
445 break;
446 case 29: out0[0] = __byte_perm_S (in1[3], 0, 0x4321);
447 out0[1] = 0;
448 out0[2] = 0;
449 out0[3] = 0;
450 out1[0] = 0;
451 out1[1] = 0;
452 out1[2] = 0;
453 out1[3] = 0;
454 break;
455 case 30: out0[0] = __byte_perm_S (in1[3], 0, 0x5432);
456 out0[1] = 0;
457 out0[2] = 0;
458 out0[3] = 0;
459 out1[0] = 0;
460 out1[1] = 0;
461 out1[2] = 0;
462 out1[3] = 0;
463 break;
464 case 31: out0[0] = __byte_perm_S (in1[3], 0, 0x6543);
465 out0[1] = 0;
466 out0[2] = 0;
467 out0[3] = 0;
468 out1[0] = 0;
469 out1[1] = 0;
470 out1[2] = 0;
471 out1[3] = 0;
472 break;
473 }
474 #endif
475
476 #if defined IS_AMD || defined IS_GENERIC
477 switch (num)
478 {
479 case 0: out0[0] = in0[0];
480 out0[1] = in0[1];
481 out0[2] = in0[2];
482 out0[3] = in0[3];
483 out1[0] = in1[0];
484 out1[1] = in1[1];
485 out1[2] = in1[2];
486 out1[3] = in1[3];
487 break;
488 case 1: out0[0] = amd_bytealign_S (in0[1], in0[0], 1);
489 out0[1] = amd_bytealign_S (in0[2], in0[1], 1);
490 out0[2] = amd_bytealign_S (in0[3], in0[2], 1);
491 out0[3] = amd_bytealign_S (in1[0], in0[3], 1);
492 out1[0] = amd_bytealign_S (in1[1], in1[0], 1);
493 out1[1] = amd_bytealign_S (in1[2], in1[1], 1);
494 out1[2] = amd_bytealign_S (in1[3], in1[2], 1);
495 out1[3] = amd_bytealign_S ( 0, in1[3], 1);
496 break;
497 case 2: out0[0] = amd_bytealign_S (in0[1], in0[0], 2);
498 out0[1] = amd_bytealign_S (in0[2], in0[1], 2);
499 out0[2] = amd_bytealign_S (in0[3], in0[2], 2);
500 out0[3] = amd_bytealign_S (in1[0], in0[3], 2);
501 out1[0] = amd_bytealign_S (in1[1], in1[0], 2);
502 out1[1] = amd_bytealign_S (in1[2], in1[1], 2);
503 out1[2] = amd_bytealign_S (in1[3], in1[2], 2);
504 out1[3] = amd_bytealign_S ( 0, in1[3], 2);
505 break;
506 case 3: out0[0] = amd_bytealign_S (in0[1], in0[0], 3);
507 out0[1] = amd_bytealign_S (in0[2], in0[1], 3);
508 out0[2] = amd_bytealign_S (in0[3], in0[2], 3);
509 out0[3] = amd_bytealign_S (in1[0], in0[3], 3);
510 out1[0] = amd_bytealign_S (in1[1], in1[0], 3);
511 out1[1] = amd_bytealign_S (in1[2], in1[1], 3);
512 out1[2] = amd_bytealign_S (in1[3], in1[2], 3);
513 out1[3] = amd_bytealign_S ( 0, in1[3], 3);
514 break;
515 case 4: out0[0] = in0[1];
516 out0[1] = in0[2];
517 out0[2] = in0[3];
518 out0[3] = in1[0];
519 out1[0] = in1[1];
520 out1[1] = in1[2];
521 out1[2] = in1[3];
522 out1[3] = 0;
523 break;
524 case 5: out0[0] = amd_bytealign_S (in0[2], in0[1], 1);
525 out0[1] = amd_bytealign_S (in0[3], in0[2], 1);
526 out0[2] = amd_bytealign_S (in1[0], in0[3], 1);
527 out0[3] = amd_bytealign_S (in1[1], in1[0], 1);
528 out1[0] = amd_bytealign_S (in1[2], in1[1], 1);
529 out1[1] = amd_bytealign_S (in1[3], in1[2], 1);
530 out1[2] = amd_bytealign_S ( 0, in1[3], 1);
531 out1[3] = 0;
532 break;
533 case 6: out0[0] = amd_bytealign_S (in0[2], in0[1], 2);
534 out0[1] = amd_bytealign_S (in0[3], in0[2], 2);
535 out0[2] = amd_bytealign_S (in1[0], in0[3], 2);
536 out0[3] = amd_bytealign_S (in1[1], in1[0], 2);
537 out1[0] = amd_bytealign_S (in1[2], in1[1], 2);
538 out1[1] = amd_bytealign_S (in1[3], in1[2], 2);
539 out1[2] = amd_bytealign_S ( 0, in1[3], 2);
540 out1[3] = 0;
541 break;
542 case 7: out0[0] = amd_bytealign_S (in0[2], in0[1], 3);
543 out0[1] = amd_bytealign_S (in0[3], in0[2], 3);
544 out0[2] = amd_bytealign_S (in1[0], in0[3], 3);
545 out0[3] = amd_bytealign_S (in1[1], in1[0], 3);
546 out1[0] = amd_bytealign_S (in1[2], in1[1], 3);
547 out1[1] = amd_bytealign_S (in1[3], in1[2], 3);
548 out1[2] = amd_bytealign_S ( 0, in1[3], 3);
549 out1[3] = 0;
550 break;
551 case 8: out0[0] = in0[2];
552 out0[1] = in0[3];
553 out0[2] = in1[0];
554 out0[3] = in1[1];
555 out1[0] = in1[2];
556 out1[1] = in1[3];
557 out1[2] = 0;
558 out1[3] = 0;
559 break;
560 case 9: out0[0] = amd_bytealign_S (in0[3], in0[2], 1);
561 out0[1] = amd_bytealign_S (in1[0], in0[3], 1);
562 out0[2] = amd_bytealign_S (in1[1], in1[0], 1);
563 out0[3] = amd_bytealign_S (in1[2], in1[1], 1);
564 out1[0] = amd_bytealign_S (in1[3], in1[2], 1);
565 out1[1] = amd_bytealign_S ( 0, in1[3], 1);
566 out1[2] = 0;
567 out1[3] = 0;
568 break;
569 case 10: out0[0] = amd_bytealign_S (in0[3], in0[2], 2);
570 out0[1] = amd_bytealign_S (in1[0], in0[3], 2);
571 out0[2] = amd_bytealign_S (in1[1], in1[0], 2);
572 out0[3] = amd_bytealign_S (in1[2], in1[1], 2);
573 out1[0] = amd_bytealign_S (in1[3], in1[2], 2);
574 out1[1] = amd_bytealign_S ( 0, in1[3], 2);
575 out1[2] = 0;
576 out1[3] = 0;
577 break;
578 case 11: out0[0] = amd_bytealign_S (in0[3], in0[2], 3);
579 out0[1] = amd_bytealign_S (in1[0], in0[3], 3);
580 out0[2] = amd_bytealign_S (in1[1], in1[0], 3);
581 out0[3] = amd_bytealign_S (in1[2], in1[1], 3);
582 out1[0] = amd_bytealign_S (in1[3], in1[2], 3);
583 out1[1] = amd_bytealign_S ( 0, in1[3], 3);
584 out1[2] = 0;
585 out1[3] = 0;
586 break;
587 case 12: out0[0] = in0[3];
588 out0[1] = in1[0];
589 out0[2] = in1[1];
590 out0[3] = in1[2];
591 out1[0] = in1[3];
592 out1[1] = 0;
593 out1[2] = 0;
594 out1[3] = 0;
595 break;
596 case 13: out0[0] = amd_bytealign_S (in1[0], in0[3], 1);
597 out0[1] = amd_bytealign_S (in1[1], in1[0], 1);
598 out0[2] = amd_bytealign_S (in1[2], in1[1], 1);
599 out0[3] = amd_bytealign_S (in1[3], in1[2], 1);
600 out1[0] = amd_bytealign_S ( 0, in1[3], 1);
601 out1[1] = 0;
602 out1[2] = 0;
603 out1[3] = 0;
604 break;
605 case 14: out0[0] = amd_bytealign_S (in1[0], in0[3], 2);
606 out0[1] = amd_bytealign_S (in1[1], in1[0], 2);
607 out0[2] = amd_bytealign_S (in1[2], in1[1], 2);
608 out0[3] = amd_bytealign_S (in1[3], in1[2], 2);
609 out1[0] = amd_bytealign_S ( 0, in1[3], 2);
610 out1[1] = 0;
611 out1[2] = 0;
612 out1[3] = 0;
613 break;
614 case 15: out0[0] = amd_bytealign_S (in1[0], in0[3], 3);
615 out0[1] = amd_bytealign_S (in1[1], in1[0], 3);
616 out0[2] = amd_bytealign_S (in1[2], in1[1], 3);
617 out0[3] = amd_bytealign_S (in1[3], in1[2], 3);
618 out1[0] = amd_bytealign_S ( 0, in1[3], 3);
619 out1[1] = 0;
620 out1[2] = 0;
621 out1[3] = 0;
622 break;
623 case 16: out0[0] = in1[0];
624 out0[1] = in1[1];
625 out0[2] = in1[2];
626 out0[3] = in1[3];
627 out1[0] = 0;
628 out1[1] = 0;
629 out1[2] = 0;
630 out1[3] = 0;
631 break;
632 case 17: out0[0] = amd_bytealign_S (in1[1], in1[0], 1);
633 out0[1] = amd_bytealign_S (in1[2], in1[1], 1);
634 out0[2] = amd_bytealign_S (in1[3], in1[2], 1);
635 out0[3] = amd_bytealign_S ( 0, in1[3], 1);
636 out1[0] = 0;
637 out1[1] = 0;
638 out1[2] = 0;
639 out1[3] = 0;
640 break;
641 case 18: out0[0] = amd_bytealign_S (in1[1], in1[0], 2);
642 out0[1] = amd_bytealign_S (in1[2], in1[1], 2);
643 out0[2] = amd_bytealign_S (in1[3], in1[2], 2);
644 out0[3] = amd_bytealign_S ( 0, in1[3], 2);
645 out1[0] = 0;
646 out1[1] = 0;
647 out1[2] = 0;
648 out1[3] = 0;
649 break;
650 case 19: out0[0] = amd_bytealign_S (in1[1], in1[0], 3);
651 out0[1] = amd_bytealign_S (in1[2], in1[1], 3);
652 out0[2] = amd_bytealign_S (in1[3], in1[2], 3);
653 out0[3] = amd_bytealign_S ( 0, in1[3], 3);
654 out1[0] = 0;
655 out1[1] = 0;
656 out1[2] = 0;
657 out1[3] = 0;
658 break;
659 case 20: out0[0] = in1[1];
660 out0[1] = in1[2];
661 out0[2] = in1[3];
662 out0[3] = 0;
663 out1[0] = 0;
664 out1[1] = 0;
665 out1[2] = 0;
666 out1[3] = 0;
667 break;
668 case 21: out0[0] = amd_bytealign_S (in1[2], in1[1], 1);
669 out0[1] = amd_bytealign_S (in1[3], in1[2], 1);
670 out0[2] = amd_bytealign_S ( 0, in1[3], 1);
671 out0[3] = 0;
672 out1[0] = 0;
673 out1[1] = 0;
674 out1[2] = 0;
675 out1[3] = 0;
676 break;
677 case 22: out0[0] = amd_bytealign_S (in1[2], in1[1], 2);
678 out0[1] = amd_bytealign_S (in1[3], in1[2], 2);
679 out0[2] = amd_bytealign_S ( 0, in1[3], 2);
680 out0[3] = 0;
681 out1[0] = 0;
682 out1[1] = 0;
683 out1[2] = 0;
684 out1[3] = 0;
685 break;
686 case 23: out0[0] = amd_bytealign_S (in1[2], in1[1], 3);
687 out0[1] = amd_bytealign_S (in1[3], in1[2], 3);
688 out0[2] = amd_bytealign_S ( 0, in1[3], 3);
689 out0[3] = 0;
690 out1[0] = 0;
691 out1[1] = 0;
692 out1[2] = 0;
693 out1[3] = 0;
694 break;
695 case 24: out0[0] = in1[2];
696 out0[1] = in1[3];
697 out0[2] = 0;
698 out0[3] = 0;
699 out1[0] = 0;
700 out1[1] = 0;
701 out1[2] = 0;
702 out1[3] = 0;
703 break;
704 case 25: out0[0] = amd_bytealign_S (in1[3], in1[2], 1);
705 out0[1] = amd_bytealign_S ( 0, in1[3], 1);
706 out0[2] = 0;
707 out0[3] = 0;
708 out1[0] = 0;
709 out1[1] = 0;
710 out1[2] = 0;
711 out1[3] = 0;
712 break;
713 case 26: out0[0] = amd_bytealign_S (in1[3], in1[2], 2);
714 out0[1] = amd_bytealign_S ( 0, in1[3], 2);
715 out0[2] = 0;
716 out0[3] = 0;
717 out1[0] = 0;
718 out1[1] = 0;
719 out1[2] = 0;
720 out1[3] = 0;
721 break;
722 case 27: out0[0] = amd_bytealign_S (in1[3], in1[2], 3);
723 out0[1] = amd_bytealign_S ( 0, in1[3], 3);
724 out0[2] = 0;
725 out0[3] = 0;
726 out1[0] = 0;
727 out1[1] = 0;
728 out1[2] = 0;
729 out1[3] = 0;
730 break;
731 case 28: out0[0] = in1[3];
732 out0[1] = 0;
733 out0[2] = 0;
734 out0[3] = 0;
735 out1[0] = 0;
736 out1[1] = 0;
737 out1[2] = 0;
738 out1[3] = 0;
739 break;
740 case 29: out0[0] = amd_bytealign_S ( 0, in1[3], 1);
741 out0[1] = 0;
742 out0[2] = 0;
743 out0[3] = 0;
744 out1[0] = 0;
745 out1[1] = 0;
746 out1[2] = 0;
747 out1[3] = 0;
748 break;
749 case 30: out0[0] = amd_bytealign_S ( 0, in1[3], 2);
750 out0[1] = 0;
751 out0[2] = 0;
752 out0[3] = 0;
753 out1[0] = 0;
754 out1[1] = 0;
755 out1[2] = 0;
756 out1[3] = 0;
757 break;
758 case 31: out0[0] = amd_bytealign_S ( 0, in1[3], 3);
759 out0[1] = 0;
760 out0[2] = 0;
761 out0[3] = 0;
762 out1[0] = 0;
763 out1[1] = 0;
764 out1[2] = 0;
765 out1[3] = 0;
766 break;
767 }
768 #endif
769 }
770
771 inline void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num)
772 {
773 #ifdef IS_NV
774 switch (num)
775 {
776 case 0: out1[3] = in1[3];
777 out1[2] = in1[2];
778 out1[1] = in1[1];
779 out1[0] = in1[0];
780 out0[3] = in0[3];
781 out0[2] = in0[2];
782 out0[1] = in0[1];
783 out0[0] = in0[0];
784 break;
785 case 1: out1[3] = __byte_perm_S (in1[2], in1[3], 0x6543);
786 out1[2] = __byte_perm_S (in1[1], in1[2], 0x6543);
787 out1[1] = __byte_perm_S (in1[0], in1[1], 0x6543);
788 out1[0] = __byte_perm_S (in0[3], in1[0], 0x6543);
789 out0[3] = __byte_perm_S (in0[2], in0[3], 0x6543);
790 out0[2] = __byte_perm_S (in0[1], in0[2], 0x6543);
791 out0[1] = __byte_perm_S (in0[0], in0[1], 0x6543);
792 out0[0] = __byte_perm_S ( 0, in0[0], 0x6543);
793 break;
794 case 2: out1[3] = __byte_perm_S (in1[2], in1[3], 0x5432);
795 out1[2] = __byte_perm_S (in1[1], in1[2], 0x5432);
796 out1[1] = __byte_perm_S (in1[0], in1[1], 0x5432);
797 out1[0] = __byte_perm_S (in0[3], in1[0], 0x5432);
798 out0[3] = __byte_perm_S (in0[2], in0[3], 0x5432);
799 out0[2] = __byte_perm_S (in0[1], in0[2], 0x5432);
800 out0[1] = __byte_perm_S (in0[0], in0[1], 0x5432);
801 out0[0] = __byte_perm_S ( 0, in0[0], 0x5432);
802 break;
803 case 3: out1[3] = __byte_perm_S (in1[2], in1[3], 0x4321);
804 out1[2] = __byte_perm_S (in1[1], in1[2], 0x4321);
805 out1[1] = __byte_perm_S (in1[0], in1[1], 0x4321);
806 out1[0] = __byte_perm_S (in0[3], in1[0], 0x4321);
807 out0[3] = __byte_perm_S (in0[2], in0[3], 0x4321);
808 out0[2] = __byte_perm_S (in0[1], in0[2], 0x4321);
809 out0[1] = __byte_perm_S (in0[0], in0[1], 0x4321);
810 out0[0] = __byte_perm_S ( 0, in0[0], 0x4321);
811 break;
812 case 4: out1[3] = in1[2];
813 out1[2] = in1[1];
814 out1[1] = in1[0];
815 out1[0] = in0[3];
816 out0[3] = in0[2];
817 out0[2] = in0[1];
818 out0[1] = in0[0];
819 out0[0] = 0;
820 break;
821 case 5: out1[3] = __byte_perm_S (in1[1], in1[2], 0x6543);
822 out1[2] = __byte_perm_S (in1[0], in1[1], 0x6543);
823 out1[1] = __byte_perm_S (in0[3], in1[0], 0x6543);
824 out1[0] = __byte_perm_S (in0[2], in0[3], 0x6543);
825 out0[3] = __byte_perm_S (in0[1], in0[2], 0x6543);
826 out0[2] = __byte_perm_S (in0[0], in0[1], 0x6543);
827 out0[1] = __byte_perm_S ( 0, in0[0], 0x6543);
828 out0[0] = 0;
829 break;
830 case 6: out1[3] = __byte_perm_S (in1[1], in1[2], 0x5432);
831 out1[2] = __byte_perm_S (in1[0], in1[1], 0x5432);
832 out1[1] = __byte_perm_S (in0[3], in1[0], 0x5432);
833 out1[0] = __byte_perm_S (in0[2], in0[3], 0x5432);
834 out0[3] = __byte_perm_S (in0[1], in0[2], 0x5432);
835 out0[2] = __byte_perm_S (in0[0], in0[1], 0x5432);
836 out0[1] = __byte_perm_S ( 0, in0[0], 0x5432);
837 out0[0] = 0;
838 break;
839 case 7: out1[3] = __byte_perm_S (in1[1], in1[2], 0x4321);
840 out1[2] = __byte_perm_S (in1[0], in1[1], 0x4321);
841 out1[1] = __byte_perm_S (in0[3], in1[0], 0x4321);
842 out1[0] = __byte_perm_S (in0[2], in0[3], 0x4321);
843 out0[3] = __byte_perm_S (in0[1], in0[2], 0x4321);
844 out0[2] = __byte_perm_S (in0[0], in0[1], 0x4321);
845 out0[1] = __byte_perm_S ( 0, in0[0], 0x4321);
846 out0[0] = 0;
847 break;
848 case 8: out1[3] = in1[1];
849 out1[2] = in1[0];
850 out1[1] = in0[3];
851 out1[0] = in0[2];
852 out0[3] = in0[1];
853 out0[2] = in0[0];
854 out0[1] = 0;
855 out0[0] = 0;
856 break;
857 case 9: out1[3] = __byte_perm_S (in1[0], in1[1], 0x6543);
858 out1[2] = __byte_perm_S (in0[3], in1[0], 0x6543);
859 out1[1] = __byte_perm_S (in0[2], in0[3], 0x6543);
860 out1[0] = __byte_perm_S (in0[1], in0[2], 0x6543);
861 out0[3] = __byte_perm_S (in0[0], in0[1], 0x6543);
862 out0[2] = __byte_perm_S ( 0, in0[0], 0x6543);
863 out0[1] = 0;
864 out0[0] = 0;
865 break;
866 case 10: out1[3] = __byte_perm_S (in1[0], in1[1], 0x5432);
867 out1[2] = __byte_perm_S (in0[3], in1[0], 0x5432);
868 out1[1] = __byte_perm_S (in0[2], in0[3], 0x5432);
869 out1[0] = __byte_perm_S (in0[1], in0[2], 0x5432);
870 out0[3] = __byte_perm_S (in0[0], in0[1], 0x5432);
871 out0[2] = __byte_perm_S ( 0, in0[0], 0x5432);
872 out0[1] = 0;
873 out0[0] = 0;
874 break;
875 case 11: out1[3] = __byte_perm_S (in1[0], in1[1], 0x4321);
876 out1[2] = __byte_perm_S (in0[3], in1[0], 0x4321);
877 out1[1] = __byte_perm_S (in0[2], in0[3], 0x4321);
878 out1[0] = __byte_perm_S (in0[1], in0[2], 0x4321);
879 out0[3] = __byte_perm_S (in0[0], in0[1], 0x4321);
880 out0[2] = __byte_perm_S ( 0, in0[0], 0x4321);
881 out0[1] = 0;
882 out0[0] = 0;
883 break;
884 case 12: out1[3] = in1[0];
885 out1[2] = in0[3];
886 out1[1] = in0[2];
887 out1[0] = in0[1];
888 out0[3] = in0[0];
889 out0[2] = 0;
890 out0[1] = 0;
891 out0[0] = 0;
892 break;
893 case 13: out1[3] = __byte_perm_S (in0[3], in1[0], 0x6543);
894 out1[2] = __byte_perm_S (in0[2], in0[3], 0x6543);
895 out1[1] = __byte_perm_S (in0[1], in0[2], 0x6543);
896 out1[0] = __byte_perm_S (in0[0], in0[1], 0x6543);
897 out0[3] = __byte_perm_S ( 0, in0[0], 0x6543);
898 out0[2] = 0;
899 out0[1] = 0;
900 out0[0] = 0;
901 break;
902 case 14: out1[3] = __byte_perm_S (in0[3], in1[0], 0x5432);
903 out1[2] = __byte_perm_S (in0[2], in0[3], 0x5432);
904 out1[1] = __byte_perm_S (in0[1], in0[2], 0x5432);
905 out1[0] = __byte_perm_S (in0[0], in0[1], 0x5432);
906 out0[3] = __byte_perm_S ( 0, in0[0], 0x5432);
907 out0[2] = 0;
908 out0[1] = 0;
909 out0[0] = 0;
910 break;
911 case 15: out1[3] = __byte_perm_S (in0[3], in1[0], 0x4321);
912 out1[2] = __byte_perm_S (in0[2], in0[3], 0x4321);
913 out1[1] = __byte_perm_S (in0[1], in0[2], 0x4321);
914 out1[0] = __byte_perm_S (in0[0], in0[1], 0x4321);
915 out0[3] = __byte_perm_S ( 0, in0[0], 0x4321);
916 out0[2] = 0;
917 out0[1] = 0;
918 out0[0] = 0;
919 break;
920 case 16: out1[3] = in0[3];
921 out1[2] = in0[2];
922 out1[1] = in0[1];
923 out1[0] = in0[0];
924 out0[3] = 0;
925 out0[2] = 0;
926 out0[1] = 0;
927 out0[0] = 0;
928 break;
929 case 17: out1[3] = __byte_perm_S (in0[2], in0[3], 0x6543);
930 out1[2] = __byte_perm_S (in0[1], in0[2], 0x6543);
931 out1[1] = __byte_perm_S (in0[0], in0[1], 0x6543);
932 out1[0] = __byte_perm_S ( 0, in0[0], 0x6543);
933 out0[3] = 0;
934 out0[2] = 0;
935 out0[1] = 0;
936 out0[0] = 0;
937 break;
938 case 18: out1[3] = __byte_perm_S (in0[2], in0[3], 0x5432);
939 out1[2] = __byte_perm_S (in0[1], in0[2], 0x5432);
940 out1[1] = __byte_perm_S (in0[0], in0[1], 0x5432);
941 out1[0] = __byte_perm_S ( 0, in0[0], 0x5432);
942 out0[3] = 0;
943 out0[2] = 0;
944 out0[1] = 0;
945 out0[0] = 0;
946 break;
947 case 19: out1[3] = __byte_perm_S (in0[2], in0[3], 0x4321);
948 out1[2] = __byte_perm_S (in0[1], in0[2], 0x4321);
949 out1[1] = __byte_perm_S (in0[0], in0[1], 0x4321);
950 out1[0] = __byte_perm_S ( 0, in0[0], 0x4321);
951 out0[3] = 0;
952 out0[2] = 0;
953 out0[1] = 0;
954 out0[0] = 0;
955 break;
956 case 20: out1[3] = in0[2];
957 out1[2] = in0[1];
958 out1[1] = in0[0];
959 out1[0] = 0;
960 out0[3] = 0;
961 out0[2] = 0;
962 out0[1] = 0;
963 out0[0] = 0;
964 break;
965 case 21: out1[3] = __byte_perm_S (in0[1], in0[2], 0x6543);
966 out1[2] = __byte_perm_S (in0[0], in0[1], 0x6543);
967 out1[1] = __byte_perm_S ( 0, in0[0], 0x6543);
968 out1[0] = 0;
969 out0[3] = 0;
970 out0[2] = 0;
971 out0[1] = 0;
972 out0[0] = 0;
973 break;
974 case 22: out1[3] = __byte_perm_S (in0[1], in0[2], 0x5432);
975 out1[2] = __byte_perm_S (in0[0], in0[1], 0x5432);
976 out1[1] = __byte_perm_S ( 0, in0[0], 0x5432);
977 out1[0] = 0;
978 out0[3] = 0;
979 out0[2] = 0;
980 out0[1] = 0;
981 out0[0] = 0;
982 break;
983 case 23: out1[3] = __byte_perm_S (in0[1], in0[2], 0x4321);
984 out1[2] = __byte_perm_S (in0[0], in0[1], 0x4321);
985 out1[1] = __byte_perm_S ( 0, in0[0], 0x4321);
986 out1[0] = 0;
987 out0[3] = 0;
988 out0[2] = 0;
989 out0[1] = 0;
990 out0[0] = 0;
991 break;
992 case 24: out1[3] = in0[1];
993 out1[2] = in0[0];
994 out1[1] = 0;
995 out1[0] = 0;
996 out0[3] = 0;
997 out0[2] = 0;
998 out0[1] = 0;
999 out0[0] = 0;
1000 break;
1001 case 25: out1[3] = __byte_perm_S (in0[0], in0[1], 0x6543);
1002 out1[2] = __byte_perm_S ( 0, in0[0], 0x6543);
1003 out1[1] = 0;
1004 out1[0] = 0;
1005 out0[3] = 0;
1006 out0[2] = 0;
1007 out0[1] = 0;
1008 out0[0] = 0;
1009 break;
1010 case 26: out1[3] = __byte_perm_S (in0[0], in0[1], 0x5432);
1011 out1[2] = __byte_perm_S ( 0, in0[0], 0x5432);
1012 out1[1] = 0;
1013 out1[0] = 0;
1014 out0[3] = 0;
1015 out0[2] = 0;
1016 out0[1] = 0;
1017 out0[0] = 0;
1018 break;
1019 case 27: out1[3] = __byte_perm_S (in0[0], in0[1], 0x4321);
1020 out1[2] = __byte_perm_S ( 0, in0[0], 0x4321);
1021 out1[1] = 0;
1022 out1[0] = 0;
1023 out0[3] = 0;
1024 out0[2] = 0;
1025 out0[1] = 0;
1026 out0[0] = 0;
1027 break;
1028 case 28: out1[3] = in0[0];
1029 out1[2] = 0;
1030 out1[1] = 0;
1031 out1[0] = 0;
1032 out0[3] = 0;
1033 out0[2] = 0;
1034 out0[1] = 0;
1035 out0[0] = 0;
1036 break;
1037 case 29: out1[3] = __byte_perm_S ( 0, in0[0], 0x6543);
1038 out1[2] = 0;
1039 out1[1] = 0;
1040 out1[0] = 0;
1041 out0[3] = 0;
1042 out0[2] = 0;
1043 out0[1] = 0;
1044 out0[0] = 0;
1045 break;
1046 case 30: out1[3] = __byte_perm_S ( 0, in0[0], 0x5432);
1047 out1[2] = 0;
1048 out1[1] = 0;
1049 out1[0] = 0;
1050 out0[3] = 0;
1051 out0[2] = 0;
1052 out0[1] = 0;
1053 out0[0] = 0;
1054 break;
1055 case 31: out1[3] = __byte_perm_S ( 0, in0[0], 0x4321);
1056 out1[2] = 0;
1057 out1[1] = 0;
1058 out1[0] = 0;
1059 out0[3] = 0;
1060 out0[2] = 0;
1061 out0[1] = 0;
1062 out0[0] = 0;
1063 break;
1064 }
1065 #endif
1066
1067 #if defined IS_AMD || defined IS_GENERIC
1068 switch (num)
1069 {
1070 case 0: out1[3] = in1[3];
1071 out1[2] = in1[2];
1072 out1[1] = in1[1];
1073 out1[0] = in1[0];
1074 out0[3] = in0[3];
1075 out0[2] = in0[2];
1076 out0[1] = in0[1];
1077 out0[0] = in0[0];
1078 break;
1079 case 1: out1[3] = amd_bytealign_S (in1[3], in1[2], 3);
1080 out1[2] = amd_bytealign_S (in1[2], in1[1], 3);
1081 out1[1] = amd_bytealign_S (in1[1], in1[0], 3);
1082 out1[0] = amd_bytealign_S (in1[0], in0[3], 3);
1083 out0[3] = amd_bytealign_S (in0[3], in0[2], 3);
1084 out0[2] = amd_bytealign_S (in0[2], in0[1], 3);
1085 out0[1] = amd_bytealign_S (in0[1], in0[0], 3);
1086 out0[0] = amd_bytealign_S (in0[0], 0, 3);
1087 break;
1088 case 2: out1[3] = amd_bytealign_S (in1[3], in1[2], 2);
1089 out1[2] = amd_bytealign_S (in1[2], in1[1], 2);
1090 out1[1] = amd_bytealign_S (in1[1], in1[0], 2);
1091 out1[0] = amd_bytealign_S (in1[0], in0[3], 2);
1092 out0[3] = amd_bytealign_S (in0[3], in0[2], 2);
1093 out0[2] = amd_bytealign_S (in0[2], in0[1], 2);
1094 out0[1] = amd_bytealign_S (in0[1], in0[0], 2);
1095 out0[0] = amd_bytealign_S (in0[0], 0, 2);
1096 break;
1097 case 3: out1[3] = amd_bytealign_S (in1[3], in1[2], 1);
1098 out1[2] = amd_bytealign_S (in1[2], in1[1], 1);
1099 out1[1] = amd_bytealign_S (in1[1], in1[0], 1);
1100 out1[0] = amd_bytealign_S (in1[0], in0[3], 1);
1101 out0[3] = amd_bytealign_S (in0[3], in0[2], 1);
1102 out0[2] = amd_bytealign_S (in0[2], in0[1], 1);
1103 out0[1] = amd_bytealign_S (in0[1], in0[0], 1);
1104 out0[0] = amd_bytealign_S (in0[0], 0, 1);
1105 break;
1106 case 4: out1[3] = in1[2];
1107 out1[2] = in1[1];
1108 out1[1] = in1[0];
1109 out1[0] = in0[3];
1110 out0[3] = in0[2];
1111 out0[2] = in0[1];
1112 out0[1] = in0[0];
1113 out0[0] = 0;
1114 break;
1115 case 5: out1[3] = amd_bytealign_S (in1[2], in1[1], 3);
1116 out1[2] = amd_bytealign_S (in1[1], in1[0], 3);
1117 out1[1] = amd_bytealign_S (in1[0], in0[3], 3);
1118 out1[0] = amd_bytealign_S (in0[3], in0[2], 3);
1119 out0[3] = amd_bytealign_S (in0[2], in0[1], 3);
1120 out0[2] = amd_bytealign_S (in0[1], in0[0], 3);
1121 out0[1] = amd_bytealign_S (in0[0], 0, 3);
1122 out0[0] = 0;
1123 break;
1124 case 6: out1[3] = amd_bytealign_S (in1[2], in1[1], 2);
1125 out1[2] = amd_bytealign_S (in1[1], in1[0], 2);
1126 out1[1] = amd_bytealign_S (in1[0], in0[3], 2);
1127 out1[0] = amd_bytealign_S (in0[3], in0[2], 2);
1128 out0[3] = amd_bytealign_S (in0[2], in0[1], 2);
1129 out0[2] = amd_bytealign_S (in0[1], in0[0], 2);
1130 out0[1] = amd_bytealign_S (in0[0], 0, 2);
1131 out0[0] = 0;
1132 break;
1133 case 7: out1[3] = amd_bytealign_S (in1[2], in1[1], 1);
1134 out1[2] = amd_bytealign_S (in1[1], in1[0], 1);
1135 out1[1] = amd_bytealign_S (in1[0], in0[3], 1);
1136 out1[0] = amd_bytealign_S (in0[3], in0[2], 1);
1137 out0[3] = amd_bytealign_S (in0[2], in0[1], 1);
1138 out0[2] = amd_bytealign_S (in0[1], in0[0], 1);
1139 out0[1] = amd_bytealign_S (in0[0], 0, 1);
1140 out0[0] = 0;
1141 break;
1142 case 8: out1[3] = in1[1];
1143 out1[2] = in1[0];
1144 out1[1] = in0[3];
1145 out1[0] = in0[2];
1146 out0[3] = in0[1];
1147 out0[2] = in0[0];
1148 out0[1] = 0;
1149 out0[0] = 0;
1150 break;
1151 case 9: out1[3] = amd_bytealign_S (in1[1], in1[0], 3);
1152 out1[2] = amd_bytealign_S (in1[0], in0[3], 3);
1153 out1[1] = amd_bytealign_S (in0[3], in0[2], 3);
1154 out1[0] = amd_bytealign_S (in0[2], in0[1], 3);
1155 out0[3] = amd_bytealign_S (in0[1], in0[0], 3);
1156 out0[2] = amd_bytealign_S (in0[0], 0, 3);
1157 out0[1] = 0;
1158 out0[0] = 0;
1159 break;
1160 case 10: out1[3] = amd_bytealign_S (in1[1], in1[0], 2);
1161 out1[2] = amd_bytealign_S (in1[0], in0[3], 2);
1162 out1[1] = amd_bytealign_S (in0[3], in0[2], 2);
1163 out1[0] = amd_bytealign_S (in0[2], in0[1], 2);
1164 out0[3] = amd_bytealign_S (in0[1], in0[0], 2);
1165 out0[2] = amd_bytealign_S (in0[0], 0, 2);
1166 out0[1] = 0;
1167 out0[0] = 0;
1168 break;
1169 case 11: out1[3] = amd_bytealign_S (in1[1], in1[0], 1);
1170 out1[2] = amd_bytealign_S (in1[0], in0[3], 1);
1171 out1[1] = amd_bytealign_S (in0[3], in0[2], 1);
1172 out1[0] = amd_bytealign_S (in0[2], in0[1], 1);
1173 out0[3] = amd_bytealign_S (in0[1], in0[0], 1);
1174 out0[2] = amd_bytealign_S (in0[0], 0, 1);
1175 out0[1] = 0;
1176 out0[0] = 0;
1177 break;
1178 case 12: out1[3] = in1[0];
1179 out1[2] = in0[3];
1180 out1[1] = in0[2];
1181 out1[0] = in0[1];
1182 out0[3] = in0[0];
1183 out0[2] = 0;
1184 out0[1] = 0;
1185 out0[0] = 0;
1186 break;
1187 case 13: out1[3] = amd_bytealign_S (in1[0], in0[3], 3);
1188 out1[2] = amd_bytealign_S (in0[3], in0[2], 3);
1189 out1[1] = amd_bytealign_S (in0[2], in0[1], 3);
1190 out1[0] = amd_bytealign_S (in0[1], in0[0], 3);
1191 out0[3] = amd_bytealign_S (in0[0], 0, 3);
1192 out0[2] = 0;
1193 out0[1] = 0;
1194 out0[0] = 0;
1195 break;
1196 case 14: out1[3] = amd_bytealign_S (in1[0], in0[3], 2);
1197 out1[2] = amd_bytealign_S (in0[3], in0[2], 2);
1198 out1[1] = amd_bytealign_S (in0[2], in0[1], 2);
1199 out1[0] = amd_bytealign_S (in0[1], in0[0], 2);
1200 out0[3] = amd_bytealign_S (in0[0], 0, 2);
1201 out0[2] = 0;
1202 out0[1] = 0;
1203 out0[0] = 0;
1204 break;
1205 case 15: out1[3] = amd_bytealign_S (in1[0], in0[3], 1);
1206 out1[2] = amd_bytealign_S (in0[3], in0[2], 1);
1207 out1[1] = amd_bytealign_S (in0[2], in0[1], 1);
1208 out1[0] = amd_bytealign_S (in0[1], in0[0], 1);
1209 out0[3] = amd_bytealign_S (in0[0], 0, 1);
1210 out0[2] = 0;
1211 out0[1] = 0;
1212 out0[0] = 0;
1213 break;
1214 case 16: out1[3] = in0[3];
1215 out1[2] = in0[2];
1216 out1[1] = in0[1];
1217 out1[0] = in0[0];
1218 out0[3] = 0;
1219 out0[2] = 0;
1220 out0[1] = 0;
1221 out0[0] = 0;
1222 break;
1223 case 17: out1[3] = amd_bytealign_S (in0[3], in0[2], 3);
1224 out1[2] = amd_bytealign_S (in0[2], in0[1], 3);
1225 out1[1] = amd_bytealign_S (in0[1], in0[0], 3);
1226 out1[0] = amd_bytealign_S (in0[0], 0, 3);
1227 out0[3] = 0;
1228 out0[2] = 0;
1229 out0[1] = 0;
1230 out0[0] = 0;
1231 break;
1232 case 18: out1[3] = amd_bytealign_S (in0[3], in0[2], 2);
1233 out1[2] = amd_bytealign_S (in0[2], in0[1], 2);
1234 out1[1] = amd_bytealign_S (in0[1], in0[0], 2);
1235 out1[0] = amd_bytealign_S (in0[0], 0, 2);
1236 out0[3] = 0;
1237 out0[2] = 0;
1238 out0[1] = 0;
1239 out0[0] = 0;
1240 break;
1241 case 19: out1[3] = amd_bytealign_S (in0[3], in0[2], 1);
1242 out1[2] = amd_bytealign_S (in0[2], in0[1], 1);
1243 out1[1] = amd_bytealign_S (in0[1], in0[0], 1);
1244 out1[0] = amd_bytealign_S (in0[0], 0, 1);
1245 out0[3] = 0;
1246 out0[2] = 0;
1247 out0[1] = 0;
1248 out0[0] = 0;
1249 break;
1250 case 20: out1[3] = in0[2];
1251 out1[2] = in0[1];
1252 out1[1] = in0[0];
1253 out1[0] = 0;
1254 out0[3] = 0;
1255 out0[2] = 0;
1256 out0[1] = 0;
1257 out0[0] = 0;
1258 break;
1259 case 21: out1[3] = amd_bytealign_S (in0[2], in0[1], 3);
1260 out1[2] = amd_bytealign_S (in0[1], in0[0], 3);
1261 out1[1] = amd_bytealign_S (in0[0], 0, 3);
1262 out1[0] = 0;
1263 out0[3] = 0;
1264 out0[2] = 0;
1265 out0[1] = 0;
1266 out0[0] = 0;
1267 break;
1268 case 22: out1[3] = amd_bytealign_S (in0[2], in0[1], 2);
1269 out1[2] = amd_bytealign_S (in0[1], in0[0], 2);
1270 out1[1] = amd_bytealign_S (in0[0], 0, 2);
1271 out1[0] = 0;
1272 out0[3] = 0;
1273 out0[2] = 0;
1274 out0[1] = 0;
1275 out0[0] = 0;
1276 break;
1277 case 23: out1[3] = amd_bytealign_S (in0[2], in0[1], 1);
1278 out1[2] = amd_bytealign_S (in0[1], in0[0], 1);
1279 out1[1] = amd_bytealign_S (in0[0], 0, 1);
1280 out1[0] = 0;
1281 out0[3] = 0;
1282 out0[2] = 0;
1283 out0[1] = 0;
1284 out0[0] = 0;
1285 break;
1286 case 24: out1[3] = in0[1];
1287 out1[2] = in0[0];
1288 out1[1] = 0;
1289 out1[0] = 0;
1290 out0[3] = 0;
1291 out0[2] = 0;
1292 out0[1] = 0;
1293 out0[0] = 0;
1294 break;
1295 case 25: out1[3] = amd_bytealign_S (in0[1], in0[0], 3);
1296 out1[2] = amd_bytealign_S (in0[0], 0, 3);
1297 out1[1] = 0;
1298 out1[0] = 0;
1299 out0[3] = 0;
1300 out0[2] = 0;
1301 out0[1] = 0;
1302 out0[0] = 0;
1303 break;
1304 case 26: out1[3] = amd_bytealign_S (in0[1], in0[0], 2);
1305 out1[2] = amd_bytealign_S (in0[0], 0, 2);
1306 out1[1] = 0;
1307 out1[0] = 0;
1308 out0[3] = 0;
1309 out0[2] = 0;
1310 out0[1] = 0;
1311 out0[0] = 0;
1312 break;
1313 case 27: out1[3] = amd_bytealign_S (in0[1], in0[0], 1);
1314 out1[2] = amd_bytealign_S (in0[0], 0, 1);
1315 out1[1] = 0;
1316 out1[0] = 0;
1317 out0[3] = 0;
1318 out0[2] = 0;
1319 out0[1] = 0;
1320 out0[0] = 0;
1321 break;
1322 case 28: out1[3] = in0[0];
1323 out1[2] = 0;
1324 out1[1] = 0;
1325 out1[0] = 0;
1326 out0[3] = 0;
1327 out0[2] = 0;
1328 out0[1] = 0;
1329 out0[0] = 0;
1330 break;
1331 case 29: out1[3] = amd_bytealign_S (in0[0], 0, 3);
1332 out1[2] = 0;
1333 out1[1] = 0;
1334 out1[0] = 0;
1335 out0[3] = 0;
1336 out0[2] = 0;
1337 out0[1] = 0;
1338 out0[0] = 0;
1339 break;
1340 case 30: out1[3] = amd_bytealign_S (in0[0], 0, 2);
1341 out1[2] = 0;
1342 out1[1] = 0;
1343 out1[0] = 0;
1344 out0[3] = 0;
1345 out0[2] = 0;
1346 out0[1] = 0;
1347 out0[0] = 0;
1348 break;
1349 case 31: out1[3] = amd_bytealign_S (in0[0], 0, 1);
1350 out1[2] = 0;
1351 out1[1] = 0;
1352 out1[0] = 0;
1353 out0[3] = 0;
1354 out0[2] = 0;
1355 out0[1] = 0;
1356 out0[0] = 0;
1357 break;
1358 }
1359 #endif
1360 }
1361
1362 inline void append_block1 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 src_r0)
1363 {
1364 u32 tmp[2];
1365
1366 switch (offset & 3)
1367 {
1368 case 0: tmp[0] = src_r0;
1369 tmp[1] = 0;
1370 break;
1371 case 1: tmp[0] = src_r0 << 8;
1372 tmp[1] = src_r0 >> 24;
1373 break;
1374 case 2: tmp[0] = src_r0 << 16;
1375 tmp[1] = src_r0 >> 16;
1376 break;
1377 case 3: tmp[0] = src_r0 << 24;
1378 tmp[1] = src_r0 >> 8;
1379 break;
1380 }
1381
1382 switch (offset / 4)
1383 {
1384 case 0: dst0[0] |= tmp[0];
1385 dst0[1] = tmp[1];
1386 break;
1387 case 1: dst0[1] |= tmp[0];
1388 dst0[2] = tmp[1];
1389 break;
1390 case 2: dst0[2] |= tmp[0];
1391 dst0[3] = tmp[1];
1392 break;
1393 case 3: dst0[3] |= tmp[0];
1394 dst1[0] = tmp[1];
1395 break;
1396 case 4: dst1[0] |= tmp[0];
1397 dst1[1] = tmp[1];
1398 break;
1399 case 5: dst1[1] |= tmp[0];
1400 dst1[2] = tmp[1];
1401 break;
1402 case 6: dst1[2] |= tmp[0];
1403 dst1[3] = tmp[1];
1404 break;
1405 case 7: dst1[3] |= tmp[0];
1406 break;
1407 }
1408 }
1409
1410 inline void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4])
1411 {
1412 /*
1413 #ifdef IS_NV
1414 switch (offset)
1415 {
1416 case 0:
1417 dst0[0] = src_r0[0];
1418 dst0[1] = src_r0[1];
1419 dst0[2] = src_r0[2];
1420 dst0[3] = src_r0[3];
1421 dst1[0] = src_r1[0];
1422 dst1[1] = src_r1[1];
1423 dst1[2] = src_r1[2];
1424 dst1[3] = src_r1[3];
1425 break;
1426
1427 case 1:
1428 dst0[0] = __byte_perm_S (src_l0[0], src_r0[0], 0x6540);
1429 dst0[1] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1430 dst0[2] = __byte_perm_S (src_r0[1], src_r0[2], 0x6543);
1431 dst0[3] = __byte_perm_S (src_r0[2], src_r0[3], 0x6543);
1432 dst1[0] = __byte_perm_S (src_r0[3], src_r1[0], 0x6543);
1433 dst1[1] = __byte_perm_S (src_r1[0], src_r1[1], 0x6543);
1434 dst1[2] = __byte_perm_S (src_r1[1], src_r1[2], 0x6543);
1435 dst1[3] = __byte_perm_S (src_r1[2], src_r1[3], 0x6543);
1436 break;
1437
1438 case 2:
1439 dst0[0] = __byte_perm_S (src_l0[0], src_r0[0], 0x5410);
1440 dst0[1] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1441 dst0[2] = __byte_perm_S (src_r0[1], src_r0[2], 0x5432);
1442 dst0[3] = __byte_perm_S (src_r0[2], src_r0[3], 0x5432);
1443 dst1[0] = __byte_perm_S (src_r0[3], src_r1[0], 0x5432);
1444 dst1[1] = __byte_perm_S (src_r1[0], src_r1[1], 0x5432);
1445 dst1[2] = __byte_perm_S (src_r1[1], src_r1[2], 0x5432);
1446 dst1[3] = __byte_perm_S (src_r1[2], src_r1[3], 0x5432);
1447 break;
1448
1449 case 3:
1450 dst0[0] = __byte_perm_S (src_l0[0], src_r0[0], 0x4210);
1451 dst0[1] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1452 dst0[2] = __byte_perm_S (src_r0[1], src_r0[2], 0x4321);
1453 dst0[3] = __byte_perm_S (src_r0[2], src_r0[3], 0x4321);
1454 dst1[0] = __byte_perm_S (src_r0[3], src_r1[0], 0x4321);
1455 dst1[1] = __byte_perm_S (src_r1[0], src_r1[1], 0x4321);
1456 dst1[2] = __byte_perm_S (src_r1[1], src_r1[2], 0x4321);
1457 dst1[3] = __byte_perm_S (src_r1[2], src_r1[3], 0x4321);
1458 break;
1459
1460 case 4:
1461 dst0[1] = src_r0[0];
1462 dst0[2] = src_r0[1];
1463 dst0[3] = src_r0[2];
1464 dst1[0] = src_r0[3];
1465 dst1[1] = src_r1[0];
1466 dst1[2] = src_r1[1];
1467 dst1[3] = src_r1[2];
1468 break;
1469
1470 case 5:
1471 dst0[1] = __byte_perm_S (src_l0[1], src_r0[0], 0x6540);
1472 dst0[2] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1473 dst0[3] = __byte_perm_S (src_r0[1], src_r0[2], 0x6543);
1474 dst1[0] = __byte_perm_S (src_r0[2], src_r0[3], 0x6543);
1475 dst1[1] = __byte_perm_S (src_r0[3], src_r1[0], 0x6543);
1476 dst1[2] = __byte_perm_S (src_r1[0], src_r1[1], 0x6543);
1477 dst1[3] = __byte_perm_S (src_r1[1], src_r1[2], 0x6543);
1478 break;
1479
1480 case 6:
1481 dst0[1] = __byte_perm_S (src_l0[1], src_r0[0], 0x5410);
1482 dst0[2] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1483 dst0[3] = __byte_perm_S (src_r0[1], src_r0[2], 0x5432);
1484 dst1[0] = __byte_perm_S (src_r0[2], src_r0[3], 0x5432);
1485 dst1[1] = __byte_perm_S (src_r0[3], src_r1[0], 0x5432);
1486 dst1[2] = __byte_perm_S (src_r1[0], src_r1[1], 0x5432);
1487 dst1[3] = __byte_perm_S (src_r1[1], src_r1[2], 0x5432);
1488 break;
1489
1490 case 7:
1491 dst0[1] = __byte_perm_S (src_l0[1], src_r0[0], 0x4210);
1492 dst0[2] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1493 dst0[3] = __byte_perm_S (src_r0[1], src_r0[2], 0x4321);
1494 dst1[0] = __byte_perm_S (src_r0[2], src_r0[3], 0x4321);
1495 dst1[1] = __byte_perm_S (src_r0[3], src_r1[0], 0x4321);
1496 dst1[2] = __byte_perm_S (src_r1[0], src_r1[1], 0x4321);
1497 dst1[3] = __byte_perm_S (src_r1[1], src_r1[2], 0x4321);
1498 break;
1499
1500 case 8:
1501 dst0[2] = src_r0[0];
1502 dst0[3] = src_r0[1];
1503 dst1[0] = src_r0[2];
1504 dst1[1] = src_r0[3];
1505 dst1[2] = src_r1[0];
1506 dst1[3] = src_r1[1];
1507 break;
1508
1509 case 9:
1510 dst0[2] = __byte_perm_S (src_l0[2], src_r0[0], 0x6540);
1511 dst0[3] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1512 dst1[0] = __byte_perm_S (src_r0[1], src_r0[2], 0x6543);
1513 dst1[1] = __byte_perm_S (src_r0[2], src_r0[3], 0x6543);
1514 dst1[2] = __byte_perm_S (src_r0[3], src_r1[0], 0x6543);
1515 dst1[3] = __byte_perm_S (src_r1[0], src_r1[1], 0x6543);
1516 break;
1517
1518 case 10:
1519 dst0[2] = __byte_perm_S (src_l0[2], src_r0[0], 0x5410);
1520 dst0[3] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1521 dst1[0] = __byte_perm_S (src_r0[1], src_r0[2], 0x5432);
1522 dst1[1] = __byte_perm_S (src_r0[2], src_r0[3], 0x5432);
1523 dst1[2] = __byte_perm_S (src_r0[3], src_r1[0], 0x5432);
1524 dst1[3] = __byte_perm_S (src_r1[0], src_r1[1], 0x5432);
1525 break;
1526
1527 case 11:
1528 dst0[2] = __byte_perm_S (src_l0[2], src_r0[0], 0x4210);
1529 dst0[3] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1530 dst1[0] = __byte_perm_S (src_r0[1], src_r0[2], 0x4321);
1531 dst1[1] = __byte_perm_S (src_r0[2], src_r0[3], 0x4321);
1532 dst1[2] = __byte_perm_S (src_r0[3], src_r1[0], 0x4321);
1533 dst1[3] = __byte_perm_S (src_r1[0], src_r1[1], 0x4321);
1534 break;
1535
1536 case 12:
1537 dst0[3] = src_r0[0];
1538 dst1[0] = src_r0[1];
1539 dst1[1] = src_r0[2];
1540 dst1[2] = src_r0[3];
1541 dst1[3] = src_r1[0];
1542 break;
1543
1544 case 13:
1545 dst0[3] = __byte_perm_S (src_l0[3], src_r0[0], 0x6540);
1546 dst1[0] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1547 dst1[1] = __byte_perm_S (src_r0[1], src_r0[2], 0x6543);
1548 dst1[2] = __byte_perm_S (src_r0[2], src_r0[3], 0x6543);
1549 dst1[3] = __byte_perm_S (src_r0[3], src_r1[0], 0x6543);
1550 break;
1551
1552 case 14:
1553 dst0[3] = __byte_perm_S (src_l0[3], src_r0[0], 0x5410);
1554 dst1[0] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1555 dst1[1] = __byte_perm_S (src_r0[1], src_r0[2], 0x5432);
1556 dst1[2] = __byte_perm_S (src_r0[2], src_r0[3], 0x5432);
1557 dst1[3] = __byte_perm_S (src_r0[3], src_r1[0], 0x5432);
1558 break;
1559
1560 case 15:
1561 dst0[3] = __byte_perm_S (src_l0[3], src_r0[0], 0x4210);
1562 dst1[0] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1563 dst1[1] = __byte_perm_S (src_r0[1], src_r0[2], 0x4321);
1564 dst1[2] = __byte_perm_S (src_r0[2], src_r0[3], 0x4321);
1565 dst1[3] = __byte_perm_S (src_r0[3], src_r1[0], 0x4321);
1566 break;
1567
1568 case 16:
1569 dst1[0] = src_r0[0];
1570 dst1[1] = src_r0[1];
1571 dst1[2] = src_r0[2];
1572 dst1[3] = src_r0[3];
1573 break;
1574
1575 case 17:
1576 dst1[0] = __byte_perm_S (src_l1[0], src_r0[0], 0x6540);
1577 dst1[1] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1578 dst1[2] = __byte_perm_S (src_r0[1], src_r0[2], 0x6543);
1579 dst1[3] = __byte_perm_S (src_r0[2], src_r0[3], 0x6543);
1580 break;
1581
1582 case 18:
1583 dst1[0] = __byte_perm_S (src_l1[0], src_r0[0], 0x5410);
1584 dst1[1] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1585 dst1[2] = __byte_perm_S (src_r0[1], src_r0[2], 0x5432);
1586 dst1[3] = __byte_perm_S (src_r0[2], src_r0[3], 0x5432);
1587 break;
1588
1589 case 19:
1590 dst1[0] = __byte_perm_S (src_l1[0], src_r0[0], 0x4210);
1591 dst1[1] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1592 dst1[2] = __byte_perm_S (src_r0[1], src_r0[2], 0x4321);
1593 dst1[3] = __byte_perm_S (src_r0[2], src_r0[3], 0x4321);
1594 break;
1595
1596 case 20:
1597 dst1[1] = src_r0[0];
1598 dst1[2] = src_r0[1];
1599 dst1[3] = src_r0[2];
1600 break;
1601
1602 case 21:
1603 dst1[1] = __byte_perm_S (src_l1[1], src_r0[0], 0x6540);
1604 dst1[2] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1605 dst1[3] = __byte_perm_S (src_r0[1], src_r0[2], 0x6543);
1606 break;
1607
1608 case 22:
1609 dst1[1] = __byte_perm_S (src_l1[1], src_r0[0], 0x5410);
1610 dst1[2] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1611 dst1[3] = __byte_perm_S (src_r0[1], src_r0[2], 0x5432);
1612 break;
1613
1614 case 23:
1615 dst1[1] = __byte_perm_S (src_l1[1], src_r0[0], 0x4210);
1616 dst1[2] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1617 dst1[3] = __byte_perm_S (src_r0[1], src_r0[2], 0x4321);
1618 break;
1619
1620 case 24:
1621 dst1[2] = src_r0[0];
1622 dst1[3] = src_r0[1];
1623 break;
1624
1625 case 25:
1626 dst1[2] = __byte_perm_S (src_l1[2], src_r0[0], 0x6540);
1627 dst1[3] = __byte_perm_S (src_r0[0], src_r0[1], 0x6543);
1628 break;
1629
1630 case 26:
1631 dst1[2] = __byte_perm_S (src_l1[2], src_r0[0], 0x5410);
1632 dst1[3] = __byte_perm_S (src_r0[0], src_r0[1], 0x5432);
1633 break;
1634
1635 case 27:
1636 dst1[2] = __byte_perm_S (src_l1[2], src_r0[0], 0x4210);
1637 dst1[3] = __byte_perm_S (src_r0[0], src_r0[1], 0x4321);
1638 break;
1639
1640 case 28:
1641 dst1[3] = src_r0[0];
1642 break;
1643
1644 case 29:
1645 dst1[3] = __byte_perm_S (src_l1[3], src_r0[0], 0x6540);
1646 break;
1647
1648 case 30:
1649 dst1[3] = __byte_perm_S (src_l1[3], src_r0[0], 0x5410);
1650 break;
1651
1652 case 31:
1653 dst1[3] = __byte_perm_S (src_l1[3], src_r0[0], 0x4210);
1654 break;
1655 }
1656 #endif
1657
1658 #if defined IS_AMD || defined IS_GENERIC
1659 */
1660 switch (offset)
1661 {
1662 case 31:
1663 dst1[3] = src_l1[3] | src_r0[0] << 24;
1664 break;
1665 case 30:
1666 dst1[3] = src_l1[3] | src_r0[0] << 16;
1667 break;
1668 case 29:
1669 dst1[3] = src_l1[3] | src_r0[0] << 8;
1670 break;
1671 case 28:
1672 dst1[3] = src_r0[0];
1673 break;
1674 case 27:
1675 dst1[3] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1676 dst1[2] = src_l1[2] | src_r0[0] << 24;
1677 break;
1678 case 26:
1679 dst1[3] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1680 dst1[2] = src_l1[2] | src_r0[0] << 16;
1681 break;
1682 case 25:
1683 dst1[3] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1684 dst1[2] = src_l1[2] | src_r0[0] << 8;
1685 break;
1686 case 24:
1687 dst1[3] = src_r0[1];
1688 dst1[2] = src_r0[0];
1689 break;
1690 case 23:
1691 dst1[3] = amd_bytealign_S (src_r0[2], src_r0[1], 1);
1692 dst1[2] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1693 dst1[1] = src_l1[1] | src_r0[0] << 24;
1694 break;
1695 case 22:
1696 dst1[3] = amd_bytealign_S (src_r0[2], src_r0[1], 2);
1697 dst1[2] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1698 dst1[1] = src_l1[1] | src_r0[0] << 16;
1699 break;
1700 case 21:
1701 dst1[3] = amd_bytealign_S (src_r0[2], src_r0[1], 3);
1702 dst1[2] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1703 dst1[1] = src_l1[1] | src_r0[0] << 8;
1704 break;
1705 case 20:
1706 dst1[3] = src_r0[2];
1707 dst1[2] = src_r0[1];
1708 dst1[1] = src_r0[0];
1709 break;
1710 case 19:
1711 dst1[3] = amd_bytealign_S (src_r0[3], src_r0[2], 1);
1712 dst1[2] = amd_bytealign_S (src_r0[2], src_r0[1], 1);
1713 dst1[1] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1714 dst1[0] = src_l1[0] | src_r0[0] << 24;
1715 break;
1716 case 18:
1717 dst1[3] = amd_bytealign_S (src_r0[3], src_r0[2], 2);
1718 dst1[2] = amd_bytealign_S (src_r0[2], src_r0[1], 2);
1719 dst1[1] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1720 dst1[0] = src_l1[0] | src_r0[0] << 16;
1721 break;
1722 case 17:
1723 dst1[3] = amd_bytealign_S (src_r0[3], src_r0[2], 3);
1724 dst1[2] = amd_bytealign_S (src_r0[2], src_r0[1], 3);
1725 dst1[1] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1726 dst1[0] = src_l1[0] | src_r0[0] << 8;
1727 break;
1728 case 16:
1729 dst1[3] = src_r0[3];
1730 dst1[2] = src_r0[2];
1731 dst1[1] = src_r0[1];
1732 dst1[0] = src_r0[0];
1733 break;
1734 case 15:
1735 dst1[3] = amd_bytealign_S (src_r1[0], src_r0[3], 1);
1736 dst1[2] = amd_bytealign_S (src_r0[3], src_r0[2], 1);
1737 dst1[1] = amd_bytealign_S (src_r0[2], src_r0[1], 1);
1738 dst1[0] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1739 dst0[3] = src_l0[3] | src_r0[0] << 24;
1740 break;
1741 case 14:
1742 dst1[3] = amd_bytealign_S (src_r1[0], src_r0[3], 2);
1743 dst1[2] = amd_bytealign_S (src_r0[3], src_r0[2], 2);
1744 dst1[1] = amd_bytealign_S (src_r0[2], src_r0[1], 2);
1745 dst1[0] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1746 dst0[3] = src_l0[3] | src_r0[0] << 16;
1747 break;
1748 case 13:
1749 dst1[3] = amd_bytealign_S (src_r1[0], src_r0[3], 3);
1750 dst1[2] = amd_bytealign_S (src_r0[3], src_r0[2], 3);
1751 dst1[1] = amd_bytealign_S (src_r0[2], src_r0[1], 3);
1752 dst1[0] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1753 dst0[3] = src_l0[3] | src_r0[0] << 8;
1754 break;
1755 case 12:
1756 dst1[3] = src_r1[0];
1757 dst1[2] = src_r0[3];
1758 dst1[1] = src_r0[2];
1759 dst1[0] = src_r0[1];
1760 dst0[3] = src_r0[0];
1761 break;
1762 case 11:
1763 dst1[3] = amd_bytealign_S (src_r1[1], src_r1[0], 1);
1764 dst1[2] = amd_bytealign_S (src_r1[0], src_r0[3], 1);
1765 dst1[1] = amd_bytealign_S (src_r0[3], src_r0[2], 1);
1766 dst1[0] = amd_bytealign_S (src_r0[2], src_r0[1], 1);
1767 dst0[3] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1768 dst0[2] = src_l0[2] | src_r0[0] << 24;
1769 break;
1770 case 10:
1771 dst1[3] = amd_bytealign_S (src_r1[1], src_r1[0], 2);
1772 dst1[2] = amd_bytealign_S (src_r1[0], src_r0[3], 2);
1773 dst1[1] = amd_bytealign_S (src_r0[3], src_r0[2], 2);
1774 dst1[0] = amd_bytealign_S (src_r0[2], src_r0[1], 2);
1775 dst0[3] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1776 dst0[2] = src_l0[2] | src_r0[0] << 16;
1777 break;
1778 case 9:
1779 dst1[3] = amd_bytealign_S (src_r1[1], src_r1[0], 3);
1780 dst1[2] = amd_bytealign_S (src_r1[0], src_r0[3], 3);
1781 dst1[1] = amd_bytealign_S (src_r0[3], src_r0[2], 3);
1782 dst1[0] = amd_bytealign_S (src_r0[2], src_r0[1], 3);
1783 dst0[3] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1784 dst0[2] = src_l0[2] | src_r0[0] << 8;
1785 break;
1786 case 8:
1787 dst1[3] = src_r1[1];
1788 dst1[2] = src_r1[0];
1789 dst1[1] = src_r0[3];
1790 dst1[0] = src_r0[2];
1791 dst0[3] = src_r0[1];
1792 dst0[2] = src_r0[0];
1793 break;
1794 case 7:
1795 dst1[3] = amd_bytealign_S (src_r1[2], src_r1[1], 1);
1796 dst1[2] = amd_bytealign_S (src_r1[1], src_r1[0], 1);
1797 dst1[1] = amd_bytealign_S (src_r1[0], src_r0[3], 1);
1798 dst1[0] = amd_bytealign_S (src_r0[3], src_r0[2], 1);
1799 dst0[3] = amd_bytealign_S (src_r0[2], src_r0[1], 1);
1800 dst0[2] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1801 dst0[1] = src_l0[1] | src_r0[0] << 24;
1802 break;
1803 case 6:
1804 dst1[3] = amd_bytealign_S (src_r1[2], src_r1[1], 2);
1805 dst1[2] = amd_bytealign_S (src_r1[1], src_r1[0], 2);
1806 dst1[1] = amd_bytealign_S (src_r1[0], src_r0[3], 2);
1807 dst1[0] = amd_bytealign_S (src_r0[3], src_r0[2], 2);
1808 dst0[3] = amd_bytealign_S (src_r0[2], src_r0[1], 2);
1809 dst0[2] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1810 dst0[1] = src_l0[1] | src_r0[0] << 16;
1811 break;
1812 case 5:
1813 dst1[3] = amd_bytealign_S (src_r1[2], src_r1[1], 3);
1814 dst1[2] = amd_bytealign_S (src_r1[1], src_r1[0], 3);
1815 dst1[1] = amd_bytealign_S (src_r1[0], src_r0[3], 3);
1816 dst1[0] = amd_bytealign_S (src_r0[3], src_r0[2], 3);
1817 dst0[3] = amd_bytealign_S (src_r0[2], src_r0[1], 3);
1818 dst0[2] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1819 dst0[1] = src_l0[1] | src_r0[0] << 8;
1820 break;
1821 case 4:
1822 dst1[3] = src_r1[2];
1823 dst1[2] = src_r1[1];
1824 dst1[1] = src_r1[0];
1825 dst1[0] = src_r0[3];
1826 dst0[3] = src_r0[2];
1827 dst0[2] = src_r0[1];
1828 dst0[1] = src_r0[0];
1829 break;
1830 case 3:
1831 dst1[3] = amd_bytealign_S (src_r1[3], src_r1[2], 1);
1832 dst1[2] = amd_bytealign_S (src_r1[2], src_r1[1], 1);
1833 dst1[1] = amd_bytealign_S (src_r1[1], src_r1[0], 1);
1834 dst1[0] = amd_bytealign_S (src_r1[0], src_r0[3], 1);
1835 dst0[3] = amd_bytealign_S (src_r0[3], src_r0[2], 1);
1836 dst0[2] = amd_bytealign_S (src_r0[2], src_r0[1], 1);
1837 dst0[1] = amd_bytealign_S (src_r0[1], src_r0[0], 1);
1838 dst0[0] = src_l0[0] | src_r0[0] << 24;
1839 break;
1840 case 2:
1841 dst1[3] = amd_bytealign_S (src_r1[3], src_r1[2], 2);
1842 dst1[2] = amd_bytealign_S (src_r1[2], src_r1[1], 2);
1843 dst1[1] = amd_bytealign_S (src_r1[1], src_r1[0], 2);
1844 dst1[0] = amd_bytealign_S (src_r1[0], src_r0[3], 2);
1845 dst0[3] = amd_bytealign_S (src_r0[3], src_r0[2], 2);
1846 dst0[2] = amd_bytealign_S (src_r0[2], src_r0[1], 2);
1847 dst0[1] = amd_bytealign_S (src_r0[1], src_r0[0], 2);
1848 dst0[0] = src_l0[0] | src_r0[0] << 16;
1849 break;
1850 case 1:
1851 dst1[3] = amd_bytealign_S (src_r1[3], src_r1[2], 3);
1852 dst1[2] = amd_bytealign_S (src_r1[2], src_r1[1], 3);
1853 dst1[1] = amd_bytealign_S (src_r1[1], src_r1[0], 3);
1854 dst1[0] = amd_bytealign_S (src_r1[0], src_r0[3], 3);
1855 dst0[3] = amd_bytealign_S (src_r0[3], src_r0[2], 3);
1856 dst0[2] = amd_bytealign_S (src_r0[2], src_r0[1], 3);
1857 dst0[1] = amd_bytealign_S (src_r0[1], src_r0[0], 3);
1858 dst0[0] = src_l0[0] | src_r0[0] << 8;
1859 break;
1860 case 0:
1861 dst1[3] = src_r1[3];
1862 dst1[2] = src_r1[2];
1863 dst1[1] = src_r1[1];
1864 dst1[0] = src_r1[0];
1865 dst0[3] = src_r0[3];
1866 dst0[2] = src_r0[2];
1867 dst0[1] = src_r0[1];
1868 dst0[0] = src_r0[0];
1869 break;
1870 }
1871 // #endif
1872 }
1873
1874 inline void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], const u32 len)
1875 {
1876 rshift_block_N (in0, in1, out0, out1, 32 - len);
1877
1878 u32 tib40[4];
1879 u32 tib41[4];
1880
1881 tib40[0] = out1[3];
1882 tib40[1] = out1[2];
1883 tib40[2] = out1[1];
1884 tib40[3] = out1[0];
1885 tib41[0] = out0[3];
1886 tib41[1] = out0[2];
1887 tib41[2] = out0[1];
1888 tib41[3] = out0[0];
1889
1890 out0[0] = swap32_S (tib40[0]);
1891 out0[1] = swap32_S (tib40[1]);
1892 out0[2] = swap32_S (tib40[2]);
1893 out0[3] = swap32_S (tib40[3]);
1894 out1[0] = swap32_S (tib41[0]);
1895 out1[1] = swap32_S (tib41[1]);
1896 out1[2] = swap32_S (tib41[2]);
1897 out1[3] = swap32_S (tib41[3]);
1898 }
1899
1900 inline u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1901 {
1902 buf0[0] |= (generate_cmask (buf0[0]));
1903 buf0[1] |= (generate_cmask (buf0[1]));
1904 buf0[2] |= (generate_cmask (buf0[2]));
1905 buf0[3] |= (generate_cmask (buf0[3]));
1906 buf1[0] |= (generate_cmask (buf1[0]));
1907 buf1[1] |= (generate_cmask (buf1[1]));
1908 buf1[2] |= (generate_cmask (buf1[2]));
1909 buf1[3] |= (generate_cmask (buf1[3]));
1910
1911 return in_len;
1912 }
1913
1914 inline u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1915 {
1916 buf0[0] &= ~(generate_cmask (buf0[0]));
1917 buf0[1] &= ~(generate_cmask (buf0[1]));
1918 buf0[2] &= ~(generate_cmask (buf0[2]));
1919 buf0[3] &= ~(generate_cmask (buf0[3]));
1920 buf1[0] &= ~(generate_cmask (buf1[0]));
1921 buf1[1] &= ~(generate_cmask (buf1[1]));
1922 buf1[2] &= ~(generate_cmask (buf1[2]));
1923 buf1[3] &= ~(generate_cmask (buf1[3]));
1924
1925 return in_len;
1926 }
1927
1928 inline u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1929 {
1930 rule_op_mangle_lrest (p0, p1, buf0, buf1, in_len);
1931
1932 buf0[0] &= ~(0x00000020 & generate_cmask (buf0[0]));
1933
1934 return in_len;
1935 }
1936
1937 inline u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1938 {
1939 rule_op_mangle_urest (p0, p1, buf0, buf1, in_len);
1940
1941 buf0[0] |= (0x00000020 & generate_cmask (buf0[0]));
1942
1943 return in_len;
1944 }
1945
1946 inline u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1947 {
1948 buf0[0] ^= (generate_cmask (buf0[0]));
1949 buf0[1] ^= (generate_cmask (buf0[1]));
1950 buf0[2] ^= (generate_cmask (buf0[2]));
1951 buf0[3] ^= (generate_cmask (buf0[3]));
1952 buf1[0] ^= (generate_cmask (buf1[0]));
1953 buf1[1] ^= (generate_cmask (buf1[1]));
1954 buf1[2] ^= (generate_cmask (buf1[2]));
1955 buf1[3] ^= (generate_cmask (buf1[3]));
1956
1957 return in_len;
1958 }
1959
1960 inline u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1961 {
1962 if (p0 >= in_len) return (in_len);
1963
1964 const u32 tmp = 0x20u << ((p0 & 3) * 8);
1965
1966 switch (p0 / 4)
1967 {
1968 case 0: buf0[0] ^= (tmp & generate_cmask (buf0[0])); break;
1969 case 1: buf0[1] ^= (tmp & generate_cmask (buf0[1])); break;
1970 case 2: buf0[2] ^= (tmp & generate_cmask (buf0[2])); break;
1971 case 3: buf0[3] ^= (tmp & generate_cmask (buf0[3])); break;
1972 case 4: buf1[0] ^= (tmp & generate_cmask (buf1[0])); break;
1973 case 5: buf1[1] ^= (tmp & generate_cmask (buf1[1])); break;
1974 case 6: buf1[2] ^= (tmp & generate_cmask (buf1[2])); break;
1975 case 7: buf1[3] ^= (tmp & generate_cmask (buf1[3])); break;
1976 }
1977
1978 return in_len;
1979 }
1980
1981 inline u32 rule_op_mangle_reverse (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1982 {
1983 reverse_block (buf0, buf1, buf0, buf1, in_len);
1984
1985 return in_len;
1986 }
1987
1988 inline u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
1989 {
1990 if ((in_len + in_len) >= 32) return (in_len);
1991
1992 u32 out_len = in_len;
1993
1994 append_block8 (out_len, buf0, buf1, buf0, buf1, buf0, buf1);
1995
1996 out_len += in_len;
1997
1998 return out_len;
1999 }
2000
2001 inline u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2002 {
2003 if (((in_len * p0) + in_len) >= 32) return (in_len);
2004
2005 u32 out_len = in_len;
2006
2007 u32 tib40[4];
2008 u32 tib41[4];
2009
2010 tib40[0] = buf0[0];
2011 tib40[1] = buf0[1];
2012 tib40[2] = buf0[2];
2013 tib40[3] = buf0[3];
2014 tib41[0] = buf1[0];
2015 tib41[1] = buf1[1];
2016 tib41[2] = buf1[2];
2017 tib41[3] = buf1[3];
2018
2019 for (u32 i = 0; i < p0; i++)
2020 {
2021 append_block8 (out_len, buf0, buf1, buf0, buf1, tib40, tib41);
2022
2023 out_len += in_len;
2024 }
2025
2026 return out_len;
2027 }
2028
2029 inline u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2030 {
2031 if ((in_len + in_len) >= 32) return (in_len);
2032
2033 u32 out_len = in_len;
2034
2035 u32 tib40[4];
2036 u32 tib41[4];
2037
2038 reverse_block (buf0, buf1, tib40, tib41, out_len);
2039
2040 append_block8 (out_len, buf0, buf1, buf0, buf1, tib40, tib41);
2041
2042 out_len += in_len;
2043
2044 return out_len;
2045 }
2046
2047 inline u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2048 {
2049 if ((in_len + 1) >= 32) return (in_len);
2050
2051 u32 out_len = in_len;
2052
2053 append_block1 (out_len, buf0, buf1, p0);
2054
2055 out_len++;
2056
2057 return out_len;
2058 }
2059
2060 inline u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2061 {
2062 if ((in_len + 1) >= 32) return (in_len);
2063
2064 u32 out_len = in_len;
2065
2066 rshift_block (buf0, buf1, buf0, buf1);
2067
2068 buf0[0] = buf0[0] | p0;
2069
2070 out_len++;
2071
2072 return out_len;
2073 }
2074
2075 inline u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2076 {
2077 if (in_len == 0) return (in_len);
2078
2079 const u32 in_len1 = in_len - 1;
2080
2081 const u32 sh = (in_len1 & 3) * 8;
2082
2083 const u32 tmp = (buf0[0] & 0xff) << sh;
2084
2085 lshift_block (buf0, buf1, buf0, buf1);
2086
2087 switch (in_len1 / 4)
2088 {
2089 case 0: buf0[0] |= tmp; break;
2090 case 1: buf0[1] |= tmp; break;
2091 case 2: buf0[2] |= tmp; break;
2092 case 3: buf0[3] |= tmp; break;
2093 case 4: buf1[0] |= tmp; break;
2094 case 5: buf1[1] |= tmp; break;
2095 case 6: buf1[2] |= tmp; break;
2096 case 7: buf1[3] |= tmp; break;
2097 }
2098
2099 return in_len;
2100 }
2101
2102 inline u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2103 {
2104 if (in_len == 0) return (in_len);
2105
2106 const u32 in_len1 = in_len - 1;
2107
2108 const u32 sh = (in_len1 & 3) * 8;
2109
2110 u32 tmp = 0;
2111
2112 switch (in_len1 / 4)
2113 {
2114 case 0: tmp = (buf0[0] >> sh) & 0xff; break;
2115 case 1: tmp = (buf0[1] >> sh) & 0xff; break;
2116 case 2: tmp = (buf0[2] >> sh) & 0xff; break;
2117 case 3: tmp = (buf0[3] >> sh) & 0xff; break;
2118 case 4: tmp = (buf1[0] >> sh) & 0xff; break;
2119 case 5: tmp = (buf1[1] >> sh) & 0xff; break;
2120 case 6: tmp = (buf1[2] >> sh) & 0xff; break;
2121 case 7: tmp = (buf1[3] >> sh) & 0xff; break;
2122 }
2123
2124 rshift_block (buf0, buf1, buf0, buf1);
2125
2126 buf0[0] |= tmp;
2127
2128 truncate_right (buf0, buf1, in_len);
2129
2130 return in_len;
2131 }
2132
2133 inline u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2134 {
2135 if (in_len == 0) return (in_len);
2136
2137 const u32 in_len1 = in_len - 1;
2138
2139 lshift_block (buf0, buf1, buf0, buf1);
2140
2141 return in_len1;
2142 }
2143
2144 inline u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2145 {
2146 if (in_len == 0) return (in_len);
2147
2148 const u32 in_len1 = in_len - 1;
2149
2150 const u32 tmp = (1 << ((in_len1 & 3) * 8)) - 1;
2151
2152 switch (in_len1 / 4)
2153 {
2154 case 0: buf0[0] &= tmp; break;
2155 case 1: buf0[1] &= tmp; break;
2156 case 2: buf0[2] &= tmp; break;
2157 case 3: buf0[3] &= tmp; break;
2158 case 4: buf1[0] &= tmp; break;
2159 case 5: buf1[1] &= tmp; break;
2160 case 6: buf1[2] &= tmp; break;
2161 case 7: buf1[3] &= tmp; break;
2162 }
2163
2164 return in_len1;
2165 }
2166
2167 inline u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2168 {
2169 if (p0 >= in_len) return (in_len);
2170
2171 u32 out_len = in_len;
2172
2173 u32 tib40[4];
2174 u32 tib41[4];
2175
2176 lshift_block (buf0, buf1, tib40, tib41);
2177
2178 const u32 ml = (1 << ((p0 & 3) * 8)) - 1;
2179 const u32 mr = ~ml;
2180
2181 switch (p0 / 4)
2182 {
2183 case 0: buf0[0] = (buf0[0] & ml)
2184 | (tib40[0] & mr);
2185 buf0[1] = tib40[1];
2186 buf0[2] = tib40[2];
2187 buf0[3] = tib40[3];
2188 buf1[0] = tib41[0];
2189 buf1[1] = tib41[1];
2190 buf1[2] = tib41[2];
2191 buf1[3] = tib41[3];
2192 break;
2193 case 1: buf0[1] = (buf0[1] & ml)
2194 | (tib40[1] & mr);
2195 buf0[2] = tib40[2];
2196 buf0[3] = tib40[3];
2197 buf1[0] = tib41[0];
2198 buf1[1] = tib41[1];
2199 buf1[2] = tib41[2];
2200 buf1[3] = tib41[3];
2201 break;
2202 case 2: buf0[2] = (buf0[2] & ml)
2203 | (tib40[2] & mr);
2204 buf0[3] = tib40[3];
2205 buf1[0] = tib41[0];
2206 buf1[1] = tib41[1];
2207 buf1[2] = tib41[2];
2208 buf1[3] = tib41[3];
2209 break;
2210 case 3: buf0[3] = (buf0[3] & ml)
2211 | (tib40[3] & mr);
2212 buf1[0] = tib41[0];
2213 buf1[1] = tib41[1];
2214 buf1[2] = tib41[2];
2215 buf1[3] = tib41[3];
2216 break;
2217 case 4: buf1[0] = (buf1[0] & ml)
2218 | (tib41[0] & mr);
2219 buf1[1] = tib41[1];
2220 buf1[2] = tib41[2];
2221 buf1[3] = tib41[3];
2222 break;
2223 case 5: buf1[1] = (buf1[1] & ml)
2224 | (tib41[1] & mr);
2225 buf1[2] = tib41[2];
2226 buf1[3] = tib41[3];
2227 break;
2228 case 6: buf1[2] = (buf1[2] & ml)
2229 | (tib41[2] & mr);
2230 buf1[3] = tib41[3];
2231 break;
2232 case 7: buf1[3] = (buf1[3] & ml)
2233 | (tib41[3] & mr);
2234 break;
2235 }
2236
2237 out_len--;
2238
2239 return out_len;
2240 }
2241
2242 inline u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2243 {
2244 if (p0 >= in_len) return (in_len);
2245
2246 if ((p0 + p1) > in_len) return (in_len);
2247
2248 u32 out_len = p1;
2249
2250 lshift_block_N (buf0, buf1, buf0, buf1, p0);
2251
2252 truncate_right (buf0, buf1, out_len);
2253
2254 return out_len;
2255 }
2256
2257 inline u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2258 {
2259 if (p0 >= in_len) return (in_len);
2260
2261 if ((p0 + p1) > in_len) return (in_len);
2262
2263 u32 out_len = in_len;
2264
2265 u32 tib40[4];
2266 u32 tib41[4];
2267
2268 tib40[0] = 0;
2269 tib40[1] = 0;
2270 tib40[2] = 0;
2271 tib40[3] = 0;
2272 tib41[0] = 0;
2273 tib41[1] = 0;
2274 tib41[2] = 0;
2275 tib41[3] = 0;
2276
2277 lshift_block_N (buf0, buf1, tib40, tib41, p1);
2278
2279 const u32 ml = (1 << ((p0 & 3) * 8)) - 1;
2280 const u32 mr = ~ml;
2281
2282 switch (p0 / 4)
2283 {
2284 case 0: buf0[0] = (buf0[0] & ml)
2285 | (tib40[0] & mr);
2286 buf0[1] = tib40[1];
2287 buf0[2] = tib40[2];
2288 buf0[3] = tib40[3];
2289 buf1[0] = tib41[0];
2290 buf1[1] = tib41[1];
2291 buf1[2] = tib41[2];
2292 buf1[3] = tib41[3];
2293 break;
2294 case 1: buf0[1] = (buf0[1] & ml)
2295 | (tib40[1] & mr);
2296 buf0[2] = tib40[2];
2297 buf0[3] = tib40[3];
2298 buf1[0] = tib41[0];
2299 buf1[1] = tib41[1];
2300 buf1[2] = tib41[2];
2301 buf1[3] = tib41[3];
2302 break;
2303 case 2: buf0[2] = (buf0[2] & ml)
2304 | (tib40[2] & mr);
2305 buf0[3] = tib40[3];
2306 buf1[0] = tib41[0];
2307 buf1[1] = tib41[1];
2308 buf1[2] = tib41[2];
2309 buf1[3] = tib41[3];
2310 break;
2311 case 3: buf0[3] = (buf0[3] & ml)
2312 | (tib40[3] & mr);
2313 buf1[0] = tib41[0];
2314 buf1[1] = tib41[1];
2315 buf1[2] = tib41[2];
2316 buf1[3] = tib41[3];
2317 break;
2318 case 4: buf1[0] = (buf1[0] & ml)
2319 | (tib41[0] & mr);
2320 buf1[1] = tib41[1];
2321 buf1[2] = tib41[2];
2322 buf1[3] = tib41[3];
2323 break;
2324 case 5: buf1[1] = (buf1[1] & ml)
2325 | (tib41[1] & mr);
2326 buf1[2] = tib41[2];
2327 buf1[3] = tib41[3];
2328 break;
2329 case 6: buf1[2] = (buf1[2] & ml)
2330 | (tib41[2] & mr);
2331 buf1[3] = tib41[3];
2332 break;
2333 case 7: buf1[3] = (buf1[3] & ml)
2334 | (tib41[3] & mr);
2335 break;
2336 }
2337
2338 out_len -= p1;
2339
2340 return out_len;
2341 }
2342
2343 inline u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2344 {
2345 if (p0 > in_len) return (in_len);
2346
2347 if ((in_len + 1) >= 32) return (in_len);
2348
2349 u32 out_len = in_len;
2350
2351 u32 tib40[4];
2352 u32 tib41[4];
2353
2354 rshift_block (buf0, buf1, tib40, tib41);
2355
2356 const u32 p1n = p1 << ((p0 & 3) * 8);
2357
2358 const u32 ml = (1 << ((p0 & 3) * 8)) - 1;
2359
2360 const u32 mr = 0xffffff00 << ((p0 & 3) * 8);
2361
2362 switch (p0 / 4)
2363 {
2364 case 0: buf0[0] = (buf0[0] & ml) | p1n | (tib40[0] & mr);
2365 buf0[1] = tib40[1];
2366 buf0[2] = tib40[2];
2367 buf0[3] = tib40[3];
2368 buf1[0] = tib41[0];
2369 buf1[1] = tib41[1];
2370 buf1[2] = tib41[2];
2371 buf1[3] = tib41[3];
2372 break;
2373 case 1: buf0[1] = (buf0[1] & ml) | p1n | (tib40[1] & mr);
2374 buf0[2] = tib40[2];
2375 buf0[3] = tib40[3];
2376 buf1[0] = tib41[0];
2377 buf1[1] = tib41[1];
2378 buf1[2] = tib41[2];
2379 buf1[3] = tib41[3];
2380 break;
2381 case 2: buf0[2] = (buf0[2] & ml) | p1n | (tib40[2] & mr);
2382 buf0[3] = tib40[3];
2383 buf1[0] = tib41[0];
2384 buf1[1] = tib41[1];
2385 buf1[2] = tib41[2];
2386 buf1[3] = tib41[3];
2387 break;
2388 case 3: buf0[3] = (buf0[3] & ml) | p1n | (tib40[3] & mr);
2389 buf1[0] = tib41[0];
2390 buf1[1] = tib41[1];
2391 buf1[2] = tib41[2];
2392 buf1[3] = tib41[3];
2393 break;
2394 case 4: buf1[0] = (buf1[0] & ml) | p1n | (tib41[0] & mr);
2395 buf1[1] = tib41[1];
2396 buf1[2] = tib41[2];
2397 buf1[3] = tib41[3];
2398 break;
2399 case 5: buf1[1] = (buf1[1] & ml) | p1n | (tib41[1] & mr);
2400 buf1[2] = tib41[2];
2401 buf1[3] = tib41[3];
2402 break;
2403 case 6: buf1[2] = (buf1[2] & ml) | p1n | (tib41[2] & mr);
2404 buf1[3] = tib41[3];
2405 break;
2406 case 7: buf1[3] = (buf1[3] & ml) | p1n | (tib41[3] & mr);
2407 break;
2408 }
2409
2410 out_len++;
2411
2412 return out_len;
2413 }
2414
2415 inline u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2416 {
2417 if (p0 >= in_len) return (in_len);
2418
2419 const u32 p1n = p1 << ((p0 & 3) * 8);
2420
2421 const u32 m = ~(0xffu << ((p0 & 3) * 8));
2422
2423 switch (p0 / 4)
2424 {
2425 case 0: buf0[0] = (buf0[0] & m) | p1n; break;
2426 case 1: buf0[1] = (buf0[1] & m) | p1n; break;
2427 case 2: buf0[2] = (buf0[2] & m) | p1n; break;
2428 case 3: buf0[3] = (buf0[3] & m) | p1n; break;
2429 case 4: buf1[0] = (buf1[0] & m) | p1n; break;
2430 case 5: buf1[1] = (buf1[1] & m) | p1n; break;
2431 case 6: buf1[2] = (buf1[2] & m) | p1n; break;
2432 case 7: buf1[3] = (buf1[3] & m) | p1n; break;
2433 }
2434
2435 return in_len;
2436 }
2437
2438 inline u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2439 {
2440 if (p0 >= in_len) return (in_len);
2441
2442 truncate_right (buf0, buf1, p0);
2443
2444 return p0;
2445 }
2446
2447 inline u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2448 {
2449 #ifdef IS_NV
2450 for (u32 i = 0; i < in_len; i++)
2451 {
2452 switch (i)
2453 {
2454 case 0: if ((__byte_perm_S (buf0[0], 0, 0x6540)) == p0) buf0[0] = __byte_perm_S (p1, buf0[0], 0x7650);
2455 break;
2456 case 1: if ((__byte_perm_S (buf0[0], 0, 0x6541)) == p0) buf0[0] = __byte_perm_S (p1, buf0[0], 0x7604);
2457 break;
2458 case 2: if ((__byte_perm_S (buf0[0], 0, 0x6542)) == p0) buf0[0] = __byte_perm_S (p1, buf0[0], 0x7054);
2459 break;
2460 case 3: if ((__byte_perm_S (buf0[0], 0, 0x6543)) == p0) buf0[0] = __byte_perm_S (p1, buf0[0], 0x0654);
2461 break;
2462 case 4: if ((__byte_perm_S (buf0[1], 0, 0x6540)) == p0) buf0[1] = __byte_perm_S (p1, buf0[1], 0x7650);
2463 break;
2464 case 5: if ((__byte_perm_S (buf0[1], 0, 0x6541)) == p0) buf0[1] = __byte_perm_S (p1, buf0[1], 0x7604);
2465 break;
2466 case 6: if ((__byte_perm_S (buf0[1], 0, 0x6542)) == p0) buf0[1] = __byte_perm_S (p1, buf0[1], 0x7054);
2467 break;
2468 case 7: if ((__byte_perm_S (buf0[1], 0, 0x6543)) == p0) buf0[1] = __byte_perm_S (p1, buf0[1], 0x0654);
2469 break;
2470 case 8: if ((__byte_perm_S (buf0[2], 0, 0x6540)) == p0) buf0[2] = __byte_perm_S (p1, buf0[2], 0x7650);
2471 break;
2472 case 9: if ((__byte_perm_S (buf0[2], 0, 0x6541)) == p0) buf0[2] = __byte_perm_S (p1, buf0[2], 0x7604);
2473 break;
2474 case 10: if ((__byte_perm_S (buf0[2], 0, 0x6542)) == p0) buf0[2] = __byte_perm_S (p1, buf0[2], 0x7054);
2475 break;
2476 case 11: if ((__byte_perm_S (buf0[2], 0, 0x6543)) == p0) buf0[2] = __byte_perm_S (p1, buf0[2], 0x0654);
2477 break;
2478 case 12: if ((__byte_perm_S (buf0[3], 0, 0x6540)) == p0) buf0[3] = __byte_perm_S (p1, buf0[3], 0x7650);
2479 break;
2480 case 13: if ((__byte_perm_S (buf0[3], 0, 0x6541)) == p0) buf0[3] = __byte_perm_S (p1, buf0[3], 0x7604);
2481 break;
2482 case 14: if ((__byte_perm_S (buf0[3], 0, 0x6542)) == p0) buf0[3] = __byte_perm_S (p1, buf0[3], 0x7054);
2483 break;
2484 case 15: if ((__byte_perm_S (buf0[3], 0, 0x6543)) == p0) buf0[3] = __byte_perm_S (p1, buf0[3], 0x0654);
2485 break;
2486 case 16: if ((__byte_perm_S (buf1[0], 0, 0x6540)) == p0) buf1[0] = __byte_perm_S (p1, buf1[0], 0x7650);
2487 break;
2488 case 17: if ((__byte_perm_S (buf1[0], 0, 0x6541)) == p0) buf1[0] = __byte_perm_S (p1, buf1[0], 0x7604);
2489 break;
2490 case 18: if ((__byte_perm_S (buf1[0], 0, 0x6542)) == p0) buf1[0] = __byte_perm_S (p1, buf1[0], 0x7054);
2491 break;
2492 case 19: if ((__byte_perm_S (buf1[0], 0, 0x6543)) == p0) buf1[0] = __byte_perm_S (p1, buf1[0], 0x0654);
2493 break;
2494 case 20: if ((__byte_perm_S (buf1[1], 0, 0x6540)) == p0) buf1[1] = __byte_perm_S (p1, buf1[1], 0x7650);
2495 break;
2496 case 21: if ((__byte_perm_S (buf1[1], 0, 0x6541)) == p0) buf1[1] = __byte_perm_S (p1, buf1[1], 0x7604);
2497 break;
2498 case 22: if ((__byte_perm_S (buf1[1], 0, 0x6542)) == p0) buf1[1] = __byte_perm_S (p1, buf1[1], 0x7054);
2499 break;
2500 case 23: if ((__byte_perm_S (buf1[1], 0, 0x6543)) == p0) buf1[1] = __byte_perm_S (p1, buf1[1], 0x0654);
2501 break;
2502 case 24: if ((__byte_perm_S (buf1[2], 0, 0x6540)) == p0) buf1[2] = __byte_perm_S (p1, buf1[2], 0x7650);
2503 break;
2504 case 25: if ((__byte_perm_S (buf1[2], 0, 0x6541)) == p0) buf1[2] = __byte_perm_S (p1, buf1[2], 0x7604);
2505 break;
2506 case 26: if ((__byte_perm_S (buf1[2], 0, 0x6542)) == p0) buf1[2] = __byte_perm_S (p1, buf1[2], 0x7054);
2507 break;
2508 case 27: if ((__byte_perm_S (buf1[2], 0, 0x6543)) == p0) buf1[2] = __byte_perm_S (p1, buf1[2], 0x0654);
2509 break;
2510 case 28: if ((__byte_perm_S (buf1[3], 0, 0x6540)) == p0) buf1[3] = __byte_perm_S (p1, buf1[3], 0x7650);
2511 break;
2512 case 29: if ((__byte_perm_S (buf1[3], 0, 0x6541)) == p0) buf1[3] = __byte_perm_S (p1, buf1[3], 0x7604);
2513 break;
2514 case 30: if ((__byte_perm_S (buf1[3], 0, 0x6542)) == p0) buf1[3] = __byte_perm_S (p1, buf1[3], 0x7054);
2515 break;
2516 case 31: if ((__byte_perm_S (buf1[3], 0, 0x6543)) == p0) buf1[3] = __byte_perm_S (p1, buf1[3], 0x0654);
2517 break;
2518 }
2519 }
2520 #endif
2521
2522 #if defined IS_AMD || defined IS_GENERIC
2523 const uchar4 tmp0 = (uchar4) (p0);
2524 const uchar4 tmp1 = (uchar4) (p1);
2525
2526 uchar4 tmp;
2527
2528 tmp = as_uchar4 (buf0[0]); tmp = select (tmp, tmp1, tmp == tmp0); buf0[0] = as_uint (tmp);
2529 tmp = as_uchar4 (buf0[1]); tmp = select (tmp, tmp1, tmp == tmp0); buf0[1] = as_uint (tmp);
2530 tmp = as_uchar4 (buf0[2]); tmp = select (tmp, tmp1, tmp == tmp0); buf0[2] = as_uint (tmp);
2531 tmp = as_uchar4 (buf0[3]); tmp = select (tmp, tmp1, tmp == tmp0); buf0[3] = as_uint (tmp);
2532 tmp = as_uchar4 (buf1[0]); tmp = select (tmp, tmp1, tmp == tmp0); buf1[0] = as_uint (tmp);
2533 tmp = as_uchar4 (buf1[1]); tmp = select (tmp, tmp1, tmp == tmp0); buf1[1] = as_uint (tmp);
2534 tmp = as_uchar4 (buf1[2]); tmp = select (tmp, tmp1, tmp == tmp0); buf1[2] = as_uint (tmp);
2535 tmp = as_uchar4 (buf1[3]); tmp = select (tmp, tmp1, tmp == tmp0); buf1[3] = as_uint (tmp);
2536 #endif
2537
2538 return in_len;
2539 }
2540
2541 inline u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2542 {
2543 // TODO
2544 return in_len;
2545 }
2546
2547 inline u32 rule_op_mangle_togglecase_rec (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2548 {
2549 // TODO
2550 return in_len;
2551 }
2552
2553 inline u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2554 {
2555 if ( in_len == 0) return (in_len);
2556 if ((in_len + p0) >= 32) return (in_len);
2557
2558 u32 out_len = in_len;
2559
2560 const u32 tmp = buf0[0] & 0xFF;
2561
2562 rshift_block_N (buf0, buf1, buf0, buf1, p0);
2563
2564 #ifdef IS_NV
2565 switch (p0)
2566 {
2567 case 1: buf0[0] |= tmp;
2568 break;
2569 case 2: buf0[0] |= __byte_perm_S (tmp, 0, 0x5400);
2570 break;
2571 case 3: buf0[0] |= __byte_perm_S (tmp, 0, 0x4000);
2572 break;
2573 case 4: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2574 break;
2575 case 5: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2576 buf0[1] |= tmp;
2577 break;
2578 case 6: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2579 buf0[1] |= __byte_perm_S (tmp, 0, 0x5400);
2580 break;
2581 case 7: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2582 buf0[1] |= __byte_perm_S (tmp, 0, 0x4000);
2583 break;
2584 case 8: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2585 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2586 break;
2587 case 9: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2588 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2589 buf0[2] |= tmp;
2590 break;
2591 case 10: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2592 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2593 buf0[2] |= __byte_perm_S (tmp, 0, 0x5400);
2594 break;
2595 case 11: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2596 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2597 buf0[2] |= __byte_perm_S (tmp, 0, 0x4000);
2598 break;
2599 case 12: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2600 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2601 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2602 break;
2603 case 13: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2604 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2605 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2606 buf0[3] |= tmp;
2607 break;
2608 case 14: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2609 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2610 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2611 buf0[3] |= __byte_perm_S (tmp, 0, 0x5400);
2612 break;
2613 case 15: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2614 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2615 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2616 buf0[3] |= __byte_perm_S (tmp, 0, 0x4000);
2617 break;
2618 case 16: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2619 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2620 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2621 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2622 break;
2623 case 17: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2624 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2625 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2626 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2627 buf1[0] |= tmp;
2628 break;
2629 case 18: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2630 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2631 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2632 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2633 buf1[0] |= __byte_perm_S (tmp, 0, 0x5400);
2634 break;
2635 case 19: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2636 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2637 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2638 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2639 buf1[0] |= __byte_perm_S (tmp, 0, 0x4000);
2640 break;
2641 case 20: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2642 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2643 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2644 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2645 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2646 break;
2647 case 21: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2648 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2649 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2650 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2651 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2652 buf1[1] |= tmp;
2653 break;
2654 case 22: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2655 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2656 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2657 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2658 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2659 buf1[1] |= __byte_perm_S (tmp, 0, 0x5400);
2660 break;
2661 case 23: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2662 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2663 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2664 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2665 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2666 buf1[1] |= __byte_perm_S (tmp, 0, 0x4000);
2667 break;
2668 case 24: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2669 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2670 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2671 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2672 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2673 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2674 break;
2675 case 25: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2676 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2677 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2678 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2679 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2680 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2681 buf1[2] |= tmp;
2682 break;
2683 case 26: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2684 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2685 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2686 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2687 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2688 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2689 buf1[2] |= __byte_perm_S (tmp, 0, 0x5400);
2690 break;
2691 case 27: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2692 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2693 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2694 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2695 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2696 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2697 buf1[2] |= __byte_perm_S (tmp, 0, 0x4000);
2698 break;
2699 case 28: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2700 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2701 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2702 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2703 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2704 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2705 buf1[2] |= __byte_perm_S (tmp, 0, 0x0000);
2706 break;
2707 case 29: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2708 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2709 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2710 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2711 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2712 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2713 buf1[2] |= __byte_perm_S (tmp, 0, 0x0000);
2714 buf1[3] |= tmp;
2715 break;
2716 case 30: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2717 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2718 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2719 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2720 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2721 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2722 buf1[2] |= __byte_perm_S (tmp, 0, 0x0000);
2723 buf1[3] |= __byte_perm_S (tmp, 0, 0x5400);
2724 break;
2725 case 31: buf0[0] |= __byte_perm_S (tmp, 0, 0x0000);
2726 buf0[1] |= __byte_perm_S (tmp, 0, 0x0000);
2727 buf0[2] |= __byte_perm_S (tmp, 0, 0x0000);
2728 buf0[3] |= __byte_perm_S (tmp, 0, 0x0000);
2729 buf1[0] |= __byte_perm_S (tmp, 0, 0x0000);
2730 buf1[1] |= __byte_perm_S (tmp, 0, 0x0000);
2731 buf1[2] |= __byte_perm_S (tmp, 0, 0x0000);
2732 buf1[3] |= __byte_perm_S (tmp, 0, 0x4000);
2733 break;
2734 }
2735 #endif
2736
2737 #if defined IS_AMD || defined IS_GENERIC
2738 switch (p0)
2739 {
2740 case 1: buf0[0] |= tmp << 0;
2741 break;
2742 case 2: buf0[0] |= tmp << 0 | tmp << 8;
2743 break;
2744 case 3: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16;
2745 break;
2746 case 4: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2747 break;
2748 case 5: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2749 buf0[1] |= tmp << 0;
2750 break;
2751 case 6: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2752 buf0[1] |= tmp << 0 | tmp << 8;
2753 break;
2754 case 7: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2755 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16;
2756 break;
2757 case 8: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2758 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2759 break;
2760 case 9: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2761 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2762 buf0[2] |= tmp << 0;
2763 break;
2764 case 10: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2765 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2766 buf0[2] |= tmp << 0 | tmp << 8;
2767 break;
2768 case 11: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2769 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2770 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16;
2771 break;
2772 case 12: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2773 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2774 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2775 break;
2776 case 13: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2777 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2778 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2779 buf0[3] |= tmp << 0;
2780 break;
2781 case 14: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2782 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2783 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2784 buf0[3] |= tmp << 0 | tmp << 8;
2785 break;
2786 case 15: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2787 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2788 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2789 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16;
2790 break;
2791 case 16: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2792 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2793 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2794 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2795 break;
2796 case 17: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2797 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2798 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2799 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2800 buf1[0] |= tmp << 0;
2801 break;
2802 case 18: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2803 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2804 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2805 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2806 buf1[0] |= tmp << 0 | tmp << 8;
2807 break;
2808 case 19: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2809 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2810 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2811 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2812 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16;
2813 break;
2814 case 20: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2815 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2816 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2817 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2818 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2819 break;
2820 case 21: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2821 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2822 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2823 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2824 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2825 buf1[1] |= tmp << 0;
2826 break;
2827 case 22: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2828 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2829 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2830 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2831 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2832 buf1[1] |= tmp << 0 | tmp << 8;
2833 break;
2834 case 23: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2835 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2836 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2837 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2838 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2839 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16;
2840 break;
2841 case 24: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2842 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2843 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2844 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2845 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2846 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2847 break;
2848 case 25: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2849 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2850 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2851 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2852 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2853 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2854 buf1[2] |= tmp << 0;
2855 break;
2856 case 26: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2857 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2858 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2859 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2860 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2861 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2862 buf1[2] |= tmp << 0 | tmp << 8;
2863 break;
2864 case 27: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2865 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2866 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2867 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2868 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2869 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2870 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16;
2871 break;
2872 case 28: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2873 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2874 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2875 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2876 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2877 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2878 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2879 break;
2880 case 29: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2881 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2882 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2883 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2884 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2885 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2886 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2887 buf1[3] |= tmp << 0;
2888 break;
2889 case 30: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2890 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2891 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2892 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2893 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2894 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2895 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2896 buf1[3] |= tmp << 0 | tmp << 8;
2897 break;
2898 case 31: buf0[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2899 buf0[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2900 buf0[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2901 buf0[3] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2902 buf1[0] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2903 buf1[1] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2904 buf1[2] |= tmp << 0 | tmp << 8 | tmp << 16 | tmp << 24;
2905 buf1[3] |= tmp << 0 | tmp << 8 | tmp << 16;
2906 break;
2907 }
2908 #endif
2909
2910 out_len += p0;
2911
2912 return out_len;
2913 }
2914
2915 inline u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2916 {
2917 if ( in_len == 0) return (in_len);
2918 if ((in_len + p0) >= 32) return (in_len);
2919
2920 const u32 in_len1 = in_len - 1;
2921
2922 const u32 sh = (in_len1 & 3) * 8;
2923
2924 u32 tmp = 0;
2925
2926 switch (in_len1 / 4)
2927 {
2928 case 0: tmp = (buf0[0] >> sh) & 0xff; break;
2929 case 1: tmp = (buf0[1] >> sh) & 0xff; break;
2930 case 2: tmp = (buf0[2] >> sh) & 0xff; break;
2931 case 3: tmp = (buf0[3] >> sh) & 0xff; break;
2932 case 4: tmp = (buf1[0] >> sh) & 0xff; break;
2933 case 5: tmp = (buf1[1] >> sh) & 0xff; break;
2934 case 6: tmp = (buf1[2] >> sh) & 0xff; break;
2935 case 7: tmp = (buf1[3] >> sh) & 0xff; break;
2936 }
2937
2938 u32 out_len = in_len;
2939
2940 for (u32 i = 0; i < p0; i++)
2941 {
2942 append_block1 (out_len, buf0, buf1, tmp);
2943
2944 out_len++;
2945 }
2946
2947 return out_len;
2948 }
2949
2950 inline u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
2951 {
2952 if ( in_len == 0) return (in_len);
2953 if ((in_len + in_len) >= 32) return (in_len);
2954
2955 u32 out_len = in_len;
2956
2957 u32 tib40[4];
2958 u32 tib41[4];
2959
2960 #ifdef IS_NV
2961 tib40[0] = __byte_perm_S (buf0[0], 0, 0x1100);
2962 tib40[1] = __byte_perm_S (buf0[0], 0, 0x3322);
2963 tib40[2] = __byte_perm_S (buf0[1], 0, 0x1100);
2964 tib40[3] = __byte_perm_S (buf0[1], 0, 0x3322);
2965 tib41[0] = __byte_perm_S (buf0[2], 0, 0x1100);
2966 tib41[1] = __byte_perm_S (buf0[2], 0, 0x3322);
2967 tib41[2] = __byte_perm_S (buf0[3], 0, 0x1100);
2968 tib41[3] = __byte_perm_S (buf0[3], 0, 0x3322);
2969
2970 buf0[0] = tib40[0];
2971 buf0[1] = tib40[1];
2972 buf0[2] = tib40[2];
2973 buf0[3] = tib40[3];
2974 buf1[0] = tib41[0];
2975 buf1[1] = tib41[1];
2976 buf1[2] = tib41[2];
2977 buf1[3] = tib41[3];
2978 #endif
2979
2980 #if defined IS_AMD || defined IS_GENERIC
2981 tib40[0] = ((buf0[0] & 0x000000FF) << 0) | ((buf0[0] & 0x0000FF00) << 8);
2982 tib40[1] = ((buf0[0] & 0x00FF0000) >> 16) | ((buf0[0] & 0xFF000000) >> 8);
2983 tib40[2] = ((buf0[1] & 0x000000FF) << 0) | ((buf0[1] & 0x0000FF00) << 8);
2984 tib40[3] = ((buf0[1] & 0x00FF0000) >> 16) | ((buf0[1] & 0xFF000000) >> 8);
2985 tib41[0] = ((buf0[2] & 0x000000FF) << 0) | ((buf0[2] & 0x0000FF00) << 8);
2986 tib41[1] = ((buf0[2] & 0x00FF0000) >> 16) | ((buf0[2] & 0xFF000000) >> 8);
2987 tib41[2] = ((buf0[3] & 0x000000FF) << 0) | ((buf0[3] & 0x0000FF00) << 8);
2988 tib41[3] = ((buf0[3] & 0x00FF0000) >> 16) | ((buf0[3] & 0xFF000000) >> 8);
2989
2990 buf0[0] = tib40[0] | (tib40[0] << 8);
2991 buf0[1] = tib40[1] | (tib40[1] << 8);
2992 buf0[2] = tib40[2] | (tib40[2] << 8);
2993 buf0[3] = tib40[3] | (tib40[3] << 8);
2994 buf1[0] = tib41[0] | (tib41[0] << 8);
2995 buf1[1] = tib41[1] | (tib41[1] << 8);
2996 buf1[2] = tib41[2] | (tib41[2] << 8);
2997 buf1[3] = tib41[3] | (tib41[3] << 8);
2998 #endif
2999
3000 out_len = out_len + out_len;
3001
3002 return out_len;
3003 }
3004
3005 inline u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3006 {
3007 if (in_len < 2) return (in_len);
3008
3009 #ifdef IS_NV
3010 buf0[0] = __byte_perm_S (buf0[0], 0, 0x3201);
3011 #endif
3012
3013 #if defined IS_AMD || defined IS_GENERIC
3014 buf0[0] = (buf0[0] & 0xFFFF0000) | ((buf0[0] << 8) & 0x0000FF00) | ((buf0[0] >> 8) & 0x000000FF);
3015 #endif
3016
3017 return in_len;
3018 }
3019
3020 inline u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3021 {
3022 if (in_len < 2) return (in_len);
3023
3024 #ifdef IS_NV
3025 switch (in_len)
3026 {
3027 case 2: buf0[0] = __byte_perm_S (buf0[0], 0, 0x5401);
3028 break;
3029 case 3: buf0[0] = __byte_perm_S (buf0[0], 0, 0x4120);
3030 break;
3031 case 4: buf0[0] = __byte_perm_S (buf0[0], 0, 0x2310);
3032 break;
3033 case 5: buf0[1] = __byte_perm_S (buf0[1], buf0[0], 0x7210);
3034 buf0[0] = __byte_perm_S (buf0[0], buf0[1], 0x4210);
3035 buf0[1] = __byte_perm_S (buf0[1], 0, 0x6543);
3036 break;
3037 case 6: buf0[1] = __byte_perm_S (buf0[1], 0, 0x5401);
3038 break;
3039 case 7: buf0[1] = __byte_perm_S (buf0[1], 0, 0x4120);
3040 break;
3041 case 8: buf0[1] = __byte_perm_S (buf0[1], 0, 0x2310);
3042 break;
3043 case 9: buf0[2] = __byte_perm_S (buf0[2], buf0[1], 0x7210);
3044 buf0[1] = __byte_perm_S (buf0[1], buf0[2], 0x4210);
3045 buf0[2] = __byte_perm_S (buf0[2], 0, 0x6543);
3046 break;
3047 case 10: buf0[2] = __byte_perm_S (buf0[2], 0, 0x5401);
3048 break;
3049 case 11: buf0[2] = __byte_perm_S (buf0[2], 0, 0x4120);
3050 break;
3051 case 12: buf0[2] = __byte_perm_S (buf0[2], 0, 0x2310);
3052 break;
3053 case 13: buf0[3] = __byte_perm_S (buf0[3], buf0[2], 0x7210);
3054 buf0[2] = __byte_perm_S (buf0[2], buf0[3], 0x4210);
3055 buf0[3] = __byte_perm_S (buf0[3], 0, 0x6543);
3056 break;
3057 case 14: buf0[3] = __byte_perm_S (buf0[3], 0, 0x5401);
3058 break;
3059 case 15: buf0[3] = __byte_perm_S (buf0[3], 0, 0x4120);
3060 break;
3061 case 16: buf0[3] = __byte_perm_S (buf0[3], 0, 0x2310);
3062 break;
3063 case 17: buf1[0] = __byte_perm_S (buf1[0], buf0[3], 0x7210);
3064 buf0[3] = __byte_perm_S (buf0[3], buf1[0], 0x4210);
3065 buf1[0] = __byte_perm_S (buf1[0], 0, 0x6543);
3066 break;
3067 case 18: buf1[0] = __byte_perm_S (buf1[0], 0, 0x5401);
3068 break;
3069 case 19: buf1[0] = __byte_perm_S (buf1[0], 0, 0x4120);
3070 break;
3071 case 20: buf1[0] = __byte_perm_S (buf1[0], 0, 0x2310);
3072 break;
3073 case 21: buf1[1] = __byte_perm_S (buf1[1], buf1[0], 0x7210);
3074 buf1[0] = __byte_perm_S (buf1[0], buf1[1], 0x4210);
3075 buf1[1] = __byte_perm_S (buf1[1], 0, 0x6543);
3076 break;
3077 case 22: buf1[1] = __byte_perm_S (buf1[1], 0, 0x5401);
3078 break;
3079 case 23: buf1[1] = __byte_perm_S (buf1[1], 0, 0x4120);
3080 break;
3081 case 24: buf1[1] = __byte_perm_S (buf1[1], 0, 0x2310);
3082 break;
3083 case 25: buf1[2] = __byte_perm_S (buf1[2], buf1[1], 0x7210);
3084 buf1[1] = __byte_perm_S (buf1[1], buf1[2], 0x4210);
3085 buf1[2] = __byte_perm_S (buf1[2], 0, 0x6543);
3086 break;
3087 case 26: buf1[2] = __byte_perm_S (buf1[2], 0, 0x5401);
3088 break;
3089 case 27: buf1[2] = __byte_perm_S (buf1[2], 0, 0x4120);
3090 break;
3091 case 28: buf1[2] = __byte_perm_S (buf1[2], 0, 0x2310);
3092 break;
3093 case 29: buf1[3] = __byte_perm_S (buf1[3], buf1[2], 0x7210);
3094 buf1[2] = __byte_perm_S (buf1[2], buf1[3], 0x4210);
3095 buf1[3] = __byte_perm_S (buf1[3], 0, 0x6543);
3096 break;
3097 case 30: buf1[3] = __byte_perm_S (buf1[3], 0, 0x5401);
3098 break;
3099 case 31: buf1[3] = __byte_perm_S (buf1[3], 0, 0x4120);
3100 break;
3101 }
3102 #endif
3103
3104 #if defined IS_AMD || defined IS_GENERIC
3105 switch (in_len)
3106 {
3107 case 2: buf0[0] = ((buf0[0] << 8) & 0x0000FF00) | ((buf0[0] >> 8) & 0x000000FF);
3108 break;
3109 case 3: buf0[0] = (buf0[0] & 0x000000FF) | ((buf0[0] << 8) & 0x00FF0000) | ((buf0[0] >> 8) & 0x0000FF00);
3110 break;
3111 case 4: buf0[0] = (buf0[0] & 0x0000FFFF) | ((buf0[0] << 8) & 0xFF000000) | ((buf0[0] >> 8) & 0x00FF0000);
3112 break;
3113 case 5: buf0[1] = (buf0[0] & 0xFF000000) | buf0[1];
3114 buf0[0] = (buf0[0] & 0x00FFFFFF) | (buf0[1] << 24);
3115 buf0[1] = (buf0[1] >> 24);
3116 break;
3117 case 6: buf0[1] = ((buf0[1] << 8) & 0x0000FF00) | ((buf0[1] >> 8) & 0x000000FF);
3118 break;
3119 case 7: buf0[1] = (buf0[1] & 0x000000FF) | ((buf0[1] << 8) & 0x00FF0000) | ((buf0[1] >> 8) & 0x0000FF00);
3120 break;
3121 case 8: buf0[1] = (buf0[1] & 0x0000FFFF) | ((buf0[1] << 8) & 0xFF000000) | ((buf0[1] >> 8) & 0x00FF0000);
3122 break;
3123 case 9: buf0[2] = (buf0[1] & 0xFF000000) | buf0[2];
3124 buf0[1] = (buf0[1] & 0x00FFFFFF) | (buf0[2] << 24);
3125 buf0[2] = (buf0[2] >> 24);
3126 break;
3127 case 10: buf0[2] = ((buf0[2] << 8) & 0x0000FF00) | ((buf0[2] >> 8) & 0x000000FF);
3128 break;
3129 case 11: buf0[2] = (buf0[2] & 0x000000FF) | ((buf0[2] << 8) & 0x00FF0000) | ((buf0[2] >> 8) & 0x0000FF00);
3130 break;
3131 case 12: buf0[2] = (buf0[2] & 0x0000FFFF) | ((buf0[2] << 8) & 0xFF000000) | ((buf0[2] >> 8) & 0x00FF0000);
3132 break;
3133 case 13: buf0[3] = (buf0[2] & 0xFF000000) | buf0[3];
3134 buf0[2] = (buf0[2] & 0x00FFFFFF) | (buf0[3] << 24);
3135 buf0[3] = (buf0[3] >> 24);
3136 break;
3137 case 14: buf0[3] = ((buf0[3] << 8) & 0x0000FF00) | ((buf0[3] >> 8) & 0x000000FF);
3138 break;
3139 case 15: buf0[3] = (buf0[3] & 0x000000FF) | ((buf0[3] << 8) & 0x00FF0000) | ((buf0[3] >> 8) & 0x0000FF00);
3140 break;
3141 case 16: buf0[3] = (buf0[3] & 0x0000FFFF) | ((buf0[3] << 8) & 0xFF000000) | ((buf0[3] >> 8) & 0x00FF0000);
3142 break;
3143 case 17: buf1[0] = (buf0[3] & 0xFF000000) | buf1[0];
3144 buf0[3] = (buf0[3] & 0x00FFFFFF) | (buf1[0] << 24);
3145 buf1[0] = (buf1[0] >> 24);
3146 break;
3147 case 18: buf1[0] = ((buf1[0] << 8) & 0x0000FF00) | ((buf1[0] >> 8) & 0x000000FF);
3148 break;
3149 case 19: buf1[0] = (buf1[0] & 0x000000FF) | ((buf1[0] << 8) & 0x00FF0000) | ((buf1[0] >> 8) & 0x0000FF00);
3150 break;
3151 case 20: buf1[0] = (buf1[0] & 0x0000FFFF) | ((buf1[0] << 8) & 0xFF000000) | ((buf1[0] >> 8) & 0x00FF0000);
3152 break;
3153 case 21: buf1[1] = (buf1[0] & 0xFF000000) | buf1[1];
3154 buf1[0] = (buf1[0] & 0x00FFFFFF) | (buf1[1] << 24);
3155 buf1[1] = (buf1[1] >> 24);
3156 break;
3157 case 22: buf1[1] = ((buf1[1] << 8) & 0x0000FF00) | ((buf1[1] >> 8) & 0x000000FF);
3158 break;
3159 case 23: buf1[1] = (buf1[1] & 0x000000FF) | ((buf1[1] << 8) & 0x00FF0000) | ((buf1[1] >> 8) & 0x0000FF00);
3160 break;
3161 case 24: buf1[1] = (buf1[1] & 0x0000FFFF) | ((buf1[1] << 8) & 0xFF000000) | ((buf1[1] >> 8) & 0x00FF0000);
3162 break;
3163 case 25: buf1[2] = (buf1[1] & 0xFF000000) | buf1[2];
3164 buf1[1] = (buf1[1] & 0x00FFFFFF) | (buf1[2] << 24);
3165 buf1[2] = (buf1[2] >> 24);
3166 break;
3167 case 26: buf1[2] = ((buf1[2] << 8) & 0x0000FF00) | ((buf1[2] >> 8) & 0x000000FF);
3168 break;
3169 case 27: buf1[2] = (buf1[2] & 0x000000FF) | ((buf1[2] << 8) & 0x00FF0000) | ((buf1[2] >> 8) & 0x0000FF00);
3170 break;
3171 case 28: buf1[2] = (buf1[2] & 0x0000FFFF) | ((buf1[2] << 8) & 0xFF000000) | ((buf1[2] >> 8) & 0x00FF0000);
3172 break;
3173 case 29: buf1[3] = (buf1[2] & 0xFF000000) | buf1[3];
3174 buf1[2] = (buf1[2] & 0x00FFFFFF) | (buf1[3] << 24);
3175 buf1[3] = (buf1[3] >> 24);
3176 break;
3177 case 30: buf1[3] = ((buf1[3] << 8) & 0x0000FF00) | ((buf1[3] >> 8) & 0x000000FF);
3178 break;
3179 case 31: buf1[3] = (buf1[3] & 0x000000FF) | ((buf1[3] << 8) & 0x00FF0000) | ((buf1[3] >> 8) & 0x0000FF00);
3180 break;
3181 }
3182 #endif
3183
3184 return in_len;
3185 }
3186
3187 inline u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3188 {
3189 if (p0 >= in_len) return (in_len);
3190 if (p1 >= in_len) return (in_len);
3191
3192 u32 tmp0 = 0;
3193 u32 tmp1 = 0;
3194
3195 #ifdef IS_NV
3196 switch (p0)
3197 {
3198 case 0: tmp0 = __byte_perm_S (buf0[0], 0, 0x6540);
3199 break;
3200 case 1: tmp0 = __byte_perm_S (buf0[0], 0, 0x6541);
3201 break;
3202 case 2: tmp0 = __byte_perm_S (buf0[0], 0, 0x6542);
3203 break;
3204 case 3: tmp0 = __byte_perm_S (buf0[0], 0, 0x6543);
3205 break;
3206 case 4: tmp0 = __byte_perm_S (buf0[1], 0, 0x6540);
3207 break;
3208 case 5: tmp0 = __byte_perm_S (buf0[1], 0, 0x6541);
3209 break;
3210 case 6: tmp0 = __byte_perm_S (buf0[1], 0, 0x6542);
3211 break;
3212 case 7: tmp0 = __byte_perm_S (buf0[1], 0, 0x6543);
3213 break;
3214 case 8: tmp0 = __byte_perm_S (buf0[2], 0, 0x6540);
3215 break;
3216 case 9: tmp0 = __byte_perm_S (buf0[2], 0, 0x6541);
3217 break;
3218 case 10: tmp0 = __byte_perm_S (buf0[2], 0, 0x6542);
3219 break;
3220 case 11: tmp0 = __byte_perm_S (buf0[2], 0, 0x6543);
3221 break;
3222 case 12: tmp0 = __byte_perm_S (buf0[3], 0, 0x6540);
3223 break;
3224 case 13: tmp0 = __byte_perm_S (buf0[3], 0, 0x6541);
3225 break;
3226 case 14: tmp0 = __byte_perm_S (buf0[3], 0, 0x6542);
3227 break;
3228 case 15: tmp0 = __byte_perm_S (buf0[3], 0, 0x6543);
3229 break;
3230 case 16: tmp0 = __byte_perm_S (buf1[0], 0, 0x6540);
3231 break;
3232 case 17: tmp0 = __byte_perm_S (buf1[0], 0, 0x6541);
3233 break;
3234 case 18: tmp0 = __byte_perm_S (buf1[0], 0, 0x6542);
3235 break;
3236 case 19: tmp0 = __byte_perm_S (buf1[0], 0, 0x6543);
3237 break;
3238 case 20: tmp0 = __byte_perm_S (buf1[1], 0, 0x6540);
3239 break;
3240 case 21: tmp0 = __byte_perm_S (buf1[1], 0, 0x6541);
3241 break;
3242 case 22: tmp0 = __byte_perm_S (buf1[1], 0, 0x6542);
3243 break;
3244 case 23: tmp0 = __byte_perm_S (buf1[1], 0, 0x6543);
3245 break;
3246 case 24: tmp0 = __byte_perm_S (buf1[2], 0, 0x6540);
3247 break;
3248 case 25: tmp0 = __byte_perm_S (buf1[2], 0, 0x6541);
3249 break;
3250 case 26: tmp0 = __byte_perm_S (buf1[2], 0, 0x6542);
3251 break;
3252 case 27: tmp0 = __byte_perm_S (buf1[2], 0, 0x6543);
3253 break;
3254 case 28: tmp0 = __byte_perm_S (buf1[3], 0, 0x6540);
3255 break;
3256 case 29: tmp0 = __byte_perm_S (buf1[3], 0, 0x6541);
3257 break;
3258 case 30: tmp0 = __byte_perm_S (buf1[3], 0, 0x6542);
3259 break;
3260 case 31: tmp0 = __byte_perm_S (buf1[3], 0, 0x6543);
3261 break;
3262 }
3263
3264 switch (p1)
3265 {
3266 case 0: tmp1 = __byte_perm_S (buf0[0], 0, 0x6540);
3267 buf0[0] = __byte_perm_S (tmp0, buf0[0], 0x7650);
3268 break;
3269 case 1: tmp1 = __byte_perm_S (buf0[0], 0, 0x6541);
3270 buf0[0] = __byte_perm_S (tmp0, buf0[0], 0x7604);
3271 break;
3272 case 2: tmp1 = __byte_perm_S (buf0[0], 0, 0x6542);
3273 buf0[0] = __byte_perm_S (tmp0, buf0[0], 0x7054);
3274 break;
3275 case 3: tmp1 = __byte_perm_S (buf0[0], 0, 0x6543);
3276 buf0[0] = __byte_perm_S (tmp0, buf0[0], 0x0654);
3277 break;
3278 case 4: tmp1 = __byte_perm_S (buf0[1], 0, 0x6540);
3279 buf0[1] = __byte_perm_S (tmp0, buf0[1], 0x7650);
3280 break;
3281 case 5: tmp1 = __byte_perm_S (buf0[1], 0, 0x6541);
3282 buf0[1] = __byte_perm_S (tmp0, buf0[1], 0x7604);
3283 break;
3284 case 6: tmp1 = __byte_perm_S (buf0[1], 0, 0x6542);
3285 buf0[1] = __byte_perm_S (tmp0, buf0[1], 0x7054);
3286 break;
3287 case 7: tmp1 = __byte_perm_S (buf0[1], 0, 0x6543);
3288 buf0[1] = __byte_perm_S (tmp0, buf0[1], 0x0654);
3289 break;
3290 case 8: tmp1 = __byte_perm_S (buf0[2], 0, 0x6540);
3291 buf0[2] = __byte_perm_S (tmp0, buf0[2], 0x7650);
3292 break;
3293 case 9: tmp1 = __byte_perm_S (buf0[2], 0, 0x6541);
3294 buf0[2] = __byte_perm_S (tmp0, buf0[2], 0x7604);
3295 break;
3296 case 10: tmp1 = __byte_perm_S (buf0[2], 0, 0x6542);
3297 buf0[2] = __byte_perm_S (tmp0, buf0[2], 0x7054);
3298 break;
3299 case 11: tmp1 = __byte_perm_S (buf0[2], 0, 0x6543);
3300 buf0[2] = __byte_perm_S (tmp0, buf0[2], 0x0654);
3301 break;
3302 case 12: tmp1 = __byte_perm_S (buf0[3], 0, 0x6540);
3303 buf0[3] = __byte_perm_S (tmp0, buf0[3], 0x7650);
3304 break;
3305 case 13: tmp1 = __byte_perm_S (buf0[3], 0, 0x6541);
3306 buf0[3] = __byte_perm_S (tmp0, buf0[3], 0x7604);
3307 break;
3308 case 14: tmp1 = __byte_perm_S (buf0[3], 0, 0x6542);
3309 buf0[3] = __byte_perm_S (tmp0, buf0[3], 0x7054);
3310 break;
3311 case 15: tmp1 = __byte_perm_S (buf0[3], 0, 0x6543);
3312 buf0[3] = __byte_perm_S (tmp0, buf0[3], 0x0654);
3313 break;
3314 case 16: tmp1 = __byte_perm_S (buf1[0], 0, 0x6540);
3315 buf1[0] = __byte_perm_S (tmp0, buf1[0], 0x7650);
3316 break;
3317 case 17: tmp1 = __byte_perm_S (buf1[0], 0, 0x6541);
3318 buf1[0] = __byte_perm_S (tmp0, buf1[0], 0x7604);
3319 break;
3320 case 18: tmp1 = __byte_perm_S (buf1[0], 0, 0x6542);
3321 buf1[0] = __byte_perm_S (tmp0, buf1[0], 0x7054);
3322 break;
3323 case 19: tmp1 = __byte_perm_S (buf1[0], 0, 0x6543);
3324 buf1[0] = __byte_perm_S (tmp0, buf1[0], 0x0654);
3325 break;
3326 case 20: tmp1 = __byte_perm_S (buf1[1], 0, 0x6540);
3327 buf1[1] = __byte_perm_S (tmp0, buf1[1], 0x7650);
3328 break;
3329 case 21: tmp1 = __byte_perm_S (buf1[1], 0, 0x6541);
3330 buf1[1] = __byte_perm_S (tmp0, buf1[1], 0x7604);
3331 break;
3332 case 22: tmp1 = __byte_perm_S (buf1[1], 0, 0x6542);
3333 buf1[1] = __byte_perm_S (tmp0, buf1[1], 0x7054);
3334 break;
3335 case 23: tmp1 = __byte_perm_S (buf1[1], 0, 0x6543);
3336 buf1[1] = __byte_perm_S (tmp0, buf1[1], 0x0654);
3337 break;
3338 case 24: tmp1 = __byte_perm_S (buf1[2], 0, 0x6540);
3339 buf1[2] = __byte_perm_S (tmp0, buf1[2], 0x7650);
3340 break;
3341 case 25: tmp1 = __byte_perm_S (buf1[2], 0, 0x6541);
3342 buf1[2] = __byte_perm_S (tmp0, buf1[2], 0x7604);
3343 break;
3344 case 26: tmp1 = __byte_perm_S (buf1[2], 0, 0x6542);
3345 buf1[2] = __byte_perm_S (tmp0, buf1[2], 0x7054);
3346 break;
3347 case 27: tmp1 = __byte_perm_S (buf1[2], 0, 0x6543);
3348 buf1[2] = __byte_perm_S (tmp0, buf1[2], 0x0654);
3349 break;
3350 case 28: tmp1 = __byte_perm_S (buf1[3], 0, 0x6540);
3351 buf1[3] = __byte_perm_S (tmp0, buf1[3], 0x7650);
3352 break;
3353 case 29: tmp1 = __byte_perm_S (buf1[3], 0, 0x6541);
3354 buf1[3] = __byte_perm_S (tmp0, buf1[3], 0x7604);
3355 break;
3356 case 30: tmp1 = __byte_perm_S (buf1[3], 0, 0x6542);
3357 buf1[3] = __byte_perm_S (tmp0, buf1[3], 0x7054);
3358 break;
3359 case 31: tmp1 = __byte_perm_S (buf1[3], 0, 0x6543);
3360 buf1[3] = __byte_perm_S (tmp0, buf1[3], 0x0654);
3361 break;
3362 }
3363
3364 switch (p0)
3365 {
3366 case 0: buf0[0] = __byte_perm_S (tmp1, buf0[0], 0x7650);
3367 break;
3368 case 1: buf0[0] = __byte_perm_S (tmp1, buf0[0], 0x7604);
3369 break;
3370 case 2: buf0[0] = __byte_perm_S (tmp1, buf0[0], 0x7054);
3371 break;
3372 case 3: buf0[0] = __byte_perm_S (tmp1, buf0[0], 0x0654);
3373 break;
3374 case 4: buf0[1] = __byte_perm_S (tmp1, buf0[1], 0x7650);
3375 break;
3376 case 5: buf0[1] = __byte_perm_S (tmp1, buf0[1], 0x7604);
3377 break;
3378 case 6: buf0[1] = __byte_perm_S (tmp1, buf0[1], 0x7054);
3379 break;
3380 case 7: buf0[1] = __byte_perm_S (tmp1, buf0[1], 0x0654);
3381 break;
3382 case 8: buf0[2] = __byte_perm_S (tmp1, buf0[2], 0x7650);
3383 break;
3384 case 9: buf0[2] = __byte_perm_S (tmp1, buf0[2], 0x7604);
3385 break;
3386 case 10: buf0[2] = __byte_perm_S (tmp1, buf0[2], 0x7054);
3387 break;
3388 case 11: buf0[2] = __byte_perm_S (tmp1, buf0[2], 0x0654);
3389 break;
3390 case 12: buf0[3] = __byte_perm_S (tmp1, buf0[3], 0x7650);
3391 break;
3392 case 13: buf0[3] = __byte_perm_S (tmp1, buf0[3], 0x7604);
3393 break;
3394 case 14: buf0[3] = __byte_perm_S (tmp1, buf0[3], 0x7054);
3395 break;
3396 case 15: buf0[3] = __byte_perm_S (tmp1, buf0[3], 0x0654);
3397 break;
3398 case 16: buf1[0] = __byte_perm_S (tmp1, buf1[0], 0x7650);
3399 break;
3400 case 17: buf1[0] = __byte_perm_S (tmp1, buf1[0], 0x7604);
3401 break;
3402 case 18: buf1[0] = __byte_perm_S (tmp1, buf1[0], 0x7054);
3403 break;
3404 case 19: buf1[0] = __byte_perm_S (tmp1, buf1[0], 0x0654);
3405 break;
3406 case 20: buf1[1] = __byte_perm_S (tmp1, buf1[1], 0x7650);
3407 break;
3408 case 21: buf1[1] = __byte_perm_S (tmp1, buf1[1], 0x7604);
3409 break;
3410 case 22: buf1[1] = __byte_perm_S (tmp1, buf1[1], 0x7054);
3411 break;
3412 case 23: buf1[1] = __byte_perm_S (tmp1, buf1[1], 0x0654);
3413 break;
3414 case 24: buf1[2] = __byte_perm_S (tmp1, buf1[2], 0x7650);
3415 break;
3416 case 25: buf1[2] = __byte_perm_S (tmp1, buf1[2], 0x7604);
3417 break;
3418 case 26: buf1[2] = __byte_perm_S (tmp1, buf1[2], 0x7054);
3419 break;
3420 case 27: buf1[2] = __byte_perm_S (tmp1, buf1[2], 0x0654);
3421 break;
3422 case 28: buf1[3] = __byte_perm_S (tmp1, buf1[3], 0x7650);
3423 break;
3424 case 29: buf1[3] = __byte_perm_S (tmp1, buf1[3], 0x7604);
3425 break;
3426 case 30: buf1[3] = __byte_perm_S (tmp1, buf1[3], 0x7054);
3427 break;
3428 case 31: buf1[3] = __byte_perm_S (tmp1, buf1[3], 0x0654);
3429 break;
3430 }
3431 #endif
3432
3433 #if defined IS_AMD || defined IS_GENERIC
3434 switch (p0)
3435 {
3436 case 0: tmp0 = (buf0[0] >> 0) & 0xFF;
3437 break;
3438 case 1: tmp0 = (buf0[0] >> 8) & 0xFF;
3439 break;
3440 case 2: tmp0 = (buf0[0] >> 16) & 0xFF;
3441 break;
3442 case 3: tmp0 = (buf0[0] >> 24) & 0xFF;
3443 break;
3444 case 4: tmp0 = (buf0[1] >> 0) & 0xFF;
3445 break;
3446 case 5: tmp0 = (buf0[1] >> 8) & 0xFF;
3447 break;
3448 case 6: tmp0 = (buf0[1] >> 16) & 0xFF;
3449 break;
3450 case 7: tmp0 = (buf0[1] >> 24) & 0xFF;
3451 break;
3452 case 8: tmp0 = (buf0[2] >> 0) & 0xFF;
3453 break;
3454 case 9: tmp0 = (buf0[2] >> 8) & 0xFF;
3455 break;
3456 case 10: tmp0 = (buf0[2] >> 16) & 0xFF;
3457 break;
3458 case 11: tmp0 = (buf0[2] >> 24) & 0xFF;
3459 break;
3460 case 12: tmp0 = (buf0[3] >> 0) & 0xFF;
3461 break;
3462 case 13: tmp0 = (buf0[3] >> 8) & 0xFF;
3463 break;
3464 case 14: tmp0 = (buf0[3] >> 16) & 0xFF;
3465 break;
3466 case 15: tmp0 = (buf0[3] >> 24) & 0xFF;
3467 break;
3468 case 16: tmp0 = (buf1[0] >> 0) & 0xFF;
3469 break;
3470 case 17: tmp0 = (buf1[0] >> 8) & 0xFF;
3471 break;
3472 case 18: tmp0 = (buf1[0] >> 16) & 0xFF;
3473 break;
3474 case 19: tmp0 = (buf1[0] >> 24) & 0xFF;
3475 break;
3476 case 20: tmp0 = (buf1[1] >> 0) & 0xFF;
3477 break;
3478 case 21: tmp0 = (buf1[1] >> 8) & 0xFF;
3479 break;
3480 case 22: tmp0 = (buf1[1] >> 16) & 0xFF;
3481 break;
3482 case 23: tmp0 = (buf1[1] >> 24) & 0xFF;
3483 break;
3484 case 24: tmp0 = (buf1[2] >> 0) & 0xFF;
3485 break;
3486 case 25: tmp0 = (buf1[2] >> 8) & 0xFF;
3487 break;
3488 case 26: tmp0 = (buf1[2] >> 16) & 0xFF;
3489 break;
3490 case 27: tmp0 = (buf1[2] >> 24) & 0xFF;
3491 break;
3492 case 28: tmp0 = (buf1[3] >> 0) & 0xFF;
3493 break;
3494 case 29: tmp0 = (buf1[3] >> 8) & 0xFF;
3495 break;
3496 case 30: tmp0 = (buf1[3] >> 16) & 0xFF;
3497 break;
3498 case 31: tmp0 = (buf1[3] >> 24) & 0xFF;
3499 break;
3500 }
3501
3502 switch (p1)
3503 {
3504 case 0: tmp1 = (buf0[0] >> 0) & 0xff;
3505 buf0[0] = (buf0[0] & 0xffffff00) | tmp0 << 0;
3506 break;
3507 case 1: tmp1 = (buf0[0] >> 8) & 0xff;
3508 buf0[0] = (buf0[0] & 0xffff00ff) | tmp0 << 8;
3509 break;
3510 case 2: tmp1 = (buf0[0] >> 16) & 0xff;
3511 buf0[0] = (buf0[0] & 0xff00ffff) | tmp0 << 16;
3512 break;
3513 case 3: tmp1 = (buf0[0] >> 24) & 0xff;
3514 buf0[0] = (buf0[0] & 0x00ffffff) | tmp0 << 24;
3515 break;
3516 case 4: tmp1 = (buf0[1] >> 0) & 0xff;
3517 buf0[1] = (buf0[1] & 0xffffff00) | tmp0 << 0;
3518 break;
3519 case 5: tmp1 = (buf0[1] >> 8) & 0xff;
3520 buf0[1] = (buf0[1] & 0xffff00ff) | tmp0 << 8;
3521 break;
3522 case 6: tmp1 = (buf0[1] >> 16) & 0xff;
3523 buf0[1] = (buf0[1] & 0xff00ffff) | tmp0 << 16;
3524 break;
3525 case 7: tmp1 = (buf0[1] >> 24) & 0xff;
3526 buf0[1] = (buf0[1] & 0x00ffffff) | tmp0 << 24;
3527 break;
3528 case 8: tmp1 = (buf0[2] >> 0) & 0xff;
3529 buf0[2] = (buf0[2] & 0xffffff00) | tmp0 << 0;
3530 break;
3531 case 9: tmp1 = (buf0[2] >> 8) & 0xff;
3532 buf0[2] = (buf0[2] & 0xffff00ff) | tmp0 << 8;
3533 break;
3534 case 10: tmp1 = (buf0[2] >> 16) & 0xff;
3535 buf0[2] = (buf0[2] & 0xff00ffff) | tmp0 << 16;
3536 break;
3537 case 11: tmp1 = (buf0[2] >> 24) & 0xff;
3538 buf0[2] = (buf0[2] & 0x00ffffff) | tmp0 << 24;
3539 break;
3540 case 12: tmp1 = (buf0[3] >> 0) & 0xff;
3541 buf0[3] = (buf0[3] & 0xffffff00) | tmp0 << 0;
3542 break;
3543 case 13: tmp1 = (buf0[3] >> 8) & 0xff;
3544 buf0[3] = (buf0[3] & 0xffff00ff) | tmp0 << 8;
3545 break;
3546 case 14: tmp1 = (buf0[3] >> 16) & 0xff;
3547 buf0[3] = (buf0[3] & 0xff00ffff) | tmp0 << 16;
3548 break;
3549 case 15: tmp1 = (buf0[3] >> 24) & 0xff;
3550 buf0[3] = (buf0[3] & 0x00ffffff) | tmp0 << 24;
3551 break;
3552 case 16: tmp1 = (buf1[0] >> 0) & 0xff;
3553 buf1[0] = (buf1[0] & 0xffffff00) | tmp0 << 0;
3554 break;
3555 case 17: tmp1 = (buf1[0] >> 8) & 0xff;
3556 buf1[0] = (buf1[0] & 0xffff00ff) | tmp0 << 8;
3557 break;
3558 case 18: tmp1 = (buf1[0] >> 16) & 0xff;
3559 buf1[0] = (buf1[0] & 0xff00ffff) | tmp0 << 16;
3560 break;
3561 case 19: tmp1 = (buf1[0] >> 24) & 0xff;
3562 buf1[0] = (buf1[0] & 0x00ffffff) | tmp0 << 24;
3563 break;
3564 case 20: tmp1 = (buf1[1] >> 0) & 0xff;
3565 buf1[1] = (buf1[1] & 0xffffff00) | tmp0 << 0;
3566 break;
3567 case 21: tmp1 = (buf1[1] >> 8) & 0xff;
3568 buf1[1] = (buf1[1] & 0xffff00ff) | tmp0 << 8;
3569 break;
3570 case 22: tmp1 = (buf1[1] >> 16) & 0xff;
3571 buf1[1] = (buf1[1] & 0xff00ffff) | tmp0 << 16;
3572 break;
3573 case 23: tmp1 = (buf1[1] >> 24) & 0xff;
3574 buf1[1] = (buf1[1] & 0x00ffffff) | tmp0 << 24;
3575 break;
3576 case 24: tmp1 = (buf1[2] >> 0) & 0xff;
3577 buf1[2] = (buf1[2] & 0xffffff00) | tmp0 << 0;
3578 break;
3579 case 25: tmp1 = (buf1[2] >> 8) & 0xff;
3580 buf1[2] = (buf1[2] & 0xffff00ff) | tmp0 << 8;
3581 break;
3582 case 26: tmp1 = (buf1[2] >> 16) & 0xff;
3583 buf1[2] = (buf1[2] & 0xff00ffff) | tmp0 << 16;
3584 break;
3585 case 27: tmp1 = (buf1[2] >> 24) & 0xff;
3586 buf1[2] = (buf1[2] & 0x00ffffff) | tmp0 << 24;
3587 break;
3588 case 28: tmp1 = (buf1[3] >> 0) & 0xff;
3589 buf1[3] = (buf1[3] & 0xffffff00) | tmp0 << 0;
3590 break;
3591 case 29: tmp1 = (buf1[3] >> 8) & 0xff;
3592 buf1[3] = (buf1[3] & 0xffff00ff) | tmp0 << 8;
3593 break;
3594 case 30: tmp1 = (buf1[3] >> 16) & 0xff;
3595 buf1[3] = (buf1[3] & 0xff00ffff) | tmp0 << 16;
3596 break;
3597 case 31: tmp1 = (buf1[3] >> 24) & 0xff;
3598 buf1[3] = (buf1[3] & 0x00ffffff) | tmp0 << 24;
3599 break;
3600 }
3601
3602 switch (p0)
3603 {
3604 case 0: buf0[0] = (buf0[0] & 0xffffff00) | tmp1 << 0;
3605 break;
3606 case 1: buf0[0] = (buf0[0] & 0xffff00ff) | tmp1 << 8;
3607 break;
3608 case 2: buf0[0] = (buf0[0] & 0xff00ffff) | tmp1 << 16;
3609 break;
3610 case 3: buf0[0] = (buf0[0] & 0x00ffffff) | tmp1 << 24;
3611 break;
3612 case 4: buf0[1] = (buf0[1] & 0xffffff00) | tmp1 << 0;
3613 break;
3614 case 5: buf0[1] = (buf0[1] & 0xffff00ff) | tmp1 << 8;
3615 break;
3616 case 6: buf0[1] = (buf0[1] & 0xff00ffff) | tmp1 << 16;
3617 break;
3618 case 7: buf0[1] = (buf0[1] & 0x00ffffff) | tmp1 << 24;
3619 break;
3620 case 8: buf0[2] = (buf0[2] & 0xffffff00) | tmp1 << 0;
3621 break;
3622 case 9: buf0[2] = (buf0[2] & 0xffff00ff) | tmp1 << 8;
3623 break;
3624 case 10: buf0[2] = (buf0[2] & 0xff00ffff) | tmp1 << 16;
3625 break;
3626 case 11: buf0[2] = (buf0[2] & 0x00ffffff) | tmp1 << 24;
3627 break;
3628 case 12: buf0[3] = (buf0[3] & 0xffffff00) | tmp1 << 0;
3629 break;
3630 case 13: buf0[3] = (buf0[3] & 0xffff00ff) | tmp1 << 8;
3631 break;
3632 case 14: buf0[3] = (buf0[3] & 0xff00ffff) | tmp1 << 16;
3633 break;
3634 case 15: buf0[3] = (buf0[3] & 0x00ffffff) | tmp1 << 24;
3635 break;
3636 case 16: buf1[0] = (buf1[0] & 0xffffff00) | tmp1 << 0;
3637 break;
3638 case 17: buf1[0] = (buf1[0] & 0xffff00ff) | tmp1 << 8;
3639 break;
3640 case 18: buf1[0] = (buf1[0] & 0xff00ffff) | tmp1 << 16;
3641 break;
3642 case 19: buf1[0] = (buf1[0] & 0x00ffffff) | tmp1 << 24;
3643 break;
3644 case 20: buf1[1] = (buf1[1] & 0xffffff00) | tmp1 << 0;
3645 break;
3646 case 21: buf1[1] = (buf1[1] & 0xffff00ff) | tmp1 << 8;
3647 break;
3648 case 22: buf1[1] = (buf1[1] & 0xff00ffff) | tmp1 << 16;
3649 break;
3650 case 23: buf1[1] = (buf1[1] & 0x00ffffff) | tmp1 << 24;
3651 break;
3652 case 24: buf1[2] = (buf1[2] & 0xffffff00) | tmp1 << 0;
3653 break;
3654 case 25: buf1[2] = (buf1[2] & 0xffff00ff) | tmp1 << 8;
3655 break;
3656 case 26: buf1[2] = (buf1[2] & 0xff00ffff) | tmp1 << 16;
3657 break;
3658 case 27: buf1[2] = (buf1[2] & 0x00ffffff) | tmp1 << 24;
3659 break;
3660 case 28: buf1[3] = (buf1[3] & 0xffffff00) | tmp1 << 0;
3661 break;
3662 case 29: buf1[3] = (buf1[3] & 0xffff00ff) | tmp1 << 8;
3663 break;
3664 case 30: buf1[3] = (buf1[3] & 0xff00ffff) | tmp1 << 16;
3665 break;
3666 case 31: buf1[3] = (buf1[3] & 0x00ffffff) | tmp1 << 24;
3667 break;
3668 }
3669 #endif
3670
3671 return in_len;
3672 }
3673
3674 inline u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3675 {
3676 if (p0 >= in_len) return (in_len);
3677
3678 const u32 mr = 0xffu << ((p0 & 3) * 8);
3679 const u32 ml = ~mr;
3680
3681 switch (p0 / 4)
3682 {
3683 case 0: buf0[0] = (buf0[0] & ml) | (((buf0[0] & mr) << 1) & mr); break;
3684 case 1: buf0[1] = (buf0[1] & ml) | (((buf0[1] & mr) << 1) & mr); break;
3685 case 2: buf0[2] = (buf0[2] & ml) | (((buf0[2] & mr) << 1) & mr); break;
3686 case 3: buf0[3] = (buf0[3] & ml) | (((buf0[3] & mr) << 1) & mr); break;
3687 case 4: buf1[0] = (buf1[0] & ml) | (((buf1[0] & mr) << 1) & mr); break;
3688 case 5: buf1[1] = (buf1[1] & ml) | (((buf1[1] & mr) << 1) & mr); break;
3689 case 6: buf1[2] = (buf1[2] & ml) | (((buf1[2] & mr) << 1) & mr); break;
3690 case 7: buf1[3] = (buf1[3] & ml) | (((buf1[3] & mr) << 1) & mr); break;
3691 }
3692
3693 return in_len;
3694 }
3695
3696 inline u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3697 {
3698 if (p0 >= in_len) return (in_len);
3699
3700 const u32 mr = 0xffu << ((p0 & 3) * 8);
3701 const u32 ml = ~mr;
3702
3703 switch (p0 / 4)
3704 {
3705 case 0: buf0[0] = (buf0[0] & ml) | (((buf0[0] & mr) >> 1) & mr); break;
3706 case 1: buf0[1] = (buf0[1] & ml) | (((buf0[1] & mr) >> 1) & mr); break;
3707 case 2: buf0[2] = (buf0[2] & ml) | (((buf0[2] & mr) >> 1) & mr); break;
3708 case 3: buf0[3] = (buf0[3] & ml) | (((buf0[3] & mr) >> 1) & mr); break;
3709 case 4: buf1[0] = (buf1[0] & ml) | (((buf1[0] & mr) >> 1) & mr); break;
3710 case 5: buf1[1] = (buf1[1] & ml) | (((buf1[1] & mr) >> 1) & mr); break;
3711 case 6: buf1[2] = (buf1[2] & ml) | (((buf1[2] & mr) >> 1) & mr); break;
3712 case 7: buf1[3] = (buf1[3] & ml) | (((buf1[3] & mr) >> 1) & mr); break;
3713 }
3714
3715 return in_len;
3716 }
3717
3718 inline u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3719 {
3720 if (p0 >= in_len) return (in_len);
3721
3722 const u32 mr = 0xffu << ((p0 & 3) * 8);
3723 const u32 ml = ~mr;
3724
3725 const u32 n = 0x01010101 & mr;
3726
3727 switch (p0 / 4)
3728 {
3729 case 0: buf0[0] = (buf0[0] & ml) | (((buf0[0] & mr) + n) & mr); break;
3730 case 1: buf0[1] = (buf0[1] & ml) | (((buf0[1] & mr) + n) & mr); break;
3731 case 2: buf0[2] = (buf0[2] & ml) | (((buf0[2] & mr) + n) & mr); break;
3732 case 3: buf0[3] = (buf0[3] & ml) | (((buf0[3] & mr) + n) & mr); break;
3733 case 4: buf1[0] = (buf1[0] & ml) | (((buf1[0] & mr) + n) & mr); break;
3734 case 5: buf1[1] = (buf1[1] & ml) | (((buf1[1] & mr) + n) & mr); break;
3735 case 6: buf1[2] = (buf1[2] & ml) | (((buf1[2] & mr) + n) & mr); break;
3736 case 7: buf1[3] = (buf1[3] & ml) | (((buf1[3] & mr) + n) & mr); break;
3737 }
3738
3739 return in_len;
3740 }
3741
3742 inline u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3743 {
3744 if (p0 >= in_len) return (in_len);
3745
3746 const u32 mr = 0xffu << ((p0 & 3) * 8);
3747 const u32 ml = ~mr;
3748
3749 const u32 n = 0x01010101 & mr;
3750
3751 switch (p0 / 4)
3752 {
3753 case 0: buf0[0] = (buf0[0] & ml) | (((buf0[0] & mr) - n) & mr); break;
3754 case 1: buf0[1] = (buf0[1] & ml) | (((buf0[1] & mr) - n) & mr); break;
3755 case 2: buf0[2] = (buf0[2] & ml) | (((buf0[2] & mr) - n) & mr); break;
3756 case 3: buf0[3] = (buf0[3] & ml) | (((buf0[3] & mr) - n) & mr); break;
3757 case 4: buf1[0] = (buf1[0] & ml) | (((buf1[0] & mr) - n) & mr); break;
3758 case 5: buf1[1] = (buf1[1] & ml) | (((buf1[1] & mr) - n) & mr); break;
3759 case 6: buf1[2] = (buf1[2] & ml) | (((buf1[2] & mr) - n) & mr); break;
3760 case 7: buf1[3] = (buf1[3] & ml) | (((buf1[3] & mr) - n) & mr); break;
3761 }
3762
3763 return in_len;
3764 }
3765
3766 inline u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3767 {
3768 if ((p0 + 1) >= in_len) return (in_len);
3769
3770 u32 tib40[4];
3771 u32 tib41[4];
3772
3773 lshift_block (buf0, buf1, tib40, tib41);
3774
3775 const u32 mr = 0xffu << ((p0 & 3) * 8);
3776 const u32 ml = ~mr;
3777
3778 switch (p0 / 4)
3779 {
3780 case 0: buf0[0] = (buf0[0] & ml) | (tib40[0] & mr); break;
3781 case 1: buf0[1] = (buf0[1] & ml) | (tib40[1] & mr); break;
3782 case 2: buf0[2] = (buf0[2] & ml) | (tib40[2] & mr); break;
3783 case 3: buf0[3] = (buf0[3] & ml) | (tib40[3] & mr); break;
3784 case 4: buf1[0] = (buf1[0] & ml) | (tib41[0] & mr); break;
3785 case 5: buf1[1] = (buf1[1] & ml) | (tib41[1] & mr); break;
3786 case 6: buf1[2] = (buf1[2] & ml) | (tib41[2] & mr); break;
3787 case 7: buf1[3] = (buf1[3] & ml) | (tib41[3] & mr); break;
3788 }
3789
3790 return in_len;
3791 }
3792
3793 inline u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3794 {
3795 if (p0 == 0) return (in_len);
3796
3797 if (p0 >= in_len) return (in_len);
3798
3799 u32 tib40[4];
3800 u32 tib41[4];
3801
3802 rshift_block (buf0, buf1, tib40, tib41);
3803
3804 const u32 mr = 0xffu << ((p0 & 3) * 8);
3805 const u32 ml = ~mr;
3806
3807 switch (p0 / 4)
3808 {
3809 case 0: buf0[0] = (buf0[0] & ml) | (tib40[0] & mr); break;
3810 case 1: buf0[1] = (buf0[1] & ml) | (tib40[1] & mr); break;
3811 case 2: buf0[2] = (buf0[2] & ml) | (tib40[2] & mr); break;
3812 case 3: buf0[3] = (buf0[3] & ml) | (tib40[3] & mr); break;
3813 case 4: buf1[0] = (buf1[0] & ml) | (tib41[0] & mr); break;
3814 case 5: buf1[1] = (buf1[1] & ml) | (tib41[1] & mr); break;
3815 case 6: buf1[2] = (buf1[2] & ml) | (tib41[2] & mr); break;
3816 case 7: buf1[3] = (buf1[3] & ml) | (tib41[3] & mr); break;
3817 }
3818
3819 return in_len;
3820 }
3821
3822 inline u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3823 {
3824 if (p0 > in_len) return (in_len);
3825
3826 if ((in_len + p0) >= 32) return (in_len);
3827
3828 u32 out_len = in_len;
3829
3830 u32 tib40[4];
3831 u32 tib41[4];
3832
3833 tib40[0] = buf0[0];
3834 tib40[1] = buf0[1];
3835 tib40[2] = buf0[2];
3836 tib40[3] = buf0[3];
3837 tib41[0] = buf1[0];
3838 tib41[1] = buf1[1];
3839 tib41[2] = buf1[2];
3840 tib41[3] = buf1[3];
3841
3842 truncate_right (tib40, tib41, p0);
3843
3844 rshift_block_N (buf0, buf1, buf0, buf1, p0);
3845
3846 buf0[0] |= tib40[0];
3847 buf0[1] |= tib40[1];
3848 buf0[2] |= tib40[2];
3849 buf0[3] |= tib40[3];
3850 buf1[0] |= tib41[0];
3851 buf1[1] |= tib41[1];
3852 buf1[2] |= tib41[2];
3853 buf1[3] |= tib41[3];
3854
3855 out_len += p0;
3856
3857 return out_len;
3858 }
3859
3860 inline u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3861 {
3862 if (p0 > in_len) return (in_len);
3863
3864 if ((in_len + p0) >= 32) return (in_len);
3865
3866 u32 out_len = in_len;
3867
3868 u32 tib40[4];
3869 u32 tib41[4];
3870
3871 rshift_block_N (buf0, buf1, tib40, tib41, p0);
3872
3873 truncate_left (tib40, tib41, out_len);
3874
3875 buf0[0] |= tib40[0];
3876 buf0[1] |= tib40[1];
3877 buf0[2] |= tib40[2];
3878 buf0[3] |= tib40[3];
3879 buf1[0] |= tib41[0];
3880 buf1[1] |= tib41[1];
3881 buf1[2] |= tib41[2];
3882 buf1[3] |= tib41[3];
3883
3884 out_len += p0;
3885
3886 return out_len;
3887 }
3888
3889 inline u32 rule_op_mangle_title (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
3890 {
3891 buf0[0] |= (generate_cmask (buf0[0]));
3892 buf0[1] |= (generate_cmask (buf0[1]));
3893 buf0[2] |= (generate_cmask (buf0[2]));
3894 buf0[3] |= (generate_cmask (buf0[3]));
3895 buf1[0] |= (generate_cmask (buf1[0]));
3896 buf1[1] |= (generate_cmask (buf1[1]));
3897 buf1[2] |= (generate_cmask (buf1[2]));
3898 buf1[3] |= (generate_cmask (buf1[3]));
3899
3900 #ifdef IS_NV
3901 buf0[0] &= ~(0x00000020 & generate_cmask (buf0[0]));
3902
3903 for (u32 i = 0; i < in_len; i++)
3904 {
3905 u32 tmp0;
3906 u32 tmp1;
3907
3908 switch (i)
3909 {
3910 case 0: tmp0 = __byte_perm_S (buf0[0], 0, 0x6540);
3911 tmp1 = ~(0x00002000 & generate_cmask (buf0[0])); break;
3912 case 1: tmp0 = __byte_perm_S (buf0[0], 0, 0x6541);
3913 tmp1 = ~(0x00200000 & generate_cmask (buf0[0])); break;
3914 case 2: tmp0 = __byte_perm_S (buf0[0], 0, 0x6542);
3915 tmp1 = ~(0x20000000 & generate_cmask (buf0[0])); break;
3916 case 3: tmp0 = __byte_perm_S (buf0[0], 0, 0x6543);
3917 tmp1 = ~(0x00000020 & generate_cmask (buf0[1])); break;
3918 case 4: tmp0 = __byte_perm_S (buf0[1], 0, 0x6540);
3919 tmp1 = ~(0x00002000 & generate_cmask (buf0[1])); break;
3920 case 5: tmp0 = __byte_perm_S (buf0[1], 0, 0x6541);
3921 tmp1 = ~(0x00200000 & generate_cmask (buf0[1])); break;
3922 case 6: tmp0 = __byte_perm_S (buf0[1], 0, 0x6542);
3923 tmp1 = ~(0x20000000 & generate_cmask (buf0[1])); break;
3924 case 7: tmp0 = __byte_perm_S (buf0[1], 0, 0x6543);
3925 tmp1 = ~(0x00000020 & generate_cmask (buf0[2])); break;
3926 case 8: tmp0 = __byte_perm_S (buf0[2], 0, 0x6540);
3927 tmp1 = ~(0x00002000 & generate_cmask (buf0[2])); break;
3928 case 9: tmp0 = __byte_perm_S (buf0[2], 0, 0x6541);
3929 tmp1 = ~(0x00200000 & generate_cmask (buf0[2])); break;
3930 case 10: tmp0 = __byte_perm_S (buf0[2], 0, 0x6542);
3931 tmp1 = ~(0x20000000 & generate_cmask (buf0[2])); break;
3932 case 11: tmp0 = __byte_perm_S (buf0[2], 0, 0x6543);
3933 tmp1 = ~(0x00000020 & generate_cmask (buf0[3])); break;
3934 case 12: tmp0 = __byte_perm_S (buf0[3], 0, 0x6540);
3935 tmp1 = ~(0x00002000 & generate_cmask (buf0[3])); break;
3936 case 13: tmp0 = __byte_perm_S (buf0[3], 0, 0x6541);
3937 tmp1 = ~(0x00200000 & generate_cmask (buf0[3])); break;
3938 case 14: tmp0 = __byte_perm_S (buf0[3], 0, 0x6542);
3939 tmp1 = ~(0x20000000 & generate_cmask (buf0[3])); break;
3940 case 15: tmp0 = __byte_perm_S (buf0[3], 0, 0x6543);
3941 tmp1 = ~(0x00000020 & generate_cmask (buf1[0])); break;
3942 case 16: tmp0 = __byte_perm_S (buf1[0], 0, 0x6540);
3943 tmp1 = ~(0x00002000 & generate_cmask (buf1[0])); break;
3944 case 17: tmp0 = __byte_perm_S (buf1[0], 0, 0x6541);
3945 tmp1 = ~(0x00200000 & generate_cmask (buf1[0])); break;
3946 case 18: tmp0 = __byte_perm_S (buf1[0], 0, 0x6542);
3947 tmp1 = ~(0x20000000 & generate_cmask (buf1[0])); break;
3948 case 19: tmp0 = __byte_perm_S (buf1[0], 0, 0x6543);
3949 tmp1 = ~(0x00000020 & generate_cmask (buf1[1])); break;
3950 case 20: tmp0 = __byte_perm_S (buf1[1], 0, 0x6540);
3951 tmp1 = ~(0x00002000 & generate_cmask (buf1[1])); break;
3952 case 21: tmp0 = __byte_perm_S (buf1[1], 0, 0x6541);
3953 tmp1 = ~(0x00200000 & generate_cmask (buf1[1])); break;
3954 case 22: tmp0 = __byte_perm_S (buf1[1], 0, 0x6542);
3955 tmp1 = ~(0x20000000 & generate_cmask (buf1[1])); break;
3956 case 23: tmp0 = __byte_perm_S (buf1[1], 0, 0x6543);
3957 tmp1 = ~(0x00000020 & generate_cmask (buf1[2])); break;
3958 case 24: tmp0 = __byte_perm_S (buf1[2], 0, 0x6540);
3959 tmp1 = ~(0x00002000 & generate_cmask (buf1[2])); break;
3960 case 25: tmp0 = __byte_perm_S (buf1[2], 0, 0x6541);
3961 tmp1 = ~(0x00200000 & generate_cmask (buf1[2])); break;
3962 case 26: tmp0 = __byte_perm_S (buf1[2], 0, 0x6542);
3963 tmp1 = ~(0x20000000 & generate_cmask (buf1[2])); break;
3964 case 27: tmp0 = __byte_perm_S (buf1[2], 0, 0x6543);
3965 tmp1 = ~(0x00000020 & generate_cmask (buf1[3])); break;
3966 case 28: tmp0 = __byte_perm_S (buf1[3], 0, 0x6540);
3967 tmp1 = ~(0x00002000 & generate_cmask (buf1[3])); break;
3968 case 29: tmp0 = __byte_perm_S (buf1[3], 0, 0x6541);
3969 tmp1 = ~(0x00200000 & generate_cmask (buf1[3])); break;
3970 case 30: tmp0 = __byte_perm_S (buf1[3], 0, 0x6542);
3971 tmp1 = ~(0x20000000 & generate_cmask (buf1[3])); break;
3972 }
3973
3974 if (i < 3)
3975 {
3976 if (tmp0 == ' ') buf0[0] &= tmp1 ;
3977 }
3978 else if (i < 7)
3979 {
3980 if (tmp0 == ' ') buf0[1] &= tmp1 ;
3981 }
3982 else if (i < 11)
3983 {
3984 if (tmp0 == ' ') buf0[2] &= tmp1 ;
3985 }
3986 else if (i < 15)
3987 {
3988 if (tmp0 == ' ') buf0[3] &= tmp1 ;
3989 }
3990 else if (i < 19)
3991 {
3992 if (tmp0 == ' ') buf1[0] &= tmp1 ;
3993 }
3994 else if (i < 23)
3995 {
3996 if (tmp0 == ' ') buf1[1] &= tmp1 ;
3997 }
3998 else if (i < 27)
3999 {
4000 if (tmp0 == ' ') buf1[2] &= tmp1 ;
4001 }
4002 else if (i < 31)
4003 {
4004 if (tmp0 == ' ') buf1[3] &= tmp1 ;
4005 }
4006 }
4007 #endif
4008
4009 #if defined IS_AMD || defined IS_GENERIC
4010 u32 tib40[4];
4011 u32 tib41[4];
4012
4013 const uchar4 tmp0 = (uchar4) (' ');
4014 const uchar4 tmp1 = (uchar4) (0x00);
4015 const uchar4 tmp2 = (uchar4) (0xff);
4016
4017 uchar4 tmp;
4018
4019 tmp = as_uchar4 (buf0[0]); tmp = select (tmp1, tmp2, tmp == tmp0); tib40[0] = as_uint (tmp);
4020 tmp = as_uchar4 (buf0[1]); tmp = select (tmp1, tmp2, tmp == tmp0); tib40[1] = as_uint (tmp);
4021 tmp = as_uchar4 (buf0[2]); tmp = select (tmp1, tmp2, tmp == tmp0); tib40[2] = as_uint (tmp);
4022 tmp = as_uchar4 (buf0[3]); tmp = select (tmp1, tmp2, tmp == tmp0); tib40[3] = as_uint (tmp);
4023 tmp = as_uchar4 (buf1[0]); tmp = select (tmp1, tmp2, tmp == tmp0); tib41[0] = as_uint (tmp);
4024 tmp = as_uchar4 (buf1[1]); tmp = select (tmp1, tmp2, tmp == tmp0); tib41[1] = as_uint (tmp);
4025 tmp = as_uchar4 (buf1[2]); tmp = select (tmp1, tmp2, tmp == tmp0); tib41[2] = as_uint (tmp);
4026 tmp = as_uchar4 (buf1[3]); tmp = select (tmp1, tmp2, tmp == tmp0); tib41[3] = as_uint (tmp);
4027
4028 rshift_block (tib40, tib41, tib40, tib41); tib40[0] |= 0xff;
4029
4030 buf0[0] &= ~(generate_cmask (buf0[0]) & tib40[0]);
4031 buf0[1] &= ~(generate_cmask (buf0[1]) & tib40[1]);
4032 buf0[2] &= ~(generate_cmask (buf0[2]) & tib40[2]);
4033 buf0[3] &= ~(generate_cmask (buf0[3]) & tib40[3]);
4034 buf1[0] &= ~(generate_cmask (buf1[0]) & tib41[0]);
4035 buf1[1] &= ~(generate_cmask (buf1[1]) & tib41[1]);
4036 buf1[2] &= ~(generate_cmask (buf1[2]) & tib41[2]);
4037 buf1[3] &= ~(generate_cmask (buf1[3]) & tib41[3]);
4038 #endif
4039
4040 return in_len;
4041 }
4042
4043 inline u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len)
4044 {
4045 u32 out_len = in_len;
4046
4047 switch (name)
4048 {
4049 case RULE_OP_MANGLE_LREST: out_len = rule_op_mangle_lrest (p0, p1, buf0, buf1, out_len); break;
4050 case RULE_OP_MANGLE_UREST: out_len = rule_op_mangle_urest (p0, p1, buf0, buf1, out_len); break;
4051 case RULE_OP_MANGLE_LREST_UFIRST: out_len = rule_op_mangle_lrest_ufirst (p0, p1, buf0, buf1, out_len); break;
4052 case RULE_OP_MANGLE_UREST_LFIRST: out_len = rule_op_mangle_urest_lfirst (p0, p1, buf0, buf1, out_len); break;
4053 case RULE_OP_MANGLE_TREST: out_len = rule_op_mangle_trest (p0, p1, buf0, buf1, out_len); break;
4054 case RULE_OP_MANGLE_TOGGLE_AT: out_len = rule_op_mangle_toggle_at (p0, p1, buf0, buf1, out_len); break;
4055 case RULE_OP_MANGLE_REVERSE: out_len = rule_op_mangle_reverse (p0, p1, buf0, buf1, out_len); break;
4056 case RULE_OP_MANGLE_DUPEWORD: out_len = rule_op_mangle_dupeword (p0, p1, buf0, buf1, out_len); break;
4057 case RULE_OP_MANGLE_DUPEWORD_TIMES: out_len = rule_op_mangle_dupeword_times (p0, p1, buf0, buf1, out_len); break;
4058 case RULE_OP_MANGLE_REFLECT: out_len = rule_op_mangle_reflect (p0, p1, buf0, buf1, out_len); break;
4059 case RULE_OP_MANGLE_APPEND: out_len = rule_op_mangle_append (p0, p1, buf0, buf1, out_len); break;
4060 case RULE_OP_MANGLE_PREPEND: out_len = rule_op_mangle_prepend (p0, p1, buf0, buf1, out_len); break;
4061 case RULE_OP_MANGLE_ROTATE_LEFT: out_len = rule_op_mangle_rotate_left (p0, p1, buf0, buf1, out_len); break;
4062 case RULE_OP_MANGLE_ROTATE_RIGHT: out_len = rule_op_mangle_rotate_right (p0, p1, buf0, buf1, out_len); break;
4063 case RULE_OP_MANGLE_DELETE_FIRST: out_len = rule_op_mangle_delete_first (p0, p1, buf0, buf1, out_len); break;
4064 case RULE_OP_MANGLE_DELETE_LAST: out_len = rule_op_mangle_delete_last (p0, p1, buf0, buf1, out_len); break;
4065 case RULE_OP_MANGLE_DELETE_AT: out_len = rule_op_mangle_delete_at (p0, p1, buf0, buf1, out_len); break;
4066 case RULE_OP_MANGLE_EXTRACT: out_len = rule_op_mangle_extract (p0, p1, buf0, buf1, out_len); break;
4067 case RULE_OP_MANGLE_OMIT: out_len = rule_op_mangle_omit (p0, p1, buf0, buf1, out_len); break;
4068 case RULE_OP_MANGLE_INSERT: out_len = rule_op_mangle_insert (p0, p1, buf0, buf1, out_len); break;
4069 case RULE_OP_MANGLE_OVERSTRIKE: out_len = rule_op_mangle_overstrike (p0, p1, buf0, buf1, out_len); break;
4070 case RULE_OP_MANGLE_TRUNCATE_AT: out_len = rule_op_mangle_truncate_at (p0, p1, buf0, buf1, out_len); break;
4071 case RULE_OP_MANGLE_REPLACE: out_len = rule_op_mangle_replace (p0, p1, buf0, buf1, out_len); break;
4072 //case RULE_OP_MANGLE_PURGECHAR: out_len = rule_op_mangle_purgechar (p0, p1, buf0, buf1, out_len); break;
4073 //case RULE_OP_MANGLE_TOGGLECASE_REC: out_len = rule_op_mangle_togglecase_rec (p0, p1, buf0, buf1, out_len); break;
4074 case RULE_OP_MANGLE_DUPECHAR_FIRST: out_len = rule_op_mangle_dupechar_first (p0, p1, buf0, buf1, out_len); break;
4075 case RULE_OP_MANGLE_DUPECHAR_LAST: out_len = rule_op_mangle_dupechar_last (p0, p1, buf0, buf1, out_len); break;
4076 case RULE_OP_MANGLE_DUPECHAR_ALL: out_len = rule_op_mangle_dupechar_all (p0, p1, buf0, buf1, out_len); break;
4077 case RULE_OP_MANGLE_SWITCH_FIRST: out_len = rule_op_mangle_switch_first (p0, p1, buf0, buf1, out_len); break;
4078 case RULE_OP_MANGLE_SWITCH_LAST: out_len = rule_op_mangle_switch_last (p0, p1, buf0, buf1, out_len); break;
4079 case RULE_OP_MANGLE_SWITCH_AT: out_len = rule_op_mangle_switch_at (p0, p1, buf0, buf1, out_len); break;
4080 case RULE_OP_MANGLE_CHR_SHIFTL: out_len = rule_op_mangle_chr_shiftl (p0, p1, buf0, buf1, out_len); break;
4081 case RULE_OP_MANGLE_CHR_SHIFTR: out_len = rule_op_mangle_chr_shiftr (p0, p1, buf0, buf1, out_len); break;
4082 case RULE_OP_MANGLE_CHR_INCR: out_len = rule_op_mangle_chr_incr (p0, p1, buf0, buf1, out_len); break;
4083 case RULE_OP_MANGLE_CHR_DECR: out_len = rule_op_mangle_chr_decr (p0, p1, buf0, buf1, out_len); break;
4084 case RULE_OP_MANGLE_REPLACE_NP1: out_len = rule_op_mangle_replace_np1 (p0, p1, buf0, buf1, out_len); break;
4085 case RULE_OP_MANGLE_REPLACE_NM1: out_len = rule_op_mangle_replace_nm1 (p0, p1, buf0, buf1, out_len); break;
4086 case RULE_OP_MANGLE_DUPEBLOCK_FIRST: out_len = rule_op_mangle_dupeblock_first (p0, p1, buf0, buf1, out_len); break;
4087 case RULE_OP_MANGLE_DUPEBLOCK_LAST: out_len = rule_op_mangle_dupeblock_last (p0, p1, buf0, buf1, out_len); break;
4088 case RULE_OP_MANGLE_TITLE: out_len = rule_op_mangle_title (p0, p1, buf0, buf1, out_len); break;
4089 }
4090
4091 return out_len;
4092 }
4093
4094 inline u32 apply_rules (const __global u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len)
4095 {
4096 u32 out_len = len;
4097
4098 for (u32 i = 0; cmds[i] != 0; i++)
4099 {
4100 const u32 cmd = cmds[i];
4101
4102 const u32 name = (cmd >> 0) & 0xff;
4103 const u32 p0 = (cmd >> 8) & 0xff;
4104 const u32 p1 = (cmd >> 16) & 0xff;
4105
4106 out_len = apply_rule (name, p0, p1, buf0, buf1, out_len);
4107 }
4108
4109 return out_len;
4110 }
4111
4112 inline u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, const __global kernel_rule_t *rules_buf, const u32 il_pos, u32x w0[4], u32x w1[4])
4113 {
4114 #if VECT_SIZE == 1
4115
4116 w0[0] = pw_buf0[0];
4117 w0[1] = pw_buf0[1];
4118 w0[2] = pw_buf0[2];
4119 w0[3] = pw_buf0[3];
4120 w1[0] = pw_buf1[0];
4121 w1[1] = pw_buf1[1];
4122 w1[2] = pw_buf1[2];
4123 w1[3] = pw_buf1[3];
4124
4125 return apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len);
4126
4127 #else
4128
4129 u32x out_len = 0;
4130
4131 #ifdef _unroll
4132 #pragma unroll
4133 #endif
4134 for (int i = 0; i < VECT_SIZE; i++)
4135 {
4136 u32 tmp0[4];
4137 u32 tmp1[4];
4138
4139 tmp0[0] = pw_buf0[0];
4140 tmp0[1] = pw_buf0[1];
4141 tmp0[2] = pw_buf0[2];
4142 tmp0[3] = pw_buf0[3];
4143 tmp1[0] = pw_buf1[0];
4144 tmp1[1] = pw_buf1[1];
4145 tmp1[2] = pw_buf1[2];
4146 tmp1[3] = pw_buf1[3];
4147
4148 const u32 tmp_len = apply_rules (rules_buf[il_pos + i].cmds, tmp0, tmp1, pw_len);
4149
4150 switch (i)
4151 {
4152 #if VECT_SIZE >= 2
4153 case 0:
4154 w0[0].s0 = tmp0[0];
4155 w0[1].s0 = tmp0[1];
4156 w0[2].s0 = tmp0[2];
4157 w0[3].s0 = tmp0[3];
4158 w1[0].s0 = tmp1[0];
4159 w1[1].s0 = tmp1[1];
4160 w1[2].s0 = tmp1[2];
4161 w1[3].s0 = tmp1[3];
4162 out_len.s0 = tmp_len;
4163 break;
4164
4165 case 1:
4166 w0[0].s1 = tmp0[0];
4167 w0[1].s1 = tmp0[1];
4168 w0[2].s1 = tmp0[2];
4169 w0[3].s1 = tmp0[3];
4170 w1[0].s1 = tmp1[0];
4171 w1[1].s1 = tmp1[1];
4172 w1[2].s1 = tmp1[2];
4173 w1[3].s1 = tmp1[3];
4174 out_len.s1 = tmp_len;
4175 break;
4176 #endif
4177
4178 #if VECT_SIZE >= 4
4179 case 2:
4180 w0[0].s2 = tmp0[0];
4181 w0[1].s2 = tmp0[1];
4182 w0[2].s2 = tmp0[2];
4183 w0[3].s2 = tmp0[3];
4184 w1[0].s2 = tmp1[0];
4185 w1[1].s2 = tmp1[1];
4186 w1[2].s2 = tmp1[2];
4187 w1[3].s2 = tmp1[3];
4188 out_len.s2 = tmp_len;
4189 break;
4190
4191 case 3:
4192 w0[0].s3 = tmp0[0];
4193 w0[1].s3 = tmp0[1];
4194 w0[2].s3 = tmp0[2];
4195 w0[3].s3 = tmp0[3];
4196 w1[0].s3 = tmp1[0];
4197 w1[1].s3 = tmp1[1];
4198 w1[2].s3 = tmp1[2];
4199 w1[3].s3 = tmp1[3];
4200 out_len.s3 = tmp_len;
4201 break;
4202 #endif
4203
4204 #if VECT_SIZE >= 8
4205 case 4:
4206 w0[0].s4 = tmp0[0];
4207 w0[1].s4 = tmp0[1];
4208 w0[2].s4 = tmp0[2];
4209 w0[3].s4 = tmp0[3];
4210 w1[0].s4 = tmp1[0];
4211 w1[1].s4 = tmp1[1];
4212 w1[2].s4 = tmp1[2];
4213 w1[3].s4 = tmp1[3];
4214 out_len.s4 = tmp_len;
4215 break;
4216
4217 case 5:
4218 w0[0].s5 = tmp0[0];
4219 w0[1].s5 = tmp0[1];
4220 w0[2].s5 = tmp0[2];
4221 w0[3].s5 = tmp0[3];
4222 w1[0].s5 = tmp1[0];
4223 w1[1].s5 = tmp1[1];
4224 w1[2].s5 = tmp1[2];
4225 w1[3].s5 = tmp1[3];
4226 out_len.s5 = tmp_len;
4227 break;
4228
4229 case 6:
4230 w0[0].s6 = tmp0[0];
4231 w0[1].s6 = tmp0[1];
4232 w0[2].s6 = tmp0[2];
4233 w0[3].s6 = tmp0[3];
4234 w1[0].s6 = tmp1[0];
4235 w1[1].s6 = tmp1[1];
4236 w1[2].s6 = tmp1[2];
4237 w1[3].s6 = tmp1[3];
4238 out_len.s6 = tmp_len;
4239 break;
4240
4241 case 7:
4242 w0[0].s7 = tmp0[0];
4243 w0[1].s7 = tmp0[1];
4244 w0[2].s7 = tmp0[2];
4245 w0[3].s7 = tmp0[3];
4246 w1[0].s7 = tmp1[0];
4247 w1[1].s7 = tmp1[1];
4248 w1[2].s7 = tmp1[2];
4249 w1[3].s7 = tmp1[3];
4250 out_len.s7 = tmp_len;
4251 break;
4252 #endif
4253
4254 #if VECT_SIZE >= 16
4255 case 8:
4256 w0[0].s8 = tmp0[0];
4257 w0[1].s8 = tmp0[1];
4258 w0[2].s8 = tmp0[2];
4259 w0[3].s8 = tmp0[3];
4260 w1[0].s8 = tmp1[0];
4261 w1[1].s8 = tmp1[1];
4262 w1[2].s8 = tmp1[2];
4263 w1[3].s8 = tmp1[3];
4264 out_len.s8 = tmp_len;
4265 break;
4266
4267 case 9:
4268 w0[0].s9 = tmp0[0];
4269 w0[1].s9 = tmp0[1];
4270 w0[2].s9 = tmp0[2];
4271 w0[3].s9 = tmp0[3];
4272 w1[0].s9 = tmp1[0];
4273 w1[1].s9 = tmp1[1];
4274 w1[2].s9 = tmp1[2];
4275 w1[3].s9 = tmp1[3];
4276 out_len.s9 = tmp_len;
4277 break;
4278
4279 case 10:
4280 w0[0].sa = tmp0[0];
4281 w0[1].sa = tmp0[1];
4282 w0[2].sa = tmp0[2];
4283 w0[3].sa = tmp0[3];
4284 w1[0].sa = tmp1[0];
4285 w1[1].sa = tmp1[1];
4286 w1[2].sa = tmp1[2];
4287 w1[3].sa = tmp1[3];
4288 out_len.sa = tmp_len;
4289 break;
4290
4291 case 11:
4292 w0[0].sb = tmp0[0];
4293 w0[1].sb = tmp0[1];
4294 w0[2].sb = tmp0[2];
4295 w0[3].sb = tmp0[3];
4296 w1[0].sb = tmp1[0];
4297 w1[1].sb = tmp1[1];
4298 w1[2].sb = tmp1[2];
4299 w1[3].sb = tmp1[3];
4300 out_len.sb = tmp_len;
4301 break;
4302
4303 case 12:
4304 w0[0].sc = tmp0[0];
4305 w0[1].sc = tmp0[1];
4306 w0[2].sc = tmp0[2];
4307 w0[3].sc = tmp0[3];
4308 w1[0].sc = tmp1[0];
4309 w1[1].sc = tmp1[1];
4310 w1[2].sc = tmp1[2];
4311 w1[3].sc = tmp1[3];
4312 out_len.sc = tmp_len;
4313 break;
4314
4315 case 13:
4316 w0[0].sd = tmp0[0];
4317 w0[1].sd = tmp0[1];
4318 w0[2].sd = tmp0[2];
4319 w0[3].sd = tmp0[3];
4320 w1[0].sd = tmp1[0];
4321 w1[1].sd = tmp1[1];
4322 w1[2].sd = tmp1[2];
4323 w1[3].sd = tmp1[3];
4324 out_len.sd = tmp_len;
4325 break;
4326
4327 case 14:
4328 w0[0].se = tmp0[0];
4329 w0[1].se = tmp0[1];
4330 w0[2].se = tmp0[2];
4331 w0[3].se = tmp0[3];
4332 w1[0].se = tmp1[0];
4333 w1[1].se = tmp1[1];
4334 w1[2].se = tmp1[2];
4335 w1[3].se = tmp1[3];
4336 out_len.se = tmp_len;
4337 break;
4338
4339 case 15:
4340 w0[0].sf = tmp0[0];
4341 w0[1].sf = tmp0[1];
4342 w0[2].sf = tmp0[2];
4343 w0[3].sf = tmp0[3];
4344 w1[0].sf = tmp1[0];
4345 w1[1].sf = tmp1[1];
4346 w1[2].sf = tmp1[2];
4347 w1[3].sf = tmp1[3];
4348 out_len.sf = tmp_len;
4349 break;
4350 #endif
4351 }
4352 }
4353
4354 return out_len;
4355
4356 #endif
4357 }