2 * Author......: Jens Steube <jens.steube@gmail.com>
12 __device__ static void generate_pw (u32 pw_buf[16], cs_t *root_css_buf, cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val)
31 cs_t *cs = &root_css_buf[pw_r_len];
36 for (i = 0, j = pw_r_len; i < pw_l_len; i++, j++)
38 const u32 len = cs->cs_len;
40 const u64 next = val / len;
41 const u64 pos = val % len;
45 const u32 key = cs->cs_buf[pos];
47 const u32 jd4 = j / 4;
48 const u32 jm4 = j % 4;
50 pw_buf[jd4] |= key << ((3 - jm4) * 8);
52 cs = &markov_css_buf[(j * CHARSIZ) + key];
55 const u32 jd4 = j / 4;
56 const u32 jm4 = j % 4;
58 pw_buf[jd4] |= (0xff << ((3 - jm4) * 8)) & mask80;
60 if (bits14) pw_buf[14] = (pw_l_len + pw_r_len) * 8;
61 if (bits15) pw_buf[15] = (pw_l_len + pw_r_len) * 8;
64 extern "C" __global__ void __launch_bounds__ (256, 1) l_markov (pw_t *pws_buf_l, cs_t *root_css_buf, cs_t *markov_css_buf, const u64 off, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, const u32 gid_max)
66 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
68 if (gid >= gid_max) return;
70 const u32 gid4 = gid * 4;
77 generate_pw (pw_buf0, root_css_buf, markov_css_buf, pw_l_len, pw_r_len, mask80, bits14, bits15, off + gid4 + 0);
78 generate_pw (pw_buf1, root_css_buf, markov_css_buf, pw_l_len, pw_r_len, mask80, bits14, bits15, off + gid4 + 1);
79 generate_pw (pw_buf2, root_css_buf, markov_css_buf, pw_l_len, pw_r_len, mask80, bits14, bits15, off + gid4 + 2);
80 generate_pw (pw_buf3, root_css_buf, markov_css_buf, pw_l_len, pw_r_len, mask80, bits14, bits15, off + gid4 + 3);
83 for (int i = 0; i < 16; i++)
85 pws_buf_l[gid].i[i].x = pw_buf0[i];
86 pws_buf_l[gid].i[i].y = pw_buf1[i];
87 pws_buf_l[gid].i[i].z = pw_buf2[i];
88 pws_buf_l[gid].i[i].w = pw_buf3[i];
91 pws_buf_l[gid].pw_len = pw_l_len + pw_r_len;
94 extern "C" __global__ void __launch_bounds__ (256, 1) r_markov (bf_t *pws_buf_r, cs_t *root_css_buf, cs_t *markov_css_buf, const u64 off, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, const u32 gid_max)
96 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
98 if (gid >= gid_max) return;
100 const u32 gid4 = gid * 4;
104 generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_r_len, 0, 0, 0, 0, off + gid4 + 0);
106 pws_buf_r[gid4 + 0].i = pw_buf[0];
108 generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_r_len, 0, 0, 0, 0, off + gid4 + 1);
110 pws_buf_r[gid4 + 1].i = pw_buf[0];
112 generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_r_len, 0, 0, 0, 0, off + gid4 + 2);
114 pws_buf_r[gid4 + 2].i = pw_buf[0];
116 generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_r_len, 0, 0, 0, 0, off + gid4 + 3);
118 pws_buf_r[gid4 + 3].i = pw_buf[0];
121 extern "C" __global__ void __launch_bounds__ (256, 1) C_markov (comb_t *pws_buf, cs_t *root_css_buf, cs_t *markov_css_buf, const u64 off, const u32 pw_len, const u32 mask80, const u32 bits14, const u32 bits15, const u32 gid_max)
123 const u32 gid = (blockIdx.x * blockDim.x) + threadIdx.x;
125 if (gid >= gid_max) return;
127 const u32 gid4 = gid * 4;
131 generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_len, 0, mask80, bits14, bits15, off + gid4 + 0);
133 pws_buf[gid4 + 0].i[ 0] = pw_buf[ 0];
134 pws_buf[gid4 + 0].i[ 1] = pw_buf[ 1];
135 pws_buf[gid4 + 0].i[ 2] = pw_buf[ 2];
136 pws_buf[gid4 + 0].i[ 3] = pw_buf[ 3];
137 pws_buf[gid4 + 0].i[ 4] = pw_buf[ 4];
138 pws_buf[gid4 + 0].i[ 5] = pw_buf[ 5];
139 pws_buf[gid4 + 0].i[ 6] = pw_buf[ 6];
140 pws_buf[gid4 + 0].i[ 7] = pw_buf[ 7];
142 pws_buf[gid4 + 0].pw_len = pw_len;
144 generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_len, 0, mask80, bits14, bits15, off + gid4 + 1);
146 pws_buf[gid4 + 1].i[ 0] = pw_buf[ 0];
147 pws_buf[gid4 + 1].i[ 1] = pw_buf[ 1];
148 pws_buf[gid4 + 1].i[ 2] = pw_buf[ 2];
149 pws_buf[gid4 + 1].i[ 3] = pw_buf[ 3];
150 pws_buf[gid4 + 1].i[ 4] = pw_buf[ 4];
151 pws_buf[gid4 + 1].i[ 5] = pw_buf[ 5];
152 pws_buf[gid4 + 1].i[ 6] = pw_buf[ 6];
153 pws_buf[gid4 + 1].i[ 7] = pw_buf[ 7];
155 pws_buf[gid4 + 1].pw_len = pw_len;
157 generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_len, 0, mask80, bits14, bits15, off + gid4 + 2);
159 pws_buf[gid4 + 2].i[ 0] = pw_buf[ 0];
160 pws_buf[gid4 + 2].i[ 1] = pw_buf[ 1];
161 pws_buf[gid4 + 2].i[ 2] = pw_buf[ 2];
162 pws_buf[gid4 + 2].i[ 3] = pw_buf[ 3];
163 pws_buf[gid4 + 2].i[ 4] = pw_buf[ 4];
164 pws_buf[gid4 + 2].i[ 5] = pw_buf[ 5];
165 pws_buf[gid4 + 2].i[ 6] = pw_buf[ 6];
166 pws_buf[gid4 + 2].i[ 7] = pw_buf[ 7];
168 pws_buf[gid4 + 2].pw_len = pw_len;
170 generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_len, 0, mask80, bits14, bits15, off + gid4 + 3);
172 pws_buf[gid4 + 3].i[ 0] = pw_buf[ 0];
173 pws_buf[gid4 + 3].i[ 1] = pw_buf[ 1];
174 pws_buf[gid4 + 3].i[ 2] = pw_buf[ 2];
175 pws_buf[gid4 + 3].i[ 3] = pw_buf[ 3];
176 pws_buf[gid4 + 3].i[ 4] = pw_buf[ 4];
177 pws_buf[gid4 + 3].i[ 5] = pw_buf[ 5];
178 pws_buf[gid4 + 3].i[ 6] = pw_buf[ 6];
179 pws_buf[gid4 + 3].i[ 7] = pw_buf[ 7];
181 pws_buf[gid4 + 3].pw_len = pw_len;