typedef uint u32;
typedef ulong u64;
-static u32 swap32 (const u32 v)
-{
- return (as_uint (as_uchar4 (v).s3210));
-}
+#define allx(r) r
-static u64 swap64 (const u64 v)
+/*
+static u32 allx (const u32 r)
{
- return (as_ulong (as_uchar8 (v).s76543210));
+ return r;
}
+*/
-#ifdef IS_AMD
-#endif
-
-#ifdef IS_NV
-static u32 __byte_perm (const u32 a, const u32 b, const u32 c)
+static inline u32 l32_from_64 (u64 a)
{
- u32 r;
-
- asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
+ const u32 r = (uint) (a);
return r;
}
-static u32 lut3_2d (const u32 a, const u32 b, const u32 c)
+static inline u32 h32_from_64 (u64 a)
{
- u32 r;
+ a >>= 32;
- asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+ const u32 r = (uint) (a);
return r;
}
-static u32 lut3_39 (const u32 a, const u32 b, const u32 c)
+static inline u64 hl32_to_64 (const u32 a, const u32 b)
{
- u32 r;
+ return as_ulong ((uint2) (b, a));
+}
- asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+#ifdef IS_AMD
+static inline u32 swap32 (const u32 v)
+{
+ return (as_uint (as_uchar4 (v).s3210));
+}
- return r;
+static inline u64 swap64 (const u64 v)
+{
+ return (as_ulong (as_uchar8 (v).s76543210));
}
+#endif
-static u32 lut3_59 (const u32 a, const u32 b, const u32 c)
+#ifdef IS_NV
+static inline u32 swap32 (const u32 v)
{
u32 r;
- asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+ asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r) : "r"(v));
return r;
}
-static u32 lut3_96 (const u32 a, const u32 b, const u32 c)
+static inline u64 swap64 (const u64 v)
{
- u32 r;
+ u32 il;
+ u32 ir;
- asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+ asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(v));
+
+ u32 tl;
+ u32 tr;
+
+ asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl) : "r"(il));
+ asm ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr) : "r"(ir));
+
+ u64 r;
+
+ asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tr), "r"(tl));
return r;
}
+#endif
-static u32 lut3_e4 (const u32 a, const u32 b, const u32 c)
+#ifdef IS_GENERIC
+static inline u32 swap32 (const u32 v)
{
- u32 r;
+ return (as_uint (as_uchar4 (v).s3210));
+}
- asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+static inline u64 swap64 (const u64 v)
+{
+ return (as_ulong (as_uchar8 (v).s76543210));
+}
+#endif
- return r;
+#ifdef IS_AMD
+static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
+{
+ return amd_bfe (a, b, c);
}
+#endif
-static u32 lut3_e8 (const u32 a, const u32 b, const u32 c)
+#ifdef IS_NV
+static inline u32 __byte_perm (const u32 a, const u32 b, const u32 c)
{
u32 r;
- asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+ asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
return r;
}
-static u32 lut3_ca (const u32 a, const u32 b, const u32 c)
+static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
{
u32 r;
- asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+ asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
return r;
}
+#if CUDA_ARCH >= 350
+static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
-#endif
+ asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"((c & 3) * 8));
-static bool allx (const u32 r)
-{
return r;
}
-
-static u32 l32_from_64 (u64 a)
+#else
+static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
{
- const u32 r = (uint) (a);
-
- return r;
+ return __byte_perm (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
}
+#endif
+#endif
-static u32 h32_from_64 (u64 a)
+#ifdef IS_GENERIC
+static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
{
- a >>= 32;
+ #define BIT(x) (1 << (x))
+ #define BIT_MASK(x) (BIT (x) - 1)
+ #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z))
- const u32 r = (uint) (a);
-
- return r;
+ return BFE (a, b, c);
}
-static u64 hl32_to_64 (const u32 a, const u32 b)
+static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
{
- return as_ulong ((uint2) (b, a));
+ return (u32) (((((u64) a) << 32) | (u64) b) >> ((c & 3) * 8));
}
+#endif
#ifdef IS_AMD
-
-static u32 rotr32 (const u32 a, const u32 n)
+static inline u32 rotr32 (const u32 a, const u32 n)
{
return rotate (a, 32 - n);
}
-static u32 rotl32 (const u32 a, const u32 n)
+static inline u32 rotl32 (const u32 a, const u32 n)
{
return rotate (a, n);
}
-static u64 rotr64 (const u64 a, const u32 n)
+static inline u64 rotr64 (const u64 a, const u32 n)
{
uint2 a2 = as_uint2 (a);
return as_ulong (t);
}
-static u64 rotl64 (const u64 a, const u32 n)
+static inline u64 rotl64 (const u64 a, const u32 n)
{
return rotr64 (a, 64 - n);
}
-
#endif
#ifdef IS_NV
-
#if CUDA_ARCH >= 350
+/*
+this version reduced the number of registers but for some unknown reason the whole kernel become slower.. instruction cache monster?
+static inline u32 rotr32 (const u32 a, const u32 n)
+{
+ u32 r;
-static u32 rotr32 (const u32 a, const u32 n)
+ switch (n & 31)
+ {
+ case 0: asm ("shf.r.wrap.b32 %0, %1, %1, 0;" : "=r"(r) : "r"(a)); break;
+ case 1: asm ("shf.r.wrap.b32 %0, %1, %1, 1;" : "=r"(r) : "r"(a)); break;
+ case 2: asm ("shf.r.wrap.b32 %0, %1, %1, 2;" : "=r"(r) : "r"(a)); break;
+ case 3: asm ("shf.r.wrap.b32 %0, %1, %1, 3;" : "=r"(r) : "r"(a)); break;
+ case 4: asm ("shf.r.wrap.b32 %0, %1, %1, 4;" : "=r"(r) : "r"(a)); break;
+ case 5: asm ("shf.r.wrap.b32 %0, %1, %1, 5;" : "=r"(r) : "r"(a)); break;
+ case 6: asm ("shf.r.wrap.b32 %0, %1, %1, 6;" : "=r"(r) : "r"(a)); break;
+ case 7: asm ("shf.r.wrap.b32 %0, %1, %1, 7;" : "=r"(r) : "r"(a)); break;
+ case 8: asm ("shf.r.wrap.b32 %0, %1, %1, 8;" : "=r"(r) : "r"(a)); break;
+ case 9: asm ("shf.r.wrap.b32 %0, %1, %1, 9;" : "=r"(r) : "r"(a)); break;
+ case 10: asm ("shf.r.wrap.b32 %0, %1, %1, 10;" : "=r"(r) : "r"(a)); break;
+ case 11: asm ("shf.r.wrap.b32 %0, %1, %1, 11;" : "=r"(r) : "r"(a)); break;
+ case 12: asm ("shf.r.wrap.b32 %0, %1, %1, 12;" : "=r"(r) : "r"(a)); break;
+ case 13: asm ("shf.r.wrap.b32 %0, %1, %1, 13;" : "=r"(r) : "r"(a)); break;
+ case 14: asm ("shf.r.wrap.b32 %0, %1, %1, 14;" : "=r"(r) : "r"(a)); break;
+ case 15: asm ("shf.r.wrap.b32 %0, %1, %1, 15;" : "=r"(r) : "r"(a)); break;
+ case 16: asm ("shf.r.wrap.b32 %0, %1, %1, 16;" : "=r"(r) : "r"(a)); break;
+ case 17: asm ("shf.r.wrap.b32 %0, %1, %1, 17;" : "=r"(r) : "r"(a)); break;
+ case 18: asm ("shf.r.wrap.b32 %0, %1, %1, 18;" : "=r"(r) : "r"(a)); break;
+ case 19: asm ("shf.r.wrap.b32 %0, %1, %1, 19;" : "=r"(r) : "r"(a)); break;
+ case 20: asm ("shf.r.wrap.b32 %0, %1, %1, 20;" : "=r"(r) : "r"(a)); break;
+ case 21: asm ("shf.r.wrap.b32 %0, %1, %1, 21;" : "=r"(r) : "r"(a)); break;
+ case 22: asm ("shf.r.wrap.b32 %0, %1, %1, 22;" : "=r"(r) : "r"(a)); break;
+ case 23: asm ("shf.r.wrap.b32 %0, %1, %1, 23;" : "=r"(r) : "r"(a)); break;
+ case 24: asm ("shf.r.wrap.b32 %0, %1, %1, 24;" : "=r"(r) : "r"(a)); break;
+ case 25: asm ("shf.r.wrap.b32 %0, %1, %1, 25;" : "=r"(r) : "r"(a)); break;
+ case 26: asm ("shf.r.wrap.b32 %0, %1, %1, 26;" : "=r"(r) : "r"(a)); break;
+ case 27: asm ("shf.r.wrap.b32 %0, %1, %1, 27;" : "=r"(r) : "r"(a)); break;
+ case 28: asm ("shf.r.wrap.b32 %0, %1, %1, 28;" : "=r"(r) : "r"(a)); break;
+ case 29: asm ("shf.r.wrap.b32 %0, %1, %1, 29;" : "=r"(r) : "r"(a)); break;
+ case 30: asm ("shf.r.wrap.b32 %0, %1, %1, 30;" : "=r"(r) : "r"(a)); break;
+ case 31: asm ("shf.r.wrap.b32 %0, %1, %1, 31;" : "=r"(r) : "r"(a)); break;
+ }
+
+ return r;
+}
+*/
+
+static inline u32 rotr32 (const u32 a, const u32 n)
{
u32 r;
return r;
}
-static u32 rotl32 (const u32 a, const u32 n)
+static inline u32 rotl32 (const u32 a, const u32 n)
{
return rotr32 (a, 32 - n);
}
-static u64 rotr64 (const u64 a, const u32 n)
+static inline u64 rotr64 (const u64 a, const u32 n)
{
u32 il;
u32 ir;
return r;
}
-static u64 rotl64 (const u64 a, const u32 n)
+static inline u64 rotl64 (const u64 a, const u32 n)
{
return rotr64 (a, 64 - n);
}
-
#else
+static inline u32 rotr32 (const u32 a, const u32 n)
+{
+ return rotate (a, 32 - n);
+}
-static u32 rotr32 (const u32 a, const u32 n)
+static inline u32 rotl32 (const u32 a, const u32 n)
{
- return (((a) >> (n)) | ((a) << (32 - (n))));
+ return rotate (a, n);
}
-static u32 rotl32 (const u32 a, const u32 n)
+static inline u64 rotr64 (const u64 a, const u32 n)
{
- return rotr32 (a, 32 - n);
+ return rotate (a, (u64) 64 - n);
}
-static u64 rotr64 (const u64 a, const u32 n)
+static inline u64 rotl64 (const u64 a, const u32 n)
{
- return (((a) >> (n)) | ((a) << (64 - (n))));
+ return rotate (a, (u64) n);
}
+#endif
+#endif
-static u64 rotl64 (const u64 a, const u32 n)
+#ifdef IS_GENERIC
+static inline u32 rotr32 (const u32 a, const u32 n)
{
- return rotr64 (a, 64 - n);
+ return rotate (a, 32 - n);
}
+static inline u32 rotl32 (const u32 a, const u32 n)
+{
+ return rotate (a, n);
+}
+
+static inline u64 rotr64 (const u64 a, const u32 n)
+{
+ return rotate (a, (u64) 64 - n);
+}
+
+static inline u64 rotl64 (const u64 a, const u32 n)
+{
+ return rotate (a, (u64) n);
+}
+#endif
+
+#ifdef IS_NV
+#if CUDA_ARCH >= 500
+static inline u32 lut3_2d (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
+
+static inline u32 lut3_39 (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
+
+static inline u32 lut3_59 (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
+
+static inline u32 lut3_96 (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
+
+static inline u32 lut3_e4 (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
+
+static inline u32 lut3_e8 (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
+
+static inline u32 lut3_ca (const u32 a, const u32 b, const u32 c)
+{
+ u32 r;
+
+ asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
+
+ return r;
+}
#endif
#endif
u32 digest_buf[8];
#elif defined _MS_DRSR_
u32 digest_buf[8];
+ #elif defined _ANDROIDFDE_SAMSUNG_
+ u32 digest_buf[8];
+ #elif defined _RAR5_
+ u32 digest_buf[4];
#endif
} digest_t;
typedef struct
{
+ u32 E[18];
+
u32 P[18];
u32 S0[256];
typedef struct
{
- u32 key;
+ u32 key;
u64 val;
} hcstat_table_t;
typedef struct
{
- u32 cmds[15];
-
-} gpu_rule_t;
+ u32 cmds[0x100];
-/*
-typedef struct
-{
- u32 plain_buf[16];
- u32 plailen;
-
-} plain_t;
-*/
+} kernel_rule_t;
typedef struct
{