#define VECT_SIZE 1
#endif
-#define CONCAT(a, b) a##b
+#define CONCAT(a, b) a##b
#define VTYPE(type, width) CONCAT(type, width)
#if VECT_SIZE == 1
return r;
}
-#if CUDA_ARCH >= 350
inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
{
u32 r;
+ #if CUDA_ARCH >= 350
+
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"((c & 3) * 8));
+ #else
+
+ r = __byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
+
+ #endif
+
return r;
}
-#else
-inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
-{
- return __byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
-}
-#endif
-
#endif
#ifdef IS_GENERIC