avx512.c 18 KB


  1. /* -*- linux-c -*- --------------------------------------------------------
  2. *
  3. * Copyright (C) 2016 Intel Corporation
  4. *
  5. * Author: Gayatri Kammela <[email protected]>
  6. * Author: Megha Dey <[email protected]>
  7. *
  8. * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
  9. * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of the GNU General Public License as published by
  13. * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
  14. * Boston MA 02111-1307, USA; either version 2 of the License, or
  15. * (at your option) any later version; incorporated herein by reference.
  16. *
  17. * -----------------------------------------------------------------------
  18. */
  19. /*
  20. * AVX512 implementation of RAID-6 syndrome functions
  21. *
  22. */
  23. #ifdef CONFIG_AS_AVX512
  24. #include <linux/raid/pq.h>
  25. #include "x86.h"
  26. static const struct raid6_avx512_constants {
  27. u64 x1d[8];
  28. } raid6_avx512_constants __aligned(512) = {
  29. { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  30. 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  31. 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  32. 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
  33. };
  34. static int raid6_have_avx512(void)
  35. {
  36. return boot_cpu_has(X86_FEATURE_AVX2) &&
  37. boot_cpu_has(X86_FEATURE_AVX) &&
  38. boot_cpu_has(X86_FEATURE_AVX512F) &&
  39. boot_cpu_has(X86_FEATURE_AVX512BW) &&
  40. boot_cpu_has(X86_FEATURE_AVX512VL) &&
  41. boot_cpu_has(X86_FEATURE_AVX512DQ);
  42. }
  43. static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
  44. {
  45. u8 **dptr = (u8 **)ptrs;
  46. u8 *p, *q;
  47. int d, z, z0;
  48. z0 = disks - 3; /* Highest data disk */
  49. p = dptr[z0+1]; /* XOR parity */
  50. q = dptr[z0+2]; /* RS syndrome */
  51. kernel_fpu_begin();
  52. asm volatile("vmovdqa64 %0,%%zmm0\n\t"
  53. "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
  54. :
  55. : "m" (raid6_avx512_constants.x1d[0]));
  56. for (d = 0; d < bytes; d += 64) {
  57. asm volatile("prefetchnta %0\n\t"
  58. "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
  59. "prefetchnta %1\n\t"
  60. "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
  61. "vmovdqa64 %1,%%zmm6"
  62. :
  63. : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
  64. for (z = z0-2; z >= 0; z--) {
  65. asm volatile("prefetchnta %0\n\t"
  66. "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
  67. "vpmovm2b %%k1,%%zmm5\n\t"
  68. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  69. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  70. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  71. "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
  72. "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
  73. "vmovdqa64 %0,%%zmm6"
  74. :
  75. : "m" (dptr[z][d]));
  76. }
  77. asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
  78. "vpmovm2b %%k1,%%zmm5\n\t"
  79. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  80. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  81. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  82. "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
  83. "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
  84. "vmovntdq %%zmm2,%0\n\t"
  85. "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
  86. "vmovntdq %%zmm4,%1\n\t"
  87. "vpxorq %%zmm4,%%zmm4,%%zmm4"
  88. :
  89. : "m" (p[d]), "m" (q[d]));
  90. }
  91. asm volatile("sfence" : : : "memory");
  92. kernel_fpu_end();
  93. }
  94. static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
  95. size_t bytes, void **ptrs)
  96. {
  97. u8 **dptr = (u8 **)ptrs;
  98. u8 *p, *q;
  99. int d, z, z0;
  100. z0 = stop; /* P/Q right side optimization */
  101. p = dptr[disks-2]; /* XOR parity */
  102. q = dptr[disks-1]; /* RS syndrome */
  103. kernel_fpu_begin();
  104. asm volatile("vmovdqa64 %0,%%zmm0"
  105. : : "m" (raid6_avx512_constants.x1d[0]));
  106. for (d = 0 ; d < bytes ; d += 64) {
  107. asm volatile("vmovdqa64 %0,%%zmm4\n\t"
  108. "vmovdqa64 %1,%%zmm2\n\t"
  109. "vpxorq %%zmm4,%%zmm2,%%zmm2"
  110. :
  111. : "m" (dptr[z0][d]), "m" (p[d]));
  112. /* P/Q data pages */
  113. for (z = z0-1 ; z >= start ; z--) {
  114. asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
  115. "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
  116. "vpmovm2b %%k1,%%zmm5\n\t"
  117. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  118. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  119. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  120. "vmovdqa64 %0,%%zmm5\n\t"
  121. "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
  122. "vpxorq %%zmm5,%%zmm4,%%zmm4"
  123. :
  124. : "m" (dptr[z][d]));
  125. }
  126. /* P/Q left side optimization */
  127. for (z = start-1 ; z >= 0 ; z--) {
  128. asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
  129. "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
  130. "vpmovm2b %%k1,%%zmm5\n\t"
  131. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  132. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  133. "vpxorq %%zmm5,%%zmm4,%%zmm4"
  134. :
  135. : );
  136. }
  137. asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
  138. /* Don't use movntdq for r/w memory area < cache line */
  139. "vmovdqa64 %%zmm4,%0\n\t"
  140. "vmovdqa64 %%zmm2,%1"
  141. :
  142. : "m" (q[d]), "m" (p[d]));
  143. }
  144. asm volatile("sfence" : : : "memory");
  145. kernel_fpu_end();
  146. }
  147. const struct raid6_calls raid6_avx512x1 = {
  148. raid6_avx5121_gen_syndrome,
  149. raid6_avx5121_xor_syndrome,
  150. raid6_have_avx512,
  151. "avx512x1",
  152. 1 /* Has cache hints */
  153. };
  154. /*
  155. * Unrolled-by-2 AVX512 implementation
  156. */
  157. static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
  158. {
  159. u8 **dptr = (u8 **)ptrs;
  160. u8 *p, *q;
  161. int d, z, z0;
  162. z0 = disks - 3; /* Highest data disk */
  163. p = dptr[z0+1]; /* XOR parity */
  164. q = dptr[z0+2]; /* RS syndrome */
  165. kernel_fpu_begin();
  166. asm volatile("vmovdqa64 %0,%%zmm0\n\t"
  167. "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
  168. :
  169. : "m" (raid6_avx512_constants.x1d[0]));
  170. /* We uniformly assume a single prefetch covers at least 64 bytes */
  171. for (d = 0; d < bytes; d += 128) {
  172. asm volatile("prefetchnta %0\n\t"
  173. "prefetchnta %1\n\t"
  174. "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
  175. "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */
  176. "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
  177. "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */
  178. :
  179. : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
  180. for (z = z0-1; z >= 0; z--) {
  181. asm volatile("prefetchnta %0\n\t"
  182. "prefetchnta %1\n\t"
  183. "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
  184. "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
  185. "vpmovm2b %%k1,%%zmm5\n\t"
  186. "vpmovm2b %%k2,%%zmm7\n\t"
  187. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  188. "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
  189. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  190. "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
  191. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  192. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  193. "vmovdqa64 %0,%%zmm5\n\t"
  194. "vmovdqa64 %1,%%zmm7\n\t"
  195. "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
  196. "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
  197. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  198. "vpxorq %%zmm7,%%zmm6,%%zmm6"
  199. :
  200. : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
  201. }
  202. asm volatile("vmovntdq %%zmm2,%0\n\t"
  203. "vmovntdq %%zmm3,%1\n\t"
  204. "vmovntdq %%zmm4,%2\n\t"
  205. "vmovntdq %%zmm6,%3"
  206. :
  207. : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
  208. "m" (q[d+64]));
  209. }
  210. asm volatile("sfence" : : : "memory");
  211. kernel_fpu_end();
  212. }
  213. static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
  214. size_t bytes, void **ptrs)
  215. {
  216. u8 **dptr = (u8 **)ptrs;
  217. u8 *p, *q;
  218. int d, z, z0;
  219. z0 = stop; /* P/Q right side optimization */
  220. p = dptr[disks-2]; /* XOR parity */
  221. q = dptr[disks-1]; /* RS syndrome */
  222. kernel_fpu_begin();
  223. asm volatile("vmovdqa64 %0,%%zmm0"
  224. : : "m" (raid6_avx512_constants.x1d[0]));
  225. for (d = 0 ; d < bytes ; d += 128) {
  226. asm volatile("vmovdqa64 %0,%%zmm4\n\t"
  227. "vmovdqa64 %1,%%zmm6\n\t"
  228. "vmovdqa64 %2,%%zmm2\n\t"
  229. "vmovdqa64 %3,%%zmm3\n\t"
  230. "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
  231. "vpxorq %%zmm6,%%zmm3,%%zmm3"
  232. :
  233. : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
  234. "m" (p[d]), "m" (p[d+64]));
  235. /* P/Q data pages */
  236. for (z = z0-1 ; z >= start ; z--) {
  237. asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
  238. "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
  239. "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
  240. "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
  241. "vpmovm2b %%k1,%%zmm5\n\t"
  242. "vpmovm2b %%k2,%%zmm7\n\t"
  243. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  244. "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
  245. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  246. "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
  247. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  248. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  249. "vmovdqa64 %0,%%zmm5\n\t"
  250. "vmovdqa64 %1,%%zmm7\n\t"
  251. "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
  252. "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
  253. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  254. "vpxorq %%zmm7,%%zmm6,%%zmm6"
  255. :
  256. : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
  257. }
  258. /* P/Q left side optimization */
  259. for (z = start-1 ; z >= 0 ; z--) {
  260. asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
  261. "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
  262. "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
  263. "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
  264. "vpmovm2b %%k1,%%zmm5\n\t"
  265. "vpmovm2b %%k2,%%zmm7\n\t"
  266. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  267. "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
  268. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  269. "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
  270. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  271. "vpxorq %%zmm7,%%zmm6,%%zmm6"
  272. :
  273. : );
  274. }
  275. asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
  276. "vpxorq %1,%%zmm6,%%zmm6\n\t"
  277. /* Don't use movntdq for r/w
  278. * memory area < cache line
  279. */
  280. "vmovdqa64 %%zmm4,%0\n\t"
  281. "vmovdqa64 %%zmm6,%1\n\t"
  282. "vmovdqa64 %%zmm2,%2\n\t"
  283. "vmovdqa64 %%zmm3,%3"
  284. :
  285. : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
  286. "m" (p[d+64]));
  287. }
  288. asm volatile("sfence" : : : "memory");
  289. kernel_fpu_end();
  290. }
  291. const struct raid6_calls raid6_avx512x2 = {
  292. raid6_avx5122_gen_syndrome,
  293. raid6_avx5122_xor_syndrome,
  294. raid6_have_avx512,
  295. "avx512x2",
  296. 1 /* Has cache hints */
  297. };
  298. #ifdef CONFIG_X86_64
  299. /*
  300. * Unrolled-by-4 AVX2 implementation
  301. */
  302. static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
  303. {
  304. u8 **dptr = (u8 **)ptrs;
  305. u8 *p, *q;
  306. int d, z, z0;
  307. z0 = disks - 3; /* Highest data disk */
  308. p = dptr[z0+1]; /* XOR parity */
  309. q = dptr[z0+2]; /* RS syndrome */
  310. kernel_fpu_begin();
  311. asm volatile("vmovdqa64 %0,%%zmm0\n\t"
  312. "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */
  313. "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */
  314. "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */
  315. "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */
  316. "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */
  317. "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */
  318. "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */
  319. "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */
  320. "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */
  321. :
  322. : "m" (raid6_avx512_constants.x1d[0]));
  323. for (d = 0; d < bytes; d += 256) {
  324. for (z = z0; z >= 0; z--) {
  325. asm volatile("prefetchnta %0\n\t"
  326. "prefetchnta %1\n\t"
  327. "prefetchnta %2\n\t"
  328. "prefetchnta %3\n\t"
  329. "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
  330. "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
  331. "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
  332. "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
  333. "vpmovm2b %%k1,%%zmm5\n\t"
  334. "vpmovm2b %%k2,%%zmm7\n\t"
  335. "vpmovm2b %%k3,%%zmm13\n\t"
  336. "vpmovm2b %%k4,%%zmm15\n\t"
  337. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  338. "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
  339. "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
  340. "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
  341. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  342. "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
  343. "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
  344. "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
  345. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  346. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  347. "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
  348. "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
  349. "vmovdqa64 %0,%%zmm5\n\t"
  350. "vmovdqa64 %1,%%zmm7\n\t"
  351. "vmovdqa64 %2,%%zmm13\n\t"
  352. "vmovdqa64 %3,%%zmm15\n\t"
  353. "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
  354. "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
  355. "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
  356. "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
  357. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  358. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  359. "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
  360. "vpxorq %%zmm15,%%zmm14,%%zmm14"
  361. :
  362. : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
  363. "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
  364. }
  365. asm volatile("vmovntdq %%zmm2,%0\n\t"
  366. "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
  367. "vmovntdq %%zmm3,%1\n\t"
  368. "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
  369. "vmovntdq %%zmm10,%2\n\t"
  370. "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
  371. "vmovntdq %%zmm11,%3\n\t"
  372. "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
  373. "vmovntdq %%zmm4,%4\n\t"
  374. "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
  375. "vmovntdq %%zmm6,%5\n\t"
  376. "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
  377. "vmovntdq %%zmm12,%6\n\t"
  378. "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
  379. "vmovntdq %%zmm14,%7\n\t"
  380. "vpxorq %%zmm14,%%zmm14,%%zmm14"
  381. :
  382. : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
  383. "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
  384. "m" (q[d+128]), "m" (q[d+192]));
  385. }
  386. asm volatile("sfence" : : : "memory");
  387. kernel_fpu_end();
  388. }
  389. static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
  390. size_t bytes, void **ptrs)
  391. {
  392. u8 **dptr = (u8 **)ptrs;
  393. u8 *p, *q;
  394. int d, z, z0;
  395. z0 = stop; /* P/Q right side optimization */
  396. p = dptr[disks-2]; /* XOR parity */
  397. q = dptr[disks-1]; /* RS syndrome */
  398. kernel_fpu_begin();
  399. asm volatile("vmovdqa64 %0,%%zmm0"
  400. :: "m" (raid6_avx512_constants.x1d[0]));
  401. for (d = 0 ; d < bytes ; d += 256) {
  402. asm volatile("vmovdqa64 %0,%%zmm4\n\t"
  403. "vmovdqa64 %1,%%zmm6\n\t"
  404. "vmovdqa64 %2,%%zmm12\n\t"
  405. "vmovdqa64 %3,%%zmm14\n\t"
  406. "vmovdqa64 %4,%%zmm2\n\t"
  407. "vmovdqa64 %5,%%zmm3\n\t"
  408. "vmovdqa64 %6,%%zmm10\n\t"
  409. "vmovdqa64 %7,%%zmm11\n\t"
  410. "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
  411. "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
  412. "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
  413. "vpxorq %%zmm14,%%zmm11,%%zmm11"
  414. :
  415. : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
  416. "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
  417. "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
  418. "m" (p[d+192]));
  419. /* P/Q data pages */
  420. for (z = z0-1 ; z >= start ; z--) {
  421. asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
  422. "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
  423. "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
  424. "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
  425. "prefetchnta %0\n\t"
  426. "prefetchnta %2\n\t"
  427. "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
  428. "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
  429. "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
  430. "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
  431. "vpmovm2b %%k1,%%zmm5\n\t"
  432. "vpmovm2b %%k2,%%zmm7\n\t"
  433. "vpmovm2b %%k3,%%zmm13\n\t"
  434. "vpmovm2b %%k4,%%zmm15\n\t"
  435. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  436. "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
  437. "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
  438. "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
  439. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  440. "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
  441. "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
  442. "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
  443. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  444. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  445. "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
  446. "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
  447. "vmovdqa64 %0,%%zmm5\n\t"
  448. "vmovdqa64 %1,%%zmm7\n\t"
  449. "vmovdqa64 %2,%%zmm13\n\t"
  450. "vmovdqa64 %3,%%zmm15\n\t"
  451. "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
  452. "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
  453. "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
  454. "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
  455. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  456. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  457. "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
  458. "vpxorq %%zmm15,%%zmm14,%%zmm14"
  459. :
  460. : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
  461. "m" (dptr[z][d+128]),
  462. "m" (dptr[z][d+192]));
  463. }
  464. asm volatile("prefetchnta %0\n\t"
  465. "prefetchnta %1\n\t"
  466. :
  467. : "m" (q[d]), "m" (q[d+128]));
  468. /* P/Q left side optimization */
  469. for (z = start-1 ; z >= 0 ; z--) {
  470. asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
  471. "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
  472. "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
  473. "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
  474. "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
  475. "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
  476. "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
  477. "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
  478. "vpmovm2b %%k1,%%zmm5\n\t"
  479. "vpmovm2b %%k2,%%zmm7\n\t"
  480. "vpmovm2b %%k3,%%zmm13\n\t"
  481. "vpmovm2b %%k4,%%zmm15\n\t"
  482. "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  483. "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
  484. "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
  485. "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
  486. "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  487. "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
  488. "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
  489. "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
  490. "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  491. "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
  492. "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
  493. "vpxorq %%zmm15,%%zmm14,%%zmm14"
  494. :
  495. : );
  496. }
  497. asm volatile("vmovntdq %%zmm2,%0\n\t"
  498. "vmovntdq %%zmm3,%1\n\t"
  499. "vmovntdq %%zmm10,%2\n\t"
  500. "vmovntdq %%zmm11,%3\n\t"
  501. "vpxorq %4,%%zmm4,%%zmm4\n\t"
  502. "vpxorq %5,%%zmm6,%%zmm6\n\t"
  503. "vpxorq %6,%%zmm12,%%zmm12\n\t"
  504. "vpxorq %7,%%zmm14,%%zmm14\n\t"
  505. "vmovntdq %%zmm4,%4\n\t"
  506. "vmovntdq %%zmm6,%5\n\t"
  507. "vmovntdq %%zmm12,%6\n\t"
  508. "vmovntdq %%zmm14,%7"
  509. :
  510. : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
  511. "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
  512. "m" (q[d+128]), "m" (q[d+192]));
  513. }
  514. asm volatile("sfence" : : : "memory");
  515. kernel_fpu_end();
  516. }
  517. const struct raid6_calls raid6_avx512x4 = {
  518. raid6_avx5124_gen_syndrome,
  519. raid6_avx5124_xor_syndrome,
  520. raid6_have_avx512,
  521. "avx512x4",
  522. 1 /* Has cache hints */
  523. };
  524. #endif
  525. #endif /* CONFIG_AS_AVX512 */