kryo3xx_arm64_edac.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. /* Copyright (c) 2016-2017, The Linux Foundation. All rights reserved.
  2. *
  3. * This program is free software; you can redistribute it and/or modify
  4. * it under the terms of the GNU General Public License version 2 and
  5. * only version 2 as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. */
  12. #include <linux/kernel.h>
  13. #include <linux/edac.h>
  14. #include <linux/of_device.h>
  15. #include <linux/platform_device.h>
  16. #include <linux/smp.h>
  17. #include <linux/cpu.h>
  18. #include <linux/cpu_pm.h>
  19. #include <linux/interrupt.h>
  20. #include <linux/of_irq.h>
  21. #include <asm/cputype.h>
  22. #include "edac_core.h"
  23. #ifdef CONFIG_EDAC_KRYO3XX_ARM64_POLL
  24. static int poll_msec = 1000;
  25. module_param(poll_msec, int, 0444);
  26. #endif
  27. #ifdef CONFIG_EDAC_KRYO3XX_ARM64_PANIC_ON_CE
  28. static bool panic_on_ce = 1;
  29. #else
  30. static bool panic_on_ce;
  31. #endif
  32. module_param_named(panic_on_ce, panic_on_ce, bool, 0664);
  33. #ifdef CONFIG_EDAC_KRYO3XX_ARM64_PANIC_ON_UE
  34. #define ARM64_ERP_PANIC_ON_UE 1
  35. #else
  36. #define ARM64_ERP_PANIC_ON_UE 0
  37. #endif
  38. #define L1 0x0
  39. #define L2 0x1
  40. #define L3 0x2
  41. #define EDAC_CPU "kryo3xx_edac"
  42. #define KRYO3XX_ERRXSTATUS_VALID(a) ((a >> 30) & 0x1)
  43. #define KRYO3XX_ERRXSTATUS_UE(a) ((a >> 29) & 0x1)
  44. #define KRYO3XX_ERRXSTATUS_SERR(a) (a & 0xFF)
  45. #define KRYO3XX_ERRXMISC_LVL(a) ((a >> 1) & 0x7)
  46. #define KRYO3XX_ERRXMISC_WAY(a) ((a >> 28) & 0xF)
  47. static inline void set_errxctlr_el1(void)
  48. {
  49. u64 val = 0x10f;
  50. asm volatile("msr s3_0_c5_c4_1, %0" : : "r" (val));
  51. }
  52. static inline void set_errxmisc_overflow(void)
  53. {
  54. u64 val = 0x7F7F00000000ULL;
  55. asm volatile("msr s3_0_c5_c5_0, %0" : : "r" (val));
  56. }
  57. static inline void write_errselr_el1(u64 val)
  58. {
  59. asm volatile("msr s3_0_c5_c3_1, %0" : : "r" (val));
  60. }
  61. static inline u64 read_errxstatus_el1(void)
  62. {
  63. u64 val;
  64. asm volatile("mrs %0, s3_0_c5_c4_2" : "=r" (val));
  65. return val;
  66. }
  67. static inline u64 read_errxmisc_el1(void)
  68. {
  69. u64 val;
  70. asm volatile("mrs %0, s3_0_c5_c5_0" : "=r" (val));
  71. return val;
  72. }
  73. static inline void clear_errxstatus_valid(u64 val)
  74. {
  75. asm volatile("msr s3_0_c5_c4_2, %0" : : "r" (val));
  76. }
  77. struct errors_edac {
  78. const char * const msg;
  79. void (*func)(struct edac_device_ctl_info *edac_dev,
  80. int inst_nr, int block_nr, const char *msg);
  81. };
  82. static const struct errors_edac errors[] = {
  83. {"Kryo3xx L1 Correctable Error", edac_device_handle_ce },
  84. {"Kryo3xx L1 Uncorrectable Error", edac_device_handle_ue },
  85. {"Kryo3xx L2 Correctable Error", edac_device_handle_ce },
  86. {"Kryo3xx L2 Uncorrectable Error", edac_device_handle_ue },
  87. {"L3 Correctable Error", edac_device_handle_ce },
  88. {"L3 Uncorrectable Error", edac_device_handle_ue },
  89. };
  90. #define KRYO3XX_L1_CE 0
  91. #define KRYO3XX_L1_UE 1
  92. #define KRYO3XX_L2_CE 2
  93. #define KRYO3XX_L2_UE 3
  94. #define KRYO3XX_L3_CE 4
  95. #define KRYO3XX_L3_UE 5
  96. #define DATA_BUF_ERR 0x2
  97. #define CACHE_DATA_ERR 0x6
  98. #define CACHE_TAG_DIRTY_ERR 0x7
  99. #define TLB_PARITY_ERR_DATA 0x8
  100. #define TLB_PARITY_ERR_TAG 0x9
  101. #define BUS_ERROR 0x12
  102. struct erp_drvdata {
  103. struct edac_device_ctl_info *edev_ctl;
  104. struct erp_drvdata __percpu **erp_cpu_drvdata;
  105. struct notifier_block nb_pm;
  106. int ppi;
  107. };
  108. static struct erp_drvdata *panic_handler_drvdata;
  109. static DEFINE_SPINLOCK(local_handler_lock);
  110. static void l1_l2_irq_enable(void *info)
  111. {
  112. int irq = *(int *)info;
  113. enable_percpu_irq(irq, IRQ_TYPE_LEVEL_HIGH);
  114. }
  115. static int request_erp_irq(struct platform_device *pdev, const char *propname,
  116. const char *desc, irq_handler_t handler,
  117. void *ed, int percpu)
  118. {
  119. int rc;
  120. struct resource *r;
  121. struct erp_drvdata *drv = ed;
  122. r = platform_get_resource_byname(pdev, IORESOURCE_IRQ, propname);
  123. if (!r) {
  124. pr_err("ARM64 CPU ERP: Could not find <%s> IRQ property. Proceeding anyway.\n",
  125. propname);
  126. goto out;
  127. }
  128. if (!percpu) {
  129. rc = devm_request_threaded_irq(&pdev->dev, r->start, NULL,
  130. handler,
  131. IRQF_ONESHOT | IRQF_TRIGGER_HIGH,
  132. desc,
  133. ed);
  134. if (rc) {
  135. pr_err("ARM64 CPU ERP: Failed to request IRQ %d: %d (%s / %s). Proceeding anyway.\n",
  136. (int) r->start, rc, propname, desc);
  137. goto out;
  138. }
  139. } else {
  140. drv->erp_cpu_drvdata = alloc_percpu(struct erp_drvdata *);
  141. if (!drv->erp_cpu_drvdata) {
  142. pr_err("Failed to allocate percpu erp data\n");
  143. goto out;
  144. }
  145. *raw_cpu_ptr(drv->erp_cpu_drvdata) = drv;
  146. rc = request_percpu_irq(r->start, handler, desc,
  147. drv->erp_cpu_drvdata);
  148. if (rc) {
  149. pr_err("ARM64 CPU ERP: Failed to request IRQ %d: %d (%s / %s). Proceeding anyway.\n",
  150. (int) r->start, rc, propname, desc);
  151. goto out_free;
  152. }
  153. drv->ppi = r->start;
  154. on_each_cpu(l1_l2_irq_enable, &(r->start), 1);
  155. }
  156. return 0;
  157. out_free:
  158. free_percpu(drv->erp_cpu_drvdata);
  159. drv->erp_cpu_drvdata = NULL;
  160. out:
  161. return -EINVAL;
  162. }
  163. static void dump_err_reg(int errorcode, int level, u64 errxstatus, u64 errxmisc,
  164. struct edac_device_ctl_info *edev_ctl)
  165. {
  166. edac_printk(KERN_CRIT, EDAC_CPU, "ERRXSTATUS_EL1: %llx\n", errxstatus);
  167. edac_printk(KERN_CRIT, EDAC_CPU, "ERRXMISC_EL1: %llx\n", errxmisc);
  168. edac_printk(KERN_CRIT, EDAC_CPU, "Cache level: L%d\n", level + 1);
  169. switch (KRYO3XX_ERRXSTATUS_SERR(errxstatus)) {
  170. case DATA_BUF_ERR:
  171. edac_printk(KERN_CRIT, EDAC_CPU, "ECC Error from internal data buffer\n");
  172. break;
  173. case CACHE_DATA_ERR:
  174. edac_printk(KERN_CRIT, EDAC_CPU, "ECC Error from cache data RAM\n");
  175. break;
  176. case CACHE_TAG_DIRTY_ERR:
  177. edac_printk(KERN_CRIT, EDAC_CPU, "ECC Error from cache tag or dirty RAM\n");
  178. break;
  179. case TLB_PARITY_ERR_DATA:
  180. edac_printk(KERN_CRIT, EDAC_CPU, "Parity error on TLB RAM\n");
  181. break;
  182. case TLB_PARITY_ERR_TAG:
  183. edac_printk(KERN_CRIT, EDAC_CPU, "Parity error on TLB DATA\n");
  184. case BUS_ERROR:
  185. edac_printk(KERN_CRIT, EDAC_CPU, "Bus Error\n");
  186. break;
  187. }
  188. if (level == L3)
  189. edac_printk(KERN_CRIT, EDAC_CPU,
  190. "Way: %d\n", (int) KRYO3XX_ERRXMISC_WAY(errxmisc));
  191. else
  192. edac_printk(KERN_CRIT, EDAC_CPU,
  193. "Way: %d\n", (int) KRYO3XX_ERRXMISC_WAY(errxmisc) >> 2);
  194. edev_ctl->panic_on_ce = panic_on_ce;
  195. errors[errorcode].func(edev_ctl, smp_processor_id(),
  196. level, errors[errorcode].msg);
  197. }
  198. static void kryo3xx_parse_l1_l2_cache_error(u64 errxstatus, u64 errxmisc,
  199. struct edac_device_ctl_info *edev_ctl)
  200. {
  201. switch (KRYO3XX_ERRXMISC_LVL(errxmisc)) {
  202. case L1:
  203. if (KRYO3XX_ERRXSTATUS_UE(errxstatus))
  204. dump_err_reg(KRYO3XX_L1_UE, L1, errxstatus, errxmisc,
  205. edev_ctl);
  206. else
  207. dump_err_reg(KRYO3XX_L1_CE, L1, errxstatus, errxmisc,
  208. edev_ctl);
  209. break;
  210. case L2:
  211. if (KRYO3XX_ERRXSTATUS_UE(errxstatus))
  212. dump_err_reg(KRYO3XX_L2_UE, L2, errxstatus, errxmisc,
  213. edev_ctl);
  214. else
  215. dump_err_reg(KRYO3XX_L2_CE, L2, errxstatus, errxmisc,
  216. edev_ctl);
  217. break;
  218. }
  219. }
  220. static void kryo3xx_check_l1_l2_ecc(void *info)
  221. {
  222. struct edac_device_ctl_info *edev_ctl = info;
  223. u64 errxstatus = 0;
  224. u64 errxmisc = 0;
  225. unsigned long flags;
  226. spin_lock_irqsave(&local_handler_lock, flags);
  227. write_errselr_el1(0);
  228. errxstatus = read_errxstatus_el1();
  229. if (KRYO3XX_ERRXSTATUS_VALID(errxstatus)) {
  230. errxmisc = read_errxmisc_el1();
  231. edac_printk(KERN_CRIT, EDAC_CPU,
  232. "Kryo3xx CPU%d detected a L1/L2 cache error\n",
  233. smp_processor_id());
  234. kryo3xx_parse_l1_l2_cache_error(errxstatus, errxmisc, edev_ctl);
  235. clear_errxstatus_valid(errxstatus);
  236. }
  237. spin_unlock_irqrestore(&local_handler_lock, flags);
  238. }
  239. static bool l3_is_bus_error(u64 errxstatus)
  240. {
  241. if (KRYO3XX_ERRXSTATUS_SERR(errxstatus) == BUS_ERROR) {
  242. edac_printk(KERN_CRIT, EDAC_CPU, "Bus Error\n");
  243. return true;
  244. }
  245. return false;
  246. }
  247. static void kryo3xx_check_l3_scu_error(struct edac_device_ctl_info *edev_ctl)
  248. {
  249. u64 errxstatus = 0;
  250. u64 errxmisc = 0;
  251. unsigned long flags;
  252. spin_lock_irqsave(&local_handler_lock, flags);
  253. write_errselr_el1(1);
  254. errxstatus = read_errxstatus_el1();
  255. errxmisc = read_errxmisc_el1();
  256. if (KRYO3XX_ERRXSTATUS_VALID(errxstatus) &&
  257. KRYO3XX_ERRXMISC_LVL(errxmisc) == L3) {
  258. if (l3_is_bus_error(errxstatus)) {
  259. if (edev_ctl->panic_on_ue)
  260. panic("Causing panic due to Bus Error\n");
  261. goto out;
  262. }
  263. if (KRYO3XX_ERRXSTATUS_UE(errxstatus)) {
  264. edac_printk(KERN_CRIT, EDAC_CPU, "Detected L3 uncorrectable error\n");
  265. dump_err_reg(KRYO3XX_L3_UE, L3, errxstatus, errxmisc,
  266. edev_ctl);
  267. } else {
  268. edac_printk(KERN_CRIT, EDAC_CPU, "Detected L3 correctable error\n");
  269. dump_err_reg(KRYO3XX_L3_CE, L3, errxstatus, errxmisc,
  270. edev_ctl);
  271. }
  272. clear_errxstatus_valid(errxstatus);
  273. }
  274. out:
  275. spin_unlock_irqrestore(&local_handler_lock, flags);
  276. }
  277. void kryo3xx_poll_cache_errors(struct edac_device_ctl_info *edev_ctl)
  278. {
  279. int cpu;
  280. if (edev_ctl == NULL)
  281. edev_ctl = panic_handler_drvdata->edev_ctl;
  282. kryo3xx_check_l3_scu_error(edev_ctl);
  283. for_each_possible_cpu(cpu)
  284. smp_call_function_single(cpu, kryo3xx_check_l1_l2_ecc,
  285. edev_ctl, 0);
  286. }
  287. static irqreturn_t kryo3xx_l1_l2_handler(int irq, void *drvdata)
  288. {
  289. kryo3xx_check_l1_l2_ecc(panic_handler_drvdata->edev_ctl);
  290. return IRQ_HANDLED;
  291. }
  292. static irqreturn_t kryo3xx_l3_scu_handler(int irq, void *drvdata)
  293. {
  294. struct erp_drvdata *drv = drvdata;
  295. struct edac_device_ctl_info *edev_ctl = drv->edev_ctl;
  296. kryo3xx_check_l3_scu_error(edev_ctl);
  297. return IRQ_HANDLED;
  298. }
  299. static void initialize_registers(void *info)
  300. {
  301. set_errxctlr_el1();
  302. set_errxmisc_overflow();
  303. }
  304. static void init_regs_on_cpu(bool all_cpus)
  305. {
  306. int cpu;
  307. write_errselr_el1(0);
  308. if (all_cpus) {
  309. for_each_possible_cpu(cpu)
  310. smp_call_function_single(cpu, initialize_registers,
  311. NULL, 1);
  312. } else
  313. initialize_registers(NULL);
  314. write_errselr_el1(1);
  315. initialize_registers(NULL);
  316. }
  317. static int kryo3xx_pmu_cpu_pm_notify(struct notifier_block *self,
  318. unsigned long action, void *v)
  319. {
  320. switch (action) {
  321. case CPU_PM_EXIT:
  322. init_regs_on_cpu(false);
  323. kryo3xx_check_l3_scu_error(panic_handler_drvdata->edev_ctl);
  324. kryo3xx_check_l1_l2_ecc(panic_handler_drvdata->edev_ctl);
  325. break;
  326. }
  327. return NOTIFY_OK;
  328. }
  329. static int kryo3xx_cpu_erp_probe(struct platform_device *pdev)
  330. {
  331. struct device *dev = &pdev->dev;
  332. struct erp_drvdata *drv;
  333. int rc = 0;
  334. int fail = 0;
  335. init_regs_on_cpu(true);
  336. drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
  337. if (!drv)
  338. return -ENOMEM;
  339. drv->edev_ctl = edac_device_alloc_ctl_info(0, "cpu",
  340. num_possible_cpus(), "L", 3, 1, NULL, 0,
  341. edac_device_alloc_index());
  342. if (!drv->edev_ctl)
  343. return -ENOMEM;
  344. #ifdef CONFIG_EDAC_KRYO3XX_ARM64_POLL
  345. drv->edev_ctl->edac_check = kryo3xx_poll_cache_errors;
  346. drv->edev_ctl->poll_msec = poll_msec;
  347. drv->edev_ctl->defer_work = 1;
  348. #endif
  349. drv->edev_ctl->dev = dev;
  350. drv->edev_ctl->mod_name = dev_name(dev);
  351. drv->edev_ctl->dev_name = dev_name(dev);
  352. drv->edev_ctl->ctl_name = "cache";
  353. drv->edev_ctl->panic_on_ce = panic_on_ce;
  354. drv->edev_ctl->panic_on_ue = ARM64_ERP_PANIC_ON_UE;
  355. drv->nb_pm.notifier_call = kryo3xx_pmu_cpu_pm_notify;
  356. platform_set_drvdata(pdev, drv);
  357. rc = edac_device_add_device(drv->edev_ctl);
  358. if (rc)
  359. goto out_mem;
  360. panic_handler_drvdata = drv;
  361. if (request_erp_irq(pdev, "l1-l2-faultirq",
  362. "KRYO3XX L1-L2 ECC FAULTIRQ",
  363. kryo3xx_l1_l2_handler, drv, 1))
  364. fail++;
  365. if (request_erp_irq(pdev, "l3-scu-faultirq",
  366. "KRYO3XX L3-SCU ECC FAULTIRQ",
  367. kryo3xx_l3_scu_handler, drv, 0))
  368. fail++;
  369. if (fail == of_irq_count(dev->of_node)) {
  370. pr_err("KRYO3XX ERP: Could not request any IRQs. Giving up.\n");
  371. rc = -ENODEV;
  372. goto out_dev;
  373. }
  374. cpu_pm_register_notifier(&(drv->nb_pm));
  375. return 0;
  376. out_dev:
  377. edac_device_del_device(dev);
  378. out_mem:
  379. edac_device_free_ctl_info(drv->edev_ctl);
  380. return rc;
  381. }
  382. static int kryo3xx_cpu_erp_remove(struct platform_device *pdev)
  383. {
  384. struct erp_drvdata *drv = dev_get_drvdata(&pdev->dev);
  385. struct edac_device_ctl_info *edac_ctl = drv->edev_ctl;
  386. if (drv->erp_cpu_drvdata != NULL) {
  387. free_percpu_irq(drv->ppi, drv->erp_cpu_drvdata);
  388. free_percpu(drv->erp_cpu_drvdata);
  389. }
  390. edac_device_del_device(edac_ctl->dev);
  391. edac_device_free_ctl_info(edac_ctl);
  392. return 0;
  393. }
  394. static const struct of_device_id kryo3xx_cpu_erp_match_table[] = {
  395. { .compatible = "arm,arm64-kryo3xx-cpu-erp" },
  396. { }
  397. };
  398. static struct platform_driver kryo3xx_cpu_erp_driver = {
  399. .probe = kryo3xx_cpu_erp_probe,
  400. .remove = kryo3xx_cpu_erp_remove,
  401. .driver = {
  402. .name = "kryo3xx_cpu_cache_erp",
  403. .owner = THIS_MODULE,
  404. .of_match_table = of_match_ptr(kryo3xx_cpu_erp_match_table),
  405. },
  406. };
  407. static int __init kryo3xx_cpu_erp_init(void)
  408. {
  409. return platform_driver_register(&kryo3xx_cpu_erp_driver);
  410. }
  411. module_init(kryo3xx_cpu_erp_init);
  412. static void __exit kryo3xx_cpu_erp_exit(void)
  413. {
  414. platform_driver_unregister(&kryo3xx_cpu_erp_driver);
  415. }
  416. module_exit(kryo3xx_cpu_erp_exit);
  417. MODULE_LICENSE("GPL v2");
  418. MODULE_DESCRIPTION("Kryo3xx EDAC driver");