eventfd.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. /*
  2. * fs/eventfd.c
  3. *
  4. * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
  5. *
  6. */
  7. #include <linux/file.h>
  8. #include <linux/poll.h>
  9. #include <linux/init.h>
  10. #include <linux/fs.h>
  11. #include <linux/sched.h>
  12. #include <linux/kernel.h>
  13. #include <linux/slab.h>
  14. #include <linux/list.h>
  15. #include <linux/spinlock.h>
  16. #include <linux/anon_inodes.h>
  17. #include <linux/syscalls.h>
  18. #include <linux/export.h>
  19. #include <linux/kref.h>
  20. #include <linux/eventfd.h>
  21. #include <linux/proc_fs.h>
  22. #include <linux/seq_file.h>
  23. struct eventfd_ctx {
  24. struct kref kref;
  25. wait_queue_head_t wqh;
  26. /*
  27. * Every time that a write(2) is performed on an eventfd, the
  28. * value of the __u64 being written is added to "count" and a
  29. * wakeup is performed on "wqh". A read(2) will return the "count"
  30. * value to userspace, and will reset "count" to zero. The kernel
  31. * side eventfd_signal() also, adds to the "count" counter and
  32. * issue a wakeup.
  33. */
  34. __u64 count;
  35. unsigned int flags;
  36. };
  37. /**
  38. * eventfd_signal - Adds @n to the eventfd counter.
  39. * @ctx: [in] Pointer to the eventfd context.
  40. * @n: [in] Value of the counter to be added to the eventfd internal counter.
  41. * The value cannot be negative.
  42. *
  43. * This function is supposed to be called by the kernel in paths that do not
  44. * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
  45. * value, and we signal this as overflow condition by returning a POLLERR
  46. * to poll(2).
  47. *
  48. * Returns the amount by which the counter was incremented. This will be less
  49. * than @n if the counter has overflowed.
  50. */
  51. __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
  52. {
  53. unsigned long flags;
  54. spin_lock_irqsave(&ctx->wqh.lock, flags);
  55. if (ULLONG_MAX - ctx->count < n)
  56. n = ULLONG_MAX - ctx->count;
  57. ctx->count += n;
  58. if (waitqueue_active(&ctx->wqh))
  59. wake_up_locked_poll(&ctx->wqh, POLLIN);
  60. spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  61. return n;
  62. }
  63. EXPORT_SYMBOL_GPL(eventfd_signal);
  64. static void eventfd_free_ctx(struct eventfd_ctx *ctx)
  65. {
  66. kfree(ctx);
  67. }
  68. static void eventfd_free(struct kref *kref)
  69. {
  70. struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
  71. eventfd_free_ctx(ctx);
  72. }
  73. /**
  74. * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
  75. * @ctx: [in] Pointer to the eventfd context.
  76. *
  77. * Returns: In case of success, returns a pointer to the eventfd context.
  78. */
  79. struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
  80. {
  81. kref_get(&ctx->kref);
  82. return ctx;
  83. }
  84. EXPORT_SYMBOL_GPL(eventfd_ctx_get);
  85. /**
  86. * eventfd_ctx_put - Releases a reference to the internal eventfd context.
  87. * @ctx: [in] Pointer to eventfd context.
  88. *
  89. * The eventfd context reference must have been previously acquired either
  90. * with eventfd_ctx_get() or eventfd_ctx_fdget().
  91. */
  92. void eventfd_ctx_put(struct eventfd_ctx *ctx)
  93. {
  94. kref_put(&ctx->kref, eventfd_free);
  95. }
  96. EXPORT_SYMBOL_GPL(eventfd_ctx_put);
  97. static int eventfd_release(struct inode *inode, struct file *file)
  98. {
  99. struct eventfd_ctx *ctx = file->private_data;
  100. wake_up_poll(&ctx->wqh, POLLHUP);
  101. eventfd_ctx_put(ctx);
  102. return 0;
  103. }
  104. static unsigned int eventfd_poll(struct file *file, poll_table *wait)
  105. {
  106. struct eventfd_ctx *ctx = file->private_data;
  107. unsigned int events = 0;
  108. u64 count;
  109. poll_wait(file, &ctx->wqh, wait);
  110. /*
  111. * All writes to ctx->count occur within ctx->wqh.lock. This read
  112. * can be done outside ctx->wqh.lock because we know that poll_wait
  113. * takes that lock (through add_wait_queue) if our caller will sleep.
  114. *
  115. * The read _can_ therefore seep into add_wait_queue's critical
  116. * section, but cannot move above it! add_wait_queue's spin_lock acts
  117. * as an acquire barrier and ensures that the read be ordered properly
  118. * against the writes. The following CAN happen and is safe:
  119. *
  120. * poll write
  121. * ----------------- ------------
  122. * lock ctx->wqh.lock (in poll_wait)
  123. * count = ctx->count
  124. * __add_wait_queue
  125. * unlock ctx->wqh.lock
  126. * lock ctx->qwh.lock
  127. * ctx->count += n
  128. * if (waitqueue_active)
  129. * wake_up_locked_poll
  130. * unlock ctx->qwh.lock
  131. * eventfd_poll returns 0
  132. *
  133. * but the following, which would miss a wakeup, cannot happen:
  134. *
  135. * poll write
  136. * ----------------- ------------
  137. * count = ctx->count (INVALID!)
  138. * lock ctx->qwh.lock
  139. * ctx->count += n
  140. * **waitqueue_active is false**
  141. * **no wake_up_locked_poll!**
  142. * unlock ctx->qwh.lock
  143. * lock ctx->wqh.lock (in poll_wait)
  144. * __add_wait_queue
  145. * unlock ctx->wqh.lock
  146. * eventfd_poll returns 0
  147. */
  148. count = READ_ONCE(ctx->count);
  149. if (count > 0)
  150. events |= POLLIN;
  151. if (count == ULLONG_MAX)
  152. events |= POLLERR;
  153. if (ULLONG_MAX - 1 > count)
  154. events |= POLLOUT;
  155. return events;
  156. }
  157. static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
  158. {
  159. *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
  160. ctx->count -= *cnt;
  161. }
  162. /**
  163. * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
  164. * @ctx: [in] Pointer to eventfd context.
  165. * @wait: [in] Wait queue to be removed.
  166. * @cnt: [out] Pointer to the 64-bit counter value.
  167. *
  168. * Returns %0 if successful, or the following error codes:
  169. *
  170. * -EAGAIN : The operation would have blocked.
  171. *
  172. * This is used to atomically remove a wait queue entry from the eventfd wait
  173. * queue head, and read/reset the counter value.
  174. */
  175. int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
  176. __u64 *cnt)
  177. {
  178. unsigned long flags;
  179. spin_lock_irqsave(&ctx->wqh.lock, flags);
  180. eventfd_ctx_do_read(ctx, cnt);
  181. __remove_wait_queue(&ctx->wqh, wait);
  182. if (*cnt != 0 && waitqueue_active(&ctx->wqh))
  183. wake_up_locked_poll(&ctx->wqh, POLLOUT);
  184. spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  185. return *cnt != 0 ? 0 : -EAGAIN;
  186. }
  187. EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
  188. /**
  189. * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
  190. * @ctx: [in] Pointer to eventfd context.
  191. * @no_wait: [in] Different from zero if the operation should not block.
  192. * @cnt: [out] Pointer to the 64-bit counter value.
  193. *
  194. * Returns %0 if successful, or the following error codes:
  195. *
  196. * -EAGAIN : The operation would have blocked but @no_wait was non-zero.
  197. * -ERESTARTSYS : A signal interrupted the wait operation.
  198. *
  199. * If @no_wait is zero, the function might sleep until the eventfd internal
  200. * counter becomes greater than zero.
  201. */
  202. ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
  203. {
  204. ssize_t res;
  205. DECLARE_WAITQUEUE(wait, current);
  206. spin_lock_irq(&ctx->wqh.lock);
  207. *cnt = 0;
  208. res = -EAGAIN;
  209. if (ctx->count > 0)
  210. res = 0;
  211. else if (!no_wait) {
  212. __add_wait_queue(&ctx->wqh, &wait);
  213. for (;;) {
  214. set_current_state(TASK_INTERRUPTIBLE);
  215. if (ctx->count > 0) {
  216. res = 0;
  217. break;
  218. }
  219. if (signal_pending(current)) {
  220. res = -ERESTARTSYS;
  221. break;
  222. }
  223. spin_unlock_irq(&ctx->wqh.lock);
  224. schedule();
  225. spin_lock_irq(&ctx->wqh.lock);
  226. }
  227. __remove_wait_queue(&ctx->wqh, &wait);
  228. __set_current_state(TASK_RUNNING);
  229. }
  230. if (likely(res == 0)) {
  231. eventfd_ctx_do_read(ctx, cnt);
  232. if (waitqueue_active(&ctx->wqh))
  233. wake_up_locked_poll(&ctx->wqh, POLLOUT);
  234. }
  235. spin_unlock_irq(&ctx->wqh.lock);
  236. return res;
  237. }
  238. EXPORT_SYMBOL_GPL(eventfd_ctx_read);
  239. static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
  240. loff_t *ppos)
  241. {
  242. struct eventfd_ctx *ctx = file->private_data;
  243. ssize_t res;
  244. __u64 cnt;
  245. if (count < sizeof(cnt))
  246. return -EINVAL;
  247. res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
  248. if (res < 0)
  249. return res;
  250. return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
  251. }
  252. static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
  253. loff_t *ppos)
  254. {
  255. struct eventfd_ctx *ctx = file->private_data;
  256. ssize_t res;
  257. __u64 ucnt;
  258. DECLARE_WAITQUEUE(wait, current);
  259. if (count < sizeof(ucnt))
  260. return -EINVAL;
  261. if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
  262. return -EFAULT;
  263. if (ucnt == ULLONG_MAX)
  264. return -EINVAL;
  265. spin_lock_irq(&ctx->wqh.lock);
  266. res = -EAGAIN;
  267. if (ULLONG_MAX - ctx->count > ucnt)
  268. res = sizeof(ucnt);
  269. else if (!(file->f_flags & O_NONBLOCK)) {
  270. __add_wait_queue(&ctx->wqh, &wait);
  271. for (res = 0;;) {
  272. set_current_state(TASK_INTERRUPTIBLE);
  273. if (ULLONG_MAX - ctx->count > ucnt) {
  274. res = sizeof(ucnt);
  275. break;
  276. }
  277. if (signal_pending(current)) {
  278. res = -ERESTARTSYS;
  279. break;
  280. }
  281. spin_unlock_irq(&ctx->wqh.lock);
  282. schedule();
  283. spin_lock_irq(&ctx->wqh.lock);
  284. }
  285. __remove_wait_queue(&ctx->wqh, &wait);
  286. __set_current_state(TASK_RUNNING);
  287. }
  288. if (likely(res > 0)) {
  289. ctx->count += ucnt;
  290. if (waitqueue_active(&ctx->wqh))
  291. wake_up_locked_poll(&ctx->wqh, POLLIN);
  292. }
  293. spin_unlock_irq(&ctx->wqh.lock);
  294. return res;
  295. }
  296. #ifdef CONFIG_PROC_FS
  297. static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
  298. {
  299. struct eventfd_ctx *ctx = f->private_data;
  300. spin_lock_irq(&ctx->wqh.lock);
  301. seq_printf(m, "eventfd-count: %16llx\n",
  302. (unsigned long long)ctx->count);
  303. spin_unlock_irq(&ctx->wqh.lock);
  304. }
  305. #endif
  306. static const struct file_operations eventfd_fops = {
  307. #ifdef CONFIG_PROC_FS
  308. .show_fdinfo = eventfd_show_fdinfo,
  309. #endif
  310. .release = eventfd_release,
  311. .poll = eventfd_poll,
  312. .read = eventfd_read,
  313. .write = eventfd_write,
  314. .llseek = noop_llseek,
  315. };
  316. /**
  317. * eventfd_fget - Acquire a reference of an eventfd file descriptor.
  318. * @fd: [in] Eventfd file descriptor.
  319. *
  320. * Returns a pointer to the eventfd file structure in case of success, or the
  321. * following error pointer:
  322. *
  323. * -EBADF : Invalid @fd file descriptor.
  324. * -EINVAL : The @fd file descriptor is not an eventfd file.
  325. */
  326. struct file *eventfd_fget(int fd)
  327. {
  328. struct file *file;
  329. file = fget(fd);
  330. if (!file)
  331. return ERR_PTR(-EBADF);
  332. if (file->f_op != &eventfd_fops) {
  333. fput(file);
  334. return ERR_PTR(-EINVAL);
  335. }
  336. return file;
  337. }
  338. EXPORT_SYMBOL_GPL(eventfd_fget);
  339. /**
  340. * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
  341. * @fd: [in] Eventfd file descriptor.
  342. *
  343. * Returns a pointer to the internal eventfd context, otherwise the error
  344. * pointers returned by the following functions:
  345. *
  346. * eventfd_fget
  347. */
  348. struct eventfd_ctx *eventfd_ctx_fdget(int fd)
  349. {
  350. struct eventfd_ctx *ctx;
  351. struct fd f = fdget(fd);
  352. if (!f.file)
  353. return ERR_PTR(-EBADF);
  354. ctx = eventfd_ctx_fileget(f.file);
  355. fdput(f);
  356. return ctx;
  357. }
  358. EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
  359. /**
  360. * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
  361. * @file: [in] Eventfd file pointer.
  362. *
  363. * Returns a pointer to the internal eventfd context, otherwise the error
  364. * pointer:
  365. *
  366. * -EINVAL : The @fd file descriptor is not an eventfd file.
  367. */
  368. struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
  369. {
  370. if (file->f_op != &eventfd_fops)
  371. return ERR_PTR(-EINVAL);
  372. return eventfd_ctx_get(file->private_data);
  373. }
  374. EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
  375. /**
  376. * eventfd_file_create - Creates an eventfd file pointer.
  377. * @count: Initial eventfd counter value.
  378. * @flags: Flags for the eventfd file.
  379. *
  380. * This function creates an eventfd file pointer, w/out installing it into
  381. * the fd table. This is useful when the eventfd file is used during the
  382. * initialization of data structures that require extra setup after the eventfd
  383. * creation. So the eventfd creation is split into the file pointer creation
  384. * phase, and the file descriptor installation phase.
  385. * In this way races with userspace closing the newly installed file descriptor
  386. * can be avoided.
  387. * Returns an eventfd file pointer, or a proper error pointer.
  388. */
  389. struct file *eventfd_file_create(unsigned int count, int flags)
  390. {
  391. struct file *file;
  392. struct eventfd_ctx *ctx;
  393. /* Check the EFD_* constants for consistency. */
  394. BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
  395. BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
  396. if (flags & ~EFD_FLAGS_SET)
  397. return ERR_PTR(-EINVAL);
  398. ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
  399. if (!ctx)
  400. return ERR_PTR(-ENOMEM);
  401. kref_init(&ctx->kref);
  402. init_waitqueue_head(&ctx->wqh);
  403. ctx->count = count;
  404. ctx->flags = flags;
  405. file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
  406. O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
  407. if (IS_ERR(file))
  408. eventfd_free_ctx(ctx);
  409. return file;
  410. }
  411. SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
  412. {
  413. int fd, error;
  414. struct file *file;
  415. error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
  416. if (error < 0)
  417. return error;
  418. fd = error;
  419. file = eventfd_file_create(count, flags);
  420. if (IS_ERR(file)) {
  421. error = PTR_ERR(file);
  422. goto err_put_unused_fd;
  423. }
  424. fd_install(fd, file);
  425. return fd;
  426. err_put_unused_fd:
  427. put_unused_fd(fd);
  428. return error;
  429. }
  430. SYSCALL_DEFINE1(eventfd, unsigned int, count)
  431. {
  432. return sys_eventfd2(count, 0);
  433. }