blkback.c 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503
  1. /******************************************************************************
  2. *
  3. * Back-end of the driver for virtual block devices. This portion of the
  4. * driver exports a 'unified' block-device interface that can be accessed
  5. * by any operating system that implements a compatible front end. A
  6. * reference front-end implementation can be found in:
  7. * drivers/block/xen-blkfront.c
  8. *
  9. * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
  10. * Copyright (c) 2005, Christopher Clark
  11. *
  12. * This program is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU General Public License version 2
  14. * as published by the Free Software Foundation; or, when distributed
  15. * separately from the Linux kernel or incorporated into other
  16. * software packages, subject to the following license:
  17. *
  18. * Permission is hereby granted, free of charge, to any person obtaining a copy
  19. * of this source file (the "Software"), to deal in the Software without
  20. * restriction, including without limitation the rights to use, copy, modify,
  21. * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  22. * and to permit persons to whom the Software is furnished to do so, subject to
  23. * the following conditions:
  24. *
  25. * The above copyright notice and this permission notice shall be included in
  26. * all copies or substantial portions of the Software.
  27. *
  28. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  29. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  30. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  31. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  32. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  33. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  34. * IN THE SOFTWARE.
  35. */
  36. #define pr_fmt(fmt) "xen-blkback: " fmt
  37. #include <linux/spinlock.h>
  38. #include <linux/kthread.h>
  39. #include <linux/list.h>
  40. #include <linux/delay.h>
  41. #include <linux/freezer.h>
  42. #include <linux/bitmap.h>
  43. #include <xen/events.h>
  44. #include <xen/page.h>
  45. #include <xen/xen.h>
  46. #include <asm/xen/hypervisor.h>
  47. #include <asm/xen/hypercall.h>
  48. #include <xen/balloon.h>
  49. #include <xen/grant_table.h>
  50. #include "common.h"
  51. /*
  52. * Maximum number of unused free pages to keep in the internal buffer.
  53. * Setting this to a value too low will reduce memory used in each backend,
  54. * but can have a performance penalty.
  55. *
  56. * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
  57. * be set to a lower value that might degrade performance on some intensive
  58. * IO workloads.
  59. */
  60. static int xen_blkif_max_buffer_pages = 1024;
  61. module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
  62. MODULE_PARM_DESC(max_buffer_pages,
  63. "Maximum number of free pages to keep in each block backend buffer");
  64. /*
  65. * Maximum number of grants to map persistently in blkback. For maximum
  66. * performance this should be the total numbers of grants that can be used
  67. * to fill the ring, but since this might become too high, specially with
  68. * the use of indirect descriptors, we set it to a value that provides good
  69. * performance without using too much memory.
  70. *
  71. * When the list of persistent grants is full we clean it up using a LRU
  72. * algorithm.
  73. */
  74. static int xen_blkif_max_pgrants = 1056;
  75. module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
  76. MODULE_PARM_DESC(max_persistent_grants,
  77. "Maximum number of grants to map persistently");
  78. /*
  79. * Maximum number of rings/queues blkback supports, allow as many queues as there
  80. * are CPUs if user has not specified a value.
  81. */
  82. unsigned int xenblk_max_queues;
  83. module_param_named(max_queues, xenblk_max_queues, uint, 0644);
  84. MODULE_PARM_DESC(max_queues,
  85. "Maximum number of hardware queues per virtual disk." \
  86. "By default it is the number of online CPUs.");
  87. /*
  88. * Maximum order of pages to be used for the shared ring between front and
  89. * backend, 4KB page granularity is used.
  90. */
  91. unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
  92. module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
  93. MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
  94. /*
  95. * The LRU mechanism to clean the lists of persistent grants needs to
  96. * be executed periodically. The time interval between consecutive executions
  97. * of the purge mechanism is set in ms.
  98. */
  99. #define LRU_INTERVAL 100
  100. /*
  101. * When the persistent grants list is full we will remove unused grants
  102. * from the list. The percent number of grants to be removed at each LRU
  103. * execution.
  104. */
  105. #define LRU_PERCENT_CLEAN 5
  106. /* Run-time switchable: /sys/module/blkback/parameters/ */
  107. static unsigned int log_stats;
  108. module_param(log_stats, int, 0644);
  109. #define BLKBACK_INVALID_HANDLE (~0)
  110. /* Number of free pages to remove on each call to gnttab_free_pages */
  111. #define NUM_BATCH_FREE_PAGES 10
  112. static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page)
  113. {
  114. unsigned long flags;
  115. spin_lock_irqsave(&ring->free_pages_lock, flags);
  116. if (list_empty(&ring->free_pages)) {
  117. BUG_ON(ring->free_pages_num != 0);
  118. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  119. return gnttab_alloc_pages(1, page);
  120. }
  121. BUG_ON(ring->free_pages_num == 0);
  122. page[0] = list_first_entry(&ring->free_pages, struct page, lru);
  123. list_del(&page[0]->lru);
  124. ring->free_pages_num--;
  125. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  126. return 0;
  127. }
  128. static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page,
  129. int num)
  130. {
  131. unsigned long flags;
  132. int i;
  133. spin_lock_irqsave(&ring->free_pages_lock, flags);
  134. for (i = 0; i < num; i++)
  135. list_add(&page[i]->lru, &ring->free_pages);
  136. ring->free_pages_num += num;
  137. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  138. }
  139. static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
  140. {
  141. /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
  142. struct page *page[NUM_BATCH_FREE_PAGES];
  143. unsigned int num_pages = 0;
  144. unsigned long flags;
  145. spin_lock_irqsave(&ring->free_pages_lock, flags);
  146. while (ring->free_pages_num > num) {
  147. BUG_ON(list_empty(&ring->free_pages));
  148. page[num_pages] = list_first_entry(&ring->free_pages,
  149. struct page, lru);
  150. list_del(&page[num_pages]->lru);
  151. ring->free_pages_num--;
  152. if (++num_pages == NUM_BATCH_FREE_PAGES) {
  153. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  154. gnttab_free_pages(num_pages, page);
  155. spin_lock_irqsave(&ring->free_pages_lock, flags);
  156. num_pages = 0;
  157. }
  158. }
  159. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  160. if (num_pages != 0)
  161. gnttab_free_pages(num_pages, page);
  162. }
  163. #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
  164. static int do_block_io_op(struct xen_blkif_ring *ring);
  165. static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
  166. struct blkif_request *req,
  167. struct pending_req *pending_req);
  168. static void make_response(struct xen_blkif_ring *ring, u64 id,
  169. unsigned short op, int st);
  170. #define foreach_grant_safe(pos, n, rbtree, node) \
  171. for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
  172. (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
  173. &(pos)->node != NULL; \
  174. (pos) = container_of(n, typeof(*(pos)), node), \
  175. (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
  176. /*
  177. * We don't need locking around the persistent grant helpers
  178. * because blkback uses a single-thread for each backend, so we
  179. * can be sure that this functions will never be called recursively.
  180. *
  181. * The only exception to that is put_persistent_grant, that can be called
  182. * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
  183. * bit operations to modify the flags of a persistent grant and to count
  184. * the number of used grants.
  185. */
  186. static int add_persistent_gnt(struct xen_blkif_ring *ring,
  187. struct persistent_gnt *persistent_gnt)
  188. {
  189. struct rb_node **new = NULL, *parent = NULL;
  190. struct persistent_gnt *this;
  191. struct xen_blkif *blkif = ring->blkif;
  192. if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) {
  193. if (!blkif->vbd.overflow_max_grants)
  194. blkif->vbd.overflow_max_grants = 1;
  195. return -EBUSY;
  196. }
  197. /* Figure out where to put new node */
  198. new = &ring->persistent_gnts.rb_node;
  199. while (*new) {
  200. this = container_of(*new, struct persistent_gnt, node);
  201. parent = *new;
  202. if (persistent_gnt->gnt < this->gnt)
  203. new = &((*new)->rb_left);
  204. else if (persistent_gnt->gnt > this->gnt)
  205. new = &((*new)->rb_right);
  206. else {
  207. pr_alert_ratelimited("trying to add a gref that's already in the tree\n");
  208. return -EINVAL;
  209. }
  210. }
  211. bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE);
  212. set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
  213. /* Add new node and rebalance tree. */
  214. rb_link_node(&(persistent_gnt->node), parent, new);
  215. rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
  216. ring->persistent_gnt_c++;
  217. atomic_inc(&ring->persistent_gnt_in_use);
  218. return 0;
  219. }
  220. static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
  221. grant_ref_t gref)
  222. {
  223. struct persistent_gnt *data;
  224. struct rb_node *node = NULL;
  225. node = ring->persistent_gnts.rb_node;
  226. while (node) {
  227. data = container_of(node, struct persistent_gnt, node);
  228. if (gref < data->gnt)
  229. node = node->rb_left;
  230. else if (gref > data->gnt)
  231. node = node->rb_right;
  232. else {
  233. if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) {
  234. pr_alert_ratelimited("requesting a grant already in use\n");
  235. return NULL;
  236. }
  237. set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
  238. atomic_inc(&ring->persistent_gnt_in_use);
  239. return data;
  240. }
  241. }
  242. return NULL;
  243. }
  244. static void put_persistent_gnt(struct xen_blkif_ring *ring,
  245. struct persistent_gnt *persistent_gnt)
  246. {
  247. if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
  248. pr_alert_ratelimited("freeing a grant already unused\n");
  249. set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
  250. clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
  251. atomic_dec(&ring->persistent_gnt_in_use);
  252. }
  253. static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
  254. unsigned int num)
  255. {
  256. struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  257. struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  258. struct persistent_gnt *persistent_gnt;
  259. struct rb_node *n;
  260. int segs_to_unmap = 0;
  261. struct gntab_unmap_queue_data unmap_data;
  262. unmap_data.pages = pages;
  263. unmap_data.unmap_ops = unmap;
  264. unmap_data.kunmap_ops = NULL;
  265. foreach_grant_safe(persistent_gnt, n, root, node) {
  266. BUG_ON(persistent_gnt->handle ==
  267. BLKBACK_INVALID_HANDLE);
  268. gnttab_set_unmap_op(&unmap[segs_to_unmap],
  269. (unsigned long) pfn_to_kaddr(page_to_pfn(
  270. persistent_gnt->page)),
  271. GNTMAP_host_map,
  272. persistent_gnt->handle);
  273. pages[segs_to_unmap] = persistent_gnt->page;
  274. if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
  275. !rb_next(&persistent_gnt->node)) {
  276. unmap_data.count = segs_to_unmap;
  277. BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
  278. put_free_pages(ring, pages, segs_to_unmap);
  279. segs_to_unmap = 0;
  280. }
  281. rb_erase(&persistent_gnt->node, root);
  282. kfree(persistent_gnt);
  283. num--;
  284. }
  285. BUG_ON(num != 0);
  286. }
  287. void xen_blkbk_unmap_purged_grants(struct work_struct *work)
  288. {
  289. struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  290. struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  291. struct persistent_gnt *persistent_gnt;
  292. int segs_to_unmap = 0;
  293. struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
  294. struct gntab_unmap_queue_data unmap_data;
  295. unmap_data.pages = pages;
  296. unmap_data.unmap_ops = unmap;
  297. unmap_data.kunmap_ops = NULL;
  298. while(!list_empty(&ring->persistent_purge_list)) {
  299. persistent_gnt = list_first_entry(&ring->persistent_purge_list,
  300. struct persistent_gnt,
  301. remove_node);
  302. list_del(&persistent_gnt->remove_node);
  303. gnttab_set_unmap_op(&unmap[segs_to_unmap],
  304. vaddr(persistent_gnt->page),
  305. GNTMAP_host_map,
  306. persistent_gnt->handle);
  307. pages[segs_to_unmap] = persistent_gnt->page;
  308. if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
  309. unmap_data.count = segs_to_unmap;
  310. BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
  311. put_free_pages(ring, pages, segs_to_unmap);
  312. segs_to_unmap = 0;
  313. }
  314. kfree(persistent_gnt);
  315. }
  316. if (segs_to_unmap > 0) {
  317. unmap_data.count = segs_to_unmap;
  318. BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
  319. put_free_pages(ring, pages, segs_to_unmap);
  320. }
  321. }
  322. static void purge_persistent_gnt(struct xen_blkif_ring *ring)
  323. {
  324. struct persistent_gnt *persistent_gnt;
  325. struct rb_node *n;
  326. unsigned int num_clean, total;
  327. bool scan_used = false, clean_used = false;
  328. struct rb_root *root;
  329. if (ring->persistent_gnt_c < xen_blkif_max_pgrants ||
  330. (ring->persistent_gnt_c == xen_blkif_max_pgrants &&
  331. !ring->blkif->vbd.overflow_max_grants)) {
  332. goto out;
  333. }
  334. if (work_busy(&ring->persistent_purge_work)) {
  335. pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
  336. goto out;
  337. }
  338. num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
  339. num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
  340. num_clean = min(ring->persistent_gnt_c, num_clean);
  341. if ((num_clean == 0) ||
  342. (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use))))
  343. goto out;
  344. /*
  345. * At this point, we can assure that there will be no calls
  346. * to get_persistent_grant (because we are executing this code from
  347. * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
  348. * which means that the number of currently used grants will go down,
  349. * but never up, so we will always be able to remove the requested
  350. * number of grants.
  351. */
  352. total = num_clean;
  353. pr_debug("Going to purge %u persistent grants\n", num_clean);
  354. BUG_ON(!list_empty(&ring->persistent_purge_list));
  355. root = &ring->persistent_gnts;
  356. purge_list:
  357. foreach_grant_safe(persistent_gnt, n, root, node) {
  358. BUG_ON(persistent_gnt->handle ==
  359. BLKBACK_INVALID_HANDLE);
  360. if (clean_used) {
  361. clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
  362. continue;
  363. }
  364. if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
  365. continue;
  366. if (!scan_used &&
  367. (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags)))
  368. continue;
  369. rb_erase(&persistent_gnt->node, root);
  370. list_add(&persistent_gnt->remove_node,
  371. &ring->persistent_purge_list);
  372. if (--num_clean == 0)
  373. goto finished;
  374. }
  375. /*
  376. * If we get here it means we also need to start cleaning
  377. * grants that were used since last purge in order to cope
  378. * with the requested num
  379. */
  380. if (!scan_used && !clean_used) {
  381. pr_debug("Still missing %u purged frames\n", num_clean);
  382. scan_used = true;
  383. goto purge_list;
  384. }
  385. finished:
  386. if (!clean_used) {
  387. pr_debug("Finished scanning for grants to clean, removing used flag\n");
  388. clean_used = true;
  389. goto purge_list;
  390. }
  391. ring->persistent_gnt_c -= (total - num_clean);
  392. ring->blkif->vbd.overflow_max_grants = 0;
  393. /* We can defer this work */
  394. schedule_work(&ring->persistent_purge_work);
  395. pr_debug("Purged %u/%u\n", (total - num_clean), total);
  396. out:
  397. return;
  398. }
  399. /*
  400. * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
  401. */
  402. static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
  403. {
  404. struct pending_req *req = NULL;
  405. unsigned long flags;
  406. spin_lock_irqsave(&ring->pending_free_lock, flags);
  407. if (!list_empty(&ring->pending_free)) {
  408. req = list_entry(ring->pending_free.next, struct pending_req,
  409. free_list);
  410. list_del(&req->free_list);
  411. }
  412. spin_unlock_irqrestore(&ring->pending_free_lock, flags);
  413. return req;
  414. }
  415. /*
  416. * Return the 'pending_req' structure back to the freepool. We also
  417. * wake up the thread if it was waiting for a free page.
  418. */
  419. static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
  420. {
  421. unsigned long flags;
  422. int was_empty;
  423. spin_lock_irqsave(&ring->pending_free_lock, flags);
  424. was_empty = list_empty(&ring->pending_free);
  425. list_add(&req->free_list, &ring->pending_free);
  426. spin_unlock_irqrestore(&ring->pending_free_lock, flags);
  427. if (was_empty)
  428. wake_up(&ring->pending_free_wq);
  429. }
  430. /*
  431. * Routines for managing virtual block devices (vbds).
  432. */
  433. static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
  434. int operation)
  435. {
  436. struct xen_vbd *vbd = &blkif->vbd;
  437. int rc = -EACCES;
  438. if ((operation != REQ_OP_READ) && vbd->readonly)
  439. goto out;
  440. if (likely(req->nr_sects)) {
  441. blkif_sector_t end = req->sector_number + req->nr_sects;
  442. if (unlikely(end < req->sector_number))
  443. goto out;
  444. if (unlikely(end > vbd_sz(vbd)))
  445. goto out;
  446. }
  447. req->dev = vbd->pdevice;
  448. req->bdev = vbd->bdev;
  449. rc = 0;
  450. out:
  451. return rc;
  452. }
  453. static void xen_vbd_resize(struct xen_blkif *blkif)
  454. {
  455. struct xen_vbd *vbd = &blkif->vbd;
  456. struct xenbus_transaction xbt;
  457. int err;
  458. struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
  459. unsigned long long new_size = vbd_sz(vbd);
  460. pr_info("VBD Resize: Domid: %d, Device: (%d, %d)\n",
  461. blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
  462. pr_info("VBD Resize: new size %llu\n", new_size);
  463. vbd->size = new_size;
  464. again:
  465. err = xenbus_transaction_start(&xbt);
  466. if (err) {
  467. pr_warn("Error starting transaction\n");
  468. return;
  469. }
  470. err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
  471. (unsigned long long)vbd_sz(vbd));
  472. if (err) {
  473. pr_warn("Error writing new size\n");
  474. goto abort;
  475. }
  476. /*
  477. * Write the current state; we will use this to synchronize
  478. * the front-end. If the current state is "connected" the
  479. * front-end will get the new size information online.
  480. */
  481. err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
  482. if (err) {
  483. pr_warn("Error writing the state\n");
  484. goto abort;
  485. }
  486. err = xenbus_transaction_end(xbt, 0);
  487. if (err == -EAGAIN)
  488. goto again;
  489. if (err)
  490. pr_warn("Error ending transaction\n");
  491. return;
  492. abort:
  493. xenbus_transaction_end(xbt, 1);
  494. }
  495. /*
  496. * Notification from the guest OS.
  497. */
  498. static void blkif_notify_work(struct xen_blkif_ring *ring)
  499. {
  500. ring->waiting_reqs = 1;
  501. wake_up(&ring->wq);
  502. }
  503. irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
  504. {
  505. blkif_notify_work(dev_id);
  506. return IRQ_HANDLED;
  507. }
  508. /*
  509. * SCHEDULER FUNCTIONS
  510. */
  511. static void print_stats(struct xen_blkif_ring *ring)
  512. {
  513. pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
  514. " | ds %4llu | pg: %4u/%4d\n",
  515. current->comm, ring->st_oo_req,
  516. ring->st_rd_req, ring->st_wr_req,
  517. ring->st_f_req, ring->st_ds_req,
  518. ring->persistent_gnt_c,
  519. xen_blkif_max_pgrants);
  520. ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
  521. ring->st_rd_req = 0;
  522. ring->st_wr_req = 0;
  523. ring->st_oo_req = 0;
  524. ring->st_ds_req = 0;
  525. }
  526. int xen_blkif_schedule(void *arg)
  527. {
  528. struct xen_blkif_ring *ring = arg;
  529. struct xen_blkif *blkif = ring->blkif;
  530. struct xen_vbd *vbd = &blkif->vbd;
  531. unsigned long timeout;
  532. int ret;
  533. set_freezable();
  534. while (!kthread_should_stop()) {
  535. if (try_to_freeze())
  536. continue;
  537. if (unlikely(vbd->size != vbd_sz(vbd)))
  538. xen_vbd_resize(blkif);
  539. timeout = msecs_to_jiffies(LRU_INTERVAL);
  540. timeout = wait_event_interruptible_timeout(
  541. ring->wq,
  542. ring->waiting_reqs || kthread_should_stop(),
  543. timeout);
  544. if (timeout == 0)
  545. goto purge_gnt_list;
  546. timeout = wait_event_interruptible_timeout(
  547. ring->pending_free_wq,
  548. !list_empty(&ring->pending_free) ||
  549. kthread_should_stop(),
  550. timeout);
  551. if (timeout == 0)
  552. goto purge_gnt_list;
  553. ring->waiting_reqs = 0;
  554. smp_mb(); /* clear flag *before* checking for work */
  555. ret = do_block_io_op(ring);
  556. if (ret > 0)
  557. ring->waiting_reqs = 1;
  558. if (ret == -EACCES)
  559. wait_event_interruptible(ring->shutdown_wq,
  560. kthread_should_stop());
  561. purge_gnt_list:
  562. if (blkif->vbd.feature_gnt_persistent &&
  563. time_after(jiffies, ring->next_lru)) {
  564. purge_persistent_gnt(ring);
  565. ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
  566. }
  567. /* Shrink if we have more than xen_blkif_max_buffer_pages */
  568. shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
  569. if (log_stats && time_after(jiffies, ring->st_print))
  570. print_stats(ring);
  571. }
  572. /* Drain pending purge work */
  573. flush_work(&ring->persistent_purge_work);
  574. if (log_stats)
  575. print_stats(ring);
  576. ring->xenblkd = NULL;
  577. return 0;
  578. }
  579. /*
  580. * Remove persistent grants and empty the pool of free pages
  581. */
  582. void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
  583. {
  584. /* Free all persistent grant pages */
  585. if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
  586. free_persistent_gnts(ring, &ring->persistent_gnts,
  587. ring->persistent_gnt_c);
  588. BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
  589. ring->persistent_gnt_c = 0;
  590. /* Since we are shutting down remove all pages from the buffer */
  591. shrink_free_pagepool(ring, 0 /* All */);
  592. }
  593. static unsigned int xen_blkbk_unmap_prepare(
  594. struct xen_blkif_ring *ring,
  595. struct grant_page **pages,
  596. unsigned int num,
  597. struct gnttab_unmap_grant_ref *unmap_ops,
  598. struct page **unmap_pages)
  599. {
  600. unsigned int i, invcount = 0;
  601. for (i = 0; i < num; i++) {
  602. if (pages[i]->persistent_gnt != NULL) {
  603. put_persistent_gnt(ring, pages[i]->persistent_gnt);
  604. continue;
  605. }
  606. if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
  607. continue;
  608. unmap_pages[invcount] = pages[i]->page;
  609. gnttab_set_unmap_op(&unmap_ops[invcount], vaddr(pages[i]->page),
  610. GNTMAP_host_map, pages[i]->handle);
  611. pages[i]->handle = BLKBACK_INVALID_HANDLE;
  612. invcount++;
  613. }
  614. return invcount;
  615. }
  616. static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
  617. {
  618. struct pending_req *pending_req = (struct pending_req *)(data->data);
  619. struct xen_blkif_ring *ring = pending_req->ring;
  620. struct xen_blkif *blkif = ring->blkif;
  621. /* BUG_ON used to reproduce existing behaviour,
  622. but is this the best way to deal with this? */
  623. BUG_ON(result);
  624. put_free_pages(ring, data->pages, data->count);
  625. make_response(ring, pending_req->id,
  626. pending_req->operation, pending_req->status);
  627. free_req(ring, pending_req);
  628. /*
  629. * Make sure the request is freed before releasing blkif,
  630. * or there could be a race between free_req and the
  631. * cleanup done in xen_blkif_free during shutdown.
  632. *
  633. * NB: The fact that we might try to wake up pending_free_wq
  634. * before drain_complete (in case there's a drain going on)
  635. * it's not a problem with our current implementation
  636. * because we can assure there's no thread waiting on
  637. * pending_free_wq if there's a drain going on, but it has
  638. * to be taken into account if the current model is changed.
  639. */
  640. if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
  641. complete(&blkif->drain_complete);
  642. }
  643. xen_blkif_put(blkif);
  644. }
  645. static void xen_blkbk_unmap_and_respond(struct pending_req *req)
  646. {
  647. struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
  648. struct xen_blkif_ring *ring = req->ring;
  649. struct grant_page **pages = req->segments;
  650. unsigned int invcount;
  651. invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
  652. req->unmap, req->unmap_pages);
  653. work->data = req;
  654. work->done = xen_blkbk_unmap_and_respond_callback;
  655. work->unmap_ops = req->unmap;
  656. work->kunmap_ops = NULL;
  657. work->pages = req->unmap_pages;
  658. work->count = invcount;
  659. gnttab_unmap_refs_async(&req->gnttab_unmap_data);
  660. }
  661. /*
  662. * Unmap the grant references.
  663. *
  664. * This could accumulate ops up to the batch size to reduce the number
  665. * of hypercalls, but since this is only used in error paths there's
  666. * no real need.
  667. */
  668. static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
  669. struct grant_page *pages[],
  670. int num)
  671. {
  672. struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  673. struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  674. unsigned int invcount = 0;
  675. int ret;
  676. while (num) {
  677. unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
  678. invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
  679. unmap, unmap_pages);
  680. if (invcount) {
  681. ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
  682. BUG_ON(ret);
  683. put_free_pages(ring, unmap_pages, invcount);
  684. }
  685. pages += batch;
  686. num -= batch;
  687. }
  688. }
  689. static int xen_blkbk_map(struct xen_blkif_ring *ring,
  690. struct grant_page *pages[],
  691. int num, bool ro)
  692. {
  693. struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  694. struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  695. struct persistent_gnt *persistent_gnt = NULL;
  696. phys_addr_t addr = 0;
  697. int i, seg_idx, new_map_idx;
  698. int segs_to_map = 0;
  699. int ret = 0;
  700. int last_map = 0, map_until = 0;
  701. int use_persistent_gnts;
  702. struct xen_blkif *blkif = ring->blkif;
  703. use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
  704. /*
  705. * Fill out preq.nr_sects with proper amount of sectors, and setup
  706. * assign map[..] with the PFN of the page in our domain with the
  707. * corresponding grant reference for each page.
  708. */
  709. again:
  710. for (i = map_until; i < num; i++) {
  711. uint32_t flags;
  712. if (use_persistent_gnts) {
  713. persistent_gnt = get_persistent_gnt(
  714. ring,
  715. pages[i]->gref);
  716. }
  717. if (persistent_gnt) {
  718. /*
  719. * We are using persistent grants and
  720. * the grant is already mapped
  721. */
  722. pages[i]->page = persistent_gnt->page;
  723. pages[i]->persistent_gnt = persistent_gnt;
  724. } else {
  725. if (get_free_page(ring, &pages[i]->page))
  726. goto out_of_memory;
  727. addr = vaddr(pages[i]->page);
  728. pages_to_gnt[segs_to_map] = pages[i]->page;
  729. pages[i]->persistent_gnt = NULL;
  730. flags = GNTMAP_host_map;
  731. if (!use_persistent_gnts && ro)
  732. flags |= GNTMAP_readonly;
  733. gnttab_set_map_op(&map[segs_to_map++], addr,
  734. flags, pages[i]->gref,
  735. blkif->domid);
  736. }
  737. map_until = i + 1;
  738. if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
  739. break;
  740. }
  741. if (segs_to_map) {
  742. ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
  743. BUG_ON(ret);
  744. }
  745. /*
  746. * Now swizzle the MFN in our domain with the MFN from the other domain
  747. * so that when we access vaddr(pending_req,i) it has the contents of
  748. * the page from the other domain.
  749. */
  750. for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
  751. if (!pages[seg_idx]->persistent_gnt) {
  752. /* This is a newly mapped grant */
  753. BUG_ON(new_map_idx >= segs_to_map);
  754. if (unlikely(map[new_map_idx].status != 0)) {
  755. pr_debug("invalid buffer -- could not remap it\n");
  756. put_free_pages(ring, &pages[seg_idx]->page, 1);
  757. pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
  758. ret |= 1;
  759. goto next;
  760. }
  761. pages[seg_idx]->handle = map[new_map_idx].handle;
  762. } else {
  763. continue;
  764. }
  765. if (use_persistent_gnts &&
  766. ring->persistent_gnt_c < xen_blkif_max_pgrants) {
  767. /*
  768. * We are using persistent grants, the grant is
  769. * not mapped but we might have room for it.
  770. */
  771. persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
  772. GFP_KERNEL);
  773. if (!persistent_gnt) {
  774. /*
  775. * If we don't have enough memory to
  776. * allocate the persistent_gnt struct
  777. * map this grant non-persistenly
  778. */
  779. goto next;
  780. }
  781. persistent_gnt->gnt = map[new_map_idx].ref;
  782. persistent_gnt->handle = map[new_map_idx].handle;
  783. persistent_gnt->page = pages[seg_idx]->page;
  784. if (add_persistent_gnt(ring,
  785. persistent_gnt)) {
  786. kfree(persistent_gnt);
  787. persistent_gnt = NULL;
  788. goto next;
  789. }
  790. pages[seg_idx]->persistent_gnt = persistent_gnt;
  791. pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
  792. persistent_gnt->gnt, ring->persistent_gnt_c,
  793. xen_blkif_max_pgrants);
  794. goto next;
  795. }
  796. if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
  797. blkif->vbd.overflow_max_grants = 1;
  798. pr_debug("domain %u, device %#x is using maximum number of persistent grants\n",
  799. blkif->domid, blkif->vbd.handle);
  800. }
  801. /*
  802. * We could not map this grant persistently, so use it as
  803. * a non-persistent grant.
  804. */
  805. next:
  806. new_map_idx++;
  807. }
  808. segs_to_map = 0;
  809. last_map = map_until;
  810. if (map_until != num)
  811. goto again;
  812. return ret;
  813. out_of_memory:
  814. pr_alert("%s: out of memory\n", __func__);
  815. put_free_pages(ring, pages_to_gnt, segs_to_map);
  816. return -ENOMEM;
  817. }
  818. static int xen_blkbk_map_seg(struct pending_req *pending_req)
  819. {
  820. int rc;
  821. rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
  822. pending_req->nr_segs,
  823. (pending_req->operation != BLKIF_OP_READ));
  824. return rc;
  825. }
  826. static int xen_blkbk_parse_indirect(struct blkif_request *req,
  827. struct pending_req *pending_req,
  828. struct seg_buf seg[],
  829. struct phys_req *preq)
  830. {
  831. struct grant_page **pages = pending_req->indirect_pages;
  832. struct xen_blkif_ring *ring = pending_req->ring;
  833. int indirect_grefs, rc, n, nseg, i;
  834. struct blkif_request_segment *segments = NULL;
  835. nseg = pending_req->nr_segs;
  836. indirect_grefs = INDIRECT_PAGES(nseg);
  837. BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
  838. for (i = 0; i < indirect_grefs; i++)
  839. pages[i]->gref = req->u.indirect.indirect_grefs[i];
  840. rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
  841. if (rc)
  842. goto unmap;
  843. for (n = 0, i = 0; n < nseg; n++) {
  844. uint8_t first_sect, last_sect;
  845. if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
  846. /* Map indirect segments */
  847. if (segments)
  848. kunmap_atomic(segments);
  849. segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
  850. }
  851. i = n % SEGS_PER_INDIRECT_FRAME;
  852. pending_req->segments[n]->gref = segments[i].gref;
  853. first_sect = READ_ONCE(segments[i].first_sect);
  854. last_sect = READ_ONCE(segments[i].last_sect);
  855. if (last_sect >= (XEN_PAGE_SIZE >> 9) || last_sect < first_sect) {
  856. rc = -EINVAL;
  857. goto unmap;
  858. }
  859. seg[n].nsec = last_sect - first_sect + 1;
  860. seg[n].offset = first_sect << 9;
  861. preq->nr_sects += seg[n].nsec;
  862. }
  863. unmap:
  864. if (segments)
  865. kunmap_atomic(segments);
  866. xen_blkbk_unmap(ring, pages, indirect_grefs);
  867. return rc;
  868. }
  869. static int dispatch_discard_io(struct xen_blkif_ring *ring,
  870. struct blkif_request *req)
  871. {
  872. int err = 0;
  873. int status = BLKIF_RSP_OKAY;
  874. struct xen_blkif *blkif = ring->blkif;
  875. struct block_device *bdev = blkif->vbd.bdev;
  876. unsigned long secure;
  877. struct phys_req preq;
  878. xen_blkif_get(blkif);
  879. preq.sector_number = req->u.discard.sector_number;
  880. preq.nr_sects = req->u.discard.nr_sectors;
  881. err = xen_vbd_translate(&preq, blkif, REQ_OP_WRITE);
  882. if (err) {
  883. pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n",
  884. preq.sector_number,
  885. preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
  886. goto fail_response;
  887. }
  888. ring->st_ds_req++;
  889. secure = (blkif->vbd.discard_secure &&
  890. (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
  891. BLKDEV_DISCARD_SECURE : 0;
  892. err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
  893. req->u.discard.nr_sectors,
  894. GFP_KERNEL, secure);
  895. fail_response:
  896. if (err == -EOPNOTSUPP) {
  897. pr_debug("discard op failed, not supported\n");
  898. status = BLKIF_RSP_EOPNOTSUPP;
  899. } else if (err)
  900. status = BLKIF_RSP_ERROR;
  901. make_response(ring, req->u.discard.id, req->operation, status);
  902. xen_blkif_put(blkif);
  903. return err;
  904. }
  905. static int dispatch_other_io(struct xen_blkif_ring *ring,
  906. struct blkif_request *req,
  907. struct pending_req *pending_req)
  908. {
  909. free_req(ring, pending_req);
  910. make_response(ring, req->u.other.id, req->operation,
  911. BLKIF_RSP_EOPNOTSUPP);
  912. return -EIO;
  913. }
  914. static void xen_blk_drain_io(struct xen_blkif_ring *ring)
  915. {
  916. struct xen_blkif *blkif = ring->blkif;
  917. atomic_set(&blkif->drain, 1);
  918. do {
  919. if (atomic_read(&ring->inflight) == 0)
  920. break;
  921. wait_for_completion_interruptible_timeout(
  922. &blkif->drain_complete, HZ);
  923. if (!atomic_read(&blkif->drain))
  924. break;
  925. } while (!kthread_should_stop());
  926. atomic_set(&blkif->drain, 0);
  927. }
  928. /*
  929. * Completion callback on the bio's. Called as bh->b_end_io()
  930. */
  931. static void __end_block_io_op(struct pending_req *pending_req, int error)
  932. {
  933. /* An error fails the entire request. */
  934. if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
  935. (error == -EOPNOTSUPP)) {
  936. pr_debug("flush diskcache op failed, not supported\n");
  937. xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
  938. pending_req->status = BLKIF_RSP_EOPNOTSUPP;
  939. } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
  940. (error == -EOPNOTSUPP)) {
  941. pr_debug("write barrier op failed, not supported\n");
  942. xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
  943. pending_req->status = BLKIF_RSP_EOPNOTSUPP;
  944. } else if (error) {
  945. pr_debug("Buffer not up-to-date at end of operation,"
  946. " error=%d\n", error);
  947. pending_req->status = BLKIF_RSP_ERROR;
  948. }
  949. /*
  950. * If all of the bio's have completed it is time to unmap
  951. * the grant references associated with 'request' and provide
  952. * the proper response on the ring.
  953. */
  954. if (atomic_dec_and_test(&pending_req->pendcnt))
  955. xen_blkbk_unmap_and_respond(pending_req);
  956. }
  957. /*
  958. * bio callback.
  959. */
  960. static void end_block_io_op(struct bio *bio)
  961. {
  962. __end_block_io_op(bio->bi_private, bio->bi_error);
  963. bio_put(bio);
  964. }
  965. /*
  966. * Function to copy the from the ring buffer the 'struct blkif_request'
  967. * (which has the sectors we want, number of them, grant references, etc),
  968. * and transmute it to the block API to hand it over to the proper block disk.
  969. */
  970. static int
  971. __do_block_io_op(struct xen_blkif_ring *ring)
  972. {
  973. union blkif_back_rings *blk_rings = &ring->blk_rings;
  974. struct blkif_request req;
  975. struct pending_req *pending_req;
  976. RING_IDX rc, rp;
  977. int more_to_do = 0;
  978. rc = blk_rings->common.req_cons;
  979. rp = blk_rings->common.sring->req_prod;
  980. rmb(); /* Ensure we see queued requests up to 'rp'. */
  981. if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
  982. rc = blk_rings->common.rsp_prod_pvt;
  983. pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
  984. rp, rc, rp - rc, ring->blkif->vbd.pdevice);
  985. return -EACCES;
  986. }
  987. while (rc != rp) {
  988. if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
  989. break;
  990. if (kthread_should_stop()) {
  991. more_to_do = 1;
  992. break;
  993. }
  994. pending_req = alloc_req(ring);
  995. if (NULL == pending_req) {
  996. ring->st_oo_req++;
  997. more_to_do = 1;
  998. break;
  999. }
  1000. switch (ring->blkif->blk_protocol) {
  1001. case BLKIF_PROTOCOL_NATIVE:
  1002. memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
  1003. break;
  1004. case BLKIF_PROTOCOL_X86_32:
  1005. blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
  1006. break;
  1007. case BLKIF_PROTOCOL_X86_64:
  1008. blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
  1009. break;
  1010. default:
  1011. BUG();
  1012. }
  1013. blk_rings->common.req_cons = ++rc; /* before make_response() */
  1014. /* Apply all sanity checks to /private copy/ of request. */
  1015. barrier();
  1016. switch (req.operation) {
  1017. case BLKIF_OP_READ:
  1018. case BLKIF_OP_WRITE:
  1019. case BLKIF_OP_WRITE_BARRIER:
  1020. case BLKIF_OP_FLUSH_DISKCACHE:
  1021. case BLKIF_OP_INDIRECT:
  1022. if (dispatch_rw_block_io(ring, &req, pending_req))
  1023. goto done;
  1024. break;
  1025. case BLKIF_OP_DISCARD:
  1026. free_req(ring, pending_req);
  1027. if (dispatch_discard_io(ring, &req))
  1028. goto done;
  1029. break;
  1030. default:
  1031. if (dispatch_other_io(ring, &req, pending_req))
  1032. goto done;
  1033. break;
  1034. }
  1035. /* Yield point for this unbounded loop. */
  1036. cond_resched();
  1037. }
  1038. done:
  1039. return more_to_do;
  1040. }
  1041. static int
  1042. do_block_io_op(struct xen_blkif_ring *ring)
  1043. {
  1044. union blkif_back_rings *blk_rings = &ring->blk_rings;
  1045. int more_to_do;
  1046. do {
  1047. more_to_do = __do_block_io_op(ring);
  1048. if (more_to_do)
  1049. break;
  1050. RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
  1051. } while (more_to_do);
  1052. return more_to_do;
  1053. }
  1054. /*
  1055. * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
  1056. * and call the 'submit_bio' to pass it to the underlying storage.
  1057. */
  1058. static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
  1059. struct blkif_request *req,
  1060. struct pending_req *pending_req)
  1061. {
  1062. struct phys_req preq;
  1063. struct seg_buf *seg = pending_req->seg;
  1064. unsigned int nseg;
  1065. struct bio *bio = NULL;
  1066. struct bio **biolist = pending_req->biolist;
  1067. int i, nbio = 0;
  1068. int operation;
  1069. int operation_flags = 0;
  1070. struct blk_plug plug;
  1071. bool drain = false;
  1072. struct grant_page **pages = pending_req->segments;
  1073. unsigned short req_operation;
  1074. req_operation = req->operation == BLKIF_OP_INDIRECT ?
  1075. req->u.indirect.indirect_op : req->operation;
  1076. if ((req->operation == BLKIF_OP_INDIRECT) &&
  1077. (req_operation != BLKIF_OP_READ) &&
  1078. (req_operation != BLKIF_OP_WRITE)) {
  1079. pr_debug("Invalid indirect operation (%u)\n", req_operation);
  1080. goto fail_response;
  1081. }
  1082. switch (req_operation) {
  1083. case BLKIF_OP_READ:
  1084. ring->st_rd_req++;
  1085. operation = REQ_OP_READ;
  1086. break;
  1087. case BLKIF_OP_WRITE:
  1088. ring->st_wr_req++;
  1089. operation = REQ_OP_WRITE;
  1090. operation_flags = WRITE_ODIRECT;
  1091. break;
  1092. case BLKIF_OP_WRITE_BARRIER:
  1093. drain = true;
  1094. case BLKIF_OP_FLUSH_DISKCACHE:
  1095. ring->st_f_req++;
  1096. operation = REQ_OP_WRITE;
  1097. operation_flags = WRITE_FLUSH;
  1098. break;
  1099. default:
  1100. operation = 0; /* make gcc happy */
  1101. goto fail_response;
  1102. break;
  1103. }
  1104. /* Check that the number of segments is sane. */
  1105. nseg = req->operation == BLKIF_OP_INDIRECT ?
  1106. req->u.indirect.nr_segments : req->u.rw.nr_segments;
  1107. if (unlikely(nseg == 0 && operation_flags != WRITE_FLUSH) ||
  1108. unlikely((req->operation != BLKIF_OP_INDIRECT) &&
  1109. (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
  1110. unlikely((req->operation == BLKIF_OP_INDIRECT) &&
  1111. (nseg > MAX_INDIRECT_SEGMENTS))) {
  1112. pr_debug("Bad number of segments in request (%d)\n", nseg);
  1113. /* Haven't submitted any bio's yet. */
  1114. goto fail_response;
  1115. }
  1116. preq.nr_sects = 0;
  1117. pending_req->ring = ring;
  1118. pending_req->id = req->u.rw.id;
  1119. pending_req->operation = req_operation;
  1120. pending_req->status = BLKIF_RSP_OKAY;
  1121. pending_req->nr_segs = nseg;
  1122. if (req->operation != BLKIF_OP_INDIRECT) {
  1123. preq.dev = req->u.rw.handle;
  1124. preq.sector_number = req->u.rw.sector_number;
  1125. for (i = 0; i < nseg; i++) {
  1126. pages[i]->gref = req->u.rw.seg[i].gref;
  1127. seg[i].nsec = req->u.rw.seg[i].last_sect -
  1128. req->u.rw.seg[i].first_sect + 1;
  1129. seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
  1130. if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> 9)) ||
  1131. (req->u.rw.seg[i].last_sect <
  1132. req->u.rw.seg[i].first_sect))
  1133. goto fail_response;
  1134. preq.nr_sects += seg[i].nsec;
  1135. }
  1136. } else {
  1137. preq.dev = req->u.indirect.handle;
  1138. preq.sector_number = req->u.indirect.sector_number;
  1139. if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
  1140. goto fail_response;
  1141. }
  1142. if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
  1143. pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
  1144. operation == REQ_OP_READ ? "read" : "write",
  1145. preq.sector_number,
  1146. preq.sector_number + preq.nr_sects,
  1147. ring->blkif->vbd.pdevice);
  1148. goto fail_response;
  1149. }
  1150. /*
  1151. * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
  1152. * is set there.
  1153. */
  1154. for (i = 0; i < nseg; i++) {
  1155. if (((int)preq.sector_number|(int)seg[i].nsec) &
  1156. ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
  1157. pr_debug("Misaligned I/O request from domain %d\n",
  1158. ring->blkif->domid);
  1159. goto fail_response;
  1160. }
  1161. }
  1162. /* Wait on all outstanding I/O's and once that has been completed
  1163. * issue the WRITE_FLUSH.
  1164. */
  1165. if (drain)
  1166. xen_blk_drain_io(pending_req->ring);
  1167. /*
  1168. * If we have failed at this point, we need to undo the M2P override,
  1169. * set gnttab_set_unmap_op on all of the grant references and perform
  1170. * the hypercall to unmap the grants - that is all done in
  1171. * xen_blkbk_unmap.
  1172. */
  1173. if (xen_blkbk_map_seg(pending_req))
  1174. goto fail_flush;
  1175. /*
  1176. * This corresponding xen_blkif_put is done in __end_block_io_op, or
  1177. * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
  1178. */
  1179. xen_blkif_get(ring->blkif);
  1180. atomic_inc(&ring->inflight);
  1181. for (i = 0; i < nseg; i++) {
  1182. while ((bio == NULL) ||
  1183. (bio_add_page(bio,
  1184. pages[i]->page,
  1185. seg[i].nsec << 9,
  1186. seg[i].offset) == 0)) {
  1187. int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
  1188. bio = bio_alloc(GFP_KERNEL, nr_iovecs);
  1189. if (unlikely(bio == NULL))
  1190. goto fail_put_bio;
  1191. biolist[nbio++] = bio;
  1192. bio->bi_bdev = preq.bdev;
  1193. bio->bi_private = pending_req;
  1194. bio->bi_end_io = end_block_io_op;
  1195. bio->bi_iter.bi_sector = preq.sector_number;
  1196. bio_set_op_attrs(bio, operation, operation_flags);
  1197. }
  1198. preq.sector_number += seg[i].nsec;
  1199. }
  1200. /* This will be hit if the operation was a flush or discard. */
  1201. if (!bio) {
  1202. BUG_ON(operation_flags != WRITE_FLUSH);
  1203. bio = bio_alloc(GFP_KERNEL, 0);
  1204. if (unlikely(bio == NULL))
  1205. goto fail_put_bio;
  1206. biolist[nbio++] = bio;
  1207. bio->bi_bdev = preq.bdev;
  1208. bio->bi_private = pending_req;
  1209. bio->bi_end_io = end_block_io_op;
  1210. bio_set_op_attrs(bio, operation, operation_flags);
  1211. }
  1212. atomic_set(&pending_req->pendcnt, nbio);
  1213. blk_start_plug(&plug);
  1214. for (i = 0; i < nbio; i++)
  1215. submit_bio(biolist[i]);
  1216. /* Let the I/Os go.. */
  1217. blk_finish_plug(&plug);
  1218. if (operation == REQ_OP_READ)
  1219. ring->st_rd_sect += preq.nr_sects;
  1220. else if (operation == REQ_OP_WRITE)
  1221. ring->st_wr_sect += preq.nr_sects;
  1222. return 0;
  1223. fail_flush:
  1224. xen_blkbk_unmap(ring, pending_req->segments,
  1225. pending_req->nr_segs);
  1226. fail_response:
  1227. /* Haven't submitted any bio's yet. */
  1228. make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
  1229. free_req(ring, pending_req);
  1230. msleep(1); /* back off a bit */
  1231. return -EIO;
  1232. fail_put_bio:
  1233. for (i = 0; i < nbio; i++)
  1234. bio_put(biolist[i]);
  1235. atomic_set(&pending_req->pendcnt, 1);
  1236. __end_block_io_op(pending_req, -EINVAL);
  1237. msleep(1); /* back off a bit */
  1238. return -EIO;
  1239. }
  1240. /*
  1241. * Put a response on the ring on how the operation fared.
  1242. */
  1243. static void make_response(struct xen_blkif_ring *ring, u64 id,
  1244. unsigned short op, int st)
  1245. {
  1246. struct blkif_response *resp;
  1247. unsigned long flags;
  1248. union blkif_back_rings *blk_rings;
  1249. int notify;
  1250. spin_lock_irqsave(&ring->blk_ring_lock, flags);
  1251. blk_rings = &ring->blk_rings;
  1252. /* Place on the response ring for the relevant domain. */
  1253. switch (ring->blkif->blk_protocol) {
  1254. case BLKIF_PROTOCOL_NATIVE:
  1255. resp = RING_GET_RESPONSE(&blk_rings->native,
  1256. blk_rings->native.rsp_prod_pvt);
  1257. break;
  1258. case BLKIF_PROTOCOL_X86_32:
  1259. resp = RING_GET_RESPONSE(&blk_rings->x86_32,
  1260. blk_rings->x86_32.rsp_prod_pvt);
  1261. break;
  1262. case BLKIF_PROTOCOL_X86_64:
  1263. resp = RING_GET_RESPONSE(&blk_rings->x86_64,
  1264. blk_rings->x86_64.rsp_prod_pvt);
  1265. break;
  1266. default:
  1267. BUG();
  1268. }
  1269. resp->id = id;
  1270. resp->operation = op;
  1271. resp->status = st;
  1272. blk_rings->common.rsp_prod_pvt++;
  1273. RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
  1274. spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
  1275. if (notify)
  1276. notify_remote_via_irq(ring->irq);
  1277. }
  1278. static int __init xen_blkif_init(void)
  1279. {
  1280. int rc = 0;
  1281. if (!xen_domain())
  1282. return -ENODEV;
  1283. if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
  1284. pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
  1285. xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
  1286. xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
  1287. }
  1288. if (xenblk_max_queues == 0)
  1289. xenblk_max_queues = num_online_cpus();
  1290. rc = xen_blkif_interface_init();
  1291. if (rc)
  1292. goto failed_init;
  1293. rc = xen_blkif_xenbus_init();
  1294. if (rc)
  1295. goto failed_init;
  1296. failed_init:
  1297. return rc;
  1298. }
  1299. module_init(xen_blkif_init);
  1300. MODULE_LICENSE("Dual BSD/GPL");
  1301. MODULE_ALIAS("xen-backend:vbd");