root/drivers/md/dm-writecache.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. wc_lock
  2. wc_unlock
  3. persistent_memory_claim
  4. persistent_memory_claim
  5. persistent_memory_release
  6. persistent_memory_page
  7. persistent_memory_page_offset
  8. persistent_memory_flush_cache
  9. persistent_memory_invalidate_cache
  10. sb
  11. memory_data
  12. cache_sector
  13. read_original_sector
  14. read_seq_count
  15. clear_seq_count
  16. write_original_sector_seq_count
  17. writecache_flush_all_metadata
  18. writecache_flush_region
  19. writecache_notify_io
  20. writecache_wait_for_ios
  21. ssd_commit_flushed
  22. writecache_commit_flushed
  23. writecache_disk_flush
  24. writecache_find_entry
  25. writecache_insert_entry
  26. writecache_unlink
  27. writecache_add_to_freelist
  28. writecache_verify_watermark
  29. writecache_pop_from_freelist
  30. writecache_free_entry
  31. writecache_wait_on_freelist
  32. writecache_poison_lists
  33. writecache_flush_entry
  34. writecache_entry_is_committed
  35. writecache_flush
  36. writecache_flush_work
  37. writecache_autocommit_timer
  38. writecache_schedule_autocommit
  39. writecache_discard
  40. writecache_wait_for_writeback
  41. writecache_suspend
  42. writecache_alloc_entries
  43. writecache_read_metadata
  44. writecache_resume
  45. process_flush_mesg
  46. process_flush_on_suspend_mesg
  47. writecache_message
  48. bio_copy_block
  49. writecache_flush_thread
  50. writecache_offload_bio
  51. writecache_map
  52. writecache_end_io
  53. writecache_iterate_devices
  54. writecache_io_hints
  55. writecache_writeback_endio
  56. writecache_copy_endio
  57. __writecache_endio_pmem
  58. __writecache_endio_ssd
  59. writecache_endio_thread
  60. wc_add_block
  61. __writeback_throttle
  62. __writecache_writeback_pmem
  63. __writecache_writeback_ssd
  64. writecache_writeback
  65. calculate_memory_size
  66. init_memory
  67. writecache_dtr
  68. writecache_ctr
  69. writecache_status
  70. dm_writecache_init
  71. dm_writecache_exit

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2018 Red Hat. All rights reserved.
   4  *
   5  * This file is released under the GPL.
   6  */
   7 
   8 #include <linux/device-mapper.h>
   9 #include <linux/module.h>
  10 #include <linux/init.h>
  11 #include <linux/vmalloc.h>
  12 #include <linux/kthread.h>
  13 #include <linux/dm-io.h>
  14 #include <linux/dm-kcopyd.h>
  15 #include <linux/dax.h>
  16 #include <linux/pfn_t.h>
  17 #include <linux/libnvdimm.h>
  18 
  19 #define DM_MSG_PREFIX "writecache"
  20 
  21 #define HIGH_WATERMARK                  50
  22 #define LOW_WATERMARK                   45
  23 #define MAX_WRITEBACK_JOBS              0
  24 #define ENDIO_LATENCY                   16
  25 #define WRITEBACK_LATENCY               64
  26 #define AUTOCOMMIT_BLOCKS_SSD           65536
  27 #define AUTOCOMMIT_BLOCKS_PMEM          64
  28 #define AUTOCOMMIT_MSEC                 1000
  29 
  30 #define BITMAP_GRANULARITY      65536
  31 #if BITMAP_GRANULARITY < PAGE_SIZE
  32 #undef BITMAP_GRANULARITY
  33 #define BITMAP_GRANULARITY      PAGE_SIZE
  34 #endif
  35 
  36 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
  37 #define DM_WRITECACHE_HAS_PMEM
  38 #endif
  39 
  40 #ifdef DM_WRITECACHE_HAS_PMEM
  41 #define pmem_assign(dest, src)                                  \
  42 do {                                                            \
  43         typeof(dest) uniq = (src);                              \
  44         memcpy_flushcache(&(dest), &uniq, sizeof(dest));        \
  45 } while (0)
  46 #else
  47 #define pmem_assign(dest, src)  ((dest) = (src))
  48 #endif
  49 
  50 #if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
  51 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  52 #endif
  53 
  54 #define MEMORY_SUPERBLOCK_MAGIC         0x23489321
  55 #define MEMORY_SUPERBLOCK_VERSION       1
  56 
  57 struct wc_memory_entry {
  58         __le64 original_sector;
  59         __le64 seq_count;
  60 };
  61 
  62 struct wc_memory_superblock {
  63         union {
  64                 struct {
  65                         __le32 magic;
  66                         __le32 version;
  67                         __le32 block_size;
  68                         __le32 pad;
  69                         __le64 n_blocks;
  70                         __le64 seq_count;
  71                 };
  72                 __le64 padding[8];
  73         };
  74         struct wc_memory_entry entries[0];
  75 };
  76 
  77 struct wc_entry {
  78         struct rb_node rb_node;
  79         struct list_head lru;
  80         unsigned short wc_list_contiguous;
  81         bool write_in_progress
  82 #if BITS_PER_LONG == 64
  83                 :1
  84 #endif
  85         ;
  86         unsigned long index
  87 #if BITS_PER_LONG == 64
  88                 :47
  89 #endif
  90         ;
  91 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  92         uint64_t original_sector;
  93         uint64_t seq_count;
  94 #endif
  95 };
  96 
  97 #ifdef DM_WRITECACHE_HAS_PMEM
  98 #define WC_MODE_PMEM(wc)                        ((wc)->pmem_mode)
  99 #define WC_MODE_FUA(wc)                         ((wc)->writeback_fua)
 100 #else
 101 #define WC_MODE_PMEM(wc)                        false
 102 #define WC_MODE_FUA(wc)                         false
 103 #endif
 104 #define WC_MODE_SORT_FREELIST(wc)               (!WC_MODE_PMEM(wc))
 105 
 106 struct dm_writecache {
 107         struct mutex lock;
 108         struct list_head lru;
 109         union {
 110                 struct list_head freelist;
 111                 struct {
 112                         struct rb_root freetree;
 113                         struct wc_entry *current_free;
 114                 };
 115         };
 116         struct rb_root tree;
 117 
 118         size_t freelist_size;
 119         size_t writeback_size;
 120         size_t freelist_high_watermark;
 121         size_t freelist_low_watermark;
 122 
 123         unsigned uncommitted_blocks;
 124         unsigned autocommit_blocks;
 125         unsigned max_writeback_jobs;
 126 
 127         int error;
 128 
 129         unsigned long autocommit_jiffies;
 130         struct timer_list autocommit_timer;
 131         struct wait_queue_head freelist_wait;
 132 
 133         atomic_t bio_in_progress[2];
 134         struct wait_queue_head bio_in_progress_wait[2];
 135 
 136         struct dm_target *ti;
 137         struct dm_dev *dev;
 138         struct dm_dev *ssd_dev;
 139         sector_t start_sector;
 140         void *memory_map;
 141         uint64_t memory_map_size;
 142         size_t metadata_sectors;
 143         size_t n_blocks;
 144         uint64_t seq_count;
 145         void *block_start;
 146         struct wc_entry *entries;
 147         unsigned block_size;
 148         unsigned char block_size_bits;
 149 
 150         bool pmem_mode:1;
 151         bool writeback_fua:1;
 152 
 153         bool overwrote_committed:1;
 154         bool memory_vmapped:1;
 155 
 156         bool high_wm_percent_set:1;
 157         bool low_wm_percent_set:1;
 158         bool max_writeback_jobs_set:1;
 159         bool autocommit_blocks_set:1;
 160         bool autocommit_time_set:1;
 161         bool writeback_fua_set:1;
 162         bool flush_on_suspend:1;
 163 
 164         unsigned writeback_all;
 165         struct workqueue_struct *writeback_wq;
 166         struct work_struct writeback_work;
 167         struct work_struct flush_work;
 168 
 169         struct dm_io_client *dm_io;
 170 
 171         raw_spinlock_t endio_list_lock;
 172         struct list_head endio_list;
 173         struct task_struct *endio_thread;
 174 
 175         struct task_struct *flush_thread;
 176         struct bio_list flush_list;
 177 
 178         struct dm_kcopyd_client *dm_kcopyd;
 179         unsigned long *dirty_bitmap;
 180         unsigned dirty_bitmap_size;
 181 
 182         struct bio_set bio_set;
 183         mempool_t copy_pool;
 184 };
 185 
 186 #define WB_LIST_INLINE          16
 187 
 188 struct writeback_struct {
 189         struct list_head endio_entry;
 190         struct dm_writecache *wc;
 191         struct wc_entry **wc_list;
 192         unsigned wc_list_n;
 193         struct wc_entry *wc_list_inline[WB_LIST_INLINE];
 194         struct bio bio;
 195 };
 196 
 197 struct copy_struct {
 198         struct list_head endio_entry;
 199         struct dm_writecache *wc;
 200         struct wc_entry *e;
 201         unsigned n_entries;
 202         int error;
 203 };
 204 
 205 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
 206                                             "A percentage of time allocated for data copying");
 207 
 208 static void wc_lock(struct dm_writecache *wc)
 209 {
 210         mutex_lock(&wc->lock);
 211 }
 212 
 213 static void wc_unlock(struct dm_writecache *wc)
 214 {
 215         mutex_unlock(&wc->lock);
 216 }
 217 
 218 #ifdef DM_WRITECACHE_HAS_PMEM
 219 static int persistent_memory_claim(struct dm_writecache *wc)
 220 {
 221         int r;
 222         loff_t s;
 223         long p, da;
 224         pfn_t pfn;
 225         int id;
 226         struct page **pages;
 227 
 228         wc->memory_vmapped = false;
 229 
 230         if (!wc->ssd_dev->dax_dev) {
 231                 r = -EOPNOTSUPP;
 232                 goto err1;
 233         }
 234         s = wc->memory_map_size;
 235         p = s >> PAGE_SHIFT;
 236         if (!p) {
 237                 r = -EINVAL;
 238                 goto err1;
 239         }
 240         if (p != s >> PAGE_SHIFT) {
 241                 r = -EOVERFLOW;
 242                 goto err1;
 243         }
 244 
 245         id = dax_read_lock();
 246 
 247         da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
 248         if (da < 0) {
 249                 wc->memory_map = NULL;
 250                 r = da;
 251                 goto err2;
 252         }
 253         if (!pfn_t_has_page(pfn)) {
 254                 wc->memory_map = NULL;
 255                 r = -EOPNOTSUPP;
 256                 goto err2;
 257         }
 258         if (da != p) {
 259                 long i;
 260                 wc->memory_map = NULL;
 261                 pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
 262                 if (!pages) {
 263                         r = -ENOMEM;
 264                         goto err2;
 265                 }
 266                 i = 0;
 267                 do {
 268                         long daa;
 269                         daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i,
 270                                                 NULL, &pfn);
 271                         if (daa <= 0) {
 272                                 r = daa ? daa : -EINVAL;
 273                                 goto err3;
 274                         }
 275                         if (!pfn_t_has_page(pfn)) {
 276                                 r = -EOPNOTSUPP;
 277                                 goto err3;
 278                         }
 279                         while (daa-- && i < p) {
 280                                 pages[i++] = pfn_t_to_page(pfn);
 281                                 pfn.val++;
 282                         }
 283                 } while (i < p);
 284                 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
 285                 if (!wc->memory_map) {
 286                         r = -ENOMEM;
 287                         goto err3;
 288                 }
 289                 kvfree(pages);
 290                 wc->memory_vmapped = true;
 291         }
 292 
 293         dax_read_unlock(id);
 294 
 295         wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
 296         wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
 297 
 298         return 0;
 299 err3:
 300         kvfree(pages);
 301 err2:
 302         dax_read_unlock(id);
 303 err1:
 304         return r;
 305 }
 306 #else
 307 static int persistent_memory_claim(struct dm_writecache *wc)
 308 {
 309         BUG();
 310 }
 311 #endif
 312 
 313 static void persistent_memory_release(struct dm_writecache *wc)
 314 {
 315         if (wc->memory_vmapped)
 316                 vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
 317 }
 318 
 319 static struct page *persistent_memory_page(void *addr)
 320 {
 321         if (is_vmalloc_addr(addr))
 322                 return vmalloc_to_page(addr);
 323         else
 324                 return virt_to_page(addr);
 325 }
 326 
 327 static unsigned persistent_memory_page_offset(void *addr)
 328 {
 329         return (unsigned long)addr & (PAGE_SIZE - 1);
 330 }
 331 
 332 static void persistent_memory_flush_cache(void *ptr, size_t size)
 333 {
 334         if (is_vmalloc_addr(ptr))
 335                 flush_kernel_vmap_range(ptr, size);
 336 }
 337 
 338 static void persistent_memory_invalidate_cache(void *ptr, size_t size)
 339 {
 340         if (is_vmalloc_addr(ptr))
 341                 invalidate_kernel_vmap_range(ptr, size);
 342 }
 343 
 344 static struct wc_memory_superblock *sb(struct dm_writecache *wc)
 345 {
 346         return wc->memory_map;
 347 }
 348 
 349 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
 350 {
 351         return &sb(wc)->entries[e->index];
 352 }
 353 
 354 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
 355 {
 356         return (char *)wc->block_start + (e->index << wc->block_size_bits);
 357 }
 358 
 359 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
 360 {
 361         return wc->start_sector + wc->metadata_sectors +
 362                 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
 363 }
 364 
 365 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
 366 {
 367 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 368         return e->original_sector;
 369 #else
 370         return le64_to_cpu(memory_entry(wc, e)->original_sector);
 371 #endif
 372 }
 373 
 374 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
 375 {
 376 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 377         return e->seq_count;
 378 #else
 379         return le64_to_cpu(memory_entry(wc, e)->seq_count);
 380 #endif
 381 }
 382 
 383 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
 384 {
 385 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 386         e->seq_count = -1;
 387 #endif
 388         pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
 389 }
 390 
 391 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
 392                                             uint64_t original_sector, uint64_t seq_count)
 393 {
 394         struct wc_memory_entry me;
 395 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 396         e->original_sector = original_sector;
 397         e->seq_count = seq_count;
 398 #endif
 399         me.original_sector = cpu_to_le64(original_sector);
 400         me.seq_count = cpu_to_le64(seq_count);
 401         pmem_assign(*memory_entry(wc, e), me);
 402 }
 403 
 404 #define writecache_error(wc, err, msg, arg...)                          \
 405 do {                                                                    \
 406         if (!cmpxchg(&(wc)->error, 0, err))                             \
 407                 DMERR(msg, ##arg);                                      \
 408         wake_up(&(wc)->freelist_wait);                                  \
 409 } while (0)
 410 
 411 #define writecache_has_error(wc)        (unlikely(READ_ONCE((wc)->error)))
 412 
 413 static void writecache_flush_all_metadata(struct dm_writecache *wc)
 414 {
 415         if (!WC_MODE_PMEM(wc))
 416                 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
 417 }
 418 
 419 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
 420 {
 421         if (!WC_MODE_PMEM(wc))
 422                 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
 423                           wc->dirty_bitmap);
 424 }
 425 
 426 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
 427 
 428 struct io_notify {
 429         struct dm_writecache *wc;
 430         struct completion c;
 431         atomic_t count;
 432 };
 433 
 434 static void writecache_notify_io(unsigned long error, void *context)
 435 {
 436         struct io_notify *endio = context;
 437 
 438         if (unlikely(error != 0))
 439                 writecache_error(endio->wc, -EIO, "error writing metadata");
 440         BUG_ON(atomic_read(&endio->count) <= 0);
 441         if (atomic_dec_and_test(&endio->count))
 442                 complete(&endio->c);
 443 }
 444 
 445 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
 446 {
 447         wait_event(wc->bio_in_progress_wait[direction],
 448                    !atomic_read(&wc->bio_in_progress[direction]));
 449 }
 450 
 451 static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
 452 {
 453         struct dm_io_region region;
 454         struct dm_io_request req;
 455         struct io_notify endio = {
 456                 wc,
 457                 COMPLETION_INITIALIZER_ONSTACK(endio.c),
 458                 ATOMIC_INIT(1),
 459         };
 460         unsigned bitmap_bits = wc->dirty_bitmap_size * 8;
 461         unsigned i = 0;
 462 
 463         while (1) {
 464                 unsigned j;
 465                 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
 466                 if (unlikely(i == bitmap_bits))
 467                         break;
 468                 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
 469 
 470                 region.bdev = wc->ssd_dev->bdev;
 471                 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
 472                 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
 473 
 474                 if (unlikely(region.sector >= wc->metadata_sectors))
 475                         break;
 476                 if (unlikely(region.sector + region.count > wc->metadata_sectors))
 477                         region.count = wc->metadata_sectors - region.sector;
 478 
 479                 region.sector += wc->start_sector;
 480                 atomic_inc(&endio.count);
 481                 req.bi_op = REQ_OP_WRITE;
 482                 req.bi_op_flags = REQ_SYNC;
 483                 req.mem.type = DM_IO_VMA;
 484                 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
 485                 req.client = wc->dm_io;
 486                 req.notify.fn = writecache_notify_io;
 487                 req.notify.context = &endio;
 488 
 489                 /* writing via async dm-io (implied by notify.fn above) won't return an error */
 490                 (void) dm_io(&req, 1, &region, NULL);
 491                 i = j;
 492         }
 493 
 494         writecache_notify_io(0, &endio);
 495         wait_for_completion_io(&endio.c);
 496 
 497         if (wait_for_ios)
 498                 writecache_wait_for_ios(wc, WRITE);
 499 
 500         writecache_disk_flush(wc, wc->ssd_dev);
 501 
 502         memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
 503 }
 504 
 505 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
 506 {
 507         if (WC_MODE_PMEM(wc))
 508                 wmb();
 509         else
 510                 ssd_commit_flushed(wc, wait_for_ios);
 511 }
 512 
 513 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
 514 {
 515         int r;
 516         struct dm_io_region region;
 517         struct dm_io_request req;
 518 
 519         region.bdev = dev->bdev;
 520         region.sector = 0;
 521         region.count = 0;
 522         req.bi_op = REQ_OP_WRITE;
 523         req.bi_op_flags = REQ_PREFLUSH;
 524         req.mem.type = DM_IO_KMEM;
 525         req.mem.ptr.addr = NULL;
 526         req.client = wc->dm_io;
 527         req.notify.fn = NULL;
 528 
 529         r = dm_io(&req, 1, &region, NULL);
 530         if (unlikely(r))
 531                 writecache_error(wc, r, "error flushing metadata: %d", r);
 532 }
 533 
 534 #define WFE_RETURN_FOLLOWING    1
 535 #define WFE_LOWEST_SEQ          2
 536 
 537 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
 538                                               uint64_t block, int flags)
 539 {
 540         struct wc_entry *e;
 541         struct rb_node *node = wc->tree.rb_node;
 542 
 543         if (unlikely(!node))
 544                 return NULL;
 545 
 546         while (1) {
 547                 e = container_of(node, struct wc_entry, rb_node);
 548                 if (read_original_sector(wc, e) == block)
 549                         break;
 550 
 551                 node = (read_original_sector(wc, e) >= block ?
 552                         e->rb_node.rb_left : e->rb_node.rb_right);
 553                 if (unlikely(!node)) {
 554                         if (!(flags & WFE_RETURN_FOLLOWING))
 555                                 return NULL;
 556                         if (read_original_sector(wc, e) >= block) {
 557                                 return e;
 558                         } else {
 559                                 node = rb_next(&e->rb_node);
 560                                 if (unlikely(!node))
 561                                         return NULL;
 562                                 e = container_of(node, struct wc_entry, rb_node);
 563                                 return e;
 564                         }
 565                 }
 566         }
 567 
 568         while (1) {
 569                 struct wc_entry *e2;
 570                 if (flags & WFE_LOWEST_SEQ)
 571                         node = rb_prev(&e->rb_node);
 572                 else
 573                         node = rb_next(&e->rb_node);
 574                 if (unlikely(!node))
 575                         return e;
 576                 e2 = container_of(node, struct wc_entry, rb_node);
 577                 if (read_original_sector(wc, e2) != block)
 578                         return e;
 579                 e = e2;
 580         }
 581 }
 582 
 583 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
 584 {
 585         struct wc_entry *e;
 586         struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
 587 
 588         while (*node) {
 589                 e = container_of(*node, struct wc_entry, rb_node);
 590                 parent = &e->rb_node;
 591                 if (read_original_sector(wc, e) > read_original_sector(wc, ins))
 592                         node = &parent->rb_left;
 593                 else
 594                         node = &parent->rb_right;
 595         }
 596         rb_link_node(&ins->rb_node, parent, node);
 597         rb_insert_color(&ins->rb_node, &wc->tree);
 598         list_add(&ins->lru, &wc->lru);
 599 }
 600 
 601 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
 602 {
 603         list_del(&e->lru);
 604         rb_erase(&e->rb_node, &wc->tree);
 605 }
 606 
 607 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
 608 {
 609         if (WC_MODE_SORT_FREELIST(wc)) {
 610                 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
 611                 if (unlikely(!*node))
 612                         wc->current_free = e;
 613                 while (*node) {
 614                         parent = *node;
 615                         if (&e->rb_node < *node)
 616                                 node = &parent->rb_left;
 617                         else
 618                                 node = &parent->rb_right;
 619                 }
 620                 rb_link_node(&e->rb_node, parent, node);
 621                 rb_insert_color(&e->rb_node, &wc->freetree);
 622         } else {
 623                 list_add_tail(&e->lru, &wc->freelist);
 624         }
 625         wc->freelist_size++;
 626 }
 627 
 628 static inline void writecache_verify_watermark(struct dm_writecache *wc)
 629 {
 630         if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
 631                 queue_work(wc->writeback_wq, &wc->writeback_work);
 632 }
 633 
 634 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
 635 {
 636         struct wc_entry *e;
 637 
 638         if (WC_MODE_SORT_FREELIST(wc)) {
 639                 struct rb_node *next;
 640                 if (unlikely(!wc->current_free))
 641                         return NULL;
 642                 e = wc->current_free;
 643                 next = rb_next(&e->rb_node);
 644                 rb_erase(&e->rb_node, &wc->freetree);
 645                 if (unlikely(!next))
 646                         next = rb_first(&wc->freetree);
 647                 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
 648         } else {
 649                 if (unlikely(list_empty(&wc->freelist)))
 650                         return NULL;
 651                 e = container_of(wc->freelist.next, struct wc_entry, lru);
 652                 list_del(&e->lru);
 653         }
 654         wc->freelist_size--;
 655 
 656         writecache_verify_watermark(wc);
 657 
 658         return e;
 659 }
 660 
 661 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
 662 {
 663         writecache_unlink(wc, e);
 664         writecache_add_to_freelist(wc, e);
 665         clear_seq_count(wc, e);
 666         writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
 667         if (unlikely(waitqueue_active(&wc->freelist_wait)))
 668                 wake_up(&wc->freelist_wait);
 669 }
 670 
 671 static void writecache_wait_on_freelist(struct dm_writecache *wc)
 672 {
 673         DEFINE_WAIT(wait);
 674 
 675         prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
 676         wc_unlock(wc);
 677         io_schedule();
 678         finish_wait(&wc->freelist_wait, &wait);
 679         wc_lock(wc);
 680 }
 681 
 682 static void writecache_poison_lists(struct dm_writecache *wc)
 683 {
 684         /*
 685          * Catch incorrect access to these values while the device is suspended.
 686          */
 687         memset(&wc->tree, -1, sizeof wc->tree);
 688         wc->lru.next = LIST_POISON1;
 689         wc->lru.prev = LIST_POISON2;
 690         wc->freelist.next = LIST_POISON1;
 691         wc->freelist.prev = LIST_POISON2;
 692 }
 693 
 694 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
 695 {
 696         writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
 697         if (WC_MODE_PMEM(wc))
 698                 writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
 699 }
 700 
 701 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
 702 {
 703         return read_seq_count(wc, e) < wc->seq_count;
 704 }
 705 
 706 static void writecache_flush(struct dm_writecache *wc)
 707 {
 708         struct wc_entry *e, *e2;
 709         bool need_flush_after_free;
 710 
 711         wc->uncommitted_blocks = 0;
 712         del_timer(&wc->autocommit_timer);
 713 
 714         if (list_empty(&wc->lru))
 715                 return;
 716 
 717         e = container_of(wc->lru.next, struct wc_entry, lru);
 718         if (writecache_entry_is_committed(wc, e)) {
 719                 if (wc->overwrote_committed) {
 720                         writecache_wait_for_ios(wc, WRITE);
 721                         writecache_disk_flush(wc, wc->ssd_dev);
 722                         wc->overwrote_committed = false;
 723                 }
 724                 return;
 725         }
 726         while (1) {
 727                 writecache_flush_entry(wc, e);
 728                 if (unlikely(e->lru.next == &wc->lru))
 729                         break;
 730                 e2 = container_of(e->lru.next, struct wc_entry, lru);
 731                 if (writecache_entry_is_committed(wc, e2))
 732                         break;
 733                 e = e2;
 734                 cond_resched();
 735         }
 736         writecache_commit_flushed(wc, true);
 737 
 738         wc->seq_count++;
 739         pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
 740         writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
 741         writecache_commit_flushed(wc, false);
 742 
 743         wc->overwrote_committed = false;
 744 
 745         need_flush_after_free = false;
 746         while (1) {
 747                 /* Free another committed entry with lower seq-count */
 748                 struct rb_node *rb_node = rb_prev(&e->rb_node);
 749 
 750                 if (rb_node) {
 751                         e2 = container_of(rb_node, struct wc_entry, rb_node);
 752                         if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
 753                             likely(!e2->write_in_progress)) {
 754                                 writecache_free_entry(wc, e2);
 755                                 need_flush_after_free = true;
 756                         }
 757                 }
 758                 if (unlikely(e->lru.prev == &wc->lru))
 759                         break;
 760                 e = container_of(e->lru.prev, struct wc_entry, lru);
 761                 cond_resched();
 762         }
 763 
 764         if (need_flush_after_free)
 765                 writecache_commit_flushed(wc, false);
 766 }
 767 
 768 static void writecache_flush_work(struct work_struct *work)
 769 {
 770         struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
 771 
 772         wc_lock(wc);
 773         writecache_flush(wc);
 774         wc_unlock(wc);
 775 }
 776 
 777 static void writecache_autocommit_timer(struct timer_list *t)
 778 {
 779         struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
 780         if (!writecache_has_error(wc))
 781                 queue_work(wc->writeback_wq, &wc->flush_work);
 782 }
 783 
 784 static void writecache_schedule_autocommit(struct dm_writecache *wc)
 785 {
 786         if (!timer_pending(&wc->autocommit_timer))
 787                 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
 788 }
 789 
 790 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
 791 {
 792         struct wc_entry *e;
 793         bool discarded_something = false;
 794 
 795         e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
 796         if (unlikely(!e))
 797                 return;
 798 
 799         while (read_original_sector(wc, e) < end) {
 800                 struct rb_node *node = rb_next(&e->rb_node);
 801 
 802                 if (likely(!e->write_in_progress)) {
 803                         if (!discarded_something) {
 804                                 writecache_wait_for_ios(wc, READ);
 805                                 writecache_wait_for_ios(wc, WRITE);
 806                                 discarded_something = true;
 807                         }
 808                         writecache_free_entry(wc, e);
 809                 }
 810 
 811                 if (unlikely(!node))
 812                         break;
 813 
 814                 e = container_of(node, struct wc_entry, rb_node);
 815         }
 816 
 817         if (discarded_something)
 818                 writecache_commit_flushed(wc, false);
 819 }
 820 
 821 static bool writecache_wait_for_writeback(struct dm_writecache *wc)
 822 {
 823         if (wc->writeback_size) {
 824                 writecache_wait_on_freelist(wc);
 825                 return true;
 826         }
 827         return false;
 828 }
 829 
 830 static void writecache_suspend(struct dm_target *ti)
 831 {
 832         struct dm_writecache *wc = ti->private;
 833         bool flush_on_suspend;
 834 
 835         del_timer_sync(&wc->autocommit_timer);
 836 
 837         wc_lock(wc);
 838         writecache_flush(wc);
 839         flush_on_suspend = wc->flush_on_suspend;
 840         if (flush_on_suspend) {
 841                 wc->flush_on_suspend = false;
 842                 wc->writeback_all++;
 843                 queue_work(wc->writeback_wq, &wc->writeback_work);
 844         }
 845         wc_unlock(wc);
 846 
 847         drain_workqueue(wc->writeback_wq);
 848 
 849         wc_lock(wc);
 850         if (flush_on_suspend)
 851                 wc->writeback_all--;
 852         while (writecache_wait_for_writeback(wc));
 853 
 854         if (WC_MODE_PMEM(wc))
 855                 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
 856 
 857         writecache_poison_lists(wc);
 858 
 859         wc_unlock(wc);
 860 }
 861 
 862 static int writecache_alloc_entries(struct dm_writecache *wc)
 863 {
 864         size_t b;
 865 
 866         if (wc->entries)
 867                 return 0;
 868         wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
 869         if (!wc->entries)
 870                 return -ENOMEM;
 871         for (b = 0; b < wc->n_blocks; b++) {
 872                 struct wc_entry *e = &wc->entries[b];
 873                 e->index = b;
 874                 e->write_in_progress = false;
 875                 cond_resched();
 876         }
 877 
 878         return 0;
 879 }
 880 
 881 static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
 882 {
 883         struct dm_io_region region;
 884         struct dm_io_request req;
 885 
 886         region.bdev = wc->ssd_dev->bdev;
 887         region.sector = wc->start_sector;
 888         region.count = n_sectors;
 889         req.bi_op = REQ_OP_READ;
 890         req.bi_op_flags = REQ_SYNC;
 891         req.mem.type = DM_IO_VMA;
 892         req.mem.ptr.vma = (char *)wc->memory_map;
 893         req.client = wc->dm_io;
 894         req.notify.fn = NULL;
 895 
 896         return dm_io(&req, 1, &region, NULL);
 897 }
 898 
 899 static void writecache_resume(struct dm_target *ti)
 900 {
 901         struct dm_writecache *wc = ti->private;
 902         size_t b;
 903         bool need_flush = false;
 904         __le64 sb_seq_count;
 905         int r;
 906 
 907         wc_lock(wc);
 908 
 909         if (WC_MODE_PMEM(wc)) {
 910                 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
 911         } else {
 912                 r = writecache_read_metadata(wc, wc->metadata_sectors);
 913                 if (r) {
 914                         size_t sb_entries_offset;
 915                         writecache_error(wc, r, "unable to read metadata: %d", r);
 916                         sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
 917                         memset((char *)wc->memory_map + sb_entries_offset, -1,
 918                                (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
 919                 }
 920         }
 921 
 922         wc->tree = RB_ROOT;
 923         INIT_LIST_HEAD(&wc->lru);
 924         if (WC_MODE_SORT_FREELIST(wc)) {
 925                 wc->freetree = RB_ROOT;
 926                 wc->current_free = NULL;
 927         } else {
 928                 INIT_LIST_HEAD(&wc->freelist);
 929         }
 930         wc->freelist_size = 0;
 931 
 932         r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
 933         if (r) {
 934                 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
 935                 sb_seq_count = cpu_to_le64(0);
 936         }
 937         wc->seq_count = le64_to_cpu(sb_seq_count);
 938 
 939 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 940         for (b = 0; b < wc->n_blocks; b++) {
 941                 struct wc_entry *e = &wc->entries[b];
 942                 struct wc_memory_entry wme;
 943                 if (writecache_has_error(wc)) {
 944                         e->original_sector = -1;
 945                         e->seq_count = -1;
 946                         continue;
 947                 }
 948                 r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
 949                 if (r) {
 950                         writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
 951                                          (unsigned long)b, r);
 952                         e->original_sector = -1;
 953                         e->seq_count = -1;
 954                 } else {
 955                         e->original_sector = le64_to_cpu(wme.original_sector);
 956                         e->seq_count = le64_to_cpu(wme.seq_count);
 957                 }
 958                 cond_resched();
 959         }
 960 #endif
 961         for (b = 0; b < wc->n_blocks; b++) {
 962                 struct wc_entry *e = &wc->entries[b];
 963                 if (!writecache_entry_is_committed(wc, e)) {
 964                         if (read_seq_count(wc, e) != -1) {
 965 erase_this:
 966                                 clear_seq_count(wc, e);
 967                                 need_flush = true;
 968                         }
 969                         writecache_add_to_freelist(wc, e);
 970                 } else {
 971                         struct wc_entry *old;
 972 
 973                         old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
 974                         if (!old) {
 975                                 writecache_insert_entry(wc, e);
 976                         } else {
 977                                 if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
 978                                         writecache_error(wc, -EINVAL,
 979                                                  "two identical entries, position %llu, sector %llu, sequence %llu",
 980                                                  (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
 981                                                  (unsigned long long)read_seq_count(wc, e));
 982                                 }
 983                                 if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
 984                                         goto erase_this;
 985                                 } else {
 986                                         writecache_free_entry(wc, old);
 987                                         writecache_insert_entry(wc, e);
 988                                         need_flush = true;
 989                                 }
 990                         }
 991                 }
 992                 cond_resched();
 993         }
 994 
 995         if (need_flush) {
 996                 writecache_flush_all_metadata(wc);
 997                 writecache_commit_flushed(wc, false);
 998         }
 999 
1000         writecache_verify_watermark(wc);
1001 
1002         wc_unlock(wc);
1003 }
1004 
1005 static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1006 {
1007         if (argc != 1)
1008                 return -EINVAL;
1009 
1010         wc_lock(wc);
1011         if (dm_suspended(wc->ti)) {
1012                 wc_unlock(wc);
1013                 return -EBUSY;
1014         }
1015         if (writecache_has_error(wc)) {
1016                 wc_unlock(wc);
1017                 return -EIO;
1018         }
1019 
1020         writecache_flush(wc);
1021         wc->writeback_all++;
1022         queue_work(wc->writeback_wq, &wc->writeback_work);
1023         wc_unlock(wc);
1024 
1025         flush_workqueue(wc->writeback_wq);
1026 
1027         wc_lock(wc);
1028         wc->writeback_all--;
1029         if (writecache_has_error(wc)) {
1030                 wc_unlock(wc);
1031                 return -EIO;
1032         }
1033         wc_unlock(wc);
1034 
1035         return 0;
1036 }
1037 
1038 static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1039 {
1040         if (argc != 1)
1041                 return -EINVAL;
1042 
1043         wc_lock(wc);
1044         wc->flush_on_suspend = true;
1045         wc_unlock(wc);
1046 
1047         return 0;
1048 }
1049 
1050 static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
1051                               char *result, unsigned maxlen)
1052 {
1053         int r = -EINVAL;
1054         struct dm_writecache *wc = ti->private;
1055 
1056         if (!strcasecmp(argv[0], "flush"))
1057                 r = process_flush_mesg(argc, argv, wc);
1058         else if (!strcasecmp(argv[0], "flush_on_suspend"))
1059                 r = process_flush_on_suspend_mesg(argc, argv, wc);
1060         else
1061                 DMERR("unrecognised message received: %s", argv[0]);
1062 
1063         return r;
1064 }
1065 
1066 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1067 {
1068         void *buf;
1069         unsigned long flags;
1070         unsigned size;
1071         int rw = bio_data_dir(bio);
1072         unsigned remaining_size = wc->block_size;
1073 
1074         do {
1075                 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1076                 buf = bvec_kmap_irq(&bv, &flags);
1077                 size = bv.bv_len;
1078                 if (unlikely(size > remaining_size))
1079                         size = remaining_size;
1080 
1081                 if (rw == READ) {
1082                         int r;
1083                         r = memcpy_mcsafe(buf, data, size);
1084                         flush_dcache_page(bio_page(bio));
1085                         if (unlikely(r)) {
1086                                 writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1087                                 bio->bi_status = BLK_STS_IOERR;
1088                         }
1089                 } else {
1090                         flush_dcache_page(bio_page(bio));
1091                         memcpy_flushcache(data, buf, size);
1092                 }
1093 
1094                 bvec_kunmap_irq(buf, &flags);
1095 
1096                 data = (char *)data + size;
1097                 remaining_size -= size;
1098                 bio_advance(bio, size);
1099         } while (unlikely(remaining_size));
1100 }
1101 
1102 static int writecache_flush_thread(void *data)
1103 {
1104         struct dm_writecache *wc = data;
1105 
1106         while (1) {
1107                 struct bio *bio;
1108 
1109                 wc_lock(wc);
1110                 bio = bio_list_pop(&wc->flush_list);
1111                 if (!bio) {
1112                         set_current_state(TASK_INTERRUPTIBLE);
1113                         wc_unlock(wc);
1114 
1115                         if (unlikely(kthread_should_stop())) {
1116                                 set_current_state(TASK_RUNNING);
1117                                 break;
1118                         }
1119 
1120                         schedule();
1121                         continue;
1122                 }
1123 
1124                 if (bio_op(bio) == REQ_OP_DISCARD) {
1125                         writecache_discard(wc, bio->bi_iter.bi_sector,
1126                                            bio_end_sector(bio));
1127                         wc_unlock(wc);
1128                         bio_set_dev(bio, wc->dev->bdev);
1129                         generic_make_request(bio);
1130                 } else {
1131                         writecache_flush(wc);
1132                         wc_unlock(wc);
1133                         if (writecache_has_error(wc))
1134                                 bio->bi_status = BLK_STS_IOERR;
1135                         bio_endio(bio);
1136                 }
1137         }
1138 
1139         return 0;
1140 }
1141 
1142 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1143 {
1144         if (bio_list_empty(&wc->flush_list))
1145                 wake_up_process(wc->flush_thread);
1146         bio_list_add(&wc->flush_list, bio);
1147 }
1148 
1149 static int writecache_map(struct dm_target *ti, struct bio *bio)
1150 {
1151         struct wc_entry *e;
1152         struct dm_writecache *wc = ti->private;
1153 
1154         bio->bi_private = NULL;
1155 
1156         wc_lock(wc);
1157 
1158         if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1159                 if (writecache_has_error(wc))
1160                         goto unlock_error;
1161                 if (WC_MODE_PMEM(wc)) {
1162                         writecache_flush(wc);
1163                         if (writecache_has_error(wc))
1164                                 goto unlock_error;
1165                         goto unlock_submit;
1166                 } else {
1167                         writecache_offload_bio(wc, bio);
1168                         goto unlock_return;
1169                 }
1170         }
1171 
1172         bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1173 
1174         if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1175                                 (wc->block_size / 512 - 1)) != 0)) {
1176                 DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1177                       (unsigned long long)bio->bi_iter.bi_sector,
1178                       bio->bi_iter.bi_size, wc->block_size);
1179                 goto unlock_error;
1180         }
1181 
1182         if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1183                 if (writecache_has_error(wc))
1184                         goto unlock_error;
1185                 if (WC_MODE_PMEM(wc)) {
1186                         writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1187                         goto unlock_remap_origin;
1188                 } else {
1189                         writecache_offload_bio(wc, bio);
1190                         goto unlock_return;
1191                 }
1192         }
1193 
1194         if (bio_data_dir(bio) == READ) {
1195 read_next_block:
1196                 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1197                 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1198                         if (WC_MODE_PMEM(wc)) {
1199                                 bio_copy_block(wc, bio, memory_data(wc, e));
1200                                 if (bio->bi_iter.bi_size)
1201                                         goto read_next_block;
1202                                 goto unlock_submit;
1203                         } else {
1204                                 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1205                                 bio_set_dev(bio, wc->ssd_dev->bdev);
1206                                 bio->bi_iter.bi_sector = cache_sector(wc, e);
1207                                 if (!writecache_entry_is_committed(wc, e))
1208                                         writecache_wait_for_ios(wc, WRITE);
1209                                 goto unlock_remap;
1210                         }
1211                 } else {
1212                         if (e) {
1213                                 sector_t next_boundary =
1214                                         read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1215                                 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1216                                         dm_accept_partial_bio(bio, next_boundary);
1217                                 }
1218                         }
1219                         goto unlock_remap_origin;
1220                 }
1221         } else {
1222                 do {
1223                         if (writecache_has_error(wc))
1224                                 goto unlock_error;
1225                         e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1226                         if (e) {
1227                                 if (!writecache_entry_is_committed(wc, e))
1228                                         goto bio_copy;
1229                                 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1230                                         wc->overwrote_committed = true;
1231                                         goto bio_copy;
1232                                 }
1233                         }
1234                         e = writecache_pop_from_freelist(wc);
1235                         if (unlikely(!e)) {
1236                                 writecache_wait_on_freelist(wc);
1237                                 continue;
1238                         }
1239                         write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1240                         writecache_insert_entry(wc, e);
1241                         wc->uncommitted_blocks++;
1242 bio_copy:
1243                         if (WC_MODE_PMEM(wc)) {
1244                                 bio_copy_block(wc, bio, memory_data(wc, e));
1245                         } else {
1246                                 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1247                                 bio_set_dev(bio, wc->ssd_dev->bdev);
1248                                 bio->bi_iter.bi_sector = cache_sector(wc, e);
1249                                 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1250                                         wc->uncommitted_blocks = 0;
1251                                         queue_work(wc->writeback_wq, &wc->flush_work);
1252                                 } else {
1253                                         writecache_schedule_autocommit(wc);
1254                                 }
1255                                 goto unlock_remap;
1256                         }
1257                 } while (bio->bi_iter.bi_size);
1258 
1259                 if (unlikely(bio->bi_opf & REQ_FUA ||
1260                              wc->uncommitted_blocks >= wc->autocommit_blocks))
1261                         writecache_flush(wc);
1262                 else
1263                         writecache_schedule_autocommit(wc);
1264                 goto unlock_submit;
1265         }
1266 
1267 unlock_remap_origin:
1268         bio_set_dev(bio, wc->dev->bdev);
1269         wc_unlock(wc);
1270         return DM_MAPIO_REMAPPED;
1271 
1272 unlock_remap:
1273         /* make sure that writecache_end_io decrements bio_in_progress: */
1274         bio->bi_private = (void *)1;
1275         atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1276         wc_unlock(wc);
1277         return DM_MAPIO_REMAPPED;
1278 
1279 unlock_submit:
1280         wc_unlock(wc);
1281         bio_endio(bio);
1282         return DM_MAPIO_SUBMITTED;
1283 
1284 unlock_return:
1285         wc_unlock(wc);
1286         return DM_MAPIO_SUBMITTED;
1287 
1288 unlock_error:
1289         wc_unlock(wc);
1290         bio_io_error(bio);
1291         return DM_MAPIO_SUBMITTED;
1292 }
1293 
1294 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1295 {
1296         struct dm_writecache *wc = ti->private;
1297 
1298         if (bio->bi_private != NULL) {
1299                 int dir = bio_data_dir(bio);
1300                 if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1301                         if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1302                                 wake_up(&wc->bio_in_progress_wait[dir]);
1303         }
1304         return 0;
1305 }
1306 
1307 static int writecache_iterate_devices(struct dm_target *ti,
1308                                       iterate_devices_callout_fn fn, void *data)
1309 {
1310         struct dm_writecache *wc = ti->private;
1311 
1312         return fn(ti, wc->dev, 0, ti->len, data);
1313 }
1314 
1315 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1316 {
1317         struct dm_writecache *wc = ti->private;
1318 
1319         if (limits->logical_block_size < wc->block_size)
1320                 limits->logical_block_size = wc->block_size;
1321 
1322         if (limits->physical_block_size < wc->block_size)
1323                 limits->physical_block_size = wc->block_size;
1324 
1325         if (limits->io_min < wc->block_size)
1326                 limits->io_min = wc->block_size;
1327 }
1328 
1329 
1330 static void writecache_writeback_endio(struct bio *bio)
1331 {
1332         struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1333         struct dm_writecache *wc = wb->wc;
1334         unsigned long flags;
1335 
1336         raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1337         if (unlikely(list_empty(&wc->endio_list)))
1338                 wake_up_process(wc->endio_thread);
1339         list_add_tail(&wb->endio_entry, &wc->endio_list);
1340         raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1341 }
1342 
1343 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1344 {
1345         struct copy_struct *c = ptr;
1346         struct dm_writecache *wc = c->wc;
1347 
1348         c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1349 
1350         raw_spin_lock_irq(&wc->endio_list_lock);
1351         if (unlikely(list_empty(&wc->endio_list)))
1352                 wake_up_process(wc->endio_thread);
1353         list_add_tail(&c->endio_entry, &wc->endio_list);
1354         raw_spin_unlock_irq(&wc->endio_list_lock);
1355 }
1356 
1357 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1358 {
1359         unsigned i;
1360         struct writeback_struct *wb;
1361         struct wc_entry *e;
1362         unsigned long n_walked = 0;
1363 
1364         do {
1365                 wb = list_entry(list->next, struct writeback_struct, endio_entry);
1366                 list_del(&wb->endio_entry);
1367 
1368                 if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1369                         writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1370                                         "write error %d", wb->bio.bi_status);
1371                 i = 0;
1372                 do {
1373                         e = wb->wc_list[i];
1374                         BUG_ON(!e->write_in_progress);
1375                         e->write_in_progress = false;
1376                         INIT_LIST_HEAD(&e->lru);
1377                         if (!writecache_has_error(wc))
1378                                 writecache_free_entry(wc, e);
1379                         BUG_ON(!wc->writeback_size);
1380                         wc->writeback_size--;
1381                         n_walked++;
1382                         if (unlikely(n_walked >= ENDIO_LATENCY)) {
1383                                 writecache_commit_flushed(wc, false);
1384                                 wc_unlock(wc);
1385                                 wc_lock(wc);
1386                                 n_walked = 0;
1387                         }
1388                 } while (++i < wb->wc_list_n);
1389 
1390                 if (wb->wc_list != wb->wc_list_inline)
1391                         kfree(wb->wc_list);
1392                 bio_put(&wb->bio);
1393         } while (!list_empty(list));
1394 }
1395 
1396 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1397 {
1398         struct copy_struct *c;
1399         struct wc_entry *e;
1400 
1401         do {
1402                 c = list_entry(list->next, struct copy_struct, endio_entry);
1403                 list_del(&c->endio_entry);
1404 
1405                 if (unlikely(c->error))
1406                         writecache_error(wc, c->error, "copy error");
1407 
1408                 e = c->e;
1409                 do {
1410                         BUG_ON(!e->write_in_progress);
1411                         e->write_in_progress = false;
1412                         INIT_LIST_HEAD(&e->lru);
1413                         if (!writecache_has_error(wc))
1414                                 writecache_free_entry(wc, e);
1415 
1416                         BUG_ON(!wc->writeback_size);
1417                         wc->writeback_size--;
1418                         e++;
1419                 } while (--c->n_entries);
1420                 mempool_free(c, &wc->copy_pool);
1421         } while (!list_empty(list));
1422 }
1423 
1424 static int writecache_endio_thread(void *data)
1425 {
1426         struct dm_writecache *wc = data;
1427 
1428         while (1) {
1429                 struct list_head list;
1430 
1431                 raw_spin_lock_irq(&wc->endio_list_lock);
1432                 if (!list_empty(&wc->endio_list))
1433                         goto pop_from_list;
1434                 set_current_state(TASK_INTERRUPTIBLE);
1435                 raw_spin_unlock_irq(&wc->endio_list_lock);
1436 
1437                 if (unlikely(kthread_should_stop())) {
1438                         set_current_state(TASK_RUNNING);
1439                         break;
1440                 }
1441 
1442                 schedule();
1443 
1444                 continue;
1445 
1446 pop_from_list:
1447                 list = wc->endio_list;
1448                 list.next->prev = list.prev->next = &list;
1449                 INIT_LIST_HEAD(&wc->endio_list);
1450                 raw_spin_unlock_irq(&wc->endio_list_lock);
1451 
1452                 if (!WC_MODE_FUA(wc))
1453                         writecache_disk_flush(wc, wc->dev);
1454 
1455                 wc_lock(wc);
1456 
1457                 if (WC_MODE_PMEM(wc)) {
1458                         __writecache_endio_pmem(wc, &list);
1459                 } else {
1460                         __writecache_endio_ssd(wc, &list);
1461                         writecache_wait_for_ios(wc, READ);
1462                 }
1463 
1464                 writecache_commit_flushed(wc, false);
1465 
1466                 wc_unlock(wc);
1467         }
1468 
1469         return 0;
1470 }
1471 
1472 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
1473 {
1474         struct dm_writecache *wc = wb->wc;
1475         unsigned block_size = wc->block_size;
1476         void *address = memory_data(wc, e);
1477 
1478         persistent_memory_flush_cache(address, block_size);
1479         return bio_add_page(&wb->bio, persistent_memory_page(address),
1480                             block_size, persistent_memory_page_offset(address)) != 0;
1481 }
1482 
1483 struct writeback_list {
1484         struct list_head list;
1485         size_t size;
1486 };
1487 
1488 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1489 {
1490         if (unlikely(wc->max_writeback_jobs)) {
1491                 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1492                         wc_lock(wc);
1493                         while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1494                                 writecache_wait_on_freelist(wc);
1495                         wc_unlock(wc);
1496                 }
1497         }
1498         cond_resched();
1499 }
1500 
1501 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1502 {
1503         struct wc_entry *e, *f;
1504         struct bio *bio;
1505         struct writeback_struct *wb;
1506         unsigned max_pages;
1507 
1508         while (wbl->size) {
1509                 wbl->size--;
1510                 e = container_of(wbl->list.prev, struct wc_entry, lru);
1511                 list_del(&e->lru);
1512 
1513                 max_pages = e->wc_list_contiguous;
1514 
1515                 bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
1516                 wb = container_of(bio, struct writeback_struct, bio);
1517                 wb->wc = wc;
1518                 bio->bi_end_io = writecache_writeback_endio;
1519                 bio_set_dev(bio, wc->dev->bdev);
1520                 bio->bi_iter.bi_sector = read_original_sector(wc, e);
1521                 if (max_pages <= WB_LIST_INLINE ||
1522                     unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
1523                                                            GFP_NOIO | __GFP_NORETRY |
1524                                                            __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
1525                         wb->wc_list = wb->wc_list_inline;
1526                         max_pages = WB_LIST_INLINE;
1527                 }
1528 
1529                 BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
1530 
1531                 wb->wc_list[0] = e;
1532                 wb->wc_list_n = 1;
1533 
1534                 while (wbl->size && wb->wc_list_n < max_pages) {
1535                         f = container_of(wbl->list.prev, struct wc_entry, lru);
1536                         if (read_original_sector(wc, f) !=
1537                             read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1538                                 break;
1539                         if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
1540                                 break;
1541                         wbl->size--;
1542                         list_del(&f->lru);
1543                         wb->wc_list[wb->wc_list_n++] = f;
1544                         e = f;
1545                 }
1546                 bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
1547                 if (writecache_has_error(wc)) {
1548                         bio->bi_status = BLK_STS_IOERR;
1549                         bio_endio(bio);
1550                 } else {
1551                         submit_bio(bio);
1552                 }
1553 
1554                 __writeback_throttle(wc, wbl);
1555         }
1556 }
1557 
1558 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1559 {
1560         struct wc_entry *e, *f;
1561         struct dm_io_region from, to;
1562         struct copy_struct *c;
1563 
1564         while (wbl->size) {
1565                 unsigned n_sectors;
1566 
1567                 wbl->size--;
1568                 e = container_of(wbl->list.prev, struct wc_entry, lru);
1569                 list_del(&e->lru);
1570 
1571                 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1572 
1573                 from.bdev = wc->ssd_dev->bdev;
1574                 from.sector = cache_sector(wc, e);
1575                 from.count = n_sectors;
1576                 to.bdev = wc->dev->bdev;
1577                 to.sector = read_original_sector(wc, e);
1578                 to.count = n_sectors;
1579 
1580                 c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1581                 c->wc = wc;
1582                 c->e = e;
1583                 c->n_entries = e->wc_list_contiguous;
1584 
1585                 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1586                         wbl->size--;
1587                         f = container_of(wbl->list.prev, struct wc_entry, lru);
1588                         BUG_ON(f != e + 1);
1589                         list_del(&f->lru);
1590                         e = f;
1591                 }
1592 
1593                 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1594 
1595                 __writeback_throttle(wc, wbl);
1596         }
1597 }
1598 
1599 static void writecache_writeback(struct work_struct *work)
1600 {
1601         struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1602         struct blk_plug plug;
1603         struct wc_entry *f, *g, *e = NULL;
1604         struct rb_node *node, *next_node;
1605         struct list_head skipped;
1606         struct writeback_list wbl;
1607         unsigned long n_walked;
1608 
1609         wc_lock(wc);
1610 restart:
1611         if (writecache_has_error(wc)) {
1612                 wc_unlock(wc);
1613                 return;
1614         }
1615 
1616         if (unlikely(wc->writeback_all)) {
1617                 if (writecache_wait_for_writeback(wc))
1618                         goto restart;
1619         }
1620 
1621         if (wc->overwrote_committed) {
1622                 writecache_wait_for_ios(wc, WRITE);
1623         }
1624 
1625         n_walked = 0;
1626         INIT_LIST_HEAD(&skipped);
1627         INIT_LIST_HEAD(&wbl.list);
1628         wbl.size = 0;
1629         while (!list_empty(&wc->lru) &&
1630                (wc->writeback_all ||
1631                 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) {
1632 
1633                 n_walked++;
1634                 if (unlikely(n_walked > WRITEBACK_LATENCY) &&
1635                     likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
1636                         queue_work(wc->writeback_wq, &wc->writeback_work);
1637                         break;
1638                 }
1639 
1640                 if (unlikely(wc->writeback_all)) {
1641                         if (unlikely(!e)) {
1642                                 writecache_flush(wc);
1643                                 e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
1644                         } else
1645                                 e = g;
1646                 } else
1647                         e = container_of(wc->lru.prev, struct wc_entry, lru);
1648                 BUG_ON(e->write_in_progress);
1649                 if (unlikely(!writecache_entry_is_committed(wc, e))) {
1650                         writecache_flush(wc);
1651                 }
1652                 node = rb_prev(&e->rb_node);
1653                 if (node) {
1654                         f = container_of(node, struct wc_entry, rb_node);
1655                         if (unlikely(read_original_sector(wc, f) ==
1656                                      read_original_sector(wc, e))) {
1657                                 BUG_ON(!f->write_in_progress);
1658                                 list_del(&e->lru);
1659                                 list_add(&e->lru, &skipped);
1660                                 cond_resched();
1661                                 continue;
1662                         }
1663                 }
1664                 wc->writeback_size++;
1665                 list_del(&e->lru);
1666                 list_add(&e->lru, &wbl.list);
1667                 wbl.size++;
1668                 e->write_in_progress = true;
1669                 e->wc_list_contiguous = 1;
1670 
1671                 f = e;
1672 
1673                 while (1) {
1674                         next_node = rb_next(&f->rb_node);
1675                         if (unlikely(!next_node))
1676                                 break;
1677                         g = container_of(next_node, struct wc_entry, rb_node);
1678                         if (unlikely(read_original_sector(wc, g) ==
1679                             read_original_sector(wc, f))) {
1680                                 f = g;
1681                                 continue;
1682                         }
1683                         if (read_original_sector(wc, g) !=
1684                             read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
1685                                 break;
1686                         if (unlikely(g->write_in_progress))
1687                                 break;
1688                         if (unlikely(!writecache_entry_is_committed(wc, g)))
1689                                 break;
1690 
1691                         if (!WC_MODE_PMEM(wc)) {
1692                                 if (g != f + 1)
1693                                         break;
1694                         }
1695 
1696                         n_walked++;
1697                         //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
1698                         //      break;
1699 
1700                         wc->writeback_size++;
1701                         list_del(&g->lru);
1702                         list_add(&g->lru, &wbl.list);
1703                         wbl.size++;
1704                         g->write_in_progress = true;
1705                         g->wc_list_contiguous = BIO_MAX_PAGES;
1706                         f = g;
1707                         e->wc_list_contiguous++;
1708                         if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) {
1709                                 if (unlikely(wc->writeback_all)) {
1710                                         next_node = rb_next(&f->rb_node);
1711                                         if (likely(next_node))
1712                                                 g = container_of(next_node, struct wc_entry, rb_node);
1713                                 }
1714                                 break;
1715                         }
1716                 }
1717                 cond_resched();
1718         }
1719 
1720         if (!list_empty(&skipped)) {
1721                 list_splice_tail(&skipped, &wc->lru);
1722                 /*
1723                  * If we didn't do any progress, we must wait until some
1724                  * writeback finishes to avoid burning CPU in a loop
1725                  */
1726                 if (unlikely(!wbl.size))
1727                         writecache_wait_for_writeback(wc);
1728         }
1729 
1730         wc_unlock(wc);
1731 
1732         blk_start_plug(&plug);
1733 
1734         if (WC_MODE_PMEM(wc))
1735                 __writecache_writeback_pmem(wc, &wbl);
1736         else
1737                 __writecache_writeback_ssd(wc, &wbl);
1738 
1739         blk_finish_plug(&plug);
1740 
1741         if (unlikely(wc->writeback_all)) {
1742                 wc_lock(wc);
1743                 while (writecache_wait_for_writeback(wc));
1744                 wc_unlock(wc);
1745         }
1746 }
1747 
1748 static int calculate_memory_size(uint64_t device_size, unsigned block_size,
1749                                  size_t *n_blocks_p, size_t *n_metadata_blocks_p)
1750 {
1751         uint64_t n_blocks, offset;
1752         struct wc_entry e;
1753 
1754         n_blocks = device_size;
1755         do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
1756 
1757         while (1) {
1758                 if (!n_blocks)
1759                         return -ENOSPC;
1760                 /* Verify the following entries[n_blocks] won't overflow */
1761                 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
1762                                  sizeof(struct wc_memory_entry)))
1763                         return -EFBIG;
1764                 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
1765                 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
1766                 if (offset + n_blocks * block_size <= device_size)
1767                         break;
1768                 n_blocks--;
1769         }
1770 
1771         /* check if the bit field overflows */
1772         e.index = n_blocks;
1773         if (e.index != n_blocks)
1774                 return -EFBIG;
1775 
1776         if (n_blocks_p)
1777                 *n_blocks_p = n_blocks;
1778         if (n_metadata_blocks_p)
1779                 *n_metadata_blocks_p = offset >> __ffs(block_size);
1780         return 0;
1781 }
1782 
1783 static int init_memory(struct dm_writecache *wc)
1784 {
1785         size_t b;
1786         int r;
1787 
1788         r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
1789         if (r)
1790                 return r;
1791 
1792         r = writecache_alloc_entries(wc);
1793         if (r)
1794                 return r;
1795 
1796         for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
1797                 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
1798         pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
1799         pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
1800         pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
1801         pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
1802 
1803         for (b = 0; b < wc->n_blocks; b++) {
1804                 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
1805                 cond_resched();
1806         }
1807 
1808         writecache_flush_all_metadata(wc);
1809         writecache_commit_flushed(wc, false);
1810         pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
1811         writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
1812         writecache_commit_flushed(wc, false);
1813 
1814         return 0;
1815 }
1816 
1817 static void writecache_dtr(struct dm_target *ti)
1818 {
1819         struct dm_writecache *wc = ti->private;
1820 
1821         if (!wc)
1822                 return;
1823 
1824         if (wc->endio_thread)
1825                 kthread_stop(wc->endio_thread);
1826 
1827         if (wc->flush_thread)
1828                 kthread_stop(wc->flush_thread);
1829 
1830         bioset_exit(&wc->bio_set);
1831 
1832         mempool_exit(&wc->copy_pool);
1833 
1834         if (wc->writeback_wq)
1835                 destroy_workqueue(wc->writeback_wq);
1836 
1837         if (wc->dev)
1838                 dm_put_device(ti, wc->dev);
1839 
1840         if (wc->ssd_dev)
1841                 dm_put_device(ti, wc->ssd_dev);
1842 
1843         if (wc->entries)
1844                 vfree(wc->entries);
1845 
1846         if (wc->memory_map) {
1847                 if (WC_MODE_PMEM(wc))
1848                         persistent_memory_release(wc);
1849                 else
1850                         vfree(wc->memory_map);
1851         }
1852 
1853         if (wc->dm_kcopyd)
1854                 dm_kcopyd_client_destroy(wc->dm_kcopyd);
1855 
1856         if (wc->dm_io)
1857                 dm_io_client_destroy(wc->dm_io);
1858 
1859         if (wc->dirty_bitmap)
1860                 vfree(wc->dirty_bitmap);
1861 
1862         kfree(wc);
1863 }
1864 
1865 static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
1866 {
1867         struct dm_writecache *wc;
1868         struct dm_arg_set as;
1869         const char *string;
1870         unsigned opt_params;
1871         size_t offset, data_size;
1872         int i, r;
1873         char dummy;
1874         int high_wm_percent = HIGH_WATERMARK;
1875         int low_wm_percent = LOW_WATERMARK;
1876         uint64_t x;
1877         struct wc_memory_superblock s;
1878 
1879         static struct dm_arg _args[] = {
1880                 {0, 10, "Invalid number of feature args"},
1881         };
1882 
1883         as.argc = argc;
1884         as.argv = argv;
1885 
1886         wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
1887         if (!wc) {
1888                 ti->error = "Cannot allocate writecache structure";
1889                 r = -ENOMEM;
1890                 goto bad;
1891         }
1892         ti->private = wc;
1893         wc->ti = ti;
1894 
1895         mutex_init(&wc->lock);
1896         writecache_poison_lists(wc);
1897         init_waitqueue_head(&wc->freelist_wait);
1898         timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
1899 
1900         for (i = 0; i < 2; i++) {
1901                 atomic_set(&wc->bio_in_progress[i], 0);
1902                 init_waitqueue_head(&wc->bio_in_progress_wait[i]);
1903         }
1904 
1905         wc->dm_io = dm_io_client_create();
1906         if (IS_ERR(wc->dm_io)) {
1907                 r = PTR_ERR(wc->dm_io);
1908                 ti->error = "Unable to allocate dm-io client";
1909                 wc->dm_io = NULL;
1910                 goto bad;
1911         }
1912 
1913         wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1);
1914         if (!wc->writeback_wq) {
1915                 r = -ENOMEM;
1916                 ti->error = "Could not allocate writeback workqueue";
1917                 goto bad;
1918         }
1919         INIT_WORK(&wc->writeback_work, writecache_writeback);
1920         INIT_WORK(&wc->flush_work, writecache_flush_work);
1921 
1922         raw_spin_lock_init(&wc->endio_list_lock);
1923         INIT_LIST_HEAD(&wc->endio_list);
1924         wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
1925         if (IS_ERR(wc->endio_thread)) {
1926                 r = PTR_ERR(wc->endio_thread);
1927                 wc->endio_thread = NULL;
1928                 ti->error = "Couldn't spawn endio thread";
1929                 goto bad;
1930         }
1931         wake_up_process(wc->endio_thread);
1932 
1933         /*
1934          * Parse the mode (pmem or ssd)
1935          */
1936         string = dm_shift_arg(&as);
1937         if (!string)
1938                 goto bad_arguments;
1939 
1940         if (!strcasecmp(string, "s")) {
1941                 wc->pmem_mode = false;
1942         } else if (!strcasecmp(string, "p")) {
1943 #ifdef DM_WRITECACHE_HAS_PMEM
1944                 wc->pmem_mode = true;
1945                 wc->writeback_fua = true;
1946 #else
1947                 /*
1948                  * If the architecture doesn't support persistent memory or
1949                  * the kernel doesn't support any DAX drivers, this driver can
1950                  * only be used in SSD-only mode.
1951                  */
1952                 r = -EOPNOTSUPP;
1953                 ti->error = "Persistent memory or DAX not supported on this system";
1954                 goto bad;
1955 #endif
1956         } else {
1957                 goto bad_arguments;
1958         }
1959 
1960         if (WC_MODE_PMEM(wc)) {
1961                 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
1962                                 offsetof(struct writeback_struct, bio),
1963                                 BIOSET_NEED_BVECS);
1964                 if (r) {
1965                         ti->error = "Could not allocate bio set";
1966                         goto bad;
1967                 }
1968         } else {
1969                 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
1970                 if (r) {
1971                         ti->error = "Could not allocate mempool";
1972                         goto bad;
1973                 }
1974         }
1975 
1976         /*
1977          * Parse the origin data device
1978          */
1979         string = dm_shift_arg(&as);
1980         if (!string)
1981                 goto bad_arguments;
1982         r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
1983         if (r) {
1984                 ti->error = "Origin data device lookup failed";
1985                 goto bad;
1986         }
1987 
1988         /*
1989          * Parse cache data device (be it pmem or ssd)
1990          */
1991         string = dm_shift_arg(&as);
1992         if (!string)
1993                 goto bad_arguments;
1994 
1995         r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
1996         if (r) {
1997                 ti->error = "Cache data device lookup failed";
1998                 goto bad;
1999         }
2000         wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
2001 
2002         /*
2003          * Parse the cache block size
2004          */
2005         string = dm_shift_arg(&as);
2006         if (!string)
2007                 goto bad_arguments;
2008         if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
2009             wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
2010             (wc->block_size & (wc->block_size - 1))) {
2011                 r = -EINVAL;
2012                 ti->error = "Invalid block size";
2013                 goto bad;
2014         }
2015         if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
2016             wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
2017                 r = -EINVAL;
2018                 ti->error = "Block size is smaller than device logical block size";
2019                 goto bad;
2020         }
2021         wc->block_size_bits = __ffs(wc->block_size);
2022 
2023         wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
2024         wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
2025         wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
2026 
2027         /*
2028          * Parse optional arguments
2029          */
2030         r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
2031         if (r)
2032                 goto bad;
2033 
2034         while (opt_params) {
2035                 string = dm_shift_arg(&as), opt_params--;
2036                 if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
2037                         unsigned long long start_sector;
2038                         string = dm_shift_arg(&as), opt_params--;
2039                         if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
2040                                 goto invalid_optional;
2041                         wc->start_sector = start_sector;
2042                         if (wc->start_sector != start_sector ||
2043                             wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
2044                                 goto invalid_optional;
2045                 } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
2046                         string = dm_shift_arg(&as), opt_params--;
2047                         if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
2048                                 goto invalid_optional;
2049                         if (high_wm_percent < 0 || high_wm_percent > 100)
2050                                 goto invalid_optional;
2051                         wc->high_wm_percent_set = true;
2052                 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
2053                         string = dm_shift_arg(&as), opt_params--;
2054                         if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
2055                                 goto invalid_optional;
2056                         if (low_wm_percent < 0 || low_wm_percent > 100)
2057                                 goto invalid_optional;
2058                         wc->low_wm_percent_set = true;
2059                 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2060                         string = dm_shift_arg(&as), opt_params--;
2061                         if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2062                                 goto invalid_optional;
2063                         wc->max_writeback_jobs_set = true;
2064                 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2065                         string = dm_shift_arg(&as), opt_params--;
2066                         if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2067                                 goto invalid_optional;
2068                         wc->autocommit_blocks_set = true;
2069                 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2070                         unsigned autocommit_msecs;
2071                         string = dm_shift_arg(&as), opt_params--;
2072                         if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2073                                 goto invalid_optional;
2074                         if (autocommit_msecs > 3600000)
2075                                 goto invalid_optional;
2076                         wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2077                         wc->autocommit_time_set = true;
2078                 } else if (!strcasecmp(string, "fua")) {
2079                         if (WC_MODE_PMEM(wc)) {
2080                                 wc->writeback_fua = true;
2081                                 wc->writeback_fua_set = true;
2082                         } else goto invalid_optional;
2083                 } else if (!strcasecmp(string, "nofua")) {
2084                         if (WC_MODE_PMEM(wc)) {
2085                                 wc->writeback_fua = false;
2086                                 wc->writeback_fua_set = true;
2087                         } else goto invalid_optional;
2088                 } else {
2089 invalid_optional:
2090                         r = -EINVAL;
2091                         ti->error = "Invalid optional argument";
2092                         goto bad;
2093                 }
2094         }
2095 
2096         if (high_wm_percent < low_wm_percent) {
2097                 r = -EINVAL;
2098                 ti->error = "High watermark must be greater than or equal to low watermark";
2099                 goto bad;
2100         }
2101 
2102         if (WC_MODE_PMEM(wc)) {
2103                 r = persistent_memory_claim(wc);
2104                 if (r) {
2105                         ti->error = "Unable to map persistent memory for cache";
2106                         goto bad;
2107                 }
2108         } else {
2109                 size_t n_blocks, n_metadata_blocks;
2110                 uint64_t n_bitmap_bits;
2111 
2112                 wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
2113 
2114                 bio_list_init(&wc->flush_list);
2115                 wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
2116                 if (IS_ERR(wc->flush_thread)) {
2117                         r = PTR_ERR(wc->flush_thread);
2118                         wc->flush_thread = NULL;
2119                         ti->error = "Couldn't spawn flush thread";
2120                         goto bad;
2121                 }
2122                 wake_up_process(wc->flush_thread);
2123 
2124                 r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2125                                           &n_blocks, &n_metadata_blocks);
2126                 if (r) {
2127                         ti->error = "Invalid device size";
2128                         goto bad;
2129                 }
2130 
2131                 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2132                                  BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2133                 /* this is limitation of test_bit functions */
2134                 if (n_bitmap_bits > 1U << 31) {
2135                         r = -EFBIG;
2136                         ti->error = "Invalid device size";
2137                         goto bad;
2138                 }
2139 
2140                 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2141                 if (!wc->memory_map) {
2142                         r = -ENOMEM;
2143                         ti->error = "Unable to allocate memory for metadata";
2144                         goto bad;
2145                 }
2146 
2147                 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2148                 if (IS_ERR(wc->dm_kcopyd)) {
2149                         r = PTR_ERR(wc->dm_kcopyd);
2150                         ti->error = "Unable to allocate dm-kcopyd client";
2151                         wc->dm_kcopyd = NULL;
2152                         goto bad;
2153                 }
2154 
2155                 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2156                 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2157                         BITS_PER_LONG * sizeof(unsigned long);
2158                 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2159                 if (!wc->dirty_bitmap) {
2160                         r = -ENOMEM;
2161                         ti->error = "Unable to allocate dirty bitmap";
2162                         goto bad;
2163                 }
2164 
2165                 r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
2166                 if (r) {
2167                         ti->error = "Unable to read first block of metadata";
2168                         goto bad;
2169                 }
2170         }
2171 
2172         r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2173         if (r) {
2174                 ti->error = "Hardware memory error when reading superblock";
2175                 goto bad;
2176         }
2177         if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2178                 r = init_memory(wc);
2179                 if (r) {
2180                         ti->error = "Unable to initialize device";
2181                         goto bad;
2182                 }
2183                 r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2184                 if (r) {
2185                         ti->error = "Hardware memory error when reading superblock";
2186                         goto bad;
2187                 }
2188         }
2189 
2190         if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2191                 ti->error = "Invalid magic in the superblock";
2192                 r = -EINVAL;
2193                 goto bad;
2194         }
2195 
2196         if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2197                 ti->error = "Invalid version in the superblock";
2198                 r = -EINVAL;
2199                 goto bad;
2200         }
2201 
2202         if (le32_to_cpu(s.block_size) != wc->block_size) {
2203                 ti->error = "Block size does not match superblock";
2204                 r = -EINVAL;
2205                 goto bad;
2206         }
2207 
2208         wc->n_blocks = le64_to_cpu(s.n_blocks);
2209 
2210         offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2211         if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2212 overflow:
2213                 ti->error = "Overflow in size calculation";
2214                 r = -EINVAL;
2215                 goto bad;
2216         }
2217         offset += sizeof(struct wc_memory_superblock);
2218         if (offset < sizeof(struct wc_memory_superblock))
2219                 goto overflow;
2220         offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2221         data_size = wc->n_blocks * (size_t)wc->block_size;
2222         if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2223             (offset + data_size < offset))
2224                 goto overflow;
2225         if (offset + data_size > wc->memory_map_size) {
2226                 ti->error = "Memory area is too small";
2227                 r = -EINVAL;
2228                 goto bad;
2229         }
2230 
2231         wc->metadata_sectors = offset >> SECTOR_SHIFT;
2232         wc->block_start = (char *)sb(wc) + offset;
2233 
2234         x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2235         x += 50;
2236         do_div(x, 100);
2237         wc->freelist_high_watermark = x;
2238         x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2239         x += 50;
2240         do_div(x, 100);
2241         wc->freelist_low_watermark = x;
2242 
2243         r = writecache_alloc_entries(wc);
2244         if (r) {
2245                 ti->error = "Cannot allocate memory";
2246                 goto bad;
2247         }
2248 
2249         ti->num_flush_bios = 1;
2250         ti->flush_supported = true;
2251         ti->num_discard_bios = 1;
2252 
2253         if (WC_MODE_PMEM(wc))
2254                 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2255 
2256         return 0;
2257 
2258 bad_arguments:
2259         r = -EINVAL;
2260         ti->error = "Bad arguments";
2261 bad:
2262         writecache_dtr(ti);
2263         return r;
2264 }
2265 
2266 static void writecache_status(struct dm_target *ti, status_type_t type,
2267                               unsigned status_flags, char *result, unsigned maxlen)
2268 {
2269         struct dm_writecache *wc = ti->private;
2270         unsigned extra_args;
2271         unsigned sz = 0;
2272         uint64_t x;
2273 
2274         switch (type) {
2275         case STATUSTYPE_INFO:
2276                 DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc),
2277                        (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2278                        (unsigned long long)wc->writeback_size);
2279                 break;
2280         case STATUSTYPE_TABLE:
2281                 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2282                                 wc->dev->name, wc->ssd_dev->name, wc->block_size);
2283                 extra_args = 0;
2284                 if (wc->start_sector)
2285                         extra_args += 2;
2286                 if (wc->high_wm_percent_set)
2287                         extra_args += 2;
2288                 if (wc->low_wm_percent_set)
2289                         extra_args += 2;
2290                 if (wc->max_writeback_jobs_set)
2291                         extra_args += 2;
2292                 if (wc->autocommit_blocks_set)
2293                         extra_args += 2;
2294                 if (wc->autocommit_time_set)
2295                         extra_args += 2;
2296                 if (wc->writeback_fua_set)
2297                         extra_args++;
2298 
2299                 DMEMIT("%u", extra_args);
2300                 if (wc->start_sector)
2301                         DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
2302                 if (wc->high_wm_percent_set) {
2303                         x = (uint64_t)wc->freelist_high_watermark * 100;
2304                         x += wc->n_blocks / 2;
2305                         do_div(x, (size_t)wc->n_blocks);
2306                         DMEMIT(" high_watermark %u", 100 - (unsigned)x);
2307                 }
2308                 if (wc->low_wm_percent_set) {
2309                         x = (uint64_t)wc->freelist_low_watermark * 100;
2310                         x += wc->n_blocks / 2;
2311                         do_div(x, (size_t)wc->n_blocks);
2312                         DMEMIT(" low_watermark %u", 100 - (unsigned)x);
2313                 }
2314                 if (wc->max_writeback_jobs_set)
2315                         DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2316                 if (wc->autocommit_blocks_set)
2317                         DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2318                 if (wc->autocommit_time_set)
2319                         DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
2320                 if (wc->writeback_fua_set)
2321                         DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2322                 break;
2323         }
2324 }
2325 
2326 static struct target_type writecache_target = {
2327         .name                   = "writecache",
2328         .version                = {1, 1, 1},
2329         .module                 = THIS_MODULE,
2330         .ctr                    = writecache_ctr,
2331         .dtr                    = writecache_dtr,
2332         .status                 = writecache_status,
2333         .postsuspend            = writecache_suspend,
2334         .resume                 = writecache_resume,
2335         .message                = writecache_message,
2336         .map                    = writecache_map,
2337         .end_io                 = writecache_end_io,
2338         .iterate_devices        = writecache_iterate_devices,
2339         .io_hints               = writecache_io_hints,
2340 };
2341 
2342 static int __init dm_writecache_init(void)
2343 {
2344         int r;
2345 
2346         r = dm_register_target(&writecache_target);
2347         if (r < 0) {
2348                 DMERR("register failed %d", r);
2349                 return r;
2350         }
2351 
2352         return 0;
2353 }
2354 
2355 static void __exit dm_writecache_exit(void)
2356 {
2357         dm_unregister_target(&writecache_target);
2358 }
2359 
2360 module_init(dm_writecache_init);
2361 module_exit(dm_writecache_exit);
2362 
2363 MODULE_DESCRIPTION(DM_NAME " writecache target");
2364 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2365 MODULE_LICENSE("GPL");

/* [<][>][^][v][top][bottom][index][help] */