1/* 2 * Copyright (C) 2012 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7#include "dm-array.h" 8#include "dm-space-map.h" 9#include "dm-transaction-manager.h" 10 11#include <linux/export.h> 12#include <linux/device-mapper.h> 13 14#define DM_MSG_PREFIX "array" 15 16/*----------------------------------------------------------------*/ 17 18/* 19 * The array is implemented as a fully populated btree, which points to 20 * blocks that contain the packed values. This is more space efficient 21 * than just using a btree since we don't store 1 key per value. 22 */ 23struct array_block { 24 __le32 csum; 25 __le32 max_entries; 26 __le32 nr_entries; 27 __le32 value_size; 28 __le64 blocknr; /* Block this node is supposed to live in. */ 29} __packed; 30 31/*----------------------------------------------------------------*/ 32 33/* 34 * Validator methods. As usual we calculate a checksum, and also write the 35 * block location into the header (paranoia about ssds remapping areas by 36 * mistake). 37 */ 38#define CSUM_XOR 595846735 39 40static void array_block_prepare_for_write(struct dm_block_validator *v, 41 struct dm_block *b, 42 size_t size_of_block) 43{ 44 struct array_block *bh_le = dm_block_data(b); 45 46 bh_le->blocknr = cpu_to_le64(dm_block_location(b)); 47 bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, 48 size_of_block - sizeof(__le32), 49 CSUM_XOR)); 50} 51 52static int array_block_check(struct dm_block_validator *v, 53 struct dm_block *b, 54 size_t size_of_block) 55{ 56 struct array_block *bh_le = dm_block_data(b); 57 __le32 csum_disk; 58 59 if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) { 60 DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu", 61 (unsigned long long) le64_to_cpu(bh_le->blocknr), 62 (unsigned long long) dm_block_location(b)); 63 return -ENOTBLK; 64 } 65 66 csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, 67 size_of_block - sizeof(__le32), 68 CSUM_XOR)); 69 if (csum_disk != bh_le->csum) { 70 DMERR_LIMIT("array_block_check failed: csum %u != wanted %u", 71 (unsigned) le32_to_cpu(csum_disk), 72 (unsigned) le32_to_cpu(bh_le->csum)); 73 return -EILSEQ; 74 } 75 76 return 0; 77} 78 79static struct dm_block_validator array_validator = { 80 .name = "array", 81 .prepare_for_write = array_block_prepare_for_write, 82 .check = array_block_check 83}; 84 85/*----------------------------------------------------------------*/ 86 87/* 88 * Functions for manipulating the array blocks. 89 */ 90 91/* 92 * Returns a pointer to a value within an array block. 93 * 94 * index - The index into _this_ specific block. 95 */ 96static void *element_at(struct dm_array_info *info, struct array_block *ab, 97 unsigned index) 98{ 99 unsigned char *entry = (unsigned char *) (ab + 1); 100 101 entry += index * info->value_type.size; 102 103 return entry; 104} 105 106/* 107 * Utility function that calls one of the value_type methods on every value 108 * in an array block. 109 */ 110static void on_entries(struct dm_array_info *info, struct array_block *ab, 111 void (*fn)(void *, const void *)) 112{ 113 unsigned i, nr_entries = le32_to_cpu(ab->nr_entries); 114 115 for (i = 0; i < nr_entries; i++) 116 fn(info->value_type.context, element_at(info, ab, i)); 117} 118 119/* 120 * Increment every value in an array block. 121 */ 122static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab) 123{ 124 struct dm_btree_value_type *vt = &info->value_type; 125 126 if (vt->inc) 127 on_entries(info, ab, vt->inc); 128} 129 130/* 131 * Decrement every value in an array block. 132 */ 133static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab) 134{ 135 struct dm_btree_value_type *vt = &info->value_type; 136 137 if (vt->dec) 138 on_entries(info, ab, vt->dec); 139} 140 141/* 142 * Each array block can hold this many values. 143 */ 144static uint32_t calc_max_entries(size_t value_size, size_t size_of_block) 145{ 146 return (size_of_block - sizeof(struct array_block)) / value_size; 147} 148 149/* 150 * Allocate a new array block. The caller will need to unlock block. 151 */ 152static int alloc_ablock(struct dm_array_info *info, size_t size_of_block, 153 uint32_t max_entries, 154 struct dm_block **block, struct array_block **ab) 155{ 156 int r; 157 158 r = dm_tm_new_block(info->btree_info.tm, &array_validator, block); 159 if (r) 160 return r; 161 162 (*ab) = dm_block_data(*block); 163 (*ab)->max_entries = cpu_to_le32(max_entries); 164 (*ab)->nr_entries = cpu_to_le32(0); 165 (*ab)->value_size = cpu_to_le32(info->value_type.size); 166 167 return 0; 168} 169 170/* 171 * Pad an array block out with a particular value. Every instance will 172 * cause an increment of the value_type. new_nr must always be more than 173 * the current number of entries. 174 */ 175static void fill_ablock(struct dm_array_info *info, struct array_block *ab, 176 const void *value, unsigned new_nr) 177{ 178 unsigned i; 179 uint32_t nr_entries; 180 struct dm_btree_value_type *vt = &info->value_type; 181 182 BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); 183 BUG_ON(new_nr < le32_to_cpu(ab->nr_entries)); 184 185 nr_entries = le32_to_cpu(ab->nr_entries); 186 for (i = nr_entries; i < new_nr; i++) { 187 if (vt->inc) 188 vt->inc(vt->context, value); 189 memcpy(element_at(info, ab, i), value, vt->size); 190 } 191 ab->nr_entries = cpu_to_le32(new_nr); 192} 193 194/* 195 * Remove some entries from the back of an array block. Every value 196 * removed will be decremented. new_nr must be <= the current number of 197 * entries. 198 */ 199static void trim_ablock(struct dm_array_info *info, struct array_block *ab, 200 unsigned new_nr) 201{ 202 unsigned i; 203 uint32_t nr_entries; 204 struct dm_btree_value_type *vt = &info->value_type; 205 206 BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); 207 BUG_ON(new_nr > le32_to_cpu(ab->nr_entries)); 208 209 nr_entries = le32_to_cpu(ab->nr_entries); 210 for (i = nr_entries; i > new_nr; i--) 211 if (vt->dec) 212 vt->dec(vt->context, element_at(info, ab, i - 1)); 213 ab->nr_entries = cpu_to_le32(new_nr); 214} 215 216/* 217 * Read locks a block, and coerces it to an array block. The caller must 218 * unlock 'block' when finished. 219 */ 220static int get_ablock(struct dm_array_info *info, dm_block_t b, 221 struct dm_block **block, struct array_block **ab) 222{ 223 int r; 224 225 r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block); 226 if (r) 227 return r; 228 229 *ab = dm_block_data(*block); 230 return 0; 231} 232 233/* 234 * Unlocks an array block. 235 */ 236static int unlock_ablock(struct dm_array_info *info, struct dm_block *block) 237{ 238 return dm_tm_unlock(info->btree_info.tm, block); 239} 240 241/*----------------------------------------------------------------*/ 242 243/* 244 * Btree manipulation. 245 */ 246 247/* 248 * Looks up an array block in the btree, and then read locks it. 249 * 250 * index is the index of the index of the array_block, (ie. the array index 251 * / max_entries). 252 */ 253static int lookup_ablock(struct dm_array_info *info, dm_block_t root, 254 unsigned index, struct dm_block **block, 255 struct array_block **ab) 256{ 257 int r; 258 uint64_t key = index; 259 __le64 block_le; 260 261 r = dm_btree_lookup(&info->btree_info, root, &key, &block_le); 262 if (r) 263 return r; 264 265 return get_ablock(info, le64_to_cpu(block_le), block, ab); 266} 267 268/* 269 * Insert an array block into the btree. The block is _not_ unlocked. 270 */ 271static int insert_ablock(struct dm_array_info *info, uint64_t index, 272 struct dm_block *block, dm_block_t *root) 273{ 274 __le64 block_le = cpu_to_le64(dm_block_location(block)); 275 276 __dm_bless_for_disk(block_le); 277 return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root); 278} 279 280/* 281 * Looks up an array block in the btree. Then shadows it, and updates the 282 * btree to point to this new shadow. 'root' is an input/output parameter 283 * for both the current root block, and the new one. 284 */ 285static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, 286 unsigned index, struct dm_block **block, 287 struct array_block **ab) 288{ 289 int r, inc; 290 uint64_t key = index; 291 dm_block_t b; 292 __le64 block_le; 293 294 /* 295 * lookup 296 */ 297 r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le); 298 if (r) 299 return r; 300 b = le64_to_cpu(block_le); 301 302 /* 303 * shadow 304 */ 305 r = dm_tm_shadow_block(info->btree_info.tm, b, 306 &array_validator, block, &inc); 307 if (r) 308 return r; 309 310 *ab = dm_block_data(*block); 311 if (inc) 312 inc_ablock_entries(info, *ab); 313 314 /* 315 * Reinsert. 316 * 317 * The shadow op will often be a noop. Only insert if it really 318 * copied data. 319 */ 320 if (dm_block_location(*block) != b) { 321 /* 322 * dm_tm_shadow_block will have already decremented the old 323 * block, but it is still referenced by the btree. We 324 * increment to stop the insert decrementing it below zero 325 * when overwriting the old value. 326 */ 327 dm_tm_inc(info->btree_info.tm, b); 328 r = insert_ablock(info, index, *block, root); 329 } 330 331 return r; 332} 333 334/* 335 * Allocate an new array block, and fill it with some values. 336 */ 337static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block, 338 uint32_t max_entries, 339 unsigned block_index, uint32_t nr, 340 const void *value, dm_block_t *root) 341{ 342 int r; 343 struct dm_block *block; 344 struct array_block *ab; 345 346 r = alloc_ablock(info, size_of_block, max_entries, &block, &ab); 347 if (r) 348 return r; 349 350 fill_ablock(info, ab, value, nr); 351 r = insert_ablock(info, block_index, block, root); 352 unlock_ablock(info, block); 353 354 return r; 355} 356 357static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block, 358 unsigned begin_block, unsigned end_block, 359 unsigned max_entries, const void *value, 360 dm_block_t *root) 361{ 362 int r = 0; 363 364 for (; !r && begin_block != end_block; begin_block++) 365 r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root); 366 367 return r; 368} 369 370/* 371 * There are a bunch of functions involved with resizing an array. This 372 * structure holds information that commonly needed by them. Purely here 373 * to reduce parameter count. 374 */ 375struct resize { 376 /* 377 * Describes the array. 378 */ 379 struct dm_array_info *info; 380 381 /* 382 * The current root of the array. This gets updated. 383 */ 384 dm_block_t root; 385 386 /* 387 * Metadata block size. Used to calculate the nr entries in an 388 * array block. 389 */ 390 size_t size_of_block; 391 392 /* 393 * Maximum nr entries in an array block. 394 */ 395 unsigned max_entries; 396 397 /* 398 * nr of completely full blocks in the array. 399 * 400 * 'old' refers to before the resize, 'new' after. 401 */ 402 unsigned old_nr_full_blocks, new_nr_full_blocks; 403 404 /* 405 * Number of entries in the final block. 0 iff only full blocks in 406 * the array. 407 */ 408 unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block; 409 410 /* 411 * The default value used when growing the array. 412 */ 413 const void *value; 414}; 415 416/* 417 * Removes a consecutive set of array blocks from the btree. The values 418 * in block are decremented as a side effect of the btree remove. 419 * 420 * begin_index - the index of the first array block to remove. 421 * end_index - the one-past-the-end value. ie. this block is not removed. 422 */ 423static int drop_blocks(struct resize *resize, unsigned begin_index, 424 unsigned end_index) 425{ 426 int r; 427 428 while (begin_index != end_index) { 429 uint64_t key = begin_index++; 430 r = dm_btree_remove(&resize->info->btree_info, resize->root, 431 &key, &resize->root); 432 if (r) 433 return r; 434 } 435 436 return 0; 437} 438 439/* 440 * Calculates how many blocks are needed for the array. 441 */ 442static unsigned total_nr_blocks_needed(unsigned nr_full_blocks, 443 unsigned nr_entries_in_last_block) 444{ 445 return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0); 446} 447 448/* 449 * Shrink an array. 450 */ 451static int shrink(struct resize *resize) 452{ 453 int r; 454 unsigned begin, end; 455 struct dm_block *block; 456 struct array_block *ab; 457 458 /* 459 * Lose some blocks from the back? 460 */ 461 if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) { 462 begin = total_nr_blocks_needed(resize->new_nr_full_blocks, 463 resize->new_nr_entries_in_last_block); 464 end = total_nr_blocks_needed(resize->old_nr_full_blocks, 465 resize->old_nr_entries_in_last_block); 466 467 r = drop_blocks(resize, begin, end); 468 if (r) 469 return r; 470 } 471 472 /* 473 * Trim the new tail block 474 */ 475 if (resize->new_nr_entries_in_last_block) { 476 r = shadow_ablock(resize->info, &resize->root, 477 resize->new_nr_full_blocks, &block, &ab); 478 if (r) 479 return r; 480 481 trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block); 482 unlock_ablock(resize->info, block); 483 } 484 485 return 0; 486} 487 488/* 489 * Grow an array. 490 */ 491static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries) 492{ 493 int r; 494 struct dm_block *block; 495 struct array_block *ab; 496 497 r = shadow_ablock(resize->info, &resize->root, 498 resize->old_nr_full_blocks, &block, &ab); 499 if (r) 500 return r; 501 502 fill_ablock(resize->info, ab, resize->value, new_nr_entries); 503 unlock_ablock(resize->info, block); 504 505 return r; 506} 507 508static int grow_add_tail_block(struct resize *resize) 509{ 510 return insert_new_ablock(resize->info, resize->size_of_block, 511 resize->max_entries, 512 resize->new_nr_full_blocks, 513 resize->new_nr_entries_in_last_block, 514 resize->value, &resize->root); 515} 516 517static int grow_needs_more_blocks(struct resize *resize) 518{ 519 int r; 520 unsigned old_nr_blocks = resize->old_nr_full_blocks; 521 522 if (resize->old_nr_entries_in_last_block > 0) { 523 old_nr_blocks++; 524 525 r = grow_extend_tail_block(resize, resize->max_entries); 526 if (r) 527 return r; 528 } 529 530 r = insert_full_ablocks(resize->info, resize->size_of_block, 531 old_nr_blocks, 532 resize->new_nr_full_blocks, 533 resize->max_entries, resize->value, 534 &resize->root); 535 if (r) 536 return r; 537 538 if (resize->new_nr_entries_in_last_block) 539 r = grow_add_tail_block(resize); 540 541 return r; 542} 543 544static int grow(struct resize *resize) 545{ 546 if (resize->new_nr_full_blocks > resize->old_nr_full_blocks) 547 return grow_needs_more_blocks(resize); 548 549 else if (resize->old_nr_entries_in_last_block) 550 return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block); 551 552 else 553 return grow_add_tail_block(resize); 554} 555 556/*----------------------------------------------------------------*/ 557 558/* 559 * These are the value_type functions for the btree elements, which point 560 * to array blocks. 561 */ 562static void block_inc(void *context, const void *value) 563{ 564 __le64 block_le; 565 struct dm_array_info *info = context; 566 567 memcpy(&block_le, value, sizeof(block_le)); 568 dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le)); 569} 570 571static void block_dec(void *context, const void *value) 572{ 573 int r; 574 uint64_t b; 575 __le64 block_le; 576 uint32_t ref_count; 577 struct dm_block *block; 578 struct array_block *ab; 579 struct dm_array_info *info = context; 580 581 memcpy(&block_le, value, sizeof(block_le)); 582 b = le64_to_cpu(block_le); 583 584 r = dm_tm_ref(info->btree_info.tm, b, &ref_count); 585 if (r) { 586 DMERR_LIMIT("couldn't get reference count for block %llu", 587 (unsigned long long) b); 588 return; 589 } 590 591 if (ref_count == 1) { 592 /* 593 * We're about to drop the last reference to this ablock. 594 * So we need to decrement the ref count of the contents. 595 */ 596 r = get_ablock(info, b, &block, &ab); 597 if (r) { 598 DMERR_LIMIT("couldn't get array block %llu", 599 (unsigned long long) b); 600 return; 601 } 602 603 dec_ablock_entries(info, ab); 604 unlock_ablock(info, block); 605 } 606 607 dm_tm_dec(info->btree_info.tm, b); 608} 609 610static int block_equal(void *context, const void *value1, const void *value2) 611{ 612 return !memcmp(value1, value2, sizeof(__le64)); 613} 614 615/*----------------------------------------------------------------*/ 616 617void dm_array_info_init(struct dm_array_info *info, 618 struct dm_transaction_manager *tm, 619 struct dm_btree_value_type *vt) 620{ 621 struct dm_btree_value_type *bvt = &info->btree_info.value_type; 622 623 memcpy(&info->value_type, vt, sizeof(info->value_type)); 624 info->btree_info.tm = tm; 625 info->btree_info.levels = 1; 626 627 bvt->context = info; 628 bvt->size = sizeof(__le64); 629 bvt->inc = block_inc; 630 bvt->dec = block_dec; 631 bvt->equal = block_equal; 632} 633EXPORT_SYMBOL_GPL(dm_array_info_init); 634 635int dm_array_empty(struct dm_array_info *info, dm_block_t *root) 636{ 637 return dm_btree_empty(&info->btree_info, root); 638} 639EXPORT_SYMBOL_GPL(dm_array_empty); 640 641static int array_resize(struct dm_array_info *info, dm_block_t root, 642 uint32_t old_size, uint32_t new_size, 643 const void *value, dm_block_t *new_root) 644{ 645 int r; 646 struct resize resize; 647 648 if (old_size == new_size) { 649 *new_root = root; 650 return 0; 651 } 652 653 resize.info = info; 654 resize.root = root; 655 resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); 656 resize.max_entries = calc_max_entries(info->value_type.size, 657 resize.size_of_block); 658 659 resize.old_nr_full_blocks = old_size / resize.max_entries; 660 resize.old_nr_entries_in_last_block = old_size % resize.max_entries; 661 resize.new_nr_full_blocks = new_size / resize.max_entries; 662 resize.new_nr_entries_in_last_block = new_size % resize.max_entries; 663 resize.value = value; 664 665 r = ((new_size > old_size) ? grow : shrink)(&resize); 666 if (r) 667 return r; 668 669 *new_root = resize.root; 670 return 0; 671} 672 673int dm_array_resize(struct dm_array_info *info, dm_block_t root, 674 uint32_t old_size, uint32_t new_size, 675 const void *value, dm_block_t *new_root) 676 __dm_written_to_disk(value) 677{ 678 int r = array_resize(info, root, old_size, new_size, value, new_root); 679 __dm_unbless_for_disk(value); 680 return r; 681} 682EXPORT_SYMBOL_GPL(dm_array_resize); 683 684int dm_array_del(struct dm_array_info *info, dm_block_t root) 685{ 686 return dm_btree_del(&info->btree_info, root); 687} 688EXPORT_SYMBOL_GPL(dm_array_del); 689 690int dm_array_get_value(struct dm_array_info *info, dm_block_t root, 691 uint32_t index, void *value_le) 692{ 693 int r; 694 struct dm_block *block; 695 struct array_block *ab; 696 size_t size_of_block; 697 unsigned entry, max_entries; 698 699 size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); 700 max_entries = calc_max_entries(info->value_type.size, size_of_block); 701 702 r = lookup_ablock(info, root, index / max_entries, &block, &ab); 703 if (r) 704 return r; 705 706 entry = index % max_entries; 707 if (entry >= le32_to_cpu(ab->nr_entries)) 708 r = -ENODATA; 709 else 710 memcpy(value_le, element_at(info, ab, entry), 711 info->value_type.size); 712 713 unlock_ablock(info, block); 714 return r; 715} 716EXPORT_SYMBOL_GPL(dm_array_get_value); 717 718static int array_set_value(struct dm_array_info *info, dm_block_t root, 719 uint32_t index, const void *value, dm_block_t *new_root) 720{ 721 int r; 722 struct dm_block *block; 723 struct array_block *ab; 724 size_t size_of_block; 725 unsigned max_entries; 726 unsigned entry; 727 void *old_value; 728 struct dm_btree_value_type *vt = &info->value_type; 729 730 size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); 731 max_entries = calc_max_entries(info->value_type.size, size_of_block); 732 733 r = shadow_ablock(info, &root, index / max_entries, &block, &ab); 734 if (r) 735 return r; 736 *new_root = root; 737 738 entry = index % max_entries; 739 if (entry >= le32_to_cpu(ab->nr_entries)) { 740 r = -ENODATA; 741 goto out; 742 } 743 744 old_value = element_at(info, ab, entry); 745 if (vt->dec && 746 (!vt->equal || !vt->equal(vt->context, old_value, value))) { 747 vt->dec(vt->context, old_value); 748 if (vt->inc) 749 vt->inc(vt->context, value); 750 } 751 752 memcpy(old_value, value, info->value_type.size); 753 754out: 755 unlock_ablock(info, block); 756 return r; 757} 758 759int dm_array_set_value(struct dm_array_info *info, dm_block_t root, 760 uint32_t index, const void *value, dm_block_t *new_root) 761 __dm_written_to_disk(value) 762{ 763 int r; 764 765 r = array_set_value(info, root, index, value, new_root); 766 __dm_unbless_for_disk(value); 767 return r; 768} 769EXPORT_SYMBOL_GPL(dm_array_set_value); 770 771struct walk_info { 772 struct dm_array_info *info; 773 int (*fn)(void *context, uint64_t key, void *leaf); 774 void *context; 775}; 776 777static int walk_ablock(void *context, uint64_t *keys, void *leaf) 778{ 779 struct walk_info *wi = context; 780 781 int r; 782 unsigned i; 783 __le64 block_le; 784 unsigned nr_entries, max_entries; 785 struct dm_block *block; 786 struct array_block *ab; 787 788 memcpy(&block_le, leaf, sizeof(block_le)); 789 r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab); 790 if (r) 791 return r; 792 793 max_entries = le32_to_cpu(ab->max_entries); 794 nr_entries = le32_to_cpu(ab->nr_entries); 795 for (i = 0; i < nr_entries; i++) { 796 r = wi->fn(wi->context, keys[0] * max_entries + i, 797 element_at(wi->info, ab, i)); 798 799 if (r) 800 break; 801 } 802 803 unlock_ablock(wi->info, block); 804 return r; 805} 806 807int dm_array_walk(struct dm_array_info *info, dm_block_t root, 808 int (*fn)(void *, uint64_t key, void *leaf), 809 void *context) 810{ 811 struct walk_info wi; 812 813 wi.info = info; 814 wi.fn = fn; 815 wi.context = context; 816 817 return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi); 818} 819EXPORT_SYMBOL_GPL(dm_array_walk); 820 821/*----------------------------------------------------------------*/ 822