1/* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33*/ 34 35#include <linux/kthread.h> 36#include <linux/blkdev.h> 37#include <linux/sysctl.h> 38#include <linux/seq_file.h> 39#include <linux/fs.h> 40#include <linux/poll.h> 41#include <linux/ctype.h> 42#include <linux/string.h> 43#include <linux/hdreg.h> 44#include <linux/proc_fs.h> 45#include <linux/random.h> 46#include <linux/module.h> 47#include <linux/reboot.h> 48#include <linux/file.h> 49#include <linux/compat.h> 50#include <linux/delay.h> 51#include <linux/raid/md_p.h> 52#include <linux/raid/md_u.h> 53#include <linux/slab.h> 54#include "md.h" 55#include "bitmap.h" 56#include "md-cluster.h" 57 58#ifndef MODULE 59static void autostart_arrays(int part); 60#endif 61 62/* pers_list is a list of registered personalities protected 63 * by pers_lock. 64 * pers_lock does extra service to protect accesses to 65 * mddev->thread when the mutex cannot be held. 66 */ 67static LIST_HEAD(pers_list); 68static DEFINE_SPINLOCK(pers_lock); 69 70struct md_cluster_operations *md_cluster_ops; 71EXPORT_SYMBOL(md_cluster_ops); 72struct module *md_cluster_mod; 73EXPORT_SYMBOL(md_cluster_mod); 74 75static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 76static struct workqueue_struct *md_wq; 77static struct workqueue_struct *md_misc_wq; 78 79static int remove_and_add_spares(struct mddev *mddev, 80 struct md_rdev *this); 81static void mddev_detach(struct mddev *mddev); 82 83/* 84 * Default number of read corrections we'll attempt on an rdev 85 * before ejecting it from the array. We divide the read error 86 * count by 2 for every hour elapsed between read errors. 87 */ 88#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 89/* 90 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 91 * is 1000 KB/sec, so the extra system load does not show up that much. 92 * Increase it if you want to have more _guaranteed_ speed. Note that 93 * the RAID driver will use the maximum available bandwidth if the IO 94 * subsystem is idle. There is also an 'absolute maximum' reconstruction 95 * speed limit - in case reconstruction slows down your system despite 96 * idle IO detection. 97 * 98 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 99 * or /sys/block/mdX/md/sync_speed_{min,max} 100 */ 101 102static int sysctl_speed_limit_min = 1000; 103static int sysctl_speed_limit_max = 200000; 104static inline int speed_min(struct mddev *mddev) 105{ 106 return mddev->sync_speed_min ? 107 mddev->sync_speed_min : sysctl_speed_limit_min; 108} 109 110static inline int speed_max(struct mddev *mddev) 111{ 112 return mddev->sync_speed_max ? 113 mddev->sync_speed_max : sysctl_speed_limit_max; 114} 115 116static struct ctl_table_header *raid_table_header; 117 118static struct ctl_table raid_table[] = { 119 { 120 .procname = "speed_limit_min", 121 .data = &sysctl_speed_limit_min, 122 .maxlen = sizeof(int), 123 .mode = S_IRUGO|S_IWUSR, 124 .proc_handler = proc_dointvec, 125 }, 126 { 127 .procname = "speed_limit_max", 128 .data = &sysctl_speed_limit_max, 129 .maxlen = sizeof(int), 130 .mode = S_IRUGO|S_IWUSR, 131 .proc_handler = proc_dointvec, 132 }, 133 { } 134}; 135 136static struct ctl_table raid_dir_table[] = { 137 { 138 .procname = "raid", 139 .maxlen = 0, 140 .mode = S_IRUGO|S_IXUGO, 141 .child = raid_table, 142 }, 143 { } 144}; 145 146static struct ctl_table raid_root_table[] = { 147 { 148 .procname = "dev", 149 .maxlen = 0, 150 .mode = 0555, 151 .child = raid_dir_table, 152 }, 153 { } 154}; 155 156static const struct block_device_operations md_fops; 157 158static int start_readonly; 159 160/* bio_clone_mddev 161 * like bio_clone, but with a local bio set 162 */ 163 164struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 165 struct mddev *mddev) 166{ 167 struct bio *b; 168 169 if (!mddev || !mddev->bio_set) 170 return bio_alloc(gfp_mask, nr_iovecs); 171 172 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); 173 if (!b) 174 return NULL; 175 return b; 176} 177EXPORT_SYMBOL_GPL(bio_alloc_mddev); 178 179struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, 180 struct mddev *mddev) 181{ 182 if (!mddev || !mddev->bio_set) 183 return bio_clone(bio, gfp_mask); 184 185 return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); 186} 187EXPORT_SYMBOL_GPL(bio_clone_mddev); 188 189/* 190 * We have a system wide 'event count' that is incremented 191 * on any 'interesting' event, and readers of /proc/mdstat 192 * can use 'poll' or 'select' to find out when the event 193 * count increases. 194 * 195 * Events are: 196 * start array, stop array, error, add device, remove device, 197 * start build, activate spare 198 */ 199static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 200static atomic_t md_event_count; 201void md_new_event(struct mddev *mddev) 202{ 203 atomic_inc(&md_event_count); 204 wake_up(&md_event_waiters); 205} 206EXPORT_SYMBOL_GPL(md_new_event); 207 208/* Alternate version that can be called from interrupts 209 * when calling sysfs_notify isn't needed. 210 */ 211static void md_new_event_inintr(struct mddev *mddev) 212{ 213 atomic_inc(&md_event_count); 214 wake_up(&md_event_waiters); 215} 216 217/* 218 * Enables to iterate over all existing md arrays 219 * all_mddevs_lock protects this list. 220 */ 221static LIST_HEAD(all_mddevs); 222static DEFINE_SPINLOCK(all_mddevs_lock); 223 224/* 225 * iterates through all used mddevs in the system. 226 * We take care to grab the all_mddevs_lock whenever navigating 227 * the list, and to always hold a refcount when unlocked. 228 * Any code which breaks out of this loop while own 229 * a reference to the current mddev and must mddev_put it. 230 */ 231#define for_each_mddev(_mddev,_tmp) \ 232 \ 233 for (({ spin_lock(&all_mddevs_lock); \ 234 _tmp = all_mddevs.next; \ 235 _mddev = NULL;}); \ 236 ({ if (_tmp != &all_mddevs) \ 237 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ 238 spin_unlock(&all_mddevs_lock); \ 239 if (_mddev) mddev_put(_mddev); \ 240 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ 241 _tmp != &all_mddevs;}); \ 242 ({ spin_lock(&all_mddevs_lock); \ 243 _tmp = _tmp->next;}) \ 244 ) 245 246/* Rather than calling directly into the personality make_request function, 247 * IO requests come here first so that we can check if the device is 248 * being suspended pending a reconfiguration. 249 * We hold a refcount over the call to ->make_request. By the time that 250 * call has finished, the bio has been linked into some internal structure 251 * and so is visible to ->quiesce(), so we don't need the refcount any more. 252 */ 253static void md_make_request(struct request_queue *q, struct bio *bio) 254{ 255 const int rw = bio_data_dir(bio); 256 struct mddev *mddev = q->queuedata; 257 unsigned int sectors; 258 int cpu; 259 260 if (mddev == NULL || mddev->pers == NULL 261 || !mddev->ready) { 262 bio_io_error(bio); 263 return; 264 } 265 if (mddev->ro == 1 && unlikely(rw == WRITE)) { 266 bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); 267 return; 268 } 269 smp_rmb(); /* Ensure implications of 'active' are visible */ 270 rcu_read_lock(); 271 if (mddev->suspended) { 272 DEFINE_WAIT(__wait); 273 for (;;) { 274 prepare_to_wait(&mddev->sb_wait, &__wait, 275 TASK_UNINTERRUPTIBLE); 276 if (!mddev->suspended) 277 break; 278 rcu_read_unlock(); 279 schedule(); 280 rcu_read_lock(); 281 } 282 finish_wait(&mddev->sb_wait, &__wait); 283 } 284 atomic_inc(&mddev->active_io); 285 rcu_read_unlock(); 286 287 /* 288 * save the sectors now since our bio can 289 * go away inside make_request 290 */ 291 sectors = bio_sectors(bio); 292 /* bio could be mergeable after passing to underlayer */ 293 bio->bi_rw &= ~REQ_NOMERGE; 294 mddev->pers->make_request(mddev, bio); 295 296 cpu = part_stat_lock(); 297 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 298 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); 299 part_stat_unlock(); 300 301 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 302 wake_up(&mddev->sb_wait); 303} 304 305/* mddev_suspend makes sure no new requests are submitted 306 * to the device, and that any requests that have been submitted 307 * are completely handled. 308 * Once mddev_detach() is called and completes, the module will be 309 * completely unused. 310 */ 311void mddev_suspend(struct mddev *mddev) 312{ 313 BUG_ON(mddev->suspended); 314 mddev->suspended = 1; 315 synchronize_rcu(); 316 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 317 mddev->pers->quiesce(mddev, 1); 318 319 del_timer_sync(&mddev->safemode_timer); 320} 321EXPORT_SYMBOL_GPL(mddev_suspend); 322 323void mddev_resume(struct mddev *mddev) 324{ 325 mddev->suspended = 0; 326 wake_up(&mddev->sb_wait); 327 mddev->pers->quiesce(mddev, 0); 328 329 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 330 md_wakeup_thread(mddev->thread); 331 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 332} 333EXPORT_SYMBOL_GPL(mddev_resume); 334 335int mddev_congested(struct mddev *mddev, int bits) 336{ 337 struct md_personality *pers = mddev->pers; 338 int ret = 0; 339 340 rcu_read_lock(); 341 if (mddev->suspended) 342 ret = 1; 343 else if (pers && pers->congested) 344 ret = pers->congested(mddev, bits); 345 rcu_read_unlock(); 346 return ret; 347} 348EXPORT_SYMBOL_GPL(mddev_congested); 349static int md_congested(void *data, int bits) 350{ 351 struct mddev *mddev = data; 352 return mddev_congested(mddev, bits); 353} 354 355static int md_mergeable_bvec(struct request_queue *q, 356 struct bvec_merge_data *bvm, 357 struct bio_vec *biovec) 358{ 359 struct mddev *mddev = q->queuedata; 360 int ret; 361 rcu_read_lock(); 362 if (mddev->suspended) { 363 /* Must always allow one vec */ 364 if (bvm->bi_size == 0) 365 ret = biovec->bv_len; 366 else 367 ret = 0; 368 } else { 369 struct md_personality *pers = mddev->pers; 370 if (pers && pers->mergeable_bvec) 371 ret = pers->mergeable_bvec(mddev, bvm, biovec); 372 else 373 ret = biovec->bv_len; 374 } 375 rcu_read_unlock(); 376 return ret; 377} 378/* 379 * Generic flush handling for md 380 */ 381 382static void md_end_flush(struct bio *bio, int err) 383{ 384 struct md_rdev *rdev = bio->bi_private; 385 struct mddev *mddev = rdev->mddev; 386 387 rdev_dec_pending(rdev, mddev); 388 389 if (atomic_dec_and_test(&mddev->flush_pending)) { 390 /* The pre-request flush has finished */ 391 queue_work(md_wq, &mddev->flush_work); 392 } 393 bio_put(bio); 394} 395 396static void md_submit_flush_data(struct work_struct *ws); 397 398static void submit_flushes(struct work_struct *ws) 399{ 400 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 401 struct md_rdev *rdev; 402 403 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 404 atomic_set(&mddev->flush_pending, 1); 405 rcu_read_lock(); 406 rdev_for_each_rcu(rdev, mddev) 407 if (rdev->raid_disk >= 0 && 408 !test_bit(Faulty, &rdev->flags)) { 409 /* Take two references, one is dropped 410 * when request finishes, one after 411 * we reclaim rcu_read_lock 412 */ 413 struct bio *bi; 414 atomic_inc(&rdev->nr_pending); 415 atomic_inc(&rdev->nr_pending); 416 rcu_read_unlock(); 417 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 418 bi->bi_end_io = md_end_flush; 419 bi->bi_private = rdev; 420 bi->bi_bdev = rdev->bdev; 421 atomic_inc(&mddev->flush_pending); 422 submit_bio(WRITE_FLUSH, bi); 423 rcu_read_lock(); 424 rdev_dec_pending(rdev, mddev); 425 } 426 rcu_read_unlock(); 427 if (atomic_dec_and_test(&mddev->flush_pending)) 428 queue_work(md_wq, &mddev->flush_work); 429} 430 431static void md_submit_flush_data(struct work_struct *ws) 432{ 433 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 434 struct bio *bio = mddev->flush_bio; 435 436 if (bio->bi_iter.bi_size == 0) 437 /* an empty barrier - all done */ 438 bio_endio(bio, 0); 439 else { 440 bio->bi_rw &= ~REQ_FLUSH; 441 mddev->pers->make_request(mddev, bio); 442 } 443 444 mddev->flush_bio = NULL; 445 wake_up(&mddev->sb_wait); 446} 447 448void md_flush_request(struct mddev *mddev, struct bio *bio) 449{ 450 spin_lock_irq(&mddev->lock); 451 wait_event_lock_irq(mddev->sb_wait, 452 !mddev->flush_bio, 453 mddev->lock); 454 mddev->flush_bio = bio; 455 spin_unlock_irq(&mddev->lock); 456 457 INIT_WORK(&mddev->flush_work, submit_flushes); 458 queue_work(md_wq, &mddev->flush_work); 459} 460EXPORT_SYMBOL(md_flush_request); 461 462void md_unplug(struct blk_plug_cb *cb, bool from_schedule) 463{ 464 struct mddev *mddev = cb->data; 465 md_wakeup_thread(mddev->thread); 466 kfree(cb); 467} 468EXPORT_SYMBOL(md_unplug); 469 470static inline struct mddev *mddev_get(struct mddev *mddev) 471{ 472 atomic_inc(&mddev->active); 473 return mddev; 474} 475 476static void mddev_delayed_delete(struct work_struct *ws); 477 478static void mddev_put(struct mddev *mddev) 479{ 480 struct bio_set *bs = NULL; 481 482 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 483 return; 484 if (!mddev->raid_disks && list_empty(&mddev->disks) && 485 mddev->ctime == 0 && !mddev->hold_active) { 486 /* Array is not configured at all, and not held active, 487 * so destroy it */ 488 list_del_init(&mddev->all_mddevs); 489 bs = mddev->bio_set; 490 mddev->bio_set = NULL; 491 if (mddev->gendisk) { 492 /* We did a probe so need to clean up. Call 493 * queue_work inside the spinlock so that 494 * flush_workqueue() after mddev_find will 495 * succeed in waiting for the work to be done. 496 */ 497 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 498 queue_work(md_misc_wq, &mddev->del_work); 499 } else 500 kfree(mddev); 501 } 502 spin_unlock(&all_mddevs_lock); 503 if (bs) 504 bioset_free(bs); 505} 506 507void mddev_init(struct mddev *mddev) 508{ 509 mutex_init(&mddev->open_mutex); 510 mutex_init(&mddev->reconfig_mutex); 511 mutex_init(&mddev->bitmap_info.mutex); 512 INIT_LIST_HEAD(&mddev->disks); 513 INIT_LIST_HEAD(&mddev->all_mddevs); 514 init_timer(&mddev->safemode_timer); 515 atomic_set(&mddev->active, 1); 516 atomic_set(&mddev->openers, 0); 517 atomic_set(&mddev->active_io, 0); 518 spin_lock_init(&mddev->lock); 519 atomic_set(&mddev->flush_pending, 0); 520 init_waitqueue_head(&mddev->sb_wait); 521 init_waitqueue_head(&mddev->recovery_wait); 522 mddev->reshape_position = MaxSector; 523 mddev->reshape_backwards = 0; 524 mddev->last_sync_action = "none"; 525 mddev->resync_min = 0; 526 mddev->resync_max = MaxSector; 527 mddev->level = LEVEL_NONE; 528} 529EXPORT_SYMBOL_GPL(mddev_init); 530 531static struct mddev *mddev_find(dev_t unit) 532{ 533 struct mddev *mddev, *new = NULL; 534 535 if (unit && MAJOR(unit) != MD_MAJOR) 536 unit &= ~((1<<MdpMinorShift)-1); 537 538 retry: 539 spin_lock(&all_mddevs_lock); 540 541 if (unit) { 542 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 543 if (mddev->unit == unit) { 544 mddev_get(mddev); 545 spin_unlock(&all_mddevs_lock); 546 kfree(new); 547 return mddev; 548 } 549 550 if (new) { 551 list_add(&new->all_mddevs, &all_mddevs); 552 spin_unlock(&all_mddevs_lock); 553 new->hold_active = UNTIL_IOCTL; 554 return new; 555 } 556 } else if (new) { 557 /* find an unused unit number */ 558 static int next_minor = 512; 559 int start = next_minor; 560 int is_free = 0; 561 int dev = 0; 562 while (!is_free) { 563 dev = MKDEV(MD_MAJOR, next_minor); 564 next_minor++; 565 if (next_minor > MINORMASK) 566 next_minor = 0; 567 if (next_minor == start) { 568 /* Oh dear, all in use. */ 569 spin_unlock(&all_mddevs_lock); 570 kfree(new); 571 return NULL; 572 } 573 574 is_free = 1; 575 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 576 if (mddev->unit == dev) { 577 is_free = 0; 578 break; 579 } 580 } 581 new->unit = dev; 582 new->md_minor = MINOR(dev); 583 new->hold_active = UNTIL_STOP; 584 list_add(&new->all_mddevs, &all_mddevs); 585 spin_unlock(&all_mddevs_lock); 586 return new; 587 } 588 spin_unlock(&all_mddevs_lock); 589 590 new = kzalloc(sizeof(*new), GFP_KERNEL); 591 if (!new) 592 return NULL; 593 594 new->unit = unit; 595 if (MAJOR(unit) == MD_MAJOR) 596 new->md_minor = MINOR(unit); 597 else 598 new->md_minor = MINOR(unit) >> MdpMinorShift; 599 600 mddev_init(new); 601 602 goto retry; 603} 604 605static struct attribute_group md_redundancy_group; 606 607void mddev_unlock(struct mddev *mddev) 608{ 609 if (mddev->to_remove) { 610 /* These cannot be removed under reconfig_mutex as 611 * an access to the files will try to take reconfig_mutex 612 * while holding the file unremovable, which leads to 613 * a deadlock. 614 * So hold set sysfs_active while the remove in happeing, 615 * and anything else which might set ->to_remove or my 616 * otherwise change the sysfs namespace will fail with 617 * -EBUSY if sysfs_active is still set. 618 * We set sysfs_active under reconfig_mutex and elsewhere 619 * test it under the same mutex to ensure its correct value 620 * is seen. 621 */ 622 struct attribute_group *to_remove = mddev->to_remove; 623 mddev->to_remove = NULL; 624 mddev->sysfs_active = 1; 625 mutex_unlock(&mddev->reconfig_mutex); 626 627 if (mddev->kobj.sd) { 628 if (to_remove != &md_redundancy_group) 629 sysfs_remove_group(&mddev->kobj, to_remove); 630 if (mddev->pers == NULL || 631 mddev->pers->sync_request == NULL) { 632 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 633 if (mddev->sysfs_action) 634 sysfs_put(mddev->sysfs_action); 635 mddev->sysfs_action = NULL; 636 } 637 } 638 mddev->sysfs_active = 0; 639 } else 640 mutex_unlock(&mddev->reconfig_mutex); 641 642 /* As we've dropped the mutex we need a spinlock to 643 * make sure the thread doesn't disappear 644 */ 645 spin_lock(&pers_lock); 646 md_wakeup_thread(mddev->thread); 647 spin_unlock(&pers_lock); 648} 649EXPORT_SYMBOL_GPL(mddev_unlock); 650 651struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 652{ 653 struct md_rdev *rdev; 654 655 rdev_for_each_rcu(rdev, mddev) 656 if (rdev->desc_nr == nr) 657 return rdev; 658 659 return NULL; 660} 661EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 662 663static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 664{ 665 struct md_rdev *rdev; 666 667 rdev_for_each(rdev, mddev) 668 if (rdev->bdev->bd_dev == dev) 669 return rdev; 670 671 return NULL; 672} 673 674static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) 675{ 676 struct md_rdev *rdev; 677 678 rdev_for_each_rcu(rdev, mddev) 679 if (rdev->bdev->bd_dev == dev) 680 return rdev; 681 682 return NULL; 683} 684 685static struct md_personality *find_pers(int level, char *clevel) 686{ 687 struct md_personality *pers; 688 list_for_each_entry(pers, &pers_list, list) { 689 if (level != LEVEL_NONE && pers->level == level) 690 return pers; 691 if (strcmp(pers->name, clevel)==0) 692 return pers; 693 } 694 return NULL; 695} 696 697/* return the offset of the super block in 512byte sectors */ 698static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 699{ 700 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; 701 return MD_NEW_SIZE_SECTORS(num_sectors); 702} 703 704static int alloc_disk_sb(struct md_rdev *rdev) 705{ 706 rdev->sb_page = alloc_page(GFP_KERNEL); 707 if (!rdev->sb_page) { 708 printk(KERN_ALERT "md: out of memory.\n"); 709 return -ENOMEM; 710 } 711 712 return 0; 713} 714 715void md_rdev_clear(struct md_rdev *rdev) 716{ 717 if (rdev->sb_page) { 718 put_page(rdev->sb_page); 719 rdev->sb_loaded = 0; 720 rdev->sb_page = NULL; 721 rdev->sb_start = 0; 722 rdev->sectors = 0; 723 } 724 if (rdev->bb_page) { 725 put_page(rdev->bb_page); 726 rdev->bb_page = NULL; 727 } 728 kfree(rdev->badblocks.page); 729 rdev->badblocks.page = NULL; 730} 731EXPORT_SYMBOL_GPL(md_rdev_clear); 732 733static void super_written(struct bio *bio, int error) 734{ 735 struct md_rdev *rdev = bio->bi_private; 736 struct mddev *mddev = rdev->mddev; 737 738 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 739 printk("md: super_written gets error=%d, uptodate=%d\n", 740 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 741 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 742 md_error(mddev, rdev); 743 } 744 745 if (atomic_dec_and_test(&mddev->pending_writes)) 746 wake_up(&mddev->sb_wait); 747 bio_put(bio); 748} 749 750void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 751 sector_t sector, int size, struct page *page) 752{ 753 /* write first size bytes of page to sector of rdev 754 * Increment mddev->pending_writes before returning 755 * and decrement it on completion, waking up sb_wait 756 * if zero is reached. 757 * If an error occurred, call md_error 758 */ 759 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); 760 761 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; 762 bio->bi_iter.bi_sector = sector; 763 bio_add_page(bio, page, size, 0); 764 bio->bi_private = rdev; 765 bio->bi_end_io = super_written; 766 767 atomic_inc(&mddev->pending_writes); 768 submit_bio(WRITE_FLUSH_FUA, bio); 769} 770 771void md_super_wait(struct mddev *mddev) 772{ 773 /* wait for all superblock writes that were scheduled to complete */ 774 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 775} 776 777int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 778 struct page *page, int rw, bool metadata_op) 779{ 780 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); 781 int ret; 782 783 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? 784 rdev->meta_bdev : rdev->bdev; 785 if (metadata_op) 786 bio->bi_iter.bi_sector = sector + rdev->sb_start; 787 else if (rdev->mddev->reshape_position != MaxSector && 788 (rdev->mddev->reshape_backwards == 789 (sector >= rdev->mddev->reshape_position))) 790 bio->bi_iter.bi_sector = sector + rdev->new_data_offset; 791 else 792 bio->bi_iter.bi_sector = sector + rdev->data_offset; 793 bio_add_page(bio, page, size, 0); 794 submit_bio_wait(rw, bio); 795 796 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 797 bio_put(bio); 798 return ret; 799} 800EXPORT_SYMBOL_GPL(sync_page_io); 801 802static int read_disk_sb(struct md_rdev *rdev, int size) 803{ 804 char b[BDEVNAME_SIZE]; 805 806 if (rdev->sb_loaded) 807 return 0; 808 809 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) 810 goto fail; 811 rdev->sb_loaded = 1; 812 return 0; 813 814fail: 815 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 816 bdevname(rdev->bdev,b)); 817 return -EINVAL; 818} 819 820static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 821{ 822 return sb1->set_uuid0 == sb2->set_uuid0 && 823 sb1->set_uuid1 == sb2->set_uuid1 && 824 sb1->set_uuid2 == sb2->set_uuid2 && 825 sb1->set_uuid3 == sb2->set_uuid3; 826} 827 828static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 829{ 830 int ret; 831 mdp_super_t *tmp1, *tmp2; 832 833 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 834 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 835 836 if (!tmp1 || !tmp2) { 837 ret = 0; 838 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 839 goto abort; 840 } 841 842 *tmp1 = *sb1; 843 *tmp2 = *sb2; 844 845 /* 846 * nr_disks is not constant 847 */ 848 tmp1->nr_disks = 0; 849 tmp2->nr_disks = 0; 850 851 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 852abort: 853 kfree(tmp1); 854 kfree(tmp2); 855 return ret; 856} 857 858static u32 md_csum_fold(u32 csum) 859{ 860 csum = (csum & 0xffff) + (csum >> 16); 861 return (csum & 0xffff) + (csum >> 16); 862} 863 864static unsigned int calc_sb_csum(mdp_super_t *sb) 865{ 866 u64 newcsum = 0; 867 u32 *sb32 = (u32*)sb; 868 int i; 869 unsigned int disk_csum, csum; 870 871 disk_csum = sb->sb_csum; 872 sb->sb_csum = 0; 873 874 for (i = 0; i < MD_SB_BYTES/4 ; i++) 875 newcsum += sb32[i]; 876 csum = (newcsum & 0xffffffff) + (newcsum>>32); 877 878#ifdef CONFIG_ALPHA 879 /* This used to use csum_partial, which was wrong for several 880 * reasons including that different results are returned on 881 * different architectures. It isn't critical that we get exactly 882 * the same return value as before (we always csum_fold before 883 * testing, and that removes any differences). However as we 884 * know that csum_partial always returned a 16bit value on 885 * alphas, do a fold to maximise conformity to previous behaviour. 886 */ 887 sb->sb_csum = md_csum_fold(disk_csum); 888#else 889 sb->sb_csum = disk_csum; 890#endif 891 return csum; 892} 893 894/* 895 * Handle superblock details. 896 * We want to be able to handle multiple superblock formats 897 * so we have a common interface to them all, and an array of 898 * different handlers. 899 * We rely on user-space to write the initial superblock, and support 900 * reading and updating of superblocks. 901 * Interface methods are: 902 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 903 * loads and validates a superblock on dev. 904 * if refdev != NULL, compare superblocks on both devices 905 * Return: 906 * 0 - dev has a superblock that is compatible with refdev 907 * 1 - dev has a superblock that is compatible and newer than refdev 908 * so dev should be used as the refdev in future 909 * -EINVAL superblock incompatible or invalid 910 * -othererror e.g. -EIO 911 * 912 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 913 * Verify that dev is acceptable into mddev. 914 * The first time, mddev->raid_disks will be 0, and data from 915 * dev should be merged in. Subsequent calls check that dev 916 * is new enough. Return 0 or -EINVAL 917 * 918 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 919 * Update the superblock for rdev with data in mddev 920 * This does not write to disc. 921 * 922 */ 923 924struct super_type { 925 char *name; 926 struct module *owner; 927 int (*load_super)(struct md_rdev *rdev, 928 struct md_rdev *refdev, 929 int minor_version); 930 int (*validate_super)(struct mddev *mddev, 931 struct md_rdev *rdev); 932 void (*sync_super)(struct mddev *mddev, 933 struct md_rdev *rdev); 934 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 935 sector_t num_sectors); 936 int (*allow_new_offset)(struct md_rdev *rdev, 937 unsigned long long new_offset); 938}; 939 940/* 941 * Check that the given mddev has no bitmap. 942 * 943 * This function is called from the run method of all personalities that do not 944 * support bitmaps. It prints an error message and returns non-zero if mddev 945 * has a bitmap. Otherwise, it returns 0. 946 * 947 */ 948int md_check_no_bitmap(struct mddev *mddev) 949{ 950 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 951 return 0; 952 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 953 mdname(mddev), mddev->pers->name); 954 return 1; 955} 956EXPORT_SYMBOL(md_check_no_bitmap); 957 958/* 959 * load_super for 0.90.0 960 */ 961static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 962{ 963 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 964 mdp_super_t *sb; 965 int ret; 966 967 /* 968 * Calculate the position of the superblock (512byte sectors), 969 * it's at the end of the disk. 970 * 971 * It also happens to be a multiple of 4Kb. 972 */ 973 rdev->sb_start = calc_dev_sboffset(rdev); 974 975 ret = read_disk_sb(rdev, MD_SB_BYTES); 976 if (ret) return ret; 977 978 ret = -EINVAL; 979 980 bdevname(rdev->bdev, b); 981 sb = page_address(rdev->sb_page); 982 983 if (sb->md_magic != MD_SB_MAGIC) { 984 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 985 b); 986 goto abort; 987 } 988 989 if (sb->major_version != 0 || 990 sb->minor_version < 90 || 991 sb->minor_version > 91) { 992 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 993 sb->major_version, sb->minor_version, 994 b); 995 goto abort; 996 } 997 998 if (sb->raid_disks <= 0) 999 goto abort; 1000 1001 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1002 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 1003 b); 1004 goto abort; 1005 } 1006 1007 rdev->preferred_minor = sb->md_minor; 1008 rdev->data_offset = 0; 1009 rdev->new_data_offset = 0; 1010 rdev->sb_size = MD_SB_BYTES; 1011 rdev->badblocks.shift = -1; 1012 1013 if (sb->level == LEVEL_MULTIPATH) 1014 rdev->desc_nr = -1; 1015 else 1016 rdev->desc_nr = sb->this_disk.number; 1017 1018 if (!refdev) { 1019 ret = 1; 1020 } else { 1021 __u64 ev1, ev2; 1022 mdp_super_t *refsb = page_address(refdev->sb_page); 1023 if (!uuid_equal(refsb, sb)) { 1024 printk(KERN_WARNING "md: %s has different UUID to %s\n", 1025 b, bdevname(refdev->bdev,b2)); 1026 goto abort; 1027 } 1028 if (!sb_equal(refsb, sb)) { 1029 printk(KERN_WARNING "md: %s has same UUID" 1030 " but different superblock to %s\n", 1031 b, bdevname(refdev->bdev, b2)); 1032 goto abort; 1033 } 1034 ev1 = md_event(sb); 1035 ev2 = md_event(refsb); 1036 if (ev1 > ev2) 1037 ret = 1; 1038 else 1039 ret = 0; 1040 } 1041 rdev->sectors = rdev->sb_start; 1042 /* Limit to 4TB as metadata cannot record more than that. 1043 * (not needed for Linear and RAID0 as metadata doesn't 1044 * record this size) 1045 */ 1046 if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1047 rdev->sectors = (2ULL << 32) - 2; 1048 1049 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1050 /* "this cannot possibly happen" ... */ 1051 ret = -EINVAL; 1052 1053 abort: 1054 return ret; 1055} 1056 1057/* 1058 * validate_super for 0.90.0 1059 */ 1060static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1061{ 1062 mdp_disk_t *desc; 1063 mdp_super_t *sb = page_address(rdev->sb_page); 1064 __u64 ev1 = md_event(sb); 1065 1066 rdev->raid_disk = -1; 1067 clear_bit(Faulty, &rdev->flags); 1068 clear_bit(In_sync, &rdev->flags); 1069 clear_bit(Bitmap_sync, &rdev->flags); 1070 clear_bit(WriteMostly, &rdev->flags); 1071 1072 if (mddev->raid_disks == 0) { 1073 mddev->major_version = 0; 1074 mddev->minor_version = sb->minor_version; 1075 mddev->patch_version = sb->patch_version; 1076 mddev->external = 0; 1077 mddev->chunk_sectors = sb->chunk_size >> 9; 1078 mddev->ctime = sb->ctime; 1079 mddev->utime = sb->utime; 1080 mddev->level = sb->level; 1081 mddev->clevel[0] = 0; 1082 mddev->layout = sb->layout; 1083 mddev->raid_disks = sb->raid_disks; 1084 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1085 mddev->events = ev1; 1086 mddev->bitmap_info.offset = 0; 1087 mddev->bitmap_info.space = 0; 1088 /* bitmap can use 60 K after the 4K superblocks */ 1089 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1090 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1091 mddev->reshape_backwards = 0; 1092 1093 if (mddev->minor_version >= 91) { 1094 mddev->reshape_position = sb->reshape_position; 1095 mddev->delta_disks = sb->delta_disks; 1096 mddev->new_level = sb->new_level; 1097 mddev->new_layout = sb->new_layout; 1098 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1099 if (mddev->delta_disks < 0) 1100 mddev->reshape_backwards = 1; 1101 } else { 1102 mddev->reshape_position = MaxSector; 1103 mddev->delta_disks = 0; 1104 mddev->new_level = mddev->level; 1105 mddev->new_layout = mddev->layout; 1106 mddev->new_chunk_sectors = mddev->chunk_sectors; 1107 } 1108 1109 if (sb->state & (1<<MD_SB_CLEAN)) 1110 mddev->recovery_cp = MaxSector; 1111 else { 1112 if (sb->events_hi == sb->cp_events_hi && 1113 sb->events_lo == sb->cp_events_lo) { 1114 mddev->recovery_cp = sb->recovery_cp; 1115 } else 1116 mddev->recovery_cp = 0; 1117 } 1118 1119 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1120 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1121 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1122 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1123 1124 mddev->max_disks = MD_SB_DISKS; 1125 1126 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1127 mddev->bitmap_info.file == NULL) { 1128 mddev->bitmap_info.offset = 1129 mddev->bitmap_info.default_offset; 1130 mddev->bitmap_info.space = 1131 mddev->bitmap_info.default_space; 1132 } 1133 1134 } else if (mddev->pers == NULL) { 1135 /* Insist on good event counter while assembling, except 1136 * for spares (which don't need an event count) */ 1137 ++ev1; 1138 if (sb->disks[rdev->desc_nr].state & ( 1139 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1140 if (ev1 < mddev->events) 1141 return -EINVAL; 1142 } else if (mddev->bitmap) { 1143 /* if adding to array with a bitmap, then we can accept an 1144 * older device ... but not too old. 1145 */ 1146 if (ev1 < mddev->bitmap->events_cleared) 1147 return 0; 1148 if (ev1 < mddev->events) 1149 set_bit(Bitmap_sync, &rdev->flags); 1150 } else { 1151 if (ev1 < mddev->events) 1152 /* just a hot-add of a new device, leave raid_disk at -1 */ 1153 return 0; 1154 } 1155 1156 if (mddev->level != LEVEL_MULTIPATH) { 1157 desc = sb->disks + rdev->desc_nr; 1158 1159 if (desc->state & (1<<MD_DISK_FAULTY)) 1160 set_bit(Faulty, &rdev->flags); 1161 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1162 desc->raid_disk < mddev->raid_disks */) { 1163 set_bit(In_sync, &rdev->flags); 1164 rdev->raid_disk = desc->raid_disk; 1165 rdev->saved_raid_disk = desc->raid_disk; 1166 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1167 /* active but not in sync implies recovery up to 1168 * reshape position. We don't know exactly where 1169 * that is, so set to zero for now */ 1170 if (mddev->minor_version >= 91) { 1171 rdev->recovery_offset = 0; 1172 rdev->raid_disk = desc->raid_disk; 1173 } 1174 } 1175 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1176 set_bit(WriteMostly, &rdev->flags); 1177 } else /* MULTIPATH are always insync */ 1178 set_bit(In_sync, &rdev->flags); 1179 return 0; 1180} 1181 1182/* 1183 * sync_super for 0.90.0 1184 */ 1185static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1186{ 1187 mdp_super_t *sb; 1188 struct md_rdev *rdev2; 1189 int next_spare = mddev->raid_disks; 1190 1191 /* make rdev->sb match mddev data.. 1192 * 1193 * 1/ zero out disks 1194 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1195 * 3/ any empty disks < next_spare become removed 1196 * 1197 * disks[0] gets initialised to REMOVED because 1198 * we cannot be sure from other fields if it has 1199 * been initialised or not. 1200 */ 1201 int i; 1202 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1203 1204 rdev->sb_size = MD_SB_BYTES; 1205 1206 sb = page_address(rdev->sb_page); 1207 1208 memset(sb, 0, sizeof(*sb)); 1209 1210 sb->md_magic = MD_SB_MAGIC; 1211 sb->major_version = mddev->major_version; 1212 sb->patch_version = mddev->patch_version; 1213 sb->gvalid_words = 0; /* ignored */ 1214 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1215 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1216 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1217 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1218 1219 sb->ctime = mddev->ctime; 1220 sb->level = mddev->level; 1221 sb->size = mddev->dev_sectors / 2; 1222 sb->raid_disks = mddev->raid_disks; 1223 sb->md_minor = mddev->md_minor; 1224 sb->not_persistent = 0; 1225 sb->utime = mddev->utime; 1226 sb->state = 0; 1227 sb->events_hi = (mddev->events>>32); 1228 sb->events_lo = (u32)mddev->events; 1229 1230 if (mddev->reshape_position == MaxSector) 1231 sb->minor_version = 90; 1232 else { 1233 sb->minor_version = 91; 1234 sb->reshape_position = mddev->reshape_position; 1235 sb->new_level = mddev->new_level; 1236 sb->delta_disks = mddev->delta_disks; 1237 sb->new_layout = mddev->new_layout; 1238 sb->new_chunk = mddev->new_chunk_sectors << 9; 1239 } 1240 mddev->minor_version = sb->minor_version; 1241 if (mddev->in_sync) 1242 { 1243 sb->recovery_cp = mddev->recovery_cp; 1244 sb->cp_events_hi = (mddev->events>>32); 1245 sb->cp_events_lo = (u32)mddev->events; 1246 if (mddev->recovery_cp == MaxSector) 1247 sb->state = (1<< MD_SB_CLEAN); 1248 } else 1249 sb->recovery_cp = 0; 1250 1251 sb->layout = mddev->layout; 1252 sb->chunk_size = mddev->chunk_sectors << 9; 1253 1254 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1255 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1256 1257 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1258 rdev_for_each(rdev2, mddev) { 1259 mdp_disk_t *d; 1260 int desc_nr; 1261 int is_active = test_bit(In_sync, &rdev2->flags); 1262 1263 if (rdev2->raid_disk >= 0 && 1264 sb->minor_version >= 91) 1265 /* we have nowhere to store the recovery_offset, 1266 * but if it is not below the reshape_position, 1267 * we can piggy-back on that. 1268 */ 1269 is_active = 1; 1270 if (rdev2->raid_disk < 0 || 1271 test_bit(Faulty, &rdev2->flags)) 1272 is_active = 0; 1273 if (is_active) 1274 desc_nr = rdev2->raid_disk; 1275 else 1276 desc_nr = next_spare++; 1277 rdev2->desc_nr = desc_nr; 1278 d = &sb->disks[rdev2->desc_nr]; 1279 nr_disks++; 1280 d->number = rdev2->desc_nr; 1281 d->major = MAJOR(rdev2->bdev->bd_dev); 1282 d->minor = MINOR(rdev2->bdev->bd_dev); 1283 if (is_active) 1284 d->raid_disk = rdev2->raid_disk; 1285 else 1286 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1287 if (test_bit(Faulty, &rdev2->flags)) 1288 d->state = (1<<MD_DISK_FAULTY); 1289 else if (is_active) { 1290 d->state = (1<<MD_DISK_ACTIVE); 1291 if (test_bit(In_sync, &rdev2->flags)) 1292 d->state |= (1<<MD_DISK_SYNC); 1293 active++; 1294 working++; 1295 } else { 1296 d->state = 0; 1297 spare++; 1298 working++; 1299 } 1300 if (test_bit(WriteMostly, &rdev2->flags)) 1301 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1302 } 1303 /* now set the "removed" and "faulty" bits on any missing devices */ 1304 for (i=0 ; i < mddev->raid_disks ; i++) { 1305 mdp_disk_t *d = &sb->disks[i]; 1306 if (d->state == 0 && d->number == 0) { 1307 d->number = i; 1308 d->raid_disk = i; 1309 d->state = (1<<MD_DISK_REMOVED); 1310 d->state |= (1<<MD_DISK_FAULTY); 1311 failed++; 1312 } 1313 } 1314 sb->nr_disks = nr_disks; 1315 sb->active_disks = active; 1316 sb->working_disks = working; 1317 sb->failed_disks = failed; 1318 sb->spare_disks = spare; 1319 1320 sb->this_disk = sb->disks[rdev->desc_nr]; 1321 sb->sb_csum = calc_sb_csum(sb); 1322} 1323 1324/* 1325 * rdev_size_change for 0.90.0 1326 */ 1327static unsigned long long 1328super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1329{ 1330 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1331 return 0; /* component must fit device */ 1332 if (rdev->mddev->bitmap_info.offset) 1333 return 0; /* can't move bitmap */ 1334 rdev->sb_start = calc_dev_sboffset(rdev); 1335 if (!num_sectors || num_sectors > rdev->sb_start) 1336 num_sectors = rdev->sb_start; 1337 /* Limit to 4TB as metadata cannot record more than that. 1338 * 4TB == 2^32 KB, or 2*2^32 sectors. 1339 */ 1340 if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1341 num_sectors = (2ULL << 32) - 2; 1342 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1343 rdev->sb_page); 1344 md_super_wait(rdev->mddev); 1345 return num_sectors; 1346} 1347 1348static int 1349super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1350{ 1351 /* non-zero offset changes not possible with v0.90 */ 1352 return new_offset == 0; 1353} 1354 1355/* 1356 * version 1 superblock 1357 */ 1358 1359static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1360{ 1361 __le32 disk_csum; 1362 u32 csum; 1363 unsigned long long newcsum; 1364 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1365 __le32 *isuper = (__le32*)sb; 1366 1367 disk_csum = sb->sb_csum; 1368 sb->sb_csum = 0; 1369 newcsum = 0; 1370 for (; size >= 4; size -= 4) 1371 newcsum += le32_to_cpu(*isuper++); 1372 1373 if (size == 2) 1374 newcsum += le16_to_cpu(*(__le16*) isuper); 1375 1376 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1377 sb->sb_csum = disk_csum; 1378 return cpu_to_le32(csum); 1379} 1380 1381static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, 1382 int acknowledged); 1383static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1384{ 1385 struct mdp_superblock_1 *sb; 1386 int ret; 1387 sector_t sb_start; 1388 sector_t sectors; 1389 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1390 int bmask; 1391 1392 /* 1393 * Calculate the position of the superblock in 512byte sectors. 1394 * It is always aligned to a 4K boundary and 1395 * depeding on minor_version, it can be: 1396 * 0: At least 8K, but less than 12K, from end of device 1397 * 1: At start of device 1398 * 2: 4K from start of device. 1399 */ 1400 switch(minor_version) { 1401 case 0: 1402 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; 1403 sb_start -= 8*2; 1404 sb_start &= ~(sector_t)(4*2-1); 1405 break; 1406 case 1: 1407 sb_start = 0; 1408 break; 1409 case 2: 1410 sb_start = 8; 1411 break; 1412 default: 1413 return -EINVAL; 1414 } 1415 rdev->sb_start = sb_start; 1416 1417 /* superblock is rarely larger than 1K, but it can be larger, 1418 * and it is safe to read 4k, so we do that 1419 */ 1420 ret = read_disk_sb(rdev, 4096); 1421 if (ret) return ret; 1422 1423 sb = page_address(rdev->sb_page); 1424 1425 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1426 sb->major_version != cpu_to_le32(1) || 1427 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1428 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1429 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1430 return -EINVAL; 1431 1432 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1433 printk("md: invalid superblock checksum on %s\n", 1434 bdevname(rdev->bdev,b)); 1435 return -EINVAL; 1436 } 1437 if (le64_to_cpu(sb->data_size) < 10) { 1438 printk("md: data_size too small on %s\n", 1439 bdevname(rdev->bdev,b)); 1440 return -EINVAL; 1441 } 1442 if (sb->pad0 || 1443 sb->pad3[0] || 1444 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1445 /* Some padding is non-zero, might be a new feature */ 1446 return -EINVAL; 1447 1448 rdev->preferred_minor = 0xffff; 1449 rdev->data_offset = le64_to_cpu(sb->data_offset); 1450 rdev->new_data_offset = rdev->data_offset; 1451 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1452 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1453 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1454 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1455 1456 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1457 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1458 if (rdev->sb_size & bmask) 1459 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1460 1461 if (minor_version 1462 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1463 return -EINVAL; 1464 if (minor_version 1465 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1466 return -EINVAL; 1467 1468 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1469 rdev->desc_nr = -1; 1470 else 1471 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1472 1473 if (!rdev->bb_page) { 1474 rdev->bb_page = alloc_page(GFP_KERNEL); 1475 if (!rdev->bb_page) 1476 return -ENOMEM; 1477 } 1478 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1479 rdev->badblocks.count == 0) { 1480 /* need to load the bad block list. 1481 * Currently we limit it to one page. 1482 */ 1483 s32 offset; 1484 sector_t bb_sector; 1485 u64 *bbp; 1486 int i; 1487 int sectors = le16_to_cpu(sb->bblog_size); 1488 if (sectors > (PAGE_SIZE / 512)) 1489 return -EINVAL; 1490 offset = le32_to_cpu(sb->bblog_offset); 1491 if (offset == 0) 1492 return -EINVAL; 1493 bb_sector = (long long)offset; 1494 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1495 rdev->bb_page, READ, true)) 1496 return -EIO; 1497 bbp = (u64 *)page_address(rdev->bb_page); 1498 rdev->badblocks.shift = sb->bblog_shift; 1499 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1500 u64 bb = le64_to_cpu(*bbp); 1501 int count = bb & (0x3ff); 1502 u64 sector = bb >> 10; 1503 sector <<= sb->bblog_shift; 1504 count <<= sb->bblog_shift; 1505 if (bb + 1 == 0) 1506 break; 1507 if (md_set_badblocks(&rdev->badblocks, 1508 sector, count, 1) == 0) 1509 return -EINVAL; 1510 } 1511 } else if (sb->bblog_offset != 0) 1512 rdev->badblocks.shift = 0; 1513 1514 if (!refdev) { 1515 ret = 1; 1516 } else { 1517 __u64 ev1, ev2; 1518 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1519 1520 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1521 sb->level != refsb->level || 1522 sb->layout != refsb->layout || 1523 sb->chunksize != refsb->chunksize) { 1524 printk(KERN_WARNING "md: %s has strangely different" 1525 " superblock to %s\n", 1526 bdevname(rdev->bdev,b), 1527 bdevname(refdev->bdev,b2)); 1528 return -EINVAL; 1529 } 1530 ev1 = le64_to_cpu(sb->events); 1531 ev2 = le64_to_cpu(refsb->events); 1532 1533 if (ev1 > ev2) 1534 ret = 1; 1535 else 1536 ret = 0; 1537 } 1538 if (minor_version) { 1539 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); 1540 sectors -= rdev->data_offset; 1541 } else 1542 sectors = rdev->sb_start; 1543 if (sectors < le64_to_cpu(sb->data_size)) 1544 return -EINVAL; 1545 rdev->sectors = le64_to_cpu(sb->data_size); 1546 return ret; 1547} 1548 1549static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1550{ 1551 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1552 __u64 ev1 = le64_to_cpu(sb->events); 1553 1554 rdev->raid_disk = -1; 1555 clear_bit(Faulty, &rdev->flags); 1556 clear_bit(In_sync, &rdev->flags); 1557 clear_bit(Bitmap_sync, &rdev->flags); 1558 clear_bit(WriteMostly, &rdev->flags); 1559 1560 if (mddev->raid_disks == 0) { 1561 mddev->major_version = 1; 1562 mddev->patch_version = 0; 1563 mddev->external = 0; 1564 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1565 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1566 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1567 mddev->level = le32_to_cpu(sb->level); 1568 mddev->clevel[0] = 0; 1569 mddev->layout = le32_to_cpu(sb->layout); 1570 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1571 mddev->dev_sectors = le64_to_cpu(sb->size); 1572 mddev->events = ev1; 1573 mddev->bitmap_info.offset = 0; 1574 mddev->bitmap_info.space = 0; 1575 /* Default location for bitmap is 1K after superblock 1576 * using 3K - total of 4K 1577 */ 1578 mddev->bitmap_info.default_offset = 1024 >> 9; 1579 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1580 mddev->reshape_backwards = 0; 1581 1582 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1583 memcpy(mddev->uuid, sb->set_uuid, 16); 1584 1585 mddev->max_disks = (4096-256)/2; 1586 1587 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1588 mddev->bitmap_info.file == NULL) { 1589 mddev->bitmap_info.offset = 1590 (__s32)le32_to_cpu(sb->bitmap_offset); 1591 /* Metadata doesn't record how much space is available. 1592 * For 1.0, we assume we can use up to the superblock 1593 * if before, else to 4K beyond superblock. 1594 * For others, assume no change is possible. 1595 */ 1596 if (mddev->minor_version > 0) 1597 mddev->bitmap_info.space = 0; 1598 else if (mddev->bitmap_info.offset > 0) 1599 mddev->bitmap_info.space = 1600 8 - mddev->bitmap_info.offset; 1601 else 1602 mddev->bitmap_info.space = 1603 -mddev->bitmap_info.offset; 1604 } 1605 1606 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1607 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1608 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1609 mddev->new_level = le32_to_cpu(sb->new_level); 1610 mddev->new_layout = le32_to_cpu(sb->new_layout); 1611 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1612 if (mddev->delta_disks < 0 || 1613 (mddev->delta_disks == 0 && 1614 (le32_to_cpu(sb->feature_map) 1615 & MD_FEATURE_RESHAPE_BACKWARDS))) 1616 mddev->reshape_backwards = 1; 1617 } else { 1618 mddev->reshape_position = MaxSector; 1619 mddev->delta_disks = 0; 1620 mddev->new_level = mddev->level; 1621 mddev->new_layout = mddev->layout; 1622 mddev->new_chunk_sectors = mddev->chunk_sectors; 1623 } 1624 1625 } else if (mddev->pers == NULL) { 1626 /* Insist of good event counter while assembling, except for 1627 * spares (which don't need an event count) */ 1628 ++ev1; 1629 if (rdev->desc_nr >= 0 && 1630 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1631 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) 1632 if (ev1 < mddev->events) 1633 return -EINVAL; 1634 } else if (mddev->bitmap) { 1635 /* If adding to array with a bitmap, then we can accept an 1636 * older device, but not too old. 1637 */ 1638 if (ev1 < mddev->bitmap->events_cleared) 1639 return 0; 1640 if (ev1 < mddev->events) 1641 set_bit(Bitmap_sync, &rdev->flags); 1642 } else { 1643 if (ev1 < mddev->events) 1644 /* just a hot-add of a new device, leave raid_disk at -1 */ 1645 return 0; 1646 } 1647 if (mddev->level != LEVEL_MULTIPATH) { 1648 int role; 1649 if (rdev->desc_nr < 0 || 1650 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1651 role = 0xffff; 1652 rdev->desc_nr = -1; 1653 } else 1654 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1655 switch(role) { 1656 case 0xffff: /* spare */ 1657 break; 1658 case 0xfffe: /* faulty */ 1659 set_bit(Faulty, &rdev->flags); 1660 break; 1661 default: 1662 rdev->saved_raid_disk = role; 1663 if ((le32_to_cpu(sb->feature_map) & 1664 MD_FEATURE_RECOVERY_OFFSET)) { 1665 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1666 if (!(le32_to_cpu(sb->feature_map) & 1667 MD_FEATURE_RECOVERY_BITMAP)) 1668 rdev->saved_raid_disk = -1; 1669 } else 1670 set_bit(In_sync, &rdev->flags); 1671 rdev->raid_disk = role; 1672 break; 1673 } 1674 if (sb->devflags & WriteMostly1) 1675 set_bit(WriteMostly, &rdev->flags); 1676 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1677 set_bit(Replacement, &rdev->flags); 1678 } else /* MULTIPATH are always insync */ 1679 set_bit(In_sync, &rdev->flags); 1680 1681 return 0; 1682} 1683 1684static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1685{ 1686 struct mdp_superblock_1 *sb; 1687 struct md_rdev *rdev2; 1688 int max_dev, i; 1689 /* make rdev->sb match mddev and rdev data. */ 1690 1691 sb = page_address(rdev->sb_page); 1692 1693 sb->feature_map = 0; 1694 sb->pad0 = 0; 1695 sb->recovery_offset = cpu_to_le64(0); 1696 memset(sb->pad3, 0, sizeof(sb->pad3)); 1697 1698 sb->utime = cpu_to_le64((__u64)mddev->utime); 1699 sb->events = cpu_to_le64(mddev->events); 1700 if (mddev->in_sync) 1701 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1702 else 1703 sb->resync_offset = cpu_to_le64(0); 1704 1705 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1706 1707 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1708 sb->size = cpu_to_le64(mddev->dev_sectors); 1709 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1710 sb->level = cpu_to_le32(mddev->level); 1711 sb->layout = cpu_to_le32(mddev->layout); 1712 1713 if (test_bit(WriteMostly, &rdev->flags)) 1714 sb->devflags |= WriteMostly1; 1715 else 1716 sb->devflags &= ~WriteMostly1; 1717 sb->data_offset = cpu_to_le64(rdev->data_offset); 1718 sb->data_size = cpu_to_le64(rdev->sectors); 1719 1720 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1721 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1722 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1723 } 1724 1725 if (rdev->raid_disk >= 0 && 1726 !test_bit(In_sync, &rdev->flags)) { 1727 sb->feature_map |= 1728 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1729 sb->recovery_offset = 1730 cpu_to_le64(rdev->recovery_offset); 1731 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 1732 sb->feature_map |= 1733 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 1734 } 1735 if (test_bit(Replacement, &rdev->flags)) 1736 sb->feature_map |= 1737 cpu_to_le32(MD_FEATURE_REPLACEMENT); 1738 1739 if (mddev->reshape_position != MaxSector) { 1740 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1741 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1742 sb->new_layout = cpu_to_le32(mddev->new_layout); 1743 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1744 sb->new_level = cpu_to_le32(mddev->new_level); 1745 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1746 if (mddev->delta_disks == 0 && 1747 mddev->reshape_backwards) 1748 sb->feature_map 1749 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 1750 if (rdev->new_data_offset != rdev->data_offset) { 1751 sb->feature_map 1752 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 1753 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 1754 - rdev->data_offset)); 1755 } 1756 } 1757 1758 if (rdev->badblocks.count == 0) 1759 /* Nothing to do for bad blocks*/ ; 1760 else if (sb->bblog_offset == 0) 1761 /* Cannot record bad blocks on this device */ 1762 md_error(mddev, rdev); 1763 else { 1764 struct badblocks *bb = &rdev->badblocks; 1765 u64 *bbp = (u64 *)page_address(rdev->bb_page); 1766 u64 *p = bb->page; 1767 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 1768 if (bb->changed) { 1769 unsigned seq; 1770 1771retry: 1772 seq = read_seqbegin(&bb->lock); 1773 1774 memset(bbp, 0xff, PAGE_SIZE); 1775 1776 for (i = 0 ; i < bb->count ; i++) { 1777 u64 internal_bb = p[i]; 1778 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 1779 | BB_LEN(internal_bb)); 1780 bbp[i] = cpu_to_le64(store_bb); 1781 } 1782 bb->changed = 0; 1783 if (read_seqretry(&bb->lock, seq)) 1784 goto retry; 1785 1786 bb->sector = (rdev->sb_start + 1787 (int)le32_to_cpu(sb->bblog_offset)); 1788 bb->size = le16_to_cpu(sb->bblog_size); 1789 } 1790 } 1791 1792 max_dev = 0; 1793 rdev_for_each(rdev2, mddev) 1794 if (rdev2->desc_nr+1 > max_dev) 1795 max_dev = rdev2->desc_nr+1; 1796 1797 if (max_dev > le32_to_cpu(sb->max_dev)) { 1798 int bmask; 1799 sb->max_dev = cpu_to_le32(max_dev); 1800 rdev->sb_size = max_dev * 2 + 256; 1801 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1802 if (rdev->sb_size & bmask) 1803 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1804 } else 1805 max_dev = le32_to_cpu(sb->max_dev); 1806 1807 for (i=0; i<max_dev;i++) 1808 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1809 1810 rdev_for_each(rdev2, mddev) { 1811 i = rdev2->desc_nr; 1812 if (test_bit(Faulty, &rdev2->flags)) 1813 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1814 else if (test_bit(In_sync, &rdev2->flags)) 1815 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1816 else if (rdev2->raid_disk >= 0) 1817 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1818 else 1819 sb->dev_roles[i] = cpu_to_le16(0xffff); 1820 } 1821 1822 sb->sb_csum = calc_sb_1_csum(sb); 1823} 1824 1825static unsigned long long 1826super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1827{ 1828 struct mdp_superblock_1 *sb; 1829 sector_t max_sectors; 1830 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1831 return 0; /* component must fit device */ 1832 if (rdev->data_offset != rdev->new_data_offset) 1833 return 0; /* too confusing */ 1834 if (rdev->sb_start < rdev->data_offset) { 1835 /* minor versions 1 and 2; superblock before data */ 1836 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; 1837 max_sectors -= rdev->data_offset; 1838 if (!num_sectors || num_sectors > max_sectors) 1839 num_sectors = max_sectors; 1840 } else if (rdev->mddev->bitmap_info.offset) { 1841 /* minor version 0 with bitmap we can't move */ 1842 return 0; 1843 } else { 1844 /* minor version 0; superblock after data */ 1845 sector_t sb_start; 1846 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; 1847 sb_start &= ~(sector_t)(4*2 - 1); 1848 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1849 if (!num_sectors || num_sectors > max_sectors) 1850 num_sectors = max_sectors; 1851 rdev->sb_start = sb_start; 1852 } 1853 sb = page_address(rdev->sb_page); 1854 sb->data_size = cpu_to_le64(num_sectors); 1855 sb->super_offset = rdev->sb_start; 1856 sb->sb_csum = calc_sb_1_csum(sb); 1857 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1858 rdev->sb_page); 1859 md_super_wait(rdev->mddev); 1860 return num_sectors; 1861 1862} 1863 1864static int 1865super_1_allow_new_offset(struct md_rdev *rdev, 1866 unsigned long long new_offset) 1867{ 1868 /* All necessary checks on new >= old have been done */ 1869 struct bitmap *bitmap; 1870 if (new_offset >= rdev->data_offset) 1871 return 1; 1872 1873 /* with 1.0 metadata, there is no metadata to tread on 1874 * so we can always move back */ 1875 if (rdev->mddev->minor_version == 0) 1876 return 1; 1877 1878 /* otherwise we must be sure not to step on 1879 * any metadata, so stay: 1880 * 36K beyond start of superblock 1881 * beyond end of badblocks 1882 * beyond write-intent bitmap 1883 */ 1884 if (rdev->sb_start + (32+4)*2 > new_offset) 1885 return 0; 1886 bitmap = rdev->mddev->bitmap; 1887 if (bitmap && !rdev->mddev->bitmap_info.file && 1888 rdev->sb_start + rdev->mddev->bitmap_info.offset + 1889 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 1890 return 0; 1891 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 1892 return 0; 1893 1894 return 1; 1895} 1896 1897static struct super_type super_types[] = { 1898 [0] = { 1899 .name = "0.90.0", 1900 .owner = THIS_MODULE, 1901 .load_super = super_90_load, 1902 .validate_super = super_90_validate, 1903 .sync_super = super_90_sync, 1904 .rdev_size_change = super_90_rdev_size_change, 1905 .allow_new_offset = super_90_allow_new_offset, 1906 }, 1907 [1] = { 1908 .name = "md-1", 1909 .owner = THIS_MODULE, 1910 .load_super = super_1_load, 1911 .validate_super = super_1_validate, 1912 .sync_super = super_1_sync, 1913 .rdev_size_change = super_1_rdev_size_change, 1914 .allow_new_offset = super_1_allow_new_offset, 1915 }, 1916}; 1917 1918static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 1919{ 1920 if (mddev->sync_super) { 1921 mddev->sync_super(mddev, rdev); 1922 return; 1923 } 1924 1925 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 1926 1927 super_types[mddev->major_version].sync_super(mddev, rdev); 1928} 1929 1930static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 1931{ 1932 struct md_rdev *rdev, *rdev2; 1933 1934 rcu_read_lock(); 1935 rdev_for_each_rcu(rdev, mddev1) 1936 rdev_for_each_rcu(rdev2, mddev2) 1937 if (rdev->bdev->bd_contains == 1938 rdev2->bdev->bd_contains) { 1939 rcu_read_unlock(); 1940 return 1; 1941 } 1942 rcu_read_unlock(); 1943 return 0; 1944} 1945 1946static LIST_HEAD(pending_raid_disks); 1947 1948/* 1949 * Try to register data integrity profile for an mddev 1950 * 1951 * This is called when an array is started and after a disk has been kicked 1952 * from the array. It only succeeds if all working and active component devices 1953 * are integrity capable with matching profiles. 1954 */ 1955int md_integrity_register(struct mddev *mddev) 1956{ 1957 struct md_rdev *rdev, *reference = NULL; 1958 1959 if (list_empty(&mddev->disks)) 1960 return 0; /* nothing to do */ 1961 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 1962 return 0; /* shouldn't register, or already is */ 1963 rdev_for_each(rdev, mddev) { 1964 /* skip spares and non-functional disks */ 1965 if (test_bit(Faulty, &rdev->flags)) 1966 continue; 1967 if (rdev->raid_disk < 0) 1968 continue; 1969 if (!reference) { 1970 /* Use the first rdev as the reference */ 1971 reference = rdev; 1972 continue; 1973 } 1974 /* does this rdev's profile match the reference profile? */ 1975 if (blk_integrity_compare(reference->bdev->bd_disk, 1976 rdev->bdev->bd_disk) < 0) 1977 return -EINVAL; 1978 } 1979 if (!reference || !bdev_get_integrity(reference->bdev)) 1980 return 0; 1981 /* 1982 * All component devices are integrity capable and have matching 1983 * profiles, register the common profile for the md device. 1984 */ 1985 if (blk_integrity_register(mddev->gendisk, 1986 bdev_get_integrity(reference->bdev)) != 0) { 1987 printk(KERN_ERR "md: failed to register integrity for %s\n", 1988 mdname(mddev)); 1989 return -EINVAL; 1990 } 1991 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); 1992 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { 1993 printk(KERN_ERR "md: failed to create integrity pool for %s\n", 1994 mdname(mddev)); 1995 return -EINVAL; 1996 } 1997 return 0; 1998} 1999EXPORT_SYMBOL(md_integrity_register); 2000 2001/* Disable data integrity if non-capable/non-matching disk is being added */ 2002void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2003{ 2004 struct blk_integrity *bi_rdev; 2005 struct blk_integrity *bi_mddev; 2006 2007 if (!mddev->gendisk) 2008 return; 2009 2010 bi_rdev = bdev_get_integrity(rdev->bdev); 2011 bi_mddev = blk_get_integrity(mddev->gendisk); 2012 2013 if (!bi_mddev) /* nothing to do */ 2014 return; 2015 if (rdev->raid_disk < 0) /* skip spares */ 2016 return; 2017 if (bi_rdev && blk_integrity_compare(mddev->gendisk, 2018 rdev->bdev->bd_disk) >= 0) 2019 return; 2020 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); 2021 blk_integrity_unregister(mddev->gendisk); 2022} 2023EXPORT_SYMBOL(md_integrity_add_rdev); 2024 2025static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2026{ 2027 char b[BDEVNAME_SIZE]; 2028 struct kobject *ko; 2029 char *s; 2030 int err; 2031 2032 /* prevent duplicates */ 2033 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2034 return -EEXIST; 2035 2036 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2037 if (rdev->sectors && (mddev->dev_sectors == 0 || 2038 rdev->sectors < mddev->dev_sectors)) { 2039 if (mddev->pers) { 2040 /* Cannot change size, so fail 2041 * If mddev->level <= 0, then we don't care 2042 * about aligning sizes (e.g. linear) 2043 */ 2044 if (mddev->level > 0) 2045 return -ENOSPC; 2046 } else 2047 mddev->dev_sectors = rdev->sectors; 2048 } 2049 2050 /* Verify rdev->desc_nr is unique. 2051 * If it is -1, assign a free number, else 2052 * check number is not in use 2053 */ 2054 rcu_read_lock(); 2055 if (rdev->desc_nr < 0) { 2056 int choice = 0; 2057 if (mddev->pers) 2058 choice = mddev->raid_disks; 2059 while (md_find_rdev_nr_rcu(mddev, choice)) 2060 choice++; 2061 rdev->desc_nr = choice; 2062 } else { 2063 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2064 rcu_read_unlock(); 2065 return -EBUSY; 2066 } 2067 } 2068 rcu_read_unlock(); 2069 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2070 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 2071 mdname(mddev), mddev->max_disks); 2072 return -EBUSY; 2073 } 2074 bdevname(rdev->bdev,b); 2075 while ( (s=strchr(b, '/')) != NULL) 2076 *s = '!'; 2077 2078 rdev->mddev = mddev; 2079 printk(KERN_INFO "md: bind<%s>\n", b); 2080 2081 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2082 goto fail; 2083 2084 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 2085 if (sysfs_create_link(&rdev->kobj, ko, "block")) 2086 /* failure here is OK */; 2087 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2088 2089 list_add_rcu(&rdev->same_set, &mddev->disks); 2090 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2091 2092 /* May as well allow recovery to be retried once */ 2093 mddev->recovery_disabled++; 2094 2095 return 0; 2096 2097 fail: 2098 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 2099 b, mdname(mddev)); 2100 return err; 2101} 2102 2103static void md_delayed_delete(struct work_struct *ws) 2104{ 2105 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); 2106 kobject_del(&rdev->kobj); 2107 kobject_put(&rdev->kobj); 2108} 2109 2110static void unbind_rdev_from_array(struct md_rdev *rdev) 2111{ 2112 char b[BDEVNAME_SIZE]; 2113 2114 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2115 list_del_rcu(&rdev->same_set); 2116 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 2117 rdev->mddev = NULL; 2118 sysfs_remove_link(&rdev->kobj, "block"); 2119 sysfs_put(rdev->sysfs_state); 2120 rdev->sysfs_state = NULL; 2121 rdev->badblocks.count = 0; 2122 /* We need to delay this, otherwise we can deadlock when 2123 * writing to 'remove' to "dev/state". We also need 2124 * to delay it due to rcu usage. 2125 */ 2126 synchronize_rcu(); 2127 INIT_WORK(&rdev->del_work, md_delayed_delete); 2128 kobject_get(&rdev->kobj); 2129 queue_work(md_misc_wq, &rdev->del_work); 2130} 2131 2132/* 2133 * prevent the device from being mounted, repartitioned or 2134 * otherwise reused by a RAID array (or any other kernel 2135 * subsystem), by bd_claiming the device. 2136 */ 2137static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) 2138{ 2139 int err = 0; 2140 struct block_device *bdev; 2141 char b[BDEVNAME_SIZE]; 2142 2143 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, 2144 shared ? (struct md_rdev *)lock_rdev : rdev); 2145 if (IS_ERR(bdev)) { 2146 printk(KERN_ERR "md: could not open %s.\n", 2147 __bdevname(dev, b)); 2148 return PTR_ERR(bdev); 2149 } 2150 rdev->bdev = bdev; 2151 return err; 2152} 2153 2154static void unlock_rdev(struct md_rdev *rdev) 2155{ 2156 struct block_device *bdev = rdev->bdev; 2157 rdev->bdev = NULL; 2158 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 2159} 2160 2161void md_autodetect_dev(dev_t dev); 2162 2163static void export_rdev(struct md_rdev *rdev) 2164{ 2165 char b[BDEVNAME_SIZE]; 2166 2167 printk(KERN_INFO "md: export_rdev(%s)\n", 2168 bdevname(rdev->bdev,b)); 2169 md_rdev_clear(rdev); 2170#ifndef MODULE 2171 if (test_bit(AutoDetected, &rdev->flags)) 2172 md_autodetect_dev(rdev->bdev->bd_dev); 2173#endif 2174 unlock_rdev(rdev); 2175 kobject_put(&rdev->kobj); 2176} 2177 2178void md_kick_rdev_from_array(struct md_rdev *rdev) 2179{ 2180 unbind_rdev_from_array(rdev); 2181 export_rdev(rdev); 2182} 2183EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); 2184 2185static void export_array(struct mddev *mddev) 2186{ 2187 struct md_rdev *rdev; 2188 2189 while (!list_empty(&mddev->disks)) { 2190 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2191 same_set); 2192 md_kick_rdev_from_array(rdev); 2193 } 2194 mddev->raid_disks = 0; 2195 mddev->major_version = 0; 2196} 2197 2198static void sync_sbs(struct mddev *mddev, int nospares) 2199{ 2200 /* Update each superblock (in-memory image), but 2201 * if we are allowed to, skip spares which already 2202 * have the right event counter, or have one earlier 2203 * (which would mean they aren't being marked as dirty 2204 * with the rest of the array) 2205 */ 2206 struct md_rdev *rdev; 2207 rdev_for_each(rdev, mddev) { 2208 if (rdev->sb_events == mddev->events || 2209 (nospares && 2210 rdev->raid_disk < 0 && 2211 rdev->sb_events+1 == mddev->events)) { 2212 /* Don't update this superblock */ 2213 rdev->sb_loaded = 2; 2214 } else { 2215 sync_super(mddev, rdev); 2216 rdev->sb_loaded = 1; 2217 } 2218 } 2219} 2220 2221void md_update_sb(struct mddev *mddev, int force_change) 2222{ 2223 struct md_rdev *rdev; 2224 int sync_req; 2225 int nospares = 0; 2226 int any_badblocks_changed = 0; 2227 2228 if (mddev->ro) { 2229 if (force_change) 2230 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2231 return; 2232 } 2233repeat: 2234 /* First make sure individual recovery_offsets are correct */ 2235 rdev_for_each(rdev, mddev) { 2236 if (rdev->raid_disk >= 0 && 2237 mddev->delta_disks >= 0 && 2238 !test_bit(In_sync, &rdev->flags) && 2239 mddev->curr_resync_completed > rdev->recovery_offset) 2240 rdev->recovery_offset = mddev->curr_resync_completed; 2241 2242 } 2243 if (!mddev->persistent) { 2244 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2245 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2246 if (!mddev->external) { 2247 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2248 rdev_for_each(rdev, mddev) { 2249 if (rdev->badblocks.changed) { 2250 rdev->badblocks.changed = 0; 2251 md_ack_all_badblocks(&rdev->badblocks); 2252 md_error(mddev, rdev); 2253 } 2254 clear_bit(Blocked, &rdev->flags); 2255 clear_bit(BlockedBadBlocks, &rdev->flags); 2256 wake_up(&rdev->blocked_wait); 2257 } 2258 } 2259 wake_up(&mddev->sb_wait); 2260 return; 2261 } 2262 2263 spin_lock(&mddev->lock); 2264 2265 mddev->utime = get_seconds(); 2266 2267 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2268 force_change = 1; 2269 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2270 /* just a clean<-> dirty transition, possibly leave spares alone, 2271 * though if events isn't the right even/odd, we will have to do 2272 * spares after all 2273 */ 2274 nospares = 1; 2275 if (force_change) 2276 nospares = 0; 2277 if (mddev->degraded) 2278 /* If the array is degraded, then skipping spares is both 2279 * dangerous and fairly pointless. 2280 * Dangerous because a device that was removed from the array 2281 * might have a event_count that still looks up-to-date, 2282 * so it can be re-added without a resync. 2283 * Pointless because if there are any spares to skip, 2284 * then a recovery will happen and soon that array won't 2285 * be degraded any more and the spare can go back to sleep then. 2286 */ 2287 nospares = 0; 2288 2289 sync_req = mddev->in_sync; 2290 2291 /* If this is just a dirty<->clean transition, and the array is clean 2292 * and 'events' is odd, we can roll back to the previous clean state */ 2293 if (nospares 2294 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2295 && mddev->can_decrease_events 2296 && mddev->events != 1) { 2297 mddev->events--; 2298 mddev->can_decrease_events = 0; 2299 } else { 2300 /* otherwise we have to go forward and ... */ 2301 mddev->events ++; 2302 mddev->can_decrease_events = nospares; 2303 } 2304 2305 /* 2306 * This 64-bit counter should never wrap. 2307 * Either we are in around ~1 trillion A.C., assuming 2308 * 1 reboot per second, or we have a bug... 2309 */ 2310 WARN_ON(mddev->events == 0); 2311 2312 rdev_for_each(rdev, mddev) { 2313 if (rdev->badblocks.changed) 2314 any_badblocks_changed++; 2315 if (test_bit(Faulty, &rdev->flags)) 2316 set_bit(FaultRecorded, &rdev->flags); 2317 } 2318 2319 sync_sbs(mddev, nospares); 2320 spin_unlock(&mddev->lock); 2321 2322 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2323 mdname(mddev), mddev->in_sync); 2324 2325 bitmap_update_sb(mddev->bitmap); 2326 rdev_for_each(rdev, mddev) { 2327 char b[BDEVNAME_SIZE]; 2328 2329 if (rdev->sb_loaded != 1) 2330 continue; /* no noise on spare devices */ 2331 2332 if (!test_bit(Faulty, &rdev->flags)) { 2333 md_super_write(mddev,rdev, 2334 rdev->sb_start, rdev->sb_size, 2335 rdev->sb_page); 2336 pr_debug("md: (write) %s's sb offset: %llu\n", 2337 bdevname(rdev->bdev, b), 2338 (unsigned long long)rdev->sb_start); 2339 rdev->sb_events = mddev->events; 2340 if (rdev->badblocks.size) { 2341 md_super_write(mddev, rdev, 2342 rdev->badblocks.sector, 2343 rdev->badblocks.size << 9, 2344 rdev->bb_page); 2345 rdev->badblocks.size = 0; 2346 } 2347 2348 } else 2349 pr_debug("md: %s (skipping faulty)\n", 2350 bdevname(rdev->bdev, b)); 2351 2352 if (mddev->level == LEVEL_MULTIPATH) 2353 /* only need to write one superblock... */ 2354 break; 2355 } 2356 md_super_wait(mddev); 2357 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2358 2359 spin_lock(&mddev->lock); 2360 if (mddev->in_sync != sync_req || 2361 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2362 /* have to write it out again */ 2363 spin_unlock(&mddev->lock); 2364 goto repeat; 2365 } 2366 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2367 spin_unlock(&mddev->lock); 2368 wake_up(&mddev->sb_wait); 2369 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2370 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2371 2372 rdev_for_each(rdev, mddev) { 2373 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2374 clear_bit(Blocked, &rdev->flags); 2375 2376 if (any_badblocks_changed) 2377 md_ack_all_badblocks(&rdev->badblocks); 2378 clear_bit(BlockedBadBlocks, &rdev->flags); 2379 wake_up(&rdev->blocked_wait); 2380 } 2381} 2382EXPORT_SYMBOL(md_update_sb); 2383 2384static int add_bound_rdev(struct md_rdev *rdev) 2385{ 2386 struct mddev *mddev = rdev->mddev; 2387 int err = 0; 2388 2389 if (!mddev->pers->hot_remove_disk) { 2390 /* If there is hot_add_disk but no hot_remove_disk 2391 * then added disks for geometry changes, 2392 * and should be added immediately. 2393 */ 2394 super_types[mddev->major_version]. 2395 validate_super(mddev, rdev); 2396 err = mddev->pers->hot_add_disk(mddev, rdev); 2397 if (err) { 2398 unbind_rdev_from_array(rdev); 2399 export_rdev(rdev); 2400 return err; 2401 } 2402 } 2403 sysfs_notify_dirent_safe(rdev->sysfs_state); 2404 2405 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2406 if (mddev->degraded) 2407 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2408 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2409 md_new_event(mddev); 2410 md_wakeup_thread(mddev->thread); 2411 return 0; 2412} 2413 2414/* words written to sysfs files may, or may not, be \n terminated. 2415 * We want to accept with case. For this we use cmd_match. 2416 */ 2417static int cmd_match(const char *cmd, const char *str) 2418{ 2419 /* See if cmd, written into a sysfs file, matches 2420 * str. They must either be the same, or cmd can 2421 * have a trailing newline 2422 */ 2423 while (*cmd && *str && *cmd == *str) { 2424 cmd++; 2425 str++; 2426 } 2427 if (*cmd == '\n') 2428 cmd++; 2429 if (*str || *cmd) 2430 return 0; 2431 return 1; 2432} 2433 2434struct rdev_sysfs_entry { 2435 struct attribute attr; 2436 ssize_t (*show)(struct md_rdev *, char *); 2437 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2438}; 2439 2440static ssize_t 2441state_show(struct md_rdev *rdev, char *page) 2442{ 2443 char *sep = ""; 2444 size_t len = 0; 2445 unsigned long flags = ACCESS_ONCE(rdev->flags); 2446 2447 if (test_bit(Faulty, &flags) || 2448 rdev->badblocks.unacked_exist) { 2449 len+= sprintf(page+len, "%sfaulty",sep); 2450 sep = ","; 2451 } 2452 if (test_bit(In_sync, &flags)) { 2453 len += sprintf(page+len, "%sin_sync",sep); 2454 sep = ","; 2455 } 2456 if (test_bit(WriteMostly, &flags)) { 2457 len += sprintf(page+len, "%swrite_mostly",sep); 2458 sep = ","; 2459 } 2460 if (test_bit(Blocked, &flags) || 2461 (rdev->badblocks.unacked_exist 2462 && !test_bit(Faulty, &flags))) { 2463 len += sprintf(page+len, "%sblocked", sep); 2464 sep = ","; 2465 } 2466 if (!test_bit(Faulty, &flags) && 2467 !test_bit(In_sync, &flags)) { 2468 len += sprintf(page+len, "%sspare", sep); 2469 sep = ","; 2470 } 2471 if (test_bit(WriteErrorSeen, &flags)) { 2472 len += sprintf(page+len, "%swrite_error", sep); 2473 sep = ","; 2474 } 2475 if (test_bit(WantReplacement, &flags)) { 2476 len += sprintf(page+len, "%swant_replacement", sep); 2477 sep = ","; 2478 } 2479 if (test_bit(Replacement, &flags)) { 2480 len += sprintf(page+len, "%sreplacement", sep); 2481 sep = ","; 2482 } 2483 2484 return len+sprintf(page+len, "\n"); 2485} 2486 2487static ssize_t 2488state_store(struct md_rdev *rdev, const char *buf, size_t len) 2489{ 2490 /* can write 2491 * faulty - simulates an error 2492 * remove - disconnects the device 2493 * writemostly - sets write_mostly 2494 * -writemostly - clears write_mostly 2495 * blocked - sets the Blocked flags 2496 * -blocked - clears the Blocked and possibly simulates an error 2497 * insync - sets Insync providing device isn't active 2498 * -insync - clear Insync for a device with a slot assigned, 2499 * so that it gets rebuilt based on bitmap 2500 * write_error - sets WriteErrorSeen 2501 * -write_error - clears WriteErrorSeen 2502 */ 2503 int err = -EINVAL; 2504 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2505 md_error(rdev->mddev, rdev); 2506 if (test_bit(Faulty, &rdev->flags)) 2507 err = 0; 2508 else 2509 err = -EBUSY; 2510 } else if (cmd_match(buf, "remove")) { 2511 if (rdev->raid_disk >= 0) 2512 err = -EBUSY; 2513 else { 2514 struct mddev *mddev = rdev->mddev; 2515 if (mddev_is_clustered(mddev)) 2516 md_cluster_ops->remove_disk(mddev, rdev); 2517 md_kick_rdev_from_array(rdev); 2518 if (mddev_is_clustered(mddev)) 2519 md_cluster_ops->metadata_update_start(mddev); 2520 if (mddev->pers) 2521 md_update_sb(mddev, 1); 2522 md_new_event(mddev); 2523 if (mddev_is_clustered(mddev)) 2524 md_cluster_ops->metadata_update_finish(mddev); 2525 err = 0; 2526 } 2527 } else if (cmd_match(buf, "writemostly")) { 2528 set_bit(WriteMostly, &rdev->flags); 2529 err = 0; 2530 } else if (cmd_match(buf, "-writemostly")) { 2531 clear_bit(WriteMostly, &rdev->flags); 2532 err = 0; 2533 } else if (cmd_match(buf, "blocked")) { 2534 set_bit(Blocked, &rdev->flags); 2535 err = 0; 2536 } else if (cmd_match(buf, "-blocked")) { 2537 if (!test_bit(Faulty, &rdev->flags) && 2538 rdev->badblocks.unacked_exist) { 2539 /* metadata handler doesn't understand badblocks, 2540 * so we need to fail the device 2541 */ 2542 md_error(rdev->mddev, rdev); 2543 } 2544 clear_bit(Blocked, &rdev->flags); 2545 clear_bit(BlockedBadBlocks, &rdev->flags); 2546 wake_up(&rdev->blocked_wait); 2547 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2548 md_wakeup_thread(rdev->mddev->thread); 2549 2550 err = 0; 2551 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2552 set_bit(In_sync, &rdev->flags); 2553 err = 0; 2554 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { 2555 if (rdev->mddev->pers == NULL) { 2556 clear_bit(In_sync, &rdev->flags); 2557 rdev->saved_raid_disk = rdev->raid_disk; 2558 rdev->raid_disk = -1; 2559 err = 0; 2560 } 2561 } else if (cmd_match(buf, "write_error")) { 2562 set_bit(WriteErrorSeen, &rdev->flags); 2563 err = 0; 2564 } else if (cmd_match(buf, "-write_error")) { 2565 clear_bit(WriteErrorSeen, &rdev->flags); 2566 err = 0; 2567 } else if (cmd_match(buf, "want_replacement")) { 2568 /* Any non-spare device that is not a replacement can 2569 * become want_replacement at any time, but we then need to 2570 * check if recovery is needed. 2571 */ 2572 if (rdev->raid_disk >= 0 && 2573 !test_bit(Replacement, &rdev->flags)) 2574 set_bit(WantReplacement, &rdev->flags); 2575 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2576 md_wakeup_thread(rdev->mddev->thread); 2577 err = 0; 2578 } else if (cmd_match(buf, "-want_replacement")) { 2579 /* Clearing 'want_replacement' is always allowed. 2580 * Once replacements starts it is too late though. 2581 */ 2582 err = 0; 2583 clear_bit(WantReplacement, &rdev->flags); 2584 } else if (cmd_match(buf, "replacement")) { 2585 /* Can only set a device as a replacement when array has not 2586 * yet been started. Once running, replacement is automatic 2587 * from spares, or by assigning 'slot'. 2588 */ 2589 if (rdev->mddev->pers) 2590 err = -EBUSY; 2591 else { 2592 set_bit(Replacement, &rdev->flags); 2593 err = 0; 2594 } 2595 } else if (cmd_match(buf, "-replacement")) { 2596 /* Similarly, can only clear Replacement before start */ 2597 if (rdev->mddev->pers) 2598 err = -EBUSY; 2599 else { 2600 clear_bit(Replacement, &rdev->flags); 2601 err = 0; 2602 } 2603 } else if (cmd_match(buf, "re-add")) { 2604 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { 2605 /* clear_bit is performed _after_ all the devices 2606 * have their local Faulty bit cleared. If any writes 2607 * happen in the meantime in the local node, they 2608 * will land in the local bitmap, which will be synced 2609 * by this node eventually 2610 */ 2611 if (!mddev_is_clustered(rdev->mddev) || 2612 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 2613 clear_bit(Faulty, &rdev->flags); 2614 err = add_bound_rdev(rdev); 2615 } 2616 } else 2617 err = -EBUSY; 2618 } 2619 if (!err) 2620 sysfs_notify_dirent_safe(rdev->sysfs_state); 2621 return err ? err : len; 2622} 2623static struct rdev_sysfs_entry rdev_state = 2624__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 2625 2626static ssize_t 2627errors_show(struct md_rdev *rdev, char *page) 2628{ 2629 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2630} 2631 2632static ssize_t 2633errors_store(struct md_rdev *rdev, const char *buf, size_t len) 2634{ 2635 char *e; 2636 unsigned long n = simple_strtoul(buf, &e, 10); 2637 if (*buf && (*e == 0 || *e == '\n')) { 2638 atomic_set(&rdev->corrected_errors, n); 2639 return len; 2640 } 2641 return -EINVAL; 2642} 2643static struct rdev_sysfs_entry rdev_errors = 2644__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2645 2646static ssize_t 2647slot_show(struct md_rdev *rdev, char *page) 2648{ 2649 if (rdev->raid_disk < 0) 2650 return sprintf(page, "none\n"); 2651 else 2652 return sprintf(page, "%d\n", rdev->raid_disk); 2653} 2654 2655static ssize_t 2656slot_store(struct md_rdev *rdev, const char *buf, size_t len) 2657{ 2658 char *e; 2659 int err; 2660 int slot = simple_strtoul(buf, &e, 10); 2661 if (strncmp(buf, "none", 4)==0) 2662 slot = -1; 2663 else if (e==buf || (*e && *e!= '\n')) 2664 return -EINVAL; 2665 if (rdev->mddev->pers && slot == -1) { 2666 /* Setting 'slot' on an active array requires also 2667 * updating the 'rd%d' link, and communicating 2668 * with the personality with ->hot_*_disk. 2669 * For now we only support removing 2670 * failed/spare devices. This normally happens automatically, 2671 * but not when the metadata is externally managed. 2672 */ 2673 if (rdev->raid_disk == -1) 2674 return -EEXIST; 2675 /* personality does all needed checks */ 2676 if (rdev->mddev->pers->hot_remove_disk == NULL) 2677 return -EINVAL; 2678 clear_bit(Blocked, &rdev->flags); 2679 remove_and_add_spares(rdev->mddev, rdev); 2680 if (rdev->raid_disk >= 0) 2681 return -EBUSY; 2682 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2683 md_wakeup_thread(rdev->mddev->thread); 2684 } else if (rdev->mddev->pers) { 2685 /* Activating a spare .. or possibly reactivating 2686 * if we ever get bitmaps working here. 2687 */ 2688 2689 if (rdev->raid_disk != -1) 2690 return -EBUSY; 2691 2692 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 2693 return -EBUSY; 2694 2695 if (rdev->mddev->pers->hot_add_disk == NULL) 2696 return -EINVAL; 2697 2698 if (slot >= rdev->mddev->raid_disks && 2699 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2700 return -ENOSPC; 2701 2702 rdev->raid_disk = slot; 2703 if (test_bit(In_sync, &rdev->flags)) 2704 rdev->saved_raid_disk = slot; 2705 else 2706 rdev->saved_raid_disk = -1; 2707 clear_bit(In_sync, &rdev->flags); 2708 clear_bit(Bitmap_sync, &rdev->flags); 2709 err = rdev->mddev->pers-> 2710 hot_add_disk(rdev->mddev, rdev); 2711 if (err) { 2712 rdev->raid_disk = -1; 2713 return err; 2714 } else 2715 sysfs_notify_dirent_safe(rdev->sysfs_state); 2716 if (sysfs_link_rdev(rdev->mddev, rdev)) 2717 /* failure here is OK */; 2718 /* don't wakeup anyone, leave that to userspace. */ 2719 } else { 2720 if (slot >= rdev->mddev->raid_disks && 2721 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2722 return -ENOSPC; 2723 rdev->raid_disk = slot; 2724 /* assume it is working */ 2725 clear_bit(Faulty, &rdev->flags); 2726 clear_bit(WriteMostly, &rdev->flags); 2727 set_bit(In_sync, &rdev->flags); 2728 sysfs_notify_dirent_safe(rdev->sysfs_state); 2729 } 2730 return len; 2731} 2732 2733static struct rdev_sysfs_entry rdev_slot = 2734__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2735 2736static ssize_t 2737offset_show(struct md_rdev *rdev, char *page) 2738{ 2739 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2740} 2741 2742static ssize_t 2743offset_store(struct md_rdev *rdev, const char *buf, size_t len) 2744{ 2745 unsigned long long offset; 2746 if (kstrtoull(buf, 10, &offset) < 0) 2747 return -EINVAL; 2748 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2749 return -EBUSY; 2750 if (rdev->sectors && rdev->mddev->external) 2751 /* Must set offset before size, so overlap checks 2752 * can be sane */ 2753 return -EBUSY; 2754 rdev->data_offset = offset; 2755 rdev->new_data_offset = offset; 2756 return len; 2757} 2758 2759static struct rdev_sysfs_entry rdev_offset = 2760__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2761 2762static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 2763{ 2764 return sprintf(page, "%llu\n", 2765 (unsigned long long)rdev->new_data_offset); 2766} 2767 2768static ssize_t new_offset_store(struct md_rdev *rdev, 2769 const char *buf, size_t len) 2770{ 2771 unsigned long long new_offset; 2772 struct mddev *mddev = rdev->mddev; 2773 2774 if (kstrtoull(buf, 10, &new_offset) < 0) 2775 return -EINVAL; 2776 2777 if (mddev->sync_thread || 2778 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 2779 return -EBUSY; 2780 if (new_offset == rdev->data_offset) 2781 /* reset is always permitted */ 2782 ; 2783 else if (new_offset > rdev->data_offset) { 2784 /* must not push array size beyond rdev_sectors */ 2785 if (new_offset - rdev->data_offset 2786 + mddev->dev_sectors > rdev->sectors) 2787 return -E2BIG; 2788 } 2789 /* Metadata worries about other space details. */ 2790 2791 /* decreasing the offset is inconsistent with a backwards 2792 * reshape. 2793 */ 2794 if (new_offset < rdev->data_offset && 2795 mddev->reshape_backwards) 2796 return -EINVAL; 2797 /* Increasing offset is inconsistent with forwards 2798 * reshape. reshape_direction should be set to 2799 * 'backwards' first. 2800 */ 2801 if (new_offset > rdev->data_offset && 2802 !mddev->reshape_backwards) 2803 return -EINVAL; 2804 2805 if (mddev->pers && mddev->persistent && 2806 !super_types[mddev->major_version] 2807 .allow_new_offset(rdev, new_offset)) 2808 return -E2BIG; 2809 rdev->new_data_offset = new_offset; 2810 if (new_offset > rdev->data_offset) 2811 mddev->reshape_backwards = 1; 2812 else if (new_offset < rdev->data_offset) 2813 mddev->reshape_backwards = 0; 2814 2815 return len; 2816} 2817static struct rdev_sysfs_entry rdev_new_offset = 2818__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 2819 2820static ssize_t 2821rdev_size_show(struct md_rdev *rdev, char *page) 2822{ 2823 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2824} 2825 2826static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2827{ 2828 /* check if two start/length pairs overlap */ 2829 if (s1+l1 <= s2) 2830 return 0; 2831 if (s2+l2 <= s1) 2832 return 0; 2833 return 1; 2834} 2835 2836static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2837{ 2838 unsigned long long blocks; 2839 sector_t new; 2840 2841 if (kstrtoull(buf, 10, &blocks) < 0) 2842 return -EINVAL; 2843 2844 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2845 return -EINVAL; /* sector conversion overflow */ 2846 2847 new = blocks * 2; 2848 if (new != blocks * 2) 2849 return -EINVAL; /* unsigned long long to sector_t overflow */ 2850 2851 *sectors = new; 2852 return 0; 2853} 2854 2855static ssize_t 2856rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 2857{ 2858 struct mddev *my_mddev = rdev->mddev; 2859 sector_t oldsectors = rdev->sectors; 2860 sector_t sectors; 2861 2862 if (strict_blocks_to_sectors(buf, §ors) < 0) 2863 return -EINVAL; 2864 if (rdev->data_offset != rdev->new_data_offset) 2865 return -EINVAL; /* too confusing */ 2866 if (my_mddev->pers && rdev->raid_disk >= 0) { 2867 if (my_mddev->persistent) { 2868 sectors = super_types[my_mddev->major_version]. 2869 rdev_size_change(rdev, sectors); 2870 if (!sectors) 2871 return -EBUSY; 2872 } else if (!sectors) 2873 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 2874 rdev->data_offset; 2875 if (!my_mddev->pers->resize) 2876 /* Cannot change size for RAID0 or Linear etc */ 2877 return -EINVAL; 2878 } 2879 if (sectors < my_mddev->dev_sectors) 2880 return -EINVAL; /* component must fit device */ 2881 2882 rdev->sectors = sectors; 2883 if (sectors > oldsectors && my_mddev->external) { 2884 /* Need to check that all other rdevs with the same 2885 * ->bdev do not overlap. 'rcu' is sufficient to walk 2886 * the rdev lists safely. 2887 * This check does not provide a hard guarantee, it 2888 * just helps avoid dangerous mistakes. 2889 */ 2890 struct mddev *mddev; 2891 int overlap = 0; 2892 struct list_head *tmp; 2893 2894 rcu_read_lock(); 2895 for_each_mddev(mddev, tmp) { 2896 struct md_rdev *rdev2; 2897 2898 rdev_for_each(rdev2, mddev) 2899 if (rdev->bdev == rdev2->bdev && 2900 rdev != rdev2 && 2901 overlaps(rdev->data_offset, rdev->sectors, 2902 rdev2->data_offset, 2903 rdev2->sectors)) { 2904 overlap = 1; 2905 break; 2906 } 2907 if (overlap) { 2908 mddev_put(mddev); 2909 break; 2910 } 2911 } 2912 rcu_read_unlock(); 2913 if (overlap) { 2914 /* Someone else could have slipped in a size 2915 * change here, but doing so is just silly. 2916 * We put oldsectors back because we *know* it is 2917 * safe, and trust userspace not to race with 2918 * itself 2919 */ 2920 rdev->sectors = oldsectors; 2921 return -EBUSY; 2922 } 2923 } 2924 return len; 2925} 2926 2927static struct rdev_sysfs_entry rdev_size = 2928__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2929 2930static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 2931{ 2932 unsigned long long recovery_start = rdev->recovery_offset; 2933 2934 if (test_bit(In_sync, &rdev->flags) || 2935 recovery_start == MaxSector) 2936 return sprintf(page, "none\n"); 2937 2938 return sprintf(page, "%llu\n", recovery_start); 2939} 2940 2941static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 2942{ 2943 unsigned long long recovery_start; 2944 2945 if (cmd_match(buf, "none")) 2946 recovery_start = MaxSector; 2947 else if (kstrtoull(buf, 10, &recovery_start)) 2948 return -EINVAL; 2949 2950 if (rdev->mddev->pers && 2951 rdev->raid_disk >= 0) 2952 return -EBUSY; 2953 2954 rdev->recovery_offset = recovery_start; 2955 if (recovery_start == MaxSector) 2956 set_bit(In_sync, &rdev->flags); 2957 else 2958 clear_bit(In_sync, &rdev->flags); 2959 return len; 2960} 2961 2962static struct rdev_sysfs_entry rdev_recovery_start = 2963__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 2964 2965static ssize_t 2966badblocks_show(struct badblocks *bb, char *page, int unack); 2967static ssize_t 2968badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); 2969 2970static ssize_t bb_show(struct md_rdev *rdev, char *page) 2971{ 2972 return badblocks_show(&rdev->badblocks, page, 0); 2973} 2974static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 2975{ 2976 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 2977 /* Maybe that ack was all we needed */ 2978 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 2979 wake_up(&rdev->blocked_wait); 2980 return rv; 2981} 2982static struct rdev_sysfs_entry rdev_bad_blocks = 2983__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 2984 2985static ssize_t ubb_show(struct md_rdev *rdev, char *page) 2986{ 2987 return badblocks_show(&rdev->badblocks, page, 1); 2988} 2989static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 2990{ 2991 return badblocks_store(&rdev->badblocks, page, len, 1); 2992} 2993static struct rdev_sysfs_entry rdev_unack_bad_blocks = 2994__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 2995 2996static struct attribute *rdev_default_attrs[] = { 2997 &rdev_state.attr, 2998 &rdev_errors.attr, 2999 &rdev_slot.attr, 3000 &rdev_offset.attr, 3001 &rdev_new_offset.attr, 3002 &rdev_size.attr, 3003 &rdev_recovery_start.attr, 3004 &rdev_bad_blocks.attr, 3005 &rdev_unack_bad_blocks.attr, 3006 NULL, 3007}; 3008static ssize_t 3009rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3010{ 3011 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3012 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3013 3014 if (!entry->show) 3015 return -EIO; 3016 if (!rdev->mddev) 3017 return -EBUSY; 3018 return entry->show(rdev, page); 3019} 3020 3021static ssize_t 3022rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3023 const char *page, size_t length) 3024{ 3025 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3026 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3027 ssize_t rv; 3028 struct mddev *mddev = rdev->mddev; 3029 3030 if (!entry->store) 3031 return -EIO; 3032 if (!capable(CAP_SYS_ADMIN)) 3033 return -EACCES; 3034 rv = mddev ? mddev_lock(mddev): -EBUSY; 3035 if (!rv) { 3036 if (rdev->mddev == NULL) 3037 rv = -EBUSY; 3038 else 3039 rv = entry->store(rdev, page, length); 3040 mddev_unlock(mddev); 3041 } 3042 return rv; 3043} 3044 3045static void rdev_free(struct kobject *ko) 3046{ 3047 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3048 kfree(rdev); 3049} 3050static const struct sysfs_ops rdev_sysfs_ops = { 3051 .show = rdev_attr_show, 3052 .store = rdev_attr_store, 3053}; 3054static struct kobj_type rdev_ktype = { 3055 .release = rdev_free, 3056 .sysfs_ops = &rdev_sysfs_ops, 3057 .default_attrs = rdev_default_attrs, 3058}; 3059 3060int md_rdev_init(struct md_rdev *rdev) 3061{ 3062 rdev->desc_nr = -1; 3063 rdev->saved_raid_disk = -1; 3064 rdev->raid_disk = -1; 3065 rdev->flags = 0; 3066 rdev->data_offset = 0; 3067 rdev->new_data_offset = 0; 3068 rdev->sb_events = 0; 3069 rdev->last_read_error.tv_sec = 0; 3070 rdev->last_read_error.tv_nsec = 0; 3071 rdev->sb_loaded = 0; 3072 rdev->bb_page = NULL; 3073 atomic_set(&rdev->nr_pending, 0); 3074 atomic_set(&rdev->read_errors, 0); 3075 atomic_set(&rdev->corrected_errors, 0); 3076 3077 INIT_LIST_HEAD(&rdev->same_set); 3078 init_waitqueue_head(&rdev->blocked_wait); 3079 3080 /* Add space to store bad block list. 3081 * This reserves the space even on arrays where it cannot 3082 * be used - I wonder if that matters 3083 */ 3084 rdev->badblocks.count = 0; 3085 rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ 3086 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); 3087 seqlock_init(&rdev->badblocks.lock); 3088 if (rdev->badblocks.page == NULL) 3089 return -ENOMEM; 3090 3091 return 0; 3092} 3093EXPORT_SYMBOL_GPL(md_rdev_init); 3094/* 3095 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3096 * 3097 * mark the device faulty if: 3098 * 3099 * - the device is nonexistent (zero size) 3100 * - the device has no valid superblock 3101 * 3102 * a faulty rdev _never_ has rdev->sb set. 3103 */ 3104static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3105{ 3106 char b[BDEVNAME_SIZE]; 3107 int err; 3108 struct md_rdev *rdev; 3109 sector_t size; 3110 3111 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3112 if (!rdev) { 3113 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 3114 return ERR_PTR(-ENOMEM); 3115 } 3116 3117 err = md_rdev_init(rdev); 3118 if (err) 3119 goto abort_free; 3120 err = alloc_disk_sb(rdev); 3121 if (err) 3122 goto abort_free; 3123 3124 err = lock_rdev(rdev, newdev, super_format == -2); 3125 if (err) 3126 goto abort_free; 3127 3128 kobject_init(&rdev->kobj, &rdev_ktype); 3129 3130 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; 3131 if (!size) { 3132 printk(KERN_WARNING 3133 "md: %s has zero or unknown size, marking faulty!\n", 3134 bdevname(rdev->bdev,b)); 3135 err = -EINVAL; 3136 goto abort_free; 3137 } 3138 3139 if (super_format >= 0) { 3140 err = super_types[super_format]. 3141 load_super(rdev, NULL, super_minor); 3142 if (err == -EINVAL) { 3143 printk(KERN_WARNING 3144 "md: %s does not have a valid v%d.%d " 3145 "superblock, not importing!\n", 3146 bdevname(rdev->bdev,b), 3147 super_format, super_minor); 3148 goto abort_free; 3149 } 3150 if (err < 0) { 3151 printk(KERN_WARNING 3152 "md: could not read %s's sb, not importing!\n", 3153 bdevname(rdev->bdev,b)); 3154 goto abort_free; 3155 } 3156 } 3157 3158 return rdev; 3159 3160abort_free: 3161 if (rdev->bdev) 3162 unlock_rdev(rdev); 3163 md_rdev_clear(rdev); 3164 kfree(rdev); 3165 return ERR_PTR(err); 3166} 3167 3168/* 3169 * Check a full RAID array for plausibility 3170 */ 3171 3172static void analyze_sbs(struct mddev *mddev) 3173{ 3174 int i; 3175 struct md_rdev *rdev, *freshest, *tmp; 3176 char b[BDEVNAME_SIZE]; 3177 3178 freshest = NULL; 3179 rdev_for_each_safe(rdev, tmp, mddev) 3180 switch (super_types[mddev->major_version]. 3181 load_super(rdev, freshest, mddev->minor_version)) { 3182 case 1: 3183 freshest = rdev; 3184 break; 3185 case 0: 3186 break; 3187 default: 3188 printk( KERN_ERR \ 3189 "md: fatal superblock inconsistency in %s" 3190 " -- removing from array\n", 3191 bdevname(rdev->bdev,b)); 3192 md_kick_rdev_from_array(rdev); 3193 } 3194 3195 super_types[mddev->major_version]. 3196 validate_super(mddev, freshest); 3197 3198 i = 0; 3199 rdev_for_each_safe(rdev, tmp, mddev) { 3200 if (mddev->max_disks && 3201 (rdev->desc_nr >= mddev->max_disks || 3202 i > mddev->max_disks)) { 3203 printk(KERN_WARNING 3204 "md: %s: %s: only %d devices permitted\n", 3205 mdname(mddev), bdevname(rdev->bdev, b), 3206 mddev->max_disks); 3207 md_kick_rdev_from_array(rdev); 3208 continue; 3209 } 3210 if (rdev != freshest) { 3211 if (super_types[mddev->major_version]. 3212 validate_super(mddev, rdev)) { 3213 printk(KERN_WARNING "md: kicking non-fresh %s" 3214 " from array!\n", 3215 bdevname(rdev->bdev,b)); 3216 md_kick_rdev_from_array(rdev); 3217 continue; 3218 } 3219 /* No device should have a Candidate flag 3220 * when reading devices 3221 */ 3222 if (test_bit(Candidate, &rdev->flags)) { 3223 pr_info("md: kicking Cluster Candidate %s from array!\n", 3224 bdevname(rdev->bdev, b)); 3225 md_kick_rdev_from_array(rdev); 3226 } 3227 } 3228 if (mddev->level == LEVEL_MULTIPATH) { 3229 rdev->desc_nr = i++; 3230 rdev->raid_disk = rdev->desc_nr; 3231 set_bit(In_sync, &rdev->flags); 3232 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { 3233 rdev->raid_disk = -1; 3234 clear_bit(In_sync, &rdev->flags); 3235 } 3236 } 3237} 3238 3239/* Read a fixed-point number. 3240 * Numbers in sysfs attributes should be in "standard" units where 3241 * possible, so time should be in seconds. 3242 * However we internally use a a much smaller unit such as 3243 * milliseconds or jiffies. 3244 * This function takes a decimal number with a possible fractional 3245 * component, and produces an integer which is the result of 3246 * multiplying that number by 10^'scale'. 3247 * all without any floating-point arithmetic. 3248 */ 3249int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3250{ 3251 unsigned long result = 0; 3252 long decimals = -1; 3253 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3254 if (*cp == '.') 3255 decimals = 0; 3256 else if (decimals < scale) { 3257 unsigned int value; 3258 value = *cp - '0'; 3259 result = result * 10 + value; 3260 if (decimals >= 0) 3261 decimals++; 3262 } 3263 cp++; 3264 } 3265 if (*cp == '\n') 3266 cp++; 3267 if (*cp) 3268 return -EINVAL; 3269 if (decimals < 0) 3270 decimals = 0; 3271 while (decimals < scale) { 3272 result *= 10; 3273 decimals ++; 3274 } 3275 *res = result; 3276 return 0; 3277} 3278 3279static void md_safemode_timeout(unsigned long data); 3280 3281static ssize_t 3282safe_delay_show(struct mddev *mddev, char *page) 3283{ 3284 int msec = (mddev->safemode_delay*1000)/HZ; 3285 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 3286} 3287static ssize_t 3288safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3289{ 3290 unsigned long msec; 3291 3292 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) 3293 return -EINVAL; 3294 if (msec == 0) 3295 mddev->safemode_delay = 0; 3296 else { 3297 unsigned long old_delay = mddev->safemode_delay; 3298 unsigned long new_delay = (msec*HZ)/1000; 3299 3300 if (new_delay == 0) 3301 new_delay = 1; 3302 mddev->safemode_delay = new_delay; 3303 if (new_delay < old_delay || old_delay == 0) 3304 mod_timer(&mddev->safemode_timer, jiffies+1); 3305 } 3306 return len; 3307} 3308static struct md_sysfs_entry md_safe_delay = 3309__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3310 3311static ssize_t 3312level_show(struct mddev *mddev, char *page) 3313{ 3314 struct md_personality *p; 3315 int ret; 3316 spin_lock(&mddev->lock); 3317 p = mddev->pers; 3318 if (p) 3319 ret = sprintf(page, "%s\n", p->name); 3320 else if (mddev->clevel[0]) 3321 ret = sprintf(page, "%s\n", mddev->clevel); 3322 else if (mddev->level != LEVEL_NONE) 3323 ret = sprintf(page, "%d\n", mddev->level); 3324 else 3325 ret = 0; 3326 spin_unlock(&mddev->lock); 3327 return ret; 3328} 3329 3330static ssize_t 3331level_store(struct mddev *mddev, const char *buf, size_t len) 3332{ 3333 char clevel[16]; 3334 ssize_t rv; 3335 size_t slen = len; 3336 struct md_personality *pers, *oldpers; 3337 long level; 3338 void *priv, *oldpriv; 3339 struct md_rdev *rdev; 3340 3341 if (slen == 0 || slen >= sizeof(clevel)) 3342 return -EINVAL; 3343 3344 rv = mddev_lock(mddev); 3345 if (rv) 3346 return rv; 3347 3348 if (mddev->pers == NULL) { 3349 strncpy(mddev->clevel, buf, slen); 3350 if (mddev->clevel[slen-1] == '\n') 3351 slen--; 3352 mddev->clevel[slen] = 0; 3353 mddev->level = LEVEL_NONE; 3354 rv = len; 3355 goto out_unlock; 3356 } 3357 rv = -EROFS; 3358 if (mddev->ro) 3359 goto out_unlock; 3360 3361 /* request to change the personality. Need to ensure: 3362 * - array is not engaged in resync/recovery/reshape 3363 * - old personality can be suspended 3364 * - new personality will access other array. 3365 */ 3366 3367 rv = -EBUSY; 3368 if (mddev->sync_thread || 3369 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3370 mddev->reshape_position != MaxSector || 3371 mddev->sysfs_active) 3372 goto out_unlock; 3373 3374 rv = -EINVAL; 3375 if (!mddev->pers->quiesce) { 3376 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 3377 mdname(mddev), mddev->pers->name); 3378 goto out_unlock; 3379 } 3380 3381 /* Now find the new personality */ 3382 strncpy(clevel, buf, slen); 3383 if (clevel[slen-1] == '\n') 3384 slen--; 3385 clevel[slen] = 0; 3386 if (kstrtol(clevel, 10, &level)) 3387 level = LEVEL_NONE; 3388 3389 if (request_module("md-%s", clevel) != 0) 3390 request_module("md-level-%s", clevel); 3391 spin_lock(&pers_lock); 3392 pers = find_pers(level, clevel); 3393 if (!pers || !try_module_get(pers->owner)) { 3394 spin_unlock(&pers_lock); 3395 printk(KERN_WARNING "md: personality %s not loaded\n", clevel); 3396 rv = -EINVAL; 3397 goto out_unlock; 3398 } 3399 spin_unlock(&pers_lock); 3400 3401 if (pers == mddev->pers) { 3402 /* Nothing to do! */ 3403 module_put(pers->owner); 3404 rv = len; 3405 goto out_unlock; 3406 } 3407 if (!pers->takeover) { 3408 module_put(pers->owner); 3409 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 3410 mdname(mddev), clevel); 3411 rv = -EINVAL; 3412 goto out_unlock; 3413 } 3414 3415 rdev_for_each(rdev, mddev) 3416 rdev->new_raid_disk = rdev->raid_disk; 3417 3418 /* ->takeover must set new_* and/or delta_disks 3419 * if it succeeds, and may set them when it fails. 3420 */ 3421 priv = pers->takeover(mddev); 3422 if (IS_ERR(priv)) { 3423 mddev->new_level = mddev->level; 3424 mddev->new_layout = mddev->layout; 3425 mddev->new_chunk_sectors = mddev->chunk_sectors; 3426 mddev->raid_disks -= mddev->delta_disks; 3427 mddev->delta_disks = 0; 3428 mddev->reshape_backwards = 0; 3429 module_put(pers->owner); 3430 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3431 mdname(mddev), clevel); 3432 rv = PTR_ERR(priv); 3433 goto out_unlock; 3434 } 3435 3436 /* Looks like we have a winner */ 3437 mddev_suspend(mddev); 3438 mddev_detach(mddev); 3439 3440 spin_lock(&mddev->lock); 3441 oldpers = mddev->pers; 3442 oldpriv = mddev->private; 3443 mddev->pers = pers; 3444 mddev->private = priv; 3445 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3446 mddev->level = mddev->new_level; 3447 mddev->layout = mddev->new_layout; 3448 mddev->chunk_sectors = mddev->new_chunk_sectors; 3449 mddev->delta_disks = 0; 3450 mddev->reshape_backwards = 0; 3451 mddev->degraded = 0; 3452 spin_unlock(&mddev->lock); 3453 3454 if (oldpers->sync_request == NULL && 3455 mddev->external) { 3456 /* We are converting from a no-redundancy array 3457 * to a redundancy array and metadata is managed 3458 * externally so we need to be sure that writes 3459 * won't block due to a need to transition 3460 * clean->dirty 3461 * until external management is started. 3462 */ 3463 mddev->in_sync = 0; 3464 mddev->safemode_delay = 0; 3465 mddev->safemode = 0; 3466 } 3467 3468 oldpers->free(mddev, oldpriv); 3469 3470 if (oldpers->sync_request == NULL && 3471 pers->sync_request != NULL) { 3472 /* need to add the md_redundancy_group */ 3473 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3474 printk(KERN_WARNING 3475 "md: cannot register extra attributes for %s\n", 3476 mdname(mddev)); 3477 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 3478 } 3479 if (oldpers->sync_request != NULL && 3480 pers->sync_request == NULL) { 3481 /* need to remove the md_redundancy_group */ 3482 if (mddev->to_remove == NULL) 3483 mddev->to_remove = &md_redundancy_group; 3484 } 3485 3486 rdev_for_each(rdev, mddev) { 3487 if (rdev->raid_disk < 0) 3488 continue; 3489 if (rdev->new_raid_disk >= mddev->raid_disks) 3490 rdev->new_raid_disk = -1; 3491 if (rdev->new_raid_disk == rdev->raid_disk) 3492 continue; 3493 sysfs_unlink_rdev(mddev, rdev); 3494 } 3495 rdev_for_each(rdev, mddev) { 3496 if (rdev->raid_disk < 0) 3497 continue; 3498 if (rdev->new_raid_disk == rdev->raid_disk) 3499 continue; 3500 rdev->raid_disk = rdev->new_raid_disk; 3501 if (rdev->raid_disk < 0) 3502 clear_bit(In_sync, &rdev->flags); 3503 else { 3504 if (sysfs_link_rdev(mddev, rdev)) 3505 printk(KERN_WARNING "md: cannot register rd%d" 3506 " for %s after level change\n", 3507 rdev->raid_disk, mdname(mddev)); 3508 } 3509 } 3510 3511 if (pers->sync_request == NULL) { 3512 /* this is now an array without redundancy, so 3513 * it must always be in_sync 3514 */ 3515 mddev->in_sync = 1; 3516 del_timer_sync(&mddev->safemode_timer); 3517 } 3518 blk_set_stacking_limits(&mddev->queue->limits); 3519 pers->run(mddev); 3520 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3521 mddev_resume(mddev); 3522 if (!mddev->thread) 3523 md_update_sb(mddev, 1); 3524 sysfs_notify(&mddev->kobj, NULL, "level"); 3525 md_new_event(mddev); 3526 rv = len; 3527out_unlock: 3528 mddev_unlock(mddev); 3529 return rv; 3530} 3531 3532static struct md_sysfs_entry md_level = 3533__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 3534 3535static ssize_t 3536layout_show(struct mddev *mddev, char *page) 3537{ 3538 /* just a number, not meaningful for all levels */ 3539 if (mddev->reshape_position != MaxSector && 3540 mddev->layout != mddev->new_layout) 3541 return sprintf(page, "%d (%d)\n", 3542 mddev->new_layout, mddev->layout); 3543 return sprintf(page, "%d\n", mddev->layout); 3544} 3545 3546static ssize_t 3547layout_store(struct mddev *mddev, const char *buf, size_t len) 3548{ 3549 char *e; 3550 unsigned long n = simple_strtoul(buf, &e, 10); 3551 int err; 3552 3553 if (!*buf || (*e && *e != '\n')) 3554 return -EINVAL; 3555 err = mddev_lock(mddev); 3556 if (err) 3557 return err; 3558 3559 if (mddev->pers) { 3560 if (mddev->pers->check_reshape == NULL) 3561 err = -EBUSY; 3562 else if (mddev->ro) 3563 err = -EROFS; 3564 else { 3565 mddev->new_layout = n; 3566 err = mddev->pers->check_reshape(mddev); 3567 if (err) 3568 mddev->new_layout = mddev->layout; 3569 } 3570 } else { 3571 mddev->new_layout = n; 3572 if (mddev->reshape_position == MaxSector) 3573 mddev->layout = n; 3574 } 3575 mddev_unlock(mddev); 3576 return err ?: len; 3577} 3578static struct md_sysfs_entry md_layout = 3579__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 3580 3581static ssize_t 3582raid_disks_show(struct mddev *mddev, char *page) 3583{ 3584 if (mddev->raid_disks == 0) 3585 return 0; 3586 if (mddev->reshape_position != MaxSector && 3587 mddev->delta_disks != 0) 3588 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 3589 mddev->raid_disks - mddev->delta_disks); 3590 return sprintf(page, "%d\n", mddev->raid_disks); 3591} 3592 3593static int update_raid_disks(struct mddev *mddev, int raid_disks); 3594 3595static ssize_t 3596raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 3597{ 3598 char *e; 3599 int err; 3600 unsigned long n = simple_strtoul(buf, &e, 10); 3601 3602 if (!*buf || (*e && *e != '\n')) 3603 return -EINVAL; 3604 3605 err = mddev_lock(mddev); 3606 if (err) 3607 return err; 3608 if (mddev->pers) 3609 err = update_raid_disks(mddev, n); 3610 else if (mddev->reshape_position != MaxSector) { 3611 struct md_rdev *rdev; 3612 int olddisks = mddev->raid_disks - mddev->delta_disks; 3613 3614 err = -EINVAL; 3615 rdev_for_each(rdev, mddev) { 3616 if (olddisks < n && 3617 rdev->data_offset < rdev->new_data_offset) 3618 goto out_unlock; 3619 if (olddisks > n && 3620 rdev->data_offset > rdev->new_data_offset) 3621 goto out_unlock; 3622 } 3623 err = 0; 3624 mddev->delta_disks = n - olddisks; 3625 mddev->raid_disks = n; 3626 mddev->reshape_backwards = (mddev->delta_disks < 0); 3627 } else 3628 mddev->raid_disks = n; 3629out_unlock: 3630 mddev_unlock(mddev); 3631 return err ? err : len; 3632} 3633static struct md_sysfs_entry md_raid_disks = 3634__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3635 3636static ssize_t 3637chunk_size_show(struct mddev *mddev, char *page) 3638{ 3639 if (mddev->reshape_position != MaxSector && 3640 mddev->chunk_sectors != mddev->new_chunk_sectors) 3641 return sprintf(page, "%d (%d)\n", 3642 mddev->new_chunk_sectors << 9, 3643 mddev->chunk_sectors << 9); 3644 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 3645} 3646 3647static ssize_t 3648chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 3649{ 3650 int err; 3651 char *e; 3652 unsigned long n = simple_strtoul(buf, &e, 10); 3653 3654 if (!*buf || (*e && *e != '\n')) 3655 return -EINVAL; 3656 3657 err = mddev_lock(mddev); 3658 if (err) 3659 return err; 3660 if (mddev->pers) { 3661 if (mddev->pers->check_reshape == NULL) 3662 err = -EBUSY; 3663 else if (mddev->ro) 3664 err = -EROFS; 3665 else { 3666 mddev->new_chunk_sectors = n >> 9; 3667 err = mddev->pers->check_reshape(mddev); 3668 if (err) 3669 mddev->new_chunk_sectors = mddev->chunk_sectors; 3670 } 3671 } else { 3672 mddev->new_chunk_sectors = n >> 9; 3673 if (mddev->reshape_position == MaxSector) 3674 mddev->chunk_sectors = n >> 9; 3675 } 3676 mddev_unlock(mddev); 3677 return err ?: len; 3678} 3679static struct md_sysfs_entry md_chunk_size = 3680__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3681 3682static ssize_t 3683resync_start_show(struct mddev *mddev, char *page) 3684{ 3685 if (mddev->recovery_cp == MaxSector) 3686 return sprintf(page, "none\n"); 3687 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 3688} 3689 3690static ssize_t 3691resync_start_store(struct mddev *mddev, const char *buf, size_t len) 3692{ 3693 int err; 3694 char *e; 3695 unsigned long long n = simple_strtoull(buf, &e, 10); 3696 3697 err = mddev_lock(mddev); 3698 if (err) 3699 return err; 3700 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3701 err = -EBUSY; 3702 else if (cmd_match(buf, "none")) 3703 n = MaxSector; 3704 else if (!*buf || (*e && *e != '\n')) 3705 err = -EINVAL; 3706 3707 if (!err) { 3708 mddev->recovery_cp = n; 3709 if (mddev->pers) 3710 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3711 } 3712 mddev_unlock(mddev); 3713 return err ?: len; 3714} 3715static struct md_sysfs_entry md_resync_start = 3716__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 3717 resync_start_show, resync_start_store); 3718 3719/* 3720 * The array state can be: 3721 * 3722 * clear 3723 * No devices, no size, no level 3724 * Equivalent to STOP_ARRAY ioctl 3725 * inactive 3726 * May have some settings, but array is not active 3727 * all IO results in error 3728 * When written, doesn't tear down array, but just stops it 3729 * suspended (not supported yet) 3730 * All IO requests will block. The array can be reconfigured. 3731 * Writing this, if accepted, will block until array is quiescent 3732 * readonly 3733 * no resync can happen. no superblocks get written. 3734 * write requests fail 3735 * read-auto 3736 * like readonly, but behaves like 'clean' on a write request. 3737 * 3738 * clean - no pending writes, but otherwise active. 3739 * When written to inactive array, starts without resync 3740 * If a write request arrives then 3741 * if metadata is known, mark 'dirty' and switch to 'active'. 3742 * if not known, block and switch to write-pending 3743 * If written to an active array that has pending writes, then fails. 3744 * active 3745 * fully active: IO and resync can be happening. 3746 * When written to inactive array, starts with resync 3747 * 3748 * write-pending 3749 * clean, but writes are blocked waiting for 'active' to be written. 3750 * 3751 * active-idle 3752 * like active, but no writes have been seen for a while (100msec). 3753 * 3754 */ 3755enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 3756 write_pending, active_idle, bad_word}; 3757static char *array_states[] = { 3758 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 3759 "write-pending", "active-idle", NULL }; 3760 3761static int match_word(const char *word, char **list) 3762{ 3763 int n; 3764 for (n=0; list[n]; n++) 3765 if (cmd_match(word, list[n])) 3766 break; 3767 return n; 3768} 3769 3770static ssize_t 3771array_state_show(struct mddev *mddev, char *page) 3772{ 3773 enum array_state st = inactive; 3774 3775 if (mddev->pers) 3776 switch(mddev->ro) { 3777 case 1: 3778 st = readonly; 3779 break; 3780 case 2: 3781 st = read_auto; 3782 break; 3783 case 0: 3784 if (mddev->in_sync) 3785 st = clean; 3786 else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 3787 st = write_pending; 3788 else if (mddev->safemode) 3789 st = active_idle; 3790 else 3791 st = active; 3792 } 3793 else { 3794 if (list_empty(&mddev->disks) && 3795 mddev->raid_disks == 0 && 3796 mddev->dev_sectors == 0) 3797 st = clear; 3798 else 3799 st = inactive; 3800 } 3801 return sprintf(page, "%s\n", array_states[st]); 3802} 3803 3804static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 3805static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 3806static int do_md_run(struct mddev *mddev); 3807static int restart_array(struct mddev *mddev); 3808 3809static ssize_t 3810array_state_store(struct mddev *mddev, const char *buf, size_t len) 3811{ 3812 int err; 3813 enum array_state st = match_word(buf, array_states); 3814 3815 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { 3816 /* don't take reconfig_mutex when toggling between 3817 * clean and active 3818 */ 3819 spin_lock(&mddev->lock); 3820 if (st == active) { 3821 restart_array(mddev); 3822 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 3823 wake_up(&mddev->sb_wait); 3824 err = 0; 3825 } else /* st == clean */ { 3826 restart_array(mddev); 3827 if (atomic_read(&mddev->writes_pending) == 0) { 3828 if (mddev->in_sync == 0) { 3829 mddev->in_sync = 1; 3830 if (mddev->safemode == 1) 3831 mddev->safemode = 0; 3832 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3833 } 3834 err = 0; 3835 } else 3836 err = -EBUSY; 3837 } 3838 spin_unlock(&mddev->lock); 3839 return err ?: len; 3840 } 3841 err = mddev_lock(mddev); 3842 if (err) 3843 return err; 3844 err = -EINVAL; 3845 switch(st) { 3846 case bad_word: 3847 break; 3848 case clear: 3849 /* stopping an active array */ 3850 err = do_md_stop(mddev, 0, NULL); 3851 break; 3852 case inactive: 3853 /* stopping an active array */ 3854 if (mddev->pers) 3855 err = do_md_stop(mddev, 2, NULL); 3856 else 3857 err = 0; /* already inactive */ 3858 break; 3859 case suspended: 3860 break; /* not supported yet */ 3861 case readonly: 3862 if (mddev->pers) 3863 err = md_set_readonly(mddev, NULL); 3864 else { 3865 mddev->ro = 1; 3866 set_disk_ro(mddev->gendisk, 1); 3867 err = do_md_run(mddev); 3868 } 3869 break; 3870 case read_auto: 3871 if (mddev->pers) { 3872 if (mddev->ro == 0) 3873 err = md_set_readonly(mddev, NULL); 3874 else if (mddev->ro == 1) 3875 err = restart_array(mddev); 3876 if (err == 0) { 3877 mddev->ro = 2; 3878 set_disk_ro(mddev->gendisk, 0); 3879 } 3880 } else { 3881 mddev->ro = 2; 3882 err = do_md_run(mddev); 3883 } 3884 break; 3885 case clean: 3886 if (mddev->pers) { 3887 restart_array(mddev); 3888 spin_lock(&mddev->lock); 3889 if (atomic_read(&mddev->writes_pending) == 0) { 3890 if (mddev->in_sync == 0) { 3891 mddev->in_sync = 1; 3892 if (mddev->safemode == 1) 3893 mddev->safemode = 0; 3894 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3895 } 3896 err = 0; 3897 } else 3898 err = -EBUSY; 3899 spin_unlock(&mddev->lock); 3900 } else 3901 err = -EINVAL; 3902 break; 3903 case active: 3904 if (mddev->pers) { 3905 restart_array(mddev); 3906 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 3907 wake_up(&mddev->sb_wait); 3908 err = 0; 3909 } else { 3910 mddev->ro = 0; 3911 set_disk_ro(mddev->gendisk, 0); 3912 err = do_md_run(mddev); 3913 } 3914 break; 3915 case write_pending: 3916 case active_idle: 3917 /* these cannot be set */ 3918 break; 3919 } 3920 3921 if (!err) { 3922 if (mddev->hold_active == UNTIL_IOCTL) 3923 mddev->hold_active = 0; 3924 sysfs_notify_dirent_safe(mddev->sysfs_state); 3925 } 3926 mddev_unlock(mddev); 3927 return err ?: len; 3928} 3929static struct md_sysfs_entry md_array_state = 3930__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3931 3932static ssize_t 3933max_corrected_read_errors_show(struct mddev *mddev, char *page) { 3934 return sprintf(page, "%d\n", 3935 atomic_read(&mddev->max_corr_read_errors)); 3936} 3937 3938static ssize_t 3939max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 3940{ 3941 char *e; 3942 unsigned long n = simple_strtoul(buf, &e, 10); 3943 3944 if (*buf && (*e == 0 || *e == '\n')) { 3945 atomic_set(&mddev->max_corr_read_errors, n); 3946 return len; 3947 } 3948 return -EINVAL; 3949} 3950 3951static struct md_sysfs_entry max_corr_read_errors = 3952__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 3953 max_corrected_read_errors_store); 3954 3955static ssize_t 3956null_show(struct mddev *mddev, char *page) 3957{ 3958 return -EINVAL; 3959} 3960 3961static ssize_t 3962new_dev_store(struct mddev *mddev, const char *buf, size_t len) 3963{ 3964 /* buf must be %d:%d\n? giving major and minor numbers */ 3965 /* The new device is added to the array. 3966 * If the array has a persistent superblock, we read the 3967 * superblock to initialise info and check validity. 3968 * Otherwise, only checking done is that in bind_rdev_to_array, 3969 * which mainly checks size. 3970 */ 3971 char *e; 3972 int major = simple_strtoul(buf, &e, 10); 3973 int minor; 3974 dev_t dev; 3975 struct md_rdev *rdev; 3976 int err; 3977 3978 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 3979 return -EINVAL; 3980 minor = simple_strtoul(e+1, &e, 10); 3981 if (*e && *e != '\n') 3982 return -EINVAL; 3983 dev = MKDEV(major, minor); 3984 if (major != MAJOR(dev) || 3985 minor != MINOR(dev)) 3986 return -EOVERFLOW; 3987 3988 flush_workqueue(md_misc_wq); 3989 3990 err = mddev_lock(mddev); 3991 if (err) 3992 return err; 3993 if (mddev->persistent) { 3994 rdev = md_import_device(dev, mddev->major_version, 3995 mddev->minor_version); 3996 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 3997 struct md_rdev *rdev0 3998 = list_entry(mddev->disks.next, 3999 struct md_rdev, same_set); 4000 err = super_types[mddev->major_version] 4001 .load_super(rdev, rdev0, mddev->minor_version); 4002 if (err < 0) 4003 goto out; 4004 } 4005 } else if (mddev->external) 4006 rdev = md_import_device(dev, -2, -1); 4007 else 4008 rdev = md_import_device(dev, -1, -1); 4009 4010 if (IS_ERR(rdev)) { 4011 mddev_unlock(mddev); 4012 return PTR_ERR(rdev); 4013 } 4014 err = bind_rdev_to_array(rdev, mddev); 4015 out: 4016 if (err) 4017 export_rdev(rdev); 4018 mddev_unlock(mddev); 4019 return err ? err : len; 4020} 4021 4022static struct md_sysfs_entry md_new_device = 4023__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4024 4025static ssize_t 4026bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4027{ 4028 char *end; 4029 unsigned long chunk, end_chunk; 4030 int err; 4031 4032 err = mddev_lock(mddev); 4033 if (err) 4034 return err; 4035 if (!mddev->bitmap) 4036 goto out; 4037 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4038 while (*buf) { 4039 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4040 if (buf == end) break; 4041 if (*end == '-') { /* range */ 4042 buf = end + 1; 4043 end_chunk = simple_strtoul(buf, &end, 0); 4044 if (buf == end) break; 4045 } 4046 if (*end && !isspace(*end)) break; 4047 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4048 buf = skip_spaces(end); 4049 } 4050 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4051out: 4052 mddev_unlock(mddev); 4053 return len; 4054} 4055 4056static struct md_sysfs_entry md_bitmap = 4057__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4058 4059static ssize_t 4060size_show(struct mddev *mddev, char *page) 4061{ 4062 return sprintf(page, "%llu\n", 4063 (unsigned long long)mddev->dev_sectors / 2); 4064} 4065 4066static int update_size(struct mddev *mddev, sector_t num_sectors); 4067 4068static ssize_t 4069size_store(struct mddev *mddev, const char *buf, size_t len) 4070{ 4071 /* If array is inactive, we can reduce the component size, but 4072 * not increase it (except from 0). 4073 * If array is active, we can try an on-line resize 4074 */ 4075 sector_t sectors; 4076 int err = strict_blocks_to_sectors(buf, §ors); 4077 4078 if (err < 0) 4079 return err; 4080 err = mddev_lock(mddev); 4081 if (err) 4082 return err; 4083 if (mddev->pers) { 4084 if (mddev_is_clustered(mddev)) 4085 md_cluster_ops->metadata_update_start(mddev); 4086 err = update_size(mddev, sectors); 4087 md_update_sb(mddev, 1); 4088 if (mddev_is_clustered(mddev)) 4089 md_cluster_ops->metadata_update_finish(mddev); 4090 } else { 4091 if (mddev->dev_sectors == 0 || 4092 mddev->dev_sectors > sectors) 4093 mddev->dev_sectors = sectors; 4094 else 4095 err = -ENOSPC; 4096 } 4097 mddev_unlock(mddev); 4098 return err ? err : len; 4099} 4100 4101static struct md_sysfs_entry md_size = 4102__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4103 4104/* Metadata version. 4105 * This is one of 4106 * 'none' for arrays with no metadata (good luck...) 4107 * 'external' for arrays with externally managed metadata, 4108 * or N.M for internally known formats 4109 */ 4110static ssize_t 4111metadata_show(struct mddev *mddev, char *page) 4112{ 4113 if (mddev->persistent) 4114 return sprintf(page, "%d.%d\n", 4115 mddev->major_version, mddev->minor_version); 4116 else if (mddev->external) 4117 return sprintf(page, "external:%s\n", mddev->metadata_type); 4118 else 4119 return sprintf(page, "none\n"); 4120} 4121 4122static ssize_t 4123metadata_store(struct mddev *mddev, const char *buf, size_t len) 4124{ 4125 int major, minor; 4126 char *e; 4127 int err; 4128 /* Changing the details of 'external' metadata is 4129 * always permitted. Otherwise there must be 4130 * no devices attached to the array. 4131 */ 4132 4133 err = mddev_lock(mddev); 4134 if (err) 4135 return err; 4136 err = -EBUSY; 4137 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4138 ; 4139 else if (!list_empty(&mddev->disks)) 4140 goto out_unlock; 4141 4142 err = 0; 4143 if (cmd_match(buf, "none")) { 4144 mddev->persistent = 0; 4145 mddev->external = 0; 4146 mddev->major_version = 0; 4147 mddev->minor_version = 90; 4148 goto out_unlock; 4149 } 4150 if (strncmp(buf, "external:", 9) == 0) { 4151 size_t namelen = len-9; 4152 if (namelen >= sizeof(mddev->metadata_type)) 4153 namelen = sizeof(mddev->metadata_type)-1; 4154 strncpy(mddev->metadata_type, buf+9, namelen); 4155 mddev->metadata_type[namelen] = 0; 4156 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4157 mddev->metadata_type[--namelen] = 0; 4158 mddev->persistent = 0; 4159 mddev->external = 1; 4160 mddev->major_version = 0; 4161 mddev->minor_version = 90; 4162 goto out_unlock; 4163 } 4164 major = simple_strtoul(buf, &e, 10); 4165 err = -EINVAL; 4166 if (e==buf || *e != '.') 4167 goto out_unlock; 4168 buf = e+1; 4169 minor = simple_strtoul(buf, &e, 10); 4170 if (e==buf || (*e && *e != '\n') ) 4171 goto out_unlock; 4172 err = -ENOENT; 4173 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4174 goto out_unlock; 4175 mddev->major_version = major; 4176 mddev->minor_version = minor; 4177 mddev->persistent = 1; 4178 mddev->external = 0; 4179 err = 0; 4180out_unlock: 4181 mddev_unlock(mddev); 4182 return err ?: len; 4183} 4184 4185static struct md_sysfs_entry md_metadata = 4186__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4187 4188static ssize_t 4189action_show(struct mddev *mddev, char *page) 4190{ 4191 char *type = "idle"; 4192 unsigned long recovery = mddev->recovery; 4193 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4194 type = "frozen"; 4195 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4196 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4197 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4198 type = "reshape"; 4199 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4200 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4201 type = "resync"; 4202 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4203 type = "check"; 4204 else 4205 type = "repair"; 4206 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4207 type = "recover"; 4208 } 4209 return sprintf(page, "%s\n", type); 4210} 4211 4212static ssize_t 4213action_store(struct mddev *mddev, const char *page, size_t len) 4214{ 4215 if (!mddev->pers || !mddev->pers->sync_request) 4216 return -EINVAL; 4217 4218 4219 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4220 if (cmd_match(page, "frozen")) 4221 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4222 else 4223 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4224 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4225 mddev_lock(mddev) == 0) { 4226 flush_workqueue(md_misc_wq); 4227 if (mddev->sync_thread) { 4228 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4229 md_reap_sync_thread(mddev); 4230 } 4231 mddev_unlock(mddev); 4232 } 4233 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4234 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 4235 return -EBUSY; 4236 else if (cmd_match(page, "resync")) 4237 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4238 else if (cmd_match(page, "recover")) { 4239 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4240 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4241 } else if (cmd_match(page, "reshape")) { 4242 int err; 4243 if (mddev->pers->start_reshape == NULL) 4244 return -EINVAL; 4245 err = mddev_lock(mddev); 4246 if (!err) { 4247 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4248 err = mddev->pers->start_reshape(mddev); 4249 mddev_unlock(mddev); 4250 } 4251 if (err) 4252 return err; 4253 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4254 } else { 4255 if (cmd_match(page, "check")) 4256 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4257 else if (!cmd_match(page, "repair")) 4258 return -EINVAL; 4259 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4260 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4261 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4262 } 4263 if (mddev->ro == 2) { 4264 /* A write to sync_action is enough to justify 4265 * canceling read-auto mode 4266 */ 4267 mddev->ro = 0; 4268 md_wakeup_thread(mddev->sync_thread); 4269 } 4270 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4271 md_wakeup_thread(mddev->thread); 4272 sysfs_notify_dirent_safe(mddev->sysfs_action); 4273 return len; 4274} 4275 4276static struct md_sysfs_entry md_scan_mode = 4277__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4278 4279static ssize_t 4280last_sync_action_show(struct mddev *mddev, char *page) 4281{ 4282 return sprintf(page, "%s\n", mddev->last_sync_action); 4283} 4284 4285static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4286 4287static ssize_t 4288mismatch_cnt_show(struct mddev *mddev, char *page) 4289{ 4290 return sprintf(page, "%llu\n", 4291 (unsigned long long) 4292 atomic64_read(&mddev->resync_mismatches)); 4293} 4294 4295static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4296 4297static ssize_t 4298sync_min_show(struct mddev *mddev, char *page) 4299{ 4300 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4301 mddev->sync_speed_min ? "local": "system"); 4302} 4303 4304static ssize_t 4305sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4306{ 4307 int min; 4308 char *e; 4309 if (strncmp(buf, "system", 6)==0) { 4310 mddev->sync_speed_min = 0; 4311 return len; 4312 } 4313 min = simple_strtoul(buf, &e, 10); 4314 if (buf == e || (*e && *e != '\n') || min <= 0) 4315 return -EINVAL; 4316 mddev->sync_speed_min = min; 4317 return len; 4318} 4319 4320static struct md_sysfs_entry md_sync_min = 4321__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4322 4323static ssize_t 4324sync_max_show(struct mddev *mddev, char *page) 4325{ 4326 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4327 mddev->sync_speed_max ? "local": "system"); 4328} 4329 4330static ssize_t 4331sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4332{ 4333 int max; 4334 char *e; 4335 if (strncmp(buf, "system", 6)==0) { 4336 mddev->sync_speed_max = 0; 4337 return len; 4338 } 4339 max = simple_strtoul(buf, &e, 10); 4340 if (buf == e || (*e && *e != '\n') || max <= 0) 4341 return -EINVAL; 4342 mddev->sync_speed_max = max; 4343 return len; 4344} 4345 4346static struct md_sysfs_entry md_sync_max = 4347__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 4348 4349static ssize_t 4350degraded_show(struct mddev *mddev, char *page) 4351{ 4352 return sprintf(page, "%d\n", mddev->degraded); 4353} 4354static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 4355 4356static ssize_t 4357sync_force_parallel_show(struct mddev *mddev, char *page) 4358{ 4359 return sprintf(page, "%d\n", mddev->parallel_resync); 4360} 4361 4362static ssize_t 4363sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 4364{ 4365 long n; 4366 4367 if (kstrtol(buf, 10, &n)) 4368 return -EINVAL; 4369 4370 if (n != 0 && n != 1) 4371 return -EINVAL; 4372 4373 mddev->parallel_resync = n; 4374 4375 if (mddev->sync_thread) 4376 wake_up(&resync_wait); 4377 4378 return len; 4379} 4380 4381/* force parallel resync, even with shared block devices */ 4382static struct md_sysfs_entry md_sync_force_parallel = 4383__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 4384 sync_force_parallel_show, sync_force_parallel_store); 4385 4386static ssize_t 4387sync_speed_show(struct mddev *mddev, char *page) 4388{ 4389 unsigned long resync, dt, db; 4390 if (mddev->curr_resync == 0) 4391 return sprintf(page, "none\n"); 4392 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 4393 dt = (jiffies - mddev->resync_mark) / HZ; 4394 if (!dt) dt++; 4395 db = resync - mddev->resync_mark_cnt; 4396 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 4397} 4398 4399static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 4400 4401static ssize_t 4402sync_completed_show(struct mddev *mddev, char *page) 4403{ 4404 unsigned long long max_sectors, resync; 4405 4406 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4407 return sprintf(page, "none\n"); 4408 4409 if (mddev->curr_resync == 1 || 4410 mddev->curr_resync == 2) 4411 return sprintf(page, "delayed\n"); 4412 4413 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 4414 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4415 max_sectors = mddev->resync_max_sectors; 4416 else 4417 max_sectors = mddev->dev_sectors; 4418 4419 resync = mddev->curr_resync_completed; 4420 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 4421} 4422 4423static struct md_sysfs_entry md_sync_completed = 4424 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 4425 4426static ssize_t 4427min_sync_show(struct mddev *mddev, char *page) 4428{ 4429 return sprintf(page, "%llu\n", 4430 (unsigned long long)mddev->resync_min); 4431} 4432static ssize_t 4433min_sync_store(struct mddev *mddev, const char *buf, size_t len) 4434{ 4435 unsigned long long min; 4436 int err; 4437 4438 if (kstrtoull(buf, 10, &min)) 4439 return -EINVAL; 4440 4441 spin_lock(&mddev->lock); 4442 err = -EINVAL; 4443 if (min > mddev->resync_max) 4444 goto out_unlock; 4445 4446 err = -EBUSY; 4447 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4448 goto out_unlock; 4449 4450 /* Round down to multiple of 4K for safety */ 4451 mddev->resync_min = round_down(min, 8); 4452 err = 0; 4453 4454out_unlock: 4455 spin_unlock(&mddev->lock); 4456 return err ?: len; 4457} 4458 4459static struct md_sysfs_entry md_min_sync = 4460__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 4461 4462static ssize_t 4463max_sync_show(struct mddev *mddev, char *page) 4464{ 4465 if (mddev->resync_max == MaxSector) 4466 return sprintf(page, "max\n"); 4467 else 4468 return sprintf(page, "%llu\n", 4469 (unsigned long long)mddev->resync_max); 4470} 4471static ssize_t 4472max_sync_store(struct mddev *mddev, const char *buf, size_t len) 4473{ 4474 int err; 4475 spin_lock(&mddev->lock); 4476 if (strncmp(buf, "max", 3) == 0) 4477 mddev->resync_max = MaxSector; 4478 else { 4479 unsigned long long max; 4480 int chunk; 4481 4482 err = -EINVAL; 4483 if (kstrtoull(buf, 10, &max)) 4484 goto out_unlock; 4485 if (max < mddev->resync_min) 4486 goto out_unlock; 4487 4488 err = -EBUSY; 4489 if (max < mddev->resync_max && 4490 mddev->ro == 0 && 4491 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4492 goto out_unlock; 4493 4494 /* Must be a multiple of chunk_size */ 4495 chunk = mddev->chunk_sectors; 4496 if (chunk) { 4497 sector_t temp = max; 4498 4499 err = -EINVAL; 4500 if (sector_div(temp, chunk)) 4501 goto out_unlock; 4502 } 4503 mddev->resync_max = max; 4504 } 4505 wake_up(&mddev->recovery_wait); 4506 err = 0; 4507out_unlock: 4508 spin_unlock(&mddev->lock); 4509 return err ?: len; 4510} 4511 4512static struct md_sysfs_entry md_max_sync = 4513__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 4514 4515static ssize_t 4516suspend_lo_show(struct mddev *mddev, char *page) 4517{ 4518 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 4519} 4520 4521static ssize_t 4522suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 4523{ 4524 char *e; 4525 unsigned long long new = simple_strtoull(buf, &e, 10); 4526 unsigned long long old; 4527 int err; 4528 4529 if (buf == e || (*e && *e != '\n')) 4530 return -EINVAL; 4531 4532 err = mddev_lock(mddev); 4533 if (err) 4534 return err; 4535 err = -EINVAL; 4536 if (mddev->pers == NULL || 4537 mddev->pers->quiesce == NULL) 4538 goto unlock; 4539 old = mddev->suspend_lo; 4540 mddev->suspend_lo = new; 4541 if (new >= old) 4542 /* Shrinking suspended region */ 4543 mddev->pers->quiesce(mddev, 2); 4544 else { 4545 /* Expanding suspended region - need to wait */ 4546 mddev->pers->quiesce(mddev, 1); 4547 mddev->pers->quiesce(mddev, 0); 4548 } 4549 err = 0; 4550unlock: 4551 mddev_unlock(mddev); 4552 return err ?: len; 4553} 4554static struct md_sysfs_entry md_suspend_lo = 4555__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4556 4557static ssize_t 4558suspend_hi_show(struct mddev *mddev, char *page) 4559{ 4560 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 4561} 4562 4563static ssize_t 4564suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 4565{ 4566 char *e; 4567 unsigned long long new = simple_strtoull(buf, &e, 10); 4568 unsigned long long old; 4569 int err; 4570 4571 if (buf == e || (*e && *e != '\n')) 4572 return -EINVAL; 4573 4574 err = mddev_lock(mddev); 4575 if (err) 4576 return err; 4577 err = -EINVAL; 4578 if (mddev->pers == NULL || 4579 mddev->pers->quiesce == NULL) 4580 goto unlock; 4581 old = mddev->suspend_hi; 4582 mddev->suspend_hi = new; 4583 if (new <= old) 4584 /* Shrinking suspended region */ 4585 mddev->pers->quiesce(mddev, 2); 4586 else { 4587 /* Expanding suspended region - need to wait */ 4588 mddev->pers->quiesce(mddev, 1); 4589 mddev->pers->quiesce(mddev, 0); 4590 } 4591 err = 0; 4592unlock: 4593 mddev_unlock(mddev); 4594 return err ?: len; 4595} 4596static struct md_sysfs_entry md_suspend_hi = 4597__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4598 4599static ssize_t 4600reshape_position_show(struct mddev *mddev, char *page) 4601{ 4602 if (mddev->reshape_position != MaxSector) 4603 return sprintf(page, "%llu\n", 4604 (unsigned long long)mddev->reshape_position); 4605 strcpy(page, "none\n"); 4606 return 5; 4607} 4608 4609static ssize_t 4610reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 4611{ 4612 struct md_rdev *rdev; 4613 char *e; 4614 int err; 4615 unsigned long long new = simple_strtoull(buf, &e, 10); 4616 4617 if (buf == e || (*e && *e != '\n')) 4618 return -EINVAL; 4619 err = mddev_lock(mddev); 4620 if (err) 4621 return err; 4622 err = -EBUSY; 4623 if (mddev->pers) 4624 goto unlock; 4625 mddev->reshape_position = new; 4626 mddev->delta_disks = 0; 4627 mddev->reshape_backwards = 0; 4628 mddev->new_level = mddev->level; 4629 mddev->new_layout = mddev->layout; 4630 mddev->new_chunk_sectors = mddev->chunk_sectors; 4631 rdev_for_each(rdev, mddev) 4632 rdev->new_data_offset = rdev->data_offset; 4633 err = 0; 4634unlock: 4635 mddev_unlock(mddev); 4636 return err ?: len; 4637} 4638 4639static struct md_sysfs_entry md_reshape_position = 4640__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 4641 reshape_position_store); 4642 4643static ssize_t 4644reshape_direction_show(struct mddev *mddev, char *page) 4645{ 4646 return sprintf(page, "%s\n", 4647 mddev->reshape_backwards ? "backwards" : "forwards"); 4648} 4649 4650static ssize_t 4651reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 4652{ 4653 int backwards = 0; 4654 int err; 4655 4656 if (cmd_match(buf, "forwards")) 4657 backwards = 0; 4658 else if (cmd_match(buf, "backwards")) 4659 backwards = 1; 4660 else 4661 return -EINVAL; 4662 if (mddev->reshape_backwards == backwards) 4663 return len; 4664 4665 err = mddev_lock(mddev); 4666 if (err) 4667 return err; 4668 /* check if we are allowed to change */ 4669 if (mddev->delta_disks) 4670 err = -EBUSY; 4671 else if (mddev->persistent && 4672 mddev->major_version == 0) 4673 err = -EINVAL; 4674 else 4675 mddev->reshape_backwards = backwards; 4676 mddev_unlock(mddev); 4677 return err ?: len; 4678} 4679 4680static struct md_sysfs_entry md_reshape_direction = 4681__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 4682 reshape_direction_store); 4683 4684static ssize_t 4685array_size_show(struct mddev *mddev, char *page) 4686{ 4687 if (mddev->external_size) 4688 return sprintf(page, "%llu\n", 4689 (unsigned long long)mddev->array_sectors/2); 4690 else 4691 return sprintf(page, "default\n"); 4692} 4693 4694static ssize_t 4695array_size_store(struct mddev *mddev, const char *buf, size_t len) 4696{ 4697 sector_t sectors; 4698 int err; 4699 4700 err = mddev_lock(mddev); 4701 if (err) 4702 return err; 4703 4704 if (strncmp(buf, "default", 7) == 0) { 4705 if (mddev->pers) 4706 sectors = mddev->pers->size(mddev, 0, 0); 4707 else 4708 sectors = mddev->array_sectors; 4709 4710 mddev->external_size = 0; 4711 } else { 4712 if (strict_blocks_to_sectors(buf, §ors) < 0) 4713 err = -EINVAL; 4714 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 4715 err = -E2BIG; 4716 else 4717 mddev->external_size = 1; 4718 } 4719 4720 if (!err) { 4721 mddev->array_sectors = sectors; 4722 if (mddev->pers) { 4723 set_capacity(mddev->gendisk, mddev->array_sectors); 4724 revalidate_disk(mddev->gendisk); 4725 } 4726 } 4727 mddev_unlock(mddev); 4728 return err ?: len; 4729} 4730 4731static struct md_sysfs_entry md_array_size = 4732__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 4733 array_size_store); 4734 4735static struct attribute *md_default_attrs[] = { 4736 &md_level.attr, 4737 &md_layout.attr, 4738 &md_raid_disks.attr, 4739 &md_chunk_size.attr, 4740 &md_size.attr, 4741 &md_resync_start.attr, 4742 &md_metadata.attr, 4743 &md_new_device.attr, 4744 &md_safe_delay.attr, 4745 &md_array_state.attr, 4746 &md_reshape_position.attr, 4747 &md_reshape_direction.attr, 4748 &md_array_size.attr, 4749 &max_corr_read_errors.attr, 4750 NULL, 4751}; 4752 4753static struct attribute *md_redundancy_attrs[] = { 4754 &md_scan_mode.attr, 4755 &md_last_scan_mode.attr, 4756 &md_mismatches.attr, 4757 &md_sync_min.attr, 4758 &md_sync_max.attr, 4759 &md_sync_speed.attr, 4760 &md_sync_force_parallel.attr, 4761 &md_sync_completed.attr, 4762 &md_min_sync.attr, 4763 &md_max_sync.attr, 4764 &md_suspend_lo.attr, 4765 &md_suspend_hi.attr, 4766 &md_bitmap.attr, 4767 &md_degraded.attr, 4768 NULL, 4769}; 4770static struct attribute_group md_redundancy_group = { 4771 .name = NULL, 4772 .attrs = md_redundancy_attrs, 4773}; 4774 4775static ssize_t 4776md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4777{ 4778 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4779 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4780 ssize_t rv; 4781 4782 if (!entry->show) 4783 return -EIO; 4784 spin_lock(&all_mddevs_lock); 4785 if (list_empty(&mddev->all_mddevs)) { 4786 spin_unlock(&all_mddevs_lock); 4787 return -EBUSY; 4788 } 4789 mddev_get(mddev); 4790 spin_unlock(&all_mddevs_lock); 4791 4792 rv = entry->show(mddev, page); 4793 mddev_put(mddev); 4794 return rv; 4795} 4796 4797static ssize_t 4798md_attr_store(struct kobject *kobj, struct attribute *attr, 4799 const char *page, size_t length) 4800{ 4801 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4802 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4803 ssize_t rv; 4804 4805 if (!entry->store) 4806 return -EIO; 4807 if (!capable(CAP_SYS_ADMIN)) 4808 return -EACCES; 4809 spin_lock(&all_mddevs_lock); 4810 if (list_empty(&mddev->all_mddevs)) { 4811 spin_unlock(&all_mddevs_lock); 4812 return -EBUSY; 4813 } 4814 mddev_get(mddev); 4815 spin_unlock(&all_mddevs_lock); 4816 rv = entry->store(mddev, page, length); 4817 mddev_put(mddev); 4818 return rv; 4819} 4820 4821static void md_free(struct kobject *ko) 4822{ 4823 struct mddev *mddev = container_of(ko, struct mddev, kobj); 4824 4825 if (mddev->sysfs_state) 4826 sysfs_put(mddev->sysfs_state); 4827 4828 if (mddev->queue) 4829 blk_cleanup_queue(mddev->queue); 4830 if (mddev->gendisk) { 4831 del_gendisk(mddev->gendisk); 4832 put_disk(mddev->gendisk); 4833 } 4834 4835 kfree(mddev); 4836} 4837 4838static const struct sysfs_ops md_sysfs_ops = { 4839 .show = md_attr_show, 4840 .store = md_attr_store, 4841}; 4842static struct kobj_type md_ktype = { 4843 .release = md_free, 4844 .sysfs_ops = &md_sysfs_ops, 4845 .default_attrs = md_default_attrs, 4846}; 4847 4848int mdp_major = 0; 4849 4850static void mddev_delayed_delete(struct work_struct *ws) 4851{ 4852 struct mddev *mddev = container_of(ws, struct mddev, del_work); 4853 4854 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4855 kobject_del(&mddev->kobj); 4856 kobject_put(&mddev->kobj); 4857} 4858 4859static int md_alloc(dev_t dev, char *name) 4860{ 4861 static DEFINE_MUTEX(disks_mutex); 4862 struct mddev *mddev = mddev_find(dev); 4863 struct gendisk *disk; 4864 int partitioned; 4865 int shift; 4866 int unit; 4867 int error; 4868 4869 if (!mddev) 4870 return -ENODEV; 4871 4872 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 4873 shift = partitioned ? MdpMinorShift : 0; 4874 unit = MINOR(mddev->unit) >> shift; 4875 4876 /* wait for any previous instance of this device to be 4877 * completely removed (mddev_delayed_delete). 4878 */ 4879 flush_workqueue(md_misc_wq); 4880 4881 mutex_lock(&disks_mutex); 4882 error = -EEXIST; 4883 if (mddev->gendisk) 4884 goto abort; 4885 4886 if (name) { 4887 /* Need to ensure that 'name' is not a duplicate. 4888 */ 4889 struct mddev *mddev2; 4890 spin_lock(&all_mddevs_lock); 4891 4892 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 4893 if (mddev2->gendisk && 4894 strcmp(mddev2->gendisk->disk_name, name) == 0) { 4895 spin_unlock(&all_mddevs_lock); 4896 goto abort; 4897 } 4898 spin_unlock(&all_mddevs_lock); 4899 } 4900 4901 error = -ENOMEM; 4902 mddev->queue = blk_alloc_queue(GFP_KERNEL); 4903 if (!mddev->queue) 4904 goto abort; 4905 mddev->queue->queuedata = mddev; 4906 4907 blk_queue_make_request(mddev->queue, md_make_request); 4908 blk_set_stacking_limits(&mddev->queue->limits); 4909 4910 disk = alloc_disk(1 << shift); 4911 if (!disk) { 4912 blk_cleanup_queue(mddev->queue); 4913 mddev->queue = NULL; 4914 goto abort; 4915 } 4916 disk->major = MAJOR(mddev->unit); 4917 disk->first_minor = unit << shift; 4918 if (name) 4919 strcpy(disk->disk_name, name); 4920 else if (partitioned) 4921 sprintf(disk->disk_name, "md_d%d", unit); 4922 else 4923 sprintf(disk->disk_name, "md%d", unit); 4924 disk->fops = &md_fops; 4925 disk->private_data = mddev; 4926 disk->queue = mddev->queue; 4927 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); 4928 /* Allow extended partitions. This makes the 4929 * 'mdp' device redundant, but we can't really 4930 * remove it now. 4931 */ 4932 disk->flags |= GENHD_FL_EXT_DEVT; 4933 mddev->gendisk = disk; 4934 /* As soon as we call add_disk(), another thread could get 4935 * through to md_open, so make sure it doesn't get too far 4936 */ 4937 mutex_lock(&mddev->open_mutex); 4938 add_disk(disk); 4939 4940 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 4941 &disk_to_dev(disk)->kobj, "%s", "md"); 4942 if (error) { 4943 /* This isn't possible, but as kobject_init_and_add is marked 4944 * __must_check, we must do something with the result 4945 */ 4946 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 4947 disk->disk_name); 4948 error = 0; 4949 } 4950 if (mddev->kobj.sd && 4951 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 4952 printk(KERN_DEBUG "pointless warning\n"); 4953 mutex_unlock(&mddev->open_mutex); 4954 abort: 4955 mutex_unlock(&disks_mutex); 4956 if (!error && mddev->kobj.sd) { 4957 kobject_uevent(&mddev->kobj, KOBJ_ADD); 4958 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 4959 } 4960 mddev_put(mddev); 4961 return error; 4962} 4963 4964static struct kobject *md_probe(dev_t dev, int *part, void *data) 4965{ 4966 md_alloc(dev, NULL); 4967 return NULL; 4968} 4969 4970static int add_named_array(const char *val, struct kernel_param *kp) 4971{ 4972 /* val must be "md_*" where * is not all digits. 4973 * We allocate an array with a large free minor number, and 4974 * set the name to val. val must not already be an active name. 4975 */ 4976 int len = strlen(val); 4977 char buf[DISK_NAME_LEN]; 4978 4979 while (len && val[len-1] == '\n') 4980 len--; 4981 if (len >= DISK_NAME_LEN) 4982 return -E2BIG; 4983 strlcpy(buf, val, len+1); 4984 if (strncmp(buf, "md_", 3) != 0) 4985 return -EINVAL; 4986 return md_alloc(0, buf); 4987} 4988 4989static void md_safemode_timeout(unsigned long data) 4990{ 4991 struct mddev *mddev = (struct mddev *) data; 4992 4993 if (!atomic_read(&mddev->writes_pending)) { 4994 mddev->safemode = 1; 4995 if (mddev->external) 4996 sysfs_notify_dirent_safe(mddev->sysfs_state); 4997 } 4998 md_wakeup_thread(mddev->thread); 4999} 5000 5001static int start_dirty_degraded; 5002 5003int md_run(struct mddev *mddev) 5004{ 5005 int err; 5006 struct md_rdev *rdev; 5007 struct md_personality *pers; 5008 5009 if (list_empty(&mddev->disks)) 5010 /* cannot run an array with no devices.. */ 5011 return -EINVAL; 5012 5013 if (mddev->pers) 5014 return -EBUSY; 5015 /* Cannot run until previous stop completes properly */ 5016 if (mddev->sysfs_active) 5017 return -EBUSY; 5018 5019 /* 5020 * Analyze all RAID superblock(s) 5021 */ 5022 if (!mddev->raid_disks) { 5023 if (!mddev->persistent) 5024 return -EINVAL; 5025 analyze_sbs(mddev); 5026 } 5027 5028 if (mddev->level != LEVEL_NONE) 5029 request_module("md-level-%d", mddev->level); 5030 else if (mddev->clevel[0]) 5031 request_module("md-%s", mddev->clevel); 5032 5033 /* 5034 * Drop all container device buffers, from now on 5035 * the only valid external interface is through the md 5036 * device. 5037 */ 5038 rdev_for_each(rdev, mddev) { 5039 if (test_bit(Faulty, &rdev->flags)) 5040 continue; 5041 sync_blockdev(rdev->bdev); 5042 invalidate_bdev(rdev->bdev); 5043 5044 /* perform some consistency tests on the device. 5045 * We don't want the data to overlap the metadata, 5046 * Internal Bitmap issues have been handled elsewhere. 5047 */ 5048 if (rdev->meta_bdev) { 5049 /* Nothing to check */; 5050 } else if (rdev->data_offset < rdev->sb_start) { 5051 if (mddev->dev_sectors && 5052 rdev->data_offset + mddev->dev_sectors 5053 > rdev->sb_start) { 5054 printk("md: %s: data overlaps metadata\n", 5055 mdname(mddev)); 5056 return -EINVAL; 5057 } 5058 } else { 5059 if (rdev->sb_start + rdev->sb_size/512 5060 > rdev->data_offset) { 5061 printk("md: %s: metadata overlaps data\n", 5062 mdname(mddev)); 5063 return -EINVAL; 5064 } 5065 } 5066 sysfs_notify_dirent_safe(rdev->sysfs_state); 5067 } 5068 5069 if (mddev->bio_set == NULL) 5070 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); 5071 5072 spin_lock(&pers_lock); 5073 pers = find_pers(mddev->level, mddev->clevel); 5074 if (!pers || !try_module_get(pers->owner)) { 5075 spin_unlock(&pers_lock); 5076 if (mddev->level != LEVEL_NONE) 5077 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 5078 mddev->level); 5079 else 5080 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 5081 mddev->clevel); 5082 return -EINVAL; 5083 } 5084 spin_unlock(&pers_lock); 5085 if (mddev->level != pers->level) { 5086 mddev->level = pers->level; 5087 mddev->new_level = pers->level; 5088 } 5089 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5090 5091 if (mddev->reshape_position != MaxSector && 5092 pers->start_reshape == NULL) { 5093 /* This personality cannot handle reshaping... */ 5094 module_put(pers->owner); 5095 return -EINVAL; 5096 } 5097 5098 if (pers->sync_request) { 5099 /* Warn if this is a potentially silly 5100 * configuration. 5101 */ 5102 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5103 struct md_rdev *rdev2; 5104 int warned = 0; 5105 5106 rdev_for_each(rdev, mddev) 5107 rdev_for_each(rdev2, mddev) { 5108 if (rdev < rdev2 && 5109 rdev->bdev->bd_contains == 5110 rdev2->bdev->bd_contains) { 5111 printk(KERN_WARNING 5112 "%s: WARNING: %s appears to be" 5113 " on the same physical disk as" 5114 " %s.\n", 5115 mdname(mddev), 5116 bdevname(rdev->bdev,b), 5117 bdevname(rdev2->bdev,b2)); 5118 warned = 1; 5119 } 5120 } 5121 5122 if (warned) 5123 printk(KERN_WARNING 5124 "True protection against single-disk" 5125 " failure might be compromised.\n"); 5126 } 5127 5128 mddev->recovery = 0; 5129 /* may be over-ridden by personality */ 5130 mddev->resync_max_sectors = mddev->dev_sectors; 5131 5132 mddev->ok_start_degraded = start_dirty_degraded; 5133 5134 if (start_readonly && mddev->ro == 0) 5135 mddev->ro = 2; /* read-only, but switch on first write */ 5136 5137 err = pers->run(mddev); 5138 if (err) 5139 printk(KERN_ERR "md: pers->run() failed ...\n"); 5140 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 5141 WARN_ONCE(!mddev->external_size, "%s: default size too small," 5142 " but 'external_size' not in effect?\n", __func__); 5143 printk(KERN_ERR 5144 "md: invalid array_size %llu > default size %llu\n", 5145 (unsigned long long)mddev->array_sectors / 2, 5146 (unsigned long long)pers->size(mddev, 0, 0) / 2); 5147 err = -EINVAL; 5148 } 5149 if (err == 0 && pers->sync_request && 5150 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 5151 struct bitmap *bitmap; 5152 5153 bitmap = bitmap_create(mddev, -1); 5154 if (IS_ERR(bitmap)) { 5155 err = PTR_ERR(bitmap); 5156 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 5157 mdname(mddev), err); 5158 } else 5159 mddev->bitmap = bitmap; 5160 5161 } 5162 if (err) { 5163 mddev_detach(mddev); 5164 if (mddev->private) 5165 pers->free(mddev, mddev->private); 5166 mddev->private = NULL; 5167 module_put(pers->owner); 5168 bitmap_destroy(mddev); 5169 return err; 5170 } 5171 if (mddev->queue) { 5172 mddev->queue->backing_dev_info.congested_data = mddev; 5173 mddev->queue->backing_dev_info.congested_fn = md_congested; 5174 blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec); 5175 } 5176 if (pers->sync_request) { 5177 if (mddev->kobj.sd && 5178 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 5179 printk(KERN_WARNING 5180 "md: cannot register extra attributes for %s\n", 5181 mdname(mddev)); 5182 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 5183 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 5184 mddev->ro = 0; 5185 5186 atomic_set(&mddev->writes_pending,0); 5187 atomic_set(&mddev->max_corr_read_errors, 5188 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 5189 mddev->safemode = 0; 5190 mddev->safemode_timer.function = md_safemode_timeout; 5191 mddev->safemode_timer.data = (unsigned long) mddev; 5192 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 5193 mddev->in_sync = 1; 5194 smp_wmb(); 5195 spin_lock(&mddev->lock); 5196 mddev->pers = pers; 5197 mddev->ready = 1; 5198 spin_unlock(&mddev->lock); 5199 rdev_for_each(rdev, mddev) 5200 if (rdev->raid_disk >= 0) 5201 if (sysfs_link_rdev(mddev, rdev)) 5202 /* failure here is OK */; 5203 5204 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5205 5206 if (mddev->flags & MD_UPDATE_SB_FLAGS) 5207 md_update_sb(mddev, 0); 5208 5209 md_new_event(mddev); 5210 sysfs_notify_dirent_safe(mddev->sysfs_state); 5211 sysfs_notify_dirent_safe(mddev->sysfs_action); 5212 sysfs_notify(&mddev->kobj, NULL, "degraded"); 5213 return 0; 5214} 5215EXPORT_SYMBOL_GPL(md_run); 5216 5217static int do_md_run(struct mddev *mddev) 5218{ 5219 int err; 5220 5221 err = md_run(mddev); 5222 if (err) 5223 goto out; 5224 err = bitmap_load(mddev); 5225 if (err) { 5226 bitmap_destroy(mddev); 5227 goto out; 5228 } 5229 5230 md_wakeup_thread(mddev->thread); 5231 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 5232 5233 set_capacity(mddev->gendisk, mddev->array_sectors); 5234 revalidate_disk(mddev->gendisk); 5235 mddev->changed = 1; 5236 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5237out: 5238 return err; 5239} 5240 5241static int restart_array(struct mddev *mddev) 5242{ 5243 struct gendisk *disk = mddev->gendisk; 5244 5245 /* Complain if it has no devices */ 5246 if (list_empty(&mddev->disks)) 5247 return -ENXIO; 5248 if (!mddev->pers) 5249 return -EINVAL; 5250 if (!mddev->ro) 5251 return -EBUSY; 5252 mddev->safemode = 0; 5253 mddev->ro = 0; 5254 set_disk_ro(disk, 0); 5255 printk(KERN_INFO "md: %s switched to read-write mode.\n", 5256 mdname(mddev)); 5257 /* Kick recovery or resync if necessary */ 5258 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5259 md_wakeup_thread(mddev->thread); 5260 md_wakeup_thread(mddev->sync_thread); 5261 sysfs_notify_dirent_safe(mddev->sysfs_state); 5262 return 0; 5263} 5264 5265static void md_clean(struct mddev *mddev) 5266{ 5267 mddev->array_sectors = 0; 5268 mddev->external_size = 0; 5269 mddev->dev_sectors = 0; 5270 mddev->raid_disks = 0; 5271 mddev->recovery_cp = 0; 5272 mddev->resync_min = 0; 5273 mddev->resync_max = MaxSector; 5274 mddev->reshape_position = MaxSector; 5275 mddev->external = 0; 5276 mddev->persistent = 0; 5277 mddev->level = LEVEL_NONE; 5278 mddev->clevel[0] = 0; 5279 mddev->flags = 0; 5280 mddev->ro = 0; 5281 mddev->metadata_type[0] = 0; 5282 mddev->chunk_sectors = 0; 5283 mddev->ctime = mddev->utime = 0; 5284 mddev->layout = 0; 5285 mddev->max_disks = 0; 5286 mddev->events = 0; 5287 mddev->can_decrease_events = 0; 5288 mddev->delta_disks = 0; 5289 mddev->reshape_backwards = 0; 5290 mddev->new_level = LEVEL_NONE; 5291 mddev->new_layout = 0; 5292 mddev->new_chunk_sectors = 0; 5293 mddev->curr_resync = 0; 5294 atomic64_set(&mddev->resync_mismatches, 0); 5295 mddev->suspend_lo = mddev->suspend_hi = 0; 5296 mddev->sync_speed_min = mddev->sync_speed_max = 0; 5297 mddev->recovery = 0; 5298 mddev->in_sync = 0; 5299 mddev->changed = 0; 5300 mddev->degraded = 0; 5301 mddev->safemode = 0; 5302 mddev->private = NULL; 5303 mddev->merge_check_needed = 0; 5304 mddev->bitmap_info.offset = 0; 5305 mddev->bitmap_info.default_offset = 0; 5306 mddev->bitmap_info.default_space = 0; 5307 mddev->bitmap_info.chunksize = 0; 5308 mddev->bitmap_info.daemon_sleep = 0; 5309 mddev->bitmap_info.max_write_behind = 0; 5310} 5311 5312static void __md_stop_writes(struct mddev *mddev) 5313{ 5314 if (mddev_is_clustered(mddev)) 5315 md_cluster_ops->metadata_update_start(mddev); 5316 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5317 flush_workqueue(md_misc_wq); 5318 if (mddev->sync_thread) { 5319 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5320 md_reap_sync_thread(mddev); 5321 } 5322 5323 del_timer_sync(&mddev->safemode_timer); 5324 5325 bitmap_flush(mddev); 5326 md_super_wait(mddev); 5327 5328 if (mddev->ro == 0 && 5329 (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) { 5330 /* mark array as shutdown cleanly */ 5331 mddev->in_sync = 1; 5332 md_update_sb(mddev, 1); 5333 } 5334 if (mddev_is_clustered(mddev)) 5335 md_cluster_ops->metadata_update_finish(mddev); 5336} 5337 5338void md_stop_writes(struct mddev *mddev) 5339{ 5340 mddev_lock_nointr(mddev); 5341 __md_stop_writes(mddev); 5342 mddev_unlock(mddev); 5343} 5344EXPORT_SYMBOL_GPL(md_stop_writes); 5345 5346static void mddev_detach(struct mddev *mddev) 5347{ 5348 struct bitmap *bitmap = mddev->bitmap; 5349 /* wait for behind writes to complete */ 5350 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 5351 printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n", 5352 mdname(mddev)); 5353 /* need to kick something here to make sure I/O goes? */ 5354 wait_event(bitmap->behind_wait, 5355 atomic_read(&bitmap->behind_writes) == 0); 5356 } 5357 if (mddev->pers && mddev->pers->quiesce) { 5358 mddev->pers->quiesce(mddev, 1); 5359 mddev->pers->quiesce(mddev, 0); 5360 } 5361 md_unregister_thread(&mddev->thread); 5362 if (mddev->queue) 5363 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5364} 5365 5366static void __md_stop(struct mddev *mddev) 5367{ 5368 struct md_personality *pers = mddev->pers; 5369 mddev_detach(mddev); 5370 /* Ensure ->event_work is done */ 5371 flush_workqueue(md_misc_wq); 5372 spin_lock(&mddev->lock); 5373 mddev->ready = 0; 5374 mddev->pers = NULL; 5375 spin_unlock(&mddev->lock); 5376 pers->free(mddev, mddev->private); 5377 mddev->private = NULL; 5378 if (pers->sync_request && mddev->to_remove == NULL) 5379 mddev->to_remove = &md_redundancy_group; 5380 module_put(pers->owner); 5381 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5382} 5383 5384void md_stop(struct mddev *mddev) 5385{ 5386 /* stop the array and free an attached data structures. 5387 * This is called from dm-raid 5388 */ 5389 __md_stop(mddev); 5390 bitmap_destroy(mddev); 5391 if (mddev->bio_set) 5392 bioset_free(mddev->bio_set); 5393} 5394 5395EXPORT_SYMBOL_GPL(md_stop); 5396 5397static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5398{ 5399 int err = 0; 5400 int did_freeze = 0; 5401 5402 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5403 did_freeze = 1; 5404 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5405 md_wakeup_thread(mddev->thread); 5406 } 5407 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5408 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5409 if (mddev->sync_thread) 5410 /* Thread might be blocked waiting for metadata update 5411 * which will now never happen */ 5412 wake_up_process(mddev->sync_thread->tsk); 5413 5414 mddev_unlock(mddev); 5415 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 5416 &mddev->recovery)); 5417 mddev_lock_nointr(mddev); 5418 5419 mutex_lock(&mddev->open_mutex); 5420 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5421 mddev->sync_thread || 5422 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5423 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { 5424 printk("md: %s still in use.\n",mdname(mddev)); 5425 if (did_freeze) { 5426 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5427 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5428 md_wakeup_thread(mddev->thread); 5429 } 5430 err = -EBUSY; 5431 goto out; 5432 } 5433 if (mddev->pers) { 5434 __md_stop_writes(mddev); 5435 5436 err = -ENXIO; 5437 if (mddev->ro==1) 5438 goto out; 5439 mddev->ro = 1; 5440 set_disk_ro(mddev->gendisk, 1); 5441 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5442 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5443 md_wakeup_thread(mddev->thread); 5444 sysfs_notify_dirent_safe(mddev->sysfs_state); 5445 err = 0; 5446 } 5447out: 5448 mutex_unlock(&mddev->open_mutex); 5449 return err; 5450} 5451 5452/* mode: 5453 * 0 - completely stop and dis-assemble array 5454 * 2 - stop but do not disassemble array 5455 */ 5456static int do_md_stop(struct mddev *mddev, int mode, 5457 struct block_device *bdev) 5458{ 5459 struct gendisk *disk = mddev->gendisk; 5460 struct md_rdev *rdev; 5461 int did_freeze = 0; 5462 5463 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5464 did_freeze = 1; 5465 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5466 md_wakeup_thread(mddev->thread); 5467 } 5468 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5469 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5470 if (mddev->sync_thread) 5471 /* Thread might be blocked waiting for metadata update 5472 * which will now never happen */ 5473 wake_up_process(mddev->sync_thread->tsk); 5474 5475 mddev_unlock(mddev); 5476 wait_event(resync_wait, (mddev->sync_thread == NULL && 5477 !test_bit(MD_RECOVERY_RUNNING, 5478 &mddev->recovery))); 5479 mddev_lock_nointr(mddev); 5480 5481 mutex_lock(&mddev->open_mutex); 5482 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5483 mddev->sysfs_active || 5484 mddev->sync_thread || 5485 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5486 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { 5487 printk("md: %s still in use.\n",mdname(mddev)); 5488 mutex_unlock(&mddev->open_mutex); 5489 if (did_freeze) { 5490 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5491 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5492 md_wakeup_thread(mddev->thread); 5493 } 5494 return -EBUSY; 5495 } 5496 if (mddev->pers) { 5497 if (mddev->ro) 5498 set_disk_ro(disk, 0); 5499 5500 __md_stop_writes(mddev); 5501 __md_stop(mddev); 5502 mddev->queue->merge_bvec_fn = NULL; 5503 mddev->queue->backing_dev_info.congested_fn = NULL; 5504 5505 /* tell userspace to handle 'inactive' */ 5506 sysfs_notify_dirent_safe(mddev->sysfs_state); 5507 5508 rdev_for_each(rdev, mddev) 5509 if (rdev->raid_disk >= 0) 5510 sysfs_unlink_rdev(mddev, rdev); 5511 5512 set_capacity(disk, 0); 5513 mutex_unlock(&mddev->open_mutex); 5514 mddev->changed = 1; 5515 revalidate_disk(disk); 5516 5517 if (mddev->ro) 5518 mddev->ro = 0; 5519 } else 5520 mutex_unlock(&mddev->open_mutex); 5521 /* 5522 * Free resources if final stop 5523 */ 5524 if (mode == 0) { 5525 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 5526 5527 bitmap_destroy(mddev); 5528 if (mddev->bitmap_info.file) { 5529 struct file *f = mddev->bitmap_info.file; 5530 spin_lock(&mddev->lock); 5531 mddev->bitmap_info.file = NULL; 5532 spin_unlock(&mddev->lock); 5533 fput(f); 5534 } 5535 mddev->bitmap_info.offset = 0; 5536 5537 export_array(mddev); 5538 5539 md_clean(mddev); 5540 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5541 if (mddev->hold_active == UNTIL_STOP) 5542 mddev->hold_active = 0; 5543 } 5544 blk_integrity_unregister(disk); 5545 md_new_event(mddev); 5546 sysfs_notify_dirent_safe(mddev->sysfs_state); 5547 return 0; 5548} 5549 5550#ifndef MODULE 5551static void autorun_array(struct mddev *mddev) 5552{ 5553 struct md_rdev *rdev; 5554 int err; 5555 5556 if (list_empty(&mddev->disks)) 5557 return; 5558 5559 printk(KERN_INFO "md: running: "); 5560 5561 rdev_for_each(rdev, mddev) { 5562 char b[BDEVNAME_SIZE]; 5563 printk("<%s>", bdevname(rdev->bdev,b)); 5564 } 5565 printk("\n"); 5566 5567 err = do_md_run(mddev); 5568 if (err) { 5569 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 5570 do_md_stop(mddev, 0, NULL); 5571 } 5572} 5573 5574/* 5575 * lets try to run arrays based on all disks that have arrived 5576 * until now. (those are in pending_raid_disks) 5577 * 5578 * the method: pick the first pending disk, collect all disks with 5579 * the same UUID, remove all from the pending list and put them into 5580 * the 'same_array' list. Then order this list based on superblock 5581 * update time (freshest comes first), kick out 'old' disks and 5582 * compare superblocks. If everything's fine then run it. 5583 * 5584 * If "unit" is allocated, then bump its reference count 5585 */ 5586static void autorun_devices(int part) 5587{ 5588 struct md_rdev *rdev0, *rdev, *tmp; 5589 struct mddev *mddev; 5590 char b[BDEVNAME_SIZE]; 5591 5592 printk(KERN_INFO "md: autorun ...\n"); 5593 while (!list_empty(&pending_raid_disks)) { 5594 int unit; 5595 dev_t dev; 5596 LIST_HEAD(candidates); 5597 rdev0 = list_entry(pending_raid_disks.next, 5598 struct md_rdev, same_set); 5599 5600 printk(KERN_INFO "md: considering %s ...\n", 5601 bdevname(rdev0->bdev,b)); 5602 INIT_LIST_HEAD(&candidates); 5603 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 5604 if (super_90_load(rdev, rdev0, 0) >= 0) { 5605 printk(KERN_INFO "md: adding %s ...\n", 5606 bdevname(rdev->bdev,b)); 5607 list_move(&rdev->same_set, &candidates); 5608 } 5609 /* 5610 * now we have a set of devices, with all of them having 5611 * mostly sane superblocks. It's time to allocate the 5612 * mddev. 5613 */ 5614 if (part) { 5615 dev = MKDEV(mdp_major, 5616 rdev0->preferred_minor << MdpMinorShift); 5617 unit = MINOR(dev) >> MdpMinorShift; 5618 } else { 5619 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 5620 unit = MINOR(dev); 5621 } 5622 if (rdev0->preferred_minor != unit) { 5623 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 5624 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 5625 break; 5626 } 5627 5628 md_probe(dev, NULL, NULL); 5629 mddev = mddev_find(dev); 5630 if (!mddev || !mddev->gendisk) { 5631 if (mddev) 5632 mddev_put(mddev); 5633 printk(KERN_ERR 5634 "md: cannot allocate memory for md drive.\n"); 5635 break; 5636 } 5637 if (mddev_lock(mddev)) 5638 printk(KERN_WARNING "md: %s locked, cannot run\n", 5639 mdname(mddev)); 5640 else if (mddev->raid_disks || mddev->major_version 5641 || !list_empty(&mddev->disks)) { 5642 printk(KERN_WARNING 5643 "md: %s already running, cannot run %s\n", 5644 mdname(mddev), bdevname(rdev0->bdev,b)); 5645 mddev_unlock(mddev); 5646 } else { 5647 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 5648 mddev->persistent = 1; 5649 rdev_for_each_list(rdev, tmp, &candidates) { 5650 list_del_init(&rdev->same_set); 5651 if (bind_rdev_to_array(rdev, mddev)) 5652 export_rdev(rdev); 5653 } 5654 autorun_array(mddev); 5655 mddev_unlock(mddev); 5656 } 5657 /* on success, candidates will be empty, on error 5658 * it won't... 5659 */ 5660 rdev_for_each_list(rdev, tmp, &candidates) { 5661 list_del_init(&rdev->same_set); 5662 export_rdev(rdev); 5663 } 5664 mddev_put(mddev); 5665 } 5666 printk(KERN_INFO "md: ... autorun DONE.\n"); 5667} 5668#endif /* !MODULE */ 5669 5670static int get_version(void __user *arg) 5671{ 5672 mdu_version_t ver; 5673 5674 ver.major = MD_MAJOR_VERSION; 5675 ver.minor = MD_MINOR_VERSION; 5676 ver.patchlevel = MD_PATCHLEVEL_VERSION; 5677 5678 if (copy_to_user(arg, &ver, sizeof(ver))) 5679 return -EFAULT; 5680 5681 return 0; 5682} 5683 5684static int get_array_info(struct mddev *mddev, void __user *arg) 5685{ 5686 mdu_array_info_t info; 5687 int nr,working,insync,failed,spare; 5688 struct md_rdev *rdev; 5689 5690 nr = working = insync = failed = spare = 0; 5691 rcu_read_lock(); 5692 rdev_for_each_rcu(rdev, mddev) { 5693 nr++; 5694 if (test_bit(Faulty, &rdev->flags)) 5695 failed++; 5696 else { 5697 working++; 5698 if (test_bit(In_sync, &rdev->flags)) 5699 insync++; 5700 else 5701 spare++; 5702 } 5703 } 5704 rcu_read_unlock(); 5705 5706 info.major_version = mddev->major_version; 5707 info.minor_version = mddev->minor_version; 5708 info.patch_version = MD_PATCHLEVEL_VERSION; 5709 info.ctime = mddev->ctime; 5710 info.level = mddev->level; 5711 info.size = mddev->dev_sectors / 2; 5712 if (info.size != mddev->dev_sectors / 2) /* overflow */ 5713 info.size = -1; 5714 info.nr_disks = nr; 5715 info.raid_disks = mddev->raid_disks; 5716 info.md_minor = mddev->md_minor; 5717 info.not_persistent= !mddev->persistent; 5718 5719 info.utime = mddev->utime; 5720 info.state = 0; 5721 if (mddev->in_sync) 5722 info.state = (1<<MD_SB_CLEAN); 5723 if (mddev->bitmap && mddev->bitmap_info.offset) 5724 info.state |= (1<<MD_SB_BITMAP_PRESENT); 5725 if (mddev_is_clustered(mddev)) 5726 info.state |= (1<<MD_SB_CLUSTERED); 5727 info.active_disks = insync; 5728 info.working_disks = working; 5729 info.failed_disks = failed; 5730 info.spare_disks = spare; 5731 5732 info.layout = mddev->layout; 5733 info.chunk_size = mddev->chunk_sectors << 9; 5734 5735 if (copy_to_user(arg, &info, sizeof(info))) 5736 return -EFAULT; 5737 5738 return 0; 5739} 5740 5741static int get_bitmap_file(struct mddev *mddev, void __user * arg) 5742{ 5743 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 5744 char *ptr; 5745 int err; 5746 5747 file = kzalloc(sizeof(*file), GFP_NOIO); 5748 if (!file) 5749 return -ENOMEM; 5750 5751 err = 0; 5752 spin_lock(&mddev->lock); 5753 /* bitmap disabled, zero the first byte and copy out */ 5754 if (!mddev->bitmap_info.file) 5755 file->pathname[0] = '\0'; 5756 else if ((ptr = d_path(&mddev->bitmap_info.file->f_path, 5757 file->pathname, sizeof(file->pathname))), 5758 IS_ERR(ptr)) 5759 err = PTR_ERR(ptr); 5760 else 5761 memmove(file->pathname, ptr, 5762 sizeof(file->pathname)-(ptr-file->pathname)); 5763 spin_unlock(&mddev->lock); 5764 5765 if (err == 0 && 5766 copy_to_user(arg, file, sizeof(*file))) 5767 err = -EFAULT; 5768 5769 kfree(file); 5770 return err; 5771} 5772 5773static int get_disk_info(struct mddev *mddev, void __user * arg) 5774{ 5775 mdu_disk_info_t info; 5776 struct md_rdev *rdev; 5777 5778 if (copy_from_user(&info, arg, sizeof(info))) 5779 return -EFAULT; 5780 5781 rcu_read_lock(); 5782 rdev = md_find_rdev_nr_rcu(mddev, info.number); 5783 if (rdev) { 5784 info.major = MAJOR(rdev->bdev->bd_dev); 5785 info.minor = MINOR(rdev->bdev->bd_dev); 5786 info.raid_disk = rdev->raid_disk; 5787 info.state = 0; 5788 if (test_bit(Faulty, &rdev->flags)) 5789 info.state |= (1<<MD_DISK_FAULTY); 5790 else if (test_bit(In_sync, &rdev->flags)) { 5791 info.state |= (1<<MD_DISK_ACTIVE); 5792 info.state |= (1<<MD_DISK_SYNC); 5793 } 5794 if (test_bit(WriteMostly, &rdev->flags)) 5795 info.state |= (1<<MD_DISK_WRITEMOSTLY); 5796 } else { 5797 info.major = info.minor = 0; 5798 info.raid_disk = -1; 5799 info.state = (1<<MD_DISK_REMOVED); 5800 } 5801 rcu_read_unlock(); 5802 5803 if (copy_to_user(arg, &info, sizeof(info))) 5804 return -EFAULT; 5805 5806 return 0; 5807} 5808 5809static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) 5810{ 5811 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5812 struct md_rdev *rdev; 5813 dev_t dev = MKDEV(info->major,info->minor); 5814 5815 if (mddev_is_clustered(mddev) && 5816 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 5817 pr_err("%s: Cannot add to clustered mddev.\n", 5818 mdname(mddev)); 5819 return -EINVAL; 5820 } 5821 5822 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 5823 return -EOVERFLOW; 5824 5825 if (!mddev->raid_disks) { 5826 int err; 5827 /* expecting a device which has a superblock */ 5828 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 5829 if (IS_ERR(rdev)) { 5830 printk(KERN_WARNING 5831 "md: md_import_device returned %ld\n", 5832 PTR_ERR(rdev)); 5833 return PTR_ERR(rdev); 5834 } 5835 if (!list_empty(&mddev->disks)) { 5836 struct md_rdev *rdev0 5837 = list_entry(mddev->disks.next, 5838 struct md_rdev, same_set); 5839 err = super_types[mddev->major_version] 5840 .load_super(rdev, rdev0, mddev->minor_version); 5841 if (err < 0) { 5842 printk(KERN_WARNING 5843 "md: %s has different UUID to %s\n", 5844 bdevname(rdev->bdev,b), 5845 bdevname(rdev0->bdev,b2)); 5846 export_rdev(rdev); 5847 return -EINVAL; 5848 } 5849 } 5850 err = bind_rdev_to_array(rdev, mddev); 5851 if (err) 5852 export_rdev(rdev); 5853 return err; 5854 } 5855 5856 /* 5857 * add_new_disk can be used once the array is assembled 5858 * to add "hot spares". They must already have a superblock 5859 * written 5860 */ 5861 if (mddev->pers) { 5862 int err; 5863 if (!mddev->pers->hot_add_disk) { 5864 printk(KERN_WARNING 5865 "%s: personality does not support diskops!\n", 5866 mdname(mddev)); 5867 return -EINVAL; 5868 } 5869 if (mddev->persistent) 5870 rdev = md_import_device(dev, mddev->major_version, 5871 mddev->minor_version); 5872 else 5873 rdev = md_import_device(dev, -1, -1); 5874 if (IS_ERR(rdev)) { 5875 printk(KERN_WARNING 5876 "md: md_import_device returned %ld\n", 5877 PTR_ERR(rdev)); 5878 return PTR_ERR(rdev); 5879 } 5880 /* set saved_raid_disk if appropriate */ 5881 if (!mddev->persistent) { 5882 if (info->state & (1<<MD_DISK_SYNC) && 5883 info->raid_disk < mddev->raid_disks) { 5884 rdev->raid_disk = info->raid_disk; 5885 set_bit(In_sync, &rdev->flags); 5886 clear_bit(Bitmap_sync, &rdev->flags); 5887 } else 5888 rdev->raid_disk = -1; 5889 rdev->saved_raid_disk = rdev->raid_disk; 5890 } else 5891 super_types[mddev->major_version]. 5892 validate_super(mddev, rdev); 5893 if ((info->state & (1<<MD_DISK_SYNC)) && 5894 rdev->raid_disk != info->raid_disk) { 5895 /* This was a hot-add request, but events doesn't 5896 * match, so reject it. 5897 */ 5898 export_rdev(rdev); 5899 return -EINVAL; 5900 } 5901 5902 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 5903 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5904 set_bit(WriteMostly, &rdev->flags); 5905 else 5906 clear_bit(WriteMostly, &rdev->flags); 5907 5908 /* 5909 * check whether the device shows up in other nodes 5910 */ 5911 if (mddev_is_clustered(mddev)) { 5912 if (info->state & (1 << MD_DISK_CANDIDATE)) { 5913 /* Through --cluster-confirm */ 5914 set_bit(Candidate, &rdev->flags); 5915 err = md_cluster_ops->new_disk_ack(mddev, true); 5916 if (err) { 5917 export_rdev(rdev); 5918 return err; 5919 } 5920 } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 5921 /* --add initiated by this node */ 5922 err = md_cluster_ops->add_new_disk_start(mddev, rdev); 5923 if (err) { 5924 md_cluster_ops->add_new_disk_finish(mddev); 5925 export_rdev(rdev); 5926 return err; 5927 } 5928 } 5929 } 5930 5931 rdev->raid_disk = -1; 5932 err = bind_rdev_to_array(rdev, mddev); 5933 if (err) 5934 export_rdev(rdev); 5935 else 5936 err = add_bound_rdev(rdev); 5937 if (mddev_is_clustered(mddev) && 5938 (info->state & (1 << MD_DISK_CLUSTER_ADD))) 5939 md_cluster_ops->add_new_disk_finish(mddev); 5940 return err; 5941 } 5942 5943 /* otherwise, add_new_disk is only allowed 5944 * for major_version==0 superblocks 5945 */ 5946 if (mddev->major_version != 0) { 5947 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 5948 mdname(mddev)); 5949 return -EINVAL; 5950 } 5951 5952 if (!(info->state & (1<<MD_DISK_FAULTY))) { 5953 int err; 5954 rdev = md_import_device(dev, -1, 0); 5955 if (IS_ERR(rdev)) { 5956 printk(KERN_WARNING 5957 "md: error, md_import_device() returned %ld\n", 5958 PTR_ERR(rdev)); 5959 return PTR_ERR(rdev); 5960 } 5961 rdev->desc_nr = info->number; 5962 if (info->raid_disk < mddev->raid_disks) 5963 rdev->raid_disk = info->raid_disk; 5964 else 5965 rdev->raid_disk = -1; 5966 5967 if (rdev->raid_disk < mddev->raid_disks) 5968 if (info->state & (1<<MD_DISK_SYNC)) 5969 set_bit(In_sync, &rdev->flags); 5970 5971 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5972 set_bit(WriteMostly, &rdev->flags); 5973 5974 if (!mddev->persistent) { 5975 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 5976 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 5977 } else 5978 rdev->sb_start = calc_dev_sboffset(rdev); 5979 rdev->sectors = rdev->sb_start; 5980 5981 err = bind_rdev_to_array(rdev, mddev); 5982 if (err) { 5983 export_rdev(rdev); 5984 return err; 5985 } 5986 } 5987 5988 return 0; 5989} 5990 5991static int hot_remove_disk(struct mddev *mddev, dev_t dev) 5992{ 5993 char b[BDEVNAME_SIZE]; 5994 struct md_rdev *rdev; 5995 5996 rdev = find_rdev(mddev, dev); 5997 if (!rdev) 5998 return -ENXIO; 5999 6000 if (mddev_is_clustered(mddev)) 6001 md_cluster_ops->metadata_update_start(mddev); 6002 6003 clear_bit(Blocked, &rdev->flags); 6004 remove_and_add_spares(mddev, rdev); 6005 6006 if (rdev->raid_disk >= 0) 6007 goto busy; 6008 6009 if (mddev_is_clustered(mddev)) 6010 md_cluster_ops->remove_disk(mddev, rdev); 6011 6012 md_kick_rdev_from_array(rdev); 6013 md_update_sb(mddev, 1); 6014 md_new_event(mddev); 6015 6016 if (mddev_is_clustered(mddev)) 6017 md_cluster_ops->metadata_update_finish(mddev); 6018 6019 return 0; 6020busy: 6021 if (mddev_is_clustered(mddev)) 6022 md_cluster_ops->metadata_update_cancel(mddev); 6023 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 6024 bdevname(rdev->bdev,b), mdname(mddev)); 6025 return -EBUSY; 6026} 6027 6028static int hot_add_disk(struct mddev *mddev, dev_t dev) 6029{ 6030 char b[BDEVNAME_SIZE]; 6031 int err; 6032 struct md_rdev *rdev; 6033 6034 if (!mddev->pers) 6035 return -ENODEV; 6036 6037 if (mddev->major_version != 0) { 6038 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 6039 " version-0 superblocks.\n", 6040 mdname(mddev)); 6041 return -EINVAL; 6042 } 6043 if (!mddev->pers->hot_add_disk) { 6044 printk(KERN_WARNING 6045 "%s: personality does not support diskops!\n", 6046 mdname(mddev)); 6047 return -EINVAL; 6048 } 6049 6050 rdev = md_import_device(dev, -1, 0); 6051 if (IS_ERR(rdev)) { 6052 printk(KERN_WARNING 6053 "md: error, md_import_device() returned %ld\n", 6054 PTR_ERR(rdev)); 6055 return -EINVAL; 6056 } 6057 6058 if (mddev->persistent) 6059 rdev->sb_start = calc_dev_sboffset(rdev); 6060 else 6061 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6062 6063 rdev->sectors = rdev->sb_start; 6064 6065 if (test_bit(Faulty, &rdev->flags)) { 6066 printk(KERN_WARNING 6067 "md: can not hot-add faulty %s disk to %s!\n", 6068 bdevname(rdev->bdev,b), mdname(mddev)); 6069 err = -EINVAL; 6070 goto abort_export; 6071 } 6072 6073 if (mddev_is_clustered(mddev)) 6074 md_cluster_ops->metadata_update_start(mddev); 6075 clear_bit(In_sync, &rdev->flags); 6076 rdev->desc_nr = -1; 6077 rdev->saved_raid_disk = -1; 6078 err = bind_rdev_to_array(rdev, mddev); 6079 if (err) 6080 goto abort_clustered; 6081 6082 /* 6083 * The rest should better be atomic, we can have disk failures 6084 * noticed in interrupt contexts ... 6085 */ 6086 6087 rdev->raid_disk = -1; 6088 6089 md_update_sb(mddev, 1); 6090 6091 if (mddev_is_clustered(mddev)) 6092 md_cluster_ops->metadata_update_finish(mddev); 6093 /* 6094 * Kick recovery, maybe this spare has to be added to the 6095 * array immediately. 6096 */ 6097 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6098 md_wakeup_thread(mddev->thread); 6099 md_new_event(mddev); 6100 return 0; 6101 6102abort_clustered: 6103 if (mddev_is_clustered(mddev)) 6104 md_cluster_ops->metadata_update_cancel(mddev); 6105abort_export: 6106 export_rdev(rdev); 6107 return err; 6108} 6109 6110static int set_bitmap_file(struct mddev *mddev, int fd) 6111{ 6112 int err = 0; 6113 6114 if (mddev->pers) { 6115 if (!mddev->pers->quiesce || !mddev->thread) 6116 return -EBUSY; 6117 if (mddev->recovery || mddev->sync_thread) 6118 return -EBUSY; 6119 /* we should be able to change the bitmap.. */ 6120 } 6121 6122 if (fd >= 0) { 6123 struct inode *inode; 6124 struct file *f; 6125 6126 if (mddev->bitmap || mddev->bitmap_info.file) 6127 return -EEXIST; /* cannot add when bitmap is present */ 6128 f = fget(fd); 6129 6130 if (f == NULL) { 6131 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 6132 mdname(mddev)); 6133 return -EBADF; 6134 } 6135 6136 inode = f->f_mapping->host; 6137 if (!S_ISREG(inode->i_mode)) { 6138 printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", 6139 mdname(mddev)); 6140 err = -EBADF; 6141 } else if (!(f->f_mode & FMODE_WRITE)) { 6142 printk(KERN_ERR "%s: error: bitmap file must open for write\n", 6143 mdname(mddev)); 6144 err = -EBADF; 6145 } else if (atomic_read(&inode->i_writecount) != 1) { 6146 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 6147 mdname(mddev)); 6148 err = -EBUSY; 6149 } 6150 if (err) { 6151 fput(f); 6152 return err; 6153 } 6154 mddev->bitmap_info.file = f; 6155 mddev->bitmap_info.offset = 0; /* file overrides offset */ 6156 } else if (mddev->bitmap == NULL) 6157 return -ENOENT; /* cannot remove what isn't there */ 6158 err = 0; 6159 if (mddev->pers) { 6160 mddev->pers->quiesce(mddev, 1); 6161 if (fd >= 0) { 6162 struct bitmap *bitmap; 6163 6164 bitmap = bitmap_create(mddev, -1); 6165 if (!IS_ERR(bitmap)) { 6166 mddev->bitmap = bitmap; 6167 err = bitmap_load(mddev); 6168 } else 6169 err = PTR_ERR(bitmap); 6170 } 6171 if (fd < 0 || err) { 6172 bitmap_destroy(mddev); 6173 fd = -1; /* make sure to put the file */ 6174 } 6175 mddev->pers->quiesce(mddev, 0); 6176 } 6177 if (fd < 0) { 6178 struct file *f = mddev->bitmap_info.file; 6179 if (f) { 6180 spin_lock(&mddev->lock); 6181 mddev->bitmap_info.file = NULL; 6182 spin_unlock(&mddev->lock); 6183 fput(f); 6184 } 6185 } 6186 6187 return err; 6188} 6189 6190/* 6191 * set_array_info is used two different ways 6192 * The original usage is when creating a new array. 6193 * In this usage, raid_disks is > 0 and it together with 6194 * level, size, not_persistent,layout,chunksize determine the 6195 * shape of the array. 6196 * This will always create an array with a type-0.90.0 superblock. 6197 * The newer usage is when assembling an array. 6198 * In this case raid_disks will be 0, and the major_version field is 6199 * use to determine which style super-blocks are to be found on the devices. 6200 * The minor and patch _version numbers are also kept incase the 6201 * super_block handler wishes to interpret them. 6202 */ 6203static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) 6204{ 6205 6206 if (info->raid_disks == 0) { 6207 /* just setting version number for superblock loading */ 6208 if (info->major_version < 0 || 6209 info->major_version >= ARRAY_SIZE(super_types) || 6210 super_types[info->major_version].name == NULL) { 6211 /* maybe try to auto-load a module? */ 6212 printk(KERN_INFO 6213 "md: superblock version %d not known\n", 6214 info->major_version); 6215 return -EINVAL; 6216 } 6217 mddev->major_version = info->major_version; 6218 mddev->minor_version = info->minor_version; 6219 mddev->patch_version = info->patch_version; 6220 mddev->persistent = !info->not_persistent; 6221 /* ensure mddev_put doesn't delete this now that there 6222 * is some minimal configuration. 6223 */ 6224 mddev->ctime = get_seconds(); 6225 return 0; 6226 } 6227 mddev->major_version = MD_MAJOR_VERSION; 6228 mddev->minor_version = MD_MINOR_VERSION; 6229 mddev->patch_version = MD_PATCHLEVEL_VERSION; 6230 mddev->ctime = get_seconds(); 6231 6232 mddev->level = info->level; 6233 mddev->clevel[0] = 0; 6234 mddev->dev_sectors = 2 * (sector_t)info->size; 6235 mddev->raid_disks = info->raid_disks; 6236 /* don't set md_minor, it is determined by which /dev/md* was 6237 * openned 6238 */ 6239 if (info->state & (1<<MD_SB_CLEAN)) 6240 mddev->recovery_cp = MaxSector; 6241 else 6242 mddev->recovery_cp = 0; 6243 mddev->persistent = ! info->not_persistent; 6244 mddev->external = 0; 6245 6246 mddev->layout = info->layout; 6247 mddev->chunk_sectors = info->chunk_size >> 9; 6248 6249 mddev->max_disks = MD_SB_DISKS; 6250 6251 if (mddev->persistent) 6252 mddev->flags = 0; 6253 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6254 6255 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 6256 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 6257 mddev->bitmap_info.offset = 0; 6258 6259 mddev->reshape_position = MaxSector; 6260 6261 /* 6262 * Generate a 128 bit UUID 6263 */ 6264 get_random_bytes(mddev->uuid, 16); 6265 6266 mddev->new_level = mddev->level; 6267 mddev->new_chunk_sectors = mddev->chunk_sectors; 6268 mddev->new_layout = mddev->layout; 6269 mddev->delta_disks = 0; 6270 mddev->reshape_backwards = 0; 6271 6272 return 0; 6273} 6274 6275void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 6276{ 6277 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 6278 6279 if (mddev->external_size) 6280 return; 6281 6282 mddev->array_sectors = array_sectors; 6283} 6284EXPORT_SYMBOL(md_set_array_sectors); 6285 6286static int update_size(struct mddev *mddev, sector_t num_sectors) 6287{ 6288 struct md_rdev *rdev; 6289 int rv; 6290 int fit = (num_sectors == 0); 6291 6292 if (mddev->pers->resize == NULL) 6293 return -EINVAL; 6294 /* The "num_sectors" is the number of sectors of each device that 6295 * is used. This can only make sense for arrays with redundancy. 6296 * linear and raid0 always use whatever space is available. We can only 6297 * consider changing this number if no resync or reconstruction is 6298 * happening, and if the new size is acceptable. It must fit before the 6299 * sb_start or, if that is <data_offset, it must fit before the size 6300 * of each device. If num_sectors is zero, we find the largest size 6301 * that fits. 6302 */ 6303 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6304 mddev->sync_thread) 6305 return -EBUSY; 6306 if (mddev->ro) 6307 return -EROFS; 6308 6309 rdev_for_each(rdev, mddev) { 6310 sector_t avail = rdev->sectors; 6311 6312 if (fit && (num_sectors == 0 || num_sectors > avail)) 6313 num_sectors = avail; 6314 if (avail < num_sectors) 6315 return -ENOSPC; 6316 } 6317 rv = mddev->pers->resize(mddev, num_sectors); 6318 if (!rv) 6319 revalidate_disk(mddev->gendisk); 6320 return rv; 6321} 6322 6323static int update_raid_disks(struct mddev *mddev, int raid_disks) 6324{ 6325 int rv; 6326 struct md_rdev *rdev; 6327 /* change the number of raid disks */ 6328 if (mddev->pers->check_reshape == NULL) 6329 return -EINVAL; 6330 if (mddev->ro) 6331 return -EROFS; 6332 if (raid_disks <= 0 || 6333 (mddev->max_disks && raid_disks >= mddev->max_disks)) 6334 return -EINVAL; 6335 if (mddev->sync_thread || 6336 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6337 mddev->reshape_position != MaxSector) 6338 return -EBUSY; 6339 6340 rdev_for_each(rdev, mddev) { 6341 if (mddev->raid_disks < raid_disks && 6342 rdev->data_offset < rdev->new_data_offset) 6343 return -EINVAL; 6344 if (mddev->raid_disks > raid_disks && 6345 rdev->data_offset > rdev->new_data_offset) 6346 return -EINVAL; 6347 } 6348 6349 mddev->delta_disks = raid_disks - mddev->raid_disks; 6350 if (mddev->delta_disks < 0) 6351 mddev->reshape_backwards = 1; 6352 else if (mddev->delta_disks > 0) 6353 mddev->reshape_backwards = 0; 6354 6355 rv = mddev->pers->check_reshape(mddev); 6356 if (rv < 0) { 6357 mddev->delta_disks = 0; 6358 mddev->reshape_backwards = 0; 6359 } 6360 return rv; 6361} 6362 6363/* 6364 * update_array_info is used to change the configuration of an 6365 * on-line array. 6366 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 6367 * fields in the info are checked against the array. 6368 * Any differences that cannot be handled will cause an error. 6369 * Normally, only one change can be managed at a time. 6370 */ 6371static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 6372{ 6373 int rv = 0; 6374 int cnt = 0; 6375 int state = 0; 6376 6377 /* calculate expected state,ignoring low bits */ 6378 if (mddev->bitmap && mddev->bitmap_info.offset) 6379 state |= (1 << MD_SB_BITMAP_PRESENT); 6380 6381 if (mddev->major_version != info->major_version || 6382 mddev->minor_version != info->minor_version || 6383/* mddev->patch_version != info->patch_version || */ 6384 mddev->ctime != info->ctime || 6385 mddev->level != info->level || 6386/* mddev->layout != info->layout || */ 6387 mddev->persistent != !info->not_persistent || 6388 mddev->chunk_sectors != info->chunk_size >> 9 || 6389 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 6390 ((state^info->state) & 0xfffffe00) 6391 ) 6392 return -EINVAL; 6393 /* Check there is only one change */ 6394 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6395 cnt++; 6396 if (mddev->raid_disks != info->raid_disks) 6397 cnt++; 6398 if (mddev->layout != info->layout) 6399 cnt++; 6400 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 6401 cnt++; 6402 if (cnt == 0) 6403 return 0; 6404 if (cnt > 1) 6405 return -EINVAL; 6406 6407 if (mddev->layout != info->layout) { 6408 /* Change layout 6409 * we don't need to do anything at the md level, the 6410 * personality will take care of it all. 6411 */ 6412 if (mddev->pers->check_reshape == NULL) 6413 return -EINVAL; 6414 else { 6415 mddev->new_layout = info->layout; 6416 rv = mddev->pers->check_reshape(mddev); 6417 if (rv) 6418 mddev->new_layout = mddev->layout; 6419 return rv; 6420 } 6421 } 6422 if (mddev_is_clustered(mddev)) 6423 md_cluster_ops->metadata_update_start(mddev); 6424 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6425 rv = update_size(mddev, (sector_t)info->size * 2); 6426 6427 if (mddev->raid_disks != info->raid_disks) 6428 rv = update_raid_disks(mddev, info->raid_disks); 6429 6430 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 6431 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 6432 rv = -EINVAL; 6433 goto err; 6434 } 6435 if (mddev->recovery || mddev->sync_thread) { 6436 rv = -EBUSY; 6437 goto err; 6438 } 6439 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 6440 struct bitmap *bitmap; 6441 /* add the bitmap */ 6442 if (mddev->bitmap) { 6443 rv = -EEXIST; 6444 goto err; 6445 } 6446 if (mddev->bitmap_info.default_offset == 0) { 6447 rv = -EINVAL; 6448 goto err; 6449 } 6450 mddev->bitmap_info.offset = 6451 mddev->bitmap_info.default_offset; 6452 mddev->bitmap_info.space = 6453 mddev->bitmap_info.default_space; 6454 mddev->pers->quiesce(mddev, 1); 6455 bitmap = bitmap_create(mddev, -1); 6456 if (!IS_ERR(bitmap)) { 6457 mddev->bitmap = bitmap; 6458 rv = bitmap_load(mddev); 6459 } else 6460 rv = PTR_ERR(bitmap); 6461 if (rv) 6462 bitmap_destroy(mddev); 6463 mddev->pers->quiesce(mddev, 0); 6464 } else { 6465 /* remove the bitmap */ 6466 if (!mddev->bitmap) { 6467 rv = -ENOENT; 6468 goto err; 6469 } 6470 if (mddev->bitmap->storage.file) { 6471 rv = -EINVAL; 6472 goto err; 6473 } 6474 mddev->pers->quiesce(mddev, 1); 6475 bitmap_destroy(mddev); 6476 mddev->pers->quiesce(mddev, 0); 6477 mddev->bitmap_info.offset = 0; 6478 } 6479 } 6480 md_update_sb(mddev, 1); 6481 if (mddev_is_clustered(mddev)) 6482 md_cluster_ops->metadata_update_finish(mddev); 6483 return rv; 6484err: 6485 if (mddev_is_clustered(mddev)) 6486 md_cluster_ops->metadata_update_cancel(mddev); 6487 return rv; 6488} 6489 6490static int set_disk_faulty(struct mddev *mddev, dev_t dev) 6491{ 6492 struct md_rdev *rdev; 6493 int err = 0; 6494 6495 if (mddev->pers == NULL) 6496 return -ENODEV; 6497 6498 rcu_read_lock(); 6499 rdev = find_rdev_rcu(mddev, dev); 6500 if (!rdev) 6501 err = -ENODEV; 6502 else { 6503 md_error(mddev, rdev); 6504 if (!test_bit(Faulty, &rdev->flags)) 6505 err = -EBUSY; 6506 } 6507 rcu_read_unlock(); 6508 return err; 6509} 6510 6511/* 6512 * We have a problem here : there is no easy way to give a CHS 6513 * virtual geometry. We currently pretend that we have a 2 heads 6514 * 4 sectors (with a BIG number of cylinders...). This drives 6515 * dosfs just mad... ;-) 6516 */ 6517static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 6518{ 6519 struct mddev *mddev = bdev->bd_disk->private_data; 6520 6521 geo->heads = 2; 6522 geo->sectors = 4; 6523 geo->cylinders = mddev->array_sectors / 8; 6524 return 0; 6525} 6526 6527static inline bool md_ioctl_valid(unsigned int cmd) 6528{ 6529 switch (cmd) { 6530 case ADD_NEW_DISK: 6531 case BLKROSET: 6532 case GET_ARRAY_INFO: 6533 case GET_BITMAP_FILE: 6534 case GET_DISK_INFO: 6535 case HOT_ADD_DISK: 6536 case HOT_REMOVE_DISK: 6537 case RAID_AUTORUN: 6538 case RAID_VERSION: 6539 case RESTART_ARRAY_RW: 6540 case RUN_ARRAY: 6541 case SET_ARRAY_INFO: 6542 case SET_BITMAP_FILE: 6543 case SET_DISK_FAULTY: 6544 case STOP_ARRAY: 6545 case STOP_ARRAY_RO: 6546 case CLUSTERED_DISK_NACK: 6547 return true; 6548 default: 6549 return false; 6550 } 6551} 6552 6553static int md_ioctl(struct block_device *bdev, fmode_t mode, 6554 unsigned int cmd, unsigned long arg) 6555{ 6556 int err = 0; 6557 void __user *argp = (void __user *)arg; 6558 struct mddev *mddev = NULL; 6559 int ro; 6560 6561 if (!md_ioctl_valid(cmd)) 6562 return -ENOTTY; 6563 6564 switch (cmd) { 6565 case RAID_VERSION: 6566 case GET_ARRAY_INFO: 6567 case GET_DISK_INFO: 6568 break; 6569 default: 6570 if (!capable(CAP_SYS_ADMIN)) 6571 return -EACCES; 6572 } 6573 6574 /* 6575 * Commands dealing with the RAID driver but not any 6576 * particular array: 6577 */ 6578 switch (cmd) { 6579 case RAID_VERSION: 6580 err = get_version(argp); 6581 goto out; 6582 6583#ifndef MODULE 6584 case RAID_AUTORUN: 6585 err = 0; 6586 autostart_arrays(arg); 6587 goto out; 6588#endif 6589 default:; 6590 } 6591 6592 /* 6593 * Commands creating/starting a new array: 6594 */ 6595 6596 mddev = bdev->bd_disk->private_data; 6597 6598 if (!mddev) { 6599 BUG(); 6600 goto out; 6601 } 6602 6603 /* Some actions do not requires the mutex */ 6604 switch (cmd) { 6605 case GET_ARRAY_INFO: 6606 if (!mddev->raid_disks && !mddev->external) 6607 err = -ENODEV; 6608 else 6609 err = get_array_info(mddev, argp); 6610 goto out; 6611 6612 case GET_DISK_INFO: 6613 if (!mddev->raid_disks && !mddev->external) 6614 err = -ENODEV; 6615 else 6616 err = get_disk_info(mddev, argp); 6617 goto out; 6618 6619 case SET_DISK_FAULTY: 6620 err = set_disk_faulty(mddev, new_decode_dev(arg)); 6621 goto out; 6622 6623 case GET_BITMAP_FILE: 6624 err = get_bitmap_file(mddev, argp); 6625 goto out; 6626 6627 } 6628 6629 if (cmd == ADD_NEW_DISK) 6630 /* need to ensure md_delayed_delete() has completed */ 6631 flush_workqueue(md_misc_wq); 6632 6633 if (cmd == HOT_REMOVE_DISK) 6634 /* need to ensure recovery thread has run */ 6635 wait_event_interruptible_timeout(mddev->sb_wait, 6636 !test_bit(MD_RECOVERY_NEEDED, 6637 &mddev->flags), 6638 msecs_to_jiffies(5000)); 6639 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 6640 /* Need to flush page cache, and ensure no-one else opens 6641 * and writes 6642 */ 6643 mutex_lock(&mddev->open_mutex); 6644 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 6645 mutex_unlock(&mddev->open_mutex); 6646 err = -EBUSY; 6647 goto out; 6648 } 6649 set_bit(MD_STILL_CLOSED, &mddev->flags); 6650 mutex_unlock(&mddev->open_mutex); 6651 sync_blockdev(bdev); 6652 } 6653 err = mddev_lock(mddev); 6654 if (err) { 6655 printk(KERN_INFO 6656 "md: ioctl lock interrupted, reason %d, cmd %d\n", 6657 err, cmd); 6658 goto out; 6659 } 6660 6661 if (cmd == SET_ARRAY_INFO) { 6662 mdu_array_info_t info; 6663 if (!arg) 6664 memset(&info, 0, sizeof(info)); 6665 else if (copy_from_user(&info, argp, sizeof(info))) { 6666 err = -EFAULT; 6667 goto unlock; 6668 } 6669 if (mddev->pers) { 6670 err = update_array_info(mddev, &info); 6671 if (err) { 6672 printk(KERN_WARNING "md: couldn't update" 6673 " array info. %d\n", err); 6674 goto unlock; 6675 } 6676 goto unlock; 6677 } 6678 if (!list_empty(&mddev->disks)) { 6679 printk(KERN_WARNING 6680 "md: array %s already has disks!\n", 6681 mdname(mddev)); 6682 err = -EBUSY; 6683 goto unlock; 6684 } 6685 if (mddev->raid_disks) { 6686 printk(KERN_WARNING 6687 "md: array %s already initialised!\n", 6688 mdname(mddev)); 6689 err = -EBUSY; 6690 goto unlock; 6691 } 6692 err = set_array_info(mddev, &info); 6693 if (err) { 6694 printk(KERN_WARNING "md: couldn't set" 6695 " array info. %d\n", err); 6696 goto unlock; 6697 } 6698 goto unlock; 6699 } 6700 6701 /* 6702 * Commands querying/configuring an existing array: 6703 */ 6704 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 6705 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 6706 if ((!mddev->raid_disks && !mddev->external) 6707 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 6708 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 6709 && cmd != GET_BITMAP_FILE) { 6710 err = -ENODEV; 6711 goto unlock; 6712 } 6713 6714 /* 6715 * Commands even a read-only array can execute: 6716 */ 6717 switch (cmd) { 6718 case RESTART_ARRAY_RW: 6719 err = restart_array(mddev); 6720 goto unlock; 6721 6722 case STOP_ARRAY: 6723 err = do_md_stop(mddev, 0, bdev); 6724 goto unlock; 6725 6726 case STOP_ARRAY_RO: 6727 err = md_set_readonly(mddev, bdev); 6728 goto unlock; 6729 6730 case HOT_REMOVE_DISK: 6731 err = hot_remove_disk(mddev, new_decode_dev(arg)); 6732 goto unlock; 6733 6734 case ADD_NEW_DISK: 6735 /* We can support ADD_NEW_DISK on read-only arrays 6736 * on if we are re-adding a preexisting device. 6737 * So require mddev->pers and MD_DISK_SYNC. 6738 */ 6739 if (mddev->pers) { 6740 mdu_disk_info_t info; 6741 if (copy_from_user(&info, argp, sizeof(info))) 6742 err = -EFAULT; 6743 else if (!(info.state & (1<<MD_DISK_SYNC))) 6744 /* Need to clear read-only for this */ 6745 break; 6746 else 6747 err = add_new_disk(mddev, &info); 6748 goto unlock; 6749 } 6750 break; 6751 6752 case BLKROSET: 6753 if (get_user(ro, (int __user *)(arg))) { 6754 err = -EFAULT; 6755 goto unlock; 6756 } 6757 err = -EINVAL; 6758 6759 /* if the bdev is going readonly the value of mddev->ro 6760 * does not matter, no writes are coming 6761 */ 6762 if (ro) 6763 goto unlock; 6764 6765 /* are we are already prepared for writes? */ 6766 if (mddev->ro != 1) 6767 goto unlock; 6768 6769 /* transitioning to readauto need only happen for 6770 * arrays that call md_write_start 6771 */ 6772 if (mddev->pers) { 6773 err = restart_array(mddev); 6774 if (err == 0) { 6775 mddev->ro = 2; 6776 set_disk_ro(mddev->gendisk, 0); 6777 } 6778 } 6779 goto unlock; 6780 } 6781 6782 /* 6783 * The remaining ioctls are changing the state of the 6784 * superblock, so we do not allow them on read-only arrays. 6785 */ 6786 if (mddev->ro && mddev->pers) { 6787 if (mddev->ro == 2) { 6788 mddev->ro = 0; 6789 sysfs_notify_dirent_safe(mddev->sysfs_state); 6790 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6791 /* mddev_unlock will wake thread */ 6792 /* If a device failed while we were read-only, we 6793 * need to make sure the metadata is updated now. 6794 */ 6795 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 6796 mddev_unlock(mddev); 6797 wait_event(mddev->sb_wait, 6798 !test_bit(MD_CHANGE_DEVS, &mddev->flags) && 6799 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6800 mddev_lock_nointr(mddev); 6801 } 6802 } else { 6803 err = -EROFS; 6804 goto unlock; 6805 } 6806 } 6807 6808 switch (cmd) { 6809 case ADD_NEW_DISK: 6810 { 6811 mdu_disk_info_t info; 6812 if (copy_from_user(&info, argp, sizeof(info))) 6813 err = -EFAULT; 6814 else 6815 err = add_new_disk(mddev, &info); 6816 goto unlock; 6817 } 6818 6819 case CLUSTERED_DISK_NACK: 6820 if (mddev_is_clustered(mddev)) 6821 md_cluster_ops->new_disk_ack(mddev, false); 6822 else 6823 err = -EINVAL; 6824 goto unlock; 6825 6826 case HOT_ADD_DISK: 6827 err = hot_add_disk(mddev, new_decode_dev(arg)); 6828 goto unlock; 6829 6830 case RUN_ARRAY: 6831 err = do_md_run(mddev); 6832 goto unlock; 6833 6834 case SET_BITMAP_FILE: 6835 err = set_bitmap_file(mddev, (int)arg); 6836 goto unlock; 6837 6838 default: 6839 err = -EINVAL; 6840 goto unlock; 6841 } 6842 6843unlock: 6844 if (mddev->hold_active == UNTIL_IOCTL && 6845 err != -EINVAL) 6846 mddev->hold_active = 0; 6847 mddev_unlock(mddev); 6848out: 6849 return err; 6850} 6851#ifdef CONFIG_COMPAT 6852static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, 6853 unsigned int cmd, unsigned long arg) 6854{ 6855 switch (cmd) { 6856 case HOT_REMOVE_DISK: 6857 case HOT_ADD_DISK: 6858 case SET_DISK_FAULTY: 6859 case SET_BITMAP_FILE: 6860 /* These take in integer arg, do not convert */ 6861 break; 6862 default: 6863 arg = (unsigned long)compat_ptr(arg); 6864 break; 6865 } 6866 6867 return md_ioctl(bdev, mode, cmd, arg); 6868} 6869#endif /* CONFIG_COMPAT */ 6870 6871static int md_open(struct block_device *bdev, fmode_t mode) 6872{ 6873 /* 6874 * Succeed if we can lock the mddev, which confirms that 6875 * it isn't being stopped right now. 6876 */ 6877 struct mddev *mddev = mddev_find(bdev->bd_dev); 6878 int err; 6879 6880 if (!mddev) 6881 return -ENODEV; 6882 6883 if (mddev->gendisk != bdev->bd_disk) { 6884 /* we are racing with mddev_put which is discarding this 6885 * bd_disk. 6886 */ 6887 mddev_put(mddev); 6888 /* Wait until bdev->bd_disk is definitely gone */ 6889 flush_workqueue(md_misc_wq); 6890 /* Then retry the open from the top */ 6891 return -ERESTARTSYS; 6892 } 6893 BUG_ON(mddev != bdev->bd_disk->private_data); 6894 6895 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 6896 goto out; 6897 6898 err = 0; 6899 atomic_inc(&mddev->openers); 6900 clear_bit(MD_STILL_CLOSED, &mddev->flags); 6901 mutex_unlock(&mddev->open_mutex); 6902 6903 check_disk_change(bdev); 6904 out: 6905 return err; 6906} 6907 6908static void md_release(struct gendisk *disk, fmode_t mode) 6909{ 6910 struct mddev *mddev = disk->private_data; 6911 6912 BUG_ON(!mddev); 6913 atomic_dec(&mddev->openers); 6914 mddev_put(mddev); 6915} 6916 6917static int md_media_changed(struct gendisk *disk) 6918{ 6919 struct mddev *mddev = disk->private_data; 6920 6921 return mddev->changed; 6922} 6923 6924static int md_revalidate(struct gendisk *disk) 6925{ 6926 struct mddev *mddev = disk->private_data; 6927 6928 mddev->changed = 0; 6929 return 0; 6930} 6931static const struct block_device_operations md_fops = 6932{ 6933 .owner = THIS_MODULE, 6934 .open = md_open, 6935 .release = md_release, 6936 .ioctl = md_ioctl, 6937#ifdef CONFIG_COMPAT 6938 .compat_ioctl = md_compat_ioctl, 6939#endif 6940 .getgeo = md_getgeo, 6941 .media_changed = md_media_changed, 6942 .revalidate_disk= md_revalidate, 6943}; 6944 6945static int md_thread(void *arg) 6946{ 6947 struct md_thread *thread = arg; 6948 6949 /* 6950 * md_thread is a 'system-thread', it's priority should be very 6951 * high. We avoid resource deadlocks individually in each 6952 * raid personality. (RAID5 does preallocation) We also use RR and 6953 * the very same RT priority as kswapd, thus we will never get 6954 * into a priority inversion deadlock. 6955 * 6956 * we definitely have to have equal or higher priority than 6957 * bdflush, otherwise bdflush will deadlock if there are too 6958 * many dirty RAID5 blocks. 6959 */ 6960 6961 allow_signal(SIGKILL); 6962 while (!kthread_should_stop()) { 6963 6964 /* We need to wait INTERRUPTIBLE so that 6965 * we don't add to the load-average. 6966 * That means we need to be sure no signals are 6967 * pending 6968 */ 6969 if (signal_pending(current)) 6970 flush_signals(current); 6971 6972 wait_event_interruptible_timeout 6973 (thread->wqueue, 6974 test_bit(THREAD_WAKEUP, &thread->flags) 6975 || kthread_should_stop(), 6976 thread->timeout); 6977 6978 clear_bit(THREAD_WAKEUP, &thread->flags); 6979 if (!kthread_should_stop()) 6980 thread->run(thread); 6981 } 6982 6983 return 0; 6984} 6985 6986void md_wakeup_thread(struct md_thread *thread) 6987{ 6988 if (thread) { 6989 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); 6990 set_bit(THREAD_WAKEUP, &thread->flags); 6991 wake_up(&thread->wqueue); 6992 } 6993} 6994EXPORT_SYMBOL(md_wakeup_thread); 6995 6996struct md_thread *md_register_thread(void (*run) (struct md_thread *), 6997 struct mddev *mddev, const char *name) 6998{ 6999 struct md_thread *thread; 7000 7001 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 7002 if (!thread) 7003 return NULL; 7004 7005 init_waitqueue_head(&thread->wqueue); 7006 7007 thread->run = run; 7008 thread->mddev = mddev; 7009 thread->timeout = MAX_SCHEDULE_TIMEOUT; 7010 thread->tsk = kthread_run(md_thread, thread, 7011 "%s_%s", 7012 mdname(thread->mddev), 7013 name); 7014 if (IS_ERR(thread->tsk)) { 7015 kfree(thread); 7016 return NULL; 7017 } 7018 return thread; 7019} 7020EXPORT_SYMBOL(md_register_thread); 7021 7022void md_unregister_thread(struct md_thread **threadp) 7023{ 7024 struct md_thread *thread = *threadp; 7025 if (!thread) 7026 return; 7027 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 7028 /* Locking ensures that mddev_unlock does not wake_up a 7029 * non-existent thread 7030 */ 7031 spin_lock(&pers_lock); 7032 *threadp = NULL; 7033 spin_unlock(&pers_lock); 7034 7035 kthread_stop(thread->tsk); 7036 kfree(thread); 7037} 7038EXPORT_SYMBOL(md_unregister_thread); 7039 7040void md_error(struct mddev *mddev, struct md_rdev *rdev) 7041{ 7042 if (!rdev || test_bit(Faulty, &rdev->flags)) 7043 return; 7044 7045 if (!mddev->pers || !mddev->pers->error_handler) 7046 return; 7047 mddev->pers->error_handler(mddev,rdev); 7048 if (mddev->degraded) 7049 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7050 sysfs_notify_dirent_safe(rdev->sysfs_state); 7051 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7052 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7053 md_wakeup_thread(mddev->thread); 7054 if (mddev->event_work.func) 7055 queue_work(md_misc_wq, &mddev->event_work); 7056 md_new_event_inintr(mddev); 7057} 7058EXPORT_SYMBOL(md_error); 7059 7060/* seq_file implementation /proc/mdstat */ 7061 7062static void status_unused(struct seq_file *seq) 7063{ 7064 int i = 0; 7065 struct md_rdev *rdev; 7066 7067 seq_printf(seq, "unused devices: "); 7068 7069 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 7070 char b[BDEVNAME_SIZE]; 7071 i++; 7072 seq_printf(seq, "%s ", 7073 bdevname(rdev->bdev,b)); 7074 } 7075 if (!i) 7076 seq_printf(seq, "<none>"); 7077 7078 seq_printf(seq, "\n"); 7079} 7080 7081static void status_resync(struct seq_file *seq, struct mddev *mddev) 7082{ 7083 sector_t max_sectors, resync, res; 7084 unsigned long dt, db; 7085 sector_t rt; 7086 int scale; 7087 unsigned int per_milli; 7088 7089 if (mddev->curr_resync <= 3) 7090 resync = 0; 7091 else 7092 resync = mddev->curr_resync 7093 - atomic_read(&mddev->recovery_active); 7094 7095 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 7096 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7097 max_sectors = mddev->resync_max_sectors; 7098 else 7099 max_sectors = mddev->dev_sectors; 7100 7101 WARN_ON(max_sectors == 0); 7102 /* Pick 'scale' such that (resync>>scale)*1000 will fit 7103 * in a sector_t, and (max_sectors>>scale) will fit in a 7104 * u32, as those are the requirements for sector_div. 7105 * Thus 'scale' must be at least 10 7106 */ 7107 scale = 10; 7108 if (sizeof(sector_t) > sizeof(unsigned long)) { 7109 while ( max_sectors/2 > (1ULL<<(scale+32))) 7110 scale++; 7111 } 7112 res = (resync>>scale)*1000; 7113 sector_div(res, (u32)((max_sectors>>scale)+1)); 7114 7115 per_milli = res; 7116 { 7117 int i, x = per_milli/50, y = 20-x; 7118 seq_printf(seq, "["); 7119 for (i = 0; i < x; i++) 7120 seq_printf(seq, "="); 7121 seq_printf(seq, ">"); 7122 for (i = 0; i < y; i++) 7123 seq_printf(seq, "."); 7124 seq_printf(seq, "] "); 7125 } 7126 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 7127 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 7128 "reshape" : 7129 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 7130 "check" : 7131 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 7132 "resync" : "recovery"))), 7133 per_milli/10, per_milli % 10, 7134 (unsigned long long) resync/2, 7135 (unsigned long long) max_sectors/2); 7136 7137 /* 7138 * dt: time from mark until now 7139 * db: blocks written from mark until now 7140 * rt: remaining time 7141 * 7142 * rt is a sector_t, so could be 32bit or 64bit. 7143 * So we divide before multiply in case it is 32bit and close 7144 * to the limit. 7145 * We scale the divisor (db) by 32 to avoid losing precision 7146 * near the end of resync when the number of remaining sectors 7147 * is close to 'db'. 7148 * We then divide rt by 32 after multiplying by db to compensate. 7149 * The '+1' avoids division by zero if db is very small. 7150 */ 7151 dt = ((jiffies - mddev->resync_mark) / HZ); 7152 if (!dt) dt++; 7153 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 7154 - mddev->resync_mark_cnt; 7155 7156 rt = max_sectors - resync; /* number of remaining sectors */ 7157 sector_div(rt, db/32+1); 7158 rt *= dt; 7159 rt >>= 5; 7160 7161 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 7162 ((unsigned long)rt % 60)/6); 7163 7164 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 7165} 7166 7167static void *md_seq_start(struct seq_file *seq, loff_t *pos) 7168{ 7169 struct list_head *tmp; 7170 loff_t l = *pos; 7171 struct mddev *mddev; 7172 7173 if (l >= 0x10000) 7174 return NULL; 7175 if (!l--) 7176 /* header */ 7177 return (void*)1; 7178 7179 spin_lock(&all_mddevs_lock); 7180 list_for_each(tmp,&all_mddevs) 7181 if (!l--) { 7182 mddev = list_entry(tmp, struct mddev, all_mddevs); 7183 mddev_get(mddev); 7184 spin_unlock(&all_mddevs_lock); 7185 return mddev; 7186 } 7187 spin_unlock(&all_mddevs_lock); 7188 if (!l--) 7189 return (void*)2;/* tail */ 7190 return NULL; 7191} 7192 7193static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 7194{ 7195 struct list_head *tmp; 7196 struct mddev *next_mddev, *mddev = v; 7197 7198 ++*pos; 7199 if (v == (void*)2) 7200 return NULL; 7201 7202 spin_lock(&all_mddevs_lock); 7203 if (v == (void*)1) 7204 tmp = all_mddevs.next; 7205 else 7206 tmp = mddev->all_mddevs.next; 7207 if (tmp != &all_mddevs) 7208 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); 7209 else { 7210 next_mddev = (void*)2; 7211 *pos = 0x10000; 7212 } 7213 spin_unlock(&all_mddevs_lock); 7214 7215 if (v != (void*)1) 7216 mddev_put(mddev); 7217 return next_mddev; 7218 7219} 7220 7221static void md_seq_stop(struct seq_file *seq, void *v) 7222{ 7223 struct mddev *mddev = v; 7224 7225 if (mddev && v != (void*)1 && v != (void*)2) 7226 mddev_put(mddev); 7227} 7228 7229static int md_seq_show(struct seq_file *seq, void *v) 7230{ 7231 struct mddev *mddev = v; 7232 sector_t sectors; 7233 struct md_rdev *rdev; 7234 7235 if (v == (void*)1) { 7236 struct md_personality *pers; 7237 seq_printf(seq, "Personalities : "); 7238 spin_lock(&pers_lock); 7239 list_for_each_entry(pers, &pers_list, list) 7240 seq_printf(seq, "[%s] ", pers->name); 7241 7242 spin_unlock(&pers_lock); 7243 seq_printf(seq, "\n"); 7244 seq->poll_event = atomic_read(&md_event_count); 7245 return 0; 7246 } 7247 if (v == (void*)2) { 7248 status_unused(seq); 7249 return 0; 7250 } 7251 7252 spin_lock(&mddev->lock); 7253 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 7254 seq_printf(seq, "%s : %sactive", mdname(mddev), 7255 mddev->pers ? "" : "in"); 7256 if (mddev->pers) { 7257 if (mddev->ro==1) 7258 seq_printf(seq, " (read-only)"); 7259 if (mddev->ro==2) 7260 seq_printf(seq, " (auto-read-only)"); 7261 seq_printf(seq, " %s", mddev->pers->name); 7262 } 7263 7264 sectors = 0; 7265 rcu_read_lock(); 7266 rdev_for_each_rcu(rdev, mddev) { 7267 char b[BDEVNAME_SIZE]; 7268 seq_printf(seq, " %s[%d]", 7269 bdevname(rdev->bdev,b), rdev->desc_nr); 7270 if (test_bit(WriteMostly, &rdev->flags)) 7271 seq_printf(seq, "(W)"); 7272 if (test_bit(Faulty, &rdev->flags)) { 7273 seq_printf(seq, "(F)"); 7274 continue; 7275 } 7276 if (rdev->raid_disk < 0) 7277 seq_printf(seq, "(S)"); /* spare */ 7278 if (test_bit(Replacement, &rdev->flags)) 7279 seq_printf(seq, "(R)"); 7280 sectors += rdev->sectors; 7281 } 7282 rcu_read_unlock(); 7283 7284 if (!list_empty(&mddev->disks)) { 7285 if (mddev->pers) 7286 seq_printf(seq, "\n %llu blocks", 7287 (unsigned long long) 7288 mddev->array_sectors / 2); 7289 else 7290 seq_printf(seq, "\n %llu blocks", 7291 (unsigned long long)sectors / 2); 7292 } 7293 if (mddev->persistent) { 7294 if (mddev->major_version != 0 || 7295 mddev->minor_version != 90) { 7296 seq_printf(seq," super %d.%d", 7297 mddev->major_version, 7298 mddev->minor_version); 7299 } 7300 } else if (mddev->external) 7301 seq_printf(seq, " super external:%s", 7302 mddev->metadata_type); 7303 else 7304 seq_printf(seq, " super non-persistent"); 7305 7306 if (mddev->pers) { 7307 mddev->pers->status(seq, mddev); 7308 seq_printf(seq, "\n "); 7309 if (mddev->pers->sync_request) { 7310 if (mddev->curr_resync > 2) { 7311 status_resync(seq, mddev); 7312 seq_printf(seq, "\n "); 7313 } else if (mddev->curr_resync >= 1) 7314 seq_printf(seq, "\tresync=DELAYED\n "); 7315 else if (mddev->recovery_cp < MaxSector) 7316 seq_printf(seq, "\tresync=PENDING\n "); 7317 } 7318 } else 7319 seq_printf(seq, "\n "); 7320 7321 bitmap_status(seq, mddev->bitmap); 7322 7323 seq_printf(seq, "\n"); 7324 } 7325 spin_unlock(&mddev->lock); 7326 7327 return 0; 7328} 7329 7330static const struct seq_operations md_seq_ops = { 7331 .start = md_seq_start, 7332 .next = md_seq_next, 7333 .stop = md_seq_stop, 7334 .show = md_seq_show, 7335}; 7336 7337static int md_seq_open(struct inode *inode, struct file *file) 7338{ 7339 struct seq_file *seq; 7340 int error; 7341 7342 error = seq_open(file, &md_seq_ops); 7343 if (error) 7344 return error; 7345 7346 seq = file->private_data; 7347 seq->poll_event = atomic_read(&md_event_count); 7348 return error; 7349} 7350 7351static int md_unloading; 7352static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 7353{ 7354 struct seq_file *seq = filp->private_data; 7355 int mask; 7356 7357 if (md_unloading) 7358 return POLLIN|POLLRDNORM|POLLERR|POLLPRI; 7359 poll_wait(filp, &md_event_waiters, wait); 7360 7361 /* always allow read */ 7362 mask = POLLIN | POLLRDNORM; 7363 7364 if (seq->poll_event != atomic_read(&md_event_count)) 7365 mask |= POLLERR | POLLPRI; 7366 return mask; 7367} 7368 7369static const struct file_operations md_seq_fops = { 7370 .owner = THIS_MODULE, 7371 .open = md_seq_open, 7372 .read = seq_read, 7373 .llseek = seq_lseek, 7374 .release = seq_release_private, 7375 .poll = mdstat_poll, 7376}; 7377 7378int register_md_personality(struct md_personality *p) 7379{ 7380 printk(KERN_INFO "md: %s personality registered for level %d\n", 7381 p->name, p->level); 7382 spin_lock(&pers_lock); 7383 list_add_tail(&p->list, &pers_list); 7384 spin_unlock(&pers_lock); 7385 return 0; 7386} 7387EXPORT_SYMBOL(register_md_personality); 7388 7389int unregister_md_personality(struct md_personality *p) 7390{ 7391 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 7392 spin_lock(&pers_lock); 7393 list_del_init(&p->list); 7394 spin_unlock(&pers_lock); 7395 return 0; 7396} 7397EXPORT_SYMBOL(unregister_md_personality); 7398 7399int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module) 7400{ 7401 if (md_cluster_ops != NULL) 7402 return -EALREADY; 7403 spin_lock(&pers_lock); 7404 md_cluster_ops = ops; 7405 md_cluster_mod = module; 7406 spin_unlock(&pers_lock); 7407 return 0; 7408} 7409EXPORT_SYMBOL(register_md_cluster_operations); 7410 7411int unregister_md_cluster_operations(void) 7412{ 7413 spin_lock(&pers_lock); 7414 md_cluster_ops = NULL; 7415 spin_unlock(&pers_lock); 7416 return 0; 7417} 7418EXPORT_SYMBOL(unregister_md_cluster_operations); 7419 7420int md_setup_cluster(struct mddev *mddev, int nodes) 7421{ 7422 int err; 7423 7424 err = request_module("md-cluster"); 7425 if (err) { 7426 pr_err("md-cluster module not found.\n"); 7427 return err; 7428 } 7429 7430 spin_lock(&pers_lock); 7431 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 7432 spin_unlock(&pers_lock); 7433 return -ENOENT; 7434 } 7435 spin_unlock(&pers_lock); 7436 7437 return md_cluster_ops->join(mddev, nodes); 7438} 7439 7440void md_cluster_stop(struct mddev *mddev) 7441{ 7442 if (!md_cluster_ops) 7443 return; 7444 md_cluster_ops->leave(mddev); 7445 module_put(md_cluster_mod); 7446} 7447 7448static int is_mddev_idle(struct mddev *mddev, int init) 7449{ 7450 struct md_rdev *rdev; 7451 int idle; 7452 int curr_events; 7453 7454 idle = 1; 7455 rcu_read_lock(); 7456 rdev_for_each_rcu(rdev, mddev) { 7457 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 7458 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 7459 (int)part_stat_read(&disk->part0, sectors[1]) - 7460 atomic_read(&disk->sync_io); 7461 /* sync IO will cause sync_io to increase before the disk_stats 7462 * as sync_io is counted when a request starts, and 7463 * disk_stats is counted when it completes. 7464 * So resync activity will cause curr_events to be smaller than 7465 * when there was no such activity. 7466 * non-sync IO will cause disk_stat to increase without 7467 * increasing sync_io so curr_events will (eventually) 7468 * be larger than it was before. Once it becomes 7469 * substantially larger, the test below will cause 7470 * the array to appear non-idle, and resync will slow 7471 * down. 7472 * If there is a lot of outstanding resync activity when 7473 * we set last_event to curr_events, then all that activity 7474 * completing might cause the array to appear non-idle 7475 * and resync will be slowed down even though there might 7476 * not have been non-resync activity. This will only 7477 * happen once though. 'last_events' will soon reflect 7478 * the state where there is little or no outstanding 7479 * resync requests, and further resync activity will 7480 * always make curr_events less than last_events. 7481 * 7482 */ 7483 if (init || curr_events - rdev->last_events > 64) { 7484 rdev->last_events = curr_events; 7485 idle = 0; 7486 } 7487 } 7488 rcu_read_unlock(); 7489 return idle; 7490} 7491 7492void md_done_sync(struct mddev *mddev, int blocks, int ok) 7493{ 7494 /* another "blocks" (512byte) blocks have been synced */ 7495 atomic_sub(blocks, &mddev->recovery_active); 7496 wake_up(&mddev->recovery_wait); 7497 if (!ok) { 7498 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7499 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 7500 md_wakeup_thread(mddev->thread); 7501 // stop recovery, signal do_sync .... 7502 } 7503} 7504EXPORT_SYMBOL(md_done_sync); 7505 7506/* md_write_start(mddev, bi) 7507 * If we need to update some array metadata (e.g. 'active' flag 7508 * in superblock) before writing, schedule a superblock update 7509 * and wait for it to complete. 7510 */ 7511void md_write_start(struct mddev *mddev, struct bio *bi) 7512{ 7513 int did_change = 0; 7514 if (bio_data_dir(bi) != WRITE) 7515 return; 7516 7517 BUG_ON(mddev->ro == 1); 7518 if (mddev->ro == 2) { 7519 /* need to switch to read/write */ 7520 mddev->ro = 0; 7521 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7522 md_wakeup_thread(mddev->thread); 7523 md_wakeup_thread(mddev->sync_thread); 7524 did_change = 1; 7525 } 7526 atomic_inc(&mddev->writes_pending); 7527 if (mddev->safemode == 1) 7528 mddev->safemode = 0; 7529 if (mddev->in_sync) { 7530 spin_lock(&mddev->lock); 7531 if (mddev->in_sync) { 7532 mddev->in_sync = 0; 7533 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7534 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7535 md_wakeup_thread(mddev->thread); 7536 did_change = 1; 7537 } 7538 spin_unlock(&mddev->lock); 7539 } 7540 if (did_change) 7541 sysfs_notify_dirent_safe(mddev->sysfs_state); 7542 wait_event(mddev->sb_wait, 7543 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 7544} 7545EXPORT_SYMBOL(md_write_start); 7546 7547void md_write_end(struct mddev *mddev) 7548{ 7549 if (atomic_dec_and_test(&mddev->writes_pending)) { 7550 if (mddev->safemode == 2) 7551 md_wakeup_thread(mddev->thread); 7552 else if (mddev->safemode_delay) 7553 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 7554 } 7555} 7556EXPORT_SYMBOL(md_write_end); 7557 7558/* md_allow_write(mddev) 7559 * Calling this ensures that the array is marked 'active' so that writes 7560 * may proceed without blocking. It is important to call this before 7561 * attempting a GFP_KERNEL allocation while holding the mddev lock. 7562 * Must be called with mddev_lock held. 7563 * 7564 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 7565 * is dropped, so return -EAGAIN after notifying userspace. 7566 */ 7567int md_allow_write(struct mddev *mddev) 7568{ 7569 if (!mddev->pers) 7570 return 0; 7571 if (mddev->ro) 7572 return 0; 7573 if (!mddev->pers->sync_request) 7574 return 0; 7575 7576 spin_lock(&mddev->lock); 7577 if (mddev->in_sync) { 7578 mddev->in_sync = 0; 7579 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7580 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7581 if (mddev->safemode_delay && 7582 mddev->safemode == 0) 7583 mddev->safemode = 1; 7584 spin_unlock(&mddev->lock); 7585 if (mddev_is_clustered(mddev)) 7586 md_cluster_ops->metadata_update_start(mddev); 7587 md_update_sb(mddev, 0); 7588 if (mddev_is_clustered(mddev)) 7589 md_cluster_ops->metadata_update_finish(mddev); 7590 sysfs_notify_dirent_safe(mddev->sysfs_state); 7591 } else 7592 spin_unlock(&mddev->lock); 7593 7594 if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 7595 return -EAGAIN; 7596 else 7597 return 0; 7598} 7599EXPORT_SYMBOL_GPL(md_allow_write); 7600 7601#define SYNC_MARKS 10 7602#define SYNC_MARK_STEP (3*HZ) 7603#define UPDATE_FREQUENCY (5*60*HZ) 7604void md_do_sync(struct md_thread *thread) 7605{ 7606 struct mddev *mddev = thread->mddev; 7607 struct mddev *mddev2; 7608 unsigned int currspeed = 0, 7609 window; 7610 sector_t max_sectors,j, io_sectors, recovery_done; 7611 unsigned long mark[SYNC_MARKS]; 7612 unsigned long update_time; 7613 sector_t mark_cnt[SYNC_MARKS]; 7614 int last_mark,m; 7615 struct list_head *tmp; 7616 sector_t last_check; 7617 int skipped = 0; 7618 struct md_rdev *rdev; 7619 char *desc, *action = NULL; 7620 struct blk_plug plug; 7621 7622 /* just incase thread restarts... */ 7623 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7624 return; 7625 if (mddev->ro) {/* never try to sync a read-only array */ 7626 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7627 return; 7628 } 7629 7630 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7631 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 7632 desc = "data-check"; 7633 action = "check"; 7634 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7635 desc = "requested-resync"; 7636 action = "repair"; 7637 } else 7638 desc = "resync"; 7639 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7640 desc = "reshape"; 7641 else 7642 desc = "recovery"; 7643 7644 mddev->last_sync_action = action ?: desc; 7645 7646 /* we overload curr_resync somewhat here. 7647 * 0 == not engaged in resync at all 7648 * 2 == checking that there is no conflict with another sync 7649 * 1 == like 2, but have yielded to allow conflicting resync to 7650 * commense 7651 * other == active in resync - this many blocks 7652 * 7653 * Before starting a resync we must have set curr_resync to 7654 * 2, and then checked that every "conflicting" array has curr_resync 7655 * less than ours. When we find one that is the same or higher 7656 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 7657 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 7658 * This will mean we have to start checking from the beginning again. 7659 * 7660 */ 7661 7662 do { 7663 mddev->curr_resync = 2; 7664 7665 try_again: 7666 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7667 goto skip; 7668 for_each_mddev(mddev2, tmp) { 7669 if (mddev2 == mddev) 7670 continue; 7671 if (!mddev->parallel_resync 7672 && mddev2->curr_resync 7673 && match_mddev_units(mddev, mddev2)) { 7674 DEFINE_WAIT(wq); 7675 if (mddev < mddev2 && mddev->curr_resync == 2) { 7676 /* arbitrarily yield */ 7677 mddev->curr_resync = 1; 7678 wake_up(&resync_wait); 7679 } 7680 if (mddev > mddev2 && mddev->curr_resync == 1) 7681 /* no need to wait here, we can wait the next 7682 * time 'round when curr_resync == 2 7683 */ 7684 continue; 7685 /* We need to wait 'interruptible' so as not to 7686 * contribute to the load average, and not to 7687 * be caught by 'softlockup' 7688 */ 7689 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 7690 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 7691 mddev2->curr_resync >= mddev->curr_resync) { 7692 printk(KERN_INFO "md: delaying %s of %s" 7693 " until %s has finished (they" 7694 " share one or more physical units)\n", 7695 desc, mdname(mddev), mdname(mddev2)); 7696 mddev_put(mddev2); 7697 if (signal_pending(current)) 7698 flush_signals(current); 7699 schedule(); 7700 finish_wait(&resync_wait, &wq); 7701 goto try_again; 7702 } 7703 finish_wait(&resync_wait, &wq); 7704 } 7705 } 7706 } while (mddev->curr_resync < 2); 7707 7708 j = 0; 7709 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7710 /* resync follows the size requested by the personality, 7711 * which defaults to physical size, but can be virtual size 7712 */ 7713 max_sectors = mddev->resync_max_sectors; 7714 atomic64_set(&mddev->resync_mismatches, 0); 7715 /* we don't use the checkpoint if there's a bitmap */ 7716 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7717 j = mddev->resync_min; 7718 else if (!mddev->bitmap) 7719 j = mddev->recovery_cp; 7720 7721 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7722 max_sectors = mddev->resync_max_sectors; 7723 else { 7724 /* recovery follows the physical size of devices */ 7725 max_sectors = mddev->dev_sectors; 7726 j = MaxSector; 7727 rcu_read_lock(); 7728 rdev_for_each_rcu(rdev, mddev) 7729 if (rdev->raid_disk >= 0 && 7730 !test_bit(Faulty, &rdev->flags) && 7731 !test_bit(In_sync, &rdev->flags) && 7732 rdev->recovery_offset < j) 7733 j = rdev->recovery_offset; 7734 rcu_read_unlock(); 7735 7736 /* If there is a bitmap, we need to make sure all 7737 * writes that started before we added a spare 7738 * complete before we start doing a recovery. 7739 * Otherwise the write might complete and (via 7740 * bitmap_endwrite) set a bit in the bitmap after the 7741 * recovery has checked that bit and skipped that 7742 * region. 7743 */ 7744 if (mddev->bitmap) { 7745 mddev->pers->quiesce(mddev, 1); 7746 mddev->pers->quiesce(mddev, 0); 7747 } 7748 } 7749 7750 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 7751 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 7752 " %d KB/sec/disk.\n", speed_min(mddev)); 7753 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 7754 "(but not more than %d KB/sec) for %s.\n", 7755 speed_max(mddev), desc); 7756 7757 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 7758 7759 io_sectors = 0; 7760 for (m = 0; m < SYNC_MARKS; m++) { 7761 mark[m] = jiffies; 7762 mark_cnt[m] = io_sectors; 7763 } 7764 last_mark = 0; 7765 mddev->resync_mark = mark[last_mark]; 7766 mddev->resync_mark_cnt = mark_cnt[last_mark]; 7767 7768 /* 7769 * Tune reconstruction: 7770 */ 7771 window = 32*(PAGE_SIZE/512); 7772 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", 7773 window/2, (unsigned long long)max_sectors/2); 7774 7775 atomic_set(&mddev->recovery_active, 0); 7776 last_check = 0; 7777 7778 if (j>2) { 7779 printk(KERN_INFO 7780 "md: resuming %s of %s from checkpoint.\n", 7781 desc, mdname(mddev)); 7782 mddev->curr_resync = j; 7783 } else 7784 mddev->curr_resync = 3; /* no longer delayed */ 7785 mddev->curr_resync_completed = j; 7786 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7787 md_new_event(mddev); 7788 update_time = jiffies; 7789 7790 if (mddev_is_clustered(mddev)) 7791 md_cluster_ops->resync_start(mddev, j, max_sectors); 7792 7793 blk_start_plug(&plug); 7794 while (j < max_sectors) { 7795 sector_t sectors; 7796 7797 skipped = 0; 7798 7799 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 7800 ((mddev->curr_resync > mddev->curr_resync_completed && 7801 (mddev->curr_resync - mddev->curr_resync_completed) 7802 > (max_sectors >> 4)) || 7803 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 7804 (j - mddev->curr_resync_completed)*2 7805 >= mddev->resync_max - mddev->curr_resync_completed 7806 )) { 7807 /* time to update curr_resync_completed */ 7808 wait_event(mddev->recovery_wait, 7809 atomic_read(&mddev->recovery_active) == 0); 7810 mddev->curr_resync_completed = j; 7811 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 7812 j > mddev->recovery_cp) 7813 mddev->recovery_cp = j; 7814 update_time = jiffies; 7815 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7816 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7817 } 7818 7819 while (j >= mddev->resync_max && 7820 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7821 /* As this condition is controlled by user-space, 7822 * we can block indefinitely, so use '_interruptible' 7823 * to avoid triggering warnings. 7824 */ 7825 flush_signals(current); /* just in case */ 7826 wait_event_interruptible(mddev->recovery_wait, 7827 mddev->resync_max > j 7828 || test_bit(MD_RECOVERY_INTR, 7829 &mddev->recovery)); 7830 } 7831 7832 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7833 break; 7834 7835 sectors = mddev->pers->sync_request(mddev, j, &skipped); 7836 if (sectors == 0) { 7837 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7838 break; 7839 } 7840 7841 if (!skipped) { /* actual IO requested */ 7842 io_sectors += sectors; 7843 atomic_add(sectors, &mddev->recovery_active); 7844 } 7845 7846 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7847 break; 7848 7849 j += sectors; 7850 if (j > 2) 7851 mddev->curr_resync = j; 7852 if (mddev_is_clustered(mddev)) 7853 md_cluster_ops->resync_info_update(mddev, j, max_sectors); 7854 mddev->curr_mark_cnt = io_sectors; 7855 if (last_check == 0) 7856 /* this is the earliest that rebuild will be 7857 * visible in /proc/mdstat 7858 */ 7859 md_new_event(mddev); 7860 7861 if (last_check + window > io_sectors || j == max_sectors) 7862 continue; 7863 7864 last_check = io_sectors; 7865 repeat: 7866 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 7867 /* step marks */ 7868 int next = (last_mark+1) % SYNC_MARKS; 7869 7870 mddev->resync_mark = mark[next]; 7871 mddev->resync_mark_cnt = mark_cnt[next]; 7872 mark[next] = jiffies; 7873 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 7874 last_mark = next; 7875 } 7876 7877 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7878 break; 7879 7880 /* 7881 * this loop exits only if either when we are slower than 7882 * the 'hard' speed limit, or the system was IO-idle for 7883 * a jiffy. 7884 * the system might be non-idle CPU-wise, but we only care 7885 * about not overloading the IO subsystem. (things like an 7886 * e2fsck being done on the RAID array should execute fast) 7887 */ 7888 cond_resched(); 7889 7890 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 7891 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 7892 /((jiffies-mddev->resync_mark)/HZ +1) +1; 7893 7894 if (currspeed > speed_min(mddev)) { 7895 if (currspeed > speed_max(mddev)) { 7896 msleep(500); 7897 goto repeat; 7898 } 7899 if (!is_mddev_idle(mddev, 0)) { 7900 /* 7901 * Give other IO more of a chance. 7902 * The faster the devices, the less we wait. 7903 */ 7904 wait_event(mddev->recovery_wait, 7905 !atomic_read(&mddev->recovery_active)); 7906 } 7907 } 7908 } 7909 printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, 7910 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 7911 ? "interrupted" : "done"); 7912 /* 7913 * this also signals 'finished resyncing' to md_stop 7914 */ 7915 blk_finish_plug(&plug); 7916 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 7917 7918 /* tell personality that we are finished */ 7919 mddev->pers->sync_request(mddev, max_sectors, &skipped); 7920 7921 if (mddev_is_clustered(mddev)) 7922 md_cluster_ops->resync_finish(mddev); 7923 7924 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 7925 mddev->curr_resync > 2) { 7926 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7927 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7928 if (mddev->curr_resync >= mddev->recovery_cp) { 7929 printk(KERN_INFO 7930 "md: checkpointing %s of %s.\n", 7931 desc, mdname(mddev)); 7932 if (test_bit(MD_RECOVERY_ERROR, 7933 &mddev->recovery)) 7934 mddev->recovery_cp = 7935 mddev->curr_resync_completed; 7936 else 7937 mddev->recovery_cp = 7938 mddev->curr_resync; 7939 } 7940 } else 7941 mddev->recovery_cp = MaxSector; 7942 } else { 7943 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7944 mddev->curr_resync = MaxSector; 7945 rcu_read_lock(); 7946 rdev_for_each_rcu(rdev, mddev) 7947 if (rdev->raid_disk >= 0 && 7948 mddev->delta_disks >= 0 && 7949 !test_bit(Faulty, &rdev->flags) && 7950 !test_bit(In_sync, &rdev->flags) && 7951 rdev->recovery_offset < mddev->curr_resync) 7952 rdev->recovery_offset = mddev->curr_resync; 7953 rcu_read_unlock(); 7954 } 7955 } 7956 skip: 7957 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7958 7959 spin_lock(&mddev->lock); 7960 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7961 /* We completed so min/max setting can be forgotten if used. */ 7962 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7963 mddev->resync_min = 0; 7964 mddev->resync_max = MaxSector; 7965 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7966 mddev->resync_min = mddev->curr_resync_completed; 7967 mddev->curr_resync = 0; 7968 spin_unlock(&mddev->lock); 7969 7970 wake_up(&resync_wait); 7971 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 7972 md_wakeup_thread(mddev->thread); 7973 return; 7974} 7975EXPORT_SYMBOL_GPL(md_do_sync); 7976 7977static int remove_and_add_spares(struct mddev *mddev, 7978 struct md_rdev *this) 7979{ 7980 struct md_rdev *rdev; 7981 int spares = 0; 7982 int removed = 0; 7983 7984 rdev_for_each(rdev, mddev) 7985 if ((this == NULL || rdev == this) && 7986 rdev->raid_disk >= 0 && 7987 !test_bit(Blocked, &rdev->flags) && 7988 (test_bit(Faulty, &rdev->flags) || 7989 ! test_bit(In_sync, &rdev->flags)) && 7990 atomic_read(&rdev->nr_pending)==0) { 7991 if (mddev->pers->hot_remove_disk( 7992 mddev, rdev) == 0) { 7993 sysfs_unlink_rdev(mddev, rdev); 7994 rdev->raid_disk = -1; 7995 removed++; 7996 } 7997 } 7998 if (removed && mddev->kobj.sd) 7999 sysfs_notify(&mddev->kobj, NULL, "degraded"); 8000 8001 if (this) 8002 goto no_add; 8003 8004 rdev_for_each(rdev, mddev) { 8005 if (rdev->raid_disk >= 0 && 8006 !test_bit(In_sync, &rdev->flags) && 8007 !test_bit(Faulty, &rdev->flags)) 8008 spares++; 8009 if (rdev->raid_disk >= 0) 8010 continue; 8011 if (test_bit(Faulty, &rdev->flags)) 8012 continue; 8013 if (mddev->ro && 8014 ! (rdev->saved_raid_disk >= 0 && 8015 !test_bit(Bitmap_sync, &rdev->flags))) 8016 continue; 8017 8018 rdev->recovery_offset = 0; 8019 if (mddev->pers-> 8020 hot_add_disk(mddev, rdev) == 0) { 8021 if (sysfs_link_rdev(mddev, rdev)) 8022 /* failure here is OK */; 8023 spares++; 8024 md_new_event(mddev); 8025 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8026 } 8027 } 8028no_add: 8029 if (removed) 8030 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8031 return spares; 8032} 8033 8034static void md_start_sync(struct work_struct *ws) 8035{ 8036 struct mddev *mddev = container_of(ws, struct mddev, del_work); 8037 8038 mddev->sync_thread = md_register_thread(md_do_sync, 8039 mddev, 8040 "resync"); 8041 if (!mddev->sync_thread) { 8042 printk(KERN_ERR "%s: could not start resync" 8043 " thread...\n", 8044 mdname(mddev)); 8045 /* leave the spares where they are, it shouldn't hurt */ 8046 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8047 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8048 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8049 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8050 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8051 wake_up(&resync_wait); 8052 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8053 &mddev->recovery)) 8054 if (mddev->sysfs_action) 8055 sysfs_notify_dirent_safe(mddev->sysfs_action); 8056 } else 8057 md_wakeup_thread(mddev->sync_thread); 8058 sysfs_notify_dirent_safe(mddev->sysfs_action); 8059 md_new_event(mddev); 8060} 8061 8062/* 8063 * This routine is regularly called by all per-raid-array threads to 8064 * deal with generic issues like resync and super-block update. 8065 * Raid personalities that don't have a thread (linear/raid0) do not 8066 * need this as they never do any recovery or update the superblock. 8067 * 8068 * It does not do any resync itself, but rather "forks" off other threads 8069 * to do that as needed. 8070 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 8071 * "->recovery" and create a thread at ->sync_thread. 8072 * When the thread finishes it sets MD_RECOVERY_DONE 8073 * and wakeups up this thread which will reap the thread and finish up. 8074 * This thread also removes any faulty devices (with nr_pending == 0). 8075 * 8076 * The overall approach is: 8077 * 1/ if the superblock needs updating, update it. 8078 * 2/ If a recovery thread is running, don't do anything else. 8079 * 3/ If recovery has finished, clean up, possibly marking spares active. 8080 * 4/ If there are any faulty devices, remove them. 8081 * 5/ If array is degraded, try to add spares devices 8082 * 6/ If array has spares or is not in-sync, start a resync thread. 8083 */ 8084void md_check_recovery(struct mddev *mddev) 8085{ 8086 if (mddev->suspended) 8087 return; 8088 8089 if (mddev->bitmap) 8090 bitmap_daemon_work(mddev); 8091 8092 if (signal_pending(current)) { 8093 if (mddev->pers->sync_request && !mddev->external) { 8094 printk(KERN_INFO "md: %s in immediate safe mode\n", 8095 mdname(mddev)); 8096 mddev->safemode = 2; 8097 } 8098 flush_signals(current); 8099 } 8100 8101 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 8102 return; 8103 if ( ! ( 8104 (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || 8105 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8106 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8107 (mddev->external == 0 && mddev->safemode == 1) || 8108 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 8109 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 8110 )) 8111 return; 8112 8113 if (mddev_trylock(mddev)) { 8114 int spares = 0; 8115 8116 if (mddev->ro) { 8117 /* On a read-only array we can: 8118 * - remove failed devices 8119 * - add already-in_sync devices if the array itself 8120 * is in-sync. 8121 * As we only add devices that are already in-sync, 8122 * we can activate the spares immediately. 8123 */ 8124 remove_and_add_spares(mddev, NULL); 8125 /* There is no thread, but we need to call 8126 * ->spare_active and clear saved_raid_disk 8127 */ 8128 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8129 md_reap_sync_thread(mddev); 8130 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8131 goto unlock; 8132 } 8133 8134 if (!mddev->external) { 8135 int did_change = 0; 8136 spin_lock(&mddev->lock); 8137 if (mddev->safemode && 8138 !atomic_read(&mddev->writes_pending) && 8139 !mddev->in_sync && 8140 mddev->recovery_cp == MaxSector) { 8141 mddev->in_sync = 1; 8142 did_change = 1; 8143 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 8144 } 8145 if (mddev->safemode == 1) 8146 mddev->safemode = 0; 8147 spin_unlock(&mddev->lock); 8148 if (did_change) 8149 sysfs_notify_dirent_safe(mddev->sysfs_state); 8150 } 8151 8152 if (mddev->flags & MD_UPDATE_SB_FLAGS) { 8153 if (mddev_is_clustered(mddev)) 8154 md_cluster_ops->metadata_update_start(mddev); 8155 md_update_sb(mddev, 0); 8156 if (mddev_is_clustered(mddev)) 8157 md_cluster_ops->metadata_update_finish(mddev); 8158 } 8159 8160 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 8161 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 8162 /* resync/recovery still happening */ 8163 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8164 goto unlock; 8165 } 8166 if (mddev->sync_thread) { 8167 md_reap_sync_thread(mddev); 8168 goto unlock; 8169 } 8170 /* Set RUNNING before clearing NEEDED to avoid 8171 * any transients in the value of "sync_action". 8172 */ 8173 mddev->curr_resync_completed = 0; 8174 spin_lock(&mddev->lock); 8175 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8176 spin_unlock(&mddev->lock); 8177 /* Clear some bits that don't mean anything, but 8178 * might be left set 8179 */ 8180 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 8181 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8182 8183 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8184 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 8185 goto not_running; 8186 /* no recovery is running. 8187 * remove any failed drives, then 8188 * add spares if possible. 8189 * Spares are also removed and re-added, to allow 8190 * the personality to fail the re-add. 8191 */ 8192 8193 if (mddev->reshape_position != MaxSector) { 8194 if (mddev->pers->check_reshape == NULL || 8195 mddev->pers->check_reshape(mddev) != 0) 8196 /* Cannot proceed */ 8197 goto not_running; 8198 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8199 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8200 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 8201 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8202 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8203 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8204 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8205 } else if (mddev->recovery_cp < MaxSector) { 8206 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8207 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8208 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 8209 /* nothing to be done ... */ 8210 goto not_running; 8211 8212 if (mddev->pers->sync_request) { 8213 if (spares) { 8214 /* We are adding a device or devices to an array 8215 * which has the bitmap stored on all devices. 8216 * So make sure all bitmap pages get written 8217 */ 8218 bitmap_write_all(mddev->bitmap); 8219 } 8220 INIT_WORK(&mddev->del_work, md_start_sync); 8221 queue_work(md_misc_wq, &mddev->del_work); 8222 goto unlock; 8223 } 8224 not_running: 8225 if (!mddev->sync_thread) { 8226 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8227 wake_up(&resync_wait); 8228 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8229 &mddev->recovery)) 8230 if (mddev->sysfs_action) 8231 sysfs_notify_dirent_safe(mddev->sysfs_action); 8232 } 8233 unlock: 8234 wake_up(&mddev->sb_wait); 8235 mddev_unlock(mddev); 8236 } 8237} 8238EXPORT_SYMBOL(md_check_recovery); 8239 8240void md_reap_sync_thread(struct mddev *mddev) 8241{ 8242 struct md_rdev *rdev; 8243 8244 /* resync has finished, collect result */ 8245 md_unregister_thread(&mddev->sync_thread); 8246 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8247 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8248 /* success...*/ 8249 /* activate any spares */ 8250 if (mddev->pers->spare_active(mddev)) { 8251 sysfs_notify(&mddev->kobj, NULL, 8252 "degraded"); 8253 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8254 } 8255 } 8256 if (mddev_is_clustered(mddev)) 8257 md_cluster_ops->metadata_update_start(mddev); 8258 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8259 mddev->pers->finish_reshape) 8260 mddev->pers->finish_reshape(mddev); 8261 8262 /* If array is no-longer degraded, then any saved_raid_disk 8263 * information must be scrapped. 8264 */ 8265 if (!mddev->degraded) 8266 rdev_for_each(rdev, mddev) 8267 rdev->saved_raid_disk = -1; 8268 8269 md_update_sb(mddev, 1); 8270 if (mddev_is_clustered(mddev)) 8271 md_cluster_ops->metadata_update_finish(mddev); 8272 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8273 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8274 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8275 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8276 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8277 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8278 wake_up(&resync_wait); 8279 /* flag recovery needed just to double check */ 8280 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8281 sysfs_notify_dirent_safe(mddev->sysfs_action); 8282 md_new_event(mddev); 8283 if (mddev->event_work.func) 8284 queue_work(md_misc_wq, &mddev->event_work); 8285} 8286EXPORT_SYMBOL(md_reap_sync_thread); 8287 8288void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 8289{ 8290 sysfs_notify_dirent_safe(rdev->sysfs_state); 8291 wait_event_timeout(rdev->blocked_wait, 8292 !test_bit(Blocked, &rdev->flags) && 8293 !test_bit(BlockedBadBlocks, &rdev->flags), 8294 msecs_to_jiffies(5000)); 8295 rdev_dec_pending(rdev, mddev); 8296} 8297EXPORT_SYMBOL(md_wait_for_blocked_rdev); 8298 8299void md_finish_reshape(struct mddev *mddev) 8300{ 8301 /* called be personality module when reshape completes. */ 8302 struct md_rdev *rdev; 8303 8304 rdev_for_each(rdev, mddev) { 8305 if (rdev->data_offset > rdev->new_data_offset) 8306 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 8307 else 8308 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 8309 rdev->data_offset = rdev->new_data_offset; 8310 } 8311} 8312EXPORT_SYMBOL(md_finish_reshape); 8313 8314/* Bad block management. 8315 * We can record which blocks on each device are 'bad' and so just 8316 * fail those blocks, or that stripe, rather than the whole device. 8317 * Entries in the bad-block table are 64bits wide. This comprises: 8318 * Length of bad-range, in sectors: 0-511 for lengths 1-512 8319 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) 8320 * A 'shift' can be set so that larger blocks are tracked and 8321 * consequently larger devices can be covered. 8322 * 'Acknowledged' flag - 1 bit. - the most significant bit. 8323 * 8324 * Locking of the bad-block table uses a seqlock so md_is_badblock 8325 * might need to retry if it is very unlucky. 8326 * We will sometimes want to check for bad blocks in a bi_end_io function, 8327 * so we use the write_seqlock_irq variant. 8328 * 8329 * When looking for a bad block we specify a range and want to 8330 * know if any block in the range is bad. So we binary-search 8331 * to the last range that starts at-or-before the given endpoint, 8332 * (or "before the sector after the target range") 8333 * then see if it ends after the given start. 8334 * We return 8335 * 0 if there are no known bad blocks in the range 8336 * 1 if there are known bad block which are all acknowledged 8337 * -1 if there are bad blocks which have not yet been acknowledged in metadata. 8338 * plus the start/length of the first bad section we overlap. 8339 */ 8340int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, 8341 sector_t *first_bad, int *bad_sectors) 8342{ 8343 int hi; 8344 int lo; 8345 u64 *p = bb->page; 8346 int rv; 8347 sector_t target = s + sectors; 8348 unsigned seq; 8349 8350 if (bb->shift > 0) { 8351 /* round the start down, and the end up */ 8352 s >>= bb->shift; 8353 target += (1<<bb->shift) - 1; 8354 target >>= bb->shift; 8355 sectors = target - s; 8356 } 8357 /* 'target' is now the first block after the bad range */ 8358 8359retry: 8360 seq = read_seqbegin(&bb->lock); 8361 lo = 0; 8362 rv = 0; 8363 hi = bb->count; 8364 8365 /* Binary search between lo and hi for 'target' 8366 * i.e. for the last range that starts before 'target' 8367 */ 8368 /* INVARIANT: ranges before 'lo' and at-or-after 'hi' 8369 * are known not to be the last range before target. 8370 * VARIANT: hi-lo is the number of possible 8371 * ranges, and decreases until it reaches 1 8372 */ 8373 while (hi - lo > 1) { 8374 int mid = (lo + hi) / 2; 8375 sector_t a = BB_OFFSET(p[mid]); 8376 if (a < target) 8377 /* This could still be the one, earlier ranges 8378 * could not. */ 8379 lo = mid; 8380 else 8381 /* This and later ranges are definitely out. */ 8382 hi = mid; 8383 } 8384 /* 'lo' might be the last that started before target, but 'hi' isn't */ 8385 if (hi > lo) { 8386 /* need to check all range that end after 's' to see if 8387 * any are unacknowledged. 8388 */ 8389 while (lo >= 0 && 8390 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { 8391 if (BB_OFFSET(p[lo]) < target) { 8392 /* starts before the end, and finishes after 8393 * the start, so they must overlap 8394 */ 8395 if (rv != -1 && BB_ACK(p[lo])) 8396 rv = 1; 8397 else 8398 rv = -1; 8399 *first_bad = BB_OFFSET(p[lo]); 8400 *bad_sectors = BB_LEN(p[lo]); 8401 } 8402 lo--; 8403 } 8404 } 8405 8406 if (read_seqretry(&bb->lock, seq)) 8407 goto retry; 8408 8409 return rv; 8410} 8411EXPORT_SYMBOL_GPL(md_is_badblock); 8412 8413/* 8414 * Add a range of bad blocks to the table. 8415 * This might extend the table, or might contract it 8416 * if two adjacent ranges can be merged. 8417 * We binary-search to find the 'insertion' point, then 8418 * decide how best to handle it. 8419 */ 8420static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, 8421 int acknowledged) 8422{ 8423 u64 *p; 8424 int lo, hi; 8425 int rv = 1; 8426 unsigned long flags; 8427 8428 if (bb->shift < 0) 8429 /* badblocks are disabled */ 8430 return 0; 8431 8432 if (bb->shift) { 8433 /* round the start down, and the end up */ 8434 sector_t next = s + sectors; 8435 s >>= bb->shift; 8436 next += (1<<bb->shift) - 1; 8437 next >>= bb->shift; 8438 sectors = next - s; 8439 } 8440 8441 write_seqlock_irqsave(&bb->lock, flags); 8442 8443 p = bb->page; 8444 lo = 0; 8445 hi = bb->count; 8446 /* Find the last range that starts at-or-before 's' */ 8447 while (hi - lo > 1) { 8448 int mid = (lo + hi) / 2; 8449 sector_t a = BB_OFFSET(p[mid]); 8450 if (a <= s) 8451 lo = mid; 8452 else 8453 hi = mid; 8454 } 8455 if (hi > lo && BB_OFFSET(p[lo]) > s) 8456 hi = lo; 8457 8458 if (hi > lo) { 8459 /* we found a range that might merge with the start 8460 * of our new range 8461 */ 8462 sector_t a = BB_OFFSET(p[lo]); 8463 sector_t e = a + BB_LEN(p[lo]); 8464 int ack = BB_ACK(p[lo]); 8465 if (e >= s) { 8466 /* Yes, we can merge with a previous range */ 8467 if (s == a && s + sectors >= e) 8468 /* new range covers old */ 8469 ack = acknowledged; 8470 else 8471 ack = ack && acknowledged; 8472 8473 if (e < s + sectors) 8474 e = s + sectors; 8475 if (e - a <= BB_MAX_LEN) { 8476 p[lo] = BB_MAKE(a, e-a, ack); 8477 s = e; 8478 } else { 8479 /* does not all fit in one range, 8480 * make p[lo] maximal 8481 */ 8482 if (BB_LEN(p[lo]) != BB_MAX_LEN) 8483 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); 8484 s = a + BB_MAX_LEN; 8485 } 8486 sectors = e - s; 8487 } 8488 } 8489 if (sectors && hi < bb->count) { 8490 /* 'hi' points to the first range that starts after 's'. 8491 * Maybe we can merge with the start of that range */ 8492 sector_t a = BB_OFFSET(p[hi]); 8493 sector_t e = a + BB_LEN(p[hi]); 8494 int ack = BB_ACK(p[hi]); 8495 if (a <= s + sectors) { 8496 /* merging is possible */ 8497 if (e <= s + sectors) { 8498 /* full overlap */ 8499 e = s + sectors; 8500 ack = acknowledged; 8501 } else 8502 ack = ack && acknowledged; 8503 8504 a = s; 8505 if (e - a <= BB_MAX_LEN) { 8506 p[hi] = BB_MAKE(a, e-a, ack); 8507 s = e; 8508 } else { 8509 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); 8510 s = a + BB_MAX_LEN; 8511 } 8512 sectors = e - s; 8513 lo = hi; 8514 hi++; 8515 } 8516 } 8517 if (sectors == 0 && hi < bb->count) { 8518 /* we might be able to combine lo and hi */ 8519 /* Note: 's' is at the end of 'lo' */ 8520 sector_t a = BB_OFFSET(p[hi]); 8521 int lolen = BB_LEN(p[lo]); 8522 int hilen = BB_LEN(p[hi]); 8523 int newlen = lolen + hilen - (s - a); 8524 if (s >= a && newlen < BB_MAX_LEN) { 8525 /* yes, we can combine them */ 8526 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); 8527 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); 8528 memmove(p + hi, p + hi + 1, 8529 (bb->count - hi - 1) * 8); 8530 bb->count--; 8531 } 8532 } 8533 while (sectors) { 8534 /* didn't merge (it all). 8535 * Need to add a range just before 'hi' */ 8536 if (bb->count >= MD_MAX_BADBLOCKS) { 8537 /* No room for more */ 8538 rv = 0; 8539 break; 8540 } else { 8541 int this_sectors = sectors; 8542 memmove(p + hi + 1, p + hi, 8543 (bb->count - hi) * 8); 8544 bb->count++; 8545 8546 if (this_sectors > BB_MAX_LEN) 8547 this_sectors = BB_MAX_LEN; 8548 p[hi] = BB_MAKE(s, this_sectors, acknowledged); 8549 sectors -= this_sectors; 8550 s += this_sectors; 8551 } 8552 } 8553 8554 bb->changed = 1; 8555 if (!acknowledged) 8556 bb->unacked_exist = 1; 8557 write_sequnlock_irqrestore(&bb->lock, flags); 8558 8559 return rv; 8560} 8561 8562int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8563 int is_new) 8564{ 8565 int rv; 8566 if (is_new) 8567 s += rdev->new_data_offset; 8568 else 8569 s += rdev->data_offset; 8570 rv = md_set_badblocks(&rdev->badblocks, 8571 s, sectors, 0); 8572 if (rv) { 8573 /* Make sure they get written out promptly */ 8574 sysfs_notify_dirent_safe(rdev->sysfs_state); 8575 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); 8576 md_wakeup_thread(rdev->mddev->thread); 8577 } 8578 return rv; 8579} 8580EXPORT_SYMBOL_GPL(rdev_set_badblocks); 8581 8582/* 8583 * Remove a range of bad blocks from the table. 8584 * This may involve extending the table if we spilt a region, 8585 * but it must not fail. So if the table becomes full, we just 8586 * drop the remove request. 8587 */ 8588static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) 8589{ 8590 u64 *p; 8591 int lo, hi; 8592 sector_t target = s + sectors; 8593 int rv = 0; 8594 8595 if (bb->shift > 0) { 8596 /* When clearing we round the start up and the end down. 8597 * This should not matter as the shift should align with 8598 * the block size and no rounding should ever be needed. 8599 * However it is better the think a block is bad when it 8600 * isn't than to think a block is not bad when it is. 8601 */ 8602 s += (1<<bb->shift) - 1; 8603 s >>= bb->shift; 8604 target >>= bb->shift; 8605 sectors = target - s; 8606 } 8607 8608 write_seqlock_irq(&bb->lock); 8609 8610 p = bb->page; 8611 lo = 0; 8612 hi = bb->count; 8613 /* Find the last range that starts before 'target' */ 8614 while (hi - lo > 1) { 8615 int mid = (lo + hi) / 2; 8616 sector_t a = BB_OFFSET(p[mid]); 8617 if (a < target) 8618 lo = mid; 8619 else 8620 hi = mid; 8621 } 8622 if (hi > lo) { 8623 /* p[lo] is the last range that could overlap the 8624 * current range. Earlier ranges could also overlap, 8625 * but only this one can overlap the end of the range. 8626 */ 8627 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { 8628 /* Partial overlap, leave the tail of this range */ 8629 int ack = BB_ACK(p[lo]); 8630 sector_t a = BB_OFFSET(p[lo]); 8631 sector_t end = a + BB_LEN(p[lo]); 8632 8633 if (a < s) { 8634 /* we need to split this range */ 8635 if (bb->count >= MD_MAX_BADBLOCKS) { 8636 rv = -ENOSPC; 8637 goto out; 8638 } 8639 memmove(p+lo+1, p+lo, (bb->count - lo) * 8); 8640 bb->count++; 8641 p[lo] = BB_MAKE(a, s-a, ack); 8642 lo++; 8643 } 8644 p[lo] = BB_MAKE(target, end - target, ack); 8645 /* there is no longer an overlap */ 8646 hi = lo; 8647 lo--; 8648 } 8649 while (lo >= 0 && 8650 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { 8651 /* This range does overlap */ 8652 if (BB_OFFSET(p[lo]) < s) { 8653 /* Keep the early parts of this range. */ 8654 int ack = BB_ACK(p[lo]); 8655 sector_t start = BB_OFFSET(p[lo]); 8656 p[lo] = BB_MAKE(start, s - start, ack); 8657 /* now low doesn't overlap, so.. */ 8658 break; 8659 } 8660 lo--; 8661 } 8662 /* 'lo' is strictly before, 'hi' is strictly after, 8663 * anything between needs to be discarded 8664 */ 8665 if (hi - lo > 1) { 8666 memmove(p+lo+1, p+hi, (bb->count - hi) * 8); 8667 bb->count -= (hi - lo - 1); 8668 } 8669 } 8670 8671 bb->changed = 1; 8672out: 8673 write_sequnlock_irq(&bb->lock); 8674 return rv; 8675} 8676 8677int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8678 int is_new) 8679{ 8680 if (is_new) 8681 s += rdev->new_data_offset; 8682 else 8683 s += rdev->data_offset; 8684 return md_clear_badblocks(&rdev->badblocks, 8685 s, sectors); 8686} 8687EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 8688 8689/* 8690 * Acknowledge all bad blocks in a list. 8691 * This only succeeds if ->changed is clear. It is used by 8692 * in-kernel metadata updates 8693 */ 8694void md_ack_all_badblocks(struct badblocks *bb) 8695{ 8696 if (bb->page == NULL || bb->changed) 8697 /* no point even trying */ 8698 return; 8699 write_seqlock_irq(&bb->lock); 8700 8701 if (bb->changed == 0 && bb->unacked_exist) { 8702 u64 *p = bb->page; 8703 int i; 8704 for (i = 0; i < bb->count ; i++) { 8705 if (!BB_ACK(p[i])) { 8706 sector_t start = BB_OFFSET(p[i]); 8707 int len = BB_LEN(p[i]); 8708 p[i] = BB_MAKE(start, len, 1); 8709 } 8710 } 8711 bb->unacked_exist = 0; 8712 } 8713 write_sequnlock_irq(&bb->lock); 8714} 8715EXPORT_SYMBOL_GPL(md_ack_all_badblocks); 8716 8717/* sysfs access to bad-blocks list. 8718 * We present two files. 8719 * 'bad-blocks' lists sector numbers and lengths of ranges that 8720 * are recorded as bad. The list is truncated to fit within 8721 * the one-page limit of sysfs. 8722 * Writing "sector length" to this file adds an acknowledged 8723 * bad block list. 8724 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 8725 * been acknowledged. Writing to this file adds bad blocks 8726 * without acknowledging them. This is largely for testing. 8727 */ 8728 8729static ssize_t 8730badblocks_show(struct badblocks *bb, char *page, int unack) 8731{ 8732 size_t len; 8733 int i; 8734 u64 *p = bb->page; 8735 unsigned seq; 8736 8737 if (bb->shift < 0) 8738 return 0; 8739 8740retry: 8741 seq = read_seqbegin(&bb->lock); 8742 8743 len = 0; 8744 i = 0; 8745 8746 while (len < PAGE_SIZE && i < bb->count) { 8747 sector_t s = BB_OFFSET(p[i]); 8748 unsigned int length = BB_LEN(p[i]); 8749 int ack = BB_ACK(p[i]); 8750 i++; 8751 8752 if (unack && ack) 8753 continue; 8754 8755 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", 8756 (unsigned long long)s << bb->shift, 8757 length << bb->shift); 8758 } 8759 if (unack && len == 0) 8760 bb->unacked_exist = 0; 8761 8762 if (read_seqretry(&bb->lock, seq)) 8763 goto retry; 8764 8765 return len; 8766} 8767 8768#define DO_DEBUG 1 8769 8770static ssize_t 8771badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) 8772{ 8773 unsigned long long sector; 8774 int length; 8775 char newline; 8776#ifdef DO_DEBUG 8777 /* Allow clearing via sysfs *only* for testing/debugging. 8778 * Normally only a successful write may clear a badblock 8779 */ 8780 int clear = 0; 8781 if (page[0] == '-') { 8782 clear = 1; 8783 page++; 8784 } 8785#endif /* DO_DEBUG */ 8786 8787 switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { 8788 case 3: 8789 if (newline != '\n') 8790 return -EINVAL; 8791 case 2: 8792 if (length <= 0) 8793 return -EINVAL; 8794 break; 8795 default: 8796 return -EINVAL; 8797 } 8798 8799#ifdef DO_DEBUG 8800 if (clear) { 8801 md_clear_badblocks(bb, sector, length); 8802 return len; 8803 } 8804#endif /* DO_DEBUG */ 8805 if (md_set_badblocks(bb, sector, length, !unack)) 8806 return len; 8807 else 8808 return -ENOSPC; 8809} 8810 8811static int md_notify_reboot(struct notifier_block *this, 8812 unsigned long code, void *x) 8813{ 8814 struct list_head *tmp; 8815 struct mddev *mddev; 8816 int need_delay = 0; 8817 8818 for_each_mddev(mddev, tmp) { 8819 if (mddev_trylock(mddev)) { 8820 if (mddev->pers) 8821 __md_stop_writes(mddev); 8822 if (mddev->persistent) 8823 mddev->safemode = 2; 8824 mddev_unlock(mddev); 8825 } 8826 need_delay = 1; 8827 } 8828 /* 8829 * certain more exotic SCSI devices are known to be 8830 * volatile wrt too early system reboots. While the 8831 * right place to handle this issue is the given 8832 * driver, we do want to have a safe RAID driver ... 8833 */ 8834 if (need_delay) 8835 mdelay(1000*1); 8836 8837 return NOTIFY_DONE; 8838} 8839 8840static struct notifier_block md_notifier = { 8841 .notifier_call = md_notify_reboot, 8842 .next = NULL, 8843 .priority = INT_MAX, /* before any real devices */ 8844}; 8845 8846static void md_geninit(void) 8847{ 8848 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 8849 8850 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 8851} 8852 8853static int __init md_init(void) 8854{ 8855 int ret = -ENOMEM; 8856 8857 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 8858 if (!md_wq) 8859 goto err_wq; 8860 8861 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 8862 if (!md_misc_wq) 8863 goto err_misc_wq; 8864 8865 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) 8866 goto err_md; 8867 8868 if ((ret = register_blkdev(0, "mdp")) < 0) 8869 goto err_mdp; 8870 mdp_major = ret; 8871 8872 blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE, 8873 md_probe, NULL, NULL); 8874 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 8875 md_probe, NULL, NULL); 8876 8877 register_reboot_notifier(&md_notifier); 8878 raid_table_header = register_sysctl_table(raid_root_table); 8879 8880 md_geninit(); 8881 return 0; 8882 8883err_mdp: 8884 unregister_blkdev(MD_MAJOR, "md"); 8885err_md: 8886 destroy_workqueue(md_misc_wq); 8887err_misc_wq: 8888 destroy_workqueue(md_wq); 8889err_wq: 8890 return ret; 8891} 8892 8893void md_reload_sb(struct mddev *mddev) 8894{ 8895 struct md_rdev *rdev, *tmp; 8896 8897 rdev_for_each_safe(rdev, tmp, mddev) { 8898 rdev->sb_loaded = 0; 8899 ClearPageUptodate(rdev->sb_page); 8900 } 8901 mddev->raid_disks = 0; 8902 analyze_sbs(mddev); 8903 rdev_for_each_safe(rdev, tmp, mddev) { 8904 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 8905 /* since we don't write to faulty devices, we figure out if the 8906 * disk is faulty by comparing events 8907 */ 8908 if (mddev->events > sb->events) 8909 set_bit(Faulty, &rdev->flags); 8910 } 8911 8912} 8913EXPORT_SYMBOL(md_reload_sb); 8914 8915#ifndef MODULE 8916 8917/* 8918 * Searches all registered partitions for autorun RAID arrays 8919 * at boot time. 8920 */ 8921 8922static LIST_HEAD(all_detected_devices); 8923struct detected_devices_node { 8924 struct list_head list; 8925 dev_t dev; 8926}; 8927 8928void md_autodetect_dev(dev_t dev) 8929{ 8930 struct detected_devices_node *node_detected_dev; 8931 8932 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 8933 if (node_detected_dev) { 8934 node_detected_dev->dev = dev; 8935 list_add_tail(&node_detected_dev->list, &all_detected_devices); 8936 } else { 8937 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 8938 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 8939 } 8940} 8941 8942static void autostart_arrays(int part) 8943{ 8944 struct md_rdev *rdev; 8945 struct detected_devices_node *node_detected_dev; 8946 dev_t dev; 8947 int i_scanned, i_passed; 8948 8949 i_scanned = 0; 8950 i_passed = 0; 8951 8952 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 8953 8954 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 8955 i_scanned++; 8956 node_detected_dev = list_entry(all_detected_devices.next, 8957 struct detected_devices_node, list); 8958 list_del(&node_detected_dev->list); 8959 dev = node_detected_dev->dev; 8960 kfree(node_detected_dev); 8961 rdev = md_import_device(dev,0, 90); 8962 if (IS_ERR(rdev)) 8963 continue; 8964 8965 if (test_bit(Faulty, &rdev->flags)) 8966 continue; 8967 8968 set_bit(AutoDetected, &rdev->flags); 8969 list_add(&rdev->same_set, &pending_raid_disks); 8970 i_passed++; 8971 } 8972 8973 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 8974 i_scanned, i_passed); 8975 8976 autorun_devices(part); 8977} 8978 8979#endif /* !MODULE */ 8980 8981static __exit void md_exit(void) 8982{ 8983 struct mddev *mddev; 8984 struct list_head *tmp; 8985 int delay = 1; 8986 8987 blk_unregister_region(MKDEV(MD_MAJOR,0), 512); 8988 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 8989 8990 unregister_blkdev(MD_MAJOR,"md"); 8991 unregister_blkdev(mdp_major, "mdp"); 8992 unregister_reboot_notifier(&md_notifier); 8993 unregister_sysctl_table(raid_table_header); 8994 8995 /* We cannot unload the modules while some process is 8996 * waiting for us in select() or poll() - wake them up 8997 */ 8998 md_unloading = 1; 8999 while (waitqueue_active(&md_event_waiters)) { 9000 /* not safe to leave yet */ 9001 wake_up(&md_event_waiters); 9002 msleep(delay); 9003 delay += delay; 9004 } 9005 remove_proc_entry("mdstat", NULL); 9006 9007 for_each_mddev(mddev, tmp) { 9008 export_array(mddev); 9009 mddev->hold_active = 0; 9010 } 9011 destroy_workqueue(md_misc_wq); 9012 destroy_workqueue(md_wq); 9013} 9014 9015subsys_initcall(md_init); 9016module_exit(md_exit) 9017 9018static int get_ro(char *buffer, struct kernel_param *kp) 9019{ 9020 return sprintf(buffer, "%d", start_readonly); 9021} 9022static int set_ro(const char *val, struct kernel_param *kp) 9023{ 9024 char *e; 9025 int num = simple_strtoul(val, &e, 10); 9026 if (*val && (*e == '\0' || *e == '\n')) { 9027 start_readonly = num; 9028 return 0; 9029 } 9030 return -EINVAL; 9031} 9032 9033module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 9034module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 9035module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 9036 9037MODULE_LICENSE("GPL"); 9038MODULE_DESCRIPTION("MD RAID framework"); 9039MODULE_ALIAS("md"); 9040MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 9041