1/*
2   md.c : Multiple Devices driver for Linux
3     Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5     completely rewritten, based on the MD driver code from Marc Zyngier
6
7   Changes:
8
9   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13   - kmod support by: Cyrus Durgin
14   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17   - lots of fixes and improvements to the RAID1/RAID5 and generic
18     RAID code (such as request based resynchronization):
19
20     Neil Brown <neilb@cse.unsw.edu.au>.
21
22   - persistent bitmap code
23     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25   This program is free software; you can redistribute it and/or modify
26   it under the terms of the GNU General Public License as published by
27   the Free Software Foundation; either version 2, or (at your option)
28   any later version.
29
30   You should have received a copy of the GNU General Public License
31   (for example /usr/src/linux/COPYING); if not, write to the Free
32   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33*/
34
35#include <linux/kthread.h>
36#include <linux/blkdev.h>
37#include <linux/sysctl.h>
38#include <linux/seq_file.h>
39#include <linux/fs.h>
40#include <linux/poll.h>
41#include <linux/ctype.h>
42#include <linux/string.h>
43#include <linux/hdreg.h>
44#include <linux/proc_fs.h>
45#include <linux/random.h>
46#include <linux/module.h>
47#include <linux/reboot.h>
48#include <linux/file.h>
49#include <linux/compat.h>
50#include <linux/delay.h>
51#include <linux/raid/md_p.h>
52#include <linux/raid/md_u.h>
53#include <linux/slab.h>
54#include "md.h"
55#include "bitmap.h"
56#include "md-cluster.h"
57
58#ifndef MODULE
59static void autostart_arrays(int part);
60#endif
61
62/* pers_list is a list of registered personalities protected
63 * by pers_lock.
64 * pers_lock does extra service to protect accesses to
65 * mddev->thread when the mutex cannot be held.
66 */
67static LIST_HEAD(pers_list);
68static DEFINE_SPINLOCK(pers_lock);
69
70struct md_cluster_operations *md_cluster_ops;
71EXPORT_SYMBOL(md_cluster_ops);
72struct module *md_cluster_mod;
73EXPORT_SYMBOL(md_cluster_mod);
74
75static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
76static struct workqueue_struct *md_wq;
77static struct workqueue_struct *md_misc_wq;
78
79static int remove_and_add_spares(struct mddev *mddev,
80				 struct md_rdev *this);
81static void mddev_detach(struct mddev *mddev);
82
83/*
84 * Default number of read corrections we'll attempt on an rdev
85 * before ejecting it from the array. We divide the read error
86 * count by 2 for every hour elapsed between read errors.
87 */
88#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
89/*
90 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
91 * is 1000 KB/sec, so the extra system load does not show up that much.
92 * Increase it if you want to have more _guaranteed_ speed. Note that
93 * the RAID driver will use the maximum available bandwidth if the IO
94 * subsystem is idle. There is also an 'absolute maximum' reconstruction
95 * speed limit - in case reconstruction slows down your system despite
96 * idle IO detection.
97 *
98 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
99 * or /sys/block/mdX/md/sync_speed_{min,max}
100 */
101
102static int sysctl_speed_limit_min = 1000;
103static int sysctl_speed_limit_max = 200000;
104static inline int speed_min(struct mddev *mddev)
105{
106	return mddev->sync_speed_min ?
107		mddev->sync_speed_min : sysctl_speed_limit_min;
108}
109
110static inline int speed_max(struct mddev *mddev)
111{
112	return mddev->sync_speed_max ?
113		mddev->sync_speed_max : sysctl_speed_limit_max;
114}
115
116static struct ctl_table_header *raid_table_header;
117
118static struct ctl_table raid_table[] = {
119	{
120		.procname	= "speed_limit_min",
121		.data		= &sysctl_speed_limit_min,
122		.maxlen		= sizeof(int),
123		.mode		= S_IRUGO|S_IWUSR,
124		.proc_handler	= proc_dointvec,
125	},
126	{
127		.procname	= "speed_limit_max",
128		.data		= &sysctl_speed_limit_max,
129		.maxlen		= sizeof(int),
130		.mode		= S_IRUGO|S_IWUSR,
131		.proc_handler	= proc_dointvec,
132	},
133	{ }
134};
135
136static struct ctl_table raid_dir_table[] = {
137	{
138		.procname	= "raid",
139		.maxlen		= 0,
140		.mode		= S_IRUGO|S_IXUGO,
141		.child		= raid_table,
142	},
143	{ }
144};
145
146static struct ctl_table raid_root_table[] = {
147	{
148		.procname	= "dev",
149		.maxlen		= 0,
150		.mode		= 0555,
151		.child		= raid_dir_table,
152	},
153	{  }
154};
155
156static const struct block_device_operations md_fops;
157
158static int start_readonly;
159
160/* bio_clone_mddev
161 * like bio_clone, but with a local bio set
162 */
163
164struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
165			    struct mddev *mddev)
166{
167	struct bio *b;
168
169	if (!mddev || !mddev->bio_set)
170		return bio_alloc(gfp_mask, nr_iovecs);
171
172	b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
173	if (!b)
174		return NULL;
175	return b;
176}
177EXPORT_SYMBOL_GPL(bio_alloc_mddev);
178
179struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
180			    struct mddev *mddev)
181{
182	if (!mddev || !mddev->bio_set)
183		return bio_clone(bio, gfp_mask);
184
185	return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
186}
187EXPORT_SYMBOL_GPL(bio_clone_mddev);
188
189/*
190 * We have a system wide 'event count' that is incremented
191 * on any 'interesting' event, and readers of /proc/mdstat
192 * can use 'poll' or 'select' to find out when the event
193 * count increases.
194 *
195 * Events are:
196 *  start array, stop array, error, add device, remove device,
197 *  start build, activate spare
198 */
199static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
200static atomic_t md_event_count;
201void md_new_event(struct mddev *mddev)
202{
203	atomic_inc(&md_event_count);
204	wake_up(&md_event_waiters);
205}
206EXPORT_SYMBOL_GPL(md_new_event);
207
208/* Alternate version that can be called from interrupts
209 * when calling sysfs_notify isn't needed.
210 */
211static void md_new_event_inintr(struct mddev *mddev)
212{
213	atomic_inc(&md_event_count);
214	wake_up(&md_event_waiters);
215}
216
217/*
218 * Enables to iterate over all existing md arrays
219 * all_mddevs_lock protects this list.
220 */
221static LIST_HEAD(all_mddevs);
222static DEFINE_SPINLOCK(all_mddevs_lock);
223
224/*
225 * iterates through all used mddevs in the system.
226 * We take care to grab the all_mddevs_lock whenever navigating
227 * the list, and to always hold a refcount when unlocked.
228 * Any code which breaks out of this loop while own
229 * a reference to the current mddev and must mddev_put it.
230 */
231#define for_each_mddev(_mddev,_tmp)					\
232									\
233	for (({ spin_lock(&all_mddevs_lock);				\
234		_tmp = all_mddevs.next;					\
235		_mddev = NULL;});					\
236	     ({ if (_tmp != &all_mddevs)				\
237			mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
238		spin_unlock(&all_mddevs_lock);				\
239		if (_mddev) mddev_put(_mddev);				\
240		_mddev = list_entry(_tmp, struct mddev, all_mddevs);	\
241		_tmp != &all_mddevs;});					\
242	     ({ spin_lock(&all_mddevs_lock);				\
243		_tmp = _tmp->next;})					\
244		)
245
246/* Rather than calling directly into the personality make_request function,
247 * IO requests come here first so that we can check if the device is
248 * being suspended pending a reconfiguration.
249 * We hold a refcount over the call to ->make_request.  By the time that
250 * call has finished, the bio has been linked into some internal structure
251 * and so is visible to ->quiesce(), so we don't need the refcount any more.
252 */
253static void md_make_request(struct request_queue *q, struct bio *bio)
254{
255	const int rw = bio_data_dir(bio);
256	struct mddev *mddev = q->queuedata;
257	unsigned int sectors;
258	int cpu;
259
260	if (mddev == NULL || mddev->pers == NULL
261	    || !mddev->ready) {
262		bio_io_error(bio);
263		return;
264	}
265	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
266		bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
267		return;
268	}
269	smp_rmb(); /* Ensure implications of  'active' are visible */
270	rcu_read_lock();
271	if (mddev->suspended) {
272		DEFINE_WAIT(__wait);
273		for (;;) {
274			prepare_to_wait(&mddev->sb_wait, &__wait,
275					TASK_UNINTERRUPTIBLE);
276			if (!mddev->suspended)
277				break;
278			rcu_read_unlock();
279			schedule();
280			rcu_read_lock();
281		}
282		finish_wait(&mddev->sb_wait, &__wait);
283	}
284	atomic_inc(&mddev->active_io);
285	rcu_read_unlock();
286
287	/*
288	 * save the sectors now since our bio can
289	 * go away inside make_request
290	 */
291	sectors = bio_sectors(bio);
292	/* bio could be mergeable after passing to underlayer */
293	bio->bi_rw &= ~REQ_NOMERGE;
294	mddev->pers->make_request(mddev, bio);
295
296	cpu = part_stat_lock();
297	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
298	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
299	part_stat_unlock();
300
301	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
302		wake_up(&mddev->sb_wait);
303}
304
305/* mddev_suspend makes sure no new requests are submitted
306 * to the device, and that any requests that have been submitted
307 * are completely handled.
308 * Once mddev_detach() is called and completes, the module will be
309 * completely unused.
310 */
311void mddev_suspend(struct mddev *mddev)
312{
313	BUG_ON(mddev->suspended);
314	mddev->suspended = 1;
315	synchronize_rcu();
316	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
317	mddev->pers->quiesce(mddev, 1);
318
319	del_timer_sync(&mddev->safemode_timer);
320}
321EXPORT_SYMBOL_GPL(mddev_suspend);
322
323void mddev_resume(struct mddev *mddev)
324{
325	mddev->suspended = 0;
326	wake_up(&mddev->sb_wait);
327	mddev->pers->quiesce(mddev, 0);
328
329	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
330	md_wakeup_thread(mddev->thread);
331	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
332}
333EXPORT_SYMBOL_GPL(mddev_resume);
334
335int mddev_congested(struct mddev *mddev, int bits)
336{
337	struct md_personality *pers = mddev->pers;
338	int ret = 0;
339
340	rcu_read_lock();
341	if (mddev->suspended)
342		ret = 1;
343	else if (pers && pers->congested)
344		ret = pers->congested(mddev, bits);
345	rcu_read_unlock();
346	return ret;
347}
348EXPORT_SYMBOL_GPL(mddev_congested);
349static int md_congested(void *data, int bits)
350{
351	struct mddev *mddev = data;
352	return mddev_congested(mddev, bits);
353}
354
355static int md_mergeable_bvec(struct request_queue *q,
356			     struct bvec_merge_data *bvm,
357			     struct bio_vec *biovec)
358{
359	struct mddev *mddev = q->queuedata;
360	int ret;
361	rcu_read_lock();
362	if (mddev->suspended) {
363		/* Must always allow one vec */
364		if (bvm->bi_size == 0)
365			ret = biovec->bv_len;
366		else
367			ret = 0;
368	} else {
369		struct md_personality *pers = mddev->pers;
370		if (pers && pers->mergeable_bvec)
371			ret = pers->mergeable_bvec(mddev, bvm, biovec);
372		else
373			ret = biovec->bv_len;
374	}
375	rcu_read_unlock();
376	return ret;
377}
378/*
379 * Generic flush handling for md
380 */
381
382static void md_end_flush(struct bio *bio, int err)
383{
384	struct md_rdev *rdev = bio->bi_private;
385	struct mddev *mddev = rdev->mddev;
386
387	rdev_dec_pending(rdev, mddev);
388
389	if (atomic_dec_and_test(&mddev->flush_pending)) {
390		/* The pre-request flush has finished */
391		queue_work(md_wq, &mddev->flush_work);
392	}
393	bio_put(bio);
394}
395
396static void md_submit_flush_data(struct work_struct *ws);
397
398static void submit_flushes(struct work_struct *ws)
399{
400	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
401	struct md_rdev *rdev;
402
403	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
404	atomic_set(&mddev->flush_pending, 1);
405	rcu_read_lock();
406	rdev_for_each_rcu(rdev, mddev)
407		if (rdev->raid_disk >= 0 &&
408		    !test_bit(Faulty, &rdev->flags)) {
409			/* Take two references, one is dropped
410			 * when request finishes, one after
411			 * we reclaim rcu_read_lock
412			 */
413			struct bio *bi;
414			atomic_inc(&rdev->nr_pending);
415			atomic_inc(&rdev->nr_pending);
416			rcu_read_unlock();
417			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
418			bi->bi_end_io = md_end_flush;
419			bi->bi_private = rdev;
420			bi->bi_bdev = rdev->bdev;
421			atomic_inc(&mddev->flush_pending);
422			submit_bio(WRITE_FLUSH, bi);
423			rcu_read_lock();
424			rdev_dec_pending(rdev, mddev);
425		}
426	rcu_read_unlock();
427	if (atomic_dec_and_test(&mddev->flush_pending))
428		queue_work(md_wq, &mddev->flush_work);
429}
430
431static void md_submit_flush_data(struct work_struct *ws)
432{
433	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
434	struct bio *bio = mddev->flush_bio;
435
436	if (bio->bi_iter.bi_size == 0)
437		/* an empty barrier - all done */
438		bio_endio(bio, 0);
439	else {
440		bio->bi_rw &= ~REQ_FLUSH;
441		mddev->pers->make_request(mddev, bio);
442	}
443
444	mddev->flush_bio = NULL;
445	wake_up(&mddev->sb_wait);
446}
447
448void md_flush_request(struct mddev *mddev, struct bio *bio)
449{
450	spin_lock_irq(&mddev->lock);
451	wait_event_lock_irq(mddev->sb_wait,
452			    !mddev->flush_bio,
453			    mddev->lock);
454	mddev->flush_bio = bio;
455	spin_unlock_irq(&mddev->lock);
456
457	INIT_WORK(&mddev->flush_work, submit_flushes);
458	queue_work(md_wq, &mddev->flush_work);
459}
460EXPORT_SYMBOL(md_flush_request);
461
462void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
463{
464	struct mddev *mddev = cb->data;
465	md_wakeup_thread(mddev->thread);
466	kfree(cb);
467}
468EXPORT_SYMBOL(md_unplug);
469
470static inline struct mddev *mddev_get(struct mddev *mddev)
471{
472	atomic_inc(&mddev->active);
473	return mddev;
474}
475
476static void mddev_delayed_delete(struct work_struct *ws);
477
478static void mddev_put(struct mddev *mddev)
479{
480	struct bio_set *bs = NULL;
481
482	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
483		return;
484	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
485	    mddev->ctime == 0 && !mddev->hold_active) {
486		/* Array is not configured at all, and not held active,
487		 * so destroy it */
488		list_del_init(&mddev->all_mddevs);
489		bs = mddev->bio_set;
490		mddev->bio_set = NULL;
491		if (mddev->gendisk) {
492			/* We did a probe so need to clean up.  Call
493			 * queue_work inside the spinlock so that
494			 * flush_workqueue() after mddev_find will
495			 * succeed in waiting for the work to be done.
496			 */
497			INIT_WORK(&mddev->del_work, mddev_delayed_delete);
498			queue_work(md_misc_wq, &mddev->del_work);
499		} else
500			kfree(mddev);
501	}
502	spin_unlock(&all_mddevs_lock);
503	if (bs)
504		bioset_free(bs);
505}
506
507void mddev_init(struct mddev *mddev)
508{
509	mutex_init(&mddev->open_mutex);
510	mutex_init(&mddev->reconfig_mutex);
511	mutex_init(&mddev->bitmap_info.mutex);
512	INIT_LIST_HEAD(&mddev->disks);
513	INIT_LIST_HEAD(&mddev->all_mddevs);
514	init_timer(&mddev->safemode_timer);
515	atomic_set(&mddev->active, 1);
516	atomic_set(&mddev->openers, 0);
517	atomic_set(&mddev->active_io, 0);
518	spin_lock_init(&mddev->lock);
519	atomic_set(&mddev->flush_pending, 0);
520	init_waitqueue_head(&mddev->sb_wait);
521	init_waitqueue_head(&mddev->recovery_wait);
522	mddev->reshape_position = MaxSector;
523	mddev->reshape_backwards = 0;
524	mddev->last_sync_action = "none";
525	mddev->resync_min = 0;
526	mddev->resync_max = MaxSector;
527	mddev->level = LEVEL_NONE;
528}
529EXPORT_SYMBOL_GPL(mddev_init);
530
531static struct mddev *mddev_find(dev_t unit)
532{
533	struct mddev *mddev, *new = NULL;
534
535	if (unit && MAJOR(unit) != MD_MAJOR)
536		unit &= ~((1<<MdpMinorShift)-1);
537
538 retry:
539	spin_lock(&all_mddevs_lock);
540
541	if (unit) {
542		list_for_each_entry(mddev, &all_mddevs, all_mddevs)
543			if (mddev->unit == unit) {
544				mddev_get(mddev);
545				spin_unlock(&all_mddevs_lock);
546				kfree(new);
547				return mddev;
548			}
549
550		if (new) {
551			list_add(&new->all_mddevs, &all_mddevs);
552			spin_unlock(&all_mddevs_lock);
553			new->hold_active = UNTIL_IOCTL;
554			return new;
555		}
556	} else if (new) {
557		/* find an unused unit number */
558		static int next_minor = 512;
559		int start = next_minor;
560		int is_free = 0;
561		int dev = 0;
562		while (!is_free) {
563			dev = MKDEV(MD_MAJOR, next_minor);
564			next_minor++;
565			if (next_minor > MINORMASK)
566				next_minor = 0;
567			if (next_minor == start) {
568				/* Oh dear, all in use. */
569				spin_unlock(&all_mddevs_lock);
570				kfree(new);
571				return NULL;
572			}
573
574			is_free = 1;
575			list_for_each_entry(mddev, &all_mddevs, all_mddevs)
576				if (mddev->unit == dev) {
577					is_free = 0;
578					break;
579				}
580		}
581		new->unit = dev;
582		new->md_minor = MINOR(dev);
583		new->hold_active = UNTIL_STOP;
584		list_add(&new->all_mddevs, &all_mddevs);
585		spin_unlock(&all_mddevs_lock);
586		return new;
587	}
588	spin_unlock(&all_mddevs_lock);
589
590	new = kzalloc(sizeof(*new), GFP_KERNEL);
591	if (!new)
592		return NULL;
593
594	new->unit = unit;
595	if (MAJOR(unit) == MD_MAJOR)
596		new->md_minor = MINOR(unit);
597	else
598		new->md_minor = MINOR(unit) >> MdpMinorShift;
599
600	mddev_init(new);
601
602	goto retry;
603}
604
605static struct attribute_group md_redundancy_group;
606
607void mddev_unlock(struct mddev *mddev)
608{
609	if (mddev->to_remove) {
610		/* These cannot be removed under reconfig_mutex as
611		 * an access to the files will try to take reconfig_mutex
612		 * while holding the file unremovable, which leads to
613		 * a deadlock.
614		 * So hold set sysfs_active while the remove in happeing,
615		 * and anything else which might set ->to_remove or my
616		 * otherwise change the sysfs namespace will fail with
617		 * -EBUSY if sysfs_active is still set.
618		 * We set sysfs_active under reconfig_mutex and elsewhere
619		 * test it under the same mutex to ensure its correct value
620		 * is seen.
621		 */
622		struct attribute_group *to_remove = mddev->to_remove;
623		mddev->to_remove = NULL;
624		mddev->sysfs_active = 1;
625		mutex_unlock(&mddev->reconfig_mutex);
626
627		if (mddev->kobj.sd) {
628			if (to_remove != &md_redundancy_group)
629				sysfs_remove_group(&mddev->kobj, to_remove);
630			if (mddev->pers == NULL ||
631			    mddev->pers->sync_request == NULL) {
632				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
633				if (mddev->sysfs_action)
634					sysfs_put(mddev->sysfs_action);
635				mddev->sysfs_action = NULL;
636			}
637		}
638		mddev->sysfs_active = 0;
639	} else
640		mutex_unlock(&mddev->reconfig_mutex);
641
642	/* As we've dropped the mutex we need a spinlock to
643	 * make sure the thread doesn't disappear
644	 */
645	spin_lock(&pers_lock);
646	md_wakeup_thread(mddev->thread);
647	spin_unlock(&pers_lock);
648}
649EXPORT_SYMBOL_GPL(mddev_unlock);
650
651struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
652{
653	struct md_rdev *rdev;
654
655	rdev_for_each_rcu(rdev, mddev)
656		if (rdev->desc_nr == nr)
657			return rdev;
658
659	return NULL;
660}
661EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
662
663static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
664{
665	struct md_rdev *rdev;
666
667	rdev_for_each(rdev, mddev)
668		if (rdev->bdev->bd_dev == dev)
669			return rdev;
670
671	return NULL;
672}
673
674static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
675{
676	struct md_rdev *rdev;
677
678	rdev_for_each_rcu(rdev, mddev)
679		if (rdev->bdev->bd_dev == dev)
680			return rdev;
681
682	return NULL;
683}
684
685static struct md_personality *find_pers(int level, char *clevel)
686{
687	struct md_personality *pers;
688	list_for_each_entry(pers, &pers_list, list) {
689		if (level != LEVEL_NONE && pers->level == level)
690			return pers;
691		if (strcmp(pers->name, clevel)==0)
692			return pers;
693	}
694	return NULL;
695}
696
697/* return the offset of the super block in 512byte sectors */
698static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
699{
700	sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
701	return MD_NEW_SIZE_SECTORS(num_sectors);
702}
703
704static int alloc_disk_sb(struct md_rdev *rdev)
705{
706	rdev->sb_page = alloc_page(GFP_KERNEL);
707	if (!rdev->sb_page) {
708		printk(KERN_ALERT "md: out of memory.\n");
709		return -ENOMEM;
710	}
711
712	return 0;
713}
714
715void md_rdev_clear(struct md_rdev *rdev)
716{
717	if (rdev->sb_page) {
718		put_page(rdev->sb_page);
719		rdev->sb_loaded = 0;
720		rdev->sb_page = NULL;
721		rdev->sb_start = 0;
722		rdev->sectors = 0;
723	}
724	if (rdev->bb_page) {
725		put_page(rdev->bb_page);
726		rdev->bb_page = NULL;
727	}
728	kfree(rdev->badblocks.page);
729	rdev->badblocks.page = NULL;
730}
731EXPORT_SYMBOL_GPL(md_rdev_clear);
732
733static void super_written(struct bio *bio, int error)
734{
735	struct md_rdev *rdev = bio->bi_private;
736	struct mddev *mddev = rdev->mddev;
737
738	if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
739		printk("md: super_written gets error=%d, uptodate=%d\n",
740		       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
741		WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
742		md_error(mddev, rdev);
743	}
744
745	if (atomic_dec_and_test(&mddev->pending_writes))
746		wake_up(&mddev->sb_wait);
747	bio_put(bio);
748}
749
750void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
751		   sector_t sector, int size, struct page *page)
752{
753	/* write first size bytes of page to sector of rdev
754	 * Increment mddev->pending_writes before returning
755	 * and decrement it on completion, waking up sb_wait
756	 * if zero is reached.
757	 * If an error occurred, call md_error
758	 */
759	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
760
761	bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
762	bio->bi_iter.bi_sector = sector;
763	bio_add_page(bio, page, size, 0);
764	bio->bi_private = rdev;
765	bio->bi_end_io = super_written;
766
767	atomic_inc(&mddev->pending_writes);
768	submit_bio(WRITE_FLUSH_FUA, bio);
769}
770
771void md_super_wait(struct mddev *mddev)
772{
773	/* wait for all superblock writes that were scheduled to complete */
774	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
775}
776
777int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
778		 struct page *page, int rw, bool metadata_op)
779{
780	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
781	int ret;
782
783	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
784		rdev->meta_bdev : rdev->bdev;
785	if (metadata_op)
786		bio->bi_iter.bi_sector = sector + rdev->sb_start;
787	else if (rdev->mddev->reshape_position != MaxSector &&
788		 (rdev->mddev->reshape_backwards ==
789		  (sector >= rdev->mddev->reshape_position)))
790		bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
791	else
792		bio->bi_iter.bi_sector = sector + rdev->data_offset;
793	bio_add_page(bio, page, size, 0);
794	submit_bio_wait(rw, bio);
795
796	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
797	bio_put(bio);
798	return ret;
799}
800EXPORT_SYMBOL_GPL(sync_page_io);
801
802static int read_disk_sb(struct md_rdev *rdev, int size)
803{
804	char b[BDEVNAME_SIZE];
805
806	if (rdev->sb_loaded)
807		return 0;
808
809	if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
810		goto fail;
811	rdev->sb_loaded = 1;
812	return 0;
813
814fail:
815	printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
816		bdevname(rdev->bdev,b));
817	return -EINVAL;
818}
819
820static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
821{
822	return	sb1->set_uuid0 == sb2->set_uuid0 &&
823		sb1->set_uuid1 == sb2->set_uuid1 &&
824		sb1->set_uuid2 == sb2->set_uuid2 &&
825		sb1->set_uuid3 == sb2->set_uuid3;
826}
827
828static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
829{
830	int ret;
831	mdp_super_t *tmp1, *tmp2;
832
833	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
834	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
835
836	if (!tmp1 || !tmp2) {
837		ret = 0;
838		printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
839		goto abort;
840	}
841
842	*tmp1 = *sb1;
843	*tmp2 = *sb2;
844
845	/*
846	 * nr_disks is not constant
847	 */
848	tmp1->nr_disks = 0;
849	tmp2->nr_disks = 0;
850
851	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
852abort:
853	kfree(tmp1);
854	kfree(tmp2);
855	return ret;
856}
857
858static u32 md_csum_fold(u32 csum)
859{
860	csum = (csum & 0xffff) + (csum >> 16);
861	return (csum & 0xffff) + (csum >> 16);
862}
863
864static unsigned int calc_sb_csum(mdp_super_t *sb)
865{
866	u64 newcsum = 0;
867	u32 *sb32 = (u32*)sb;
868	int i;
869	unsigned int disk_csum, csum;
870
871	disk_csum = sb->sb_csum;
872	sb->sb_csum = 0;
873
874	for (i = 0; i < MD_SB_BYTES/4 ; i++)
875		newcsum += sb32[i];
876	csum = (newcsum & 0xffffffff) + (newcsum>>32);
877
878#ifdef CONFIG_ALPHA
879	/* This used to use csum_partial, which was wrong for several
880	 * reasons including that different results are returned on
881	 * different architectures.  It isn't critical that we get exactly
882	 * the same return value as before (we always csum_fold before
883	 * testing, and that removes any differences).  However as we
884	 * know that csum_partial always returned a 16bit value on
885	 * alphas, do a fold to maximise conformity to previous behaviour.
886	 */
887	sb->sb_csum = md_csum_fold(disk_csum);
888#else
889	sb->sb_csum = disk_csum;
890#endif
891	return csum;
892}
893
894/*
895 * Handle superblock details.
896 * We want to be able to handle multiple superblock formats
897 * so we have a common interface to them all, and an array of
898 * different handlers.
899 * We rely on user-space to write the initial superblock, and support
900 * reading and updating of superblocks.
901 * Interface methods are:
902 *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
903 *      loads and validates a superblock on dev.
904 *      if refdev != NULL, compare superblocks on both devices
905 *    Return:
906 *      0 - dev has a superblock that is compatible with refdev
907 *      1 - dev has a superblock that is compatible and newer than refdev
908 *          so dev should be used as the refdev in future
909 *     -EINVAL superblock incompatible or invalid
910 *     -othererror e.g. -EIO
911 *
912 *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
913 *      Verify that dev is acceptable into mddev.
914 *       The first time, mddev->raid_disks will be 0, and data from
915 *       dev should be merged in.  Subsequent calls check that dev
916 *       is new enough.  Return 0 or -EINVAL
917 *
918 *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
919 *     Update the superblock for rdev with data in mddev
920 *     This does not write to disc.
921 *
922 */
923
924struct super_type  {
925	char		    *name;
926	struct module	    *owner;
927	int		    (*load_super)(struct md_rdev *rdev,
928					  struct md_rdev *refdev,
929					  int minor_version);
930	int		    (*validate_super)(struct mddev *mddev,
931					      struct md_rdev *rdev);
932	void		    (*sync_super)(struct mddev *mddev,
933					  struct md_rdev *rdev);
934	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
935						sector_t num_sectors);
936	int		    (*allow_new_offset)(struct md_rdev *rdev,
937						unsigned long long new_offset);
938};
939
940/*
941 * Check that the given mddev has no bitmap.
942 *
943 * This function is called from the run method of all personalities that do not
944 * support bitmaps. It prints an error message and returns non-zero if mddev
945 * has a bitmap. Otherwise, it returns 0.
946 *
947 */
948int md_check_no_bitmap(struct mddev *mddev)
949{
950	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
951		return 0;
952	printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
953		mdname(mddev), mddev->pers->name);
954	return 1;
955}
956EXPORT_SYMBOL(md_check_no_bitmap);
957
958/*
959 * load_super for 0.90.0
960 */
961static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
962{
963	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
964	mdp_super_t *sb;
965	int ret;
966
967	/*
968	 * Calculate the position of the superblock (512byte sectors),
969	 * it's at the end of the disk.
970	 *
971	 * It also happens to be a multiple of 4Kb.
972	 */
973	rdev->sb_start = calc_dev_sboffset(rdev);
974
975	ret = read_disk_sb(rdev, MD_SB_BYTES);
976	if (ret) return ret;
977
978	ret = -EINVAL;
979
980	bdevname(rdev->bdev, b);
981	sb = page_address(rdev->sb_page);
982
983	if (sb->md_magic != MD_SB_MAGIC) {
984		printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
985		       b);
986		goto abort;
987	}
988
989	if (sb->major_version != 0 ||
990	    sb->minor_version < 90 ||
991	    sb->minor_version > 91) {
992		printk(KERN_WARNING "Bad version number %d.%d on %s\n",
993			sb->major_version, sb->minor_version,
994			b);
995		goto abort;
996	}
997
998	if (sb->raid_disks <= 0)
999		goto abort;
1000
1001	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1002		printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
1003			b);
1004		goto abort;
1005	}
1006
1007	rdev->preferred_minor = sb->md_minor;
1008	rdev->data_offset = 0;
1009	rdev->new_data_offset = 0;
1010	rdev->sb_size = MD_SB_BYTES;
1011	rdev->badblocks.shift = -1;
1012
1013	if (sb->level == LEVEL_MULTIPATH)
1014		rdev->desc_nr = -1;
1015	else
1016		rdev->desc_nr = sb->this_disk.number;
1017
1018	if (!refdev) {
1019		ret = 1;
1020	} else {
1021		__u64 ev1, ev2;
1022		mdp_super_t *refsb = page_address(refdev->sb_page);
1023		if (!uuid_equal(refsb, sb)) {
1024			printk(KERN_WARNING "md: %s has different UUID to %s\n",
1025				b, bdevname(refdev->bdev,b2));
1026			goto abort;
1027		}
1028		if (!sb_equal(refsb, sb)) {
1029			printk(KERN_WARNING "md: %s has same UUID"
1030			       " but different superblock to %s\n",
1031			       b, bdevname(refdev->bdev, b2));
1032			goto abort;
1033		}
1034		ev1 = md_event(sb);
1035		ev2 = md_event(refsb);
1036		if (ev1 > ev2)
1037			ret = 1;
1038		else
1039			ret = 0;
1040	}
1041	rdev->sectors = rdev->sb_start;
1042	/* Limit to 4TB as metadata cannot record more than that.
1043	 * (not needed for Linear and RAID0 as metadata doesn't
1044	 * record this size)
1045	 */
1046	if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1047		rdev->sectors = (2ULL << 32) - 2;
1048
1049	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1050		/* "this cannot possibly happen" ... */
1051		ret = -EINVAL;
1052
1053 abort:
1054	return ret;
1055}
1056
1057/*
1058 * validate_super for 0.90.0
1059 */
1060static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1061{
1062	mdp_disk_t *desc;
1063	mdp_super_t *sb = page_address(rdev->sb_page);
1064	__u64 ev1 = md_event(sb);
1065
1066	rdev->raid_disk = -1;
1067	clear_bit(Faulty, &rdev->flags);
1068	clear_bit(In_sync, &rdev->flags);
1069	clear_bit(Bitmap_sync, &rdev->flags);
1070	clear_bit(WriteMostly, &rdev->flags);
1071
1072	if (mddev->raid_disks == 0) {
1073		mddev->major_version = 0;
1074		mddev->minor_version = sb->minor_version;
1075		mddev->patch_version = sb->patch_version;
1076		mddev->external = 0;
1077		mddev->chunk_sectors = sb->chunk_size >> 9;
1078		mddev->ctime = sb->ctime;
1079		mddev->utime = sb->utime;
1080		mddev->level = sb->level;
1081		mddev->clevel[0] = 0;
1082		mddev->layout = sb->layout;
1083		mddev->raid_disks = sb->raid_disks;
1084		mddev->dev_sectors = ((sector_t)sb->size) * 2;
1085		mddev->events = ev1;
1086		mddev->bitmap_info.offset = 0;
1087		mddev->bitmap_info.space = 0;
1088		/* bitmap can use 60 K after the 4K superblocks */
1089		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1090		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1091		mddev->reshape_backwards = 0;
1092
1093		if (mddev->minor_version >= 91) {
1094			mddev->reshape_position = sb->reshape_position;
1095			mddev->delta_disks = sb->delta_disks;
1096			mddev->new_level = sb->new_level;
1097			mddev->new_layout = sb->new_layout;
1098			mddev->new_chunk_sectors = sb->new_chunk >> 9;
1099			if (mddev->delta_disks < 0)
1100				mddev->reshape_backwards = 1;
1101		} else {
1102			mddev->reshape_position = MaxSector;
1103			mddev->delta_disks = 0;
1104			mddev->new_level = mddev->level;
1105			mddev->new_layout = mddev->layout;
1106			mddev->new_chunk_sectors = mddev->chunk_sectors;
1107		}
1108
1109		if (sb->state & (1<<MD_SB_CLEAN))
1110			mddev->recovery_cp = MaxSector;
1111		else {
1112			if (sb->events_hi == sb->cp_events_hi &&
1113				sb->events_lo == sb->cp_events_lo) {
1114				mddev->recovery_cp = sb->recovery_cp;
1115			} else
1116				mddev->recovery_cp = 0;
1117		}
1118
1119		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1120		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1121		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1122		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1123
1124		mddev->max_disks = MD_SB_DISKS;
1125
1126		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1127		    mddev->bitmap_info.file == NULL) {
1128			mddev->bitmap_info.offset =
1129				mddev->bitmap_info.default_offset;
1130			mddev->bitmap_info.space =
1131				mddev->bitmap_info.default_space;
1132		}
1133
1134	} else if (mddev->pers == NULL) {
1135		/* Insist on good event counter while assembling, except
1136		 * for spares (which don't need an event count) */
1137		++ev1;
1138		if (sb->disks[rdev->desc_nr].state & (
1139			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1140			if (ev1 < mddev->events)
1141				return -EINVAL;
1142	} else if (mddev->bitmap) {
1143		/* if adding to array with a bitmap, then we can accept an
1144		 * older device ... but not too old.
1145		 */
1146		if (ev1 < mddev->bitmap->events_cleared)
1147			return 0;
1148		if (ev1 < mddev->events)
1149			set_bit(Bitmap_sync, &rdev->flags);
1150	} else {
1151		if (ev1 < mddev->events)
1152			/* just a hot-add of a new device, leave raid_disk at -1 */
1153			return 0;
1154	}
1155
1156	if (mddev->level != LEVEL_MULTIPATH) {
1157		desc = sb->disks + rdev->desc_nr;
1158
1159		if (desc->state & (1<<MD_DISK_FAULTY))
1160			set_bit(Faulty, &rdev->flags);
1161		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1162			    desc->raid_disk < mddev->raid_disks */) {
1163			set_bit(In_sync, &rdev->flags);
1164			rdev->raid_disk = desc->raid_disk;
1165			rdev->saved_raid_disk = desc->raid_disk;
1166		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1167			/* active but not in sync implies recovery up to
1168			 * reshape position.  We don't know exactly where
1169			 * that is, so set to zero for now */
1170			if (mddev->minor_version >= 91) {
1171				rdev->recovery_offset = 0;
1172				rdev->raid_disk = desc->raid_disk;
1173			}
1174		}
1175		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1176			set_bit(WriteMostly, &rdev->flags);
1177	} else /* MULTIPATH are always insync */
1178		set_bit(In_sync, &rdev->flags);
1179	return 0;
1180}
1181
1182/*
1183 * sync_super for 0.90.0
1184 */
1185static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1186{
1187	mdp_super_t *sb;
1188	struct md_rdev *rdev2;
1189	int next_spare = mddev->raid_disks;
1190
1191	/* make rdev->sb match mddev data..
1192	 *
1193	 * 1/ zero out disks
1194	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1195	 * 3/ any empty disks < next_spare become removed
1196	 *
1197	 * disks[0] gets initialised to REMOVED because
1198	 * we cannot be sure from other fields if it has
1199	 * been initialised or not.
1200	 */
1201	int i;
1202	int active=0, working=0,failed=0,spare=0,nr_disks=0;
1203
1204	rdev->sb_size = MD_SB_BYTES;
1205
1206	sb = page_address(rdev->sb_page);
1207
1208	memset(sb, 0, sizeof(*sb));
1209
1210	sb->md_magic = MD_SB_MAGIC;
1211	sb->major_version = mddev->major_version;
1212	sb->patch_version = mddev->patch_version;
1213	sb->gvalid_words  = 0; /* ignored */
1214	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1215	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1216	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1217	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1218
1219	sb->ctime = mddev->ctime;
1220	sb->level = mddev->level;
1221	sb->size = mddev->dev_sectors / 2;
1222	sb->raid_disks = mddev->raid_disks;
1223	sb->md_minor = mddev->md_minor;
1224	sb->not_persistent = 0;
1225	sb->utime = mddev->utime;
1226	sb->state = 0;
1227	sb->events_hi = (mddev->events>>32);
1228	sb->events_lo = (u32)mddev->events;
1229
1230	if (mddev->reshape_position == MaxSector)
1231		sb->minor_version = 90;
1232	else {
1233		sb->minor_version = 91;
1234		sb->reshape_position = mddev->reshape_position;
1235		sb->new_level = mddev->new_level;
1236		sb->delta_disks = mddev->delta_disks;
1237		sb->new_layout = mddev->new_layout;
1238		sb->new_chunk = mddev->new_chunk_sectors << 9;
1239	}
1240	mddev->minor_version = sb->minor_version;
1241	if (mddev->in_sync)
1242	{
1243		sb->recovery_cp = mddev->recovery_cp;
1244		sb->cp_events_hi = (mddev->events>>32);
1245		sb->cp_events_lo = (u32)mddev->events;
1246		if (mddev->recovery_cp == MaxSector)
1247			sb->state = (1<< MD_SB_CLEAN);
1248	} else
1249		sb->recovery_cp = 0;
1250
1251	sb->layout = mddev->layout;
1252	sb->chunk_size = mddev->chunk_sectors << 9;
1253
1254	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1255		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1256
1257	sb->disks[0].state = (1<<MD_DISK_REMOVED);
1258	rdev_for_each(rdev2, mddev) {
1259		mdp_disk_t *d;
1260		int desc_nr;
1261		int is_active = test_bit(In_sync, &rdev2->flags);
1262
1263		if (rdev2->raid_disk >= 0 &&
1264		    sb->minor_version >= 91)
1265			/* we have nowhere to store the recovery_offset,
1266			 * but if it is not below the reshape_position,
1267			 * we can piggy-back on that.
1268			 */
1269			is_active = 1;
1270		if (rdev2->raid_disk < 0 ||
1271		    test_bit(Faulty, &rdev2->flags))
1272			is_active = 0;
1273		if (is_active)
1274			desc_nr = rdev2->raid_disk;
1275		else
1276			desc_nr = next_spare++;
1277		rdev2->desc_nr = desc_nr;
1278		d = &sb->disks[rdev2->desc_nr];
1279		nr_disks++;
1280		d->number = rdev2->desc_nr;
1281		d->major = MAJOR(rdev2->bdev->bd_dev);
1282		d->minor = MINOR(rdev2->bdev->bd_dev);
1283		if (is_active)
1284			d->raid_disk = rdev2->raid_disk;
1285		else
1286			d->raid_disk = rdev2->desc_nr; /* compatibility */
1287		if (test_bit(Faulty, &rdev2->flags))
1288			d->state = (1<<MD_DISK_FAULTY);
1289		else if (is_active) {
1290			d->state = (1<<MD_DISK_ACTIVE);
1291			if (test_bit(In_sync, &rdev2->flags))
1292				d->state |= (1<<MD_DISK_SYNC);
1293			active++;
1294			working++;
1295		} else {
1296			d->state = 0;
1297			spare++;
1298			working++;
1299		}
1300		if (test_bit(WriteMostly, &rdev2->flags))
1301			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1302	}
1303	/* now set the "removed" and "faulty" bits on any missing devices */
1304	for (i=0 ; i < mddev->raid_disks ; i++) {
1305		mdp_disk_t *d = &sb->disks[i];
1306		if (d->state == 0 && d->number == 0) {
1307			d->number = i;
1308			d->raid_disk = i;
1309			d->state = (1<<MD_DISK_REMOVED);
1310			d->state |= (1<<MD_DISK_FAULTY);
1311			failed++;
1312		}
1313	}
1314	sb->nr_disks = nr_disks;
1315	sb->active_disks = active;
1316	sb->working_disks = working;
1317	sb->failed_disks = failed;
1318	sb->spare_disks = spare;
1319
1320	sb->this_disk = sb->disks[rdev->desc_nr];
1321	sb->sb_csum = calc_sb_csum(sb);
1322}
1323
1324/*
1325 * rdev_size_change for 0.90.0
1326 */
1327static unsigned long long
1328super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1329{
1330	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1331		return 0; /* component must fit device */
1332	if (rdev->mddev->bitmap_info.offset)
1333		return 0; /* can't move bitmap */
1334	rdev->sb_start = calc_dev_sboffset(rdev);
1335	if (!num_sectors || num_sectors > rdev->sb_start)
1336		num_sectors = rdev->sb_start;
1337	/* Limit to 4TB as metadata cannot record more than that.
1338	 * 4TB == 2^32 KB, or 2*2^32 sectors.
1339	 */
1340	if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1341		num_sectors = (2ULL << 32) - 2;
1342	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1343		       rdev->sb_page);
1344	md_super_wait(rdev->mddev);
1345	return num_sectors;
1346}
1347
1348static int
1349super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1350{
1351	/* non-zero offset changes not possible with v0.90 */
1352	return new_offset == 0;
1353}
1354
1355/*
1356 * version 1 superblock
1357 */
1358
1359static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1360{
1361	__le32 disk_csum;
1362	u32 csum;
1363	unsigned long long newcsum;
1364	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1365	__le32 *isuper = (__le32*)sb;
1366
1367	disk_csum = sb->sb_csum;
1368	sb->sb_csum = 0;
1369	newcsum = 0;
1370	for (; size >= 4; size -= 4)
1371		newcsum += le32_to_cpu(*isuper++);
1372
1373	if (size == 2)
1374		newcsum += le16_to_cpu(*(__le16*) isuper);
1375
1376	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1377	sb->sb_csum = disk_csum;
1378	return cpu_to_le32(csum);
1379}
1380
1381static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1382			    int acknowledged);
1383static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1384{
1385	struct mdp_superblock_1 *sb;
1386	int ret;
1387	sector_t sb_start;
1388	sector_t sectors;
1389	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1390	int bmask;
1391
1392	/*
1393	 * Calculate the position of the superblock in 512byte sectors.
1394	 * It is always aligned to a 4K boundary and
1395	 * depeding on minor_version, it can be:
1396	 * 0: At least 8K, but less than 12K, from end of device
1397	 * 1: At start of device
1398	 * 2: 4K from start of device.
1399	 */
1400	switch(minor_version) {
1401	case 0:
1402		sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1403		sb_start -= 8*2;
1404		sb_start &= ~(sector_t)(4*2-1);
1405		break;
1406	case 1:
1407		sb_start = 0;
1408		break;
1409	case 2:
1410		sb_start = 8;
1411		break;
1412	default:
1413		return -EINVAL;
1414	}
1415	rdev->sb_start = sb_start;
1416
1417	/* superblock is rarely larger than 1K, but it can be larger,
1418	 * and it is safe to read 4k, so we do that
1419	 */
1420	ret = read_disk_sb(rdev, 4096);
1421	if (ret) return ret;
1422
1423	sb = page_address(rdev->sb_page);
1424
1425	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1426	    sb->major_version != cpu_to_le32(1) ||
1427	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1428	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1429	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1430		return -EINVAL;
1431
1432	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1433		printk("md: invalid superblock checksum on %s\n",
1434			bdevname(rdev->bdev,b));
1435		return -EINVAL;
1436	}
1437	if (le64_to_cpu(sb->data_size) < 10) {
1438		printk("md: data_size too small on %s\n",
1439		       bdevname(rdev->bdev,b));
1440		return -EINVAL;
1441	}
1442	if (sb->pad0 ||
1443	    sb->pad3[0] ||
1444	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1445		/* Some padding is non-zero, might be a new feature */
1446		return -EINVAL;
1447
1448	rdev->preferred_minor = 0xffff;
1449	rdev->data_offset = le64_to_cpu(sb->data_offset);
1450	rdev->new_data_offset = rdev->data_offset;
1451	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1452	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1453		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1454	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1455
1456	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1457	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1458	if (rdev->sb_size & bmask)
1459		rdev->sb_size = (rdev->sb_size | bmask) + 1;
1460
1461	if (minor_version
1462	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1463		return -EINVAL;
1464	if (minor_version
1465	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1466		return -EINVAL;
1467
1468	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1469		rdev->desc_nr = -1;
1470	else
1471		rdev->desc_nr = le32_to_cpu(sb->dev_number);
1472
1473	if (!rdev->bb_page) {
1474		rdev->bb_page = alloc_page(GFP_KERNEL);
1475		if (!rdev->bb_page)
1476			return -ENOMEM;
1477	}
1478	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1479	    rdev->badblocks.count == 0) {
1480		/* need to load the bad block list.
1481		 * Currently we limit it to one page.
1482		 */
1483		s32 offset;
1484		sector_t bb_sector;
1485		u64 *bbp;
1486		int i;
1487		int sectors = le16_to_cpu(sb->bblog_size);
1488		if (sectors > (PAGE_SIZE / 512))
1489			return -EINVAL;
1490		offset = le32_to_cpu(sb->bblog_offset);
1491		if (offset == 0)
1492			return -EINVAL;
1493		bb_sector = (long long)offset;
1494		if (!sync_page_io(rdev, bb_sector, sectors << 9,
1495				  rdev->bb_page, READ, true))
1496			return -EIO;
1497		bbp = (u64 *)page_address(rdev->bb_page);
1498		rdev->badblocks.shift = sb->bblog_shift;
1499		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1500			u64 bb = le64_to_cpu(*bbp);
1501			int count = bb & (0x3ff);
1502			u64 sector = bb >> 10;
1503			sector <<= sb->bblog_shift;
1504			count <<= sb->bblog_shift;
1505			if (bb + 1 == 0)
1506				break;
1507			if (md_set_badblocks(&rdev->badblocks,
1508					     sector, count, 1) == 0)
1509				return -EINVAL;
1510		}
1511	} else if (sb->bblog_offset != 0)
1512		rdev->badblocks.shift = 0;
1513
1514	if (!refdev) {
1515		ret = 1;
1516	} else {
1517		__u64 ev1, ev2;
1518		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1519
1520		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1521		    sb->level != refsb->level ||
1522		    sb->layout != refsb->layout ||
1523		    sb->chunksize != refsb->chunksize) {
1524			printk(KERN_WARNING "md: %s has strangely different"
1525				" superblock to %s\n",
1526				bdevname(rdev->bdev,b),
1527				bdevname(refdev->bdev,b2));
1528			return -EINVAL;
1529		}
1530		ev1 = le64_to_cpu(sb->events);
1531		ev2 = le64_to_cpu(refsb->events);
1532
1533		if (ev1 > ev2)
1534			ret = 1;
1535		else
1536			ret = 0;
1537	}
1538	if (minor_version) {
1539		sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1540		sectors -= rdev->data_offset;
1541	} else
1542		sectors = rdev->sb_start;
1543	if (sectors < le64_to_cpu(sb->data_size))
1544		return -EINVAL;
1545	rdev->sectors = le64_to_cpu(sb->data_size);
1546	return ret;
1547}
1548
1549static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1550{
1551	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1552	__u64 ev1 = le64_to_cpu(sb->events);
1553
1554	rdev->raid_disk = -1;
1555	clear_bit(Faulty, &rdev->flags);
1556	clear_bit(In_sync, &rdev->flags);
1557	clear_bit(Bitmap_sync, &rdev->flags);
1558	clear_bit(WriteMostly, &rdev->flags);
1559
1560	if (mddev->raid_disks == 0) {
1561		mddev->major_version = 1;
1562		mddev->patch_version = 0;
1563		mddev->external = 0;
1564		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1565		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1566		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1567		mddev->level = le32_to_cpu(sb->level);
1568		mddev->clevel[0] = 0;
1569		mddev->layout = le32_to_cpu(sb->layout);
1570		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1571		mddev->dev_sectors = le64_to_cpu(sb->size);
1572		mddev->events = ev1;
1573		mddev->bitmap_info.offset = 0;
1574		mddev->bitmap_info.space = 0;
1575		/* Default location for bitmap is 1K after superblock
1576		 * using 3K - total of 4K
1577		 */
1578		mddev->bitmap_info.default_offset = 1024 >> 9;
1579		mddev->bitmap_info.default_space = (4096-1024) >> 9;
1580		mddev->reshape_backwards = 0;
1581
1582		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1583		memcpy(mddev->uuid, sb->set_uuid, 16);
1584
1585		mddev->max_disks =  (4096-256)/2;
1586
1587		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1588		    mddev->bitmap_info.file == NULL) {
1589			mddev->bitmap_info.offset =
1590				(__s32)le32_to_cpu(sb->bitmap_offset);
1591			/* Metadata doesn't record how much space is available.
1592			 * For 1.0, we assume we can use up to the superblock
1593			 * if before, else to 4K beyond superblock.
1594			 * For others, assume no change is possible.
1595			 */
1596			if (mddev->minor_version > 0)
1597				mddev->bitmap_info.space = 0;
1598			else if (mddev->bitmap_info.offset > 0)
1599				mddev->bitmap_info.space =
1600					8 - mddev->bitmap_info.offset;
1601			else
1602				mddev->bitmap_info.space =
1603					-mddev->bitmap_info.offset;
1604		}
1605
1606		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1607			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1608			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1609			mddev->new_level = le32_to_cpu(sb->new_level);
1610			mddev->new_layout = le32_to_cpu(sb->new_layout);
1611			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1612			if (mddev->delta_disks < 0 ||
1613			    (mddev->delta_disks == 0 &&
1614			     (le32_to_cpu(sb->feature_map)
1615			      & MD_FEATURE_RESHAPE_BACKWARDS)))
1616				mddev->reshape_backwards = 1;
1617		} else {
1618			mddev->reshape_position = MaxSector;
1619			mddev->delta_disks = 0;
1620			mddev->new_level = mddev->level;
1621			mddev->new_layout = mddev->layout;
1622			mddev->new_chunk_sectors = mddev->chunk_sectors;
1623		}
1624
1625	} else if (mddev->pers == NULL) {
1626		/* Insist of good event counter while assembling, except for
1627		 * spares (which don't need an event count) */
1628		++ev1;
1629		if (rdev->desc_nr >= 0 &&
1630		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1631		    le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1632			if (ev1 < mddev->events)
1633				return -EINVAL;
1634	} else if (mddev->bitmap) {
1635		/* If adding to array with a bitmap, then we can accept an
1636		 * older device, but not too old.
1637		 */
1638		if (ev1 < mddev->bitmap->events_cleared)
1639			return 0;
1640		if (ev1 < mddev->events)
1641			set_bit(Bitmap_sync, &rdev->flags);
1642	} else {
1643		if (ev1 < mddev->events)
1644			/* just a hot-add of a new device, leave raid_disk at -1 */
1645			return 0;
1646	}
1647	if (mddev->level != LEVEL_MULTIPATH) {
1648		int role;
1649		if (rdev->desc_nr < 0 ||
1650		    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1651			role = 0xffff;
1652			rdev->desc_nr = -1;
1653		} else
1654			role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1655		switch(role) {
1656		case 0xffff: /* spare */
1657			break;
1658		case 0xfffe: /* faulty */
1659			set_bit(Faulty, &rdev->flags);
1660			break;
1661		default:
1662			rdev->saved_raid_disk = role;
1663			if ((le32_to_cpu(sb->feature_map) &
1664			     MD_FEATURE_RECOVERY_OFFSET)) {
1665				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1666				if (!(le32_to_cpu(sb->feature_map) &
1667				      MD_FEATURE_RECOVERY_BITMAP))
1668					rdev->saved_raid_disk = -1;
1669			} else
1670				set_bit(In_sync, &rdev->flags);
1671			rdev->raid_disk = role;
1672			break;
1673		}
1674		if (sb->devflags & WriteMostly1)
1675			set_bit(WriteMostly, &rdev->flags);
1676		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1677			set_bit(Replacement, &rdev->flags);
1678	} else /* MULTIPATH are always insync */
1679		set_bit(In_sync, &rdev->flags);
1680
1681	return 0;
1682}
1683
1684static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1685{
1686	struct mdp_superblock_1 *sb;
1687	struct md_rdev *rdev2;
1688	int max_dev, i;
1689	/* make rdev->sb match mddev and rdev data. */
1690
1691	sb = page_address(rdev->sb_page);
1692
1693	sb->feature_map = 0;
1694	sb->pad0 = 0;
1695	sb->recovery_offset = cpu_to_le64(0);
1696	memset(sb->pad3, 0, sizeof(sb->pad3));
1697
1698	sb->utime = cpu_to_le64((__u64)mddev->utime);
1699	sb->events = cpu_to_le64(mddev->events);
1700	if (mddev->in_sync)
1701		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1702	else
1703		sb->resync_offset = cpu_to_le64(0);
1704
1705	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1706
1707	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1708	sb->size = cpu_to_le64(mddev->dev_sectors);
1709	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1710	sb->level = cpu_to_le32(mddev->level);
1711	sb->layout = cpu_to_le32(mddev->layout);
1712
1713	if (test_bit(WriteMostly, &rdev->flags))
1714		sb->devflags |= WriteMostly1;
1715	else
1716		sb->devflags &= ~WriteMostly1;
1717	sb->data_offset = cpu_to_le64(rdev->data_offset);
1718	sb->data_size = cpu_to_le64(rdev->sectors);
1719
1720	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1721		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1722		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1723	}
1724
1725	if (rdev->raid_disk >= 0 &&
1726	    !test_bit(In_sync, &rdev->flags)) {
1727		sb->feature_map |=
1728			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1729		sb->recovery_offset =
1730			cpu_to_le64(rdev->recovery_offset);
1731		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1732			sb->feature_map |=
1733				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1734	}
1735	if (test_bit(Replacement, &rdev->flags))
1736		sb->feature_map |=
1737			cpu_to_le32(MD_FEATURE_REPLACEMENT);
1738
1739	if (mddev->reshape_position != MaxSector) {
1740		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1741		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1742		sb->new_layout = cpu_to_le32(mddev->new_layout);
1743		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1744		sb->new_level = cpu_to_le32(mddev->new_level);
1745		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1746		if (mddev->delta_disks == 0 &&
1747		    mddev->reshape_backwards)
1748			sb->feature_map
1749				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1750		if (rdev->new_data_offset != rdev->data_offset) {
1751			sb->feature_map
1752				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1753			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1754							     - rdev->data_offset));
1755		}
1756	}
1757
1758	if (rdev->badblocks.count == 0)
1759		/* Nothing to do for bad blocks*/ ;
1760	else if (sb->bblog_offset == 0)
1761		/* Cannot record bad blocks on this device */
1762		md_error(mddev, rdev);
1763	else {
1764		struct badblocks *bb = &rdev->badblocks;
1765		u64 *bbp = (u64 *)page_address(rdev->bb_page);
1766		u64 *p = bb->page;
1767		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1768		if (bb->changed) {
1769			unsigned seq;
1770
1771retry:
1772			seq = read_seqbegin(&bb->lock);
1773
1774			memset(bbp, 0xff, PAGE_SIZE);
1775
1776			for (i = 0 ; i < bb->count ; i++) {
1777				u64 internal_bb = p[i];
1778				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1779						| BB_LEN(internal_bb));
1780				bbp[i] = cpu_to_le64(store_bb);
1781			}
1782			bb->changed = 0;
1783			if (read_seqretry(&bb->lock, seq))
1784				goto retry;
1785
1786			bb->sector = (rdev->sb_start +
1787				      (int)le32_to_cpu(sb->bblog_offset));
1788			bb->size = le16_to_cpu(sb->bblog_size);
1789		}
1790	}
1791
1792	max_dev = 0;
1793	rdev_for_each(rdev2, mddev)
1794		if (rdev2->desc_nr+1 > max_dev)
1795			max_dev = rdev2->desc_nr+1;
1796
1797	if (max_dev > le32_to_cpu(sb->max_dev)) {
1798		int bmask;
1799		sb->max_dev = cpu_to_le32(max_dev);
1800		rdev->sb_size = max_dev * 2 + 256;
1801		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1802		if (rdev->sb_size & bmask)
1803			rdev->sb_size = (rdev->sb_size | bmask) + 1;
1804	} else
1805		max_dev = le32_to_cpu(sb->max_dev);
1806
1807	for (i=0; i<max_dev;i++)
1808		sb->dev_roles[i] = cpu_to_le16(0xfffe);
1809
1810	rdev_for_each(rdev2, mddev) {
1811		i = rdev2->desc_nr;
1812		if (test_bit(Faulty, &rdev2->flags))
1813			sb->dev_roles[i] = cpu_to_le16(0xfffe);
1814		else if (test_bit(In_sync, &rdev2->flags))
1815			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1816		else if (rdev2->raid_disk >= 0)
1817			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1818		else
1819			sb->dev_roles[i] = cpu_to_le16(0xffff);
1820	}
1821
1822	sb->sb_csum = calc_sb_1_csum(sb);
1823}
1824
1825static unsigned long long
1826super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1827{
1828	struct mdp_superblock_1 *sb;
1829	sector_t max_sectors;
1830	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1831		return 0; /* component must fit device */
1832	if (rdev->data_offset != rdev->new_data_offset)
1833		return 0; /* too confusing */
1834	if (rdev->sb_start < rdev->data_offset) {
1835		/* minor versions 1 and 2; superblock before data */
1836		max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1837		max_sectors -= rdev->data_offset;
1838		if (!num_sectors || num_sectors > max_sectors)
1839			num_sectors = max_sectors;
1840	} else if (rdev->mddev->bitmap_info.offset) {
1841		/* minor version 0 with bitmap we can't move */
1842		return 0;
1843	} else {
1844		/* minor version 0; superblock after data */
1845		sector_t sb_start;
1846		sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1847		sb_start &= ~(sector_t)(4*2 - 1);
1848		max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1849		if (!num_sectors || num_sectors > max_sectors)
1850			num_sectors = max_sectors;
1851		rdev->sb_start = sb_start;
1852	}
1853	sb = page_address(rdev->sb_page);
1854	sb->data_size = cpu_to_le64(num_sectors);
1855	sb->super_offset = rdev->sb_start;
1856	sb->sb_csum = calc_sb_1_csum(sb);
1857	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1858		       rdev->sb_page);
1859	md_super_wait(rdev->mddev);
1860	return num_sectors;
1861
1862}
1863
1864static int
1865super_1_allow_new_offset(struct md_rdev *rdev,
1866			 unsigned long long new_offset)
1867{
1868	/* All necessary checks on new >= old have been done */
1869	struct bitmap *bitmap;
1870	if (new_offset >= rdev->data_offset)
1871		return 1;
1872
1873	/* with 1.0 metadata, there is no metadata to tread on
1874	 * so we can always move back */
1875	if (rdev->mddev->minor_version == 0)
1876		return 1;
1877
1878	/* otherwise we must be sure not to step on
1879	 * any metadata, so stay:
1880	 * 36K beyond start of superblock
1881	 * beyond end of badblocks
1882	 * beyond write-intent bitmap
1883	 */
1884	if (rdev->sb_start + (32+4)*2 > new_offset)
1885		return 0;
1886	bitmap = rdev->mddev->bitmap;
1887	if (bitmap && !rdev->mddev->bitmap_info.file &&
1888	    rdev->sb_start + rdev->mddev->bitmap_info.offset +
1889	    bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
1890		return 0;
1891	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
1892		return 0;
1893
1894	return 1;
1895}
1896
1897static struct super_type super_types[] = {
1898	[0] = {
1899		.name	= "0.90.0",
1900		.owner	= THIS_MODULE,
1901		.load_super	    = super_90_load,
1902		.validate_super	    = super_90_validate,
1903		.sync_super	    = super_90_sync,
1904		.rdev_size_change   = super_90_rdev_size_change,
1905		.allow_new_offset   = super_90_allow_new_offset,
1906	},
1907	[1] = {
1908		.name	= "md-1",
1909		.owner	= THIS_MODULE,
1910		.load_super	    = super_1_load,
1911		.validate_super	    = super_1_validate,
1912		.sync_super	    = super_1_sync,
1913		.rdev_size_change   = super_1_rdev_size_change,
1914		.allow_new_offset   = super_1_allow_new_offset,
1915	},
1916};
1917
1918static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
1919{
1920	if (mddev->sync_super) {
1921		mddev->sync_super(mddev, rdev);
1922		return;
1923	}
1924
1925	BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1926
1927	super_types[mddev->major_version].sync_super(mddev, rdev);
1928}
1929
1930static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
1931{
1932	struct md_rdev *rdev, *rdev2;
1933
1934	rcu_read_lock();
1935	rdev_for_each_rcu(rdev, mddev1)
1936		rdev_for_each_rcu(rdev2, mddev2)
1937			if (rdev->bdev->bd_contains ==
1938			    rdev2->bdev->bd_contains) {
1939				rcu_read_unlock();
1940				return 1;
1941			}
1942	rcu_read_unlock();
1943	return 0;
1944}
1945
1946static LIST_HEAD(pending_raid_disks);
1947
1948/*
1949 * Try to register data integrity profile for an mddev
1950 *
1951 * This is called when an array is started and after a disk has been kicked
1952 * from the array. It only succeeds if all working and active component devices
1953 * are integrity capable with matching profiles.
1954 */
1955int md_integrity_register(struct mddev *mddev)
1956{
1957	struct md_rdev *rdev, *reference = NULL;
1958
1959	if (list_empty(&mddev->disks))
1960		return 0; /* nothing to do */
1961	if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
1962		return 0; /* shouldn't register, or already is */
1963	rdev_for_each(rdev, mddev) {
1964		/* skip spares and non-functional disks */
1965		if (test_bit(Faulty, &rdev->flags))
1966			continue;
1967		if (rdev->raid_disk < 0)
1968			continue;
1969		if (!reference) {
1970			/* Use the first rdev as the reference */
1971			reference = rdev;
1972			continue;
1973		}
1974		/* does this rdev's profile match the reference profile? */
1975		if (blk_integrity_compare(reference->bdev->bd_disk,
1976				rdev->bdev->bd_disk) < 0)
1977			return -EINVAL;
1978	}
1979	if (!reference || !bdev_get_integrity(reference->bdev))
1980		return 0;
1981	/*
1982	 * All component devices are integrity capable and have matching
1983	 * profiles, register the common profile for the md device.
1984	 */
1985	if (blk_integrity_register(mddev->gendisk,
1986			bdev_get_integrity(reference->bdev)) != 0) {
1987		printk(KERN_ERR "md: failed to register integrity for %s\n",
1988			mdname(mddev));
1989		return -EINVAL;
1990	}
1991	printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
1992	if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
1993		printk(KERN_ERR "md: failed to create integrity pool for %s\n",
1994		       mdname(mddev));
1995		return -EINVAL;
1996	}
1997	return 0;
1998}
1999EXPORT_SYMBOL(md_integrity_register);
2000
2001/* Disable data integrity if non-capable/non-matching disk is being added */
2002void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2003{
2004	struct blk_integrity *bi_rdev;
2005	struct blk_integrity *bi_mddev;
2006
2007	if (!mddev->gendisk)
2008		return;
2009
2010	bi_rdev = bdev_get_integrity(rdev->bdev);
2011	bi_mddev = blk_get_integrity(mddev->gendisk);
2012
2013	if (!bi_mddev) /* nothing to do */
2014		return;
2015	if (rdev->raid_disk < 0) /* skip spares */
2016		return;
2017	if (bi_rdev && blk_integrity_compare(mddev->gendisk,
2018					     rdev->bdev->bd_disk) >= 0)
2019		return;
2020	printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
2021	blk_integrity_unregister(mddev->gendisk);
2022}
2023EXPORT_SYMBOL(md_integrity_add_rdev);
2024
2025static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2026{
2027	char b[BDEVNAME_SIZE];
2028	struct kobject *ko;
2029	char *s;
2030	int err;
2031
2032	/* prevent duplicates */
2033	if (find_rdev(mddev, rdev->bdev->bd_dev))
2034		return -EEXIST;
2035
2036	/* make sure rdev->sectors exceeds mddev->dev_sectors */
2037	if (rdev->sectors && (mddev->dev_sectors == 0 ||
2038			rdev->sectors < mddev->dev_sectors)) {
2039		if (mddev->pers) {
2040			/* Cannot change size, so fail
2041			 * If mddev->level <= 0, then we don't care
2042			 * about aligning sizes (e.g. linear)
2043			 */
2044			if (mddev->level > 0)
2045				return -ENOSPC;
2046		} else
2047			mddev->dev_sectors = rdev->sectors;
2048	}
2049
2050	/* Verify rdev->desc_nr is unique.
2051	 * If it is -1, assign a free number, else
2052	 * check number is not in use
2053	 */
2054	rcu_read_lock();
2055	if (rdev->desc_nr < 0) {
2056		int choice = 0;
2057		if (mddev->pers)
2058			choice = mddev->raid_disks;
2059		while (md_find_rdev_nr_rcu(mddev, choice))
2060			choice++;
2061		rdev->desc_nr = choice;
2062	} else {
2063		if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2064			rcu_read_unlock();
2065			return -EBUSY;
2066		}
2067	}
2068	rcu_read_unlock();
2069	if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2070		printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
2071		       mdname(mddev), mddev->max_disks);
2072		return -EBUSY;
2073	}
2074	bdevname(rdev->bdev,b);
2075	while ( (s=strchr(b, '/')) != NULL)
2076		*s = '!';
2077
2078	rdev->mddev = mddev;
2079	printk(KERN_INFO "md: bind<%s>\n", b);
2080
2081	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2082		goto fail;
2083
2084	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2085	if (sysfs_create_link(&rdev->kobj, ko, "block"))
2086		/* failure here is OK */;
2087	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2088
2089	list_add_rcu(&rdev->same_set, &mddev->disks);
2090	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2091
2092	/* May as well allow recovery to be retried once */
2093	mddev->recovery_disabled++;
2094
2095	return 0;
2096
2097 fail:
2098	printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
2099	       b, mdname(mddev));
2100	return err;
2101}
2102
2103static void md_delayed_delete(struct work_struct *ws)
2104{
2105	struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2106	kobject_del(&rdev->kobj);
2107	kobject_put(&rdev->kobj);
2108}
2109
2110static void unbind_rdev_from_array(struct md_rdev *rdev)
2111{
2112	char b[BDEVNAME_SIZE];
2113
2114	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2115	list_del_rcu(&rdev->same_set);
2116	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
2117	rdev->mddev = NULL;
2118	sysfs_remove_link(&rdev->kobj, "block");
2119	sysfs_put(rdev->sysfs_state);
2120	rdev->sysfs_state = NULL;
2121	rdev->badblocks.count = 0;
2122	/* We need to delay this, otherwise we can deadlock when
2123	 * writing to 'remove' to "dev/state".  We also need
2124	 * to delay it due to rcu usage.
2125	 */
2126	synchronize_rcu();
2127	INIT_WORK(&rdev->del_work, md_delayed_delete);
2128	kobject_get(&rdev->kobj);
2129	queue_work(md_misc_wq, &rdev->del_work);
2130}
2131
2132/*
2133 * prevent the device from being mounted, repartitioned or
2134 * otherwise reused by a RAID array (or any other kernel
2135 * subsystem), by bd_claiming the device.
2136 */
2137static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2138{
2139	int err = 0;
2140	struct block_device *bdev;
2141	char b[BDEVNAME_SIZE];
2142
2143	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2144				 shared ? (struct md_rdev *)lock_rdev : rdev);
2145	if (IS_ERR(bdev)) {
2146		printk(KERN_ERR "md: could not open %s.\n",
2147			__bdevname(dev, b));
2148		return PTR_ERR(bdev);
2149	}
2150	rdev->bdev = bdev;
2151	return err;
2152}
2153
2154static void unlock_rdev(struct md_rdev *rdev)
2155{
2156	struct block_device *bdev = rdev->bdev;
2157	rdev->bdev = NULL;
2158	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2159}
2160
2161void md_autodetect_dev(dev_t dev);
2162
2163static void export_rdev(struct md_rdev *rdev)
2164{
2165	char b[BDEVNAME_SIZE];
2166
2167	printk(KERN_INFO "md: export_rdev(%s)\n",
2168		bdevname(rdev->bdev,b));
2169	md_rdev_clear(rdev);
2170#ifndef MODULE
2171	if (test_bit(AutoDetected, &rdev->flags))
2172		md_autodetect_dev(rdev->bdev->bd_dev);
2173#endif
2174	unlock_rdev(rdev);
2175	kobject_put(&rdev->kobj);
2176}
2177
2178void md_kick_rdev_from_array(struct md_rdev *rdev)
2179{
2180	unbind_rdev_from_array(rdev);
2181	export_rdev(rdev);
2182}
2183EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
2184
2185static void export_array(struct mddev *mddev)
2186{
2187	struct md_rdev *rdev;
2188
2189	while (!list_empty(&mddev->disks)) {
2190		rdev = list_first_entry(&mddev->disks, struct md_rdev,
2191					same_set);
2192		md_kick_rdev_from_array(rdev);
2193	}
2194	mddev->raid_disks = 0;
2195	mddev->major_version = 0;
2196}
2197
2198static void sync_sbs(struct mddev *mddev, int nospares)
2199{
2200	/* Update each superblock (in-memory image), but
2201	 * if we are allowed to, skip spares which already
2202	 * have the right event counter, or have one earlier
2203	 * (which would mean they aren't being marked as dirty
2204	 * with the rest of the array)
2205	 */
2206	struct md_rdev *rdev;
2207	rdev_for_each(rdev, mddev) {
2208		if (rdev->sb_events == mddev->events ||
2209		    (nospares &&
2210		     rdev->raid_disk < 0 &&
2211		     rdev->sb_events+1 == mddev->events)) {
2212			/* Don't update this superblock */
2213			rdev->sb_loaded = 2;
2214		} else {
2215			sync_super(mddev, rdev);
2216			rdev->sb_loaded = 1;
2217		}
2218	}
2219}
2220
2221void md_update_sb(struct mddev *mddev, int force_change)
2222{
2223	struct md_rdev *rdev;
2224	int sync_req;
2225	int nospares = 0;
2226	int any_badblocks_changed = 0;
2227
2228	if (mddev->ro) {
2229		if (force_change)
2230			set_bit(MD_CHANGE_DEVS, &mddev->flags);
2231		return;
2232	}
2233repeat:
2234	/* First make sure individual recovery_offsets are correct */
2235	rdev_for_each(rdev, mddev) {
2236		if (rdev->raid_disk >= 0 &&
2237		    mddev->delta_disks >= 0 &&
2238		    !test_bit(In_sync, &rdev->flags) &&
2239		    mddev->curr_resync_completed > rdev->recovery_offset)
2240				rdev->recovery_offset = mddev->curr_resync_completed;
2241
2242	}
2243	if (!mddev->persistent) {
2244		clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2245		clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2246		if (!mddev->external) {
2247			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2248			rdev_for_each(rdev, mddev) {
2249				if (rdev->badblocks.changed) {
2250					rdev->badblocks.changed = 0;
2251					md_ack_all_badblocks(&rdev->badblocks);
2252					md_error(mddev, rdev);
2253				}
2254				clear_bit(Blocked, &rdev->flags);
2255				clear_bit(BlockedBadBlocks, &rdev->flags);
2256				wake_up(&rdev->blocked_wait);
2257			}
2258		}
2259		wake_up(&mddev->sb_wait);
2260		return;
2261	}
2262
2263	spin_lock(&mddev->lock);
2264
2265	mddev->utime = get_seconds();
2266
2267	if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2268		force_change = 1;
2269	if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
2270		/* just a clean<-> dirty transition, possibly leave spares alone,
2271		 * though if events isn't the right even/odd, we will have to do
2272		 * spares after all
2273		 */
2274		nospares = 1;
2275	if (force_change)
2276		nospares = 0;
2277	if (mddev->degraded)
2278		/* If the array is degraded, then skipping spares is both
2279		 * dangerous and fairly pointless.
2280		 * Dangerous because a device that was removed from the array
2281		 * might have a event_count that still looks up-to-date,
2282		 * so it can be re-added without a resync.
2283		 * Pointless because if there are any spares to skip,
2284		 * then a recovery will happen and soon that array won't
2285		 * be degraded any more and the spare can go back to sleep then.
2286		 */
2287		nospares = 0;
2288
2289	sync_req = mddev->in_sync;
2290
2291	/* If this is just a dirty<->clean transition, and the array is clean
2292	 * and 'events' is odd, we can roll back to the previous clean state */
2293	if (nospares
2294	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2295	    && mddev->can_decrease_events
2296	    && mddev->events != 1) {
2297		mddev->events--;
2298		mddev->can_decrease_events = 0;
2299	} else {
2300		/* otherwise we have to go forward and ... */
2301		mddev->events ++;
2302		mddev->can_decrease_events = nospares;
2303	}
2304
2305	/*
2306	 * This 64-bit counter should never wrap.
2307	 * Either we are in around ~1 trillion A.C., assuming
2308	 * 1 reboot per second, or we have a bug...
2309	 */
2310	WARN_ON(mddev->events == 0);
2311
2312	rdev_for_each(rdev, mddev) {
2313		if (rdev->badblocks.changed)
2314			any_badblocks_changed++;
2315		if (test_bit(Faulty, &rdev->flags))
2316			set_bit(FaultRecorded, &rdev->flags);
2317	}
2318
2319	sync_sbs(mddev, nospares);
2320	spin_unlock(&mddev->lock);
2321
2322	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2323		 mdname(mddev), mddev->in_sync);
2324
2325	bitmap_update_sb(mddev->bitmap);
2326	rdev_for_each(rdev, mddev) {
2327		char b[BDEVNAME_SIZE];
2328
2329		if (rdev->sb_loaded != 1)
2330			continue; /* no noise on spare devices */
2331
2332		if (!test_bit(Faulty, &rdev->flags)) {
2333			md_super_write(mddev,rdev,
2334				       rdev->sb_start, rdev->sb_size,
2335				       rdev->sb_page);
2336			pr_debug("md: (write) %s's sb offset: %llu\n",
2337				 bdevname(rdev->bdev, b),
2338				 (unsigned long long)rdev->sb_start);
2339			rdev->sb_events = mddev->events;
2340			if (rdev->badblocks.size) {
2341				md_super_write(mddev, rdev,
2342					       rdev->badblocks.sector,
2343					       rdev->badblocks.size << 9,
2344					       rdev->bb_page);
2345				rdev->badblocks.size = 0;
2346			}
2347
2348		} else
2349			pr_debug("md: %s (skipping faulty)\n",
2350				 bdevname(rdev->bdev, b));
2351
2352		if (mddev->level == LEVEL_MULTIPATH)
2353			/* only need to write one superblock... */
2354			break;
2355	}
2356	md_super_wait(mddev);
2357	/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2358
2359	spin_lock(&mddev->lock);
2360	if (mddev->in_sync != sync_req ||
2361	    test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2362		/* have to write it out again */
2363		spin_unlock(&mddev->lock);
2364		goto repeat;
2365	}
2366	clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2367	spin_unlock(&mddev->lock);
2368	wake_up(&mddev->sb_wait);
2369	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2370		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2371
2372	rdev_for_each(rdev, mddev) {
2373		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2374			clear_bit(Blocked, &rdev->flags);
2375
2376		if (any_badblocks_changed)
2377			md_ack_all_badblocks(&rdev->badblocks);
2378		clear_bit(BlockedBadBlocks, &rdev->flags);
2379		wake_up(&rdev->blocked_wait);
2380	}
2381}
2382EXPORT_SYMBOL(md_update_sb);
2383
2384static int add_bound_rdev(struct md_rdev *rdev)
2385{
2386	struct mddev *mddev = rdev->mddev;
2387	int err = 0;
2388
2389	if (!mddev->pers->hot_remove_disk) {
2390		/* If there is hot_add_disk but no hot_remove_disk
2391		 * then added disks for geometry changes,
2392		 * and should be added immediately.
2393		 */
2394		super_types[mddev->major_version].
2395			validate_super(mddev, rdev);
2396		err = mddev->pers->hot_add_disk(mddev, rdev);
2397		if (err) {
2398			unbind_rdev_from_array(rdev);
2399			export_rdev(rdev);
2400			return err;
2401		}
2402	}
2403	sysfs_notify_dirent_safe(rdev->sysfs_state);
2404
2405	set_bit(MD_CHANGE_DEVS, &mddev->flags);
2406	if (mddev->degraded)
2407		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2408	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2409	md_new_event(mddev);
2410	md_wakeup_thread(mddev->thread);
2411	return 0;
2412}
2413
2414/* words written to sysfs files may, or may not, be \n terminated.
2415 * We want to accept with case. For this we use cmd_match.
2416 */
2417static int cmd_match(const char *cmd, const char *str)
2418{
2419	/* See if cmd, written into a sysfs file, matches
2420	 * str.  They must either be the same, or cmd can
2421	 * have a trailing newline
2422	 */
2423	while (*cmd && *str && *cmd == *str) {
2424		cmd++;
2425		str++;
2426	}
2427	if (*cmd == '\n')
2428		cmd++;
2429	if (*str || *cmd)
2430		return 0;
2431	return 1;
2432}
2433
2434struct rdev_sysfs_entry {
2435	struct attribute attr;
2436	ssize_t (*show)(struct md_rdev *, char *);
2437	ssize_t (*store)(struct md_rdev *, const char *, size_t);
2438};
2439
2440static ssize_t
2441state_show(struct md_rdev *rdev, char *page)
2442{
2443	char *sep = "";
2444	size_t len = 0;
2445	unsigned long flags = ACCESS_ONCE(rdev->flags);
2446
2447	if (test_bit(Faulty, &flags) ||
2448	    rdev->badblocks.unacked_exist) {
2449		len+= sprintf(page+len, "%sfaulty",sep);
2450		sep = ",";
2451	}
2452	if (test_bit(In_sync, &flags)) {
2453		len += sprintf(page+len, "%sin_sync",sep);
2454		sep = ",";
2455	}
2456	if (test_bit(WriteMostly, &flags)) {
2457		len += sprintf(page+len, "%swrite_mostly",sep);
2458		sep = ",";
2459	}
2460	if (test_bit(Blocked, &flags) ||
2461	    (rdev->badblocks.unacked_exist
2462	     && !test_bit(Faulty, &flags))) {
2463		len += sprintf(page+len, "%sblocked", sep);
2464		sep = ",";
2465	}
2466	if (!test_bit(Faulty, &flags) &&
2467	    !test_bit(In_sync, &flags)) {
2468		len += sprintf(page+len, "%sspare", sep);
2469		sep = ",";
2470	}
2471	if (test_bit(WriteErrorSeen, &flags)) {
2472		len += sprintf(page+len, "%swrite_error", sep);
2473		sep = ",";
2474	}
2475	if (test_bit(WantReplacement, &flags)) {
2476		len += sprintf(page+len, "%swant_replacement", sep);
2477		sep = ",";
2478	}
2479	if (test_bit(Replacement, &flags)) {
2480		len += sprintf(page+len, "%sreplacement", sep);
2481		sep = ",";
2482	}
2483
2484	return len+sprintf(page+len, "\n");
2485}
2486
2487static ssize_t
2488state_store(struct md_rdev *rdev, const char *buf, size_t len)
2489{
2490	/* can write
2491	 *  faulty  - simulates an error
2492	 *  remove  - disconnects the device
2493	 *  writemostly - sets write_mostly
2494	 *  -writemostly - clears write_mostly
2495	 *  blocked - sets the Blocked flags
2496	 *  -blocked - clears the Blocked and possibly simulates an error
2497	 *  insync - sets Insync providing device isn't active
2498	 *  -insync - clear Insync for a device with a slot assigned,
2499	 *            so that it gets rebuilt based on bitmap
2500	 *  write_error - sets WriteErrorSeen
2501	 *  -write_error - clears WriteErrorSeen
2502	 */
2503	int err = -EINVAL;
2504	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2505		md_error(rdev->mddev, rdev);
2506		if (test_bit(Faulty, &rdev->flags))
2507			err = 0;
2508		else
2509			err = -EBUSY;
2510	} else if (cmd_match(buf, "remove")) {
2511		if (rdev->raid_disk >= 0)
2512			err = -EBUSY;
2513		else {
2514			struct mddev *mddev = rdev->mddev;
2515			if (mddev_is_clustered(mddev))
2516				md_cluster_ops->remove_disk(mddev, rdev);
2517			md_kick_rdev_from_array(rdev);
2518			if (mddev_is_clustered(mddev))
2519				md_cluster_ops->metadata_update_start(mddev);
2520			if (mddev->pers)
2521				md_update_sb(mddev, 1);
2522			md_new_event(mddev);
2523			if (mddev_is_clustered(mddev))
2524				md_cluster_ops->metadata_update_finish(mddev);
2525			err = 0;
2526		}
2527	} else if (cmd_match(buf, "writemostly")) {
2528		set_bit(WriteMostly, &rdev->flags);
2529		err = 0;
2530	} else if (cmd_match(buf, "-writemostly")) {
2531		clear_bit(WriteMostly, &rdev->flags);
2532		err = 0;
2533	} else if (cmd_match(buf, "blocked")) {
2534		set_bit(Blocked, &rdev->flags);
2535		err = 0;
2536	} else if (cmd_match(buf, "-blocked")) {
2537		if (!test_bit(Faulty, &rdev->flags) &&
2538		    rdev->badblocks.unacked_exist) {
2539			/* metadata handler doesn't understand badblocks,
2540			 * so we need to fail the device
2541			 */
2542			md_error(rdev->mddev, rdev);
2543		}
2544		clear_bit(Blocked, &rdev->flags);
2545		clear_bit(BlockedBadBlocks, &rdev->flags);
2546		wake_up(&rdev->blocked_wait);
2547		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2548		md_wakeup_thread(rdev->mddev->thread);
2549
2550		err = 0;
2551	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2552		set_bit(In_sync, &rdev->flags);
2553		err = 0;
2554	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
2555		if (rdev->mddev->pers == NULL) {
2556			clear_bit(In_sync, &rdev->flags);
2557			rdev->saved_raid_disk = rdev->raid_disk;
2558			rdev->raid_disk = -1;
2559			err = 0;
2560		}
2561	} else if (cmd_match(buf, "write_error")) {
2562		set_bit(WriteErrorSeen, &rdev->flags);
2563		err = 0;
2564	} else if (cmd_match(buf, "-write_error")) {
2565		clear_bit(WriteErrorSeen, &rdev->flags);
2566		err = 0;
2567	} else if (cmd_match(buf, "want_replacement")) {
2568		/* Any non-spare device that is not a replacement can
2569		 * become want_replacement at any time, but we then need to
2570		 * check if recovery is needed.
2571		 */
2572		if (rdev->raid_disk >= 0 &&
2573		    !test_bit(Replacement, &rdev->flags))
2574			set_bit(WantReplacement, &rdev->flags);
2575		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2576		md_wakeup_thread(rdev->mddev->thread);
2577		err = 0;
2578	} else if (cmd_match(buf, "-want_replacement")) {
2579		/* Clearing 'want_replacement' is always allowed.
2580		 * Once replacements starts it is too late though.
2581		 */
2582		err = 0;
2583		clear_bit(WantReplacement, &rdev->flags);
2584	} else if (cmd_match(buf, "replacement")) {
2585		/* Can only set a device as a replacement when array has not
2586		 * yet been started.  Once running, replacement is automatic
2587		 * from spares, or by assigning 'slot'.
2588		 */
2589		if (rdev->mddev->pers)
2590			err = -EBUSY;
2591		else {
2592			set_bit(Replacement, &rdev->flags);
2593			err = 0;
2594		}
2595	} else if (cmd_match(buf, "-replacement")) {
2596		/* Similarly, can only clear Replacement before start */
2597		if (rdev->mddev->pers)
2598			err = -EBUSY;
2599		else {
2600			clear_bit(Replacement, &rdev->flags);
2601			err = 0;
2602		}
2603	} else if (cmd_match(buf, "re-add")) {
2604		if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
2605			/* clear_bit is performed _after_ all the devices
2606			 * have their local Faulty bit cleared. If any writes
2607			 * happen in the meantime in the local node, they
2608			 * will land in the local bitmap, which will be synced
2609			 * by this node eventually
2610			 */
2611			if (!mddev_is_clustered(rdev->mddev) ||
2612			    (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
2613				clear_bit(Faulty, &rdev->flags);
2614				err = add_bound_rdev(rdev);
2615			}
2616		} else
2617			err = -EBUSY;
2618	}
2619	if (!err)
2620		sysfs_notify_dirent_safe(rdev->sysfs_state);
2621	return err ? err : len;
2622}
2623static struct rdev_sysfs_entry rdev_state =
2624__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
2625
2626static ssize_t
2627errors_show(struct md_rdev *rdev, char *page)
2628{
2629	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2630}
2631
2632static ssize_t
2633errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2634{
2635	char *e;
2636	unsigned long n = simple_strtoul(buf, &e, 10);
2637	if (*buf && (*e == 0 || *e == '\n')) {
2638		atomic_set(&rdev->corrected_errors, n);
2639		return len;
2640	}
2641	return -EINVAL;
2642}
2643static struct rdev_sysfs_entry rdev_errors =
2644__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2645
2646static ssize_t
2647slot_show(struct md_rdev *rdev, char *page)
2648{
2649	if (rdev->raid_disk < 0)
2650		return sprintf(page, "none\n");
2651	else
2652		return sprintf(page, "%d\n", rdev->raid_disk);
2653}
2654
2655static ssize_t
2656slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2657{
2658	char *e;
2659	int err;
2660	int slot = simple_strtoul(buf, &e, 10);
2661	if (strncmp(buf, "none", 4)==0)
2662		slot = -1;
2663	else if (e==buf || (*e && *e!= '\n'))
2664		return -EINVAL;
2665	if (rdev->mddev->pers && slot == -1) {
2666		/* Setting 'slot' on an active array requires also
2667		 * updating the 'rd%d' link, and communicating
2668		 * with the personality with ->hot_*_disk.
2669		 * For now we only support removing
2670		 * failed/spare devices.  This normally happens automatically,
2671		 * but not when the metadata is externally managed.
2672		 */
2673		if (rdev->raid_disk == -1)
2674			return -EEXIST;
2675		/* personality does all needed checks */
2676		if (rdev->mddev->pers->hot_remove_disk == NULL)
2677			return -EINVAL;
2678		clear_bit(Blocked, &rdev->flags);
2679		remove_and_add_spares(rdev->mddev, rdev);
2680		if (rdev->raid_disk >= 0)
2681			return -EBUSY;
2682		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2683		md_wakeup_thread(rdev->mddev->thread);
2684	} else if (rdev->mddev->pers) {
2685		/* Activating a spare .. or possibly reactivating
2686		 * if we ever get bitmaps working here.
2687		 */
2688
2689		if (rdev->raid_disk != -1)
2690			return -EBUSY;
2691
2692		if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2693			return -EBUSY;
2694
2695		if (rdev->mddev->pers->hot_add_disk == NULL)
2696			return -EINVAL;
2697
2698		if (slot >= rdev->mddev->raid_disks &&
2699		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2700			return -ENOSPC;
2701
2702		rdev->raid_disk = slot;
2703		if (test_bit(In_sync, &rdev->flags))
2704			rdev->saved_raid_disk = slot;
2705		else
2706			rdev->saved_raid_disk = -1;
2707		clear_bit(In_sync, &rdev->flags);
2708		clear_bit(Bitmap_sync, &rdev->flags);
2709		err = rdev->mddev->pers->
2710			hot_add_disk(rdev->mddev, rdev);
2711		if (err) {
2712			rdev->raid_disk = -1;
2713			return err;
2714		} else
2715			sysfs_notify_dirent_safe(rdev->sysfs_state);
2716		if (sysfs_link_rdev(rdev->mddev, rdev))
2717			/* failure here is OK */;
2718		/* don't wakeup anyone, leave that to userspace. */
2719	} else {
2720		if (slot >= rdev->mddev->raid_disks &&
2721		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2722			return -ENOSPC;
2723		rdev->raid_disk = slot;
2724		/* assume it is working */
2725		clear_bit(Faulty, &rdev->flags);
2726		clear_bit(WriteMostly, &rdev->flags);
2727		set_bit(In_sync, &rdev->flags);
2728		sysfs_notify_dirent_safe(rdev->sysfs_state);
2729	}
2730	return len;
2731}
2732
2733static struct rdev_sysfs_entry rdev_slot =
2734__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2735
2736static ssize_t
2737offset_show(struct md_rdev *rdev, char *page)
2738{
2739	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2740}
2741
2742static ssize_t
2743offset_store(struct md_rdev *rdev, const char *buf, size_t len)
2744{
2745	unsigned long long offset;
2746	if (kstrtoull(buf, 10, &offset) < 0)
2747		return -EINVAL;
2748	if (rdev->mddev->pers && rdev->raid_disk >= 0)
2749		return -EBUSY;
2750	if (rdev->sectors && rdev->mddev->external)
2751		/* Must set offset before size, so overlap checks
2752		 * can be sane */
2753		return -EBUSY;
2754	rdev->data_offset = offset;
2755	rdev->new_data_offset = offset;
2756	return len;
2757}
2758
2759static struct rdev_sysfs_entry rdev_offset =
2760__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2761
2762static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
2763{
2764	return sprintf(page, "%llu\n",
2765		       (unsigned long long)rdev->new_data_offset);
2766}
2767
2768static ssize_t new_offset_store(struct md_rdev *rdev,
2769				const char *buf, size_t len)
2770{
2771	unsigned long long new_offset;
2772	struct mddev *mddev = rdev->mddev;
2773
2774	if (kstrtoull(buf, 10, &new_offset) < 0)
2775		return -EINVAL;
2776
2777	if (mddev->sync_thread ||
2778	    test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
2779		return -EBUSY;
2780	if (new_offset == rdev->data_offset)
2781		/* reset is always permitted */
2782		;
2783	else if (new_offset > rdev->data_offset) {
2784		/* must not push array size beyond rdev_sectors */
2785		if (new_offset - rdev->data_offset
2786		    + mddev->dev_sectors > rdev->sectors)
2787				return -E2BIG;
2788	}
2789	/* Metadata worries about other space details. */
2790
2791	/* decreasing the offset is inconsistent with a backwards
2792	 * reshape.
2793	 */
2794	if (new_offset < rdev->data_offset &&
2795	    mddev->reshape_backwards)
2796		return -EINVAL;
2797	/* Increasing offset is inconsistent with forwards
2798	 * reshape.  reshape_direction should be set to
2799	 * 'backwards' first.
2800	 */
2801	if (new_offset > rdev->data_offset &&
2802	    !mddev->reshape_backwards)
2803		return -EINVAL;
2804
2805	if (mddev->pers && mddev->persistent &&
2806	    !super_types[mddev->major_version]
2807	    .allow_new_offset(rdev, new_offset))
2808		return -E2BIG;
2809	rdev->new_data_offset = new_offset;
2810	if (new_offset > rdev->data_offset)
2811		mddev->reshape_backwards = 1;
2812	else if (new_offset < rdev->data_offset)
2813		mddev->reshape_backwards = 0;
2814
2815	return len;
2816}
2817static struct rdev_sysfs_entry rdev_new_offset =
2818__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
2819
2820static ssize_t
2821rdev_size_show(struct md_rdev *rdev, char *page)
2822{
2823	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2824}
2825
2826static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2827{
2828	/* check if two start/length pairs overlap */
2829	if (s1+l1 <= s2)
2830		return 0;
2831	if (s2+l2 <= s1)
2832		return 0;
2833	return 1;
2834}
2835
2836static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2837{
2838	unsigned long long blocks;
2839	sector_t new;
2840
2841	if (kstrtoull(buf, 10, &blocks) < 0)
2842		return -EINVAL;
2843
2844	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2845		return -EINVAL; /* sector conversion overflow */
2846
2847	new = blocks * 2;
2848	if (new != blocks * 2)
2849		return -EINVAL; /* unsigned long long to sector_t overflow */
2850
2851	*sectors = new;
2852	return 0;
2853}
2854
2855static ssize_t
2856rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2857{
2858	struct mddev *my_mddev = rdev->mddev;
2859	sector_t oldsectors = rdev->sectors;
2860	sector_t sectors;
2861
2862	if (strict_blocks_to_sectors(buf, &sectors) < 0)
2863		return -EINVAL;
2864	if (rdev->data_offset != rdev->new_data_offset)
2865		return -EINVAL; /* too confusing */
2866	if (my_mddev->pers && rdev->raid_disk >= 0) {
2867		if (my_mddev->persistent) {
2868			sectors = super_types[my_mddev->major_version].
2869				rdev_size_change(rdev, sectors);
2870			if (!sectors)
2871				return -EBUSY;
2872		} else if (!sectors)
2873			sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
2874				rdev->data_offset;
2875		if (!my_mddev->pers->resize)
2876			/* Cannot change size for RAID0 or Linear etc */
2877			return -EINVAL;
2878	}
2879	if (sectors < my_mddev->dev_sectors)
2880		return -EINVAL; /* component must fit device */
2881
2882	rdev->sectors = sectors;
2883	if (sectors > oldsectors && my_mddev->external) {
2884		/* Need to check that all other rdevs with the same
2885		 * ->bdev do not overlap.  'rcu' is sufficient to walk
2886		 * the rdev lists safely.
2887		 * This check does not provide a hard guarantee, it
2888		 * just helps avoid dangerous mistakes.
2889		 */
2890		struct mddev *mddev;
2891		int overlap = 0;
2892		struct list_head *tmp;
2893
2894		rcu_read_lock();
2895		for_each_mddev(mddev, tmp) {
2896			struct md_rdev *rdev2;
2897
2898			rdev_for_each(rdev2, mddev)
2899				if (rdev->bdev == rdev2->bdev &&
2900				    rdev != rdev2 &&
2901				    overlaps(rdev->data_offset, rdev->sectors,
2902					     rdev2->data_offset,
2903					     rdev2->sectors)) {
2904					overlap = 1;
2905					break;
2906				}
2907			if (overlap) {
2908				mddev_put(mddev);
2909				break;
2910			}
2911		}
2912		rcu_read_unlock();
2913		if (overlap) {
2914			/* Someone else could have slipped in a size
2915			 * change here, but doing so is just silly.
2916			 * We put oldsectors back because we *know* it is
2917			 * safe, and trust userspace not to race with
2918			 * itself
2919			 */
2920			rdev->sectors = oldsectors;
2921			return -EBUSY;
2922		}
2923	}
2924	return len;
2925}
2926
2927static struct rdev_sysfs_entry rdev_size =
2928__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2929
2930static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
2931{
2932	unsigned long long recovery_start = rdev->recovery_offset;
2933
2934	if (test_bit(In_sync, &rdev->flags) ||
2935	    recovery_start == MaxSector)
2936		return sprintf(page, "none\n");
2937
2938	return sprintf(page, "%llu\n", recovery_start);
2939}
2940
2941static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
2942{
2943	unsigned long long recovery_start;
2944
2945	if (cmd_match(buf, "none"))
2946		recovery_start = MaxSector;
2947	else if (kstrtoull(buf, 10, &recovery_start))
2948		return -EINVAL;
2949
2950	if (rdev->mddev->pers &&
2951	    rdev->raid_disk >= 0)
2952		return -EBUSY;
2953
2954	rdev->recovery_offset = recovery_start;
2955	if (recovery_start == MaxSector)
2956		set_bit(In_sync, &rdev->flags);
2957	else
2958		clear_bit(In_sync, &rdev->flags);
2959	return len;
2960}
2961
2962static struct rdev_sysfs_entry rdev_recovery_start =
2963__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2964
2965static ssize_t
2966badblocks_show(struct badblocks *bb, char *page, int unack);
2967static ssize_t
2968badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
2969
2970static ssize_t bb_show(struct md_rdev *rdev, char *page)
2971{
2972	return badblocks_show(&rdev->badblocks, page, 0);
2973}
2974static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
2975{
2976	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
2977	/* Maybe that ack was all we needed */
2978	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
2979		wake_up(&rdev->blocked_wait);
2980	return rv;
2981}
2982static struct rdev_sysfs_entry rdev_bad_blocks =
2983__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
2984
2985static ssize_t ubb_show(struct md_rdev *rdev, char *page)
2986{
2987	return badblocks_show(&rdev->badblocks, page, 1);
2988}
2989static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
2990{
2991	return badblocks_store(&rdev->badblocks, page, len, 1);
2992}
2993static struct rdev_sysfs_entry rdev_unack_bad_blocks =
2994__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
2995
2996static struct attribute *rdev_default_attrs[] = {
2997	&rdev_state.attr,
2998	&rdev_errors.attr,
2999	&rdev_slot.attr,
3000	&rdev_offset.attr,
3001	&rdev_new_offset.attr,
3002	&rdev_size.attr,
3003	&rdev_recovery_start.attr,
3004	&rdev_bad_blocks.attr,
3005	&rdev_unack_bad_blocks.attr,
3006	NULL,
3007};
3008static ssize_t
3009rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3010{
3011	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3012	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3013
3014	if (!entry->show)
3015		return -EIO;
3016	if (!rdev->mddev)
3017		return -EBUSY;
3018	return entry->show(rdev, page);
3019}
3020
3021static ssize_t
3022rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3023	      const char *page, size_t length)
3024{
3025	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3026	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3027	ssize_t rv;
3028	struct mddev *mddev = rdev->mddev;
3029
3030	if (!entry->store)
3031		return -EIO;
3032	if (!capable(CAP_SYS_ADMIN))
3033		return -EACCES;
3034	rv = mddev ? mddev_lock(mddev): -EBUSY;
3035	if (!rv) {
3036		if (rdev->mddev == NULL)
3037			rv = -EBUSY;
3038		else
3039			rv = entry->store(rdev, page, length);
3040		mddev_unlock(mddev);
3041	}
3042	return rv;
3043}
3044
3045static void rdev_free(struct kobject *ko)
3046{
3047	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3048	kfree(rdev);
3049}
3050static const struct sysfs_ops rdev_sysfs_ops = {
3051	.show		= rdev_attr_show,
3052	.store		= rdev_attr_store,
3053};
3054static struct kobj_type rdev_ktype = {
3055	.release	= rdev_free,
3056	.sysfs_ops	= &rdev_sysfs_ops,
3057	.default_attrs	= rdev_default_attrs,
3058};
3059
3060int md_rdev_init(struct md_rdev *rdev)
3061{
3062	rdev->desc_nr = -1;
3063	rdev->saved_raid_disk = -1;
3064	rdev->raid_disk = -1;
3065	rdev->flags = 0;
3066	rdev->data_offset = 0;
3067	rdev->new_data_offset = 0;
3068	rdev->sb_events = 0;
3069	rdev->last_read_error.tv_sec  = 0;
3070	rdev->last_read_error.tv_nsec = 0;
3071	rdev->sb_loaded = 0;
3072	rdev->bb_page = NULL;
3073	atomic_set(&rdev->nr_pending, 0);
3074	atomic_set(&rdev->read_errors, 0);
3075	atomic_set(&rdev->corrected_errors, 0);
3076
3077	INIT_LIST_HEAD(&rdev->same_set);
3078	init_waitqueue_head(&rdev->blocked_wait);
3079
3080	/* Add space to store bad block list.
3081	 * This reserves the space even on arrays where it cannot
3082	 * be used - I wonder if that matters
3083	 */
3084	rdev->badblocks.count = 0;
3085	rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
3086	rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3087	seqlock_init(&rdev->badblocks.lock);
3088	if (rdev->badblocks.page == NULL)
3089		return -ENOMEM;
3090
3091	return 0;
3092}
3093EXPORT_SYMBOL_GPL(md_rdev_init);
3094/*
3095 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3096 *
3097 * mark the device faulty if:
3098 *
3099 *   - the device is nonexistent (zero size)
3100 *   - the device has no valid superblock
3101 *
3102 * a faulty rdev _never_ has rdev->sb set.
3103 */
3104static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3105{
3106	char b[BDEVNAME_SIZE];
3107	int err;
3108	struct md_rdev *rdev;
3109	sector_t size;
3110
3111	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3112	if (!rdev) {
3113		printk(KERN_ERR "md: could not alloc mem for new device!\n");
3114		return ERR_PTR(-ENOMEM);
3115	}
3116
3117	err = md_rdev_init(rdev);
3118	if (err)
3119		goto abort_free;
3120	err = alloc_disk_sb(rdev);
3121	if (err)
3122		goto abort_free;
3123
3124	err = lock_rdev(rdev, newdev, super_format == -2);
3125	if (err)
3126		goto abort_free;
3127
3128	kobject_init(&rdev->kobj, &rdev_ktype);
3129
3130	size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3131	if (!size) {
3132		printk(KERN_WARNING
3133			"md: %s has zero or unknown size, marking faulty!\n",
3134			bdevname(rdev->bdev,b));
3135		err = -EINVAL;
3136		goto abort_free;
3137	}
3138
3139	if (super_format >= 0) {
3140		err = super_types[super_format].
3141			load_super(rdev, NULL, super_minor);
3142		if (err == -EINVAL) {
3143			printk(KERN_WARNING
3144				"md: %s does not have a valid v%d.%d "
3145			       "superblock, not importing!\n",
3146				bdevname(rdev->bdev,b),
3147			       super_format, super_minor);
3148			goto abort_free;
3149		}
3150		if (err < 0) {
3151			printk(KERN_WARNING
3152				"md: could not read %s's sb, not importing!\n",
3153				bdevname(rdev->bdev,b));
3154			goto abort_free;
3155		}
3156	}
3157
3158	return rdev;
3159
3160abort_free:
3161	if (rdev->bdev)
3162		unlock_rdev(rdev);
3163	md_rdev_clear(rdev);
3164	kfree(rdev);
3165	return ERR_PTR(err);
3166}
3167
3168/*
3169 * Check a full RAID array for plausibility
3170 */
3171
3172static void analyze_sbs(struct mddev *mddev)
3173{
3174	int i;
3175	struct md_rdev *rdev, *freshest, *tmp;
3176	char b[BDEVNAME_SIZE];
3177
3178	freshest = NULL;
3179	rdev_for_each_safe(rdev, tmp, mddev)
3180		switch (super_types[mddev->major_version].
3181			load_super(rdev, freshest, mddev->minor_version)) {
3182		case 1:
3183			freshest = rdev;
3184			break;
3185		case 0:
3186			break;
3187		default:
3188			printk( KERN_ERR \
3189				"md: fatal superblock inconsistency in %s"
3190				" -- removing from array\n",
3191				bdevname(rdev->bdev,b));
3192			md_kick_rdev_from_array(rdev);
3193		}
3194
3195	super_types[mddev->major_version].
3196		validate_super(mddev, freshest);
3197
3198	i = 0;
3199	rdev_for_each_safe(rdev, tmp, mddev) {
3200		if (mddev->max_disks &&
3201		    (rdev->desc_nr >= mddev->max_disks ||
3202		     i > mddev->max_disks)) {
3203			printk(KERN_WARNING
3204			       "md: %s: %s: only %d devices permitted\n",
3205			       mdname(mddev), bdevname(rdev->bdev, b),
3206			       mddev->max_disks);
3207			md_kick_rdev_from_array(rdev);
3208			continue;
3209		}
3210		if (rdev != freshest) {
3211			if (super_types[mddev->major_version].
3212			    validate_super(mddev, rdev)) {
3213				printk(KERN_WARNING "md: kicking non-fresh %s"
3214					" from array!\n",
3215					bdevname(rdev->bdev,b));
3216				md_kick_rdev_from_array(rdev);
3217				continue;
3218			}
3219			/* No device should have a Candidate flag
3220			 * when reading devices
3221			 */
3222			if (test_bit(Candidate, &rdev->flags)) {
3223				pr_info("md: kicking Cluster Candidate %s from array!\n",
3224					bdevname(rdev->bdev, b));
3225				md_kick_rdev_from_array(rdev);
3226			}
3227		}
3228		if (mddev->level == LEVEL_MULTIPATH) {
3229			rdev->desc_nr = i++;
3230			rdev->raid_disk = rdev->desc_nr;
3231			set_bit(In_sync, &rdev->flags);
3232		} else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
3233			rdev->raid_disk = -1;
3234			clear_bit(In_sync, &rdev->flags);
3235		}
3236	}
3237}
3238
3239/* Read a fixed-point number.
3240 * Numbers in sysfs attributes should be in "standard" units where
3241 * possible, so time should be in seconds.
3242 * However we internally use a a much smaller unit such as
3243 * milliseconds or jiffies.
3244 * This function takes a decimal number with a possible fractional
3245 * component, and produces an integer which is the result of
3246 * multiplying that number by 10^'scale'.
3247 * all without any floating-point arithmetic.
3248 */
3249int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3250{
3251	unsigned long result = 0;
3252	long decimals = -1;
3253	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3254		if (*cp == '.')
3255			decimals = 0;
3256		else if (decimals < scale) {
3257			unsigned int value;
3258			value = *cp - '0';
3259			result = result * 10 + value;
3260			if (decimals >= 0)
3261				decimals++;
3262		}
3263		cp++;
3264	}
3265	if (*cp == '\n')
3266		cp++;
3267	if (*cp)
3268		return -EINVAL;
3269	if (decimals < 0)
3270		decimals = 0;
3271	while (decimals < scale) {
3272		result *= 10;
3273		decimals ++;
3274	}
3275	*res = result;
3276	return 0;
3277}
3278
3279static void md_safemode_timeout(unsigned long data);
3280
3281static ssize_t
3282safe_delay_show(struct mddev *mddev, char *page)
3283{
3284	int msec = (mddev->safemode_delay*1000)/HZ;
3285	return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3286}
3287static ssize_t
3288safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3289{
3290	unsigned long msec;
3291
3292	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3293		return -EINVAL;
3294	if (msec == 0)
3295		mddev->safemode_delay = 0;
3296	else {
3297		unsigned long old_delay = mddev->safemode_delay;
3298		unsigned long new_delay = (msec*HZ)/1000;
3299
3300		if (new_delay == 0)
3301			new_delay = 1;
3302		mddev->safemode_delay = new_delay;
3303		if (new_delay < old_delay || old_delay == 0)
3304			mod_timer(&mddev->safemode_timer, jiffies+1);
3305	}
3306	return len;
3307}
3308static struct md_sysfs_entry md_safe_delay =
3309__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3310
3311static ssize_t
3312level_show(struct mddev *mddev, char *page)
3313{
3314	struct md_personality *p;
3315	int ret;
3316	spin_lock(&mddev->lock);
3317	p = mddev->pers;
3318	if (p)
3319		ret = sprintf(page, "%s\n", p->name);
3320	else if (mddev->clevel[0])
3321		ret = sprintf(page, "%s\n", mddev->clevel);
3322	else if (mddev->level != LEVEL_NONE)
3323		ret = sprintf(page, "%d\n", mddev->level);
3324	else
3325		ret = 0;
3326	spin_unlock(&mddev->lock);
3327	return ret;
3328}
3329
3330static ssize_t
3331level_store(struct mddev *mddev, const char *buf, size_t len)
3332{
3333	char clevel[16];
3334	ssize_t rv;
3335	size_t slen = len;
3336	struct md_personality *pers, *oldpers;
3337	long level;
3338	void *priv, *oldpriv;
3339	struct md_rdev *rdev;
3340
3341	if (slen == 0 || slen >= sizeof(clevel))
3342		return -EINVAL;
3343
3344	rv = mddev_lock(mddev);
3345	if (rv)
3346		return rv;
3347
3348	if (mddev->pers == NULL) {
3349		strncpy(mddev->clevel, buf, slen);
3350		if (mddev->clevel[slen-1] == '\n')
3351			slen--;
3352		mddev->clevel[slen] = 0;
3353		mddev->level = LEVEL_NONE;
3354		rv = len;
3355		goto out_unlock;
3356	}
3357	rv = -EROFS;
3358	if (mddev->ro)
3359		goto out_unlock;
3360
3361	/* request to change the personality.  Need to ensure:
3362	 *  - array is not engaged in resync/recovery/reshape
3363	 *  - old personality can be suspended
3364	 *  - new personality will access other array.
3365	 */
3366
3367	rv = -EBUSY;
3368	if (mddev->sync_thread ||
3369	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3370	    mddev->reshape_position != MaxSector ||
3371	    mddev->sysfs_active)
3372		goto out_unlock;
3373
3374	rv = -EINVAL;
3375	if (!mddev->pers->quiesce) {
3376		printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
3377		       mdname(mddev), mddev->pers->name);
3378		goto out_unlock;
3379	}
3380
3381	/* Now find the new personality */
3382	strncpy(clevel, buf, slen);
3383	if (clevel[slen-1] == '\n')
3384		slen--;
3385	clevel[slen] = 0;
3386	if (kstrtol(clevel, 10, &level))
3387		level = LEVEL_NONE;
3388
3389	if (request_module("md-%s", clevel) != 0)
3390		request_module("md-level-%s", clevel);
3391	spin_lock(&pers_lock);
3392	pers = find_pers(level, clevel);
3393	if (!pers || !try_module_get(pers->owner)) {
3394		spin_unlock(&pers_lock);
3395		printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
3396		rv = -EINVAL;
3397		goto out_unlock;
3398	}
3399	spin_unlock(&pers_lock);
3400
3401	if (pers == mddev->pers) {
3402		/* Nothing to do! */
3403		module_put(pers->owner);
3404		rv = len;
3405		goto out_unlock;
3406	}
3407	if (!pers->takeover) {
3408		module_put(pers->owner);
3409		printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
3410		       mdname(mddev), clevel);
3411		rv = -EINVAL;
3412		goto out_unlock;
3413	}
3414
3415	rdev_for_each(rdev, mddev)
3416		rdev->new_raid_disk = rdev->raid_disk;
3417
3418	/* ->takeover must set new_* and/or delta_disks
3419	 * if it succeeds, and may set them when it fails.
3420	 */
3421	priv = pers->takeover(mddev);
3422	if (IS_ERR(priv)) {
3423		mddev->new_level = mddev->level;
3424		mddev->new_layout = mddev->layout;
3425		mddev->new_chunk_sectors = mddev->chunk_sectors;
3426		mddev->raid_disks -= mddev->delta_disks;
3427		mddev->delta_disks = 0;
3428		mddev->reshape_backwards = 0;
3429		module_put(pers->owner);
3430		printk(KERN_WARNING "md: %s: %s would not accept array\n",
3431		       mdname(mddev), clevel);
3432		rv = PTR_ERR(priv);
3433		goto out_unlock;
3434	}
3435
3436	/* Looks like we have a winner */
3437	mddev_suspend(mddev);
3438	mddev_detach(mddev);
3439
3440	spin_lock(&mddev->lock);
3441	oldpers = mddev->pers;
3442	oldpriv = mddev->private;
3443	mddev->pers = pers;
3444	mddev->private = priv;
3445	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3446	mddev->level = mddev->new_level;
3447	mddev->layout = mddev->new_layout;
3448	mddev->chunk_sectors = mddev->new_chunk_sectors;
3449	mddev->delta_disks = 0;
3450	mddev->reshape_backwards = 0;
3451	mddev->degraded = 0;
3452	spin_unlock(&mddev->lock);
3453
3454	if (oldpers->sync_request == NULL &&
3455	    mddev->external) {
3456		/* We are converting from a no-redundancy array
3457		 * to a redundancy array and metadata is managed
3458		 * externally so we need to be sure that writes
3459		 * won't block due to a need to transition
3460		 *      clean->dirty
3461		 * until external management is started.
3462		 */
3463		mddev->in_sync = 0;
3464		mddev->safemode_delay = 0;
3465		mddev->safemode = 0;
3466	}
3467
3468	oldpers->free(mddev, oldpriv);
3469
3470	if (oldpers->sync_request == NULL &&
3471	    pers->sync_request != NULL) {
3472		/* need to add the md_redundancy_group */
3473		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3474			printk(KERN_WARNING
3475			       "md: cannot register extra attributes for %s\n",
3476			       mdname(mddev));
3477		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3478	}
3479	if (oldpers->sync_request != NULL &&
3480	    pers->sync_request == NULL) {
3481		/* need to remove the md_redundancy_group */
3482		if (mddev->to_remove == NULL)
3483			mddev->to_remove = &md_redundancy_group;
3484	}
3485
3486	rdev_for_each(rdev, mddev) {
3487		if (rdev->raid_disk < 0)
3488			continue;
3489		if (rdev->new_raid_disk >= mddev->raid_disks)
3490			rdev->new_raid_disk = -1;
3491		if (rdev->new_raid_disk == rdev->raid_disk)
3492			continue;
3493		sysfs_unlink_rdev(mddev, rdev);
3494	}
3495	rdev_for_each(rdev, mddev) {
3496		if (rdev->raid_disk < 0)
3497			continue;
3498		if (rdev->new_raid_disk == rdev->raid_disk)
3499			continue;
3500		rdev->raid_disk = rdev->new_raid_disk;
3501		if (rdev->raid_disk < 0)
3502			clear_bit(In_sync, &rdev->flags);
3503		else {
3504			if (sysfs_link_rdev(mddev, rdev))
3505				printk(KERN_WARNING "md: cannot register rd%d"
3506				       " for %s after level change\n",
3507				       rdev->raid_disk, mdname(mddev));
3508		}
3509	}
3510
3511	if (pers->sync_request == NULL) {
3512		/* this is now an array without redundancy, so
3513		 * it must always be in_sync
3514		 */
3515		mddev->in_sync = 1;
3516		del_timer_sync(&mddev->safemode_timer);
3517	}
3518	blk_set_stacking_limits(&mddev->queue->limits);
3519	pers->run(mddev);
3520	set_bit(MD_CHANGE_DEVS, &mddev->flags);
3521	mddev_resume(mddev);
3522	if (!mddev->thread)
3523		md_update_sb(mddev, 1);
3524	sysfs_notify(&mddev->kobj, NULL, "level");
3525	md_new_event(mddev);
3526	rv = len;
3527out_unlock:
3528	mddev_unlock(mddev);
3529	return rv;
3530}
3531
3532static struct md_sysfs_entry md_level =
3533__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3534
3535static ssize_t
3536layout_show(struct mddev *mddev, char *page)
3537{
3538	/* just a number, not meaningful for all levels */
3539	if (mddev->reshape_position != MaxSector &&
3540	    mddev->layout != mddev->new_layout)
3541		return sprintf(page, "%d (%d)\n",
3542			       mddev->new_layout, mddev->layout);
3543	return sprintf(page, "%d\n", mddev->layout);
3544}
3545
3546static ssize_t
3547layout_store(struct mddev *mddev, const char *buf, size_t len)
3548{
3549	char *e;
3550	unsigned long n = simple_strtoul(buf, &e, 10);
3551	int err;
3552
3553	if (!*buf || (*e && *e != '\n'))
3554		return -EINVAL;
3555	err = mddev_lock(mddev);
3556	if (err)
3557		return err;
3558
3559	if (mddev->pers) {
3560		if (mddev->pers->check_reshape == NULL)
3561			err = -EBUSY;
3562		else if (mddev->ro)
3563			err = -EROFS;
3564		else {
3565			mddev->new_layout = n;
3566			err = mddev->pers->check_reshape(mddev);
3567			if (err)
3568				mddev->new_layout = mddev->layout;
3569		}
3570	} else {
3571		mddev->new_layout = n;
3572		if (mddev->reshape_position == MaxSector)
3573			mddev->layout = n;
3574	}
3575	mddev_unlock(mddev);
3576	return err ?: len;
3577}
3578static struct md_sysfs_entry md_layout =
3579__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3580
3581static ssize_t
3582raid_disks_show(struct mddev *mddev, char *page)
3583{
3584	if (mddev->raid_disks == 0)
3585		return 0;
3586	if (mddev->reshape_position != MaxSector &&
3587	    mddev->delta_disks != 0)
3588		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3589			       mddev->raid_disks - mddev->delta_disks);
3590	return sprintf(page, "%d\n", mddev->raid_disks);
3591}
3592
3593static int update_raid_disks(struct mddev *mddev, int raid_disks);
3594
3595static ssize_t
3596raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3597{
3598	char *e;
3599	int err;
3600	unsigned long n = simple_strtoul(buf, &e, 10);
3601
3602	if (!*buf || (*e && *e != '\n'))
3603		return -EINVAL;
3604
3605	err = mddev_lock(mddev);
3606	if (err)
3607		return err;
3608	if (mddev->pers)
3609		err = update_raid_disks(mddev, n);
3610	else if (mddev->reshape_position != MaxSector) {
3611		struct md_rdev *rdev;
3612		int olddisks = mddev->raid_disks - mddev->delta_disks;
3613
3614		err = -EINVAL;
3615		rdev_for_each(rdev, mddev) {
3616			if (olddisks < n &&
3617			    rdev->data_offset < rdev->new_data_offset)
3618				goto out_unlock;
3619			if (olddisks > n &&
3620			    rdev->data_offset > rdev->new_data_offset)
3621				goto out_unlock;
3622		}
3623		err = 0;
3624		mddev->delta_disks = n - olddisks;
3625		mddev->raid_disks = n;
3626		mddev->reshape_backwards = (mddev->delta_disks < 0);
3627	} else
3628		mddev->raid_disks = n;
3629out_unlock:
3630	mddev_unlock(mddev);
3631	return err ? err : len;
3632}
3633static struct md_sysfs_entry md_raid_disks =
3634__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3635
3636static ssize_t
3637chunk_size_show(struct mddev *mddev, char *page)
3638{
3639	if (mddev->reshape_position != MaxSector &&
3640	    mddev->chunk_sectors != mddev->new_chunk_sectors)
3641		return sprintf(page, "%d (%d)\n",
3642			       mddev->new_chunk_sectors << 9,
3643			       mddev->chunk_sectors << 9);
3644	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3645}
3646
3647static ssize_t
3648chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3649{
3650	int err;
3651	char *e;
3652	unsigned long n = simple_strtoul(buf, &e, 10);
3653
3654	if (!*buf || (*e && *e != '\n'))
3655		return -EINVAL;
3656
3657	err = mddev_lock(mddev);
3658	if (err)
3659		return err;
3660	if (mddev->pers) {
3661		if (mddev->pers->check_reshape == NULL)
3662			err = -EBUSY;
3663		else if (mddev->ro)
3664			err = -EROFS;
3665		else {
3666			mddev->new_chunk_sectors = n >> 9;
3667			err = mddev->pers->check_reshape(mddev);
3668			if (err)
3669				mddev->new_chunk_sectors = mddev->chunk_sectors;
3670		}
3671	} else {
3672		mddev->new_chunk_sectors = n >> 9;
3673		if (mddev->reshape_position == MaxSector)
3674			mddev->chunk_sectors = n >> 9;
3675	}
3676	mddev_unlock(mddev);
3677	return err ?: len;
3678}
3679static struct md_sysfs_entry md_chunk_size =
3680__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3681
3682static ssize_t
3683resync_start_show(struct mddev *mddev, char *page)
3684{
3685	if (mddev->recovery_cp == MaxSector)
3686		return sprintf(page, "none\n");
3687	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
3688}
3689
3690static ssize_t
3691resync_start_store(struct mddev *mddev, const char *buf, size_t len)
3692{
3693	int err;
3694	char *e;
3695	unsigned long long n = simple_strtoull(buf, &e, 10);
3696
3697	err = mddev_lock(mddev);
3698	if (err)
3699		return err;
3700	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3701		err = -EBUSY;
3702	else if (cmd_match(buf, "none"))
3703		n = MaxSector;
3704	else if (!*buf || (*e && *e != '\n'))
3705		err = -EINVAL;
3706
3707	if (!err) {
3708		mddev->recovery_cp = n;
3709		if (mddev->pers)
3710			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3711	}
3712	mddev_unlock(mddev);
3713	return err ?: len;
3714}
3715static struct md_sysfs_entry md_resync_start =
3716__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
3717		resync_start_show, resync_start_store);
3718
3719/*
3720 * The array state can be:
3721 *
3722 * clear
3723 *     No devices, no size, no level
3724 *     Equivalent to STOP_ARRAY ioctl
3725 * inactive
3726 *     May have some settings, but array is not active
3727 *        all IO results in error
3728 *     When written, doesn't tear down array, but just stops it
3729 * suspended (not supported yet)
3730 *     All IO requests will block. The array can be reconfigured.
3731 *     Writing this, if accepted, will block until array is quiescent
3732 * readonly
3733 *     no resync can happen.  no superblocks get written.
3734 *     write requests fail
3735 * read-auto
3736 *     like readonly, but behaves like 'clean' on a write request.
3737 *
3738 * clean - no pending writes, but otherwise active.
3739 *     When written to inactive array, starts without resync
3740 *     If a write request arrives then
3741 *       if metadata is known, mark 'dirty' and switch to 'active'.
3742 *       if not known, block and switch to write-pending
3743 *     If written to an active array that has pending writes, then fails.
3744 * active
3745 *     fully active: IO and resync can be happening.
3746 *     When written to inactive array, starts with resync
3747 *
3748 * write-pending
3749 *     clean, but writes are blocked waiting for 'active' to be written.
3750 *
3751 * active-idle
3752 *     like active, but no writes have been seen for a while (100msec).
3753 *
3754 */
3755enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
3756		   write_pending, active_idle, bad_word};
3757static char *array_states[] = {
3758	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3759	"write-pending", "active-idle", NULL };
3760
3761static int match_word(const char *word, char **list)
3762{
3763	int n;
3764	for (n=0; list[n]; n++)
3765		if (cmd_match(word, list[n]))
3766			break;
3767	return n;
3768}
3769
3770static ssize_t
3771array_state_show(struct mddev *mddev, char *page)
3772{
3773	enum array_state st = inactive;
3774
3775	if (mddev->pers)
3776		switch(mddev->ro) {
3777		case 1:
3778			st = readonly;
3779			break;
3780		case 2:
3781			st = read_auto;
3782			break;
3783		case 0:
3784			if (mddev->in_sync)
3785				st = clean;
3786			else if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
3787				st = write_pending;
3788			else if (mddev->safemode)
3789				st = active_idle;
3790			else
3791				st = active;
3792		}
3793	else {
3794		if (list_empty(&mddev->disks) &&
3795		    mddev->raid_disks == 0 &&
3796		    mddev->dev_sectors == 0)
3797			st = clear;
3798		else
3799			st = inactive;
3800	}
3801	return sprintf(page, "%s\n", array_states[st]);
3802}
3803
3804static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
3805static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
3806static int do_md_run(struct mddev *mddev);
3807static int restart_array(struct mddev *mddev);
3808
3809static ssize_t
3810array_state_store(struct mddev *mddev, const char *buf, size_t len)
3811{
3812	int err;
3813	enum array_state st = match_word(buf, array_states);
3814
3815	if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
3816		/* don't take reconfig_mutex when toggling between
3817		 * clean and active
3818		 */
3819		spin_lock(&mddev->lock);
3820		if (st == active) {
3821			restart_array(mddev);
3822			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
3823			wake_up(&mddev->sb_wait);
3824			err = 0;
3825		} else /* st == clean */ {
3826			restart_array(mddev);
3827			if (atomic_read(&mddev->writes_pending) == 0) {
3828				if (mddev->in_sync == 0) {
3829					mddev->in_sync = 1;
3830					if (mddev->safemode == 1)
3831						mddev->safemode = 0;
3832					set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3833				}
3834				err = 0;
3835			} else
3836				err = -EBUSY;
3837		}
3838		spin_unlock(&mddev->lock);
3839		return err ?: len;
3840	}
3841	err = mddev_lock(mddev);
3842	if (err)
3843		return err;
3844	err = -EINVAL;
3845	switch(st) {
3846	case bad_word:
3847		break;
3848	case clear:
3849		/* stopping an active array */
3850		err = do_md_stop(mddev, 0, NULL);
3851		break;
3852	case inactive:
3853		/* stopping an active array */
3854		if (mddev->pers)
3855			err = do_md_stop(mddev, 2, NULL);
3856		else
3857			err = 0; /* already inactive */
3858		break;
3859	case suspended:
3860		break; /* not supported yet */
3861	case readonly:
3862		if (mddev->pers)
3863			err = md_set_readonly(mddev, NULL);
3864		else {
3865			mddev->ro = 1;
3866			set_disk_ro(mddev->gendisk, 1);
3867			err = do_md_run(mddev);
3868		}
3869		break;
3870	case read_auto:
3871		if (mddev->pers) {
3872			if (mddev->ro == 0)
3873				err = md_set_readonly(mddev, NULL);
3874			else if (mddev->ro == 1)
3875				err = restart_array(mddev);
3876			if (err == 0) {
3877				mddev->ro = 2;
3878				set_disk_ro(mddev->gendisk, 0);
3879			}
3880		} else {
3881			mddev->ro = 2;
3882			err = do_md_run(mddev);
3883		}
3884		break;
3885	case clean:
3886		if (mddev->pers) {
3887			restart_array(mddev);
3888			spin_lock(&mddev->lock);
3889			if (atomic_read(&mddev->writes_pending) == 0) {
3890				if (mddev->in_sync == 0) {
3891					mddev->in_sync = 1;
3892					if (mddev->safemode == 1)
3893						mddev->safemode = 0;
3894					set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3895				}
3896				err = 0;
3897			} else
3898				err = -EBUSY;
3899			spin_unlock(&mddev->lock);
3900		} else
3901			err = -EINVAL;
3902		break;
3903	case active:
3904		if (mddev->pers) {
3905			restart_array(mddev);
3906			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
3907			wake_up(&mddev->sb_wait);
3908			err = 0;
3909		} else {
3910			mddev->ro = 0;
3911			set_disk_ro(mddev->gendisk, 0);
3912			err = do_md_run(mddev);
3913		}
3914		break;
3915	case write_pending:
3916	case active_idle:
3917		/* these cannot be set */
3918		break;
3919	}
3920
3921	if (!err) {
3922		if (mddev->hold_active == UNTIL_IOCTL)
3923			mddev->hold_active = 0;
3924		sysfs_notify_dirent_safe(mddev->sysfs_state);
3925	}
3926	mddev_unlock(mddev);
3927	return err ?: len;
3928}
3929static struct md_sysfs_entry md_array_state =
3930__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3931
3932static ssize_t
3933max_corrected_read_errors_show(struct mddev *mddev, char *page) {
3934	return sprintf(page, "%d\n",
3935		       atomic_read(&mddev->max_corr_read_errors));
3936}
3937
3938static ssize_t
3939max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
3940{
3941	char *e;
3942	unsigned long n = simple_strtoul(buf, &e, 10);
3943
3944	if (*buf && (*e == 0 || *e == '\n')) {
3945		atomic_set(&mddev->max_corr_read_errors, n);
3946		return len;
3947	}
3948	return -EINVAL;
3949}
3950
3951static struct md_sysfs_entry max_corr_read_errors =
3952__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3953	max_corrected_read_errors_store);
3954
3955static ssize_t
3956null_show(struct mddev *mddev, char *page)
3957{
3958	return -EINVAL;
3959}
3960
3961static ssize_t
3962new_dev_store(struct mddev *mddev, const char *buf, size_t len)
3963{
3964	/* buf must be %d:%d\n? giving major and minor numbers */
3965	/* The new device is added to the array.
3966	 * If the array has a persistent superblock, we read the
3967	 * superblock to initialise info and check validity.
3968	 * Otherwise, only checking done is that in bind_rdev_to_array,
3969	 * which mainly checks size.
3970	 */
3971	char *e;
3972	int major = simple_strtoul(buf, &e, 10);
3973	int minor;
3974	dev_t dev;
3975	struct md_rdev *rdev;
3976	int err;
3977
3978	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
3979		return -EINVAL;
3980	minor = simple_strtoul(e+1, &e, 10);
3981	if (*e && *e != '\n')
3982		return -EINVAL;
3983	dev = MKDEV(major, minor);
3984	if (major != MAJOR(dev) ||
3985	    minor != MINOR(dev))
3986		return -EOVERFLOW;
3987
3988	flush_workqueue(md_misc_wq);
3989
3990	err = mddev_lock(mddev);
3991	if (err)
3992		return err;
3993	if (mddev->persistent) {
3994		rdev = md_import_device(dev, mddev->major_version,
3995					mddev->minor_version);
3996		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3997			struct md_rdev *rdev0
3998				= list_entry(mddev->disks.next,
3999					     struct md_rdev, same_set);
4000			err = super_types[mddev->major_version]
4001				.load_super(rdev, rdev0, mddev->minor_version);
4002			if (err < 0)
4003				goto out;
4004		}
4005	} else if (mddev->external)
4006		rdev = md_import_device(dev, -2, -1);
4007	else
4008		rdev = md_import_device(dev, -1, -1);
4009
4010	if (IS_ERR(rdev)) {
4011		mddev_unlock(mddev);
4012		return PTR_ERR(rdev);
4013	}
4014	err = bind_rdev_to_array(rdev, mddev);
4015 out:
4016	if (err)
4017		export_rdev(rdev);
4018	mddev_unlock(mddev);
4019	return err ? err : len;
4020}
4021
4022static struct md_sysfs_entry md_new_device =
4023__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4024
4025static ssize_t
4026bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4027{
4028	char *end;
4029	unsigned long chunk, end_chunk;
4030	int err;
4031
4032	err = mddev_lock(mddev);
4033	if (err)
4034		return err;
4035	if (!mddev->bitmap)
4036		goto out;
4037	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4038	while (*buf) {
4039		chunk = end_chunk = simple_strtoul(buf, &end, 0);
4040		if (buf == end) break;
4041		if (*end == '-') { /* range */
4042			buf = end + 1;
4043			end_chunk = simple_strtoul(buf, &end, 0);
4044			if (buf == end) break;
4045		}
4046		if (*end && !isspace(*end)) break;
4047		bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4048		buf = skip_spaces(end);
4049	}
4050	bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4051out:
4052	mddev_unlock(mddev);
4053	return len;
4054}
4055
4056static struct md_sysfs_entry md_bitmap =
4057__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4058
4059static ssize_t
4060size_show(struct mddev *mddev, char *page)
4061{
4062	return sprintf(page, "%llu\n",
4063		(unsigned long long)mddev->dev_sectors / 2);
4064}
4065
4066static int update_size(struct mddev *mddev, sector_t num_sectors);
4067
4068static ssize_t
4069size_store(struct mddev *mddev, const char *buf, size_t len)
4070{
4071	/* If array is inactive, we can reduce the component size, but
4072	 * not increase it (except from 0).
4073	 * If array is active, we can try an on-line resize
4074	 */
4075	sector_t sectors;
4076	int err = strict_blocks_to_sectors(buf, &sectors);
4077
4078	if (err < 0)
4079		return err;
4080	err = mddev_lock(mddev);
4081	if (err)
4082		return err;
4083	if (mddev->pers) {
4084		if (mddev_is_clustered(mddev))
4085			md_cluster_ops->metadata_update_start(mddev);
4086		err = update_size(mddev, sectors);
4087		md_update_sb(mddev, 1);
4088		if (mddev_is_clustered(mddev))
4089			md_cluster_ops->metadata_update_finish(mddev);
4090	} else {
4091		if (mddev->dev_sectors == 0 ||
4092		    mddev->dev_sectors > sectors)
4093			mddev->dev_sectors = sectors;
4094		else
4095			err = -ENOSPC;
4096	}
4097	mddev_unlock(mddev);
4098	return err ? err : len;
4099}
4100
4101static struct md_sysfs_entry md_size =
4102__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4103
4104/* Metadata version.
4105 * This is one of
4106 *   'none' for arrays with no metadata (good luck...)
4107 *   'external' for arrays with externally managed metadata,
4108 * or N.M for internally known formats
4109 */
4110static ssize_t
4111metadata_show(struct mddev *mddev, char *page)
4112{
4113	if (mddev->persistent)
4114		return sprintf(page, "%d.%d\n",
4115			       mddev->major_version, mddev->minor_version);
4116	else if (mddev->external)
4117		return sprintf(page, "external:%s\n", mddev->metadata_type);
4118	else
4119		return sprintf(page, "none\n");
4120}
4121
4122static ssize_t
4123metadata_store(struct mddev *mddev, const char *buf, size_t len)
4124{
4125	int major, minor;
4126	char *e;
4127	int err;
4128	/* Changing the details of 'external' metadata is
4129	 * always permitted.  Otherwise there must be
4130	 * no devices attached to the array.
4131	 */
4132
4133	err = mddev_lock(mddev);
4134	if (err)
4135		return err;
4136	err = -EBUSY;
4137	if (mddev->external && strncmp(buf, "external:", 9) == 0)
4138		;
4139	else if (!list_empty(&mddev->disks))
4140		goto out_unlock;
4141
4142	err = 0;
4143	if (cmd_match(buf, "none")) {
4144		mddev->persistent = 0;
4145		mddev->external = 0;
4146		mddev->major_version = 0;
4147		mddev->minor_version = 90;
4148		goto out_unlock;
4149	}
4150	if (strncmp(buf, "external:", 9) == 0) {
4151		size_t namelen = len-9;
4152		if (namelen >= sizeof(mddev->metadata_type))
4153			namelen = sizeof(mddev->metadata_type)-1;
4154		strncpy(mddev->metadata_type, buf+9, namelen);
4155		mddev->metadata_type[namelen] = 0;
4156		if (namelen && mddev->metadata_type[namelen-1] == '\n')
4157			mddev->metadata_type[--namelen] = 0;
4158		mddev->persistent = 0;
4159		mddev->external = 1;
4160		mddev->major_version = 0;
4161		mddev->minor_version = 90;
4162		goto out_unlock;
4163	}
4164	major = simple_strtoul(buf, &e, 10);
4165	err = -EINVAL;
4166	if (e==buf || *e != '.')
4167		goto out_unlock;
4168	buf = e+1;
4169	minor = simple_strtoul(buf, &e, 10);
4170	if (e==buf || (*e && *e != '\n') )
4171		goto out_unlock;
4172	err = -ENOENT;
4173	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4174		goto out_unlock;
4175	mddev->major_version = major;
4176	mddev->minor_version = minor;
4177	mddev->persistent = 1;
4178	mddev->external = 0;
4179	err = 0;
4180out_unlock:
4181	mddev_unlock(mddev);
4182	return err ?: len;
4183}
4184
4185static struct md_sysfs_entry md_metadata =
4186__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4187
4188static ssize_t
4189action_show(struct mddev *mddev, char *page)
4190{
4191	char *type = "idle";
4192	unsigned long recovery = mddev->recovery;
4193	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4194		type = "frozen";
4195	else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4196	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4197		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4198			type = "reshape";
4199		else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4200			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4201				type = "resync";
4202			else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4203				type = "check";
4204			else
4205				type = "repair";
4206		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4207			type = "recover";
4208	}
4209	return sprintf(page, "%s\n", type);
4210}
4211
4212static ssize_t
4213action_store(struct mddev *mddev, const char *page, size_t len)
4214{
4215	if (!mddev->pers || !mddev->pers->sync_request)
4216		return -EINVAL;
4217
4218
4219	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4220		if (cmd_match(page, "frozen"))
4221			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4222		else
4223			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4224		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
4225		    mddev_lock(mddev) == 0) {
4226			flush_workqueue(md_misc_wq);
4227			if (mddev->sync_thread) {
4228				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4229				md_reap_sync_thread(mddev);
4230			}
4231			mddev_unlock(mddev);
4232		}
4233	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4234		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
4235		return -EBUSY;
4236	else if (cmd_match(page, "resync"))
4237		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4238	else if (cmd_match(page, "recover")) {
4239		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4240		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4241	} else if (cmd_match(page, "reshape")) {
4242		int err;
4243		if (mddev->pers->start_reshape == NULL)
4244			return -EINVAL;
4245		err = mddev_lock(mddev);
4246		if (!err) {
4247			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4248			err = mddev->pers->start_reshape(mddev);
4249			mddev_unlock(mddev);
4250		}
4251		if (err)
4252			return err;
4253		sysfs_notify(&mddev->kobj, NULL, "degraded");
4254	} else {
4255		if (cmd_match(page, "check"))
4256			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4257		else if (!cmd_match(page, "repair"))
4258			return -EINVAL;
4259		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4260		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4261		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4262	}
4263	if (mddev->ro == 2) {
4264		/* A write to sync_action is enough to justify
4265		 * canceling read-auto mode
4266		 */
4267		mddev->ro = 0;
4268		md_wakeup_thread(mddev->sync_thread);
4269	}
4270	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4271	md_wakeup_thread(mddev->thread);
4272	sysfs_notify_dirent_safe(mddev->sysfs_action);
4273	return len;
4274}
4275
4276static struct md_sysfs_entry md_scan_mode =
4277__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4278
4279static ssize_t
4280last_sync_action_show(struct mddev *mddev, char *page)
4281{
4282	return sprintf(page, "%s\n", mddev->last_sync_action);
4283}
4284
4285static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4286
4287static ssize_t
4288mismatch_cnt_show(struct mddev *mddev, char *page)
4289{
4290	return sprintf(page, "%llu\n",
4291		       (unsigned long long)
4292		       atomic64_read(&mddev->resync_mismatches));
4293}
4294
4295static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4296
4297static ssize_t
4298sync_min_show(struct mddev *mddev, char *page)
4299{
4300	return sprintf(page, "%d (%s)\n", speed_min(mddev),
4301		       mddev->sync_speed_min ? "local": "system");
4302}
4303
4304static ssize_t
4305sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4306{
4307	int min;
4308	char *e;
4309	if (strncmp(buf, "system", 6)==0) {
4310		mddev->sync_speed_min = 0;
4311		return len;
4312	}
4313	min = simple_strtoul(buf, &e, 10);
4314	if (buf == e || (*e && *e != '\n') || min <= 0)
4315		return -EINVAL;
4316	mddev->sync_speed_min = min;
4317	return len;
4318}
4319
4320static struct md_sysfs_entry md_sync_min =
4321__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4322
4323static ssize_t
4324sync_max_show(struct mddev *mddev, char *page)
4325{
4326	return sprintf(page, "%d (%s)\n", speed_max(mddev),
4327		       mddev->sync_speed_max ? "local": "system");
4328}
4329
4330static ssize_t
4331sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4332{
4333	int max;
4334	char *e;
4335	if (strncmp(buf, "system", 6)==0) {
4336		mddev->sync_speed_max = 0;
4337		return len;
4338	}
4339	max = simple_strtoul(buf, &e, 10);
4340	if (buf == e || (*e && *e != '\n') || max <= 0)
4341		return -EINVAL;
4342	mddev->sync_speed_max = max;
4343	return len;
4344}
4345
4346static struct md_sysfs_entry md_sync_max =
4347__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4348
4349static ssize_t
4350degraded_show(struct mddev *mddev, char *page)
4351{
4352	return sprintf(page, "%d\n", mddev->degraded);
4353}
4354static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4355
4356static ssize_t
4357sync_force_parallel_show(struct mddev *mddev, char *page)
4358{
4359	return sprintf(page, "%d\n", mddev->parallel_resync);
4360}
4361
4362static ssize_t
4363sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4364{
4365	long n;
4366
4367	if (kstrtol(buf, 10, &n))
4368		return -EINVAL;
4369
4370	if (n != 0 && n != 1)
4371		return -EINVAL;
4372
4373	mddev->parallel_resync = n;
4374
4375	if (mddev->sync_thread)
4376		wake_up(&resync_wait);
4377
4378	return len;
4379}
4380
4381/* force parallel resync, even with shared block devices */
4382static struct md_sysfs_entry md_sync_force_parallel =
4383__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4384       sync_force_parallel_show, sync_force_parallel_store);
4385
4386static ssize_t
4387sync_speed_show(struct mddev *mddev, char *page)
4388{
4389	unsigned long resync, dt, db;
4390	if (mddev->curr_resync == 0)
4391		return sprintf(page, "none\n");
4392	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4393	dt = (jiffies - mddev->resync_mark) / HZ;
4394	if (!dt) dt++;
4395	db = resync - mddev->resync_mark_cnt;
4396	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4397}
4398
4399static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4400
4401static ssize_t
4402sync_completed_show(struct mddev *mddev, char *page)
4403{
4404	unsigned long long max_sectors, resync;
4405
4406	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4407		return sprintf(page, "none\n");
4408
4409	if (mddev->curr_resync == 1 ||
4410	    mddev->curr_resync == 2)
4411		return sprintf(page, "delayed\n");
4412
4413	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4414	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4415		max_sectors = mddev->resync_max_sectors;
4416	else
4417		max_sectors = mddev->dev_sectors;
4418
4419	resync = mddev->curr_resync_completed;
4420	return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4421}
4422
4423static struct md_sysfs_entry md_sync_completed =
4424	__ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
4425
4426static ssize_t
4427min_sync_show(struct mddev *mddev, char *page)
4428{
4429	return sprintf(page, "%llu\n",
4430		       (unsigned long long)mddev->resync_min);
4431}
4432static ssize_t
4433min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4434{
4435	unsigned long long min;
4436	int err;
4437
4438	if (kstrtoull(buf, 10, &min))
4439		return -EINVAL;
4440
4441	spin_lock(&mddev->lock);
4442	err = -EINVAL;
4443	if (min > mddev->resync_max)
4444		goto out_unlock;
4445
4446	err = -EBUSY;
4447	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4448		goto out_unlock;
4449
4450	/* Round down to multiple of 4K for safety */
4451	mddev->resync_min = round_down(min, 8);
4452	err = 0;
4453
4454out_unlock:
4455	spin_unlock(&mddev->lock);
4456	return err ?: len;
4457}
4458
4459static struct md_sysfs_entry md_min_sync =
4460__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4461
4462static ssize_t
4463max_sync_show(struct mddev *mddev, char *page)
4464{
4465	if (mddev->resync_max == MaxSector)
4466		return sprintf(page, "max\n");
4467	else
4468		return sprintf(page, "%llu\n",
4469			       (unsigned long long)mddev->resync_max);
4470}
4471static ssize_t
4472max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4473{
4474	int err;
4475	spin_lock(&mddev->lock);
4476	if (strncmp(buf, "max", 3) == 0)
4477		mddev->resync_max = MaxSector;
4478	else {
4479		unsigned long long max;
4480		int chunk;
4481
4482		err = -EINVAL;
4483		if (kstrtoull(buf, 10, &max))
4484			goto out_unlock;
4485		if (max < mddev->resync_min)
4486			goto out_unlock;
4487
4488		err = -EBUSY;
4489		if (max < mddev->resync_max &&
4490		    mddev->ro == 0 &&
4491		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4492			goto out_unlock;
4493
4494		/* Must be a multiple of chunk_size */
4495		chunk = mddev->chunk_sectors;
4496		if (chunk) {
4497			sector_t temp = max;
4498
4499			err = -EINVAL;
4500			if (sector_div(temp, chunk))
4501				goto out_unlock;
4502		}
4503		mddev->resync_max = max;
4504	}
4505	wake_up(&mddev->recovery_wait);
4506	err = 0;
4507out_unlock:
4508	spin_unlock(&mddev->lock);
4509	return err ?: len;
4510}
4511
4512static struct md_sysfs_entry md_max_sync =
4513__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4514
4515static ssize_t
4516suspend_lo_show(struct mddev *mddev, char *page)
4517{
4518	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4519}
4520
4521static ssize_t
4522suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4523{
4524	char *e;
4525	unsigned long long new = simple_strtoull(buf, &e, 10);
4526	unsigned long long old;
4527	int err;
4528
4529	if (buf == e || (*e && *e != '\n'))
4530		return -EINVAL;
4531
4532	err = mddev_lock(mddev);
4533	if (err)
4534		return err;
4535	err = -EINVAL;
4536	if (mddev->pers == NULL ||
4537	    mddev->pers->quiesce == NULL)
4538		goto unlock;
4539	old = mddev->suspend_lo;
4540	mddev->suspend_lo = new;
4541	if (new >= old)
4542		/* Shrinking suspended region */
4543		mddev->pers->quiesce(mddev, 2);
4544	else {
4545		/* Expanding suspended region - need to wait */
4546		mddev->pers->quiesce(mddev, 1);
4547		mddev->pers->quiesce(mddev, 0);
4548	}
4549	err = 0;
4550unlock:
4551	mddev_unlock(mddev);
4552	return err ?: len;
4553}
4554static struct md_sysfs_entry md_suspend_lo =
4555__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4556
4557static ssize_t
4558suspend_hi_show(struct mddev *mddev, char *page)
4559{
4560	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4561}
4562
4563static ssize_t
4564suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4565{
4566	char *e;
4567	unsigned long long new = simple_strtoull(buf, &e, 10);
4568	unsigned long long old;
4569	int err;
4570
4571	if (buf == e || (*e && *e != '\n'))
4572		return -EINVAL;
4573
4574	err = mddev_lock(mddev);
4575	if (err)
4576		return err;
4577	err = -EINVAL;
4578	if (mddev->pers == NULL ||
4579	    mddev->pers->quiesce == NULL)
4580		goto unlock;
4581	old = mddev->suspend_hi;
4582	mddev->suspend_hi = new;
4583	if (new <= old)
4584		/* Shrinking suspended region */
4585		mddev->pers->quiesce(mddev, 2);
4586	else {
4587		/* Expanding suspended region - need to wait */
4588		mddev->pers->quiesce(mddev, 1);
4589		mddev->pers->quiesce(mddev, 0);
4590	}
4591	err = 0;
4592unlock:
4593	mddev_unlock(mddev);
4594	return err ?: len;
4595}
4596static struct md_sysfs_entry md_suspend_hi =
4597__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4598
4599static ssize_t
4600reshape_position_show(struct mddev *mddev, char *page)
4601{
4602	if (mddev->reshape_position != MaxSector)
4603		return sprintf(page, "%llu\n",
4604			       (unsigned long long)mddev->reshape_position);
4605	strcpy(page, "none\n");
4606	return 5;
4607}
4608
4609static ssize_t
4610reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4611{
4612	struct md_rdev *rdev;
4613	char *e;
4614	int err;
4615	unsigned long long new = simple_strtoull(buf, &e, 10);
4616
4617	if (buf == e || (*e && *e != '\n'))
4618		return -EINVAL;
4619	err = mddev_lock(mddev);
4620	if (err)
4621		return err;
4622	err = -EBUSY;
4623	if (mddev->pers)
4624		goto unlock;
4625	mddev->reshape_position = new;
4626	mddev->delta_disks = 0;
4627	mddev->reshape_backwards = 0;
4628	mddev->new_level = mddev->level;
4629	mddev->new_layout = mddev->layout;
4630	mddev->new_chunk_sectors = mddev->chunk_sectors;
4631	rdev_for_each(rdev, mddev)
4632		rdev->new_data_offset = rdev->data_offset;
4633	err = 0;
4634unlock:
4635	mddev_unlock(mddev);
4636	return err ?: len;
4637}
4638
4639static struct md_sysfs_entry md_reshape_position =
4640__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4641       reshape_position_store);
4642
4643static ssize_t
4644reshape_direction_show(struct mddev *mddev, char *page)
4645{
4646	return sprintf(page, "%s\n",
4647		       mddev->reshape_backwards ? "backwards" : "forwards");
4648}
4649
4650static ssize_t
4651reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4652{
4653	int backwards = 0;
4654	int err;
4655
4656	if (cmd_match(buf, "forwards"))
4657		backwards = 0;
4658	else if (cmd_match(buf, "backwards"))
4659		backwards = 1;
4660	else
4661		return -EINVAL;
4662	if (mddev->reshape_backwards == backwards)
4663		return len;
4664
4665	err = mddev_lock(mddev);
4666	if (err)
4667		return err;
4668	/* check if we are allowed to change */
4669	if (mddev->delta_disks)
4670		err = -EBUSY;
4671	else if (mddev->persistent &&
4672	    mddev->major_version == 0)
4673		err =  -EINVAL;
4674	else
4675		mddev->reshape_backwards = backwards;
4676	mddev_unlock(mddev);
4677	return err ?: len;
4678}
4679
4680static struct md_sysfs_entry md_reshape_direction =
4681__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
4682       reshape_direction_store);
4683
4684static ssize_t
4685array_size_show(struct mddev *mddev, char *page)
4686{
4687	if (mddev->external_size)
4688		return sprintf(page, "%llu\n",
4689			       (unsigned long long)mddev->array_sectors/2);
4690	else
4691		return sprintf(page, "default\n");
4692}
4693
4694static ssize_t
4695array_size_store(struct mddev *mddev, const char *buf, size_t len)
4696{
4697	sector_t sectors;
4698	int err;
4699
4700	err = mddev_lock(mddev);
4701	if (err)
4702		return err;
4703
4704	if (strncmp(buf, "default", 7) == 0) {
4705		if (mddev->pers)
4706			sectors = mddev->pers->size(mddev, 0, 0);
4707		else
4708			sectors = mddev->array_sectors;
4709
4710		mddev->external_size = 0;
4711	} else {
4712		if (strict_blocks_to_sectors(buf, &sectors) < 0)
4713			err = -EINVAL;
4714		else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
4715			err = -E2BIG;
4716		else
4717			mddev->external_size = 1;
4718	}
4719
4720	if (!err) {
4721		mddev->array_sectors = sectors;
4722		if (mddev->pers) {
4723			set_capacity(mddev->gendisk, mddev->array_sectors);
4724			revalidate_disk(mddev->gendisk);
4725		}
4726	}
4727	mddev_unlock(mddev);
4728	return err ?: len;
4729}
4730
4731static struct md_sysfs_entry md_array_size =
4732__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
4733       array_size_store);
4734
4735static struct attribute *md_default_attrs[] = {
4736	&md_level.attr,
4737	&md_layout.attr,
4738	&md_raid_disks.attr,
4739	&md_chunk_size.attr,
4740	&md_size.attr,
4741	&md_resync_start.attr,
4742	&md_metadata.attr,
4743	&md_new_device.attr,
4744	&md_safe_delay.attr,
4745	&md_array_state.attr,
4746	&md_reshape_position.attr,
4747	&md_reshape_direction.attr,
4748	&md_array_size.attr,
4749	&max_corr_read_errors.attr,
4750	NULL,
4751};
4752
4753static struct attribute *md_redundancy_attrs[] = {
4754	&md_scan_mode.attr,
4755	&md_last_scan_mode.attr,
4756	&md_mismatches.attr,
4757	&md_sync_min.attr,
4758	&md_sync_max.attr,
4759	&md_sync_speed.attr,
4760	&md_sync_force_parallel.attr,
4761	&md_sync_completed.attr,
4762	&md_min_sync.attr,
4763	&md_max_sync.attr,
4764	&md_suspend_lo.attr,
4765	&md_suspend_hi.attr,
4766	&md_bitmap.attr,
4767	&md_degraded.attr,
4768	NULL,
4769};
4770static struct attribute_group md_redundancy_group = {
4771	.name = NULL,
4772	.attrs = md_redundancy_attrs,
4773};
4774
4775static ssize_t
4776md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4777{
4778	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4779	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
4780	ssize_t rv;
4781
4782	if (!entry->show)
4783		return -EIO;
4784	spin_lock(&all_mddevs_lock);
4785	if (list_empty(&mddev->all_mddevs)) {
4786		spin_unlock(&all_mddevs_lock);
4787		return -EBUSY;
4788	}
4789	mddev_get(mddev);
4790	spin_unlock(&all_mddevs_lock);
4791
4792	rv = entry->show(mddev, page);
4793	mddev_put(mddev);
4794	return rv;
4795}
4796
4797static ssize_t
4798md_attr_store(struct kobject *kobj, struct attribute *attr,
4799	      const char *page, size_t length)
4800{
4801	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4802	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
4803	ssize_t rv;
4804
4805	if (!entry->store)
4806		return -EIO;
4807	if (!capable(CAP_SYS_ADMIN))
4808		return -EACCES;
4809	spin_lock(&all_mddevs_lock);
4810	if (list_empty(&mddev->all_mddevs)) {
4811		spin_unlock(&all_mddevs_lock);
4812		return -EBUSY;
4813	}
4814	mddev_get(mddev);
4815	spin_unlock(&all_mddevs_lock);
4816	rv = entry->store(mddev, page, length);
4817	mddev_put(mddev);
4818	return rv;
4819}
4820
4821static void md_free(struct kobject *ko)
4822{
4823	struct mddev *mddev = container_of(ko, struct mddev, kobj);
4824
4825	if (mddev->sysfs_state)
4826		sysfs_put(mddev->sysfs_state);
4827
4828	if (mddev->queue)
4829		blk_cleanup_queue(mddev->queue);
4830	if (mddev->gendisk) {
4831		del_gendisk(mddev->gendisk);
4832		put_disk(mddev->gendisk);
4833	}
4834
4835	kfree(mddev);
4836}
4837
4838static const struct sysfs_ops md_sysfs_ops = {
4839	.show	= md_attr_show,
4840	.store	= md_attr_store,
4841};
4842static struct kobj_type md_ktype = {
4843	.release	= md_free,
4844	.sysfs_ops	= &md_sysfs_ops,
4845	.default_attrs	= md_default_attrs,
4846};
4847
4848int mdp_major = 0;
4849
4850static void mddev_delayed_delete(struct work_struct *ws)
4851{
4852	struct mddev *mddev = container_of(ws, struct mddev, del_work);
4853
4854	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
4855	kobject_del(&mddev->kobj);
4856	kobject_put(&mddev->kobj);
4857}
4858
4859static int md_alloc(dev_t dev, char *name)
4860{
4861	static DEFINE_MUTEX(disks_mutex);
4862	struct mddev *mddev = mddev_find(dev);
4863	struct gendisk *disk;
4864	int partitioned;
4865	int shift;
4866	int unit;
4867	int error;
4868
4869	if (!mddev)
4870		return -ENODEV;
4871
4872	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
4873	shift = partitioned ? MdpMinorShift : 0;
4874	unit = MINOR(mddev->unit) >> shift;
4875
4876	/* wait for any previous instance of this device to be
4877	 * completely removed (mddev_delayed_delete).
4878	 */
4879	flush_workqueue(md_misc_wq);
4880
4881	mutex_lock(&disks_mutex);
4882	error = -EEXIST;
4883	if (mddev->gendisk)
4884		goto abort;
4885
4886	if (name) {
4887		/* Need to ensure that 'name' is not a duplicate.
4888		 */
4889		struct mddev *mddev2;
4890		spin_lock(&all_mddevs_lock);
4891
4892		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
4893			if (mddev2->gendisk &&
4894			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
4895				spin_unlock(&all_mddevs_lock);
4896				goto abort;
4897			}
4898		spin_unlock(&all_mddevs_lock);
4899	}
4900
4901	error = -ENOMEM;
4902	mddev->queue = blk_alloc_queue(GFP_KERNEL);
4903	if (!mddev->queue)
4904		goto abort;
4905	mddev->queue->queuedata = mddev;
4906
4907	blk_queue_make_request(mddev->queue, md_make_request);
4908	blk_set_stacking_limits(&mddev->queue->limits);
4909
4910	disk = alloc_disk(1 << shift);
4911	if (!disk) {
4912		blk_cleanup_queue(mddev->queue);
4913		mddev->queue = NULL;
4914		goto abort;
4915	}
4916	disk->major = MAJOR(mddev->unit);
4917	disk->first_minor = unit << shift;
4918	if (name)
4919		strcpy(disk->disk_name, name);
4920	else if (partitioned)
4921		sprintf(disk->disk_name, "md_d%d", unit);
4922	else
4923		sprintf(disk->disk_name, "md%d", unit);
4924	disk->fops = &md_fops;
4925	disk->private_data = mddev;
4926	disk->queue = mddev->queue;
4927	blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
4928	/* Allow extended partitions.  This makes the
4929	 * 'mdp' device redundant, but we can't really
4930	 * remove it now.
4931	 */
4932	disk->flags |= GENHD_FL_EXT_DEVT;
4933	mddev->gendisk = disk;
4934	/* As soon as we call add_disk(), another thread could get
4935	 * through to md_open, so make sure it doesn't get too far
4936	 */
4937	mutex_lock(&mddev->open_mutex);
4938	add_disk(disk);
4939
4940	error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4941				     &disk_to_dev(disk)->kobj, "%s", "md");
4942	if (error) {
4943		/* This isn't possible, but as kobject_init_and_add is marked
4944		 * __must_check, we must do something with the result
4945		 */
4946		printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
4947		       disk->disk_name);
4948		error = 0;
4949	}
4950	if (mddev->kobj.sd &&
4951	    sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4952		printk(KERN_DEBUG "pointless warning\n");
4953	mutex_unlock(&mddev->open_mutex);
4954 abort:
4955	mutex_unlock(&disks_mutex);
4956	if (!error && mddev->kobj.sd) {
4957		kobject_uevent(&mddev->kobj, KOBJ_ADD);
4958		mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
4959	}
4960	mddev_put(mddev);
4961	return error;
4962}
4963
4964static struct kobject *md_probe(dev_t dev, int *part, void *data)
4965{
4966	md_alloc(dev, NULL);
4967	return NULL;
4968}
4969
4970static int add_named_array(const char *val, struct kernel_param *kp)
4971{
4972	/* val must be "md_*" where * is not all digits.
4973	 * We allocate an array with a large free minor number, and
4974	 * set the name to val.  val must not already be an active name.
4975	 */
4976	int len = strlen(val);
4977	char buf[DISK_NAME_LEN];
4978
4979	while (len && val[len-1] == '\n')
4980		len--;
4981	if (len >= DISK_NAME_LEN)
4982		return -E2BIG;
4983	strlcpy(buf, val, len+1);
4984	if (strncmp(buf, "md_", 3) != 0)
4985		return -EINVAL;
4986	return md_alloc(0, buf);
4987}
4988
4989static void md_safemode_timeout(unsigned long data)
4990{
4991	struct mddev *mddev = (struct mddev *) data;
4992
4993	if (!atomic_read(&mddev->writes_pending)) {
4994		mddev->safemode = 1;
4995		if (mddev->external)
4996			sysfs_notify_dirent_safe(mddev->sysfs_state);
4997	}
4998	md_wakeup_thread(mddev->thread);
4999}
5000
5001static int start_dirty_degraded;
5002
5003int md_run(struct mddev *mddev)
5004{
5005	int err;
5006	struct md_rdev *rdev;
5007	struct md_personality *pers;
5008
5009	if (list_empty(&mddev->disks))
5010		/* cannot run an array with no devices.. */
5011		return -EINVAL;
5012
5013	if (mddev->pers)
5014		return -EBUSY;
5015	/* Cannot run until previous stop completes properly */
5016	if (mddev->sysfs_active)
5017		return -EBUSY;
5018
5019	/*
5020	 * Analyze all RAID superblock(s)
5021	 */
5022	if (!mddev->raid_disks) {
5023		if (!mddev->persistent)
5024			return -EINVAL;
5025		analyze_sbs(mddev);
5026	}
5027
5028	if (mddev->level != LEVEL_NONE)
5029		request_module("md-level-%d", mddev->level);
5030	else if (mddev->clevel[0])
5031		request_module("md-%s", mddev->clevel);
5032
5033	/*
5034	 * Drop all container device buffers, from now on
5035	 * the only valid external interface is through the md
5036	 * device.
5037	 */
5038	rdev_for_each(rdev, mddev) {
5039		if (test_bit(Faulty, &rdev->flags))
5040			continue;
5041		sync_blockdev(rdev->bdev);
5042		invalidate_bdev(rdev->bdev);
5043
5044		/* perform some consistency tests on the device.
5045		 * We don't want the data to overlap the metadata,
5046		 * Internal Bitmap issues have been handled elsewhere.
5047		 */
5048		if (rdev->meta_bdev) {
5049			/* Nothing to check */;
5050		} else if (rdev->data_offset < rdev->sb_start) {
5051			if (mddev->dev_sectors &&
5052			    rdev->data_offset + mddev->dev_sectors
5053			    > rdev->sb_start) {
5054				printk("md: %s: data overlaps metadata\n",
5055				       mdname(mddev));
5056				return -EINVAL;
5057			}
5058		} else {
5059			if (rdev->sb_start + rdev->sb_size/512
5060			    > rdev->data_offset) {
5061				printk("md: %s: metadata overlaps data\n",
5062				       mdname(mddev));
5063				return -EINVAL;
5064			}
5065		}
5066		sysfs_notify_dirent_safe(rdev->sysfs_state);
5067	}
5068
5069	if (mddev->bio_set == NULL)
5070		mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
5071
5072	spin_lock(&pers_lock);
5073	pers = find_pers(mddev->level, mddev->clevel);
5074	if (!pers || !try_module_get(pers->owner)) {
5075		spin_unlock(&pers_lock);
5076		if (mddev->level != LEVEL_NONE)
5077			printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
5078			       mddev->level);
5079		else
5080			printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
5081			       mddev->clevel);
5082		return -EINVAL;
5083	}
5084	spin_unlock(&pers_lock);
5085	if (mddev->level != pers->level) {
5086		mddev->level = pers->level;
5087		mddev->new_level = pers->level;
5088	}
5089	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5090
5091	if (mddev->reshape_position != MaxSector &&
5092	    pers->start_reshape == NULL) {
5093		/* This personality cannot handle reshaping... */
5094		module_put(pers->owner);
5095		return -EINVAL;
5096	}
5097
5098	if (pers->sync_request) {
5099		/* Warn if this is a potentially silly
5100		 * configuration.
5101		 */
5102		char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5103		struct md_rdev *rdev2;
5104		int warned = 0;
5105
5106		rdev_for_each(rdev, mddev)
5107			rdev_for_each(rdev2, mddev) {
5108				if (rdev < rdev2 &&
5109				    rdev->bdev->bd_contains ==
5110				    rdev2->bdev->bd_contains) {
5111					printk(KERN_WARNING
5112					       "%s: WARNING: %s appears to be"
5113					       " on the same physical disk as"
5114					       " %s.\n",
5115					       mdname(mddev),
5116					       bdevname(rdev->bdev,b),
5117					       bdevname(rdev2->bdev,b2));
5118					warned = 1;
5119				}
5120			}
5121
5122		if (warned)
5123			printk(KERN_WARNING
5124			       "True protection against single-disk"
5125			       " failure might be compromised.\n");
5126	}
5127
5128	mddev->recovery = 0;
5129	/* may be over-ridden by personality */
5130	mddev->resync_max_sectors = mddev->dev_sectors;
5131
5132	mddev->ok_start_degraded = start_dirty_degraded;
5133
5134	if (start_readonly && mddev->ro == 0)
5135		mddev->ro = 2; /* read-only, but switch on first write */
5136
5137	err = pers->run(mddev);
5138	if (err)
5139		printk(KERN_ERR "md: pers->run() failed ...\n");
5140	else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
5141		WARN_ONCE(!mddev->external_size, "%s: default size too small,"
5142			  " but 'external_size' not in effect?\n", __func__);
5143		printk(KERN_ERR
5144		       "md: invalid array_size %llu > default size %llu\n",
5145		       (unsigned long long)mddev->array_sectors / 2,
5146		       (unsigned long long)pers->size(mddev, 0, 0) / 2);
5147		err = -EINVAL;
5148	}
5149	if (err == 0 && pers->sync_request &&
5150	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5151		struct bitmap *bitmap;
5152
5153		bitmap = bitmap_create(mddev, -1);
5154		if (IS_ERR(bitmap)) {
5155			err = PTR_ERR(bitmap);
5156			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
5157			       mdname(mddev), err);
5158		} else
5159			mddev->bitmap = bitmap;
5160
5161	}
5162	if (err) {
5163		mddev_detach(mddev);
5164		if (mddev->private)
5165			pers->free(mddev, mddev->private);
5166		mddev->private = NULL;
5167		module_put(pers->owner);
5168		bitmap_destroy(mddev);
5169		return err;
5170	}
5171	if (mddev->queue) {
5172		mddev->queue->backing_dev_info.congested_data = mddev;
5173		mddev->queue->backing_dev_info.congested_fn = md_congested;
5174		blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec);
5175	}
5176	if (pers->sync_request) {
5177		if (mddev->kobj.sd &&
5178		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5179			printk(KERN_WARNING
5180			       "md: cannot register extra attributes for %s\n",
5181			       mdname(mddev));
5182		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5183	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
5184		mddev->ro = 0;
5185
5186	atomic_set(&mddev->writes_pending,0);
5187	atomic_set(&mddev->max_corr_read_errors,
5188		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
5189	mddev->safemode = 0;
5190	mddev->safemode_timer.function = md_safemode_timeout;
5191	mddev->safemode_timer.data = (unsigned long) mddev;
5192	mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
5193	mddev->in_sync = 1;
5194	smp_wmb();
5195	spin_lock(&mddev->lock);
5196	mddev->pers = pers;
5197	mddev->ready = 1;
5198	spin_unlock(&mddev->lock);
5199	rdev_for_each(rdev, mddev)
5200		if (rdev->raid_disk >= 0)
5201			if (sysfs_link_rdev(mddev, rdev))
5202				/* failure here is OK */;
5203
5204	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5205
5206	if (mddev->flags & MD_UPDATE_SB_FLAGS)
5207		md_update_sb(mddev, 0);
5208
5209	md_new_event(mddev);
5210	sysfs_notify_dirent_safe(mddev->sysfs_state);
5211	sysfs_notify_dirent_safe(mddev->sysfs_action);
5212	sysfs_notify(&mddev->kobj, NULL, "degraded");
5213	return 0;
5214}
5215EXPORT_SYMBOL_GPL(md_run);
5216
5217static int do_md_run(struct mddev *mddev)
5218{
5219	int err;
5220
5221	err = md_run(mddev);
5222	if (err)
5223		goto out;
5224	err = bitmap_load(mddev);
5225	if (err) {
5226		bitmap_destroy(mddev);
5227		goto out;
5228	}
5229
5230	md_wakeup_thread(mddev->thread);
5231	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5232
5233	set_capacity(mddev->gendisk, mddev->array_sectors);
5234	revalidate_disk(mddev->gendisk);
5235	mddev->changed = 1;
5236	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5237out:
5238	return err;
5239}
5240
5241static int restart_array(struct mddev *mddev)
5242{
5243	struct gendisk *disk = mddev->gendisk;
5244
5245	/* Complain if it has no devices */
5246	if (list_empty(&mddev->disks))
5247		return -ENXIO;
5248	if (!mddev->pers)
5249		return -EINVAL;
5250	if (!mddev->ro)
5251		return -EBUSY;
5252	mddev->safemode = 0;
5253	mddev->ro = 0;
5254	set_disk_ro(disk, 0);
5255	printk(KERN_INFO "md: %s switched to read-write mode.\n",
5256		mdname(mddev));
5257	/* Kick recovery or resync if necessary */
5258	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5259	md_wakeup_thread(mddev->thread);
5260	md_wakeup_thread(mddev->sync_thread);
5261	sysfs_notify_dirent_safe(mddev->sysfs_state);
5262	return 0;
5263}
5264
5265static void md_clean(struct mddev *mddev)
5266{
5267	mddev->array_sectors = 0;
5268	mddev->external_size = 0;
5269	mddev->dev_sectors = 0;
5270	mddev->raid_disks = 0;
5271	mddev->recovery_cp = 0;
5272	mddev->resync_min = 0;
5273	mddev->resync_max = MaxSector;
5274	mddev->reshape_position = MaxSector;
5275	mddev->external = 0;
5276	mddev->persistent = 0;
5277	mddev->level = LEVEL_NONE;
5278	mddev->clevel[0] = 0;
5279	mddev->flags = 0;
5280	mddev->ro = 0;
5281	mddev->metadata_type[0] = 0;
5282	mddev->chunk_sectors = 0;
5283	mddev->ctime = mddev->utime = 0;
5284	mddev->layout = 0;
5285	mddev->max_disks = 0;
5286	mddev->events = 0;
5287	mddev->can_decrease_events = 0;
5288	mddev->delta_disks = 0;
5289	mddev->reshape_backwards = 0;
5290	mddev->new_level = LEVEL_NONE;
5291	mddev->new_layout = 0;
5292	mddev->new_chunk_sectors = 0;
5293	mddev->curr_resync = 0;
5294	atomic64_set(&mddev->resync_mismatches, 0);
5295	mddev->suspend_lo = mddev->suspend_hi = 0;
5296	mddev->sync_speed_min = mddev->sync_speed_max = 0;
5297	mddev->recovery = 0;
5298	mddev->in_sync = 0;
5299	mddev->changed = 0;
5300	mddev->degraded = 0;
5301	mddev->safemode = 0;
5302	mddev->private = NULL;
5303	mddev->merge_check_needed = 0;
5304	mddev->bitmap_info.offset = 0;
5305	mddev->bitmap_info.default_offset = 0;
5306	mddev->bitmap_info.default_space = 0;
5307	mddev->bitmap_info.chunksize = 0;
5308	mddev->bitmap_info.daemon_sleep = 0;
5309	mddev->bitmap_info.max_write_behind = 0;
5310}
5311
5312static void __md_stop_writes(struct mddev *mddev)
5313{
5314	if (mddev_is_clustered(mddev))
5315		md_cluster_ops->metadata_update_start(mddev);
5316	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5317	flush_workqueue(md_misc_wq);
5318	if (mddev->sync_thread) {
5319		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5320		md_reap_sync_thread(mddev);
5321	}
5322
5323	del_timer_sync(&mddev->safemode_timer);
5324
5325	bitmap_flush(mddev);
5326	md_super_wait(mddev);
5327
5328	if (mddev->ro == 0 &&
5329	    (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
5330		/* mark array as shutdown cleanly */
5331		mddev->in_sync = 1;
5332		md_update_sb(mddev, 1);
5333	}
5334	if (mddev_is_clustered(mddev))
5335		md_cluster_ops->metadata_update_finish(mddev);
5336}
5337
5338void md_stop_writes(struct mddev *mddev)
5339{
5340	mddev_lock_nointr(mddev);
5341	__md_stop_writes(mddev);
5342	mddev_unlock(mddev);
5343}
5344EXPORT_SYMBOL_GPL(md_stop_writes);
5345
5346static void mddev_detach(struct mddev *mddev)
5347{
5348	struct bitmap *bitmap = mddev->bitmap;
5349	/* wait for behind writes to complete */
5350	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
5351		printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n",
5352		       mdname(mddev));
5353		/* need to kick something here to make sure I/O goes? */
5354		wait_event(bitmap->behind_wait,
5355			   atomic_read(&bitmap->behind_writes) == 0);
5356	}
5357	if (mddev->pers && mddev->pers->quiesce) {
5358		mddev->pers->quiesce(mddev, 1);
5359		mddev->pers->quiesce(mddev, 0);
5360	}
5361	md_unregister_thread(&mddev->thread);
5362	if (mddev->queue)
5363		blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
5364}
5365
5366static void __md_stop(struct mddev *mddev)
5367{
5368	struct md_personality *pers = mddev->pers;
5369	mddev_detach(mddev);
5370	/* Ensure ->event_work is done */
5371	flush_workqueue(md_misc_wq);
5372	spin_lock(&mddev->lock);
5373	mddev->ready = 0;
5374	mddev->pers = NULL;
5375	spin_unlock(&mddev->lock);
5376	pers->free(mddev, mddev->private);
5377	mddev->private = NULL;
5378	if (pers->sync_request && mddev->to_remove == NULL)
5379		mddev->to_remove = &md_redundancy_group;
5380	module_put(pers->owner);
5381	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5382}
5383
5384void md_stop(struct mddev *mddev)
5385{
5386	/* stop the array and free an attached data structures.
5387	 * This is called from dm-raid
5388	 */
5389	__md_stop(mddev);
5390	bitmap_destroy(mddev);
5391	if (mddev->bio_set)
5392		bioset_free(mddev->bio_set);
5393}
5394
5395EXPORT_SYMBOL_GPL(md_stop);
5396
5397static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5398{
5399	int err = 0;
5400	int did_freeze = 0;
5401
5402	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5403		did_freeze = 1;
5404		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5405		md_wakeup_thread(mddev->thread);
5406	}
5407	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5408		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5409	if (mddev->sync_thread)
5410		/* Thread might be blocked waiting for metadata update
5411		 * which will now never happen */
5412		wake_up_process(mddev->sync_thread->tsk);
5413
5414	mddev_unlock(mddev);
5415	wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
5416					  &mddev->recovery));
5417	mddev_lock_nointr(mddev);
5418
5419	mutex_lock(&mddev->open_mutex);
5420	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5421	    mddev->sync_thread ||
5422	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
5423	    (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5424		printk("md: %s still in use.\n",mdname(mddev));
5425		if (did_freeze) {
5426			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5427			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5428			md_wakeup_thread(mddev->thread);
5429		}
5430		err = -EBUSY;
5431		goto out;
5432	}
5433	if (mddev->pers) {
5434		__md_stop_writes(mddev);
5435
5436		err  = -ENXIO;
5437		if (mddev->ro==1)
5438			goto out;
5439		mddev->ro = 1;
5440		set_disk_ro(mddev->gendisk, 1);
5441		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5442		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5443		md_wakeup_thread(mddev->thread);
5444		sysfs_notify_dirent_safe(mddev->sysfs_state);
5445		err = 0;
5446	}
5447out:
5448	mutex_unlock(&mddev->open_mutex);
5449	return err;
5450}
5451
5452/* mode:
5453 *   0 - completely stop and dis-assemble array
5454 *   2 - stop but do not disassemble array
5455 */
5456static int do_md_stop(struct mddev *mddev, int mode,
5457		      struct block_device *bdev)
5458{
5459	struct gendisk *disk = mddev->gendisk;
5460	struct md_rdev *rdev;
5461	int did_freeze = 0;
5462
5463	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5464		did_freeze = 1;
5465		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5466		md_wakeup_thread(mddev->thread);
5467	}
5468	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5469		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5470	if (mddev->sync_thread)
5471		/* Thread might be blocked waiting for metadata update
5472		 * which will now never happen */
5473		wake_up_process(mddev->sync_thread->tsk);
5474
5475	mddev_unlock(mddev);
5476	wait_event(resync_wait, (mddev->sync_thread == NULL &&
5477				 !test_bit(MD_RECOVERY_RUNNING,
5478					   &mddev->recovery)));
5479	mddev_lock_nointr(mddev);
5480
5481	mutex_lock(&mddev->open_mutex);
5482	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5483	    mddev->sysfs_active ||
5484	    mddev->sync_thread ||
5485	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
5486	    (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5487		printk("md: %s still in use.\n",mdname(mddev));
5488		mutex_unlock(&mddev->open_mutex);
5489		if (did_freeze) {
5490			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5491			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5492			md_wakeup_thread(mddev->thread);
5493		}
5494		return -EBUSY;
5495	}
5496	if (mddev->pers) {
5497		if (mddev->ro)
5498			set_disk_ro(disk, 0);
5499
5500		__md_stop_writes(mddev);
5501		__md_stop(mddev);
5502		mddev->queue->merge_bvec_fn = NULL;
5503		mddev->queue->backing_dev_info.congested_fn = NULL;
5504
5505		/* tell userspace to handle 'inactive' */
5506		sysfs_notify_dirent_safe(mddev->sysfs_state);
5507
5508		rdev_for_each(rdev, mddev)
5509			if (rdev->raid_disk >= 0)
5510				sysfs_unlink_rdev(mddev, rdev);
5511
5512		set_capacity(disk, 0);
5513		mutex_unlock(&mddev->open_mutex);
5514		mddev->changed = 1;
5515		revalidate_disk(disk);
5516
5517		if (mddev->ro)
5518			mddev->ro = 0;
5519	} else
5520		mutex_unlock(&mddev->open_mutex);
5521	/*
5522	 * Free resources if final stop
5523	 */
5524	if (mode == 0) {
5525		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
5526
5527		bitmap_destroy(mddev);
5528		if (mddev->bitmap_info.file) {
5529			struct file *f = mddev->bitmap_info.file;
5530			spin_lock(&mddev->lock);
5531			mddev->bitmap_info.file = NULL;
5532			spin_unlock(&mddev->lock);
5533			fput(f);
5534		}
5535		mddev->bitmap_info.offset = 0;
5536
5537		export_array(mddev);
5538
5539		md_clean(mddev);
5540		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5541		if (mddev->hold_active == UNTIL_STOP)
5542			mddev->hold_active = 0;
5543	}
5544	blk_integrity_unregister(disk);
5545	md_new_event(mddev);
5546	sysfs_notify_dirent_safe(mddev->sysfs_state);
5547	return 0;
5548}
5549
5550#ifndef MODULE
5551static void autorun_array(struct mddev *mddev)
5552{
5553	struct md_rdev *rdev;
5554	int err;
5555
5556	if (list_empty(&mddev->disks))
5557		return;
5558
5559	printk(KERN_INFO "md: running: ");
5560
5561	rdev_for_each(rdev, mddev) {
5562		char b[BDEVNAME_SIZE];
5563		printk("<%s>", bdevname(rdev->bdev,b));
5564	}
5565	printk("\n");
5566
5567	err = do_md_run(mddev);
5568	if (err) {
5569		printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
5570		do_md_stop(mddev, 0, NULL);
5571	}
5572}
5573
5574/*
5575 * lets try to run arrays based on all disks that have arrived
5576 * until now. (those are in pending_raid_disks)
5577 *
5578 * the method: pick the first pending disk, collect all disks with
5579 * the same UUID, remove all from the pending list and put them into
5580 * the 'same_array' list. Then order this list based on superblock
5581 * update time (freshest comes first), kick out 'old' disks and
5582 * compare superblocks. If everything's fine then run it.
5583 *
5584 * If "unit" is allocated, then bump its reference count
5585 */
5586static void autorun_devices(int part)
5587{
5588	struct md_rdev *rdev0, *rdev, *tmp;
5589	struct mddev *mddev;
5590	char b[BDEVNAME_SIZE];
5591
5592	printk(KERN_INFO "md: autorun ...\n");
5593	while (!list_empty(&pending_raid_disks)) {
5594		int unit;
5595		dev_t dev;
5596		LIST_HEAD(candidates);
5597		rdev0 = list_entry(pending_raid_disks.next,
5598					 struct md_rdev, same_set);
5599
5600		printk(KERN_INFO "md: considering %s ...\n",
5601			bdevname(rdev0->bdev,b));
5602		INIT_LIST_HEAD(&candidates);
5603		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
5604			if (super_90_load(rdev, rdev0, 0) >= 0) {
5605				printk(KERN_INFO "md:  adding %s ...\n",
5606					bdevname(rdev->bdev,b));
5607				list_move(&rdev->same_set, &candidates);
5608			}
5609		/*
5610		 * now we have a set of devices, with all of them having
5611		 * mostly sane superblocks. It's time to allocate the
5612		 * mddev.
5613		 */
5614		if (part) {
5615			dev = MKDEV(mdp_major,
5616				    rdev0->preferred_minor << MdpMinorShift);
5617			unit = MINOR(dev) >> MdpMinorShift;
5618		} else {
5619			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
5620			unit = MINOR(dev);
5621		}
5622		if (rdev0->preferred_minor != unit) {
5623			printk(KERN_INFO "md: unit number in %s is bad: %d\n",
5624			       bdevname(rdev0->bdev, b), rdev0->preferred_minor);
5625			break;
5626		}
5627
5628		md_probe(dev, NULL, NULL);
5629		mddev = mddev_find(dev);
5630		if (!mddev || !mddev->gendisk) {
5631			if (mddev)
5632				mddev_put(mddev);
5633			printk(KERN_ERR
5634				"md: cannot allocate memory for md drive.\n");
5635			break;
5636		}
5637		if (mddev_lock(mddev))
5638			printk(KERN_WARNING "md: %s locked, cannot run\n",
5639			       mdname(mddev));
5640		else if (mddev->raid_disks || mddev->major_version
5641			 || !list_empty(&mddev->disks)) {
5642			printk(KERN_WARNING
5643				"md: %s already running, cannot run %s\n",
5644				mdname(mddev), bdevname(rdev0->bdev,b));
5645			mddev_unlock(mddev);
5646		} else {
5647			printk(KERN_INFO "md: created %s\n", mdname(mddev));
5648			mddev->persistent = 1;
5649			rdev_for_each_list(rdev, tmp, &candidates) {
5650				list_del_init(&rdev->same_set);
5651				if (bind_rdev_to_array(rdev, mddev))
5652					export_rdev(rdev);
5653			}
5654			autorun_array(mddev);
5655			mddev_unlock(mddev);
5656		}
5657		/* on success, candidates will be empty, on error
5658		 * it won't...
5659		 */
5660		rdev_for_each_list(rdev, tmp, &candidates) {
5661			list_del_init(&rdev->same_set);
5662			export_rdev(rdev);
5663		}
5664		mddev_put(mddev);
5665	}
5666	printk(KERN_INFO "md: ... autorun DONE.\n");
5667}
5668#endif /* !MODULE */
5669
5670static int get_version(void __user *arg)
5671{
5672	mdu_version_t ver;
5673
5674	ver.major = MD_MAJOR_VERSION;
5675	ver.minor = MD_MINOR_VERSION;
5676	ver.patchlevel = MD_PATCHLEVEL_VERSION;
5677
5678	if (copy_to_user(arg, &ver, sizeof(ver)))
5679		return -EFAULT;
5680
5681	return 0;
5682}
5683
5684static int get_array_info(struct mddev *mddev, void __user *arg)
5685{
5686	mdu_array_info_t info;
5687	int nr,working,insync,failed,spare;
5688	struct md_rdev *rdev;
5689
5690	nr = working = insync = failed = spare = 0;
5691	rcu_read_lock();
5692	rdev_for_each_rcu(rdev, mddev) {
5693		nr++;
5694		if (test_bit(Faulty, &rdev->flags))
5695			failed++;
5696		else {
5697			working++;
5698			if (test_bit(In_sync, &rdev->flags))
5699				insync++;
5700			else
5701				spare++;
5702		}
5703	}
5704	rcu_read_unlock();
5705
5706	info.major_version = mddev->major_version;
5707	info.minor_version = mddev->minor_version;
5708	info.patch_version = MD_PATCHLEVEL_VERSION;
5709	info.ctime         = mddev->ctime;
5710	info.level         = mddev->level;
5711	info.size          = mddev->dev_sectors / 2;
5712	if (info.size != mddev->dev_sectors / 2) /* overflow */
5713		info.size = -1;
5714	info.nr_disks      = nr;
5715	info.raid_disks    = mddev->raid_disks;
5716	info.md_minor      = mddev->md_minor;
5717	info.not_persistent= !mddev->persistent;
5718
5719	info.utime         = mddev->utime;
5720	info.state         = 0;
5721	if (mddev->in_sync)
5722		info.state = (1<<MD_SB_CLEAN);
5723	if (mddev->bitmap && mddev->bitmap_info.offset)
5724		info.state |= (1<<MD_SB_BITMAP_PRESENT);
5725	if (mddev_is_clustered(mddev))
5726		info.state |= (1<<MD_SB_CLUSTERED);
5727	info.active_disks  = insync;
5728	info.working_disks = working;
5729	info.failed_disks  = failed;
5730	info.spare_disks   = spare;
5731
5732	info.layout        = mddev->layout;
5733	info.chunk_size    = mddev->chunk_sectors << 9;
5734
5735	if (copy_to_user(arg, &info, sizeof(info)))
5736		return -EFAULT;
5737
5738	return 0;
5739}
5740
5741static int get_bitmap_file(struct mddev *mddev, void __user * arg)
5742{
5743	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
5744	char *ptr;
5745	int err;
5746
5747	file = kzalloc(sizeof(*file), GFP_NOIO);
5748	if (!file)
5749		return -ENOMEM;
5750
5751	err = 0;
5752	spin_lock(&mddev->lock);
5753	/* bitmap disabled, zero the first byte and copy out */
5754	if (!mddev->bitmap_info.file)
5755		file->pathname[0] = '\0';
5756	else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
5757			       file->pathname, sizeof(file->pathname))),
5758		 IS_ERR(ptr))
5759		err = PTR_ERR(ptr);
5760	else
5761		memmove(file->pathname, ptr,
5762			sizeof(file->pathname)-(ptr-file->pathname));
5763	spin_unlock(&mddev->lock);
5764
5765	if (err == 0 &&
5766	    copy_to_user(arg, file, sizeof(*file)))
5767		err = -EFAULT;
5768
5769	kfree(file);
5770	return err;
5771}
5772
5773static int get_disk_info(struct mddev *mddev, void __user * arg)
5774{
5775	mdu_disk_info_t info;
5776	struct md_rdev *rdev;
5777
5778	if (copy_from_user(&info, arg, sizeof(info)))
5779		return -EFAULT;
5780
5781	rcu_read_lock();
5782	rdev = md_find_rdev_nr_rcu(mddev, info.number);
5783	if (rdev) {
5784		info.major = MAJOR(rdev->bdev->bd_dev);
5785		info.minor = MINOR(rdev->bdev->bd_dev);
5786		info.raid_disk = rdev->raid_disk;
5787		info.state = 0;
5788		if (test_bit(Faulty, &rdev->flags))
5789			info.state |= (1<<MD_DISK_FAULTY);
5790		else if (test_bit(In_sync, &rdev->flags)) {
5791			info.state |= (1<<MD_DISK_ACTIVE);
5792			info.state |= (1<<MD_DISK_SYNC);
5793		}
5794		if (test_bit(WriteMostly, &rdev->flags))
5795			info.state |= (1<<MD_DISK_WRITEMOSTLY);
5796	} else {
5797		info.major = info.minor = 0;
5798		info.raid_disk = -1;
5799		info.state = (1<<MD_DISK_REMOVED);
5800	}
5801	rcu_read_unlock();
5802
5803	if (copy_to_user(arg, &info, sizeof(info)))
5804		return -EFAULT;
5805
5806	return 0;
5807}
5808
5809static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
5810{
5811	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5812	struct md_rdev *rdev;
5813	dev_t dev = MKDEV(info->major,info->minor);
5814
5815	if (mddev_is_clustered(mddev) &&
5816		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
5817		pr_err("%s: Cannot add to clustered mddev.\n",
5818			       mdname(mddev));
5819		return -EINVAL;
5820	}
5821
5822	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
5823		return -EOVERFLOW;
5824
5825	if (!mddev->raid_disks) {
5826		int err;
5827		/* expecting a device which has a superblock */
5828		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
5829		if (IS_ERR(rdev)) {
5830			printk(KERN_WARNING
5831				"md: md_import_device returned %ld\n",
5832				PTR_ERR(rdev));
5833			return PTR_ERR(rdev);
5834		}
5835		if (!list_empty(&mddev->disks)) {
5836			struct md_rdev *rdev0
5837				= list_entry(mddev->disks.next,
5838					     struct md_rdev, same_set);
5839			err = super_types[mddev->major_version]
5840				.load_super(rdev, rdev0, mddev->minor_version);
5841			if (err < 0) {
5842				printk(KERN_WARNING
5843					"md: %s has different UUID to %s\n",
5844					bdevname(rdev->bdev,b),
5845					bdevname(rdev0->bdev,b2));
5846				export_rdev(rdev);
5847				return -EINVAL;
5848			}
5849		}
5850		err = bind_rdev_to_array(rdev, mddev);
5851		if (err)
5852			export_rdev(rdev);
5853		return err;
5854	}
5855
5856	/*
5857	 * add_new_disk can be used once the array is assembled
5858	 * to add "hot spares".  They must already have a superblock
5859	 * written
5860	 */
5861	if (mddev->pers) {
5862		int err;
5863		if (!mddev->pers->hot_add_disk) {
5864			printk(KERN_WARNING
5865				"%s: personality does not support diskops!\n",
5866			       mdname(mddev));
5867			return -EINVAL;
5868		}
5869		if (mddev->persistent)
5870			rdev = md_import_device(dev, mddev->major_version,
5871						mddev->minor_version);
5872		else
5873			rdev = md_import_device(dev, -1, -1);
5874		if (IS_ERR(rdev)) {
5875			printk(KERN_WARNING
5876				"md: md_import_device returned %ld\n",
5877				PTR_ERR(rdev));
5878			return PTR_ERR(rdev);
5879		}
5880		/* set saved_raid_disk if appropriate */
5881		if (!mddev->persistent) {
5882			if (info->state & (1<<MD_DISK_SYNC)  &&
5883			    info->raid_disk < mddev->raid_disks) {
5884				rdev->raid_disk = info->raid_disk;
5885				set_bit(In_sync, &rdev->flags);
5886				clear_bit(Bitmap_sync, &rdev->flags);
5887			} else
5888				rdev->raid_disk = -1;
5889			rdev->saved_raid_disk = rdev->raid_disk;
5890		} else
5891			super_types[mddev->major_version].
5892				validate_super(mddev, rdev);
5893		if ((info->state & (1<<MD_DISK_SYNC)) &&
5894		     rdev->raid_disk != info->raid_disk) {
5895			/* This was a hot-add request, but events doesn't
5896			 * match, so reject it.
5897			 */
5898			export_rdev(rdev);
5899			return -EINVAL;
5900		}
5901
5902		clear_bit(In_sync, &rdev->flags); /* just to be sure */
5903		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5904			set_bit(WriteMostly, &rdev->flags);
5905		else
5906			clear_bit(WriteMostly, &rdev->flags);
5907
5908		/*
5909		 * check whether the device shows up in other nodes
5910		 */
5911		if (mddev_is_clustered(mddev)) {
5912			if (info->state & (1 << MD_DISK_CANDIDATE)) {
5913				/* Through --cluster-confirm */
5914				set_bit(Candidate, &rdev->flags);
5915				err = md_cluster_ops->new_disk_ack(mddev, true);
5916				if (err) {
5917					export_rdev(rdev);
5918					return err;
5919				}
5920			} else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
5921				/* --add initiated by this node */
5922				err = md_cluster_ops->add_new_disk_start(mddev, rdev);
5923				if (err) {
5924					md_cluster_ops->add_new_disk_finish(mddev);
5925					export_rdev(rdev);
5926					return err;
5927				}
5928			}
5929		}
5930
5931		rdev->raid_disk = -1;
5932		err = bind_rdev_to_array(rdev, mddev);
5933		if (err)
5934			export_rdev(rdev);
5935		else
5936			err = add_bound_rdev(rdev);
5937		if (mddev_is_clustered(mddev) &&
5938				(info->state & (1 << MD_DISK_CLUSTER_ADD)))
5939			md_cluster_ops->add_new_disk_finish(mddev);
5940		return err;
5941	}
5942
5943	/* otherwise, add_new_disk is only allowed
5944	 * for major_version==0 superblocks
5945	 */
5946	if (mddev->major_version != 0) {
5947		printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
5948		       mdname(mddev));
5949		return -EINVAL;
5950	}
5951
5952	if (!(info->state & (1<<MD_DISK_FAULTY))) {
5953		int err;
5954		rdev = md_import_device(dev, -1, 0);
5955		if (IS_ERR(rdev)) {
5956			printk(KERN_WARNING
5957				"md: error, md_import_device() returned %ld\n",
5958				PTR_ERR(rdev));
5959			return PTR_ERR(rdev);
5960		}
5961		rdev->desc_nr = info->number;
5962		if (info->raid_disk < mddev->raid_disks)
5963			rdev->raid_disk = info->raid_disk;
5964		else
5965			rdev->raid_disk = -1;
5966
5967		if (rdev->raid_disk < mddev->raid_disks)
5968			if (info->state & (1<<MD_DISK_SYNC))
5969				set_bit(In_sync, &rdev->flags);
5970
5971		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5972			set_bit(WriteMostly, &rdev->flags);
5973
5974		if (!mddev->persistent) {
5975			printk(KERN_INFO "md: nonpersistent superblock ...\n");
5976			rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5977		} else
5978			rdev->sb_start = calc_dev_sboffset(rdev);
5979		rdev->sectors = rdev->sb_start;
5980
5981		err = bind_rdev_to_array(rdev, mddev);
5982		if (err) {
5983			export_rdev(rdev);
5984			return err;
5985		}
5986	}
5987
5988	return 0;
5989}
5990
5991static int hot_remove_disk(struct mddev *mddev, dev_t dev)
5992{
5993	char b[BDEVNAME_SIZE];
5994	struct md_rdev *rdev;
5995
5996	rdev = find_rdev(mddev, dev);
5997	if (!rdev)
5998		return -ENXIO;
5999
6000	if (mddev_is_clustered(mddev))
6001		md_cluster_ops->metadata_update_start(mddev);
6002
6003	clear_bit(Blocked, &rdev->flags);
6004	remove_and_add_spares(mddev, rdev);
6005
6006	if (rdev->raid_disk >= 0)
6007		goto busy;
6008
6009	if (mddev_is_clustered(mddev))
6010		md_cluster_ops->remove_disk(mddev, rdev);
6011
6012	md_kick_rdev_from_array(rdev);
6013	md_update_sb(mddev, 1);
6014	md_new_event(mddev);
6015
6016	if (mddev_is_clustered(mddev))
6017		md_cluster_ops->metadata_update_finish(mddev);
6018
6019	return 0;
6020busy:
6021	if (mddev_is_clustered(mddev))
6022		md_cluster_ops->metadata_update_cancel(mddev);
6023	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
6024		bdevname(rdev->bdev,b), mdname(mddev));
6025	return -EBUSY;
6026}
6027
6028static int hot_add_disk(struct mddev *mddev, dev_t dev)
6029{
6030	char b[BDEVNAME_SIZE];
6031	int err;
6032	struct md_rdev *rdev;
6033
6034	if (!mddev->pers)
6035		return -ENODEV;
6036
6037	if (mddev->major_version != 0) {
6038		printk(KERN_WARNING "%s: HOT_ADD may only be used with"
6039			" version-0 superblocks.\n",
6040			mdname(mddev));
6041		return -EINVAL;
6042	}
6043	if (!mddev->pers->hot_add_disk) {
6044		printk(KERN_WARNING
6045			"%s: personality does not support diskops!\n",
6046			mdname(mddev));
6047		return -EINVAL;
6048	}
6049
6050	rdev = md_import_device(dev, -1, 0);
6051	if (IS_ERR(rdev)) {
6052		printk(KERN_WARNING
6053			"md: error, md_import_device() returned %ld\n",
6054			PTR_ERR(rdev));
6055		return -EINVAL;
6056	}
6057
6058	if (mddev->persistent)
6059		rdev->sb_start = calc_dev_sboffset(rdev);
6060	else
6061		rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6062
6063	rdev->sectors = rdev->sb_start;
6064
6065	if (test_bit(Faulty, &rdev->flags)) {
6066		printk(KERN_WARNING
6067			"md: can not hot-add faulty %s disk to %s!\n",
6068			bdevname(rdev->bdev,b), mdname(mddev));
6069		err = -EINVAL;
6070		goto abort_export;
6071	}
6072
6073	if (mddev_is_clustered(mddev))
6074		md_cluster_ops->metadata_update_start(mddev);
6075	clear_bit(In_sync, &rdev->flags);
6076	rdev->desc_nr = -1;
6077	rdev->saved_raid_disk = -1;
6078	err = bind_rdev_to_array(rdev, mddev);
6079	if (err)
6080		goto abort_clustered;
6081
6082	/*
6083	 * The rest should better be atomic, we can have disk failures
6084	 * noticed in interrupt contexts ...
6085	 */
6086
6087	rdev->raid_disk = -1;
6088
6089	md_update_sb(mddev, 1);
6090
6091	if (mddev_is_clustered(mddev))
6092		md_cluster_ops->metadata_update_finish(mddev);
6093	/*
6094	 * Kick recovery, maybe this spare has to be added to the
6095	 * array immediately.
6096	 */
6097	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6098	md_wakeup_thread(mddev->thread);
6099	md_new_event(mddev);
6100	return 0;
6101
6102abort_clustered:
6103	if (mddev_is_clustered(mddev))
6104		md_cluster_ops->metadata_update_cancel(mddev);
6105abort_export:
6106	export_rdev(rdev);
6107	return err;
6108}
6109
6110static int set_bitmap_file(struct mddev *mddev, int fd)
6111{
6112	int err = 0;
6113
6114	if (mddev->pers) {
6115		if (!mddev->pers->quiesce || !mddev->thread)
6116			return -EBUSY;
6117		if (mddev->recovery || mddev->sync_thread)
6118			return -EBUSY;
6119		/* we should be able to change the bitmap.. */
6120	}
6121
6122	if (fd >= 0) {
6123		struct inode *inode;
6124		struct file *f;
6125
6126		if (mddev->bitmap || mddev->bitmap_info.file)
6127			return -EEXIST; /* cannot add when bitmap is present */
6128		f = fget(fd);
6129
6130		if (f == NULL) {
6131			printk(KERN_ERR "%s: error: failed to get bitmap file\n",
6132			       mdname(mddev));
6133			return -EBADF;
6134		}
6135
6136		inode = f->f_mapping->host;
6137		if (!S_ISREG(inode->i_mode)) {
6138			printk(KERN_ERR "%s: error: bitmap file must be a regular file\n",
6139			       mdname(mddev));
6140			err = -EBADF;
6141		} else if (!(f->f_mode & FMODE_WRITE)) {
6142			printk(KERN_ERR "%s: error: bitmap file must open for write\n",
6143			       mdname(mddev));
6144			err = -EBADF;
6145		} else if (atomic_read(&inode->i_writecount) != 1) {
6146			printk(KERN_ERR "%s: error: bitmap file is already in use\n",
6147			       mdname(mddev));
6148			err = -EBUSY;
6149		}
6150		if (err) {
6151			fput(f);
6152			return err;
6153		}
6154		mddev->bitmap_info.file = f;
6155		mddev->bitmap_info.offset = 0; /* file overrides offset */
6156	} else if (mddev->bitmap == NULL)
6157		return -ENOENT; /* cannot remove what isn't there */
6158	err = 0;
6159	if (mddev->pers) {
6160		mddev->pers->quiesce(mddev, 1);
6161		if (fd >= 0) {
6162			struct bitmap *bitmap;
6163
6164			bitmap = bitmap_create(mddev, -1);
6165			if (!IS_ERR(bitmap)) {
6166				mddev->bitmap = bitmap;
6167				err = bitmap_load(mddev);
6168			} else
6169				err = PTR_ERR(bitmap);
6170		}
6171		if (fd < 0 || err) {
6172			bitmap_destroy(mddev);
6173			fd = -1; /* make sure to put the file */
6174		}
6175		mddev->pers->quiesce(mddev, 0);
6176	}
6177	if (fd < 0) {
6178		struct file *f = mddev->bitmap_info.file;
6179		if (f) {
6180			spin_lock(&mddev->lock);
6181			mddev->bitmap_info.file = NULL;
6182			spin_unlock(&mddev->lock);
6183			fput(f);
6184		}
6185	}
6186
6187	return err;
6188}
6189
6190/*
6191 * set_array_info is used two different ways
6192 * The original usage is when creating a new array.
6193 * In this usage, raid_disks is > 0 and it together with
6194 *  level, size, not_persistent,layout,chunksize determine the
6195 *  shape of the array.
6196 *  This will always create an array with a type-0.90.0 superblock.
6197 * The newer usage is when assembling an array.
6198 *  In this case raid_disks will be 0, and the major_version field is
6199 *  use to determine which style super-blocks are to be found on the devices.
6200 *  The minor and patch _version numbers are also kept incase the
6201 *  super_block handler wishes to interpret them.
6202 */
6203static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
6204{
6205
6206	if (info->raid_disks == 0) {
6207		/* just setting version number for superblock loading */
6208		if (info->major_version < 0 ||
6209		    info->major_version >= ARRAY_SIZE(super_types) ||
6210		    super_types[info->major_version].name == NULL) {
6211			/* maybe try to auto-load a module? */
6212			printk(KERN_INFO
6213				"md: superblock version %d not known\n",
6214				info->major_version);
6215			return -EINVAL;
6216		}
6217		mddev->major_version = info->major_version;
6218		mddev->minor_version = info->minor_version;
6219		mddev->patch_version = info->patch_version;
6220		mddev->persistent = !info->not_persistent;
6221		/* ensure mddev_put doesn't delete this now that there
6222		 * is some minimal configuration.
6223		 */
6224		mddev->ctime         = get_seconds();
6225		return 0;
6226	}
6227	mddev->major_version = MD_MAJOR_VERSION;
6228	mddev->minor_version = MD_MINOR_VERSION;
6229	mddev->patch_version = MD_PATCHLEVEL_VERSION;
6230	mddev->ctime         = get_seconds();
6231
6232	mddev->level         = info->level;
6233	mddev->clevel[0]     = 0;
6234	mddev->dev_sectors   = 2 * (sector_t)info->size;
6235	mddev->raid_disks    = info->raid_disks;
6236	/* don't set md_minor, it is determined by which /dev/md* was
6237	 * openned
6238	 */
6239	if (info->state & (1<<MD_SB_CLEAN))
6240		mddev->recovery_cp = MaxSector;
6241	else
6242		mddev->recovery_cp = 0;
6243	mddev->persistent    = ! info->not_persistent;
6244	mddev->external	     = 0;
6245
6246	mddev->layout        = info->layout;
6247	mddev->chunk_sectors = info->chunk_size >> 9;
6248
6249	mddev->max_disks     = MD_SB_DISKS;
6250
6251	if (mddev->persistent)
6252		mddev->flags         = 0;
6253	set_bit(MD_CHANGE_DEVS, &mddev->flags);
6254
6255	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6256	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6257	mddev->bitmap_info.offset = 0;
6258
6259	mddev->reshape_position = MaxSector;
6260
6261	/*
6262	 * Generate a 128 bit UUID
6263	 */
6264	get_random_bytes(mddev->uuid, 16);
6265
6266	mddev->new_level = mddev->level;
6267	mddev->new_chunk_sectors = mddev->chunk_sectors;
6268	mddev->new_layout = mddev->layout;
6269	mddev->delta_disks = 0;
6270	mddev->reshape_backwards = 0;
6271
6272	return 0;
6273}
6274
6275void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6276{
6277	WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
6278
6279	if (mddev->external_size)
6280		return;
6281
6282	mddev->array_sectors = array_sectors;
6283}
6284EXPORT_SYMBOL(md_set_array_sectors);
6285
6286static int update_size(struct mddev *mddev, sector_t num_sectors)
6287{
6288	struct md_rdev *rdev;
6289	int rv;
6290	int fit = (num_sectors == 0);
6291
6292	if (mddev->pers->resize == NULL)
6293		return -EINVAL;
6294	/* The "num_sectors" is the number of sectors of each device that
6295	 * is used.  This can only make sense for arrays with redundancy.
6296	 * linear and raid0 always use whatever space is available. We can only
6297	 * consider changing this number if no resync or reconstruction is
6298	 * happening, and if the new size is acceptable. It must fit before the
6299	 * sb_start or, if that is <data_offset, it must fit before the size
6300	 * of each device.  If num_sectors is zero, we find the largest size
6301	 * that fits.
6302	 */
6303	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6304	    mddev->sync_thread)
6305		return -EBUSY;
6306	if (mddev->ro)
6307		return -EROFS;
6308
6309	rdev_for_each(rdev, mddev) {
6310		sector_t avail = rdev->sectors;
6311
6312		if (fit && (num_sectors == 0 || num_sectors > avail))
6313			num_sectors = avail;
6314		if (avail < num_sectors)
6315			return -ENOSPC;
6316	}
6317	rv = mddev->pers->resize(mddev, num_sectors);
6318	if (!rv)
6319		revalidate_disk(mddev->gendisk);
6320	return rv;
6321}
6322
6323static int update_raid_disks(struct mddev *mddev, int raid_disks)
6324{
6325	int rv;
6326	struct md_rdev *rdev;
6327	/* change the number of raid disks */
6328	if (mddev->pers->check_reshape == NULL)
6329		return -EINVAL;
6330	if (mddev->ro)
6331		return -EROFS;
6332	if (raid_disks <= 0 ||
6333	    (mddev->max_disks && raid_disks >= mddev->max_disks))
6334		return -EINVAL;
6335	if (mddev->sync_thread ||
6336	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6337	    mddev->reshape_position != MaxSector)
6338		return -EBUSY;
6339
6340	rdev_for_each(rdev, mddev) {
6341		if (mddev->raid_disks < raid_disks &&
6342		    rdev->data_offset < rdev->new_data_offset)
6343			return -EINVAL;
6344		if (mddev->raid_disks > raid_disks &&
6345		    rdev->data_offset > rdev->new_data_offset)
6346			return -EINVAL;
6347	}
6348
6349	mddev->delta_disks = raid_disks - mddev->raid_disks;
6350	if (mddev->delta_disks < 0)
6351		mddev->reshape_backwards = 1;
6352	else if (mddev->delta_disks > 0)
6353		mddev->reshape_backwards = 0;
6354
6355	rv = mddev->pers->check_reshape(mddev);
6356	if (rv < 0) {
6357		mddev->delta_disks = 0;
6358		mddev->reshape_backwards = 0;
6359	}
6360	return rv;
6361}
6362
6363/*
6364 * update_array_info is used to change the configuration of an
6365 * on-line array.
6366 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
6367 * fields in the info are checked against the array.
6368 * Any differences that cannot be handled will cause an error.
6369 * Normally, only one change can be managed at a time.
6370 */
6371static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6372{
6373	int rv = 0;
6374	int cnt = 0;
6375	int state = 0;
6376
6377	/* calculate expected state,ignoring low bits */
6378	if (mddev->bitmap && mddev->bitmap_info.offset)
6379		state |= (1 << MD_SB_BITMAP_PRESENT);
6380
6381	if (mddev->major_version != info->major_version ||
6382	    mddev->minor_version != info->minor_version ||
6383/*	    mddev->patch_version != info->patch_version || */
6384	    mddev->ctime         != info->ctime         ||
6385	    mddev->level         != info->level         ||
6386/*	    mddev->layout        != info->layout        || */
6387	    mddev->persistent	 != !info->not_persistent ||
6388	    mddev->chunk_sectors != info->chunk_size >> 9 ||
6389	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
6390	    ((state^info->state) & 0xfffffe00)
6391		)
6392		return -EINVAL;
6393	/* Check there is only one change */
6394	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6395		cnt++;
6396	if (mddev->raid_disks != info->raid_disks)
6397		cnt++;
6398	if (mddev->layout != info->layout)
6399		cnt++;
6400	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
6401		cnt++;
6402	if (cnt == 0)
6403		return 0;
6404	if (cnt > 1)
6405		return -EINVAL;
6406
6407	if (mddev->layout != info->layout) {
6408		/* Change layout
6409		 * we don't need to do anything at the md level, the
6410		 * personality will take care of it all.
6411		 */
6412		if (mddev->pers->check_reshape == NULL)
6413			return -EINVAL;
6414		else {
6415			mddev->new_layout = info->layout;
6416			rv = mddev->pers->check_reshape(mddev);
6417			if (rv)
6418				mddev->new_layout = mddev->layout;
6419			return rv;
6420		}
6421	}
6422	if (mddev_is_clustered(mddev))
6423		md_cluster_ops->metadata_update_start(mddev);
6424	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6425		rv = update_size(mddev, (sector_t)info->size * 2);
6426
6427	if (mddev->raid_disks    != info->raid_disks)
6428		rv = update_raid_disks(mddev, info->raid_disks);
6429
6430	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
6431		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
6432			rv = -EINVAL;
6433			goto err;
6434		}
6435		if (mddev->recovery || mddev->sync_thread) {
6436			rv = -EBUSY;
6437			goto err;
6438		}
6439		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
6440			struct bitmap *bitmap;
6441			/* add the bitmap */
6442			if (mddev->bitmap) {
6443				rv = -EEXIST;
6444				goto err;
6445			}
6446			if (mddev->bitmap_info.default_offset == 0) {
6447				rv = -EINVAL;
6448				goto err;
6449			}
6450			mddev->bitmap_info.offset =
6451				mddev->bitmap_info.default_offset;
6452			mddev->bitmap_info.space =
6453				mddev->bitmap_info.default_space;
6454			mddev->pers->quiesce(mddev, 1);
6455			bitmap = bitmap_create(mddev, -1);
6456			if (!IS_ERR(bitmap)) {
6457				mddev->bitmap = bitmap;
6458				rv = bitmap_load(mddev);
6459			} else
6460				rv = PTR_ERR(bitmap);
6461			if (rv)
6462				bitmap_destroy(mddev);
6463			mddev->pers->quiesce(mddev, 0);
6464		} else {
6465			/* remove the bitmap */
6466			if (!mddev->bitmap) {
6467				rv = -ENOENT;
6468				goto err;
6469			}
6470			if (mddev->bitmap->storage.file) {
6471				rv = -EINVAL;
6472				goto err;
6473			}
6474			mddev->pers->quiesce(mddev, 1);
6475			bitmap_destroy(mddev);
6476			mddev->pers->quiesce(mddev, 0);
6477			mddev->bitmap_info.offset = 0;
6478		}
6479	}
6480	md_update_sb(mddev, 1);
6481	if (mddev_is_clustered(mddev))
6482		md_cluster_ops->metadata_update_finish(mddev);
6483	return rv;
6484err:
6485	if (mddev_is_clustered(mddev))
6486		md_cluster_ops->metadata_update_cancel(mddev);
6487	return rv;
6488}
6489
6490static int set_disk_faulty(struct mddev *mddev, dev_t dev)
6491{
6492	struct md_rdev *rdev;
6493	int err = 0;
6494
6495	if (mddev->pers == NULL)
6496		return -ENODEV;
6497
6498	rcu_read_lock();
6499	rdev = find_rdev_rcu(mddev, dev);
6500	if (!rdev)
6501		err =  -ENODEV;
6502	else {
6503		md_error(mddev, rdev);
6504		if (!test_bit(Faulty, &rdev->flags))
6505			err = -EBUSY;
6506	}
6507	rcu_read_unlock();
6508	return err;
6509}
6510
6511/*
6512 * We have a problem here : there is no easy way to give a CHS
6513 * virtual geometry. We currently pretend that we have a 2 heads
6514 * 4 sectors (with a BIG number of cylinders...). This drives
6515 * dosfs just mad... ;-)
6516 */
6517static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
6518{
6519	struct mddev *mddev = bdev->bd_disk->private_data;
6520
6521	geo->heads = 2;
6522	geo->sectors = 4;
6523	geo->cylinders = mddev->array_sectors / 8;
6524	return 0;
6525}
6526
6527static inline bool md_ioctl_valid(unsigned int cmd)
6528{
6529	switch (cmd) {
6530	case ADD_NEW_DISK:
6531	case BLKROSET:
6532	case GET_ARRAY_INFO:
6533	case GET_BITMAP_FILE:
6534	case GET_DISK_INFO:
6535	case HOT_ADD_DISK:
6536	case HOT_REMOVE_DISK:
6537	case RAID_AUTORUN:
6538	case RAID_VERSION:
6539	case RESTART_ARRAY_RW:
6540	case RUN_ARRAY:
6541	case SET_ARRAY_INFO:
6542	case SET_BITMAP_FILE:
6543	case SET_DISK_FAULTY:
6544	case STOP_ARRAY:
6545	case STOP_ARRAY_RO:
6546	case CLUSTERED_DISK_NACK:
6547		return true;
6548	default:
6549		return false;
6550	}
6551}
6552
6553static int md_ioctl(struct block_device *bdev, fmode_t mode,
6554			unsigned int cmd, unsigned long arg)
6555{
6556	int err = 0;
6557	void __user *argp = (void __user *)arg;
6558	struct mddev *mddev = NULL;
6559	int ro;
6560
6561	if (!md_ioctl_valid(cmd))
6562		return -ENOTTY;
6563
6564	switch (cmd) {
6565	case RAID_VERSION:
6566	case GET_ARRAY_INFO:
6567	case GET_DISK_INFO:
6568		break;
6569	default:
6570		if (!capable(CAP_SYS_ADMIN))
6571			return -EACCES;
6572	}
6573
6574	/*
6575	 * Commands dealing with the RAID driver but not any
6576	 * particular array:
6577	 */
6578	switch (cmd) {
6579	case RAID_VERSION:
6580		err = get_version(argp);
6581		goto out;
6582
6583#ifndef MODULE
6584	case RAID_AUTORUN:
6585		err = 0;
6586		autostart_arrays(arg);
6587		goto out;
6588#endif
6589	default:;
6590	}
6591
6592	/*
6593	 * Commands creating/starting a new array:
6594	 */
6595
6596	mddev = bdev->bd_disk->private_data;
6597
6598	if (!mddev) {
6599		BUG();
6600		goto out;
6601	}
6602
6603	/* Some actions do not requires the mutex */
6604	switch (cmd) {
6605	case GET_ARRAY_INFO:
6606		if (!mddev->raid_disks && !mddev->external)
6607			err = -ENODEV;
6608		else
6609			err = get_array_info(mddev, argp);
6610		goto out;
6611
6612	case GET_DISK_INFO:
6613		if (!mddev->raid_disks && !mddev->external)
6614			err = -ENODEV;
6615		else
6616			err = get_disk_info(mddev, argp);
6617		goto out;
6618
6619	case SET_DISK_FAULTY:
6620		err = set_disk_faulty(mddev, new_decode_dev(arg));
6621		goto out;
6622
6623	case GET_BITMAP_FILE:
6624		err = get_bitmap_file(mddev, argp);
6625		goto out;
6626
6627	}
6628
6629	if (cmd == ADD_NEW_DISK)
6630		/* need to ensure md_delayed_delete() has completed */
6631		flush_workqueue(md_misc_wq);
6632
6633	if (cmd == HOT_REMOVE_DISK)
6634		/* need to ensure recovery thread has run */
6635		wait_event_interruptible_timeout(mddev->sb_wait,
6636						 !test_bit(MD_RECOVERY_NEEDED,
6637							   &mddev->flags),
6638						 msecs_to_jiffies(5000));
6639	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
6640		/* Need to flush page cache, and ensure no-one else opens
6641		 * and writes
6642		 */
6643		mutex_lock(&mddev->open_mutex);
6644		if (mddev->pers && atomic_read(&mddev->openers) > 1) {
6645			mutex_unlock(&mddev->open_mutex);
6646			err = -EBUSY;
6647			goto out;
6648		}
6649		set_bit(MD_STILL_CLOSED, &mddev->flags);
6650		mutex_unlock(&mddev->open_mutex);
6651		sync_blockdev(bdev);
6652	}
6653	err = mddev_lock(mddev);
6654	if (err) {
6655		printk(KERN_INFO
6656			"md: ioctl lock interrupted, reason %d, cmd %d\n",
6657			err, cmd);
6658		goto out;
6659	}
6660
6661	if (cmd == SET_ARRAY_INFO) {
6662		mdu_array_info_t info;
6663		if (!arg)
6664			memset(&info, 0, sizeof(info));
6665		else if (copy_from_user(&info, argp, sizeof(info))) {
6666			err = -EFAULT;
6667			goto unlock;
6668		}
6669		if (mddev->pers) {
6670			err = update_array_info(mddev, &info);
6671			if (err) {
6672				printk(KERN_WARNING "md: couldn't update"
6673				       " array info. %d\n", err);
6674				goto unlock;
6675			}
6676			goto unlock;
6677		}
6678		if (!list_empty(&mddev->disks)) {
6679			printk(KERN_WARNING
6680			       "md: array %s already has disks!\n",
6681			       mdname(mddev));
6682			err = -EBUSY;
6683			goto unlock;
6684		}
6685		if (mddev->raid_disks) {
6686			printk(KERN_WARNING
6687			       "md: array %s already initialised!\n",
6688			       mdname(mddev));
6689			err = -EBUSY;
6690			goto unlock;
6691		}
6692		err = set_array_info(mddev, &info);
6693		if (err) {
6694			printk(KERN_WARNING "md: couldn't set"
6695			       " array info. %d\n", err);
6696			goto unlock;
6697		}
6698		goto unlock;
6699	}
6700
6701	/*
6702	 * Commands querying/configuring an existing array:
6703	 */
6704	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
6705	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
6706	if ((!mddev->raid_disks && !mddev->external)
6707	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
6708	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
6709	    && cmd != GET_BITMAP_FILE) {
6710		err = -ENODEV;
6711		goto unlock;
6712	}
6713
6714	/*
6715	 * Commands even a read-only array can execute:
6716	 */
6717	switch (cmd) {
6718	case RESTART_ARRAY_RW:
6719		err = restart_array(mddev);
6720		goto unlock;
6721
6722	case STOP_ARRAY:
6723		err = do_md_stop(mddev, 0, bdev);
6724		goto unlock;
6725
6726	case STOP_ARRAY_RO:
6727		err = md_set_readonly(mddev, bdev);
6728		goto unlock;
6729
6730	case HOT_REMOVE_DISK:
6731		err = hot_remove_disk(mddev, new_decode_dev(arg));
6732		goto unlock;
6733
6734	case ADD_NEW_DISK:
6735		/* We can support ADD_NEW_DISK on read-only arrays
6736		 * on if we are re-adding a preexisting device.
6737		 * So require mddev->pers and MD_DISK_SYNC.
6738		 */
6739		if (mddev->pers) {
6740			mdu_disk_info_t info;
6741			if (copy_from_user(&info, argp, sizeof(info)))
6742				err = -EFAULT;
6743			else if (!(info.state & (1<<MD_DISK_SYNC)))
6744				/* Need to clear read-only for this */
6745				break;
6746			else
6747				err = add_new_disk(mddev, &info);
6748			goto unlock;
6749		}
6750		break;
6751
6752	case BLKROSET:
6753		if (get_user(ro, (int __user *)(arg))) {
6754			err = -EFAULT;
6755			goto unlock;
6756		}
6757		err = -EINVAL;
6758
6759		/* if the bdev is going readonly the value of mddev->ro
6760		 * does not matter, no writes are coming
6761		 */
6762		if (ro)
6763			goto unlock;
6764
6765		/* are we are already prepared for writes? */
6766		if (mddev->ro != 1)
6767			goto unlock;
6768
6769		/* transitioning to readauto need only happen for
6770		 * arrays that call md_write_start
6771		 */
6772		if (mddev->pers) {
6773			err = restart_array(mddev);
6774			if (err == 0) {
6775				mddev->ro = 2;
6776				set_disk_ro(mddev->gendisk, 0);
6777			}
6778		}
6779		goto unlock;
6780	}
6781
6782	/*
6783	 * The remaining ioctls are changing the state of the
6784	 * superblock, so we do not allow them on read-only arrays.
6785	 */
6786	if (mddev->ro && mddev->pers) {
6787		if (mddev->ro == 2) {
6788			mddev->ro = 0;
6789			sysfs_notify_dirent_safe(mddev->sysfs_state);
6790			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6791			/* mddev_unlock will wake thread */
6792			/* If a device failed while we were read-only, we
6793			 * need to make sure the metadata is updated now.
6794			 */
6795			if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
6796				mddev_unlock(mddev);
6797				wait_event(mddev->sb_wait,
6798					   !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6799					   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6800				mddev_lock_nointr(mddev);
6801			}
6802		} else {
6803			err = -EROFS;
6804			goto unlock;
6805		}
6806	}
6807
6808	switch (cmd) {
6809	case ADD_NEW_DISK:
6810	{
6811		mdu_disk_info_t info;
6812		if (copy_from_user(&info, argp, sizeof(info)))
6813			err = -EFAULT;
6814		else
6815			err = add_new_disk(mddev, &info);
6816		goto unlock;
6817	}
6818
6819	case CLUSTERED_DISK_NACK:
6820		if (mddev_is_clustered(mddev))
6821			md_cluster_ops->new_disk_ack(mddev, false);
6822		else
6823			err = -EINVAL;
6824		goto unlock;
6825
6826	case HOT_ADD_DISK:
6827		err = hot_add_disk(mddev, new_decode_dev(arg));
6828		goto unlock;
6829
6830	case RUN_ARRAY:
6831		err = do_md_run(mddev);
6832		goto unlock;
6833
6834	case SET_BITMAP_FILE:
6835		err = set_bitmap_file(mddev, (int)arg);
6836		goto unlock;
6837
6838	default:
6839		err = -EINVAL;
6840		goto unlock;
6841	}
6842
6843unlock:
6844	if (mddev->hold_active == UNTIL_IOCTL &&
6845	    err != -EINVAL)
6846		mddev->hold_active = 0;
6847	mddev_unlock(mddev);
6848out:
6849	return err;
6850}
6851#ifdef CONFIG_COMPAT
6852static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
6853		    unsigned int cmd, unsigned long arg)
6854{
6855	switch (cmd) {
6856	case HOT_REMOVE_DISK:
6857	case HOT_ADD_DISK:
6858	case SET_DISK_FAULTY:
6859	case SET_BITMAP_FILE:
6860		/* These take in integer arg, do not convert */
6861		break;
6862	default:
6863		arg = (unsigned long)compat_ptr(arg);
6864		break;
6865	}
6866
6867	return md_ioctl(bdev, mode, cmd, arg);
6868}
6869#endif /* CONFIG_COMPAT */
6870
6871static int md_open(struct block_device *bdev, fmode_t mode)
6872{
6873	/*
6874	 * Succeed if we can lock the mddev, which confirms that
6875	 * it isn't being stopped right now.
6876	 */
6877	struct mddev *mddev = mddev_find(bdev->bd_dev);
6878	int err;
6879
6880	if (!mddev)
6881		return -ENODEV;
6882
6883	if (mddev->gendisk != bdev->bd_disk) {
6884		/* we are racing with mddev_put which is discarding this
6885		 * bd_disk.
6886		 */
6887		mddev_put(mddev);
6888		/* Wait until bdev->bd_disk is definitely gone */
6889		flush_workqueue(md_misc_wq);
6890		/* Then retry the open from the top */
6891		return -ERESTARTSYS;
6892	}
6893	BUG_ON(mddev != bdev->bd_disk->private_data);
6894
6895	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
6896		goto out;
6897
6898	err = 0;
6899	atomic_inc(&mddev->openers);
6900	clear_bit(MD_STILL_CLOSED, &mddev->flags);
6901	mutex_unlock(&mddev->open_mutex);
6902
6903	check_disk_change(bdev);
6904 out:
6905	return err;
6906}
6907
6908static void md_release(struct gendisk *disk, fmode_t mode)
6909{
6910	struct mddev *mddev = disk->private_data;
6911
6912	BUG_ON(!mddev);
6913	atomic_dec(&mddev->openers);
6914	mddev_put(mddev);
6915}
6916
6917static int md_media_changed(struct gendisk *disk)
6918{
6919	struct mddev *mddev = disk->private_data;
6920
6921	return mddev->changed;
6922}
6923
6924static int md_revalidate(struct gendisk *disk)
6925{
6926	struct mddev *mddev = disk->private_data;
6927
6928	mddev->changed = 0;
6929	return 0;
6930}
6931static const struct block_device_operations md_fops =
6932{
6933	.owner		= THIS_MODULE,
6934	.open		= md_open,
6935	.release	= md_release,
6936	.ioctl		= md_ioctl,
6937#ifdef CONFIG_COMPAT
6938	.compat_ioctl	= md_compat_ioctl,
6939#endif
6940	.getgeo		= md_getgeo,
6941	.media_changed  = md_media_changed,
6942	.revalidate_disk= md_revalidate,
6943};
6944
6945static int md_thread(void *arg)
6946{
6947	struct md_thread *thread = arg;
6948
6949	/*
6950	 * md_thread is a 'system-thread', it's priority should be very
6951	 * high. We avoid resource deadlocks individually in each
6952	 * raid personality. (RAID5 does preallocation) We also use RR and
6953	 * the very same RT priority as kswapd, thus we will never get
6954	 * into a priority inversion deadlock.
6955	 *
6956	 * we definitely have to have equal or higher priority than
6957	 * bdflush, otherwise bdflush will deadlock if there are too
6958	 * many dirty RAID5 blocks.
6959	 */
6960
6961	allow_signal(SIGKILL);
6962	while (!kthread_should_stop()) {
6963
6964		/* We need to wait INTERRUPTIBLE so that
6965		 * we don't add to the load-average.
6966		 * That means we need to be sure no signals are
6967		 * pending
6968		 */
6969		if (signal_pending(current))
6970			flush_signals(current);
6971
6972		wait_event_interruptible_timeout
6973			(thread->wqueue,
6974			 test_bit(THREAD_WAKEUP, &thread->flags)
6975			 || kthread_should_stop(),
6976			 thread->timeout);
6977
6978		clear_bit(THREAD_WAKEUP, &thread->flags);
6979		if (!kthread_should_stop())
6980			thread->run(thread);
6981	}
6982
6983	return 0;
6984}
6985
6986void md_wakeup_thread(struct md_thread *thread)
6987{
6988	if (thread) {
6989		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
6990		set_bit(THREAD_WAKEUP, &thread->flags);
6991		wake_up(&thread->wqueue);
6992	}
6993}
6994EXPORT_SYMBOL(md_wakeup_thread);
6995
6996struct md_thread *md_register_thread(void (*run) (struct md_thread *),
6997		struct mddev *mddev, const char *name)
6998{
6999	struct md_thread *thread;
7000
7001	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
7002	if (!thread)
7003		return NULL;
7004
7005	init_waitqueue_head(&thread->wqueue);
7006
7007	thread->run = run;
7008	thread->mddev = mddev;
7009	thread->timeout = MAX_SCHEDULE_TIMEOUT;
7010	thread->tsk = kthread_run(md_thread, thread,
7011				  "%s_%s",
7012				  mdname(thread->mddev),
7013				  name);
7014	if (IS_ERR(thread->tsk)) {
7015		kfree(thread);
7016		return NULL;
7017	}
7018	return thread;
7019}
7020EXPORT_SYMBOL(md_register_thread);
7021
7022void md_unregister_thread(struct md_thread **threadp)
7023{
7024	struct md_thread *thread = *threadp;
7025	if (!thread)
7026		return;
7027	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
7028	/* Locking ensures that mddev_unlock does not wake_up a
7029	 * non-existent thread
7030	 */
7031	spin_lock(&pers_lock);
7032	*threadp = NULL;
7033	spin_unlock(&pers_lock);
7034
7035	kthread_stop(thread->tsk);
7036	kfree(thread);
7037}
7038EXPORT_SYMBOL(md_unregister_thread);
7039
7040void md_error(struct mddev *mddev, struct md_rdev *rdev)
7041{
7042	if (!rdev || test_bit(Faulty, &rdev->flags))
7043		return;
7044
7045	if (!mddev->pers || !mddev->pers->error_handler)
7046		return;
7047	mddev->pers->error_handler(mddev,rdev);
7048	if (mddev->degraded)
7049		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7050	sysfs_notify_dirent_safe(rdev->sysfs_state);
7051	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7052	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7053	md_wakeup_thread(mddev->thread);
7054	if (mddev->event_work.func)
7055		queue_work(md_misc_wq, &mddev->event_work);
7056	md_new_event_inintr(mddev);
7057}
7058EXPORT_SYMBOL(md_error);
7059
7060/* seq_file implementation /proc/mdstat */
7061
7062static void status_unused(struct seq_file *seq)
7063{
7064	int i = 0;
7065	struct md_rdev *rdev;
7066
7067	seq_printf(seq, "unused devices: ");
7068
7069	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
7070		char b[BDEVNAME_SIZE];
7071		i++;
7072		seq_printf(seq, "%s ",
7073			      bdevname(rdev->bdev,b));
7074	}
7075	if (!i)
7076		seq_printf(seq, "<none>");
7077
7078	seq_printf(seq, "\n");
7079}
7080
7081static void status_resync(struct seq_file *seq, struct mddev *mddev)
7082{
7083	sector_t max_sectors, resync, res;
7084	unsigned long dt, db;
7085	sector_t rt;
7086	int scale;
7087	unsigned int per_milli;
7088
7089	if (mddev->curr_resync <= 3)
7090		resync = 0;
7091	else
7092		resync = mddev->curr_resync
7093			- atomic_read(&mddev->recovery_active);
7094
7095	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
7096	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7097		max_sectors = mddev->resync_max_sectors;
7098	else
7099		max_sectors = mddev->dev_sectors;
7100
7101	WARN_ON(max_sectors == 0);
7102	/* Pick 'scale' such that (resync>>scale)*1000 will fit
7103	 * in a sector_t, and (max_sectors>>scale) will fit in a
7104	 * u32, as those are the requirements for sector_div.
7105	 * Thus 'scale' must be at least 10
7106	 */
7107	scale = 10;
7108	if (sizeof(sector_t) > sizeof(unsigned long)) {
7109		while ( max_sectors/2 > (1ULL<<(scale+32)))
7110			scale++;
7111	}
7112	res = (resync>>scale)*1000;
7113	sector_div(res, (u32)((max_sectors>>scale)+1));
7114
7115	per_milli = res;
7116	{
7117		int i, x = per_milli/50, y = 20-x;
7118		seq_printf(seq, "[");
7119		for (i = 0; i < x; i++)
7120			seq_printf(seq, "=");
7121		seq_printf(seq, ">");
7122		for (i = 0; i < y; i++)
7123			seq_printf(seq, ".");
7124		seq_printf(seq, "] ");
7125	}
7126	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
7127		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
7128		    "reshape" :
7129		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
7130		     "check" :
7131		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
7132		      "resync" : "recovery"))),
7133		   per_milli/10, per_milli % 10,
7134		   (unsigned long long) resync/2,
7135		   (unsigned long long) max_sectors/2);
7136
7137	/*
7138	 * dt: time from mark until now
7139	 * db: blocks written from mark until now
7140	 * rt: remaining time
7141	 *
7142	 * rt is a sector_t, so could be 32bit or 64bit.
7143	 * So we divide before multiply in case it is 32bit and close
7144	 * to the limit.
7145	 * We scale the divisor (db) by 32 to avoid losing precision
7146	 * near the end of resync when the number of remaining sectors
7147	 * is close to 'db'.
7148	 * We then divide rt by 32 after multiplying by db to compensate.
7149	 * The '+1' avoids division by zero if db is very small.
7150	 */
7151	dt = ((jiffies - mddev->resync_mark) / HZ);
7152	if (!dt) dt++;
7153	db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
7154		- mddev->resync_mark_cnt;
7155
7156	rt = max_sectors - resync;    /* number of remaining sectors */
7157	sector_div(rt, db/32+1);
7158	rt *= dt;
7159	rt >>= 5;
7160
7161	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
7162		   ((unsigned long)rt % 60)/6);
7163
7164	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
7165}
7166
7167static void *md_seq_start(struct seq_file *seq, loff_t *pos)
7168{
7169	struct list_head *tmp;
7170	loff_t l = *pos;
7171	struct mddev *mddev;
7172
7173	if (l >= 0x10000)
7174		return NULL;
7175	if (!l--)
7176		/* header */
7177		return (void*)1;
7178
7179	spin_lock(&all_mddevs_lock);
7180	list_for_each(tmp,&all_mddevs)
7181		if (!l--) {
7182			mddev = list_entry(tmp, struct mddev, all_mddevs);
7183			mddev_get(mddev);
7184			spin_unlock(&all_mddevs_lock);
7185			return mddev;
7186		}
7187	spin_unlock(&all_mddevs_lock);
7188	if (!l--)
7189		return (void*)2;/* tail */
7190	return NULL;
7191}
7192
7193static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
7194{
7195	struct list_head *tmp;
7196	struct mddev *next_mddev, *mddev = v;
7197
7198	++*pos;
7199	if (v == (void*)2)
7200		return NULL;
7201
7202	spin_lock(&all_mddevs_lock);
7203	if (v == (void*)1)
7204		tmp = all_mddevs.next;
7205	else
7206		tmp = mddev->all_mddevs.next;
7207	if (tmp != &all_mddevs)
7208		next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
7209	else {
7210		next_mddev = (void*)2;
7211		*pos = 0x10000;
7212	}
7213	spin_unlock(&all_mddevs_lock);
7214
7215	if (v != (void*)1)
7216		mddev_put(mddev);
7217	return next_mddev;
7218
7219}
7220
7221static void md_seq_stop(struct seq_file *seq, void *v)
7222{
7223	struct mddev *mddev = v;
7224
7225	if (mddev && v != (void*)1 && v != (void*)2)
7226		mddev_put(mddev);
7227}
7228
7229static int md_seq_show(struct seq_file *seq, void *v)
7230{
7231	struct mddev *mddev = v;
7232	sector_t sectors;
7233	struct md_rdev *rdev;
7234
7235	if (v == (void*)1) {
7236		struct md_personality *pers;
7237		seq_printf(seq, "Personalities : ");
7238		spin_lock(&pers_lock);
7239		list_for_each_entry(pers, &pers_list, list)
7240			seq_printf(seq, "[%s] ", pers->name);
7241
7242		spin_unlock(&pers_lock);
7243		seq_printf(seq, "\n");
7244		seq->poll_event = atomic_read(&md_event_count);
7245		return 0;
7246	}
7247	if (v == (void*)2) {
7248		status_unused(seq);
7249		return 0;
7250	}
7251
7252	spin_lock(&mddev->lock);
7253	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
7254		seq_printf(seq, "%s : %sactive", mdname(mddev),
7255						mddev->pers ? "" : "in");
7256		if (mddev->pers) {
7257			if (mddev->ro==1)
7258				seq_printf(seq, " (read-only)");
7259			if (mddev->ro==2)
7260				seq_printf(seq, " (auto-read-only)");
7261			seq_printf(seq, " %s", mddev->pers->name);
7262		}
7263
7264		sectors = 0;
7265		rcu_read_lock();
7266		rdev_for_each_rcu(rdev, mddev) {
7267			char b[BDEVNAME_SIZE];
7268			seq_printf(seq, " %s[%d]",
7269				bdevname(rdev->bdev,b), rdev->desc_nr);
7270			if (test_bit(WriteMostly, &rdev->flags))
7271				seq_printf(seq, "(W)");
7272			if (test_bit(Faulty, &rdev->flags)) {
7273				seq_printf(seq, "(F)");
7274				continue;
7275			}
7276			if (rdev->raid_disk < 0)
7277				seq_printf(seq, "(S)"); /* spare */
7278			if (test_bit(Replacement, &rdev->flags))
7279				seq_printf(seq, "(R)");
7280			sectors += rdev->sectors;
7281		}
7282		rcu_read_unlock();
7283
7284		if (!list_empty(&mddev->disks)) {
7285			if (mddev->pers)
7286				seq_printf(seq, "\n      %llu blocks",
7287					   (unsigned long long)
7288					   mddev->array_sectors / 2);
7289			else
7290				seq_printf(seq, "\n      %llu blocks",
7291					   (unsigned long long)sectors / 2);
7292		}
7293		if (mddev->persistent) {
7294			if (mddev->major_version != 0 ||
7295			    mddev->minor_version != 90) {
7296				seq_printf(seq," super %d.%d",
7297					   mddev->major_version,
7298					   mddev->minor_version);
7299			}
7300		} else if (mddev->external)
7301			seq_printf(seq, " super external:%s",
7302				   mddev->metadata_type);
7303		else
7304			seq_printf(seq, " super non-persistent");
7305
7306		if (mddev->pers) {
7307			mddev->pers->status(seq, mddev);
7308			seq_printf(seq, "\n      ");
7309			if (mddev->pers->sync_request) {
7310				if (mddev->curr_resync > 2) {
7311					status_resync(seq, mddev);
7312					seq_printf(seq, "\n      ");
7313				} else if (mddev->curr_resync >= 1)
7314					seq_printf(seq, "\tresync=DELAYED\n      ");
7315				else if (mddev->recovery_cp < MaxSector)
7316					seq_printf(seq, "\tresync=PENDING\n      ");
7317			}
7318		} else
7319			seq_printf(seq, "\n       ");
7320
7321		bitmap_status(seq, mddev->bitmap);
7322
7323		seq_printf(seq, "\n");
7324	}
7325	spin_unlock(&mddev->lock);
7326
7327	return 0;
7328}
7329
7330static const struct seq_operations md_seq_ops = {
7331	.start  = md_seq_start,
7332	.next   = md_seq_next,
7333	.stop   = md_seq_stop,
7334	.show   = md_seq_show,
7335};
7336
7337static int md_seq_open(struct inode *inode, struct file *file)
7338{
7339	struct seq_file *seq;
7340	int error;
7341
7342	error = seq_open(file, &md_seq_ops);
7343	if (error)
7344		return error;
7345
7346	seq = file->private_data;
7347	seq->poll_event = atomic_read(&md_event_count);
7348	return error;
7349}
7350
7351static int md_unloading;
7352static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
7353{
7354	struct seq_file *seq = filp->private_data;
7355	int mask;
7356
7357	if (md_unloading)
7358		return POLLIN|POLLRDNORM|POLLERR|POLLPRI;
7359	poll_wait(filp, &md_event_waiters, wait);
7360
7361	/* always allow read */
7362	mask = POLLIN | POLLRDNORM;
7363
7364	if (seq->poll_event != atomic_read(&md_event_count))
7365		mask |= POLLERR | POLLPRI;
7366	return mask;
7367}
7368
7369static const struct file_operations md_seq_fops = {
7370	.owner		= THIS_MODULE,
7371	.open           = md_seq_open,
7372	.read           = seq_read,
7373	.llseek         = seq_lseek,
7374	.release	= seq_release_private,
7375	.poll		= mdstat_poll,
7376};
7377
7378int register_md_personality(struct md_personality *p)
7379{
7380	printk(KERN_INFO "md: %s personality registered for level %d\n",
7381						p->name, p->level);
7382	spin_lock(&pers_lock);
7383	list_add_tail(&p->list, &pers_list);
7384	spin_unlock(&pers_lock);
7385	return 0;
7386}
7387EXPORT_SYMBOL(register_md_personality);
7388
7389int unregister_md_personality(struct md_personality *p)
7390{
7391	printk(KERN_INFO "md: %s personality unregistered\n", p->name);
7392	spin_lock(&pers_lock);
7393	list_del_init(&p->list);
7394	spin_unlock(&pers_lock);
7395	return 0;
7396}
7397EXPORT_SYMBOL(unregister_md_personality);
7398
7399int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module)
7400{
7401	if (md_cluster_ops != NULL)
7402		return -EALREADY;
7403	spin_lock(&pers_lock);
7404	md_cluster_ops = ops;
7405	md_cluster_mod = module;
7406	spin_unlock(&pers_lock);
7407	return 0;
7408}
7409EXPORT_SYMBOL(register_md_cluster_operations);
7410
7411int unregister_md_cluster_operations(void)
7412{
7413	spin_lock(&pers_lock);
7414	md_cluster_ops = NULL;
7415	spin_unlock(&pers_lock);
7416	return 0;
7417}
7418EXPORT_SYMBOL(unregister_md_cluster_operations);
7419
7420int md_setup_cluster(struct mddev *mddev, int nodes)
7421{
7422	int err;
7423
7424	err = request_module("md-cluster");
7425	if (err) {
7426		pr_err("md-cluster module not found.\n");
7427		return err;
7428	}
7429
7430	spin_lock(&pers_lock);
7431	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
7432		spin_unlock(&pers_lock);
7433		return -ENOENT;
7434	}
7435	spin_unlock(&pers_lock);
7436
7437	return md_cluster_ops->join(mddev, nodes);
7438}
7439
7440void md_cluster_stop(struct mddev *mddev)
7441{
7442	if (!md_cluster_ops)
7443		return;
7444	md_cluster_ops->leave(mddev);
7445	module_put(md_cluster_mod);
7446}
7447
7448static int is_mddev_idle(struct mddev *mddev, int init)
7449{
7450	struct md_rdev *rdev;
7451	int idle;
7452	int curr_events;
7453
7454	idle = 1;
7455	rcu_read_lock();
7456	rdev_for_each_rcu(rdev, mddev) {
7457		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
7458		curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
7459			      (int)part_stat_read(&disk->part0, sectors[1]) -
7460			      atomic_read(&disk->sync_io);
7461		/* sync IO will cause sync_io to increase before the disk_stats
7462		 * as sync_io is counted when a request starts, and
7463		 * disk_stats is counted when it completes.
7464		 * So resync activity will cause curr_events to be smaller than
7465		 * when there was no such activity.
7466		 * non-sync IO will cause disk_stat to increase without
7467		 * increasing sync_io so curr_events will (eventually)
7468		 * be larger than it was before.  Once it becomes
7469		 * substantially larger, the test below will cause
7470		 * the array to appear non-idle, and resync will slow
7471		 * down.
7472		 * If there is a lot of outstanding resync activity when
7473		 * we set last_event to curr_events, then all that activity
7474		 * completing might cause the array to appear non-idle
7475		 * and resync will be slowed down even though there might
7476		 * not have been non-resync activity.  This will only
7477		 * happen once though.  'last_events' will soon reflect
7478		 * the state where there is little or no outstanding
7479		 * resync requests, and further resync activity will
7480		 * always make curr_events less than last_events.
7481		 *
7482		 */
7483		if (init || curr_events - rdev->last_events > 64) {
7484			rdev->last_events = curr_events;
7485			idle = 0;
7486		}
7487	}
7488	rcu_read_unlock();
7489	return idle;
7490}
7491
7492void md_done_sync(struct mddev *mddev, int blocks, int ok)
7493{
7494	/* another "blocks" (512byte) blocks have been synced */
7495	atomic_sub(blocks, &mddev->recovery_active);
7496	wake_up(&mddev->recovery_wait);
7497	if (!ok) {
7498		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7499		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
7500		md_wakeup_thread(mddev->thread);
7501		// stop recovery, signal do_sync ....
7502	}
7503}
7504EXPORT_SYMBOL(md_done_sync);
7505
7506/* md_write_start(mddev, bi)
7507 * If we need to update some array metadata (e.g. 'active' flag
7508 * in superblock) before writing, schedule a superblock update
7509 * and wait for it to complete.
7510 */
7511void md_write_start(struct mddev *mddev, struct bio *bi)
7512{
7513	int did_change = 0;
7514	if (bio_data_dir(bi) != WRITE)
7515		return;
7516
7517	BUG_ON(mddev->ro == 1);
7518	if (mddev->ro == 2) {
7519		/* need to switch to read/write */
7520		mddev->ro = 0;
7521		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7522		md_wakeup_thread(mddev->thread);
7523		md_wakeup_thread(mddev->sync_thread);
7524		did_change = 1;
7525	}
7526	atomic_inc(&mddev->writes_pending);
7527	if (mddev->safemode == 1)
7528		mddev->safemode = 0;
7529	if (mddev->in_sync) {
7530		spin_lock(&mddev->lock);
7531		if (mddev->in_sync) {
7532			mddev->in_sync = 0;
7533			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7534			set_bit(MD_CHANGE_PENDING, &mddev->flags);
7535			md_wakeup_thread(mddev->thread);
7536			did_change = 1;
7537		}
7538		spin_unlock(&mddev->lock);
7539	}
7540	if (did_change)
7541		sysfs_notify_dirent_safe(mddev->sysfs_state);
7542	wait_event(mddev->sb_wait,
7543		   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
7544}
7545EXPORT_SYMBOL(md_write_start);
7546
7547void md_write_end(struct mddev *mddev)
7548{
7549	if (atomic_dec_and_test(&mddev->writes_pending)) {
7550		if (mddev->safemode == 2)
7551			md_wakeup_thread(mddev->thread);
7552		else if (mddev->safemode_delay)
7553			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
7554	}
7555}
7556EXPORT_SYMBOL(md_write_end);
7557
7558/* md_allow_write(mddev)
7559 * Calling this ensures that the array is marked 'active' so that writes
7560 * may proceed without blocking.  It is important to call this before
7561 * attempting a GFP_KERNEL allocation while holding the mddev lock.
7562 * Must be called with mddev_lock held.
7563 *
7564 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
7565 * is dropped, so return -EAGAIN after notifying userspace.
7566 */
7567int md_allow_write(struct mddev *mddev)
7568{
7569	if (!mddev->pers)
7570		return 0;
7571	if (mddev->ro)
7572		return 0;
7573	if (!mddev->pers->sync_request)
7574		return 0;
7575
7576	spin_lock(&mddev->lock);
7577	if (mddev->in_sync) {
7578		mddev->in_sync = 0;
7579		set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7580		set_bit(MD_CHANGE_PENDING, &mddev->flags);
7581		if (mddev->safemode_delay &&
7582		    mddev->safemode == 0)
7583			mddev->safemode = 1;
7584		spin_unlock(&mddev->lock);
7585		if (mddev_is_clustered(mddev))
7586			md_cluster_ops->metadata_update_start(mddev);
7587		md_update_sb(mddev, 0);
7588		if (mddev_is_clustered(mddev))
7589			md_cluster_ops->metadata_update_finish(mddev);
7590		sysfs_notify_dirent_safe(mddev->sysfs_state);
7591	} else
7592		spin_unlock(&mddev->lock);
7593
7594	if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
7595		return -EAGAIN;
7596	else
7597		return 0;
7598}
7599EXPORT_SYMBOL_GPL(md_allow_write);
7600
7601#define SYNC_MARKS	10
7602#define	SYNC_MARK_STEP	(3*HZ)
7603#define UPDATE_FREQUENCY (5*60*HZ)
7604void md_do_sync(struct md_thread *thread)
7605{
7606	struct mddev *mddev = thread->mddev;
7607	struct mddev *mddev2;
7608	unsigned int currspeed = 0,
7609		 window;
7610	sector_t max_sectors,j, io_sectors, recovery_done;
7611	unsigned long mark[SYNC_MARKS];
7612	unsigned long update_time;
7613	sector_t mark_cnt[SYNC_MARKS];
7614	int last_mark,m;
7615	struct list_head *tmp;
7616	sector_t last_check;
7617	int skipped = 0;
7618	struct md_rdev *rdev;
7619	char *desc, *action = NULL;
7620	struct blk_plug plug;
7621
7622	/* just incase thread restarts... */
7623	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7624		return;
7625	if (mddev->ro) {/* never try to sync a read-only array */
7626		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7627		return;
7628	}
7629
7630	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7631		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
7632			desc = "data-check";
7633			action = "check";
7634		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7635			desc = "requested-resync";
7636			action = "repair";
7637		} else
7638			desc = "resync";
7639	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7640		desc = "reshape";
7641	else
7642		desc = "recovery";
7643
7644	mddev->last_sync_action = action ?: desc;
7645
7646	/* we overload curr_resync somewhat here.
7647	 * 0 == not engaged in resync at all
7648	 * 2 == checking that there is no conflict with another sync
7649	 * 1 == like 2, but have yielded to allow conflicting resync to
7650	 *		commense
7651	 * other == active in resync - this many blocks
7652	 *
7653	 * Before starting a resync we must have set curr_resync to
7654	 * 2, and then checked that every "conflicting" array has curr_resync
7655	 * less than ours.  When we find one that is the same or higher
7656	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
7657	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
7658	 * This will mean we have to start checking from the beginning again.
7659	 *
7660	 */
7661
7662	do {
7663		mddev->curr_resync = 2;
7664
7665	try_again:
7666		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7667			goto skip;
7668		for_each_mddev(mddev2, tmp) {
7669			if (mddev2 == mddev)
7670				continue;
7671			if (!mddev->parallel_resync
7672			&&  mddev2->curr_resync
7673			&&  match_mddev_units(mddev, mddev2)) {
7674				DEFINE_WAIT(wq);
7675				if (mddev < mddev2 && mddev->curr_resync == 2) {
7676					/* arbitrarily yield */
7677					mddev->curr_resync = 1;
7678					wake_up(&resync_wait);
7679				}
7680				if (mddev > mddev2 && mddev->curr_resync == 1)
7681					/* no need to wait here, we can wait the next
7682					 * time 'round when curr_resync == 2
7683					 */
7684					continue;
7685				/* We need to wait 'interruptible' so as not to
7686				 * contribute to the load average, and not to
7687				 * be caught by 'softlockup'
7688				 */
7689				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
7690				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7691				    mddev2->curr_resync >= mddev->curr_resync) {
7692					printk(KERN_INFO "md: delaying %s of %s"
7693					       " until %s has finished (they"
7694					       " share one or more physical units)\n",
7695					       desc, mdname(mddev), mdname(mddev2));
7696					mddev_put(mddev2);
7697					if (signal_pending(current))
7698						flush_signals(current);
7699					schedule();
7700					finish_wait(&resync_wait, &wq);
7701					goto try_again;
7702				}
7703				finish_wait(&resync_wait, &wq);
7704			}
7705		}
7706	} while (mddev->curr_resync < 2);
7707
7708	j = 0;
7709	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7710		/* resync follows the size requested by the personality,
7711		 * which defaults to physical size, but can be virtual size
7712		 */
7713		max_sectors = mddev->resync_max_sectors;
7714		atomic64_set(&mddev->resync_mismatches, 0);
7715		/* we don't use the checkpoint if there's a bitmap */
7716		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7717			j = mddev->resync_min;
7718		else if (!mddev->bitmap)
7719			j = mddev->recovery_cp;
7720
7721	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7722		max_sectors = mddev->resync_max_sectors;
7723	else {
7724		/* recovery follows the physical size of devices */
7725		max_sectors = mddev->dev_sectors;
7726		j = MaxSector;
7727		rcu_read_lock();
7728		rdev_for_each_rcu(rdev, mddev)
7729			if (rdev->raid_disk >= 0 &&
7730			    !test_bit(Faulty, &rdev->flags) &&
7731			    !test_bit(In_sync, &rdev->flags) &&
7732			    rdev->recovery_offset < j)
7733				j = rdev->recovery_offset;
7734		rcu_read_unlock();
7735
7736		/* If there is a bitmap, we need to make sure all
7737		 * writes that started before we added a spare
7738		 * complete before we start doing a recovery.
7739		 * Otherwise the write might complete and (via
7740		 * bitmap_endwrite) set a bit in the bitmap after the
7741		 * recovery has checked that bit and skipped that
7742		 * region.
7743		 */
7744		if (mddev->bitmap) {
7745			mddev->pers->quiesce(mddev, 1);
7746			mddev->pers->quiesce(mddev, 0);
7747		}
7748	}
7749
7750	printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
7751	printk(KERN_INFO "md: minimum _guaranteed_  speed:"
7752		" %d KB/sec/disk.\n", speed_min(mddev));
7753	printk(KERN_INFO "md: using maximum available idle IO bandwidth "
7754	       "(but not more than %d KB/sec) for %s.\n",
7755	       speed_max(mddev), desc);
7756
7757	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
7758
7759	io_sectors = 0;
7760	for (m = 0; m < SYNC_MARKS; m++) {
7761		mark[m] = jiffies;
7762		mark_cnt[m] = io_sectors;
7763	}
7764	last_mark = 0;
7765	mddev->resync_mark = mark[last_mark];
7766	mddev->resync_mark_cnt = mark_cnt[last_mark];
7767
7768	/*
7769	 * Tune reconstruction:
7770	 */
7771	window = 32*(PAGE_SIZE/512);
7772	printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
7773		window/2, (unsigned long long)max_sectors/2);
7774
7775	atomic_set(&mddev->recovery_active, 0);
7776	last_check = 0;
7777
7778	if (j>2) {
7779		printk(KERN_INFO
7780		       "md: resuming %s of %s from checkpoint.\n",
7781		       desc, mdname(mddev));
7782		mddev->curr_resync = j;
7783	} else
7784		mddev->curr_resync = 3; /* no longer delayed */
7785	mddev->curr_resync_completed = j;
7786	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7787	md_new_event(mddev);
7788	update_time = jiffies;
7789
7790	if (mddev_is_clustered(mddev))
7791		md_cluster_ops->resync_start(mddev, j, max_sectors);
7792
7793	blk_start_plug(&plug);
7794	while (j < max_sectors) {
7795		sector_t sectors;
7796
7797		skipped = 0;
7798
7799		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7800		    ((mddev->curr_resync > mddev->curr_resync_completed &&
7801		      (mddev->curr_resync - mddev->curr_resync_completed)
7802		      > (max_sectors >> 4)) ||
7803		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
7804		     (j - mddev->curr_resync_completed)*2
7805		     >= mddev->resync_max - mddev->curr_resync_completed
7806			    )) {
7807			/* time to update curr_resync_completed */
7808			wait_event(mddev->recovery_wait,
7809				   atomic_read(&mddev->recovery_active) == 0);
7810			mddev->curr_resync_completed = j;
7811			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
7812			    j > mddev->recovery_cp)
7813				mddev->recovery_cp = j;
7814			update_time = jiffies;
7815			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7816			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7817		}
7818
7819		while (j >= mddev->resync_max &&
7820		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7821			/* As this condition is controlled by user-space,
7822			 * we can block indefinitely, so use '_interruptible'
7823			 * to avoid triggering warnings.
7824			 */
7825			flush_signals(current); /* just in case */
7826			wait_event_interruptible(mddev->recovery_wait,
7827						 mddev->resync_max > j
7828						 || test_bit(MD_RECOVERY_INTR,
7829							     &mddev->recovery));
7830		}
7831
7832		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7833			break;
7834
7835		sectors = mddev->pers->sync_request(mddev, j, &skipped);
7836		if (sectors == 0) {
7837			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7838			break;
7839		}
7840
7841		if (!skipped) { /* actual IO requested */
7842			io_sectors += sectors;
7843			atomic_add(sectors, &mddev->recovery_active);
7844		}
7845
7846		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7847			break;
7848
7849		j += sectors;
7850		if (j > 2)
7851			mddev->curr_resync = j;
7852		if (mddev_is_clustered(mddev))
7853			md_cluster_ops->resync_info_update(mddev, j, max_sectors);
7854		mddev->curr_mark_cnt = io_sectors;
7855		if (last_check == 0)
7856			/* this is the earliest that rebuild will be
7857			 * visible in /proc/mdstat
7858			 */
7859			md_new_event(mddev);
7860
7861		if (last_check + window > io_sectors || j == max_sectors)
7862			continue;
7863
7864		last_check = io_sectors;
7865	repeat:
7866		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
7867			/* step marks */
7868			int next = (last_mark+1) % SYNC_MARKS;
7869
7870			mddev->resync_mark = mark[next];
7871			mddev->resync_mark_cnt = mark_cnt[next];
7872			mark[next] = jiffies;
7873			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
7874			last_mark = next;
7875		}
7876
7877		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7878			break;
7879
7880		/*
7881		 * this loop exits only if either when we are slower than
7882		 * the 'hard' speed limit, or the system was IO-idle for
7883		 * a jiffy.
7884		 * the system might be non-idle CPU-wise, but we only care
7885		 * about not overloading the IO subsystem. (things like an
7886		 * e2fsck being done on the RAID array should execute fast)
7887		 */
7888		cond_resched();
7889
7890		recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
7891		currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
7892			/((jiffies-mddev->resync_mark)/HZ +1) +1;
7893
7894		if (currspeed > speed_min(mddev)) {
7895			if (currspeed > speed_max(mddev)) {
7896				msleep(500);
7897				goto repeat;
7898			}
7899			if (!is_mddev_idle(mddev, 0)) {
7900				/*
7901				 * Give other IO more of a chance.
7902				 * The faster the devices, the less we wait.
7903				 */
7904				wait_event(mddev->recovery_wait,
7905					   !atomic_read(&mddev->recovery_active));
7906			}
7907		}
7908	}
7909	printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
7910	       test_bit(MD_RECOVERY_INTR, &mddev->recovery)
7911	       ? "interrupted" : "done");
7912	/*
7913	 * this also signals 'finished resyncing' to md_stop
7914	 */
7915	blk_finish_plug(&plug);
7916	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7917
7918	/* tell personality that we are finished */
7919	mddev->pers->sync_request(mddev, max_sectors, &skipped);
7920
7921	if (mddev_is_clustered(mddev))
7922		md_cluster_ops->resync_finish(mddev);
7923
7924	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
7925	    mddev->curr_resync > 2) {
7926		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7927			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7928				if (mddev->curr_resync >= mddev->recovery_cp) {
7929					printk(KERN_INFO
7930					       "md: checkpointing %s of %s.\n",
7931					       desc, mdname(mddev));
7932					if (test_bit(MD_RECOVERY_ERROR,
7933						&mddev->recovery))
7934						mddev->recovery_cp =
7935							mddev->curr_resync_completed;
7936					else
7937						mddev->recovery_cp =
7938							mddev->curr_resync;
7939				}
7940			} else
7941				mddev->recovery_cp = MaxSector;
7942		} else {
7943			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7944				mddev->curr_resync = MaxSector;
7945			rcu_read_lock();
7946			rdev_for_each_rcu(rdev, mddev)
7947				if (rdev->raid_disk >= 0 &&
7948				    mddev->delta_disks >= 0 &&
7949				    !test_bit(Faulty, &rdev->flags) &&
7950				    !test_bit(In_sync, &rdev->flags) &&
7951				    rdev->recovery_offset < mddev->curr_resync)
7952					rdev->recovery_offset = mddev->curr_resync;
7953			rcu_read_unlock();
7954		}
7955	}
7956 skip:
7957	set_bit(MD_CHANGE_DEVS, &mddev->flags);
7958
7959	spin_lock(&mddev->lock);
7960	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7961		/* We completed so min/max setting can be forgotten if used. */
7962		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7963			mddev->resync_min = 0;
7964		mddev->resync_max = MaxSector;
7965	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7966		mddev->resync_min = mddev->curr_resync_completed;
7967	mddev->curr_resync = 0;
7968	spin_unlock(&mddev->lock);
7969
7970	wake_up(&resync_wait);
7971	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
7972	md_wakeup_thread(mddev->thread);
7973	return;
7974}
7975EXPORT_SYMBOL_GPL(md_do_sync);
7976
7977static int remove_and_add_spares(struct mddev *mddev,
7978				 struct md_rdev *this)
7979{
7980	struct md_rdev *rdev;
7981	int spares = 0;
7982	int removed = 0;
7983
7984	rdev_for_each(rdev, mddev)
7985		if ((this == NULL || rdev == this) &&
7986		    rdev->raid_disk >= 0 &&
7987		    !test_bit(Blocked, &rdev->flags) &&
7988		    (test_bit(Faulty, &rdev->flags) ||
7989		     ! test_bit(In_sync, &rdev->flags)) &&
7990		    atomic_read(&rdev->nr_pending)==0) {
7991			if (mddev->pers->hot_remove_disk(
7992				    mddev, rdev) == 0) {
7993				sysfs_unlink_rdev(mddev, rdev);
7994				rdev->raid_disk = -1;
7995				removed++;
7996			}
7997		}
7998	if (removed && mddev->kobj.sd)
7999		sysfs_notify(&mddev->kobj, NULL, "degraded");
8000
8001	if (this)
8002		goto no_add;
8003
8004	rdev_for_each(rdev, mddev) {
8005		if (rdev->raid_disk >= 0 &&
8006		    !test_bit(In_sync, &rdev->flags) &&
8007		    !test_bit(Faulty, &rdev->flags))
8008			spares++;
8009		if (rdev->raid_disk >= 0)
8010			continue;
8011		if (test_bit(Faulty, &rdev->flags))
8012			continue;
8013		if (mddev->ro &&
8014		    ! (rdev->saved_raid_disk >= 0 &&
8015		       !test_bit(Bitmap_sync, &rdev->flags)))
8016			continue;
8017
8018		rdev->recovery_offset = 0;
8019		if (mddev->pers->
8020		    hot_add_disk(mddev, rdev) == 0) {
8021			if (sysfs_link_rdev(mddev, rdev))
8022				/* failure here is OK */;
8023			spares++;
8024			md_new_event(mddev);
8025			set_bit(MD_CHANGE_DEVS, &mddev->flags);
8026		}
8027	}
8028no_add:
8029	if (removed)
8030		set_bit(MD_CHANGE_DEVS, &mddev->flags);
8031	return spares;
8032}
8033
8034static void md_start_sync(struct work_struct *ws)
8035{
8036	struct mddev *mddev = container_of(ws, struct mddev, del_work);
8037
8038	mddev->sync_thread = md_register_thread(md_do_sync,
8039						mddev,
8040						"resync");
8041	if (!mddev->sync_thread) {
8042		printk(KERN_ERR "%s: could not start resync"
8043		       " thread...\n",
8044		       mdname(mddev));
8045		/* leave the spares where they are, it shouldn't hurt */
8046		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8047		clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8048		clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8049		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8050		clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8051		wake_up(&resync_wait);
8052		if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8053				       &mddev->recovery))
8054			if (mddev->sysfs_action)
8055				sysfs_notify_dirent_safe(mddev->sysfs_action);
8056	} else
8057		md_wakeup_thread(mddev->sync_thread);
8058	sysfs_notify_dirent_safe(mddev->sysfs_action);
8059	md_new_event(mddev);
8060}
8061
8062/*
8063 * This routine is regularly called by all per-raid-array threads to
8064 * deal with generic issues like resync and super-block update.
8065 * Raid personalities that don't have a thread (linear/raid0) do not
8066 * need this as they never do any recovery or update the superblock.
8067 *
8068 * It does not do any resync itself, but rather "forks" off other threads
8069 * to do that as needed.
8070 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
8071 * "->recovery" and create a thread at ->sync_thread.
8072 * When the thread finishes it sets MD_RECOVERY_DONE
8073 * and wakeups up this thread which will reap the thread and finish up.
8074 * This thread also removes any faulty devices (with nr_pending == 0).
8075 *
8076 * The overall approach is:
8077 *  1/ if the superblock needs updating, update it.
8078 *  2/ If a recovery thread is running, don't do anything else.
8079 *  3/ If recovery has finished, clean up, possibly marking spares active.
8080 *  4/ If there are any faulty devices, remove them.
8081 *  5/ If array is degraded, try to add spares devices
8082 *  6/ If array has spares or is not in-sync, start a resync thread.
8083 */
8084void md_check_recovery(struct mddev *mddev)
8085{
8086	if (mddev->suspended)
8087		return;
8088
8089	if (mddev->bitmap)
8090		bitmap_daemon_work(mddev);
8091
8092	if (signal_pending(current)) {
8093		if (mddev->pers->sync_request && !mddev->external) {
8094			printk(KERN_INFO "md: %s in immediate safe mode\n",
8095			       mdname(mddev));
8096			mddev->safemode = 2;
8097		}
8098		flush_signals(current);
8099	}
8100
8101	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
8102		return;
8103	if ( ! (
8104		(mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
8105		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8106		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8107		(mddev->external == 0 && mddev->safemode == 1) ||
8108		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
8109		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
8110		))
8111		return;
8112
8113	if (mddev_trylock(mddev)) {
8114		int spares = 0;
8115
8116		if (mddev->ro) {
8117			/* On a read-only array we can:
8118			 * - remove failed devices
8119			 * - add already-in_sync devices if the array itself
8120			 *   is in-sync.
8121			 * As we only add devices that are already in-sync,
8122			 * we can activate the spares immediately.
8123			 */
8124			remove_and_add_spares(mddev, NULL);
8125			/* There is no thread, but we need to call
8126			 * ->spare_active and clear saved_raid_disk
8127			 */
8128			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8129			md_reap_sync_thread(mddev);
8130			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8131			goto unlock;
8132		}
8133
8134		if (!mddev->external) {
8135			int did_change = 0;
8136			spin_lock(&mddev->lock);
8137			if (mddev->safemode &&
8138			    !atomic_read(&mddev->writes_pending) &&
8139			    !mddev->in_sync &&
8140			    mddev->recovery_cp == MaxSector) {
8141				mddev->in_sync = 1;
8142				did_change = 1;
8143				set_bit(MD_CHANGE_CLEAN, &mddev->flags);
8144			}
8145			if (mddev->safemode == 1)
8146				mddev->safemode = 0;
8147			spin_unlock(&mddev->lock);
8148			if (did_change)
8149				sysfs_notify_dirent_safe(mddev->sysfs_state);
8150		}
8151
8152		if (mddev->flags & MD_UPDATE_SB_FLAGS) {
8153			if (mddev_is_clustered(mddev))
8154				md_cluster_ops->metadata_update_start(mddev);
8155			md_update_sb(mddev, 0);
8156			if (mddev_is_clustered(mddev))
8157				md_cluster_ops->metadata_update_finish(mddev);
8158		}
8159
8160		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
8161		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
8162			/* resync/recovery still happening */
8163			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8164			goto unlock;
8165		}
8166		if (mddev->sync_thread) {
8167			md_reap_sync_thread(mddev);
8168			goto unlock;
8169		}
8170		/* Set RUNNING before clearing NEEDED to avoid
8171		 * any transients in the value of "sync_action".
8172		 */
8173		mddev->curr_resync_completed = 0;
8174		spin_lock(&mddev->lock);
8175		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8176		spin_unlock(&mddev->lock);
8177		/* Clear some bits that don't mean anything, but
8178		 * might be left set
8179		 */
8180		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
8181		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8182
8183		if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8184		    test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
8185			goto not_running;
8186		/* no recovery is running.
8187		 * remove any failed drives, then
8188		 * add spares if possible.
8189		 * Spares are also removed and re-added, to allow
8190		 * the personality to fail the re-add.
8191		 */
8192
8193		if (mddev->reshape_position != MaxSector) {
8194			if (mddev->pers->check_reshape == NULL ||
8195			    mddev->pers->check_reshape(mddev) != 0)
8196				/* Cannot proceed */
8197				goto not_running;
8198			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8199			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8200		} else if ((spares = remove_and_add_spares(mddev, NULL))) {
8201			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8202			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8203			clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8204			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8205		} else if (mddev->recovery_cp < MaxSector) {
8206			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8207			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8208		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
8209			/* nothing to be done ... */
8210			goto not_running;
8211
8212		if (mddev->pers->sync_request) {
8213			if (spares) {
8214				/* We are adding a device or devices to an array
8215				 * which has the bitmap stored on all devices.
8216				 * So make sure all bitmap pages get written
8217				 */
8218				bitmap_write_all(mddev->bitmap);
8219			}
8220			INIT_WORK(&mddev->del_work, md_start_sync);
8221			queue_work(md_misc_wq, &mddev->del_work);
8222			goto unlock;
8223		}
8224	not_running:
8225		if (!mddev->sync_thread) {
8226			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8227			wake_up(&resync_wait);
8228			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8229					       &mddev->recovery))
8230				if (mddev->sysfs_action)
8231					sysfs_notify_dirent_safe(mddev->sysfs_action);
8232		}
8233	unlock:
8234		wake_up(&mddev->sb_wait);
8235		mddev_unlock(mddev);
8236	}
8237}
8238EXPORT_SYMBOL(md_check_recovery);
8239
8240void md_reap_sync_thread(struct mddev *mddev)
8241{
8242	struct md_rdev *rdev;
8243
8244	/* resync has finished, collect result */
8245	md_unregister_thread(&mddev->sync_thread);
8246	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8247	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8248		/* success...*/
8249		/* activate any spares */
8250		if (mddev->pers->spare_active(mddev)) {
8251			sysfs_notify(&mddev->kobj, NULL,
8252				     "degraded");
8253			set_bit(MD_CHANGE_DEVS, &mddev->flags);
8254		}
8255	}
8256	if (mddev_is_clustered(mddev))
8257		md_cluster_ops->metadata_update_start(mddev);
8258	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8259	    mddev->pers->finish_reshape)
8260		mddev->pers->finish_reshape(mddev);
8261
8262	/* If array is no-longer degraded, then any saved_raid_disk
8263	 * information must be scrapped.
8264	 */
8265	if (!mddev->degraded)
8266		rdev_for_each(rdev, mddev)
8267			rdev->saved_raid_disk = -1;
8268
8269	md_update_sb(mddev, 1);
8270	if (mddev_is_clustered(mddev))
8271		md_cluster_ops->metadata_update_finish(mddev);
8272	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8273	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8274	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8275	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8276	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8277	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8278	wake_up(&resync_wait);
8279	/* flag recovery needed just to double check */
8280	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8281	sysfs_notify_dirent_safe(mddev->sysfs_action);
8282	md_new_event(mddev);
8283	if (mddev->event_work.func)
8284		queue_work(md_misc_wq, &mddev->event_work);
8285}
8286EXPORT_SYMBOL(md_reap_sync_thread);
8287
8288void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
8289{
8290	sysfs_notify_dirent_safe(rdev->sysfs_state);
8291	wait_event_timeout(rdev->blocked_wait,
8292			   !test_bit(Blocked, &rdev->flags) &&
8293			   !test_bit(BlockedBadBlocks, &rdev->flags),
8294			   msecs_to_jiffies(5000));
8295	rdev_dec_pending(rdev, mddev);
8296}
8297EXPORT_SYMBOL(md_wait_for_blocked_rdev);
8298
8299void md_finish_reshape(struct mddev *mddev)
8300{
8301	/* called be personality module when reshape completes. */
8302	struct md_rdev *rdev;
8303
8304	rdev_for_each(rdev, mddev) {
8305		if (rdev->data_offset > rdev->new_data_offset)
8306			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
8307		else
8308			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
8309		rdev->data_offset = rdev->new_data_offset;
8310	}
8311}
8312EXPORT_SYMBOL(md_finish_reshape);
8313
8314/* Bad block management.
8315 * We can record which blocks on each device are 'bad' and so just
8316 * fail those blocks, or that stripe, rather than the whole device.
8317 * Entries in the bad-block table are 64bits wide.  This comprises:
8318 * Length of bad-range, in sectors: 0-511 for lengths 1-512
8319 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
8320 *  A 'shift' can be set so that larger blocks are tracked and
8321 *  consequently larger devices can be covered.
8322 * 'Acknowledged' flag - 1 bit. - the most significant bit.
8323 *
8324 * Locking of the bad-block table uses a seqlock so md_is_badblock
8325 * might need to retry if it is very unlucky.
8326 * We will sometimes want to check for bad blocks in a bi_end_io function,
8327 * so we use the write_seqlock_irq variant.
8328 *
8329 * When looking for a bad block we specify a range and want to
8330 * know if any block in the range is bad.  So we binary-search
8331 * to the last range that starts at-or-before the given endpoint,
8332 * (or "before the sector after the target range")
8333 * then see if it ends after the given start.
8334 * We return
8335 *  0 if there are no known bad blocks in the range
8336 *  1 if there are known bad block which are all acknowledged
8337 * -1 if there are bad blocks which have not yet been acknowledged in metadata.
8338 * plus the start/length of the first bad section we overlap.
8339 */
8340int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
8341		   sector_t *first_bad, int *bad_sectors)
8342{
8343	int hi;
8344	int lo;
8345	u64 *p = bb->page;
8346	int rv;
8347	sector_t target = s + sectors;
8348	unsigned seq;
8349
8350	if (bb->shift > 0) {
8351		/* round the start down, and the end up */
8352		s >>= bb->shift;
8353		target += (1<<bb->shift) - 1;
8354		target >>= bb->shift;
8355		sectors = target - s;
8356	}
8357	/* 'target' is now the first block after the bad range */
8358
8359retry:
8360	seq = read_seqbegin(&bb->lock);
8361	lo = 0;
8362	rv = 0;
8363	hi = bb->count;
8364
8365	/* Binary search between lo and hi for 'target'
8366	 * i.e. for the last range that starts before 'target'
8367	 */
8368	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
8369	 * are known not to be the last range before target.
8370	 * VARIANT: hi-lo is the number of possible
8371	 * ranges, and decreases until it reaches 1
8372	 */
8373	while (hi - lo > 1) {
8374		int mid = (lo + hi) / 2;
8375		sector_t a = BB_OFFSET(p[mid]);
8376		if (a < target)
8377			/* This could still be the one, earlier ranges
8378			 * could not. */
8379			lo = mid;
8380		else
8381			/* This and later ranges are definitely out. */
8382			hi = mid;
8383	}
8384	/* 'lo' might be the last that started before target, but 'hi' isn't */
8385	if (hi > lo) {
8386		/* need to check all range that end after 's' to see if
8387		 * any are unacknowledged.
8388		 */
8389		while (lo >= 0 &&
8390		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8391			if (BB_OFFSET(p[lo]) < target) {
8392				/* starts before the end, and finishes after
8393				 * the start, so they must overlap
8394				 */
8395				if (rv != -1 && BB_ACK(p[lo]))
8396					rv = 1;
8397				else
8398					rv = -1;
8399				*first_bad = BB_OFFSET(p[lo]);
8400				*bad_sectors = BB_LEN(p[lo]);
8401			}
8402			lo--;
8403		}
8404	}
8405
8406	if (read_seqretry(&bb->lock, seq))
8407		goto retry;
8408
8409	return rv;
8410}
8411EXPORT_SYMBOL_GPL(md_is_badblock);
8412
8413/*
8414 * Add a range of bad blocks to the table.
8415 * This might extend the table, or might contract it
8416 * if two adjacent ranges can be merged.
8417 * We binary-search to find the 'insertion' point, then
8418 * decide how best to handle it.
8419 */
8420static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
8421			    int acknowledged)
8422{
8423	u64 *p;
8424	int lo, hi;
8425	int rv = 1;
8426	unsigned long flags;
8427
8428	if (bb->shift < 0)
8429		/* badblocks are disabled */
8430		return 0;
8431
8432	if (bb->shift) {
8433		/* round the start down, and the end up */
8434		sector_t next = s + sectors;
8435		s >>= bb->shift;
8436		next += (1<<bb->shift) - 1;
8437		next >>= bb->shift;
8438		sectors = next - s;
8439	}
8440
8441	write_seqlock_irqsave(&bb->lock, flags);
8442
8443	p = bb->page;
8444	lo = 0;
8445	hi = bb->count;
8446	/* Find the last range that starts at-or-before 's' */
8447	while (hi - lo > 1) {
8448		int mid = (lo + hi) / 2;
8449		sector_t a = BB_OFFSET(p[mid]);
8450		if (a <= s)
8451			lo = mid;
8452		else
8453			hi = mid;
8454	}
8455	if (hi > lo && BB_OFFSET(p[lo]) > s)
8456		hi = lo;
8457
8458	if (hi > lo) {
8459		/* we found a range that might merge with the start
8460		 * of our new range
8461		 */
8462		sector_t a = BB_OFFSET(p[lo]);
8463		sector_t e = a + BB_LEN(p[lo]);
8464		int ack = BB_ACK(p[lo]);
8465		if (e >= s) {
8466			/* Yes, we can merge with a previous range */
8467			if (s == a && s + sectors >= e)
8468				/* new range covers old */
8469				ack = acknowledged;
8470			else
8471				ack = ack && acknowledged;
8472
8473			if (e < s + sectors)
8474				e = s + sectors;
8475			if (e - a <= BB_MAX_LEN) {
8476				p[lo] = BB_MAKE(a, e-a, ack);
8477				s = e;
8478			} else {
8479				/* does not all fit in one range,
8480				 * make p[lo] maximal
8481				 */
8482				if (BB_LEN(p[lo]) != BB_MAX_LEN)
8483					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
8484				s = a + BB_MAX_LEN;
8485			}
8486			sectors = e - s;
8487		}
8488	}
8489	if (sectors && hi < bb->count) {
8490		/* 'hi' points to the first range that starts after 's'.
8491		 * Maybe we can merge with the start of that range */
8492		sector_t a = BB_OFFSET(p[hi]);
8493		sector_t e = a + BB_LEN(p[hi]);
8494		int ack = BB_ACK(p[hi]);
8495		if (a <= s + sectors) {
8496			/* merging is possible */
8497			if (e <= s + sectors) {
8498				/* full overlap */
8499				e = s + sectors;
8500				ack = acknowledged;
8501			} else
8502				ack = ack && acknowledged;
8503
8504			a = s;
8505			if (e - a <= BB_MAX_LEN) {
8506				p[hi] = BB_MAKE(a, e-a, ack);
8507				s = e;
8508			} else {
8509				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
8510				s = a + BB_MAX_LEN;
8511			}
8512			sectors = e - s;
8513			lo = hi;
8514			hi++;
8515		}
8516	}
8517	if (sectors == 0 && hi < bb->count) {
8518		/* we might be able to combine lo and hi */
8519		/* Note: 's' is at the end of 'lo' */
8520		sector_t a = BB_OFFSET(p[hi]);
8521		int lolen = BB_LEN(p[lo]);
8522		int hilen = BB_LEN(p[hi]);
8523		int newlen = lolen + hilen - (s - a);
8524		if (s >= a && newlen < BB_MAX_LEN) {
8525			/* yes, we can combine them */
8526			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
8527			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
8528			memmove(p + hi, p + hi + 1,
8529				(bb->count - hi - 1) * 8);
8530			bb->count--;
8531		}
8532	}
8533	while (sectors) {
8534		/* didn't merge (it all).
8535		 * Need to add a range just before 'hi' */
8536		if (bb->count >= MD_MAX_BADBLOCKS) {
8537			/* No room for more */
8538			rv = 0;
8539			break;
8540		} else {
8541			int this_sectors = sectors;
8542			memmove(p + hi + 1, p + hi,
8543				(bb->count - hi) * 8);
8544			bb->count++;
8545
8546			if (this_sectors > BB_MAX_LEN)
8547				this_sectors = BB_MAX_LEN;
8548			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
8549			sectors -= this_sectors;
8550			s += this_sectors;
8551		}
8552	}
8553
8554	bb->changed = 1;
8555	if (!acknowledged)
8556		bb->unacked_exist = 1;
8557	write_sequnlock_irqrestore(&bb->lock, flags);
8558
8559	return rv;
8560}
8561
8562int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8563		       int is_new)
8564{
8565	int rv;
8566	if (is_new)
8567		s += rdev->new_data_offset;
8568	else
8569		s += rdev->data_offset;
8570	rv = md_set_badblocks(&rdev->badblocks,
8571			      s, sectors, 0);
8572	if (rv) {
8573		/* Make sure they get written out promptly */
8574		sysfs_notify_dirent_safe(rdev->sysfs_state);
8575		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
8576		md_wakeup_thread(rdev->mddev->thread);
8577	}
8578	return rv;
8579}
8580EXPORT_SYMBOL_GPL(rdev_set_badblocks);
8581
8582/*
8583 * Remove a range of bad blocks from the table.
8584 * This may involve extending the table if we spilt a region,
8585 * but it must not fail.  So if the table becomes full, we just
8586 * drop the remove request.
8587 */
8588static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
8589{
8590	u64 *p;
8591	int lo, hi;
8592	sector_t target = s + sectors;
8593	int rv = 0;
8594
8595	if (bb->shift > 0) {
8596		/* When clearing we round the start up and the end down.
8597		 * This should not matter as the shift should align with
8598		 * the block size and no rounding should ever be needed.
8599		 * However it is better the think a block is bad when it
8600		 * isn't than to think a block is not bad when it is.
8601		 */
8602		s += (1<<bb->shift) - 1;
8603		s >>= bb->shift;
8604		target >>= bb->shift;
8605		sectors = target - s;
8606	}
8607
8608	write_seqlock_irq(&bb->lock);
8609
8610	p = bb->page;
8611	lo = 0;
8612	hi = bb->count;
8613	/* Find the last range that starts before 'target' */
8614	while (hi - lo > 1) {
8615		int mid = (lo + hi) / 2;
8616		sector_t a = BB_OFFSET(p[mid]);
8617		if (a < target)
8618			lo = mid;
8619		else
8620			hi = mid;
8621	}
8622	if (hi > lo) {
8623		/* p[lo] is the last range that could overlap the
8624		 * current range.  Earlier ranges could also overlap,
8625		 * but only this one can overlap the end of the range.
8626		 */
8627		if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
8628			/* Partial overlap, leave the tail of this range */
8629			int ack = BB_ACK(p[lo]);
8630			sector_t a = BB_OFFSET(p[lo]);
8631			sector_t end = a + BB_LEN(p[lo]);
8632
8633			if (a < s) {
8634				/* we need to split this range */
8635				if (bb->count >= MD_MAX_BADBLOCKS) {
8636					rv = -ENOSPC;
8637					goto out;
8638				}
8639				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
8640				bb->count++;
8641				p[lo] = BB_MAKE(a, s-a, ack);
8642				lo++;
8643			}
8644			p[lo] = BB_MAKE(target, end - target, ack);
8645			/* there is no longer an overlap */
8646			hi = lo;
8647			lo--;
8648		}
8649		while (lo >= 0 &&
8650		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8651			/* This range does overlap */
8652			if (BB_OFFSET(p[lo]) < s) {
8653				/* Keep the early parts of this range. */
8654				int ack = BB_ACK(p[lo]);
8655				sector_t start = BB_OFFSET(p[lo]);
8656				p[lo] = BB_MAKE(start, s - start, ack);
8657				/* now low doesn't overlap, so.. */
8658				break;
8659			}
8660			lo--;
8661		}
8662		/* 'lo' is strictly before, 'hi' is strictly after,
8663		 * anything between needs to be discarded
8664		 */
8665		if (hi - lo > 1) {
8666			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
8667			bb->count -= (hi - lo - 1);
8668		}
8669	}
8670
8671	bb->changed = 1;
8672out:
8673	write_sequnlock_irq(&bb->lock);
8674	return rv;
8675}
8676
8677int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8678			 int is_new)
8679{
8680	if (is_new)
8681		s += rdev->new_data_offset;
8682	else
8683		s += rdev->data_offset;
8684	return md_clear_badblocks(&rdev->badblocks,
8685				  s, sectors);
8686}
8687EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8688
8689/*
8690 * Acknowledge all bad blocks in a list.
8691 * This only succeeds if ->changed is clear.  It is used by
8692 * in-kernel metadata updates
8693 */
8694void md_ack_all_badblocks(struct badblocks *bb)
8695{
8696	if (bb->page == NULL || bb->changed)
8697		/* no point even trying */
8698		return;
8699	write_seqlock_irq(&bb->lock);
8700
8701	if (bb->changed == 0 && bb->unacked_exist) {
8702		u64 *p = bb->page;
8703		int i;
8704		for (i = 0; i < bb->count ; i++) {
8705			if (!BB_ACK(p[i])) {
8706				sector_t start = BB_OFFSET(p[i]);
8707				int len = BB_LEN(p[i]);
8708				p[i] = BB_MAKE(start, len, 1);
8709			}
8710		}
8711		bb->unacked_exist = 0;
8712	}
8713	write_sequnlock_irq(&bb->lock);
8714}
8715EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
8716
8717/* sysfs access to bad-blocks list.
8718 * We present two files.
8719 * 'bad-blocks' lists sector numbers and lengths of ranges that
8720 *    are recorded as bad.  The list is truncated to fit within
8721 *    the one-page limit of sysfs.
8722 *    Writing "sector length" to this file adds an acknowledged
8723 *    bad block list.
8724 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
8725 *    been acknowledged.  Writing to this file adds bad blocks
8726 *    without acknowledging them.  This is largely for testing.
8727 */
8728
8729static ssize_t
8730badblocks_show(struct badblocks *bb, char *page, int unack)
8731{
8732	size_t len;
8733	int i;
8734	u64 *p = bb->page;
8735	unsigned seq;
8736
8737	if (bb->shift < 0)
8738		return 0;
8739
8740retry:
8741	seq = read_seqbegin(&bb->lock);
8742
8743	len = 0;
8744	i = 0;
8745
8746	while (len < PAGE_SIZE && i < bb->count) {
8747		sector_t s = BB_OFFSET(p[i]);
8748		unsigned int length = BB_LEN(p[i]);
8749		int ack = BB_ACK(p[i]);
8750		i++;
8751
8752		if (unack && ack)
8753			continue;
8754
8755		len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
8756				(unsigned long long)s << bb->shift,
8757				length << bb->shift);
8758	}
8759	if (unack && len == 0)
8760		bb->unacked_exist = 0;
8761
8762	if (read_seqretry(&bb->lock, seq))
8763		goto retry;
8764
8765	return len;
8766}
8767
8768#define DO_DEBUG 1
8769
8770static ssize_t
8771badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8772{
8773	unsigned long long sector;
8774	int length;
8775	char newline;
8776#ifdef DO_DEBUG
8777	/* Allow clearing via sysfs *only* for testing/debugging.
8778	 * Normally only a successful write may clear a badblock
8779	 */
8780	int clear = 0;
8781	if (page[0] == '-') {
8782		clear = 1;
8783		page++;
8784	}
8785#endif /* DO_DEBUG */
8786
8787	switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
8788	case 3:
8789		if (newline != '\n')
8790			return -EINVAL;
8791	case 2:
8792		if (length <= 0)
8793			return -EINVAL;
8794		break;
8795	default:
8796		return -EINVAL;
8797	}
8798
8799#ifdef DO_DEBUG
8800	if (clear) {
8801		md_clear_badblocks(bb, sector, length);
8802		return len;
8803	}
8804#endif /* DO_DEBUG */
8805	if (md_set_badblocks(bb, sector, length, !unack))
8806		return len;
8807	else
8808		return -ENOSPC;
8809}
8810
8811static int md_notify_reboot(struct notifier_block *this,
8812			    unsigned long code, void *x)
8813{
8814	struct list_head *tmp;
8815	struct mddev *mddev;
8816	int need_delay = 0;
8817
8818	for_each_mddev(mddev, tmp) {
8819		if (mddev_trylock(mddev)) {
8820			if (mddev->pers)
8821				__md_stop_writes(mddev);
8822			if (mddev->persistent)
8823				mddev->safemode = 2;
8824			mddev_unlock(mddev);
8825		}
8826		need_delay = 1;
8827	}
8828	/*
8829	 * certain more exotic SCSI devices are known to be
8830	 * volatile wrt too early system reboots. While the
8831	 * right place to handle this issue is the given
8832	 * driver, we do want to have a safe RAID driver ...
8833	 */
8834	if (need_delay)
8835		mdelay(1000*1);
8836
8837	return NOTIFY_DONE;
8838}
8839
8840static struct notifier_block md_notifier = {
8841	.notifier_call	= md_notify_reboot,
8842	.next		= NULL,
8843	.priority	= INT_MAX, /* before any real devices */
8844};
8845
8846static void md_geninit(void)
8847{
8848	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8849
8850	proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
8851}
8852
8853static int __init md_init(void)
8854{
8855	int ret = -ENOMEM;
8856
8857	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
8858	if (!md_wq)
8859		goto err_wq;
8860
8861	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
8862	if (!md_misc_wq)
8863		goto err_misc_wq;
8864
8865	if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
8866		goto err_md;
8867
8868	if ((ret = register_blkdev(0, "mdp")) < 0)
8869		goto err_mdp;
8870	mdp_major = ret;
8871
8872	blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
8873			    md_probe, NULL, NULL);
8874	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
8875			    md_probe, NULL, NULL);
8876
8877	register_reboot_notifier(&md_notifier);
8878	raid_table_header = register_sysctl_table(raid_root_table);
8879
8880	md_geninit();
8881	return 0;
8882
8883err_mdp:
8884	unregister_blkdev(MD_MAJOR, "md");
8885err_md:
8886	destroy_workqueue(md_misc_wq);
8887err_misc_wq:
8888	destroy_workqueue(md_wq);
8889err_wq:
8890	return ret;
8891}
8892
8893void md_reload_sb(struct mddev *mddev)
8894{
8895	struct md_rdev *rdev, *tmp;
8896
8897	rdev_for_each_safe(rdev, tmp, mddev) {
8898		rdev->sb_loaded = 0;
8899		ClearPageUptodate(rdev->sb_page);
8900	}
8901	mddev->raid_disks = 0;
8902	analyze_sbs(mddev);
8903	rdev_for_each_safe(rdev, tmp, mddev) {
8904		struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
8905		/* since we don't write to faulty devices, we figure out if the
8906		 *  disk is faulty by comparing events
8907		 */
8908		if (mddev->events > sb->events)
8909			set_bit(Faulty, &rdev->flags);
8910	}
8911
8912}
8913EXPORT_SYMBOL(md_reload_sb);
8914
8915#ifndef MODULE
8916
8917/*
8918 * Searches all registered partitions for autorun RAID arrays
8919 * at boot time.
8920 */
8921
8922static LIST_HEAD(all_detected_devices);
8923struct detected_devices_node {
8924	struct list_head list;
8925	dev_t dev;
8926};
8927
8928void md_autodetect_dev(dev_t dev)
8929{
8930	struct detected_devices_node *node_detected_dev;
8931
8932	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
8933	if (node_detected_dev) {
8934		node_detected_dev->dev = dev;
8935		list_add_tail(&node_detected_dev->list, &all_detected_devices);
8936	} else {
8937		printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
8938			", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
8939	}
8940}
8941
8942static void autostart_arrays(int part)
8943{
8944	struct md_rdev *rdev;
8945	struct detected_devices_node *node_detected_dev;
8946	dev_t dev;
8947	int i_scanned, i_passed;
8948
8949	i_scanned = 0;
8950	i_passed = 0;
8951
8952	printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
8953
8954	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
8955		i_scanned++;
8956		node_detected_dev = list_entry(all_detected_devices.next,
8957					struct detected_devices_node, list);
8958		list_del(&node_detected_dev->list);
8959		dev = node_detected_dev->dev;
8960		kfree(node_detected_dev);
8961		rdev = md_import_device(dev,0, 90);
8962		if (IS_ERR(rdev))
8963			continue;
8964
8965		if (test_bit(Faulty, &rdev->flags))
8966			continue;
8967
8968		set_bit(AutoDetected, &rdev->flags);
8969		list_add(&rdev->same_set, &pending_raid_disks);
8970		i_passed++;
8971	}
8972
8973	printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
8974						i_scanned, i_passed);
8975
8976	autorun_devices(part);
8977}
8978
8979#endif /* !MODULE */
8980
8981static __exit void md_exit(void)
8982{
8983	struct mddev *mddev;
8984	struct list_head *tmp;
8985	int delay = 1;
8986
8987	blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
8988	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
8989
8990	unregister_blkdev(MD_MAJOR,"md");
8991	unregister_blkdev(mdp_major, "mdp");
8992	unregister_reboot_notifier(&md_notifier);
8993	unregister_sysctl_table(raid_table_header);
8994
8995	/* We cannot unload the modules while some process is
8996	 * waiting for us in select() or poll() - wake them up
8997	 */
8998	md_unloading = 1;
8999	while (waitqueue_active(&md_event_waiters)) {
9000		/* not safe to leave yet */
9001		wake_up(&md_event_waiters);
9002		msleep(delay);
9003		delay += delay;
9004	}
9005	remove_proc_entry("mdstat", NULL);
9006
9007	for_each_mddev(mddev, tmp) {
9008		export_array(mddev);
9009		mddev->hold_active = 0;
9010	}
9011	destroy_workqueue(md_misc_wq);
9012	destroy_workqueue(md_wq);
9013}
9014
9015subsys_initcall(md_init);
9016module_exit(md_exit)
9017
9018static int get_ro(char *buffer, struct kernel_param *kp)
9019{
9020	return sprintf(buffer, "%d", start_readonly);
9021}
9022static int set_ro(const char *val, struct kernel_param *kp)
9023{
9024	char *e;
9025	int num = simple_strtoul(val, &e, 10);
9026	if (*val && (*e == '\0' || *e == '\n')) {
9027		start_readonly = num;
9028		return 0;
9029	}
9030	return -EINVAL;
9031}
9032
9033module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
9034module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
9035module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
9036
9037MODULE_LICENSE("GPL");
9038MODULE_DESCRIPTION("MD RAID framework");
9039MODULE_ALIAS("md");
9040MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
9041