hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/drivers/md/md.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 md.c : Multiple Devices driver for Linux
34 Copyright (C) 1998, 1999, 2000 Ingo Molnar
....@@ -22,14 +23,6 @@
2223 - persistent bitmap code
2324 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
2425
25
- This program is free software; you can redistribute it and/or modify
26
- it under the terms of the GNU General Public License as published by
27
- the Free Software Foundation; either version 2, or (at your option)
28
- any later version.
29
-
30
- You should have received a copy of the GNU General Public License
31
- (for example /usr/src/linux/COPYING); if not, write to the Free
32
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3326
3427 Errors, Warnings, etc.
3528 Please use:
....@@ -44,6 +37,7 @@
4437
4538 */
4639
40
+#include <linux/sched/mm.h>
4741 #include <linux/sched/signal.h>
4842 #include <linux/kthread.h>
4943 #include <linux/blkdev.h>
....@@ -64,17 +58,15 @@
6458 #include <linux/delay.h>
6559 #include <linux/raid/md_p.h>
6660 #include <linux/raid/md_u.h>
61
+#include <linux/raid/detect.h>
6762 #include <linux/slab.h>
6863 #include <linux/percpu-refcount.h>
64
+#include <linux/part_stat.h>
6965
7066 #include <trace/events/block.h>
7167 #include "md.h"
7268 #include "md-bitmap.h"
7369 #include "md-cluster.h"
74
-
75
-#ifndef MODULE
76
-static void autostart_arrays(int part);
77
-#endif
7870
7971 /* pers_list is a list of registered personalities protected
8072 * by pers_lock.
....@@ -88,12 +80,12 @@
8880
8981 struct md_cluster_operations *md_cluster_ops;
9082 EXPORT_SYMBOL(md_cluster_ops);
91
-struct module *md_cluster_mod;
92
-EXPORT_SYMBOL(md_cluster_mod);
83
+static struct module *md_cluster_mod;
9384
9485 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
9586 static struct workqueue_struct *md_wq;
9687 static struct workqueue_struct *md_misc_wq;
88
+static struct workqueue_struct *md_rdev_misc_wq;
9789
9890 static int remove_and_add_spares(struct mddev *mddev,
9991 struct md_rdev *this);
....@@ -105,6 +97,8 @@
10597 * count by 2 for every hour elapsed between read errors.
10698 */
10799 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
100
+/* Default safemode delay: 200 msec */
101
+#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
108102 /*
109103 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
110104 * is 1000 KB/sec, so the extra system load does not show up that much.
....@@ -130,6 +124,168 @@
130124 {
131125 return mddev->sync_speed_max ?
132126 mddev->sync_speed_max : sysctl_speed_limit_max;
127
+}
128
+
129
+static void rdev_uninit_serial(struct md_rdev *rdev)
130
+{
131
+ if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
132
+ return;
133
+
134
+ kvfree(rdev->serial);
135
+ rdev->serial = NULL;
136
+}
137
+
138
+static void rdevs_uninit_serial(struct mddev *mddev)
139
+{
140
+ struct md_rdev *rdev;
141
+
142
+ rdev_for_each(rdev, mddev)
143
+ rdev_uninit_serial(rdev);
144
+}
145
+
146
+static int rdev_init_serial(struct md_rdev *rdev)
147
+{
148
+ /* serial_nums equals with BARRIER_BUCKETS_NR */
149
+ int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
150
+ struct serial_in_rdev *serial = NULL;
151
+
152
+ if (test_bit(CollisionCheck, &rdev->flags))
153
+ return 0;
154
+
155
+ serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
156
+ GFP_KERNEL);
157
+ if (!serial)
158
+ return -ENOMEM;
159
+
160
+ for (i = 0; i < serial_nums; i++) {
161
+ struct serial_in_rdev *serial_tmp = &serial[i];
162
+
163
+ spin_lock_init(&serial_tmp->serial_lock);
164
+ serial_tmp->serial_rb = RB_ROOT_CACHED;
165
+ init_waitqueue_head(&serial_tmp->serial_io_wait);
166
+ }
167
+
168
+ rdev->serial = serial;
169
+ set_bit(CollisionCheck, &rdev->flags);
170
+
171
+ return 0;
172
+}
173
+
174
+static int rdevs_init_serial(struct mddev *mddev)
175
+{
176
+ struct md_rdev *rdev;
177
+ int ret = 0;
178
+
179
+ rdev_for_each(rdev, mddev) {
180
+ ret = rdev_init_serial(rdev);
181
+ if (ret)
182
+ break;
183
+ }
184
+
185
+ /* Free all resources if pool is not existed */
186
+ if (ret && !mddev->serial_info_pool)
187
+ rdevs_uninit_serial(mddev);
188
+
189
+ return ret;
190
+}
191
+
192
+/*
193
+ * rdev needs to enable serial stuffs if it meets the conditions:
194
+ * 1. it is multi-queue device flaged with writemostly.
195
+ * 2. the write-behind mode is enabled.
196
+ */
197
+static int rdev_need_serial(struct md_rdev *rdev)
198
+{
199
+ return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
200
+ rdev->bdev->bd_disk->queue->nr_hw_queues != 1 &&
201
+ test_bit(WriteMostly, &rdev->flags));
202
+}
203
+
204
+/*
205
+ * Init resource for rdev(s), then create serial_info_pool if:
206
+ * 1. rdev is the first device which return true from rdev_enable_serial.
207
+ * 2. rdev is NULL, means we want to enable serialization for all rdevs.
208
+ */
209
+void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
210
+ bool is_suspend)
211
+{
212
+ int ret = 0;
213
+
214
+ if (rdev && !rdev_need_serial(rdev) &&
215
+ !test_bit(CollisionCheck, &rdev->flags))
216
+ return;
217
+
218
+ if (!is_suspend)
219
+ mddev_suspend(mddev);
220
+
221
+ if (!rdev)
222
+ ret = rdevs_init_serial(mddev);
223
+ else
224
+ ret = rdev_init_serial(rdev);
225
+ if (ret)
226
+ goto abort;
227
+
228
+ if (mddev->serial_info_pool == NULL) {
229
+ /*
230
+ * already in memalloc noio context by
231
+ * mddev_suspend()
232
+ */
233
+ mddev->serial_info_pool =
234
+ mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
235
+ sizeof(struct serial_info));
236
+ if (!mddev->serial_info_pool) {
237
+ rdevs_uninit_serial(mddev);
238
+ pr_err("can't alloc memory pool for serialization\n");
239
+ }
240
+ }
241
+
242
+abort:
243
+ if (!is_suspend)
244
+ mddev_resume(mddev);
245
+}
246
+
247
+/*
248
+ * Free resource from rdev(s), and destroy serial_info_pool under conditions:
249
+ * 1. rdev is the last device flaged with CollisionCheck.
250
+ * 2. when bitmap is destroyed while policy is not enabled.
251
+ * 3. for disable policy, the pool is destroyed only when no rdev needs it.
252
+ */
253
+void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
254
+ bool is_suspend)
255
+{
256
+ if (rdev && !test_bit(CollisionCheck, &rdev->flags))
257
+ return;
258
+
259
+ if (mddev->serial_info_pool) {
260
+ struct md_rdev *temp;
261
+ int num = 0; /* used to track if other rdevs need the pool */
262
+
263
+ if (!is_suspend)
264
+ mddev_suspend(mddev);
265
+ rdev_for_each(temp, mddev) {
266
+ if (!rdev) {
267
+ if (!mddev->serialize_policy ||
268
+ !rdev_need_serial(temp))
269
+ rdev_uninit_serial(temp);
270
+ else
271
+ num++;
272
+ } else if (temp != rdev &&
273
+ test_bit(CollisionCheck, &temp->flags))
274
+ num++;
275
+ }
276
+
277
+ if (rdev)
278
+ rdev_uninit_serial(rdev);
279
+
280
+ if (num)
281
+ pr_info("The mempool could be used by other devices\n");
282
+ else {
283
+ mempool_destroy(mddev->serial_info_pool);
284
+ mddev->serial_info_pool = NULL;
285
+ }
286
+ if (!is_suspend)
287
+ mddev_resume(mddev);
288
+ }
133289 }
134290
135291 static struct ctl_table_header *raid_table_header;
....@@ -172,8 +328,6 @@
172328 { }
173329 };
174330
175
-static const struct block_device_operations md_fops;
176
-
177331 static int start_readonly;
178332
179333 /*
....@@ -189,15 +343,10 @@
189343 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
190344 struct mddev *mddev)
191345 {
192
- struct bio *b;
193
-
194346 if (!mddev || !bioset_initialized(&mddev->bio_set))
195347 return bio_alloc(gfp_mask, nr_iovecs);
196348
197
- b = bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
198
- if (!b)
199
- return NULL;
200
- return b;
349
+ return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
201350 }
202351 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
203352
....@@ -310,20 +459,25 @@
310459 }
311460 EXPORT_SYMBOL(md_handle_request);
312461
313
-static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
462
+static blk_qc_t md_submit_bio(struct bio *bio)
314463 {
315464 const int rw = bio_data_dir(bio);
316465 const int sgrp = op_stat_group(bio_op(bio));
317
- struct mddev *mddev = q->queuedata;
466
+ struct mddev *mddev = bio->bi_disk->private_data;
318467 unsigned int sectors;
319
- int cpu;
320
-
321
- blk_queue_split(q, &bio);
322468
323469 if (mddev == NULL || mddev->pers == NULL) {
324470 bio_io_error(bio);
325471 return BLK_QC_T_NONE;
326472 }
473
+
474
+ if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
475
+ bio_io_error(bio);
476
+ return BLK_QC_T_NONE;
477
+ }
478
+
479
+ blk_queue_split(&bio);
480
+
327481 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
328482 if (bio_sectors(bio) != 0)
329483 bio->bi_status = BLK_STS_IOERR;
....@@ -341,9 +495,9 @@
341495
342496 md_handle_request(mddev, bio);
343497
344
- cpu = part_stat_lock();
345
- part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
346
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
498
+ part_stat_lock();
499
+ part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
500
+ part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
347501 part_stat_unlock();
348502
349503 return BLK_QC_T_NONE;
....@@ -371,11 +525,15 @@
371525 wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
372526
373527 del_timer_sync(&mddev->safemode_timer);
528
+ /* restrict memory reclaim I/O during raid array is suspend */
529
+ mddev->noio_flag = memalloc_noio_save();
374530 }
375531 EXPORT_SYMBOL_GPL(mddev_suspend);
376532
377533 void mddev_resume(struct mddev *mddev)
378534 {
535
+ /* entred the memalloc scope from mddev_suspend() */
536
+ memalloc_noio_restore(mddev->noio_flag);
379537 lockdep_assert_held(&mddev->reconfig_mutex);
380538 if (--mddev->suspended)
381539 return;
....@@ -388,26 +546,6 @@
388546 }
389547 EXPORT_SYMBOL_GPL(mddev_resume);
390548
391
-int mddev_congested(struct mddev *mddev, int bits)
392
-{
393
- struct md_personality *pers = mddev->pers;
394
- int ret = 0;
395
-
396
- rcu_read_lock();
397
- if (mddev->suspended)
398
- ret = 1;
399
- else if (pers && pers->congested)
400
- ret = pers->congested(mddev, bits);
401
- rcu_read_unlock();
402
- return ret;
403
-}
404
-EXPORT_SYMBOL_GPL(mddev_congested);
405
-static int md_congested(void *data, int bits)
406
-{
407
- struct mddev *mddev = data;
408
- return mddev_congested(mddev, bits);
409
-}
410
-
411549 /*
412550 * Generic flush handling for md
413551 */
....@@ -417,13 +555,14 @@
417555 struct md_rdev *rdev = bio->bi_private;
418556 struct mddev *mddev = rdev->mddev;
419557
558
+ bio_put(bio);
559
+
420560 rdev_dec_pending(rdev, mddev);
421561
422562 if (atomic_dec_and_test(&mddev->flush_pending)) {
423563 /* The pre-request flush has finished */
424564 queue_work(md_wq, &mddev->flush_work);
425565 }
426
- bio_put(bio);
427566 }
428567
429568 static void md_submit_flush_data(struct work_struct *ws);
....@@ -709,7 +848,13 @@
709848 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
710849 if (mddev->sysfs_action)
711850 sysfs_put(mddev->sysfs_action);
851
+ if (mddev->sysfs_completed)
852
+ sysfs_put(mddev->sysfs_completed);
853
+ if (mddev->sysfs_degraded)
854
+ sysfs_put(mddev->sysfs_degraded);
712855 mddev->sysfs_action = NULL;
856
+ mddev->sysfs_completed = NULL;
857
+ mddev->sysfs_degraded = NULL;
713858 }
714859 }
715860 mddev->sysfs_active = 0;
....@@ -811,7 +956,8 @@
811956 struct mddev *mddev = rdev->mddev;
812957
813958 if (bio->bi_status) {
814
- pr_err("md: super_written gets error=%d\n", bio->bi_status);
959
+ pr_err("md: %s gets error=%d\n", __func__,
960
+ blk_status_to_errno(bio->bi_status));
815961 md_error(mddev, rdev);
816962 if (!test_bit(Faulty, &rdev->flags)
817963 && (bio->bi_opf & MD_FAILFAST)) {
....@@ -821,10 +967,12 @@
821967 } else
822968 clear_bit(LastDev, &rdev->flags);
823969
970
+ bio_put(bio);
971
+
972
+ rdev_dec_pending(rdev, mddev);
973
+
824974 if (atomic_dec_and_test(&mddev->pending_writes))
825975 wake_up(&mddev->sb_wait);
826
- rdev_dec_pending(rdev, mddev);
827
- bio_put(bio);
828976 }
829977
830978 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
....@@ -1066,6 +1214,7 @@
10661214 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
10671215 mdp_super_t *sb;
10681216 int ret;
1217
+ bool spare_disk = true;
10691218
10701219 /*
10711220 * Calculate the position of the superblock (512byte sectors),
....@@ -1116,8 +1265,19 @@
11161265 else
11171266 rdev->desc_nr = sb->this_disk.number;
11181267
1268
+ /* not spare disk, or LEVEL_MULTIPATH */
1269
+ if (sb->level == LEVEL_MULTIPATH ||
1270
+ (rdev->desc_nr >= 0 &&
1271
+ rdev->desc_nr < MD_SB_DISKS &&
1272
+ sb->disks[rdev->desc_nr].state &
1273
+ ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))))
1274
+ spare_disk = false;
1275
+
11191276 if (!refdev) {
1120
- ret = 1;
1277
+ if (!spare_disk)
1278
+ ret = 1;
1279
+ else
1280
+ ret = 0;
11211281 } else {
11221282 __u64 ev1, ev2;
11231283 mdp_super_t *refsb = page_address(refdev->sb_page);
....@@ -1133,7 +1293,8 @@
11331293 }
11341294 ev1 = md_event(sb);
11351295 ev2 = md_event(refsb);
1136
- if (ev1 > ev2)
1296
+
1297
+ if (!spare_disk && ev1 > ev2)
11371298 ret = 1;
11381299 else
11391300 ret = 0;
....@@ -1143,8 +1304,7 @@
11431304 * (not needed for Linear and RAID0 as metadata doesn't
11441305 * record this size)
11451306 */
1146
- if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
1147
- sb->level >= 1)
1307
+ if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
11481308 rdev->sectors = (sector_t)(2ULL << 32) - 2;
11491309
11501310 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
....@@ -1444,8 +1604,7 @@
14441604 /* Limit to 4TB as metadata cannot record more than that.
14451605 * 4TB == 2^32 KB, or 2*2^32 sectors.
14461606 */
1447
- if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
1448
- rdev->mddev->level >= 1)
1607
+ if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
14491608 num_sectors = (sector_t)(2ULL << 32) - 2;
14501609 do {
14511610 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
....@@ -1495,6 +1654,7 @@
14951654 sector_t sectors;
14961655 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
14971656 int bmask;
1657
+ bool spare_disk = true;
14981658
14991659 /*
15001660 * Calculate the position of the superblock in 512byte sectors.
....@@ -1589,7 +1749,7 @@
15891749 */
15901750 s32 offset;
15911751 sector_t bb_sector;
1592
- u64 *bbp;
1752
+ __le64 *bbp;
15931753 int i;
15941754 int sectors = le16_to_cpu(sb->bblog_size);
15951755 if (sectors > (PAGE_SIZE / 512))
....@@ -1601,7 +1761,7 @@
16011761 if (!sync_page_io(rdev, bb_sector, sectors << 9,
16021762 rdev->bb_page, REQ_OP_READ, 0, true))
16031763 return -EIO;
1604
- bbp = (u64 *)page_address(rdev->bb_page);
1764
+ bbp = (__le64 *)page_address(rdev->bb_page);
16051765 rdev->badblocks.shift = sb->bblog_shift;
16061766 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
16071767 u64 bb = le64_to_cpu(*bbp);
....@@ -1628,8 +1788,19 @@
16281788 sb->level != 0)
16291789 return -EINVAL;
16301790
1791
+ /* not spare disk, or LEVEL_MULTIPATH */
1792
+ if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) ||
1793
+ (rdev->desc_nr >= 0 &&
1794
+ rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1795
+ (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1796
+ le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)))
1797
+ spare_disk = false;
1798
+
16311799 if (!refdev) {
1632
- ret = 1;
1800
+ if (!spare_disk)
1801
+ ret = 1;
1802
+ else
1803
+ ret = 0;
16331804 } else {
16341805 __u64 ev1, ev2;
16351806 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
....@@ -1646,7 +1817,7 @@
16461817 ev1 = le64_to_cpu(sb->events);
16471818 ev2 = le64_to_cpu(refsb->events);
16481819
1649
- if (ev1 > ev2)
1820
+ if (!spare_disk && ev1 > ev2)
16501821 ret = 1;
16511822 else
16521823 ret = 0;
....@@ -1928,7 +2099,7 @@
19282099 md_error(mddev, rdev);
19292100 else {
19302101 struct badblocks *bb = &rdev->badblocks;
1931
- u64 *bbp = (u64 *)page_address(rdev->bb_page);
2102
+ __le64 *bbp = (__le64 *)page_address(rdev->bb_page);
19322103 u64 *p = bb->page;
19332104 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
19342105 if (bb->changed) {
....@@ -2003,6 +2174,24 @@
20032174 sb->sb_csum = calc_sb_1_csum(sb);
20042175 }
20052176
2177
+static sector_t super_1_choose_bm_space(sector_t dev_size)
2178
+{
2179
+ sector_t bm_space;
2180
+
2181
+ /* if the device is bigger than 8Gig, save 64k for bitmap
2182
+ * usage, if bigger than 200Gig, save 128k
2183
+ */
2184
+ if (dev_size < 64*2)
2185
+ bm_space = 0;
2186
+ else if (dev_size - 64*2 >= 200*1024*1024*2)
2187
+ bm_space = 128*2;
2188
+ else if (dev_size - 4*2 > 8*1024*1024*2)
2189
+ bm_space = 64*2;
2190
+ else
2191
+ bm_space = 4*2;
2192
+ return bm_space;
2193
+}
2194
+
20062195 static unsigned long long
20072196 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
20082197 {
....@@ -2023,10 +2212,20 @@
20232212 return 0;
20242213 } else {
20252214 /* minor version 0; superblock after data */
2026
- sector_t sb_start;
2027
- sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
2215
+ sector_t sb_start, bm_space;
2216
+ sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9;
2217
+
2218
+ /* 8K is for superblock */
2219
+ sb_start = dev_size - 8*2;
20282220 sb_start &= ~(sector_t)(4*2 - 1);
2029
- max_sectors = rdev->sectors + sb_start - rdev->sb_start;
2221
+
2222
+ bm_space = super_1_choose_bm_space(dev_size);
2223
+
2224
+ /* Space that can be used to store date needs to decrease
2225
+ * superblock bitmap space and bad block space(4K)
2226
+ */
2227
+ max_sectors = sb_start - bm_space - 4*2;
2228
+
20302229 if (!num_sectors || num_sectors > max_sectors)
20312230 num_sectors = max_sectors;
20322231 rdev->sb_start = sb_start;
....@@ -2124,8 +2323,7 @@
21242323 test_bit(Journal, &rdev2->flags) ||
21252324 rdev2->raid_disk == -1)
21262325 continue;
2127
- if (rdev->bdev->bd_contains ==
2128
- rdev2->bdev->bd_contains) {
2326
+ if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) {
21292327 rcu_read_unlock();
21302328 return 1;
21312329 }
....@@ -2193,14 +2391,12 @@
21932391 */
21942392 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
21952393 {
2196
- struct blk_integrity *bi_rdev;
21972394 struct blk_integrity *bi_mddev;
21982395 char name[BDEVNAME_SIZE];
21992396
22002397 if (!mddev->gendisk)
22012398 return 0;
22022399
2203
- bi_rdev = bdev_get_integrity(rdev->bdev);
22042400 bi_mddev = blk_get_integrity(mddev->gendisk);
22052401
22062402 if (!bi_mddev) /* nothing to do */
....@@ -2276,13 +2472,20 @@
22762472 rdev->mddev = mddev;
22772473 pr_debug("md: bind<%s>\n", b);
22782474
2475
+ if (mddev->raid_disks)
2476
+ mddev_create_serial_pool(mddev, rdev, false);
2477
+
22792478 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
22802479 goto fail;
22812480
22822481 ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2283
- if (sysfs_create_link(&rdev->kobj, ko, "block"))
2284
- /* failure here is OK */;
2482
+ /* failure here is OK */
2483
+ err = sysfs_create_link(&rdev->kobj, ko, "block");
22852484 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2485
+ rdev->sysfs_unack_badblocks =
2486
+ sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
2487
+ rdev->sysfs_badblocks =
2488
+ sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
22862489
22872490 list_add_rcu(&rdev->same_set, &mddev->disks);
22882491 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
....@@ -2298,7 +2501,7 @@
22982501 return err;
22992502 }
23002503
2301
-static void md_delayed_delete(struct work_struct *ws)
2504
+static void rdev_delayed_delete(struct work_struct *ws)
23022505 {
23032506 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
23042507 kobject_del(&rdev->kobj);
....@@ -2312,19 +2515,24 @@
23122515 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
23132516 list_del_rcu(&rdev->same_set);
23142517 pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2518
+ mddev_destroy_serial_pool(rdev->mddev, rdev, false);
23152519 rdev->mddev = NULL;
23162520 sysfs_remove_link(&rdev->kobj, "block");
23172521 sysfs_put(rdev->sysfs_state);
2522
+ sysfs_put(rdev->sysfs_unack_badblocks);
2523
+ sysfs_put(rdev->sysfs_badblocks);
23182524 rdev->sysfs_state = NULL;
2525
+ rdev->sysfs_unack_badblocks = NULL;
2526
+ rdev->sysfs_badblocks = NULL;
23192527 rdev->badblocks.count = 0;
23202528 /* We need to delay this, otherwise we can deadlock when
23212529 * writing to 'remove' to "dev/state". We also need
23222530 * to delay it due to rcu usage.
23232531 */
23242532 synchronize_rcu();
2325
- INIT_WORK(&rdev->del_work, md_delayed_delete);
2533
+ INIT_WORK(&rdev->del_work, rdev_delayed_delete);
23262534 kobject_get(&rdev->kobj);
2327
- queue_work(md_misc_wq, &rdev->del_work);
2535
+ queue_work(md_rdev_misc_wq, &rdev->del_work);
23282536 }
23292537
23302538 /*
....@@ -2336,12 +2544,12 @@
23362544 {
23372545 int err = 0;
23382546 struct block_device *bdev;
2339
- char b[BDEVNAME_SIZE];
23402547
23412548 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
23422549 shared ? (struct md_rdev *)lock_rdev : rdev);
23432550 if (IS_ERR(bdev)) {
2344
- pr_warn("md: could not open %s.\n", __bdevname(dev, b));
2551
+ pr_warn("md: could not open device unknown-block(%u,%u).\n",
2552
+ MAJOR(dev), MINOR(dev));
23452553 return PTR_ERR(bdev);
23462554 }
23472555 rdev->bdev = bdev;
....@@ -2443,14 +2651,16 @@
24432651
24442652 static bool does_sb_need_changing(struct mddev *mddev)
24452653 {
2446
- struct md_rdev *rdev;
2654
+ struct md_rdev *rdev = NULL, *iter;
24472655 struct mdp_superblock_1 *sb;
24482656 int role;
24492657
24502658 /* Find a good rdev */
2451
- rdev_for_each(rdev, mddev)
2452
- if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2659
+ rdev_for_each(iter, mddev)
2660
+ if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) {
2661
+ rdev = iter;
24532662 break;
2663
+ }
24542664
24552665 /* No good device found. */
24562666 if (!rdev)
....@@ -2660,7 +2870,7 @@
26602870 goto repeat;
26612871 wake_up(&mddev->sb_wait);
26622872 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2663
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2873
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
26642874
26652875 rdev_for_each(rdev, mddev) {
26662876 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
....@@ -2793,7 +3003,11 @@
27933003 * -write_error - clears WriteErrorSeen
27943004 * {,-}failfast - set/clear FailFast
27953005 */
3006
+
3007
+ struct mddev *mddev = rdev->mddev;
27963008 int err = -EINVAL;
3009
+ bool need_update_sb = false;
3010
+
27973011 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
27983012 md_error(rdev->mddev, rdev);
27993013 if (test_bit(Faulty, &rdev->flags))
....@@ -2808,7 +3022,6 @@
28083022 if (rdev->raid_disk >= 0)
28093023 err = -EBUSY;
28103024 else {
2811
- struct mddev *mddev = rdev->mddev;
28123025 err = 0;
28133026 if (mddev_is_clustered(mddev))
28143027 err = md_cluster_ops->remove_disk(mddev, rdev);
....@@ -2824,9 +3037,13 @@
28243037 }
28253038 } else if (cmd_match(buf, "writemostly")) {
28263039 set_bit(WriteMostly, &rdev->flags);
3040
+ mddev_create_serial_pool(rdev->mddev, rdev, false);
3041
+ need_update_sb = true;
28273042 err = 0;
28283043 } else if (cmd_match(buf, "-writemostly")) {
3044
+ mddev_destroy_serial_pool(rdev->mddev, rdev, false);
28293045 clear_bit(WriteMostly, &rdev->flags);
3046
+ need_update_sb = true;
28303047 err = 0;
28313048 } else if (cmd_match(buf, "blocked")) {
28323049 set_bit(Blocked, &rdev->flags);
....@@ -2852,9 +3069,11 @@
28523069 err = 0;
28533070 } else if (cmd_match(buf, "failfast")) {
28543071 set_bit(FailFast, &rdev->flags);
3072
+ need_update_sb = true;
28553073 err = 0;
28563074 } else if (cmd_match(buf, "-failfast")) {
28573075 clear_bit(FailFast, &rdev->flags);
3076
+ need_update_sb = true;
28583077 err = 0;
28593078 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
28603079 !test_bit(Journal, &rdev->flags)) {
....@@ -2933,6 +3152,8 @@
29333152 clear_bit(ExternalBbl, &rdev->flags);
29343153 err = 0;
29353154 }
3155
+ if (need_update_sb)
3156
+ md_update_sb(mddev, 1);
29363157 if (!err)
29373158 sysfs_notify_dirent_safe(rdev->sysfs_state);
29383159 return err ? err : len;
....@@ -2986,6 +3207,9 @@
29863207 err = kstrtouint(buf, 10, (unsigned int *)&slot);
29873208 if (err < 0)
29883209 return err;
3210
+ if (slot < 0)
3211
+ /* overflow */
3212
+ return -ENOSPC;
29893213 }
29903214 if (rdev->mddev->pers && slot == -1) {
29913215 /* Setting 'slot' on an active array requires also
....@@ -3032,15 +3256,14 @@
30323256 rdev->saved_raid_disk = -1;
30333257 clear_bit(In_sync, &rdev->flags);
30343258 clear_bit(Bitmap_sync, &rdev->flags);
3035
- err = rdev->mddev->pers->
3036
- hot_add_disk(rdev->mddev, rdev);
3259
+ err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev);
30373260 if (err) {
30383261 rdev->raid_disk = -1;
30393262 return err;
30403263 } else
30413264 sysfs_notify_dirent_safe(rdev->sysfs_state);
3042
- if (sysfs_link_rdev(rdev->mddev, rdev))
3043
- /* failure here is OK */;
3265
+ /* failure here is OK */;
3266
+ sysfs_link_rdev(rdev->mddev, rdev);
30443267 /* don't wakeup anyone, leave that to userspace. */
30453268 } else {
30463269 if (slot >= rdev->mddev->raid_disks &&
....@@ -3422,7 +3645,7 @@
34223645 if (!entry->show)
34233646 return -EIO;
34243647 if (!rdev->mddev)
3425
- return -EBUSY;
3648
+ return -ENODEV;
34263649 return entry->show(rdev, page);
34273650 }
34283651
....@@ -3439,10 +3662,10 @@
34393662 return -EIO;
34403663 if (!capable(CAP_SYS_ADMIN))
34413664 return -EACCES;
3442
- rv = mddev ? mddev_lock(mddev): -EBUSY;
3665
+ rv = mddev ? mddev_lock(mddev) : -ENODEV;
34433666 if (!rv) {
34443667 if (rdev->mddev == NULL)
3445
- rv = -EBUSY;
3668
+ rv = -ENODEV;
34463669 else
34473670 rv = entry->store(rdev, page, length);
34483671 mddev_unlock(mddev);
....@@ -3563,7 +3786,7 @@
35633786 * Check a full RAID array for plausibility
35643787 */
35653788
3566
-static void analyze_sbs(struct mddev *mddev)
3789
+static int analyze_sbs(struct mddev *mddev)
35673790 {
35683791 int i;
35693792 struct md_rdev *rdev, *freshest, *tmp;
....@@ -3583,6 +3806,12 @@
35833806 bdevname(rdev->bdev,b));
35843807 md_kick_rdev_from_array(rdev);
35853808 }
3809
+
3810
+ /* Cannot find a valid fresh disk */
3811
+ if (!freshest) {
3812
+ pr_warn("md: cannot find a valid disk\n");
3813
+ return -EINVAL;
3814
+ }
35863815
35873816 super_types[mddev->major_version].
35883817 validate_super(mddev, freshest);
....@@ -3618,6 +3847,8 @@
36183847 clear_bit(In_sync, &rdev->flags);
36193848 }
36203849 }
3850
+
3851
+ return 0;
36213852 }
36223853
36233854 /* Read a fixed-point number.
....@@ -3652,19 +3883,16 @@
36523883 return -EINVAL;
36533884 if (decimals < 0)
36543885 decimals = 0;
3655
- while (decimals < scale) {
3656
- result *= 10;
3657
- decimals ++;
3658
- }
3659
- *res = result;
3886
+ *res = result * int_pow(10, scale - decimals);
36603887 return 0;
36613888 }
36623889
36633890 static ssize_t
36643891 safe_delay_show(struct mddev *mddev, char *page)
36653892 {
3666
- int msec = (mddev->safemode_delay*1000)/HZ;
3667
- return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3893
+ unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ;
3894
+
3895
+ return sprintf(page, "%u.%03u\n", msec/1000, msec%1000);
36683896 }
36693897 static ssize_t
36703898 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
....@@ -3676,7 +3904,7 @@
36763904 return -EINVAL;
36773905 }
36783906
3679
- if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3907
+ if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ)
36803908 return -EINVAL;
36813909 if (msec == 0)
36823910 mddev->safemode_delay = 0;
....@@ -3861,6 +4089,8 @@
38614089 pr_warn("md: cannot register extra attributes for %s\n",
38624090 mdname(mddev));
38634091 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4092
+ mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
4093
+ mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
38644094 }
38654095 if (oldpers->sync_request != NULL &&
38664096 pers->sync_request == NULL) {
....@@ -3908,7 +4138,7 @@
39084138 mddev_resume(mddev);
39094139 if (!mddev->thread)
39104140 md_update_sb(mddev, 1);
3911
- sysfs_notify(&mddev->kobj, NULL, "level");
4141
+ sysfs_notify_dirent_safe(mddev->sysfs_level);
39124142 md_new_event(mddev);
39134143 rv = len;
39144144 out_unlock:
....@@ -4019,6 +4249,14 @@
40194249 }
40204250 static struct md_sysfs_entry md_raid_disks =
40214251 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4252
+
4253
+static ssize_t
4254
+uuid_show(struct mddev *mddev, char *page)
4255
+{
4256
+ return sprintf(page, "%pU\n", mddev->uuid);
4257
+}
4258
+static struct md_sysfs_entry md_uuid =
4259
+__ATTR(uuid, S_IRUGO, uuid_show, NULL);
40224260
40234261 static ssize_t
40244262 chunk_size_show(struct mddev *mddev, char *page)
....@@ -4143,12 +4381,17 @@
41434381 * active-idle
41444382 * like active, but no writes have been seen for a while (100msec).
41454383 *
4384
+ * broken
4385
+ * RAID0/LINEAR-only: same as clean, but array is missing a member.
4386
+ * It's useful because RAID0/LINEAR mounted-arrays aren't stopped
4387
+ * when a member is gone, so this state will at least alert the
4388
+ * user that something is wrong.
41464389 */
41474390 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4148
- write_pending, active_idle, bad_word};
4391
+ write_pending, active_idle, broken, bad_word};
41494392 static char *array_states[] = {
41504393 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4151
- "write-pending", "active-idle", NULL };
4394
+ "write-pending", "active-idle", "broken", NULL };
41524395
41534396 static int match_word(const char *word, char **list)
41544397 {
....@@ -4164,7 +4407,7 @@
41644407 {
41654408 enum array_state st = inactive;
41664409
4167
- if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags))
4410
+ if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
41684411 switch(mddev->ro) {
41694412 case 1:
41704413 st = readonly;
....@@ -4184,7 +4427,10 @@
41844427 st = active;
41854428 spin_unlock(&mddev->lock);
41864429 }
4187
- else {
4430
+
4431
+ if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4432
+ st = broken;
4433
+ } else {
41884434 if (list_empty(&mddev->disks) &&
41894435 mddev->raid_disks == 0 &&
41904436 mddev->dev_sectors == 0)
....@@ -4197,7 +4443,6 @@
41974443
41984444 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
41994445 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
4200
-static int do_md_run(struct mddev *mddev);
42014446 static int restart_array(struct mddev *mddev);
42024447
42034448 static ssize_t
....@@ -4298,6 +4543,7 @@
42984543 break;
42994544 case write_pending:
43004545 case active_idle:
4546
+ case broken:
43014547 /* these cannot be set */
43024548 break;
43034549 }
....@@ -4328,6 +4574,8 @@
43284574 rv = kstrtouint(buf, 10, &n);
43294575 if (rv < 0)
43304576 return rv;
4577
+ if (n > INT_MAX)
4578
+ return -EINVAL;
43314579 atomic_set(&mddev->max_corr_read_errors, n);
43324580 return len;
43334581 }
....@@ -4340,6 +4588,20 @@
43404588 null_show(struct mddev *mddev, char *page)
43414589 {
43424590 return -EINVAL;
4591
+}
4592
+
4593
+/* need to ensure rdev_delayed_delete() has completed */
4594
+static void flush_rdev_wq(struct mddev *mddev)
4595
+{
4596
+ struct md_rdev *rdev;
4597
+
4598
+ rcu_read_lock();
4599
+ rdev_for_each_rcu(rdev, mddev)
4600
+ if (work_pending(&rdev->del_work)) {
4601
+ flush_workqueue(md_rdev_misc_wq);
4602
+ break;
4603
+ }
4604
+ rcu_read_unlock();
43434605 }
43444606
43454607 static ssize_t
....@@ -4369,8 +4631,7 @@
43694631 minor != MINOR(dev))
43704632 return -EOVERFLOW;
43714633
4372
- flush_workqueue(md_misc_wq);
4373
-
4634
+ flush_rdev_wq(mddev);
43744635 err = mddev_lock(mddev);
43754636 if (err)
43764637 return err;
....@@ -4608,7 +4869,8 @@
46084869 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
46094870 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
46104871 mddev_lock(mddev) == 0) {
4611
- flush_workqueue(md_misc_wq);
4872
+ if (work_pending(&mddev->del_work))
4873
+ flush_workqueue(md_misc_wq);
46124874 if (mddev->sync_thread) {
46134875 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
46144876 md_reap_sync_thread(mddev);
....@@ -4628,17 +4890,27 @@
46284890 return -EINVAL;
46294891 err = mddev_lock(mddev);
46304892 if (!err) {
4631
- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4893
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
46324894 err = -EBUSY;
4633
- else {
4895
+ } else if (mddev->reshape_position == MaxSector ||
4896
+ mddev->pers->check_reshape == NULL ||
4897
+ mddev->pers->check_reshape(mddev)) {
46344898 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
46354899 err = mddev->pers->start_reshape(mddev);
4900
+ } else {
4901
+ /*
4902
+ * If reshape is still in progress, and
4903
+ * md_check_recovery() can continue to reshape,
4904
+ * don't restart reshape because data can be
4905
+ * corrupted for raid456.
4906
+ */
4907
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
46364908 }
46374909 mddev_unlock(mddev);
46384910 }
46394911 if (err)
46404912 return err;
4641
- sysfs_notify(&mddev->kobj, NULL, "degraded");
4913
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
46424914 } else {
46434915 if (cmd_match(page, "check"))
46444916 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
....@@ -5113,7 +5385,7 @@
51135385 mddev->array_sectors = sectors;
51145386 if (mddev->pers) {
51155387 set_capacity(mddev->gendisk, mddev->array_sectors);
5116
- revalidate_disk(mddev->gendisk);
5388
+ revalidate_disk_size(mddev->gendisk, true);
51175389 }
51185390 }
51195391 mddev_unlock(mddev);
....@@ -5170,10 +5442,90 @@
51705442 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
51715443 consistency_policy_store);
51725444
5445
+static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
5446
+{
5447
+ return sprintf(page, "%d\n", mddev->fail_last_dev);
5448
+}
5449
+
5450
+/*
5451
+ * Setting fail_last_dev to true to allow last device to be forcibly removed
5452
+ * from RAID1/RAID10.
5453
+ */
5454
+static ssize_t
5455
+fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
5456
+{
5457
+ int ret;
5458
+ bool value;
5459
+
5460
+ ret = kstrtobool(buf, &value);
5461
+ if (ret)
5462
+ return ret;
5463
+
5464
+ if (value != mddev->fail_last_dev)
5465
+ mddev->fail_last_dev = value;
5466
+
5467
+ return len;
5468
+}
5469
+static struct md_sysfs_entry md_fail_last_dev =
5470
+__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
5471
+ fail_last_dev_store);
5472
+
5473
+static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
5474
+{
5475
+ if (mddev->pers == NULL || (mddev->pers->level != 1))
5476
+ return sprintf(page, "n/a\n");
5477
+ else
5478
+ return sprintf(page, "%d\n", mddev->serialize_policy);
5479
+}
5480
+
5481
+/*
5482
+ * Setting serialize_policy to true to enforce write IO is not reordered
5483
+ * for raid1.
5484
+ */
5485
+static ssize_t
5486
+serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
5487
+{
5488
+ int err;
5489
+ bool value;
5490
+
5491
+ err = kstrtobool(buf, &value);
5492
+ if (err)
5493
+ return err;
5494
+
5495
+ if (value == mddev->serialize_policy)
5496
+ return len;
5497
+
5498
+ err = mddev_lock(mddev);
5499
+ if (err)
5500
+ return err;
5501
+ if (mddev->pers == NULL || (mddev->pers->level != 1)) {
5502
+ pr_err("md: serialize_policy is only effective for raid1\n");
5503
+ err = -EINVAL;
5504
+ goto unlock;
5505
+ }
5506
+
5507
+ mddev_suspend(mddev);
5508
+ if (value)
5509
+ mddev_create_serial_pool(mddev, NULL, true);
5510
+ else
5511
+ mddev_destroy_serial_pool(mddev, NULL, true);
5512
+ mddev->serialize_policy = value;
5513
+ mddev_resume(mddev);
5514
+unlock:
5515
+ mddev_unlock(mddev);
5516
+ return err ?: len;
5517
+}
5518
+
5519
+static struct md_sysfs_entry md_serialize_policy =
5520
+__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
5521
+ serialize_policy_store);
5522
+
5523
+
51735524 static struct attribute *md_default_attrs[] = {
51745525 &md_level.attr,
51755526 &md_layout.attr,
51765527 &md_raid_disks.attr,
5528
+ &md_uuid.attr,
51775529 &md_chunk_size.attr,
51785530 &md_size.attr,
51795531 &md_resync_start.attr,
....@@ -5186,6 +5538,8 @@
51865538 &md_array_size.attr,
51875539 &max_corr_read_errors.attr,
51885540 &md_consistency_policy.attr,
5541
+ &md_fail_last_dev.attr,
5542
+ &md_serialize_policy.attr,
51895543 NULL,
51905544 };
51915545
....@@ -5263,6 +5617,8 @@
52635617
52645618 if (mddev->sysfs_state)
52655619 sysfs_put(mddev->sysfs_state);
5620
+ if (mddev->sysfs_level)
5621
+ sysfs_put(mddev->sysfs_level);
52665622
52675623 if (mddev->gendisk)
52685624 del_gendisk(mddev->gendisk);
....@@ -5304,7 +5660,8 @@
53045660 {
53055661 if (mddev->writes_pending.percpu_count_ptr)
53065662 return 0;
5307
- if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
5663
+ if (percpu_ref_init(&mddev->writes_pending, no_op,
5664
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
53085665 return -ENOMEM;
53095666 /* We want to start with the refcount at zero */
53105667 percpu_ref_put(&mddev->writes_pending);
....@@ -5342,6 +5699,7 @@
53425699 * completely removed (mddev_delayed_delete).
53435700 */
53445701 flush_workqueue(md_misc_wq);
5702
+ flush_workqueue(md_rdev_misc_wq);
53455703
53465704 mutex_lock(&disks_mutex);
53475705 error = -EEXIST;
....@@ -5369,12 +5727,10 @@
53695727 mddev->hold_active = UNTIL_STOP;
53705728
53715729 error = -ENOMEM;
5372
- mddev->queue = blk_alloc_queue(GFP_KERNEL);
5730
+ mddev->queue = blk_alloc_queue(NUMA_NO_NODE);
53735731 if (!mddev->queue)
53745732 goto abort;
5375
- mddev->queue->queuedata = mddev;
53765733
5377
- blk_queue_make_request(mddev->queue, md_make_request);
53785734 blk_set_stacking_limits(&mddev->queue->limits);
53795735
53805736 disk = alloc_disk(1 << shift);
....@@ -5400,6 +5756,7 @@
54005756 * remove it now.
54015757 */
54025758 disk->flags |= GENHD_FL_EXT_DEVT;
5759
+ disk->events |= DISK_EVENT_MEDIA_CHANGE;
54035760 mddev->gendisk = disk;
54045761 add_disk(disk);
54055762
....@@ -5420,6 +5777,7 @@
54205777 if (!error && mddev->kobj.sd) {
54215778 kobject_uevent(&mddev->kobj, KOBJ_ADD);
54225779 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5780
+ mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
54235781 }
54245782 mddev_put(mddev);
54255783 return error;
....@@ -5496,7 +5854,9 @@
54965854 if (!mddev->raid_disks) {
54975855 if (!mddev->persistent)
54985856 return -EINVAL;
5499
- analyze_sbs(mddev);
5857
+ err = analyze_sbs(mddev);
5858
+ if (err)
5859
+ return -EINVAL;
55005860 }
55015861
55025862 if (mddev->level != LEVEL_NONE)
....@@ -5601,8 +5961,8 @@
56015961 rdev_for_each(rdev, mddev)
56025962 rdev_for_each(rdev2, mddev) {
56035963 if (rdev < rdev2 &&
5604
- rdev->bdev->bd_contains ==
5605
- rdev2->bdev->bd_contains) {
5964
+ rdev->bdev->bd_disk ==
5965
+ rdev2->bdev->bd_disk) {
56065966 pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
56075967 mdname(mddev),
56085968 bdevname(rdev->bdev,b),
....@@ -5649,15 +6009,28 @@
56496009 mddev->bitmap = bitmap;
56506010
56516011 }
5652
- if (err) {
5653
- mddev_detach(mddev);
5654
- if (mddev->private)
5655
- pers->free(mddev, mddev->private);
5656
- mddev->private = NULL;
5657
- module_put(pers->owner);
5658
- md_bitmap_destroy(mddev);
5659
- goto abort;
6012
+ if (err)
6013
+ goto bitmap_abort;
6014
+
6015
+ if (mddev->bitmap_info.max_write_behind > 0) {
6016
+ bool create_pool = false;
6017
+
6018
+ rdev_for_each(rdev, mddev) {
6019
+ if (test_bit(WriteMostly, &rdev->flags) &&
6020
+ rdev_init_serial(rdev))
6021
+ create_pool = true;
6022
+ }
6023
+ if (create_pool && mddev->serial_info_pool == NULL) {
6024
+ mddev->serial_info_pool =
6025
+ mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
6026
+ sizeof(struct serial_info));
6027
+ if (!mddev->serial_info_pool) {
6028
+ err = -ENOMEM;
6029
+ goto bitmap_abort;
6030
+ }
6031
+ }
56606032 }
6033
+
56616034 if (mddev->queue) {
56626035 bool nonrot = true;
56636036
....@@ -5674,8 +6047,6 @@
56746047 blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
56756048 else
56766049 blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
5677
- mddev->queue->backing_dev_info->congested_data = mddev;
5678
- mddev->queue->backing_dev_info->congested_fn = md_congested;
56796050 }
56806051 if (pers->sync_request) {
56816052 if (mddev->kobj.sd &&
....@@ -5683,6 +6054,8 @@
56836054 pr_warn("md: cannot register extra attributes for %s\n",
56846055 mdname(mddev));
56856056 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
6057
+ mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
6058
+ mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
56866059 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
56876060 mddev->ro = 0;
56886061
....@@ -5692,7 +6065,7 @@
56926065 if (mddev_is_clustered(mddev))
56936066 mddev->safemode_delay = 0;
56946067 else
5695
- mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
6068
+ mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
56966069 mddev->in_sync = 1;
56976070 smp_wmb();
56986071 spin_lock(&mddev->lock);
....@@ -5700,8 +6073,7 @@
57006073 spin_unlock(&mddev->lock);
57016074 rdev_for_each(rdev, mddev)
57026075 if (rdev->raid_disk >= 0)
5703
- if (sysfs_link_rdev(mddev, rdev))
5704
- /* failure here is OK */;
6076
+ sysfs_link_rdev(mddev, rdev); /* failure here is OK */
57056077
57066078 if (mddev->degraded && !mddev->ro)
57076079 /* This ensures that recovering status is reported immediately
....@@ -5716,6 +6088,13 @@
57166088 md_new_event(mddev);
57176089 return 0;
57186090
6091
+bitmap_abort:
6092
+ mddev_detach(mddev);
6093
+ if (mddev->private)
6094
+ pers->free(mddev, mddev->private);
6095
+ mddev->private = NULL;
6096
+ module_put(pers->owner);
6097
+ md_bitmap_destroy(mddev);
57196098 abort:
57206099 bioset_exit(&mddev->bio_set);
57216100 bioset_exit(&mddev->sync_set);
....@@ -5723,7 +6102,7 @@
57236102 }
57246103 EXPORT_SYMBOL_GPL(md_run);
57256104
5726
-static int do_md_run(struct mddev *mddev)
6105
+int do_md_run(struct mddev *mddev)
57276106 {
57286107 int err;
57296108
....@@ -5747,13 +6126,13 @@
57476126 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
57486127
57496128 set_capacity(mddev->gendisk, mddev->array_sectors);
5750
- revalidate_disk(mddev->gendisk);
6129
+ revalidate_disk_size(mddev->gendisk, true);
57516130 clear_bit(MD_NOT_READY, &mddev->flags);
57526131 mddev->changed = 1;
57536132 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
57546133 sysfs_notify_dirent_safe(mddev->sysfs_state);
57556134 sysfs_notify_dirent_safe(mddev->sysfs_action);
5756
- sysfs_notify(&mddev->kobj, NULL, "degraded");
6135
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
57576136 out:
57586137 clear_bit(MD_NOT_READY, &mddev->flags);
57596138 return err;
....@@ -5868,7 +6247,8 @@
58686247 static void __md_stop_writes(struct mddev *mddev)
58696248 {
58706249 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5871
- flush_workqueue(md_misc_wq);
6250
+ if (work_pending(&mddev->del_work))
6251
+ flush_workqueue(md_misc_wq);
58726252 if (mddev->sync_thread) {
58736253 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
58746254 md_reap_sync_thread(mddev);
....@@ -5890,6 +6270,9 @@
58906270 mddev->in_sync = 1;
58916271 md_update_sb(mddev, 1);
58926272 }
6273
+ /* disable policy to guarantee rdevs free resources for serialization */
6274
+ mddev->serialize_policy = 0;
6275
+ mddev_destroy_serial_pool(mddev, NULL, true);
58936276 }
58946277
58956278 void md_stop_writes(struct mddev *mddev)
....@@ -5918,7 +6301,8 @@
59186301 md_bitmap_destroy(mddev);
59196302 mddev_detach(mddev);
59206303 /* Ensure ->event_work is done */
5921
- flush_workqueue(md_misc_wq);
6304
+ if (mddev->event_work.func)
6305
+ flush_workqueue(md_misc_wq);
59226306 spin_lock(&mddev->lock);
59236307 mddev->pers = NULL;
59246308 spin_unlock(&mddev->lock);
....@@ -5932,9 +6316,12 @@
59326316
59336317 void md_stop(struct mddev *mddev)
59346318 {
6319
+ lockdep_assert_held(&mddev->reconfig_mutex);
6320
+
59356321 /* stop the array and free an attached data structures.
59366322 * This is called from dm-raid
59376323 */
6324
+ __md_stop_writes(mddev);
59386325 __md_stop(mddev);
59396326 bioset_exit(&mddev->bio_set);
59406327 bioset_exit(&mddev->sync_set);
....@@ -6049,7 +6436,6 @@
60496436
60506437 __md_stop_writes(mddev);
60516438 __md_stop(mddev);
6052
- mddev->queue->backing_dev_info->congested_fn = NULL;
60536439
60546440 /* tell userspace to handle 'inactive' */
60556441 sysfs_notify_dirent_safe(mddev->sysfs_state);
....@@ -6061,7 +6447,7 @@
60616447 set_capacity(disk, 0);
60626448 mutex_unlock(&mddev->open_mutex);
60636449 mddev->changed = 1;
6064
- revalidate_disk(disk);
6450
+ revalidate_disk_size(disk, true);
60656451
60666452 if (mddev->ro)
60676453 mddev->ro = 0;
....@@ -6352,7 +6738,7 @@
63526738 return 0;
63536739 }
63546740
6355
-static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6741
+int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
63566742 {
63576743 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
63586744 struct md_rdev *rdev;
....@@ -6398,7 +6784,7 @@
63986784 }
63996785
64006786 /*
6401
- * add_new_disk can be used once the array is assembled
6787
+ * md_add_new_disk can be used once the array is assembled
64026788 * to add "hot spares". They must already have a superblock
64036789 * written
64046790 */
....@@ -6511,7 +6897,7 @@
65116897 return err;
65126898 }
65136899
6514
- /* otherwise, add_new_disk is only allowed
6900
+ /* otherwise, md_add_new_disk is only allowed
65156901 * for major_version==0 superblocks
65166902 */
65176903 if (mddev->major_version != 0) {
....@@ -6758,7 +7144,7 @@
67587144 }
67597145
67607146 /*
6761
- * set_array_info is used two different ways
7147
+ * md_set_array_info is used two different ways
67627148 * The original usage is when creating a new array.
67637149 * In this usage, raid_disks is > 0 and it together with
67647150 * level, size, not_persistent,layout,chunksize determine the
....@@ -6770,9 +7156,8 @@
67707156 * The minor and patch _version numbers are also kept incase the
67717157 * super_block handler wishes to interpret them.
67727158 */
6773
-static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
7159
+int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
67747160 {
6775
-
67767161 if (info->raid_disks == 0) {
67777162 /* just setting version number for superblock loading */
67787163 if (info->major_version < 0 ||
....@@ -6894,7 +7279,7 @@
68947279 md_cluster_ops->update_size(mddev, old_dev_sectors);
68957280 else if (mddev->queue) {
68967281 set_capacity(mddev->gendisk, mddev->array_sectors);
6897
- revalidate_disk(mddev->gendisk);
7282
+ revalidate_disk_size(mddev->gendisk, true);
68987283 }
68997284 }
69007285 return rv;
....@@ -7061,6 +7446,8 @@
70617446
70627447 mddev->bitmap_info.nodes = 0;
70637448 md_cluster_ops->leave(mddev);
7449
+ module_put(md_cluster_mod);
7450
+ mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
70647451 }
70657452 mddev_suspend(mddev);
70667453 md_bitmap_destroy(mddev);
....@@ -7121,7 +7508,6 @@
71217508 case GET_DISK_INFO:
71227509 case HOT_ADD_DISK:
71237510 case HOT_REMOVE_DISK:
7124
- case RAID_AUTORUN:
71257511 case RAID_VERSION:
71267512 case RESTART_ARRAY_RW:
71277513 case RUN_ARRAY:
....@@ -7167,13 +7553,6 @@
71677553 case RAID_VERSION:
71687554 err = get_version(argp);
71697555 goto out;
7170
-
7171
-#ifndef MODULE
7172
- case RAID_AUTORUN:
7173
- err = 0;
7174
- autostart_arrays(arg);
7175
- goto out;
7176
-#endif
71777556 default:;
71787557 }
71797558
....@@ -7214,9 +7593,8 @@
72147593
72157594 }
72167595
7217
- if (cmd == ADD_NEW_DISK)
7218
- /* need to ensure md_delayed_delete() has completed */
7219
- flush_workqueue(md_misc_wq);
7596
+ if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
7597
+ flush_rdev_wq(mddev);
72207598
72217599 if (cmd == HOT_REMOVE_DISK)
72227600 /* need to ensure recovery thread has run */
....@@ -7276,7 +7654,7 @@
72767654 err = -EBUSY;
72777655 goto unlock;
72787656 }
7279
- err = set_array_info(mddev, &info);
7657
+ err = md_set_array_info(mddev, &info);
72807658 if (err) {
72817659 pr_warn("md: couldn't set array info. %d\n", err);
72827660 goto unlock;
....@@ -7330,7 +7708,7 @@
73307708 /* Need to clear read-only for this */
73317709 break;
73327710 else
7333
- err = add_new_disk(mddev, &info);
7711
+ err = md_add_new_disk(mddev, &info);
73347712 goto unlock;
73357713 }
73367714 break;
....@@ -7398,7 +7776,7 @@
73987776 if (copy_from_user(&info, argp, sizeof(info)))
73997777 err = -EFAULT;
74007778 else
7401
- err = add_new_disk(mddev, &info);
7779
+ err = md_add_new_disk(mddev, &info);
74027780 goto unlock;
74037781 }
74047782
....@@ -7493,7 +7871,7 @@
74937871 atomic_inc(&mddev->openers);
74947872 mutex_unlock(&mddev->open_mutex);
74957873
7496
- check_disk_change(bdev);
7874
+ bdev_check_media_change(bdev);
74977875 out:
74987876 if (err)
74997877 mddev_put(mddev);
....@@ -7509,23 +7887,21 @@
75097887 mddev_put(mddev);
75107888 }
75117889
7512
-static int md_media_changed(struct gendisk *disk)
7890
+static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
75137891 {
75147892 struct mddev *mddev = disk->private_data;
7893
+ unsigned int ret = 0;
75157894
7516
- return mddev->changed;
7517
-}
7518
-
7519
-static int md_revalidate(struct gendisk *disk)
7520
-{
7521
- struct mddev *mddev = disk->private_data;
7522
-
7895
+ if (mddev->changed)
7896
+ ret = DISK_EVENT_MEDIA_CHANGE;
75237897 mddev->changed = 0;
7524
- return 0;
7898
+ return ret;
75257899 }
7526
-static const struct block_device_operations md_fops =
7900
+
7901
+const struct block_device_operations md_fops =
75277902 {
75287903 .owner = THIS_MODULE,
7904
+ .submit_bio = md_submit_bio,
75297905 .open = md_open,
75307906 .release = md_release,
75317907 .ioctl = md_ioctl,
....@@ -7533,8 +7909,7 @@
75337909 .compat_ioctl = md_compat_ioctl,
75347910 #endif
75357911 .getgeo = md_getgeo,
7536
- .media_changed = md_media_changed,
7537
- .revalidate_disk= md_revalidate,
7912
+ .check_events = md_check_events,
75387913 };
75397914
75407915 static int md_thread(void *arg)
....@@ -7618,17 +7993,22 @@
76187993
76197994 void md_unregister_thread(struct md_thread **threadp)
76207995 {
7621
- struct md_thread *thread = *threadp;
7622
- if (!thread)
7623
- return;
7624
- pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
7625
- /* Locking ensures that mddev_unlock does not wake_up a
7996
+ struct md_thread *thread;
7997
+
7998
+ /*
7999
+ * Locking ensures that mddev_unlock does not wake_up a
76268000 * non-existent thread
76278001 */
76288002 spin_lock(&pers_lock);
8003
+ thread = *threadp;
8004
+ if (!thread) {
8005
+ spin_unlock(&pers_lock);
8006
+ return;
8007
+ }
76298008 *threadp = NULL;
76308009 spin_unlock(&pers_lock);
76318010
8011
+ pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
76328012 kthread_stop(thread->tsk);
76338013 kfree(thread);
76348014 }
....@@ -8006,13 +8386,12 @@
80068386 return mask;
80078387 }
80088388
8009
-static const struct file_operations md_seq_fops = {
8010
- .owner = THIS_MODULE,
8011
- .open = md_seq_open,
8012
- .read = seq_read,
8013
- .llseek = seq_lseek,
8014
- .release = seq_release,
8015
- .poll = mdstat_poll,
8389
+static const struct proc_ops mdstat_proc_ops = {
8390
+ .proc_open = md_seq_open,
8391
+ .proc_read = seq_read,
8392
+ .proc_lseek = seq_lseek,
8393
+ .proc_release = seq_release,
8394
+ .proc_poll = mdstat_poll,
80168395 };
80178396
80188397 int register_md_personality(struct md_personality *p)
....@@ -8063,6 +8442,7 @@
80638442
80648443 int md_setup_cluster(struct mddev *mddev, int nodes)
80658444 {
8445
+ int ret;
80668446 if (!md_cluster_ops)
80678447 request_module("md-cluster");
80688448 spin_lock(&pers_lock);
....@@ -8074,7 +8454,10 @@
80748454 }
80758455 spin_unlock(&pers_lock);
80768456
8077
- return md_cluster_ops->join(mddev, nodes);
8457
+ ret = md_cluster_ops->join(mddev, nodes);
8458
+ if (!ret)
8459
+ mddev->safemode_delay = 0;
8460
+ return ret;
80788461 }
80798462
80808463 void md_cluster_stop(struct mddev *mddev)
....@@ -8094,7 +8477,7 @@
80948477 idle = 1;
80958478 rcu_read_lock();
80968479 rdev_for_each_rcu(rdev, mddev) {
8097
- struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
8480
+ struct gendisk *disk = rdev->bdev->bd_disk;
80988481 curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
80998482 atomic_read(&disk->sync_io);
81008483 /* sync IO will cause sync_io to increase before the disk_stats
....@@ -8273,8 +8656,7 @@
82738656 {
82748657 struct mddev *mddev = thread->mddev;
82758658 struct mddev *mddev2;
8276
- unsigned int currspeed = 0,
8277
- window;
8659
+ unsigned int currspeed = 0, window;
82788660 sector_t max_sectors,j, io_sectors, recovery_done;
82798661 unsigned long mark[SYNC_MARKS];
82808662 unsigned long update_time;
....@@ -8331,7 +8713,7 @@
83318713 * 0 == not engaged in resync at all
83328714 * 2 == checking that there is no conflict with another sync
83338715 * 1 == like 2, but have yielded to allow conflicting resync to
8334
- * commense
8716
+ * commence
83358717 * other == active in resync - this many blocks
83368718 *
83378719 * Before starting a resync we must have set curr_resync to
....@@ -8405,9 +8787,17 @@
84058787 else if (!mddev->bitmap)
84068788 j = mddev->recovery_cp;
84078789
8408
- } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8790
+ } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
84098791 max_sectors = mddev->resync_max_sectors;
8410
- else {
8792
+ /*
8793
+ * If the original node aborts reshaping then we continue the
8794
+ * reshaping, so set j again to avoid restart reshape from the
8795
+ * first beginning
8796
+ */
8797
+ if (mddev_is_clustered(mddev) &&
8798
+ mddev->reshape_position != MaxSector)
8799
+ j = mddev->reshape_position;
8800
+ } else {
84118801 /* recovery follows the physical size of devices */
84128802 max_sectors = mddev->dev_sectors;
84138803 j = MaxSector;
....@@ -8454,7 +8844,7 @@
84548844 /*
84558845 * Tune reconstruction:
84568846 */
8457
- window = 32*(PAGE_SIZE/512);
8847
+ window = 32 * (PAGE_SIZE / 512);
84588848 pr_debug("md: using %dk window, over a total of %lluk.\n",
84598849 window/2, (unsigned long long)max_sectors/2);
84608850
....@@ -8468,7 +8858,7 @@
84688858 } else
84698859 mddev->curr_resync = 3; /* no longer delayed */
84708860 mddev->curr_resync_completed = j;
8471
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8861
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
84728862 md_new_event(mddev);
84738863 update_time = jiffies;
84748864
....@@ -8496,7 +8886,7 @@
84968886 mddev->recovery_cp = j;
84978887 update_time = jiffies;
84988888 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8499
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8889
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
85008890 }
85018891
85028892 while (j >= mddev->resync_max &&
....@@ -8603,7 +8993,7 @@
86038993 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
86048994 mddev->curr_resync > 3) {
86058995 mddev->curr_resync_completed = mddev->curr_resync;
8606
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8996
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
86078997 }
86088998 mddev->pers->sync_request(mddev, max_sectors, &skipped);
86098999
....@@ -8658,8 +9048,10 @@
86589048 mddev_lock_nointr(mddev);
86599049 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
86609050 mddev_unlock(mddev);
8661
- set_capacity(mddev->gendisk, mddev->array_sectors);
8662
- revalidate_disk(mddev->gendisk);
9051
+ if (!mddev_is_clustered(mddev)) {
9052
+ set_capacity(mddev->gendisk, mddev->array_sectors);
9053
+ revalidate_disk_size(mddev->gendisk, true);
9054
+ }
86639055 }
86649056
86659057 spin_lock(&mddev->lock);
....@@ -8731,7 +9123,7 @@
87319123 }
87329124
87339125 if (removed && mddev->kobj.sd)
8734
- sysfs_notify(&mddev->kobj, NULL, "degraded");
9126
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
87359127
87369128 if (this && removed)
87379129 goto no_add;
....@@ -8758,10 +9150,9 @@
87589150
87599151 rdev->recovery_offset = 0;
87609152 }
8761
- if (mddev->pers->
8762
- hot_add_disk(mddev, rdev) == 0) {
8763
- if (sysfs_link_rdev(mddev, rdev))
8764
- /* failure here is OK */;
9153
+ if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
9154
+ /* failure here is OK */
9155
+ sysfs_link_rdev(mddev, rdev);
87659156 if (!test_bit(Journal, &rdev->flags))
87669157 spares++;
87679158 md_new_event(mddev);
....@@ -9004,6 +9395,8 @@
90049395 void md_reap_sync_thread(struct mddev *mddev)
90059396 {
90069397 struct md_rdev *rdev;
9398
+ sector_t old_dev_sectors = mddev->dev_sectors;
9399
+ bool is_reshaped = false;
90079400
90089401 /* resync has finished, collect result */
90099402 md_unregister_thread(&mddev->sync_thread);
....@@ -9013,14 +9406,16 @@
90139406 /* success...*/
90149407 /* activate any spares */
90159408 if (mddev->pers->spare_active(mddev)) {
9016
- sysfs_notify(&mddev->kobj, NULL,
9017
- "degraded");
9409
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
90189410 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
90199411 }
90209412 }
90219413 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9022
- mddev->pers->finish_reshape)
9414
+ mddev->pers->finish_reshape) {
90239415 mddev->pers->finish_reshape(mddev);
9416
+ if (mddev_is_clustered(mddev))
9417
+ is_reshaped = true;
9418
+ }
90249419
90259420 /* If array is no-longer degraded, then any saved_raid_disk
90269421 * information must be scrapped.
....@@ -9041,9 +9436,18 @@
90419436 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
90429437 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
90439438 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9439
+ /*
9440
+ * We call md_cluster_ops->update_size here because sync_size could
9441
+ * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
9442
+ * so it is time to update size across cluster.
9443
+ */
9444
+ if (mddev_is_clustered(mddev) && is_reshaped
9445
+ && !test_bit(MD_CLOSING, &mddev->flags))
9446
+ md_cluster_ops->update_size(mddev, old_dev_sectors);
90449447 wake_up(&resync_wait);
90459448 /* flag recovery needed just to double check */
90469449 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9450
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
90479451 sysfs_notify_dirent_safe(mddev->sysfs_action);
90489452 md_new_event(mddev);
90499453 if (mddev->event_work.func)
....@@ -9093,8 +9497,7 @@
90939497 if (rv == 0) {
90949498 /* Make sure they get written out promptly */
90959499 if (test_bit(ExternalBbl, &rdev->flags))
9096
- sysfs_notify(&rdev->kobj, NULL,
9097
- "unacknowledged_bad_blocks");
9500
+ sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
90989501 sysfs_notify_dirent_safe(rdev->sysfs_state);
90999502 set_mask_bits(&mddev->sb_flags, 0,
91009503 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
....@@ -9115,7 +9518,7 @@
91159518 s += rdev->data_offset;
91169519 rv = badblocks_clear(&rdev->badblocks, s, sectors);
91179520 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
9118
- sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
9521
+ sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
91199522 return rv;
91209523 }
91219524 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
....@@ -9159,7 +9562,7 @@
91599562 {
91609563 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
91619564
9162
- proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
9565
+ proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops);
91639566 }
91649567
91659568 static int __init md_init(void)
....@@ -9173,6 +9576,10 @@
91739576 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
91749577 if (!md_misc_wq)
91759578 goto err_misc_wq;
9579
+
9580
+ md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
9581
+ if (!md_rdev_misc_wq)
9582
+ goto err_rdev_misc_wq;
91769583
91779584 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
91789585 goto err_md;
....@@ -9195,6 +9602,8 @@
91959602 err_mdp:
91969603 unregister_blkdev(MD_MAJOR, "md");
91979604 err_md:
9605
+ destroy_workqueue(md_rdev_misc_wq);
9606
+err_rdev_misc_wq:
91989607 destroy_workqueue(md_misc_wq);
91999608 err_misc_wq:
92009609 destroy_workqueue(md_wq);
....@@ -9240,8 +9649,12 @@
92409649 }
92419650
92429651 if (role != rdev2->raid_disk) {
9243
- /* got activated */
9244
- if (rdev2->raid_disk == -1 && role != 0xffff) {
9652
+ /*
9653
+ * got activated except reshape is happening.
9654
+ */
9655
+ if (rdev2->raid_disk == -1 && role != 0xffff &&
9656
+ !(le32_to_cpu(sb->feature_map) &
9657
+ MD_FEATURE_RESHAPE_ACTIVE)) {
92459658 rdev2->saved_raid_disk = role;
92469659 ret = remove_and_add_spares(mddev, rdev2);
92479660 pr_info("Activated spare: %s\n",
....@@ -9250,7 +9663,6 @@
92509663 * perform resync with the new activated disk */
92519664 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
92529665 md_wakeup_thread(mddev->thread);
9253
-
92549666 }
92559667 /* device faulty
92569668 * We just want to do the minimum to mark the disk
....@@ -9268,6 +9680,30 @@
92689680 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
92699681 if (ret)
92709682 pr_warn("md: updating array disks failed. %d\n", ret);
9683
+ }
9684
+
9685
+ /*
9686
+ * Since mddev->delta_disks has already updated in update_raid_disks,
9687
+ * so it is time to check reshape.
9688
+ */
9689
+ if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9690
+ (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9691
+ /*
9692
+ * reshape is happening in the remote node, we need to
9693
+ * update reshape_position and call start_reshape.
9694
+ */
9695
+ mddev->reshape_position = le64_to_cpu(sb->reshape_position);
9696
+ if (mddev->pers->update_reshape_pos)
9697
+ mddev->pers->update_reshape_pos(mddev);
9698
+ if (mddev->pers->start_reshape)
9699
+ mddev->pers->start_reshape(mddev);
9700
+ } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9701
+ mddev->reshape_position != MaxSector &&
9702
+ !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9703
+ /* reshape is just done in another node. */
9704
+ mddev->reshape_position = MaxSector;
9705
+ if (mddev->pers->update_reshape_pos)
9706
+ mddev->pers->update_reshape_pos(mddev);
92719707 }
92729708
92739709 /* Finally set the event to be up to date */
....@@ -9315,7 +9751,7 @@
93159751 if (rdev->recovery_offset == MaxSector &&
93169752 !test_bit(In_sync, &rdev->flags) &&
93179753 mddev->pers->spare_active(mddev))
9318
- sysfs_notify(&mddev->kobj, NULL, "degraded");
9754
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
93199755
93209756 put_page(swapout);
93219757 return 0;
....@@ -9323,16 +9759,18 @@
93239759
93249760 void md_reload_sb(struct mddev *mddev, int nr)
93259761 {
9326
- struct md_rdev *rdev;
9762
+ struct md_rdev *rdev = NULL, *iter;
93279763 int err;
93289764
93299765 /* Find the rdev */
9330
- rdev_for_each_rcu(rdev, mddev) {
9331
- if (rdev->desc_nr == nr)
9766
+ rdev_for_each_rcu(iter, mddev) {
9767
+ if (iter->desc_nr == nr) {
9768
+ rdev = iter;
93329769 break;
9770
+ }
93339771 }
93349772
9335
- if (!rdev || rdev->desc_nr != nr) {
9773
+ if (!rdev) {
93369774 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
93379775 return;
93389776 }
....@@ -9378,7 +9816,7 @@
93789816 }
93799817 }
93809818
9381
-static void autostart_arrays(int part)
9819
+void md_autostart_arrays(int part)
93829820 {
93839821 struct md_rdev *rdev;
93849822 struct detected_devices_node *node_detected_dev;
....@@ -9457,6 +9895,7 @@
94579895 * destroy_workqueue() below will wait for that to complete.
94589896 */
94599897 }
9898
+ destroy_workqueue(md_rdev_misc_wq);
94609899 destroy_workqueue(md_misc_wq);
94619900 destroy_workqueue(md_wq);
94629901 }
....@@ -9466,7 +9905,7 @@
94669905
94679906 static int get_ro(char *buffer, const struct kernel_param *kp)
94689907 {
9469
- return sprintf(buffer, "%d", start_readonly);
9908
+ return sprintf(buffer, "%d\n", start_readonly);
94709909 }
94719910 static int set_ro(const char *val, const struct kernel_param *kp)
94729911 {