.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* |
---|
2 | 3 | * raid10.c : Multiple Devices driver for Linux |
---|
3 | 4 | * |
---|
.. | .. |
---|
6 | 7 | * RAID-10 support for md. |
---|
7 | 8 | * |
---|
8 | 9 | * Base on code in raid1.c. See raid1.c for further copyright information. |
---|
9 | | - * |
---|
10 | | - * |
---|
11 | | - * This program is free software; you can redistribute it and/or modify |
---|
12 | | - * it under the terms of the GNU General Public License as published by |
---|
13 | | - * the Free Software Foundation; either version 2, or (at your option) |
---|
14 | | - * any later version. |
---|
15 | | - * |
---|
16 | | - * You should have received a copy of the GNU General Public License |
---|
17 | | - * (for example /usr/src/linux/COPYING); if not, write to the Free |
---|
18 | | - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
19 | 10 | */ |
---|
20 | 11 | |
---|
21 | 12 | #include <linux/slab.h> |
---|
.. | .. |
---|
25 | 16 | #include <linux/seq_file.h> |
---|
26 | 17 | #include <linux/ratelimit.h> |
---|
27 | 18 | #include <linux/kthread.h> |
---|
| 19 | +#include <linux/raid/md_p.h> |
---|
28 | 20 | #include <trace/events/block.h> |
---|
29 | 21 | #include "md.h" |
---|
30 | 22 | #include "raid10.h" |
---|
.. | .. |
---|
72 | 64 | * [B A] [D C] [B A] [E C D] |
---|
73 | 65 | */ |
---|
74 | 66 | |
---|
75 | | -/* |
---|
76 | | - * Number of guaranteed r10bios in case of extreme VM load: |
---|
77 | | - */ |
---|
78 | | -#define NR_RAID10_BIOS 256 |
---|
79 | | - |
---|
80 | | -/* when we get a read error on a read-only array, we redirect to another |
---|
81 | | - * device without failing the first device, or trying to over-write to |
---|
82 | | - * correct the read error. To keep track of bad blocks on a per-bio |
---|
83 | | - * level, we store IO_BLOCKED in the appropriate 'bios' pointer |
---|
84 | | - */ |
---|
85 | | -#define IO_BLOCKED ((struct bio *)1) |
---|
86 | | -/* When we successfully write to a known bad-block, we need to remove the |
---|
87 | | - * bad-block marking which must be done from process context. So we record |
---|
88 | | - * the success by setting devs[n].bio to IO_MADE_GOOD |
---|
89 | | - */ |
---|
90 | | -#define IO_MADE_GOOD ((struct bio *)2) |
---|
91 | | - |
---|
92 | | -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) |
---|
93 | | - |
---|
94 | | -/* When there are this many requests queued to be written by |
---|
95 | | - * the raid10 thread, we become 'congested' to provide back-pressure |
---|
96 | | - * for writeback. |
---|
97 | | - */ |
---|
98 | | -static int max_queued_requests = 1024; |
---|
99 | | - |
---|
100 | 67 | static void allow_barrier(struct r10conf *conf); |
---|
101 | 68 | static void lower_barrier(struct r10conf *conf); |
---|
102 | 69 | static int _enough(struct r10conf *conf, int previous, int ignore); |
---|
.. | .. |
---|
129 | 96 | /* allocate a r10bio with room for raid_disks entries in the |
---|
130 | 97 | * bios array */ |
---|
131 | 98 | return kzalloc(size, gfp_flags); |
---|
132 | | -} |
---|
133 | | - |
---|
134 | | -static void r10bio_pool_free(void *r10_bio, void *data) |
---|
135 | | -{ |
---|
136 | | - kfree(r10_bio); |
---|
137 | 99 | } |
---|
138 | 100 | |
---|
139 | 101 | #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) |
---|
.. | .. |
---|
241 | 203 | } |
---|
242 | 204 | kfree(rps); |
---|
243 | 205 | out_free_r10bio: |
---|
244 | | - r10bio_pool_free(r10_bio, conf); |
---|
| 206 | + rbio_pool_free(r10_bio, conf); |
---|
245 | 207 | return NULL; |
---|
246 | 208 | } |
---|
247 | 209 | |
---|
.. | .. |
---|
269 | 231 | /* resync pages array stored in the 1st bio's .bi_private */ |
---|
270 | 232 | kfree(rp); |
---|
271 | 233 | |
---|
272 | | - r10bio_pool_free(r10bio, conf); |
---|
| 234 | + rbio_pool_free(r10bio, conf); |
---|
273 | 235 | } |
---|
274 | 236 | |
---|
275 | 237 | static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) |
---|
.. | .. |
---|
503 | 465 | if (test_bit(FailFast, &rdev->flags) && |
---|
504 | 466 | (bio->bi_opf & MD_FAILFAST)) { |
---|
505 | 467 | md_error(rdev->mddev, rdev); |
---|
506 | | - if (!test_bit(Faulty, &rdev->flags)) |
---|
507 | | - /* This is the only remaining device, |
---|
508 | | - * We need to retry the write without |
---|
509 | | - * FailFast |
---|
510 | | - */ |
---|
511 | | - set_bit(R10BIO_WriteError, &r10_bio->state); |
---|
512 | | - else { |
---|
513 | | - r10_bio->devs[slot].bio = NULL; |
---|
514 | | - to_put = bio; |
---|
515 | | - dec_rdev = 1; |
---|
516 | | - } |
---|
517 | | - } else |
---|
| 468 | + } |
---|
| 469 | + |
---|
| 470 | + /* |
---|
| 471 | + * When the device is faulty, it is not necessary to |
---|
| 472 | + * handle write error. |
---|
| 473 | + */ |
---|
| 474 | + if (!test_bit(Faulty, &rdev->flags)) |
---|
518 | 475 | set_bit(R10BIO_WriteError, &r10_bio->state); |
---|
| 476 | + else { |
---|
| 477 | + /* Fail the request */ |
---|
| 478 | + set_bit(R10BIO_Degraded, &r10_bio->state); |
---|
| 479 | + r10_bio->devs[slot].bio = NULL; |
---|
| 480 | + to_put = bio; |
---|
| 481 | + dec_rdev = 1; |
---|
| 482 | + } |
---|
519 | 483 | } |
---|
520 | 484 | } else { |
---|
521 | 485 | /* |
---|
.. | .. |
---|
745 | 709 | int sectors = r10_bio->sectors; |
---|
746 | 710 | int best_good_sectors; |
---|
747 | 711 | sector_t new_distance, best_dist; |
---|
748 | | - struct md_rdev *best_rdev, *rdev = NULL; |
---|
| 712 | + struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL; |
---|
749 | 713 | int do_balance; |
---|
750 | | - int best_slot; |
---|
| 714 | + int best_dist_slot, best_pending_slot; |
---|
| 715 | + bool has_nonrot_disk = false; |
---|
| 716 | + unsigned int min_pending; |
---|
751 | 717 | struct geom *geo = &conf->geo; |
---|
752 | 718 | |
---|
753 | 719 | raid10_find_phys(conf, r10_bio); |
---|
754 | 720 | rcu_read_lock(); |
---|
755 | | - best_slot = -1; |
---|
756 | | - best_rdev = NULL; |
---|
| 721 | + best_dist_slot = -1; |
---|
| 722 | + min_pending = UINT_MAX; |
---|
| 723 | + best_dist_rdev = NULL; |
---|
| 724 | + best_pending_rdev = NULL; |
---|
757 | 725 | best_dist = MaxSector; |
---|
758 | 726 | best_good_sectors = 0; |
---|
759 | 727 | do_balance = 1; |
---|
.. | .. |
---|
775 | 743 | sector_t first_bad; |
---|
776 | 744 | int bad_sectors; |
---|
777 | 745 | sector_t dev_sector; |
---|
| 746 | + unsigned int pending; |
---|
| 747 | + bool nonrot; |
---|
778 | 748 | |
---|
779 | 749 | if (r10_bio->devs[slot].bio == IO_BLOCKED) |
---|
780 | 750 | continue; |
---|
781 | 751 | disk = r10_bio->devs[slot].devnum; |
---|
782 | 752 | rdev = rcu_dereference(conf->mirrors[disk].replacement); |
---|
783 | 753 | if (rdev == NULL || test_bit(Faulty, &rdev->flags) || |
---|
784 | | - r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) |
---|
| 754 | + r10_bio->devs[slot].addr + sectors > |
---|
| 755 | + rdev->recovery_offset) { |
---|
| 756 | + /* |
---|
| 757 | + * Read replacement first to prevent reading both rdev |
---|
| 758 | + * and replacement as NULL during replacement replace |
---|
| 759 | + * rdev. |
---|
| 760 | + */ |
---|
| 761 | + smp_mb(); |
---|
785 | 762 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
---|
| 763 | + } |
---|
786 | 764 | if (rdev == NULL || |
---|
787 | 765 | test_bit(Faulty, &rdev->flags)) |
---|
788 | 766 | continue; |
---|
.. | .. |
---|
811 | 789 | first_bad - dev_sector; |
---|
812 | 790 | if (good_sectors > best_good_sectors) { |
---|
813 | 791 | best_good_sectors = good_sectors; |
---|
814 | | - best_slot = slot; |
---|
815 | | - best_rdev = rdev; |
---|
| 792 | + best_dist_slot = slot; |
---|
| 793 | + best_dist_rdev = rdev; |
---|
816 | 794 | } |
---|
817 | 795 | if (!do_balance) |
---|
818 | 796 | /* Must read from here */ |
---|
.. | .. |
---|
825 | 803 | if (!do_balance) |
---|
826 | 804 | break; |
---|
827 | 805 | |
---|
828 | | - if (best_slot >= 0) |
---|
| 806 | + nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); |
---|
| 807 | + has_nonrot_disk |= nonrot; |
---|
| 808 | + pending = atomic_read(&rdev->nr_pending); |
---|
| 809 | + if (min_pending > pending && nonrot) { |
---|
| 810 | + min_pending = pending; |
---|
| 811 | + best_pending_slot = slot; |
---|
| 812 | + best_pending_rdev = rdev; |
---|
| 813 | + } |
---|
| 814 | + |
---|
| 815 | + if (best_dist_slot >= 0) |
---|
829 | 816 | /* At least 2 disks to choose from so failfast is OK */ |
---|
830 | 817 | set_bit(R10BIO_FailFast, &r10_bio->state); |
---|
831 | 818 | /* This optimisation is debatable, and completely destroys |
---|
832 | 819 | * sequential read speed for 'far copies' arrays. So only |
---|
833 | 820 | * keep it for 'near' arrays, and review those later. |
---|
834 | 821 | */ |
---|
835 | | - if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) |
---|
| 822 | + if (geo->near_copies > 1 && !pending) |
---|
836 | 823 | new_distance = 0; |
---|
837 | 824 | |
---|
838 | 825 | /* for far > 1 always use the lowest address */ |
---|
.. | .. |
---|
841 | 828 | else |
---|
842 | 829 | new_distance = abs(r10_bio->devs[slot].addr - |
---|
843 | 830 | conf->mirrors[disk].head_position); |
---|
| 831 | + |
---|
844 | 832 | if (new_distance < best_dist) { |
---|
845 | 833 | best_dist = new_distance; |
---|
846 | | - best_slot = slot; |
---|
847 | | - best_rdev = rdev; |
---|
| 834 | + best_dist_slot = slot; |
---|
| 835 | + best_dist_rdev = rdev; |
---|
848 | 836 | } |
---|
849 | 837 | } |
---|
850 | 838 | if (slot >= conf->copies) { |
---|
851 | | - slot = best_slot; |
---|
852 | | - rdev = best_rdev; |
---|
| 839 | + if (has_nonrot_disk) { |
---|
| 840 | + slot = best_pending_slot; |
---|
| 841 | + rdev = best_pending_rdev; |
---|
| 842 | + } else { |
---|
| 843 | + slot = best_dist_slot; |
---|
| 844 | + rdev = best_dist_rdev; |
---|
| 845 | + } |
---|
853 | 846 | } |
---|
854 | 847 | |
---|
855 | 848 | if (slot >= 0) { |
---|
.. | .. |
---|
861 | 854 | *max_sectors = best_good_sectors; |
---|
862 | 855 | |
---|
863 | 856 | return rdev; |
---|
864 | | -} |
---|
865 | | - |
---|
866 | | -static int raid10_congested(struct mddev *mddev, int bits) |
---|
867 | | -{ |
---|
868 | | - struct r10conf *conf = mddev->private; |
---|
869 | | - int i, ret = 0; |
---|
870 | | - |
---|
871 | | - if ((bits & (1 << WB_async_congested)) && |
---|
872 | | - conf->pending_count >= max_queued_requests) |
---|
873 | | - return 1; |
---|
874 | | - |
---|
875 | | - rcu_read_lock(); |
---|
876 | | - for (i = 0; |
---|
877 | | - (i < conf->geo.raid_disks || i < conf->prev.raid_disks) |
---|
878 | | - && ret == 0; |
---|
879 | | - i++) { |
---|
880 | | - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); |
---|
881 | | - if (rdev && !test_bit(Faulty, &rdev->flags)) { |
---|
882 | | - struct request_queue *q = bdev_get_queue(rdev->bdev); |
---|
883 | | - |
---|
884 | | - ret |= bdi_congested(q->backing_dev_info, bits); |
---|
885 | | - } |
---|
886 | | - } |
---|
887 | | - rcu_read_unlock(); |
---|
888 | | - return ret; |
---|
889 | 857 | } |
---|
890 | 858 | |
---|
891 | 859 | static void flush_pending_writes(struct r10conf *conf) |
---|
.. | .. |
---|
932 | 900 | /* Just ignore it */ |
---|
933 | 901 | bio_endio(bio); |
---|
934 | 902 | else |
---|
935 | | - generic_make_request(bio); |
---|
| 903 | + submit_bio_noacct(bio); |
---|
936 | 904 | bio = next; |
---|
| 905 | + cond_resched(); |
---|
937 | 906 | } |
---|
938 | 907 | blk_finish_plug(&plug); |
---|
939 | 908 | } else |
---|
.. | .. |
---|
995 | 964 | { |
---|
996 | 965 | spin_lock_irq(&conf->resync_lock); |
---|
997 | 966 | if (conf->barrier) { |
---|
| 967 | + struct bio_list *bio_list = current->bio_list; |
---|
998 | 968 | conf->nr_waiting++; |
---|
999 | 969 | /* Wait for the barrier to drop. |
---|
1000 | 970 | * However if there are already pending |
---|
.. | .. |
---|
1009 | 979 | wait_event_lock_irq(conf->wait_barrier, |
---|
1010 | 980 | !conf->barrier || |
---|
1011 | 981 | (atomic_read(&conf->nr_pending) && |
---|
1012 | | - current->bio_list && |
---|
1013 | | - (!bio_list_empty(¤t->bio_list[0]) || |
---|
1014 | | - !bio_list_empty(¤t->bio_list[1]))), |
---|
| 982 | + bio_list && |
---|
| 983 | + (!bio_list_empty(&bio_list[0]) || |
---|
| 984 | + !bio_list_empty(&bio_list[1]))) || |
---|
| 985 | + /* move on if recovery thread is |
---|
| 986 | + * blocked by us |
---|
| 987 | + */ |
---|
| 988 | + (conf->mddev->thread->tsk == current && |
---|
| 989 | + test_bit(MD_RECOVERY_RUNNING, |
---|
| 990 | + &conf->mddev->recovery) && |
---|
| 991 | + conf->nr_queued > 0), |
---|
1015 | 992 | conf->resync_lock); |
---|
1016 | 993 | conf->nr_waiting--; |
---|
1017 | 994 | if (!conf->nr_waiting) |
---|
.. | .. |
---|
1117 | 1094 | /* Just ignore it */ |
---|
1118 | 1095 | bio_endio(bio); |
---|
1119 | 1096 | else |
---|
1120 | | - generic_make_request(bio); |
---|
| 1097 | + submit_bio_noacct(bio); |
---|
1121 | 1098 | bio = next; |
---|
| 1099 | + cond_resched(); |
---|
1122 | 1100 | } |
---|
1123 | 1101 | kfree(plug); |
---|
| 1102 | +} |
---|
| 1103 | + |
---|
| 1104 | +/* |
---|
| 1105 | + * 1. Register the new request and wait if the reconstruction thread has put |
---|
| 1106 | + * up a bar for new requests. Continue immediately if no resync is active |
---|
| 1107 | + * currently. |
---|
| 1108 | + * 2. If IO spans the reshape position. Need to wait for reshape to pass. |
---|
| 1109 | + */ |
---|
| 1110 | +static void regular_request_wait(struct mddev *mddev, struct r10conf *conf, |
---|
| 1111 | + struct bio *bio, sector_t sectors) |
---|
| 1112 | +{ |
---|
| 1113 | + wait_barrier(conf); |
---|
| 1114 | + while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
---|
| 1115 | + bio->bi_iter.bi_sector < conf->reshape_progress && |
---|
| 1116 | + bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { |
---|
| 1117 | + raid10_log(conf->mddev, "wait reshape"); |
---|
| 1118 | + allow_barrier(conf); |
---|
| 1119 | + wait_event(conf->wait_barrier, |
---|
| 1120 | + conf->reshape_progress <= bio->bi_iter.bi_sector || |
---|
| 1121 | + conf->reshape_progress >= bio->bi_iter.bi_sector + |
---|
| 1122 | + sectors); |
---|
| 1123 | + wait_barrier(conf); |
---|
| 1124 | + } |
---|
1124 | 1125 | } |
---|
1125 | 1126 | |
---|
1126 | 1127 | static void raid10_read_request(struct mddev *mddev, struct bio *bio, |
---|
.. | .. |
---|
1131 | 1132 | const int op = bio_op(bio); |
---|
1132 | 1133 | const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); |
---|
1133 | 1134 | int max_sectors; |
---|
1134 | | - sector_t sectors; |
---|
1135 | 1135 | struct md_rdev *rdev; |
---|
1136 | 1136 | char b[BDEVNAME_SIZE]; |
---|
1137 | 1137 | int slot = r10_bio->read_slot; |
---|
.. | .. |
---|
1165 | 1165 | } |
---|
1166 | 1166 | rcu_read_unlock(); |
---|
1167 | 1167 | } |
---|
1168 | | - /* |
---|
1169 | | - * Register the new request and wait if the reconstruction |
---|
1170 | | - * thread has put up a bar for new requests. |
---|
1171 | | - * Continue immediately if no resync is active currently. |
---|
1172 | | - */ |
---|
1173 | | - wait_barrier(conf); |
---|
1174 | 1168 | |
---|
1175 | | - sectors = r10_bio->sectors; |
---|
1176 | | - while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
---|
1177 | | - bio->bi_iter.bi_sector < conf->reshape_progress && |
---|
1178 | | - bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { |
---|
1179 | | - /* |
---|
1180 | | - * IO spans the reshape position. Need to wait for reshape to |
---|
1181 | | - * pass |
---|
1182 | | - */ |
---|
1183 | | - raid10_log(conf->mddev, "wait reshape"); |
---|
1184 | | - allow_barrier(conf); |
---|
1185 | | - wait_event(conf->wait_barrier, |
---|
1186 | | - conf->reshape_progress <= bio->bi_iter.bi_sector || |
---|
1187 | | - conf->reshape_progress >= bio->bi_iter.bi_sector + |
---|
1188 | | - sectors); |
---|
1189 | | - wait_barrier(conf); |
---|
1190 | | - } |
---|
1191 | | - |
---|
| 1169 | + regular_request_wait(mddev, conf, bio, r10_bio->sectors); |
---|
1192 | 1170 | rdev = read_balance(conf, r10_bio, &max_sectors); |
---|
1193 | 1171 | if (!rdev) { |
---|
1194 | 1172 | if (err_rdev) { |
---|
.. | .. |
---|
1209 | 1187 | gfp, &conf->bio_split); |
---|
1210 | 1188 | bio_chain(split, bio); |
---|
1211 | 1189 | allow_barrier(conf); |
---|
1212 | | - generic_make_request(bio); |
---|
| 1190 | + submit_bio_noacct(bio); |
---|
1213 | 1191 | wait_barrier(conf); |
---|
1214 | 1192 | bio = split; |
---|
1215 | 1193 | r10_bio->master_bio = bio; |
---|
.. | .. |
---|
1236 | 1214 | trace_block_bio_remap(read_bio->bi_disk->queue, |
---|
1237 | 1215 | read_bio, disk_devt(mddev->gendisk), |
---|
1238 | 1216 | r10_bio->sector); |
---|
1239 | | - generic_make_request(read_bio); |
---|
| 1217 | + submit_bio_noacct(read_bio); |
---|
1240 | 1218 | return; |
---|
1241 | 1219 | } |
---|
1242 | 1220 | |
---|
.. | .. |
---|
1333 | 1311 | finish_wait(&conf->wait_barrier, &w); |
---|
1334 | 1312 | } |
---|
1335 | 1313 | |
---|
1336 | | - /* |
---|
1337 | | - * Register the new request and wait if the reconstruction |
---|
1338 | | - * thread has put up a bar for new requests. |
---|
1339 | | - * Continue immediately if no resync is active currently. |
---|
1340 | | - */ |
---|
1341 | | - wait_barrier(conf); |
---|
1342 | | - |
---|
1343 | 1314 | sectors = r10_bio->sectors; |
---|
1344 | | - while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
---|
1345 | | - bio->bi_iter.bi_sector < conf->reshape_progress && |
---|
1346 | | - bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { |
---|
1347 | | - /* |
---|
1348 | | - * IO spans the reshape position. Need to wait for reshape to |
---|
1349 | | - * pass |
---|
1350 | | - */ |
---|
1351 | | - raid10_log(conf->mddev, "wait reshape"); |
---|
1352 | | - allow_barrier(conf); |
---|
1353 | | - wait_event(conf->wait_barrier, |
---|
1354 | | - conf->reshape_progress <= bio->bi_iter.bi_sector || |
---|
1355 | | - conf->reshape_progress >= bio->bi_iter.bi_sector + |
---|
1356 | | - sectors); |
---|
1357 | | - wait_barrier(conf); |
---|
1358 | | - } |
---|
1359 | | - |
---|
| 1315 | + regular_request_wait(mddev, conf, bio, sectors); |
---|
1360 | 1316 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
---|
1361 | 1317 | (mddev->reshape_backwards |
---|
1362 | 1318 | ? (bio->bi_iter.bi_sector < conf->reshape_safe && |
---|
.. | .. |
---|
1400 | 1356 | |
---|
1401 | 1357 | for (i = 0; i < conf->copies; i++) { |
---|
1402 | 1358 | int d = r10_bio->devs[i].devnum; |
---|
1403 | | - struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); |
---|
1404 | | - struct md_rdev *rrdev = rcu_dereference( |
---|
1405 | | - conf->mirrors[d].replacement); |
---|
| 1359 | + struct md_rdev *rdev, *rrdev; |
---|
| 1360 | + |
---|
| 1361 | + rrdev = rcu_dereference(conf->mirrors[d].replacement); |
---|
| 1362 | + /* |
---|
| 1363 | + * Read replacement first to prevent reading both rdev and |
---|
| 1364 | + * replacement as NULL during replacement replace rdev. |
---|
| 1365 | + */ |
---|
| 1366 | + smp_mb(); |
---|
| 1367 | + rdev = rcu_dereference(conf->mirrors[d].rdev); |
---|
1406 | 1368 | if (rdev == rrdev) |
---|
1407 | 1369 | rrdev = NULL; |
---|
1408 | 1370 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { |
---|
.. | .. |
---|
1516 | 1478 | GFP_NOIO, &conf->bio_split); |
---|
1517 | 1479 | bio_chain(split, bio); |
---|
1518 | 1480 | allow_barrier(conf); |
---|
1519 | | - generic_make_request(bio); |
---|
| 1481 | + submit_bio_noacct(bio); |
---|
1520 | 1482 | wait_barrier(conf); |
---|
1521 | 1483 | bio = split; |
---|
1522 | 1484 | r10_bio->master_bio = bio; |
---|
.. | .. |
---|
1677 | 1639 | |
---|
1678 | 1640 | /* |
---|
1679 | 1641 | * If it is not operational, then we have already marked it as dead |
---|
1680 | | - * else if it is the last working disks, ignore the error, let the |
---|
1681 | | - * next level up know. |
---|
| 1642 | + * else if it is the last working disks with "fail_last_dev == false", |
---|
| 1643 | + * ignore the error, let the next level up know. |
---|
1682 | 1644 | * else mark the drive as failed |
---|
1683 | 1645 | */ |
---|
1684 | 1646 | spin_lock_irqsave(&conf->device_lock, flags); |
---|
1685 | | - if (test_bit(In_sync, &rdev->flags) |
---|
| 1647 | + if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev |
---|
1686 | 1648 | && !enough(conf, rdev->raid_disk)) { |
---|
1687 | 1649 | /* |
---|
1688 | 1650 | * Don't fail the drive, just return an IO error. |
---|
.. | .. |
---|
1863 | 1825 | int err = 0; |
---|
1864 | 1826 | int number = rdev->raid_disk; |
---|
1865 | 1827 | struct md_rdev **rdevp; |
---|
1866 | | - struct raid10_info *p = conf->mirrors + number; |
---|
| 1828 | + struct raid10_info *p; |
---|
1867 | 1829 | |
---|
1868 | 1830 | print_conf(conf); |
---|
| 1831 | + if (unlikely(number >= mddev->raid_disks)) |
---|
| 1832 | + return 0; |
---|
| 1833 | + p = conf->mirrors + number; |
---|
1869 | 1834 | if (rdev == p->rdev) |
---|
1870 | 1835 | rdevp = &p->rdev; |
---|
1871 | 1836 | else if (rdev == p->replacement) |
---|
.. | .. |
---|
2137 | 2102 | tbio->bi_opf |= MD_FAILFAST; |
---|
2138 | 2103 | tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; |
---|
2139 | 2104 | bio_set_dev(tbio, conf->mirrors[d].rdev->bdev); |
---|
2140 | | - generic_make_request(tbio); |
---|
| 2105 | + submit_bio_noacct(tbio); |
---|
2141 | 2106 | } |
---|
2142 | 2107 | |
---|
2143 | 2108 | /* Now write out to any replacement devices |
---|
.. | .. |
---|
2156 | 2121 | atomic_inc(&r10_bio->remaining); |
---|
2157 | 2122 | md_sync_acct(conf->mirrors[d].replacement->bdev, |
---|
2158 | 2123 | bio_sectors(tbio)); |
---|
2159 | | - generic_make_request(tbio); |
---|
| 2124 | + submit_bio_noacct(tbio); |
---|
2160 | 2125 | } |
---|
2161 | 2126 | |
---|
2162 | 2127 | done: |
---|
.. | .. |
---|
2263 | 2228 | { |
---|
2264 | 2229 | struct r10conf *conf = mddev->private; |
---|
2265 | 2230 | int d; |
---|
2266 | | - struct bio *wbio, *wbio2; |
---|
| 2231 | + struct bio *wbio = r10_bio->devs[1].bio; |
---|
| 2232 | + struct bio *wbio2 = r10_bio->devs[1].repl_bio; |
---|
| 2233 | + |
---|
| 2234 | + /* Need to test wbio2->bi_end_io before we call |
---|
| 2235 | + * submit_bio_noacct as if the former is NULL, |
---|
| 2236 | + * the latter is free to free wbio2. |
---|
| 2237 | + */ |
---|
| 2238 | + if (wbio2 && !wbio2->bi_end_io) |
---|
| 2239 | + wbio2 = NULL; |
---|
2267 | 2240 | |
---|
2268 | 2241 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { |
---|
2269 | 2242 | fix_recovery_read_error(r10_bio); |
---|
2270 | | - end_sync_request(r10_bio); |
---|
| 2243 | + if (wbio->bi_end_io) |
---|
| 2244 | + end_sync_request(r10_bio); |
---|
| 2245 | + if (wbio2) |
---|
| 2246 | + end_sync_request(r10_bio); |
---|
2271 | 2247 | return; |
---|
2272 | 2248 | } |
---|
2273 | 2249 | |
---|
.. | .. |
---|
2276 | 2252 | * and submit the write request |
---|
2277 | 2253 | */ |
---|
2278 | 2254 | d = r10_bio->devs[1].devnum; |
---|
2279 | | - wbio = r10_bio->devs[1].bio; |
---|
2280 | | - wbio2 = r10_bio->devs[1].repl_bio; |
---|
2281 | | - /* Need to test wbio2->bi_end_io before we call |
---|
2282 | | - * generic_make_request as if the former is NULL, |
---|
2283 | | - * the latter is free to free wbio2. |
---|
2284 | | - */ |
---|
2285 | | - if (wbio2 && !wbio2->bi_end_io) |
---|
2286 | | - wbio2 = NULL; |
---|
2287 | 2255 | if (wbio->bi_end_io) { |
---|
2288 | 2256 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
---|
2289 | 2257 | md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); |
---|
2290 | | - generic_make_request(wbio); |
---|
| 2258 | + submit_bio_noacct(wbio); |
---|
2291 | 2259 | } |
---|
2292 | 2260 | if (wbio2) { |
---|
2293 | 2261 | atomic_inc(&conf->mirrors[d].replacement->nr_pending); |
---|
2294 | 2262 | md_sync_acct(conf->mirrors[d].replacement->bdev, |
---|
2295 | 2263 | bio_sectors(wbio2)); |
---|
2296 | | - generic_make_request(wbio2); |
---|
| 2264 | + submit_bio_noacct(wbio2); |
---|
2297 | 2265 | } |
---|
2298 | 2266 | } |
---|
2299 | 2267 | |
---|
.. | .. |
---|
2927 | 2895 | * a number of r10_bio structures, one for each out-of-sync device. |
---|
2928 | 2896 | * As we setup these structures, we collect all bio's together into a list |
---|
2929 | 2897 | * which we then process collectively to add pages, and then process again |
---|
2930 | | - * to pass to generic_make_request. |
---|
| 2898 | + * to pass to submit_bio_noacct. |
---|
2931 | 2899 | * |
---|
2932 | 2900 | * The r10_bio structures are linked using a borrowed master_bio pointer. |
---|
2933 | 2901 | * This link is counted in ->remaining. When the r10_bio that points to NULL |
---|
.. | .. |
---|
2951 | 2919 | sector_t chunk_mask = conf->geo.chunk_mask; |
---|
2952 | 2920 | int page_idx = 0; |
---|
2953 | 2921 | |
---|
2954 | | - if (!mempool_initialized(&conf->r10buf_pool)) |
---|
2955 | | - if (init_resync(conf)) |
---|
2956 | | - return 0; |
---|
2957 | | - |
---|
2958 | 2922 | /* |
---|
2959 | 2923 | * Allow skipping a full rebuild for incremental assembly |
---|
2960 | 2924 | * of a clean array, like RAID1 does. |
---|
.. | .. |
---|
2969 | 2933 | *skipped = 1; |
---|
2970 | 2934 | return mddev->dev_sectors - sector_nr; |
---|
2971 | 2935 | } |
---|
| 2936 | + |
---|
| 2937 | + if (!mempool_initialized(&conf->r10buf_pool)) |
---|
| 2938 | + if (init_resync(conf)) |
---|
| 2939 | + return 0; |
---|
2972 | 2940 | |
---|
2973 | 2941 | skipped: |
---|
2974 | 2942 | max_sector = mddev->dev_sectors; |
---|
.. | .. |
---|
3084 | 3052 | sector_t sect; |
---|
3085 | 3053 | int must_sync; |
---|
3086 | 3054 | int any_working; |
---|
| 3055 | + int need_recover = 0; |
---|
3087 | 3056 | struct raid10_info *mirror = &conf->mirrors[i]; |
---|
3088 | 3057 | struct md_rdev *mrdev, *mreplace; |
---|
3089 | 3058 | |
---|
.. | .. |
---|
3091 | 3060 | mrdev = rcu_dereference(mirror->rdev); |
---|
3092 | 3061 | mreplace = rcu_dereference(mirror->replacement); |
---|
3093 | 3062 | |
---|
3094 | | - if ((mrdev == NULL || |
---|
3095 | | - test_bit(Faulty, &mrdev->flags) || |
---|
3096 | | - test_bit(In_sync, &mrdev->flags)) && |
---|
3097 | | - (mreplace == NULL || |
---|
3098 | | - test_bit(Faulty, &mreplace->flags))) { |
---|
| 3063 | + if (mrdev != NULL && |
---|
| 3064 | + !test_bit(Faulty, &mrdev->flags) && |
---|
| 3065 | + !test_bit(In_sync, &mrdev->flags)) |
---|
| 3066 | + need_recover = 1; |
---|
| 3067 | + if (mreplace && test_bit(Faulty, &mreplace->flags)) |
---|
| 3068 | + mreplace = NULL; |
---|
| 3069 | + |
---|
| 3070 | + if (!need_recover && !mreplace) { |
---|
3099 | 3071 | rcu_read_unlock(); |
---|
3100 | 3072 | continue; |
---|
3101 | 3073 | } |
---|
.. | .. |
---|
3111 | 3083 | rcu_read_unlock(); |
---|
3112 | 3084 | continue; |
---|
3113 | 3085 | } |
---|
3114 | | - if (mreplace && test_bit(Faulty, &mreplace->flags)) |
---|
3115 | | - mreplace = NULL; |
---|
3116 | 3086 | /* Unless we are doing a full sync, or a replacement |
---|
3117 | 3087 | * we only need to recover the block if it is set in |
---|
3118 | 3088 | * the bitmap |
---|
.. | .. |
---|
3218 | 3188 | r10_bio->devs[1].devnum = i; |
---|
3219 | 3189 | r10_bio->devs[1].addr = to_addr; |
---|
3220 | 3190 | |
---|
3221 | | - if (!test_bit(In_sync, &mrdev->flags)) { |
---|
| 3191 | + if (need_recover) { |
---|
3222 | 3192 | bio = r10_bio->devs[1].bio; |
---|
3223 | 3193 | bio->bi_next = biolist; |
---|
3224 | 3194 | biolist = bio; |
---|
.. | .. |
---|
3235 | 3205 | bio = r10_bio->devs[1].repl_bio; |
---|
3236 | 3206 | if (bio) |
---|
3237 | 3207 | bio->bi_end_io = NULL; |
---|
3238 | | - /* Note: if mreplace != NULL, then bio |
---|
| 3208 | + /* Note: if replace is not NULL, then bio |
---|
3239 | 3209 | * cannot be NULL as r10buf_pool_alloc will |
---|
3240 | 3210 | * have allocated it. |
---|
3241 | | - * So the second test here is pointless. |
---|
3242 | | - * But it keeps semantic-checkers happy, and |
---|
3243 | | - * this comment keeps human reviewers |
---|
3244 | | - * happy. |
---|
3245 | 3211 | */ |
---|
3246 | | - if (mreplace == NULL || bio == NULL || |
---|
3247 | | - test_bit(Faulty, &mreplace->flags)) |
---|
| 3212 | + if (!mreplace) |
---|
3248 | 3213 | break; |
---|
3249 | 3214 | bio->bi_next = biolist; |
---|
3250 | 3215 | biolist = bio; |
---|
.. | .. |
---|
3533 | 3498 | if (bio->bi_end_io == end_sync_read) { |
---|
3534 | 3499 | md_sync_acct_bio(bio, nr_sectors); |
---|
3535 | 3500 | bio->bi_status = 0; |
---|
3536 | | - generic_make_request(bio); |
---|
| 3501 | + submit_bio_noacct(bio); |
---|
3537 | 3502 | } |
---|
3538 | 3503 | } |
---|
3539 | 3504 | |
---|
.. | .. |
---|
3665 | 3630 | return nc*fc; |
---|
3666 | 3631 | } |
---|
3667 | 3632 | |
---|
| 3633 | +static void raid10_free_conf(struct r10conf *conf) |
---|
| 3634 | +{ |
---|
| 3635 | + if (!conf) |
---|
| 3636 | + return; |
---|
| 3637 | + |
---|
| 3638 | + mempool_exit(&conf->r10bio_pool); |
---|
| 3639 | + kfree(conf->mirrors); |
---|
| 3640 | + kfree(conf->mirrors_old); |
---|
| 3641 | + kfree(conf->mirrors_new); |
---|
| 3642 | + safe_put_page(conf->tmppage); |
---|
| 3643 | + bioset_exit(&conf->bio_split); |
---|
| 3644 | + kfree(conf); |
---|
| 3645 | +} |
---|
| 3646 | + |
---|
3668 | 3647 | static struct r10conf *setup_conf(struct mddev *mddev) |
---|
3669 | 3648 | { |
---|
3670 | 3649 | struct r10conf *conf = NULL; |
---|
.. | .. |
---|
3704 | 3683 | |
---|
3705 | 3684 | conf->geo = geo; |
---|
3706 | 3685 | conf->copies = copies; |
---|
3707 | | - err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc, |
---|
3708 | | - r10bio_pool_free, conf); |
---|
| 3686 | + err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc, |
---|
| 3687 | + rbio_pool_free, conf); |
---|
3709 | 3688 | if (err) |
---|
3710 | 3689 | goto out; |
---|
3711 | 3690 | |
---|
.. | .. |
---|
3747 | 3726 | return conf; |
---|
3748 | 3727 | |
---|
3749 | 3728 | out: |
---|
3750 | | - if (conf) { |
---|
3751 | | - mempool_exit(&conf->r10bio_pool); |
---|
3752 | | - kfree(conf->mirrors); |
---|
3753 | | - safe_put_page(conf->tmppage); |
---|
3754 | | - bioset_exit(&conf->bio_split); |
---|
3755 | | - kfree(conf); |
---|
3756 | | - } |
---|
| 3729 | + raid10_free_conf(conf); |
---|
3757 | 3730 | return ERR_PTR(err); |
---|
| 3731 | +} |
---|
| 3732 | + |
---|
| 3733 | +static void raid10_set_io_opt(struct r10conf *conf) |
---|
| 3734 | +{ |
---|
| 3735 | + int raid_disks = conf->geo.raid_disks; |
---|
| 3736 | + |
---|
| 3737 | + if (!(conf->geo.raid_disks % conf->geo.near_copies)) |
---|
| 3738 | + raid_disks /= conf->geo.near_copies; |
---|
| 3739 | + blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * |
---|
| 3740 | + raid_disks); |
---|
3758 | 3741 | } |
---|
3759 | 3742 | |
---|
3760 | 3743 | static int raid10_run(struct mddev *mddev) |
---|
3761 | 3744 | { |
---|
3762 | 3745 | struct r10conf *conf; |
---|
3763 | | - int i, disk_idx, chunk_size; |
---|
| 3746 | + int i, disk_idx; |
---|
3764 | 3747 | struct raid10_info *disk; |
---|
3765 | 3748 | struct md_rdev *rdev; |
---|
3766 | 3749 | sector_t size; |
---|
.. | .. |
---|
3781 | 3764 | if (!conf) |
---|
3782 | 3765 | goto out; |
---|
3783 | 3766 | |
---|
| 3767 | + mddev->thread = conf->thread; |
---|
| 3768 | + conf->thread = NULL; |
---|
| 3769 | + |
---|
3784 | 3770 | if (mddev_is_clustered(conf->mddev)) { |
---|
3785 | 3771 | int fc, fo; |
---|
3786 | 3772 | |
---|
.. | .. |
---|
3793 | 3779 | } |
---|
3794 | 3780 | } |
---|
3795 | 3781 | |
---|
3796 | | - mddev->thread = conf->thread; |
---|
3797 | | - conf->thread = NULL; |
---|
3798 | | - |
---|
3799 | | - chunk_size = mddev->chunk_sectors << 9; |
---|
3800 | 3782 | if (mddev->queue) { |
---|
3801 | 3783 | blk_queue_max_discard_sectors(mddev->queue, |
---|
3802 | 3784 | mddev->chunk_sectors); |
---|
3803 | 3785 | blk_queue_max_write_same_sectors(mddev->queue, 0); |
---|
3804 | 3786 | blk_queue_max_write_zeroes_sectors(mddev->queue, 0); |
---|
3805 | | - blk_queue_io_min(mddev->queue, chunk_size); |
---|
3806 | | - if (conf->geo.raid_disks % conf->geo.near_copies) |
---|
3807 | | - blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); |
---|
3808 | | - else |
---|
3809 | | - blk_queue_io_opt(mddev->queue, chunk_size * |
---|
3810 | | - (conf->geo.raid_disks / conf->geo.near_copies)); |
---|
| 3787 | + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); |
---|
| 3788 | + raid10_set_io_opt(conf); |
---|
3811 | 3789 | } |
---|
3812 | 3790 | |
---|
3813 | 3791 | rdev_for_each(rdev, mddev) { |
---|
.. | .. |
---|
3922 | 3900 | mddev->resync_max_sectors = size; |
---|
3923 | 3901 | set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); |
---|
3924 | 3902 | |
---|
3925 | | - if (mddev->queue) { |
---|
3926 | | - int stripe = conf->geo.raid_disks * |
---|
3927 | | - ((mddev->chunk_sectors << 9) / PAGE_SIZE); |
---|
3928 | | - |
---|
3929 | | - /* Calculate max read-ahead size. |
---|
3930 | | - * We need to readahead at least twice a whole stripe.... |
---|
3931 | | - * maybe... |
---|
3932 | | - */ |
---|
3933 | | - stripe /= conf->geo.near_copies; |
---|
3934 | | - if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) |
---|
3935 | | - mddev->queue->backing_dev_info->ra_pages = 2 * stripe; |
---|
3936 | | - } |
---|
3937 | | - |
---|
3938 | 3903 | if (md_integrity_register(mddev)) |
---|
3939 | 3904 | goto out_free_conf; |
---|
3940 | 3905 | |
---|
.. | .. |
---|
3967 | 3932 | |
---|
3968 | 3933 | out_free_conf: |
---|
3969 | 3934 | md_unregister_thread(&mddev->thread); |
---|
3970 | | - mempool_exit(&conf->r10bio_pool); |
---|
3971 | | - safe_put_page(conf->tmppage); |
---|
3972 | | - kfree(conf->mirrors); |
---|
3973 | | - kfree(conf); |
---|
| 3935 | + raid10_free_conf(conf); |
---|
3974 | 3936 | mddev->private = NULL; |
---|
3975 | 3937 | out: |
---|
3976 | 3938 | return -EIO; |
---|
.. | .. |
---|
3978 | 3940 | |
---|
3979 | 3941 | static void raid10_free(struct mddev *mddev, void *priv) |
---|
3980 | 3942 | { |
---|
3981 | | - struct r10conf *conf = priv; |
---|
3982 | | - |
---|
3983 | | - mempool_exit(&conf->r10bio_pool); |
---|
3984 | | - safe_put_page(conf->tmppage); |
---|
3985 | | - kfree(conf->mirrors); |
---|
3986 | | - kfree(conf->mirrors_old); |
---|
3987 | | - kfree(conf->mirrors_new); |
---|
3988 | | - bioset_exit(&conf->bio_split); |
---|
3989 | | - kfree(conf); |
---|
| 3943 | + raid10_free_conf(priv); |
---|
3990 | 3944 | } |
---|
3991 | 3945 | |
---|
3992 | 3946 | static void raid10_quiesce(struct mddev *mddev, int quiesce) |
---|
.. | .. |
---|
4293 | 4247 | spin_unlock_irq(&conf->device_lock); |
---|
4294 | 4248 | |
---|
4295 | 4249 | if (mddev->delta_disks && mddev->bitmap) { |
---|
4296 | | - ret = md_bitmap_resize(mddev->bitmap, |
---|
4297 | | - raid10_size(mddev, 0, conf->geo.raid_disks), |
---|
4298 | | - 0, 0); |
---|
| 4250 | + struct mdp_superblock_1 *sb = NULL; |
---|
| 4251 | + sector_t oldsize, newsize; |
---|
| 4252 | + |
---|
| 4253 | + oldsize = raid10_size(mddev, 0, 0); |
---|
| 4254 | + newsize = raid10_size(mddev, 0, conf->geo.raid_disks); |
---|
| 4255 | + |
---|
| 4256 | + if (!mddev_is_clustered(mddev)) { |
---|
| 4257 | + ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); |
---|
| 4258 | + if (ret) |
---|
| 4259 | + goto abort; |
---|
| 4260 | + else |
---|
| 4261 | + goto out; |
---|
| 4262 | + } |
---|
| 4263 | + |
---|
| 4264 | + rdev_for_each(rdev, mddev) { |
---|
| 4265 | + if (rdev->raid_disk > -1 && |
---|
| 4266 | + !test_bit(Faulty, &rdev->flags)) |
---|
| 4267 | + sb = page_address(rdev->sb_page); |
---|
| 4268 | + } |
---|
| 4269 | + |
---|
| 4270 | + /* |
---|
| 4271 | + * some node is already performing reshape, and no need to |
---|
| 4272 | + * call md_bitmap_resize again since it should be called when |
---|
| 4273 | + * receiving BITMAP_RESIZE msg |
---|
| 4274 | + */ |
---|
| 4275 | + if ((sb && (le32_to_cpu(sb->feature_map) & |
---|
| 4276 | + MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) |
---|
| 4277 | + goto out; |
---|
| 4278 | + |
---|
| 4279 | + ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); |
---|
4299 | 4280 | if (ret) |
---|
4300 | 4281 | goto abort; |
---|
| 4282 | + |
---|
| 4283 | + ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); |
---|
| 4284 | + if (ret) { |
---|
| 4285 | + md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); |
---|
| 4286 | + goto abort; |
---|
| 4287 | + } |
---|
4301 | 4288 | } |
---|
| 4289 | +out: |
---|
4302 | 4290 | if (mddev->delta_disks > 0) { |
---|
4303 | 4291 | rdev_for_each(rdev, mddev) |
---|
4304 | 4292 | if (rdev->raid_disk < 0 && |
---|
.. | .. |
---|
4310 | 4298 | else |
---|
4311 | 4299 | rdev->recovery_offset = 0; |
---|
4312 | 4300 | |
---|
4313 | | - if (sysfs_link_rdev(mddev, rdev)) |
---|
4314 | | - /* Failure here is OK */; |
---|
| 4301 | + /* Failure here is OK */ |
---|
| 4302 | + sysfs_link_rdev(mddev, rdev); |
---|
4315 | 4303 | } |
---|
4316 | 4304 | } else if (rdev->raid_disk >= conf->prev.raid_disks |
---|
4317 | 4305 | && !test_bit(Faulty, &rdev->flags)) { |
---|
.. | .. |
---|
4457 | 4445 | sector_nr = conf->reshape_progress; |
---|
4458 | 4446 | if (sector_nr) { |
---|
4459 | 4447 | mddev->curr_resync_completed = sector_nr; |
---|
4460 | | - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
---|
| 4448 | + sysfs_notify_dirent_safe(mddev->sysfs_completed); |
---|
4461 | 4449 | *skipped = 1; |
---|
4462 | 4450 | return sector_nr; |
---|
4463 | 4451 | } |
---|
.. | .. |
---|
4486 | 4474 | last = conf->reshape_progress - 1; |
---|
4487 | 4475 | sector_nr = last & ~(sector_t)(conf->geo.chunk_mask |
---|
4488 | 4476 | & conf->prev.chunk_mask); |
---|
4489 | | - if (sector_nr + RESYNC_BLOCK_SIZE/512 < last) |
---|
4490 | | - sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512; |
---|
| 4477 | + if (sector_nr + RESYNC_SECTORS < last) |
---|
| 4478 | + sector_nr = last + 1 - RESYNC_SECTORS; |
---|
4491 | 4479 | } else { |
---|
4492 | 4480 | /* 'next' is after the last device address that we |
---|
4493 | 4481 | * might write to for this chunk in the new layout |
---|
.. | .. |
---|
4509 | 4497 | last = sector_nr | (conf->geo.chunk_mask |
---|
4510 | 4498 | & conf->prev.chunk_mask); |
---|
4511 | 4499 | |
---|
4512 | | - if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last) |
---|
4513 | | - last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1; |
---|
| 4500 | + if (sector_nr + RESYNC_SECTORS <= last) |
---|
| 4501 | + last = sector_nr + RESYNC_SECTORS - 1; |
---|
4514 | 4502 | } |
---|
4515 | 4503 | |
---|
4516 | 4504 | if (need_flush || |
---|
.. | .. |
---|
4575 | 4563 | r10_bio->master_bio = read_bio; |
---|
4576 | 4564 | r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; |
---|
4577 | 4565 | |
---|
| 4566 | + /* |
---|
| 4567 | + * Broadcast RESYNC message to other nodes, so all nodes would not |
---|
| 4568 | + * write to the region to avoid conflict. |
---|
| 4569 | + */ |
---|
| 4570 | + if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) { |
---|
| 4571 | + struct mdp_superblock_1 *sb = NULL; |
---|
| 4572 | + int sb_reshape_pos = 0; |
---|
| 4573 | + |
---|
| 4574 | + conf->cluster_sync_low = sector_nr; |
---|
| 4575 | + conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS; |
---|
| 4576 | + sb = page_address(rdev->sb_page); |
---|
| 4577 | + if (sb) { |
---|
| 4578 | + sb_reshape_pos = le64_to_cpu(sb->reshape_position); |
---|
| 4579 | + /* |
---|
| 4580 | + * Set cluster_sync_low again if next address for array |
---|
| 4581 | + * reshape is less than cluster_sync_low. Since we can't |
---|
| 4582 | + * update cluster_sync_low until it has finished reshape. |
---|
| 4583 | + */ |
---|
| 4584 | + if (sb_reshape_pos < conf->cluster_sync_low) |
---|
| 4585 | + conf->cluster_sync_low = sb_reshape_pos; |
---|
| 4586 | + } |
---|
| 4587 | + |
---|
| 4588 | + md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, |
---|
| 4589 | + conf->cluster_sync_high); |
---|
| 4590 | + } |
---|
| 4591 | + |
---|
4578 | 4592 | /* Now find the locations in the new layout */ |
---|
4579 | 4593 | __raid10_find_phys(&conf->geo, r10_bio); |
---|
4580 | 4594 | |
---|
.. | .. |
---|
4631 | 4645 | md_sync_acct_bio(read_bio, r10_bio->sectors); |
---|
4632 | 4646 | atomic_inc(&r10_bio->remaining); |
---|
4633 | 4647 | read_bio->bi_next = NULL; |
---|
4634 | | - generic_make_request(read_bio); |
---|
| 4648 | + submit_bio_noacct(read_bio); |
---|
4635 | 4649 | sectors_done += nr_sectors; |
---|
4636 | 4650 | if (sector_nr <= last) |
---|
4637 | 4651 | goto read_more; |
---|
.. | .. |
---|
4694 | 4708 | md_sync_acct_bio(b, r10_bio->sectors); |
---|
4695 | 4709 | atomic_inc(&r10_bio->remaining); |
---|
4696 | 4710 | b->bi_next = NULL; |
---|
4697 | | - generic_make_request(b); |
---|
| 4711 | + submit_bio_noacct(b); |
---|
4698 | 4712 | } |
---|
4699 | 4713 | end_reshape_request(r10_bio); |
---|
4700 | 4714 | } |
---|
.. | .. |
---|
4712 | 4726 | conf->reshape_safe = MaxSector; |
---|
4713 | 4727 | spin_unlock_irq(&conf->device_lock); |
---|
4714 | 4728 | |
---|
4715 | | - /* read-ahead size must cover two whole stripes, which is |
---|
4716 | | - * 2 * (datadisks) * chunksize where 'n' is the number of raid devices |
---|
4717 | | - */ |
---|
4718 | | - if (conf->mddev->queue) { |
---|
4719 | | - int stripe = conf->geo.raid_disks * |
---|
4720 | | - ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); |
---|
4721 | | - stripe /= conf->geo.near_copies; |
---|
4722 | | - if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) |
---|
4723 | | - conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; |
---|
4724 | | - } |
---|
| 4729 | + if (conf->mddev->queue) |
---|
| 4730 | + raid10_set_io_opt(conf); |
---|
4725 | 4731 | conf->fullsync = 0; |
---|
| 4732 | +} |
---|
| 4733 | + |
---|
| 4734 | +static void raid10_update_reshape_pos(struct mddev *mddev) |
---|
| 4735 | +{ |
---|
| 4736 | + struct r10conf *conf = mddev->private; |
---|
| 4737 | + sector_t lo, hi; |
---|
| 4738 | + |
---|
| 4739 | + md_cluster_ops->resync_info_get(mddev, &lo, &hi); |
---|
| 4740 | + if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo)) |
---|
| 4741 | + || mddev->reshape_position == MaxSector) |
---|
| 4742 | + conf->reshape_progress = mddev->reshape_position; |
---|
| 4743 | + else |
---|
| 4744 | + WARN_ON_ONCE(1); |
---|
4726 | 4745 | } |
---|
4727 | 4746 | |
---|
4728 | 4747 | static int handle_reshape_read_error(struct mddev *mddev, |
---|
.. | .. |
---|
4736 | 4755 | int idx = 0; |
---|
4737 | 4756 | struct page **pages; |
---|
4738 | 4757 | |
---|
4739 | | - r10b = kmalloc(sizeof(*r10b) + |
---|
4740 | | - sizeof(struct r10dev) * conf->copies, GFP_NOIO); |
---|
| 4758 | + r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO); |
---|
4741 | 4759 | if (!r10b) { |
---|
4742 | 4760 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
---|
4743 | 4761 | return -ENOMEM; |
---|
.. | .. |
---|
4893 | 4911 | .check_reshape = raid10_check_reshape, |
---|
4894 | 4912 | .start_reshape = raid10_start_reshape, |
---|
4895 | 4913 | .finish_reshape = raid10_finish_reshape, |
---|
4896 | | - .congested = raid10_congested, |
---|
| 4914 | + .update_reshape_pos = raid10_update_reshape_pos, |
---|
4897 | 4915 | }; |
---|
4898 | 4916 | |
---|
4899 | 4917 | static int __init raid_init(void) |
---|