.. | .. |
---|
17 | 17 | #include <linux/sched/clock.h> |
---|
18 | 18 | #include <trace/events/bcache.h> |
---|
19 | 19 | |
---|
| 20 | +static void update_gc_after_writeback(struct cache_set *c) |
---|
| 21 | +{ |
---|
| 22 | + if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) || |
---|
| 23 | + c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD) |
---|
| 24 | + return; |
---|
| 25 | + |
---|
| 26 | + c->gc_after_writeback |= BCH_DO_AUTO_GC; |
---|
| 27 | +} |
---|
| 28 | + |
---|
20 | 29 | /* Rate limiting */ |
---|
21 | 30 | static uint64_t __calc_target_rate(struct cached_dev *dc) |
---|
22 | 31 | { |
---|
.. | .. |
---|
26 | 35 | * This is the size of the cache, minus the amount used for |
---|
27 | 36 | * flash-only devices |
---|
28 | 37 | */ |
---|
29 | | - uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - |
---|
| 38 | + uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size - |
---|
30 | 39 | atomic_long_read(&c->flash_dev_dirty_sectors); |
---|
31 | 40 | |
---|
32 | 41 | /* |
---|
.. | .. |
---|
110 | 119 | dc->writeback_rate_target = target; |
---|
111 | 120 | } |
---|
112 | 121 | |
---|
| 122 | +static bool idle_counter_exceeded(struct cache_set *c) |
---|
| 123 | +{ |
---|
| 124 | + int counter, dev_nr; |
---|
| 125 | + |
---|
| 126 | + /* |
---|
| 127 | + * If c->idle_counter is overflow (idel for really long time), |
---|
| 128 | + * reset as 0 and not set maximum rate this time for code |
---|
| 129 | + * simplicity. |
---|
| 130 | + */ |
---|
| 131 | + counter = atomic_inc_return(&c->idle_counter); |
---|
| 132 | + if (counter <= 0) { |
---|
| 133 | + atomic_set(&c->idle_counter, 0); |
---|
| 134 | + return false; |
---|
| 135 | + } |
---|
| 136 | + |
---|
| 137 | + dev_nr = atomic_read(&c->attached_dev_nr); |
---|
| 138 | + if (dev_nr == 0) |
---|
| 139 | + return false; |
---|
| 140 | + |
---|
| 141 | + /* |
---|
| 142 | + * c->idle_counter is increased by writeback thread of all |
---|
| 143 | + * attached backing devices, in order to represent a rough |
---|
| 144 | + * time period, counter should be divided by dev_nr. |
---|
| 145 | + * Otherwise the idle time cannot be larger with more backing |
---|
| 146 | + * device attached. |
---|
| 147 | + * The following calculation equals to checking |
---|
| 148 | + * (counter / dev_nr) < (dev_nr * 6) |
---|
| 149 | + */ |
---|
| 150 | + if (counter < (dev_nr * dev_nr * 6)) |
---|
| 151 | + return false; |
---|
| 152 | + |
---|
| 153 | + return true; |
---|
| 154 | +} |
---|
| 155 | + |
---|
| 156 | +/* |
---|
| 157 | + * Idle_counter is increased every time when update_writeback_rate() is |
---|
| 158 | + * called. If all backing devices attached to the same cache set have |
---|
| 159 | + * identical dc->writeback_rate_update_seconds values, it is about 6 |
---|
| 160 | + * rounds of update_writeback_rate() on each backing device before |
---|
| 161 | + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set |
---|
| 162 | + * to each dc->writeback_rate.rate. |
---|
| 163 | + * In order to avoid extra locking cost for counting exact dirty cached |
---|
| 164 | + * devices number, c->attached_dev_nr is used to calculate the idle |
---|
| 165 | + * throushold. It might be bigger if not all cached device are in write- |
---|
| 166 | + * back mode, but it still works well with limited extra rounds of |
---|
| 167 | + * update_writeback_rate(). |
---|
| 168 | + */ |
---|
113 | 169 | static bool set_at_max_writeback_rate(struct cache_set *c, |
---|
114 | 170 | struct cached_dev *dc) |
---|
115 | 171 | { |
---|
116 | | - /* |
---|
117 | | - * Idle_counter is increased everytime when update_writeback_rate() is |
---|
118 | | - * called. If all backing devices attached to the same cache set have |
---|
119 | | - * identical dc->writeback_rate_update_seconds values, it is about 6 |
---|
120 | | - * rounds of update_writeback_rate() on each backing device before |
---|
121 | | - * c->at_max_writeback_rate is set to 1, and then max wrteback rate set |
---|
122 | | - * to each dc->writeback_rate.rate. |
---|
123 | | - * In order to avoid extra locking cost for counting exact dirty cached |
---|
124 | | - * devices number, c->attached_dev_nr is used to calculate the idle |
---|
125 | | - * throushold. It might be bigger if not all cached device are in write- |
---|
126 | | - * back mode, but it still works well with limited extra rounds of |
---|
127 | | - * update_writeback_rate(). |
---|
128 | | - */ |
---|
129 | | - if (atomic_inc_return(&c->idle_counter) < |
---|
130 | | - atomic_read(&c->attached_dev_nr) * 6) |
---|
| 172 | + /* Don't sst max writeback rate if it is disabled */ |
---|
| 173 | + if (!c->idle_max_writeback_rate_enabled) |
---|
| 174 | + return false; |
---|
| 175 | + |
---|
| 176 | + /* Don't set max writeback rate if gc is running */ |
---|
| 177 | + if (!c->gc_mark_valid) |
---|
| 178 | + return false; |
---|
| 179 | + |
---|
| 180 | + if (!idle_counter_exceeded(c)) |
---|
131 | 181 | return false; |
---|
132 | 182 | |
---|
133 | 183 | if (atomic_read(&c->at_max_writeback_rate) != 1) |
---|
.. | .. |
---|
141 | 191 | dc->writeback_rate_change = 0; |
---|
142 | 192 | |
---|
143 | 193 | /* |
---|
144 | | - * Check c->idle_counter and c->at_max_writeback_rate agagain in case |
---|
145 | | - * new I/O arrives during before set_at_max_writeback_rate() returns. |
---|
146 | | - * Then the writeback rate is set to 1, and its new value should be |
---|
147 | | - * decided via __update_writeback_rate(). |
---|
| 194 | + * In case new I/O arrives during before |
---|
| 195 | + * set_at_max_writeback_rate() returns. |
---|
148 | 196 | */ |
---|
149 | | - if ((atomic_read(&c->idle_counter) < |
---|
150 | | - atomic_read(&c->attached_dev_nr) * 6) || |
---|
| 197 | + if (!idle_counter_exceeded(c) || |
---|
151 | 198 | !atomic_read(&c->at_max_writeback_rate)) |
---|
152 | 199 | return false; |
---|
153 | 200 | |
---|
.. | .. |
---|
167 | 214 | */ |
---|
168 | 215 | set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); |
---|
169 | 216 | /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ |
---|
170 | | - smp_mb(); |
---|
| 217 | + smp_mb__after_atomic(); |
---|
171 | 218 | |
---|
172 | 219 | /* |
---|
173 | 220 | * CACHE_SET_IO_DISABLE might be set via sysfs interface, |
---|
.. | .. |
---|
177 | 224 | test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { |
---|
178 | 225 | clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); |
---|
179 | 226 | /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ |
---|
180 | | - smp_mb(); |
---|
| 227 | + smp_mb__after_atomic(); |
---|
181 | 228 | return; |
---|
182 | 229 | } |
---|
183 | 230 | |
---|
.. | .. |
---|
191 | 238 | if (!set_at_max_writeback_rate(c, dc)) { |
---|
192 | 239 | down_read(&dc->writeback_lock); |
---|
193 | 240 | __update_writeback_rate(dc); |
---|
| 241 | + update_gc_after_writeback(c); |
---|
194 | 242 | up_read(&dc->writeback_lock); |
---|
195 | 243 | } |
---|
196 | 244 | } |
---|
.. | .. |
---|
212 | 260 | */ |
---|
213 | 261 | clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); |
---|
214 | 262 | /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ |
---|
215 | | - smp_mb(); |
---|
| 263 | + smp_mb__after_atomic(); |
---|
216 | 264 | } |
---|
217 | 265 | |
---|
218 | 266 | static unsigned int writeback_delay(struct cached_dev *dc, |
---|
.. | .. |
---|
442 | 490 | for (i = 0; i < nk; i++) { |
---|
443 | 491 | w = keys[i]; |
---|
444 | 492 | |
---|
445 | | - io = kzalloc(sizeof(struct dirty_io) + |
---|
446 | | - sizeof(struct bio_vec) * |
---|
447 | | - DIV_ROUND_UP(KEY_SIZE(&w->key), |
---|
448 | | - PAGE_SECTORS), |
---|
| 493 | + io = kzalloc(struct_size(io, bio.bi_inline_vecs, |
---|
| 494 | + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), |
---|
449 | 495 | GFP_KERNEL); |
---|
450 | 496 | if (!io) |
---|
451 | 497 | goto err; |
---|
.. | .. |
---|
693 | 739 | up_write(&dc->writeback_lock); |
---|
694 | 740 | break; |
---|
695 | 741 | } |
---|
| 742 | + |
---|
| 743 | + /* |
---|
| 744 | + * When dirty data rate is high (e.g. 50%+), there might |
---|
| 745 | + * be heavy buckets fragmentation after writeback |
---|
| 746 | + * finished, which hurts following write performance. |
---|
| 747 | + * If users really care about write performance they |
---|
| 748 | + * may set BCH_ENABLE_AUTO_GC via sysfs, then when |
---|
| 749 | + * BCH_DO_AUTO_GC is set, garbage collection thread |
---|
| 750 | + * will be wake up here. After moving gc, the shrunk |
---|
| 751 | + * btree and discarded free buckets SSD space may be |
---|
| 752 | + * helpful for following write requests. |
---|
| 753 | + */ |
---|
| 754 | + if (c->gc_after_writeback == |
---|
| 755 | + (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { |
---|
| 756 | + c->gc_after_writeback &= ~BCH_DO_AUTO_GC; |
---|
| 757 | + force_wake_up_gc(c); |
---|
| 758 | + } |
---|
696 | 759 | } |
---|
697 | 760 | |
---|
698 | 761 | up_write(&dc->writeback_lock); |
---|
.. | .. |
---|
724 | 787 | |
---|
725 | 788 | /* Init */ |
---|
726 | 789 | #define INIT_KEYS_EACH_TIME 500000 |
---|
727 | | -#define INIT_KEYS_SLEEP_MS 100 |
---|
728 | 790 | |
---|
729 | 791 | struct sectors_dirty_init { |
---|
730 | 792 | struct btree_op op; |
---|
731 | 793 | unsigned int inode; |
---|
732 | 794 | size_t count; |
---|
733 | | - struct bkey start; |
---|
734 | 795 | }; |
---|
735 | 796 | |
---|
736 | 797 | static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, |
---|
.. | .. |
---|
746 | 807 | KEY_START(k), KEY_SIZE(k)); |
---|
747 | 808 | |
---|
748 | 809 | op->count++; |
---|
749 | | - if (atomic_read(&b->c->search_inflight) && |
---|
750 | | - !(op->count % INIT_KEYS_EACH_TIME)) { |
---|
751 | | - bkey_copy_key(&op->start, k); |
---|
752 | | - return -EAGAIN; |
---|
753 | | - } |
---|
| 810 | + if (!(op->count % INIT_KEYS_EACH_TIME)) |
---|
| 811 | + cond_resched(); |
---|
754 | 812 | |
---|
755 | 813 | return MAP_CONTINUE; |
---|
756 | 814 | } |
---|
757 | 815 | |
---|
758 | | -void bch_sectors_dirty_init(struct bcache_device *d) |
---|
| 816 | +static int bch_root_node_dirty_init(struct cache_set *c, |
---|
| 817 | + struct bcache_device *d, |
---|
| 818 | + struct bkey *k) |
---|
759 | 819 | { |
---|
760 | 820 | struct sectors_dirty_init op; |
---|
761 | 821 | int ret; |
---|
.. | .. |
---|
763 | 823 | bch_btree_op_init(&op.op, -1); |
---|
764 | 824 | op.inode = d->id; |
---|
765 | 825 | op.count = 0; |
---|
766 | | - op.start = KEY(op.inode, 0, 0); |
---|
767 | 826 | |
---|
768 | | - do { |
---|
769 | | - ret = bch_btree_map_keys(&op.op, d->c, &op.start, |
---|
770 | | - sectors_dirty_init_fn, 0); |
---|
771 | | - if (ret == -EAGAIN) |
---|
772 | | - schedule_timeout_interruptible( |
---|
773 | | - msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); |
---|
774 | | - else if (ret < 0) { |
---|
775 | | - pr_warn("sectors dirty init failed, ret=%d!", ret); |
---|
776 | | - break; |
---|
| 827 | + ret = bcache_btree(map_keys_recurse, |
---|
| 828 | + k, |
---|
| 829 | + c->root, |
---|
| 830 | + &op.op, |
---|
| 831 | + &KEY(op.inode, 0, 0), |
---|
| 832 | + sectors_dirty_init_fn, |
---|
| 833 | + 0); |
---|
| 834 | + if (ret < 0) |
---|
| 835 | + pr_warn("sectors dirty init failed, ret=%d!\n", ret); |
---|
| 836 | + |
---|
| 837 | + /* |
---|
| 838 | + * The op may be added to cache_set's btree_cache_wait |
---|
| 839 | + * in mca_cannibalize(), must ensure it is removed from |
---|
| 840 | + * the list and release btree_cache_alloc_lock before |
---|
| 841 | + * free op memory. |
---|
| 842 | + * Otherwise, the btree_cache_wait will be damaged. |
---|
| 843 | + */ |
---|
| 844 | + bch_cannibalize_unlock(c); |
---|
| 845 | + finish_wait(&c->btree_cache_wait, &(&op.op)->wait); |
---|
| 846 | + |
---|
| 847 | + return ret; |
---|
| 848 | +} |
---|
| 849 | + |
---|
| 850 | +static int bch_dirty_init_thread(void *arg) |
---|
| 851 | +{ |
---|
| 852 | + struct dirty_init_thrd_info *info = arg; |
---|
| 853 | + struct bch_dirty_init_state *state = info->state; |
---|
| 854 | + struct cache_set *c = state->c; |
---|
| 855 | + struct btree_iter iter; |
---|
| 856 | + struct bkey *k, *p; |
---|
| 857 | + int cur_idx, prev_idx, skip_nr; |
---|
| 858 | + |
---|
| 859 | + k = p = NULL; |
---|
| 860 | + cur_idx = prev_idx = 0; |
---|
| 861 | + |
---|
| 862 | + bch_btree_iter_init(&c->root->keys, &iter, NULL); |
---|
| 863 | + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); |
---|
| 864 | + BUG_ON(!k); |
---|
| 865 | + |
---|
| 866 | + p = k; |
---|
| 867 | + |
---|
| 868 | + while (k) { |
---|
| 869 | + spin_lock(&state->idx_lock); |
---|
| 870 | + cur_idx = state->key_idx; |
---|
| 871 | + state->key_idx++; |
---|
| 872 | + spin_unlock(&state->idx_lock); |
---|
| 873 | + |
---|
| 874 | + skip_nr = cur_idx - prev_idx; |
---|
| 875 | + |
---|
| 876 | + while (skip_nr) { |
---|
| 877 | + k = bch_btree_iter_next_filter(&iter, |
---|
| 878 | + &c->root->keys, |
---|
| 879 | + bch_ptr_bad); |
---|
| 880 | + if (k) |
---|
| 881 | + p = k; |
---|
| 882 | + else { |
---|
| 883 | + atomic_set(&state->enough, 1); |
---|
| 884 | + /* Update state->enough earlier */ |
---|
| 885 | + smp_mb__after_atomic(); |
---|
| 886 | + goto out; |
---|
| 887 | + } |
---|
| 888 | + skip_nr--; |
---|
777 | 889 | } |
---|
778 | | - } while (ret == -EAGAIN); |
---|
| 890 | + |
---|
| 891 | + if (p) { |
---|
| 892 | + if (bch_root_node_dirty_init(c, state->d, p) < 0) |
---|
| 893 | + goto out; |
---|
| 894 | + } |
---|
| 895 | + |
---|
| 896 | + p = NULL; |
---|
| 897 | + prev_idx = cur_idx; |
---|
| 898 | + } |
---|
| 899 | + |
---|
| 900 | +out: |
---|
| 901 | + /* In order to wake up state->wait in time */ |
---|
| 902 | + smp_mb__before_atomic(); |
---|
| 903 | + if (atomic_dec_and_test(&state->started)) |
---|
| 904 | + wake_up(&state->wait); |
---|
| 905 | + |
---|
| 906 | + return 0; |
---|
| 907 | +} |
---|
| 908 | + |
---|
| 909 | +static int bch_btre_dirty_init_thread_nr(void) |
---|
| 910 | +{ |
---|
| 911 | + int n = num_online_cpus()/2; |
---|
| 912 | + |
---|
| 913 | + if (n == 0) |
---|
| 914 | + n = 1; |
---|
| 915 | + else if (n > BCH_DIRTY_INIT_THRD_MAX) |
---|
| 916 | + n = BCH_DIRTY_INIT_THRD_MAX; |
---|
| 917 | + |
---|
| 918 | + return n; |
---|
| 919 | +} |
---|
| 920 | + |
---|
| 921 | +void bch_sectors_dirty_init(struct bcache_device *d) |
---|
| 922 | +{ |
---|
| 923 | + int i; |
---|
| 924 | + struct bkey *k = NULL; |
---|
| 925 | + struct btree_iter iter; |
---|
| 926 | + struct sectors_dirty_init op; |
---|
| 927 | + struct cache_set *c = d->c; |
---|
| 928 | + struct bch_dirty_init_state state; |
---|
| 929 | + |
---|
| 930 | + /* Just count root keys if no leaf node */ |
---|
| 931 | + rw_lock(0, c->root, c->root->level); |
---|
| 932 | + if (c->root->level == 0) { |
---|
| 933 | + bch_btree_op_init(&op.op, -1); |
---|
| 934 | + op.inode = d->id; |
---|
| 935 | + op.count = 0; |
---|
| 936 | + |
---|
| 937 | + for_each_key_filter(&c->root->keys, |
---|
| 938 | + k, &iter, bch_ptr_invalid) |
---|
| 939 | + sectors_dirty_init_fn(&op.op, c->root, k); |
---|
| 940 | + |
---|
| 941 | + rw_unlock(0, c->root); |
---|
| 942 | + return; |
---|
| 943 | + } |
---|
| 944 | + |
---|
| 945 | + memset(&state, 0, sizeof(struct bch_dirty_init_state)); |
---|
| 946 | + state.c = c; |
---|
| 947 | + state.d = d; |
---|
| 948 | + state.total_threads = bch_btre_dirty_init_thread_nr(); |
---|
| 949 | + state.key_idx = 0; |
---|
| 950 | + spin_lock_init(&state.idx_lock); |
---|
| 951 | + atomic_set(&state.started, 0); |
---|
| 952 | + atomic_set(&state.enough, 0); |
---|
| 953 | + init_waitqueue_head(&state.wait); |
---|
| 954 | + |
---|
| 955 | + for (i = 0; i < state.total_threads; i++) { |
---|
| 956 | + /* Fetch latest state.enough earlier */ |
---|
| 957 | + smp_mb__before_atomic(); |
---|
| 958 | + if (atomic_read(&state.enough)) |
---|
| 959 | + break; |
---|
| 960 | + |
---|
| 961 | + state.infos[i].state = &state; |
---|
| 962 | + state.infos[i].thread = |
---|
| 963 | + kthread_run(bch_dirty_init_thread, &state.infos[i], |
---|
| 964 | + "bch_dirtcnt[%d]", i); |
---|
| 965 | + if (IS_ERR(state.infos[i].thread)) { |
---|
| 966 | + pr_err("fails to run thread bch_dirty_init[%d]\n", i); |
---|
| 967 | + for (--i; i >= 0; i--) |
---|
| 968 | + kthread_stop(state.infos[i].thread); |
---|
| 969 | + goto out; |
---|
| 970 | + } |
---|
| 971 | + atomic_inc(&state.started); |
---|
| 972 | + } |
---|
| 973 | + |
---|
| 974 | +out: |
---|
| 975 | + /* Must wait for all threads to stop. */ |
---|
| 976 | + wait_event(state.wait, atomic_read(&state.started) == 0); |
---|
| 977 | + rw_unlock(0, c->root); |
---|
779 | 978 | } |
---|
780 | 979 | |
---|
781 | 980 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
---|