hc
2023-12-06 08f87f769b595151be1afeff53e144f543faa614
kernel/fs/fs-writeback.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * fs/fs-writeback.c
34 *
....@@ -35,10 +36,6 @@
3536 */
3637 #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
3738
38
-struct wb_completion {
39
- atomic_t cnt;
40
-};
41
-
4239 /*
4340 * Passed into wb_writeback(), essentially a subset of writeback_control
4441 */
....@@ -57,19 +54,6 @@
5754 struct list_head list; /* pending work list */
5855 struct wb_completion *done; /* set if the caller waits */
5956 };
60
-
61
-/*
62
- * If one wants to wait for one or more wb_writeback_works, each work's
63
- * ->done should be set to a wb_completion defined using the following
64
- * macro. Once all work items are issued with wb_queue_work(), the caller
65
- * can wait for the completion of all using wb_wait_for_completion(). Work
66
- * items which are waited upon aren't freed automatically on completion.
67
- */
68
-#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \
69
- struct wb_completion cmpl = { \
70
- .cnt = ATOMIC_INIT(1), \
71
- }
72
-
7357
7458 /*
7559 * If an inode is constantly having its pages dirtied, but then the
....@@ -181,8 +165,13 @@
181165
182166 if (work->auto_free)
183167 kfree(work);
184
- if (done && atomic_dec_and_test(&done->cnt))
185
- wake_up_all(&wb->bdi->wb_waitq);
168
+ if (done) {
169
+ wait_queue_head_t *waitq = done->waitq;
170
+
171
+ /* @done can't be accessed after the following dec */
172
+ if (atomic_dec_and_test(&done->cnt))
173
+ wake_up_all(waitq);
174
+ }
186175 }
187176
188177 static void wb_queue_work(struct bdi_writeback *wb,
....@@ -206,28 +195,44 @@
206195
207196 /**
208197 * wb_wait_for_completion - wait for completion of bdi_writeback_works
209
- * @bdi: bdi work items were issued to
210198 * @done: target wb_completion
211199 *
212200 * Wait for one or more work items issued to @bdi with their ->done field
213
- * set to @done, which should have been defined with
214
- * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such
215
- * work items are completed. Work items which are waited upon aren't freed
201
+ * set to @done, which should have been initialized with
202
+ * DEFINE_WB_COMPLETION(). This function returns after all such work items
203
+ * are completed. Work items which are waited upon aren't freed
216204 * automatically on completion.
217205 */
218
-static void wb_wait_for_completion(struct backing_dev_info *bdi,
219
- struct wb_completion *done)
206
+void wb_wait_for_completion(struct wb_completion *done)
220207 {
221208 atomic_dec(&done->cnt); /* put down the initial count */
222
- wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
209
+ wait_event(*done->waitq, !atomic_read(&done->cnt));
223210 }
224211
225212 #ifdef CONFIG_CGROUP_WRITEBACK
226213
227
-/* parameters for foreign inode detection, see wb_detach_inode() */
214
+/*
215
+ * Parameters for foreign inode detection, see wbc_detach_inode() to see
216
+ * how they're used.
217
+ *
218
+ * These paramters are inherently heuristical as the detection target
219
+ * itself is fuzzy. All we want to do is detaching an inode from the
220
+ * current owner if it's being written to by some other cgroups too much.
221
+ *
222
+ * The current cgroup writeback is built on the assumption that multiple
223
+ * cgroups writing to the same inode concurrently is very rare and a mode
224
+ * of operation which isn't well supported. As such, the goal is not
225
+ * taking too long when a different cgroup takes over an inode while
226
+ * avoiding too aggressive flip-flops from occasional foreign writes.
227
+ *
228
+ * We record, very roughly, 2s worth of IO time history and if more than
229
+ * half of that is foreign, trigger the switch. The recording is quantized
230
+ * to 16 slots. To avoid tiny writes from swinging the decision too much,
231
+ * writes smaller than 1/8 of avg size are ignored.
232
+ */
228233 #define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */
229234 #define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */
230
-#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */
235
+#define WB_FRN_TIME_CUT_DIV 8 /* ignore rounds < avg / 8 */
231236 #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */
232237
233238 #define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */
....@@ -237,6 +242,7 @@
237242 /* if foreign slots >= 8, switch */
238243 #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
239244 /* one round can affect upto 5 slots */
245
+#define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */
240246
241247 static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
242248 static struct workqueue_struct *isw_wq;
....@@ -352,9 +358,9 @@
352358 struct address_space *mapping = inode->i_mapping;
353359 struct bdi_writeback *old_wb = inode->i_wb;
354360 struct bdi_writeback *new_wb = isw->new_wb;
355
- struct radix_tree_iter iter;
361
+ XA_STATE(xas, &mapping->i_pages, 0);
362
+ struct page *page;
356363 bool switched = false;
357
- void **slot;
358364
359365 /*
360366 * If @inode switches cgwb membership while sync_inodes_sb() is
....@@ -389,30 +395,25 @@
389395 if (unlikely(inode->i_state & I_FREEING))
390396 goto skip_switch;
391397
398
+ trace_inode_switch_wbs(inode, old_wb, new_wb);
399
+
392400 /*
393401 * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
394402 * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
395403 * pages actually under writeback.
396404 */
397
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
398
- PAGECACHE_TAG_DIRTY) {
399
- struct page *page = radix_tree_deref_slot_protected(slot,
400
- &mapping->i_pages.xa_lock);
401
- if (likely(page) && PageDirty(page)) {
405
+ xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
406
+ if (PageDirty(page)) {
402407 dec_wb_stat(old_wb, WB_RECLAIMABLE);
403408 inc_wb_stat(new_wb, WB_RECLAIMABLE);
404409 }
405410 }
406411
407
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
408
- PAGECACHE_TAG_WRITEBACK) {
409
- struct page *page = radix_tree_deref_slot_protected(slot,
410
- &mapping->i_pages.xa_lock);
411
- if (likely(page)) {
412
- WARN_ON_ONCE(!PageWriteback(page));
413
- dec_wb_stat(old_wb, WB_WRITEBACK);
414
- inc_wb_stat(new_wb, WB_WRITEBACK);
415
- }
412
+ xas_set(&xas, 0);
413
+ xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
414
+ WARN_ON_ONCE(!PageWriteback(page));
415
+ dec_wb_stat(old_wb, WB_WRITEBACK);
416
+ inc_wb_stat(new_wb, WB_WRITEBACK);
416417 }
417418
418419 wb_get(new_wb);
....@@ -496,18 +497,15 @@
496497 if (inode->i_state & I_WB_SWITCH)
497498 return;
498499
499
- /*
500
- * Avoid starting new switches while sync_inodes_sb() is in
501
- * progress. Otherwise, if the down_write protected issue path
502
- * blocks heavily, we might end up starting a large number of
503
- * switches which will block on the rwsem.
504
- */
505
- if (!down_read_trylock(&bdi->wb_switch_rwsem))
500
+ /* avoid queueing a new switch if too many are already in flight */
501
+ if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
506502 return;
507503
508504 isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
509505 if (!isw)
510
- goto out_unlock;
506
+ return;
507
+
508
+ atomic_inc(&isw_nr_in_flight);
511509
512510 /* find and pin the new wb */
513511 rcu_read_lock();
....@@ -544,17 +542,13 @@
544542 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
545543 */
546544 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
547
-
548
- atomic_inc(&isw_nr_in_flight);
549
-
550
- goto out_unlock;
545
+ return;
551546
552547 out_free:
548
+ atomic_dec(&isw_nr_in_flight);
553549 if (isw->new_wb)
554550 wb_put(isw->new_wb);
555551 kfree(isw);
556
-out_unlock:
557
- up_read(&bdi->wb_switch_rwsem);
558552 }
559553
560554 /**
....@@ -598,6 +592,7 @@
598592 if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
599593 inode_switch_wbs(inode, wbc->wb_id);
600594 }
595
+EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
601596
602597 /**
603598 * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
....@@ -695,6 +690,9 @@
695690 if (wbc->wb_id != max_id)
696691 history |= (1U << slots) - 1;
697692
693
+ if (history)
694
+ trace_inode_foreign_history(inode, wbc, history);
695
+
698696 /*
699697 * Switch if the current wb isn't the consistent winner.
700698 * If there are multiple closely competing dirtiers, the
....@@ -717,9 +715,10 @@
717715 wb_put(wbc->wb);
718716 wbc->wb = NULL;
719717 }
718
+EXPORT_SYMBOL_GPL(wbc_detach_inode);
720719
721720 /**
722
- * wbc_account_io - account IO issued during writeback
721
+ * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
723722 * @wbc: writeback_control of the writeback in progress
724723 * @page: page being written out
725724 * @bytes: number of bytes being written out
....@@ -728,8 +727,8 @@
728727 * controlled by @wbc. Keep the book for foreign inode detection. See
729728 * wbc_detach_inode().
730729 */
731
-void wbc_account_io(struct writeback_control *wbc, struct page *page,
732
- size_t bytes)
730
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
731
+ size_t bytes)
733732 {
734733 struct cgroup_subsys_state *css;
735734 int id;
....@@ -740,7 +739,7 @@
740739 * behind a slow cgroup. Ultimately, we want pageout() to kick off
741740 * regular writeback instead of writing things out itself.
742741 */
743
- if (!wbc->wb)
742
+ if (!wbc->wb || wbc->no_cgroup_owner)
744743 return;
745744
746745 css = mem_cgroup_css_from_page(page);
....@@ -766,7 +765,7 @@
766765 else
767766 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
768767 }
769
-EXPORT_SYMBOL_GPL(wbc_account_io);
768
+EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
770769
771770 /**
772771 * inode_congested - test whether an inode is congested
....@@ -856,7 +855,7 @@
856855 restart:
857856 rcu_read_lock();
858857 list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
859
- DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
858
+ DEFINE_WB_COMPLETION(fallback_work_done, bdi);
860859 struct wb_writeback_work fallback_work;
861860 struct wb_writeback_work *work;
862861 long nr_pages;
....@@ -903,13 +902,96 @@
903902 last_wb = wb;
904903
905904 rcu_read_unlock();
906
- wb_wait_for_completion(bdi, &fallback_work_done);
905
+ wb_wait_for_completion(&fallback_work_done);
907906 goto restart;
908907 }
909908 rcu_read_unlock();
910909
911910 if (last_wb)
912911 wb_put(last_wb);
912
+}
913
+
914
+/**
915
+ * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
916
+ * @bdi_id: target bdi id
917
+ * @memcg_id: target memcg css id
918
+ * @nr: number of pages to write, 0 for best-effort dirty flushing
919
+ * @reason: reason why some writeback work initiated
920
+ * @done: target wb_completion
921
+ *
922
+ * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
923
+ * with the specified parameters.
924
+ */
925
+int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
926
+ enum wb_reason reason, struct wb_completion *done)
927
+{
928
+ struct backing_dev_info *bdi;
929
+ struct cgroup_subsys_state *memcg_css;
930
+ struct bdi_writeback *wb;
931
+ struct wb_writeback_work *work;
932
+ int ret;
933
+
934
+ /* lookup bdi and memcg */
935
+ bdi = bdi_get_by_id(bdi_id);
936
+ if (!bdi)
937
+ return -ENOENT;
938
+
939
+ rcu_read_lock();
940
+ memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
941
+ if (memcg_css && !css_tryget(memcg_css))
942
+ memcg_css = NULL;
943
+ rcu_read_unlock();
944
+ if (!memcg_css) {
945
+ ret = -ENOENT;
946
+ goto out_bdi_put;
947
+ }
948
+
949
+ /*
950
+ * And find the associated wb. If the wb isn't there already
951
+ * there's nothing to flush, don't create one.
952
+ */
953
+ wb = wb_get_lookup(bdi, memcg_css);
954
+ if (!wb) {
955
+ ret = -ENOENT;
956
+ goto out_css_put;
957
+ }
958
+
959
+ /*
960
+ * If @nr is zero, the caller is attempting to write out most of
961
+ * the currently dirty pages. Let's take the current dirty page
962
+ * count and inflate it by 25% which should be large enough to
963
+ * flush out most dirty pages while avoiding getting livelocked by
964
+ * concurrent dirtiers.
965
+ */
966
+ if (!nr) {
967
+ unsigned long filepages, headroom, dirty, writeback;
968
+
969
+ mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
970
+ &writeback);
971
+ nr = dirty * 10 / 8;
972
+ }
973
+
974
+ /* issue the writeback work */
975
+ work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
976
+ if (work) {
977
+ work->nr_pages = nr;
978
+ work->sync_mode = WB_SYNC_NONE;
979
+ work->range_cyclic = 1;
980
+ work->reason = reason;
981
+ work->done = done;
982
+ work->auto_free = 1;
983
+ wb_queue_work(wb, work);
984
+ ret = 0;
985
+ } else {
986
+ ret = -ENOMEM;
987
+ }
988
+
989
+ wb_put(wb);
990
+out_css_put:
991
+ css_put(memcg_css);
992
+out_bdi_put:
993
+ bdi_put(bdi);
994
+ return ret;
913995 }
914996
915997 /**
....@@ -995,7 +1077,6 @@
9951077 static unsigned long get_nr_dirty_pages(void)
9961078 {
9971079 return global_node_page_state(NR_FILE_DIRTY) +
998
- global_node_page_state(NR_UNSTABLE_NFS) +
9991080 get_nr_dirty_inodes();
10001081 }
10011082
....@@ -1053,6 +1134,7 @@
10531134 spin_unlock(&inode->i_lock);
10541135 spin_unlock(&wb->list_lock);
10551136 }
1137
+EXPORT_SYMBOL(inode_io_list_del);
10561138
10571139 /*
10581140 * mark an inode as under writeback on the sb
....@@ -1568,11 +1650,12 @@
15681650 };
15691651 unsigned long start_time = jiffies;
15701652 long write_chunk;
1571
- long wrote = 0; /* count both pages and inodes */
1653
+ long total_wrote = 0; /* count both pages and inodes */
15721654
15731655 while (!list_empty(&wb->b_io)) {
15741656 struct inode *inode = wb_inode(wb->b_io.prev);
15751657 struct bdi_writeback *tmp_wb;
1658
+ long wrote;
15761659
15771660 if (inode->i_sb != sb) {
15781661 if (work->sb) {
....@@ -1648,7 +1731,9 @@
16481731
16491732 wbc_detach_inode(&wbc);
16501733 work->nr_pages -= write_chunk - wbc.nr_to_write;
1651
- wrote += write_chunk - wbc.nr_to_write;
1734
+ wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
1735
+ wrote = wrote < 0 ? 0 : wrote;
1736
+ total_wrote += wrote;
16521737
16531738 if (need_resched()) {
16541739 /*
....@@ -1670,7 +1755,7 @@
16701755 tmp_wb = inode_to_wb_and_lock_list(inode);
16711756 spin_lock(&inode->i_lock);
16721757 if (!(inode->i_state & I_DIRTY_ALL))
1673
- wrote++;
1758
+ total_wrote++;
16741759 requeue_inode(inode, tmp_wb, &wbc);
16751760 inode_sync_complete(inode);
16761761 spin_unlock(&inode->i_lock);
....@@ -1684,14 +1769,14 @@
16841769 * bail out to wb_writeback() often enough to check
16851770 * background threshold and other termination conditions.
16861771 */
1687
- if (wrote) {
1772
+ if (total_wrote) {
16881773 if (time_is_before_jiffies(start_time + HZ / 10UL))
16891774 break;
16901775 if (work->nr_pages <= 0)
16911776 break;
16921777 }
16931778 }
1694
- return wrote;
1779
+ return total_wrote;
16951780 }
16961781
16971782 static long __writeback_inodes_wb(struct bdi_writeback *wb,
....@@ -2110,7 +2195,7 @@
21102195 __initcall(start_dirtytime_writeback);
21112196
21122197 int dirtytime_interval_handler(struct ctl_table *table, int write,
2113
- void __user *buffer, size_t *lenp, loff_t *ppos)
2198
+ void *buffer, size_t *lenp, loff_t *ppos)
21142199 {
21152200 int ret;
21162201
....@@ -2222,9 +2307,9 @@
22222307
22232308 wb = locked_inode_to_wb_and_lock_list(inode);
22242309
2225
- WARN(bdi_cap_writeback_dirty(wb->bdi) &&
2310
+ WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) &&
22262311 !test_bit(WB_registered, &wb->state),
2227
- "bdi-%s not registered\n", wb->bdi->name);
2312
+ "bdi-%s not registered\n", bdi_dev_name(wb->bdi));
22282313
22292314 inode->dirtied_when = jiffies;
22302315 if (dirtytime)
....@@ -2247,7 +2332,8 @@
22472332 * to make sure background write-back happens
22482333 * later.
22492334 */
2250
- if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
2335
+ if (wakeup_bdi &&
2336
+ (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
22512337 wb_wakeup_delayed(wb);
22522338 return;
22532339 }
....@@ -2255,7 +2341,7 @@
22552341 out_unlock_inode:
22562342 spin_unlock(&inode->i_lock);
22572343 }
2258
-EXPORT_SYMBOL(__mark_inode_dirty);
2344
+EXPORT_SYMBOL_NS(__mark_inode_dirty, ANDROID_GKI_VFS_EXPORT_ONLY);
22592345
22602346 /*
22612347 * The @s_sync_lock is used to serialise concurrent sync operations
....@@ -2354,7 +2440,8 @@
23542440 static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
23552441 enum wb_reason reason, bool skip_if_busy)
23562442 {
2357
- DEFINE_WB_COMPLETION_ONSTACK(done);
2443
+ struct backing_dev_info *bdi = sb->s_bdi;
2444
+ DEFINE_WB_COMPLETION(done, bdi);
23582445 struct wb_writeback_work work = {
23592446 .sb = sb,
23602447 .sync_mode = WB_SYNC_NONE,
....@@ -2363,14 +2450,13 @@
23632450 .nr_pages = nr,
23642451 .reason = reason,
23652452 };
2366
- struct backing_dev_info *bdi = sb->s_bdi;
23672453
23682454 if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
23692455 return;
23702456 WARN_ON(!rwsem_is_locked(&sb->s_umount));
23712457
23722458 bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
2373
- wb_wait_for_completion(bdi, &done);
2459
+ wb_wait_for_completion(&done);
23742460 }
23752461
23762462 /**
....@@ -2421,7 +2507,7 @@
24212507 __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
24222508 up_read(&sb->s_umount);
24232509 }
2424
-EXPORT_SYMBOL(try_to_writeback_inodes_sb);
2510
+EXPORT_SYMBOL_NS(try_to_writeback_inodes_sb, ANDROID_GKI_VFS_EXPORT_ONLY);
24252511
24262512 /**
24272513 * sync_inodes_sb - sync sb inode pages
....@@ -2432,7 +2518,8 @@
24322518 */
24332519 void sync_inodes_sb(struct super_block *sb)
24342520 {
2435
- DEFINE_WB_COMPLETION_ONSTACK(done);
2521
+ struct backing_dev_info *bdi = sb->s_bdi;
2522
+ DEFINE_WB_COMPLETION(done, bdi);
24362523 struct wb_writeback_work work = {
24372524 .sb = sb,
24382525 .sync_mode = WB_SYNC_ALL,
....@@ -2442,7 +2529,6 @@
24422529 .reason = WB_REASON_SYNC,
24432530 .for_sync = 1,
24442531 };
2445
- struct backing_dev_info *bdi = sb->s_bdi;
24462532
24472533 /*
24482534 * Can't skip on !bdi_has_dirty() because we should wait for !dirty
....@@ -2456,7 +2542,7 @@
24562542 /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
24572543 bdi_down_write_wb_switch_rwsem(bdi);
24582544 bdi_split_work_to_wbs(bdi, &work, false);
2459
- wb_wait_for_completion(bdi, &done);
2545
+ wb_wait_for_completion(&done);
24602546 bdi_up_write_wb_switch_rwsem(bdi);
24612547
24622548 wait_sb_inodes(sb);
....@@ -2482,13 +2568,13 @@
24822568 .range_end = LLONG_MAX,
24832569 };
24842570
2485
- if (!mapping_cap_writeback_dirty(inode->i_mapping))
2571
+ if (!mapping_can_writeback(inode->i_mapping))
24862572 wbc.nr_to_write = 0;
24872573
24882574 might_sleep();
24892575 return writeback_single_inode(inode, &wbc);
24902576 }
2491
-EXPORT_SYMBOL(write_inode_now);
2577
+EXPORT_SYMBOL_NS(write_inode_now, ANDROID_GKI_VFS_EXPORT_ONLY);
24922578
24932579 /**
24942580 * sync_inode - write an inode and its pages to disk.
....@@ -2525,4 +2611,4 @@
25252611
25262612 return sync_inode(inode, &wbc);
25272613 }
2528
-EXPORT_SYMBOL(sync_inode_metadata);
2614
+EXPORT_SYMBOL_NS(sync_inode_metadata, ANDROID_GKI_VFS_EXPORT_ONLY);