hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/drivers/md/dm-thin-metadata.c
....@@ -28,7 +28,7 @@
2828 *
2929 * - A hierarchical btree, with 2 levels which effectively maps (thin
3030 * dev id, virtual block) -> block_time. Block time is a 64-bit
31
- * field holding the time in the low 24 bits, and block in the top 48
31
+ * field holding the time in the low 24 bits, and block in the top 40
3232 * bits.
3333 *
3434 * BTrees consist solely of btree_nodes, that fill a block. Some are
....@@ -189,6 +189,15 @@
189189 sector_t data_block_size;
190190
191191 /*
192
+ * Pre-commit callback.
193
+ *
194
+ * This allows the thin provisioning target to run a callback before
195
+ * the metadata are committed.
196
+ */
197
+ dm_pool_pre_commit_fn pre_commit_fn;
198
+ void *pre_commit_context;
199
+
200
+ /*
192201 * We reserve a section of the metadata for commit overhead.
193202 * All reported space does *not* include this.
194203 */
....@@ -200,6 +209,13 @@
200209 * operation possible in this state is the closing of the device.
201210 */
202211 bool fail_io:1;
212
+
213
+ /*
214
+ * Set once a thin-pool has been accessed through one of the interfaces
215
+ * that imply the pool is in-service (e.g. thin devices created/deleted,
216
+ * thin-pool message, metadata snapshots, etc).
217
+ */
218
+ bool in_service:1;
203219
204220 /*
205221 * Reading the space map roots can fail, so we read it into these
....@@ -363,6 +379,31 @@
363379 memcpy(&v2_le, value2_le, sizeof(v2_le));
364380
365381 return v1_le == v2_le;
382
+}
383
+
384
+/*----------------------------------------------------------------*/
385
+
386
+/*
387
+ * Variant that is used for in-core only changes or code that
388
+ * shouldn't put the pool in service on its own (e.g. commit).
389
+ */
390
+static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
391
+ __acquires(pmd->root_lock)
392
+{
393
+ down_write(&pmd->root_lock);
394
+}
395
+
396
+static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
397
+{
398
+ pmd_write_lock_in_core(pmd);
399
+ if (unlikely(!pmd->in_service))
400
+ pmd->in_service = true;
401
+}
402
+
403
+static inline void pmd_write_unlock(struct dm_pool_metadata *pmd)
404
+ __releases(pmd->root_lock)
405
+{
406
+ up_write(&pmd->root_lock);
366407 }
367408
368409 /*----------------------------------------------------------------*/
....@@ -660,6 +701,15 @@
660701 goto bad_cleanup_data_sm;
661702 }
662703
704
+ /*
705
+ * For pool metadata opening process, root setting is redundant
706
+ * because it will be set again in __begin_transaction(). But dm
707
+ * pool aborting process really needs to get last transaction's
708
+ * root to avoid accessing broken btree.
709
+ */
710
+ pmd->root = le64_to_cpu(disk_super->data_mapping_root);
711
+ pmd->details_root = le64_to_cpu(disk_super->device_details_root);
712
+
663713 __setup_btree_details(pmd);
664714 dm_bm_unlock(sblock);
665715
....@@ -712,13 +762,15 @@
712762 return r;
713763 }
714764
715
-static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
765
+static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd,
766
+ bool destroy_bm)
716767 {
717768 dm_sm_destroy(pmd->data_sm);
718769 dm_sm_destroy(pmd->metadata_sm);
719770 dm_tm_destroy(pmd->nb_tm);
720771 dm_tm_destroy(pmd->tm);
721
- dm_block_manager_destroy(pmd->bm);
772
+ if (destroy_bm)
773
+ dm_block_manager_destroy(pmd->bm);
722774 }
723775
724776 static int __begin_transaction(struct dm_pool_metadata *pmd)
....@@ -773,7 +825,7 @@
773825 return r;
774826
775827 if (td->open_count)
776
- td->changed = 0;
828
+ td->changed = false;
777829 else {
778830 list_del(&td->list);
779831 kfree(td);
....@@ -793,6 +845,18 @@
793845 * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
794846 */
795847 BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
848
+ BUG_ON(!rwsem_is_locked(&pmd->root_lock));
849
+
850
+ if (unlikely(!pmd->in_service))
851
+ return 0;
852
+
853
+ if (pmd->pre_commit_fn) {
854
+ r = pmd->pre_commit_fn(pmd->pre_commit_context);
855
+ if (r < 0) {
856
+ DMERR("pre-commit callback failed");
857
+ return r;
858
+ }
859
+ }
796860
797861 r = __write_changed_details(pmd);
798862 if (r < 0)
....@@ -857,8 +921,11 @@
857921 pmd->time = 0;
858922 INIT_LIST_HEAD(&pmd->thin_devices);
859923 pmd->fail_io = false;
924
+ pmd->in_service = false;
860925 pmd->bdev = bdev;
861926 pmd->data_block_size = data_block_size;
927
+ pmd->pre_commit_fn = NULL;
928
+ pmd->pre_commit_context = NULL;
862929
863930 r = __create_persistent_data_objects(pmd, format_device);
864931 if (r) {
....@@ -901,15 +968,16 @@
901968 return -EBUSY;
902969 }
903970
971
+ pmd_write_lock_in_core(pmd);
904972 if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) {
905973 r = __commit_transaction(pmd);
906974 if (r < 0)
907975 DMWARN("%s: __commit_transaction() failed, error = %d",
908976 __func__, r);
909977 }
910
-
978
+ pmd_write_unlock(pmd);
911979 if (!pmd->fail_io)
912
- __destroy_persistent_data_objects(pmd);
980
+ __destroy_persistent_data_objects(pmd, true);
913981
914982 kfree(pmd);
915983 return 0;
....@@ -994,12 +1062,11 @@
9941062 int r;
9951063 dm_block_t dev_root;
9961064 uint64_t key = dev;
997
- struct disk_device_details details_le;
9981065 struct dm_thin_device *td;
9991066 __le64 value;
10001067
10011068 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1002
- &key, &details_le);
1069
+ &key, NULL);
10031070 if (!r)
10041071 return -EEXIST;
10051072
....@@ -1036,10 +1103,10 @@
10361103 {
10371104 int r = -EINVAL;
10381105
1039
- down_write(&pmd->root_lock);
1106
+ pmd_write_lock(pmd);
10401107 if (!pmd->fail_io)
10411108 r = __create_thin(pmd, dev);
1042
- up_write(&pmd->root_lock);
1109
+ pmd_write_unlock(pmd);
10431110
10441111 return r;
10451112 }
....@@ -1055,7 +1122,7 @@
10551122 if (r)
10561123 return r;
10571124
1058
- td->changed = 1;
1125
+ td->changed = true;
10591126 td->snapshotted_time = time;
10601127
10611128 snap->mapped_blocks = td->mapped_blocks;
....@@ -1072,12 +1139,11 @@
10721139 dm_block_t origin_root;
10731140 uint64_t key = origin, dev_key = dev;
10741141 struct dm_thin_device *td;
1075
- struct disk_device_details details_le;
10761142 __le64 value;
10771143
10781144 /* check this device is unused */
10791145 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1080
- &dev_key, &details_le);
1146
+ &dev_key, NULL);
10811147 if (!r)
10821148 return -EEXIST;
10831149
....@@ -1127,10 +1193,10 @@
11271193 {
11281194 int r = -EINVAL;
11291195
1130
- down_write(&pmd->root_lock);
1196
+ pmd_write_lock(pmd);
11311197 if (!pmd->fail_io)
11321198 r = __create_snap(pmd, dev, origin);
1133
- up_write(&pmd->root_lock);
1199
+ pmd_write_unlock(pmd);
11341200
11351201 return r;
11361202 }
....@@ -1170,10 +1236,10 @@
11701236 {
11711237 int r = -EINVAL;
11721238
1173
- down_write(&pmd->root_lock);
1239
+ pmd_write_lock(pmd);
11741240 if (!pmd->fail_io)
11751241 r = __delete_device(pmd, dev);
1176
- up_write(&pmd->root_lock);
1242
+ pmd_write_unlock(pmd);
11771243
11781244 return r;
11791245 }
....@@ -1184,7 +1250,7 @@
11841250 {
11851251 int r = -EINVAL;
11861252
1187
- down_write(&pmd->root_lock);
1253
+ pmd_write_lock(pmd);
11881254
11891255 if (pmd->fail_io)
11901256 goto out;
....@@ -1198,7 +1264,7 @@
11981264 r = 0;
11991265
12001266 out:
1201
- up_write(&pmd->root_lock);
1267
+ pmd_write_unlock(pmd);
12021268
12031269 return r;
12041270 }
....@@ -1229,7 +1295,12 @@
12291295 * We commit to ensure the btree roots which we increment in a
12301296 * moment are up to date.
12311297 */
1232
- __commit_transaction(pmd);
1298
+ r = __commit_transaction(pmd);
1299
+ if (r < 0) {
1300
+ DMWARN("%s: __commit_transaction() failed, error = %d",
1301
+ __func__, r);
1302
+ return r;
1303
+ }
12331304
12341305 /*
12351306 * Copy the superblock.
....@@ -1287,10 +1358,10 @@
12871358 {
12881359 int r = -EINVAL;
12891360
1290
- down_write(&pmd->root_lock);
1361
+ pmd_write_lock(pmd);
12911362 if (!pmd->fail_io)
12921363 r = __reserve_metadata_snap(pmd);
1293
- up_write(&pmd->root_lock);
1364
+ pmd_write_unlock(pmd);
12941365
12951366 return r;
12961367 }
....@@ -1335,10 +1406,10 @@
13351406 {
13361407 int r = -EINVAL;
13371408
1338
- down_write(&pmd->root_lock);
1409
+ pmd_write_lock(pmd);
13391410 if (!pmd->fail_io)
13401411 r = __release_metadata_snap(pmd);
1341
- up_write(&pmd->root_lock);
1412
+ pmd_write_unlock(pmd);
13421413
13431414 return r;
13441415 }
....@@ -1381,19 +1452,19 @@
13811452 {
13821453 int r = -EINVAL;
13831454
1384
- down_write(&pmd->root_lock);
1455
+ pmd_write_lock_in_core(pmd);
13851456 if (!pmd->fail_io)
13861457 r = __open_device(pmd, dev, 0, td);
1387
- up_write(&pmd->root_lock);
1458
+ pmd_write_unlock(pmd);
13881459
13891460 return r;
13901461 }
13911462
13921463 int dm_pool_close_thin_device(struct dm_thin_device *td)
13931464 {
1394
- down_write(&td->pmd->root_lock);
1465
+ pmd_write_lock_in_core(td->pmd);
13951466 __close_device(td);
1396
- up_write(&td->pmd->root_lock);
1467
+ pmd_write_unlock(td->pmd);
13971468
13981469 return 0;
13991470 }
....@@ -1562,7 +1633,7 @@
15621633 if (r)
15631634 return r;
15641635
1565
- td->changed = 1;
1636
+ td->changed = true;
15661637 if (inserted)
15671638 td->mapped_blocks++;
15681639
....@@ -1574,10 +1645,10 @@
15741645 {
15751646 int r = -EINVAL;
15761647
1577
- down_write(&td->pmd->root_lock);
1648
+ pmd_write_lock(td->pmd);
15781649 if (!td->pmd->fail_io)
15791650 r = __insert(td, block, data_block);
1580
- up_write(&td->pmd->root_lock);
1651
+ pmd_write_unlock(td->pmd);
15811652
15821653 return r;
15831654 }
....@@ -1593,7 +1664,7 @@
15931664 return r;
15941665
15951666 td->mapped_blocks--;
1596
- td->changed = 1;
1667
+ td->changed = true;
15971668
15981669 return 0;
15991670 }
....@@ -1647,7 +1718,7 @@
16471718 }
16481719
16491720 td->mapped_blocks -= total_count;
1650
- td->changed = 1;
1721
+ td->changed = true;
16511722
16521723 /*
16531724 * Reinsert the mapping tree.
....@@ -1661,10 +1732,10 @@
16611732 {
16621733 int r = -EINVAL;
16631734
1664
- down_write(&td->pmd->root_lock);
1735
+ pmd_write_lock(td->pmd);
16651736 if (!td->pmd->fail_io)
16661737 r = __remove(td, block);
1667
- up_write(&td->pmd->root_lock);
1738
+ pmd_write_unlock(td->pmd);
16681739
16691740 return r;
16701741 }
....@@ -1674,10 +1745,10 @@
16741745 {
16751746 int r = -EINVAL;
16761747
1677
- down_write(&td->pmd->root_lock);
1748
+ pmd_write_lock(td->pmd);
16781749 if (!td->pmd->fail_io)
16791750 r = __remove_range(td, begin, end);
1680
- up_write(&td->pmd->root_lock);
1751
+ pmd_write_unlock(td->pmd);
16811752
16821753 return r;
16831754 }
....@@ -1700,13 +1771,13 @@
17001771 {
17011772 int r = 0;
17021773
1703
- down_write(&pmd->root_lock);
1774
+ pmd_write_lock(pmd);
17041775 for (; b != e; b++) {
17051776 r = dm_sm_inc_block(pmd->data_sm, b);
17061777 if (r)
17071778 break;
17081779 }
1709
- up_write(&pmd->root_lock);
1780
+ pmd_write_unlock(pmd);
17101781
17111782 return r;
17121783 }
....@@ -1715,13 +1786,13 @@
17151786 {
17161787 int r = 0;
17171788
1718
- down_write(&pmd->root_lock);
1789
+ pmd_write_lock(pmd);
17191790 for (; b != e; b++) {
17201791 r = dm_sm_dec_block(pmd->data_sm, b);
17211792 if (r)
17221793 break;
17231794 }
1724
- up_write(&pmd->root_lock);
1795
+ pmd_write_unlock(pmd);
17251796
17261797 return r;
17271798 }
....@@ -1769,10 +1840,10 @@
17691840 {
17701841 int r = -EINVAL;
17711842
1772
- down_write(&pmd->root_lock);
1843
+ pmd_write_lock(pmd);
17731844 if (!pmd->fail_io)
17741845 r = dm_sm_new_block(pmd->data_sm, result);
1775
- up_write(&pmd->root_lock);
1846
+ pmd_write_unlock(pmd);
17761847
17771848 return r;
17781849 }
....@@ -1781,12 +1852,16 @@
17811852 {
17821853 int r = -EINVAL;
17831854
1784
- down_write(&pmd->root_lock);
1855
+ /*
1856
+ * Care is taken to not have commit be what
1857
+ * triggers putting the thin-pool in-service.
1858
+ */
1859
+ pmd_write_lock_in_core(pmd);
17851860 if (pmd->fail_io)
17861861 goto out;
17871862
17881863 r = __commit_transaction(pmd);
1789
- if (r <= 0)
1864
+ if (r < 0)
17901865 goto out;
17911866
17921867 /*
....@@ -1794,7 +1869,7 @@
17941869 */
17951870 r = __begin_transaction(pmd);
17961871 out:
1797
- up_write(&pmd->root_lock);
1872
+ pmd_write_unlock(pmd);
17981873 return r;
17991874 }
18001875
....@@ -1809,19 +1884,52 @@
18091884 int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
18101885 {
18111886 int r = -EINVAL;
1887
+ struct dm_block_manager *old_bm = NULL, *new_bm = NULL;
18121888
1813
- down_write(&pmd->root_lock);
1814
- if (pmd->fail_io)
1889
+ /* fail_io is double-checked with pmd->root_lock held below */
1890
+ if (unlikely(pmd->fail_io))
1891
+ return r;
1892
+
1893
+ /*
1894
+ * Replacement block manager (new_bm) is created and old_bm destroyed outside of
1895
+ * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
1896
+ * shrinker associated with the block manager's bufio client vs pmd root_lock).
1897
+ * - must take shrinker_rwsem without holding pmd->root_lock
1898
+ */
1899
+ new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
1900
+ THIN_MAX_CONCURRENT_LOCKS);
1901
+
1902
+ pmd_write_lock(pmd);
1903
+ if (pmd->fail_io) {
1904
+ pmd_write_unlock(pmd);
18151905 goto out;
1906
+ }
18161907
18171908 __set_abort_with_changes_flags(pmd);
1818
- __destroy_persistent_data_objects(pmd);
1819
- r = __create_persistent_data_objects(pmd, false);
1909
+ __destroy_persistent_data_objects(pmd, false);
1910
+ old_bm = pmd->bm;
1911
+ if (IS_ERR(new_bm)) {
1912
+ DMERR("could not create block manager during abort");
1913
+ pmd->bm = NULL;
1914
+ r = PTR_ERR(new_bm);
1915
+ goto out_unlock;
1916
+ }
1917
+
1918
+ pmd->bm = new_bm;
1919
+ r = __open_or_format_metadata(pmd, false);
1920
+ if (r) {
1921
+ pmd->bm = NULL;
1922
+ goto out_unlock;
1923
+ }
1924
+ new_bm = NULL;
1925
+out_unlock:
18201926 if (r)
18211927 pmd->fail_io = true;
1822
-
1928
+ pmd_write_unlock(pmd);
1929
+ dm_block_manager_destroy(old_bm);
18231930 out:
1824
- up_write(&pmd->root_lock);
1931
+ if (new_bm && !IS_ERR(new_bm))
1932
+ dm_block_manager_destroy(new_bm);
18251933
18261934 return r;
18271935 }
....@@ -1952,10 +2060,10 @@
19522060 {
19532061 int r = -EINVAL;
19542062
1955
- down_write(&pmd->root_lock);
2063
+ pmd_write_lock(pmd);
19562064 if (!pmd->fail_io)
19572065 r = __resize_space_map(pmd->data_sm, new_count);
1958
- up_write(&pmd->root_lock);
2066
+ pmd_write_unlock(pmd);
19592067
19602068 return r;
19612069 }
....@@ -1964,29 +2072,29 @@
19642072 {
19652073 int r = -EINVAL;
19662074
1967
- down_write(&pmd->root_lock);
2075
+ pmd_write_lock(pmd);
19682076 if (!pmd->fail_io) {
19692077 r = __resize_space_map(pmd->metadata_sm, new_count);
19702078 if (!r)
19712079 __set_metadata_reserve(pmd);
19722080 }
1973
- up_write(&pmd->root_lock);
2081
+ pmd_write_unlock(pmd);
19742082
19752083 return r;
19762084 }
19772085
19782086 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
19792087 {
1980
- down_write(&pmd->root_lock);
2088
+ pmd_write_lock_in_core(pmd);
19812089 dm_bm_set_read_only(pmd->bm);
1982
- up_write(&pmd->root_lock);
2090
+ pmd_write_unlock(pmd);
19832091 }
19842092
19852093 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
19862094 {
1987
- down_write(&pmd->root_lock);
2095
+ pmd_write_lock_in_core(pmd);
19882096 dm_bm_set_read_write(pmd->bm);
1989
- up_write(&pmd->root_lock);
2097
+ pmd_write_unlock(pmd);
19902098 }
19912099
19922100 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
....@@ -1994,13 +2102,26 @@
19942102 dm_sm_threshold_fn fn,
19952103 void *context)
19962104 {
1997
- int r;
2105
+ int r = -EINVAL;
19982106
1999
- down_write(&pmd->root_lock);
2000
- r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
2001
- up_write(&pmd->root_lock);
2107
+ pmd_write_lock_in_core(pmd);
2108
+ if (!pmd->fail_io) {
2109
+ r = dm_sm_register_threshold_callback(pmd->metadata_sm,
2110
+ threshold, fn, context);
2111
+ }
2112
+ pmd_write_unlock(pmd);
20022113
20032114 return r;
2115
+}
2116
+
2117
+void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
2118
+ dm_pool_pre_commit_fn fn,
2119
+ void *context)
2120
+{
2121
+ pmd_write_lock_in_core(pmd);
2122
+ pmd->pre_commit_fn = fn;
2123
+ pmd->pre_commit_context = context;
2124
+ pmd_write_unlock(pmd);
20042125 }
20052126
20062127 int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
....@@ -2009,7 +2130,7 @@
20092130 struct dm_block *sblock;
20102131 struct thin_disk_superblock *disk_super;
20112132
2012
- down_write(&pmd->root_lock);
2133
+ pmd_write_lock(pmd);
20132134 if (pmd->fail_io)
20142135 goto out;
20152136
....@@ -2026,7 +2147,7 @@
20262147
20272148 dm_bm_unlock(sblock);
20282149 out:
2029
- up_write(&pmd->root_lock);
2150
+ pmd_write_unlock(pmd);
20302151 return r;
20312152 }
20322153