hc
2023-12-06 08f87f769b595151be1afeff53e144f543faa614
kernel/fs/incfs/data_mgmt.h
....@@ -10,49 +10,71 @@
1010 #include <linux/types.h>
1111 #include <linux/mutex.h>
1212 #include <linux/spinlock.h>
13
+#include <linux/rcupdate.h>
1314 #include <linux/completion.h>
1415 #include <linux/wait.h>
16
+#include <linux/zstd.h>
1517 #include <crypto/hash.h>
18
+#include <linux/rwsem.h>
1619
1720 #include <uapi/linux/incrementalfs.h>
1821
1922 #include "internal.h"
23
+#include "pseudo_files.h"
2024
2125 #define SEGMENTS_PER_FILE 3
2226
2327 enum LOG_RECORD_TYPE {
2428 FULL,
2529 SAME_FILE,
30
+ SAME_FILE_CLOSE_BLOCK,
31
+ SAME_FILE_CLOSE_BLOCK_SHORT,
2632 SAME_FILE_NEXT_BLOCK,
2733 SAME_FILE_NEXT_BLOCK_SHORT,
2834 };
2935
3036 struct full_record {
31
- enum LOG_RECORD_TYPE type : 2; /* FULL */
32
- u32 block_index : 30;
37
+ enum LOG_RECORD_TYPE type : 3; /* FULL */
38
+ u32 block_index : 29;
3339 incfs_uuid_t file_id;
3440 u64 absolute_ts_us;
35
-} __packed; /* 28 bytes */
41
+ uid_t uid;
42
+} __packed; /* 32 bytes */
3643
37
-struct same_file_record {
38
- enum LOG_RECORD_TYPE type : 2; /* SAME_FILE */
39
- u32 block_index : 30;
40
- u32 relative_ts_us; /* max 2^32 us ~= 1 hour (1:11:30) */
41
-} __packed; /* 12 bytes */
44
+struct same_file {
45
+ enum LOG_RECORD_TYPE type : 3; /* SAME_FILE */
46
+ u32 block_index : 29;
47
+ uid_t uid;
48
+ u16 relative_ts_us; /* max 2^16 us ~= 64 ms */
49
+} __packed; /* 10 bytes */
4250
43
-struct same_file_next_block {
44
- enum LOG_RECORD_TYPE type : 2; /* SAME_FILE_NEXT_BLOCK */
45
- u32 relative_ts_us : 30; /* max 2^30 us ~= 15 min (17:50) */
51
+struct same_file_close_block {
52
+ enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK */
53
+ u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */
54
+ s16 block_index_delta;
4655 } __packed; /* 4 bytes */
4756
48
-struct same_file_next_block_short {
49
- enum LOG_RECORD_TYPE type : 2; /* SAME_FILE_NEXT_BLOCK_SHORT */
50
- u16 relative_ts_us : 14; /* max 2^14 us ~= 16 ms */
57
+struct same_file_close_block_short {
58
+ enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK_SHORT */
59
+ u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */
60
+ s8 block_index_delta;
5161 } __packed; /* 2 bytes */
62
+
63
+struct same_file_next_block {
64
+ enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK */
65
+ u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */
66
+} __packed; /* 2 bytes */
67
+
68
+struct same_file_next_block_short {
69
+ enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK_SHORT */
70
+ u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */
71
+} __packed; /* 1 byte */
5272
5373 union log_record {
5474 struct full_record full_record;
55
- struct same_file_record same_file_record;
75
+ struct same_file same_file;
76
+ struct same_file_close_block same_file_close_block;
77
+ struct same_file_close_block_short same_file_close_block_short;
5678 struct same_file_next_block same_file_next_block;
5779 struct same_file_next_block_short same_file_next_block_short;
5880 };
....@@ -99,8 +121,8 @@
99121 unsigned int readahead_pages;
100122 unsigned int read_log_pages;
101123 unsigned int read_log_wakeup_count;
102
- bool no_backing_file_cache;
103
- bool no_backing_file_readahead;
124
+ bool report_uid;
125
+ char *sysfs_name;
104126 };
105127
106128 struct mount_info {
....@@ -109,6 +131,14 @@
109131 struct path mi_backing_dir_path;
110132
111133 struct dentry *mi_index_dir;
134
+ /* For stacking mounts, if true, this indicates if the index dir needs
135
+ * to be freed for this SB otherwise it was created by lower level SB */
136
+ bool mi_index_free;
137
+
138
+ struct dentry *mi_incomplete_dir;
139
+ /* For stacking mounts, if true, this indicates if the incomplete dir
140
+ * needs to be freed for this SB. Similar to mi_index_free */
141
+ bool mi_incomplete_free;
112142
113143 const struct cred *mi_owner;
114144
....@@ -123,13 +153,13 @@
123153 wait_queue_head_t mi_pending_reads_notif_wq;
124154
125155 /*
126
- * Protects:
156
+ * Protects - RCU safe:
127157 * - reads_list_head
128158 * - mi_pending_reads_count
129159 * - mi_last_pending_read_number
130160 * - data_file_segment.reads_list_head
131161 */
132
- struct mutex mi_pending_reads_mutex;
162
+ spinlock_t pending_read_lock;
133163
134164 /* List of active pending_read objects */
135165 struct list_head mi_reads_list_head;
....@@ -146,11 +176,67 @@
146176 /* Temporary buffer for read logger. */
147177 struct read_log mi_log;
148178
149
- void *log_xattr;
150
- size_t log_xattr_size;
179
+ /* SELinux needs special xattrs on our pseudo files */
180
+ struct mem_range pseudo_file_xattr[PSEUDO_FILE_COUNT];
151181
152
- void *pending_read_xattr;
153
- size_t pending_read_xattr_size;
182
+ /* A queue of waiters who want to be notified about blocks_written */
183
+ wait_queue_head_t mi_blocks_written_notif_wq;
184
+
185
+ /* Number of blocks written since mount */
186
+ atomic_t mi_blocks_written;
187
+
188
+ /* Per UID read timeouts */
189
+ spinlock_t mi_per_uid_read_timeouts_lock;
190
+ struct incfs_per_uid_read_timeouts *mi_per_uid_read_timeouts;
191
+ int mi_per_uid_read_timeouts_size;
192
+
193
+ /* zstd workspace */
194
+ struct mutex mi_zstd_workspace_mutex;
195
+ void *mi_zstd_workspace;
196
+ ZSTD_DStream *mi_zstd_stream;
197
+ struct delayed_work mi_zstd_cleanup_work;
198
+
199
+ /* sysfs node */
200
+ struct incfs_sysfs_node *mi_sysfs_node;
201
+
202
+ /* Last error information */
203
+ struct mutex mi_le_mutex;
204
+ incfs_uuid_t mi_le_file_id;
205
+ u64 mi_le_time_us;
206
+ u32 mi_le_page;
207
+ u32 mi_le_errno;
208
+ uid_t mi_le_uid;
209
+
210
+ /* Number of reads timed out */
211
+ u32 mi_reads_failed_timed_out;
212
+
213
+ /* Number of reads failed because hash verification failed */
214
+ u32 mi_reads_failed_hash_verification;
215
+
216
+ /* Number of reads failed for another reason */
217
+ u32 mi_reads_failed_other;
218
+
219
+ /* Number of reads delayed because page had to be fetched */
220
+ u32 mi_reads_delayed_pending;
221
+
222
+ /* Total time waiting for pages to be fetched */
223
+ u64 mi_reads_delayed_pending_us;
224
+
225
+ /*
226
+ * Number of reads delayed because of per-uid min_time_us or
227
+ * min_pending_time_us settings
228
+ */
229
+ u32 mi_reads_delayed_min;
230
+
231
+ /* Total time waiting because of per-uid min_time_us or
232
+ * min_pending_time_us settings.
233
+ *
234
+ * Note that if a read is initially delayed because we have to wait for
235
+ * the page, then further delayed because of min_pending_time_us
236
+ * setting, this counter gets incremented by only the further delay
237
+ * time.
238
+ */
239
+ u64 mi_reads_delayed_min_us;
154240 };
155241
156242 struct data_file_block {
....@@ -172,17 +258,20 @@
172258
173259 int serial_number;
174260
261
+ uid_t uid;
262
+
175263 struct list_head mi_reads_list;
176264
177265 struct list_head segment_reads_list;
266
+
267
+ struct rcu_head rcu;
178268 };
179269
180270 struct data_file_segment {
181271 wait_queue_head_t new_data_arrival_wq;
182272
183273 /* Protects reads and writes from the blockmap */
184
- /* Good candidate for read/write mutex */
185
- struct mutex blockmap_mutex;
274
+ struct rw_semaphore rwsem;
186275
187276 /* List of active pending_read objects belonging to this segment */
188277 /* Protected by mount_info.pending_reads_mutex */
....@@ -232,11 +321,50 @@
232321 /* Total number of blocks, data + hash */
233322 int df_total_block_count;
234323
235
- struct file_attr n_attr;
324
+ /* For mapped files, the offset into the actual file */
325
+ loff_t df_mapped_offset;
236326
327
+ /* Number of data blocks written to file */
328
+ atomic_t df_data_blocks_written;
329
+
330
+ /* Number of data blocks in the status block */
331
+ u32 df_initial_data_blocks_written;
332
+
333
+ /* Number of hash blocks written to file */
334
+ atomic_t df_hash_blocks_written;
335
+
336
+ /* Number of hash blocks in the status block */
337
+ u32 df_initial_hash_blocks_written;
338
+
339
+ /* Offset to status metadata header */
340
+ loff_t df_status_offset;
341
+
342
+ /*
343
+ * Mutex acquired while enabling verity. Note that df_hash_tree is set
344
+ * by enable verity.
345
+ *
346
+ * The backing file mutex bc_mutex may be taken while this mutex is
347
+ * held.
348
+ */
349
+ struct mutex df_enable_verity;
350
+
351
+ /*
352
+ * Set either at construction time or during enabling verity. In the
353
+ * latter case, set via smp_store_release, so use smp_load_acquire to
354
+ * read it.
355
+ */
237356 struct mtree *df_hash_tree;
238357
358
+ /* Guaranteed set if df_hash_tree is set. */
239359 struct incfs_df_signature *df_signature;
360
+
361
+ /*
362
+ * The verity file digest, set when verity is enabled and the file has
363
+ * been opened
364
+ */
365
+ struct mem_range df_verity_file_digest;
366
+
367
+ struct incfs_df_verity_signature *df_verity_signature;
240368 };
241369
242370 struct dir_file {
....@@ -259,6 +387,23 @@
259387 struct path backing_path;
260388 };
261389
390
+enum FILL_PERMISSION {
391
+ CANT_FILL = 0,
392
+ CAN_FILL = 1,
393
+};
394
+
395
+struct incfs_file_data {
396
+ /* Does this file handle have INCFS_IOC_FILL_BLOCKS permission */
397
+ enum FILL_PERMISSION fd_fill_permission;
398
+
399
+ /* If INCFS_IOC_GET_FILLED_BLOCKS has been called, where are we */
400
+ int fd_get_block_pos;
401
+
402
+ /* And how many filled blocks are there up to that point */
403
+ int fd_filled_data_blocks;
404
+ int fd_filled_hash_blocks;
405
+};
406
+
262407 struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
263408 struct mount_options *options,
264409 struct path *backing_dir_path);
....@@ -268,19 +413,29 @@
268413
269414 void incfs_free_mount_info(struct mount_info *mi);
270415
416
+char *file_id_to_str(incfs_uuid_t id);
417
+struct dentry *incfs_lookup_dentry(struct dentry *parent, const char *name);
271418 struct data_file *incfs_open_data_file(struct mount_info *mi, struct file *bf);
272419 void incfs_free_data_file(struct data_file *df);
273
-
274
-int incfs_scan_metadata_chain(struct data_file *df);
275420
276421 struct dir_file *incfs_open_dir_file(struct mount_info *mi, struct file *bf);
277422 void incfs_free_dir_file(struct dir_file *dir);
278423
424
+struct incfs_read_data_file_timeouts {
425
+ u32 min_time_us;
426
+ u32 min_pending_time_us;
427
+ u32 max_pending_time_us;
428
+};
429
+
279430 ssize_t incfs_read_data_file_block(struct mem_range dst, struct file *f,
280
- int index, int timeout_ms,
281
- struct mem_range tmp);
431
+ int index, struct mem_range tmp,
432
+ struct incfs_read_data_file_timeouts *timeouts);
433
+
434
+ssize_t incfs_read_merkle_tree_blocks(struct mem_range dst,
435
+ struct data_file *df, size_t offset);
282436
283437 int incfs_get_filled_blocks(struct data_file *df,
438
+ struct incfs_file_data *fd,
284439 struct incfs_get_filled_blocks_args *arg);
285440
286441 int incfs_read_file_signature(struct data_file *df, struct mem_range dst);
....@@ -300,11 +455,13 @@
300455 */
301456 int incfs_collect_pending_reads(struct mount_info *mi, int sn_lowerbound,
302457 struct incfs_pending_read_info *reads,
303
- int reads_size);
458
+ struct incfs_pending_read_info2 *reads2,
459
+ int reads_size, int *new_max_sn);
304460
305461 int incfs_collect_logged_reads(struct mount_info *mi,
306462 struct read_log_state *start_state,
307463 struct incfs_pending_read_info *reads,
464
+ struct incfs_pending_read_info2 *reads2,
308465 int reads_size);
309466 struct read_log_state incfs_get_log_state(struct mount_info *mi);
310467 int incfs_get_uncollected_logs_count(struct mount_info *mi,
....@@ -315,7 +472,7 @@
315472 if (!inode)
316473 return NULL;
317474
318
- if (inode->i_sb->s_magic != (long) INCFS_MAGIC_NUMBER) {
475
+ if (inode->i_sb->s_magic != INCFS_MAGIC_NUMBER) {
319476 /* This inode doesn't belong to us. */
320477 pr_warn_once("incfs: %s on an alien inode.", __func__);
321478 return NULL;
....@@ -388,7 +545,5 @@
388545 return 0;
389546 return 1 + (size - 1) / INCFS_DATA_FILE_BLOCK_SIZE;
390547 }
391
-
392
-bool incfs_equal_ranges(struct mem_range lhs, struct mem_range rhs);
393548
394549 #endif /* _INCFS_DATA_MGMT_H */