.. | .. |
---|
10 | 10 | #include <linux/types.h> |
---|
11 | 11 | #include <linux/mutex.h> |
---|
12 | 12 | #include <linux/spinlock.h> |
---|
| 13 | +#include <linux/rcupdate.h> |
---|
13 | 14 | #include <linux/completion.h> |
---|
14 | 15 | #include <linux/wait.h> |
---|
| 16 | +#include <linux/zstd.h> |
---|
15 | 17 | #include <crypto/hash.h> |
---|
| 18 | +#include <linux/rwsem.h> |
---|
16 | 19 | |
---|
17 | 20 | #include <uapi/linux/incrementalfs.h> |
---|
18 | 21 | |
---|
19 | 22 | #include "internal.h" |
---|
| 23 | +#include "pseudo_files.h" |
---|
20 | 24 | |
---|
21 | 25 | #define SEGMENTS_PER_FILE 3 |
---|
22 | 26 | |
---|
23 | 27 | enum LOG_RECORD_TYPE { |
---|
24 | 28 | FULL, |
---|
25 | 29 | SAME_FILE, |
---|
| 30 | + SAME_FILE_CLOSE_BLOCK, |
---|
| 31 | + SAME_FILE_CLOSE_BLOCK_SHORT, |
---|
26 | 32 | SAME_FILE_NEXT_BLOCK, |
---|
27 | 33 | SAME_FILE_NEXT_BLOCK_SHORT, |
---|
28 | 34 | }; |
---|
29 | 35 | |
---|
30 | 36 | struct full_record { |
---|
31 | | - enum LOG_RECORD_TYPE type : 2; /* FULL */ |
---|
32 | | - u32 block_index : 30; |
---|
| 37 | + enum LOG_RECORD_TYPE type : 3; /* FULL */ |
---|
| 38 | + u32 block_index : 29; |
---|
33 | 39 | incfs_uuid_t file_id; |
---|
34 | 40 | u64 absolute_ts_us; |
---|
35 | | -} __packed; /* 28 bytes */ |
---|
| 41 | + uid_t uid; |
---|
| 42 | +} __packed; /* 32 bytes */ |
---|
36 | 43 | |
---|
37 | | -struct same_file_record { |
---|
38 | | - enum LOG_RECORD_TYPE type : 2; /* SAME_FILE */ |
---|
39 | | - u32 block_index : 30; |
---|
40 | | - u32 relative_ts_us; /* max 2^32 us ~= 1 hour (1:11:30) */ |
---|
41 | | -} __packed; /* 12 bytes */ |
---|
| 44 | +struct same_file { |
---|
| 45 | + enum LOG_RECORD_TYPE type : 3; /* SAME_FILE */ |
---|
| 46 | + u32 block_index : 29; |
---|
| 47 | + uid_t uid; |
---|
| 48 | + u16 relative_ts_us; /* max 2^16 us ~= 64 ms */ |
---|
| 49 | +} __packed; /* 10 bytes */ |
---|
42 | 50 | |
---|
43 | | -struct same_file_next_block { |
---|
44 | | - enum LOG_RECORD_TYPE type : 2; /* SAME_FILE_NEXT_BLOCK */ |
---|
45 | | - u32 relative_ts_us : 30; /* max 2^30 us ~= 15 min (17:50) */ |
---|
| 51 | +struct same_file_close_block { |
---|
| 52 | + enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK */ |
---|
| 53 | + u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */ |
---|
| 54 | + s16 block_index_delta; |
---|
46 | 55 | } __packed; /* 4 bytes */ |
---|
47 | 56 | |
---|
48 | | -struct same_file_next_block_short { |
---|
49 | | - enum LOG_RECORD_TYPE type : 2; /* SAME_FILE_NEXT_BLOCK_SHORT */ |
---|
50 | | - u16 relative_ts_us : 14; /* max 2^14 us ~= 16 ms */ |
---|
| 57 | +struct same_file_close_block_short { |
---|
| 58 | + enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK_SHORT */ |
---|
| 59 | + u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */ |
---|
| 60 | + s8 block_index_delta; |
---|
51 | 61 | } __packed; /* 2 bytes */ |
---|
| 62 | + |
---|
| 63 | +struct same_file_next_block { |
---|
| 64 | + enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK */ |
---|
| 65 | + u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */ |
---|
| 66 | +} __packed; /* 2 bytes */ |
---|
| 67 | + |
---|
| 68 | +struct same_file_next_block_short { |
---|
| 69 | + enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK_SHORT */ |
---|
| 70 | + u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */ |
---|
| 71 | +} __packed; /* 1 byte */ |
---|
52 | 72 | |
---|
53 | 73 | union log_record { |
---|
54 | 74 | struct full_record full_record; |
---|
55 | | - struct same_file_record same_file_record; |
---|
| 75 | + struct same_file same_file; |
---|
| 76 | + struct same_file_close_block same_file_close_block; |
---|
| 77 | + struct same_file_close_block_short same_file_close_block_short; |
---|
56 | 78 | struct same_file_next_block same_file_next_block; |
---|
57 | 79 | struct same_file_next_block_short same_file_next_block_short; |
---|
58 | 80 | }; |
---|
.. | .. |
---|
99 | 121 | unsigned int readahead_pages; |
---|
100 | 122 | unsigned int read_log_pages; |
---|
101 | 123 | unsigned int read_log_wakeup_count; |
---|
102 | | - bool no_backing_file_cache; |
---|
103 | | - bool no_backing_file_readahead; |
---|
| 124 | + bool report_uid; |
---|
| 125 | + char *sysfs_name; |
---|
104 | 126 | }; |
---|
105 | 127 | |
---|
106 | 128 | struct mount_info { |
---|
.. | .. |
---|
109 | 131 | struct path mi_backing_dir_path; |
---|
110 | 132 | |
---|
111 | 133 | struct dentry *mi_index_dir; |
---|
| 134 | + /* For stacking mounts, if true, this indicates if the index dir needs |
---|
| 135 | + * to be freed for this SB otherwise it was created by lower level SB */ |
---|
| 136 | + bool mi_index_free; |
---|
| 137 | + |
---|
| 138 | + struct dentry *mi_incomplete_dir; |
---|
| 139 | + /* For stacking mounts, if true, this indicates if the incomplete dir |
---|
| 140 | + * needs to be freed for this SB. Similar to mi_index_free */ |
---|
| 141 | + bool mi_incomplete_free; |
---|
112 | 142 | |
---|
113 | 143 | const struct cred *mi_owner; |
---|
114 | 144 | |
---|
.. | .. |
---|
123 | 153 | wait_queue_head_t mi_pending_reads_notif_wq; |
---|
124 | 154 | |
---|
125 | 155 | /* |
---|
126 | | - * Protects: |
---|
| 156 | + * Protects - RCU safe: |
---|
127 | 157 | * - reads_list_head |
---|
128 | 158 | * - mi_pending_reads_count |
---|
129 | 159 | * - mi_last_pending_read_number |
---|
130 | 160 | * - data_file_segment.reads_list_head |
---|
131 | 161 | */ |
---|
132 | | - struct mutex mi_pending_reads_mutex; |
---|
| 162 | + spinlock_t pending_read_lock; |
---|
133 | 163 | |
---|
134 | 164 | /* List of active pending_read objects */ |
---|
135 | 165 | struct list_head mi_reads_list_head; |
---|
.. | .. |
---|
146 | 176 | /* Temporary buffer for read logger. */ |
---|
147 | 177 | struct read_log mi_log; |
---|
148 | 178 | |
---|
149 | | - void *log_xattr; |
---|
150 | | - size_t log_xattr_size; |
---|
| 179 | + /* SELinux needs special xattrs on our pseudo files */ |
---|
| 180 | + struct mem_range pseudo_file_xattr[PSEUDO_FILE_COUNT]; |
---|
151 | 181 | |
---|
152 | | - void *pending_read_xattr; |
---|
153 | | - size_t pending_read_xattr_size; |
---|
| 182 | + /* A queue of waiters who want to be notified about blocks_written */ |
---|
| 183 | + wait_queue_head_t mi_blocks_written_notif_wq; |
---|
| 184 | + |
---|
| 185 | + /* Number of blocks written since mount */ |
---|
| 186 | + atomic_t mi_blocks_written; |
---|
| 187 | + |
---|
| 188 | + /* Per UID read timeouts */ |
---|
| 189 | + spinlock_t mi_per_uid_read_timeouts_lock; |
---|
| 190 | + struct incfs_per_uid_read_timeouts *mi_per_uid_read_timeouts; |
---|
| 191 | + int mi_per_uid_read_timeouts_size; |
---|
| 192 | + |
---|
| 193 | + /* zstd workspace */ |
---|
| 194 | + struct mutex mi_zstd_workspace_mutex; |
---|
| 195 | + void *mi_zstd_workspace; |
---|
| 196 | + ZSTD_DStream *mi_zstd_stream; |
---|
| 197 | + struct delayed_work mi_zstd_cleanup_work; |
---|
| 198 | + |
---|
| 199 | + /* sysfs node */ |
---|
| 200 | + struct incfs_sysfs_node *mi_sysfs_node; |
---|
| 201 | + |
---|
| 202 | + /* Last error information */ |
---|
| 203 | + struct mutex mi_le_mutex; |
---|
| 204 | + incfs_uuid_t mi_le_file_id; |
---|
| 205 | + u64 mi_le_time_us; |
---|
| 206 | + u32 mi_le_page; |
---|
| 207 | + u32 mi_le_errno; |
---|
| 208 | + uid_t mi_le_uid; |
---|
| 209 | + |
---|
| 210 | + /* Number of reads timed out */ |
---|
| 211 | + u32 mi_reads_failed_timed_out; |
---|
| 212 | + |
---|
| 213 | + /* Number of reads failed because hash verification failed */ |
---|
| 214 | + u32 mi_reads_failed_hash_verification; |
---|
| 215 | + |
---|
| 216 | + /* Number of reads failed for another reason */ |
---|
| 217 | + u32 mi_reads_failed_other; |
---|
| 218 | + |
---|
| 219 | + /* Number of reads delayed because page had to be fetched */ |
---|
| 220 | + u32 mi_reads_delayed_pending; |
---|
| 221 | + |
---|
| 222 | + /* Total time waiting for pages to be fetched */ |
---|
| 223 | + u64 mi_reads_delayed_pending_us; |
---|
| 224 | + |
---|
| 225 | + /* |
---|
| 226 | + * Number of reads delayed because of per-uid min_time_us or |
---|
| 227 | + * min_pending_time_us settings |
---|
| 228 | + */ |
---|
| 229 | + u32 mi_reads_delayed_min; |
---|
| 230 | + |
---|
| 231 | + /* Total time waiting because of per-uid min_time_us or |
---|
| 232 | + * min_pending_time_us settings. |
---|
| 233 | + * |
---|
| 234 | + * Note that if a read is initially delayed because we have to wait for |
---|
| 235 | + * the page, then further delayed because of min_pending_time_us |
---|
| 236 | + * setting, this counter gets incremented by only the further delay |
---|
| 237 | + * time. |
---|
| 238 | + */ |
---|
| 239 | + u64 mi_reads_delayed_min_us; |
---|
154 | 240 | }; |
---|
155 | 241 | |
---|
156 | 242 | struct data_file_block { |
---|
.. | .. |
---|
172 | 258 | |
---|
173 | 259 | int serial_number; |
---|
174 | 260 | |
---|
| 261 | + uid_t uid; |
---|
| 262 | + |
---|
175 | 263 | struct list_head mi_reads_list; |
---|
176 | 264 | |
---|
177 | 265 | struct list_head segment_reads_list; |
---|
| 266 | + |
---|
| 267 | + struct rcu_head rcu; |
---|
178 | 268 | }; |
---|
179 | 269 | |
---|
180 | 270 | struct data_file_segment { |
---|
181 | 271 | wait_queue_head_t new_data_arrival_wq; |
---|
182 | 272 | |
---|
183 | 273 | /* Protects reads and writes from the blockmap */ |
---|
184 | | - /* Good candidate for read/write mutex */ |
---|
185 | | - struct mutex blockmap_mutex; |
---|
| 274 | + struct rw_semaphore rwsem; |
---|
186 | 275 | |
---|
187 | 276 | /* List of active pending_read objects belonging to this segment */ |
---|
188 | 277 | /* Protected by mount_info.pending_reads_mutex */ |
---|
.. | .. |
---|
232 | 321 | /* Total number of blocks, data + hash */ |
---|
233 | 322 | int df_total_block_count; |
---|
234 | 323 | |
---|
235 | | - struct file_attr n_attr; |
---|
| 324 | + /* For mapped files, the offset into the actual file */ |
---|
| 325 | + loff_t df_mapped_offset; |
---|
236 | 326 | |
---|
| 327 | + /* Number of data blocks written to file */ |
---|
| 328 | + atomic_t df_data_blocks_written; |
---|
| 329 | + |
---|
| 330 | + /* Number of data blocks in the status block */ |
---|
| 331 | + u32 df_initial_data_blocks_written; |
---|
| 332 | + |
---|
| 333 | + /* Number of hash blocks written to file */ |
---|
| 334 | + atomic_t df_hash_blocks_written; |
---|
| 335 | + |
---|
| 336 | + /* Number of hash blocks in the status block */ |
---|
| 337 | + u32 df_initial_hash_blocks_written; |
---|
| 338 | + |
---|
| 339 | + /* Offset to status metadata header */ |
---|
| 340 | + loff_t df_status_offset; |
---|
| 341 | + |
---|
| 342 | + /* |
---|
| 343 | + * Mutex acquired while enabling verity. Note that df_hash_tree is set |
---|
| 344 | + * by enable verity. |
---|
| 345 | + * |
---|
| 346 | + * The backing file mutex bc_mutex may be taken while this mutex is |
---|
| 347 | + * held. |
---|
| 348 | + */ |
---|
| 349 | + struct mutex df_enable_verity; |
---|
| 350 | + |
---|
| 351 | + /* |
---|
| 352 | + * Set either at construction time or during enabling verity. In the |
---|
| 353 | + * latter case, set via smp_store_release, so use smp_load_acquire to |
---|
| 354 | + * read it. |
---|
| 355 | + */ |
---|
237 | 356 | struct mtree *df_hash_tree; |
---|
238 | 357 | |
---|
| 358 | + /* Guaranteed set if df_hash_tree is set. */ |
---|
239 | 359 | struct incfs_df_signature *df_signature; |
---|
| 360 | + |
---|
| 361 | + /* |
---|
| 362 | + * The verity file digest, set when verity is enabled and the file has |
---|
| 363 | + * been opened |
---|
| 364 | + */ |
---|
| 365 | + struct mem_range df_verity_file_digest; |
---|
| 366 | + |
---|
| 367 | + struct incfs_df_verity_signature *df_verity_signature; |
---|
240 | 368 | }; |
---|
241 | 369 | |
---|
242 | 370 | struct dir_file { |
---|
.. | .. |
---|
259 | 387 | struct path backing_path; |
---|
260 | 388 | }; |
---|
261 | 389 | |
---|
| 390 | +enum FILL_PERMISSION { |
---|
| 391 | + CANT_FILL = 0, |
---|
| 392 | + CAN_FILL = 1, |
---|
| 393 | +}; |
---|
| 394 | + |
---|
| 395 | +struct incfs_file_data { |
---|
| 396 | + /* Does this file handle have INCFS_IOC_FILL_BLOCKS permission */ |
---|
| 397 | + enum FILL_PERMISSION fd_fill_permission; |
---|
| 398 | + |
---|
| 399 | + /* If INCFS_IOC_GET_FILLED_BLOCKS has been called, where are we */ |
---|
| 400 | + int fd_get_block_pos; |
---|
| 401 | + |
---|
| 402 | + /* And how many filled blocks are there up to that point */ |
---|
| 403 | + int fd_filled_data_blocks; |
---|
| 404 | + int fd_filled_hash_blocks; |
---|
| 405 | +}; |
---|
| 406 | + |
---|
262 | 407 | struct mount_info *incfs_alloc_mount_info(struct super_block *sb, |
---|
263 | 408 | struct mount_options *options, |
---|
264 | 409 | struct path *backing_dir_path); |
---|
.. | .. |
---|
268 | 413 | |
---|
269 | 414 | void incfs_free_mount_info(struct mount_info *mi); |
---|
270 | 415 | |
---|
| 416 | +char *file_id_to_str(incfs_uuid_t id); |
---|
| 417 | +struct dentry *incfs_lookup_dentry(struct dentry *parent, const char *name); |
---|
271 | 418 | struct data_file *incfs_open_data_file(struct mount_info *mi, struct file *bf); |
---|
272 | 419 | void incfs_free_data_file(struct data_file *df); |
---|
273 | | - |
---|
274 | | -int incfs_scan_metadata_chain(struct data_file *df); |
---|
275 | 420 | |
---|
276 | 421 | struct dir_file *incfs_open_dir_file(struct mount_info *mi, struct file *bf); |
---|
277 | 422 | void incfs_free_dir_file(struct dir_file *dir); |
---|
278 | 423 | |
---|
| 424 | +struct incfs_read_data_file_timeouts { |
---|
| 425 | + u32 min_time_us; |
---|
| 426 | + u32 min_pending_time_us; |
---|
| 427 | + u32 max_pending_time_us; |
---|
| 428 | +}; |
---|
| 429 | + |
---|
279 | 430 | ssize_t incfs_read_data_file_block(struct mem_range dst, struct file *f, |
---|
280 | | - int index, int timeout_ms, |
---|
281 | | - struct mem_range tmp); |
---|
| 431 | + int index, struct mem_range tmp, |
---|
| 432 | + struct incfs_read_data_file_timeouts *timeouts); |
---|
| 433 | + |
---|
| 434 | +ssize_t incfs_read_merkle_tree_blocks(struct mem_range dst, |
---|
| 435 | + struct data_file *df, size_t offset); |
---|
282 | 436 | |
---|
283 | 437 | int incfs_get_filled_blocks(struct data_file *df, |
---|
| 438 | + struct incfs_file_data *fd, |
---|
284 | 439 | struct incfs_get_filled_blocks_args *arg); |
---|
285 | 440 | |
---|
286 | 441 | int incfs_read_file_signature(struct data_file *df, struct mem_range dst); |
---|
.. | .. |
---|
300 | 455 | */ |
---|
301 | 456 | int incfs_collect_pending_reads(struct mount_info *mi, int sn_lowerbound, |
---|
302 | 457 | struct incfs_pending_read_info *reads, |
---|
303 | | - int reads_size); |
---|
| 458 | + struct incfs_pending_read_info2 *reads2, |
---|
| 459 | + int reads_size, int *new_max_sn); |
---|
304 | 460 | |
---|
305 | 461 | int incfs_collect_logged_reads(struct mount_info *mi, |
---|
306 | 462 | struct read_log_state *start_state, |
---|
307 | 463 | struct incfs_pending_read_info *reads, |
---|
| 464 | + struct incfs_pending_read_info2 *reads2, |
---|
308 | 465 | int reads_size); |
---|
309 | 466 | struct read_log_state incfs_get_log_state(struct mount_info *mi); |
---|
310 | 467 | int incfs_get_uncollected_logs_count(struct mount_info *mi, |
---|
.. | .. |
---|
315 | 472 | if (!inode) |
---|
316 | 473 | return NULL; |
---|
317 | 474 | |
---|
318 | | - if (inode->i_sb->s_magic != (long) INCFS_MAGIC_NUMBER) { |
---|
| 475 | + if (inode->i_sb->s_magic != INCFS_MAGIC_NUMBER) { |
---|
319 | 476 | /* This inode doesn't belong to us. */ |
---|
320 | 477 | pr_warn_once("incfs: %s on an alien inode.", __func__); |
---|
321 | 478 | return NULL; |
---|
.. | .. |
---|
388 | 545 | return 0; |
---|
389 | 546 | return 1 + (size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; |
---|
390 | 547 | } |
---|
391 | | - |
---|
392 | | -bool incfs_equal_ranges(struct mem_range lhs, struct mem_range rhs); |
---|
393 | 548 | |
---|
394 | 549 | #endif /* _INCFS_DATA_MGMT_H */ |
---|