.. | .. |
---|
1 | 1 | // SPDX-License-Identifier: GPL-2.0 |
---|
2 | 2 | #include <linux/ceph/ceph_debug.h> |
---|
| 3 | +#include <linux/ceph/striper.h> |
---|
3 | 4 | |
---|
4 | 5 | #include <linux/module.h> |
---|
5 | 6 | #include <linux/sched.h> |
---|
.. | .. |
---|
9 | 10 | #include <linux/namei.h> |
---|
10 | 11 | #include <linux/writeback.h> |
---|
11 | 12 | #include <linux/falloc.h> |
---|
| 13 | +#include <linux/iversion.h> |
---|
| 14 | +#include <linux/ktime.h> |
---|
12 | 15 | |
---|
13 | 16 | #include "super.h" |
---|
14 | 17 | #include "mds_client.h" |
---|
15 | 18 | #include "cache.h" |
---|
| 19 | +#include "io.h" |
---|
| 20 | +#include "metric.h" |
---|
16 | 21 | |
---|
17 | 22 | static __le32 ceph_flags_sys2wire(u32 flags) |
---|
18 | 23 | { |
---|
.. | .. |
---|
177 | 182 | static struct ceph_mds_request * |
---|
178 | 183 | prepare_open_request(struct super_block *sb, int flags, int create_mode) |
---|
179 | 184 | { |
---|
180 | | - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); |
---|
181 | | - struct ceph_mds_client *mdsc = fsc->mdsc; |
---|
| 185 | + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); |
---|
182 | 186 | struct ceph_mds_request *req; |
---|
183 | 187 | int want_auth = USE_ANY_MDS; |
---|
184 | 188 | int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; |
---|
.. | .. |
---|
199 | 203 | static int ceph_init_file_info(struct inode *inode, struct file *file, |
---|
200 | 204 | int fmode, bool isdir) |
---|
201 | 205 | { |
---|
| 206 | + struct ceph_inode_info *ci = ceph_inode(inode); |
---|
202 | 207 | struct ceph_file_info *fi; |
---|
203 | 208 | |
---|
204 | 209 | dout("%s %p %p 0%o (%s)\n", __func__, inode, file, |
---|
.. | .. |
---|
208 | 213 | if (isdir) { |
---|
209 | 214 | struct ceph_dir_file_info *dfi = |
---|
210 | 215 | kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); |
---|
211 | | - if (!dfi) { |
---|
212 | | - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ |
---|
| 216 | + if (!dfi) |
---|
213 | 217 | return -ENOMEM; |
---|
214 | | - } |
---|
215 | 218 | |
---|
216 | 219 | file->private_data = dfi; |
---|
217 | 220 | fi = &dfi->file_info; |
---|
.. | .. |
---|
219 | 222 | dfi->readdir_cache_idx = -1; |
---|
220 | 223 | } else { |
---|
221 | 224 | fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); |
---|
222 | | - if (!fi) { |
---|
223 | | - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ |
---|
| 225 | + if (!fi) |
---|
224 | 226 | return -ENOMEM; |
---|
225 | | - } |
---|
226 | 227 | |
---|
227 | 228 | file->private_data = fi; |
---|
228 | 229 | } |
---|
229 | 230 | |
---|
| 231 | + ceph_get_fmode(ci, fmode, 1); |
---|
230 | 232 | fi->fmode = fmode; |
---|
| 233 | + |
---|
231 | 234 | spin_lock_init(&fi->rw_contexts_lock); |
---|
232 | 235 | INIT_LIST_HEAD(&fi->rw_contexts); |
---|
| 236 | + fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); |
---|
233 | 237 | |
---|
234 | 238 | return 0; |
---|
235 | 239 | } |
---|
.. | .. |
---|
246 | 250 | case S_IFREG: |
---|
247 | 251 | ceph_fscache_register_inode_cookie(inode); |
---|
248 | 252 | ceph_fscache_file_set_cookie(inode, file); |
---|
| 253 | + fallthrough; |
---|
249 | 254 | case S_IFDIR: |
---|
250 | 255 | ret = ceph_init_file_info(inode, file, fmode, |
---|
251 | 256 | S_ISDIR(inode->i_mode)); |
---|
252 | | - if (ret) |
---|
253 | | - return ret; |
---|
254 | 257 | break; |
---|
255 | 258 | |
---|
256 | 259 | case S_IFLNK: |
---|
257 | 260 | dout("init_file %p %p 0%o (symlink)\n", inode, file, |
---|
258 | 261 | inode->i_mode); |
---|
259 | | - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ |
---|
260 | 262 | break; |
---|
261 | 263 | |
---|
262 | 264 | default: |
---|
.. | .. |
---|
266 | 268 | * we need to drop the open ref now, since we don't |
---|
267 | 269 | * have .release set to ceph_release. |
---|
268 | 270 | */ |
---|
269 | | - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ |
---|
270 | 271 | BUG_ON(inode->i_fop->release == ceph_release); |
---|
271 | 272 | |
---|
272 | 273 | /* call the proper open fop */ |
---|
.. | .. |
---|
278 | 279 | /* |
---|
279 | 280 | * try renew caps after session gets killed. |
---|
280 | 281 | */ |
---|
281 | | -int ceph_renew_caps(struct inode *inode) |
---|
| 282 | +int ceph_renew_caps(struct inode *inode, int fmode) |
---|
282 | 283 | { |
---|
283 | | - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
---|
| 284 | + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); |
---|
284 | 285 | struct ceph_inode_info *ci = ceph_inode(inode); |
---|
285 | 286 | struct ceph_mds_request *req; |
---|
286 | 287 | int err, flags, wanted; |
---|
287 | 288 | |
---|
288 | 289 | spin_lock(&ci->i_ceph_lock); |
---|
| 290 | + __ceph_touch_fmode(ci, mdsc, fmode); |
---|
289 | 291 | wanted = __ceph_caps_file_wanted(ci); |
---|
290 | 292 | if (__ceph_is_any_real_caps(ci) && |
---|
291 | 293 | (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { |
---|
.. | .. |
---|
319 | 321 | req->r_inode = inode; |
---|
320 | 322 | ihold(inode); |
---|
321 | 323 | req->r_num_caps = 1; |
---|
322 | | - req->r_fmode = -1; |
---|
323 | 324 | |
---|
324 | 325 | err = ceph_mdsc_do_request(mdsc, NULL, req); |
---|
325 | 326 | ceph_mdsc_put_request(req); |
---|
.. | .. |
---|
365 | 366 | |
---|
366 | 367 | /* trivially open snapdir */ |
---|
367 | 368 | if (ceph_snap(inode) == CEPH_SNAPDIR) { |
---|
368 | | - spin_lock(&ci->i_ceph_lock); |
---|
369 | | - __ceph_get_fmode(ci, fmode); |
---|
370 | | - spin_unlock(&ci->i_ceph_lock); |
---|
371 | 369 | return ceph_init_file(inode, file, fmode); |
---|
372 | 370 | } |
---|
373 | 371 | |
---|
.. | .. |
---|
385 | 383 | dout("open %p fmode %d want %s issued %s using existing\n", |
---|
386 | 384 | inode, fmode, ceph_cap_string(wanted), |
---|
387 | 385 | ceph_cap_string(issued)); |
---|
388 | | - __ceph_get_fmode(ci, fmode); |
---|
| 386 | + __ceph_touch_fmode(ci, mdsc, fmode); |
---|
389 | 387 | spin_unlock(&ci->i_ceph_lock); |
---|
390 | 388 | |
---|
391 | 389 | /* adjust wanted? */ |
---|
.. | .. |
---|
397 | 395 | return ceph_init_file(inode, file, fmode); |
---|
398 | 396 | } else if (ceph_snap(inode) != CEPH_NOSNAP && |
---|
399 | 397 | (ci->i_snap_caps & wanted) == wanted) { |
---|
400 | | - __ceph_get_fmode(ci, fmode); |
---|
| 398 | + __ceph_touch_fmode(ci, mdsc, fmode); |
---|
401 | 399 | spin_unlock(&ci->i_ceph_lock); |
---|
402 | 400 | return ceph_init_file(inode, file, fmode); |
---|
403 | 401 | } |
---|
.. | .. |
---|
423 | 421 | return err; |
---|
424 | 422 | } |
---|
425 | 423 | |
---|
| 424 | +/* Clone the layout from a synchronous create, if the dir now has Dc caps */ |
---|
| 425 | +static void |
---|
| 426 | +cache_file_layout(struct inode *dst, struct inode *src) |
---|
| 427 | +{ |
---|
| 428 | + struct ceph_inode_info *cdst = ceph_inode(dst); |
---|
| 429 | + struct ceph_inode_info *csrc = ceph_inode(src); |
---|
| 430 | + |
---|
| 431 | + spin_lock(&cdst->i_ceph_lock); |
---|
| 432 | + if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) && |
---|
| 433 | + !ceph_file_layout_is_valid(&cdst->i_cached_layout)) { |
---|
| 434 | + memcpy(&cdst->i_cached_layout, &csrc->i_layout, |
---|
| 435 | + sizeof(cdst->i_cached_layout)); |
---|
| 436 | + rcu_assign_pointer(cdst->i_cached_layout.pool_ns, |
---|
| 437 | + ceph_try_get_string(csrc->i_layout.pool_ns)); |
---|
| 438 | + } |
---|
| 439 | + spin_unlock(&cdst->i_ceph_lock); |
---|
| 440 | +} |
---|
| 441 | + |
---|
| 442 | +/* |
---|
| 443 | + * Try to set up an async create. We need caps, a file layout, and inode number, |
---|
| 444 | + * and either a lease on the dentry or complete dir info. If any of those |
---|
| 445 | + * criteria are not satisfied, then return false and the caller can go |
---|
| 446 | + * synchronous. |
---|
| 447 | + */ |
---|
| 448 | +static int try_prep_async_create(struct inode *dir, struct dentry *dentry, |
---|
| 449 | + struct ceph_file_layout *lo, u64 *pino) |
---|
| 450 | +{ |
---|
| 451 | + struct ceph_inode_info *ci = ceph_inode(dir); |
---|
| 452 | + struct ceph_dentry_info *di = ceph_dentry(dentry); |
---|
| 453 | + int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE; |
---|
| 454 | + u64 ino; |
---|
| 455 | + |
---|
| 456 | + spin_lock(&ci->i_ceph_lock); |
---|
| 457 | + /* No auth cap means no chance for Dc caps */ |
---|
| 458 | + if (!ci->i_auth_cap) |
---|
| 459 | + goto no_async; |
---|
| 460 | + |
---|
| 461 | + /* Any delegated inos? */ |
---|
| 462 | + if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) |
---|
| 463 | + goto no_async; |
---|
| 464 | + |
---|
| 465 | + if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) |
---|
| 466 | + goto no_async; |
---|
| 467 | + |
---|
| 468 | + if ((__ceph_caps_issued(ci, NULL) & want) != want) |
---|
| 469 | + goto no_async; |
---|
| 470 | + |
---|
| 471 | + if (d_in_lookup(dentry)) { |
---|
| 472 | + if (!__ceph_dir_is_complete(ci)) |
---|
| 473 | + goto no_async; |
---|
| 474 | + spin_lock(&dentry->d_lock); |
---|
| 475 | + di->lease_shared_gen = atomic_read(&ci->i_shared_gen); |
---|
| 476 | + spin_unlock(&dentry->d_lock); |
---|
| 477 | + } else if (atomic_read(&ci->i_shared_gen) != |
---|
| 478 | + READ_ONCE(di->lease_shared_gen)) { |
---|
| 479 | + goto no_async; |
---|
| 480 | + } |
---|
| 481 | + |
---|
| 482 | + ino = ceph_get_deleg_ino(ci->i_auth_cap->session); |
---|
| 483 | + if (!ino) |
---|
| 484 | + goto no_async; |
---|
| 485 | + |
---|
| 486 | + *pino = ino; |
---|
| 487 | + ceph_take_cap_refs(ci, want, false); |
---|
| 488 | + memcpy(lo, &ci->i_cached_layout, sizeof(*lo)); |
---|
| 489 | + rcu_assign_pointer(lo->pool_ns, |
---|
| 490 | + ceph_try_get_string(ci->i_cached_layout.pool_ns)); |
---|
| 491 | + got = want; |
---|
| 492 | +no_async: |
---|
| 493 | + spin_unlock(&ci->i_ceph_lock); |
---|
| 494 | + return got; |
---|
| 495 | +} |
---|
| 496 | + |
---|
| 497 | +static void restore_deleg_ino(struct inode *dir, u64 ino) |
---|
| 498 | +{ |
---|
| 499 | + struct ceph_inode_info *ci = ceph_inode(dir); |
---|
| 500 | + struct ceph_mds_session *s = NULL; |
---|
| 501 | + |
---|
| 502 | + spin_lock(&ci->i_ceph_lock); |
---|
| 503 | + if (ci->i_auth_cap) |
---|
| 504 | + s = ceph_get_mds_session(ci->i_auth_cap->session); |
---|
| 505 | + spin_unlock(&ci->i_ceph_lock); |
---|
| 506 | + if (s) { |
---|
| 507 | + int err = ceph_restore_deleg_ino(s, ino); |
---|
| 508 | + if (err) |
---|
| 509 | + pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n", |
---|
| 510 | + ino, err); |
---|
| 511 | + ceph_put_mds_session(s); |
---|
| 512 | + } |
---|
| 513 | +} |
---|
| 514 | + |
---|
| 515 | +static void ceph_async_create_cb(struct ceph_mds_client *mdsc, |
---|
| 516 | + struct ceph_mds_request *req) |
---|
| 517 | +{ |
---|
| 518 | + int result = req->r_err ? req->r_err : |
---|
| 519 | + le32_to_cpu(req->r_reply_info.head->result); |
---|
| 520 | + |
---|
| 521 | + if (result == -EJUKEBOX) |
---|
| 522 | + goto out; |
---|
| 523 | + |
---|
| 524 | + mapping_set_error(req->r_parent->i_mapping, result); |
---|
| 525 | + |
---|
| 526 | + if (result) { |
---|
| 527 | + struct dentry *dentry = req->r_dentry; |
---|
| 528 | + int pathlen = 0; |
---|
| 529 | + u64 base = 0; |
---|
| 530 | + char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, |
---|
| 531 | + &base, 0); |
---|
| 532 | + |
---|
| 533 | + ceph_dir_clear_complete(req->r_parent); |
---|
| 534 | + if (!d_unhashed(dentry)) |
---|
| 535 | + d_drop(dentry); |
---|
| 536 | + |
---|
| 537 | + /* FIXME: start returning I/O errors on all accesses? */ |
---|
| 538 | + pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", |
---|
| 539 | + base, IS_ERR(path) ? "<<bad>>" : path, result); |
---|
| 540 | + ceph_mdsc_free_path(path, pathlen); |
---|
| 541 | + } |
---|
| 542 | + |
---|
| 543 | + if (req->r_target_inode) { |
---|
| 544 | + struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); |
---|
| 545 | + u64 ino = ceph_vino(req->r_target_inode).ino; |
---|
| 546 | + |
---|
| 547 | + if (req->r_deleg_ino != ino) |
---|
| 548 | + pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", |
---|
| 549 | + __func__, req->r_err, req->r_deleg_ino, ino); |
---|
| 550 | + mapping_set_error(req->r_target_inode->i_mapping, result); |
---|
| 551 | + |
---|
| 552 | + spin_lock(&ci->i_ceph_lock); |
---|
| 553 | + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { |
---|
| 554 | + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; |
---|
| 555 | + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); |
---|
| 556 | + } |
---|
| 557 | + ceph_kick_flushing_inode_caps(req->r_session, ci); |
---|
| 558 | + spin_unlock(&ci->i_ceph_lock); |
---|
| 559 | + } else { |
---|
| 560 | + pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, |
---|
| 561 | + req->r_deleg_ino); |
---|
| 562 | + } |
---|
| 563 | +out: |
---|
| 564 | + ceph_mdsc_release_dir_caps(req); |
---|
| 565 | +} |
---|
| 566 | + |
---|
| 567 | +static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, |
---|
| 568 | + struct file *file, umode_t mode, |
---|
| 569 | + struct ceph_mds_request *req, |
---|
| 570 | + struct ceph_acl_sec_ctx *as_ctx, |
---|
| 571 | + struct ceph_file_layout *lo) |
---|
| 572 | +{ |
---|
| 573 | + int ret; |
---|
| 574 | + char xattr_buf[4]; |
---|
| 575 | + struct ceph_mds_reply_inode in = { }; |
---|
| 576 | + struct ceph_mds_reply_info_in iinfo = { .in = &in }; |
---|
| 577 | + struct ceph_inode_info *ci = ceph_inode(dir); |
---|
| 578 | + struct inode *inode; |
---|
| 579 | + struct timespec64 now; |
---|
| 580 | + struct ceph_string *pool_ns; |
---|
| 581 | + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); |
---|
| 582 | + struct ceph_vino vino = { .ino = req->r_deleg_ino, |
---|
| 583 | + .snap = CEPH_NOSNAP }; |
---|
| 584 | + |
---|
| 585 | + ktime_get_real_ts64(&now); |
---|
| 586 | + |
---|
| 587 | + inode = ceph_get_inode(dentry->d_sb, vino); |
---|
| 588 | + if (IS_ERR(inode)) |
---|
| 589 | + return PTR_ERR(inode); |
---|
| 590 | + |
---|
| 591 | + iinfo.inline_version = CEPH_INLINE_NONE; |
---|
| 592 | + iinfo.change_attr = 1; |
---|
| 593 | + ceph_encode_timespec64(&iinfo.btime, &now); |
---|
| 594 | + |
---|
| 595 | + if (req->r_pagelist) { |
---|
| 596 | + iinfo.xattr_len = req->r_pagelist->length; |
---|
| 597 | + iinfo.xattr_data = req->r_pagelist->mapped_tail; |
---|
| 598 | + } else { |
---|
| 599 | + /* fake it */ |
---|
| 600 | + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); |
---|
| 601 | + iinfo.xattr_data = xattr_buf; |
---|
| 602 | + memset(iinfo.xattr_data, 0, iinfo.xattr_len); |
---|
| 603 | + } |
---|
| 604 | + |
---|
| 605 | + in.ino = cpu_to_le64(vino.ino); |
---|
| 606 | + in.snapid = cpu_to_le64(CEPH_NOSNAP); |
---|
| 607 | + in.version = cpu_to_le64(1); // ??? |
---|
| 608 | + in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); |
---|
| 609 | + in.cap.cap_id = cpu_to_le64(1); |
---|
| 610 | + in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); |
---|
| 611 | + in.cap.flags = CEPH_CAP_FLAG_AUTH; |
---|
| 612 | + in.ctime = in.mtime = in.atime = iinfo.btime; |
---|
| 613 | + in.truncate_seq = cpu_to_le32(1); |
---|
| 614 | + in.truncate_size = cpu_to_le64(-1ULL); |
---|
| 615 | + in.xattr_version = cpu_to_le64(1); |
---|
| 616 | + in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); |
---|
| 617 | + if (dir->i_mode & S_ISGID) { |
---|
| 618 | + in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid)); |
---|
| 619 | + |
---|
| 620 | + /* Directories always inherit the setgid bit. */ |
---|
| 621 | + if (S_ISDIR(mode)) |
---|
| 622 | + mode |= S_ISGID; |
---|
| 623 | + else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && |
---|
| 624 | + !in_group_p(dir->i_gid) && |
---|
| 625 | + !capable_wrt_inode_uidgid(dir, CAP_FSETID)) |
---|
| 626 | + mode &= ~S_ISGID; |
---|
| 627 | + } else { |
---|
| 628 | + in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid())); |
---|
| 629 | + } |
---|
| 630 | + in.mode = cpu_to_le32((u32)mode); |
---|
| 631 | + |
---|
| 632 | + in.nlink = cpu_to_le32(1); |
---|
| 633 | + in.max_size = cpu_to_le64(lo->stripe_unit); |
---|
| 634 | + |
---|
| 635 | + ceph_file_layout_to_legacy(lo, &in.layout); |
---|
| 636 | + /* lo is private, so pool_ns can't change */ |
---|
| 637 | + pool_ns = rcu_dereference_raw(lo->pool_ns); |
---|
| 638 | + if (pool_ns) { |
---|
| 639 | + iinfo.pool_ns_len = pool_ns->len; |
---|
| 640 | + iinfo.pool_ns_data = pool_ns->str; |
---|
| 641 | + } |
---|
| 642 | + |
---|
| 643 | + down_read(&mdsc->snap_rwsem); |
---|
| 644 | + ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, |
---|
| 645 | + req->r_fmode, NULL); |
---|
| 646 | + up_read(&mdsc->snap_rwsem); |
---|
| 647 | + if (ret) { |
---|
| 648 | + dout("%s failed to fill inode: %d\n", __func__, ret); |
---|
| 649 | + ceph_dir_clear_complete(dir); |
---|
| 650 | + if (!d_unhashed(dentry)) |
---|
| 651 | + d_drop(dentry); |
---|
| 652 | + if (inode->i_state & I_NEW) |
---|
| 653 | + discard_new_inode(inode); |
---|
| 654 | + } else { |
---|
| 655 | + struct dentry *dn; |
---|
| 656 | + |
---|
| 657 | + dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__, |
---|
| 658 | + vino.ino, ceph_ino(dir), dentry->d_name.name); |
---|
| 659 | + ceph_dir_clear_ordered(dir); |
---|
| 660 | + ceph_init_inode_acls(inode, as_ctx); |
---|
| 661 | + if (inode->i_state & I_NEW) { |
---|
| 662 | + /* |
---|
| 663 | + * If it's not I_NEW, then someone created this before |
---|
| 664 | + * we got here. Assume the server is aware of it at |
---|
| 665 | + * that point and don't worry about setting |
---|
| 666 | + * CEPH_I_ASYNC_CREATE. |
---|
| 667 | + */ |
---|
| 668 | + ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE; |
---|
| 669 | + unlock_new_inode(inode); |
---|
| 670 | + } |
---|
| 671 | + if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { |
---|
| 672 | + if (!d_unhashed(dentry)) |
---|
| 673 | + d_drop(dentry); |
---|
| 674 | + dn = d_splice_alias(inode, dentry); |
---|
| 675 | + WARN_ON_ONCE(dn && dn != dentry); |
---|
| 676 | + } |
---|
| 677 | + file->f_mode |= FMODE_CREATED; |
---|
| 678 | + ret = finish_open(file, dentry, ceph_open); |
---|
| 679 | + } |
---|
| 680 | + return ret; |
---|
| 681 | +} |
---|
426 | 682 | |
---|
427 | 683 | /* |
---|
428 | 684 | * Do a lookup + open with a single request. If we get a non-existent |
---|
.. | .. |
---|
435 | 691 | struct ceph_mds_client *mdsc = fsc->mdsc; |
---|
436 | 692 | struct ceph_mds_request *req; |
---|
437 | 693 | struct dentry *dn; |
---|
438 | | - struct ceph_acls_info acls = {}; |
---|
| 694 | + struct ceph_acl_sec_ctx as_ctx = {}; |
---|
| 695 | + bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); |
---|
439 | 696 | int mask; |
---|
440 | 697 | int err; |
---|
441 | 698 | |
---|
.. | .. |
---|
446 | 703 | if (dentry->d_name.len > NAME_MAX) |
---|
447 | 704 | return -ENAMETOOLONG; |
---|
448 | 705 | |
---|
| 706 | + /* |
---|
| 707 | + * Do not truncate the file, since atomic_open is called before the |
---|
| 708 | + * permission check. The caller will do the truncation afterward. |
---|
| 709 | + */ |
---|
| 710 | + flags &= ~O_TRUNC; |
---|
| 711 | + |
---|
449 | 712 | if (flags & O_CREAT) { |
---|
450 | 713 | if (ceph_quota_is_max_files_exceeded(dir)) |
---|
451 | 714 | return -EDQUOT; |
---|
452 | | - err = ceph_pre_init_acls(dir, &mode, &acls); |
---|
| 715 | + err = ceph_pre_init_acls(dir, &mode, &as_ctx); |
---|
453 | 716 | if (err < 0) |
---|
454 | 717 | return err; |
---|
| 718 | + err = ceph_security_init_secctx(dentry, mode, &as_ctx); |
---|
| 719 | + if (err < 0) |
---|
| 720 | + goto out_ctx; |
---|
| 721 | + /* Async create can't handle more than a page of xattrs */ |
---|
| 722 | + if (as_ctx.pagelist && |
---|
| 723 | + !list_is_singular(&as_ctx.pagelist->head)) |
---|
| 724 | + try_async = false; |
---|
| 725 | + } else if (!d_in_lookup(dentry)) { |
---|
| 726 | + /* If it's not being looked up, it's negative */ |
---|
| 727 | + return -ENOENT; |
---|
455 | 728 | } |
---|
456 | | - |
---|
| 729 | +retry: |
---|
457 | 730 | /* do the open */ |
---|
458 | 731 | req = prepare_open_request(dir->i_sb, flags, mode); |
---|
459 | 732 | if (IS_ERR(req)) { |
---|
460 | 733 | err = PTR_ERR(req); |
---|
461 | | - goto out_acl; |
---|
| 734 | + goto out_ctx; |
---|
462 | 735 | } |
---|
463 | 736 | req->r_dentry = dget(dentry); |
---|
464 | 737 | req->r_num_caps = 2; |
---|
| 738 | + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; |
---|
| 739 | + if (ceph_security_xattr_wanted(dir)) |
---|
| 740 | + mask |= CEPH_CAP_XATTR_SHARED; |
---|
| 741 | + req->r_args.open.mask = cpu_to_le32(mask); |
---|
| 742 | + req->r_parent = dir; |
---|
| 743 | + |
---|
465 | 744 | if (flags & O_CREAT) { |
---|
| 745 | + struct ceph_file_layout lo; |
---|
| 746 | + |
---|
466 | 747 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; |
---|
467 | 748 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; |
---|
468 | | - if (acls.pagelist) { |
---|
469 | | - req->r_pagelist = acls.pagelist; |
---|
470 | | - acls.pagelist = NULL; |
---|
| 749 | + if (as_ctx.pagelist) { |
---|
| 750 | + req->r_pagelist = as_ctx.pagelist; |
---|
| 751 | + as_ctx.pagelist = NULL; |
---|
| 752 | + } |
---|
| 753 | + if (try_async && |
---|
| 754 | + (req->r_dir_caps = |
---|
| 755 | + try_prep_async_create(dir, dentry, &lo, |
---|
| 756 | + &req->r_deleg_ino))) { |
---|
| 757 | + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); |
---|
| 758 | + req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); |
---|
| 759 | + req->r_callback = ceph_async_create_cb; |
---|
| 760 | + err = ceph_mdsc_submit_request(mdsc, dir, req); |
---|
| 761 | + if (!err) { |
---|
| 762 | + err = ceph_finish_async_create(dir, dentry, |
---|
| 763 | + file, mode, req, |
---|
| 764 | + &as_ctx, &lo); |
---|
| 765 | + } else if (err == -EJUKEBOX) { |
---|
| 766 | + restore_deleg_ino(dir, req->r_deleg_ino); |
---|
| 767 | + ceph_mdsc_put_request(req); |
---|
| 768 | + try_async = false; |
---|
| 769 | + ceph_put_string(rcu_dereference_raw(lo.pool_ns)); |
---|
| 770 | + goto retry; |
---|
| 771 | + } |
---|
| 772 | + ceph_put_string(rcu_dereference_raw(lo.pool_ns)); |
---|
| 773 | + goto out_req; |
---|
471 | 774 | } |
---|
472 | 775 | } |
---|
473 | 776 | |
---|
474 | | - mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; |
---|
475 | | - if (ceph_security_xattr_wanted(dir)) |
---|
476 | | - mask |= CEPH_CAP_XATTR_SHARED; |
---|
477 | | - req->r_args.open.mask = cpu_to_le32(mask); |
---|
478 | | - |
---|
479 | | - req->r_parent = dir; |
---|
480 | 777 | set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); |
---|
481 | | - err = ceph_mdsc_do_request(mdsc, |
---|
482 | | - (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, |
---|
483 | | - req); |
---|
| 778 | + err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); |
---|
484 | 779 | err = ceph_handle_snapdir(req, dentry, err); |
---|
485 | 780 | if (err) |
---|
486 | 781 | goto out_req; |
---|
.. | .. |
---|
505 | 800 | } else { |
---|
506 | 801 | dout("atomic_open finish_open on dn %p\n", dn); |
---|
507 | 802 | if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { |
---|
508 | | - ceph_init_inode_acls(d_inode(dentry), &acls); |
---|
| 803 | + struct inode *newino = d_inode(dentry); |
---|
| 804 | + |
---|
| 805 | + cache_file_layout(dir, newino); |
---|
| 806 | + ceph_init_inode_acls(newino, &as_ctx); |
---|
509 | 807 | file->f_mode |= FMODE_CREATED; |
---|
510 | 808 | } |
---|
511 | 809 | err = finish_open(file, dentry, ceph_open); |
---|
512 | 810 | } |
---|
513 | 811 | out_req: |
---|
514 | | - if (!req->r_err && req->r_target_inode) |
---|
515 | | - ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); |
---|
516 | 812 | ceph_mdsc_put_request(req); |
---|
517 | | -out_acl: |
---|
518 | | - ceph_release_acls_info(&acls); |
---|
| 813 | +out_ctx: |
---|
| 814 | + ceph_release_acl_sec_ctx(&as_ctx); |
---|
519 | 815 | dout("atomic_open result=%d\n", err); |
---|
520 | 816 | return err; |
---|
521 | 817 | } |
---|
.. | .. |
---|
529 | 825 | dout("release inode %p dir file %p\n", inode, file); |
---|
530 | 826 | WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); |
---|
531 | 827 | |
---|
532 | | - ceph_put_fmode(ci, dfi->file_info.fmode); |
---|
| 828 | + ceph_put_fmode(ci, dfi->file_info.fmode, 1); |
---|
533 | 829 | |
---|
534 | 830 | if (dfi->last_readdir) |
---|
535 | 831 | ceph_mdsc_put_request(dfi->last_readdir); |
---|
.. | .. |
---|
541 | 837 | dout("release inode %p regular file %p\n", inode, file); |
---|
542 | 838 | WARN_ON(!list_empty(&fi->rw_contexts)); |
---|
543 | 839 | |
---|
544 | | - ceph_put_fmode(ci, fi->fmode); |
---|
| 840 | + ceph_put_fmode(ci, fi->fmode, 1); |
---|
| 841 | + |
---|
545 | 842 | kmem_cache_free(ceph_file_cachep, fi); |
---|
546 | 843 | } |
---|
547 | 844 | |
---|
.. | .. |
---|
557 | 854 | }; |
---|
558 | 855 | |
---|
559 | 856 | /* |
---|
560 | | - * Read a range of bytes striped over one or more objects. Iterate over |
---|
561 | | - * objects we stripe over. (That's not atomic, but good enough for now.) |
---|
| 857 | + * Completely synchronous read and write methods. Direct from __user |
---|
| 858 | + * buffer to osd, or directly to user pages (if O_DIRECT). |
---|
| 859 | + * |
---|
| 860 | + * If the read spans object boundary, just do multiple reads. (That's not |
---|
| 861 | + * atomic, but good enough for now.) |
---|
562 | 862 | * |
---|
563 | 863 | * If we get a short result from the OSD, check against i_size; we need to |
---|
564 | 864 | * only return a short read to the caller if we hit EOF. |
---|
565 | 865 | */ |
---|
566 | | -static int striped_read(struct inode *inode, |
---|
567 | | - u64 pos, u64 len, |
---|
568 | | - struct page **pages, int num_pages, |
---|
569 | | - int page_align, int *checkeof) |
---|
570 | | -{ |
---|
571 | | - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
---|
572 | | - struct ceph_inode_info *ci = ceph_inode(inode); |
---|
573 | | - u64 this_len; |
---|
574 | | - loff_t i_size; |
---|
575 | | - int page_idx; |
---|
576 | | - int ret, read = 0; |
---|
577 | | - bool hit_stripe, was_short; |
---|
578 | | - |
---|
579 | | - /* |
---|
580 | | - * we may need to do multiple reads. not atomic, unfortunately. |
---|
581 | | - */ |
---|
582 | | -more: |
---|
583 | | - this_len = len; |
---|
584 | | - page_idx = (page_align + read) >> PAGE_SHIFT; |
---|
585 | | - ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), |
---|
586 | | - &ci->i_layout, pos, &this_len, |
---|
587 | | - ci->i_truncate_seq, ci->i_truncate_size, |
---|
588 | | - pages + page_idx, num_pages - page_idx, |
---|
589 | | - ((page_align + read) & ~PAGE_MASK)); |
---|
590 | | - if (ret == -ENOENT) |
---|
591 | | - ret = 0; |
---|
592 | | - hit_stripe = this_len < len; |
---|
593 | | - was_short = ret >= 0 && ret < this_len; |
---|
594 | | - dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read, |
---|
595 | | - ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); |
---|
596 | | - |
---|
597 | | - i_size = i_size_read(inode); |
---|
598 | | - if (ret >= 0) { |
---|
599 | | - if (was_short && (pos + ret < i_size)) { |
---|
600 | | - int zlen = min(this_len - ret, i_size - pos - ret); |
---|
601 | | - int zoff = page_align + read + ret; |
---|
602 | | - dout(" zero gap %llu to %llu\n", |
---|
603 | | - pos + ret, pos + ret + zlen); |
---|
604 | | - ceph_zero_page_vector_range(zoff, zlen, pages); |
---|
605 | | - ret += zlen; |
---|
606 | | - } |
---|
607 | | - |
---|
608 | | - read += ret; |
---|
609 | | - pos += ret; |
---|
610 | | - len -= ret; |
---|
611 | | - |
---|
612 | | - /* hit stripe and need continue*/ |
---|
613 | | - if (len && hit_stripe && pos < i_size) |
---|
614 | | - goto more; |
---|
615 | | - } |
---|
616 | | - |
---|
617 | | - if (read > 0) { |
---|
618 | | - ret = read; |
---|
619 | | - /* did we bounce off eof? */ |
---|
620 | | - if (pos + len > i_size) |
---|
621 | | - *checkeof = CHECK_EOF; |
---|
622 | | - } |
---|
623 | | - |
---|
624 | | - dout("striped_read returns %d\n", ret); |
---|
625 | | - return ret; |
---|
626 | | -} |
---|
627 | | - |
---|
628 | | -/* |
---|
629 | | - * Completely synchronous read and write methods. Direct from __user |
---|
630 | | - * buffer to osd, or directly to user pages (if O_DIRECT). |
---|
631 | | - * |
---|
632 | | - * If the read spans object boundary, just do multiple reads. |
---|
633 | | - */ |
---|
634 | 866 | static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, |
---|
635 | | - int *checkeof) |
---|
| 867 | + int *retry_op) |
---|
636 | 868 | { |
---|
637 | 869 | struct file *file = iocb->ki_filp; |
---|
638 | 870 | struct inode *inode = file_inode(file); |
---|
639 | | - struct page **pages; |
---|
640 | | - u64 off = iocb->ki_pos; |
---|
641 | | - int num_pages; |
---|
| 871 | + struct ceph_inode_info *ci = ceph_inode(inode); |
---|
| 872 | + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
---|
| 873 | + struct ceph_osd_client *osdc = &fsc->client->osdc; |
---|
642 | 874 | ssize_t ret; |
---|
643 | | - size_t len = iov_iter_count(to); |
---|
| 875 | + u64 off = iocb->ki_pos; |
---|
| 876 | + u64 len = iov_iter_count(to); |
---|
644 | 877 | |
---|
645 | 878 | dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, |
---|
646 | 879 | (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); |
---|
.. | .. |
---|
653 | 886 | * but it will at least behave sensibly when they are |
---|
654 | 887 | * in sequence. |
---|
655 | 888 | */ |
---|
656 | | - ret = filemap_write_and_wait_range(inode->i_mapping, off, |
---|
657 | | - off + len); |
---|
| 889 | + ret = filemap_write_and_wait_range(inode->i_mapping, |
---|
| 890 | + off, off + len - 1); |
---|
658 | 891 | if (ret < 0) |
---|
659 | 892 | return ret; |
---|
660 | 893 | |
---|
661 | | - if (unlikely(to->type & ITER_PIPE)) { |
---|
| 894 | + ret = 0; |
---|
| 895 | + while ((len = iov_iter_count(to)) > 0) { |
---|
| 896 | + struct ceph_osd_request *req; |
---|
| 897 | + struct page **pages; |
---|
| 898 | + int num_pages; |
---|
662 | 899 | size_t page_off; |
---|
663 | | - ret = iov_iter_get_pages_alloc(to, &pages, len, |
---|
664 | | - &page_off); |
---|
665 | | - if (ret <= 0) |
---|
666 | | - return -ENOMEM; |
---|
667 | | - num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); |
---|
| 900 | + u64 i_size; |
---|
| 901 | + bool more; |
---|
| 902 | + int idx; |
---|
| 903 | + size_t left; |
---|
668 | 904 | |
---|
669 | | - ret = striped_read(inode, off, ret, pages, num_pages, |
---|
670 | | - page_off, checkeof); |
---|
671 | | - if (ret > 0) { |
---|
672 | | - iov_iter_advance(to, ret); |
---|
673 | | - off += ret; |
---|
674 | | - } else { |
---|
675 | | - iov_iter_advance(to, 0); |
---|
| 905 | + req = ceph_osdc_new_request(osdc, &ci->i_layout, |
---|
| 906 | + ci->i_vino, off, &len, 0, 1, |
---|
| 907 | + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, |
---|
| 908 | + NULL, ci->i_truncate_seq, |
---|
| 909 | + ci->i_truncate_size, false); |
---|
| 910 | + if (IS_ERR(req)) { |
---|
| 911 | + ret = PTR_ERR(req); |
---|
| 912 | + break; |
---|
676 | 913 | } |
---|
677 | | - ceph_put_page_vector(pages, num_pages, false); |
---|
678 | | - } else { |
---|
| 914 | + |
---|
| 915 | + more = len < iov_iter_count(to); |
---|
| 916 | + |
---|
679 | 917 | num_pages = calc_pages_for(off, len); |
---|
| 918 | + page_off = off & ~PAGE_MASK; |
---|
680 | 919 | pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); |
---|
681 | | - if (IS_ERR(pages)) |
---|
682 | | - return PTR_ERR(pages); |
---|
| 920 | + if (IS_ERR(pages)) { |
---|
| 921 | + ceph_osdc_put_request(req); |
---|
| 922 | + ret = PTR_ERR(pages); |
---|
| 923 | + break; |
---|
| 924 | + } |
---|
683 | 925 | |
---|
684 | | - ret = striped_read(inode, off, len, pages, num_pages, |
---|
685 | | - (off & ~PAGE_MASK), checkeof); |
---|
686 | | - if (ret > 0) { |
---|
687 | | - int l, k = 0; |
---|
688 | | - size_t left = ret; |
---|
| 926 | + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, |
---|
| 927 | + false, false); |
---|
| 928 | + ret = ceph_osdc_start_request(osdc, req, false); |
---|
| 929 | + if (!ret) |
---|
| 930 | + ret = ceph_osdc_wait_request(osdc, req); |
---|
689 | 931 | |
---|
690 | | - while (left) { |
---|
691 | | - size_t page_off = off & ~PAGE_MASK; |
---|
692 | | - size_t copy = min_t(size_t, left, |
---|
693 | | - PAGE_SIZE - page_off); |
---|
694 | | - l = copy_page_to_iter(pages[k++], page_off, |
---|
695 | | - copy, to); |
---|
696 | | - off += l; |
---|
697 | | - left -= l; |
---|
698 | | - if (l < copy) |
---|
699 | | - break; |
---|
| 932 | + ceph_update_read_latency(&fsc->mdsc->metric, |
---|
| 933 | + req->r_start_latency, |
---|
| 934 | + req->r_end_latency, |
---|
| 935 | + ret); |
---|
| 936 | + |
---|
| 937 | + ceph_osdc_put_request(req); |
---|
| 938 | + |
---|
| 939 | + i_size = i_size_read(inode); |
---|
| 940 | + dout("sync_read %llu~%llu got %zd i_size %llu%s\n", |
---|
| 941 | + off, len, ret, i_size, (more ? " MORE" : "")); |
---|
| 942 | + |
---|
| 943 | + if (ret == -ENOENT) |
---|
| 944 | + ret = 0; |
---|
| 945 | + if (ret >= 0 && ret < len && (off + ret < i_size)) { |
---|
| 946 | + int zlen = min(len - ret, i_size - off - ret); |
---|
| 947 | + int zoff = page_off + ret; |
---|
| 948 | + dout("sync_read zero gap %llu~%llu\n", |
---|
| 949 | + off + ret, off + ret + zlen); |
---|
| 950 | + ceph_zero_page_vector_range(zoff, zlen, pages); |
---|
| 951 | + ret += zlen; |
---|
| 952 | + } |
---|
| 953 | + |
---|
| 954 | + idx = 0; |
---|
| 955 | + left = ret > 0 ? ret : 0; |
---|
| 956 | + while (left > 0) { |
---|
| 957 | + size_t len, copied; |
---|
| 958 | + page_off = off & ~PAGE_MASK; |
---|
| 959 | + len = min_t(size_t, left, PAGE_SIZE - page_off); |
---|
| 960 | + SetPageUptodate(pages[idx]); |
---|
| 961 | + copied = copy_page_to_iter(pages[idx++], |
---|
| 962 | + page_off, len, to); |
---|
| 963 | + off += copied; |
---|
| 964 | + left -= copied; |
---|
| 965 | + if (copied < len) { |
---|
| 966 | + ret = -EFAULT; |
---|
| 967 | + break; |
---|
700 | 968 | } |
---|
701 | 969 | } |
---|
702 | 970 | ceph_release_page_vector(pages, num_pages); |
---|
| 971 | + |
---|
| 972 | + if (ret < 0) { |
---|
| 973 | + if (ret == -EBLOCKLISTED) |
---|
| 974 | + fsc->blocklisted = true; |
---|
| 975 | + break; |
---|
| 976 | + } |
---|
| 977 | + |
---|
| 978 | + if (off >= i_size || !more) |
---|
| 979 | + break; |
---|
703 | 980 | } |
---|
704 | 981 | |
---|
705 | 982 | if (off > iocb->ki_pos) { |
---|
| 983 | + if (ret >= 0 && |
---|
| 984 | + iov_iter_count(to) > 0 && off >= i_size_read(inode)) |
---|
| 985 | + *retry_op = CHECK_EOF; |
---|
706 | 986 | ret = off - iocb->ki_pos; |
---|
707 | 987 | iocb->ki_pos = off; |
---|
708 | 988 | } |
---|
709 | 989 | |
---|
710 | | - dout("sync_read result %zd\n", ret); |
---|
| 990 | + dout("sync_read result %zd retry_op %d\n", ret, *retry_op); |
---|
711 | 991 | return ret; |
---|
712 | 992 | } |
---|
713 | 993 | |
---|
.. | .. |
---|
739 | 1019 | |
---|
740 | 1020 | if (!atomic_dec_and_test(&aio_req->pending_reqs)) |
---|
741 | 1021 | return; |
---|
| 1022 | + |
---|
| 1023 | + if (aio_req->iocb->ki_flags & IOCB_DIRECT) |
---|
| 1024 | + inode_dio_end(inode); |
---|
742 | 1025 | |
---|
743 | 1026 | ret = aio_req->error; |
---|
744 | 1027 | if (!ret) |
---|
.. | .. |
---|
780 | 1063 | struct inode *inode = req->r_inode; |
---|
781 | 1064 | struct ceph_aio_request *aio_req = req->r_priv; |
---|
782 | 1065 | struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); |
---|
| 1066 | + struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; |
---|
783 | 1067 | |
---|
784 | 1068 | BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); |
---|
785 | 1069 | BUG_ON(!osd_data->num_bvecs); |
---|
786 | 1070 | |
---|
787 | 1071 | dout("ceph_aio_complete_req %p rc %d bytes %u\n", |
---|
788 | 1072 | inode, rc, osd_data->bvec_pos.iter.bi_size); |
---|
| 1073 | + |
---|
| 1074 | + /* r_start_latency == 0 means the request was not submitted */ |
---|
| 1075 | + if (req->r_start_latency) { |
---|
| 1076 | + if (aio_req->write) |
---|
| 1077 | + ceph_update_write_latency(metric, req->r_start_latency, |
---|
| 1078 | + req->r_end_latency, rc); |
---|
| 1079 | + else |
---|
| 1080 | + ceph_update_read_latency(metric, req->r_start_latency, |
---|
| 1081 | + req->r_end_latency, rc); |
---|
| 1082 | + } |
---|
789 | 1083 | |
---|
790 | 1084 | if (rc == -EOLDSNAPC) { |
---|
791 | 1085 | struct ceph_aio_work *aio_work; |
---|
.. | .. |
---|
795 | 1089 | if (aio_work) { |
---|
796 | 1090 | INIT_WORK(&aio_work->work, ceph_aio_retry_work); |
---|
797 | 1091 | aio_work->req = req; |
---|
798 | | - queue_work(ceph_inode_to_client(inode)->wb_wq, |
---|
| 1092 | + queue_work(ceph_inode_to_client(inode)->inode_wq, |
---|
799 | 1093 | &aio_work->work); |
---|
800 | 1094 | return; |
---|
801 | 1095 | } |
---|
.. | .. |
---|
821 | 1115 | aio_req->total_len = rc + zlen; |
---|
822 | 1116 | } |
---|
823 | 1117 | |
---|
824 | | - iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs, |
---|
| 1118 | + iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs, |
---|
825 | 1119 | osd_data->num_bvecs, |
---|
826 | 1120 | osd_data->bvec_pos.iter.bi_size); |
---|
827 | 1121 | iov_iter_advance(&i, rc); |
---|
.. | .. |
---|
865 | 1159 | } |
---|
866 | 1160 | spin_unlock(&ci->i_ceph_lock); |
---|
867 | 1161 | |
---|
868 | | - req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, |
---|
| 1162 | + req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1, |
---|
869 | 1163 | false, GFP_NOFS); |
---|
870 | 1164 | if (!req) { |
---|
871 | 1165 | ret = -ENOMEM; |
---|
.. | .. |
---|
877 | 1171 | ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); |
---|
878 | 1172 | ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); |
---|
879 | 1173 | |
---|
| 1174 | + req->r_ops[0] = orig_req->r_ops[0]; |
---|
| 1175 | + |
---|
| 1176 | + req->r_mtime = aio_req->mtime; |
---|
| 1177 | + req->r_data_offset = req->r_ops[0].extent.offset; |
---|
| 1178 | + |
---|
880 | 1179 | ret = ceph_osdc_alloc_messages(req, GFP_NOFS); |
---|
881 | 1180 | if (ret) { |
---|
882 | 1181 | ceph_osdc_put_request(req); |
---|
883 | 1182 | req = orig_req; |
---|
884 | 1183 | goto out; |
---|
885 | 1184 | } |
---|
886 | | - |
---|
887 | | - req->r_ops[0] = orig_req->r_ops[0]; |
---|
888 | | - |
---|
889 | | - req->r_mtime = aio_req->mtime; |
---|
890 | | - req->r_data_offset = req->r_ops[0].extent.offset; |
---|
891 | 1185 | |
---|
892 | 1186 | ceph_osdc_put_request(orig_req); |
---|
893 | 1187 | |
---|
.. | .. |
---|
915 | 1209 | struct inode *inode = file_inode(file); |
---|
916 | 1210 | struct ceph_inode_info *ci = ceph_inode(inode); |
---|
917 | 1211 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
---|
| 1212 | + struct ceph_client_metric *metric = &fsc->mdsc->metric; |
---|
918 | 1213 | struct ceph_vino vino; |
---|
919 | 1214 | struct ceph_osd_request *req; |
---|
920 | 1215 | struct bio_vec *bvecs; |
---|
921 | 1216 | struct ceph_aio_request *aio_req = NULL; |
---|
922 | 1217 | int num_pages = 0; |
---|
923 | 1218 | int flags; |
---|
924 | | - int ret; |
---|
| 1219 | + int ret = 0; |
---|
925 | 1220 | struct timespec64 mtime = current_time(inode); |
---|
926 | 1221 | size_t count = iov_iter_count(iter); |
---|
927 | 1222 | loff_t pos = iocb->ki_pos; |
---|
.. | .. |
---|
933 | 1228 | |
---|
934 | 1229 | dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", |
---|
935 | 1230 | (write ? "write" : "read"), file, pos, (unsigned)count, |
---|
936 | | - snapc, snapc->seq); |
---|
937 | | - |
---|
938 | | - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); |
---|
939 | | - if (ret < 0) |
---|
940 | | - return ret; |
---|
| 1231 | + snapc, snapc ? snapc->seq : 0); |
---|
941 | 1232 | |
---|
942 | 1233 | if (write) { |
---|
943 | 1234 | int ret2 = invalidate_inode_pages2_range(inode->i_mapping, |
---|
944 | 1235 | pos >> PAGE_SHIFT, |
---|
945 | | - (pos + count) >> PAGE_SHIFT); |
---|
| 1236 | + (pos + count - 1) >> PAGE_SHIFT); |
---|
946 | 1237 | if (ret2 < 0) |
---|
947 | 1238 | dout("invalidate_inode_pages2_range returned %d\n", ret2); |
---|
948 | 1239 | |
---|
.. | .. |
---|
1010 | 1301 | * may block. |
---|
1011 | 1302 | */ |
---|
1012 | 1303 | truncate_inode_pages_range(inode->i_mapping, pos, |
---|
1013 | | - (pos+len) | (PAGE_SIZE - 1)); |
---|
| 1304 | + PAGE_ALIGN(pos + len) - 1); |
---|
1014 | 1305 | |
---|
1015 | 1306 | req->r_mtime = mtime; |
---|
1016 | 1307 | } |
---|
.. | .. |
---|
1025 | 1316 | req->r_callback = ceph_aio_complete_req; |
---|
1026 | 1317 | req->r_inode = inode; |
---|
1027 | 1318 | req->r_priv = aio_req; |
---|
1028 | | - list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); |
---|
| 1319 | + list_add_tail(&req->r_private_item, &aio_req->osd_reqs); |
---|
1029 | 1320 | |
---|
1030 | 1321 | pos += len; |
---|
1031 | 1322 | continue; |
---|
.. | .. |
---|
1034 | 1325 | ret = ceph_osdc_start_request(req->r_osdc, req, false); |
---|
1035 | 1326 | if (!ret) |
---|
1036 | 1327 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); |
---|
| 1328 | + |
---|
| 1329 | + if (write) |
---|
| 1330 | + ceph_update_write_latency(metric, req->r_start_latency, |
---|
| 1331 | + req->r_end_latency, ret); |
---|
| 1332 | + else |
---|
| 1333 | + ceph_update_read_latency(metric, req->r_start_latency, |
---|
| 1334 | + req->r_end_latency, ret); |
---|
1037 | 1335 | |
---|
1038 | 1336 | size = i_size_read(inode); |
---|
1039 | 1337 | if (!write) { |
---|
.. | .. |
---|
1044 | 1342 | int zlen = min_t(size_t, len - ret, |
---|
1045 | 1343 | size - pos - ret); |
---|
1046 | 1344 | |
---|
1047 | | - iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages, |
---|
1048 | | - len); |
---|
| 1345 | + iov_iter_bvec(&i, READ, bvecs, num_pages, len); |
---|
1049 | 1346 | iov_iter_advance(&i, ret); |
---|
1050 | 1347 | iov_iter_zero(zlen, &i); |
---|
1051 | 1348 | ret += zlen; |
---|
.. | .. |
---|
1083 | 1380 | CEPH_CAP_FILE_RD); |
---|
1084 | 1381 | |
---|
1085 | 1382 | list_splice(&aio_req->osd_reqs, &osd_reqs); |
---|
| 1383 | + inode_dio_begin(inode); |
---|
1086 | 1384 | while (!list_empty(&osd_reqs)) { |
---|
1087 | 1385 | req = list_first_entry(&osd_reqs, |
---|
1088 | 1386 | struct ceph_osd_request, |
---|
1089 | | - r_unsafe_item); |
---|
1090 | | - list_del_init(&req->r_unsafe_item); |
---|
| 1387 | + r_private_item); |
---|
| 1388 | + list_del_init(&req->r_private_item); |
---|
1091 | 1389 | if (ret >= 0) |
---|
1092 | 1390 | ret = ceph_osdc_start_request(req->r_osdc, |
---|
1093 | 1391 | req, false); |
---|
.. | .. |
---|
1139 | 1437 | dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", |
---|
1140 | 1438 | file, pos, (unsigned)count, snapc, snapc->seq); |
---|
1141 | 1439 | |
---|
1142 | | - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); |
---|
| 1440 | + ret = filemap_write_and_wait_range(inode->i_mapping, |
---|
| 1441 | + pos, pos + count - 1); |
---|
1143 | 1442 | if (ret < 0) |
---|
1144 | 1443 | return ret; |
---|
1145 | 1444 | |
---|
1146 | 1445 | ret = invalidate_inode_pages2_range(inode->i_mapping, |
---|
1147 | 1446 | pos >> PAGE_SHIFT, |
---|
1148 | | - (pos + count) >> PAGE_SHIFT); |
---|
| 1447 | + (pos + count - 1) >> PAGE_SHIFT); |
---|
1149 | 1448 | if (ret < 0) |
---|
1150 | 1449 | dout("invalidate_inode_pages2_range returned %d\n", ret); |
---|
1151 | 1450 | |
---|
.. | .. |
---|
1205 | 1504 | if (!ret) |
---|
1206 | 1505 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); |
---|
1207 | 1506 | |
---|
| 1507 | + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, |
---|
| 1508 | + req->r_end_latency, ret); |
---|
1208 | 1509 | out: |
---|
1209 | 1510 | ceph_osdc_put_request(req); |
---|
1210 | 1511 | if (ret != 0) { |
---|
.. | .. |
---|
1247 | 1548 | struct inode *inode = file_inode(filp); |
---|
1248 | 1549 | struct ceph_inode_info *ci = ceph_inode(inode); |
---|
1249 | 1550 | struct page *pinned_page = NULL; |
---|
| 1551 | + bool direct_lock = iocb->ki_flags & IOCB_DIRECT; |
---|
1250 | 1552 | ssize_t ret; |
---|
1251 | 1553 | int want, got = 0; |
---|
1252 | 1554 | int retry_op = 0, read = 0; |
---|
.. | .. |
---|
1255 | 1557 | dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", |
---|
1256 | 1558 | inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); |
---|
1257 | 1559 | |
---|
| 1560 | + if (direct_lock) |
---|
| 1561 | + ceph_start_io_direct(inode); |
---|
| 1562 | + else |
---|
| 1563 | + ceph_start_io_read(inode); |
---|
| 1564 | + |
---|
1258 | 1565 | if (fi->fmode & CEPH_FILE_MODE_LAZY) |
---|
1259 | 1566 | want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; |
---|
1260 | 1567 | else |
---|
1261 | 1568 | want = CEPH_CAP_FILE_CACHE; |
---|
1262 | | - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); |
---|
1263 | | - if (ret < 0) |
---|
| 1569 | + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, |
---|
| 1570 | + &got, &pinned_page); |
---|
| 1571 | + if (ret < 0) { |
---|
| 1572 | + if (iocb->ki_flags & IOCB_DIRECT) |
---|
| 1573 | + ceph_end_io_direct(inode); |
---|
| 1574 | + else |
---|
| 1575 | + ceph_end_io_read(inode); |
---|
1264 | 1576 | return ret; |
---|
| 1577 | + } |
---|
1265 | 1578 | |
---|
1266 | 1579 | if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || |
---|
1267 | 1580 | (iocb->ki_flags & IOCB_DIRECT) || |
---|
.. | .. |
---|
1292 | 1605 | ret = generic_file_read_iter(iocb, to); |
---|
1293 | 1606 | ceph_del_rw_context(fi, &rw_ctx); |
---|
1294 | 1607 | } |
---|
| 1608 | + |
---|
1295 | 1609 | dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", |
---|
1296 | 1610 | inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); |
---|
1297 | 1611 | if (pinned_page) { |
---|
.. | .. |
---|
1299 | 1613 | pinned_page = NULL; |
---|
1300 | 1614 | } |
---|
1301 | 1615 | ceph_put_cap_refs(ci, got); |
---|
| 1616 | + |
---|
| 1617 | + if (direct_lock) |
---|
| 1618 | + ceph_end_io_direct(inode); |
---|
| 1619 | + else |
---|
| 1620 | + ceph_end_io_read(inode); |
---|
| 1621 | + |
---|
1302 | 1622 | if (retry_op > HAVE_RETRIED && ret >= 0) { |
---|
1303 | 1623 | int statret; |
---|
1304 | 1624 | struct page *page = NULL; |
---|
.. | .. |
---|
1388 | 1708 | struct ceph_cap_flush *prealloc_cf; |
---|
1389 | 1709 | ssize_t count, written = 0; |
---|
1390 | 1710 | int err, want, got; |
---|
| 1711 | + bool direct_lock = false; |
---|
1391 | 1712 | u32 map_flags; |
---|
1392 | 1713 | u64 pool_flags; |
---|
1393 | 1714 | loff_t pos; |
---|
.. | .. |
---|
1400 | 1721 | if (!prealloc_cf) |
---|
1401 | 1722 | return -ENOMEM; |
---|
1402 | 1723 | |
---|
| 1724 | + if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT) |
---|
| 1725 | + direct_lock = true; |
---|
| 1726 | + |
---|
1403 | 1727 | retry_snap: |
---|
1404 | | - inode_lock(inode); |
---|
| 1728 | + if (direct_lock) |
---|
| 1729 | + ceph_start_io_direct(inode); |
---|
| 1730 | + else |
---|
| 1731 | + ceph_start_io_write(inode); |
---|
1405 | 1732 | |
---|
1406 | 1733 | /* We can write back this queue in page reclaim */ |
---|
1407 | 1734 | current->backing_dev_info = inode_to_bdi(inode); |
---|
.. | .. |
---|
1430 | 1757 | goto out; |
---|
1431 | 1758 | } |
---|
1432 | 1759 | |
---|
1433 | | - err = file_remove_privs(file); |
---|
1434 | | - if (err) |
---|
1435 | | - goto out; |
---|
1436 | | - |
---|
1437 | | - err = file_update_time(file); |
---|
1438 | | - if (err) |
---|
1439 | | - goto out; |
---|
1440 | | - |
---|
1441 | | - if (ci->i_inline_version != CEPH_INLINE_NONE) { |
---|
1442 | | - err = ceph_uninline_data(file, NULL); |
---|
1443 | | - if (err < 0) |
---|
1444 | | - goto out; |
---|
1445 | | - } |
---|
1446 | | - |
---|
1447 | 1760 | down_read(&osdc->lock); |
---|
1448 | 1761 | map_flags = osdc->osdmap->flags; |
---|
1449 | 1762 | pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); |
---|
.. | .. |
---|
1454 | 1767 | goto out; |
---|
1455 | 1768 | } |
---|
1456 | 1769 | |
---|
| 1770 | + err = file_remove_privs(file); |
---|
| 1771 | + if (err) |
---|
| 1772 | + goto out; |
---|
| 1773 | + |
---|
| 1774 | + if (ci->i_inline_version != CEPH_INLINE_NONE) { |
---|
| 1775 | + err = ceph_uninline_data(file, NULL); |
---|
| 1776 | + if (err < 0) |
---|
| 1777 | + goto out; |
---|
| 1778 | + } |
---|
| 1779 | + |
---|
1457 | 1780 | dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", |
---|
1458 | 1781 | inode, ceph_vinop(inode), pos, count, i_size_read(inode)); |
---|
1459 | 1782 | if (fi->fmode & CEPH_FILE_MODE_LAZY) |
---|
.. | .. |
---|
1461 | 1784 | else |
---|
1462 | 1785 | want = CEPH_CAP_FILE_BUFFER; |
---|
1463 | 1786 | got = 0; |
---|
1464 | | - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, |
---|
| 1787 | + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, |
---|
1465 | 1788 | &got, NULL); |
---|
1466 | 1789 | if (err < 0) |
---|
1467 | 1790 | goto out; |
---|
| 1791 | + |
---|
| 1792 | + err = file_update_time(file); |
---|
| 1793 | + if (err) |
---|
| 1794 | + goto out_caps; |
---|
| 1795 | + |
---|
| 1796 | + inode_inc_iversion_raw(inode); |
---|
1468 | 1797 | |
---|
1469 | 1798 | dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", |
---|
1470 | 1799 | inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); |
---|
.. | .. |
---|
1474 | 1803 | (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { |
---|
1475 | 1804 | struct ceph_snap_context *snapc; |
---|
1476 | 1805 | struct iov_iter data; |
---|
1477 | | - inode_unlock(inode); |
---|
1478 | 1806 | |
---|
1479 | 1807 | spin_lock(&ci->i_ceph_lock); |
---|
1480 | 1808 | if (__ceph_have_pending_cap_snap(ci)) { |
---|
.. | .. |
---|
1496 | 1824 | &prealloc_cf); |
---|
1497 | 1825 | else |
---|
1498 | 1826 | written = ceph_sync_write(iocb, &data, pos, snapc); |
---|
| 1827 | + if (direct_lock) |
---|
| 1828 | + ceph_end_io_direct(inode); |
---|
| 1829 | + else |
---|
| 1830 | + ceph_end_io_write(inode); |
---|
1499 | 1831 | if (written > 0) |
---|
1500 | 1832 | iov_iter_advance(from, written); |
---|
1501 | 1833 | ceph_put_snap_context(snapc); |
---|
.. | .. |
---|
1510 | 1842 | written = generic_perform_write(file, from, pos); |
---|
1511 | 1843 | if (likely(written >= 0)) |
---|
1512 | 1844 | iocb->ki_pos = pos + written; |
---|
1513 | | - inode_unlock(inode); |
---|
| 1845 | + ceph_end_io_write(inode); |
---|
1514 | 1846 | } |
---|
1515 | 1847 | |
---|
1516 | 1848 | if (written >= 0) { |
---|
.. | .. |
---|
1524 | 1856 | if (dirty) |
---|
1525 | 1857 | __mark_inode_dirty(inode, dirty); |
---|
1526 | 1858 | if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) |
---|
1527 | | - ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL); |
---|
| 1859 | + ceph_check_caps(ci, 0, NULL); |
---|
1528 | 1860 | } |
---|
1529 | 1861 | |
---|
1530 | 1862 | dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", |
---|
.. | .. |
---|
1546 | 1878 | } |
---|
1547 | 1879 | |
---|
1548 | 1880 | goto out_unlocked; |
---|
1549 | | - |
---|
| 1881 | +out_caps: |
---|
| 1882 | + ceph_put_cap_refs(ci, got); |
---|
1550 | 1883 | out: |
---|
1551 | | - inode_unlock(inode); |
---|
| 1884 | + if (direct_lock) |
---|
| 1885 | + ceph_end_io_direct(inode); |
---|
| 1886 | + else |
---|
| 1887 | + ceph_end_io_write(inode); |
---|
1552 | 1888 | out_unlocked: |
---|
1553 | 1889 | ceph_free_cap_flush(prealloc_cf); |
---|
1554 | 1890 | current->backing_dev_info = NULL; |
---|
.. | .. |
---|
1786 | 2122 | else |
---|
1787 | 2123 | want = CEPH_CAP_FILE_BUFFER; |
---|
1788 | 2124 | |
---|
1789 | | - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); |
---|
| 2125 | + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); |
---|
1790 | 2126 | if (ret < 0) |
---|
1791 | 2127 | goto unlock; |
---|
1792 | 2128 | |
---|
.. | .. |
---|
1810 | 2146 | return ret; |
---|
1811 | 2147 | } |
---|
1812 | 2148 | |
---|
| 2149 | +/* |
---|
| 2150 | + * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for |
---|
| 2151 | + * src_ci. Two attempts are made to obtain both caps, and an error is return if |
---|
| 2152 | + * this fails; zero is returned on success. |
---|
| 2153 | + */ |
---|
| 2154 | +static int get_rd_wr_caps(struct file *src_filp, int *src_got, |
---|
| 2155 | + struct file *dst_filp, |
---|
| 2156 | + loff_t dst_endoff, int *dst_got) |
---|
| 2157 | +{ |
---|
| 2158 | + int ret = 0; |
---|
| 2159 | + bool retrying = false; |
---|
| 2160 | + |
---|
| 2161 | +retry_caps: |
---|
| 2162 | + ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, |
---|
| 2163 | + dst_endoff, dst_got, NULL); |
---|
| 2164 | + if (ret < 0) |
---|
| 2165 | + return ret; |
---|
| 2166 | + |
---|
| 2167 | + /* |
---|
| 2168 | + * Since we're already holding the FILE_WR capability for the dst file, |
---|
| 2169 | + * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some |
---|
| 2170 | + * retry dance instead to try to get both capabilities. |
---|
| 2171 | + */ |
---|
| 2172 | + ret = ceph_try_get_caps(file_inode(src_filp), |
---|
| 2173 | + CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, |
---|
| 2174 | + false, src_got); |
---|
| 2175 | + if (ret <= 0) { |
---|
| 2176 | + /* Start by dropping dst_ci caps and getting src_ci caps */ |
---|
| 2177 | + ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); |
---|
| 2178 | + if (retrying) { |
---|
| 2179 | + if (!ret) |
---|
| 2180 | + /* ceph_try_get_caps masks EAGAIN */ |
---|
| 2181 | + ret = -EAGAIN; |
---|
| 2182 | + return ret; |
---|
| 2183 | + } |
---|
| 2184 | + ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, |
---|
| 2185 | + CEPH_CAP_FILE_SHARED, -1, src_got, NULL); |
---|
| 2186 | + if (ret < 0) |
---|
| 2187 | + return ret; |
---|
| 2188 | + /*... drop src_ci caps too, and retry */ |
---|
| 2189 | + ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); |
---|
| 2190 | + retrying = true; |
---|
| 2191 | + goto retry_caps; |
---|
| 2192 | + } |
---|
| 2193 | + return ret; |
---|
| 2194 | +} |
---|
| 2195 | + |
---|
| 2196 | +static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got, |
---|
| 2197 | + struct ceph_inode_info *dst_ci, int dst_got) |
---|
| 2198 | +{ |
---|
| 2199 | + ceph_put_cap_refs(src_ci, src_got); |
---|
| 2200 | + ceph_put_cap_refs(dst_ci, dst_got); |
---|
| 2201 | +} |
---|
| 2202 | + |
---|
| 2203 | +/* |
---|
| 2204 | + * This function does several size-related checks, returning an error if: |
---|
| 2205 | + * - source file is smaller than off+len |
---|
| 2206 | + * - destination file size is not OK (inode_newsize_ok()) |
---|
| 2207 | + * - max bytes quotas is exceeded |
---|
| 2208 | + */ |
---|
| 2209 | +static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, |
---|
| 2210 | + loff_t src_off, loff_t dst_off, size_t len) |
---|
| 2211 | +{ |
---|
| 2212 | + loff_t size, endoff; |
---|
| 2213 | + |
---|
| 2214 | + size = i_size_read(src_inode); |
---|
| 2215 | + /* |
---|
| 2216 | + * Don't copy beyond source file EOF. Instead of simply setting length |
---|
| 2217 | + * to (size - src_off), just drop to VFS default implementation, as the |
---|
| 2218 | + * local i_size may be stale due to other clients writing to the source |
---|
| 2219 | + * inode. |
---|
| 2220 | + */ |
---|
| 2221 | + if (src_off + len > size) { |
---|
| 2222 | + dout("Copy beyond EOF (%llu + %zu > %llu)\n", |
---|
| 2223 | + src_off, len, size); |
---|
| 2224 | + return -EOPNOTSUPP; |
---|
| 2225 | + } |
---|
| 2226 | + size = i_size_read(dst_inode); |
---|
| 2227 | + |
---|
| 2228 | + endoff = dst_off + len; |
---|
| 2229 | + if (inode_newsize_ok(dst_inode, endoff)) |
---|
| 2230 | + return -EOPNOTSUPP; |
---|
| 2231 | + |
---|
| 2232 | + if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff)) |
---|
| 2233 | + return -EDQUOT; |
---|
| 2234 | + |
---|
| 2235 | + return 0; |
---|
| 2236 | +} |
---|
| 2237 | + |
---|
| 2238 | +static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off, |
---|
| 2239 | + struct ceph_inode_info *dst_ci, u64 *dst_off, |
---|
| 2240 | + struct ceph_fs_client *fsc, |
---|
| 2241 | + size_t len, unsigned int flags) |
---|
| 2242 | +{ |
---|
| 2243 | + struct ceph_object_locator src_oloc, dst_oloc; |
---|
| 2244 | + struct ceph_object_id src_oid, dst_oid; |
---|
| 2245 | + size_t bytes = 0; |
---|
| 2246 | + u64 src_objnum, src_objoff, dst_objnum, dst_objoff; |
---|
| 2247 | + u32 src_objlen, dst_objlen; |
---|
| 2248 | + u32 object_size = src_ci->i_layout.object_size; |
---|
| 2249 | + int ret; |
---|
| 2250 | + |
---|
| 2251 | + src_oloc.pool = src_ci->i_layout.pool_id; |
---|
| 2252 | + src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); |
---|
| 2253 | + dst_oloc.pool = dst_ci->i_layout.pool_id; |
---|
| 2254 | + dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); |
---|
| 2255 | + |
---|
| 2256 | + while (len >= object_size) { |
---|
| 2257 | + ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off, |
---|
| 2258 | + object_size, &src_objnum, |
---|
| 2259 | + &src_objoff, &src_objlen); |
---|
| 2260 | + ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off, |
---|
| 2261 | + object_size, &dst_objnum, |
---|
| 2262 | + &dst_objoff, &dst_objlen); |
---|
| 2263 | + ceph_oid_init(&src_oid); |
---|
| 2264 | + ceph_oid_printf(&src_oid, "%llx.%08llx", |
---|
| 2265 | + src_ci->i_vino.ino, src_objnum); |
---|
| 2266 | + ceph_oid_init(&dst_oid); |
---|
| 2267 | + ceph_oid_printf(&dst_oid, "%llx.%08llx", |
---|
| 2268 | + dst_ci->i_vino.ino, dst_objnum); |
---|
| 2269 | + /* Do an object remote copy */ |
---|
| 2270 | + ret = ceph_osdc_copy_from(&fsc->client->osdc, |
---|
| 2271 | + src_ci->i_vino.snap, 0, |
---|
| 2272 | + &src_oid, &src_oloc, |
---|
| 2273 | + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | |
---|
| 2274 | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, |
---|
| 2275 | + &dst_oid, &dst_oloc, |
---|
| 2276 | + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | |
---|
| 2277 | + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, |
---|
| 2278 | + dst_ci->i_truncate_seq, |
---|
| 2279 | + dst_ci->i_truncate_size, |
---|
| 2280 | + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); |
---|
| 2281 | + if (ret) { |
---|
| 2282 | + if (ret == -EOPNOTSUPP) { |
---|
| 2283 | + fsc->have_copy_from2 = false; |
---|
| 2284 | + pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); |
---|
| 2285 | + } |
---|
| 2286 | + dout("ceph_osdc_copy_from returned %d\n", ret); |
---|
| 2287 | + if (!bytes) |
---|
| 2288 | + bytes = ret; |
---|
| 2289 | + goto out; |
---|
| 2290 | + } |
---|
| 2291 | + len -= object_size; |
---|
| 2292 | + bytes += object_size; |
---|
| 2293 | + *src_off += object_size; |
---|
| 2294 | + *dst_off += object_size; |
---|
| 2295 | + } |
---|
| 2296 | + |
---|
| 2297 | +out: |
---|
| 2298 | + ceph_oloc_destroy(&src_oloc); |
---|
| 2299 | + ceph_oloc_destroy(&dst_oloc); |
---|
| 2300 | + return bytes; |
---|
| 2301 | +} |
---|
| 2302 | + |
---|
| 2303 | +static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, |
---|
| 2304 | + struct file *dst_file, loff_t dst_off, |
---|
| 2305 | + size_t len, unsigned int flags) |
---|
| 2306 | +{ |
---|
| 2307 | + struct inode *src_inode = file_inode(src_file); |
---|
| 2308 | + struct inode *dst_inode = file_inode(dst_file); |
---|
| 2309 | + struct ceph_inode_info *src_ci = ceph_inode(src_inode); |
---|
| 2310 | + struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); |
---|
| 2311 | + struct ceph_cap_flush *prealloc_cf; |
---|
| 2312 | + struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); |
---|
| 2313 | + loff_t size; |
---|
| 2314 | + ssize_t ret = -EIO, bytes; |
---|
| 2315 | + u64 src_objnum, dst_objnum, src_objoff, dst_objoff; |
---|
| 2316 | + u32 src_objlen, dst_objlen; |
---|
| 2317 | + int src_got = 0, dst_got = 0, err, dirty; |
---|
| 2318 | + |
---|
| 2319 | + if (src_inode->i_sb != dst_inode->i_sb) { |
---|
| 2320 | + struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); |
---|
| 2321 | + |
---|
| 2322 | + if (ceph_fsid_compare(&src_fsc->client->fsid, |
---|
| 2323 | + &dst_fsc->client->fsid)) { |
---|
| 2324 | + dout("Copying files across clusters: src: %pU dst: %pU\n", |
---|
| 2325 | + &src_fsc->client->fsid, &dst_fsc->client->fsid); |
---|
| 2326 | + return -EXDEV; |
---|
| 2327 | + } |
---|
| 2328 | + } |
---|
| 2329 | + if (ceph_snap(dst_inode) != CEPH_NOSNAP) |
---|
| 2330 | + return -EROFS; |
---|
| 2331 | + |
---|
| 2332 | + /* |
---|
| 2333 | + * Some of the checks below will return -EOPNOTSUPP, which will force a |
---|
| 2334 | + * fallback to the default VFS copy_file_range implementation. This is |
---|
| 2335 | + * desirable in several cases (for ex, the 'len' is smaller than the |
---|
| 2336 | + * size of the objects, or in cases where that would be more |
---|
| 2337 | + * efficient). |
---|
| 2338 | + */ |
---|
| 2339 | + |
---|
| 2340 | + if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) |
---|
| 2341 | + return -EOPNOTSUPP; |
---|
| 2342 | + |
---|
| 2343 | + if (!src_fsc->have_copy_from2) |
---|
| 2344 | + return -EOPNOTSUPP; |
---|
| 2345 | + |
---|
| 2346 | + /* |
---|
| 2347 | + * Striped file layouts require that we copy partial objects, but the |
---|
| 2348 | + * OSD copy-from operation only supports full-object copies. Limit |
---|
| 2349 | + * this to non-striped file layouts for now. |
---|
| 2350 | + */ |
---|
| 2351 | + if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || |
---|
| 2352 | + (src_ci->i_layout.stripe_count != 1) || |
---|
| 2353 | + (dst_ci->i_layout.stripe_count != 1) || |
---|
| 2354 | + (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) { |
---|
| 2355 | + dout("Invalid src/dst files layout\n"); |
---|
| 2356 | + return -EOPNOTSUPP; |
---|
| 2357 | + } |
---|
| 2358 | + |
---|
| 2359 | + if (len < src_ci->i_layout.object_size) |
---|
| 2360 | + return -EOPNOTSUPP; /* no remote copy will be done */ |
---|
| 2361 | + |
---|
| 2362 | + prealloc_cf = ceph_alloc_cap_flush(); |
---|
| 2363 | + if (!prealloc_cf) |
---|
| 2364 | + return -ENOMEM; |
---|
| 2365 | + |
---|
| 2366 | + /* Start by sync'ing the source and destination files */ |
---|
| 2367 | + ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); |
---|
| 2368 | + if (ret < 0) { |
---|
| 2369 | + dout("failed to write src file (%zd)\n", ret); |
---|
| 2370 | + goto out; |
---|
| 2371 | + } |
---|
| 2372 | + ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len)); |
---|
| 2373 | + if (ret < 0) { |
---|
| 2374 | + dout("failed to write dst file (%zd)\n", ret); |
---|
| 2375 | + goto out; |
---|
| 2376 | + } |
---|
| 2377 | + |
---|
| 2378 | + /* |
---|
| 2379 | + * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other |
---|
| 2380 | + * clients may have dirty data in their caches. And OSDs know nothing |
---|
| 2381 | + * about caps, so they can't safely do the remote object copies. |
---|
| 2382 | + */ |
---|
| 2383 | + err = get_rd_wr_caps(src_file, &src_got, |
---|
| 2384 | + dst_file, (dst_off + len), &dst_got); |
---|
| 2385 | + if (err < 0) { |
---|
| 2386 | + dout("get_rd_wr_caps returned %d\n", err); |
---|
| 2387 | + ret = -EOPNOTSUPP; |
---|
| 2388 | + goto out; |
---|
| 2389 | + } |
---|
| 2390 | + |
---|
| 2391 | + ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len); |
---|
| 2392 | + if (ret < 0) |
---|
| 2393 | + goto out_caps; |
---|
| 2394 | + |
---|
| 2395 | + /* Drop dst file cached pages */ |
---|
| 2396 | + ret = invalidate_inode_pages2_range(dst_inode->i_mapping, |
---|
| 2397 | + dst_off >> PAGE_SHIFT, |
---|
| 2398 | + (dst_off + len) >> PAGE_SHIFT); |
---|
| 2399 | + if (ret < 0) { |
---|
| 2400 | + dout("Failed to invalidate inode pages (%zd)\n", ret); |
---|
| 2401 | + ret = 0; /* XXX */ |
---|
| 2402 | + } |
---|
| 2403 | + ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, |
---|
| 2404 | + src_ci->i_layout.object_size, |
---|
| 2405 | + &src_objnum, &src_objoff, &src_objlen); |
---|
| 2406 | + ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, |
---|
| 2407 | + dst_ci->i_layout.object_size, |
---|
| 2408 | + &dst_objnum, &dst_objoff, &dst_objlen); |
---|
| 2409 | + /* object-level offsets need to the same */ |
---|
| 2410 | + if (src_objoff != dst_objoff) { |
---|
| 2411 | + ret = -EOPNOTSUPP; |
---|
| 2412 | + goto out_caps; |
---|
| 2413 | + } |
---|
| 2414 | + |
---|
| 2415 | + /* |
---|
| 2416 | + * Do a manual copy if the object offset isn't object aligned. |
---|
| 2417 | + * 'src_objlen' contains the bytes left until the end of the object, |
---|
| 2418 | + * starting at the src_off |
---|
| 2419 | + */ |
---|
| 2420 | + if (src_objoff) { |
---|
| 2421 | + dout("Initial partial copy of %u bytes\n", src_objlen); |
---|
| 2422 | + |
---|
| 2423 | + /* |
---|
| 2424 | + * we need to temporarily drop all caps as we'll be calling |
---|
| 2425 | + * {read,write}_iter, which will get caps again. |
---|
| 2426 | + */ |
---|
| 2427 | + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); |
---|
| 2428 | + ret = do_splice_direct(src_file, &src_off, dst_file, |
---|
| 2429 | + &dst_off, src_objlen, flags); |
---|
| 2430 | + /* Abort on short copies or on error */ |
---|
| 2431 | + if (ret < src_objlen) { |
---|
| 2432 | + dout("Failed partial copy (%zd)\n", ret); |
---|
| 2433 | + goto out; |
---|
| 2434 | + } |
---|
| 2435 | + len -= ret; |
---|
| 2436 | + err = get_rd_wr_caps(src_file, &src_got, |
---|
| 2437 | + dst_file, (dst_off + len), &dst_got); |
---|
| 2438 | + if (err < 0) |
---|
| 2439 | + goto out; |
---|
| 2440 | + err = is_file_size_ok(src_inode, dst_inode, |
---|
| 2441 | + src_off, dst_off, len); |
---|
| 2442 | + if (err < 0) |
---|
| 2443 | + goto out_caps; |
---|
| 2444 | + } |
---|
| 2445 | + |
---|
| 2446 | + size = i_size_read(dst_inode); |
---|
| 2447 | + bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off, |
---|
| 2448 | + src_fsc, len, flags); |
---|
| 2449 | + if (bytes <= 0) { |
---|
| 2450 | + if (!ret) |
---|
| 2451 | + ret = bytes; |
---|
| 2452 | + goto out_caps; |
---|
| 2453 | + } |
---|
| 2454 | + dout("Copied %zu bytes out of %zu\n", bytes, len); |
---|
| 2455 | + len -= bytes; |
---|
| 2456 | + ret += bytes; |
---|
| 2457 | + |
---|
| 2458 | + file_update_time(dst_file); |
---|
| 2459 | + inode_inc_iversion_raw(dst_inode); |
---|
| 2460 | + |
---|
| 2461 | + if (dst_off > size) { |
---|
| 2462 | + /* Let the MDS know about dst file size change */ |
---|
| 2463 | + if (ceph_inode_set_size(dst_inode, dst_off) || |
---|
| 2464 | + ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) |
---|
| 2465 | + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL); |
---|
| 2466 | + } |
---|
| 2467 | + /* Mark Fw dirty */ |
---|
| 2468 | + spin_lock(&dst_ci->i_ceph_lock); |
---|
| 2469 | + dst_ci->i_inline_version = CEPH_INLINE_NONE; |
---|
| 2470 | + dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); |
---|
| 2471 | + spin_unlock(&dst_ci->i_ceph_lock); |
---|
| 2472 | + if (dirty) |
---|
| 2473 | + __mark_inode_dirty(dst_inode, dirty); |
---|
| 2474 | + |
---|
| 2475 | +out_caps: |
---|
| 2476 | + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); |
---|
| 2477 | + |
---|
| 2478 | + /* |
---|
| 2479 | + * Do the final manual copy if we still have some bytes left, unless |
---|
| 2480 | + * there were errors in remote object copies (len >= object_size). |
---|
| 2481 | + */ |
---|
| 2482 | + if (len && (len < src_ci->i_layout.object_size)) { |
---|
| 2483 | + dout("Final partial copy of %zu bytes\n", len); |
---|
| 2484 | + bytes = do_splice_direct(src_file, &src_off, dst_file, |
---|
| 2485 | + &dst_off, len, flags); |
---|
| 2486 | + if (bytes > 0) |
---|
| 2487 | + ret += bytes; |
---|
| 2488 | + else |
---|
| 2489 | + dout("Failed partial copy (%zd)\n", bytes); |
---|
| 2490 | + } |
---|
| 2491 | + |
---|
| 2492 | +out: |
---|
| 2493 | + ceph_free_cap_flush(prealloc_cf); |
---|
| 2494 | + |
---|
| 2495 | + return ret; |
---|
| 2496 | +} |
---|
| 2497 | + |
---|
| 2498 | +static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, |
---|
| 2499 | + struct file *dst_file, loff_t dst_off, |
---|
| 2500 | + size_t len, unsigned int flags) |
---|
| 2501 | +{ |
---|
| 2502 | + ssize_t ret; |
---|
| 2503 | + |
---|
| 2504 | + ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off, |
---|
| 2505 | + len, flags); |
---|
| 2506 | + |
---|
| 2507 | + if (ret == -EOPNOTSUPP || ret == -EXDEV) |
---|
| 2508 | + ret = generic_copy_file_range(src_file, src_off, dst_file, |
---|
| 2509 | + dst_off, len, flags); |
---|
| 2510 | + return ret; |
---|
| 2511 | +} |
---|
| 2512 | + |
---|
1813 | 2513 | const struct file_operations ceph_file_fops = { |
---|
1814 | 2514 | .open = ceph_open, |
---|
1815 | 2515 | .release = ceph_release, |
---|
.. | .. |
---|
1824 | 2524 | .splice_read = generic_file_splice_read, |
---|
1825 | 2525 | .splice_write = iter_file_splice_write, |
---|
1826 | 2526 | .unlocked_ioctl = ceph_ioctl, |
---|
1827 | | - .compat_ioctl = ceph_ioctl, |
---|
| 2527 | + .compat_ioctl = compat_ptr_ioctl, |
---|
1828 | 2528 | .fallocate = ceph_fallocate, |
---|
| 2529 | + .copy_file_range = ceph_copy_file_range, |
---|
1829 | 2530 | }; |
---|
1830 | | - |
---|