| .. | .. |
|---|
| 34 | 34 | #include <linux/ceph/cls_lock_client.h> |
|---|
| 35 | 35 | #include <linux/ceph/striper.h> |
|---|
| 36 | 36 | #include <linux/ceph/decode.h> |
|---|
| 37 | | -#include <linux/parser.h> |
|---|
| 37 | +#include <linux/fs_parser.h> |
|---|
| 38 | 38 | #include <linux/bsearch.h> |
|---|
| 39 | 39 | |
|---|
| 40 | 40 | #include <linux/kernel.h> |
|---|
| .. | .. |
|---|
| 115 | 115 | #define RBD_FEATURE_LAYERING (1ULL<<0) |
|---|
| 116 | 116 | #define RBD_FEATURE_STRIPINGV2 (1ULL<<1) |
|---|
| 117 | 117 | #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) |
|---|
| 118 | +#define RBD_FEATURE_OBJECT_MAP (1ULL<<3) |
|---|
| 119 | +#define RBD_FEATURE_FAST_DIFF (1ULL<<4) |
|---|
| 120 | +#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5) |
|---|
| 118 | 121 | #define RBD_FEATURE_DATA_POOL (1ULL<<7) |
|---|
| 119 | 122 | #define RBD_FEATURE_OPERATIONS (1ULL<<8) |
|---|
| 120 | 123 | |
|---|
| 121 | 124 | #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ |
|---|
| 122 | 125 | RBD_FEATURE_STRIPINGV2 | \ |
|---|
| 123 | 126 | RBD_FEATURE_EXCLUSIVE_LOCK | \ |
|---|
| 127 | + RBD_FEATURE_OBJECT_MAP | \ |
|---|
| 128 | + RBD_FEATURE_FAST_DIFF | \ |
|---|
| 129 | + RBD_FEATURE_DEEP_FLATTEN | \ |
|---|
| 124 | 130 | RBD_FEATURE_DATA_POOL | \ |
|---|
| 125 | 131 | RBD_FEATURE_OPERATIONS) |
|---|
| 126 | 132 | |
|---|
| .. | .. |
|---|
| 201 | 207 | struct list_head node; |
|---|
| 202 | 208 | }; |
|---|
| 203 | 209 | |
|---|
| 210 | +struct pending_result { |
|---|
| 211 | + int result; /* first nonzero result */ |
|---|
| 212 | + int num_pending; |
|---|
| 213 | +}; |
|---|
| 214 | + |
|---|
| 204 | 215 | struct rbd_img_request; |
|---|
| 205 | 216 | |
|---|
| 206 | 217 | enum obj_request_type { |
|---|
| .. | .. |
|---|
| 214 | 225 | OBJ_OP_READ = 1, |
|---|
| 215 | 226 | OBJ_OP_WRITE, |
|---|
| 216 | 227 | OBJ_OP_DISCARD, |
|---|
| 228 | + OBJ_OP_ZEROOUT, |
|---|
| 229 | +}; |
|---|
| 230 | + |
|---|
| 231 | +#define RBD_OBJ_FLAG_DELETION (1U << 0) |
|---|
| 232 | +#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1) |
|---|
| 233 | +#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2) |
|---|
| 234 | +#define RBD_OBJ_FLAG_MAY_EXIST (1U << 3) |
|---|
| 235 | +#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4) |
|---|
| 236 | + |
|---|
| 237 | +enum rbd_obj_read_state { |
|---|
| 238 | + RBD_OBJ_READ_START = 1, |
|---|
| 239 | + RBD_OBJ_READ_OBJECT, |
|---|
| 240 | + RBD_OBJ_READ_PARENT, |
|---|
| 217 | 241 | }; |
|---|
| 218 | 242 | |
|---|
| 219 | 243 | /* |
|---|
| 220 | 244 | * Writes go through the following state machine to deal with |
|---|
| 221 | 245 | * layering: |
|---|
| 222 | 246 | * |
|---|
| 223 | | - * need copyup |
|---|
| 224 | | - * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP |
|---|
| 225 | | - * | ^ | |
|---|
| 226 | | - * v \------------------------------/ |
|---|
| 227 | | - * done |
|---|
| 228 | | - * ^ |
|---|
| 229 | | - * | |
|---|
| 230 | | - * RBD_OBJ_WRITE_FLAT |
|---|
| 247 | + * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . . |
|---|
| 248 | + * . | . |
|---|
| 249 | + * . v . |
|---|
| 250 | + * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . . |
|---|
| 251 | + * . | . . |
|---|
| 252 | + * . v v (deep-copyup . |
|---|
| 253 | + * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) . |
|---|
| 254 | + * flattened) v | . . |
|---|
| 255 | + * . v . . |
|---|
| 256 | + * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup . |
|---|
| 257 | + * | not needed) v |
|---|
| 258 | + * v . |
|---|
| 259 | + * done . . . . . . . . . . . . . . . . . . |
|---|
| 260 | + * ^ |
|---|
| 261 | + * | |
|---|
| 262 | + * RBD_OBJ_WRITE_FLAT |
|---|
| 231 | 263 | * |
|---|
| 232 | 264 | * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether |
|---|
| 233 | | - * there is a parent or not. |
|---|
| 265 | + * assert_exists guard is needed or not (in some cases it's not needed |
|---|
| 266 | + * even if there is a parent). |
|---|
| 234 | 267 | */ |
|---|
| 235 | 268 | enum rbd_obj_write_state { |
|---|
| 236 | | - RBD_OBJ_WRITE_FLAT = 1, |
|---|
| 237 | | - RBD_OBJ_WRITE_GUARD, |
|---|
| 269 | + RBD_OBJ_WRITE_START = 1, |
|---|
| 270 | + RBD_OBJ_WRITE_PRE_OBJECT_MAP, |
|---|
| 271 | + RBD_OBJ_WRITE_OBJECT, |
|---|
| 272 | + __RBD_OBJ_WRITE_COPYUP, |
|---|
| 238 | 273 | RBD_OBJ_WRITE_COPYUP, |
|---|
| 274 | + RBD_OBJ_WRITE_POST_OBJECT_MAP, |
|---|
| 275 | +}; |
|---|
| 276 | + |
|---|
| 277 | +enum rbd_obj_copyup_state { |
|---|
| 278 | + RBD_OBJ_COPYUP_START = 1, |
|---|
| 279 | + RBD_OBJ_COPYUP_READ_PARENT, |
|---|
| 280 | + __RBD_OBJ_COPYUP_OBJECT_MAPS, |
|---|
| 281 | + RBD_OBJ_COPYUP_OBJECT_MAPS, |
|---|
| 282 | + __RBD_OBJ_COPYUP_WRITE_OBJECT, |
|---|
| 283 | + RBD_OBJ_COPYUP_WRITE_OBJECT, |
|---|
| 239 | 284 | }; |
|---|
| 240 | 285 | |
|---|
| 241 | 286 | struct rbd_obj_request { |
|---|
| 242 | 287 | struct ceph_object_extent ex; |
|---|
| 288 | + unsigned int flags; /* RBD_OBJ_FLAG_* */ |
|---|
| 243 | 289 | union { |
|---|
| 244 | | - bool tried_parent; /* for reads */ |
|---|
| 290 | + enum rbd_obj_read_state read_state; /* for reads */ |
|---|
| 245 | 291 | enum rbd_obj_write_state write_state; /* for writes */ |
|---|
| 246 | 292 | }; |
|---|
| 247 | 293 | |
|---|
| .. | .. |
|---|
| 257 | 303 | u32 bvec_idx; |
|---|
| 258 | 304 | }; |
|---|
| 259 | 305 | }; |
|---|
| 306 | + |
|---|
| 307 | + enum rbd_obj_copyup_state copyup_state; |
|---|
| 260 | 308 | struct bio_vec *copyup_bvecs; |
|---|
| 261 | 309 | u32 copyup_bvec_count; |
|---|
| 262 | 310 | |
|---|
| 263 | | - struct ceph_osd_request *osd_req; |
|---|
| 311 | + struct list_head osd_reqs; /* w/ r_private_item */ |
|---|
| 264 | 312 | |
|---|
| 265 | | - u64 xferred; /* bytes transferred */ |
|---|
| 266 | | - int result; |
|---|
| 267 | | - |
|---|
| 313 | + struct mutex state_mutex; |
|---|
| 314 | + struct pending_result pending; |
|---|
| 268 | 315 | struct kref kref; |
|---|
| 269 | 316 | }; |
|---|
| 270 | 317 | |
|---|
| .. | .. |
|---|
| 273 | 320 | IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ |
|---|
| 274 | 321 | }; |
|---|
| 275 | 322 | |
|---|
| 323 | +enum rbd_img_state { |
|---|
| 324 | + RBD_IMG_START = 1, |
|---|
| 325 | + RBD_IMG_EXCLUSIVE_LOCK, |
|---|
| 326 | + __RBD_IMG_OBJECT_REQUESTS, |
|---|
| 327 | + RBD_IMG_OBJECT_REQUESTS, |
|---|
| 328 | +}; |
|---|
| 329 | + |
|---|
| 276 | 330 | struct rbd_img_request { |
|---|
| 277 | 331 | struct rbd_device *rbd_dev; |
|---|
| 278 | 332 | enum obj_operation_type op_type; |
|---|
| 279 | 333 | enum obj_request_type data_type; |
|---|
| 280 | 334 | unsigned long flags; |
|---|
| 335 | + enum rbd_img_state state; |
|---|
| 281 | 336 | union { |
|---|
| 282 | 337 | u64 snap_id; /* for reads */ |
|---|
| 283 | 338 | struct ceph_snap_context *snapc; /* for writes */ |
|---|
| 284 | 339 | }; |
|---|
| 285 | | - union { |
|---|
| 286 | | - struct request *rq; /* block request */ |
|---|
| 287 | | - struct rbd_obj_request *obj_request; /* obj req initiator */ |
|---|
| 288 | | - }; |
|---|
| 289 | | - spinlock_t completion_lock; |
|---|
| 290 | | - u64 xferred;/* aggregate bytes transferred */ |
|---|
| 291 | | - int result; /* first nonzero obj_request result */ |
|---|
| 340 | + struct rbd_obj_request *obj_request; /* obj req initiator */ |
|---|
| 292 | 341 | |
|---|
| 342 | + struct list_head lock_item; |
|---|
| 293 | 343 | struct list_head object_extents; /* obj_req.ex structs */ |
|---|
| 294 | | - u32 obj_request_count; |
|---|
| 295 | | - u32 pending_count; |
|---|
| 296 | 344 | |
|---|
| 297 | | - struct kref kref; |
|---|
| 345 | + struct mutex state_mutex; |
|---|
| 346 | + struct pending_result pending; |
|---|
| 347 | + struct work_struct work; |
|---|
| 348 | + int work_result; |
|---|
| 298 | 349 | }; |
|---|
| 299 | 350 | |
|---|
| 300 | 351 | #define for_each_obj_request(ireq, oreq) \ |
|---|
| .. | .. |
|---|
| 322 | 373 | |
|---|
| 323 | 374 | struct rbd_mapping { |
|---|
| 324 | 375 | u64 size; |
|---|
| 325 | | - u64 features; |
|---|
| 326 | 376 | }; |
|---|
| 327 | 377 | |
|---|
| 328 | 378 | /* |
|---|
| .. | .. |
|---|
| 367 | 417 | struct work_struct released_lock_work; |
|---|
| 368 | 418 | struct delayed_work lock_dwork; |
|---|
| 369 | 419 | struct work_struct unlock_work; |
|---|
| 370 | | - wait_queue_head_t lock_waitq; |
|---|
| 420 | + spinlock_t lock_lists_lock; |
|---|
| 421 | + struct list_head acquiring_list; |
|---|
| 422 | + struct list_head running_list; |
|---|
| 423 | + struct completion acquire_wait; |
|---|
| 424 | + int acquire_err; |
|---|
| 425 | + struct completion releasing_wait; |
|---|
| 426 | + |
|---|
| 427 | + spinlock_t object_map_lock; |
|---|
| 428 | + u8 *object_map; |
|---|
| 429 | + u64 object_map_size; /* in objects */ |
|---|
| 430 | + u64 object_map_flags; |
|---|
| 371 | 431 | |
|---|
| 372 | 432 | struct workqueue_struct *task_wq; |
|---|
| 373 | 433 | |
|---|
| .. | .. |
|---|
| 395 | 455 | * Flag bits for rbd_dev->flags: |
|---|
| 396 | 456 | * - REMOVING (which is coupled with rbd_dev->open_count) is protected |
|---|
| 397 | 457 | * by rbd_dev->lock |
|---|
| 398 | | - * - BLACKLISTED is protected by rbd_dev->lock_rwsem |
|---|
| 399 | 458 | */ |
|---|
| 400 | 459 | enum rbd_dev_flags { |
|---|
| 401 | | - RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ |
|---|
| 460 | + RBD_DEV_FLAG_EXISTS, /* rbd_dev_device_setup() ran */ |
|---|
| 402 | 461 | RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ |
|---|
| 403 | | - RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */ |
|---|
| 462 | + RBD_DEV_FLAG_READONLY, /* -o ro or snapshot */ |
|---|
| 404 | 463 | }; |
|---|
| 405 | 464 | |
|---|
| 406 | 465 | static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ |
|---|
| .. | .. |
|---|
| 421 | 480 | |
|---|
| 422 | 481 | static struct workqueue_struct *rbd_wq; |
|---|
| 423 | 482 | |
|---|
| 483 | +static struct ceph_snap_context rbd_empty_snapc = { |
|---|
| 484 | + .nref = REFCOUNT_INIT(1), |
|---|
| 485 | +}; |
|---|
| 486 | + |
|---|
| 424 | 487 | /* |
|---|
| 425 | 488 | * single-major requires >= 0.75 version of userspace rbd utility. |
|---|
| 426 | 489 | */ |
|---|
| .. | .. |
|---|
| 428 | 491 | module_param(single_major, bool, 0444); |
|---|
| 429 | 492 | MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); |
|---|
| 430 | 493 | |
|---|
| 431 | | -static ssize_t rbd_add(struct bus_type *bus, const char *buf, |
|---|
| 432 | | - size_t count); |
|---|
| 433 | | -static ssize_t rbd_remove(struct bus_type *bus, const char *buf, |
|---|
| 434 | | - size_t count); |
|---|
| 435 | | -static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, |
|---|
| 436 | | - size_t count); |
|---|
| 437 | | -static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, |
|---|
| 438 | | - size_t count); |
|---|
| 494 | +static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count); |
|---|
| 495 | +static ssize_t remove_store(struct bus_type *bus, const char *buf, |
|---|
| 496 | + size_t count); |
|---|
| 497 | +static ssize_t add_single_major_store(struct bus_type *bus, const char *buf, |
|---|
| 498 | + size_t count); |
|---|
| 499 | +static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf, |
|---|
| 500 | + size_t count); |
|---|
| 439 | 501 | static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); |
|---|
| 440 | 502 | |
|---|
| 441 | 503 | static int rbd_dev_id_to_minor(int dev_id) |
|---|
| .. | .. |
|---|
| 448 | 510 | return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; |
|---|
| 449 | 511 | } |
|---|
| 450 | 512 | |
|---|
| 513 | +static bool rbd_is_ro(struct rbd_device *rbd_dev) |
|---|
| 514 | +{ |
|---|
| 515 | + return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags); |
|---|
| 516 | +} |
|---|
| 517 | + |
|---|
| 518 | +static bool rbd_is_snap(struct rbd_device *rbd_dev) |
|---|
| 519 | +{ |
|---|
| 520 | + return rbd_dev->spec->snap_id != CEPH_NOSNAP; |
|---|
| 521 | +} |
|---|
| 522 | + |
|---|
| 451 | 523 | static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) |
|---|
| 452 | 524 | { |
|---|
| 525 | + lockdep_assert_held(&rbd_dev->lock_rwsem); |
|---|
| 526 | + |
|---|
| 453 | 527 | return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || |
|---|
| 454 | 528 | rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; |
|---|
| 455 | 529 | } |
|---|
| .. | .. |
|---|
| 464 | 538 | return is_lock_owner; |
|---|
| 465 | 539 | } |
|---|
| 466 | 540 | |
|---|
| 467 | | -static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf) |
|---|
| 541 | +static ssize_t supported_features_show(struct bus_type *bus, char *buf) |
|---|
| 468 | 542 | { |
|---|
| 469 | 543 | return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED); |
|---|
| 470 | 544 | } |
|---|
| 471 | 545 | |
|---|
| 472 | | -static BUS_ATTR(add, 0200, NULL, rbd_add); |
|---|
| 473 | | -static BUS_ATTR(remove, 0200, NULL, rbd_remove); |
|---|
| 474 | | -static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major); |
|---|
| 475 | | -static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major); |
|---|
| 476 | | -static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL); |
|---|
| 546 | +static BUS_ATTR_WO(add); |
|---|
| 547 | +static BUS_ATTR_WO(remove); |
|---|
| 548 | +static BUS_ATTR_WO(add_single_major); |
|---|
| 549 | +static BUS_ATTR_WO(remove_single_major); |
|---|
| 550 | +static BUS_ATTR_RO(supported_features); |
|---|
| 477 | 551 | |
|---|
| 478 | 552 | static struct attribute *rbd_bus_attrs[] = { |
|---|
| 479 | 553 | &bus_attr_add.attr, |
|---|
| .. | .. |
|---|
| 565 | 639 | u64 snap_id); |
|---|
| 566 | 640 | static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, |
|---|
| 567 | 641 | u8 *order, u64 *snap_size); |
|---|
| 568 | | -static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, |
|---|
| 569 | | - u64 *snap_features); |
|---|
| 642 | +static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev); |
|---|
| 643 | + |
|---|
| 644 | +static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result); |
|---|
| 645 | +static void rbd_img_handle_request(struct rbd_img_request *img_req, int result); |
|---|
| 646 | + |
|---|
| 647 | +/* |
|---|
| 648 | + * Return true if nothing else is pending. |
|---|
| 649 | + */ |
|---|
| 650 | +static bool pending_result_dec(struct pending_result *pending, int *result) |
|---|
| 651 | +{ |
|---|
| 652 | + rbd_assert(pending->num_pending > 0); |
|---|
| 653 | + |
|---|
| 654 | + if (*result && !pending->result) |
|---|
| 655 | + pending->result = *result; |
|---|
| 656 | + if (--pending->num_pending) |
|---|
| 657 | + return false; |
|---|
| 658 | + |
|---|
| 659 | + *result = pending->result; |
|---|
| 660 | + return true; |
|---|
| 661 | +} |
|---|
| 570 | 662 | |
|---|
| 571 | 663 | static int rbd_open(struct block_device *bdev, fmode_t mode) |
|---|
| 572 | 664 | { |
|---|
| .. | .. |
|---|
| 607 | 699 | if (get_user(ro, (int __user *)arg)) |
|---|
| 608 | 700 | return -EFAULT; |
|---|
| 609 | 701 | |
|---|
| 610 | | - /* Snapshots can't be marked read-write */ |
|---|
| 611 | | - if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) |
|---|
| 612 | | - return -EROFS; |
|---|
| 702 | + /* |
|---|
| 703 | + * Both images mapped read-only and snapshots can't be marked |
|---|
| 704 | + * read-write. |
|---|
| 705 | + */ |
|---|
| 706 | + if (!ro) { |
|---|
| 707 | + if (rbd_is_ro(rbd_dev)) |
|---|
| 708 | + return -EROFS; |
|---|
| 709 | + |
|---|
| 710 | + rbd_assert(!rbd_is_snap(rbd_dev)); |
|---|
| 711 | + } |
|---|
| 613 | 712 | |
|---|
| 614 | 713 | /* Let blkdev_roset() handle it */ |
|---|
| 615 | 714 | return -ENOTTY; |
|---|
| .. | .. |
|---|
| 733 | 832 | */ |
|---|
| 734 | 833 | enum { |
|---|
| 735 | 834 | Opt_queue_depth, |
|---|
| 835 | + Opt_alloc_size, |
|---|
| 736 | 836 | Opt_lock_timeout, |
|---|
| 737 | | - Opt_last_int, |
|---|
| 738 | 837 | /* int args above */ |
|---|
| 739 | 838 | Opt_pool_ns, |
|---|
| 740 | | - Opt_last_string, |
|---|
| 839 | + Opt_compression_hint, |
|---|
| 741 | 840 | /* string args above */ |
|---|
| 742 | 841 | Opt_read_only, |
|---|
| 743 | 842 | Opt_read_write, |
|---|
| 744 | 843 | Opt_lock_on_read, |
|---|
| 745 | 844 | Opt_exclusive, |
|---|
| 746 | 845 | Opt_notrim, |
|---|
| 747 | | - Opt_err |
|---|
| 748 | 846 | }; |
|---|
| 749 | 847 | |
|---|
| 750 | | -static match_table_t rbd_opts_tokens = { |
|---|
| 751 | | - {Opt_queue_depth, "queue_depth=%d"}, |
|---|
| 752 | | - {Opt_lock_timeout, "lock_timeout=%d"}, |
|---|
| 753 | | - /* int args above */ |
|---|
| 754 | | - {Opt_pool_ns, "_pool_ns=%s"}, |
|---|
| 755 | | - /* string args above */ |
|---|
| 756 | | - {Opt_read_only, "read_only"}, |
|---|
| 757 | | - {Opt_read_only, "ro"}, /* Alternate spelling */ |
|---|
| 758 | | - {Opt_read_write, "read_write"}, |
|---|
| 759 | | - {Opt_read_write, "rw"}, /* Alternate spelling */ |
|---|
| 760 | | - {Opt_lock_on_read, "lock_on_read"}, |
|---|
| 761 | | - {Opt_exclusive, "exclusive"}, |
|---|
| 762 | | - {Opt_notrim, "notrim"}, |
|---|
| 763 | | - {Opt_err, NULL} |
|---|
| 848 | +enum { |
|---|
| 849 | + Opt_compression_hint_none, |
|---|
| 850 | + Opt_compression_hint_compressible, |
|---|
| 851 | + Opt_compression_hint_incompressible, |
|---|
| 852 | +}; |
|---|
| 853 | + |
|---|
| 854 | +static const struct constant_table rbd_param_compression_hint[] = { |
|---|
| 855 | + {"none", Opt_compression_hint_none}, |
|---|
| 856 | + {"compressible", Opt_compression_hint_compressible}, |
|---|
| 857 | + {"incompressible", Opt_compression_hint_incompressible}, |
|---|
| 858 | + {} |
|---|
| 859 | +}; |
|---|
| 860 | + |
|---|
| 861 | +static const struct fs_parameter_spec rbd_parameters[] = { |
|---|
| 862 | + fsparam_u32 ("alloc_size", Opt_alloc_size), |
|---|
| 863 | + fsparam_enum ("compression_hint", Opt_compression_hint, |
|---|
| 864 | + rbd_param_compression_hint), |
|---|
| 865 | + fsparam_flag ("exclusive", Opt_exclusive), |
|---|
| 866 | + fsparam_flag ("lock_on_read", Opt_lock_on_read), |
|---|
| 867 | + fsparam_u32 ("lock_timeout", Opt_lock_timeout), |
|---|
| 868 | + fsparam_flag ("notrim", Opt_notrim), |
|---|
| 869 | + fsparam_string ("_pool_ns", Opt_pool_ns), |
|---|
| 870 | + fsparam_u32 ("queue_depth", Opt_queue_depth), |
|---|
| 871 | + fsparam_flag ("read_only", Opt_read_only), |
|---|
| 872 | + fsparam_flag ("read_write", Opt_read_write), |
|---|
| 873 | + fsparam_flag ("ro", Opt_read_only), |
|---|
| 874 | + fsparam_flag ("rw", Opt_read_write), |
|---|
| 875 | + {} |
|---|
| 764 | 876 | }; |
|---|
| 765 | 877 | |
|---|
| 766 | 878 | struct rbd_options { |
|---|
| 767 | 879 | int queue_depth; |
|---|
| 880 | + int alloc_size; |
|---|
| 768 | 881 | unsigned long lock_timeout; |
|---|
| 769 | 882 | bool read_only; |
|---|
| 770 | 883 | bool lock_on_read; |
|---|
| 771 | 884 | bool exclusive; |
|---|
| 772 | 885 | bool trim; |
|---|
| 886 | + |
|---|
| 887 | + u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ |
|---|
| 773 | 888 | }; |
|---|
| 774 | 889 | |
|---|
| 775 | 890 | #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ |
|---|
| 891 | +#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024) |
|---|
| 776 | 892 | #define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */ |
|---|
| 777 | 893 | #define RBD_READ_ONLY_DEFAULT false |
|---|
| 778 | 894 | #define RBD_LOCK_ON_READ_DEFAULT false |
|---|
| 779 | 895 | #define RBD_EXCLUSIVE_DEFAULT false |
|---|
| 780 | 896 | #define RBD_TRIM_DEFAULT true |
|---|
| 781 | 897 | |
|---|
| 782 | | -struct parse_rbd_opts_ctx { |
|---|
| 898 | +struct rbd_parse_opts_ctx { |
|---|
| 783 | 899 | struct rbd_spec *spec; |
|---|
| 900 | + struct ceph_options *copts; |
|---|
| 784 | 901 | struct rbd_options *opts; |
|---|
| 785 | 902 | }; |
|---|
| 786 | | - |
|---|
| 787 | | -static int parse_rbd_opts_token(char *c, void *private) |
|---|
| 788 | | -{ |
|---|
| 789 | | - struct parse_rbd_opts_ctx *pctx = private; |
|---|
| 790 | | - substring_t argstr[MAX_OPT_ARGS]; |
|---|
| 791 | | - int token, intval, ret; |
|---|
| 792 | | - |
|---|
| 793 | | - token = match_token(c, rbd_opts_tokens, argstr); |
|---|
| 794 | | - if (token < Opt_last_int) { |
|---|
| 795 | | - ret = match_int(&argstr[0], &intval); |
|---|
| 796 | | - if (ret < 0) { |
|---|
| 797 | | - pr_err("bad option arg (not int) at '%s'\n", c); |
|---|
| 798 | | - return ret; |
|---|
| 799 | | - } |
|---|
| 800 | | - dout("got int token %d val %d\n", token, intval); |
|---|
| 801 | | - } else if (token > Opt_last_int && token < Opt_last_string) { |
|---|
| 802 | | - dout("got string token %d val %s\n", token, argstr[0].from); |
|---|
| 803 | | - } else { |
|---|
| 804 | | - dout("got token %d\n", token); |
|---|
| 805 | | - } |
|---|
| 806 | | - |
|---|
| 807 | | - switch (token) { |
|---|
| 808 | | - case Opt_queue_depth: |
|---|
| 809 | | - if (intval < 1) { |
|---|
| 810 | | - pr_err("queue_depth out of range\n"); |
|---|
| 811 | | - return -EINVAL; |
|---|
| 812 | | - } |
|---|
| 813 | | - pctx->opts->queue_depth = intval; |
|---|
| 814 | | - break; |
|---|
| 815 | | - case Opt_lock_timeout: |
|---|
| 816 | | - /* 0 is "wait forever" (i.e. infinite timeout) */ |
|---|
| 817 | | - if (intval < 0 || intval > INT_MAX / 1000) { |
|---|
| 818 | | - pr_err("lock_timeout out of range\n"); |
|---|
| 819 | | - return -EINVAL; |
|---|
| 820 | | - } |
|---|
| 821 | | - pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000); |
|---|
| 822 | | - break; |
|---|
| 823 | | - case Opt_pool_ns: |
|---|
| 824 | | - kfree(pctx->spec->pool_ns); |
|---|
| 825 | | - pctx->spec->pool_ns = match_strdup(argstr); |
|---|
| 826 | | - if (!pctx->spec->pool_ns) |
|---|
| 827 | | - return -ENOMEM; |
|---|
| 828 | | - break; |
|---|
| 829 | | - case Opt_read_only: |
|---|
| 830 | | - pctx->opts->read_only = true; |
|---|
| 831 | | - break; |
|---|
| 832 | | - case Opt_read_write: |
|---|
| 833 | | - pctx->opts->read_only = false; |
|---|
| 834 | | - break; |
|---|
| 835 | | - case Opt_lock_on_read: |
|---|
| 836 | | - pctx->opts->lock_on_read = true; |
|---|
| 837 | | - break; |
|---|
| 838 | | - case Opt_exclusive: |
|---|
| 839 | | - pctx->opts->exclusive = true; |
|---|
| 840 | | - break; |
|---|
| 841 | | - case Opt_notrim: |
|---|
| 842 | | - pctx->opts->trim = false; |
|---|
| 843 | | - break; |
|---|
| 844 | | - default: |
|---|
| 845 | | - /* libceph prints "bad option" msg */ |
|---|
| 846 | | - return -EINVAL; |
|---|
| 847 | | - } |
|---|
| 848 | | - |
|---|
| 849 | | - return 0; |
|---|
| 850 | | -} |
|---|
| 851 | 903 | |
|---|
| 852 | 904 | static char* obj_op_name(enum obj_operation_type op_type) |
|---|
| 853 | 905 | { |
|---|
| .. | .. |
|---|
| 858 | 910 | return "write"; |
|---|
| 859 | 911 | case OBJ_OP_DISCARD: |
|---|
| 860 | 912 | return "discard"; |
|---|
| 913 | + case OBJ_OP_ZEROOUT: |
|---|
| 914 | + return "zeroout"; |
|---|
| 861 | 915 | default: |
|---|
| 862 | 916 | return "???"; |
|---|
| 863 | 917 | } |
|---|
| .. | .. |
|---|
| 891 | 945 | kref_put(&rbdc->kref, rbd_client_release); |
|---|
| 892 | 946 | } |
|---|
| 893 | 947 | |
|---|
| 894 | | -static int wait_for_latest_osdmap(struct ceph_client *client) |
|---|
| 895 | | -{ |
|---|
| 896 | | - u64 newest_epoch; |
|---|
| 897 | | - int ret; |
|---|
| 898 | | - |
|---|
| 899 | | - ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); |
|---|
| 900 | | - if (ret) |
|---|
| 901 | | - return ret; |
|---|
| 902 | | - |
|---|
| 903 | | - if (client->osdc.osdmap->epoch >= newest_epoch) |
|---|
| 904 | | - return 0; |
|---|
| 905 | | - |
|---|
| 906 | | - ceph_osdc_maybe_request_map(&client->osdc); |
|---|
| 907 | | - return ceph_monc_wait_osdmap(&client->monc, newest_epoch, |
|---|
| 908 | | - client->options->mount_timeout); |
|---|
| 909 | | -} |
|---|
| 910 | | - |
|---|
| 911 | 948 | /* |
|---|
| 912 | 949 | * Get a ceph client with specific addr and configuration, if one does |
|---|
| 913 | 950 | * not exist create it. Either way, ceph_opts is consumed by this |
|---|
| .. | .. |
|---|
| 918 | 955 | struct rbd_client *rbdc; |
|---|
| 919 | 956 | int ret; |
|---|
| 920 | 957 | |
|---|
| 921 | | - mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); |
|---|
| 958 | + mutex_lock(&client_mutex); |
|---|
| 922 | 959 | rbdc = rbd_client_find(ceph_opts); |
|---|
| 923 | 960 | if (rbdc) { |
|---|
| 924 | 961 | ceph_destroy_options(ceph_opts); |
|---|
| .. | .. |
|---|
| 927 | 964 | * Using an existing client. Make sure ->pg_pools is up to |
|---|
| 928 | 965 | * date before we look up the pool id in do_rbd_add(). |
|---|
| 929 | 966 | */ |
|---|
| 930 | | - ret = wait_for_latest_osdmap(rbdc->client); |
|---|
| 967 | + ret = ceph_wait_for_latest_osdmap(rbdc->client, |
|---|
| 968 | + rbdc->client->options->mount_timeout); |
|---|
| 931 | 969 | if (ret) { |
|---|
| 932 | 970 | rbd_warn(NULL, "failed to get latest osdmap: %d", ret); |
|---|
| 933 | 971 | rbd_put_client(rbdc); |
|---|
| .. | .. |
|---|
| 1213 | 1251 | return 0; |
|---|
| 1214 | 1252 | } |
|---|
| 1215 | 1253 | |
|---|
| 1216 | | -static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, |
|---|
| 1217 | | - u64 *snap_features) |
|---|
| 1218 | | -{ |
|---|
| 1219 | | - rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); |
|---|
| 1220 | | - if (snap_id == CEPH_NOSNAP) { |
|---|
| 1221 | | - *snap_features = rbd_dev->header.features; |
|---|
| 1222 | | - } else if (rbd_dev->image_format == 1) { |
|---|
| 1223 | | - *snap_features = 0; /* No features for format 1 */ |
|---|
| 1224 | | - } else { |
|---|
| 1225 | | - u64 features = 0; |
|---|
| 1226 | | - int ret; |
|---|
| 1227 | | - |
|---|
| 1228 | | - ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); |
|---|
| 1229 | | - if (ret) |
|---|
| 1230 | | - return ret; |
|---|
| 1231 | | - |
|---|
| 1232 | | - *snap_features = features; |
|---|
| 1233 | | - } |
|---|
| 1234 | | - return 0; |
|---|
| 1235 | | -} |
|---|
| 1236 | | - |
|---|
| 1237 | 1254 | static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) |
|---|
| 1238 | 1255 | { |
|---|
| 1239 | 1256 | u64 snap_id = rbd_dev->spec->snap_id; |
|---|
| 1240 | 1257 | u64 size = 0; |
|---|
| 1241 | | - u64 features = 0; |
|---|
| 1242 | 1258 | int ret; |
|---|
| 1243 | 1259 | |
|---|
| 1244 | 1260 | ret = rbd_snap_size(rbd_dev, snap_id, &size); |
|---|
| 1245 | 1261 | if (ret) |
|---|
| 1246 | 1262 | return ret; |
|---|
| 1247 | | - ret = rbd_snap_features(rbd_dev, snap_id, &features); |
|---|
| 1248 | | - if (ret) |
|---|
| 1249 | | - return ret; |
|---|
| 1250 | 1263 | |
|---|
| 1251 | 1264 | rbd_dev->mapping.size = size; |
|---|
| 1252 | | - rbd_dev->mapping.features = features; |
|---|
| 1253 | | - |
|---|
| 1254 | 1265 | return 0; |
|---|
| 1255 | 1266 | } |
|---|
| 1256 | 1267 | |
|---|
| 1257 | 1268 | static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) |
|---|
| 1258 | 1269 | { |
|---|
| 1259 | 1270 | rbd_dev->mapping.size = 0; |
|---|
| 1260 | | - rbd_dev->mapping.features = 0; |
|---|
| 1261 | 1271 | } |
|---|
| 1262 | 1272 | |
|---|
| 1263 | 1273 | static void zero_bvec(struct bio_vec *bv) |
|---|
| .. | .. |
|---|
| 1300 | 1310 | static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, |
|---|
| 1301 | 1311 | u32 bytes) |
|---|
| 1302 | 1312 | { |
|---|
| 1313 | + dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes); |
|---|
| 1314 | + |
|---|
| 1303 | 1315 | switch (obj_req->img_request->data_type) { |
|---|
| 1304 | 1316 | case OBJ_REQUEST_BIO: |
|---|
| 1305 | 1317 | zero_bios(&obj_req->bio_pos, off, bytes); |
|---|
| .. | .. |
|---|
| 1309 | 1321 | zero_bvecs(&obj_req->bvec_pos, off, bytes); |
|---|
| 1310 | 1322 | break; |
|---|
| 1311 | 1323 | default: |
|---|
| 1312 | | - rbd_assert(0); |
|---|
| 1324 | + BUG(); |
|---|
| 1313 | 1325 | } |
|---|
| 1314 | 1326 | } |
|---|
| 1315 | 1327 | |
|---|
| .. | .. |
|---|
| 1322 | 1334 | kref_put(&obj_request->kref, rbd_obj_request_destroy); |
|---|
| 1323 | 1335 | } |
|---|
| 1324 | 1336 | |
|---|
| 1325 | | -static void rbd_img_request_get(struct rbd_img_request *img_request) |
|---|
| 1326 | | -{ |
|---|
| 1327 | | - dout("%s: img %p (was %d)\n", __func__, img_request, |
|---|
| 1328 | | - kref_read(&img_request->kref)); |
|---|
| 1329 | | - kref_get(&img_request->kref); |
|---|
| 1330 | | -} |
|---|
| 1331 | | - |
|---|
| 1332 | | -static void rbd_img_request_destroy(struct kref *kref); |
|---|
| 1333 | | -static void rbd_img_request_put(struct rbd_img_request *img_request) |
|---|
| 1334 | | -{ |
|---|
| 1335 | | - rbd_assert(img_request != NULL); |
|---|
| 1336 | | - dout("%s: img %p (was %d)\n", __func__, img_request, |
|---|
| 1337 | | - kref_read(&img_request->kref)); |
|---|
| 1338 | | - kref_put(&img_request->kref, rbd_img_request_destroy); |
|---|
| 1339 | | -} |
|---|
| 1340 | | - |
|---|
| 1341 | 1337 | static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, |
|---|
| 1342 | 1338 | struct rbd_obj_request *obj_request) |
|---|
| 1343 | 1339 | { |
|---|
| .. | .. |
|---|
| 1345 | 1341 | |
|---|
| 1346 | 1342 | /* Image request now owns object's original reference */ |
|---|
| 1347 | 1343 | obj_request->img_request = img_request; |
|---|
| 1348 | | - img_request->obj_request_count++; |
|---|
| 1349 | | - img_request->pending_count++; |
|---|
| 1350 | 1344 | dout("%s: img %p obj %p\n", __func__, img_request, obj_request); |
|---|
| 1351 | 1345 | } |
|---|
| 1352 | 1346 | |
|---|
| .. | .. |
|---|
| 1355 | 1349 | { |
|---|
| 1356 | 1350 | dout("%s: img %p obj %p\n", __func__, img_request, obj_request); |
|---|
| 1357 | 1351 | list_del(&obj_request->ex.oe_item); |
|---|
| 1358 | | - rbd_assert(img_request->obj_request_count > 0); |
|---|
| 1359 | | - img_request->obj_request_count--; |
|---|
| 1360 | 1352 | rbd_assert(obj_request->img_request == img_request); |
|---|
| 1361 | 1353 | rbd_obj_request_put(obj_request); |
|---|
| 1362 | 1354 | } |
|---|
| 1363 | 1355 | |
|---|
| 1364 | | -static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) |
|---|
| 1356 | +static void rbd_osd_submit(struct ceph_osd_request *osd_req) |
|---|
| 1365 | 1357 | { |
|---|
| 1366 | | - struct ceph_osd_request *osd_req = obj_request->osd_req; |
|---|
| 1358 | + struct rbd_obj_request *obj_req = osd_req->r_priv; |
|---|
| 1367 | 1359 | |
|---|
| 1368 | | - dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, |
|---|
| 1369 | | - obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off, |
|---|
| 1370 | | - obj_request->ex.oe_len, osd_req); |
|---|
| 1360 | + dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n", |
|---|
| 1361 | + __func__, osd_req, obj_req, obj_req->ex.oe_objno, |
|---|
| 1362 | + obj_req->ex.oe_off, obj_req->ex.oe_len); |
|---|
| 1371 | 1363 | ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); |
|---|
| 1372 | 1364 | } |
|---|
| 1373 | 1365 | |
|---|
| .. | .. |
|---|
| 1379 | 1371 | static void img_request_layered_set(struct rbd_img_request *img_request) |
|---|
| 1380 | 1372 | { |
|---|
| 1381 | 1373 | set_bit(IMG_REQ_LAYERED, &img_request->flags); |
|---|
| 1382 | | - smp_mb(); |
|---|
| 1383 | | -} |
|---|
| 1384 | | - |
|---|
| 1385 | | -static void img_request_layered_clear(struct rbd_img_request *img_request) |
|---|
| 1386 | | -{ |
|---|
| 1387 | | - clear_bit(IMG_REQ_LAYERED, &img_request->flags); |
|---|
| 1388 | | - smp_mb(); |
|---|
| 1389 | 1374 | } |
|---|
| 1390 | 1375 | |
|---|
| 1391 | 1376 | static bool img_request_layered_test(struct rbd_img_request *img_request) |
|---|
| 1392 | 1377 | { |
|---|
| 1393 | | - smp_mb(); |
|---|
| 1394 | 1378 | return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; |
|---|
| 1395 | 1379 | } |
|---|
| 1396 | 1380 | |
|---|
| .. | .. |
|---|
| 1410 | 1394 | rbd_dev->layout.object_size; |
|---|
| 1411 | 1395 | } |
|---|
| 1412 | 1396 | |
|---|
| 1397 | +/* |
|---|
| 1398 | + * Must be called after rbd_obj_calc_img_extents(). |
|---|
| 1399 | + */ |
|---|
| 1400 | +static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req) |
|---|
| 1401 | +{ |
|---|
| 1402 | + if (!obj_req->num_img_extents || |
|---|
| 1403 | + (rbd_obj_is_entire(obj_req) && |
|---|
| 1404 | + !obj_req->img_request->snapc->num_snaps)) |
|---|
| 1405 | + return false; |
|---|
| 1406 | + |
|---|
| 1407 | + return true; |
|---|
| 1408 | +} |
|---|
| 1409 | + |
|---|
| 1413 | 1410 | static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req) |
|---|
| 1414 | 1411 | { |
|---|
| 1415 | 1412 | return ceph_file_extents_bytes(obj_req->img_extents, |
|---|
| .. | .. |
|---|
| 1423 | 1420 | return false; |
|---|
| 1424 | 1421 | case OBJ_OP_WRITE: |
|---|
| 1425 | 1422 | case OBJ_OP_DISCARD: |
|---|
| 1423 | + case OBJ_OP_ZEROOUT: |
|---|
| 1426 | 1424 | return true; |
|---|
| 1427 | 1425 | default: |
|---|
| 1428 | 1426 | BUG(); |
|---|
| 1429 | 1427 | } |
|---|
| 1430 | 1428 | } |
|---|
| 1431 | 1429 | |
|---|
| 1432 | | -static void rbd_obj_handle_request(struct rbd_obj_request *obj_req); |
|---|
| 1433 | | - |
|---|
| 1434 | 1430 | static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) |
|---|
| 1435 | 1431 | { |
|---|
| 1436 | 1432 | struct rbd_obj_request *obj_req = osd_req->r_priv; |
|---|
| 1433 | + int result; |
|---|
| 1437 | 1434 | |
|---|
| 1438 | 1435 | dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, |
|---|
| 1439 | 1436 | osd_req->r_result, obj_req); |
|---|
| 1440 | | - rbd_assert(osd_req == obj_req->osd_req); |
|---|
| 1441 | 1437 | |
|---|
| 1442 | | - obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0; |
|---|
| 1443 | | - if (!obj_req->result && !rbd_img_is_write(obj_req->img_request)) |
|---|
| 1444 | | - obj_req->xferred = osd_req->r_result; |
|---|
| 1438 | + /* |
|---|
| 1439 | + * Writes aren't allowed to return a data payload. In some |
|---|
| 1440 | + * guarded write cases (e.g. stat + zero on an empty object) |
|---|
| 1441 | + * a stat response makes it through, but we don't care. |
|---|
| 1442 | + */ |
|---|
| 1443 | + if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request)) |
|---|
| 1444 | + result = 0; |
|---|
| 1445 | 1445 | else |
|---|
| 1446 | | - /* |
|---|
| 1447 | | - * Writes aren't allowed to return a data payload. In some |
|---|
| 1448 | | - * guarded write cases (e.g. stat + zero on an empty object) |
|---|
| 1449 | | - * a stat response makes it through, but we don't care. |
|---|
| 1450 | | - */ |
|---|
| 1451 | | - obj_req->xferred = 0; |
|---|
| 1446 | + result = osd_req->r_result; |
|---|
| 1452 | 1447 | |
|---|
| 1453 | | - rbd_obj_handle_request(obj_req); |
|---|
| 1448 | + rbd_obj_handle_request(obj_req, result); |
|---|
| 1454 | 1449 | } |
|---|
| 1455 | 1450 | |
|---|
| 1456 | | -static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) |
|---|
| 1451 | +static void rbd_osd_format_read(struct ceph_osd_request *osd_req) |
|---|
| 1457 | 1452 | { |
|---|
| 1458 | | - struct ceph_osd_request *osd_req = obj_request->osd_req; |
|---|
| 1453 | + struct rbd_obj_request *obj_request = osd_req->r_priv; |
|---|
| 1454 | + struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; |
|---|
| 1455 | + struct ceph_options *opt = rbd_dev->rbd_client->client->options; |
|---|
| 1459 | 1456 | |
|---|
| 1460 | | - osd_req->r_flags = CEPH_OSD_FLAG_READ; |
|---|
| 1457 | + osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica; |
|---|
| 1461 | 1458 | osd_req->r_snapid = obj_request->img_request->snap_id; |
|---|
| 1462 | 1459 | } |
|---|
| 1463 | 1460 | |
|---|
| 1464 | | -static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) |
|---|
| 1461 | +static void rbd_osd_format_write(struct ceph_osd_request *osd_req) |
|---|
| 1465 | 1462 | { |
|---|
| 1466 | | - struct ceph_osd_request *osd_req = obj_request->osd_req; |
|---|
| 1463 | + struct rbd_obj_request *obj_request = osd_req->r_priv; |
|---|
| 1467 | 1464 | |
|---|
| 1468 | 1465 | osd_req->r_flags = CEPH_OSD_FLAG_WRITE; |
|---|
| 1469 | 1466 | ktime_get_real_ts64(&osd_req->r_mtime); |
|---|
| .. | .. |
|---|
| 1471 | 1468 | } |
|---|
| 1472 | 1469 | |
|---|
| 1473 | 1470 | static struct ceph_osd_request * |
|---|
| 1474 | | -rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) |
|---|
| 1471 | +__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, |
|---|
| 1472 | + struct ceph_snap_context *snapc, int num_ops) |
|---|
| 1475 | 1473 | { |
|---|
| 1476 | | - struct rbd_img_request *img_req = obj_req->img_request; |
|---|
| 1477 | | - struct rbd_device *rbd_dev = img_req->rbd_dev; |
|---|
| 1474 | + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 1478 | 1475 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
|---|
| 1479 | 1476 | struct ceph_osd_request *req; |
|---|
| 1480 | 1477 | const char *name_format = rbd_dev->image_format == 1 ? |
|---|
| 1481 | 1478 | RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; |
|---|
| 1479 | + int ret; |
|---|
| 1482 | 1480 | |
|---|
| 1483 | | - req = ceph_osdc_alloc_request(osdc, |
|---|
| 1484 | | - (rbd_img_is_write(img_req) ? img_req->snapc : NULL), |
|---|
| 1485 | | - num_ops, false, GFP_NOIO); |
|---|
| 1481 | + req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); |
|---|
| 1486 | 1482 | if (!req) |
|---|
| 1487 | | - return NULL; |
|---|
| 1483 | + return ERR_PTR(-ENOMEM); |
|---|
| 1488 | 1484 | |
|---|
| 1485 | + list_add_tail(&req->r_private_item, &obj_req->osd_reqs); |
|---|
| 1489 | 1486 | req->r_callback = rbd_osd_req_callback; |
|---|
| 1490 | 1487 | req->r_priv = obj_req; |
|---|
| 1491 | 1488 | |
|---|
| .. | .. |
|---|
| 1496 | 1493 | ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); |
|---|
| 1497 | 1494 | req->r_base_oloc.pool = rbd_dev->layout.pool_id; |
|---|
| 1498 | 1495 | |
|---|
| 1499 | | - if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, |
|---|
| 1500 | | - rbd_dev->header.object_prefix, obj_req->ex.oe_objno)) |
|---|
| 1501 | | - goto err_req; |
|---|
| 1502 | | - |
|---|
| 1503 | | - if (ceph_osdc_alloc_messages(req, GFP_NOIO)) |
|---|
| 1504 | | - goto err_req; |
|---|
| 1496 | + ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, |
|---|
| 1497 | + rbd_dev->header.object_prefix, |
|---|
| 1498 | + obj_req->ex.oe_objno); |
|---|
| 1499 | + if (ret) |
|---|
| 1500 | + return ERR_PTR(ret); |
|---|
| 1505 | 1501 | |
|---|
| 1506 | 1502 | return req; |
|---|
| 1507 | | - |
|---|
| 1508 | | -err_req: |
|---|
| 1509 | | - ceph_osdc_put_request(req); |
|---|
| 1510 | | - return NULL; |
|---|
| 1511 | 1503 | } |
|---|
| 1512 | 1504 | |
|---|
| 1513 | | -static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) |
|---|
| 1505 | +static struct ceph_osd_request * |
|---|
| 1506 | +rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops) |
|---|
| 1514 | 1507 | { |
|---|
| 1515 | | - ceph_osdc_put_request(osd_req); |
|---|
| 1508 | + return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc, |
|---|
| 1509 | + num_ops); |
|---|
| 1516 | 1510 | } |
|---|
| 1517 | 1511 | |
|---|
| 1518 | 1512 | static struct rbd_obj_request *rbd_obj_request_create(void) |
|---|
| .. | .. |
|---|
| 1524 | 1518 | return NULL; |
|---|
| 1525 | 1519 | |
|---|
| 1526 | 1520 | ceph_object_extent_init(&obj_request->ex); |
|---|
| 1521 | + INIT_LIST_HEAD(&obj_request->osd_reqs); |
|---|
| 1522 | + mutex_init(&obj_request->state_mutex); |
|---|
| 1527 | 1523 | kref_init(&obj_request->kref); |
|---|
| 1528 | 1524 | |
|---|
| 1529 | 1525 | dout("%s %p\n", __func__, obj_request); |
|---|
| .. | .. |
|---|
| 1533 | 1529 | static void rbd_obj_request_destroy(struct kref *kref) |
|---|
| 1534 | 1530 | { |
|---|
| 1535 | 1531 | struct rbd_obj_request *obj_request; |
|---|
| 1532 | + struct ceph_osd_request *osd_req; |
|---|
| 1536 | 1533 | u32 i; |
|---|
| 1537 | 1534 | |
|---|
| 1538 | 1535 | obj_request = container_of(kref, struct rbd_obj_request, kref); |
|---|
| 1539 | 1536 | |
|---|
| 1540 | 1537 | dout("%s: obj %p\n", __func__, obj_request); |
|---|
| 1541 | 1538 | |
|---|
| 1542 | | - if (obj_request->osd_req) |
|---|
| 1543 | | - rbd_osd_req_destroy(obj_request->osd_req); |
|---|
| 1539 | + while (!list_empty(&obj_request->osd_reqs)) { |
|---|
| 1540 | + osd_req = list_first_entry(&obj_request->osd_reqs, |
|---|
| 1541 | + struct ceph_osd_request, r_private_item); |
|---|
| 1542 | + list_del_init(&osd_req->r_private_item); |
|---|
| 1543 | + ceph_osdc_put_request(osd_req); |
|---|
| 1544 | + } |
|---|
| 1544 | 1545 | |
|---|
| 1545 | 1546 | switch (obj_request->img_request->data_type) { |
|---|
| 1546 | 1547 | case OBJ_REQUEST_NODATA: |
|---|
| .. | .. |
|---|
| 1551 | 1552 | kfree(obj_request->bvec_pos.bvecs); |
|---|
| 1552 | 1553 | break; |
|---|
| 1553 | 1554 | default: |
|---|
| 1554 | | - rbd_assert(0); |
|---|
| 1555 | + BUG(); |
|---|
| 1555 | 1556 | } |
|---|
| 1556 | 1557 | |
|---|
| 1557 | 1558 | kfree(obj_request->img_extents); |
|---|
| .. | .. |
|---|
| 1617 | 1618 | if (!rbd_dev->parent_spec) |
|---|
| 1618 | 1619 | return false; |
|---|
| 1619 | 1620 | |
|---|
| 1620 | | - down_read(&rbd_dev->header_rwsem); |
|---|
| 1621 | 1621 | if (rbd_dev->parent_overlap) |
|---|
| 1622 | 1622 | counter = atomic_inc_return_safe(&rbd_dev->parent_ref); |
|---|
| 1623 | | - up_read(&rbd_dev->header_rwsem); |
|---|
| 1624 | 1623 | |
|---|
| 1625 | 1624 | if (counter < 0) |
|---|
| 1626 | 1625 | rbd_warn(rbd_dev, "parent reference overflow"); |
|---|
| .. | .. |
|---|
| 1628 | 1627 | return counter > 0; |
|---|
| 1629 | 1628 | } |
|---|
| 1630 | 1629 | |
|---|
| 1631 | | -/* |
|---|
| 1632 | | - * Caller is responsible for filling in the list of object requests |
|---|
| 1633 | | - * that comprises the image request, and the Linux request pointer |
|---|
| 1634 | | - * (if there is one). |
|---|
| 1635 | | - */ |
|---|
| 1636 | | -static struct rbd_img_request *rbd_img_request_create( |
|---|
| 1637 | | - struct rbd_device *rbd_dev, |
|---|
| 1638 | | - enum obj_operation_type op_type, |
|---|
| 1639 | | - struct ceph_snap_context *snapc) |
|---|
| 1630 | +static void rbd_img_request_init(struct rbd_img_request *img_request, |
|---|
| 1631 | + struct rbd_device *rbd_dev, |
|---|
| 1632 | + enum obj_operation_type op_type) |
|---|
| 1640 | 1633 | { |
|---|
| 1641 | | - struct rbd_img_request *img_request; |
|---|
| 1642 | | - |
|---|
| 1643 | | - img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO); |
|---|
| 1644 | | - if (!img_request) |
|---|
| 1645 | | - return NULL; |
|---|
| 1634 | + memset(img_request, 0, sizeof(*img_request)); |
|---|
| 1646 | 1635 | |
|---|
| 1647 | 1636 | img_request->rbd_dev = rbd_dev; |
|---|
| 1648 | 1637 | img_request->op_type = op_type; |
|---|
| 1649 | | - if (!rbd_img_is_write(img_request)) |
|---|
| 1650 | | - img_request->snap_id = rbd_dev->spec->snap_id; |
|---|
| 1651 | | - else |
|---|
| 1652 | | - img_request->snapc = snapc; |
|---|
| 1653 | 1638 | |
|---|
| 1654 | | - if (rbd_dev_parent_get(rbd_dev)) |
|---|
| 1655 | | - img_request_layered_set(img_request); |
|---|
| 1656 | | - |
|---|
| 1657 | | - spin_lock_init(&img_request->completion_lock); |
|---|
| 1639 | + INIT_LIST_HEAD(&img_request->lock_item); |
|---|
| 1658 | 1640 | INIT_LIST_HEAD(&img_request->object_extents); |
|---|
| 1659 | | - kref_init(&img_request->kref); |
|---|
| 1660 | | - |
|---|
| 1661 | | - dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, |
|---|
| 1662 | | - obj_op_name(op_type), img_request); |
|---|
| 1663 | | - return img_request; |
|---|
| 1641 | + mutex_init(&img_request->state_mutex); |
|---|
| 1664 | 1642 | } |
|---|
| 1665 | 1643 | |
|---|
| 1666 | | -static void rbd_img_request_destroy(struct kref *kref) |
|---|
| 1644 | +static void rbd_img_capture_header(struct rbd_img_request *img_req) |
|---|
| 1667 | 1645 | { |
|---|
| 1668 | | - struct rbd_img_request *img_request; |
|---|
| 1646 | + struct rbd_device *rbd_dev = img_req->rbd_dev; |
|---|
| 1647 | + |
|---|
| 1648 | + lockdep_assert_held(&rbd_dev->header_rwsem); |
|---|
| 1649 | + |
|---|
| 1650 | + if (rbd_img_is_write(img_req)) |
|---|
| 1651 | + img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc); |
|---|
| 1652 | + else |
|---|
| 1653 | + img_req->snap_id = rbd_dev->spec->snap_id; |
|---|
| 1654 | + |
|---|
| 1655 | + if (rbd_dev_parent_get(rbd_dev)) |
|---|
| 1656 | + img_request_layered_set(img_req); |
|---|
| 1657 | +} |
|---|
| 1658 | + |
|---|
| 1659 | +static void rbd_img_request_destroy(struct rbd_img_request *img_request) |
|---|
| 1660 | +{ |
|---|
| 1669 | 1661 | struct rbd_obj_request *obj_request; |
|---|
| 1670 | 1662 | struct rbd_obj_request *next_obj_request; |
|---|
| 1671 | 1663 | |
|---|
| 1672 | | - img_request = container_of(kref, struct rbd_img_request, kref); |
|---|
| 1673 | | - |
|---|
| 1674 | 1664 | dout("%s: img %p\n", __func__, img_request); |
|---|
| 1675 | 1665 | |
|---|
| 1666 | + WARN_ON(!list_empty(&img_request->lock_item)); |
|---|
| 1676 | 1667 | for_each_obj_request_safe(img_request, obj_request, next_obj_request) |
|---|
| 1677 | 1668 | rbd_img_obj_request_del(img_request, obj_request); |
|---|
| 1678 | | - rbd_assert(img_request->obj_request_count == 0); |
|---|
| 1679 | 1669 | |
|---|
| 1680 | | - if (img_request_layered_test(img_request)) { |
|---|
| 1681 | | - img_request_layered_clear(img_request); |
|---|
| 1670 | + if (img_request_layered_test(img_request)) |
|---|
| 1682 | 1671 | rbd_dev_parent_put(img_request->rbd_dev); |
|---|
| 1683 | | - } |
|---|
| 1684 | 1672 | |
|---|
| 1685 | 1673 | if (rbd_img_is_write(img_request)) |
|---|
| 1686 | 1674 | ceph_put_snap_context(img_request->snapc); |
|---|
| 1687 | 1675 | |
|---|
| 1688 | | - kmem_cache_free(rbd_img_request_cache, img_request); |
|---|
| 1676 | + if (test_bit(IMG_REQ_CHILD, &img_request->flags)) |
|---|
| 1677 | + kmem_cache_free(rbd_img_request_cache, img_request); |
|---|
| 1678 | +} |
|---|
| 1679 | + |
|---|
| 1680 | +#define BITS_PER_OBJ 2 |
|---|
| 1681 | +#define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ) |
|---|
| 1682 | +#define OBJ_MASK ((1 << BITS_PER_OBJ) - 1) |
|---|
| 1683 | + |
|---|
| 1684 | +static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno, |
|---|
| 1685 | + u64 *index, u8 *shift) |
|---|
| 1686 | +{ |
|---|
| 1687 | + u32 off; |
|---|
| 1688 | + |
|---|
| 1689 | + rbd_assert(objno < rbd_dev->object_map_size); |
|---|
| 1690 | + *index = div_u64_rem(objno, OBJS_PER_BYTE, &off); |
|---|
| 1691 | + *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ; |
|---|
| 1692 | +} |
|---|
| 1693 | + |
|---|
| 1694 | +static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno) |
|---|
| 1695 | +{ |
|---|
| 1696 | + u64 index; |
|---|
| 1697 | + u8 shift; |
|---|
| 1698 | + |
|---|
| 1699 | + lockdep_assert_held(&rbd_dev->object_map_lock); |
|---|
| 1700 | + __rbd_object_map_index(rbd_dev, objno, &index, &shift); |
|---|
| 1701 | + return (rbd_dev->object_map[index] >> shift) & OBJ_MASK; |
|---|
| 1702 | +} |
|---|
| 1703 | + |
|---|
| 1704 | +static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val) |
|---|
| 1705 | +{ |
|---|
| 1706 | + u64 index; |
|---|
| 1707 | + u8 shift; |
|---|
| 1708 | + u8 *p; |
|---|
| 1709 | + |
|---|
| 1710 | + lockdep_assert_held(&rbd_dev->object_map_lock); |
|---|
| 1711 | + rbd_assert(!(val & ~OBJ_MASK)); |
|---|
| 1712 | + |
|---|
| 1713 | + __rbd_object_map_index(rbd_dev, objno, &index, &shift); |
|---|
| 1714 | + p = &rbd_dev->object_map[index]; |
|---|
| 1715 | + *p = (*p & ~(OBJ_MASK << shift)) | (val << shift); |
|---|
| 1716 | +} |
|---|
| 1717 | + |
|---|
| 1718 | +static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno) |
|---|
| 1719 | +{ |
|---|
| 1720 | + u8 state; |
|---|
| 1721 | + |
|---|
| 1722 | + spin_lock(&rbd_dev->object_map_lock); |
|---|
| 1723 | + state = __rbd_object_map_get(rbd_dev, objno); |
|---|
| 1724 | + spin_unlock(&rbd_dev->object_map_lock); |
|---|
| 1725 | + return state; |
|---|
| 1726 | +} |
|---|
| 1727 | + |
|---|
| 1728 | +static bool use_object_map(struct rbd_device *rbd_dev) |
|---|
| 1729 | +{ |
|---|
| 1730 | + /* |
|---|
| 1731 | + * An image mapped read-only can't use the object map -- it isn't |
|---|
| 1732 | + * loaded because the header lock isn't acquired. Someone else can |
|---|
| 1733 | + * write to the image and update the object map behind our back. |
|---|
| 1734 | + * |
|---|
| 1735 | + * A snapshot can't be written to, so using the object map is always |
|---|
| 1736 | + * safe. |
|---|
| 1737 | + */ |
|---|
| 1738 | + if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev)) |
|---|
| 1739 | + return false; |
|---|
| 1740 | + |
|---|
| 1741 | + return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) && |
|---|
| 1742 | + !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)); |
|---|
| 1743 | +} |
|---|
| 1744 | + |
|---|
| 1745 | +static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno) |
|---|
| 1746 | +{ |
|---|
| 1747 | + u8 state; |
|---|
| 1748 | + |
|---|
| 1749 | + /* fall back to default logic if object map is disabled or invalid */ |
|---|
| 1750 | + if (!use_object_map(rbd_dev)) |
|---|
| 1751 | + return true; |
|---|
| 1752 | + |
|---|
| 1753 | + state = rbd_object_map_get(rbd_dev, objno); |
|---|
| 1754 | + return state != OBJECT_NONEXISTENT; |
|---|
| 1755 | +} |
|---|
| 1756 | + |
|---|
| 1757 | +static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id, |
|---|
| 1758 | + struct ceph_object_id *oid) |
|---|
| 1759 | +{ |
|---|
| 1760 | + if (snap_id == CEPH_NOSNAP) |
|---|
| 1761 | + ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX, |
|---|
| 1762 | + rbd_dev->spec->image_id); |
|---|
| 1763 | + else |
|---|
| 1764 | + ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX, |
|---|
| 1765 | + rbd_dev->spec->image_id, snap_id); |
|---|
| 1766 | +} |
|---|
| 1767 | + |
|---|
| 1768 | +static int rbd_object_map_lock(struct rbd_device *rbd_dev) |
|---|
| 1769 | +{ |
|---|
| 1770 | + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
|---|
| 1771 | + CEPH_DEFINE_OID_ONSTACK(oid); |
|---|
| 1772 | + u8 lock_type; |
|---|
| 1773 | + char *lock_tag; |
|---|
| 1774 | + struct ceph_locker *lockers; |
|---|
| 1775 | + u32 num_lockers; |
|---|
| 1776 | + bool broke_lock = false; |
|---|
| 1777 | + int ret; |
|---|
| 1778 | + |
|---|
| 1779 | + rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid); |
|---|
| 1780 | + |
|---|
| 1781 | +again: |
|---|
| 1782 | + ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, |
|---|
| 1783 | + CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0); |
|---|
| 1784 | + if (ret != -EBUSY || broke_lock) { |
|---|
| 1785 | + if (ret == -EEXIST) |
|---|
| 1786 | + ret = 0; /* already locked by myself */ |
|---|
| 1787 | + if (ret) |
|---|
| 1788 | + rbd_warn(rbd_dev, "failed to lock object map: %d", ret); |
|---|
| 1789 | + return ret; |
|---|
| 1790 | + } |
|---|
| 1791 | + |
|---|
| 1792 | + ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc, |
|---|
| 1793 | + RBD_LOCK_NAME, &lock_type, &lock_tag, |
|---|
| 1794 | + &lockers, &num_lockers); |
|---|
| 1795 | + if (ret) { |
|---|
| 1796 | + if (ret == -ENOENT) |
|---|
| 1797 | + goto again; |
|---|
| 1798 | + |
|---|
| 1799 | + rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret); |
|---|
| 1800 | + return ret; |
|---|
| 1801 | + } |
|---|
| 1802 | + |
|---|
| 1803 | + kfree(lock_tag); |
|---|
| 1804 | + if (num_lockers == 0) |
|---|
| 1805 | + goto again; |
|---|
| 1806 | + |
|---|
| 1807 | + rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu", |
|---|
| 1808 | + ENTITY_NAME(lockers[0].id.name)); |
|---|
| 1809 | + |
|---|
| 1810 | + ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc, |
|---|
| 1811 | + RBD_LOCK_NAME, lockers[0].id.cookie, |
|---|
| 1812 | + &lockers[0].id.name); |
|---|
| 1813 | + ceph_free_lockers(lockers, num_lockers); |
|---|
| 1814 | + if (ret) { |
|---|
| 1815 | + if (ret == -ENOENT) |
|---|
| 1816 | + goto again; |
|---|
| 1817 | + |
|---|
| 1818 | + rbd_warn(rbd_dev, "failed to break object map lock: %d", ret); |
|---|
| 1819 | + return ret; |
|---|
| 1820 | + } |
|---|
| 1821 | + |
|---|
| 1822 | + broke_lock = true; |
|---|
| 1823 | + goto again; |
|---|
| 1824 | +} |
|---|
| 1825 | + |
|---|
| 1826 | +static void rbd_object_map_unlock(struct rbd_device *rbd_dev) |
|---|
| 1827 | +{ |
|---|
| 1828 | + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
|---|
| 1829 | + CEPH_DEFINE_OID_ONSTACK(oid); |
|---|
| 1830 | + int ret; |
|---|
| 1831 | + |
|---|
| 1832 | + rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid); |
|---|
| 1833 | + |
|---|
| 1834 | + ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, |
|---|
| 1835 | + ""); |
|---|
| 1836 | + if (ret && ret != -ENOENT) |
|---|
| 1837 | + rbd_warn(rbd_dev, "failed to unlock object map: %d", ret); |
|---|
| 1838 | +} |
|---|
| 1839 | + |
|---|
| 1840 | +static int decode_object_map_header(void **p, void *end, u64 *object_map_size) |
|---|
| 1841 | +{ |
|---|
| 1842 | + u8 struct_v; |
|---|
| 1843 | + u32 struct_len; |
|---|
| 1844 | + u32 header_len; |
|---|
| 1845 | + void *header_end; |
|---|
| 1846 | + int ret; |
|---|
| 1847 | + |
|---|
| 1848 | + ceph_decode_32_safe(p, end, header_len, e_inval); |
|---|
| 1849 | + header_end = *p + header_len; |
|---|
| 1850 | + |
|---|
| 1851 | + ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v, |
|---|
| 1852 | + &struct_len); |
|---|
| 1853 | + if (ret) |
|---|
| 1854 | + return ret; |
|---|
| 1855 | + |
|---|
| 1856 | + ceph_decode_64_safe(p, end, *object_map_size, e_inval); |
|---|
| 1857 | + |
|---|
| 1858 | + *p = header_end; |
|---|
| 1859 | + return 0; |
|---|
| 1860 | + |
|---|
| 1861 | +e_inval: |
|---|
| 1862 | + return -EINVAL; |
|---|
| 1863 | +} |
|---|
| 1864 | + |
|---|
| 1865 | +static int __rbd_object_map_load(struct rbd_device *rbd_dev) |
|---|
| 1866 | +{ |
|---|
| 1867 | + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
|---|
| 1868 | + CEPH_DEFINE_OID_ONSTACK(oid); |
|---|
| 1869 | + struct page **pages; |
|---|
| 1870 | + void *p, *end; |
|---|
| 1871 | + size_t reply_len; |
|---|
| 1872 | + u64 num_objects; |
|---|
| 1873 | + u64 object_map_bytes; |
|---|
| 1874 | + u64 object_map_size; |
|---|
| 1875 | + int num_pages; |
|---|
| 1876 | + int ret; |
|---|
| 1877 | + |
|---|
| 1878 | + rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size); |
|---|
| 1879 | + |
|---|
| 1880 | + num_objects = ceph_get_num_objects(&rbd_dev->layout, |
|---|
| 1881 | + rbd_dev->mapping.size); |
|---|
| 1882 | + object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ, |
|---|
| 1883 | + BITS_PER_BYTE); |
|---|
| 1884 | + num_pages = calc_pages_for(0, object_map_bytes) + 1; |
|---|
| 1885 | + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); |
|---|
| 1886 | + if (IS_ERR(pages)) |
|---|
| 1887 | + return PTR_ERR(pages); |
|---|
| 1888 | + |
|---|
| 1889 | + reply_len = num_pages * PAGE_SIZE; |
|---|
| 1890 | + rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid); |
|---|
| 1891 | + ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc, |
|---|
| 1892 | + "rbd", "object_map_load", CEPH_OSD_FLAG_READ, |
|---|
| 1893 | + NULL, 0, pages, &reply_len); |
|---|
| 1894 | + if (ret) |
|---|
| 1895 | + goto out; |
|---|
| 1896 | + |
|---|
| 1897 | + p = page_address(pages[0]); |
|---|
| 1898 | + end = p + min(reply_len, (size_t)PAGE_SIZE); |
|---|
| 1899 | + ret = decode_object_map_header(&p, end, &object_map_size); |
|---|
| 1900 | + if (ret) |
|---|
| 1901 | + goto out; |
|---|
| 1902 | + |
|---|
| 1903 | + if (object_map_size != num_objects) { |
|---|
| 1904 | + rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu", |
|---|
| 1905 | + object_map_size, num_objects); |
|---|
| 1906 | + ret = -EINVAL; |
|---|
| 1907 | + goto out; |
|---|
| 1908 | + } |
|---|
| 1909 | + |
|---|
| 1910 | + if (offset_in_page(p) + object_map_bytes > reply_len) { |
|---|
| 1911 | + ret = -EINVAL; |
|---|
| 1912 | + goto out; |
|---|
| 1913 | + } |
|---|
| 1914 | + |
|---|
| 1915 | + rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL); |
|---|
| 1916 | + if (!rbd_dev->object_map) { |
|---|
| 1917 | + ret = -ENOMEM; |
|---|
| 1918 | + goto out; |
|---|
| 1919 | + } |
|---|
| 1920 | + |
|---|
| 1921 | + rbd_dev->object_map_size = object_map_size; |
|---|
| 1922 | + ceph_copy_from_page_vector(pages, rbd_dev->object_map, |
|---|
| 1923 | + offset_in_page(p), object_map_bytes); |
|---|
| 1924 | + |
|---|
| 1925 | +out: |
|---|
| 1926 | + ceph_release_page_vector(pages, num_pages); |
|---|
| 1927 | + return ret; |
|---|
| 1928 | +} |
|---|
| 1929 | + |
|---|
| 1930 | +static void rbd_object_map_free(struct rbd_device *rbd_dev) |
|---|
| 1931 | +{ |
|---|
| 1932 | + kvfree(rbd_dev->object_map); |
|---|
| 1933 | + rbd_dev->object_map = NULL; |
|---|
| 1934 | + rbd_dev->object_map_size = 0; |
|---|
| 1935 | +} |
|---|
| 1936 | + |
|---|
| 1937 | +static int rbd_object_map_load(struct rbd_device *rbd_dev) |
|---|
| 1938 | +{ |
|---|
| 1939 | + int ret; |
|---|
| 1940 | + |
|---|
| 1941 | + ret = __rbd_object_map_load(rbd_dev); |
|---|
| 1942 | + if (ret) |
|---|
| 1943 | + return ret; |
|---|
| 1944 | + |
|---|
| 1945 | + ret = rbd_dev_v2_get_flags(rbd_dev); |
|---|
| 1946 | + if (ret) { |
|---|
| 1947 | + rbd_object_map_free(rbd_dev); |
|---|
| 1948 | + return ret; |
|---|
| 1949 | + } |
|---|
| 1950 | + |
|---|
| 1951 | + if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID) |
|---|
| 1952 | + rbd_warn(rbd_dev, "object map is invalid"); |
|---|
| 1953 | + |
|---|
| 1954 | + return 0; |
|---|
| 1955 | +} |
|---|
| 1956 | + |
|---|
| 1957 | +static int rbd_object_map_open(struct rbd_device *rbd_dev) |
|---|
| 1958 | +{ |
|---|
| 1959 | + int ret; |
|---|
| 1960 | + |
|---|
| 1961 | + ret = rbd_object_map_lock(rbd_dev); |
|---|
| 1962 | + if (ret) |
|---|
| 1963 | + return ret; |
|---|
| 1964 | + |
|---|
| 1965 | + ret = rbd_object_map_load(rbd_dev); |
|---|
| 1966 | + if (ret) { |
|---|
| 1967 | + rbd_object_map_unlock(rbd_dev); |
|---|
| 1968 | + return ret; |
|---|
| 1969 | + } |
|---|
| 1970 | + |
|---|
| 1971 | + return 0; |
|---|
| 1972 | +} |
|---|
| 1973 | + |
|---|
| 1974 | +static void rbd_object_map_close(struct rbd_device *rbd_dev) |
|---|
| 1975 | +{ |
|---|
| 1976 | + rbd_object_map_free(rbd_dev); |
|---|
| 1977 | + rbd_object_map_unlock(rbd_dev); |
|---|
| 1978 | +} |
|---|
| 1979 | + |
|---|
| 1980 | +/* |
|---|
| 1981 | + * This function needs snap_id (or more precisely just something to |
|---|
| 1982 | + * distinguish between HEAD and snapshot object maps), new_state and |
|---|
| 1983 | + * current_state that were passed to rbd_object_map_update(). |
|---|
| 1984 | + * |
|---|
| 1985 | + * To avoid allocating and stashing a context we piggyback on the OSD |
|---|
| 1986 | + * request. A HEAD update has two ops (assert_locked). For new_state |
|---|
| 1987 | + * and current_state we decode our own object_map_update op, encoded in |
|---|
| 1988 | + * rbd_cls_object_map_update(). |
|---|
| 1989 | + */ |
|---|
| 1990 | +static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req, |
|---|
| 1991 | + struct ceph_osd_request *osd_req) |
|---|
| 1992 | +{ |
|---|
| 1993 | + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 1994 | + struct ceph_osd_data *osd_data; |
|---|
| 1995 | + u64 objno; |
|---|
| 1996 | + u8 state, new_state, current_state; |
|---|
| 1997 | + bool has_current_state; |
|---|
| 1998 | + void *p; |
|---|
| 1999 | + |
|---|
| 2000 | + if (osd_req->r_result) |
|---|
| 2001 | + return osd_req->r_result; |
|---|
| 2002 | + |
|---|
| 2003 | + /* |
|---|
| 2004 | + * Nothing to do for a snapshot object map. |
|---|
| 2005 | + */ |
|---|
| 2006 | + if (osd_req->r_num_ops == 1) |
|---|
| 2007 | + return 0; |
|---|
| 2008 | + |
|---|
| 2009 | + /* |
|---|
| 2010 | + * Update in-memory HEAD object map. |
|---|
| 2011 | + */ |
|---|
| 2012 | + rbd_assert(osd_req->r_num_ops == 2); |
|---|
| 2013 | + osd_data = osd_req_op_data(osd_req, 1, cls, request_data); |
|---|
| 2014 | + rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES); |
|---|
| 2015 | + |
|---|
| 2016 | + p = page_address(osd_data->pages[0]); |
|---|
| 2017 | + objno = ceph_decode_64(&p); |
|---|
| 2018 | + rbd_assert(objno == obj_req->ex.oe_objno); |
|---|
| 2019 | + rbd_assert(ceph_decode_64(&p) == objno + 1); |
|---|
| 2020 | + new_state = ceph_decode_8(&p); |
|---|
| 2021 | + has_current_state = ceph_decode_8(&p); |
|---|
| 2022 | + if (has_current_state) |
|---|
| 2023 | + current_state = ceph_decode_8(&p); |
|---|
| 2024 | + |
|---|
| 2025 | + spin_lock(&rbd_dev->object_map_lock); |
|---|
| 2026 | + state = __rbd_object_map_get(rbd_dev, objno); |
|---|
| 2027 | + if (!has_current_state || current_state == state || |
|---|
| 2028 | + (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) |
|---|
| 2029 | + __rbd_object_map_set(rbd_dev, objno, new_state); |
|---|
| 2030 | + spin_unlock(&rbd_dev->object_map_lock); |
|---|
| 2031 | + |
|---|
| 2032 | + return 0; |
|---|
| 2033 | +} |
|---|
| 2034 | + |
|---|
| 2035 | +static void rbd_object_map_callback(struct ceph_osd_request *osd_req) |
|---|
| 2036 | +{ |
|---|
| 2037 | + struct rbd_obj_request *obj_req = osd_req->r_priv; |
|---|
| 2038 | + int result; |
|---|
| 2039 | + |
|---|
| 2040 | + dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, |
|---|
| 2041 | + osd_req->r_result, obj_req); |
|---|
| 2042 | + |
|---|
| 2043 | + result = rbd_object_map_update_finish(obj_req, osd_req); |
|---|
| 2044 | + rbd_obj_handle_request(obj_req, result); |
|---|
| 2045 | +} |
|---|
| 2046 | + |
|---|
| 2047 | +static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state) |
|---|
| 2048 | +{ |
|---|
| 2049 | + u8 state = rbd_object_map_get(rbd_dev, objno); |
|---|
| 2050 | + |
|---|
| 2051 | + if (state == new_state || |
|---|
| 2052 | + (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) || |
|---|
| 2053 | + (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING)) |
|---|
| 2054 | + return false; |
|---|
| 2055 | + |
|---|
| 2056 | + return true; |
|---|
| 2057 | +} |
|---|
| 2058 | + |
|---|
| 2059 | +static int rbd_cls_object_map_update(struct ceph_osd_request *req, |
|---|
| 2060 | + int which, u64 objno, u8 new_state, |
|---|
| 2061 | + const u8 *current_state) |
|---|
| 2062 | +{ |
|---|
| 2063 | + struct page **pages; |
|---|
| 2064 | + void *p, *start; |
|---|
| 2065 | + int ret; |
|---|
| 2066 | + |
|---|
| 2067 | + ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update"); |
|---|
| 2068 | + if (ret) |
|---|
| 2069 | + return ret; |
|---|
| 2070 | + |
|---|
| 2071 | + pages = ceph_alloc_page_vector(1, GFP_NOIO); |
|---|
| 2072 | + if (IS_ERR(pages)) |
|---|
| 2073 | + return PTR_ERR(pages); |
|---|
| 2074 | + |
|---|
| 2075 | + p = start = page_address(pages[0]); |
|---|
| 2076 | + ceph_encode_64(&p, objno); |
|---|
| 2077 | + ceph_encode_64(&p, objno + 1); |
|---|
| 2078 | + ceph_encode_8(&p, new_state); |
|---|
| 2079 | + if (current_state) { |
|---|
| 2080 | + ceph_encode_8(&p, 1); |
|---|
| 2081 | + ceph_encode_8(&p, *current_state); |
|---|
| 2082 | + } else { |
|---|
| 2083 | + ceph_encode_8(&p, 0); |
|---|
| 2084 | + } |
|---|
| 2085 | + |
|---|
| 2086 | + osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0, |
|---|
| 2087 | + false, true); |
|---|
| 2088 | + return 0; |
|---|
| 2089 | +} |
|---|
| 2090 | + |
|---|
| 2091 | +/* |
|---|
| 2092 | + * Return: |
|---|
| 2093 | + * 0 - object map update sent |
|---|
| 2094 | + * 1 - object map update isn't needed |
|---|
| 2095 | + * <0 - error |
|---|
| 2096 | + */ |
|---|
| 2097 | +static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id, |
|---|
| 2098 | + u8 new_state, const u8 *current_state) |
|---|
| 2099 | +{ |
|---|
| 2100 | + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 2101 | + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
|---|
| 2102 | + struct ceph_osd_request *req; |
|---|
| 2103 | + int num_ops = 1; |
|---|
| 2104 | + int which = 0; |
|---|
| 2105 | + int ret; |
|---|
| 2106 | + |
|---|
| 2107 | + if (snap_id == CEPH_NOSNAP) { |
|---|
| 2108 | + if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state)) |
|---|
| 2109 | + return 1; |
|---|
| 2110 | + |
|---|
| 2111 | + num_ops++; /* assert_locked */ |
|---|
| 2112 | + } |
|---|
| 2113 | + |
|---|
| 2114 | + req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO); |
|---|
| 2115 | + if (!req) |
|---|
| 2116 | + return -ENOMEM; |
|---|
| 2117 | + |
|---|
| 2118 | + list_add_tail(&req->r_private_item, &obj_req->osd_reqs); |
|---|
| 2119 | + req->r_callback = rbd_object_map_callback; |
|---|
| 2120 | + req->r_priv = obj_req; |
|---|
| 2121 | + |
|---|
| 2122 | + rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid); |
|---|
| 2123 | + ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); |
|---|
| 2124 | + req->r_flags = CEPH_OSD_FLAG_WRITE; |
|---|
| 2125 | + ktime_get_real_ts64(&req->r_mtime); |
|---|
| 2126 | + |
|---|
| 2127 | + if (snap_id == CEPH_NOSNAP) { |
|---|
| 2128 | + /* |
|---|
| 2129 | + * Protect against possible race conditions during lock |
|---|
| 2130 | + * ownership transitions. |
|---|
| 2131 | + */ |
|---|
| 2132 | + ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME, |
|---|
| 2133 | + CEPH_CLS_LOCK_EXCLUSIVE, "", ""); |
|---|
| 2134 | + if (ret) |
|---|
| 2135 | + return ret; |
|---|
| 2136 | + } |
|---|
| 2137 | + |
|---|
| 2138 | + ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno, |
|---|
| 2139 | + new_state, current_state); |
|---|
| 2140 | + if (ret) |
|---|
| 2141 | + return ret; |
|---|
| 2142 | + |
|---|
| 2143 | + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); |
|---|
| 2144 | + if (ret) |
|---|
| 2145 | + return ret; |
|---|
| 2146 | + |
|---|
| 2147 | + ceph_osdc_start_request(osdc, req, false); |
|---|
| 2148 | + return 0; |
|---|
| 1689 | 2149 | } |
|---|
| 1690 | 2150 | |
|---|
| 1691 | 2151 | static void prune_extents(struct ceph_file_extent *img_extents, |
|---|
| .. | .. |
|---|
| 1735 | 2195 | return 0; |
|---|
| 1736 | 2196 | } |
|---|
| 1737 | 2197 | |
|---|
| 1738 | | -static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) |
|---|
| 2198 | +static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which) |
|---|
| 1739 | 2199 | { |
|---|
| 2200 | + struct rbd_obj_request *obj_req = osd_req->r_priv; |
|---|
| 2201 | + |
|---|
| 1740 | 2202 | switch (obj_req->img_request->data_type) { |
|---|
| 1741 | 2203 | case OBJ_REQUEST_BIO: |
|---|
| 1742 | | - osd_req_op_extent_osd_data_bio(obj_req->osd_req, which, |
|---|
| 2204 | + osd_req_op_extent_osd_data_bio(osd_req, which, |
|---|
| 1743 | 2205 | &obj_req->bio_pos, |
|---|
| 1744 | 2206 | obj_req->ex.oe_len); |
|---|
| 1745 | 2207 | break; |
|---|
| .. | .. |
|---|
| 1748 | 2210 | rbd_assert(obj_req->bvec_pos.iter.bi_size == |
|---|
| 1749 | 2211 | obj_req->ex.oe_len); |
|---|
| 1750 | 2212 | rbd_assert(obj_req->bvec_idx == obj_req->bvec_count); |
|---|
| 1751 | | - osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, |
|---|
| 2213 | + osd_req_op_extent_osd_data_bvec_pos(osd_req, which, |
|---|
| 1752 | 2214 | &obj_req->bvec_pos); |
|---|
| 1753 | 2215 | break; |
|---|
| 1754 | 2216 | default: |
|---|
| 1755 | | - rbd_assert(0); |
|---|
| 2217 | + BUG(); |
|---|
| 1756 | 2218 | } |
|---|
| 1757 | 2219 | } |
|---|
| 1758 | 2220 | |
|---|
| 1759 | | -static int rbd_obj_setup_read(struct rbd_obj_request *obj_req) |
|---|
| 1760 | | -{ |
|---|
| 1761 | | - obj_req->osd_req = rbd_osd_req_create(obj_req, 1); |
|---|
| 1762 | | - if (!obj_req->osd_req) |
|---|
| 1763 | | - return -ENOMEM; |
|---|
| 1764 | | - |
|---|
| 1765 | | - osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ, |
|---|
| 1766 | | - obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); |
|---|
| 1767 | | - rbd_osd_req_setup_data(obj_req, 0); |
|---|
| 1768 | | - |
|---|
| 1769 | | - rbd_osd_req_format_read(obj_req); |
|---|
| 1770 | | - return 0; |
|---|
| 1771 | | -} |
|---|
| 1772 | | - |
|---|
| 1773 | | -static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req, |
|---|
| 1774 | | - unsigned int which) |
|---|
| 2221 | +static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which) |
|---|
| 1775 | 2222 | { |
|---|
| 1776 | 2223 | struct page **pages; |
|---|
| 1777 | 2224 | |
|---|
| .. | .. |
|---|
| 1787 | 2234 | if (IS_ERR(pages)) |
|---|
| 1788 | 2235 | return PTR_ERR(pages); |
|---|
| 1789 | 2236 | |
|---|
| 1790 | | - osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0); |
|---|
| 1791 | | - osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages, |
|---|
| 2237 | + osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0); |
|---|
| 2238 | + osd_req_op_raw_data_in_pages(osd_req, which, pages, |
|---|
| 1792 | 2239 | 8 + sizeof(struct ceph_timespec), |
|---|
| 1793 | 2240 | 0, false, true); |
|---|
| 1794 | 2241 | return 0; |
|---|
| 1795 | 2242 | } |
|---|
| 1796 | 2243 | |
|---|
| 1797 | | -static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req, |
|---|
| 1798 | | - unsigned int which) |
|---|
| 2244 | +static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which, |
|---|
| 2245 | + u32 bytes) |
|---|
| 1799 | 2246 | { |
|---|
| 2247 | + struct rbd_obj_request *obj_req = osd_req->r_priv; |
|---|
| 2248 | + int ret; |
|---|
| 2249 | + |
|---|
| 2250 | + ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup"); |
|---|
| 2251 | + if (ret) |
|---|
| 2252 | + return ret; |
|---|
| 2253 | + |
|---|
| 2254 | + osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs, |
|---|
| 2255 | + obj_req->copyup_bvec_count, bytes); |
|---|
| 2256 | + return 0; |
|---|
| 2257 | +} |
|---|
| 2258 | + |
|---|
| 2259 | +static int rbd_obj_init_read(struct rbd_obj_request *obj_req) |
|---|
| 2260 | +{ |
|---|
| 2261 | + obj_req->read_state = RBD_OBJ_READ_START; |
|---|
| 2262 | + return 0; |
|---|
| 2263 | +} |
|---|
| 2264 | + |
|---|
| 2265 | +static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req, |
|---|
| 2266 | + int which) |
|---|
| 2267 | +{ |
|---|
| 2268 | + struct rbd_obj_request *obj_req = osd_req->r_priv; |
|---|
| 1800 | 2269 | struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 1801 | 2270 | u16 opcode; |
|---|
| 1802 | 2271 | |
|---|
| 1803 | | - osd_req_op_alloc_hint_init(obj_req->osd_req, which++, |
|---|
| 1804 | | - rbd_dev->layout.object_size, |
|---|
| 1805 | | - rbd_dev->layout.object_size); |
|---|
| 2272 | + if (!use_object_map(rbd_dev) || |
|---|
| 2273 | + !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) { |
|---|
| 2274 | + osd_req_op_alloc_hint_init(osd_req, which++, |
|---|
| 2275 | + rbd_dev->layout.object_size, |
|---|
| 2276 | + rbd_dev->layout.object_size, |
|---|
| 2277 | + rbd_dev->opts->alloc_hint_flags); |
|---|
| 2278 | + } |
|---|
| 1806 | 2279 | |
|---|
| 1807 | 2280 | if (rbd_obj_is_entire(obj_req)) |
|---|
| 1808 | 2281 | opcode = CEPH_OSD_OP_WRITEFULL; |
|---|
| 1809 | 2282 | else |
|---|
| 1810 | 2283 | opcode = CEPH_OSD_OP_WRITE; |
|---|
| 1811 | 2284 | |
|---|
| 1812 | | - osd_req_op_extent_init(obj_req->osd_req, which, opcode, |
|---|
| 2285 | + osd_req_op_extent_init(osd_req, which, opcode, |
|---|
| 1813 | 2286 | obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); |
|---|
| 1814 | | - rbd_osd_req_setup_data(obj_req, which++); |
|---|
| 1815 | | - |
|---|
| 1816 | | - rbd_assert(which == obj_req->osd_req->r_num_ops); |
|---|
| 1817 | | - rbd_osd_req_format_write(obj_req); |
|---|
| 2287 | + rbd_osd_setup_data(osd_req, which); |
|---|
| 1818 | 2288 | } |
|---|
| 1819 | 2289 | |
|---|
| 1820 | | -static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) |
|---|
| 2290 | +static int rbd_obj_init_write(struct rbd_obj_request *obj_req) |
|---|
| 1821 | 2291 | { |
|---|
| 1822 | | - unsigned int num_osd_ops, which = 0; |
|---|
| 1823 | 2292 | int ret; |
|---|
| 1824 | 2293 | |
|---|
| 1825 | 2294 | /* reverse map the entire object onto the parent */ |
|---|
| .. | .. |
|---|
| 1827 | 2296 | if (ret) |
|---|
| 1828 | 2297 | return ret; |
|---|
| 1829 | 2298 | |
|---|
| 1830 | | - if (obj_req->num_img_extents) { |
|---|
| 1831 | | - obj_req->write_state = RBD_OBJ_WRITE_GUARD; |
|---|
| 1832 | | - num_osd_ops = 3; /* stat + setallochint + write/writefull */ |
|---|
| 1833 | | - } else { |
|---|
| 1834 | | - obj_req->write_state = RBD_OBJ_WRITE_FLAT; |
|---|
| 1835 | | - num_osd_ops = 2; /* setallochint + write/writefull */ |
|---|
| 1836 | | - } |
|---|
| 2299 | + if (rbd_obj_copyup_enabled(obj_req)) |
|---|
| 2300 | + obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; |
|---|
| 1837 | 2301 | |
|---|
| 1838 | | - obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); |
|---|
| 1839 | | - if (!obj_req->osd_req) |
|---|
| 1840 | | - return -ENOMEM; |
|---|
| 1841 | | - |
|---|
| 1842 | | - if (obj_req->num_img_extents) { |
|---|
| 1843 | | - ret = __rbd_obj_setup_stat(obj_req, which++); |
|---|
| 1844 | | - if (ret) |
|---|
| 1845 | | - return ret; |
|---|
| 1846 | | - } |
|---|
| 1847 | | - |
|---|
| 1848 | | - __rbd_obj_setup_write(obj_req, which); |
|---|
| 2302 | + obj_req->write_state = RBD_OBJ_WRITE_START; |
|---|
| 1849 | 2303 | return 0; |
|---|
| 1850 | 2304 | } |
|---|
| 1851 | 2305 | |
|---|
| 1852 | | -static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req, |
|---|
| 1853 | | - unsigned int which) |
|---|
| 2306 | +static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req) |
|---|
| 1854 | 2307 | { |
|---|
| 2308 | + return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE : |
|---|
| 2309 | + CEPH_OSD_OP_ZERO; |
|---|
| 2310 | +} |
|---|
| 2311 | + |
|---|
| 2312 | +static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req, |
|---|
| 2313 | + int which) |
|---|
| 2314 | +{ |
|---|
| 2315 | + struct rbd_obj_request *obj_req = osd_req->r_priv; |
|---|
| 2316 | + |
|---|
| 2317 | + if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) { |
|---|
| 2318 | + rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION); |
|---|
| 2319 | + osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0); |
|---|
| 2320 | + } else { |
|---|
| 2321 | + osd_req_op_extent_init(osd_req, which, |
|---|
| 2322 | + truncate_or_zero_opcode(obj_req), |
|---|
| 2323 | + obj_req->ex.oe_off, obj_req->ex.oe_len, |
|---|
| 2324 | + 0, 0); |
|---|
| 2325 | + } |
|---|
| 2326 | +} |
|---|
| 2327 | + |
|---|
| 2328 | +static int rbd_obj_init_discard(struct rbd_obj_request *obj_req) |
|---|
| 2329 | +{ |
|---|
| 2330 | + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 2331 | + u64 off, next_off; |
|---|
| 2332 | + int ret; |
|---|
| 2333 | + |
|---|
| 2334 | + /* |
|---|
| 2335 | + * Align the range to alloc_size boundary and punt on discards |
|---|
| 2336 | + * that are too small to free up any space. |
|---|
| 2337 | + * |
|---|
| 2338 | + * alloc_size == object_size && is_tail() is a special case for |
|---|
| 2339 | + * filestore with filestore_punch_hole = false, needed to allow |
|---|
| 2340 | + * truncate (in addition to delete). |
|---|
| 2341 | + */ |
|---|
| 2342 | + if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size || |
|---|
| 2343 | + !rbd_obj_is_tail(obj_req)) { |
|---|
| 2344 | + off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size); |
|---|
| 2345 | + next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len, |
|---|
| 2346 | + rbd_dev->opts->alloc_size); |
|---|
| 2347 | + if (off >= next_off) |
|---|
| 2348 | + return 1; |
|---|
| 2349 | + |
|---|
| 2350 | + dout("%s %p %llu~%llu -> %llu~%llu\n", __func__, |
|---|
| 2351 | + obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len, |
|---|
| 2352 | + off, next_off - off); |
|---|
| 2353 | + obj_req->ex.oe_off = off; |
|---|
| 2354 | + obj_req->ex.oe_len = next_off - off; |
|---|
| 2355 | + } |
|---|
| 2356 | + |
|---|
| 2357 | + /* reverse map the entire object onto the parent */ |
|---|
| 2358 | + ret = rbd_obj_calc_img_extents(obj_req, true); |
|---|
| 2359 | + if (ret) |
|---|
| 2360 | + return ret; |
|---|
| 2361 | + |
|---|
| 2362 | + obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT; |
|---|
| 2363 | + if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) |
|---|
| 2364 | + obj_req->flags |= RBD_OBJ_FLAG_DELETION; |
|---|
| 2365 | + |
|---|
| 2366 | + obj_req->write_state = RBD_OBJ_WRITE_START; |
|---|
| 2367 | + return 0; |
|---|
| 2368 | +} |
|---|
| 2369 | + |
|---|
| 2370 | +static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req, |
|---|
| 2371 | + int which) |
|---|
| 2372 | +{ |
|---|
| 2373 | + struct rbd_obj_request *obj_req = osd_req->r_priv; |
|---|
| 1855 | 2374 | u16 opcode; |
|---|
| 1856 | 2375 | |
|---|
| 1857 | 2376 | if (rbd_obj_is_entire(obj_req)) { |
|---|
| 1858 | 2377 | if (obj_req->num_img_extents) { |
|---|
| 1859 | | - osd_req_op_init(obj_req->osd_req, which++, |
|---|
| 1860 | | - CEPH_OSD_OP_CREATE, 0); |
|---|
| 2378 | + if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)) |
|---|
| 2379 | + osd_req_op_init(osd_req, which++, |
|---|
| 2380 | + CEPH_OSD_OP_CREATE, 0); |
|---|
| 1861 | 2381 | opcode = CEPH_OSD_OP_TRUNCATE; |
|---|
| 1862 | 2382 | } else { |
|---|
| 1863 | | - osd_req_op_init(obj_req->osd_req, which++, |
|---|
| 2383 | + rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION); |
|---|
| 2384 | + osd_req_op_init(osd_req, which++, |
|---|
| 1864 | 2385 | CEPH_OSD_OP_DELETE, 0); |
|---|
| 1865 | 2386 | opcode = 0; |
|---|
| 1866 | 2387 | } |
|---|
| 1867 | | - } else if (rbd_obj_is_tail(obj_req)) { |
|---|
| 1868 | | - opcode = CEPH_OSD_OP_TRUNCATE; |
|---|
| 1869 | 2388 | } else { |
|---|
| 1870 | | - opcode = CEPH_OSD_OP_ZERO; |
|---|
| 2389 | + opcode = truncate_or_zero_opcode(obj_req); |
|---|
| 1871 | 2390 | } |
|---|
| 1872 | 2391 | |
|---|
| 1873 | 2392 | if (opcode) |
|---|
| 1874 | | - osd_req_op_extent_init(obj_req->osd_req, which++, opcode, |
|---|
| 2393 | + osd_req_op_extent_init(osd_req, which, opcode, |
|---|
| 1875 | 2394 | obj_req->ex.oe_off, obj_req->ex.oe_len, |
|---|
| 1876 | 2395 | 0, 0); |
|---|
| 1877 | | - |
|---|
| 1878 | | - rbd_assert(which == obj_req->osd_req->r_num_ops); |
|---|
| 1879 | | - rbd_osd_req_format_write(obj_req); |
|---|
| 1880 | 2396 | } |
|---|
| 1881 | 2397 | |
|---|
| 1882 | | -static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) |
|---|
| 2398 | +static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req) |
|---|
| 1883 | 2399 | { |
|---|
| 1884 | | - unsigned int num_osd_ops, which = 0; |
|---|
| 1885 | 2400 | int ret; |
|---|
| 1886 | 2401 | |
|---|
| 1887 | 2402 | /* reverse map the entire object onto the parent */ |
|---|
| .. | .. |
|---|
| 1889 | 2404 | if (ret) |
|---|
| 1890 | 2405 | return ret; |
|---|
| 1891 | 2406 | |
|---|
| 1892 | | - if (rbd_obj_is_entire(obj_req)) { |
|---|
| 1893 | | - obj_req->write_state = RBD_OBJ_WRITE_FLAT; |
|---|
| 1894 | | - if (obj_req->num_img_extents) |
|---|
| 1895 | | - num_osd_ops = 2; /* create + truncate */ |
|---|
| 1896 | | - else |
|---|
| 1897 | | - num_osd_ops = 1; /* delete */ |
|---|
| 1898 | | - } else { |
|---|
| 1899 | | - if (obj_req->num_img_extents) { |
|---|
| 1900 | | - obj_req->write_state = RBD_OBJ_WRITE_GUARD; |
|---|
| 1901 | | - num_osd_ops = 2; /* stat + truncate/zero */ |
|---|
| 1902 | | - } else { |
|---|
| 1903 | | - obj_req->write_state = RBD_OBJ_WRITE_FLAT; |
|---|
| 1904 | | - num_osd_ops = 1; /* truncate/zero */ |
|---|
| 1905 | | - } |
|---|
| 2407 | + if (rbd_obj_copyup_enabled(obj_req)) |
|---|
| 2408 | + obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; |
|---|
| 2409 | + if (!obj_req->num_img_extents) { |
|---|
| 2410 | + obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT; |
|---|
| 2411 | + if (rbd_obj_is_entire(obj_req)) |
|---|
| 2412 | + obj_req->flags |= RBD_OBJ_FLAG_DELETION; |
|---|
| 1906 | 2413 | } |
|---|
| 1907 | 2414 | |
|---|
| 1908 | | - obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); |
|---|
| 1909 | | - if (!obj_req->osd_req) |
|---|
| 1910 | | - return -ENOMEM; |
|---|
| 1911 | | - |
|---|
| 1912 | | - if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) { |
|---|
| 1913 | | - ret = __rbd_obj_setup_stat(obj_req, which++); |
|---|
| 1914 | | - if (ret) |
|---|
| 1915 | | - return ret; |
|---|
| 1916 | | - } |
|---|
| 1917 | | - |
|---|
| 1918 | | - __rbd_obj_setup_discard(obj_req, which); |
|---|
| 2415 | + obj_req->write_state = RBD_OBJ_WRITE_START; |
|---|
| 1919 | 2416 | return 0; |
|---|
| 1920 | 2417 | } |
|---|
| 1921 | 2418 | |
|---|
| 2419 | +static int count_write_ops(struct rbd_obj_request *obj_req) |
|---|
| 2420 | +{ |
|---|
| 2421 | + struct rbd_img_request *img_req = obj_req->img_request; |
|---|
| 2422 | + |
|---|
| 2423 | + switch (img_req->op_type) { |
|---|
| 2424 | + case OBJ_OP_WRITE: |
|---|
| 2425 | + if (!use_object_map(img_req->rbd_dev) || |
|---|
| 2426 | + !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) |
|---|
| 2427 | + return 2; /* setallochint + write/writefull */ |
|---|
| 2428 | + |
|---|
| 2429 | + return 1; /* write/writefull */ |
|---|
| 2430 | + case OBJ_OP_DISCARD: |
|---|
| 2431 | + return 1; /* delete/truncate/zero */ |
|---|
| 2432 | + case OBJ_OP_ZEROOUT: |
|---|
| 2433 | + if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents && |
|---|
| 2434 | + !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)) |
|---|
| 2435 | + return 2; /* create + truncate */ |
|---|
| 2436 | + |
|---|
| 2437 | + return 1; /* delete/truncate/zero */ |
|---|
| 2438 | + default: |
|---|
| 2439 | + BUG(); |
|---|
| 2440 | + } |
|---|
| 2441 | +} |
|---|
| 2442 | + |
|---|
| 2443 | +static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req, |
|---|
| 2444 | + int which) |
|---|
| 2445 | +{ |
|---|
| 2446 | + struct rbd_obj_request *obj_req = osd_req->r_priv; |
|---|
| 2447 | + |
|---|
| 2448 | + switch (obj_req->img_request->op_type) { |
|---|
| 2449 | + case OBJ_OP_WRITE: |
|---|
| 2450 | + __rbd_osd_setup_write_ops(osd_req, which); |
|---|
| 2451 | + break; |
|---|
| 2452 | + case OBJ_OP_DISCARD: |
|---|
| 2453 | + __rbd_osd_setup_discard_ops(osd_req, which); |
|---|
| 2454 | + break; |
|---|
| 2455 | + case OBJ_OP_ZEROOUT: |
|---|
| 2456 | + __rbd_osd_setup_zeroout_ops(osd_req, which); |
|---|
| 2457 | + break; |
|---|
| 2458 | + default: |
|---|
| 2459 | + BUG(); |
|---|
| 2460 | + } |
|---|
| 2461 | +} |
|---|
| 2462 | + |
|---|
| 1922 | 2463 | /* |
|---|
| 1923 | | - * For each object request in @img_req, allocate an OSD request, add |
|---|
| 1924 | | - * individual OSD ops and prepare them for submission. The number of |
|---|
| 1925 | | - * OSD ops depends on op_type and the overlap point (if any). |
|---|
| 2464 | + * Prune the list of object requests (adjust offset and/or length, drop |
|---|
| 2465 | + * redundant requests). Prepare object request state machines and image |
|---|
| 2466 | + * request state machine for execution. |
|---|
| 1926 | 2467 | */ |
|---|
| 1927 | 2468 | static int __rbd_img_fill_request(struct rbd_img_request *img_req) |
|---|
| 1928 | 2469 | { |
|---|
| 1929 | | - struct rbd_obj_request *obj_req; |
|---|
| 2470 | + struct rbd_obj_request *obj_req, *next_obj_req; |
|---|
| 1930 | 2471 | int ret; |
|---|
| 1931 | 2472 | |
|---|
| 1932 | | - for_each_obj_request(img_req, obj_req) { |
|---|
| 2473 | + for_each_obj_request_safe(img_req, obj_req, next_obj_req) { |
|---|
| 1933 | 2474 | switch (img_req->op_type) { |
|---|
| 1934 | 2475 | case OBJ_OP_READ: |
|---|
| 1935 | | - ret = rbd_obj_setup_read(obj_req); |
|---|
| 2476 | + ret = rbd_obj_init_read(obj_req); |
|---|
| 1936 | 2477 | break; |
|---|
| 1937 | 2478 | case OBJ_OP_WRITE: |
|---|
| 1938 | | - ret = rbd_obj_setup_write(obj_req); |
|---|
| 2479 | + ret = rbd_obj_init_write(obj_req); |
|---|
| 1939 | 2480 | break; |
|---|
| 1940 | 2481 | case OBJ_OP_DISCARD: |
|---|
| 1941 | | - ret = rbd_obj_setup_discard(obj_req); |
|---|
| 2482 | + ret = rbd_obj_init_discard(obj_req); |
|---|
| 2483 | + break; |
|---|
| 2484 | + case OBJ_OP_ZEROOUT: |
|---|
| 2485 | + ret = rbd_obj_init_zeroout(obj_req); |
|---|
| 1942 | 2486 | break; |
|---|
| 1943 | 2487 | default: |
|---|
| 1944 | | - rbd_assert(0); |
|---|
| 2488 | + BUG(); |
|---|
| 1945 | 2489 | } |
|---|
| 1946 | | - if (ret) |
|---|
| 2490 | + if (ret < 0) |
|---|
| 1947 | 2491 | return ret; |
|---|
| 2492 | + if (ret > 0) { |
|---|
| 2493 | + rbd_img_obj_request_del(img_req, obj_req); |
|---|
| 2494 | + continue; |
|---|
| 2495 | + } |
|---|
| 1948 | 2496 | } |
|---|
| 1949 | 2497 | |
|---|
| 2498 | + img_req->state = RBD_IMG_START; |
|---|
| 1950 | 2499 | return 0; |
|---|
| 1951 | 2500 | } |
|---|
| 1952 | 2501 | |
|---|
| .. | .. |
|---|
| 2235 | 2784 | &it); |
|---|
| 2236 | 2785 | } |
|---|
| 2237 | 2786 | |
|---|
| 2238 | | -static void rbd_img_request_submit(struct rbd_img_request *img_request) |
|---|
| 2787 | +static void rbd_img_handle_request_work(struct work_struct *work) |
|---|
| 2239 | 2788 | { |
|---|
| 2240 | | - struct rbd_obj_request *obj_request; |
|---|
| 2789 | + struct rbd_img_request *img_req = |
|---|
| 2790 | + container_of(work, struct rbd_img_request, work); |
|---|
| 2241 | 2791 | |
|---|
| 2242 | | - dout("%s: img %p\n", __func__, img_request); |
|---|
| 2792 | + rbd_img_handle_request(img_req, img_req->work_result); |
|---|
| 2793 | +} |
|---|
| 2243 | 2794 | |
|---|
| 2244 | | - rbd_img_request_get(img_request); |
|---|
| 2245 | | - for_each_obj_request(img_request, obj_request) |
|---|
| 2246 | | - rbd_obj_request_submit(obj_request); |
|---|
| 2795 | +static void rbd_img_schedule(struct rbd_img_request *img_req, int result) |
|---|
| 2796 | +{ |
|---|
| 2797 | + INIT_WORK(&img_req->work, rbd_img_handle_request_work); |
|---|
| 2798 | + img_req->work_result = result; |
|---|
| 2799 | + queue_work(rbd_wq, &img_req->work); |
|---|
| 2800 | +} |
|---|
| 2247 | 2801 | |
|---|
| 2248 | | - rbd_img_request_put(img_request); |
|---|
| 2802 | +static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req) |
|---|
| 2803 | +{ |
|---|
| 2804 | + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 2805 | + |
|---|
| 2806 | + if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) { |
|---|
| 2807 | + obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST; |
|---|
| 2808 | + return true; |
|---|
| 2809 | + } |
|---|
| 2810 | + |
|---|
| 2811 | + dout("%s %p objno %llu assuming dne\n", __func__, obj_req, |
|---|
| 2812 | + obj_req->ex.oe_objno); |
|---|
| 2813 | + return false; |
|---|
| 2814 | +} |
|---|
| 2815 | + |
|---|
| 2816 | +static int rbd_obj_read_object(struct rbd_obj_request *obj_req) |
|---|
| 2817 | +{ |
|---|
| 2818 | + struct ceph_osd_request *osd_req; |
|---|
| 2819 | + int ret; |
|---|
| 2820 | + |
|---|
| 2821 | + osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1); |
|---|
| 2822 | + if (IS_ERR(osd_req)) |
|---|
| 2823 | + return PTR_ERR(osd_req); |
|---|
| 2824 | + |
|---|
| 2825 | + osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ, |
|---|
| 2826 | + obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); |
|---|
| 2827 | + rbd_osd_setup_data(osd_req, 0); |
|---|
| 2828 | + rbd_osd_format_read(osd_req); |
|---|
| 2829 | + |
|---|
| 2830 | + ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); |
|---|
| 2831 | + if (ret) |
|---|
| 2832 | + return ret; |
|---|
| 2833 | + |
|---|
| 2834 | + rbd_osd_submit(osd_req); |
|---|
| 2835 | + return 0; |
|---|
| 2249 | 2836 | } |
|---|
| 2250 | 2837 | |
|---|
| 2251 | 2838 | static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) |
|---|
| 2252 | 2839 | { |
|---|
| 2253 | 2840 | struct rbd_img_request *img_req = obj_req->img_request; |
|---|
| 2841 | + struct rbd_device *parent = img_req->rbd_dev->parent; |
|---|
| 2254 | 2842 | struct rbd_img_request *child_img_req; |
|---|
| 2255 | 2843 | int ret; |
|---|
| 2256 | 2844 | |
|---|
| 2257 | | - child_img_req = rbd_img_request_create(img_req->rbd_dev->parent, |
|---|
| 2258 | | - OBJ_OP_READ, NULL); |
|---|
| 2845 | + child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); |
|---|
| 2259 | 2846 | if (!child_img_req) |
|---|
| 2260 | 2847 | return -ENOMEM; |
|---|
| 2261 | 2848 | |
|---|
| 2849 | + rbd_img_request_init(child_img_req, parent, OBJ_OP_READ); |
|---|
| 2262 | 2850 | __set_bit(IMG_REQ_CHILD, &child_img_req->flags); |
|---|
| 2263 | 2851 | child_img_req->obj_request = obj_req; |
|---|
| 2852 | + |
|---|
| 2853 | + down_read(&parent->header_rwsem); |
|---|
| 2854 | + rbd_img_capture_header(child_img_req); |
|---|
| 2855 | + up_read(&parent->header_rwsem); |
|---|
| 2856 | + |
|---|
| 2857 | + dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req, |
|---|
| 2858 | + obj_req); |
|---|
| 2264 | 2859 | |
|---|
| 2265 | 2860 | if (!rbd_img_is_write(img_req)) { |
|---|
| 2266 | 2861 | switch (img_req->data_type) { |
|---|
| .. | .. |
|---|
| 2278 | 2873 | &obj_req->bvec_pos); |
|---|
| 2279 | 2874 | break; |
|---|
| 2280 | 2875 | default: |
|---|
| 2281 | | - rbd_assert(0); |
|---|
| 2876 | + BUG(); |
|---|
| 2282 | 2877 | } |
|---|
| 2283 | 2878 | } else { |
|---|
| 2284 | 2879 | ret = rbd_img_fill_from_bvecs(child_img_req, |
|---|
| .. | .. |
|---|
| 2287 | 2882 | obj_req->copyup_bvecs); |
|---|
| 2288 | 2883 | } |
|---|
| 2289 | 2884 | if (ret) { |
|---|
| 2290 | | - rbd_img_request_put(child_img_req); |
|---|
| 2885 | + rbd_img_request_destroy(child_img_req); |
|---|
| 2291 | 2886 | return ret; |
|---|
| 2292 | 2887 | } |
|---|
| 2293 | 2888 | |
|---|
| 2294 | | - rbd_img_request_submit(child_img_req); |
|---|
| 2889 | + /* avoid parent chain recursion */ |
|---|
| 2890 | + rbd_img_schedule(child_img_req, 0); |
|---|
| 2295 | 2891 | return 0; |
|---|
| 2296 | 2892 | } |
|---|
| 2297 | 2893 | |
|---|
| 2298 | | -static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req) |
|---|
| 2894 | +static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result) |
|---|
| 2299 | 2895 | { |
|---|
| 2300 | 2896 | struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 2301 | 2897 | int ret; |
|---|
| 2302 | 2898 | |
|---|
| 2303 | | - if (obj_req->result == -ENOENT && |
|---|
| 2304 | | - rbd_dev->parent_overlap && !obj_req->tried_parent) { |
|---|
| 2305 | | - /* reverse map this object extent onto the parent */ |
|---|
| 2306 | | - ret = rbd_obj_calc_img_extents(obj_req, false); |
|---|
| 2899 | +again: |
|---|
| 2900 | + switch (obj_req->read_state) { |
|---|
| 2901 | + case RBD_OBJ_READ_START: |
|---|
| 2902 | + rbd_assert(!*result); |
|---|
| 2903 | + |
|---|
| 2904 | + if (!rbd_obj_may_exist(obj_req)) { |
|---|
| 2905 | + *result = -ENOENT; |
|---|
| 2906 | + obj_req->read_state = RBD_OBJ_READ_OBJECT; |
|---|
| 2907 | + goto again; |
|---|
| 2908 | + } |
|---|
| 2909 | + |
|---|
| 2910 | + ret = rbd_obj_read_object(obj_req); |
|---|
| 2307 | 2911 | if (ret) { |
|---|
| 2308 | | - obj_req->result = ret; |
|---|
| 2912 | + *result = ret; |
|---|
| 2309 | 2913 | return true; |
|---|
| 2310 | 2914 | } |
|---|
| 2311 | | - |
|---|
| 2312 | | - if (obj_req->num_img_extents) { |
|---|
| 2313 | | - obj_req->tried_parent = true; |
|---|
| 2314 | | - ret = rbd_obj_read_from_parent(obj_req); |
|---|
| 2915 | + obj_req->read_state = RBD_OBJ_READ_OBJECT; |
|---|
| 2916 | + return false; |
|---|
| 2917 | + case RBD_OBJ_READ_OBJECT: |
|---|
| 2918 | + if (*result == -ENOENT && rbd_dev->parent_overlap) { |
|---|
| 2919 | + /* reverse map this object extent onto the parent */ |
|---|
| 2920 | + ret = rbd_obj_calc_img_extents(obj_req, false); |
|---|
| 2315 | 2921 | if (ret) { |
|---|
| 2316 | | - obj_req->result = ret; |
|---|
| 2922 | + *result = ret; |
|---|
| 2317 | 2923 | return true; |
|---|
| 2318 | 2924 | } |
|---|
| 2319 | | - return false; |
|---|
| 2925 | + if (obj_req->num_img_extents) { |
|---|
| 2926 | + ret = rbd_obj_read_from_parent(obj_req); |
|---|
| 2927 | + if (ret) { |
|---|
| 2928 | + *result = ret; |
|---|
| 2929 | + return true; |
|---|
| 2930 | + } |
|---|
| 2931 | + obj_req->read_state = RBD_OBJ_READ_PARENT; |
|---|
| 2932 | + return false; |
|---|
| 2933 | + } |
|---|
| 2320 | 2934 | } |
|---|
| 2935 | + |
|---|
| 2936 | + /* |
|---|
| 2937 | + * -ENOENT means a hole in the image -- zero-fill the entire |
|---|
| 2938 | + * length of the request. A short read also implies zero-fill |
|---|
| 2939 | + * to the end of the request. |
|---|
| 2940 | + */ |
|---|
| 2941 | + if (*result == -ENOENT) { |
|---|
| 2942 | + rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len); |
|---|
| 2943 | + *result = 0; |
|---|
| 2944 | + } else if (*result >= 0) { |
|---|
| 2945 | + if (*result < obj_req->ex.oe_len) |
|---|
| 2946 | + rbd_obj_zero_range(obj_req, *result, |
|---|
| 2947 | + obj_req->ex.oe_len - *result); |
|---|
| 2948 | + else |
|---|
| 2949 | + rbd_assert(*result == obj_req->ex.oe_len); |
|---|
| 2950 | + *result = 0; |
|---|
| 2951 | + } |
|---|
| 2952 | + return true; |
|---|
| 2953 | + case RBD_OBJ_READ_PARENT: |
|---|
| 2954 | + /* |
|---|
| 2955 | + * The parent image is read only up to the overlap -- zero-fill |
|---|
| 2956 | + * from the overlap to the end of the request. |
|---|
| 2957 | + */ |
|---|
| 2958 | + if (!*result) { |
|---|
| 2959 | + u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req); |
|---|
| 2960 | + |
|---|
| 2961 | + if (obj_overlap < obj_req->ex.oe_len) |
|---|
| 2962 | + rbd_obj_zero_range(obj_req, obj_overlap, |
|---|
| 2963 | + obj_req->ex.oe_len - obj_overlap); |
|---|
| 2964 | + } |
|---|
| 2965 | + return true; |
|---|
| 2966 | + default: |
|---|
| 2967 | + BUG(); |
|---|
| 2968 | + } |
|---|
| 2969 | +} |
|---|
| 2970 | + |
|---|
| 2971 | +static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req) |
|---|
| 2972 | +{ |
|---|
| 2973 | + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 2974 | + |
|---|
| 2975 | + if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) |
|---|
| 2976 | + obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST; |
|---|
| 2977 | + |
|---|
| 2978 | + if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) && |
|---|
| 2979 | + (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) { |
|---|
| 2980 | + dout("%s %p noop for nonexistent\n", __func__, obj_req); |
|---|
| 2981 | + return true; |
|---|
| 2321 | 2982 | } |
|---|
| 2322 | 2983 | |
|---|
| 2323 | | - /* |
|---|
| 2324 | | - * -ENOENT means a hole in the image -- zero-fill the entire |
|---|
| 2325 | | - * length of the request. A short read also implies zero-fill |
|---|
| 2326 | | - * to the end of the request. In both cases we update xferred |
|---|
| 2327 | | - * count to indicate the whole request was satisfied. |
|---|
| 2328 | | - */ |
|---|
| 2329 | | - if (obj_req->result == -ENOENT || |
|---|
| 2330 | | - (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) { |
|---|
| 2331 | | - rbd_assert(!obj_req->xferred || !obj_req->result); |
|---|
| 2332 | | - rbd_obj_zero_range(obj_req, obj_req->xferred, |
|---|
| 2333 | | - obj_req->ex.oe_len - obj_req->xferred); |
|---|
| 2334 | | - obj_req->result = 0; |
|---|
| 2335 | | - obj_req->xferred = obj_req->ex.oe_len; |
|---|
| 2984 | + return false; |
|---|
| 2985 | +} |
|---|
| 2986 | + |
|---|
| 2987 | +/* |
|---|
| 2988 | + * Return: |
|---|
| 2989 | + * 0 - object map update sent |
|---|
| 2990 | + * 1 - object map update isn't needed |
|---|
| 2991 | + * <0 - error |
|---|
| 2992 | + */ |
|---|
| 2993 | +static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req) |
|---|
| 2994 | +{ |
|---|
| 2995 | + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 2996 | + u8 new_state; |
|---|
| 2997 | + |
|---|
| 2998 | + if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) |
|---|
| 2999 | + return 1; |
|---|
| 3000 | + |
|---|
| 3001 | + if (obj_req->flags & RBD_OBJ_FLAG_DELETION) |
|---|
| 3002 | + new_state = OBJECT_PENDING; |
|---|
| 3003 | + else |
|---|
| 3004 | + new_state = OBJECT_EXISTS; |
|---|
| 3005 | + |
|---|
| 3006 | + return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL); |
|---|
| 3007 | +} |
|---|
| 3008 | + |
|---|
| 3009 | +static int rbd_obj_write_object(struct rbd_obj_request *obj_req) |
|---|
| 3010 | +{ |
|---|
| 3011 | + struct ceph_osd_request *osd_req; |
|---|
| 3012 | + int num_ops = count_write_ops(obj_req); |
|---|
| 3013 | + int which = 0; |
|---|
| 3014 | + int ret; |
|---|
| 3015 | + |
|---|
| 3016 | + if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) |
|---|
| 3017 | + num_ops++; /* stat */ |
|---|
| 3018 | + |
|---|
| 3019 | + osd_req = rbd_obj_add_osd_request(obj_req, num_ops); |
|---|
| 3020 | + if (IS_ERR(osd_req)) |
|---|
| 3021 | + return PTR_ERR(osd_req); |
|---|
| 3022 | + |
|---|
| 3023 | + if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) { |
|---|
| 3024 | + ret = rbd_osd_setup_stat(osd_req, which++); |
|---|
| 3025 | + if (ret) |
|---|
| 3026 | + return ret; |
|---|
| 2336 | 3027 | } |
|---|
| 2337 | 3028 | |
|---|
| 2338 | | - return true; |
|---|
| 3029 | + rbd_osd_setup_write_ops(osd_req, which); |
|---|
| 3030 | + rbd_osd_format_write(osd_req); |
|---|
| 3031 | + |
|---|
| 3032 | + ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); |
|---|
| 3033 | + if (ret) |
|---|
| 3034 | + return ret; |
|---|
| 3035 | + |
|---|
| 3036 | + rbd_osd_submit(osd_req); |
|---|
| 3037 | + return 0; |
|---|
| 2339 | 3038 | } |
|---|
| 2340 | 3039 | |
|---|
| 2341 | 3040 | /* |
|---|
| .. | .. |
|---|
| 2356 | 3055 | return true; |
|---|
| 2357 | 3056 | } |
|---|
| 2358 | 3057 | |
|---|
| 2359 | | -static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) |
|---|
| 3058 | +#define MODS_ONLY U32_MAX |
|---|
| 3059 | + |
|---|
| 3060 | +static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req, |
|---|
| 3061 | + u32 bytes) |
|---|
| 2360 | 3062 | { |
|---|
| 2361 | | - unsigned int num_osd_ops = obj_req->osd_req->r_num_ops; |
|---|
| 3063 | + struct ceph_osd_request *osd_req; |
|---|
| 2362 | 3064 | int ret; |
|---|
| 2363 | 3065 | |
|---|
| 2364 | 3066 | dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); |
|---|
| 2365 | | - rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT); |
|---|
| 2366 | | - rbd_osd_req_destroy(obj_req->osd_req); |
|---|
| 3067 | + rbd_assert(bytes > 0 && bytes != MODS_ONLY); |
|---|
| 2367 | 3068 | |
|---|
| 2368 | | - /* |
|---|
| 2369 | | - * Create a copyup request with the same number of OSD ops as |
|---|
| 2370 | | - * the original request. The original request was stat + op(s), |
|---|
| 2371 | | - * the new copyup request will be copyup + the same op(s). |
|---|
| 2372 | | - */ |
|---|
| 2373 | | - obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); |
|---|
| 2374 | | - if (!obj_req->osd_req) |
|---|
| 2375 | | - return -ENOMEM; |
|---|
| 3069 | + osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1); |
|---|
| 3070 | + if (IS_ERR(osd_req)) |
|---|
| 3071 | + return PTR_ERR(osd_req); |
|---|
| 2376 | 3072 | |
|---|
| 2377 | | - ret = osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd", |
|---|
| 2378 | | - "copyup"); |
|---|
| 3073 | + ret = rbd_osd_setup_copyup(osd_req, 0, bytes); |
|---|
| 2379 | 3074 | if (ret) |
|---|
| 2380 | 3075 | return ret; |
|---|
| 2381 | 3076 | |
|---|
| 2382 | | - /* |
|---|
| 2383 | | - * Only send non-zero copyup data to save some I/O and network |
|---|
| 2384 | | - * bandwidth -- zero copyup data is equivalent to the object not |
|---|
| 2385 | | - * existing. |
|---|
| 2386 | | - */ |
|---|
| 2387 | | - if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) { |
|---|
| 2388 | | - dout("%s obj_req %p detected zeroes\n", __func__, obj_req); |
|---|
| 2389 | | - bytes = 0; |
|---|
| 2390 | | - } |
|---|
| 2391 | | - osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, |
|---|
| 2392 | | - obj_req->copyup_bvecs, |
|---|
| 2393 | | - obj_req->copyup_bvec_count, |
|---|
| 2394 | | - bytes); |
|---|
| 3077 | + rbd_osd_format_write(osd_req); |
|---|
| 2395 | 3078 | |
|---|
| 2396 | | - switch (obj_req->img_request->op_type) { |
|---|
| 2397 | | - case OBJ_OP_WRITE: |
|---|
| 2398 | | - __rbd_obj_setup_write(obj_req, 1); |
|---|
| 2399 | | - break; |
|---|
| 2400 | | - case OBJ_OP_DISCARD: |
|---|
| 2401 | | - rbd_assert(!rbd_obj_is_entire(obj_req)); |
|---|
| 2402 | | - __rbd_obj_setup_discard(obj_req, 1); |
|---|
| 2403 | | - break; |
|---|
| 2404 | | - default: |
|---|
| 2405 | | - rbd_assert(0); |
|---|
| 3079 | + ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); |
|---|
| 3080 | + if (ret) |
|---|
| 3081 | + return ret; |
|---|
| 3082 | + |
|---|
| 3083 | + rbd_osd_submit(osd_req); |
|---|
| 3084 | + return 0; |
|---|
| 3085 | +} |
|---|
| 3086 | + |
|---|
| 3087 | +static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req, |
|---|
| 3088 | + u32 bytes) |
|---|
| 3089 | +{ |
|---|
| 3090 | + struct ceph_osd_request *osd_req; |
|---|
| 3091 | + int num_ops = count_write_ops(obj_req); |
|---|
| 3092 | + int which = 0; |
|---|
| 3093 | + int ret; |
|---|
| 3094 | + |
|---|
| 3095 | + dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); |
|---|
| 3096 | + |
|---|
| 3097 | + if (bytes != MODS_ONLY) |
|---|
| 3098 | + num_ops++; /* copyup */ |
|---|
| 3099 | + |
|---|
| 3100 | + osd_req = rbd_obj_add_osd_request(obj_req, num_ops); |
|---|
| 3101 | + if (IS_ERR(osd_req)) |
|---|
| 3102 | + return PTR_ERR(osd_req); |
|---|
| 3103 | + |
|---|
| 3104 | + if (bytes != MODS_ONLY) { |
|---|
| 3105 | + ret = rbd_osd_setup_copyup(osd_req, which++, bytes); |
|---|
| 3106 | + if (ret) |
|---|
| 3107 | + return ret; |
|---|
| 2406 | 3108 | } |
|---|
| 2407 | 3109 | |
|---|
| 2408 | | - rbd_obj_request_submit(obj_req); |
|---|
| 3110 | + rbd_osd_setup_write_ops(osd_req, which); |
|---|
| 3111 | + rbd_osd_format_write(osd_req); |
|---|
| 3112 | + |
|---|
| 3113 | + ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); |
|---|
| 3114 | + if (ret) |
|---|
| 3115 | + return ret; |
|---|
| 3116 | + |
|---|
| 3117 | + rbd_osd_submit(osd_req); |
|---|
| 2409 | 3118 | return 0; |
|---|
| 2410 | 3119 | } |
|---|
| 2411 | 3120 | |
|---|
| .. | .. |
|---|
| 2437 | 3146 | return 0; |
|---|
| 2438 | 3147 | } |
|---|
| 2439 | 3148 | |
|---|
| 2440 | | -static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) |
|---|
| 3149 | +/* |
|---|
| 3150 | + * The target object doesn't exist. Read the data for the entire |
|---|
| 3151 | + * target object up to the overlap point (if any) from the parent, |
|---|
| 3152 | + * so we can use it for a copyup. |
|---|
| 3153 | + */ |
|---|
| 3154 | +static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req) |
|---|
| 2441 | 3155 | { |
|---|
| 2442 | 3156 | struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 2443 | 3157 | int ret; |
|---|
| .. | .. |
|---|
| 2448 | 3162 | if (!obj_req->num_img_extents) { |
|---|
| 2449 | 3163 | /* |
|---|
| 2450 | 3164 | * The overlap has become 0 (most likely because the |
|---|
| 2451 | | - * image has been flattened). Use rbd_obj_issue_copyup() |
|---|
| 2452 | | - * to re-submit the original write request -- the copyup |
|---|
| 2453 | | - * operation itself will be a no-op, since someone must |
|---|
| 2454 | | - * have populated the child object while we weren't |
|---|
| 2455 | | - * looking. Move to WRITE_FLAT state as we'll be done |
|---|
| 2456 | | - * with the operation once the null copyup completes. |
|---|
| 3165 | + * image has been flattened). Re-submit the original write |
|---|
| 3166 | + * request -- pass MODS_ONLY since the copyup isn't needed |
|---|
| 3167 | + * anymore. |
|---|
| 2457 | 3168 | */ |
|---|
| 2458 | | - obj_req->write_state = RBD_OBJ_WRITE_FLAT; |
|---|
| 2459 | | - return rbd_obj_issue_copyup(obj_req, 0); |
|---|
| 3169 | + return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY); |
|---|
| 2460 | 3170 | } |
|---|
| 2461 | 3171 | |
|---|
| 2462 | 3172 | ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req)); |
|---|
| 2463 | 3173 | if (ret) |
|---|
| 2464 | 3174 | return ret; |
|---|
| 2465 | 3175 | |
|---|
| 2466 | | - obj_req->write_state = RBD_OBJ_WRITE_COPYUP; |
|---|
| 2467 | 3176 | return rbd_obj_read_from_parent(obj_req); |
|---|
| 2468 | 3177 | } |
|---|
| 2469 | 3178 | |
|---|
| 2470 | | -static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req) |
|---|
| 3179 | +static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req) |
|---|
| 2471 | 3180 | { |
|---|
| 3181 | + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 3182 | + struct ceph_snap_context *snapc = obj_req->img_request->snapc; |
|---|
| 3183 | + u8 new_state; |
|---|
| 3184 | + u32 i; |
|---|
| 3185 | + int ret; |
|---|
| 3186 | + |
|---|
| 3187 | + rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending); |
|---|
| 3188 | + |
|---|
| 3189 | + if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) |
|---|
| 3190 | + return; |
|---|
| 3191 | + |
|---|
| 3192 | + if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS) |
|---|
| 3193 | + return; |
|---|
| 3194 | + |
|---|
| 3195 | + for (i = 0; i < snapc->num_snaps; i++) { |
|---|
| 3196 | + if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) && |
|---|
| 3197 | + i + 1 < snapc->num_snaps) |
|---|
| 3198 | + new_state = OBJECT_EXISTS_CLEAN; |
|---|
| 3199 | + else |
|---|
| 3200 | + new_state = OBJECT_EXISTS; |
|---|
| 3201 | + |
|---|
| 3202 | + ret = rbd_object_map_update(obj_req, snapc->snaps[i], |
|---|
| 3203 | + new_state, NULL); |
|---|
| 3204 | + if (ret < 0) { |
|---|
| 3205 | + obj_req->pending.result = ret; |
|---|
| 3206 | + return; |
|---|
| 3207 | + } |
|---|
| 3208 | + |
|---|
| 3209 | + rbd_assert(!ret); |
|---|
| 3210 | + obj_req->pending.num_pending++; |
|---|
| 3211 | + } |
|---|
| 3212 | +} |
|---|
| 3213 | + |
|---|
| 3214 | +static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req) |
|---|
| 3215 | +{ |
|---|
| 3216 | + u32 bytes = rbd_obj_img_extents_bytes(obj_req); |
|---|
| 3217 | + int ret; |
|---|
| 3218 | + |
|---|
| 3219 | + rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending); |
|---|
| 3220 | + |
|---|
| 3221 | + /* |
|---|
| 3222 | + * Only send non-zero copyup data to save some I/O and network |
|---|
| 3223 | + * bandwidth -- zero copyup data is equivalent to the object not |
|---|
| 3224 | + * existing. |
|---|
| 3225 | + */ |
|---|
| 3226 | + if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS) |
|---|
| 3227 | + bytes = 0; |
|---|
| 3228 | + |
|---|
| 3229 | + if (obj_req->img_request->snapc->num_snaps && bytes > 0) { |
|---|
| 3230 | + /* |
|---|
| 3231 | + * Send a copyup request with an empty snapshot context to |
|---|
| 3232 | + * deep-copyup the object through all existing snapshots. |
|---|
| 3233 | + * A second request with the current snapshot context will be |
|---|
| 3234 | + * sent for the actual modification. |
|---|
| 3235 | + */ |
|---|
| 3236 | + ret = rbd_obj_copyup_empty_snapc(obj_req, bytes); |
|---|
| 3237 | + if (ret) { |
|---|
| 3238 | + obj_req->pending.result = ret; |
|---|
| 3239 | + return; |
|---|
| 3240 | + } |
|---|
| 3241 | + |
|---|
| 3242 | + obj_req->pending.num_pending++; |
|---|
| 3243 | + bytes = MODS_ONLY; |
|---|
| 3244 | + } |
|---|
| 3245 | + |
|---|
| 3246 | + ret = rbd_obj_copyup_current_snapc(obj_req, bytes); |
|---|
| 3247 | + if (ret) { |
|---|
| 3248 | + obj_req->pending.result = ret; |
|---|
| 3249 | + return; |
|---|
| 3250 | + } |
|---|
| 3251 | + |
|---|
| 3252 | + obj_req->pending.num_pending++; |
|---|
| 3253 | +} |
|---|
| 3254 | + |
|---|
| 3255 | +static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result) |
|---|
| 3256 | +{ |
|---|
| 3257 | + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 2472 | 3258 | int ret; |
|---|
| 2473 | 3259 | |
|---|
| 2474 | 3260 | again: |
|---|
| 2475 | | - switch (obj_req->write_state) { |
|---|
| 2476 | | - case RBD_OBJ_WRITE_GUARD: |
|---|
| 2477 | | - rbd_assert(!obj_req->xferred); |
|---|
| 2478 | | - if (obj_req->result == -ENOENT) { |
|---|
| 2479 | | - /* |
|---|
| 2480 | | - * The target object doesn't exist. Read the data for |
|---|
| 2481 | | - * the entire target object up to the overlap point (if |
|---|
| 2482 | | - * any) from the parent, so we can use it for a copyup. |
|---|
| 2483 | | - */ |
|---|
| 2484 | | - ret = rbd_obj_handle_write_guard(obj_req); |
|---|
| 2485 | | - if (ret) { |
|---|
| 2486 | | - obj_req->result = ret; |
|---|
| 2487 | | - return true; |
|---|
| 2488 | | - } |
|---|
| 2489 | | - return false; |
|---|
| 2490 | | - } |
|---|
| 2491 | | - /* fall through */ |
|---|
| 2492 | | - case RBD_OBJ_WRITE_FLAT: |
|---|
| 2493 | | - if (!obj_req->result) |
|---|
| 2494 | | - /* |
|---|
| 2495 | | - * There is no such thing as a successful short |
|---|
| 2496 | | - * write -- indicate the whole request was satisfied. |
|---|
| 2497 | | - */ |
|---|
| 2498 | | - obj_req->xferred = obj_req->ex.oe_len; |
|---|
| 2499 | | - return true; |
|---|
| 2500 | | - case RBD_OBJ_WRITE_COPYUP: |
|---|
| 2501 | | - obj_req->write_state = RBD_OBJ_WRITE_GUARD; |
|---|
| 2502 | | - if (obj_req->result) |
|---|
| 2503 | | - goto again; |
|---|
| 3261 | + switch (obj_req->copyup_state) { |
|---|
| 3262 | + case RBD_OBJ_COPYUP_START: |
|---|
| 3263 | + rbd_assert(!*result); |
|---|
| 2504 | 3264 | |
|---|
| 2505 | | - rbd_assert(obj_req->xferred); |
|---|
| 2506 | | - ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred); |
|---|
| 3265 | + ret = rbd_obj_copyup_read_parent(obj_req); |
|---|
| 2507 | 3266 | if (ret) { |
|---|
| 2508 | | - obj_req->result = ret; |
|---|
| 2509 | | - obj_req->xferred = 0; |
|---|
| 3267 | + *result = ret; |
|---|
| 2510 | 3268 | return true; |
|---|
| 2511 | 3269 | } |
|---|
| 3270 | + if (obj_req->num_img_extents) |
|---|
| 3271 | + obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT; |
|---|
| 3272 | + else |
|---|
| 3273 | + obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT; |
|---|
| 2512 | 3274 | return false; |
|---|
| 3275 | + case RBD_OBJ_COPYUP_READ_PARENT: |
|---|
| 3276 | + if (*result) |
|---|
| 3277 | + return true; |
|---|
| 3278 | + |
|---|
| 3279 | + if (is_zero_bvecs(obj_req->copyup_bvecs, |
|---|
| 3280 | + rbd_obj_img_extents_bytes(obj_req))) { |
|---|
| 3281 | + dout("%s %p detected zeros\n", __func__, obj_req); |
|---|
| 3282 | + obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS; |
|---|
| 3283 | + } |
|---|
| 3284 | + |
|---|
| 3285 | + rbd_obj_copyup_object_maps(obj_req); |
|---|
| 3286 | + if (!obj_req->pending.num_pending) { |
|---|
| 3287 | + *result = obj_req->pending.result; |
|---|
| 3288 | + obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS; |
|---|
| 3289 | + goto again; |
|---|
| 3290 | + } |
|---|
| 3291 | + obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS; |
|---|
| 3292 | + return false; |
|---|
| 3293 | + case __RBD_OBJ_COPYUP_OBJECT_MAPS: |
|---|
| 3294 | + if (!pending_result_dec(&obj_req->pending, result)) |
|---|
| 3295 | + return false; |
|---|
| 3296 | + fallthrough; |
|---|
| 3297 | + case RBD_OBJ_COPYUP_OBJECT_MAPS: |
|---|
| 3298 | + if (*result) { |
|---|
| 3299 | + rbd_warn(rbd_dev, "snap object map update failed: %d", |
|---|
| 3300 | + *result); |
|---|
| 3301 | + return true; |
|---|
| 3302 | + } |
|---|
| 3303 | + |
|---|
| 3304 | + rbd_obj_copyup_write_object(obj_req); |
|---|
| 3305 | + if (!obj_req->pending.num_pending) { |
|---|
| 3306 | + *result = obj_req->pending.result; |
|---|
| 3307 | + obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT; |
|---|
| 3308 | + goto again; |
|---|
| 3309 | + } |
|---|
| 3310 | + obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT; |
|---|
| 3311 | + return false; |
|---|
| 3312 | + case __RBD_OBJ_COPYUP_WRITE_OBJECT: |
|---|
| 3313 | + if (!pending_result_dec(&obj_req->pending, result)) |
|---|
| 3314 | + return false; |
|---|
| 3315 | + fallthrough; |
|---|
| 3316 | + case RBD_OBJ_COPYUP_WRITE_OBJECT: |
|---|
| 3317 | + return true; |
|---|
| 2513 | 3318 | default: |
|---|
| 2514 | 3319 | BUG(); |
|---|
| 2515 | 3320 | } |
|---|
| 2516 | 3321 | } |
|---|
| 2517 | 3322 | |
|---|
| 2518 | 3323 | /* |
|---|
| 2519 | | - * Returns true if @obj_req is completed, or false otherwise. |
|---|
| 3324 | + * Return: |
|---|
| 3325 | + * 0 - object map update sent |
|---|
| 3326 | + * 1 - object map update isn't needed |
|---|
| 3327 | + * <0 - error |
|---|
| 2520 | 3328 | */ |
|---|
| 2521 | | -static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req) |
|---|
| 3329 | +static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req) |
|---|
| 2522 | 3330 | { |
|---|
| 2523 | | - switch (obj_req->img_request->op_type) { |
|---|
| 2524 | | - case OBJ_OP_READ: |
|---|
| 2525 | | - return rbd_obj_handle_read(obj_req); |
|---|
| 2526 | | - case OBJ_OP_WRITE: |
|---|
| 2527 | | - return rbd_obj_handle_write(obj_req); |
|---|
| 2528 | | - case OBJ_OP_DISCARD: |
|---|
| 2529 | | - if (rbd_obj_handle_write(obj_req)) { |
|---|
| 2530 | | - /* |
|---|
| 2531 | | - * Hide -ENOENT from delete/truncate/zero -- discarding |
|---|
| 2532 | | - * a non-existent object is not a problem. |
|---|
| 2533 | | - */ |
|---|
| 2534 | | - if (obj_req->result == -ENOENT) { |
|---|
| 2535 | | - obj_req->result = 0; |
|---|
| 2536 | | - obj_req->xferred = obj_req->ex.oe_len; |
|---|
| 2537 | | - } |
|---|
| 3331 | + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 3332 | + u8 current_state = OBJECT_PENDING; |
|---|
| 3333 | + |
|---|
| 3334 | + if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) |
|---|
| 3335 | + return 1; |
|---|
| 3336 | + |
|---|
| 3337 | + if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION)) |
|---|
| 3338 | + return 1; |
|---|
| 3339 | + |
|---|
| 3340 | + return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT, |
|---|
| 3341 | + ¤t_state); |
|---|
| 3342 | +} |
|---|
| 3343 | + |
|---|
| 3344 | +static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result) |
|---|
| 3345 | +{ |
|---|
| 3346 | + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; |
|---|
| 3347 | + int ret; |
|---|
| 3348 | + |
|---|
| 3349 | +again: |
|---|
| 3350 | + switch (obj_req->write_state) { |
|---|
| 3351 | + case RBD_OBJ_WRITE_START: |
|---|
| 3352 | + rbd_assert(!*result); |
|---|
| 3353 | + |
|---|
| 3354 | + if (rbd_obj_write_is_noop(obj_req)) |
|---|
| 3355 | + return true; |
|---|
| 3356 | + |
|---|
| 3357 | + ret = rbd_obj_write_pre_object_map(obj_req); |
|---|
| 3358 | + if (ret < 0) { |
|---|
| 3359 | + *result = ret; |
|---|
| 2538 | 3360 | return true; |
|---|
| 2539 | 3361 | } |
|---|
| 3362 | + obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP; |
|---|
| 3363 | + if (ret > 0) |
|---|
| 3364 | + goto again; |
|---|
| 2540 | 3365 | return false; |
|---|
| 3366 | + case RBD_OBJ_WRITE_PRE_OBJECT_MAP: |
|---|
| 3367 | + if (*result) { |
|---|
| 3368 | + rbd_warn(rbd_dev, "pre object map update failed: %d", |
|---|
| 3369 | + *result); |
|---|
| 3370 | + return true; |
|---|
| 3371 | + } |
|---|
| 3372 | + ret = rbd_obj_write_object(obj_req); |
|---|
| 3373 | + if (ret) { |
|---|
| 3374 | + *result = ret; |
|---|
| 3375 | + return true; |
|---|
| 3376 | + } |
|---|
| 3377 | + obj_req->write_state = RBD_OBJ_WRITE_OBJECT; |
|---|
| 3378 | + return false; |
|---|
| 3379 | + case RBD_OBJ_WRITE_OBJECT: |
|---|
| 3380 | + if (*result == -ENOENT) { |
|---|
| 3381 | + if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) { |
|---|
| 3382 | + *result = 0; |
|---|
| 3383 | + obj_req->copyup_state = RBD_OBJ_COPYUP_START; |
|---|
| 3384 | + obj_req->write_state = __RBD_OBJ_WRITE_COPYUP; |
|---|
| 3385 | + goto again; |
|---|
| 3386 | + } |
|---|
| 3387 | + /* |
|---|
| 3388 | + * On a non-existent object: |
|---|
| 3389 | + * delete - -ENOENT, truncate/zero - 0 |
|---|
| 3390 | + */ |
|---|
| 3391 | + if (obj_req->flags & RBD_OBJ_FLAG_DELETION) |
|---|
| 3392 | + *result = 0; |
|---|
| 3393 | + } |
|---|
| 3394 | + if (*result) |
|---|
| 3395 | + return true; |
|---|
| 3396 | + |
|---|
| 3397 | + obj_req->write_state = RBD_OBJ_WRITE_COPYUP; |
|---|
| 3398 | + goto again; |
|---|
| 3399 | + case __RBD_OBJ_WRITE_COPYUP: |
|---|
| 3400 | + if (!rbd_obj_advance_copyup(obj_req, result)) |
|---|
| 3401 | + return false; |
|---|
| 3402 | + fallthrough; |
|---|
| 3403 | + case RBD_OBJ_WRITE_COPYUP: |
|---|
| 3404 | + if (*result) { |
|---|
| 3405 | + rbd_warn(rbd_dev, "copyup failed: %d", *result); |
|---|
| 3406 | + return true; |
|---|
| 3407 | + } |
|---|
| 3408 | + ret = rbd_obj_write_post_object_map(obj_req); |
|---|
| 3409 | + if (ret < 0) { |
|---|
| 3410 | + *result = ret; |
|---|
| 3411 | + return true; |
|---|
| 3412 | + } |
|---|
| 3413 | + obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP; |
|---|
| 3414 | + if (ret > 0) |
|---|
| 3415 | + goto again; |
|---|
| 3416 | + return false; |
|---|
| 3417 | + case RBD_OBJ_WRITE_POST_OBJECT_MAP: |
|---|
| 3418 | + if (*result) |
|---|
| 3419 | + rbd_warn(rbd_dev, "post object map update failed: %d", |
|---|
| 3420 | + *result); |
|---|
| 3421 | + return true; |
|---|
| 2541 | 3422 | default: |
|---|
| 2542 | 3423 | BUG(); |
|---|
| 2543 | 3424 | } |
|---|
| 2544 | 3425 | } |
|---|
| 2545 | 3426 | |
|---|
| 2546 | | -static void rbd_obj_end_request(struct rbd_obj_request *obj_req) |
|---|
| 3427 | +/* |
|---|
| 3428 | + * Return true if @obj_req is completed. |
|---|
| 3429 | + */ |
|---|
| 3430 | +static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req, |
|---|
| 3431 | + int *result) |
|---|
| 2547 | 3432 | { |
|---|
| 2548 | 3433 | struct rbd_img_request *img_req = obj_req->img_request; |
|---|
| 3434 | + struct rbd_device *rbd_dev = img_req->rbd_dev; |
|---|
| 3435 | + bool done; |
|---|
| 2549 | 3436 | |
|---|
| 2550 | | - rbd_assert((!obj_req->result && |
|---|
| 2551 | | - obj_req->xferred == obj_req->ex.oe_len) || |
|---|
| 2552 | | - (obj_req->result < 0 && !obj_req->xferred)); |
|---|
| 2553 | | - if (!obj_req->result) { |
|---|
| 2554 | | - img_req->xferred += obj_req->xferred; |
|---|
| 2555 | | - return; |
|---|
| 2556 | | - } |
|---|
| 3437 | + mutex_lock(&obj_req->state_mutex); |
|---|
| 3438 | + if (!rbd_img_is_write(img_req)) |
|---|
| 3439 | + done = rbd_obj_advance_read(obj_req, result); |
|---|
| 3440 | + else |
|---|
| 3441 | + done = rbd_obj_advance_write(obj_req, result); |
|---|
| 3442 | + mutex_unlock(&obj_req->state_mutex); |
|---|
| 2557 | 3443 | |
|---|
| 2558 | | - rbd_warn(img_req->rbd_dev, |
|---|
| 2559 | | - "%s at objno %llu %llu~%llu result %d xferred %llu", |
|---|
| 2560 | | - obj_op_name(img_req->op_type), obj_req->ex.oe_objno, |
|---|
| 2561 | | - obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result, |
|---|
| 2562 | | - obj_req->xferred); |
|---|
| 2563 | | - if (!img_req->result) { |
|---|
| 2564 | | - img_req->result = obj_req->result; |
|---|
| 2565 | | - img_req->xferred = 0; |
|---|
| 3444 | + if (done && *result) { |
|---|
| 3445 | + rbd_assert(*result < 0); |
|---|
| 3446 | + rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d", |
|---|
| 3447 | + obj_op_name(img_req->op_type), obj_req->ex.oe_objno, |
|---|
| 3448 | + obj_req->ex.oe_off, obj_req->ex.oe_len, *result); |
|---|
| 2566 | 3449 | } |
|---|
| 3450 | + return done; |
|---|
| 2567 | 3451 | } |
|---|
| 2568 | 3452 | |
|---|
| 2569 | | -static void rbd_img_end_child_request(struct rbd_img_request *img_req) |
|---|
| 3453 | +/* |
|---|
| 3454 | + * This is open-coded in rbd_img_handle_request() to avoid parent chain |
|---|
| 3455 | + * recursion. |
|---|
| 3456 | + */ |
|---|
| 3457 | +static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result) |
|---|
| 2570 | 3458 | { |
|---|
| 2571 | | - struct rbd_obj_request *obj_req = img_req->obj_request; |
|---|
| 2572 | | - |
|---|
| 2573 | | - rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags)); |
|---|
| 2574 | | - rbd_assert((!img_req->result && |
|---|
| 2575 | | - img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) || |
|---|
| 2576 | | - (img_req->result < 0 && !img_req->xferred)); |
|---|
| 2577 | | - |
|---|
| 2578 | | - obj_req->result = img_req->result; |
|---|
| 2579 | | - obj_req->xferred = img_req->xferred; |
|---|
| 2580 | | - rbd_img_request_put(img_req); |
|---|
| 3459 | + if (__rbd_obj_handle_request(obj_req, &result)) |
|---|
| 3460 | + rbd_img_handle_request(obj_req->img_request, result); |
|---|
| 2581 | 3461 | } |
|---|
| 2582 | 3462 | |
|---|
| 2583 | | -static void rbd_img_end_request(struct rbd_img_request *img_req) |
|---|
| 3463 | +static bool need_exclusive_lock(struct rbd_img_request *img_req) |
|---|
| 2584 | 3464 | { |
|---|
| 3465 | + struct rbd_device *rbd_dev = img_req->rbd_dev; |
|---|
| 3466 | + |
|---|
| 3467 | + if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) |
|---|
| 3468 | + return false; |
|---|
| 3469 | + |
|---|
| 3470 | + if (rbd_is_ro(rbd_dev)) |
|---|
| 3471 | + return false; |
|---|
| 3472 | + |
|---|
| 2585 | 3473 | rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); |
|---|
| 2586 | | - rbd_assert((!img_req->result && |
|---|
| 2587 | | - img_req->xferred == blk_rq_bytes(img_req->rq)) || |
|---|
| 2588 | | - (img_req->result < 0 && !img_req->xferred)); |
|---|
| 3474 | + if (rbd_dev->opts->lock_on_read || |
|---|
| 3475 | + (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) |
|---|
| 3476 | + return true; |
|---|
| 2589 | 3477 | |
|---|
| 2590 | | - blk_mq_end_request(img_req->rq, |
|---|
| 2591 | | - errno_to_blk_status(img_req->result)); |
|---|
| 2592 | | - rbd_img_request_put(img_req); |
|---|
| 3478 | + return rbd_img_is_write(img_req); |
|---|
| 2593 | 3479 | } |
|---|
| 2594 | 3480 | |
|---|
| 2595 | | -static void rbd_obj_handle_request(struct rbd_obj_request *obj_req) |
|---|
| 3481 | +static bool rbd_lock_add_request(struct rbd_img_request *img_req) |
|---|
| 2596 | 3482 | { |
|---|
| 2597 | | - struct rbd_img_request *img_req; |
|---|
| 3483 | + struct rbd_device *rbd_dev = img_req->rbd_dev; |
|---|
| 3484 | + bool locked; |
|---|
| 3485 | + |
|---|
| 3486 | + lockdep_assert_held(&rbd_dev->lock_rwsem); |
|---|
| 3487 | + locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED; |
|---|
| 3488 | + spin_lock(&rbd_dev->lock_lists_lock); |
|---|
| 3489 | + rbd_assert(list_empty(&img_req->lock_item)); |
|---|
| 3490 | + if (!locked) |
|---|
| 3491 | + list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list); |
|---|
| 3492 | + else |
|---|
| 3493 | + list_add_tail(&img_req->lock_item, &rbd_dev->running_list); |
|---|
| 3494 | + spin_unlock(&rbd_dev->lock_lists_lock); |
|---|
| 3495 | + return locked; |
|---|
| 3496 | +} |
|---|
| 3497 | + |
|---|
| 3498 | +static void rbd_lock_del_request(struct rbd_img_request *img_req) |
|---|
| 3499 | +{ |
|---|
| 3500 | + struct rbd_device *rbd_dev = img_req->rbd_dev; |
|---|
| 3501 | + bool need_wakeup; |
|---|
| 3502 | + |
|---|
| 3503 | + lockdep_assert_held(&rbd_dev->lock_rwsem); |
|---|
| 3504 | + spin_lock(&rbd_dev->lock_lists_lock); |
|---|
| 3505 | + rbd_assert(!list_empty(&img_req->lock_item)); |
|---|
| 3506 | + list_del_init(&img_req->lock_item); |
|---|
| 3507 | + need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING && |
|---|
| 3508 | + list_empty(&rbd_dev->running_list)); |
|---|
| 3509 | + spin_unlock(&rbd_dev->lock_lists_lock); |
|---|
| 3510 | + if (need_wakeup) |
|---|
| 3511 | + complete(&rbd_dev->releasing_wait); |
|---|
| 3512 | +} |
|---|
| 3513 | + |
|---|
| 3514 | +static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) |
|---|
| 3515 | +{ |
|---|
| 3516 | + struct rbd_device *rbd_dev = img_req->rbd_dev; |
|---|
| 3517 | + |
|---|
| 3518 | + if (!need_exclusive_lock(img_req)) |
|---|
| 3519 | + return 1; |
|---|
| 3520 | + |
|---|
| 3521 | + if (rbd_lock_add_request(img_req)) |
|---|
| 3522 | + return 1; |
|---|
| 3523 | + |
|---|
| 3524 | + if (rbd_dev->opts->exclusive) { |
|---|
| 3525 | + WARN_ON(1); /* lock got released? */ |
|---|
| 3526 | + return -EROFS; |
|---|
| 3527 | + } |
|---|
| 3528 | + |
|---|
| 3529 | + /* |
|---|
| 3530 | + * Note the use of mod_delayed_work() in rbd_acquire_lock() |
|---|
| 3531 | + * and cancel_delayed_work() in wake_lock_waiters(). |
|---|
| 3532 | + */ |
|---|
| 3533 | + dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); |
|---|
| 3534 | + queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); |
|---|
| 3535 | + return 0; |
|---|
| 3536 | +} |
|---|
| 3537 | + |
|---|
| 3538 | +static void rbd_img_object_requests(struct rbd_img_request *img_req) |
|---|
| 3539 | +{ |
|---|
| 3540 | + struct rbd_obj_request *obj_req; |
|---|
| 3541 | + |
|---|
| 3542 | + rbd_assert(!img_req->pending.result && !img_req->pending.num_pending); |
|---|
| 3543 | + |
|---|
| 3544 | + for_each_obj_request(img_req, obj_req) { |
|---|
| 3545 | + int result = 0; |
|---|
| 3546 | + |
|---|
| 3547 | + if (__rbd_obj_handle_request(obj_req, &result)) { |
|---|
| 3548 | + if (result) { |
|---|
| 3549 | + img_req->pending.result = result; |
|---|
| 3550 | + return; |
|---|
| 3551 | + } |
|---|
| 3552 | + } else { |
|---|
| 3553 | + img_req->pending.num_pending++; |
|---|
| 3554 | + } |
|---|
| 3555 | + } |
|---|
| 3556 | +} |
|---|
| 3557 | + |
|---|
| 3558 | +static bool rbd_img_advance(struct rbd_img_request *img_req, int *result) |
|---|
| 3559 | +{ |
|---|
| 3560 | + struct rbd_device *rbd_dev = img_req->rbd_dev; |
|---|
| 3561 | + int ret; |
|---|
| 2598 | 3562 | |
|---|
| 2599 | 3563 | again: |
|---|
| 2600 | | - if (!__rbd_obj_handle_request(obj_req)) |
|---|
| 2601 | | - return; |
|---|
| 3564 | + switch (img_req->state) { |
|---|
| 3565 | + case RBD_IMG_START: |
|---|
| 3566 | + rbd_assert(!*result); |
|---|
| 2602 | 3567 | |
|---|
| 2603 | | - img_req = obj_req->img_request; |
|---|
| 2604 | | - spin_lock(&img_req->completion_lock); |
|---|
| 2605 | | - rbd_obj_end_request(obj_req); |
|---|
| 2606 | | - rbd_assert(img_req->pending_count); |
|---|
| 2607 | | - if (--img_req->pending_count) { |
|---|
| 2608 | | - spin_unlock(&img_req->completion_lock); |
|---|
| 2609 | | - return; |
|---|
| 3568 | + ret = rbd_img_exclusive_lock(img_req); |
|---|
| 3569 | + if (ret < 0) { |
|---|
| 3570 | + *result = ret; |
|---|
| 3571 | + return true; |
|---|
| 3572 | + } |
|---|
| 3573 | + img_req->state = RBD_IMG_EXCLUSIVE_LOCK; |
|---|
| 3574 | + if (ret > 0) |
|---|
| 3575 | + goto again; |
|---|
| 3576 | + return false; |
|---|
| 3577 | + case RBD_IMG_EXCLUSIVE_LOCK: |
|---|
| 3578 | + if (*result) |
|---|
| 3579 | + return true; |
|---|
| 3580 | + |
|---|
| 3581 | + rbd_assert(!need_exclusive_lock(img_req) || |
|---|
| 3582 | + __rbd_is_lock_owner(rbd_dev)); |
|---|
| 3583 | + |
|---|
| 3584 | + rbd_img_object_requests(img_req); |
|---|
| 3585 | + if (!img_req->pending.num_pending) { |
|---|
| 3586 | + *result = img_req->pending.result; |
|---|
| 3587 | + img_req->state = RBD_IMG_OBJECT_REQUESTS; |
|---|
| 3588 | + goto again; |
|---|
| 3589 | + } |
|---|
| 3590 | + img_req->state = __RBD_IMG_OBJECT_REQUESTS; |
|---|
| 3591 | + return false; |
|---|
| 3592 | + case __RBD_IMG_OBJECT_REQUESTS: |
|---|
| 3593 | + if (!pending_result_dec(&img_req->pending, result)) |
|---|
| 3594 | + return false; |
|---|
| 3595 | + fallthrough; |
|---|
| 3596 | + case RBD_IMG_OBJECT_REQUESTS: |
|---|
| 3597 | + return true; |
|---|
| 3598 | + default: |
|---|
| 3599 | + BUG(); |
|---|
| 3600 | + } |
|---|
| 3601 | +} |
|---|
| 3602 | + |
|---|
| 3603 | +/* |
|---|
| 3604 | + * Return true if @img_req is completed. |
|---|
| 3605 | + */ |
|---|
| 3606 | +static bool __rbd_img_handle_request(struct rbd_img_request *img_req, |
|---|
| 3607 | + int *result) |
|---|
| 3608 | +{ |
|---|
| 3609 | + struct rbd_device *rbd_dev = img_req->rbd_dev; |
|---|
| 3610 | + bool done; |
|---|
| 3611 | + |
|---|
| 3612 | + if (need_exclusive_lock(img_req)) { |
|---|
| 3613 | + down_read(&rbd_dev->lock_rwsem); |
|---|
| 3614 | + mutex_lock(&img_req->state_mutex); |
|---|
| 3615 | + done = rbd_img_advance(img_req, result); |
|---|
| 3616 | + if (done) |
|---|
| 3617 | + rbd_lock_del_request(img_req); |
|---|
| 3618 | + mutex_unlock(&img_req->state_mutex); |
|---|
| 3619 | + up_read(&rbd_dev->lock_rwsem); |
|---|
| 3620 | + } else { |
|---|
| 3621 | + mutex_lock(&img_req->state_mutex); |
|---|
| 3622 | + done = rbd_img_advance(img_req, result); |
|---|
| 3623 | + mutex_unlock(&img_req->state_mutex); |
|---|
| 2610 | 3624 | } |
|---|
| 2611 | 3625 | |
|---|
| 2612 | | - spin_unlock(&img_req->completion_lock); |
|---|
| 3626 | + if (done && *result) { |
|---|
| 3627 | + rbd_assert(*result < 0); |
|---|
| 3628 | + rbd_warn(rbd_dev, "%s%s result %d", |
|---|
| 3629 | + test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "", |
|---|
| 3630 | + obj_op_name(img_req->op_type), *result); |
|---|
| 3631 | + } |
|---|
| 3632 | + return done; |
|---|
| 3633 | +} |
|---|
| 3634 | + |
|---|
| 3635 | +static void rbd_img_handle_request(struct rbd_img_request *img_req, int result) |
|---|
| 3636 | +{ |
|---|
| 3637 | +again: |
|---|
| 3638 | + if (!__rbd_img_handle_request(img_req, &result)) |
|---|
| 3639 | + return; |
|---|
| 3640 | + |
|---|
| 2613 | 3641 | if (test_bit(IMG_REQ_CHILD, &img_req->flags)) { |
|---|
| 2614 | | - obj_req = img_req->obj_request; |
|---|
| 2615 | | - rbd_img_end_child_request(img_req); |
|---|
| 2616 | | - goto again; |
|---|
| 3642 | + struct rbd_obj_request *obj_req = img_req->obj_request; |
|---|
| 3643 | + |
|---|
| 3644 | + rbd_img_request_destroy(img_req); |
|---|
| 3645 | + if (__rbd_obj_handle_request(obj_req, &result)) { |
|---|
| 3646 | + img_req = obj_req->img_request; |
|---|
| 3647 | + goto again; |
|---|
| 3648 | + } |
|---|
| 3649 | + } else { |
|---|
| 3650 | + struct request *rq = blk_mq_rq_from_pdu(img_req); |
|---|
| 3651 | + |
|---|
| 3652 | + rbd_img_request_destroy(img_req); |
|---|
| 3653 | + blk_mq_end_request(rq, errno_to_blk_status(result)); |
|---|
| 2617 | 3654 | } |
|---|
| 2618 | | - rbd_img_end_request(img_req); |
|---|
| 2619 | 3655 | } |
|---|
| 2620 | 3656 | |
|---|
| 2621 | 3657 | static const struct rbd_client_id rbd_empty_cid; |
|---|
| .. | .. |
|---|
| 2660 | 3696 | { |
|---|
| 2661 | 3697 | struct rbd_client_id cid = rbd_get_cid(rbd_dev); |
|---|
| 2662 | 3698 | |
|---|
| 3699 | + rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; |
|---|
| 2663 | 3700 | strcpy(rbd_dev->lock_cookie, cookie); |
|---|
| 2664 | 3701 | rbd_set_owner_cid(rbd_dev, &cid); |
|---|
| 2665 | 3702 | queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); |
|---|
| .. | .. |
|---|
| 2684 | 3721 | if (ret) |
|---|
| 2685 | 3722 | return ret; |
|---|
| 2686 | 3723 | |
|---|
| 2687 | | - rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; |
|---|
| 2688 | 3724 | __rbd_lock(rbd_dev, cookie); |
|---|
| 2689 | 3725 | return 0; |
|---|
| 2690 | 3726 | } |
|---|
| .. | .. |
|---|
| 2703 | 3739 | ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, |
|---|
| 2704 | 3740 | RBD_LOCK_NAME, rbd_dev->lock_cookie); |
|---|
| 2705 | 3741 | if (ret && ret != -ENOENT) |
|---|
| 2706 | | - rbd_warn(rbd_dev, "failed to unlock: %d", ret); |
|---|
| 3742 | + rbd_warn(rbd_dev, "failed to unlock header: %d", ret); |
|---|
| 2707 | 3743 | |
|---|
| 2708 | 3744 | /* treat errors as the image is unlocked */ |
|---|
| 2709 | 3745 | rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; |
|---|
| .. | .. |
|---|
| 2739 | 3775 | static void rbd_notify_op_lock(struct rbd_device *rbd_dev, |
|---|
| 2740 | 3776 | enum rbd_notify_op notify_op) |
|---|
| 2741 | 3777 | { |
|---|
| 2742 | | - struct page **reply_pages; |
|---|
| 2743 | | - size_t reply_len; |
|---|
| 2744 | | - |
|---|
| 2745 | | - __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len); |
|---|
| 2746 | | - ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); |
|---|
| 3778 | + __rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL); |
|---|
| 2747 | 3779 | } |
|---|
| 2748 | 3780 | |
|---|
| 2749 | 3781 | static void rbd_notify_acquired_lock(struct work_struct *work) |
|---|
| .. | .. |
|---|
| 2830 | 3862 | goto out; |
|---|
| 2831 | 3863 | } |
|---|
| 2832 | 3864 | |
|---|
| 2833 | | -static void wake_requests(struct rbd_device *rbd_dev, bool wake_all) |
|---|
| 3865 | +/* |
|---|
| 3866 | + * Either image request state machine(s) or rbd_add_acquire_lock() |
|---|
| 3867 | + * (i.e. "rbd map"). |
|---|
| 3868 | + */ |
|---|
| 3869 | +static void wake_lock_waiters(struct rbd_device *rbd_dev, int result) |
|---|
| 2834 | 3870 | { |
|---|
| 2835 | | - dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all); |
|---|
| 3871 | + struct rbd_img_request *img_req; |
|---|
| 3872 | + |
|---|
| 3873 | + dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); |
|---|
| 3874 | + lockdep_assert_held_write(&rbd_dev->lock_rwsem); |
|---|
| 2836 | 3875 | |
|---|
| 2837 | 3876 | cancel_delayed_work(&rbd_dev->lock_dwork); |
|---|
| 2838 | | - if (wake_all) |
|---|
| 2839 | | - wake_up_all(&rbd_dev->lock_waitq); |
|---|
| 2840 | | - else |
|---|
| 2841 | | - wake_up(&rbd_dev->lock_waitq); |
|---|
| 3877 | + if (!completion_done(&rbd_dev->acquire_wait)) { |
|---|
| 3878 | + rbd_assert(list_empty(&rbd_dev->acquiring_list) && |
|---|
| 3879 | + list_empty(&rbd_dev->running_list)); |
|---|
| 3880 | + rbd_dev->acquire_err = result; |
|---|
| 3881 | + complete_all(&rbd_dev->acquire_wait); |
|---|
| 3882 | + return; |
|---|
| 3883 | + } |
|---|
| 3884 | + |
|---|
| 3885 | + list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) { |
|---|
| 3886 | + mutex_lock(&img_req->state_mutex); |
|---|
| 3887 | + rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK); |
|---|
| 3888 | + rbd_img_schedule(img_req, result); |
|---|
| 3889 | + mutex_unlock(&img_req->state_mutex); |
|---|
| 3890 | + } |
|---|
| 3891 | + |
|---|
| 3892 | + list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list); |
|---|
| 2842 | 3893 | } |
|---|
| 2843 | 3894 | |
|---|
| 2844 | 3895 | static int get_lock_owner_info(struct rbd_device *rbd_dev, |
|---|
| .. | .. |
|---|
| 2953 | 4004 | goto again; |
|---|
| 2954 | 4005 | |
|---|
| 2955 | 4006 | ret = find_watcher(rbd_dev, lockers); |
|---|
| 2956 | | - if (ret) { |
|---|
| 2957 | | - if (ret > 0) |
|---|
| 2958 | | - ret = 0; /* have to request lock */ |
|---|
| 2959 | | - goto out; |
|---|
| 2960 | | - } |
|---|
| 4007 | + if (ret) |
|---|
| 4008 | + goto out; /* request lock or error */ |
|---|
| 2961 | 4009 | |
|---|
| 2962 | | - rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock", |
|---|
| 4010 | + rbd_warn(rbd_dev, "breaking header lock owned by %s%llu", |
|---|
| 2963 | 4011 | ENTITY_NAME(lockers[0].id.name)); |
|---|
| 2964 | 4012 | |
|---|
| 2965 | | - ret = ceph_monc_blacklist_add(&client->monc, |
|---|
| 4013 | + ret = ceph_monc_blocklist_add(&client->monc, |
|---|
| 2966 | 4014 | &lockers[0].info.addr); |
|---|
| 2967 | 4015 | if (ret) { |
|---|
| 2968 | | - rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d", |
|---|
| 4016 | + rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d", |
|---|
| 2969 | 4017 | ENTITY_NAME(lockers[0].id.name), ret); |
|---|
| 2970 | 4018 | goto out; |
|---|
| 2971 | 4019 | } |
|---|
| .. | .. |
|---|
| 2986 | 4034 | return ret; |
|---|
| 2987 | 4035 | } |
|---|
| 2988 | 4036 | |
|---|
| 2989 | | -/* |
|---|
| 2990 | | - * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED |
|---|
| 2991 | | - */ |
|---|
| 2992 | | -static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev, |
|---|
| 2993 | | - int *pret) |
|---|
| 4037 | +static int rbd_post_acquire_action(struct rbd_device *rbd_dev) |
|---|
| 2994 | 4038 | { |
|---|
| 2995 | | - enum rbd_lock_state lock_state; |
|---|
| 4039 | + int ret; |
|---|
| 4040 | + |
|---|
| 4041 | + if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) { |
|---|
| 4042 | + ret = rbd_object_map_open(rbd_dev); |
|---|
| 4043 | + if (ret) |
|---|
| 4044 | + return ret; |
|---|
| 4045 | + } |
|---|
| 4046 | + |
|---|
| 4047 | + return 0; |
|---|
| 4048 | +} |
|---|
| 4049 | + |
|---|
| 4050 | +/* |
|---|
| 4051 | + * Return: |
|---|
| 4052 | + * 0 - lock acquired |
|---|
| 4053 | + * 1 - caller should call rbd_request_lock() |
|---|
| 4054 | + * <0 - error |
|---|
| 4055 | + */ |
|---|
| 4056 | +static int rbd_try_acquire_lock(struct rbd_device *rbd_dev) |
|---|
| 4057 | +{ |
|---|
| 4058 | + int ret; |
|---|
| 2996 | 4059 | |
|---|
| 2997 | 4060 | down_read(&rbd_dev->lock_rwsem); |
|---|
| 2998 | 4061 | dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, |
|---|
| 2999 | 4062 | rbd_dev->lock_state); |
|---|
| 3000 | 4063 | if (__rbd_is_lock_owner(rbd_dev)) { |
|---|
| 3001 | | - lock_state = rbd_dev->lock_state; |
|---|
| 3002 | 4064 | up_read(&rbd_dev->lock_rwsem); |
|---|
| 3003 | | - return lock_state; |
|---|
| 4065 | + return 0; |
|---|
| 3004 | 4066 | } |
|---|
| 3005 | 4067 | |
|---|
| 3006 | 4068 | up_read(&rbd_dev->lock_rwsem); |
|---|
| 3007 | 4069 | down_write(&rbd_dev->lock_rwsem); |
|---|
| 3008 | 4070 | dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, |
|---|
| 3009 | 4071 | rbd_dev->lock_state); |
|---|
| 3010 | | - if (!__rbd_is_lock_owner(rbd_dev)) { |
|---|
| 3011 | | - *pret = rbd_try_lock(rbd_dev); |
|---|
| 3012 | | - if (*pret) |
|---|
| 3013 | | - rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret); |
|---|
| 4072 | + if (__rbd_is_lock_owner(rbd_dev)) { |
|---|
| 4073 | + up_write(&rbd_dev->lock_rwsem); |
|---|
| 4074 | + return 0; |
|---|
| 3014 | 4075 | } |
|---|
| 3015 | 4076 | |
|---|
| 3016 | | - lock_state = rbd_dev->lock_state; |
|---|
| 4077 | + ret = rbd_try_lock(rbd_dev); |
|---|
| 4078 | + if (ret < 0) { |
|---|
| 4079 | + rbd_warn(rbd_dev, "failed to lock header: %d", ret); |
|---|
| 4080 | + if (ret == -EBLOCKLISTED) |
|---|
| 4081 | + goto out; |
|---|
| 4082 | + |
|---|
| 4083 | + ret = 1; /* request lock anyway */ |
|---|
| 4084 | + } |
|---|
| 4085 | + if (ret > 0) { |
|---|
| 4086 | + up_write(&rbd_dev->lock_rwsem); |
|---|
| 4087 | + return ret; |
|---|
| 4088 | + } |
|---|
| 4089 | + |
|---|
| 4090 | + rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED); |
|---|
| 4091 | + rbd_assert(list_empty(&rbd_dev->running_list)); |
|---|
| 4092 | + |
|---|
| 4093 | + ret = rbd_post_acquire_action(rbd_dev); |
|---|
| 4094 | + if (ret) { |
|---|
| 4095 | + rbd_warn(rbd_dev, "post-acquire action failed: %d", ret); |
|---|
| 4096 | + /* |
|---|
| 4097 | + * Can't stay in RBD_LOCK_STATE_LOCKED because |
|---|
| 4098 | + * rbd_lock_add_request() would let the request through, |
|---|
| 4099 | + * assuming that e.g. object map is locked and loaded. |
|---|
| 4100 | + */ |
|---|
| 4101 | + rbd_unlock(rbd_dev); |
|---|
| 4102 | + } |
|---|
| 4103 | + |
|---|
| 4104 | +out: |
|---|
| 4105 | + wake_lock_waiters(rbd_dev, ret); |
|---|
| 3017 | 4106 | up_write(&rbd_dev->lock_rwsem); |
|---|
| 3018 | | - return lock_state; |
|---|
| 4107 | + return ret; |
|---|
| 3019 | 4108 | } |
|---|
| 3020 | 4109 | |
|---|
| 3021 | 4110 | static void rbd_acquire_lock(struct work_struct *work) |
|---|
| 3022 | 4111 | { |
|---|
| 3023 | 4112 | struct rbd_device *rbd_dev = container_of(to_delayed_work(work), |
|---|
| 3024 | 4113 | struct rbd_device, lock_dwork); |
|---|
| 3025 | | - enum rbd_lock_state lock_state; |
|---|
| 3026 | | - int ret = 0; |
|---|
| 4114 | + int ret; |
|---|
| 3027 | 4115 | |
|---|
| 3028 | 4116 | dout("%s rbd_dev %p\n", __func__, rbd_dev); |
|---|
| 3029 | 4117 | again: |
|---|
| 3030 | | - lock_state = rbd_try_acquire_lock(rbd_dev, &ret); |
|---|
| 3031 | | - if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) { |
|---|
| 3032 | | - if (lock_state == RBD_LOCK_STATE_LOCKED) |
|---|
| 3033 | | - wake_requests(rbd_dev, true); |
|---|
| 3034 | | - dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__, |
|---|
| 3035 | | - rbd_dev, lock_state, ret); |
|---|
| 4118 | + ret = rbd_try_acquire_lock(rbd_dev); |
|---|
| 4119 | + if (ret <= 0) { |
|---|
| 4120 | + dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret); |
|---|
| 3036 | 4121 | return; |
|---|
| 3037 | 4122 | } |
|---|
| 3038 | 4123 | |
|---|
| .. | .. |
|---|
| 3041 | 4126 | goto again; /* treat this as a dead client */ |
|---|
| 3042 | 4127 | } else if (ret == -EROFS) { |
|---|
| 3043 | 4128 | rbd_warn(rbd_dev, "peer will not release lock"); |
|---|
| 3044 | | - /* |
|---|
| 3045 | | - * If this is rbd_add_acquire_lock(), we want to fail |
|---|
| 3046 | | - * immediately -- reuse BLACKLISTED flag. Otherwise we |
|---|
| 3047 | | - * want to block. |
|---|
| 3048 | | - */ |
|---|
| 3049 | | - if (!(rbd_dev->disk->flags & GENHD_FL_UP)) { |
|---|
| 3050 | | - set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); |
|---|
| 3051 | | - /* wake "rbd map --exclusive" process */ |
|---|
| 3052 | | - wake_requests(rbd_dev, false); |
|---|
| 3053 | | - } |
|---|
| 4129 | + down_write(&rbd_dev->lock_rwsem); |
|---|
| 4130 | + wake_lock_waiters(rbd_dev, ret); |
|---|
| 4131 | + up_write(&rbd_dev->lock_rwsem); |
|---|
| 3054 | 4132 | } else if (ret < 0) { |
|---|
| 3055 | 4133 | rbd_warn(rbd_dev, "error requesting lock: %d", ret); |
|---|
| 3056 | 4134 | mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, |
|---|
| .. | .. |
|---|
| 3060 | 4138 | * lock owner acked, but resend if we don't see them |
|---|
| 3061 | 4139 | * release the lock |
|---|
| 3062 | 4140 | */ |
|---|
| 3063 | | - dout("%s rbd_dev %p requeueing lock_dwork\n", __func__, |
|---|
| 4141 | + dout("%s rbd_dev %p requeuing lock_dwork\n", __func__, |
|---|
| 3064 | 4142 | rbd_dev); |
|---|
| 3065 | 4143 | mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, |
|---|
| 3066 | 4144 | msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); |
|---|
| 3067 | 4145 | } |
|---|
| 3068 | 4146 | } |
|---|
| 3069 | 4147 | |
|---|
| 3070 | | -/* |
|---|
| 3071 | | - * lock_rwsem must be held for write |
|---|
| 3072 | | - */ |
|---|
| 3073 | | -static bool rbd_release_lock(struct rbd_device *rbd_dev) |
|---|
| 4148 | +static bool rbd_quiesce_lock(struct rbd_device *rbd_dev) |
|---|
| 3074 | 4149 | { |
|---|
| 3075 | | - dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, |
|---|
| 3076 | | - rbd_dev->lock_state); |
|---|
| 4150 | + dout("%s rbd_dev %p\n", __func__, rbd_dev); |
|---|
| 4151 | + lockdep_assert_held_write(&rbd_dev->lock_rwsem); |
|---|
| 4152 | + |
|---|
| 3077 | 4153 | if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) |
|---|
| 3078 | 4154 | return false; |
|---|
| 3079 | 4155 | |
|---|
| 3080 | | - rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; |
|---|
| 3081 | | - downgrade_write(&rbd_dev->lock_rwsem); |
|---|
| 3082 | 4156 | /* |
|---|
| 3083 | 4157 | * Ensure that all in-flight IO is flushed. |
|---|
| 3084 | | - * |
|---|
| 3085 | | - * FIXME: ceph_osdc_sync() flushes the entire OSD client, which |
|---|
| 3086 | | - * may be shared with other devices. |
|---|
| 3087 | 4158 | */ |
|---|
| 3088 | | - ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc); |
|---|
| 3089 | | - up_read(&rbd_dev->lock_rwsem); |
|---|
| 4159 | + rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; |
|---|
| 4160 | + rbd_assert(!completion_done(&rbd_dev->releasing_wait)); |
|---|
| 4161 | + if (list_empty(&rbd_dev->running_list)) |
|---|
| 4162 | + return true; |
|---|
| 4163 | + |
|---|
| 4164 | + up_write(&rbd_dev->lock_rwsem); |
|---|
| 4165 | + wait_for_completion(&rbd_dev->releasing_wait); |
|---|
| 3090 | 4166 | |
|---|
| 3091 | 4167 | down_write(&rbd_dev->lock_rwsem); |
|---|
| 3092 | | - dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, |
|---|
| 3093 | | - rbd_dev->lock_state); |
|---|
| 3094 | 4168 | if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) |
|---|
| 3095 | 4169 | return false; |
|---|
| 3096 | 4170 | |
|---|
| 4171 | + rbd_assert(list_empty(&rbd_dev->running_list)); |
|---|
| 4172 | + return true; |
|---|
| 4173 | +} |
|---|
| 4174 | + |
|---|
| 4175 | +static void rbd_pre_release_action(struct rbd_device *rbd_dev) |
|---|
| 4176 | +{ |
|---|
| 4177 | + if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) |
|---|
| 4178 | + rbd_object_map_close(rbd_dev); |
|---|
| 4179 | +} |
|---|
| 4180 | + |
|---|
| 4181 | +static void __rbd_release_lock(struct rbd_device *rbd_dev) |
|---|
| 4182 | +{ |
|---|
| 4183 | + rbd_assert(list_empty(&rbd_dev->running_list)); |
|---|
| 4184 | + |
|---|
| 4185 | + rbd_pre_release_action(rbd_dev); |
|---|
| 3097 | 4186 | rbd_unlock(rbd_dev); |
|---|
| 4187 | +} |
|---|
| 4188 | + |
|---|
| 4189 | +/* |
|---|
| 4190 | + * lock_rwsem must be held for write |
|---|
| 4191 | + */ |
|---|
| 4192 | +static void rbd_release_lock(struct rbd_device *rbd_dev) |
|---|
| 4193 | +{ |
|---|
| 4194 | + if (!rbd_quiesce_lock(rbd_dev)) |
|---|
| 4195 | + return; |
|---|
| 4196 | + |
|---|
| 4197 | + __rbd_release_lock(rbd_dev); |
|---|
| 4198 | + |
|---|
| 3098 | 4199 | /* |
|---|
| 3099 | 4200 | * Give others a chance to grab the lock - we would re-acquire |
|---|
| 3100 | | - * almost immediately if we got new IO during ceph_osdc_sync() |
|---|
| 3101 | | - * otherwise. We need to ack our own notifications, so this |
|---|
| 3102 | | - * lock_dwork will be requeued from rbd_wait_state_locked() |
|---|
| 3103 | | - * after wake_requests() in rbd_handle_released_lock(). |
|---|
| 4201 | + * almost immediately if we got new IO while draining the running |
|---|
| 4202 | + * list otherwise. We need to ack our own notifications, so this |
|---|
| 4203 | + * lock_dwork will be requeued from rbd_handle_released_lock() by |
|---|
| 4204 | + * way of maybe_kick_acquire(). |
|---|
| 3104 | 4205 | */ |
|---|
| 3105 | 4206 | cancel_delayed_work(&rbd_dev->lock_dwork); |
|---|
| 3106 | | - return true; |
|---|
| 3107 | 4207 | } |
|---|
| 3108 | 4208 | |
|---|
| 3109 | 4209 | static void rbd_release_lock_work(struct work_struct *work) |
|---|
| .. | .. |
|---|
| 3114 | 4214 | down_write(&rbd_dev->lock_rwsem); |
|---|
| 3115 | 4215 | rbd_release_lock(rbd_dev); |
|---|
| 3116 | 4216 | up_write(&rbd_dev->lock_rwsem); |
|---|
| 4217 | +} |
|---|
| 4218 | + |
|---|
| 4219 | +static void maybe_kick_acquire(struct rbd_device *rbd_dev) |
|---|
| 4220 | +{ |
|---|
| 4221 | + bool have_requests; |
|---|
| 4222 | + |
|---|
| 4223 | + dout("%s rbd_dev %p\n", __func__, rbd_dev); |
|---|
| 4224 | + if (__rbd_is_lock_owner(rbd_dev)) |
|---|
| 4225 | + return; |
|---|
| 4226 | + |
|---|
| 4227 | + spin_lock(&rbd_dev->lock_lists_lock); |
|---|
| 4228 | + have_requests = !list_empty(&rbd_dev->acquiring_list); |
|---|
| 4229 | + spin_unlock(&rbd_dev->lock_lists_lock); |
|---|
| 4230 | + if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) { |
|---|
| 4231 | + dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev); |
|---|
| 4232 | + mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); |
|---|
| 4233 | + } |
|---|
| 3117 | 4234 | } |
|---|
| 3118 | 4235 | |
|---|
| 3119 | 4236 | static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, |
|---|
| .. | .. |
|---|
| 3131 | 4248 | if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { |
|---|
| 3132 | 4249 | down_write(&rbd_dev->lock_rwsem); |
|---|
| 3133 | 4250 | if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { |
|---|
| 3134 | | - /* |
|---|
| 3135 | | - * we already know that the remote client is |
|---|
| 3136 | | - * the owner |
|---|
| 3137 | | - */ |
|---|
| 3138 | | - up_write(&rbd_dev->lock_rwsem); |
|---|
| 3139 | | - return; |
|---|
| 4251 | + dout("%s rbd_dev %p cid %llu-%llu == owner_cid\n", |
|---|
| 4252 | + __func__, rbd_dev, cid.gid, cid.handle); |
|---|
| 4253 | + } else { |
|---|
| 4254 | + rbd_set_owner_cid(rbd_dev, &cid); |
|---|
| 3140 | 4255 | } |
|---|
| 3141 | | - |
|---|
| 3142 | | - rbd_set_owner_cid(rbd_dev, &cid); |
|---|
| 3143 | 4256 | downgrade_write(&rbd_dev->lock_rwsem); |
|---|
| 3144 | 4257 | } else { |
|---|
| 3145 | 4258 | down_read(&rbd_dev->lock_rwsem); |
|---|
| 3146 | 4259 | } |
|---|
| 3147 | 4260 | |
|---|
| 3148 | | - if (!__rbd_is_lock_owner(rbd_dev)) |
|---|
| 3149 | | - wake_requests(rbd_dev, false); |
|---|
| 4261 | + maybe_kick_acquire(rbd_dev); |
|---|
| 3150 | 4262 | up_read(&rbd_dev->lock_rwsem); |
|---|
| 3151 | 4263 | } |
|---|
| 3152 | 4264 | |
|---|
| .. | .. |
|---|
| 3165 | 4277 | if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { |
|---|
| 3166 | 4278 | down_write(&rbd_dev->lock_rwsem); |
|---|
| 3167 | 4279 | if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { |
|---|
| 3168 | | - dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n", |
|---|
| 4280 | + dout("%s rbd_dev %p cid %llu-%llu != owner_cid %llu-%llu\n", |
|---|
| 3169 | 4281 | __func__, rbd_dev, cid.gid, cid.handle, |
|---|
| 3170 | 4282 | rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle); |
|---|
| 3171 | | - up_write(&rbd_dev->lock_rwsem); |
|---|
| 3172 | | - return; |
|---|
| 4283 | + } else { |
|---|
| 4284 | + rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); |
|---|
| 3173 | 4285 | } |
|---|
| 3174 | | - |
|---|
| 3175 | | - rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); |
|---|
| 3176 | 4286 | downgrade_write(&rbd_dev->lock_rwsem); |
|---|
| 3177 | 4287 | } else { |
|---|
| 3178 | 4288 | down_read(&rbd_dev->lock_rwsem); |
|---|
| 3179 | 4289 | } |
|---|
| 3180 | 4290 | |
|---|
| 3181 | | - if (!__rbd_is_lock_owner(rbd_dev)) |
|---|
| 3182 | | - wake_requests(rbd_dev, false); |
|---|
| 4291 | + maybe_kick_acquire(rbd_dev); |
|---|
| 3183 | 4292 | up_read(&rbd_dev->lock_rwsem); |
|---|
| 3184 | 4293 | } |
|---|
| 3185 | 4294 | |
|---|
| .. | .. |
|---|
| 3433 | 4542 | */ |
|---|
| 3434 | 4543 | static void rbd_unregister_watch(struct rbd_device *rbd_dev) |
|---|
| 3435 | 4544 | { |
|---|
| 3436 | | - WARN_ON(waitqueue_active(&rbd_dev->lock_waitq)); |
|---|
| 3437 | 4545 | cancel_tasks_sync(rbd_dev); |
|---|
| 3438 | 4546 | |
|---|
| 3439 | 4547 | mutex_lock(&rbd_dev->watch_mutex); |
|---|
| .. | .. |
|---|
| 3455 | 4563 | char cookie[32]; |
|---|
| 3456 | 4564 | int ret; |
|---|
| 3457 | 4565 | |
|---|
| 3458 | | - WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); |
|---|
| 4566 | + if (!rbd_quiesce_lock(rbd_dev)) |
|---|
| 4567 | + return; |
|---|
| 3459 | 4568 | |
|---|
| 3460 | 4569 | format_lock_cookie(rbd_dev, cookie); |
|---|
| 3461 | 4570 | ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid, |
|---|
| .. | .. |
|---|
| 3471 | 4580 | * Lock cookie cannot be updated on older OSDs, so do |
|---|
| 3472 | 4581 | * a manual release and queue an acquire. |
|---|
| 3473 | 4582 | */ |
|---|
| 3474 | | - if (rbd_release_lock(rbd_dev)) |
|---|
| 3475 | | - queue_delayed_work(rbd_dev->task_wq, |
|---|
| 3476 | | - &rbd_dev->lock_dwork, 0); |
|---|
| 4583 | + __rbd_release_lock(rbd_dev); |
|---|
| 4584 | + queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); |
|---|
| 3477 | 4585 | } else { |
|---|
| 3478 | 4586 | __rbd_lock(rbd_dev, cookie); |
|---|
| 4587 | + wake_lock_waiters(rbd_dev, 0); |
|---|
| 3479 | 4588 | } |
|---|
| 3480 | 4589 | } |
|---|
| 3481 | 4590 | |
|---|
| .. | .. |
|---|
| 3496 | 4605 | ret = __rbd_register_watch(rbd_dev); |
|---|
| 3497 | 4606 | if (ret) { |
|---|
| 3498 | 4607 | rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); |
|---|
| 3499 | | - if (ret == -EBLACKLISTED || ret == -ENOENT) { |
|---|
| 3500 | | - set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); |
|---|
| 3501 | | - wake_requests(rbd_dev, true); |
|---|
| 3502 | | - } else { |
|---|
| 4608 | + if (ret != -EBLOCKLISTED && ret != -ENOENT) { |
|---|
| 3503 | 4609 | queue_delayed_work(rbd_dev->task_wq, |
|---|
| 3504 | 4610 | &rbd_dev->watch_dwork, |
|---|
| 3505 | 4611 | RBD_RETRY_DELAY); |
|---|
| 4612 | + mutex_unlock(&rbd_dev->watch_mutex); |
|---|
| 4613 | + return; |
|---|
| 3506 | 4614 | } |
|---|
| 4615 | + |
|---|
| 3507 | 4616 | mutex_unlock(&rbd_dev->watch_mutex); |
|---|
| 4617 | + down_write(&rbd_dev->lock_rwsem); |
|---|
| 4618 | + wake_lock_waiters(rbd_dev, ret); |
|---|
| 4619 | + up_write(&rbd_dev->lock_rwsem); |
|---|
| 3508 | 4620 | return; |
|---|
| 3509 | 4621 | } |
|---|
| 3510 | 4622 | |
|---|
| .. | .. |
|---|
| 3567 | 4679 | |
|---|
| 3568 | 4680 | ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, |
|---|
| 3569 | 4681 | CEPH_OSD_FLAG_READ, req_page, outbound_size, |
|---|
| 3570 | | - reply_page, &inbound_size); |
|---|
| 4682 | + &reply_page, &inbound_size); |
|---|
| 3571 | 4683 | if (!ret) { |
|---|
| 3572 | 4684 | memcpy(inbound, page_address(reply_page), inbound_size); |
|---|
| 3573 | 4685 | ret = inbound_size; |
|---|
| .. | .. |
|---|
| 3579 | 4691 | return ret; |
|---|
| 3580 | 4692 | } |
|---|
| 3581 | 4693 | |
|---|
| 3582 | | -/* |
|---|
| 3583 | | - * lock_rwsem must be held for read |
|---|
| 3584 | | - */ |
|---|
| 3585 | | -static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire) |
|---|
| 3586 | | -{ |
|---|
| 3587 | | - DEFINE_WAIT(wait); |
|---|
| 3588 | | - unsigned long timeout; |
|---|
| 3589 | | - int ret = 0; |
|---|
| 3590 | | - |
|---|
| 3591 | | - if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) |
|---|
| 3592 | | - return -EBLACKLISTED; |
|---|
| 3593 | | - |
|---|
| 3594 | | - if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) |
|---|
| 3595 | | - return 0; |
|---|
| 3596 | | - |
|---|
| 3597 | | - if (!may_acquire) { |
|---|
| 3598 | | - rbd_warn(rbd_dev, "exclusive lock required"); |
|---|
| 3599 | | - return -EROFS; |
|---|
| 3600 | | - } |
|---|
| 3601 | | - |
|---|
| 3602 | | - do { |
|---|
| 3603 | | - /* |
|---|
| 3604 | | - * Note the use of mod_delayed_work() in rbd_acquire_lock() |
|---|
| 3605 | | - * and cancel_delayed_work() in wake_requests(). |
|---|
| 3606 | | - */ |
|---|
| 3607 | | - dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); |
|---|
| 3608 | | - queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); |
|---|
| 3609 | | - prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait, |
|---|
| 3610 | | - TASK_UNINTERRUPTIBLE); |
|---|
| 3611 | | - up_read(&rbd_dev->lock_rwsem); |
|---|
| 3612 | | - timeout = schedule_timeout(ceph_timeout_jiffies( |
|---|
| 3613 | | - rbd_dev->opts->lock_timeout)); |
|---|
| 3614 | | - down_read(&rbd_dev->lock_rwsem); |
|---|
| 3615 | | - if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { |
|---|
| 3616 | | - ret = -EBLACKLISTED; |
|---|
| 3617 | | - break; |
|---|
| 3618 | | - } |
|---|
| 3619 | | - if (!timeout) { |
|---|
| 3620 | | - rbd_warn(rbd_dev, "timed out waiting for lock"); |
|---|
| 3621 | | - ret = -ETIMEDOUT; |
|---|
| 3622 | | - break; |
|---|
| 3623 | | - } |
|---|
| 3624 | | - } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); |
|---|
| 3625 | | - |
|---|
| 3626 | | - finish_wait(&rbd_dev->lock_waitq, &wait); |
|---|
| 3627 | | - return ret; |
|---|
| 3628 | | -} |
|---|
| 3629 | | - |
|---|
| 3630 | 4694 | static void rbd_queue_workfn(struct work_struct *work) |
|---|
| 3631 | 4695 | { |
|---|
| 3632 | | - struct request *rq = blk_mq_rq_from_pdu(work); |
|---|
| 3633 | | - struct rbd_device *rbd_dev = rq->q->queuedata; |
|---|
| 3634 | | - struct rbd_img_request *img_request; |
|---|
| 3635 | | - struct ceph_snap_context *snapc = NULL; |
|---|
| 4696 | + struct rbd_img_request *img_request = |
|---|
| 4697 | + container_of(work, struct rbd_img_request, work); |
|---|
| 4698 | + struct rbd_device *rbd_dev = img_request->rbd_dev; |
|---|
| 4699 | + enum obj_operation_type op_type = img_request->op_type; |
|---|
| 4700 | + struct request *rq = blk_mq_rq_from_pdu(img_request); |
|---|
| 3636 | 4701 | u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; |
|---|
| 3637 | 4702 | u64 length = blk_rq_bytes(rq); |
|---|
| 3638 | | - enum obj_operation_type op_type; |
|---|
| 3639 | 4703 | u64 mapping_size; |
|---|
| 3640 | | - bool must_be_locked; |
|---|
| 3641 | 4704 | int result; |
|---|
| 3642 | 4705 | |
|---|
| 3643 | | - switch (req_op(rq)) { |
|---|
| 4706 | + /* Ignore/skip any zero-length requests */ |
|---|
| 4707 | + if (!length) { |
|---|
| 4708 | + dout("%s: zero-length request\n", __func__); |
|---|
| 4709 | + result = 0; |
|---|
| 4710 | + goto err_img_request; |
|---|
| 4711 | + } |
|---|
| 4712 | + |
|---|
| 4713 | + blk_mq_start_request(rq); |
|---|
| 4714 | + |
|---|
| 4715 | + down_read(&rbd_dev->header_rwsem); |
|---|
| 4716 | + mapping_size = rbd_dev->mapping.size; |
|---|
| 4717 | + rbd_img_capture_header(img_request); |
|---|
| 4718 | + up_read(&rbd_dev->header_rwsem); |
|---|
| 4719 | + |
|---|
| 4720 | + if (offset + length > mapping_size) { |
|---|
| 4721 | + rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, |
|---|
| 4722 | + length, mapping_size); |
|---|
| 4723 | + result = -EIO; |
|---|
| 4724 | + goto err_img_request; |
|---|
| 4725 | + } |
|---|
| 4726 | + |
|---|
| 4727 | + dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev, |
|---|
| 4728 | + img_request, obj_op_name(op_type), offset, length); |
|---|
| 4729 | + |
|---|
| 4730 | + if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) |
|---|
| 4731 | + result = rbd_img_fill_nodata(img_request, offset, length); |
|---|
| 4732 | + else |
|---|
| 4733 | + result = rbd_img_fill_from_bio(img_request, offset, length, |
|---|
| 4734 | + rq->bio); |
|---|
| 4735 | + if (result) |
|---|
| 4736 | + goto err_img_request; |
|---|
| 4737 | + |
|---|
| 4738 | + rbd_img_handle_request(img_request, 0); |
|---|
| 4739 | + return; |
|---|
| 4740 | + |
|---|
| 4741 | +err_img_request: |
|---|
| 4742 | + rbd_img_request_destroy(img_request); |
|---|
| 4743 | + if (result) |
|---|
| 4744 | + rbd_warn(rbd_dev, "%s %llx at %llx result %d", |
|---|
| 4745 | + obj_op_name(op_type), length, offset, result); |
|---|
| 4746 | + blk_mq_end_request(rq, errno_to_blk_status(result)); |
|---|
| 4747 | +} |
|---|
| 4748 | + |
|---|
| 4749 | +static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx, |
|---|
| 4750 | + const struct blk_mq_queue_data *bd) |
|---|
| 4751 | +{ |
|---|
| 4752 | + struct rbd_device *rbd_dev = hctx->queue->queuedata; |
|---|
| 4753 | + struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq); |
|---|
| 4754 | + enum obj_operation_type op_type; |
|---|
| 4755 | + |
|---|
| 4756 | + switch (req_op(bd->rq)) { |
|---|
| 3644 | 4757 | case REQ_OP_DISCARD: |
|---|
| 3645 | | - case REQ_OP_WRITE_ZEROES: |
|---|
| 3646 | 4758 | op_type = OBJ_OP_DISCARD; |
|---|
| 4759 | + break; |
|---|
| 4760 | + case REQ_OP_WRITE_ZEROES: |
|---|
| 4761 | + op_type = OBJ_OP_ZEROOUT; |
|---|
| 3647 | 4762 | break; |
|---|
| 3648 | 4763 | case REQ_OP_WRITE: |
|---|
| 3649 | 4764 | op_type = OBJ_OP_WRITE; |
|---|
| .. | .. |
|---|
| 3652 | 4767 | op_type = OBJ_OP_READ; |
|---|
| 3653 | 4768 | break; |
|---|
| 3654 | 4769 | default: |
|---|
| 3655 | | - dout("%s: non-fs request type %d\n", __func__, req_op(rq)); |
|---|
| 3656 | | - result = -EIO; |
|---|
| 3657 | | - goto err; |
|---|
| 4770 | + rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq)); |
|---|
| 4771 | + return BLK_STS_IOERR; |
|---|
| 3658 | 4772 | } |
|---|
| 3659 | 4773 | |
|---|
| 3660 | | - /* Ignore/skip any zero-length requests */ |
|---|
| 4774 | + rbd_img_request_init(img_req, rbd_dev, op_type); |
|---|
| 3661 | 4775 | |
|---|
| 3662 | | - if (!length) { |
|---|
| 3663 | | - dout("%s: zero-length request\n", __func__); |
|---|
| 3664 | | - result = 0; |
|---|
| 3665 | | - goto err_rq; |
|---|
| 4776 | + if (rbd_img_is_write(img_req)) { |
|---|
| 4777 | + if (rbd_is_ro(rbd_dev)) { |
|---|
| 4778 | + rbd_warn(rbd_dev, "%s on read-only mapping", |
|---|
| 4779 | + obj_op_name(img_req->op_type)); |
|---|
| 4780 | + return BLK_STS_IOERR; |
|---|
| 4781 | + } |
|---|
| 4782 | + rbd_assert(!rbd_is_snap(rbd_dev)); |
|---|
| 3666 | 4783 | } |
|---|
| 3667 | 4784 | |
|---|
| 3668 | | - rbd_assert(op_type == OBJ_OP_READ || |
|---|
| 3669 | | - rbd_dev->spec->snap_id == CEPH_NOSNAP); |
|---|
| 3670 | | - |
|---|
| 3671 | | - /* |
|---|
| 3672 | | - * Quit early if the mapped snapshot no longer exists. It's |
|---|
| 3673 | | - * still possible the snapshot will have disappeared by the |
|---|
| 3674 | | - * time our request arrives at the osd, but there's no sense in |
|---|
| 3675 | | - * sending it if we already know. |
|---|
| 3676 | | - */ |
|---|
| 3677 | | - if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { |
|---|
| 3678 | | - dout("request for non-existent snapshot"); |
|---|
| 3679 | | - rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); |
|---|
| 3680 | | - result = -ENXIO; |
|---|
| 3681 | | - goto err_rq; |
|---|
| 3682 | | - } |
|---|
| 3683 | | - |
|---|
| 3684 | | - if (offset && length > U64_MAX - offset + 1) { |
|---|
| 3685 | | - rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, |
|---|
| 3686 | | - length); |
|---|
| 3687 | | - result = -EINVAL; |
|---|
| 3688 | | - goto err_rq; /* Shouldn't happen */ |
|---|
| 3689 | | - } |
|---|
| 3690 | | - |
|---|
| 3691 | | - blk_mq_start_request(rq); |
|---|
| 3692 | | - |
|---|
| 3693 | | - down_read(&rbd_dev->header_rwsem); |
|---|
| 3694 | | - mapping_size = rbd_dev->mapping.size; |
|---|
| 3695 | | - if (op_type != OBJ_OP_READ) { |
|---|
| 3696 | | - snapc = rbd_dev->header.snapc; |
|---|
| 3697 | | - ceph_get_snap_context(snapc); |
|---|
| 3698 | | - } |
|---|
| 3699 | | - up_read(&rbd_dev->header_rwsem); |
|---|
| 3700 | | - |
|---|
| 3701 | | - if (offset + length > mapping_size) { |
|---|
| 3702 | | - rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, |
|---|
| 3703 | | - length, mapping_size); |
|---|
| 3704 | | - result = -EIO; |
|---|
| 3705 | | - goto err_rq; |
|---|
| 3706 | | - } |
|---|
| 3707 | | - |
|---|
| 3708 | | - must_be_locked = |
|---|
| 3709 | | - (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && |
|---|
| 3710 | | - (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read); |
|---|
| 3711 | | - if (must_be_locked) { |
|---|
| 3712 | | - down_read(&rbd_dev->lock_rwsem); |
|---|
| 3713 | | - result = rbd_wait_state_locked(rbd_dev, |
|---|
| 3714 | | - !rbd_dev->opts->exclusive); |
|---|
| 3715 | | - if (result) |
|---|
| 3716 | | - goto err_unlock; |
|---|
| 3717 | | - } |
|---|
| 3718 | | - |
|---|
| 3719 | | - img_request = rbd_img_request_create(rbd_dev, op_type, snapc); |
|---|
| 3720 | | - if (!img_request) { |
|---|
| 3721 | | - result = -ENOMEM; |
|---|
| 3722 | | - goto err_unlock; |
|---|
| 3723 | | - } |
|---|
| 3724 | | - img_request->rq = rq; |
|---|
| 3725 | | - snapc = NULL; /* img_request consumes a ref */ |
|---|
| 3726 | | - |
|---|
| 3727 | | - if (op_type == OBJ_OP_DISCARD) |
|---|
| 3728 | | - result = rbd_img_fill_nodata(img_request, offset, length); |
|---|
| 3729 | | - else |
|---|
| 3730 | | - result = rbd_img_fill_from_bio(img_request, offset, length, |
|---|
| 3731 | | - rq->bio); |
|---|
| 3732 | | - if (result) |
|---|
| 3733 | | - goto err_img_request; |
|---|
| 3734 | | - |
|---|
| 3735 | | - rbd_img_request_submit(img_request); |
|---|
| 3736 | | - if (must_be_locked) |
|---|
| 3737 | | - up_read(&rbd_dev->lock_rwsem); |
|---|
| 3738 | | - return; |
|---|
| 3739 | | - |
|---|
| 3740 | | -err_img_request: |
|---|
| 3741 | | - rbd_img_request_put(img_request); |
|---|
| 3742 | | -err_unlock: |
|---|
| 3743 | | - if (must_be_locked) |
|---|
| 3744 | | - up_read(&rbd_dev->lock_rwsem); |
|---|
| 3745 | | -err_rq: |
|---|
| 3746 | | - if (result) |
|---|
| 3747 | | - rbd_warn(rbd_dev, "%s %llx at %llx result %d", |
|---|
| 3748 | | - obj_op_name(op_type), length, offset, result); |
|---|
| 3749 | | - ceph_put_snap_context(snapc); |
|---|
| 3750 | | -err: |
|---|
| 3751 | | - blk_mq_end_request(rq, errno_to_blk_status(result)); |
|---|
| 3752 | | -} |
|---|
| 3753 | | - |
|---|
| 3754 | | -static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx, |
|---|
| 3755 | | - const struct blk_mq_queue_data *bd) |
|---|
| 3756 | | -{ |
|---|
| 3757 | | - struct request *rq = bd->rq; |
|---|
| 3758 | | - struct work_struct *work = blk_mq_rq_to_pdu(rq); |
|---|
| 3759 | | - |
|---|
| 3760 | | - queue_work(rbd_wq, work); |
|---|
| 4785 | + INIT_WORK(&img_req->work, rbd_queue_workfn); |
|---|
| 4786 | + queue_work(rbd_wq, &img_req->work); |
|---|
| 3761 | 4787 | return BLK_STS_OK; |
|---|
| 3762 | 4788 | } |
|---|
| 3763 | 4789 | |
|---|
| .. | .. |
|---|
| 3789 | 4815 | ceph_oloc_copy(&req->r_base_oloc, oloc); |
|---|
| 3790 | 4816 | req->r_flags = CEPH_OSD_FLAG_READ; |
|---|
| 3791 | 4817 | |
|---|
| 3792 | | - ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); |
|---|
| 3793 | | - if (ret) |
|---|
| 3794 | | - goto out_req; |
|---|
| 3795 | | - |
|---|
| 3796 | 4818 | pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); |
|---|
| 3797 | 4819 | if (IS_ERR(pages)) { |
|---|
| 3798 | 4820 | ret = PTR_ERR(pages); |
|---|
| .. | .. |
|---|
| 3802 | 4824 | osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0); |
|---|
| 3803 | 4825 | osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, |
|---|
| 3804 | 4826 | true); |
|---|
| 4827 | + |
|---|
| 4828 | + ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); |
|---|
| 4829 | + if (ret) |
|---|
| 4830 | + goto out_req; |
|---|
| 3805 | 4831 | |
|---|
| 3806 | 4832 | ceph_osdc_start_request(osdc, req, false); |
|---|
| 3807 | 4833 | ret = ceph_osdc_wait_request(osdc, req); |
|---|
| .. | .. |
|---|
| 3873 | 4899 | return ret; |
|---|
| 3874 | 4900 | } |
|---|
| 3875 | 4901 | |
|---|
| 3876 | | -/* |
|---|
| 3877 | | - * Clear the rbd device's EXISTS flag if the snapshot it's mapped to |
|---|
| 3878 | | - * has disappeared from the (just updated) snapshot context. |
|---|
| 3879 | | - */ |
|---|
| 3880 | | -static void rbd_exists_validate(struct rbd_device *rbd_dev) |
|---|
| 3881 | | -{ |
|---|
| 3882 | | - u64 snap_id; |
|---|
| 3883 | | - |
|---|
| 3884 | | - if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) |
|---|
| 3885 | | - return; |
|---|
| 3886 | | - |
|---|
| 3887 | | - snap_id = rbd_dev->spec->snap_id; |
|---|
| 3888 | | - if (snap_id == CEPH_NOSNAP) |
|---|
| 3889 | | - return; |
|---|
| 3890 | | - |
|---|
| 3891 | | - if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) |
|---|
| 3892 | | - clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); |
|---|
| 3893 | | -} |
|---|
| 3894 | | - |
|---|
| 3895 | 4902 | static void rbd_dev_update_size(struct rbd_device *rbd_dev) |
|---|
| 3896 | 4903 | { |
|---|
| 3897 | 4904 | sector_t size; |
|---|
| .. | .. |
|---|
| 3906 | 4913 | size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; |
|---|
| 3907 | 4914 | dout("setting size to %llu sectors", (unsigned long long)size); |
|---|
| 3908 | 4915 | set_capacity(rbd_dev->disk, size); |
|---|
| 3909 | | - revalidate_disk(rbd_dev->disk); |
|---|
| 4916 | + revalidate_disk_size(rbd_dev->disk, true); |
|---|
| 3910 | 4917 | } |
|---|
| 3911 | 4918 | } |
|---|
| 3912 | 4919 | |
|---|
| .. | .. |
|---|
| 3932 | 4939 | goto out; |
|---|
| 3933 | 4940 | } |
|---|
| 3934 | 4941 | |
|---|
| 3935 | | - if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { |
|---|
| 3936 | | - rbd_dev->mapping.size = rbd_dev->header.image_size; |
|---|
| 3937 | | - } else { |
|---|
| 3938 | | - /* validate mapped snapshot's EXISTS flag */ |
|---|
| 3939 | | - rbd_exists_validate(rbd_dev); |
|---|
| 3940 | | - } |
|---|
| 4942 | + rbd_assert(!rbd_is_snap(rbd_dev)); |
|---|
| 4943 | + rbd_dev->mapping.size = rbd_dev->header.image_size; |
|---|
| 3941 | 4944 | |
|---|
| 3942 | 4945 | out: |
|---|
| 3943 | 4946 | up_write(&rbd_dev->header_rwsem); |
|---|
| .. | .. |
|---|
| 3947 | 4950 | return ret; |
|---|
| 3948 | 4951 | } |
|---|
| 3949 | 4952 | |
|---|
| 3950 | | -static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq, |
|---|
| 3951 | | - unsigned int hctx_idx, unsigned int numa_node) |
|---|
| 3952 | | -{ |
|---|
| 3953 | | - struct work_struct *work = blk_mq_rq_to_pdu(rq); |
|---|
| 3954 | | - |
|---|
| 3955 | | - INIT_WORK(work, rbd_queue_workfn); |
|---|
| 3956 | | - return 0; |
|---|
| 3957 | | -} |
|---|
| 3958 | | - |
|---|
| 3959 | 4953 | static const struct blk_mq_ops rbd_mq_ops = { |
|---|
| 3960 | 4954 | .queue_rq = rbd_queue_rq, |
|---|
| 3961 | | - .init_request = rbd_init_request, |
|---|
| 3962 | 4955 | }; |
|---|
| 3963 | 4956 | |
|---|
| 3964 | 4957 | static int rbd_init_disk(struct rbd_device *rbd_dev) |
|---|
| .. | .. |
|---|
| 3989 | 4982 | rbd_dev->tag_set.ops = &rbd_mq_ops; |
|---|
| 3990 | 4983 | rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; |
|---|
| 3991 | 4984 | rbd_dev->tag_set.numa_node = NUMA_NO_NODE; |
|---|
| 3992 | | - rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; |
|---|
| 3993 | | - rbd_dev->tag_set.nr_hw_queues = 1; |
|---|
| 3994 | | - rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); |
|---|
| 4985 | + rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; |
|---|
| 4986 | + rbd_dev->tag_set.nr_hw_queues = num_present_cpus(); |
|---|
| 4987 | + rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request); |
|---|
| 3995 | 4988 | |
|---|
| 3996 | 4989 | err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); |
|---|
| 3997 | 4990 | if (err) |
|---|
| .. | .. |
|---|
| 4010 | 5003 | q->limits.max_sectors = queue_max_hw_sectors(q); |
|---|
| 4011 | 5004 | blk_queue_max_segments(q, USHRT_MAX); |
|---|
| 4012 | 5005 | blk_queue_max_segment_size(q, UINT_MAX); |
|---|
| 4013 | | - blk_queue_io_min(q, objset_bytes); |
|---|
| 4014 | | - blk_queue_io_opt(q, objset_bytes); |
|---|
| 5006 | + blk_queue_io_min(q, rbd_dev->opts->alloc_size); |
|---|
| 5007 | + blk_queue_io_opt(q, rbd_dev->opts->alloc_size); |
|---|
| 4015 | 5008 | |
|---|
| 4016 | 5009 | if (rbd_dev->opts->trim) { |
|---|
| 4017 | 5010 | blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); |
|---|
| 4018 | | - q->limits.discard_granularity = objset_bytes; |
|---|
| 5011 | + q->limits.discard_granularity = rbd_dev->opts->alloc_size; |
|---|
| 4019 | 5012 | blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); |
|---|
| 4020 | 5013 | blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); |
|---|
| 4021 | 5014 | } |
|---|
| 4022 | 5015 | |
|---|
| 4023 | 5016 | if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) |
|---|
| 4024 | | - q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; |
|---|
| 5017 | + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); |
|---|
| 4025 | 5018 | |
|---|
| 4026 | 5019 | /* |
|---|
| 4027 | 5020 | * disk_release() expects a queue ref from add_disk() and will |
|---|
| .. | .. |
|---|
| 4059 | 5052 | (unsigned long long)rbd_dev->mapping.size); |
|---|
| 4060 | 5053 | } |
|---|
| 4061 | 5054 | |
|---|
| 4062 | | -/* |
|---|
| 4063 | | - * Note this shows the features for whatever's mapped, which is not |
|---|
| 4064 | | - * necessarily the base image. |
|---|
| 4065 | | - */ |
|---|
| 4066 | 5055 | static ssize_t rbd_features_show(struct device *dev, |
|---|
| 4067 | 5056 | struct device_attribute *attr, char *buf) |
|---|
| 4068 | 5057 | { |
|---|
| 4069 | 5058 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
|---|
| 4070 | 5059 | |
|---|
| 4071 | | - return sprintf(buf, "0x%016llx\n", |
|---|
| 4072 | | - (unsigned long long)rbd_dev->mapping.features); |
|---|
| 5060 | + return sprintf(buf, "0x%016llx\n", rbd_dev->header.features); |
|---|
| 4073 | 5061 | } |
|---|
| 4074 | 5062 | |
|---|
| 4075 | 5063 | static ssize_t rbd_major_show(struct device *dev, |
|---|
| .. | .. |
|---|
| 4414 | 5402 | INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); |
|---|
| 4415 | 5403 | INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); |
|---|
| 4416 | 5404 | INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); |
|---|
| 4417 | | - init_waitqueue_head(&rbd_dev->lock_waitq); |
|---|
| 5405 | + spin_lock_init(&rbd_dev->lock_lists_lock); |
|---|
| 5406 | + INIT_LIST_HEAD(&rbd_dev->acquiring_list); |
|---|
| 5407 | + INIT_LIST_HEAD(&rbd_dev->running_list); |
|---|
| 5408 | + init_completion(&rbd_dev->acquire_wait); |
|---|
| 5409 | + init_completion(&rbd_dev->releasing_wait); |
|---|
| 5410 | + |
|---|
| 5411 | + spin_lock_init(&rbd_dev->object_map_lock); |
|---|
| 4418 | 5412 | |
|---|
| 4419 | 5413 | rbd_dev->dev.bus = &rbd_bus_type; |
|---|
| 4420 | 5414 | rbd_dev->dev.type = &rbd_device_type; |
|---|
| .. | .. |
|---|
| 4521 | 5515 | |
|---|
| 4522 | 5516 | static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) |
|---|
| 4523 | 5517 | { |
|---|
| 5518 | + size_t size; |
|---|
| 4524 | 5519 | void *reply_buf; |
|---|
| 4525 | 5520 | int ret; |
|---|
| 4526 | 5521 | void *p; |
|---|
| 4527 | 5522 | |
|---|
| 4528 | | - reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); |
|---|
| 5523 | + /* Response will be an encoded string, which includes a length */ |
|---|
| 5524 | + size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX; |
|---|
| 5525 | + reply_buf = kzalloc(size, GFP_KERNEL); |
|---|
| 4529 | 5526 | if (!reply_buf) |
|---|
| 4530 | 5527 | return -ENOMEM; |
|---|
| 4531 | 5528 | |
|---|
| 4532 | 5529 | ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, |
|---|
| 4533 | 5530 | &rbd_dev->header_oloc, "get_object_prefix", |
|---|
| 4534 | | - NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); |
|---|
| 5531 | + NULL, 0, reply_buf, size); |
|---|
| 4535 | 5532 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
|---|
| 4536 | 5533 | if (ret < 0) |
|---|
| 4537 | 5534 | goto out; |
|---|
| .. | .. |
|---|
| 4554 | 5551 | } |
|---|
| 4555 | 5552 | |
|---|
| 4556 | 5553 | static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, |
|---|
| 4557 | | - u64 *snap_features) |
|---|
| 5554 | + bool read_only, u64 *snap_features) |
|---|
| 4558 | 5555 | { |
|---|
| 4559 | | - __le64 snapid = cpu_to_le64(snap_id); |
|---|
| 5556 | + struct { |
|---|
| 5557 | + __le64 snap_id; |
|---|
| 5558 | + u8 read_only; |
|---|
| 5559 | + } features_in; |
|---|
| 4560 | 5560 | struct { |
|---|
| 4561 | 5561 | __le64 features; |
|---|
| 4562 | 5562 | __le64 incompat; |
|---|
| .. | .. |
|---|
| 4564 | 5564 | u64 unsup; |
|---|
| 4565 | 5565 | int ret; |
|---|
| 4566 | 5566 | |
|---|
| 5567 | + features_in.snap_id = cpu_to_le64(snap_id); |
|---|
| 5568 | + features_in.read_only = read_only; |
|---|
| 5569 | + |
|---|
| 4567 | 5570 | ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, |
|---|
| 4568 | 5571 | &rbd_dev->header_oloc, "get_features", |
|---|
| 4569 | | - &snapid, sizeof(snapid), |
|---|
| 5572 | + &features_in, sizeof(features_in), |
|---|
| 4570 | 5573 | &features_buf, sizeof(features_buf)); |
|---|
| 4571 | 5574 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
|---|
| 4572 | 5575 | if (ret < 0) |
|---|
| .. | .. |
|---|
| 4594 | 5597 | static int rbd_dev_v2_features(struct rbd_device *rbd_dev) |
|---|
| 4595 | 5598 | { |
|---|
| 4596 | 5599 | return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, |
|---|
| 4597 | | - &rbd_dev->header.features); |
|---|
| 5600 | + rbd_is_ro(rbd_dev), |
|---|
| 5601 | + &rbd_dev->header.features); |
|---|
| 5602 | +} |
|---|
| 5603 | + |
|---|
| 5604 | +/* |
|---|
| 5605 | + * These are generic image flags, but since they are used only for |
|---|
| 5606 | + * object map, store them in rbd_dev->object_map_flags. |
|---|
| 5607 | + * |
|---|
| 5608 | + * For the same reason, this function is called only on object map |
|---|
| 5609 | + * (re)load and not on header refresh. |
|---|
| 5610 | + */ |
|---|
| 5611 | +static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev) |
|---|
| 5612 | +{ |
|---|
| 5613 | + __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id); |
|---|
| 5614 | + __le64 flags; |
|---|
| 5615 | + int ret; |
|---|
| 5616 | + |
|---|
| 5617 | + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, |
|---|
| 5618 | + &rbd_dev->header_oloc, "get_flags", |
|---|
| 5619 | + &snapid, sizeof(snapid), |
|---|
| 5620 | + &flags, sizeof(flags)); |
|---|
| 5621 | + if (ret < 0) |
|---|
| 5622 | + return ret; |
|---|
| 5623 | + if (ret < sizeof(flags)) |
|---|
| 5624 | + return -EBADMSG; |
|---|
| 5625 | + |
|---|
| 5626 | + rbd_dev->object_map_flags = le64_to_cpu(flags); |
|---|
| 5627 | + return 0; |
|---|
| 4598 | 5628 | } |
|---|
| 4599 | 5629 | |
|---|
| 4600 | 5630 | struct parent_image_info { |
|---|
| .. | .. |
|---|
| 4654 | 5684 | |
|---|
| 4655 | 5685 | ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, |
|---|
| 4656 | 5686 | "rbd", "parent_get", CEPH_OSD_FLAG_READ, |
|---|
| 4657 | | - req_page, sizeof(u64), reply_page, &reply_len); |
|---|
| 5687 | + req_page, sizeof(u64), &reply_page, &reply_len); |
|---|
| 4658 | 5688 | if (ret) |
|---|
| 4659 | 5689 | return ret == -EOPNOTSUPP ? 1 : ret; |
|---|
| 4660 | 5690 | |
|---|
| .. | .. |
|---|
| 4666 | 5696 | |
|---|
| 4667 | 5697 | ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, |
|---|
| 4668 | 5698 | "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ, |
|---|
| 4669 | | - req_page, sizeof(u64), reply_page, &reply_len); |
|---|
| 5699 | + req_page, sizeof(u64), &reply_page, &reply_len); |
|---|
| 4670 | 5700 | if (ret) |
|---|
| 4671 | 5701 | return ret; |
|---|
| 4672 | 5702 | |
|---|
| .. | .. |
|---|
| 4697 | 5727 | |
|---|
| 4698 | 5728 | ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, |
|---|
| 4699 | 5729 | "rbd", "get_parent", CEPH_OSD_FLAG_READ, |
|---|
| 4700 | | - req_page, sizeof(u64), reply_page, &reply_len); |
|---|
| 5730 | + req_page, sizeof(u64), &reply_page, &reply_len); |
|---|
| 4701 | 5731 | if (ret) |
|---|
| 4702 | 5732 | return ret; |
|---|
| 4703 | 5733 | |
|---|
| .. | .. |
|---|
| 5275 | 6305 | return dup; |
|---|
| 5276 | 6306 | } |
|---|
| 5277 | 6307 | |
|---|
| 6308 | +static int rbd_parse_param(struct fs_parameter *param, |
|---|
| 6309 | + struct rbd_parse_opts_ctx *pctx) |
|---|
| 6310 | +{ |
|---|
| 6311 | + struct rbd_options *opt = pctx->opts; |
|---|
| 6312 | + struct fs_parse_result result; |
|---|
| 6313 | + struct p_log log = {.prefix = "rbd"}; |
|---|
| 6314 | + int token, ret; |
|---|
| 6315 | + |
|---|
| 6316 | + ret = ceph_parse_param(param, pctx->copts, NULL); |
|---|
| 6317 | + if (ret != -ENOPARAM) |
|---|
| 6318 | + return ret; |
|---|
| 6319 | + |
|---|
| 6320 | + token = __fs_parse(&log, rbd_parameters, param, &result); |
|---|
| 6321 | + dout("%s fs_parse '%s' token %d\n", __func__, param->key, token); |
|---|
| 6322 | + if (token < 0) { |
|---|
| 6323 | + if (token == -ENOPARAM) |
|---|
| 6324 | + return inval_plog(&log, "Unknown parameter '%s'", |
|---|
| 6325 | + param->key); |
|---|
| 6326 | + return token; |
|---|
| 6327 | + } |
|---|
| 6328 | + |
|---|
| 6329 | + switch (token) { |
|---|
| 6330 | + case Opt_queue_depth: |
|---|
| 6331 | + if (result.uint_32 < 1) |
|---|
| 6332 | + goto out_of_range; |
|---|
| 6333 | + opt->queue_depth = result.uint_32; |
|---|
| 6334 | + break; |
|---|
| 6335 | + case Opt_alloc_size: |
|---|
| 6336 | + if (result.uint_32 < SECTOR_SIZE) |
|---|
| 6337 | + goto out_of_range; |
|---|
| 6338 | + if (!is_power_of_2(result.uint_32)) |
|---|
| 6339 | + return inval_plog(&log, "alloc_size must be a power of 2"); |
|---|
| 6340 | + opt->alloc_size = result.uint_32; |
|---|
| 6341 | + break; |
|---|
| 6342 | + case Opt_lock_timeout: |
|---|
| 6343 | + /* 0 is "wait forever" (i.e. infinite timeout) */ |
|---|
| 6344 | + if (result.uint_32 > INT_MAX / 1000) |
|---|
| 6345 | + goto out_of_range; |
|---|
| 6346 | + opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000); |
|---|
| 6347 | + break; |
|---|
| 6348 | + case Opt_pool_ns: |
|---|
| 6349 | + kfree(pctx->spec->pool_ns); |
|---|
| 6350 | + pctx->spec->pool_ns = param->string; |
|---|
| 6351 | + param->string = NULL; |
|---|
| 6352 | + break; |
|---|
| 6353 | + case Opt_compression_hint: |
|---|
| 6354 | + switch (result.uint_32) { |
|---|
| 6355 | + case Opt_compression_hint_none: |
|---|
| 6356 | + opt->alloc_hint_flags &= |
|---|
| 6357 | + ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE | |
|---|
| 6358 | + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE); |
|---|
| 6359 | + break; |
|---|
| 6360 | + case Opt_compression_hint_compressible: |
|---|
| 6361 | + opt->alloc_hint_flags |= |
|---|
| 6362 | + CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE; |
|---|
| 6363 | + opt->alloc_hint_flags &= |
|---|
| 6364 | + ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE; |
|---|
| 6365 | + break; |
|---|
| 6366 | + case Opt_compression_hint_incompressible: |
|---|
| 6367 | + opt->alloc_hint_flags |= |
|---|
| 6368 | + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE; |
|---|
| 6369 | + opt->alloc_hint_flags &= |
|---|
| 6370 | + ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE; |
|---|
| 6371 | + break; |
|---|
| 6372 | + default: |
|---|
| 6373 | + BUG(); |
|---|
| 6374 | + } |
|---|
| 6375 | + break; |
|---|
| 6376 | + case Opt_read_only: |
|---|
| 6377 | + opt->read_only = true; |
|---|
| 6378 | + break; |
|---|
| 6379 | + case Opt_read_write: |
|---|
| 6380 | + opt->read_only = false; |
|---|
| 6381 | + break; |
|---|
| 6382 | + case Opt_lock_on_read: |
|---|
| 6383 | + opt->lock_on_read = true; |
|---|
| 6384 | + break; |
|---|
| 6385 | + case Opt_exclusive: |
|---|
| 6386 | + opt->exclusive = true; |
|---|
| 6387 | + break; |
|---|
| 6388 | + case Opt_notrim: |
|---|
| 6389 | + opt->trim = false; |
|---|
| 6390 | + break; |
|---|
| 6391 | + default: |
|---|
| 6392 | + BUG(); |
|---|
| 6393 | + } |
|---|
| 6394 | + |
|---|
| 6395 | + return 0; |
|---|
| 6396 | + |
|---|
| 6397 | +out_of_range: |
|---|
| 6398 | + return inval_plog(&log, "%s out of range", param->key); |
|---|
| 6399 | +} |
|---|
| 6400 | + |
|---|
| 6401 | +/* |
|---|
| 6402 | + * This duplicates most of generic_parse_monolithic(), untying it from |
|---|
| 6403 | + * fs_context and skipping standard superblock and security options. |
|---|
| 6404 | + */ |
|---|
| 6405 | +static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx) |
|---|
| 6406 | +{ |
|---|
| 6407 | + char *key; |
|---|
| 6408 | + int ret = 0; |
|---|
| 6409 | + |
|---|
| 6410 | + dout("%s '%s'\n", __func__, options); |
|---|
| 6411 | + while ((key = strsep(&options, ",")) != NULL) { |
|---|
| 6412 | + if (*key) { |
|---|
| 6413 | + struct fs_parameter param = { |
|---|
| 6414 | + .key = key, |
|---|
| 6415 | + .type = fs_value_is_flag, |
|---|
| 6416 | + }; |
|---|
| 6417 | + char *value = strchr(key, '='); |
|---|
| 6418 | + size_t v_len = 0; |
|---|
| 6419 | + |
|---|
| 6420 | + if (value) { |
|---|
| 6421 | + if (value == key) |
|---|
| 6422 | + continue; |
|---|
| 6423 | + *value++ = 0; |
|---|
| 6424 | + v_len = strlen(value); |
|---|
| 6425 | + param.string = kmemdup_nul(value, v_len, |
|---|
| 6426 | + GFP_KERNEL); |
|---|
| 6427 | + if (!param.string) |
|---|
| 6428 | + return -ENOMEM; |
|---|
| 6429 | + param.type = fs_value_is_string; |
|---|
| 6430 | + } |
|---|
| 6431 | + param.size = v_len; |
|---|
| 6432 | + |
|---|
| 6433 | + ret = rbd_parse_param(¶m, pctx); |
|---|
| 6434 | + kfree(param.string); |
|---|
| 6435 | + if (ret) |
|---|
| 6436 | + break; |
|---|
| 6437 | + } |
|---|
| 6438 | + } |
|---|
| 6439 | + |
|---|
| 6440 | + return ret; |
|---|
| 6441 | +} |
|---|
| 6442 | + |
|---|
| 5278 | 6443 | /* |
|---|
| 5279 | 6444 | * Parse the options provided for an "rbd add" (i.e., rbd image |
|---|
| 5280 | 6445 | * mapping) request. These arrive via a write to /sys/bus/rbd/add, |
|---|
| .. | .. |
|---|
| 5326 | 6491 | const char *mon_addrs; |
|---|
| 5327 | 6492 | char *snap_name; |
|---|
| 5328 | 6493 | size_t mon_addrs_size; |
|---|
| 5329 | | - struct parse_rbd_opts_ctx pctx = { 0 }; |
|---|
| 5330 | | - struct ceph_options *copts; |
|---|
| 6494 | + struct rbd_parse_opts_ctx pctx = { 0 }; |
|---|
| 5331 | 6495 | int ret; |
|---|
| 5332 | 6496 | |
|---|
| 5333 | 6497 | /* The first four tokens are required */ |
|---|
| .. | .. |
|---|
| 5338 | 6502 | return -EINVAL; |
|---|
| 5339 | 6503 | } |
|---|
| 5340 | 6504 | mon_addrs = buf; |
|---|
| 5341 | | - mon_addrs_size = len + 1; |
|---|
| 6505 | + mon_addrs_size = len; |
|---|
| 5342 | 6506 | buf += len; |
|---|
| 5343 | 6507 | |
|---|
| 5344 | 6508 | ret = -EINVAL; |
|---|
| .. | .. |
|---|
| 5388 | 6552 | *(snap_name + len) = '\0'; |
|---|
| 5389 | 6553 | pctx.spec->snap_name = snap_name; |
|---|
| 5390 | 6554 | |
|---|
| 6555 | + pctx.copts = ceph_alloc_options(); |
|---|
| 6556 | + if (!pctx.copts) |
|---|
| 6557 | + goto out_mem; |
|---|
| 6558 | + |
|---|
| 5391 | 6559 | /* Initialize all rbd options to the defaults */ |
|---|
| 5392 | 6560 | |
|---|
| 5393 | 6561 | pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL); |
|---|
| .. | .. |
|---|
| 5396 | 6564 | |
|---|
| 5397 | 6565 | pctx.opts->read_only = RBD_READ_ONLY_DEFAULT; |
|---|
| 5398 | 6566 | pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; |
|---|
| 6567 | + pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT; |
|---|
| 5399 | 6568 | pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT; |
|---|
| 5400 | 6569 | pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; |
|---|
| 5401 | 6570 | pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; |
|---|
| 5402 | 6571 | pctx.opts->trim = RBD_TRIM_DEFAULT; |
|---|
| 5403 | 6572 | |
|---|
| 5404 | | - copts = ceph_parse_options(options, mon_addrs, |
|---|
| 5405 | | - mon_addrs + mon_addrs_size - 1, |
|---|
| 5406 | | - parse_rbd_opts_token, &pctx); |
|---|
| 5407 | | - if (IS_ERR(copts)) { |
|---|
| 5408 | | - ret = PTR_ERR(copts); |
|---|
| 6573 | + ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL); |
|---|
| 6574 | + if (ret) |
|---|
| 5409 | 6575 | goto out_err; |
|---|
| 5410 | | - } |
|---|
| 5411 | | - kfree(options); |
|---|
| 5412 | 6576 | |
|---|
| 5413 | | - *ceph_opts = copts; |
|---|
| 6577 | + ret = rbd_parse_options(options, &pctx); |
|---|
| 6578 | + if (ret) |
|---|
| 6579 | + goto out_err; |
|---|
| 6580 | + |
|---|
| 6581 | + *ceph_opts = pctx.copts; |
|---|
| 5414 | 6582 | *opts = pctx.opts; |
|---|
| 5415 | 6583 | *rbd_spec = pctx.spec; |
|---|
| 5416 | | - |
|---|
| 6584 | + kfree(options); |
|---|
| 5417 | 6585 | return 0; |
|---|
| 6586 | + |
|---|
| 5418 | 6587 | out_mem: |
|---|
| 5419 | 6588 | ret = -ENOMEM; |
|---|
| 5420 | 6589 | out_err: |
|---|
| 5421 | 6590 | kfree(pctx.opts); |
|---|
| 6591 | + ceph_destroy_options(pctx.copts); |
|---|
| 5422 | 6592 | rbd_spec_put(pctx.spec); |
|---|
| 5423 | 6593 | kfree(options); |
|---|
| 5424 | | - |
|---|
| 5425 | 6594 | return ret; |
|---|
| 5426 | 6595 | } |
|---|
| 5427 | 6596 | |
|---|
| .. | .. |
|---|
| 5429 | 6598 | { |
|---|
| 5430 | 6599 | down_write(&rbd_dev->lock_rwsem); |
|---|
| 5431 | 6600 | if (__rbd_is_lock_owner(rbd_dev)) |
|---|
| 5432 | | - rbd_unlock(rbd_dev); |
|---|
| 6601 | + __rbd_release_lock(rbd_dev); |
|---|
| 5433 | 6602 | up_write(&rbd_dev->lock_rwsem); |
|---|
| 5434 | 6603 | } |
|---|
| 5435 | 6604 | |
|---|
| 6605 | +/* |
|---|
| 6606 | + * If the wait is interrupted, an error is returned even if the lock |
|---|
| 6607 | + * was successfully acquired. rbd_dev_image_unlock() will release it |
|---|
| 6608 | + * if needed. |
|---|
| 6609 | + */ |
|---|
| 5436 | 6610 | static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) |
|---|
| 5437 | 6611 | { |
|---|
| 5438 | | - int ret; |
|---|
| 6612 | + long ret; |
|---|
| 5439 | 6613 | |
|---|
| 5440 | 6614 | if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) { |
|---|
| 6615 | + if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read) |
|---|
| 6616 | + return 0; |
|---|
| 6617 | + |
|---|
| 5441 | 6618 | rbd_warn(rbd_dev, "exclusive-lock feature is not enabled"); |
|---|
| 5442 | 6619 | return -EINVAL; |
|---|
| 5443 | 6620 | } |
|---|
| 5444 | 6621 | |
|---|
| 5445 | | - /* FIXME: "rbd map --exclusive" should be in interruptible */ |
|---|
| 5446 | | - down_read(&rbd_dev->lock_rwsem); |
|---|
| 5447 | | - ret = rbd_wait_state_locked(rbd_dev, true); |
|---|
| 5448 | | - up_read(&rbd_dev->lock_rwsem); |
|---|
| 5449 | | - if (ret) { |
|---|
| 5450 | | - rbd_warn(rbd_dev, "failed to acquire exclusive lock"); |
|---|
| 5451 | | - return -EROFS; |
|---|
| 6622 | + if (rbd_is_ro(rbd_dev)) |
|---|
| 6623 | + return 0; |
|---|
| 6624 | + |
|---|
| 6625 | + rbd_assert(!rbd_is_lock_owner(rbd_dev)); |
|---|
| 6626 | + queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); |
|---|
| 6627 | + ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait, |
|---|
| 6628 | + ceph_timeout_jiffies(rbd_dev->opts->lock_timeout)); |
|---|
| 6629 | + if (ret > 0) { |
|---|
| 6630 | + ret = rbd_dev->acquire_err; |
|---|
| 6631 | + } else { |
|---|
| 6632 | + cancel_delayed_work_sync(&rbd_dev->lock_dwork); |
|---|
| 6633 | + if (!ret) |
|---|
| 6634 | + ret = -ETIMEDOUT; |
|---|
| 5452 | 6635 | } |
|---|
| 5453 | 6636 | |
|---|
| 6637 | + if (ret) { |
|---|
| 6638 | + rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret); |
|---|
| 6639 | + return ret; |
|---|
| 6640 | + } |
|---|
| 6641 | + |
|---|
| 6642 | + /* |
|---|
| 6643 | + * The lock may have been released by now, unless automatic lock |
|---|
| 6644 | + * transitions are disabled. |
|---|
| 6645 | + */ |
|---|
| 6646 | + rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev)); |
|---|
| 5454 | 6647 | return 0; |
|---|
| 5455 | 6648 | } |
|---|
| 5456 | 6649 | |
|---|
| .. | .. |
|---|
| 5500 | 6693 | dout("rbd id object name is %s\n", oid.name); |
|---|
| 5501 | 6694 | |
|---|
| 5502 | 6695 | /* Response will be an encoded string, which includes a length */ |
|---|
| 5503 | | - |
|---|
| 5504 | 6696 | size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; |
|---|
| 5505 | 6697 | response = kzalloc(size, GFP_NOIO); |
|---|
| 5506 | 6698 | if (!response) { |
|---|
| .. | .. |
|---|
| 5512 | 6704 | |
|---|
| 5513 | 6705 | ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, |
|---|
| 5514 | 6706 | "get_id", NULL, 0, |
|---|
| 5515 | | - response, RBD_IMAGE_ID_LEN_MAX); |
|---|
| 6707 | + response, size); |
|---|
| 5516 | 6708 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
|---|
| 5517 | 6709 | if (ret == -ENOENT) { |
|---|
| 5518 | 6710 | image_id = kstrdup("", GFP_KERNEL); |
|---|
| .. | .. |
|---|
| 5548 | 6740 | struct rbd_image_header *header; |
|---|
| 5549 | 6741 | |
|---|
| 5550 | 6742 | rbd_dev_parent_put(rbd_dev); |
|---|
| 6743 | + rbd_object_map_free(rbd_dev); |
|---|
| 6744 | + rbd_dev_mapping_clear(rbd_dev); |
|---|
| 5551 | 6745 | |
|---|
| 5552 | 6746 | /* Free dynamic fields from the header, then zero it out */ |
|---|
| 5553 | 6747 | |
|---|
| .. | .. |
|---|
| 5631 | 6825 | __rbd_get_client(rbd_dev->rbd_client); |
|---|
| 5632 | 6826 | rbd_spec_get(rbd_dev->parent_spec); |
|---|
| 5633 | 6827 | |
|---|
| 6828 | + __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags); |
|---|
| 6829 | + |
|---|
| 5634 | 6830 | ret = rbd_dev_image_probe(parent, depth); |
|---|
| 5635 | 6831 | if (ret < 0) |
|---|
| 5636 | 6832 | goto out_err; |
|---|
| .. | .. |
|---|
| 5648 | 6844 | static void rbd_dev_device_release(struct rbd_device *rbd_dev) |
|---|
| 5649 | 6845 | { |
|---|
| 5650 | 6846 | clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); |
|---|
| 5651 | | - rbd_dev_mapping_clear(rbd_dev); |
|---|
| 5652 | 6847 | rbd_free_disk(rbd_dev); |
|---|
| 5653 | 6848 | if (!single_major) |
|---|
| 5654 | 6849 | unregister_blkdev(rbd_dev->major, rbd_dev->name); |
|---|
| .. | .. |
|---|
| 5682 | 6877 | if (ret) |
|---|
| 5683 | 6878 | goto err_out_blkdev; |
|---|
| 5684 | 6879 | |
|---|
| 5685 | | - ret = rbd_dev_mapping_set(rbd_dev); |
|---|
| 5686 | | - if (ret) |
|---|
| 5687 | | - goto err_out_disk; |
|---|
| 5688 | | - |
|---|
| 5689 | 6880 | set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); |
|---|
| 5690 | | - set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only); |
|---|
| 6881 | + set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev)); |
|---|
| 5691 | 6882 | |
|---|
| 5692 | 6883 | ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); |
|---|
| 5693 | 6884 | if (ret) |
|---|
| 5694 | | - goto err_out_mapping; |
|---|
| 6885 | + goto err_out_disk; |
|---|
| 5695 | 6886 | |
|---|
| 5696 | 6887 | set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); |
|---|
| 5697 | 6888 | up_write(&rbd_dev->header_rwsem); |
|---|
| 5698 | 6889 | return 0; |
|---|
| 5699 | 6890 | |
|---|
| 5700 | | -err_out_mapping: |
|---|
| 5701 | | - rbd_dev_mapping_clear(rbd_dev); |
|---|
| 5702 | 6891 | err_out_disk: |
|---|
| 5703 | 6892 | rbd_free_disk(rbd_dev); |
|---|
| 5704 | 6893 | err_out_blkdev: |
|---|
| .. | .. |
|---|
| 5727 | 6916 | return ret; |
|---|
| 5728 | 6917 | } |
|---|
| 5729 | 6918 | |
|---|
| 6919 | +static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap) |
|---|
| 6920 | +{ |
|---|
| 6921 | + if (!is_snap) { |
|---|
| 6922 | + pr_info("image %s/%s%s%s does not exist\n", |
|---|
| 6923 | + rbd_dev->spec->pool_name, |
|---|
| 6924 | + rbd_dev->spec->pool_ns ?: "", |
|---|
| 6925 | + rbd_dev->spec->pool_ns ? "/" : "", |
|---|
| 6926 | + rbd_dev->spec->image_name); |
|---|
| 6927 | + } else { |
|---|
| 6928 | + pr_info("snap %s/%s%s%s@%s does not exist\n", |
|---|
| 6929 | + rbd_dev->spec->pool_name, |
|---|
| 6930 | + rbd_dev->spec->pool_ns ?: "", |
|---|
| 6931 | + rbd_dev->spec->pool_ns ? "/" : "", |
|---|
| 6932 | + rbd_dev->spec->image_name, |
|---|
| 6933 | + rbd_dev->spec->snap_name); |
|---|
| 6934 | + } |
|---|
| 6935 | +} |
|---|
| 6936 | + |
|---|
| 5730 | 6937 | static void rbd_dev_image_release(struct rbd_device *rbd_dev) |
|---|
| 5731 | 6938 | { |
|---|
| 5732 | | - if (rbd_dev->opts) |
|---|
| 6939 | + if (!rbd_is_ro(rbd_dev)) |
|---|
| 5733 | 6940 | rbd_unregister_watch(rbd_dev); |
|---|
| 5734 | 6941 | |
|---|
| 5735 | 6942 | rbd_dev_unprobe(rbd_dev); |
|---|
| .. | .. |
|---|
| 5749 | 6956 | */ |
|---|
| 5750 | 6957 | static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) |
|---|
| 5751 | 6958 | { |
|---|
| 6959 | + bool need_watch = !rbd_is_ro(rbd_dev); |
|---|
| 5752 | 6960 | int ret; |
|---|
| 5753 | 6961 | |
|---|
| 5754 | 6962 | /* |
|---|
| .. | .. |
|---|
| 5765 | 6973 | if (ret) |
|---|
| 5766 | 6974 | goto err_out_format; |
|---|
| 5767 | 6975 | |
|---|
| 5768 | | - if (!depth) { |
|---|
| 6976 | + if (need_watch) { |
|---|
| 5769 | 6977 | ret = rbd_register_watch(rbd_dev); |
|---|
| 5770 | 6978 | if (ret) { |
|---|
| 5771 | 6979 | if (ret == -ENOENT) |
|---|
| 5772 | | - pr_info("image %s/%s%s%s does not exist\n", |
|---|
| 5773 | | - rbd_dev->spec->pool_name, |
|---|
| 5774 | | - rbd_dev->spec->pool_ns ?: "", |
|---|
| 5775 | | - rbd_dev->spec->pool_ns ? "/" : "", |
|---|
| 5776 | | - rbd_dev->spec->image_name); |
|---|
| 6980 | + rbd_print_dne(rbd_dev, false); |
|---|
| 5777 | 6981 | goto err_out_format; |
|---|
| 5778 | 6982 | } |
|---|
| 5779 | 6983 | } |
|---|
| .. | .. |
|---|
| 5782 | 6986 | down_write(&rbd_dev->header_rwsem); |
|---|
| 5783 | 6987 | |
|---|
| 5784 | 6988 | ret = rbd_dev_header_info(rbd_dev); |
|---|
| 5785 | | - if (ret) |
|---|
| 6989 | + if (ret) { |
|---|
| 6990 | + if (ret == -ENOENT && !need_watch) |
|---|
| 6991 | + rbd_print_dne(rbd_dev, false); |
|---|
| 5786 | 6992 | goto err_out_probe; |
|---|
| 6993 | + } |
|---|
| 5787 | 6994 | |
|---|
| 5788 | 6995 | /* |
|---|
| 5789 | 6996 | * If this image is the one being mapped, we have pool name and |
|---|
| .. | .. |
|---|
| 5797 | 7004 | ret = rbd_spec_fill_names(rbd_dev); |
|---|
| 5798 | 7005 | if (ret) { |
|---|
| 5799 | 7006 | if (ret == -ENOENT) |
|---|
| 5800 | | - pr_info("snap %s/%s%s%s@%s does not exist\n", |
|---|
| 5801 | | - rbd_dev->spec->pool_name, |
|---|
| 5802 | | - rbd_dev->spec->pool_ns ?: "", |
|---|
| 5803 | | - rbd_dev->spec->pool_ns ? "/" : "", |
|---|
| 5804 | | - rbd_dev->spec->image_name, |
|---|
| 5805 | | - rbd_dev->spec->snap_name); |
|---|
| 7007 | + rbd_print_dne(rbd_dev, true); |
|---|
| 5806 | 7008 | goto err_out_probe; |
|---|
| 7009 | + } |
|---|
| 7010 | + |
|---|
| 7011 | + ret = rbd_dev_mapping_set(rbd_dev); |
|---|
| 7012 | + if (ret) |
|---|
| 7013 | + goto err_out_probe; |
|---|
| 7014 | + |
|---|
| 7015 | + if (rbd_is_snap(rbd_dev) && |
|---|
| 7016 | + (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) { |
|---|
| 7017 | + ret = rbd_object_map_load(rbd_dev); |
|---|
| 7018 | + if (ret) |
|---|
| 7019 | + goto err_out_probe; |
|---|
| 5807 | 7020 | } |
|---|
| 5808 | 7021 | |
|---|
| 5809 | 7022 | if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { |
|---|
| 5810 | 7023 | ret = rbd_dev_v2_parent_info(rbd_dev); |
|---|
| 5811 | 7024 | if (ret) |
|---|
| 5812 | 7025 | goto err_out_probe; |
|---|
| 5813 | | - |
|---|
| 5814 | | - /* |
|---|
| 5815 | | - * Need to warn users if this image is the one being |
|---|
| 5816 | | - * mapped and has a parent. |
|---|
| 5817 | | - */ |
|---|
| 5818 | | - if (!depth && rbd_dev->parent_spec) |
|---|
| 5819 | | - rbd_warn(rbd_dev, |
|---|
| 5820 | | - "WARNING: kernel layering is EXPERIMENTAL!"); |
|---|
| 5821 | 7026 | } |
|---|
| 5822 | 7027 | |
|---|
| 5823 | 7028 | ret = rbd_dev_probe_parent(rbd_dev, depth); |
|---|
| .. | .. |
|---|
| 5831 | 7036 | err_out_probe: |
|---|
| 5832 | 7037 | if (!depth) |
|---|
| 5833 | 7038 | up_write(&rbd_dev->header_rwsem); |
|---|
| 5834 | | - if (!depth) |
|---|
| 7039 | + if (need_watch) |
|---|
| 5835 | 7040 | rbd_unregister_watch(rbd_dev); |
|---|
| 5836 | 7041 | rbd_dev_unprobe(rbd_dev); |
|---|
| 5837 | 7042 | err_out_format: |
|---|
| .. | .. |
|---|
| 5887 | 7092 | spec = NULL; /* rbd_dev now owns this */ |
|---|
| 5888 | 7093 | rbd_opts = NULL; /* rbd_dev now owns this */ |
|---|
| 5889 | 7094 | |
|---|
| 7095 | + /* if we are mapping a snapshot it will be a read-only mapping */ |
|---|
| 7096 | + if (rbd_dev->opts->read_only || |
|---|
| 7097 | + strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME)) |
|---|
| 7098 | + __set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags); |
|---|
| 7099 | + |
|---|
| 5890 | 7100 | rbd_dev->config_info = kstrdup(buf, GFP_KERNEL); |
|---|
| 5891 | 7101 | if (!rbd_dev->config_info) { |
|---|
| 5892 | 7102 | rc = -ENOMEM; |
|---|
| .. | .. |
|---|
| 5897 | 7107 | if (rc < 0) |
|---|
| 5898 | 7108 | goto err_out_rbd_dev; |
|---|
| 5899 | 7109 | |
|---|
| 5900 | | - /* If we are mapping a snapshot it must be marked read-only */ |
|---|
| 5901 | | - if (rbd_dev->spec->snap_id != CEPH_NOSNAP) |
|---|
| 5902 | | - rbd_dev->opts->read_only = true; |
|---|
| 7110 | + if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) { |
|---|
| 7111 | + rbd_warn(rbd_dev, "alloc_size adjusted to %u", |
|---|
| 7112 | + rbd_dev->layout.object_size); |
|---|
| 7113 | + rbd_dev->opts->alloc_size = rbd_dev->layout.object_size; |
|---|
| 7114 | + } |
|---|
| 5903 | 7115 | |
|---|
| 5904 | 7116 | rc = rbd_dev_device_setup(rbd_dev); |
|---|
| 5905 | 7117 | if (rc) |
|---|
| 5906 | 7118 | goto err_out_image_probe; |
|---|
| 5907 | 7119 | |
|---|
| 5908 | | - if (rbd_dev->opts->exclusive) { |
|---|
| 5909 | | - rc = rbd_add_acquire_lock(rbd_dev); |
|---|
| 5910 | | - if (rc) |
|---|
| 5911 | | - goto err_out_device_setup; |
|---|
| 5912 | | - } |
|---|
| 7120 | + rc = rbd_add_acquire_lock(rbd_dev); |
|---|
| 7121 | + if (rc) |
|---|
| 7122 | + goto err_out_image_lock; |
|---|
| 5913 | 7123 | |
|---|
| 5914 | 7124 | /* Everything's ready. Announce the disk to the world. */ |
|---|
| 5915 | 7125 | |
|---|
| .. | .. |
|---|
| 5917 | 7127 | if (rc) |
|---|
| 5918 | 7128 | goto err_out_image_lock; |
|---|
| 5919 | 7129 | |
|---|
| 5920 | | - add_disk(rbd_dev->disk); |
|---|
| 7130 | + device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL); |
|---|
| 5921 | 7131 | /* see rbd_init_disk() */ |
|---|
| 5922 | 7132 | blk_put_queue(rbd_dev->disk->queue); |
|---|
| 5923 | 7133 | |
|---|
| .. | .. |
|---|
| 5935 | 7145 | |
|---|
| 5936 | 7146 | err_out_image_lock: |
|---|
| 5937 | 7147 | rbd_dev_image_unlock(rbd_dev); |
|---|
| 5938 | | -err_out_device_setup: |
|---|
| 5939 | 7148 | rbd_dev_device_release(rbd_dev); |
|---|
| 5940 | 7149 | err_out_image_probe: |
|---|
| 5941 | 7150 | rbd_dev_image_release(rbd_dev); |
|---|
| .. | .. |
|---|
| 5949 | 7158 | goto out; |
|---|
| 5950 | 7159 | } |
|---|
| 5951 | 7160 | |
|---|
| 5952 | | -static ssize_t rbd_add(struct bus_type *bus, |
|---|
| 5953 | | - const char *buf, |
|---|
| 5954 | | - size_t count) |
|---|
| 7161 | +static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count) |
|---|
| 5955 | 7162 | { |
|---|
| 5956 | 7163 | if (single_major) |
|---|
| 5957 | 7164 | return -EINVAL; |
|---|
| .. | .. |
|---|
| 5959 | 7166 | return do_rbd_add(bus, buf, count); |
|---|
| 5960 | 7167 | } |
|---|
| 5961 | 7168 | |
|---|
| 5962 | | -static ssize_t rbd_add_single_major(struct bus_type *bus, |
|---|
| 5963 | | - const char *buf, |
|---|
| 5964 | | - size_t count) |
|---|
| 7169 | +static ssize_t add_single_major_store(struct bus_type *bus, const char *buf, |
|---|
| 7170 | + size_t count) |
|---|
| 5965 | 7171 | { |
|---|
| 5966 | 7172 | return do_rbd_add(bus, buf, count); |
|---|
| 5967 | 7173 | } |
|---|
| .. | .. |
|---|
| 6067 | 7273 | return count; |
|---|
| 6068 | 7274 | } |
|---|
| 6069 | 7275 | |
|---|
| 6070 | | -static ssize_t rbd_remove(struct bus_type *bus, |
|---|
| 6071 | | - const char *buf, |
|---|
| 6072 | | - size_t count) |
|---|
| 7276 | +static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count) |
|---|
| 6073 | 7277 | { |
|---|
| 6074 | 7278 | if (single_major) |
|---|
| 6075 | 7279 | return -EINVAL; |
|---|
| .. | .. |
|---|
| 6077 | 7281 | return do_rbd_remove(bus, buf, count); |
|---|
| 6078 | 7282 | } |
|---|
| 6079 | 7283 | |
|---|
| 6080 | | -static ssize_t rbd_remove_single_major(struct bus_type *bus, |
|---|
| 6081 | | - const char *buf, |
|---|
| 6082 | | - size_t count) |
|---|
| 7284 | +static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf, |
|---|
| 7285 | + size_t count) |
|---|
| 6083 | 7286 | { |
|---|
| 6084 | 7287 | return do_rbd_remove(bus, buf, count); |
|---|
| 6085 | 7288 | } |
|---|
| .. | .. |
|---|
| 6088 | 7291 | * create control files in sysfs |
|---|
| 6089 | 7292 | * /sys/bus/rbd/... |
|---|
| 6090 | 7293 | */ |
|---|
| 6091 | | -static int rbd_sysfs_init(void) |
|---|
| 7294 | +static int __init rbd_sysfs_init(void) |
|---|
| 6092 | 7295 | { |
|---|
| 6093 | 7296 | int ret; |
|---|
| 6094 | 7297 | |
|---|
| .. | .. |
|---|
| 6103 | 7306 | return ret; |
|---|
| 6104 | 7307 | } |
|---|
| 6105 | 7308 | |
|---|
| 6106 | | -static void rbd_sysfs_cleanup(void) |
|---|
| 7309 | +static void __exit rbd_sysfs_cleanup(void) |
|---|
| 6107 | 7310 | { |
|---|
| 6108 | 7311 | bus_unregister(&rbd_bus_type); |
|---|
| 6109 | 7312 | device_unregister(&rbd_root_dev); |
|---|
| 6110 | 7313 | } |
|---|
| 6111 | 7314 | |
|---|
| 6112 | | -static int rbd_slab_init(void) |
|---|
| 7315 | +static int __init rbd_slab_init(void) |
|---|
| 6113 | 7316 | { |
|---|
| 6114 | 7317 | rbd_assert(!rbd_img_request_cache); |
|---|
| 6115 | 7318 | rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); |
|---|