~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,5 +1,6 @@
1	1	// SPDX-License-Identifier: GPL-2.0
2	2	#include <linux/ceph/ceph_debug.h>
	3	+#include <linux/ceph/striper.h>
3	4
4	5	#include <linux/module.h>
5	6	#include <linux/sched.h>
..	..	@@ -9,10 +10,14 @@
9	10	#include <linux/namei.h>
10	11	#include <linux/writeback.h>
11	12	#include <linux/falloc.h>
	13	+#include <linux/iversion.h>
	14	+#include <linux/ktime.h>
12	15
13	16	#include "super.h"
14	17	#include "mds_client.h"
15	18	#include "cache.h"
	19	+#include "io.h"
	20	+#include "metric.h"
16	21
17	22	static __le32 ceph_flags_sys2wire(u32 flags)
18	23	{
..	..	@@ -177,8 +182,7 @@
177	182	static struct ceph_mds_request *
178	183	prepare_open_request(struct super_block *sb, int flags, int create_mode)
179	184	{
180		- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
181		- struct ceph_mds_client *mdsc = fsc->mdsc;
	185	+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
182	186	struct ceph_mds_request *req;
183	187	int want_auth = USE_ANY_MDS;
184	188	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
..	..	@@ -199,6 +203,7 @@
199	203	static int ceph_init_file_info(struct inode inode, struct file file,
200	204	int fmode, bool isdir)
201	205	{
	206	+ struct ceph_inode_info *ci = ceph_inode(inode);
202	207	struct ceph_file_info *fi;
203	208
204	209	dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
..	..	@@ -208,10 +213,8 @@
208	213	if (isdir) {
209	214	struct ceph_dir_file_info *dfi =
210	215	kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
211		- if (!dfi) {
212		- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
	216	+ if (!dfi)
213	217	return -ENOMEM;
214		- }
215	218
216	219	file->private_data = dfi;
217	220	fi = &dfi->file_info;
..	..	@@ -219,17 +222,18 @@
219	222	dfi->readdir_cache_idx = -1;
220	223	} else {
221	224	fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
222		- if (!fi) {
223		- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
	225	+ if (!fi)
224	226	return -ENOMEM;
225		- }
226	227
227	228	file->private_data = fi;
228	229	}
229	230
	231	+ ceph_get_fmode(ci, fmode, 1);
230	232	fi->fmode = fmode;
	233	+
231	234	spin_lock_init(&fi->rw_contexts_lock);
232	235	INIT_LIST_HEAD(&fi->rw_contexts);
	236	+ fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
233	237
234	238	return 0;
235	239	}
..	..	@@ -246,17 +250,15 @@
246	250	case S_IFREG:
247	251	ceph_fscache_register_inode_cookie(inode);
248	252	ceph_fscache_file_set_cookie(inode, file);
	253	+ fallthrough;
249	254	case S_IFDIR:
250	255	ret = ceph_init_file_info(inode, file, fmode,
251	256	S_ISDIR(inode->i_mode));
252		- if (ret)
253		- return ret;
254	257	break;
255	258
256	259	case S_IFLNK:
257	260	dout("init_file %p %p 0%o (symlink)\n", inode, file,
258	261	inode->i_mode);
259		- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
260	262	break;
261	263
262	264	default:
..	..	@@ -266,7 +268,6 @@
266	268	* we need to drop the open ref now, since we don't
267	269	* have .release set to ceph_release.
268	270	*/
269		- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
270	271	BUG_ON(inode->i_fop->release == ceph_release);
271	272
272	273	/* call the proper open fop */
..	..	@@ -278,14 +279,15 @@
278	279	/*
279	280	* try renew caps after session gets killed.
280	281	*/
281		-int ceph_renew_caps(struct inode *inode)
	282	+int ceph_renew_caps(struct inode *inode, int fmode)
282	283	{
283		- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
	284	+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
284	285	struct ceph_inode_info *ci = ceph_inode(inode);
285	286	struct ceph_mds_request *req;
286	287	int err, flags, wanted;
287	288
288	289	spin_lock(&ci->i_ceph_lock);
	290	+ __ceph_touch_fmode(ci, mdsc, fmode);
289	291	wanted = __ceph_caps_file_wanted(ci);
290	292	if (__ceph_is_any_real_caps(ci) &&
291	293	(!(wanted & CEPH_CAP_ANY_WR) \|\| ci->i_auth_cap)) {
..	..	@@ -319,7 +321,6 @@
319	321	req->r_inode = inode;
320	322	ihold(inode);
321	323	req->r_num_caps = 1;
322		- req->r_fmode = -1;
323	324
324	325	err = ceph_mdsc_do_request(mdsc, NULL, req);
325	326	ceph_mdsc_put_request(req);
..	..	@@ -365,9 +366,6 @@
365	366
366	367	/* trivially open snapdir */
367	368	if (ceph_snap(inode) == CEPH_SNAPDIR) {
368		- spin_lock(&ci->i_ceph_lock);
369		- __ceph_get_fmode(ci, fmode);
370		- spin_unlock(&ci->i_ceph_lock);
371	369	return ceph_init_file(inode, file, fmode);
372	370	}
373	371
..	..	@@ -385,7 +383,7 @@
385	383	dout("open %p fmode %d want %s issued %s using existing\n",
386	384	inode, fmode, ceph_cap_string(wanted),
387	385	ceph_cap_string(issued));
388		- __ceph_get_fmode(ci, fmode);
	386	+ __ceph_touch_fmode(ci, mdsc, fmode);
389	387	spin_unlock(&ci->i_ceph_lock);
390	388
391	389	/* adjust wanted? */
..	..	@@ -397,7 +395,7 @@
397	395	return ceph_init_file(inode, file, fmode);
398	396	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
399	397	(ci->i_snap_caps & wanted) == wanted) {
400		- __ceph_get_fmode(ci, fmode);
	398	+ __ceph_touch_fmode(ci, mdsc, fmode);
401	399	spin_unlock(&ci->i_ceph_lock);
402	400	return ceph_init_file(inode, file, fmode);
403	401	}
..	..	@@ -423,6 +421,264 @@
423	421	return err;
424	422	}
425	423
	424	+/* Clone the layout from a synchronous create, if the dir now has Dc caps */
	425	+static void
	426	+cache_file_layout(struct inode dst, struct inode src)
	427	+{
	428	+ struct ceph_inode_info *cdst = ceph_inode(dst);
	429	+ struct ceph_inode_info *csrc = ceph_inode(src);
	430	+
	431	+ spin_lock(&cdst->i_ceph_lock);
	432	+ if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
	433	+ !ceph_file_layout_is_valid(&cdst->i_cached_layout)) {
	434	+ memcpy(&cdst->i_cached_layout, &csrc->i_layout,
	435	+ sizeof(cdst->i_cached_layout));
	436	+ rcu_assign_pointer(cdst->i_cached_layout.pool_ns,
	437	+ ceph_try_get_string(csrc->i_layout.pool_ns));
	438	+ }
	439	+ spin_unlock(&cdst->i_ceph_lock);
	440	+}
	441	+
	442	+/*
	443	+ * Try to set up an async create. We need caps, a file layout, and inode number,
	444	+ * and either a lease on the dentry or complete dir info. If any of those
	445	+ * criteria are not satisfied, then return false and the caller can go
	446	+ * synchronous.
	447	+ */
	448	+static int try_prep_async_create(struct inode dir, struct dentry dentry,
	449	+ struct ceph_file_layout lo, u64 pino)
	450	+{
	451	+ struct ceph_inode_info *ci = ceph_inode(dir);
	452	+ struct ceph_dentry_info *di = ceph_dentry(dentry);
	453	+ int got = 0, want = CEPH_CAP_FILE_EXCL \| CEPH_CAP_DIR_CREATE;
	454	+ u64 ino;
	455	+
	456	+ spin_lock(&ci->i_ceph_lock);
	457	+ /* No auth cap means no chance for Dc caps */
	458	+ if (!ci->i_auth_cap)
	459	+ goto no_async;
	460	+
	461	+ /* Any delegated inos? */
	462	+ if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos))
	463	+ goto no_async;
	464	+
	465	+ if (!ceph_file_layout_is_valid(&ci->i_cached_layout))
	466	+ goto no_async;
	467	+
	468	+ if ((__ceph_caps_issued(ci, NULL) & want) != want)
	469	+ goto no_async;
	470	+
	471	+ if (d_in_lookup(dentry)) {
	472	+ if (!__ceph_dir_is_complete(ci))
	473	+ goto no_async;
	474	+ spin_lock(&dentry->d_lock);
	475	+ di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
	476	+ spin_unlock(&dentry->d_lock);
	477	+ } else if (atomic_read(&ci->i_shared_gen) !=
	478	+ READ_ONCE(di->lease_shared_gen)) {
	479	+ goto no_async;
	480	+ }
	481	+
	482	+ ino = ceph_get_deleg_ino(ci->i_auth_cap->session);
	483	+ if (!ino)
	484	+ goto no_async;
	485	+
	486	+ *pino = ino;
	487	+ ceph_take_cap_refs(ci, want, false);
	488	+ memcpy(lo, &ci->i_cached_layout, sizeof(*lo));
	489	+ rcu_assign_pointer(lo->pool_ns,
	490	+ ceph_try_get_string(ci->i_cached_layout.pool_ns));
	491	+ got = want;
	492	+no_async:
	493	+ spin_unlock(&ci->i_ceph_lock);
	494	+ return got;
	495	+}
	496	+
	497	+static void restore_deleg_ino(struct inode *dir, u64 ino)
	498	+{
	499	+ struct ceph_inode_info *ci = ceph_inode(dir);
	500	+ struct ceph_mds_session *s = NULL;
	501	+
	502	+ spin_lock(&ci->i_ceph_lock);
	503	+ if (ci->i_auth_cap)
	504	+ s = ceph_get_mds_session(ci->i_auth_cap->session);
	505	+ spin_unlock(&ci->i_ceph_lock);
	506	+ if (s) {
	507	+ int err = ceph_restore_deleg_ino(s, ino);
	508	+ if (err)
	509	+ pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n",
	510	+ ino, err);
	511	+ ceph_put_mds_session(s);
	512	+ }
	513	+}
	514	+
	515	+static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
	516	+ struct ceph_mds_request *req)
	517	+{
	518	+ int result = req->r_err ? req->r_err :
	519	+ le32_to_cpu(req->r_reply_info.head->result);
	520	+
	521	+ if (result == -EJUKEBOX)
	522	+ goto out;
	523	+
	524	+ mapping_set_error(req->r_parent->i_mapping, result);
	525	+
	526	+ if (result) {
	527	+ struct dentry *dentry = req->r_dentry;
	528	+ int pathlen = 0;
	529	+ u64 base = 0;
	530	+ char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
	531	+ &base, 0);
	532	+
	533	+ ceph_dir_clear_complete(req->r_parent);
	534	+ if (!d_unhashed(dentry))
	535	+ d_drop(dentry);
	536	+
	537	+ /* FIXME: start returning I/O errors on all accesses? */
	538	+ pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
	539	+ base, IS_ERR(path) ? "<<bad>>" : path, result);
	540	+ ceph_mdsc_free_path(path, pathlen);
	541	+ }
	542	+
	543	+ if (req->r_target_inode) {
	544	+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
	545	+ u64 ino = ceph_vino(req->r_target_inode).ino;
	546	+
	547	+ if (req->r_deleg_ino != ino)
	548	+ pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
	549	+ __func__, req->r_err, req->r_deleg_ino, ino);
	550	+ mapping_set_error(req->r_target_inode->i_mapping, result);
	551	+
	552	+ spin_lock(&ci->i_ceph_lock);
	553	+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
	554	+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
	555	+ wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
	556	+ }
	557	+ ceph_kick_flushing_inode_caps(req->r_session, ci);
	558	+ spin_unlock(&ci->i_ceph_lock);
	559	+ } else {
	560	+ pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
	561	+ req->r_deleg_ino);
	562	+ }
	563	+out:
	564	+ ceph_mdsc_release_dir_caps(req);
	565	+}
	566	+
	567	+static int ceph_finish_async_create(struct inode dir, struct dentry dentry,
	568	+ struct file *file, umode_t mode,
	569	+ struct ceph_mds_request *req,
	570	+ struct ceph_acl_sec_ctx *as_ctx,
	571	+ struct ceph_file_layout *lo)
	572	+{
	573	+ int ret;
	574	+ char xattr_buf[4];
	575	+ struct ceph_mds_reply_inode in = { };
	576	+ struct ceph_mds_reply_info_in iinfo = { .in = &in };
	577	+ struct ceph_inode_info *ci = ceph_inode(dir);
	578	+ struct inode *inode;
	579	+ struct timespec64 now;
	580	+ struct ceph_string *pool_ns;
	581	+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
	582	+ struct ceph_vino vino = { .ino = req->r_deleg_ino,
	583	+ .snap = CEPH_NOSNAP };
	584	+
	585	+ ktime_get_real_ts64(&now);
	586	+
	587	+ inode = ceph_get_inode(dentry->d_sb, vino);
	588	+ if (IS_ERR(inode))
	589	+ return PTR_ERR(inode);
	590	+
	591	+ iinfo.inline_version = CEPH_INLINE_NONE;
	592	+ iinfo.change_attr = 1;
	593	+ ceph_encode_timespec64(&iinfo.btime, &now);
	594	+
	595	+ if (req->r_pagelist) {
	596	+ iinfo.xattr_len = req->r_pagelist->length;
	597	+ iinfo.xattr_data = req->r_pagelist->mapped_tail;
	598	+ } else {
	599	+ /* fake it */
	600	+ iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
	601	+ iinfo.xattr_data = xattr_buf;
	602	+ memset(iinfo.xattr_data, 0, iinfo.xattr_len);
	603	+ }
	604	+
	605	+ in.ino = cpu_to_le64(vino.ino);
	606	+ in.snapid = cpu_to_le64(CEPH_NOSNAP);
	607	+ in.version = cpu_to_le64(1); // ???
	608	+ in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
	609	+ in.cap.cap_id = cpu_to_le64(1);
	610	+ in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
	611	+ in.cap.flags = CEPH_CAP_FLAG_AUTH;
	612	+ in.ctime = in.mtime = in.atime = iinfo.btime;
	613	+ in.truncate_seq = cpu_to_le32(1);
	614	+ in.truncate_size = cpu_to_le64(-1ULL);
	615	+ in.xattr_version = cpu_to_le64(1);
	616	+ in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
	617	+ if (dir->i_mode & S_ISGID) {
	618	+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid));
	619	+
	620	+ /* Directories always inherit the setgid bit. */
	621	+ if (S_ISDIR(mode))
	622	+ mode \|= S_ISGID;
	623	+ else if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP) &&
	624	+ !in_group_p(dir->i_gid) &&
	625	+ !capable_wrt_inode_uidgid(dir, CAP_FSETID))
	626	+ mode &= ~S_ISGID;
	627	+ } else {
	628	+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
	629	+ }
	630	+ in.mode = cpu_to_le32((u32)mode);
	631	+
	632	+ in.nlink = cpu_to_le32(1);
	633	+ in.max_size = cpu_to_le64(lo->stripe_unit);
	634	+
	635	+ ceph_file_layout_to_legacy(lo, &in.layout);
	636	+ /* lo is private, so pool_ns can't change */
	637	+ pool_ns = rcu_dereference_raw(lo->pool_ns);
	638	+ if (pool_ns) {
	639	+ iinfo.pool_ns_len = pool_ns->len;
	640	+ iinfo.pool_ns_data = pool_ns->str;
	641	+ }
	642	+
	643	+ down_read(&mdsc->snap_rwsem);
	644	+ ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
	645	+ req->r_fmode, NULL);
	646	+ up_read(&mdsc->snap_rwsem);
	647	+ if (ret) {
	648	+ dout("%s failed to fill inode: %d\n", __func__, ret);
	649	+ ceph_dir_clear_complete(dir);
	650	+ if (!d_unhashed(dentry))
	651	+ d_drop(dentry);
	652	+ if (inode->i_state & I_NEW)
	653	+ discard_new_inode(inode);
	654	+ } else {
	655	+ struct dentry *dn;
	656	+
	657	+ dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
	658	+ vino.ino, ceph_ino(dir), dentry->d_name.name);
	659	+ ceph_dir_clear_ordered(dir);
	660	+ ceph_init_inode_acls(inode, as_ctx);
	661	+ if (inode->i_state & I_NEW) {
	662	+ /*
	663	+ * If it's not I_NEW, then someone created this before
	664	+ * we got here. Assume the server is aware of it at
	665	+ * that point and don't worry about setting
	666	+ * CEPH_I_ASYNC_CREATE.
	667	+ */
	668	+ ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE;
	669	+ unlock_new_inode(inode);
	670	+ }
	671	+ if (d_in_lookup(dentry) \|\| d_really_is_negative(dentry)) {
	672	+ if (!d_unhashed(dentry))
	673	+ d_drop(dentry);
	674	+ dn = d_splice_alias(inode, dentry);
	675	+ WARN_ON_ONCE(dn && dn != dentry);
	676	+ }
	677	+ file->f_mode \|= FMODE_CREATED;
	678	+ ret = finish_open(file, dentry, ceph_open);
	679	+ }
	680	+ return ret;
	681	+}
426	682
427	683	/*
428	684	* Do a lookup + open with a single request. If we get a non-existent
..	..	@@ -435,7 +691,8 @@
435	691	struct ceph_mds_client *mdsc = fsc->mdsc;
436	692	struct ceph_mds_request *req;
437	693	struct dentry *dn;
438		- struct ceph_acls_info acls = {};
	694	+ struct ceph_acl_sec_ctx as_ctx = {};
	695	+ bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
439	696	int mask;
440	697	int err;
441	698
..	..	@@ -446,41 +703,79 @@
446	703	if (dentry->d_name.len > NAME_MAX)
447	704	return -ENAMETOOLONG;
448	705
	706	+ /*
	707	+ * Do not truncate the file, since atomic_open is called before the
	708	+ * permission check. The caller will do the truncation afterward.
	709	+ */
	710	+ flags &= ~O_TRUNC;
	711	+
449	712	if (flags & O_CREAT) {
450	713	if (ceph_quota_is_max_files_exceeded(dir))
451	714	return -EDQUOT;
452		- err = ceph_pre_init_acls(dir, &mode, &acls);
	715	+ err = ceph_pre_init_acls(dir, &mode, &as_ctx);
453	716	if (err < 0)
454	717	return err;
	718	+ err = ceph_security_init_secctx(dentry, mode, &as_ctx);
	719	+ if (err < 0)
	720	+ goto out_ctx;
	721	+ /* Async create can't handle more than a page of xattrs */
	722	+ if (as_ctx.pagelist &&
	723	+ !list_is_singular(&as_ctx.pagelist->head))
	724	+ try_async = false;
	725	+ } else if (!d_in_lookup(dentry)) {
	726	+ /* If it's not being looked up, it's negative */
	727	+ return -ENOENT;
455	728	}
456		-
	729	+retry:
457	730	/* do the open */
458	731	req = prepare_open_request(dir->i_sb, flags, mode);
459	732	if (IS_ERR(req)) {
460	733	err = PTR_ERR(req);
461		- goto out_acl;
	734	+ goto out_ctx;
462	735	}
463	736	req->r_dentry = dget(dentry);
464	737	req->r_num_caps = 2;
	738	+ mask = CEPH_STAT_CAP_INODE \| CEPH_CAP_AUTH_SHARED;
	739	+ if (ceph_security_xattr_wanted(dir))
	740	+ mask \|= CEPH_CAP_XATTR_SHARED;
	741	+ req->r_args.open.mask = cpu_to_le32(mask);
	742	+ req->r_parent = dir;
	743	+
465	744	if (flags & O_CREAT) {
	745	+ struct ceph_file_layout lo;
	746	+
466	747	req->r_dentry_drop = CEPH_CAP_FILE_SHARED \| CEPH_CAP_AUTH_EXCL;
467	748	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
468		- if (acls.pagelist) {
469		- req->r_pagelist = acls.pagelist;
470		- acls.pagelist = NULL;
	749	+ if (as_ctx.pagelist) {
	750	+ req->r_pagelist = as_ctx.pagelist;
	751	+ as_ctx.pagelist = NULL;
	752	+ }
	753	+ if (try_async &&
	754	+ (req->r_dir_caps =
	755	+ try_prep_async_create(dir, dentry, &lo,
	756	+ &req->r_deleg_ino))) {
	757	+ set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
	758	+ req->r_args.open.flags \|= cpu_to_le32(CEPH_O_EXCL);
	759	+ req->r_callback = ceph_async_create_cb;
	760	+ err = ceph_mdsc_submit_request(mdsc, dir, req);
	761	+ if (!err) {
	762	+ err = ceph_finish_async_create(dir, dentry,
	763	+ file, mode, req,
	764	+ &as_ctx, &lo);
	765	+ } else if (err == -EJUKEBOX) {
	766	+ restore_deleg_ino(dir, req->r_deleg_ino);
	767	+ ceph_mdsc_put_request(req);
	768	+ try_async = false;
	769	+ ceph_put_string(rcu_dereference_raw(lo.pool_ns));
	770	+ goto retry;
	771	+ }
	772	+ ceph_put_string(rcu_dereference_raw(lo.pool_ns));
	773	+ goto out_req;
471	774	}
472	775	}
473	776
474		- mask = CEPH_STAT_CAP_INODE \| CEPH_CAP_AUTH_SHARED;
475		- if (ceph_security_xattr_wanted(dir))
476		- mask \|= CEPH_CAP_XATTR_SHARED;
477		- req->r_args.open.mask = cpu_to_le32(mask);
478		-
479		- req->r_parent = dir;
480	777	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
481		- err = ceph_mdsc_do_request(mdsc,
482		- (flags & (O_CREAT\|O_TRUNC)) ? dir : NULL,
483		- req);
	778	+ err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req);
484	779	err = ceph_handle_snapdir(req, dentry, err);
485	780	if (err)
486	781	goto out_req;
..	..	@@ -505,17 +800,18 @@
505	800	} else {
506	801	dout("atomic_open finish_open on dn %p\n", dn);
507	802	if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
508		- ceph_init_inode_acls(d_inode(dentry), &acls);
	803	+ struct inode *newino = d_inode(dentry);
	804	+
	805	+ cache_file_layout(dir, newino);
	806	+ ceph_init_inode_acls(newino, &as_ctx);
509	807	file->f_mode \|= FMODE_CREATED;
510	808	}
511	809	err = finish_open(file, dentry, ceph_open);
512	810	}
513	811	out_req:
514		- if (!req->r_err && req->r_target_inode)
515		- ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
516	812	ceph_mdsc_put_request(req);
517		-out_acl:
518		- ceph_release_acls_info(&acls);
	813	+out_ctx:
	814	+ ceph_release_acl_sec_ctx(&as_ctx);
519	815	dout("atomic_open result=%d\n", err);
520	816	return err;
521	817	}
..	..	@@ -529,7 +825,7 @@
529	825	dout("release inode %p dir file %p\n", inode, file);
530	826	WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
531	827
532		- ceph_put_fmode(ci, dfi->file_info.fmode);
	828	+ ceph_put_fmode(ci, dfi->file_info.fmode, 1);
533	829
534	830	if (dfi->last_readdir)
535	831	ceph_mdsc_put_request(dfi->last_readdir);
..	..	@@ -541,7 +837,8 @@
541	837	dout("release inode %p regular file %p\n", inode, file);
542	838	WARN_ON(!list_empty(&fi->rw_contexts));
543	839
544		- ceph_put_fmode(ci, fi->fmode);
	840	+ ceph_put_fmode(ci, fi->fmode, 1);
	841	+
545	842	kmem_cache_free(ceph_file_cachep, fi);
546	843	}
547	844
..	..	@@ -557,90 +854,26 @@
557	854	};
558	855
559	856	/*
560		- * Read a range of bytes striped over one or more objects. Iterate over
561		- * objects we stripe over. (That's not atomic, but good enough for now.)
	857	+ * Completely synchronous read and write methods. Direct from __user
	858	+ * buffer to osd, or directly to user pages (if O_DIRECT).
	859	+ *
	860	+ * If the read spans object boundary, just do multiple reads. (That's not
	861	+ * atomic, but good enough for now.)
562	862	*
563	863	* If we get a short result from the OSD, check against i_size; we need to
564	864	* only return a short read to the caller if we hit EOF.
565	865	*/
566		-static int striped_read(struct inode *inode,
567		- u64 pos, u64 len,
568		- struct page **pages, int num_pages,
569		- int page_align, int *checkeof)
570		-{
571		- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
572		- struct ceph_inode_info *ci = ceph_inode(inode);
573		- u64 this_len;
574		- loff_t i_size;
575		- int page_idx;
576		- int ret, read = 0;
577		- bool hit_stripe, was_short;
578		-
579		- /*
580		- * we may need to do multiple reads. not atomic, unfortunately.
581		- */
582		-more:
583		- this_len = len;
584		- page_idx = (page_align + read) >> PAGE_SHIFT;
585		- ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
586		- &ci->i_layout, pos, &this_len,
587		- ci->i_truncate_seq, ci->i_truncate_size,
588		- pages + page_idx, num_pages - page_idx,
589		- ((page_align + read) & ~PAGE_MASK));
590		- if (ret == -ENOENT)
591		- ret = 0;
592		- hit_stripe = this_len < len;
593		- was_short = ret >= 0 && ret < this_len;
594		- dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
595		- ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
596		-
597		- i_size = i_size_read(inode);
598		- if (ret >= 0) {
599		- if (was_short && (pos + ret < i_size)) {
600		- int zlen = min(this_len - ret, i_size - pos - ret);
601		- int zoff = page_align + read + ret;
602		- dout(" zero gap %llu to %llu\n",
603		- pos + ret, pos + ret + zlen);
604		- ceph_zero_page_vector_range(zoff, zlen, pages);
605		- ret += zlen;
606		- }
607		-
608		- read += ret;
609		- pos += ret;
610		- len -= ret;
611		-
612		- /* hit stripe and need continue*/
613		- if (len && hit_stripe && pos < i_size)
614		- goto more;
615		- }
616		-
617		- if (read > 0) {
618		- ret = read;
619		- /* did we bounce off eof? */
620		- if (pos + len > i_size)
621		- *checkeof = CHECK_EOF;
622		- }
623		-
624		- dout("striped_read returns %d\n", ret);
625		- return ret;
626		-}
627		-
628		-/*
629		- * Completely synchronous read and write methods. Direct from __user
630		- * buffer to osd, or directly to user pages (if O_DIRECT).
631		- *
632		- * If the read spans object boundary, just do multiple reads.
633		- */
634	866	static ssize_t ceph_sync_read(struct kiocb iocb, struct iov_iter to,
635		- int *checkeof)
	867	+ int *retry_op)
636	868	{
637	869	struct file *file = iocb->ki_filp;
638	870	struct inode *inode = file_inode(file);
639		- struct page **pages;
640		- u64 off = iocb->ki_pos;
641		- int num_pages;
	871	+ struct ceph_inode_info *ci = ceph_inode(inode);
	872	+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	873	+ struct ceph_osd_client *osdc = &fsc->client->osdc;
642	874	ssize_t ret;
643		- size_t len = iov_iter_count(to);
	875	+ u64 off = iocb->ki_pos;
	876	+ u64 len = iov_iter_count(to);
644	877
645	878	dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
646	879	(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
..	..	@@ -653,61 +886,108 @@
653	886	* but it will at least behave sensibly when they are
654	887	* in sequence.
655	888	*/
656		- ret = filemap_write_and_wait_range(inode->i_mapping, off,
657		- off + len);
	889	+ ret = filemap_write_and_wait_range(inode->i_mapping,
	890	+ off, off + len - 1);
658	891	if (ret < 0)
659	892	return ret;
660	893
661		- if (unlikely(to->type & ITER_PIPE)) {
	894	+ ret = 0;
	895	+ while ((len = iov_iter_count(to)) > 0) {
	896	+ struct ceph_osd_request *req;
	897	+ struct page **pages;
	898	+ int num_pages;
662	899	size_t page_off;
663		- ret = iov_iter_get_pages_alloc(to, &pages, len,
664		- &page_off);
665		- if (ret <= 0)
666		- return -ENOMEM;
667		- num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
	900	+ u64 i_size;
	901	+ bool more;
	902	+ int idx;
	903	+ size_t left;
668	904
669		- ret = striped_read(inode, off, ret, pages, num_pages,
670		- page_off, checkeof);
671		- if (ret > 0) {
672		- iov_iter_advance(to, ret);
673		- off += ret;
674		- } else {
675		- iov_iter_advance(to, 0);
	905	+ req = ceph_osdc_new_request(osdc, &ci->i_layout,
	906	+ ci->i_vino, off, &len, 0, 1,
	907	+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
	908	+ NULL, ci->i_truncate_seq,
	909	+ ci->i_truncate_size, false);
	910	+ if (IS_ERR(req)) {
	911	+ ret = PTR_ERR(req);
	912	+ break;
676	913	}
677		- ceph_put_page_vector(pages, num_pages, false);
678		- } else {
	914	+
	915	+ more = len < iov_iter_count(to);
	916	+
679	917	num_pages = calc_pages_for(off, len);
	918	+ page_off = off & ~PAGE_MASK;
680	919	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
681		- if (IS_ERR(pages))
682		- return PTR_ERR(pages);
	920	+ if (IS_ERR(pages)) {
	921	+ ceph_osdc_put_request(req);
	922	+ ret = PTR_ERR(pages);
	923	+ break;
	924	+ }
683	925
684		- ret = striped_read(inode, off, len, pages, num_pages,
685		- (off & ~PAGE_MASK), checkeof);
686		- if (ret > 0) {
687		- int l, k = 0;
688		- size_t left = ret;
	926	+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
	927	+ false, false);
	928	+ ret = ceph_osdc_start_request(osdc, req, false);
	929	+ if (!ret)
	930	+ ret = ceph_osdc_wait_request(osdc, req);
689	931
690		- while (left) {
691		- size_t page_off = off & ~PAGE_MASK;
692		- size_t copy = min_t(size_t, left,
693		- PAGE_SIZE - page_off);
694		- l = copy_page_to_iter(pages[k++], page_off,
695		- copy, to);
696		- off += l;
697		- left -= l;
698		- if (l < copy)
699		- break;
	932	+ ceph_update_read_latency(&fsc->mdsc->metric,
	933	+ req->r_start_latency,
	934	+ req->r_end_latency,
	935	+ ret);
	936	+
	937	+ ceph_osdc_put_request(req);
	938	+
	939	+ i_size = i_size_read(inode);
	940	+ dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
	941	+ off, len, ret, i_size, (more ? " MORE" : ""));
	942	+
	943	+ if (ret == -ENOENT)
	944	+ ret = 0;
	945	+ if (ret >= 0 && ret < len && (off + ret < i_size)) {
	946	+ int zlen = min(len - ret, i_size - off - ret);
	947	+ int zoff = page_off + ret;
	948	+ dout("sync_read zero gap %llu~%llu\n",
	949	+ off + ret, off + ret + zlen);
	950	+ ceph_zero_page_vector_range(zoff, zlen, pages);
	951	+ ret += zlen;
	952	+ }
	953	+
	954	+ idx = 0;
	955	+ left = ret > 0 ? ret : 0;
	956	+ while (left > 0) {
	957	+ size_t len, copied;
	958	+ page_off = off & ~PAGE_MASK;
	959	+ len = min_t(size_t, left, PAGE_SIZE - page_off);
	960	+ SetPageUptodate(pages[idx]);
	961	+ copied = copy_page_to_iter(pages[idx++],
	962	+ page_off, len, to);
	963	+ off += copied;
	964	+ left -= copied;
	965	+ if (copied < len) {
	966	+ ret = -EFAULT;
	967	+ break;
700	968	}
701	969	}
702	970	ceph_release_page_vector(pages, num_pages);
	971	+
	972	+ if (ret < 0) {
	973	+ if (ret == -EBLOCKLISTED)
	974	+ fsc->blocklisted = true;
	975	+ break;
	976	+ }
	977	+
	978	+ if (off >= i_size \|\| !more)
	979	+ break;
703	980	}
704	981
705	982	if (off > iocb->ki_pos) {
	983	+ if (ret >= 0 &&
	984	+ iov_iter_count(to) > 0 && off >= i_size_read(inode))
	985	+ *retry_op = CHECK_EOF;
706	986	ret = off - iocb->ki_pos;
707	987	iocb->ki_pos = off;
708	988	}
709	989
710		- dout("sync_read result %zd\n", ret);
	990	+ dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
711	991	return ret;
712	992	}
713	993
..	..	@@ -739,6 +1019,9 @@
739	1019
740	1020	if (!atomic_dec_and_test(&aio_req->pending_reqs))
741	1021	return;
	1022	+
	1023	+ if (aio_req->iocb->ki_flags & IOCB_DIRECT)
	1024	+ inode_dio_end(inode);
742	1025
743	1026	ret = aio_req->error;
744	1027	if (!ret)
..	..	@@ -780,12 +1063,23 @@
780	1063	struct inode *inode = req->r_inode;
781	1064	struct ceph_aio_request *aio_req = req->r_priv;
782	1065	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
	1066	+ struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
783	1067
784	1068	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
785	1069	BUG_ON(!osd_data->num_bvecs);
786	1070
787	1071	dout("ceph_aio_complete_req %p rc %d bytes %u\n",
788	1072	inode, rc, osd_data->bvec_pos.iter.bi_size);
	1073	+
	1074	+ /* r_start_latency == 0 means the request was not submitted */
	1075	+ if (req->r_start_latency) {
	1076	+ if (aio_req->write)
	1077	+ ceph_update_write_latency(metric, req->r_start_latency,
	1078	+ req->r_end_latency, rc);
	1079	+ else
	1080	+ ceph_update_read_latency(metric, req->r_start_latency,
	1081	+ req->r_end_latency, rc);
	1082	+ }
789	1083
790	1084	if (rc == -EOLDSNAPC) {
791	1085	struct ceph_aio_work *aio_work;
..	..	@@ -795,7 +1089,7 @@
795	1089	if (aio_work) {
796	1090	INIT_WORK(&aio_work->work, ceph_aio_retry_work);
797	1091	aio_work->req = req;
798		- queue_work(ceph_inode_to_client(inode)->wb_wq,
	1092	+ queue_work(ceph_inode_to_client(inode)->inode_wq,
799	1093	&aio_work->work);
800	1094	return;
801	1095	}
..	..	@@ -821,7 +1115,7 @@
821	1115	aio_req->total_len = rc + zlen;
822	1116	}
823	1117
824		- iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs,
	1118	+ iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs,
825	1119	osd_data->num_bvecs,
826	1120	osd_data->bvec_pos.iter.bi_size);
827	1121	iov_iter_advance(&i, rc);
..	..	@@ -865,7 +1159,7 @@
865	1159	}
866	1160	spin_unlock(&ci->i_ceph_lock);
867	1161
868		- req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
	1162	+ req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1,
869	1163	false, GFP_NOFS);
870	1164	if (!req) {
871	1165	ret = -ENOMEM;
..	..	@@ -877,17 +1171,17 @@
877	1171	ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
878	1172	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
879	1173
	1174	+ req->r_ops[0] = orig_req->r_ops[0];
	1175	+
	1176	+ req->r_mtime = aio_req->mtime;
	1177	+ req->r_data_offset = req->r_ops[0].extent.offset;
	1178	+
880	1179	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
881	1180	if (ret) {
882	1181	ceph_osdc_put_request(req);
883	1182	req = orig_req;
884	1183	goto out;
885	1184	}
886		-
887		- req->r_ops[0] = orig_req->r_ops[0];
888		-
889		- req->r_mtime = aio_req->mtime;
890		- req->r_data_offset = req->r_ops[0].extent.offset;
891	1185
892	1186	ceph_osdc_put_request(orig_req);
893	1187
..	..	@@ -915,13 +1209,14 @@
915	1209	struct inode *inode = file_inode(file);
916	1210	struct ceph_inode_info *ci = ceph_inode(inode);
917	1211	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	1212	+ struct ceph_client_metric *metric = &fsc->mdsc->metric;
918	1213	struct ceph_vino vino;
919	1214	struct ceph_osd_request *req;
920	1215	struct bio_vec *bvecs;
921	1216	struct ceph_aio_request *aio_req = NULL;
922	1217	int num_pages = 0;
923	1218	int flags;
924		- int ret;
	1219	+ int ret = 0;
925	1220	struct timespec64 mtime = current_time(inode);
926	1221	size_t count = iov_iter_count(iter);
927	1222	loff_t pos = iocb->ki_pos;
..	..	@@ -933,16 +1228,12 @@
933	1228
934	1229	dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
935	1230	(write ? "write" : "read"), file, pos, (unsigned)count,
936		- snapc, snapc->seq);
937		-
938		- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
939		- if (ret < 0)
940		- return ret;
	1231	+ snapc, snapc ? snapc->seq : 0);
941	1232
942	1233	if (write) {
943	1234	int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
944	1235	pos >> PAGE_SHIFT,
945		- (pos + count) >> PAGE_SHIFT);
	1236	+ (pos + count - 1) >> PAGE_SHIFT);
946	1237	if (ret2 < 0)
947	1238	dout("invalidate_inode_pages2_range returned %d\n", ret2);
948	1239
..	..	@@ -1010,7 +1301,7 @@
1010	1301	* may block.
1011	1302	*/
1012	1303	truncate_inode_pages_range(inode->i_mapping, pos,
1013		- (pos+len) \| (PAGE_SIZE - 1));
	1304	+ PAGE_ALIGN(pos + len) - 1);
1014	1305
1015	1306	req->r_mtime = mtime;
1016	1307	}
..	..	@@ -1025,7 +1316,7 @@
1025	1316	req->r_callback = ceph_aio_complete_req;
1026	1317	req->r_inode = inode;
1027	1318	req->r_priv = aio_req;
1028		- list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
	1319	+ list_add_tail(&req->r_private_item, &aio_req->osd_reqs);
1029	1320
1030	1321	pos += len;
1031	1322	continue;
..	..	@@ -1034,6 +1325,13 @@
1034	1325	ret = ceph_osdc_start_request(req->r_osdc, req, false);
1035	1326	if (!ret)
1036	1327	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
	1328	+
	1329	+ if (write)
	1330	+ ceph_update_write_latency(metric, req->r_start_latency,
	1331	+ req->r_end_latency, ret);
	1332	+ else
	1333	+ ceph_update_read_latency(metric, req->r_start_latency,
	1334	+ req->r_end_latency, ret);
1037	1335
1038	1336	size = i_size_read(inode);
1039	1337	if (!write) {
..	..	@@ -1044,8 +1342,7 @@
1044	1342	int zlen = min_t(size_t, len - ret,
1045	1343	size - pos - ret);
1046	1344
1047		- iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages,
1048		- len);
	1345	+ iov_iter_bvec(&i, READ, bvecs, num_pages, len);
1049	1346	iov_iter_advance(&i, ret);
1050	1347	iov_iter_zero(zlen, &i);
1051	1348	ret += zlen;
..	..	@@ -1083,11 +1380,12 @@
1083	1380	CEPH_CAP_FILE_RD);
1084	1381
1085	1382	list_splice(&aio_req->osd_reqs, &osd_reqs);
	1383	+ inode_dio_begin(inode);
1086	1384	while (!list_empty(&osd_reqs)) {
1087	1385	req = list_first_entry(&osd_reqs,
1088	1386	struct ceph_osd_request,
1089		- r_unsafe_item);
1090		- list_del_init(&req->r_unsafe_item);
	1387	+ r_private_item);
	1388	+ list_del_init(&req->r_private_item);
1091	1389	if (ret >= 0)
1092	1390	ret = ceph_osdc_start_request(req->r_osdc,
1093	1391	req, false);
..	..	@@ -1139,13 +1437,14 @@
1139	1437	dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
1140	1438	file, pos, (unsigned)count, snapc, snapc->seq);
1141	1439
1142		- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
	1440	+ ret = filemap_write_and_wait_range(inode->i_mapping,
	1441	+ pos, pos + count - 1);
1143	1442	if (ret < 0)
1144	1443	return ret;
1145	1444
1146	1445	ret = invalidate_inode_pages2_range(inode->i_mapping,
1147	1446	pos >> PAGE_SHIFT,
1148		- (pos + count) >> PAGE_SHIFT);
	1447	+ (pos + count - 1) >> PAGE_SHIFT);
1149	1448	if (ret < 0)
1150	1449	dout("invalidate_inode_pages2_range returned %d\n", ret);
1151	1450
..	..	@@ -1205,6 +1504,8 @@
1205	1504	if (!ret)
1206	1505	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
1207	1506
	1507	+ ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
	1508	+ req->r_end_latency, ret);
1208	1509	out:
1209	1510	ceph_osdc_put_request(req);
1210	1511	if (ret != 0) {
..	..	@@ -1247,6 +1548,7 @@
1247	1548	struct inode *inode = file_inode(filp);
1248	1549	struct ceph_inode_info *ci = ceph_inode(inode);
1249	1550	struct page *pinned_page = NULL;
	1551	+ bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
1250	1552	ssize_t ret;
1251	1553	int want, got = 0;
1252	1554	int retry_op = 0, read = 0;
..	..	@@ -1255,13 +1557,24 @@
1255	1557	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
1256	1558	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
1257	1559
	1560	+ if (direct_lock)
	1561	+ ceph_start_io_direct(inode);
	1562	+ else
	1563	+ ceph_start_io_read(inode);
	1564	+
1258	1565	if (fi->fmode & CEPH_FILE_MODE_LAZY)
1259	1566	want = CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO;
1260	1567	else
1261	1568	want = CEPH_CAP_FILE_CACHE;
1262		- ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
1263		- if (ret < 0)
	1569	+ ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1,
	1570	+ &got, &pinned_page);
	1571	+ if (ret < 0) {
	1572	+ if (iocb->ki_flags & IOCB_DIRECT)
	1573	+ ceph_end_io_direct(inode);
	1574	+ else
	1575	+ ceph_end_io_read(inode);
1264	1576	return ret;
	1577	+ }
1265	1578
1266	1579	if ((got & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == 0 \|\|
1267	1580	(iocb->ki_flags & IOCB_DIRECT) \|\|
..	..	@@ -1292,6 +1605,7 @@
1292	1605	ret = generic_file_read_iter(iocb, to);
1293	1606	ceph_del_rw_context(fi, &rw_ctx);
1294	1607	}
	1608	+
1295	1609	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
1296	1610	inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
1297	1611	if (pinned_page) {
..	..	@@ -1299,6 +1613,12 @@
1299	1613	pinned_page = NULL;
1300	1614	}
1301	1615	ceph_put_cap_refs(ci, got);
	1616	+
	1617	+ if (direct_lock)
	1618	+ ceph_end_io_direct(inode);
	1619	+ else
	1620	+ ceph_end_io_read(inode);
	1621	+
1302	1622	if (retry_op > HAVE_RETRIED && ret >= 0) {
1303	1623	int statret;
1304	1624	struct page *page = NULL;
..	..	@@ -1388,6 +1708,7 @@
1388	1708	struct ceph_cap_flush *prealloc_cf;
1389	1709	ssize_t count, written = 0;
1390	1710	int err, want, got;
	1711	+ bool direct_lock = false;
1391	1712	u32 map_flags;
1392	1713	u64 pool_flags;
1393	1714	loff_t pos;
..	..	@@ -1400,8 +1721,14 @@
1400	1721	if (!prealloc_cf)
1401	1722	return -ENOMEM;
1402	1723
	1724	+ if ((iocb->ki_flags & (IOCB_DIRECT \| IOCB_APPEND)) == IOCB_DIRECT)
	1725	+ direct_lock = true;
	1726	+
1403	1727	retry_snap:
1404		- inode_lock(inode);
	1728	+ if (direct_lock)
	1729	+ ceph_start_io_direct(inode);
	1730	+ else
	1731	+ ceph_start_io_write(inode);
1405	1732
1406	1733	/* We can write back this queue in page reclaim */
1407	1734	current->backing_dev_info = inode_to_bdi(inode);
..	..	@@ -1430,20 +1757,6 @@
1430	1757	goto out;
1431	1758	}
1432	1759
1433		- err = file_remove_privs(file);
1434		- if (err)
1435		- goto out;
1436		-
1437		- err = file_update_time(file);
1438		- if (err)
1439		- goto out;
1440		-
1441		- if (ci->i_inline_version != CEPH_INLINE_NONE) {
1442		- err = ceph_uninline_data(file, NULL);
1443		- if (err < 0)
1444		- goto out;
1445		- }
1446		-
1447	1760	down_read(&osdc->lock);
1448	1761	map_flags = osdc->osdmap->flags;
1449	1762	pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
..	..	@@ -1454,6 +1767,16 @@
1454	1767	goto out;
1455	1768	}
1456	1769
	1770	+ err = file_remove_privs(file);
	1771	+ if (err)
	1772	+ goto out;
	1773	+
	1774	+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
	1775	+ err = ceph_uninline_data(file, NULL);
	1776	+ if (err < 0)
	1777	+ goto out;
	1778	+ }
	1779	+
1457	1780	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
1458	1781	inode, ceph_vinop(inode), pos, count, i_size_read(inode));
1459	1782	if (fi->fmode & CEPH_FILE_MODE_LAZY)
..	..	@@ -1461,10 +1784,16 @@
1461	1784	else
1462	1785	want = CEPH_CAP_FILE_BUFFER;
1463	1786	got = 0;
1464		- err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count,
	1787	+ err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count,
1465	1788	&got, NULL);
1466	1789	if (err < 0)
1467	1790	goto out;
	1791	+
	1792	+ err = file_update_time(file);
	1793	+ if (err)
	1794	+ goto out_caps;
	1795	+
	1796	+ inode_inc_iversion_raw(inode);
1468	1797
1469	1798	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
1470	1799	inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
..	..	@@ -1474,7 +1803,6 @@
1474	1803	(ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
1475	1804	struct ceph_snap_context *snapc;
1476	1805	struct iov_iter data;
1477		- inode_unlock(inode);
1478	1806
1479	1807	spin_lock(&ci->i_ceph_lock);
1480	1808	if (__ceph_have_pending_cap_snap(ci)) {
..	..	@@ -1496,6 +1824,10 @@
1496	1824	&prealloc_cf);
1497	1825	else
1498	1826	written = ceph_sync_write(iocb, &data, pos, snapc);
	1827	+ if (direct_lock)
	1828	+ ceph_end_io_direct(inode);
	1829	+ else
	1830	+ ceph_end_io_write(inode);
1499	1831	if (written > 0)
1500	1832	iov_iter_advance(from, written);
1501	1833	ceph_put_snap_context(snapc);
..	..	@@ -1510,7 +1842,7 @@
1510	1842	written = generic_perform_write(file, from, pos);
1511	1843	if (likely(written >= 0))
1512	1844	iocb->ki_pos = pos + written;
1513		- inode_unlock(inode);
	1845	+ ceph_end_io_write(inode);
1514	1846	}
1515	1847
1516	1848	if (written >= 0) {
..	..	@@ -1524,7 +1856,7 @@
1524	1856	if (dirty)
1525	1857	__mark_inode_dirty(inode, dirty);
1526	1858	if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
1527		- ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
	1859	+ ceph_check_caps(ci, 0, NULL);
1528	1860	}
1529	1861
1530	1862	dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
..	..	@@ -1546,9 +1878,13 @@
1546	1878	}
1547	1879
1548	1880	goto out_unlocked;
1549		-
	1881	+out_caps:
	1882	+ ceph_put_cap_refs(ci, got);
1550	1883	out:
1551		- inode_unlock(inode);
	1884	+ if (direct_lock)
	1885	+ ceph_end_io_direct(inode);
	1886	+ else
	1887	+ ceph_end_io_write(inode);
1552	1888	out_unlocked:
1553	1889	ceph_free_cap_flush(prealloc_cf);
1554	1890	current->backing_dev_info = NULL;
..	..	@@ -1786,7 +2122,7 @@
1786	2122	else
1787	2123	want = CEPH_CAP_FILE_BUFFER;
1788	2124
1789		- ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
	2125	+ ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
1790	2126	if (ret < 0)
1791	2127	goto unlock;
1792	2128
..	..	@@ -1810,6 +2146,370 @@
1810	2146	return ret;
1811	2147	}
1812	2148
	2149	+/*
	2150	+ * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for
	2151	+ * src_ci. Two attempts are made to obtain both caps, and an error is return if
	2152	+ * this fails; zero is returned on success.
	2153	+ */
	2154	+static int get_rd_wr_caps(struct file src_filp, int src_got,
	2155	+ struct file *dst_filp,
	2156	+ loff_t dst_endoff, int *dst_got)
	2157	+{
	2158	+ int ret = 0;
	2159	+ bool retrying = false;
	2160	+
	2161	+retry_caps:
	2162	+ ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
	2163	+ dst_endoff, dst_got, NULL);
	2164	+ if (ret < 0)
	2165	+ return ret;
	2166	+
	2167	+ /*
	2168	+ * Since we're already holding the FILE_WR capability for the dst file,
	2169	+ * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some
	2170	+ * retry dance instead to try to get both capabilities.
	2171	+ */
	2172	+ ret = ceph_try_get_caps(file_inode(src_filp),
	2173	+ CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
	2174	+ false, src_got);
	2175	+ if (ret <= 0) {
	2176	+ /* Start by dropping dst_ci caps and getting src_ci caps */
	2177	+ ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got);
	2178	+ if (retrying) {
	2179	+ if (!ret)
	2180	+ /* ceph_try_get_caps masks EAGAIN */
	2181	+ ret = -EAGAIN;
	2182	+ return ret;
	2183	+ }
	2184	+ ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
	2185	+ CEPH_CAP_FILE_SHARED, -1, src_got, NULL);
	2186	+ if (ret < 0)
	2187	+ return ret;
	2188	+ /... drop src_ci caps too, and retry /
	2189	+ ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got);
	2190	+ retrying = true;
	2191	+ goto retry_caps;
	2192	+ }
	2193	+ return ret;
	2194	+}
	2195	+
	2196	+static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
	2197	+ struct ceph_inode_info *dst_ci, int dst_got)
	2198	+{
	2199	+ ceph_put_cap_refs(src_ci, src_got);
	2200	+ ceph_put_cap_refs(dst_ci, dst_got);
	2201	+}
	2202	+
	2203	+/*
	2204	+ * This function does several size-related checks, returning an error if:
	2205	+ * - source file is smaller than off+len
	2206	+ * - destination file size is not OK (inode_newsize_ok())
	2207	+ * - max bytes quotas is exceeded
	2208	+ */
	2209	+static int is_file_size_ok(struct inode src_inode, struct inode dst_inode,
	2210	+ loff_t src_off, loff_t dst_off, size_t len)
	2211	+{
	2212	+ loff_t size, endoff;
	2213	+
	2214	+ size = i_size_read(src_inode);
	2215	+ /*
	2216	+ * Don't copy beyond source file EOF. Instead of simply setting length
	2217	+ * to (size - src_off), just drop to VFS default implementation, as the
	2218	+ * local i_size may be stale due to other clients writing to the source
	2219	+ * inode.
	2220	+ */
	2221	+ if (src_off + len > size) {
	2222	+ dout("Copy beyond EOF (%llu + %zu > %llu)\n",
	2223	+ src_off, len, size);
	2224	+ return -EOPNOTSUPP;
	2225	+ }
	2226	+ size = i_size_read(dst_inode);
	2227	+
	2228	+ endoff = dst_off + len;
	2229	+ if (inode_newsize_ok(dst_inode, endoff))
	2230	+ return -EOPNOTSUPP;
	2231	+
	2232	+ if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff))
	2233	+ return -EDQUOT;
	2234	+
	2235	+ return 0;
	2236	+}
	2237	+
	2238	+static ssize_t ceph_do_objects_copy(struct ceph_inode_info src_ci, u64 src_off,
	2239	+ struct ceph_inode_info dst_ci, u64 dst_off,
	2240	+ struct ceph_fs_client *fsc,
	2241	+ size_t len, unsigned int flags)
	2242	+{
	2243	+ struct ceph_object_locator src_oloc, dst_oloc;
	2244	+ struct ceph_object_id src_oid, dst_oid;
	2245	+ size_t bytes = 0;
	2246	+ u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
	2247	+ u32 src_objlen, dst_objlen;
	2248	+ u32 object_size = src_ci->i_layout.object_size;
	2249	+ int ret;
	2250	+
	2251	+ src_oloc.pool = src_ci->i_layout.pool_id;
	2252	+ src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
	2253	+ dst_oloc.pool = dst_ci->i_layout.pool_id;
	2254	+ dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
	2255	+
	2256	+ while (len >= object_size) {
	2257	+ ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
	2258	+ object_size, &src_objnum,
	2259	+ &src_objoff, &src_objlen);
	2260	+ ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
	2261	+ object_size, &dst_objnum,
	2262	+ &dst_objoff, &dst_objlen);
	2263	+ ceph_oid_init(&src_oid);
	2264	+ ceph_oid_printf(&src_oid, "%llx.%08llx",
	2265	+ src_ci->i_vino.ino, src_objnum);
	2266	+ ceph_oid_init(&dst_oid);
	2267	+ ceph_oid_printf(&dst_oid, "%llx.%08llx",
	2268	+ dst_ci->i_vino.ino, dst_objnum);
	2269	+ /* Do an object remote copy */
	2270	+ ret = ceph_osdc_copy_from(&fsc->client->osdc,
	2271	+ src_ci->i_vino.snap, 0,
	2272	+ &src_oid, &src_oloc,
	2273	+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL \|
	2274	+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
	2275	+ &dst_oid, &dst_oloc,
	2276	+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL \|
	2277	+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
	2278	+ dst_ci->i_truncate_seq,
	2279	+ dst_ci->i_truncate_size,
	2280	+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
	2281	+ if (ret) {
	2282	+ if (ret == -EOPNOTSUPP) {
	2283	+ fsc->have_copy_from2 = false;
	2284	+ pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
	2285	+ }
	2286	+ dout("ceph_osdc_copy_from returned %d\n", ret);
	2287	+ if (!bytes)
	2288	+ bytes = ret;
	2289	+ goto out;
	2290	+ }
	2291	+ len -= object_size;
	2292	+ bytes += object_size;
	2293	+ *src_off += object_size;
	2294	+ *dst_off += object_size;
	2295	+ }
	2296	+
	2297	+out:
	2298	+ ceph_oloc_destroy(&src_oloc);
	2299	+ ceph_oloc_destroy(&dst_oloc);
	2300	+ return bytes;
	2301	+}
	2302	+
	2303	+static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
	2304	+ struct file *dst_file, loff_t dst_off,
	2305	+ size_t len, unsigned int flags)
	2306	+{
	2307	+ struct inode *src_inode = file_inode(src_file);
	2308	+ struct inode *dst_inode = file_inode(dst_file);
	2309	+ struct ceph_inode_info *src_ci = ceph_inode(src_inode);
	2310	+ struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
	2311	+ struct ceph_cap_flush *prealloc_cf;
	2312	+ struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
	2313	+ loff_t size;
	2314	+ ssize_t ret = -EIO, bytes;
	2315	+ u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
	2316	+ u32 src_objlen, dst_objlen;
	2317	+ int src_got = 0, dst_got = 0, err, dirty;
	2318	+
	2319	+ if (src_inode->i_sb != dst_inode->i_sb) {
	2320	+ struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
	2321	+
	2322	+ if (ceph_fsid_compare(&src_fsc->client->fsid,
	2323	+ &dst_fsc->client->fsid)) {
	2324	+ dout("Copying files across clusters: src: %pU dst: %pU\n",
	2325	+ &src_fsc->client->fsid, &dst_fsc->client->fsid);
	2326	+ return -EXDEV;
	2327	+ }
	2328	+ }
	2329	+ if (ceph_snap(dst_inode) != CEPH_NOSNAP)
	2330	+ return -EROFS;
	2331	+
	2332	+ /*
	2333	+ * Some of the checks below will return -EOPNOTSUPP, which will force a
	2334	+ * fallback to the default VFS copy_file_range implementation. This is
	2335	+ * desirable in several cases (for ex, the 'len' is smaller than the
	2336	+ * size of the objects, or in cases where that would be more
	2337	+ * efficient).
	2338	+ */
	2339	+
	2340	+ if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
	2341	+ return -EOPNOTSUPP;
	2342	+
	2343	+ if (!src_fsc->have_copy_from2)
	2344	+ return -EOPNOTSUPP;
	2345	+
	2346	+ /*
	2347	+ * Striped file layouts require that we copy partial objects, but the
	2348	+ * OSD copy-from operation only supports full-object copies. Limit
	2349	+ * this to non-striped file layouts for now.
	2350	+ */
	2351	+ if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) \|\|
	2352	+ (src_ci->i_layout.stripe_count != 1) \|\|
	2353	+ (dst_ci->i_layout.stripe_count != 1) \|\|
	2354	+ (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
	2355	+ dout("Invalid src/dst files layout\n");
	2356	+ return -EOPNOTSUPP;
	2357	+ }
	2358	+
	2359	+ if (len < src_ci->i_layout.object_size)
	2360	+ return -EOPNOTSUPP; /* no remote copy will be done */
	2361	+
	2362	+ prealloc_cf = ceph_alloc_cap_flush();
	2363	+ if (!prealloc_cf)
	2364	+ return -ENOMEM;
	2365	+
	2366	+ /* Start by sync'ing the source and destination files */
	2367	+ ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
	2368	+ if (ret < 0) {
	2369	+ dout("failed to write src file (%zd)\n", ret);
	2370	+ goto out;
	2371	+ }
	2372	+ ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len));
	2373	+ if (ret < 0) {
	2374	+ dout("failed to write dst file (%zd)\n", ret);
	2375	+ goto out;
	2376	+ }
	2377	+
	2378	+ /*
	2379	+ * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other
	2380	+ * clients may have dirty data in their caches. And OSDs know nothing
	2381	+ * about caps, so they can't safely do the remote object copies.
	2382	+ */
	2383	+ err = get_rd_wr_caps(src_file, &src_got,
	2384	+ dst_file, (dst_off + len), &dst_got);
	2385	+ if (err < 0) {
	2386	+ dout("get_rd_wr_caps returned %d\n", err);
	2387	+ ret = -EOPNOTSUPP;
	2388	+ goto out;
	2389	+ }
	2390	+
	2391	+ ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len);
	2392	+ if (ret < 0)
	2393	+ goto out_caps;
	2394	+
	2395	+ /* Drop dst file cached pages */
	2396	+ ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
	2397	+ dst_off >> PAGE_SHIFT,
	2398	+ (dst_off + len) >> PAGE_SHIFT);
	2399	+ if (ret < 0) {
	2400	+ dout("Failed to invalidate inode pages (%zd)\n", ret);
	2401	+ ret = 0; /* XXX */
	2402	+ }
	2403	+ ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
	2404	+ src_ci->i_layout.object_size,
	2405	+ &src_objnum, &src_objoff, &src_objlen);
	2406	+ ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
	2407	+ dst_ci->i_layout.object_size,
	2408	+ &dst_objnum, &dst_objoff, &dst_objlen);
	2409	+ /* object-level offsets need to the same */
	2410	+ if (src_objoff != dst_objoff) {
	2411	+ ret = -EOPNOTSUPP;
	2412	+ goto out_caps;
	2413	+ }
	2414	+
	2415	+ /*
	2416	+ * Do a manual copy if the object offset isn't object aligned.
	2417	+ * 'src_objlen' contains the bytes left until the end of the object,
	2418	+ * starting at the src_off
	2419	+ */
	2420	+ if (src_objoff) {
	2421	+ dout("Initial partial copy of %u bytes\n", src_objlen);
	2422	+
	2423	+ /*
	2424	+ * we need to temporarily drop all caps as we'll be calling
	2425	+ * {read,write}_iter, which will get caps again.
	2426	+ */
	2427	+ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
	2428	+ ret = do_splice_direct(src_file, &src_off, dst_file,
	2429	+ &dst_off, src_objlen, flags);
	2430	+ /* Abort on short copies or on error */
	2431	+ if (ret < src_objlen) {
	2432	+ dout("Failed partial copy (%zd)\n", ret);
	2433	+ goto out;
	2434	+ }
	2435	+ len -= ret;
	2436	+ err = get_rd_wr_caps(src_file, &src_got,
	2437	+ dst_file, (dst_off + len), &dst_got);
	2438	+ if (err < 0)
	2439	+ goto out;
	2440	+ err = is_file_size_ok(src_inode, dst_inode,
	2441	+ src_off, dst_off, len);
	2442	+ if (err < 0)
	2443	+ goto out_caps;
	2444	+ }
	2445	+
	2446	+ size = i_size_read(dst_inode);
	2447	+ bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
	2448	+ src_fsc, len, flags);
	2449	+ if (bytes <= 0) {
	2450	+ if (!ret)
	2451	+ ret = bytes;
	2452	+ goto out_caps;
	2453	+ }
	2454	+ dout("Copied %zu bytes out of %zu\n", bytes, len);
	2455	+ len -= bytes;
	2456	+ ret += bytes;
	2457	+
	2458	+ file_update_time(dst_file);
	2459	+ inode_inc_iversion_raw(dst_inode);
	2460	+
	2461	+ if (dst_off > size) {
	2462	+ /* Let the MDS know about dst file size change */
	2463	+ if (ceph_inode_set_size(dst_inode, dst_off) \|\|
	2464	+ ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
	2465	+ ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL);
	2466	+ }
	2467	+ /* Mark Fw dirty */
	2468	+ spin_lock(&dst_ci->i_ceph_lock);
	2469	+ dst_ci->i_inline_version = CEPH_INLINE_NONE;
	2470	+ dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
	2471	+ spin_unlock(&dst_ci->i_ceph_lock);
	2472	+ if (dirty)
	2473	+ __mark_inode_dirty(dst_inode, dirty);
	2474	+
	2475	+out_caps:
	2476	+ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
	2477	+
	2478	+ /*
	2479	+ * Do the final manual copy if we still have some bytes left, unless
	2480	+ * there were errors in remote object copies (len >= object_size).
	2481	+ */
	2482	+ if (len && (len < src_ci->i_layout.object_size)) {
	2483	+ dout("Final partial copy of %zu bytes\n", len);
	2484	+ bytes = do_splice_direct(src_file, &src_off, dst_file,
	2485	+ &dst_off, len, flags);
	2486	+ if (bytes > 0)
	2487	+ ret += bytes;
	2488	+ else
	2489	+ dout("Failed partial copy (%zd)\n", bytes);
	2490	+ }
	2491	+
	2492	+out:
	2493	+ ceph_free_cap_flush(prealloc_cf);
	2494	+
	2495	+ return ret;
	2496	+}
	2497	+
	2498	+static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
	2499	+ struct file *dst_file, loff_t dst_off,
	2500	+ size_t len, unsigned int flags)
	2501	+{
	2502	+ ssize_t ret;
	2503	+
	2504	+ ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
	2505	+ len, flags);
	2506	+
	2507	+ if (ret == -EOPNOTSUPP \|\| ret == -EXDEV)
	2508	+ ret = generic_copy_file_range(src_file, src_off, dst_file,
	2509	+ dst_off, len, flags);
	2510	+ return ret;
	2511	+}
	2512	+
1813	2513	const struct file_operations ceph_file_fops = {
1814	2514	.open = ceph_open,
1815	2515	.release = ceph_release,
..	..	@@ -1824,7 +2524,7 @@
1824	2524	.splice_read = generic_file_splice_read,
1825	2525	.splice_write = iter_file_splice_write,
1826	2526	.unlocked_ioctl = ceph_ioctl,
1827		- .compat_ioctl = ceph_ioctl,
	2527	+ .compat_ioctl = compat_ptr_ioctl,
1828	2528	.fallocate = ceph_fallocate,
	2529	+ .copy_file_range = ceph_copy_file_range,
1829	2530	};
1830		-