hc
2023-12-11 6778948f9de86c3cfaf36725a7c87dcff9ba247f
kernel/fs/proc/proc_sysctl.c
....@@ -12,7 +12,10 @@
1212 #include <linux/cred.h>
1313 #include <linux/namei.h>
1414 #include <linux/mm.h>
15
+#include <linux/uio.h>
1516 #include <linux/module.h>
17
+#include <linux/bpf-cgroup.h>
18
+#include <linux/mount.h>
1619 #include "internal.h"
1720
1821 static const struct dentry_operations proc_sys_dentry_operations;
....@@ -20,6 +23,10 @@
2023 static const struct inode_operations proc_sys_inode_operations;
2124 static const struct file_operations proc_sys_dir_file_operations;
2225 static const struct inode_operations proc_sys_dir_operations;
26
+
27
+/* shared constants to be used in various sysctls */
28
+const int sysctl_vals[] = { 0, 1, INT_MAX };
29
+EXPORT_SYMBOL(sysctl_vals);
2330
2431 /* Support for permanently empty directories */
2532
....@@ -262,42 +269,9 @@
262269 complete(p->unregistering);
263270 }
264271
265
-static void proc_sys_prune_dcache(struct ctl_table_header *head)
272
+static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
266273 {
267
- struct inode *inode;
268
- struct proc_inode *ei;
269
- struct hlist_node *node;
270
- struct super_block *sb;
271
-
272
- rcu_read_lock();
273
- for (;;) {
274
- node = hlist_first_rcu(&head->inodes);
275
- if (!node)
276
- break;
277
- ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
278
- spin_lock(&sysctl_lock);
279
- hlist_del_init_rcu(&ei->sysctl_inodes);
280
- spin_unlock(&sysctl_lock);
281
-
282
- inode = &ei->vfs_inode;
283
- sb = inode->i_sb;
284
- if (!atomic_inc_not_zero(&sb->s_active))
285
- continue;
286
- inode = igrab(inode);
287
- rcu_read_unlock();
288
- if (unlikely(!inode)) {
289
- deactivate_super(sb);
290
- rcu_read_lock();
291
- continue;
292
- }
293
-
294
- d_prune_aliases(inode);
295
- iput(inode);
296
- deactivate_super(sb);
297
-
298
- rcu_read_lock();
299
- }
300
- rcu_read_unlock();
274
+ proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
301275 }
302276
303277 /* called under sysctl_lock, will reacquire if has to wait */
....@@ -319,10 +293,10 @@
319293 spin_unlock(&sysctl_lock);
320294 }
321295 /*
322
- * Prune dentries for unregistered sysctls: namespaced sysctls
296
+ * Invalidate dentries for unregistered sysctls: namespaced sysctls
323297 * can have duplicate names and contaminate dcache very badly.
324298 */
325
- proc_sys_prune_dcache(p);
299
+ proc_sys_invalidate_dcache(p);
326300 /*
327301 * do not remove from the list until nobody holds it; walking the
328302 * list in do_sysctl() relies on that.
....@@ -478,7 +452,7 @@
478452 }
479453 ei->sysctl = head;
480454 ei->sysctl_entry = table;
481
- hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes);
455
+ hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes);
482456 head->count++;
483457 spin_unlock(&sysctl_lock);
484458
....@@ -509,7 +483,7 @@
509483 void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
510484 {
511485 spin_lock(&sysctl_lock);
512
- hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes);
486
+ hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes);
513487 if (!--head->count)
514488 kfree_rcu(head, rcu);
515489 spin_unlock(&sysctl_lock);
....@@ -567,14 +541,15 @@
567541 return err;
568542 }
569543
570
-static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
571
- size_t count, loff_t *ppos, int write)
544
+static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
545
+ int write)
572546 {
573
- struct inode *inode = file_inode(filp);
547
+ struct inode *inode = file_inode(iocb->ki_filp);
574548 struct ctl_table_header *head = grab_header(inode);
575549 struct ctl_table *table = PROC_I(inode)->sysctl_entry;
550
+ size_t count = iov_iter_count(iter);
551
+ char *kbuf;
576552 ssize_t error;
577
- size_t res;
578553
579554 if (IS_ERR(head))
580555 return PTR_ERR(head);
....@@ -592,27 +567,54 @@
592567 if (!table->proc_handler)
593568 goto out;
594569
570
+ /* don't even try if the size is too large */
571
+ error = -ENOMEM;
572
+ if (count >= KMALLOC_MAX_SIZE)
573
+ goto out;
574
+ kbuf = kvzalloc(count + 1, GFP_KERNEL);
575
+ if (!kbuf)
576
+ goto out;
577
+
578
+ if (write) {
579
+ error = -EFAULT;
580
+ if (!copy_from_iter_full(kbuf, count, iter))
581
+ goto out_free_buf;
582
+ kbuf[count] = '\0';
583
+ }
584
+
585
+ error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
586
+ &iocb->ki_pos);
587
+ if (error)
588
+ goto out_free_buf;
589
+
595590 /* careful: calling conventions are nasty here */
596
- res = count;
597
- error = table->proc_handler(table, write, buf, &res, ppos);
598
- if (!error)
599
- error = res;
591
+ error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos);
592
+ if (error)
593
+ goto out_free_buf;
594
+
595
+ if (!write) {
596
+ error = -EFAULT;
597
+ if (copy_to_iter(kbuf, count, iter) < count)
598
+ goto out_free_buf;
599
+ }
600
+
601
+ error = count;
602
+out_free_buf:
603
+ kvfree(kbuf);
600604 out:
601605 sysctl_head_finish(head);
602606
603607 return error;
604608 }
605609
606
-static ssize_t proc_sys_read(struct file *filp, char __user *buf,
607
- size_t count, loff_t *ppos)
610
+static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter)
608611 {
609
- return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
612
+ return proc_sys_call_handler(iocb, iter, 0);
610613 }
611614
612
-static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
613
- size_t count, loff_t *ppos)
615
+static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
614616 {
615
- return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
617
+ return proc_sys_call_handler(iocb, iter, 1);
616618 }
617619
618620 static int proc_sys_open(struct inode *inode, struct file *filp)
....@@ -849,8 +851,10 @@
849851 static const struct file_operations proc_sys_file_operations = {
850852 .open = proc_sys_open,
851853 .poll = proc_sys_poll,
852
- .read = proc_sys_read,
853
- .write = proc_sys_write,
854
+ .read_iter = proc_sys_read,
855
+ .write_iter = proc_sys_write,
856
+ .splice_read = generic_file_splice_read,
857
+ .splice_write = iter_file_splice_write,
854858 .llseek = default_llseek,
855859 };
856860
....@@ -1699,8 +1703,157 @@
16991703
17001704 proc_sys_root = proc_mkdir("sys", NULL);
17011705 proc_sys_root->proc_iops = &proc_sys_dir_operations;
1702
- proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
1706
+ proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations;
17031707 proc_sys_root->nlink = 0;
17041708
17051709 return sysctl_init();
17061710 }
1711
+
1712
+struct sysctl_alias {
1713
+ const char *kernel_param;
1714
+ const char *sysctl_param;
1715
+};
1716
+
1717
+/*
1718
+ * Historically some settings had both sysctl and a command line parameter.
1719
+ * With the generic sysctl. parameter support, we can handle them at a single
1720
+ * place and only keep the historical name for compatibility. This is not meant
1721
+ * to add brand new aliases. When adding existing aliases, consider whether
1722
+ * the possibly different moment of changing the value (e.g. from early_param
1723
+ * to the moment do_sysctl_args() is called) is an issue for the specific
1724
+ * parameter.
1725
+ */
1726
+static const struct sysctl_alias sysctl_aliases[] = {
1727
+ {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" },
1728
+ {"hung_task_panic", "kernel.hung_task_panic" },
1729
+ {"numa_zonelist_order", "vm.numa_zonelist_order" },
1730
+ {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" },
1731
+ {"softlockup_panic", "kernel.softlockup_panic" },
1732
+ { }
1733
+};
1734
+
1735
+static const char *sysctl_find_alias(char *param)
1736
+{
1737
+ const struct sysctl_alias *alias;
1738
+
1739
+ for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) {
1740
+ if (strcmp(alias->kernel_param, param) == 0)
1741
+ return alias->sysctl_param;
1742
+ }
1743
+
1744
+ return NULL;
1745
+}
1746
+
1747
+/* Set sysctl value passed on kernel command line. */
1748
+static int process_sysctl_arg(char *param, char *val,
1749
+ const char *unused, void *arg)
1750
+{
1751
+ char *path;
1752
+ struct vfsmount **proc_mnt = arg;
1753
+ struct file_system_type *proc_fs_type;
1754
+ struct file *file;
1755
+ int len;
1756
+ int err;
1757
+ loff_t pos = 0;
1758
+ ssize_t wret;
1759
+
1760
+ if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) {
1761
+ param += sizeof("sysctl") - 1;
1762
+
1763
+ if (param[0] != '/' && param[0] != '.')
1764
+ return 0;
1765
+
1766
+ param++;
1767
+ } else {
1768
+ param = (char *) sysctl_find_alias(param);
1769
+ if (!param)
1770
+ return 0;
1771
+ }
1772
+
1773
+ if (!val)
1774
+ return -EINVAL;
1775
+ len = strlen(val);
1776
+ if (len == 0)
1777
+ return -EINVAL;
1778
+
1779
+ /*
1780
+ * To set sysctl options, we use a temporary mount of proc, look up the
1781
+ * respective sys/ file and write to it. To avoid mounting it when no
1782
+ * options were given, we mount it only when the first sysctl option is
1783
+ * found. Why not a persistent mount? There are problems with a
1784
+ * persistent mount of proc in that it forces userspace not to use any
1785
+ * proc mount options.
1786
+ */
1787
+ if (!*proc_mnt) {
1788
+ proc_fs_type = get_fs_type("proc");
1789
+ if (!proc_fs_type) {
1790
+ pr_err("Failed to find procfs to set sysctl from command line\n");
1791
+ return 0;
1792
+ }
1793
+ *proc_mnt = kern_mount(proc_fs_type);
1794
+ put_filesystem(proc_fs_type);
1795
+ if (IS_ERR(*proc_mnt)) {
1796
+ pr_err("Failed to mount procfs to set sysctl from command line\n");
1797
+ return 0;
1798
+ }
1799
+ }
1800
+
1801
+ path = kasprintf(GFP_KERNEL, "sys/%s", param);
1802
+ if (!path)
1803
+ panic("%s: Failed to allocate path for %s\n", __func__, param);
1804
+ strreplace(path, '.', '/');
1805
+
1806
+ file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0);
1807
+ if (IS_ERR(file)) {
1808
+ err = PTR_ERR(file);
1809
+ if (err == -ENOENT)
1810
+ pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n",
1811
+ param, val);
1812
+ else if (err == -EACCES)
1813
+ pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n",
1814
+ param, val);
1815
+ else
1816
+ pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n",
1817
+ file, param, val);
1818
+ goto out;
1819
+ }
1820
+ wret = kernel_write(file, val, len, &pos);
1821
+ if (wret < 0) {
1822
+ err = wret;
1823
+ if (err == -EINVAL)
1824
+ pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n",
1825
+ param, val);
1826
+ else
1827
+ pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n",
1828
+ ERR_PTR(err), param, val);
1829
+ } else if (wret != len) {
1830
+ pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n",
1831
+ wret, len, path, param, val);
1832
+ }
1833
+
1834
+ err = filp_close(file, NULL);
1835
+ if (err)
1836
+ pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n",
1837
+ ERR_PTR(err), param, val);
1838
+out:
1839
+ kfree(path);
1840
+ return 0;
1841
+}
1842
+
1843
+void do_sysctl_args(void)
1844
+{
1845
+ char *command_line;
1846
+ struct vfsmount *proc_mnt = NULL;
1847
+
1848
+ command_line = kstrdup(saved_command_line, GFP_KERNEL);
1849
+ if (!command_line)
1850
+ panic("%s: Failed to allocate copy of command line\n", __func__);
1851
+
1852
+ parse_args("Setting sysctl args", command_line,
1853
+ NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg);
1854
+
1855
+ if (proc_mnt)
1856
+ kern_unmount(proc_mnt);
1857
+
1858
+ kfree(command_line);
1859
+}