hc
2024-10-12 a5969cabbb4660eab42b6ef0412cbbd1200cf14d
kernel/fs/proc/proc_sysctl.c
....@@ -12,7 +12,11 @@
1212 #include <linux/cred.h>
1313 #include <linux/namei.h>
1414 #include <linux/mm.h>
15
+#include <linux/uio.h>
1516 #include <linux/module.h>
17
+#include <linux/bpf-cgroup.h>
18
+#include <linux/mount.h>
19
+#include <linux/kmemleak.h>
1620 #include "internal.h"
1721
1822 static const struct dentry_operations proc_sys_dentry_operations;
....@@ -20,6 +24,12 @@
2024 static const struct inode_operations proc_sys_inode_operations;
2125 static const struct file_operations proc_sys_dir_file_operations;
2226 static const struct inode_operations proc_sys_dir_operations;
27
+
28
+/* shared constants to be used in various sysctls */
29
+const int sysctl_vals[] = { 0, 1, INT_MAX };
30
+EXPORT_SYMBOL(sysctl_vals);
31
+const int android_gki_sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX };
32
+EXPORT_SYMBOL(android_gki_sysctl_vals);
2333
2434 /* Support for permanently empty directories */
2535
....@@ -262,42 +272,9 @@
262272 complete(p->unregistering);
263273 }
264274
265
-static void proc_sys_prune_dcache(struct ctl_table_header *head)
275
+static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
266276 {
267
- struct inode *inode;
268
- struct proc_inode *ei;
269
- struct hlist_node *node;
270
- struct super_block *sb;
271
-
272
- rcu_read_lock();
273
- for (;;) {
274
- node = hlist_first_rcu(&head->inodes);
275
- if (!node)
276
- break;
277
- ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
278
- spin_lock(&sysctl_lock);
279
- hlist_del_init_rcu(&ei->sysctl_inodes);
280
- spin_unlock(&sysctl_lock);
281
-
282
- inode = &ei->vfs_inode;
283
- sb = inode->i_sb;
284
- if (!atomic_inc_not_zero(&sb->s_active))
285
- continue;
286
- inode = igrab(inode);
287
- rcu_read_unlock();
288
- if (unlikely(!inode)) {
289
- deactivate_super(sb);
290
- rcu_read_lock();
291
- continue;
292
- }
293
-
294
- d_prune_aliases(inode);
295
- iput(inode);
296
- deactivate_super(sb);
297
-
298
- rcu_read_lock();
299
- }
300
- rcu_read_unlock();
277
+ proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
301278 }
302279
303280 /* called under sysctl_lock, will reacquire if has to wait */
....@@ -319,10 +296,10 @@
319296 spin_unlock(&sysctl_lock);
320297 }
321298 /*
322
- * Prune dentries for unregistered sysctls: namespaced sysctls
299
+ * Invalidate dentries for unregistered sysctls: namespaced sysctls
323300 * can have duplicate names and contaminate dcache very badly.
324301 */
325
- proc_sys_prune_dcache(p);
302
+ proc_sys_invalidate_dcache(p);
326303 /*
327304 * do not remove from the list until nobody holds it; walking the
328305 * list in do_sysctl() relies on that.
....@@ -478,7 +455,7 @@
478455 }
479456 ei->sysctl = head;
480457 ei->sysctl_entry = table;
481
- hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes);
458
+ hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes);
482459 head->count++;
483460 spin_unlock(&sysctl_lock);
484461
....@@ -509,7 +486,7 @@
509486 void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
510487 {
511488 spin_lock(&sysctl_lock);
512
- hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes);
489
+ hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes);
513490 if (!--head->count)
514491 kfree_rcu(head, rcu);
515492 spin_unlock(&sysctl_lock);
....@@ -567,14 +544,15 @@
567544 return err;
568545 }
569546
570
-static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
571
- size_t count, loff_t *ppos, int write)
547
+static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
548
+ int write)
572549 {
573
- struct inode *inode = file_inode(filp);
550
+ struct inode *inode = file_inode(iocb->ki_filp);
574551 struct ctl_table_header *head = grab_header(inode);
575552 struct ctl_table *table = PROC_I(inode)->sysctl_entry;
553
+ size_t count = iov_iter_count(iter);
554
+ char *kbuf;
576555 ssize_t error;
577
- size_t res;
578556
579557 if (IS_ERR(head))
580558 return PTR_ERR(head);
....@@ -592,27 +570,54 @@
592570 if (!table->proc_handler)
593571 goto out;
594572
573
+ /* don't even try if the size is too large */
574
+ error = -ENOMEM;
575
+ if (count >= KMALLOC_MAX_SIZE)
576
+ goto out;
577
+ kbuf = kvzalloc(count + 1, GFP_KERNEL);
578
+ if (!kbuf)
579
+ goto out;
580
+
581
+ if (write) {
582
+ error = -EFAULT;
583
+ if (!copy_from_iter_full(kbuf, count, iter))
584
+ goto out_free_buf;
585
+ kbuf[count] = '\0';
586
+ }
587
+
588
+ error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
589
+ &iocb->ki_pos);
590
+ if (error)
591
+ goto out_free_buf;
592
+
595593 /* careful: calling conventions are nasty here */
596
- res = count;
597
- error = table->proc_handler(table, write, buf, &res, ppos);
598
- if (!error)
599
- error = res;
594
+ error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos);
595
+ if (error)
596
+ goto out_free_buf;
597
+
598
+ if (!write) {
599
+ error = -EFAULT;
600
+ if (copy_to_iter(kbuf, count, iter) < count)
601
+ goto out_free_buf;
602
+ }
603
+
604
+ error = count;
605
+out_free_buf:
606
+ kvfree(kbuf);
600607 out:
601608 sysctl_head_finish(head);
602609
603610 return error;
604611 }
605612
606
-static ssize_t proc_sys_read(struct file *filp, char __user *buf,
607
- size_t count, loff_t *ppos)
613
+static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter)
608614 {
609
- return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
615
+ return proc_sys_call_handler(iocb, iter, 0);
610616 }
611617
612
-static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
613
- size_t count, loff_t *ppos)
618
+static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
614619 {
615
- return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
620
+ return proc_sys_call_handler(iocb, iter, 1);
616621 }
617622
618623 static int proc_sys_open(struct inode *inode, struct file *filp)
....@@ -681,7 +686,7 @@
681686
682687 child = d_lookup(dir, &qname);
683688 if (!child) {
684
- DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
689
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
685690 child = d_alloc_parallel(dir, &qname, &wq);
686691 if (IS_ERR(child))
687692 return false;
....@@ -849,8 +854,10 @@
849854 static const struct file_operations proc_sys_file_operations = {
850855 .open = proc_sys_open,
851856 .poll = proc_sys_poll,
852
- .read = proc_sys_read,
853
- .write = proc_sys_write,
857
+ .read_iter = proc_sys_read,
858
+ .write_iter = proc_sys_write,
859
+ .splice_read = generic_file_splice_read,
860
+ .splice_write = iter_file_splice_write,
854861 .llseek = default_llseek,
855862 };
856863
....@@ -1101,6 +1108,11 @@
11011108 err |= sysctl_err(path, table, "array not allowed");
11021109 }
11031110
1111
+ if (table->proc_handler == proc_dou8vec_minmax) {
1112
+ if (table->maxlen != sizeof(u8))
1113
+ err |= sysctl_err(path, table, "array not allowed");
1114
+ }
1115
+
11041116 return err;
11051117 }
11061118
....@@ -1116,6 +1128,7 @@
11161128 (table->proc_handler == proc_douintvec) ||
11171129 (table->proc_handler == proc_douintvec_minmax) ||
11181130 (table->proc_handler == proc_dointvec_minmax) ||
1131
+ (table->proc_handler == proc_dou8vec_minmax) ||
11191132 (table->proc_handler == proc_dointvec_jiffies) ||
11201133 (table->proc_handler == proc_dointvec_userhz_jiffies) ||
11211134 (table->proc_handler == proc_dointvec_ms_jiffies) ||
....@@ -1375,6 +1388,38 @@
13751388 path, table);
13761389 }
13771390 EXPORT_SYMBOL(register_sysctl);
1391
+
1392
+/**
1393
+ * __register_sysctl_init() - register sysctl table to path
1394
+ * @path: path name for sysctl base
1395
+ * @table: This is the sysctl table that needs to be registered to the path
1396
+ * @table_name: The name of sysctl table, only used for log printing when
1397
+ * registration fails
1398
+ *
1399
+ * The sysctl interface is used by userspace to query or modify at runtime
1400
+ * a predefined value set on a variable. These variables however have default
1401
+ * values pre-set. Code which depends on these variables will always work even
1402
+ * if register_sysctl() fails. If register_sysctl() fails you'd just loose the
1403
+ * ability to query or modify the sysctls dynamically at run time. Chances of
1404
+ * register_sysctl() failing on init are extremely low, and so for both reasons
1405
+ * this function does not return any error as it is used by initialization code.
1406
+ *
1407
+ * Context: Can only be called after your respective sysctl base path has been
1408
+ * registered. So for instance, most base directories are registered early on
1409
+ * init before init levels are processed through proc_sys_init() and
1410
+ * sysctl_init().
1411
+ */
1412
+void __init __register_sysctl_init(const char *path, struct ctl_table *table,
1413
+ const char *table_name)
1414
+{
1415
+ struct ctl_table_header *hdr = register_sysctl(path, table);
1416
+
1417
+ if (unlikely(!hdr)) {
1418
+ pr_err("failed when register_sysctl %s to %s\n", table_name, path);
1419
+ return;
1420
+ }
1421
+ kmemleak_not_leak(hdr);
1422
+}
13781423
13791424 static char *append_path(const char *path, char *pos, const char *name)
13801425 {
....@@ -1699,8 +1744,157 @@
16991744
17001745 proc_sys_root = proc_mkdir("sys", NULL);
17011746 proc_sys_root->proc_iops = &proc_sys_dir_operations;
1702
- proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
1747
+ proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations;
17031748 proc_sys_root->nlink = 0;
17041749
17051750 return sysctl_init();
17061751 }
1752
+
1753
+struct sysctl_alias {
1754
+ const char *kernel_param;
1755
+ const char *sysctl_param;
1756
+};
1757
+
1758
+/*
1759
+ * Historically some settings had both sysctl and a command line parameter.
1760
+ * With the generic sysctl. parameter support, we can handle them at a single
1761
+ * place and only keep the historical name for compatibility. This is not meant
1762
+ * to add brand new aliases. When adding existing aliases, consider whether
1763
+ * the possibly different moment of changing the value (e.g. from early_param
1764
+ * to the moment do_sysctl_args() is called) is an issue for the specific
1765
+ * parameter.
1766
+ */
1767
+static const struct sysctl_alias sysctl_aliases[] = {
1768
+ {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" },
1769
+ {"hung_task_panic", "kernel.hung_task_panic" },
1770
+ {"numa_zonelist_order", "vm.numa_zonelist_order" },
1771
+ {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" },
1772
+ {"softlockup_panic", "kernel.softlockup_panic" },
1773
+ { }
1774
+};
1775
+
1776
+static const char *sysctl_find_alias(char *param)
1777
+{
1778
+ const struct sysctl_alias *alias;
1779
+
1780
+ for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) {
1781
+ if (strcmp(alias->kernel_param, param) == 0)
1782
+ return alias->sysctl_param;
1783
+ }
1784
+
1785
+ return NULL;
1786
+}
1787
+
1788
+/* Set sysctl value passed on kernel command line. */
1789
+static int process_sysctl_arg(char *param, char *val,
1790
+ const char *unused, void *arg)
1791
+{
1792
+ char *path;
1793
+ struct vfsmount **proc_mnt = arg;
1794
+ struct file_system_type *proc_fs_type;
1795
+ struct file *file;
1796
+ int len;
1797
+ int err;
1798
+ loff_t pos = 0;
1799
+ ssize_t wret;
1800
+
1801
+ if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) {
1802
+ param += sizeof("sysctl") - 1;
1803
+
1804
+ if (param[0] != '/' && param[0] != '.')
1805
+ return 0;
1806
+
1807
+ param++;
1808
+ } else {
1809
+ param = (char *) sysctl_find_alias(param);
1810
+ if (!param)
1811
+ return 0;
1812
+ }
1813
+
1814
+ if (!val)
1815
+ return -EINVAL;
1816
+ len = strlen(val);
1817
+ if (len == 0)
1818
+ return -EINVAL;
1819
+
1820
+ /*
1821
+ * To set sysctl options, we use a temporary mount of proc, look up the
1822
+ * respective sys/ file and write to it. To avoid mounting it when no
1823
+ * options were given, we mount it only when the first sysctl option is
1824
+ * found. Why not a persistent mount? There are problems with a
1825
+ * persistent mount of proc in that it forces userspace not to use any
1826
+ * proc mount options.
1827
+ */
1828
+ if (!*proc_mnt) {
1829
+ proc_fs_type = get_fs_type("proc");
1830
+ if (!proc_fs_type) {
1831
+ pr_err("Failed to find procfs to set sysctl from command line\n");
1832
+ return 0;
1833
+ }
1834
+ *proc_mnt = kern_mount(proc_fs_type);
1835
+ put_filesystem(proc_fs_type);
1836
+ if (IS_ERR(*proc_mnt)) {
1837
+ pr_err("Failed to mount procfs to set sysctl from command line\n");
1838
+ return 0;
1839
+ }
1840
+ }
1841
+
1842
+ path = kasprintf(GFP_KERNEL, "sys/%s", param);
1843
+ if (!path)
1844
+ panic("%s: Failed to allocate path for %s\n", __func__, param);
1845
+ strreplace(path, '.', '/');
1846
+
1847
+ file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0);
1848
+ if (IS_ERR(file)) {
1849
+ err = PTR_ERR(file);
1850
+ if (err == -ENOENT)
1851
+ pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n",
1852
+ param, val);
1853
+ else if (err == -EACCES)
1854
+ pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n",
1855
+ param, val);
1856
+ else
1857
+ pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n",
1858
+ file, param, val);
1859
+ goto out;
1860
+ }
1861
+ wret = kernel_write(file, val, len, &pos);
1862
+ if (wret < 0) {
1863
+ err = wret;
1864
+ if (err == -EINVAL)
1865
+ pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n",
1866
+ param, val);
1867
+ else
1868
+ pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n",
1869
+ ERR_PTR(err), param, val);
1870
+ } else if (wret != len) {
1871
+ pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n",
1872
+ wret, len, path, param, val);
1873
+ }
1874
+
1875
+ err = filp_close(file, NULL);
1876
+ if (err)
1877
+ pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n",
1878
+ ERR_PTR(err), param, val);
1879
+out:
1880
+ kfree(path);
1881
+ return 0;
1882
+}
1883
+
1884
+void do_sysctl_args(void)
1885
+{
1886
+ char *command_line;
1887
+ struct vfsmount *proc_mnt = NULL;
1888
+
1889
+ command_line = kstrdup(saved_command_line, GFP_KERNEL);
1890
+ if (!command_line)
1891
+ panic("%s: Failed to allocate copy of command line\n", __func__);
1892
+
1893
+ parse_args("Setting sysctl args", command_line,
1894
+ NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg);
1895
+
1896
+ if (proc_mnt)
1897
+ kern_unmount(proc_mnt);
1898
+
1899
+ kfree(command_line);
1900
+}