.. | .. |
---|
12 | 12 | #include <linux/cred.h> |
---|
13 | 13 | #include <linux/namei.h> |
---|
14 | 14 | #include <linux/mm.h> |
---|
| 15 | +#include <linux/uio.h> |
---|
15 | 16 | #include <linux/module.h> |
---|
| 17 | +#include <linux/bpf-cgroup.h> |
---|
| 18 | +#include <linux/mount.h> |
---|
16 | 19 | #include "internal.h" |
---|
17 | 20 | |
---|
18 | 21 | static const struct dentry_operations proc_sys_dentry_operations; |
---|
.. | .. |
---|
20 | 23 | static const struct inode_operations proc_sys_inode_operations; |
---|
21 | 24 | static const struct file_operations proc_sys_dir_file_operations; |
---|
22 | 25 | static const struct inode_operations proc_sys_dir_operations; |
---|
| 26 | + |
---|
| 27 | +/* shared constants to be used in various sysctls */ |
---|
| 28 | +const int sysctl_vals[] = { 0, 1, INT_MAX }; |
---|
| 29 | +EXPORT_SYMBOL(sysctl_vals); |
---|
23 | 30 | |
---|
24 | 31 | /* Support for permanently empty directories */ |
---|
25 | 32 | |
---|
.. | .. |
---|
262 | 269 | complete(p->unregistering); |
---|
263 | 270 | } |
---|
264 | 271 | |
---|
265 | | -static void proc_sys_prune_dcache(struct ctl_table_header *head) |
---|
| 272 | +static void proc_sys_invalidate_dcache(struct ctl_table_header *head) |
---|
266 | 273 | { |
---|
267 | | - struct inode *inode; |
---|
268 | | - struct proc_inode *ei; |
---|
269 | | - struct hlist_node *node; |
---|
270 | | - struct super_block *sb; |
---|
271 | | - |
---|
272 | | - rcu_read_lock(); |
---|
273 | | - for (;;) { |
---|
274 | | - node = hlist_first_rcu(&head->inodes); |
---|
275 | | - if (!node) |
---|
276 | | - break; |
---|
277 | | - ei = hlist_entry(node, struct proc_inode, sysctl_inodes); |
---|
278 | | - spin_lock(&sysctl_lock); |
---|
279 | | - hlist_del_init_rcu(&ei->sysctl_inodes); |
---|
280 | | - spin_unlock(&sysctl_lock); |
---|
281 | | - |
---|
282 | | - inode = &ei->vfs_inode; |
---|
283 | | - sb = inode->i_sb; |
---|
284 | | - if (!atomic_inc_not_zero(&sb->s_active)) |
---|
285 | | - continue; |
---|
286 | | - inode = igrab(inode); |
---|
287 | | - rcu_read_unlock(); |
---|
288 | | - if (unlikely(!inode)) { |
---|
289 | | - deactivate_super(sb); |
---|
290 | | - rcu_read_lock(); |
---|
291 | | - continue; |
---|
292 | | - } |
---|
293 | | - |
---|
294 | | - d_prune_aliases(inode); |
---|
295 | | - iput(inode); |
---|
296 | | - deactivate_super(sb); |
---|
297 | | - |
---|
298 | | - rcu_read_lock(); |
---|
299 | | - } |
---|
300 | | - rcu_read_unlock(); |
---|
| 274 | + proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); |
---|
301 | 275 | } |
---|
302 | 276 | |
---|
303 | 277 | /* called under sysctl_lock, will reacquire if has to wait */ |
---|
.. | .. |
---|
319 | 293 | spin_unlock(&sysctl_lock); |
---|
320 | 294 | } |
---|
321 | 295 | /* |
---|
322 | | - * Prune dentries for unregistered sysctls: namespaced sysctls |
---|
| 296 | + * Invalidate dentries for unregistered sysctls: namespaced sysctls |
---|
323 | 297 | * can have duplicate names and contaminate dcache very badly. |
---|
324 | 298 | */ |
---|
325 | | - proc_sys_prune_dcache(p); |
---|
| 299 | + proc_sys_invalidate_dcache(p); |
---|
326 | 300 | /* |
---|
327 | 301 | * do not remove from the list until nobody holds it; walking the |
---|
328 | 302 | * list in do_sysctl() relies on that. |
---|
.. | .. |
---|
478 | 452 | } |
---|
479 | 453 | ei->sysctl = head; |
---|
480 | 454 | ei->sysctl_entry = table; |
---|
481 | | - hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes); |
---|
| 455 | + hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes); |
---|
482 | 456 | head->count++; |
---|
483 | 457 | spin_unlock(&sysctl_lock); |
---|
484 | 458 | |
---|
.. | .. |
---|
509 | 483 | void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) |
---|
510 | 484 | { |
---|
511 | 485 | spin_lock(&sysctl_lock); |
---|
512 | | - hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes); |
---|
| 486 | + hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes); |
---|
513 | 487 | if (!--head->count) |
---|
514 | 488 | kfree_rcu(head, rcu); |
---|
515 | 489 | spin_unlock(&sysctl_lock); |
---|
.. | .. |
---|
567 | 541 | return err; |
---|
568 | 542 | } |
---|
569 | 543 | |
---|
570 | | -static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, |
---|
571 | | - size_t count, loff_t *ppos, int write) |
---|
| 544 | +static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, |
---|
| 545 | + int write) |
---|
572 | 546 | { |
---|
573 | | - struct inode *inode = file_inode(filp); |
---|
| 547 | + struct inode *inode = file_inode(iocb->ki_filp); |
---|
574 | 548 | struct ctl_table_header *head = grab_header(inode); |
---|
575 | 549 | struct ctl_table *table = PROC_I(inode)->sysctl_entry; |
---|
| 550 | + size_t count = iov_iter_count(iter); |
---|
| 551 | + char *kbuf; |
---|
576 | 552 | ssize_t error; |
---|
577 | | - size_t res; |
---|
578 | 553 | |
---|
579 | 554 | if (IS_ERR(head)) |
---|
580 | 555 | return PTR_ERR(head); |
---|
.. | .. |
---|
592 | 567 | if (!table->proc_handler) |
---|
593 | 568 | goto out; |
---|
594 | 569 | |
---|
| 570 | + /* don't even try if the size is too large */ |
---|
| 571 | + error = -ENOMEM; |
---|
| 572 | + if (count >= KMALLOC_MAX_SIZE) |
---|
| 573 | + goto out; |
---|
| 574 | + kbuf = kvzalloc(count + 1, GFP_KERNEL); |
---|
| 575 | + if (!kbuf) |
---|
| 576 | + goto out; |
---|
| 577 | + |
---|
| 578 | + if (write) { |
---|
| 579 | + error = -EFAULT; |
---|
| 580 | + if (!copy_from_iter_full(kbuf, count, iter)) |
---|
| 581 | + goto out_free_buf; |
---|
| 582 | + kbuf[count] = '\0'; |
---|
| 583 | + } |
---|
| 584 | + |
---|
| 585 | + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, |
---|
| 586 | + &iocb->ki_pos); |
---|
| 587 | + if (error) |
---|
| 588 | + goto out_free_buf; |
---|
| 589 | + |
---|
595 | 590 | /* careful: calling conventions are nasty here */ |
---|
596 | | - res = count; |
---|
597 | | - error = table->proc_handler(table, write, buf, &res, ppos); |
---|
598 | | - if (!error) |
---|
599 | | - error = res; |
---|
| 591 | + error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos); |
---|
| 592 | + if (error) |
---|
| 593 | + goto out_free_buf; |
---|
| 594 | + |
---|
| 595 | + if (!write) { |
---|
| 596 | + error = -EFAULT; |
---|
| 597 | + if (copy_to_iter(kbuf, count, iter) < count) |
---|
| 598 | + goto out_free_buf; |
---|
| 599 | + } |
---|
| 600 | + |
---|
| 601 | + error = count; |
---|
| 602 | +out_free_buf: |
---|
| 603 | + kvfree(kbuf); |
---|
600 | 604 | out: |
---|
601 | 605 | sysctl_head_finish(head); |
---|
602 | 606 | |
---|
603 | 607 | return error; |
---|
604 | 608 | } |
---|
605 | 609 | |
---|
606 | | -static ssize_t proc_sys_read(struct file *filp, char __user *buf, |
---|
607 | | - size_t count, loff_t *ppos) |
---|
| 610 | +static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter) |
---|
608 | 611 | { |
---|
609 | | - return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0); |
---|
| 612 | + return proc_sys_call_handler(iocb, iter, 0); |
---|
610 | 613 | } |
---|
611 | 614 | |
---|
612 | | -static ssize_t proc_sys_write(struct file *filp, const char __user *buf, |
---|
613 | | - size_t count, loff_t *ppos) |
---|
| 615 | +static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter) |
---|
614 | 616 | { |
---|
615 | | - return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); |
---|
| 617 | + return proc_sys_call_handler(iocb, iter, 1); |
---|
616 | 618 | } |
---|
617 | 619 | |
---|
618 | 620 | static int proc_sys_open(struct inode *inode, struct file *filp) |
---|
.. | .. |
---|
849 | 851 | static const struct file_operations proc_sys_file_operations = { |
---|
850 | 852 | .open = proc_sys_open, |
---|
851 | 853 | .poll = proc_sys_poll, |
---|
852 | | - .read = proc_sys_read, |
---|
853 | | - .write = proc_sys_write, |
---|
| 854 | + .read_iter = proc_sys_read, |
---|
| 855 | + .write_iter = proc_sys_write, |
---|
| 856 | + .splice_read = generic_file_splice_read, |
---|
| 857 | + .splice_write = iter_file_splice_write, |
---|
854 | 858 | .llseek = default_llseek, |
---|
855 | 859 | }; |
---|
856 | 860 | |
---|
.. | .. |
---|
1699 | 1703 | |
---|
1700 | 1704 | proc_sys_root = proc_mkdir("sys", NULL); |
---|
1701 | 1705 | proc_sys_root->proc_iops = &proc_sys_dir_operations; |
---|
1702 | | - proc_sys_root->proc_fops = &proc_sys_dir_file_operations; |
---|
| 1706 | + proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; |
---|
1703 | 1707 | proc_sys_root->nlink = 0; |
---|
1704 | 1708 | |
---|
1705 | 1709 | return sysctl_init(); |
---|
1706 | 1710 | } |
---|
| 1711 | + |
---|
| 1712 | +struct sysctl_alias { |
---|
| 1713 | + const char *kernel_param; |
---|
| 1714 | + const char *sysctl_param; |
---|
| 1715 | +}; |
---|
| 1716 | + |
---|
| 1717 | +/* |
---|
| 1718 | + * Historically some settings had both sysctl and a command line parameter. |
---|
| 1719 | + * With the generic sysctl. parameter support, we can handle them at a single |
---|
| 1720 | + * place and only keep the historical name for compatibility. This is not meant |
---|
| 1721 | + * to add brand new aliases. When adding existing aliases, consider whether |
---|
| 1722 | + * the possibly different moment of changing the value (e.g. from early_param |
---|
| 1723 | + * to the moment do_sysctl_args() is called) is an issue for the specific |
---|
| 1724 | + * parameter. |
---|
| 1725 | + */ |
---|
| 1726 | +static const struct sysctl_alias sysctl_aliases[] = { |
---|
| 1727 | + {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, |
---|
| 1728 | + {"hung_task_panic", "kernel.hung_task_panic" }, |
---|
| 1729 | + {"numa_zonelist_order", "vm.numa_zonelist_order" }, |
---|
| 1730 | + {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, |
---|
| 1731 | + {"softlockup_panic", "kernel.softlockup_panic" }, |
---|
| 1732 | + { } |
---|
| 1733 | +}; |
---|
| 1734 | + |
---|
| 1735 | +static const char *sysctl_find_alias(char *param) |
---|
| 1736 | +{ |
---|
| 1737 | + const struct sysctl_alias *alias; |
---|
| 1738 | + |
---|
| 1739 | + for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { |
---|
| 1740 | + if (strcmp(alias->kernel_param, param) == 0) |
---|
| 1741 | + return alias->sysctl_param; |
---|
| 1742 | + } |
---|
| 1743 | + |
---|
| 1744 | + return NULL; |
---|
| 1745 | +} |
---|
| 1746 | + |
---|
| 1747 | +/* Set sysctl value passed on kernel command line. */ |
---|
| 1748 | +static int process_sysctl_arg(char *param, char *val, |
---|
| 1749 | + const char *unused, void *arg) |
---|
| 1750 | +{ |
---|
| 1751 | + char *path; |
---|
| 1752 | + struct vfsmount **proc_mnt = arg; |
---|
| 1753 | + struct file_system_type *proc_fs_type; |
---|
| 1754 | + struct file *file; |
---|
| 1755 | + int len; |
---|
| 1756 | + int err; |
---|
| 1757 | + loff_t pos = 0; |
---|
| 1758 | + ssize_t wret; |
---|
| 1759 | + |
---|
| 1760 | + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { |
---|
| 1761 | + param += sizeof("sysctl") - 1; |
---|
| 1762 | + |
---|
| 1763 | + if (param[0] != '/' && param[0] != '.') |
---|
| 1764 | + return 0; |
---|
| 1765 | + |
---|
| 1766 | + param++; |
---|
| 1767 | + } else { |
---|
| 1768 | + param = (char *) sysctl_find_alias(param); |
---|
| 1769 | + if (!param) |
---|
| 1770 | + return 0; |
---|
| 1771 | + } |
---|
| 1772 | + |
---|
| 1773 | + if (!val) |
---|
| 1774 | + return -EINVAL; |
---|
| 1775 | + len = strlen(val); |
---|
| 1776 | + if (len == 0) |
---|
| 1777 | + return -EINVAL; |
---|
| 1778 | + |
---|
| 1779 | + /* |
---|
| 1780 | + * To set sysctl options, we use a temporary mount of proc, look up the |
---|
| 1781 | + * respective sys/ file and write to it. To avoid mounting it when no |
---|
| 1782 | + * options were given, we mount it only when the first sysctl option is |
---|
| 1783 | + * found. Why not a persistent mount? There are problems with a |
---|
| 1784 | + * persistent mount of proc in that it forces userspace not to use any |
---|
| 1785 | + * proc mount options. |
---|
| 1786 | + */ |
---|
| 1787 | + if (!*proc_mnt) { |
---|
| 1788 | + proc_fs_type = get_fs_type("proc"); |
---|
| 1789 | + if (!proc_fs_type) { |
---|
| 1790 | + pr_err("Failed to find procfs to set sysctl from command line\n"); |
---|
| 1791 | + return 0; |
---|
| 1792 | + } |
---|
| 1793 | + *proc_mnt = kern_mount(proc_fs_type); |
---|
| 1794 | + put_filesystem(proc_fs_type); |
---|
| 1795 | + if (IS_ERR(*proc_mnt)) { |
---|
| 1796 | + pr_err("Failed to mount procfs to set sysctl from command line\n"); |
---|
| 1797 | + return 0; |
---|
| 1798 | + } |
---|
| 1799 | + } |
---|
| 1800 | + |
---|
| 1801 | + path = kasprintf(GFP_KERNEL, "sys/%s", param); |
---|
| 1802 | + if (!path) |
---|
| 1803 | + panic("%s: Failed to allocate path for %s\n", __func__, param); |
---|
| 1804 | + strreplace(path, '.', '/'); |
---|
| 1805 | + |
---|
| 1806 | + file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0); |
---|
| 1807 | + if (IS_ERR(file)) { |
---|
| 1808 | + err = PTR_ERR(file); |
---|
| 1809 | + if (err == -ENOENT) |
---|
| 1810 | + pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", |
---|
| 1811 | + param, val); |
---|
| 1812 | + else if (err == -EACCES) |
---|
| 1813 | + pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", |
---|
| 1814 | + param, val); |
---|
| 1815 | + else |
---|
| 1816 | + pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", |
---|
| 1817 | + file, param, val); |
---|
| 1818 | + goto out; |
---|
| 1819 | + } |
---|
| 1820 | + wret = kernel_write(file, val, len, &pos); |
---|
| 1821 | + if (wret < 0) { |
---|
| 1822 | + err = wret; |
---|
| 1823 | + if (err == -EINVAL) |
---|
| 1824 | + pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", |
---|
| 1825 | + param, val); |
---|
| 1826 | + else |
---|
| 1827 | + pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", |
---|
| 1828 | + ERR_PTR(err), param, val); |
---|
| 1829 | + } else if (wret != len) { |
---|
| 1830 | + pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", |
---|
| 1831 | + wret, len, path, param, val); |
---|
| 1832 | + } |
---|
| 1833 | + |
---|
| 1834 | + err = filp_close(file, NULL); |
---|
| 1835 | + if (err) |
---|
| 1836 | + pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", |
---|
| 1837 | + ERR_PTR(err), param, val); |
---|
| 1838 | +out: |
---|
| 1839 | + kfree(path); |
---|
| 1840 | + return 0; |
---|
| 1841 | +} |
---|
| 1842 | + |
---|
| 1843 | +void do_sysctl_args(void) |
---|
| 1844 | +{ |
---|
| 1845 | + char *command_line; |
---|
| 1846 | + struct vfsmount *proc_mnt = NULL; |
---|
| 1847 | + |
---|
| 1848 | + command_line = kstrdup(saved_command_line, GFP_KERNEL); |
---|
| 1849 | + if (!command_line) |
---|
| 1850 | + panic("%s: Failed to allocate copy of command line\n", __func__); |
---|
| 1851 | + |
---|
| 1852 | + parse_args("Setting sysctl args", command_line, |
---|
| 1853 | + NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); |
---|
| 1854 | + |
---|
| 1855 | + if (proc_mnt) |
---|
| 1856 | + kern_unmount(proc_mnt); |
---|
| 1857 | + |
---|
| 1858 | + kfree(command_line); |
---|
| 1859 | +} |
---|