#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "lve_internal.h" #include "lve_global_params.h" #include "lsm_int.h" #include "link_protect.h" /* * We need this to prevent from racing with iterate_fd() * which typically comes from selinux when matching fd's * when user schedules "sa1" in crond for instance */ static int lve_close_fd(unsigned fd) { struct file *file; struct fdtable *fdt; struct files_struct *files = current->files; spin_lock(&files->file_lock); fdt = files_fdtable(files); if (fd >= fdt->max_fds) goto out_unlock; file = fdt->fd[fd]; if (!file) goto out_unlock; rcu_assign_pointer(fdt->fd[fd], NULL); spin_unlock(&files->file_lock); put_unused_fd(fd); fput(file); return 0; out_unlock: spin_unlock(&files->file_lock); return -EBADF; } int lve_symlink_lookup(const struct path *base, const char *name, struct path *target) { int dfd, ret; struct file *f; static const struct file_operations empty_fops = {}; #ifdef IMPL_LINK_PROT_OLD struct nameidata nd; int saved_link_count; #endif dfd = get_unused_fd_flags(0); if (dfd < 0) { ret = dfd; goto out; } #ifdef IMPL_LINK_PROT_OLD f = lve_get_empty_filp(); #else f = lve_alloc_empty_file_noaccount(0, current_cred()); #endif if (IS_ERR(f)) { ret = PTR_ERR(f); put_unused_fd(dfd); goto out; } f->f_path = *base; #ifdef IMPL_LINK_PROT_OLD /* __fput *puts* the ref uncond. in older kernels */ path_get(&f->f_path); #endif /* For selinux fd checks or alike (file_has_perm for inst.) */ f->f_inode = base->dentry->d_inode; /* Needed for filp_close() is used, when the dfd is duplicated somehow */ f->f_op = &empty_fops; f->f_mode |= FMODE_PATH; /* Protect f->* writes from reordering with fd installation */ smp_mb(); fd_install(dfd, f); #ifdef IMPL_LINK_PROT_OLD ret = lve_do_path_lookup(dfd, name, LOOKUP_FOLLOW, &nd); target->dentry = nd.path.dentry; target->mnt = nd.path.mnt; #else ret = lve_do_path_lookup(dfd, name, LOOKUP_FOLLOW, target); #endif if (lve_close_fd(dfd) != 0) ret = -EBADF; out: return ret; } int may_create_sym_link(const char *name, const struct path *path, struct dentry *link) { int ret = 0; struct path target_path; struct inode *inode; ret = check_user_link_permission(true); if (ret == 0) goto out; ret = lve_symlink_lookup(path, name, &target_path); if (ret < 0) { /* Still allow creating symlinks for non-existing target paths */ if (ret == -ENOENT) ret = 0; goto out; } inode = target_path.dentry->d_inode; if (!uid_eq(current_fsuid(), inode->i_uid)) { ret = check_link_group_permission(inode, true); } path_put(&target_path); out: return ret; } #ifdef IMPL_LINK_PROT_EXPERIMENTAL int sandbox_inode_follow_link(struct dentry *link_dentry, struct inode *link_inode, bool rcu) { bool nonroot = param_is_enabled(LVE_GLOBAL_NONROOT) ? true : is_global_nonroot(link_inode->i_uid); int ret = 0; bool handle_filter = false; bool handle_owner = false; bool handle_proc = false; struct nameidata *nd = current->nameidata; const char *link_body; DEFINE_DELAYED_CALL(done); ret = follow_link_init(&handle_owner, &handle_filter, &handle_proc); if (ret != 0) return ret == -1 ? 0 : ret; if (!nonroot) return 0; /* Switch into ref-walk mode, to be able to perform non-atomic things */ if (rcu && !lve_try_to_unlazy(nd)) return -ECHILD; link_body = vfs_get_link(link_dentry, &done); if (IS_ERR(link_body)) return 0; if (handle_proc && is_in_lve(current) && unlikely(!link_body && nd_get_jumped(nd) && link_dentry->d_op != lve_tid_fd_dentry_operations)) { struct path root_path; struct path *nd_path = nd_get_path(nd); get_fs_root(current->fs, &root_path); /* * Skip NSFS (i.e. /proc/pid/ns), as it's mnt is *INTERNAL* and isn't * reachable from root_path * * It's safe, because setns() syscall requires CAP_SYS_ADMIN anyway */ if (nd_path->mnt->mnt_sb->s_magic != NSFS_MAGIC && !path_is_under(nd_path, &root_path)) ret = -EPERM; path_put(&root_path); if (ret != 0) goto out; } if (link_body == NULL || (!handle_owner && !handle_filter) || (handle_filter && is_link_body_secure(link_body))) { ret = 0; reset_current_link_uid(); goto out; } if (uid_eq(link_inode->i_uid, get_current_link_uid())) goto out; ret = set_current_link_uid(link_inode->i_uid); out: if (ret < 0) reset_current_link_uid(); do_delayed_call(&done); return ret; } #endif #ifdef IMPL_LINK_PROT_NEW int sandbox_path_symlink(const struct path *dir /* symlink parent dir */ , struct dentry *dentry /* symlink dentry */ , const char *old_name /* target path */ ) { if (!param_is_enabled(LVE_SYMLINK_PROTECTION)) return 0; return may_create_sym_link(old_name, dir, dentry); } #endif #ifdef IMPL_LINK_PROT_EXPERIMENTAL /* * For a given path string extract the parent path string and lookup * * PS: as path_parentat() cannot be called via symbol address, we need to have * our own version. * * Return value on success: 0 - parent dir is found, 1 - no parent, CWD or * corresponding fd should be used as a parent dir information. */ static int lve_lookup_parent(int dfd, const char __user *name, struct path *path) { int ret; long len; char *pos; char *parent_name = kzalloc(PATH_MAX, GFP_KERNEL); if (!parent_name) return -ENOMEM; #if RHEL_MAJOR<9 if ((len = strncpy_from_user(parent_name, name, PATH_MAX)) < 0) { kfree(parent_name); return -EFAULT; } #else strncpy(parent_name, name, PATH_MAX); len = strlen(parent_name); #endif if (len == 0) { kfree(parent_name); return -EINVAL; } pos = parent_name + len - 1; while (pos != parent_name && *pos != '/') pos--; /* * Either parent is "/", empty string or "/" * cut the last component */ if (pos == parent_name && *pos == '/') *(pos + 1) = '\0'; else if (pos == parent_name && *pos != '/') /* No slash found, link is in the newdfd (mostly current) dir */ pos = NULL; else *pos = '\0'; if (!pos) { kfree(parent_name); return 0; } ret = lve_do_path_lookup(dfd, parent_name, LOOKUP_FOLLOW, path); if (ret < 0) { kfree(parent_name); goto out; } kfree(parent_name); return 1; out: return ret; } int lve_handle_symlink_rename(int olddfd, const char __user *oldname, int newdfd, const char __user *newname, unsigned int flags) { int ret, parent_found; const char *link_body; struct path old_path, new_path, new_parent; struct inode *inode; DEFINE_DELAYED_CALL(done); if (!param_is_enabled(LVE_SYMLINK_PROTECTION)) return 0; ret = check_user_link_permission(true); if (ret == 0) return 0; /* Don't follow the trailing path element */ ret = lve_do_path_lookup(olddfd, oldname, 0, &old_path); if (ret < 0) return ret; ret = 0; if (!d_is_symlink(old_path.dentry)) { path_put(&old_path); return 0; } link_body = vfs_get_link(old_path.dentry, &done); if (IS_ERR_OR_NULL(link_body)) { ret = -EINVAL; path_put(&old_path); return ret; } /* Absoulute path is simplier */ parent_found = 0; if (link_body[0] == '/') goto lookup_path; /* Lookup *newname* parent dir */ parent_found = lve_lookup_parent(newdfd, newname, &new_parent); if (parent_found < 0) { parent_found = parent_found == -EINVAL ? 0 : parent_found; path_put(&old_path); return parent_found; } lookup_path: ret = (parent_found == 1) ? lve_symlink_lookup(&new_parent, link_body, &new_path) : lve_do_path_lookup(newdfd, link_body, LOOKUP_FOLLOW, &new_path); if (ret < 0) { if (ret == -ENOENT) ret = 0; path_put(&old_path); if (parent_found == 1) path_put(&new_parent); goto out; } inode = new_path.dentry->d_inode; if (!uid_eq(current_fsuid(), inode->i_uid)) { ret = check_link_group_permission(inode, true); } path_put(&old_path); path_put(&new_path); if (parent_found == 1) path_put(&new_parent); out: do_delayed_call(&done); return ret; } int lve_do_handle_symlink_create(const char __user *oldname, int newdfd, const char __user *newname) { int ret, parent_found; struct path target_path; struct path base_path; struct inode *inode; if (!param_is_enabled(LVE_SYMLINK_PROTECTION)) return 0; ret = check_user_link_permission(true); if (ret == 0) return 0; parent_found = lve_lookup_parent(newdfd, newname, &base_path); if (parent_found < 0) return parent_found; ret = (parent_found == 1) ? lve_symlink_lookup(&base_path, oldname, &target_path) : lve_do_path_lookup(newdfd, oldname, LOOKUP_FOLLOW, &target_path); if (ret < 0) { if (parent_found == 1) path_put(&base_path); if (ret == -ENOENT) ret = 0; goto out; } inode = target_path.dentry->d_inode; if (!uid_eq(current_fsuid(), inode->i_uid)) { ret = check_link_group_permission(inode, true); } if (parent_found == 1) path_put(&base_path); path_put(&target_path); out: return ret; } #endif #if defined(IMPL_LINK_PROT_NEW) || defined(IMPL_LINK_PROT_EXPERIMENTAL) /* Hardlink protection */ int sandbox_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { if (!param_is_enabled(LVE_HARDLINK_PROTECTION)) return 0; return may_create_hard_link(old_dentry); } #endif #ifdef IMPL_LINK_PROT_NEW /* * Replicate the nameidata structure * TODO: get the fields from kernel debuginfo instead * */ #define EMBEDDED_LEVELS 2 struct __nameidata { struct path path; struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags; #if RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0) unsigned int state; #endif unsigned seq, m_seq; #if RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 7) unsigned r_seq; #endif int last_type; unsigned depth; int total_link_count; struct saved { struct path link; struct delayed_call done; const char *name; unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; /* * We don't care about fields below so they may not be compatible * with the new kernels (>= rhel9). * The last field we use is the "->stack". */ struct filename *name; struct nameidata *saved; struct inode *link_inode; unsigned root_seq; int dfd; }; static inline struct vfsmount *get_link_mnt(struct nameidata *__nd) { struct __nameidata *nd = (void *)__nd; struct saved *last = nd->stack + nd->depth - 1; return last->link.mnt; } static inline int get_link_count(struct nameidata *__nd) { struct __nameidata *nd = (void *)__nd; return nd->total_link_count; } static inline void set_link_count(struct nameidata *__nd, int new) { struct __nameidata *nd = (void *)__nd; nd->total_link_count = new; } static inline int get_nd_flags(struct nameidata *__nd) { return ((struct __nameidata *)__nd)->flags; } static inline bool nd_get_jumped(struct nameidata *nd) { return (((struct __nameidata *)nd)->flags & LOOKUP_JUMPED) != 0; } static inline void nd_set_jumped(struct nameidata *nd) { ((struct __nameidata *)nd)->flags |= LOOKUP_JUMPED; } static inline struct path* get_nd_path(struct nameidata *__nd) { return &((struct __nameidata *)__nd)->path; } int sandbox_inode_follow_link(struct dentry *link_dentry, struct inode *inode, bool rcu) { int ret = 0; bool handle_filter = false; bool handle_owner = false; bool handle_proc = false; const char *link_body; struct path base, target; struct nameidata *nd = current->nameidata; int saved_link_count; DEFINE_DELAYED_CALL(done); ret = follow_link_init(&handle_owner, &handle_filter, &handle_proc); if (ret != 0) return ret == -1 ? 0 : ret; /* Switch into ref-walk mode, to be able to perform "nested" lookup */ if (rcu) { BUG_ON(nd == NULL); if (!lve_try_to_unlazy(nd)) return -ECHILD; } link_body = vfs_get_link(link_dentry, &done); if (IS_ERR(link_body)) return 0; if (handle_proc && is_in_lve(current) && unlikely(!link_body && nd_get_jumped(nd) && /* we don't want to restrict fd symlinks since they may point to /dev and it breaks /dev/std* * we want restrict them pointing to dirs since it may cause cagefs bypassing */ (link_dentry->d_op != lve_tid_fd_dentry_operations || d_is_dir(get_nd_path(nd)->dentry)))) { struct path root_path; struct path *nd_path = get_nd_path(nd); get_fs_root(current->fs, &root_path); /* * Skip NSFS (i.e. /proc/pid/ns), as it's mnt is *INTERNAL* and isn't * reachable from root_path * * It's safe, because setns() syscall requires CAP_SYS_ADMIN anyway */ if (nd_path->mnt->mnt_sb->s_magic != NSFS_MAGIC && !path_is_under(nd_path, &root_path)) ret = -EPERM; path_put(&root_path); if (ret != 0) goto out; } if (link_body == NULL || (!handle_owner && !handle_filter)) { ret = 0; goto out; } /* d_parent should be *get* in unlazy_walk() / try_to_unlazy() */ base.dentry = link_dentry->d_parent; base.mnt = get_link_mnt(nd); saved_link_count = get_link_count(nd); if (saved_link_count > MAX_LINK_COUNT) { ret = -ELOOP; goto out; } ret = lve_symlink_lookup(&base, link_body, &target); set_link_count(nd, saved_link_count); if (ret != 0) goto out; /* if the current matches the filter */ if (handle_filter && !is_link_body_secure(link_body)) ret = handle_symlink_filter(inode, target.dentry->d_inode); if (handle_owner && handle_symlink_owner(inode, target.dentry->d_inode)) ret = -EACCES; path_put(&target); out: do_delayed_call(&done); return ret; } #endif #if defined(IMPL_LINK_PROT_NEW) || defined(IMPL_LINK_PROT_EXPERIMENTAL) int sandbox_inode_readlink(struct dentry *link_dentry, struct vfsmount *link_mnt) { int ret = 0; bool handle_filter = false; bool handle_owner = false; const char *link_body; struct path base, target; struct inode *inode = link_dentry->d_inode; DEFINE_DELAYED_CALL(done); ret = follow_link_init(&handle_owner, &handle_filter, NULL); if (ret != 0) return ret == -1 ? 0 : ret; if (!handle_filter) return 0; link_body = vfs_get_link(link_dentry, &done); if (IS_ERR_OR_NULL(link_body)) return 0; /* d_parent should be *get* in unlazy_walk() / try_to_unlazy */ base.dentry = link_dentry->d_parent; base.mnt = link_mnt; ret = lve_symlink_lookup(&base, link_body, &target); if (ret != 0) goto out; /* if the current matches the filter */ if (handle_filter && !is_link_body_secure(link_body)) ret = handle_symlink_filter(inode, target.dentry->d_inode); path_put(&target); out: do_delayed_call(&done); return ret; } #endif int lve_sys_readlink(int dfd, const char __user *pathname, char __user *buf, int bufsiz) { struct path path; int error; unsigned lookup_flags = 0; if (bufsiz <= 0) return 0; retry: error = user_path_at(dfd, pathname, lookup_flags, &path); if (!error) { struct inode *inode = d_backing_inode(path.dentry); if (d_is_symlink(path.dentry) && inode->i_sb->s_magic != PROC_SUPER_MAGIC) { error = sandbox_inode_readlink(path.dentry, path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } } return error; }