#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/limits.h>
#include <linux/dcache.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/spinlock.h>
#include <linux/fs_struct.h>
#include <linux/rcupdate.h>
#include <linux/mount.h>
#include <linux/nsproxy.h>
#include <linux/audit.h>
#include <asm/uaccess.h>

#include "lve_internal.h"
#include "lve_global_params.h"
#include "lsm_int.h"
#include "link_protect.h"


/* 
 * We need this to prevent from racing with iterate_fd() 
 * which typically comes from selinux when matching fd's
 * when user schedules "sa1" in crond for instance
 */
static int lve_close_fd(unsigned fd)
{
	struct file *file;
	struct fdtable *fdt;
	struct files_struct *files = current->files;

	spin_lock(&files->file_lock);
	fdt = files_fdtable(files);
	if (fd >= fdt->max_fds)
		goto out_unlock;
	file = fdt->fd[fd];
	if (!file)
		goto out_unlock;
	rcu_assign_pointer(fdt->fd[fd], NULL);
	spin_unlock(&files->file_lock);

	put_unused_fd(fd);
	fput(file);

	return 0;
out_unlock:
	spin_unlock(&files->file_lock);
	return -EBADF;
}

int lve_symlink_lookup(const struct path *base, const char *name,
			      struct path *target)
{
	int dfd, ret;
	struct file *f;
	static const struct file_operations empty_fops = {};
#ifdef IMPL_LINK_PROT_OLD
	struct nameidata nd;
	int saved_link_count;
#endif
	dfd = get_unused_fd_flags(0);
	if (dfd < 0) {
		ret = dfd;
		goto out;
	}
#ifdef IMPL_LINK_PROT_OLD
	f = lve_get_empty_filp();
#else
	f = lve_alloc_empty_file_noaccount(0, current_cred());
#endif
	if (IS_ERR(f)) {
		ret = PTR_ERR(f);
		put_unused_fd(dfd);
		goto out;
	}

	f->f_path = *base;
#ifdef IMPL_LINK_PROT_OLD
	/* __fput *puts* the ref uncond. in older kernels */
	path_get(&f->f_path);
#endif
	/* For selinux fd checks or alike (file_has_perm for inst.) */
	f->f_inode = base->dentry->d_inode;

	/* Needed for filp_close() is used, when the dfd is duplicated somehow */
	f->f_op = &empty_fops;
	f->f_mode |= FMODE_PATH;

	/* Protect f->* writes from reordering with fd installation */
	smp_mb();

	fd_install(dfd, f);
#ifdef IMPL_LINK_PROT_OLD
	ret = lve_do_path_lookup(dfd, name, LOOKUP_FOLLOW, &nd);

	target->dentry = nd.path.dentry;
	target->mnt = nd.path.mnt;
#else
	ret = lve_do_path_lookup(dfd, name, LOOKUP_FOLLOW, target);
#endif
	if (lve_close_fd(dfd) != 0)
		ret = -EBADF;
out:
	return ret;
}

int may_create_sym_link(const char *name, const struct path *path,
			struct dentry *link)
{
	int ret = 0;
	struct path target_path;
	struct inode *inode;

	ret = check_user_link_permission(true);
	if (ret == 0)
		goto out;

	ret = lve_symlink_lookup(path, name, &target_path);
	if (ret < 0) {
		/* Still allow creating symlinks for non-existing target paths */
		if (ret == -ENOENT)
			ret = 0;
		goto out;
	}

	inode = target_path.dentry->d_inode;

	if (!uid_eq(current_fsuid(), inode->i_uid)) {
		ret = check_link_group_permission(inode, true);
	}

	path_put(&target_path);
out:
	return ret;
}

#ifdef IMPL_LINK_PROT_EXPERIMENTAL
int sandbox_inode_follow_link(struct dentry *link_dentry,
                              struct inode *link_inode, bool rcu)
{
	bool nonroot = param_is_enabled(LVE_GLOBAL_NONROOT) ? true :
		is_global_nonroot(link_inode->i_uid);

	int ret = 0;

	bool handle_filter = false;
	bool handle_owner = false;
	bool handle_proc = false;

	struct nameidata *nd = current->nameidata;
	const char *link_body;

	DEFINE_DELAYED_CALL(done);

	ret = follow_link_init(&handle_owner, &handle_filter,
				&handle_proc);
	if (ret != 0)
		return ret == -1 ? 0 : ret;

	if (!nonroot)
		return 0;

	/* Switch into ref-walk mode, to be able to perform non-atomic things */
	if (rcu && !lve_try_to_unlazy(nd))
		return -ECHILD;

	link_body = vfs_get_link(link_dentry, &done);
	if (IS_ERR(link_body))
		return 0;

	if (handle_proc && is_in_lve(current) &&
		unlikely(!link_body && nd_get_jumped(nd) &&
		link_dentry->d_op != lve_tid_fd_dentry_operations)) {
		struct path root_path;
		struct path *nd_path = nd_get_path(nd);

		get_fs_root(current->fs, &root_path);
		/*
		 * Skip NSFS (i.e. /proc/pid/ns), as it's mnt is *INTERNAL* and isn't
		 * reachable from root_path
		 *
		 * It's safe, because setns() syscall requires CAP_SYS_ADMIN anyway
		 */
		if (nd_path->mnt->mnt_sb->s_magic != NSFS_MAGIC &&
			!path_is_under(nd_path, &root_path))
			ret = -EPERM;

		path_put(&root_path);

		if (ret != 0)
			goto out;
	}

	if (link_body == NULL || (!handle_owner && !handle_filter) ||
		(handle_filter && is_link_body_secure(link_body))) {
		ret = 0;
		reset_current_link_uid();
		goto out;
	}

	if (uid_eq(link_inode->i_uid, get_current_link_uid()))
		goto out;

	ret = set_current_link_uid(link_inode->i_uid);
out:
	if (ret < 0)
		reset_current_link_uid();
	do_delayed_call(&done);

	return ret;
}
#endif

#ifdef IMPL_LINK_PROT_NEW
int sandbox_path_symlink(const struct path *dir /* symlink parent dir */ ,
			 struct dentry *dentry /* symlink dentry */ ,
			 const char *old_name /* target path */ )
{
	if (!param_is_enabled(LVE_SYMLINK_PROTECTION))
		return 0;

	return may_create_sym_link(old_name, dir, dentry);
}
#endif
      
#ifdef IMPL_LINK_PROT_EXPERIMENTAL

/* 
 * For a given path string extract the parent path string and lookup
 *
 * PS: as path_parentat() cannot be called via symbol address, we need to have
 * our own version.
 *
 * Return value on success: 0 - parent dir is found, 1 - no parent, CWD or 
 * corresponding fd should be used as a parent dir information.
 */
static int lve_lookup_parent(int dfd, const char __user *name, struct path *path)
{
	int ret;

	long len;
	char *pos;

	char *parent_name = kzalloc(PATH_MAX, GFP_KERNEL);
	if (!parent_name)
		return -ENOMEM;

#if RHEL_MAJOR<9
	if ((len = strncpy_from_user(parent_name, name, PATH_MAX)) < 0) {
		kfree(parent_name);
		return -EFAULT;
	}
#else
	strncpy(parent_name, name, PATH_MAX);
	len = strlen(parent_name);
#endif

	if (len == 0) {
		kfree(parent_name);
		return -EINVAL;
	}

	pos = parent_name + len - 1;
	while (pos != parent_name && *pos != '/') pos--;

	/*
	 * Either parent is "/", empty string or "<STRING>/"
	 * cut the last component
	 */
	if (pos == parent_name && *pos == '/')
		*(pos + 1) = '\0';
	else if (pos == parent_name && *pos != '/')
		/* No slash found, link is in the newdfd (mostly current) dir */
		pos = NULL; 
	else
		*pos = '\0';

	if (!pos) {
		kfree(parent_name);
		return 0;
	}

	ret = lve_do_path_lookup(dfd, parent_name, LOOKUP_FOLLOW, path);
	if (ret < 0) {
		kfree(parent_name);
		goto out;
	}

	kfree(parent_name);
	return 1;
out:
	return ret;
}

int lve_handle_symlink_rename(int olddfd, const char __user *oldname, int newdfd,
                        const char __user *newname, unsigned int flags) {

	int ret, parent_found;

	const char *link_body;
	struct path old_path, new_path, new_parent;
	struct inode *inode;

	DEFINE_DELAYED_CALL(done);

	if (!param_is_enabled(LVE_SYMLINK_PROTECTION))
		return 0;

	ret = check_user_link_permission(true);
	if (ret == 0)
		return 0;

	/* Don't follow the trailing path element */
	ret = lve_do_path_lookup(olddfd, oldname, 0, &old_path);
	if (ret < 0)
		return ret;
	
	ret = 0;
	
	if (!d_is_symlink(old_path.dentry)) {
		path_put(&old_path);
		return 0;
	}

	link_body = vfs_get_link(old_path.dentry, &done);
	if (IS_ERR_OR_NULL(link_body)) {
		ret = -EINVAL;
		path_put(&old_path);
		return ret;
	}

	/* Absoulute path is simplier */
	parent_found = 0;
	if (link_body[0] == '/')
		goto lookup_path;

	/* Lookup *newname* parent dir */
	parent_found = lve_lookup_parent(newdfd, newname, &new_parent);
	if (parent_found < 0) {
		parent_found = parent_found == -EINVAL ? 0 : parent_found;
		path_put(&old_path);
		return parent_found;
	}

lookup_path:
	ret = (parent_found == 1) ? 
		lve_symlink_lookup(&new_parent, link_body, &new_path) :
		lve_do_path_lookup(newdfd, link_body, LOOKUP_FOLLOW, &new_path);
	if (ret < 0) {
		if (ret == -ENOENT)
			ret = 0;
		path_put(&old_path);
		if (parent_found == 1)
			path_put(&new_parent);
		goto out;
	}

	inode = new_path.dentry->d_inode;
	if (!uid_eq(current_fsuid(), inode->i_uid)) {
		ret = check_link_group_permission(inode, true);
	}

	path_put(&old_path);
	path_put(&new_path);
	if (parent_found == 1)
		path_put(&new_parent);

out:
	do_delayed_call(&done);
	return ret;
}


int lve_do_handle_symlink_create(const char __user *oldname, int newdfd,
                  const char __user *newname) 
{
	int ret, parent_found;

	struct path target_path;
	struct path base_path;
	struct inode *inode;

	if (!param_is_enabled(LVE_SYMLINK_PROTECTION))
		return 0;
	
	ret = check_user_link_permission(true);
	if (ret == 0)
		return 0;
	
	parent_found = lve_lookup_parent(newdfd, newname, &base_path);
	if (parent_found < 0)
		return parent_found;

	ret = (parent_found == 1) ? 
			lve_symlink_lookup(&base_path, oldname, &target_path) :
			lve_do_path_lookup(newdfd, oldname, LOOKUP_FOLLOW, &target_path);
	if (ret < 0) {
		if (parent_found == 1)
			path_put(&base_path);
		if (ret == -ENOENT)
			ret = 0;
		goto out;
	}

	inode = target_path.dentry->d_inode;
	if (!uid_eq(current_fsuid(), inode->i_uid)) {
		ret = check_link_group_permission(inode, true);
	}
	
	if (parent_found == 1)
		path_put(&base_path);
	path_put(&target_path);	
out:
	return ret;
}
#endif


#if defined(IMPL_LINK_PROT_NEW) || defined(IMPL_LINK_PROT_EXPERIMENTAL)
/* Hardlink protection */
int sandbox_inode_link(struct dentry *old_dentry, struct inode *dir,
				struct dentry *new_dentry)
{
	if (!param_is_enabled(LVE_HARDLINK_PROTECTION))
		return 0;

	return may_create_hard_link(old_dentry);
}
#endif


#ifdef IMPL_LINK_PROT_NEW
/* 
 * Replicate the nameidata structure 
 * TODO: get the fields from kernel debuginfo instead
 * 
 */
#define EMBEDDED_LEVELS 2
struct __nameidata {
	struct path	path;
	struct qstr	last;
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags;
#if RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)
	unsigned int	state;
#endif
	unsigned	seq, m_seq;
#if RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 7)
	unsigned	r_seq;
#endif
	int		last_type;
	unsigned	depth;
	int		total_link_count;
	struct saved {
		struct path link;
		struct delayed_call done;
		const char *name;
		unsigned seq;
	} *stack, internal[EMBEDDED_LEVELS];
/*
 * We don't care about fields below so they may not be compatible
 * with the new kernels (>= rhel9).
 * The last field we use is the "->stack".
 */
	struct filename	*name;
	struct nameidata *saved;
	struct inode	*link_inode;
	unsigned	root_seq;
	int		dfd;
};

static inline struct vfsmount *get_link_mnt(struct nameidata *__nd)
{
	struct __nameidata *nd = (void *)__nd;
	struct saved *last = nd->stack + nd->depth - 1;

        return last->link.mnt;
}

static inline int get_link_count(struct nameidata *__nd)
{
	struct __nameidata *nd = (void *)__nd;

	return nd->total_link_count;
}

static inline void set_link_count(struct nameidata *__nd, int new)
{
	struct __nameidata *nd = (void *)__nd;

	nd->total_link_count = new;
}

static inline int get_nd_flags(struct nameidata *__nd)
{
	return ((struct __nameidata *)__nd)->flags;
}

static inline bool nd_get_jumped(struct nameidata *nd)
{
	return (((struct __nameidata *)nd)->flags & LOOKUP_JUMPED) != 0;
}

static inline void nd_set_jumped(struct nameidata *nd)
{
	((struct __nameidata *)nd)->flags |= LOOKUP_JUMPED;
}


static inline struct path* get_nd_path(struct nameidata *__nd)
{
	return &((struct __nameidata *)__nd)->path;
}

int sandbox_inode_follow_link(struct dentry *link_dentry,
			      struct inode *inode, bool rcu)
{
	int ret = 0;
	bool handle_filter = false;
	bool handle_owner = false;
	bool handle_proc = false;
	const char *link_body;
	struct path base, target;
	struct nameidata *nd = current->nameidata;
	int saved_link_count;

	DEFINE_DELAYED_CALL(done);

	ret = follow_link_init(&handle_owner, &handle_filter,
				&handle_proc);
	if (ret != 0)
		return ret == -1 ? 0 : ret;

	/* Switch into ref-walk mode, to be able to perform "nested" lookup */
	if (rcu) {
		BUG_ON(nd == NULL);
		if (!lve_try_to_unlazy(nd))
			return -ECHILD;
	}

	link_body = vfs_get_link(link_dentry, &done);
	if (IS_ERR(link_body))
		return 0;

	if (handle_proc && is_in_lve(current) &&
		unlikely(!link_body && nd_get_jumped(nd) &&
		/* we don't want to restrict fd symlinks since they may point to /dev and it breaks /dev/std*
		 * we want restrict them pointing to dirs since it may cause cagefs bypassing */
		(link_dentry->d_op != lve_tid_fd_dentry_operations || d_is_dir(get_nd_path(nd)->dentry)))) {
		struct path root_path;
		struct path *nd_path = get_nd_path(nd);

		get_fs_root(current->fs, &root_path);
		/*
		 * Skip NSFS (i.e. /proc/pid/ns), as it's mnt is *INTERNAL* and isn't
		 * reachable from root_path
		 *
		 * It's safe, because setns() syscall requires CAP_SYS_ADMIN anyway
		 */
		if (nd_path->mnt->mnt_sb->s_magic != NSFS_MAGIC &&
			!path_is_under(nd_path, &root_path))
			ret = -EPERM;

		path_put(&root_path);

		if (ret != 0)
			goto out;
	}

        if (link_body == NULL || (!handle_owner && !handle_filter)) {
		ret = 0;
                goto out;
	}

	/* d_parent should be *get* in unlazy_walk() / try_to_unlazy() */
	base.dentry = link_dentry->d_parent;
	base.mnt = get_link_mnt(nd);

	saved_link_count = get_link_count(nd);
	if (saved_link_count > MAX_LINK_COUNT) {
		ret = -ELOOP;
		goto out;
	}

	ret = lve_symlink_lookup(&base, link_body, &target);
	set_link_count(nd, saved_link_count);
	if (ret != 0)
		goto out;

	/* if the current matches the filter */
	if (handle_filter && !is_link_body_secure(link_body))
		ret = handle_symlink_filter(inode,
					target.dentry->d_inode);

	if (handle_owner &&
		handle_symlink_owner(inode, target.dentry->d_inode))
		ret = -EACCES;

	path_put(&target);
out:
	do_delayed_call(&done);

	return ret;
}
#endif

#if defined(IMPL_LINK_PROT_NEW) || defined(IMPL_LINK_PROT_EXPERIMENTAL)
int sandbox_inode_readlink(struct dentry *link_dentry, struct vfsmount *link_mnt)
{
	int ret = 0;
	bool handle_filter = false;
	bool handle_owner = false;
	const char *link_body;
	struct path base, target;
	struct inode *inode = link_dentry->d_inode;

	DEFINE_DELAYED_CALL(done);

	ret = follow_link_init(&handle_owner, &handle_filter, NULL);
	if (ret != 0)
		return ret == -1 ? 0 : ret;

	if (!handle_filter)
		return 0;

	link_body = vfs_get_link(link_dentry, &done);
	if (IS_ERR_OR_NULL(link_body))
		return 0;

	/* d_parent should be *get* in unlazy_walk() / try_to_unlazy */
	base.dentry = link_dentry->d_parent;
	base.mnt = link_mnt;

	ret = lve_symlink_lookup(&base, link_body, &target);
	if (ret != 0)
		goto out;

	/* if the current matches the filter */
	if (handle_filter && !is_link_body_secure(link_body))
		ret = handle_symlink_filter(inode,
					target.dentry->d_inode);
	path_put(&target);
out:
	do_delayed_call(&done);

	return ret;
}
#endif

int lve_sys_readlink(int dfd, const char __user *pathname, char __user *buf, int bufsiz)
{
	struct path path;
	int error;
	unsigned lookup_flags = 0;

	if (bufsiz <= 0)
		return 0;

retry:
	error = user_path_at(dfd, pathname, lookup_flags, &path);
	if (!error) {
		struct inode *inode = d_backing_inode(path.dentry);

		if (d_is_symlink(path.dentry) &&
			inode->i_sb->s_magic != PROC_SUPER_MAGIC) {
			error = sandbox_inode_readlink(path.dentry, path.mnt);
		}

		path_put(&path);
		if (retry_estale(error, lookup_flags)) {
			lookup_flags |= LOOKUP_REVAL;
			goto retry;
		}
	}
	return error;
}