cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

acct.c (16193B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  linux/kernel/acct.c
      4 *
      5 *  BSD Process Accounting for Linux
      6 *
      7 *  Author: Marco van Wieringen <mvw@planets.elm.net>
      8 *
      9 *  Some code based on ideas and code from:
     10 *  Thomas K. Dyas <tdyas@eden.rutgers.edu>
     11 *
     12 *  This file implements BSD-style process accounting. Whenever any
     13 *  process exits, an accounting record of type "struct acct" is
     14 *  written to the file specified with the acct() system call. It is
     15 *  up to user-level programs to do useful things with the accounting
     16 *  log. The kernel just provides the raw accounting information.
     17 *
     18 * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
     19 *
     20 *  Plugged two leaks. 1) It didn't return acct_file into the free_filps if
     21 *  the file happened to be read-only. 2) If the accounting was suspended
     22 *  due to the lack of space it happily allowed to reopen it and completely
     23 *  lost the old acct_file. 3/10/98, Al Viro.
     24 *
     25 *  Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
     26 *  XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
     27 *
     28 *  Fixed a nasty interaction with sys_umount(). If the accounting
     29 *  was suspeneded we failed to stop it on umount(). Messy.
     30 *  Another one: remount to readonly didn't stop accounting.
     31 *	Question: what should we do if we have CAP_SYS_ADMIN but not
     32 *  CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
     33 *  unless we are messing with the root. In that case we are getting a
     34 *  real mess with do_remount_sb(). 9/11/98, AV.
     35 *
     36 *  Fixed a bunch of races (and pair of leaks). Probably not the best way,
     37 *  but this one obviously doesn't introduce deadlocks. Later. BTW, found
     38 *  one race (and leak) in BSD implementation.
     39 *  OK, that's better. ANOTHER race and leak in BSD variant. There always
     40 *  is one more bug... 10/11/98, AV.
     41 *
     42 *	Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
     43 * ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks
     44 * a struct file opened for write. Fixed. 2/6/2000, AV.
     45 */
     46
     47#include <linux/mm.h>
     48#include <linux/slab.h>
     49#include <linux/acct.h>
     50#include <linux/capability.h>
     51#include <linux/file.h>
     52#include <linux/tty.h>
     53#include <linux/security.h>
     54#include <linux/vfs.h>
     55#include <linux/jiffies.h>
     56#include <linux/times.h>
     57#include <linux/syscalls.h>
     58#include <linux/mount.h>
     59#include <linux/uaccess.h>
     60#include <linux/sched/cputime.h>
     61
     62#include <asm/div64.h>
     63#include <linux/pid_namespace.h>
     64#include <linux/fs_pin.h>
     65
     66/*
     67 * These constants control the amount of freespace that suspend and
     68 * resume the process accounting system, and the time delay between
     69 * each check.
     70 * Turned into sysctl-controllable parameters. AV, 12/11/98
     71 */
     72
     73static int acct_parm[3] = {4, 2, 30};
     74#define RESUME		(acct_parm[0])	/* >foo% free space - resume */
     75#define SUSPEND		(acct_parm[1])	/* <foo% free space - suspend */
     76#define ACCT_TIMEOUT	(acct_parm[2])	/* foo second timeout between checks */
     77
     78#ifdef CONFIG_SYSCTL
     79static struct ctl_table kern_acct_table[] = {
     80	{
     81		.procname       = "acct",
     82		.data           = &acct_parm,
     83		.maxlen         = 3*sizeof(int),
     84		.mode           = 0644,
     85		.proc_handler   = proc_dointvec,
     86	},
     87	{ }
     88};
     89
     90static __init int kernel_acct_sysctls_init(void)
     91{
     92	register_sysctl_init("kernel", kern_acct_table);
     93	return 0;
     94}
     95late_initcall(kernel_acct_sysctls_init);
     96#endif /* CONFIG_SYSCTL */
     97
     98/*
     99 * External references and all of the globals.
    100 */
    101
    102struct bsd_acct_struct {
    103	struct fs_pin		pin;
    104	atomic_long_t		count;
    105	struct rcu_head		rcu;
    106	struct mutex		lock;
    107	int			active;
    108	unsigned long		needcheck;
    109	struct file		*file;
    110	struct pid_namespace	*ns;
    111	struct work_struct	work;
    112	struct completion	done;
    113};
    114
    115static void do_acct_process(struct bsd_acct_struct *acct);
    116
    117/*
    118 * Check the amount of free space and suspend/resume accordingly.
    119 */
    120static int check_free_space(struct bsd_acct_struct *acct)
    121{
    122	struct kstatfs sbuf;
    123
    124	if (time_is_after_jiffies(acct->needcheck))
    125		goto out;
    126
    127	/* May block */
    128	if (vfs_statfs(&acct->file->f_path, &sbuf))
    129		goto out;
    130
    131	if (acct->active) {
    132		u64 suspend = sbuf.f_blocks * SUSPEND;
    133		do_div(suspend, 100);
    134		if (sbuf.f_bavail <= suspend) {
    135			acct->active = 0;
    136			pr_info("Process accounting paused\n");
    137		}
    138	} else {
    139		u64 resume = sbuf.f_blocks * RESUME;
    140		do_div(resume, 100);
    141		if (sbuf.f_bavail >= resume) {
    142			acct->active = 1;
    143			pr_info("Process accounting resumed\n");
    144		}
    145	}
    146
    147	acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
    148out:
    149	return acct->active;
    150}
    151
    152static void acct_put(struct bsd_acct_struct *p)
    153{
    154	if (atomic_long_dec_and_test(&p->count))
    155		kfree_rcu(p, rcu);
    156}
    157
    158static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
    159{
    160	return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
    161}
    162
    163static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
    164{
    165	struct bsd_acct_struct *res;
    166again:
    167	smp_rmb();
    168	rcu_read_lock();
    169	res = to_acct(READ_ONCE(ns->bacct));
    170	if (!res) {
    171		rcu_read_unlock();
    172		return NULL;
    173	}
    174	if (!atomic_long_inc_not_zero(&res->count)) {
    175		rcu_read_unlock();
    176		cpu_relax();
    177		goto again;
    178	}
    179	rcu_read_unlock();
    180	mutex_lock(&res->lock);
    181	if (res != to_acct(READ_ONCE(ns->bacct))) {
    182		mutex_unlock(&res->lock);
    183		acct_put(res);
    184		goto again;
    185	}
    186	return res;
    187}
    188
    189static void acct_pin_kill(struct fs_pin *pin)
    190{
    191	struct bsd_acct_struct *acct = to_acct(pin);
    192	mutex_lock(&acct->lock);
    193	do_acct_process(acct);
    194	schedule_work(&acct->work);
    195	wait_for_completion(&acct->done);
    196	cmpxchg(&acct->ns->bacct, pin, NULL);
    197	mutex_unlock(&acct->lock);
    198	pin_remove(pin);
    199	acct_put(acct);
    200}
    201
    202static void close_work(struct work_struct *work)
    203{
    204	struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
    205	struct file *file = acct->file;
    206	if (file->f_op->flush)
    207		file->f_op->flush(file, NULL);
    208	__fput_sync(file);
    209	complete(&acct->done);
    210}
    211
    212static int acct_on(struct filename *pathname)
    213{
    214	struct file *file;
    215	struct vfsmount *mnt, *internal;
    216	struct pid_namespace *ns = task_active_pid_ns(current);
    217	struct bsd_acct_struct *acct;
    218	struct fs_pin *old;
    219	int err;
    220
    221	acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
    222	if (!acct)
    223		return -ENOMEM;
    224
    225	/* Difference from BSD - they don't do O_APPEND */
    226	file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
    227	if (IS_ERR(file)) {
    228		kfree(acct);
    229		return PTR_ERR(file);
    230	}
    231
    232	if (!S_ISREG(file_inode(file)->i_mode)) {
    233		kfree(acct);
    234		filp_close(file, NULL);
    235		return -EACCES;
    236	}
    237
    238	if (!(file->f_mode & FMODE_CAN_WRITE)) {
    239		kfree(acct);
    240		filp_close(file, NULL);
    241		return -EIO;
    242	}
    243	internal = mnt_clone_internal(&file->f_path);
    244	if (IS_ERR(internal)) {
    245		kfree(acct);
    246		filp_close(file, NULL);
    247		return PTR_ERR(internal);
    248	}
    249	err = __mnt_want_write(internal);
    250	if (err) {
    251		mntput(internal);
    252		kfree(acct);
    253		filp_close(file, NULL);
    254		return err;
    255	}
    256	mnt = file->f_path.mnt;
    257	file->f_path.mnt = internal;
    258
    259	atomic_long_set(&acct->count, 1);
    260	init_fs_pin(&acct->pin, acct_pin_kill);
    261	acct->file = file;
    262	acct->needcheck = jiffies;
    263	acct->ns = ns;
    264	mutex_init(&acct->lock);
    265	INIT_WORK(&acct->work, close_work);
    266	init_completion(&acct->done);
    267	mutex_lock_nested(&acct->lock, 1);	/* nobody has seen it yet */
    268	pin_insert(&acct->pin, mnt);
    269
    270	rcu_read_lock();
    271	old = xchg(&ns->bacct, &acct->pin);
    272	mutex_unlock(&acct->lock);
    273	pin_kill(old);
    274	__mnt_drop_write(mnt);
    275	mntput(mnt);
    276	return 0;
    277}
    278
    279static DEFINE_MUTEX(acct_on_mutex);
    280
    281/**
    282 * sys_acct - enable/disable process accounting
    283 * @name: file name for accounting records or NULL to shutdown accounting
    284 *
    285 * sys_acct() is the only system call needed to implement process
    286 * accounting. It takes the name of the file where accounting records
    287 * should be written. If the filename is NULL, accounting will be
    288 * shutdown.
    289 *
    290 * Returns: 0 for success or negative errno values for failure.
    291 */
    292SYSCALL_DEFINE1(acct, const char __user *, name)
    293{
    294	int error = 0;
    295
    296	if (!capable(CAP_SYS_PACCT))
    297		return -EPERM;
    298
    299	if (name) {
    300		struct filename *tmp = getname(name);
    301
    302		if (IS_ERR(tmp))
    303			return PTR_ERR(tmp);
    304		mutex_lock(&acct_on_mutex);
    305		error = acct_on(tmp);
    306		mutex_unlock(&acct_on_mutex);
    307		putname(tmp);
    308	} else {
    309		rcu_read_lock();
    310		pin_kill(task_active_pid_ns(current)->bacct);
    311	}
    312
    313	return error;
    314}
    315
    316void acct_exit_ns(struct pid_namespace *ns)
    317{
    318	rcu_read_lock();
    319	pin_kill(ns->bacct);
    320}
    321
    322/*
    323 *  encode an unsigned long into a comp_t
    324 *
    325 *  This routine has been adopted from the encode_comp_t() function in
    326 *  the kern_acct.c file of the FreeBSD operating system. The encoding
    327 *  is a 13-bit fraction with a 3-bit (base 8) exponent.
    328 */
    329
    330#define	MANTSIZE	13			/* 13 bit mantissa. */
    331#define	EXPSIZE		3			/* Base 8 (3 bit) exponent. */
    332#define	MAXFRACT	((1 << MANTSIZE) - 1)	/* Maximum fractional value. */
    333
    334static comp_t encode_comp_t(unsigned long value)
    335{
    336	int exp, rnd;
    337
    338	exp = rnd = 0;
    339	while (value > MAXFRACT) {
    340		rnd = value & (1 << (EXPSIZE - 1));	/* Round up? */
    341		value >>= EXPSIZE;	/* Base 8 exponent == 3 bit shift. */
    342		exp++;
    343	}
    344
    345	/*
    346	 * If we need to round up, do it (and handle overflow correctly).
    347	 */
    348	if (rnd && (++value > MAXFRACT)) {
    349		value >>= EXPSIZE;
    350		exp++;
    351	}
    352
    353	/*
    354	 * Clean it up and polish it off.
    355	 */
    356	exp <<= MANTSIZE;		/* Shift the exponent into place */
    357	exp += value;			/* and add on the mantissa. */
    358	return exp;
    359}
    360
    361#if ACCT_VERSION == 1 || ACCT_VERSION == 2
    362/*
    363 * encode an u64 into a comp2_t (24 bits)
    364 *
    365 * Format: 5 bit base 2 exponent, 20 bits mantissa.
    366 * The leading bit of the mantissa is not stored, but implied for
    367 * non-zero exponents.
    368 * Largest encodable value is 50 bits.
    369 */
    370
    371#define MANTSIZE2       20                      /* 20 bit mantissa. */
    372#define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
    373#define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
    374#define MAXEXP2         ((1 << EXPSIZE2) - 1)    /* Maximum exponent. */
    375
    376static comp2_t encode_comp2_t(u64 value)
    377{
    378	int exp, rnd;
    379
    380	exp = (value > (MAXFRACT2>>1));
    381	rnd = 0;
    382	while (value > MAXFRACT2) {
    383		rnd = value & 1;
    384		value >>= 1;
    385		exp++;
    386	}
    387
    388	/*
    389	 * If we need to round up, do it (and handle overflow correctly).
    390	 */
    391	if (rnd && (++value > MAXFRACT2)) {
    392		value >>= 1;
    393		exp++;
    394	}
    395
    396	if (exp > MAXEXP2) {
    397		/* Overflow. Return largest representable number instead. */
    398		return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
    399	} else {
    400		return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
    401	}
    402}
    403#elif ACCT_VERSION == 3
    404/*
    405 * encode an u64 into a 32 bit IEEE float
    406 */
    407static u32 encode_float(u64 value)
    408{
    409	unsigned exp = 190;
    410	unsigned u;
    411
    412	if (value == 0)
    413		return 0;
    414	while ((s64)value > 0) {
    415		value <<= 1;
    416		exp--;
    417	}
    418	u = (u32)(value >> 40) & 0x7fffffu;
    419	return u | (exp << 23);
    420}
    421#endif
    422
    423/*
    424 *  Write an accounting entry for an exiting process
    425 *
    426 *  The acct_process() call is the workhorse of the process
    427 *  accounting system. The struct acct is built here and then written
    428 *  into the accounting file. This function should only be called from
    429 *  do_exit() or when switching to a different output file.
    430 */
    431
    432static void fill_ac(acct_t *ac)
    433{
    434	struct pacct_struct *pacct = &current->signal->pacct;
    435	u64 elapsed, run_time;
    436	time64_t btime;
    437	struct tty_struct *tty;
    438
    439	/*
    440	 * Fill the accounting struct with the needed info as recorded
    441	 * by the different kernel functions.
    442	 */
    443	memset(ac, 0, sizeof(acct_t));
    444
    445	ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
    446	strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
    447
    448	/* calculate run_time in nsec*/
    449	run_time = ktime_get_ns();
    450	run_time -= current->group_leader->start_time;
    451	/* convert nsec -> AHZ */
    452	elapsed = nsec_to_AHZ(run_time);
    453#if ACCT_VERSION == 3
    454	ac->ac_etime = encode_float(elapsed);
    455#else
    456	ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
    457				(unsigned long) elapsed : (unsigned long) -1l);
    458#endif
    459#if ACCT_VERSION == 1 || ACCT_VERSION == 2
    460	{
    461		/* new enlarged etime field */
    462		comp2_t etime = encode_comp2_t(elapsed);
    463
    464		ac->ac_etime_hi = etime >> 16;
    465		ac->ac_etime_lo = (u16) etime;
    466	}
    467#endif
    468	do_div(elapsed, AHZ);
    469	btime = ktime_get_real_seconds() - elapsed;
    470	ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
    471#if ACCT_VERSION==2
    472	ac->ac_ahz = AHZ;
    473#endif
    474
    475	spin_lock_irq(&current->sighand->siglock);
    476	tty = current->signal->tty;	/* Safe as we hold the siglock */
    477	ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
    478	ac->ac_utime = encode_comp_t(nsec_to_AHZ(pacct->ac_utime));
    479	ac->ac_stime = encode_comp_t(nsec_to_AHZ(pacct->ac_stime));
    480	ac->ac_flag = pacct->ac_flag;
    481	ac->ac_mem = encode_comp_t(pacct->ac_mem);
    482	ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
    483	ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
    484	ac->ac_exitcode = pacct->ac_exitcode;
    485	spin_unlock_irq(&current->sighand->siglock);
    486}
    487/*
    488 *  do_acct_process does all actual work. Caller holds the reference to file.
    489 */
    490static void do_acct_process(struct bsd_acct_struct *acct)
    491{
    492	acct_t ac;
    493	unsigned long flim;
    494	const struct cred *orig_cred;
    495	struct file *file = acct->file;
    496
    497	/*
    498	 * Accounting records are not subject to resource limits.
    499	 */
    500	flim = rlimit(RLIMIT_FSIZE);
    501	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
    502	/* Perform file operations on behalf of whoever enabled accounting */
    503	orig_cred = override_creds(file->f_cred);
    504
    505	/*
    506	 * First check to see if there is enough free_space to continue
    507	 * the process accounting system.
    508	 */
    509	if (!check_free_space(acct))
    510		goto out;
    511
    512	fill_ac(&ac);
    513	/* we really need to bite the bullet and change layout */
    514	ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
    515	ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
    516#if ACCT_VERSION == 1 || ACCT_VERSION == 2
    517	/* backward-compatible 16 bit fields */
    518	ac.ac_uid16 = ac.ac_uid;
    519	ac.ac_gid16 = ac.ac_gid;
    520#elif ACCT_VERSION == 3
    521	{
    522		struct pid_namespace *ns = acct->ns;
    523
    524		ac.ac_pid = task_tgid_nr_ns(current, ns);
    525		rcu_read_lock();
    526		ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
    527					     ns);
    528		rcu_read_unlock();
    529	}
    530#endif
    531	/*
    532	 * Get freeze protection. If the fs is frozen, just skip the write
    533	 * as we could deadlock the system otherwise.
    534	 */
    535	if (file_start_write_trylock(file)) {
    536		/* it's been opened O_APPEND, so position is irrelevant */
    537		loff_t pos = 0;
    538		__kernel_write(file, &ac, sizeof(acct_t), &pos);
    539		file_end_write(file);
    540	}
    541out:
    542	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
    543	revert_creds(orig_cred);
    544}
    545
    546/**
    547 * acct_collect - collect accounting information into pacct_struct
    548 * @exitcode: task exit code
    549 * @group_dead: not 0, if this thread is the last one in the process.
    550 */
    551void acct_collect(long exitcode, int group_dead)
    552{
    553	struct pacct_struct *pacct = &current->signal->pacct;
    554	u64 utime, stime;
    555	unsigned long vsize = 0;
    556
    557	if (group_dead && current->mm) {
    558		struct vm_area_struct *vma;
    559
    560		mmap_read_lock(current->mm);
    561		vma = current->mm->mmap;
    562		while (vma) {
    563			vsize += vma->vm_end - vma->vm_start;
    564			vma = vma->vm_next;
    565		}
    566		mmap_read_unlock(current->mm);
    567	}
    568
    569	spin_lock_irq(&current->sighand->siglock);
    570	if (group_dead)
    571		pacct->ac_mem = vsize / 1024;
    572	if (thread_group_leader(current)) {
    573		pacct->ac_exitcode = exitcode;
    574		if (current->flags & PF_FORKNOEXEC)
    575			pacct->ac_flag |= AFORK;
    576	}
    577	if (current->flags & PF_SUPERPRIV)
    578		pacct->ac_flag |= ASU;
    579	if (current->flags & PF_DUMPCORE)
    580		pacct->ac_flag |= ACORE;
    581	if (current->flags & PF_SIGNALED)
    582		pacct->ac_flag |= AXSIG;
    583
    584	task_cputime(current, &utime, &stime);
    585	pacct->ac_utime += utime;
    586	pacct->ac_stime += stime;
    587	pacct->ac_minflt += current->min_flt;
    588	pacct->ac_majflt += current->maj_flt;
    589	spin_unlock_irq(&current->sighand->siglock);
    590}
    591
    592static void slow_acct_process(struct pid_namespace *ns)
    593{
    594	for ( ; ns; ns = ns->parent) {
    595		struct bsd_acct_struct *acct = acct_get(ns);
    596		if (acct) {
    597			do_acct_process(acct);
    598			mutex_unlock(&acct->lock);
    599			acct_put(acct);
    600		}
    601	}
    602}
    603
    604/**
    605 * acct_process - handles process accounting for an exiting task
    606 */
    607void acct_process(void)
    608{
    609	struct pid_namespace *ns;
    610
    611	/*
    612	 * This loop is safe lockless, since current is still
    613	 * alive and holds its namespace, which in turn holds
    614	 * its parent.
    615	 */
    616	for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
    617		if (ns->bacct)
    618			break;
    619	}
    620	if (unlikely(ns))
    621		slow_acct_process(ns);
    622}